mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-1373 more fixes to support supplementals
X-SVN-Rev: 7285
This commit is contained in:
parent
da0fef51a8
commit
c7903f1367
12 changed files with 334 additions and 302 deletions
|
@ -3,8 +3,8 @@
|
|||
* others. All Rights Reserved.
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/NameUnicodeTransliterator.java,v $
|
||||
* $Date: 2001/11/21 20:56:50 $
|
||||
* $Revision: 1.5 $
|
||||
* $Date: 2001/12/03 21:33:58 $
|
||||
* $Revision: 1.6 $
|
||||
*/
|
||||
package com.ibm.text;
|
||||
import java.util.*;
|
||||
|
@ -73,9 +73,10 @@ class NameUnicodeTransliterator extends Transliterator {
|
|||
int mode = 0;
|
||||
int ibuf = 0;
|
||||
int openPos = offsets.start; // position of openDelimiter
|
||||
|
||||
for (; cursor < limit; ++cursor) {
|
||||
char c = text.charAt(cursor);
|
||||
|
||||
int c;
|
||||
for (; cursor < limit; cursor+=UTF16.getCharCount(c)) {
|
||||
c = UTF16.charAt(text, cursor);
|
||||
|
||||
switch (mode) {
|
||||
case 0: // looking for open delimiter
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/StringMatcher.java,v $
|
||||
* $Date: 2001/11/29 22:31:18 $
|
||||
* $Revision: 1.4 $
|
||||
* $Date: 2001/12/03 21:33:58 $
|
||||
* $Revision: 1.5 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -48,16 +48,20 @@ class StringMatcher implements UnicodeMatcher {
|
|||
int[] offset,
|
||||
int limit,
|
||||
boolean incremental) {
|
||||
// Note (1): We process text in 16-bit code units, rather than
|
||||
// 32-bit code points. This works because stand-ins are
|
||||
// always in the BMP and because we are doing a literal match
|
||||
// operation, which can be done 16-bits at a time.
|
||||
int i;
|
||||
int[] cursor = new int[] { offset[0] };
|
||||
if (limit < cursor[0]) {
|
||||
// Match in the reverse direction
|
||||
for (i=pattern.length()-1; i>=0; --i) {
|
||||
char keyChar = pattern.charAt(i);
|
||||
char keyChar = pattern.charAt(i); // OK; see note (1) above
|
||||
UnicodeMatcher subm = data.lookup(keyChar);
|
||||
if (subm == null) {
|
||||
if (cursor[0] >= limit &&
|
||||
keyChar == text.charAt(cursor[0])) {
|
||||
keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
|
||||
--cursor[0];
|
||||
} else {
|
||||
return U_MISMATCH;
|
||||
|
@ -84,14 +88,14 @@ class StringMatcher implements UnicodeMatcher {
|
|||
// without completing our match.
|
||||
return U_PARTIAL_MATCH;
|
||||
}
|
||||
char keyChar = pattern.charAt(i);
|
||||
char keyChar = pattern.charAt(i); // OK; see note (1) above
|
||||
UnicodeMatcher subm = data.lookup(keyChar);
|
||||
if (subm == null) {
|
||||
// Don't need the cursor < limit check if
|
||||
// incremental is true (because it's done above); do need
|
||||
// it otherwise.
|
||||
if (cursor[0] < limit &&
|
||||
keyChar == text.charAt(cursor[0])) {
|
||||
keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
|
||||
++cursor[0];
|
||||
} else {
|
||||
return U_MISMATCH;
|
||||
|
@ -123,7 +127,7 @@ class StringMatcher implements UnicodeMatcher {
|
|||
result.append('(');
|
||||
}
|
||||
for (int i=0; i<pattern.length(); ++i) {
|
||||
char keyChar = pattern.charAt(i);
|
||||
char keyChar = pattern.charAt(i); // OK; see note (1) above
|
||||
UnicodeMatcher m = data.lookup(keyChar);
|
||||
if (m == null) {
|
||||
TransliterationRule.appendToRule(result, keyChar, false, escapeUnprintable, quoteBuf);
|
||||
|
|
|
@ -3,124 +3,128 @@
|
|||
* others. All Rights Reserved.
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransformTransliterator.java,v $
|
||||
* $Date: 2001/11/17 20:45:35 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2001/12/03 21:33:58 $
|
||||
* $Revision: 1.4 $
|
||||
*/
|
||||
package com.ibm.text;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* An abstract class for transliterators based on a transform
|
||||
* operation. To create a transliterator that implements a
|
||||
* transformation, create a subclass of this class and implement the
|
||||
* abstract <code>transform()</code> and <code>hasTransform()</code>
|
||||
* methods.
|
||||
* @author Alan Liu
|
||||
*/
|
||||
abstract class TransformTransliterator extends Transliterator {
|
||||
|
||||
/**
|
||||
* Constructs a transliterator. For use by subclasses.
|
||||
*/
|
||||
protected TransformTransliterator(String id, UnicodeFilter f) {
|
||||
super(id, f);
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleTransliterate}.
|
||||
*/
|
||||
protected void handleTransliterate(Replaceable text,
|
||||
Position offsets, boolean incremental) {
|
||||
|
||||
int start;
|
||||
for (start = offsets.start; start < offsets.limit; ++start) {
|
||||
// Scan for the first character that is != its transform.
|
||||
// If there are none, we fall out without doing anything.
|
||||
char c = text.charAt(start);
|
||||
if (hasTransform(c)) {
|
||||
// There is a transforming character at start. Break
|
||||
// up the remaining string, from start to
|
||||
// offsets.limit, into segments of unfiltered and
|
||||
// filtered characters. Only transform the unfiltered
|
||||
// characters. As always, minimize the number of
|
||||
// calls to Replaceable.replace().
|
||||
|
||||
int len = offsets.limit - start;
|
||||
// assert(len >= 1);
|
||||
|
||||
char[] buf = new char[len];
|
||||
text.getChars(start, offsets.limit, buf, 0);
|
||||
|
||||
int segStart = 0;
|
||||
int segLimit;
|
||||
UnicodeFilter filt = getFilter();
|
||||
|
||||
// lenDelta is the accumulated length difference for
|
||||
// all transformed segments. It is new length - old
|
||||
// length.
|
||||
int lenDelta = 0;
|
||||
|
||||
// Set segStart, segLimit to the unfiltered segment
|
||||
// starting with start. If the filter is null, then
|
||||
// segStart/Limit will be set to the whole string,
|
||||
// that is, 0/len.
|
||||
do {
|
||||
// Set segLimit to the first filtered char at or
|
||||
// after segStart.
|
||||
segLimit = len;
|
||||
if (filt != null) {
|
||||
segLimit = segStart;
|
||||
while (segLimit < len && filt.contains(buf[segLimit])) {
|
||||
++segLimit;
|
||||
}
|
||||
}
|
||||
|
||||
// Transform the unfiltered chars between segStart
|
||||
// and segLimit.
|
||||
int segLen = segLimit - segStart;
|
||||
if (segLen != 0) {
|
||||
String newStr = transform(
|
||||
new String(buf, segStart, segLen));
|
||||
text.replace(start, start + segLen, newStr);
|
||||
start += newStr.length();
|
||||
lenDelta += newStr.length() - segLen;
|
||||
}
|
||||
|
||||
// Set segStart to the first unfiltered char at or
|
||||
// after segLimit.
|
||||
segStart = segLimit;
|
||||
if (filt != null) {
|
||||
while (segStart < len && !filt.contains(buf[segStart])) {
|
||||
++segStart;
|
||||
}
|
||||
}
|
||||
start += segStart - segLimit;
|
||||
|
||||
} while (segStart < len);
|
||||
|
||||
offsets.limit += lenDelta;
|
||||
offsets.contextLimit += lenDelta;
|
||||
offsets.start = offsets.limit;
|
||||
return;
|
||||
}
|
||||
}
|
||||
// assert(start == offsets.limit);
|
||||
offsets.start = start;
|
||||
}
|
||||
|
||||
/**
|
||||
* Subclasses must implement this method to determine whether a
|
||||
* given character has a transform that is not equal to itself.
|
||||
* This is approximately equivalent to <code>c !=
|
||||
* transform(String.valueOf(c))</code>, where
|
||||
* <code>String.valueOf(c)</code> returns a String containing the
|
||||
* single character (not integer) <code>c</code>. Subclasses that
|
||||
* transform all their input can simply return <code>true</code>.
|
||||
*/
|
||||
protected abstract boolean hasTransform(int c);
|
||||
|
||||
/**
|
||||
* Subclasses must implement this method to transform a string.
|
||||
*/
|
||||
protected abstract String transform(String s);
|
||||
abstract class TransformTransliterator {
|
||||
// Currently unused
|
||||
}
|
||||
|
||||
///**
|
||||
// * An abstract class for transliterators based on a transform
|
||||
// * operation. To create a transliterator that implements a
|
||||
// * transformation, create a subclass of this class and implement the
|
||||
// * abstract <code>transform()</code> and <code>hasTransform()</code>
|
||||
// * methods.
|
||||
// * @author Alan Liu
|
||||
// */
|
||||
//abstract class TransformTransliterator extends Transliterator {
|
||||
//
|
||||
// /**
|
||||
// * Constructs a transliterator. For use by subclasses.
|
||||
// */
|
||||
// protected TransformTransliterator(String id, UnicodeFilter f) {
|
||||
// super(id, f);
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * Implements {@link Transliterator#handleTransliterate}.
|
||||
// */
|
||||
// protected void handleTransliterate(Replaceable text,
|
||||
// Position offsets, boolean incremental) {
|
||||
//
|
||||
// int start;
|
||||
// for (start = offsets.start; start < offsets.limit; ++start) {
|
||||
// // Scan for the first character that is != its transform.
|
||||
// // If there are none, we fall out without doing anything.
|
||||
// char c = text.charAt(start);
|
||||
// if (hasTransform(c)) {
|
||||
// // There is a transforming character at start. Break
|
||||
// // up the remaining string, from start to
|
||||
// // offsets.limit, into segments of unfiltered and
|
||||
// // filtered characters. Only transform the unfiltered
|
||||
// // characters. As always, minimize the number of
|
||||
// // calls to Replaceable.replace().
|
||||
//
|
||||
// int len = offsets.limit - start;
|
||||
// // assert(len >= 1);
|
||||
//
|
||||
// char[] buf = new char[len];
|
||||
// text.getChars(start, offsets.limit, buf, 0);
|
||||
//
|
||||
// int segStart = 0;
|
||||
// int segLimit;
|
||||
// UnicodeFilter filt = getFilter();
|
||||
//
|
||||
// // lenDelta is the accumulated length difference for
|
||||
// // all transformed segments. It is new length - old
|
||||
// // length.
|
||||
// int lenDelta = 0;
|
||||
//
|
||||
// // Set segStart, segLimit to the unfiltered segment
|
||||
// // starting with start. If the filter is null, then
|
||||
// // segStart/Limit will be set to the whole string,
|
||||
// // that is, 0/len.
|
||||
// do {
|
||||
// // Set segLimit to the first filtered char at or
|
||||
// // after segStart.
|
||||
// segLimit = len;
|
||||
// if (filt != null) {
|
||||
// segLimit = segStart;
|
||||
// while (segLimit < len && filt.contains(buf[segLimit])) {
|
||||
// ++segLimit;
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// // Transform the unfiltered chars between segStart
|
||||
// // and segLimit.
|
||||
// int segLen = segLimit - segStart;
|
||||
// if (segLen != 0) {
|
||||
// String newStr = transform(
|
||||
// new String(buf, segStart, segLen));
|
||||
// text.replace(start, start + segLen, newStr);
|
||||
// start += newStr.length();
|
||||
// lenDelta += newStr.length() - segLen;
|
||||
// }
|
||||
//
|
||||
// // Set segStart to the first unfiltered char at or
|
||||
// // after segLimit.
|
||||
// segStart = segLimit;
|
||||
// if (filt != null) {
|
||||
// while (segStart < len && !filt.contains(buf[segStart])) {
|
||||
// ++segStart;
|
||||
// }
|
||||
// }
|
||||
// start += segStart - segLimit;
|
||||
//
|
||||
// } while (segStart < len);
|
||||
//
|
||||
// offsets.limit += lenDelta;
|
||||
// offsets.contextLimit += lenDelta;
|
||||
// offsets.start = offsets.limit;
|
||||
// return;
|
||||
// }
|
||||
// }
|
||||
// // assert(start == offsets.limit);
|
||||
// offsets.start = start;
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * Subclasses must implement this method to determine whether a
|
||||
// * given character has a transform that is not equal to itself.
|
||||
// * This is approximately equivalent to <code>c !=
|
||||
// * transform(String.valueOf(c))</code>, where
|
||||
// * <code>String.valueOf(c)</code> returns a String containing the
|
||||
// * single character (not integer) <code>c</code>. Subclasses that
|
||||
// * transform all their input can simply return <code>true</code>.
|
||||
// */
|
||||
// protected abstract boolean hasTransform(int c);
|
||||
//
|
||||
// /**
|
||||
// * Subclasses must implement this method to transform a string.
|
||||
// */
|
||||
// protected abstract String transform(String s);
|
||||
//}
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliterationRule.java,v $
|
||||
* $Date: 2001/11/30 22:27:29 $
|
||||
* $Revision: 1.38 $
|
||||
* $Date: 2001/12/03 21:33:58 $
|
||||
* $Revision: 1.39 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -46,7 +46,7 @@ import com.ibm.util.Utility;
|
|||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.38 $ $Date: 2001/11/30 22:27:29 $
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.39 $ $Date: 2001/12/03 21:33:58 $
|
||||
*/
|
||||
class TransliterationRule {
|
||||
|
||||
|
@ -396,12 +396,17 @@ class TransliterationRule {
|
|||
// Backup oText by one
|
||||
oText = posBefore(text, pos.start);
|
||||
|
||||
// Note (1): We process text in 16-bit code units, rather than
|
||||
// 32-bit code points. This works because stand-ins are
|
||||
// always in the BMP and because we are doing a literal match
|
||||
// operation, which can be done 16-bits at a time.
|
||||
|
||||
for (oPattern=anteContextLength-1; oPattern>=0; --oPattern) {
|
||||
char keyChar = pattern.charAt(oPattern);
|
||||
char keyChar = pattern.charAt(oPattern); // See note (1)
|
||||
UnicodeMatcher matcher = data.lookup(keyChar);
|
||||
if (matcher == null) {
|
||||
if (oText >= pos.contextStart &&
|
||||
keyChar == text.charAt(oText)) {
|
||||
keyChar == text.charAt(oText)) { // See note (1)
|
||||
--oText;
|
||||
} else {
|
||||
return UnicodeMatcher.U_MISMATCH;
|
||||
|
@ -457,14 +462,14 @@ class TransliterationRule {
|
|||
// can match up to pos.contextLimit.
|
||||
int matchLimit = (oPattern < keyLength) ? pos.limit : pos.contextLimit;
|
||||
|
||||
char keyChar = pattern.charAt(anteContextLength + oPattern++);
|
||||
char keyChar = pattern.charAt(anteContextLength + oPattern++); // See note (1)
|
||||
UnicodeMatcher matcher = data.lookup(keyChar);
|
||||
if (matcher == null) {
|
||||
// Don't need the oText < pos.contextLimit check if
|
||||
// incremental is TRUE (because it's done above); do need
|
||||
// it otherwise.
|
||||
if (oText < matchLimit &&
|
||||
keyChar == text.charAt(oText)) {
|
||||
keyChar == text.charAt(oText)) { // See note (1)
|
||||
++oText;
|
||||
} else {
|
||||
return UnicodeMatcher.U_MISMATCH;
|
||||
|
@ -716,6 +721,7 @@ class TransliterationRule {
|
|||
boolean escapeUnprintable,
|
||||
StringBuffer quoteBuf) {
|
||||
for (int i=0; i<text.length(); ++i) {
|
||||
// Okay to process in 16-bit code units here
|
||||
appendToRule(rule, text.charAt(i), isLiteral, escapeUnprintable, quoteBuf);
|
||||
}
|
||||
}
|
||||
|
@ -757,7 +763,7 @@ class TransliterationRule {
|
|||
appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
|
||||
}
|
||||
|
||||
char c = pattern.charAt(i);
|
||||
char c = pattern.charAt(i); // Ok to use 16-bits here
|
||||
UnicodeMatcher matcher = data.lookup(c);
|
||||
if (matcher == null) {
|
||||
appendToRule(rule, c, false, escapeUnprintable, quoteBuf);
|
||||
|
@ -793,7 +799,7 @@ class TransliterationRule {
|
|||
if (i == cursor) {
|
||||
appendToRule(rule, '|', true, escapeUnprintable, quoteBuf);
|
||||
}
|
||||
char c = output.charAt(i);
|
||||
char c = output.charAt(i); // Ok to use 16-bits here
|
||||
int seg = data.lookupSegmentReference(c);
|
||||
if (seg < 0) {
|
||||
appendToRule(rule, c, false, escapeUnprintable, quoteBuf);
|
||||
|
@ -872,6 +878,9 @@ class TransliterationRule {
|
|||
|
||||
/**
|
||||
* $Log: TransliterationRule.java,v $
|
||||
* Revision 1.39 2001/12/03 21:33:58 alan
|
||||
* jitterbug 1373: more fixes to support supplementals
|
||||
*
|
||||
* Revision 1.38 2001/11/30 22:27:29 alan
|
||||
* jitterbug 1560: fix double increment bug in getSourceSet
|
||||
*
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
* others. All Rights Reserved.
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UnicodeNameTransliterator.java,v $
|
||||
* $Date: 2001/11/17 20:45:35 $
|
||||
* $Revision: 1.4 $
|
||||
* $Date: 2001/12/03 21:33:59 $
|
||||
* $Revision: 1.5 $
|
||||
*/
|
||||
package com.ibm.text;
|
||||
import java.util.*;
|
||||
|
@ -63,16 +63,17 @@ class UnicodeNameTransliterator extends Transliterator {
|
|||
String name;
|
||||
|
||||
while (cursor < limit) {
|
||||
char c = text.charAt(cursor);
|
||||
int c = UTF16.charAt(text, cursor);
|
||||
if ((name=UCharacter.getName(c)) != null) {
|
||||
|
||||
str.setLength(1);
|
||||
str.append(name).append(closeDelimiter);
|
||||
|
||||
text.replace(cursor, cursor+1, str.toString());
|
||||
|
||||
int clen = UTF16.getCharCount(c);
|
||||
text.replace(cursor, cursor+clen, str.toString());
|
||||
len = str.length();
|
||||
cursor += len; // advance cursor by 1 and adjust for new text
|
||||
limit += len-1; // change in length is (len - 1)
|
||||
limit += len-clen; // change in length
|
||||
} else {
|
||||
++cursor;
|
||||
}
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UnicodeSet.java,v $
|
||||
* $Date: 2001/12/03 20:26:24 $
|
||||
* $Revision: 1.52 $
|
||||
* $Date: 2001/12/03 21:33:59 $
|
||||
* $Revision: 1.53 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -204,7 +204,7 @@ import com.ibm.util.Utility;
|
|||
* Unicode property
|
||||
* </table>
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.52 $ $Date: 2001/12/03 20:26:24 $
|
||||
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.53 $ $Date: 2001/12/03 21:33:59 $
|
||||
*/
|
||||
public class UnicodeSet extends UnicodeFilter {
|
||||
|
||||
|
@ -396,16 +396,13 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
applyPattern(pattern, pos, null, ignoreWhitespace);
|
||||
|
||||
int i = pos.getIndex();
|
||||
int n = pattern.length();
|
||||
|
||||
// Skip over trailing whitespace
|
||||
if (ignoreWhitespace) {
|
||||
while (i < n && Character.isWhitespace(pattern.charAt(i))) {
|
||||
++i;
|
||||
}
|
||||
i = Utility.skipWhitespace(pattern, i);
|
||||
}
|
||||
|
||||
if (i != n) {
|
||||
if (i != pattern.length()) {
|
||||
throw new IllegalArgumentException("Parse of \"" + pattern +
|
||||
"\" failed at " + i);
|
||||
}
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
* others. All Rights Reserved.
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/NameUnicodeTransliterator.java,v $
|
||||
* $Date: 2001/11/21 20:56:50 $
|
||||
* $Revision: 1.5 $
|
||||
* $Date: 2001/12/03 21:33:58 $
|
||||
* $Revision: 1.6 $
|
||||
*/
|
||||
package com.ibm.text;
|
||||
import java.util.*;
|
||||
|
@ -73,9 +73,10 @@ class NameUnicodeTransliterator extends Transliterator {
|
|||
int mode = 0;
|
||||
int ibuf = 0;
|
||||
int openPos = offsets.start; // position of openDelimiter
|
||||
|
||||
for (; cursor < limit; ++cursor) {
|
||||
char c = text.charAt(cursor);
|
||||
|
||||
int c;
|
||||
for (; cursor < limit; cursor+=UTF16.getCharCount(c)) {
|
||||
c = UTF16.charAt(text, cursor);
|
||||
|
||||
switch (mode) {
|
||||
case 0: // looking for open delimiter
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/StringMatcher.java,v $
|
||||
* $Date: 2001/11/29 22:31:18 $
|
||||
* $Revision: 1.4 $
|
||||
* $Date: 2001/12/03 21:33:58 $
|
||||
* $Revision: 1.5 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -48,16 +48,20 @@ class StringMatcher implements UnicodeMatcher {
|
|||
int[] offset,
|
||||
int limit,
|
||||
boolean incremental) {
|
||||
// Note (1): We process text in 16-bit code units, rather than
|
||||
// 32-bit code points. This works because stand-ins are
|
||||
// always in the BMP and because we are doing a literal match
|
||||
// operation, which can be done 16-bits at a time.
|
||||
int i;
|
||||
int[] cursor = new int[] { offset[0] };
|
||||
if (limit < cursor[0]) {
|
||||
// Match in the reverse direction
|
||||
for (i=pattern.length()-1; i>=0; --i) {
|
||||
char keyChar = pattern.charAt(i);
|
||||
char keyChar = pattern.charAt(i); // OK; see note (1) above
|
||||
UnicodeMatcher subm = data.lookup(keyChar);
|
||||
if (subm == null) {
|
||||
if (cursor[0] >= limit &&
|
||||
keyChar == text.charAt(cursor[0])) {
|
||||
keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
|
||||
--cursor[0];
|
||||
} else {
|
||||
return U_MISMATCH;
|
||||
|
@ -84,14 +88,14 @@ class StringMatcher implements UnicodeMatcher {
|
|||
// without completing our match.
|
||||
return U_PARTIAL_MATCH;
|
||||
}
|
||||
char keyChar = pattern.charAt(i);
|
||||
char keyChar = pattern.charAt(i); // OK; see note (1) above
|
||||
UnicodeMatcher subm = data.lookup(keyChar);
|
||||
if (subm == null) {
|
||||
// Don't need the cursor < limit check if
|
||||
// incremental is true (because it's done above); do need
|
||||
// it otherwise.
|
||||
if (cursor[0] < limit &&
|
||||
keyChar == text.charAt(cursor[0])) {
|
||||
keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
|
||||
++cursor[0];
|
||||
} else {
|
||||
return U_MISMATCH;
|
||||
|
@ -123,7 +127,7 @@ class StringMatcher implements UnicodeMatcher {
|
|||
result.append('(');
|
||||
}
|
||||
for (int i=0; i<pattern.length(); ++i) {
|
||||
char keyChar = pattern.charAt(i);
|
||||
char keyChar = pattern.charAt(i); // OK; see note (1) above
|
||||
UnicodeMatcher m = data.lookup(keyChar);
|
||||
if (m == null) {
|
||||
TransliterationRule.appendToRule(result, keyChar, false, escapeUnprintable, quoteBuf);
|
||||
|
|
|
@ -3,124 +3,128 @@
|
|||
* others. All Rights Reserved.
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransformTransliterator.java,v $
|
||||
* $Date: 2001/11/17 20:45:35 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2001/12/03 21:33:58 $
|
||||
* $Revision: 1.4 $
|
||||
*/
|
||||
package com.ibm.text;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* An abstract class for transliterators based on a transform
|
||||
* operation. To create a transliterator that implements a
|
||||
* transformation, create a subclass of this class and implement the
|
||||
* abstract <code>transform()</code> and <code>hasTransform()</code>
|
||||
* methods.
|
||||
* @author Alan Liu
|
||||
*/
|
||||
abstract class TransformTransliterator extends Transliterator {
|
||||
|
||||
/**
|
||||
* Constructs a transliterator. For use by subclasses.
|
||||
*/
|
||||
protected TransformTransliterator(String id, UnicodeFilter f) {
|
||||
super(id, f);
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleTransliterate}.
|
||||
*/
|
||||
protected void handleTransliterate(Replaceable text,
|
||||
Position offsets, boolean incremental) {
|
||||
|
||||
int start;
|
||||
for (start = offsets.start; start < offsets.limit; ++start) {
|
||||
// Scan for the first character that is != its transform.
|
||||
// If there are none, we fall out without doing anything.
|
||||
char c = text.charAt(start);
|
||||
if (hasTransform(c)) {
|
||||
// There is a transforming character at start. Break
|
||||
// up the remaining string, from start to
|
||||
// offsets.limit, into segments of unfiltered and
|
||||
// filtered characters. Only transform the unfiltered
|
||||
// characters. As always, minimize the number of
|
||||
// calls to Replaceable.replace().
|
||||
|
||||
int len = offsets.limit - start;
|
||||
// assert(len >= 1);
|
||||
|
||||
char[] buf = new char[len];
|
||||
text.getChars(start, offsets.limit, buf, 0);
|
||||
|
||||
int segStart = 0;
|
||||
int segLimit;
|
||||
UnicodeFilter filt = getFilter();
|
||||
|
||||
// lenDelta is the accumulated length difference for
|
||||
// all transformed segments. It is new length - old
|
||||
// length.
|
||||
int lenDelta = 0;
|
||||
|
||||
// Set segStart, segLimit to the unfiltered segment
|
||||
// starting with start. If the filter is null, then
|
||||
// segStart/Limit will be set to the whole string,
|
||||
// that is, 0/len.
|
||||
do {
|
||||
// Set segLimit to the first filtered char at or
|
||||
// after segStart.
|
||||
segLimit = len;
|
||||
if (filt != null) {
|
||||
segLimit = segStart;
|
||||
while (segLimit < len && filt.contains(buf[segLimit])) {
|
||||
++segLimit;
|
||||
}
|
||||
}
|
||||
|
||||
// Transform the unfiltered chars between segStart
|
||||
// and segLimit.
|
||||
int segLen = segLimit - segStart;
|
||||
if (segLen != 0) {
|
||||
String newStr = transform(
|
||||
new String(buf, segStart, segLen));
|
||||
text.replace(start, start + segLen, newStr);
|
||||
start += newStr.length();
|
||||
lenDelta += newStr.length() - segLen;
|
||||
}
|
||||
|
||||
// Set segStart to the first unfiltered char at or
|
||||
// after segLimit.
|
||||
segStart = segLimit;
|
||||
if (filt != null) {
|
||||
while (segStart < len && !filt.contains(buf[segStart])) {
|
||||
++segStart;
|
||||
}
|
||||
}
|
||||
start += segStart - segLimit;
|
||||
|
||||
} while (segStart < len);
|
||||
|
||||
offsets.limit += lenDelta;
|
||||
offsets.contextLimit += lenDelta;
|
||||
offsets.start = offsets.limit;
|
||||
return;
|
||||
}
|
||||
}
|
||||
// assert(start == offsets.limit);
|
||||
offsets.start = start;
|
||||
}
|
||||
|
||||
/**
|
||||
* Subclasses must implement this method to determine whether a
|
||||
* given character has a transform that is not equal to itself.
|
||||
* This is approximately equivalent to <code>c !=
|
||||
* transform(String.valueOf(c))</code>, where
|
||||
* <code>String.valueOf(c)</code> returns a String containing the
|
||||
* single character (not integer) <code>c</code>. Subclasses that
|
||||
* transform all their input can simply return <code>true</code>.
|
||||
*/
|
||||
protected abstract boolean hasTransform(int c);
|
||||
|
||||
/**
|
||||
* Subclasses must implement this method to transform a string.
|
||||
*/
|
||||
protected abstract String transform(String s);
|
||||
abstract class TransformTransliterator {
|
||||
// Currently unused
|
||||
}
|
||||
|
||||
///**
|
||||
// * An abstract class for transliterators based on a transform
|
||||
// * operation. To create a transliterator that implements a
|
||||
// * transformation, create a subclass of this class and implement the
|
||||
// * abstract <code>transform()</code> and <code>hasTransform()</code>
|
||||
// * methods.
|
||||
// * @author Alan Liu
|
||||
// */
|
||||
//abstract class TransformTransliterator extends Transliterator {
|
||||
//
|
||||
// /**
|
||||
// * Constructs a transliterator. For use by subclasses.
|
||||
// */
|
||||
// protected TransformTransliterator(String id, UnicodeFilter f) {
|
||||
// super(id, f);
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * Implements {@link Transliterator#handleTransliterate}.
|
||||
// */
|
||||
// protected void handleTransliterate(Replaceable text,
|
||||
// Position offsets, boolean incremental) {
|
||||
//
|
||||
// int start;
|
||||
// for (start = offsets.start; start < offsets.limit; ++start) {
|
||||
// // Scan for the first character that is != its transform.
|
||||
// // If there are none, we fall out without doing anything.
|
||||
// char c = text.charAt(start);
|
||||
// if (hasTransform(c)) {
|
||||
// // There is a transforming character at start. Break
|
||||
// // up the remaining string, from start to
|
||||
// // offsets.limit, into segments of unfiltered and
|
||||
// // filtered characters. Only transform the unfiltered
|
||||
// // characters. As always, minimize the number of
|
||||
// // calls to Replaceable.replace().
|
||||
//
|
||||
// int len = offsets.limit - start;
|
||||
// // assert(len >= 1);
|
||||
//
|
||||
// char[] buf = new char[len];
|
||||
// text.getChars(start, offsets.limit, buf, 0);
|
||||
//
|
||||
// int segStart = 0;
|
||||
// int segLimit;
|
||||
// UnicodeFilter filt = getFilter();
|
||||
//
|
||||
// // lenDelta is the accumulated length difference for
|
||||
// // all transformed segments. It is new length - old
|
||||
// // length.
|
||||
// int lenDelta = 0;
|
||||
//
|
||||
// // Set segStart, segLimit to the unfiltered segment
|
||||
// // starting with start. If the filter is null, then
|
||||
// // segStart/Limit will be set to the whole string,
|
||||
// // that is, 0/len.
|
||||
// do {
|
||||
// // Set segLimit to the first filtered char at or
|
||||
// // after segStart.
|
||||
// segLimit = len;
|
||||
// if (filt != null) {
|
||||
// segLimit = segStart;
|
||||
// while (segLimit < len && filt.contains(buf[segLimit])) {
|
||||
// ++segLimit;
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// // Transform the unfiltered chars between segStart
|
||||
// // and segLimit.
|
||||
// int segLen = segLimit - segStart;
|
||||
// if (segLen != 0) {
|
||||
// String newStr = transform(
|
||||
// new String(buf, segStart, segLen));
|
||||
// text.replace(start, start + segLen, newStr);
|
||||
// start += newStr.length();
|
||||
// lenDelta += newStr.length() - segLen;
|
||||
// }
|
||||
//
|
||||
// // Set segStart to the first unfiltered char at or
|
||||
// // after segLimit.
|
||||
// segStart = segLimit;
|
||||
// if (filt != null) {
|
||||
// while (segStart < len && !filt.contains(buf[segStart])) {
|
||||
// ++segStart;
|
||||
// }
|
||||
// }
|
||||
// start += segStart - segLimit;
|
||||
//
|
||||
// } while (segStart < len);
|
||||
//
|
||||
// offsets.limit += lenDelta;
|
||||
// offsets.contextLimit += lenDelta;
|
||||
// offsets.start = offsets.limit;
|
||||
// return;
|
||||
// }
|
||||
// }
|
||||
// // assert(start == offsets.limit);
|
||||
// offsets.start = start;
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * Subclasses must implement this method to determine whether a
|
||||
// * given character has a transform that is not equal to itself.
|
||||
// * This is approximately equivalent to <code>c !=
|
||||
// * transform(String.valueOf(c))</code>, where
|
||||
// * <code>String.valueOf(c)</code> returns a String containing the
|
||||
// * single character (not integer) <code>c</code>. Subclasses that
|
||||
// * transform all their input can simply return <code>true</code>.
|
||||
// */
|
||||
// protected abstract boolean hasTransform(int c);
|
||||
//
|
||||
// /**
|
||||
// * Subclasses must implement this method to transform a string.
|
||||
// */
|
||||
// protected abstract String transform(String s);
|
||||
//}
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliterationRule.java,v $
|
||||
* $Date: 2001/11/30 22:27:29 $
|
||||
* $Revision: 1.38 $
|
||||
* $Date: 2001/12/03 21:33:58 $
|
||||
* $Revision: 1.39 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -46,7 +46,7 @@ import com.ibm.util.Utility;
|
|||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.38 $ $Date: 2001/11/30 22:27:29 $
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.39 $ $Date: 2001/12/03 21:33:58 $
|
||||
*/
|
||||
class TransliterationRule {
|
||||
|
||||
|
@ -396,12 +396,17 @@ class TransliterationRule {
|
|||
// Backup oText by one
|
||||
oText = posBefore(text, pos.start);
|
||||
|
||||
// Note (1): We process text in 16-bit code units, rather than
|
||||
// 32-bit code points. This works because stand-ins are
|
||||
// always in the BMP and because we are doing a literal match
|
||||
// operation, which can be done 16-bits at a time.
|
||||
|
||||
for (oPattern=anteContextLength-1; oPattern>=0; --oPattern) {
|
||||
char keyChar = pattern.charAt(oPattern);
|
||||
char keyChar = pattern.charAt(oPattern); // See note (1)
|
||||
UnicodeMatcher matcher = data.lookup(keyChar);
|
||||
if (matcher == null) {
|
||||
if (oText >= pos.contextStart &&
|
||||
keyChar == text.charAt(oText)) {
|
||||
keyChar == text.charAt(oText)) { // See note (1)
|
||||
--oText;
|
||||
} else {
|
||||
return UnicodeMatcher.U_MISMATCH;
|
||||
|
@ -457,14 +462,14 @@ class TransliterationRule {
|
|||
// can match up to pos.contextLimit.
|
||||
int matchLimit = (oPattern < keyLength) ? pos.limit : pos.contextLimit;
|
||||
|
||||
char keyChar = pattern.charAt(anteContextLength + oPattern++);
|
||||
char keyChar = pattern.charAt(anteContextLength + oPattern++); // See note (1)
|
||||
UnicodeMatcher matcher = data.lookup(keyChar);
|
||||
if (matcher == null) {
|
||||
// Don't need the oText < pos.contextLimit check if
|
||||
// incremental is TRUE (because it's done above); do need
|
||||
// it otherwise.
|
||||
if (oText < matchLimit &&
|
||||
keyChar == text.charAt(oText)) {
|
||||
keyChar == text.charAt(oText)) { // See note (1)
|
||||
++oText;
|
||||
} else {
|
||||
return UnicodeMatcher.U_MISMATCH;
|
||||
|
@ -716,6 +721,7 @@ class TransliterationRule {
|
|||
boolean escapeUnprintable,
|
||||
StringBuffer quoteBuf) {
|
||||
for (int i=0; i<text.length(); ++i) {
|
||||
// Okay to process in 16-bit code units here
|
||||
appendToRule(rule, text.charAt(i), isLiteral, escapeUnprintable, quoteBuf);
|
||||
}
|
||||
}
|
||||
|
@ -757,7 +763,7 @@ class TransliterationRule {
|
|||
appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
|
||||
}
|
||||
|
||||
char c = pattern.charAt(i);
|
||||
char c = pattern.charAt(i); // Ok to use 16-bits here
|
||||
UnicodeMatcher matcher = data.lookup(c);
|
||||
if (matcher == null) {
|
||||
appendToRule(rule, c, false, escapeUnprintable, quoteBuf);
|
||||
|
@ -793,7 +799,7 @@ class TransliterationRule {
|
|||
if (i == cursor) {
|
||||
appendToRule(rule, '|', true, escapeUnprintable, quoteBuf);
|
||||
}
|
||||
char c = output.charAt(i);
|
||||
char c = output.charAt(i); // Ok to use 16-bits here
|
||||
int seg = data.lookupSegmentReference(c);
|
||||
if (seg < 0) {
|
||||
appendToRule(rule, c, false, escapeUnprintable, quoteBuf);
|
||||
|
@ -872,6 +878,9 @@ class TransliterationRule {
|
|||
|
||||
/**
|
||||
* $Log: TransliterationRule.java,v $
|
||||
* Revision 1.39 2001/12/03 21:33:58 alan
|
||||
* jitterbug 1373: more fixes to support supplementals
|
||||
*
|
||||
* Revision 1.38 2001/11/30 22:27:29 alan
|
||||
* jitterbug 1560: fix double increment bug in getSourceSet
|
||||
*
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
* others. All Rights Reserved.
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodeNameTransliterator.java,v $
|
||||
* $Date: 2001/11/17 20:45:35 $
|
||||
* $Revision: 1.4 $
|
||||
* $Date: 2001/12/03 21:33:59 $
|
||||
* $Revision: 1.5 $
|
||||
*/
|
||||
package com.ibm.text;
|
||||
import java.util.*;
|
||||
|
@ -63,16 +63,17 @@ class UnicodeNameTransliterator extends Transliterator {
|
|||
String name;
|
||||
|
||||
while (cursor < limit) {
|
||||
char c = text.charAt(cursor);
|
||||
int c = UTF16.charAt(text, cursor);
|
||||
if ((name=UCharacter.getName(c)) != null) {
|
||||
|
||||
str.setLength(1);
|
||||
str.append(name).append(closeDelimiter);
|
||||
|
||||
text.replace(cursor, cursor+1, str.toString());
|
||||
|
||||
int clen = UTF16.getCharCount(c);
|
||||
text.replace(cursor, cursor+clen, str.toString());
|
||||
len = str.length();
|
||||
cursor += len; // advance cursor by 1 and adjust for new text
|
||||
limit += len-1; // change in length is (len - 1)
|
||||
limit += len-clen; // change in length
|
||||
} else {
|
||||
++cursor;
|
||||
}
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodeSet.java,v $
|
||||
* $Date: 2001/12/03 20:26:24 $
|
||||
* $Revision: 1.52 $
|
||||
* $Date: 2001/12/03 21:33:59 $
|
||||
* $Revision: 1.53 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -204,7 +204,7 @@ import com.ibm.util.Utility;
|
|||
* Unicode property
|
||||
* </table>
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.52 $ $Date: 2001/12/03 20:26:24 $
|
||||
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.53 $ $Date: 2001/12/03 21:33:59 $
|
||||
*/
|
||||
public class UnicodeSet extends UnicodeFilter {
|
||||
|
||||
|
@ -396,16 +396,13 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
applyPattern(pattern, pos, null, ignoreWhitespace);
|
||||
|
||||
int i = pos.getIndex();
|
||||
int n = pattern.length();
|
||||
|
||||
// Skip over trailing whitespace
|
||||
if (ignoreWhitespace) {
|
||||
while (i < n && Character.isWhitespace(pattern.charAt(i))) {
|
||||
++i;
|
||||
}
|
||||
i = Utility.skipWhitespace(pattern, i);
|
||||
}
|
||||
|
||||
if (i != n) {
|
||||
if (i != pattern.length()) {
|
||||
throw new IllegalArgumentException("Parse of \"" + pattern +
|
||||
"\" failed at " + i);
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue