mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
Port fix for ICU (4c) jitterbug 243; Hex-Unicode and Unicode-Hex support for prefixes, suffixes, and digit counts through a pattern syntax
X-SVN-Rev: 971
This commit is contained in:
parent
258dbe98d4
commit
f7a4bbd75b
6 changed files with 1126 additions and 204 deletions
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $
|
||||
* $Date: 2000/03/10 03:47:47 $
|
||||
* $Revision: 1.13 $
|
||||
* $Date: 2000/03/22 02:00:08 $
|
||||
* $Revision: 1.14 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -392,6 +392,26 @@ public class TransliteratorTest extends TestFmwk {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Prefix, suffix support in hex transliterators
|
||||
*/
|
||||
public void TestJ243() {
|
||||
// Test default Hex-Unicode, which should handle
|
||||
// \\u, \\U, u+, and U+
|
||||
HexToUnicodeTransliterator hex = new HexToUnicodeTransliterator();
|
||||
expect(hex, "\\u0041+\\U0042,u+0043uu+0044z", "A+B,CuDz");
|
||||
|
||||
// Try a custom Hex-Unicode
|
||||
// \\uXXXX and &#xXXXX;
|
||||
HexToUnicodeTransliterator hex2 = new HexToUnicodeTransliterator("\\\\u###0;&\\#x###0\\;");
|
||||
expect(hex2, "\\u61\\u062\\u0063\\u00645\\u66x0123",
|
||||
"abcd5fx0123");
|
||||
|
||||
// Try custom Unicode-Hex (default is tested elsewhere)
|
||||
UnicodeToHexTransliterator hex3 = new UnicodeToHexTransliterator("&\\#x###0;");
|
||||
expect(hex3, "012", "012");
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Attic/HexToUnicodeTransliterator.java,v $
|
||||
* $Date: 2000/03/10 04:07:20 $
|
||||
* $Revision: 1.4 $
|
||||
* $Date: 2000/03/22 01:59:55 $
|
||||
* $Revision: 1.5 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -14,16 +14,16 @@ package com.ibm.text;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* A transliterator that converts from hexadecimal Unicode
|
||||
* escape sequences to the characters they represent. For example, "U+0040"
|
||||
* and '\u0040'. It recognizes the
|
||||
* A transliterator that converts from hexadecimal Unicode escape
|
||||
* sequences to the characters they represent. For example, "U+0040"
|
||||
* and '\u0040'. A default HexToUnicodeTransliterator recognizes the
|
||||
* prefixes "U+", "u+", "\U", and "\u". Hex values may be
|
||||
* upper- or lowercase.
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
* upper- or lowercase. By calling the applyPattern() method, one
|
||||
* or more custom prefix/suffix pairs may be specified. See
|
||||
* applyPattern() for details.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: HexToUnicodeTransliterator.java,v $ $Revision: 1.4 $ $Date: 2000/03/10 04:07:20 $
|
||||
* @version $RCSfile: HexToUnicodeTransliterator.java,v $ $Revision: 1.5 $ $Date: 2000/03/22 01:59:55 $
|
||||
*/
|
||||
public class HexToUnicodeTransliterator extends Transliterator {
|
||||
private static final String COPYRIGHT =
|
||||
|
@ -32,75 +32,345 @@ public class HexToUnicodeTransliterator extends Transliterator {
|
|||
/**
|
||||
* Package accessible ID for this transliterator.
|
||||
*/
|
||||
static String _ID = "Hex-Unicode";
|
||||
static final String _ID = "Hex-Unicode";
|
||||
|
||||
/**
|
||||
* This pattern encodes the following specs for the default constructor:
|
||||
* \\u0000
|
||||
* \\U0000
|
||||
* u+0000
|
||||
* U+0000
|
||||
* The multiple backslashes resolve to a single backslash
|
||||
* in the effective prefix.
|
||||
*/
|
||||
private static final String DEFAULT_PATTERN = "\\\\u0000;\\\\U0000;u+0000;U+0000";
|
||||
|
||||
// Character constants for special pattern characters
|
||||
private static final char SEMICOLON = ';';
|
||||
private static final char ZERO = '0';
|
||||
private static final char POUND = '#';
|
||||
private static final char BACKSLASH = '\\';
|
||||
|
||||
/**
|
||||
* The pattern for this transliterator
|
||||
*/
|
||||
private String pattern;
|
||||
|
||||
/**
|
||||
* The processed pattern specification. See applyPattern() for
|
||||
* details.
|
||||
*/
|
||||
private char[] affixes;
|
||||
|
||||
/**
|
||||
* The number of different affix sets in affixes.
|
||||
*/
|
||||
private int affixCount;
|
||||
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
*/
|
||||
public HexToUnicodeTransliterator() {
|
||||
super(_ID, null);
|
||||
applyPattern(DEFAULT_PATTERN);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
*/
|
||||
public HexToUnicodeTransliterator(String thePattern) {
|
||||
this(thePattern, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
*/
|
||||
public HexToUnicodeTransliterator(String thePattern,
|
||||
UnicodeFilter theFilter) {
|
||||
super(_ID, theFilter);
|
||||
applyPattern(thePattern);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the patterns recognized by this transliterator. One or
|
||||
* more patterns may be specified, separated by semicolons (';').
|
||||
* Each pattern contains zero or more prefix characters, one or
|
||||
* more digit characters, and zero or more suffix characters. The
|
||||
* digit characters indicates optional digits ('#') followed by
|
||||
* required digits ('0'). The total number of digits cannot
|
||||
* exceed 4, and must be at least 1 required digit. Use a
|
||||
* backslash ('\\') to escape any of the special characters. An
|
||||
* empty pattern is allowed; it specifies a transliterator that
|
||||
* does nothing.
|
||||
*
|
||||
* <p>Example: "U+0000;<###0>" specifies two patterns. The first
|
||||
* has a prefix of "U+", exactly four digits, and no suffix. The
|
||||
* second has a prefix of "<", between one and four digits, and a
|
||||
* suffix of ">".
|
||||
*
|
||||
* <p><pre>
|
||||
* pattern := spec | ( pattern ';' spec )
|
||||
* spec := prefix-char* digit-spec suffix-char*
|
||||
* digit-spec := '#'* '0'+
|
||||
* prefix-char := [^special-char] | '\\' special-char
|
||||
* suffix-char := [^special-char] | '\\' special-char
|
||||
* special-char := ';' | '0' | '#' | '\\'
|
||||
* </pre>
|
||||
*/
|
||||
public void applyPattern(String pattern) {
|
||||
|
||||
/* The pattern is processed and stored in affixes. The pattern
|
||||
* consists of zero or more affixes. Each affix is parsed to
|
||||
* determine the prefix, suffix, minimum digit count, and maximum
|
||||
* digit count. These values are then stored as a four character
|
||||
* header. That is, their numeric values are cast to UChars and
|
||||
* stored in the string. Following these four characters, the prefix
|
||||
* characters, then suffix characters are stored. Each spec takes
|
||||
* n+4 characters, where n is the total length of the prefix and
|
||||
* suffix.
|
||||
*/
|
||||
|
||||
StringBuffer affixes = new StringBuffer();
|
||||
affixCount = 0;
|
||||
|
||||
/* The mode specifies where we are in each spec.
|
||||
* mode 0 = in prefix
|
||||
* mode 1 = in optional digits (#)
|
||||
* mode 2 = in required digits (0)
|
||||
* mode 3 = in suffix
|
||||
*/
|
||||
int mode = 0;
|
||||
|
||||
int prefixLen = 0, suffixLen = 0, minDigits = 0, maxDigits = 0;
|
||||
int start = 0;
|
||||
|
||||
/* To make parsing easier, we append a virtual ';' at the end of
|
||||
* the pattern string, if there isn't one already. When we get to
|
||||
* the index pattern.length() (that is, one past the end), we
|
||||
* create a virtual ';' if necessary.
|
||||
*/
|
||||
char c = 0; // These are outside the loop so we can
|
||||
boolean isLiteral = false; // see the previous character...
|
||||
for (int i=0; i<=pattern.length(); ++i) {
|
||||
// Create the virtual trailing ';' if necessary
|
||||
if (i == pattern.length()) {
|
||||
// If the last character was not a non-literal ';'...
|
||||
if (i > 0 && !(c == SEMICOLON && !isLiteral)) {
|
||||
c = SEMICOLON;
|
||||
isLiteral = false;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
c = pattern.charAt(i);
|
||||
isLiteral = false;
|
||||
}
|
||||
|
||||
if (c == BACKSLASH) {
|
||||
if ((i+1)<pattern.length()) {
|
||||
isLiteral = true;
|
||||
c = pattern.charAt(++i);
|
||||
} else {
|
||||
// Trailing '\\'
|
||||
throw new IllegalArgumentException("Trailing '\\'");
|
||||
}
|
||||
}
|
||||
|
||||
if (!isLiteral) {
|
||||
switch (c) {
|
||||
case POUND:
|
||||
// Seeing a '#' moves us from mode 0 (prefix) to mode 1
|
||||
// (optional digits).
|
||||
if (mode == 0) {
|
||||
++mode;
|
||||
} else if (mode != 1) {
|
||||
// Unquoted '#'
|
||||
throw new IllegalArgumentException("Unquoted '#'");
|
||||
}
|
||||
++maxDigits;
|
||||
break;
|
||||
case ZERO:
|
||||
// Seeing a '0' moves us to mode 2 (required digits)
|
||||
if (mode < 2) {
|
||||
mode = 2;
|
||||
} else if (mode != 2) {
|
||||
// Unquoted '0'
|
||||
throw new IllegalArgumentException("Unquoted '0'");
|
||||
}
|
||||
++minDigits;
|
||||
++maxDigits;
|
||||
break;
|
||||
case SEMICOLON:
|
||||
if (minDigits < 1 || maxDigits > 4
|
||||
// Invalid min/max digit count
|
||||
|| prefixLen > 0xFFFF || suffixLen > 0xFFFF) {
|
||||
// Suffix or prefix too long
|
||||
throw new IllegalArgumentException("Suffix or prefix too long");
|
||||
}
|
||||
// If there was no prefix and no suffix, then the
|
||||
// header will not have been allocated yet. We need
|
||||
// allocate the header now.
|
||||
if (start == affixes.length()) {
|
||||
affixes.append("AAAA");
|
||||
}
|
||||
// Fill in 4-character header
|
||||
affixes.setCharAt(start++, (char) prefixLen);
|
||||
affixes.setCharAt(start++, (char) suffixLen);
|
||||
affixes.setCharAt(start++, (char) minDigits);
|
||||
affixes.setCharAt(start, (char) maxDigits);
|
||||
start = affixes.length();
|
||||
++affixCount;
|
||||
prefixLen = suffixLen = minDigits = maxDigits = mode = 0;
|
||||
break;
|
||||
default:
|
||||
isLiteral = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (isLiteral) {
|
||||
if (start == affixes.length()) {
|
||||
// Make space for the header. Append any four
|
||||
// characters as place holders for the header values.
|
||||
// We fill these in when we parse the ';'.
|
||||
affixes.append("AAAA");
|
||||
}
|
||||
affixes.append(c);
|
||||
if (mode == 0) {
|
||||
++prefixLen;
|
||||
} else {
|
||||
// Any literal outside the prefix moves us into mode 3
|
||||
// (suffix)
|
||||
mode = 3;
|
||||
++suffixLen;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We only modify the pattern and affixes member variables if
|
||||
// we get to this point, that is, if the parse succeeds.
|
||||
this.pattern = pattern;
|
||||
int len = affixes.length();
|
||||
this.affixes = new char[len];
|
||||
affixes.getChars(0, len, this.affixes, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return this transliterator's pattern.
|
||||
*/
|
||||
public String toPattern() {
|
||||
return pattern;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleTransliterate}.
|
||||
*/
|
||||
protected void handleTransliterate(Replaceable text,
|
||||
Position offsets, boolean incremental) {
|
||||
/**
|
||||
* Performs transliteration changing Unicode hexadecimal
|
||||
* escapes to characters. For example, "U+0040" -> '@'. A fixed
|
||||
* set of prefixes is recognized: "\u", "\U", "u+", "U+".
|
||||
*/
|
||||
Position offsets, boolean isIncremental) {
|
||||
int cursor = offsets.cursor;
|
||||
int limit = offsets.limit;
|
||||
int i, j, ipat;
|
||||
|
||||
int maxCursor = limit - 6;
|
||||
loop:
|
||||
while (cursor <= maxCursor) {
|
||||
char c = filteredCharAt(text, cursor + 5);
|
||||
int digit0 = Character.digit(c, 16);
|
||||
if (digit0 < 0) {
|
||||
if (c == '\\') {
|
||||
cursor += 5;
|
||||
} else if (c == 'U' || c == 'u' || c == '+') {
|
||||
cursor += 4;
|
||||
} else {
|
||||
cursor += 6;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
loop:
|
||||
while (cursor < limit) {
|
||||
// Loop over the specs in affixes. If affixCount is zero (an
|
||||
// empty pattern), then we do nothing. We exit this loop when
|
||||
// we match one of the specs. We exit this function (by
|
||||
// jumping to exit: below) if a partial match is detected and
|
||||
// isIncremental is true.
|
||||
for (j=0, ipat=0; j<affixCount; ++j) {
|
||||
|
||||
int u = digit0;
|
||||
// Read the header
|
||||
int prefixLen = affixes[ipat++];
|
||||
int suffixLen = affixes[ipat++];
|
||||
int minDigits = affixes[ipat++];
|
||||
int maxDigits = affixes[ipat++];
|
||||
|
||||
for (int i=4; i>=2; --i) {
|
||||
c = filteredCharAt(text, cursor + i);
|
||||
int digit = Character.digit(c, 16);
|
||||
if (digit < 0) {
|
||||
if (c == 'U' || c == 'u' || c == '+') {
|
||||
cursor += i-1;
|
||||
} else {
|
||||
cursor += 6;
|
||||
// curs is a copy of cursor that is advanced over the
|
||||
// characters as we parse them.
|
||||
int curs = cursor;
|
||||
boolean match = true;
|
||||
|
||||
for (i=0; i<prefixLen; ++i) {
|
||||
if (curs >= limit) {
|
||||
if (i > 0) {
|
||||
// We've already matched a character. This is
|
||||
// a partial match, so we return if in
|
||||
// incremental mode. In non-incremental mode,
|
||||
// go to the next spec.
|
||||
if (isIncremental) {
|
||||
break loop;
|
||||
}
|
||||
match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
char c = filteredCharAt(text, curs++);
|
||||
if (c != affixes[ipat + i]) {
|
||||
match = false;
|
||||
break;
|
||||
}
|
||||
continue loop;
|
||||
}
|
||||
u |= digit << (4 * (5-i));
|
||||
|
||||
if (match) {
|
||||
char u = 0;
|
||||
int digitCount = 0;
|
||||
for (;;) {
|
||||
if (curs >= limit) {
|
||||
// Check for partial match in incremental mode.
|
||||
if (curs > cursor && isIncremental) {
|
||||
break loop;
|
||||
}
|
||||
break;
|
||||
}
|
||||
int digit = Character.digit(filteredCharAt(text, curs), 16);
|
||||
if (digit < 0) {
|
||||
break;
|
||||
}
|
||||
++curs;
|
||||
u <<= 4;
|
||||
u |= (char) digit;
|
||||
if (++digitCount == maxDigits) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
match = (digitCount >= minDigits);
|
||||
|
||||
if (match) {
|
||||
for (i=0; i<suffixLen; ++i) {
|
||||
if (curs >= limit) {
|
||||
// Check for partial match in incremental mode.
|
||||
if (curs > cursor && isIncremental) {
|
||||
break loop;
|
||||
}
|
||||
match = false;
|
||||
break;
|
||||
}
|
||||
char c = filteredCharAt(text, curs++);
|
||||
if (c != affixes[ipat + prefixLen + i]) {
|
||||
match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (match) {
|
||||
// At this point, we have a match
|
||||
text.replace(cursor, curs, String.valueOf(u));
|
||||
limit -= curs - cursor - 1;
|
||||
// The following break statement leaves the
|
||||
// loop that is traversing the specs in
|
||||
// affixes. We then parse the next input
|
||||
// character.
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ipat += prefixLen + suffixLen;
|
||||
}
|
||||
|
||||
c = filteredCharAt(text, cursor);
|
||||
char d = filteredCharAt(text, cursor + 1);
|
||||
if (((c == 'U' || c == 'u') && d == '+')
|
||||
|| (c == '\\' && (d == 'U' || d == 'u'))) {
|
||||
|
||||
// At this point, we have a match; replace cursor..cursor+5
|
||||
// with u.
|
||||
text.replace(cursor, cursor+6, String.valueOf((char) u));
|
||||
limit -= 5;
|
||||
maxCursor -= 5;
|
||||
|
||||
++cursor;
|
||||
} else {
|
||||
cursor += 6;
|
||||
}
|
||||
++cursor;
|
||||
}
|
||||
|
||||
offsets.limit = limit;
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Attic/UnicodeToHexTransliterator.java,v $
|
||||
* $Date: 2000/03/10 04:07:25 $
|
||||
* $Revision: 1.5 $
|
||||
* $Date: 2000/03/22 01:59:55 $
|
||||
* $Revision: 1.6 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -19,47 +19,226 @@ import java.util.*;
|
|||
* prefix specified in the constructor and optionally converts the hex
|
||||
* digits to uppercase.
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
* <p>The format of the output is set by a pattern. This pattern
|
||||
* follows the same syntax as <code>HexToUnicodeTransliterator</code>,
|
||||
* except it does not allow multiple specifications. The pattern sets
|
||||
* the prefix string, suffix string, and minimum and maximum digit
|
||||
* count. There are no setters or getters for these attributes; they
|
||||
* are set only through the pattern.
|
||||
*
|
||||
* <p>The setUppercase() and isUppercase() methods control whether 'a'
|
||||
* through 'f' or 'A' through 'F' are output as hex digits. This is
|
||||
* not controlled through the pattern; only through the methods. The
|
||||
* default is uppercase.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: UnicodeToHexTransliterator.java,v $ $Revision: 1.5 $ $Date: 2000/03/10 04:07:25 $
|
||||
* @version $RCSfile: UnicodeToHexTransliterator.java,v $ $Revision: 1.6 $ $Date: 2000/03/22 01:59:55 $
|
||||
*/
|
||||
public class UnicodeToHexTransliterator extends Transliterator {
|
||||
|
||||
/**
|
||||
* Package accessible ID for this transliterator.
|
||||
*/
|
||||
static String _ID = "Unicode-Hex";
|
||||
|
||||
private String prefix;
|
||||
|
||||
private boolean uppercase;
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
/**
|
||||
* Package accessible ID for this transliterator.
|
||||
*/
|
||||
static final String _ID = "Unicode-Hex";
|
||||
|
||||
private static final char[] HEX_DIGITS = {
|
||||
'0', '1', '2', '3', '4', '5', '6', '7',
|
||||
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
|
||||
'0', '1', '2', '3', '4', '5', '6', '7',
|
||||
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
|
||||
};
|
||||
|
||||
// Character constants for special pattern chars
|
||||
private static final char ZERO = '0';
|
||||
private static final char POUND = '#';
|
||||
private static final char BACKSLASH = '\\';
|
||||
|
||||
/**
|
||||
* The pattern set by applyPattern() and returned by toPattern().
|
||||
*/
|
||||
private String pattern;
|
||||
|
||||
/**
|
||||
* The string preceding the hex digits, parsed from the pattern.
|
||||
*/
|
||||
private String prefix;
|
||||
|
||||
/**
|
||||
* The string following the hex digits, parsed from the pattern.
|
||||
*/
|
||||
private String suffix;
|
||||
|
||||
/**
|
||||
* The minimum number of hex digits to output, between 1 and 4,
|
||||
* inclusive. Parsed from the pattern.
|
||||
*/
|
||||
private int minDigits;
|
||||
|
||||
/**
|
||||
* If true, output uppercase hex digits; otherwise output
|
||||
* lowercase. Set by setUppercase() and returned by isUppercase().
|
||||
*/
|
||||
private boolean uppercase;
|
||||
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
* @param prefix the string that will precede the four hex
|
||||
* digits for UNICODE_HEX transliterators. Ignored
|
||||
* if direction is HEX_UNICODE.
|
||||
* @param pattern The pattern for this transliterator. See
|
||||
* applyPattern() for pattern syntax.
|
||||
* @param uppercase if true, the four hex digits will be
|
||||
* converted to uppercase; otherwise they will be lowercase.
|
||||
* Ignored if direction is HEX_UNICODE.
|
||||
* @param filter the filter for this transliterator, or
|
||||
* null if none.
|
||||
*/
|
||||
public UnicodeToHexTransliterator(String prefix, boolean uppercase,
|
||||
public UnicodeToHexTransliterator(String pattern, boolean uppercase,
|
||||
UnicodeFilter filter) {
|
||||
super(_ID, filter);
|
||||
this.prefix = prefix;
|
||||
this.uppercase = uppercase;
|
||||
applyPattern(pattern);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs an uppercase transliterator with no filter.
|
||||
* @param pattern The pattern for this transliterator. See
|
||||
* applyPattern() for pattern syntax.
|
||||
*/
|
||||
public UnicodeToHexTransliterator(String pattern) {
|
||||
this(pattern, true, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a transliterator with the default prefix "\u"
|
||||
* that outputs uppercase hex digits.
|
||||
* that outputs four uppercase hex digits.
|
||||
*/
|
||||
public UnicodeToHexTransliterator() {
|
||||
this("\\u", true, null);
|
||||
super(_ID, null);
|
||||
pattern = "\\\\u0000";
|
||||
prefix = "\\u";
|
||||
suffix = "";
|
||||
minDigits = 4;
|
||||
uppercase = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the pattern recognized by this transliterator. The pattern
|
||||
* must contain zero or more prefix characters, one or more digit
|
||||
* characters, and zero or more suffix characters. The digit
|
||||
* characters indicates optional digits ('#') followed by required
|
||||
* digits ('0'). The total number of digits cannot exceed 4, and
|
||||
* must be at least 1 required digit. Use a backslash ('\\') to
|
||||
* escape any of the special characters. An empty pattern is not
|
||||
* allowed.
|
||||
*
|
||||
* <p>Example: "U+0000" specifies a prefix of "U+", exactly four
|
||||
* digits, and no suffix. "<###0>" has a prefix of "<", between
|
||||
* one and four digits, and a suffix of ">".
|
||||
*
|
||||
* <p><pre>
|
||||
* pattern := prefix-char* digit-spec suffix-char*
|
||||
* digit-spec := '#'* '0'+
|
||||
* prefix-char := [^special-char] | '\\' special-char
|
||||
* suffix-char := [^special-char] | '\\' special-char
|
||||
* special-char := ';' | '0' | '#' | '\\'
|
||||
* </pre>
|
||||
*
|
||||
* <p>Limitations: There is no way to set the uppercase attribute
|
||||
* in the pattern. (applyPattern() does not alter the uppercase
|
||||
* attribute.)
|
||||
*/
|
||||
public void applyPattern(String thePattern) {
|
||||
StringBuffer prefixBuf = null;
|
||||
StringBuffer suffixBuf = null;
|
||||
int minDigits = 0;
|
||||
int maxDigits = 0;
|
||||
|
||||
/* The mode specifies where we are in each spec.
|
||||
* mode 0 = in prefix
|
||||
* mode 1 = in optional digits (#)
|
||||
* mode 2 = in required digits (0)
|
||||
* mode 3 = in suffix
|
||||
*/
|
||||
int mode = 0;
|
||||
|
||||
for (int i=0; i<thePattern.length(); ++i) {
|
||||
char c = thePattern.charAt(i);
|
||||
boolean isLiteral = false;
|
||||
if (c == BACKSLASH) {
|
||||
if ((i+1)<thePattern.length()) {
|
||||
isLiteral = true;
|
||||
c = thePattern.charAt(++i);
|
||||
} else {
|
||||
// Trailing '\\'
|
||||
throw new IllegalArgumentException("Trailing '\\'");
|
||||
}
|
||||
}
|
||||
|
||||
if (!isLiteral) {
|
||||
switch (c) {
|
||||
case POUND:
|
||||
// Seeing a '#' moves us from mode 0 (prefix) to mode 1
|
||||
// (optional digits).
|
||||
if (mode == 0) {
|
||||
++mode;
|
||||
} else if (mode != 1) {
|
||||
// Unquoted '#'
|
||||
throw new IllegalArgumentException("Unquoted '#'");
|
||||
}
|
||||
++maxDigits;
|
||||
break;
|
||||
case ZERO:
|
||||
// Seeing a '0' moves us to mode 2 (required digits)
|
||||
if (mode < 2) {
|
||||
mode = 2;
|
||||
} else if (mode != 2) {
|
||||
// Unquoted '0'
|
||||
throw new IllegalArgumentException("Unquoted '0'");
|
||||
}
|
||||
++minDigits;
|
||||
++maxDigits;
|
||||
break;
|
||||
default:
|
||||
isLiteral = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (isLiteral) {
|
||||
if (mode == 0) {
|
||||
if (prefixBuf == null) {
|
||||
prefixBuf = new StringBuffer();
|
||||
}
|
||||
prefixBuf.append(c);
|
||||
} else {
|
||||
// Any literal outside the prefix moves us into mode 3
|
||||
// (suffix)
|
||||
mode = 3;
|
||||
if (suffixBuf == null) {
|
||||
suffixBuf = new StringBuffer();
|
||||
}
|
||||
suffixBuf.append(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (minDigits < 1 || maxDigits > 4) {
|
||||
// Invalid min/max digit count
|
||||
throw new IllegalArgumentException("Invalid min/max digit count");
|
||||
}
|
||||
|
||||
pattern = thePattern;
|
||||
prefix = (prefixBuf == null) ? "" : prefixBuf.toString();
|
||||
suffix = (suffixBuf == null) ? "" : suffixBuf.toString();
|
||||
this.minDigits = minDigits;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return this transliterator's pattern.
|
||||
*/
|
||||
public String toPattern() {
|
||||
return pattern;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -116,16 +295,28 @@ public class UnicodeToHexTransliterator extends Transliterator {
|
|||
int limit = offsets.limit;
|
||||
|
||||
UnicodeFilter filter = getFilter();
|
||||
StringBuffer hex = new StringBuffer(prefix);
|
||||
int prefixLen = prefix.length();
|
||||
|
||||
loop:
|
||||
while (cursor < limit) {
|
||||
char c = text.charAt(cursor);
|
||||
if (filter != null && !filter.contains(c)) {
|
||||
++cursor;
|
||||
continue;
|
||||
}
|
||||
String hex = hex(c);
|
||||
text.replace(cursor, cursor+1, hex);
|
||||
|
||||
hex.setLength(prefixLen);
|
||||
boolean showRest = false;
|
||||
for (int i=3; i>=0; --i) {
|
||||
int d = (c >> (i*4)) & 0xF;
|
||||
if (showRest || (d != 0) || minDigits > i) {
|
||||
hex.append(HEX_DIGITS[uppercase ? (d|16) : d]);
|
||||
showRest = true;
|
||||
}
|
||||
}
|
||||
hex.append(suffix);
|
||||
|
||||
text.replace(cursor, cursor+1, hex.toString());
|
||||
int len = hex.length();
|
||||
cursor += len; // Advance cursor by 1 and adjust for new text
|
||||
--len;
|
||||
|
@ -135,24 +326,4 @@ public class UnicodeToHexTransliterator extends Transliterator {
|
|||
offsets.limit = limit;
|
||||
offsets.cursor = cursor;
|
||||
}
|
||||
|
||||
/**
|
||||
* Form escape sequence.
|
||||
*/
|
||||
private final String hex(char c) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
buf.append(prefix);
|
||||
if (c < 0x1000) {
|
||||
buf.append('0');
|
||||
if (c < 0x100) {
|
||||
buf.append('0');
|
||||
if (c < 0x10) {
|
||||
buf.append('0');
|
||||
}
|
||||
}
|
||||
}
|
||||
String h = Integer.toHexString(c);
|
||||
buf.append(uppercase ? h.toUpperCase() : h);
|
||||
return buf.toString();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $
|
||||
* $Date: 2000/03/10 03:47:47 $
|
||||
* $Revision: 1.13 $
|
||||
* $Date: 2000/03/22 02:00:08 $
|
||||
* $Revision: 1.14 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -392,6 +392,26 @@ public class TransliteratorTest extends TestFmwk {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Prefix, suffix support in hex transliterators
|
||||
*/
|
||||
public void TestJ243() {
|
||||
// Test default Hex-Unicode, which should handle
|
||||
// \\u, \\U, u+, and U+
|
||||
HexToUnicodeTransliterator hex = new HexToUnicodeTransliterator();
|
||||
expect(hex, "\\u0041+\\U0042,u+0043uu+0044z", "A+B,CuDz");
|
||||
|
||||
// Try a custom Hex-Unicode
|
||||
// \\uXXXX and &#xXXXX;
|
||||
HexToUnicodeTransliterator hex2 = new HexToUnicodeTransliterator("\\\\u###0;&\\#x###0\\;");
|
||||
expect(hex2, "\\u61\\u062\\u0063\\u00645\\u66x0123",
|
||||
"abcd5fx0123");
|
||||
|
||||
// Try custom Unicode-Hex (default is tested elsewhere)
|
||||
UnicodeToHexTransliterator hex3 = new UnicodeToHexTransliterator("&\\#x###0;");
|
||||
expect(hex3, "012", "012");
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/HexToUnicodeTransliterator.java,v $
|
||||
* $Date: 2000/03/10 04:07:20 $
|
||||
* $Revision: 1.4 $
|
||||
* $Date: 2000/03/22 01:59:55 $
|
||||
* $Revision: 1.5 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -14,16 +14,16 @@ package com.ibm.text;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* A transliterator that converts from hexadecimal Unicode
|
||||
* escape sequences to the characters they represent. For example, "U+0040"
|
||||
* and '\u0040'. It recognizes the
|
||||
* A transliterator that converts from hexadecimal Unicode escape
|
||||
* sequences to the characters they represent. For example, "U+0040"
|
||||
* and '\u0040'. A default HexToUnicodeTransliterator recognizes the
|
||||
* prefixes "U+", "u+", "\U", and "\u". Hex values may be
|
||||
* upper- or lowercase.
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
* upper- or lowercase. By calling the applyPattern() method, one
|
||||
* or more custom prefix/suffix pairs may be specified. See
|
||||
* applyPattern() for details.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: HexToUnicodeTransliterator.java,v $ $Revision: 1.4 $ $Date: 2000/03/10 04:07:20 $
|
||||
* @version $RCSfile: HexToUnicodeTransliterator.java,v $ $Revision: 1.5 $ $Date: 2000/03/22 01:59:55 $
|
||||
*/
|
||||
public class HexToUnicodeTransliterator extends Transliterator {
|
||||
private static final String COPYRIGHT =
|
||||
|
@ -32,75 +32,345 @@ public class HexToUnicodeTransliterator extends Transliterator {
|
|||
/**
|
||||
* Package accessible ID for this transliterator.
|
||||
*/
|
||||
static String _ID = "Hex-Unicode";
|
||||
static final String _ID = "Hex-Unicode";
|
||||
|
||||
/**
|
||||
* This pattern encodes the following specs for the default constructor:
|
||||
* \\u0000
|
||||
* \\U0000
|
||||
* u+0000
|
||||
* U+0000
|
||||
* The multiple backslashes resolve to a single backslash
|
||||
* in the effective prefix.
|
||||
*/
|
||||
private static final String DEFAULT_PATTERN = "\\\\u0000;\\\\U0000;u+0000;U+0000";
|
||||
|
||||
// Character constants for special pattern characters
|
||||
private static final char SEMICOLON = ';';
|
||||
private static final char ZERO = '0';
|
||||
private static final char POUND = '#';
|
||||
private static final char BACKSLASH = '\\';
|
||||
|
||||
/**
|
||||
* The pattern for this transliterator
|
||||
*/
|
||||
private String pattern;
|
||||
|
||||
/**
|
||||
* The processed pattern specification. See applyPattern() for
|
||||
* details.
|
||||
*/
|
||||
private char[] affixes;
|
||||
|
||||
/**
|
||||
* The number of different affix sets in affixes.
|
||||
*/
|
||||
private int affixCount;
|
||||
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
*/
|
||||
public HexToUnicodeTransliterator() {
|
||||
super(_ID, null);
|
||||
applyPattern(DEFAULT_PATTERN);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
*/
|
||||
public HexToUnicodeTransliterator(String thePattern) {
|
||||
this(thePattern, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
*/
|
||||
public HexToUnicodeTransliterator(String thePattern,
|
||||
UnicodeFilter theFilter) {
|
||||
super(_ID, theFilter);
|
||||
applyPattern(thePattern);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the patterns recognized by this transliterator. One or
|
||||
* more patterns may be specified, separated by semicolons (';').
|
||||
* Each pattern contains zero or more prefix characters, one or
|
||||
* more digit characters, and zero or more suffix characters. The
|
||||
* digit characters indicates optional digits ('#') followed by
|
||||
* required digits ('0'). The total number of digits cannot
|
||||
* exceed 4, and must be at least 1 required digit. Use a
|
||||
* backslash ('\\') to escape any of the special characters. An
|
||||
* empty pattern is allowed; it specifies a transliterator that
|
||||
* does nothing.
|
||||
*
|
||||
* <p>Example: "U+0000;<###0>" specifies two patterns. The first
|
||||
* has a prefix of "U+", exactly four digits, and no suffix. The
|
||||
* second has a prefix of "<", between one and four digits, and a
|
||||
* suffix of ">".
|
||||
*
|
||||
* <p><pre>
|
||||
* pattern := spec | ( pattern ';' spec )
|
||||
* spec := prefix-char* digit-spec suffix-char*
|
||||
* digit-spec := '#'* '0'+
|
||||
* prefix-char := [^special-char] | '\\' special-char
|
||||
* suffix-char := [^special-char] | '\\' special-char
|
||||
* special-char := ';' | '0' | '#' | '\\'
|
||||
* </pre>
|
||||
*/
|
||||
public void applyPattern(String pattern) {
|
||||
|
||||
/* The pattern is processed and stored in affixes. The pattern
|
||||
* consists of zero or more affixes. Each affix is parsed to
|
||||
* determine the prefix, suffix, minimum digit count, and maximum
|
||||
* digit count. These values are then stored as a four character
|
||||
* header. That is, their numeric values are cast to UChars and
|
||||
* stored in the string. Following these four characters, the prefix
|
||||
* characters, then suffix characters are stored. Each spec takes
|
||||
* n+4 characters, where n is the total length of the prefix and
|
||||
* suffix.
|
||||
*/
|
||||
|
||||
StringBuffer affixes = new StringBuffer();
|
||||
affixCount = 0;
|
||||
|
||||
/* The mode specifies where we are in each spec.
|
||||
* mode 0 = in prefix
|
||||
* mode 1 = in optional digits (#)
|
||||
* mode 2 = in required digits (0)
|
||||
* mode 3 = in suffix
|
||||
*/
|
||||
int mode = 0;
|
||||
|
||||
int prefixLen = 0, suffixLen = 0, minDigits = 0, maxDigits = 0;
|
||||
int start = 0;
|
||||
|
||||
/* To make parsing easier, we append a virtual ';' at the end of
|
||||
* the pattern string, if there isn't one already. When we get to
|
||||
* the index pattern.length() (that is, one past the end), we
|
||||
* create a virtual ';' if necessary.
|
||||
*/
|
||||
char c = 0; // These are outside the loop so we can
|
||||
boolean isLiteral = false; // see the previous character...
|
||||
for (int i=0; i<=pattern.length(); ++i) {
|
||||
// Create the virtual trailing ';' if necessary
|
||||
if (i == pattern.length()) {
|
||||
// If the last character was not a non-literal ';'...
|
||||
if (i > 0 && !(c == SEMICOLON && !isLiteral)) {
|
||||
c = SEMICOLON;
|
||||
isLiteral = false;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
c = pattern.charAt(i);
|
||||
isLiteral = false;
|
||||
}
|
||||
|
||||
if (c == BACKSLASH) {
|
||||
if ((i+1)<pattern.length()) {
|
||||
isLiteral = true;
|
||||
c = pattern.charAt(++i);
|
||||
} else {
|
||||
// Trailing '\\'
|
||||
throw new IllegalArgumentException("Trailing '\\'");
|
||||
}
|
||||
}
|
||||
|
||||
if (!isLiteral) {
|
||||
switch (c) {
|
||||
case POUND:
|
||||
// Seeing a '#' moves us from mode 0 (prefix) to mode 1
|
||||
// (optional digits).
|
||||
if (mode == 0) {
|
||||
++mode;
|
||||
} else if (mode != 1) {
|
||||
// Unquoted '#'
|
||||
throw new IllegalArgumentException("Unquoted '#'");
|
||||
}
|
||||
++maxDigits;
|
||||
break;
|
||||
case ZERO:
|
||||
// Seeing a '0' moves us to mode 2 (required digits)
|
||||
if (mode < 2) {
|
||||
mode = 2;
|
||||
} else if (mode != 2) {
|
||||
// Unquoted '0'
|
||||
throw new IllegalArgumentException("Unquoted '0'");
|
||||
}
|
||||
++minDigits;
|
||||
++maxDigits;
|
||||
break;
|
||||
case SEMICOLON:
|
||||
if (minDigits < 1 || maxDigits > 4
|
||||
// Invalid min/max digit count
|
||||
|| prefixLen > 0xFFFF || suffixLen > 0xFFFF) {
|
||||
// Suffix or prefix too long
|
||||
throw new IllegalArgumentException("Suffix or prefix too long");
|
||||
}
|
||||
// If there was no prefix and no suffix, then the
|
||||
// header will not have been allocated yet. We need
|
||||
// allocate the header now.
|
||||
if (start == affixes.length()) {
|
||||
affixes.append("AAAA");
|
||||
}
|
||||
// Fill in 4-character header
|
||||
affixes.setCharAt(start++, (char) prefixLen);
|
||||
affixes.setCharAt(start++, (char) suffixLen);
|
||||
affixes.setCharAt(start++, (char) minDigits);
|
||||
affixes.setCharAt(start, (char) maxDigits);
|
||||
start = affixes.length();
|
||||
++affixCount;
|
||||
prefixLen = suffixLen = minDigits = maxDigits = mode = 0;
|
||||
break;
|
||||
default:
|
||||
isLiteral = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (isLiteral) {
|
||||
if (start == affixes.length()) {
|
||||
// Make space for the header. Append any four
|
||||
// characters as place holders for the header values.
|
||||
// We fill these in when we parse the ';'.
|
||||
affixes.append("AAAA");
|
||||
}
|
||||
affixes.append(c);
|
||||
if (mode == 0) {
|
||||
++prefixLen;
|
||||
} else {
|
||||
// Any literal outside the prefix moves us into mode 3
|
||||
// (suffix)
|
||||
mode = 3;
|
||||
++suffixLen;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We only modify the pattern and affixes member variables if
|
||||
// we get to this point, that is, if the parse succeeds.
|
||||
this.pattern = pattern;
|
||||
int len = affixes.length();
|
||||
this.affixes = new char[len];
|
||||
affixes.getChars(0, len, this.affixes, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return this transliterator's pattern.
|
||||
*/
|
||||
public String toPattern() {
|
||||
return pattern;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleTransliterate}.
|
||||
*/
|
||||
protected void handleTransliterate(Replaceable text,
|
||||
Position offsets, boolean incremental) {
|
||||
/**
|
||||
* Performs transliteration changing Unicode hexadecimal
|
||||
* escapes to characters. For example, "U+0040" -> '@'. A fixed
|
||||
* set of prefixes is recognized: "\u", "\U", "u+", "U+".
|
||||
*/
|
||||
Position offsets, boolean isIncremental) {
|
||||
int cursor = offsets.cursor;
|
||||
int limit = offsets.limit;
|
||||
int i, j, ipat;
|
||||
|
||||
int maxCursor = limit - 6;
|
||||
loop:
|
||||
while (cursor <= maxCursor) {
|
||||
char c = filteredCharAt(text, cursor + 5);
|
||||
int digit0 = Character.digit(c, 16);
|
||||
if (digit0 < 0) {
|
||||
if (c == '\\') {
|
||||
cursor += 5;
|
||||
} else if (c == 'U' || c == 'u' || c == '+') {
|
||||
cursor += 4;
|
||||
} else {
|
||||
cursor += 6;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
loop:
|
||||
while (cursor < limit) {
|
||||
// Loop over the specs in affixes. If affixCount is zero (an
|
||||
// empty pattern), then we do nothing. We exit this loop when
|
||||
// we match one of the specs. We exit this function (by
|
||||
// jumping to exit: below) if a partial match is detected and
|
||||
// isIncremental is true.
|
||||
for (j=0, ipat=0; j<affixCount; ++j) {
|
||||
|
||||
int u = digit0;
|
||||
// Read the header
|
||||
int prefixLen = affixes[ipat++];
|
||||
int suffixLen = affixes[ipat++];
|
||||
int minDigits = affixes[ipat++];
|
||||
int maxDigits = affixes[ipat++];
|
||||
|
||||
for (int i=4; i>=2; --i) {
|
||||
c = filteredCharAt(text, cursor + i);
|
||||
int digit = Character.digit(c, 16);
|
||||
if (digit < 0) {
|
||||
if (c == 'U' || c == 'u' || c == '+') {
|
||||
cursor += i-1;
|
||||
} else {
|
||||
cursor += 6;
|
||||
// curs is a copy of cursor that is advanced over the
|
||||
// characters as we parse them.
|
||||
int curs = cursor;
|
||||
boolean match = true;
|
||||
|
||||
for (i=0; i<prefixLen; ++i) {
|
||||
if (curs >= limit) {
|
||||
if (i > 0) {
|
||||
// We've already matched a character. This is
|
||||
// a partial match, so we return if in
|
||||
// incremental mode. In non-incremental mode,
|
||||
// go to the next spec.
|
||||
if (isIncremental) {
|
||||
break loop;
|
||||
}
|
||||
match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
char c = filteredCharAt(text, curs++);
|
||||
if (c != affixes[ipat + i]) {
|
||||
match = false;
|
||||
break;
|
||||
}
|
||||
continue loop;
|
||||
}
|
||||
u |= digit << (4 * (5-i));
|
||||
|
||||
if (match) {
|
||||
char u = 0;
|
||||
int digitCount = 0;
|
||||
for (;;) {
|
||||
if (curs >= limit) {
|
||||
// Check for partial match in incremental mode.
|
||||
if (curs > cursor && isIncremental) {
|
||||
break loop;
|
||||
}
|
||||
break;
|
||||
}
|
||||
int digit = Character.digit(filteredCharAt(text, curs), 16);
|
||||
if (digit < 0) {
|
||||
break;
|
||||
}
|
||||
++curs;
|
||||
u <<= 4;
|
||||
u |= (char) digit;
|
||||
if (++digitCount == maxDigits) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
match = (digitCount >= minDigits);
|
||||
|
||||
if (match) {
|
||||
for (i=0; i<suffixLen; ++i) {
|
||||
if (curs >= limit) {
|
||||
// Check for partial match in incremental mode.
|
||||
if (curs > cursor && isIncremental) {
|
||||
break loop;
|
||||
}
|
||||
match = false;
|
||||
break;
|
||||
}
|
||||
char c = filteredCharAt(text, curs++);
|
||||
if (c != affixes[ipat + prefixLen + i]) {
|
||||
match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (match) {
|
||||
// At this point, we have a match
|
||||
text.replace(cursor, curs, String.valueOf(u));
|
||||
limit -= curs - cursor - 1;
|
||||
// The following break statement leaves the
|
||||
// loop that is traversing the specs in
|
||||
// affixes. We then parse the next input
|
||||
// character.
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ipat += prefixLen + suffixLen;
|
||||
}
|
||||
|
||||
c = filteredCharAt(text, cursor);
|
||||
char d = filteredCharAt(text, cursor + 1);
|
||||
if (((c == 'U' || c == 'u') && d == '+')
|
||||
|| (c == '\\' && (d == 'U' || d == 'u'))) {
|
||||
|
||||
// At this point, we have a match; replace cursor..cursor+5
|
||||
// with u.
|
||||
text.replace(cursor, cursor+6, String.valueOf((char) u));
|
||||
limit -= 5;
|
||||
maxCursor -= 5;
|
||||
|
||||
++cursor;
|
||||
} else {
|
||||
cursor += 6;
|
||||
}
|
||||
++cursor;
|
||||
}
|
||||
|
||||
offsets.limit = limit;
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodeToHexTransliterator.java,v $
|
||||
* $Date: 2000/03/10 04:07:25 $
|
||||
* $Revision: 1.5 $
|
||||
* $Date: 2000/03/22 01:59:55 $
|
||||
* $Revision: 1.6 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -19,47 +19,226 @@ import java.util.*;
|
|||
* prefix specified in the constructor and optionally converts the hex
|
||||
* digits to uppercase.
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
* <p>The format of the output is set by a pattern. This pattern
|
||||
* follows the same syntax as <code>HexToUnicodeTransliterator</code>,
|
||||
* except it does not allow multiple specifications. The pattern sets
|
||||
* the prefix string, suffix string, and minimum and maximum digit
|
||||
* count. There are no setters or getters for these attributes; they
|
||||
* are set only through the pattern.
|
||||
*
|
||||
* <p>The setUppercase() and isUppercase() methods control whether 'a'
|
||||
* through 'f' or 'A' through 'F' are output as hex digits. This is
|
||||
* not controlled through the pattern; only through the methods. The
|
||||
* default is uppercase.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: UnicodeToHexTransliterator.java,v $ $Revision: 1.5 $ $Date: 2000/03/10 04:07:25 $
|
||||
* @version $RCSfile: UnicodeToHexTransliterator.java,v $ $Revision: 1.6 $ $Date: 2000/03/22 01:59:55 $
|
||||
*/
|
||||
public class UnicodeToHexTransliterator extends Transliterator {
|
||||
|
||||
/**
|
||||
* Package accessible ID for this transliterator.
|
||||
*/
|
||||
static String _ID = "Unicode-Hex";
|
||||
|
||||
private String prefix;
|
||||
|
||||
private boolean uppercase;
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
/**
|
||||
* Package accessible ID for this transliterator.
|
||||
*/
|
||||
static final String _ID = "Unicode-Hex";
|
||||
|
||||
private static final char[] HEX_DIGITS = {
|
||||
'0', '1', '2', '3', '4', '5', '6', '7',
|
||||
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
|
||||
'0', '1', '2', '3', '4', '5', '6', '7',
|
||||
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
|
||||
};
|
||||
|
||||
// Character constants for special pattern chars
|
||||
private static final char ZERO = '0';
|
||||
private static final char POUND = '#';
|
||||
private static final char BACKSLASH = '\\';
|
||||
|
||||
/**
|
||||
* The pattern set by applyPattern() and returned by toPattern().
|
||||
*/
|
||||
private String pattern;
|
||||
|
||||
/**
|
||||
* The string preceding the hex digits, parsed from the pattern.
|
||||
*/
|
||||
private String prefix;
|
||||
|
||||
/**
|
||||
* The string following the hex digits, parsed from the pattern.
|
||||
*/
|
||||
private String suffix;
|
||||
|
||||
/**
|
||||
* The minimum number of hex digits to output, between 1 and 4,
|
||||
* inclusive. Parsed from the pattern.
|
||||
*/
|
||||
private int minDigits;
|
||||
|
||||
/**
|
||||
* If true, output uppercase hex digits; otherwise output
|
||||
* lowercase. Set by setUppercase() and returned by isUppercase().
|
||||
*/
|
||||
private boolean uppercase;
|
||||
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
* @param prefix the string that will precede the four hex
|
||||
* digits for UNICODE_HEX transliterators. Ignored
|
||||
* if direction is HEX_UNICODE.
|
||||
* @param pattern The pattern for this transliterator. See
|
||||
* applyPattern() for pattern syntax.
|
||||
* @param uppercase if true, the four hex digits will be
|
||||
* converted to uppercase; otherwise they will be lowercase.
|
||||
* Ignored if direction is HEX_UNICODE.
|
||||
* @param filter the filter for this transliterator, or
|
||||
* null if none.
|
||||
*/
|
||||
public UnicodeToHexTransliterator(String prefix, boolean uppercase,
|
||||
public UnicodeToHexTransliterator(String pattern, boolean uppercase,
|
||||
UnicodeFilter filter) {
|
||||
super(_ID, filter);
|
||||
this.prefix = prefix;
|
||||
this.uppercase = uppercase;
|
||||
applyPattern(pattern);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs an uppercase transliterator with no filter.
|
||||
* @param pattern The pattern for this transliterator. See
|
||||
* applyPattern() for pattern syntax.
|
||||
*/
|
||||
public UnicodeToHexTransliterator(String pattern) {
|
||||
this(pattern, true, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a transliterator with the default prefix "\u"
|
||||
* that outputs uppercase hex digits.
|
||||
* that outputs four uppercase hex digits.
|
||||
*/
|
||||
public UnicodeToHexTransliterator() {
|
||||
this("\\u", true, null);
|
||||
super(_ID, null);
|
||||
pattern = "\\\\u0000";
|
||||
prefix = "\\u";
|
||||
suffix = "";
|
||||
minDigits = 4;
|
||||
uppercase = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the pattern recognized by this transliterator. The pattern
|
||||
* must contain zero or more prefix characters, one or more digit
|
||||
* characters, and zero or more suffix characters. The digit
|
||||
* characters indicates optional digits ('#') followed by required
|
||||
* digits ('0'). The total number of digits cannot exceed 4, and
|
||||
* must be at least 1 required digit. Use a backslash ('\\') to
|
||||
* escape any of the special characters. An empty pattern is not
|
||||
* allowed.
|
||||
*
|
||||
* <p>Example: "U+0000" specifies a prefix of "U+", exactly four
|
||||
* digits, and no suffix. "<###0>" has a prefix of "<", between
|
||||
* one and four digits, and a suffix of ">".
|
||||
*
|
||||
* <p><pre>
|
||||
* pattern := prefix-char* digit-spec suffix-char*
|
||||
* digit-spec := '#'* '0'+
|
||||
* prefix-char := [^special-char] | '\\' special-char
|
||||
* suffix-char := [^special-char] | '\\' special-char
|
||||
* special-char := ';' | '0' | '#' | '\\'
|
||||
* </pre>
|
||||
*
|
||||
* <p>Limitations: There is no way to set the uppercase attribute
|
||||
* in the pattern. (applyPattern() does not alter the uppercase
|
||||
* attribute.)
|
||||
*/
|
||||
public void applyPattern(String thePattern) {
|
||||
StringBuffer prefixBuf = null;
|
||||
StringBuffer suffixBuf = null;
|
||||
int minDigits = 0;
|
||||
int maxDigits = 0;
|
||||
|
||||
/* The mode specifies where we are in each spec.
|
||||
* mode 0 = in prefix
|
||||
* mode 1 = in optional digits (#)
|
||||
* mode 2 = in required digits (0)
|
||||
* mode 3 = in suffix
|
||||
*/
|
||||
int mode = 0;
|
||||
|
||||
for (int i=0; i<thePattern.length(); ++i) {
|
||||
char c = thePattern.charAt(i);
|
||||
boolean isLiteral = false;
|
||||
if (c == BACKSLASH) {
|
||||
if ((i+1)<thePattern.length()) {
|
||||
isLiteral = true;
|
||||
c = thePattern.charAt(++i);
|
||||
} else {
|
||||
// Trailing '\\'
|
||||
throw new IllegalArgumentException("Trailing '\\'");
|
||||
}
|
||||
}
|
||||
|
||||
if (!isLiteral) {
|
||||
switch (c) {
|
||||
case POUND:
|
||||
// Seeing a '#' moves us from mode 0 (prefix) to mode 1
|
||||
// (optional digits).
|
||||
if (mode == 0) {
|
||||
++mode;
|
||||
} else if (mode != 1) {
|
||||
// Unquoted '#'
|
||||
throw new IllegalArgumentException("Unquoted '#'");
|
||||
}
|
||||
++maxDigits;
|
||||
break;
|
||||
case ZERO:
|
||||
// Seeing a '0' moves us to mode 2 (required digits)
|
||||
if (mode < 2) {
|
||||
mode = 2;
|
||||
} else if (mode != 2) {
|
||||
// Unquoted '0'
|
||||
throw new IllegalArgumentException("Unquoted '0'");
|
||||
}
|
||||
++minDigits;
|
||||
++maxDigits;
|
||||
break;
|
||||
default:
|
||||
isLiteral = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (isLiteral) {
|
||||
if (mode == 0) {
|
||||
if (prefixBuf == null) {
|
||||
prefixBuf = new StringBuffer();
|
||||
}
|
||||
prefixBuf.append(c);
|
||||
} else {
|
||||
// Any literal outside the prefix moves us into mode 3
|
||||
// (suffix)
|
||||
mode = 3;
|
||||
if (suffixBuf == null) {
|
||||
suffixBuf = new StringBuffer();
|
||||
}
|
||||
suffixBuf.append(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (minDigits < 1 || maxDigits > 4) {
|
||||
// Invalid min/max digit count
|
||||
throw new IllegalArgumentException("Invalid min/max digit count");
|
||||
}
|
||||
|
||||
pattern = thePattern;
|
||||
prefix = (prefixBuf == null) ? "" : prefixBuf.toString();
|
||||
suffix = (suffixBuf == null) ? "" : suffixBuf.toString();
|
||||
this.minDigits = minDigits;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return this transliterator's pattern.
|
||||
*/
|
||||
public String toPattern() {
|
||||
return pattern;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -116,16 +295,28 @@ public class UnicodeToHexTransliterator extends Transliterator {
|
|||
int limit = offsets.limit;
|
||||
|
||||
UnicodeFilter filter = getFilter();
|
||||
StringBuffer hex = new StringBuffer(prefix);
|
||||
int prefixLen = prefix.length();
|
||||
|
||||
loop:
|
||||
while (cursor < limit) {
|
||||
char c = text.charAt(cursor);
|
||||
if (filter != null && !filter.contains(c)) {
|
||||
++cursor;
|
||||
continue;
|
||||
}
|
||||
String hex = hex(c);
|
||||
text.replace(cursor, cursor+1, hex);
|
||||
|
||||
hex.setLength(prefixLen);
|
||||
boolean showRest = false;
|
||||
for (int i=3; i>=0; --i) {
|
||||
int d = (c >> (i*4)) & 0xF;
|
||||
if (showRest || (d != 0) || minDigits > i) {
|
||||
hex.append(HEX_DIGITS[uppercase ? (d|16) : d]);
|
||||
showRest = true;
|
||||
}
|
||||
}
|
||||
hex.append(suffix);
|
||||
|
||||
text.replace(cursor, cursor+1, hex.toString());
|
||||
int len = hex.length();
|
||||
cursor += len; // Advance cursor by 1 and adjust for new text
|
||||
--len;
|
||||
|
@ -135,24 +326,4 @@ public class UnicodeToHexTransliterator extends Transliterator {
|
|||
offsets.limit = limit;
|
||||
offsets.cursor = cursor;
|
||||
}
|
||||
|
||||
/**
|
||||
* Form escape sequence.
|
||||
*/
|
||||
private final String hex(char c) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
buf.append(prefix);
|
||||
if (c < 0x1000) {
|
||||
buf.append('0');
|
||||
if (c < 0x100) {
|
||||
buf.append('0');
|
||||
if (c < 0x10) {
|
||||
buf.append('0');
|
||||
}
|
||||
}
|
||||
}
|
||||
String h = Integer.toHexString(c);
|
||||
buf.append(uppercase ? h.toUpperCase() : h);
|
||||
return buf.toString();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue