ICU-60 add unescapeAt and unescape methods

X-SVN-Rev: 5905
This commit is contained in:
Alan Liu 2001-09-24 19:57:51 +00:00
parent a01b74ee85
commit 1260e882d6
2 changed files with 296 additions and 4 deletions
icu4j/src/com/ibm

View file

@ -5,12 +5,14 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/Utility.java,v $
* $Date: 2001/07/03 16:35:12 $
* $Revision: 1.6 $
* $Date: 2001/09/24 19:57:51 $
* $Revision: 1.7 $
*
*****************************************************************************************
*/
package com.ibm.util;
import com.ibm.text.UCharacter;
import com.ibm.text.UTF16;
public final class Utility {
@ -635,6 +637,140 @@ public final class Utility {
return buf.toString();
}
/* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
static private final char[] UNESCAPE_MAP = {
/*" 0x22, 0x22 */
/*' 0x27, 0x27 */
/*? 0x3F, 0x3F */
/*\ 0x5C, 0x5C */
/*a*/ 0x61, 0x07,
/*b*/ 0x62, 0x08,
/*f*/ 0x66, 0x0c,
/*n*/ 0x6E, 0x0a,
/*r*/ 0x72, 0x0d,
/*t*/ 0x74, 0x09,
/*v*/ 0x76, 0x0b
};
/**
* Convert an escape to a 32-bit code point value. We attempt
* to parallel the icu4c unesacpeAt() function.
* @param offset16 an array containing offset to the character
* <em>after</em> the backslash. Upon return offset16[0] will
* be updated to point after the escape sequence.
* @return character value from 0 to 10FFFF, or -1 on error.
*/
public static int unescapeAt(String s, int[] offset16) {
int c;
int result = 0;
int n = 0;
int minDig = 0;
int maxDig = 0;
int bitsPerDigit = 4;
int dig;
int i;
/* Check that offset is in range */
int offset = offset16[0];
int length = s.length();
if (offset < 0 || offset >= length) {
return -1;
}
/* Fetch first UChar after '\\' */
c = UTF16.charAt(s, offset);
offset += UTF16.getCharCount(c);
/* Convert hexadecimal and octal escapes */
switch (c) {
case 'u':
minDig = maxDig = 4;
break;
case 'U':
minDig = maxDig = 8;
break;
case 'x':
minDig = 1;
maxDig = 2;
break;
default:
dig = UCharacter.digit(c, 8);
if (dig >= 0) {
minDig = 1;
maxDig = 3;
n = 1; /* Already have first octal digit */
bitsPerDigit = 3;
result = dig;
}
break;
}
if (minDig != 0) {
while (offset < length && n < maxDig) {
// TEMPORARY
// TODO: Restore the char32-based code when UCharacter.digit
// is working (Bug 66).
//c = UTF16.charAt(s, offset);
//dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
c = s.charAt(offset);
dig = Character.digit((char)c, (bitsPerDigit == 3) ? 8 : 16);
if (dig < 0) {
break;
}
result = (result << bitsPerDigit) | dig;
//offset += UTF16.getCharCount(c);
++offset;
++n;
}
if (n < minDig) {
return -1;
}
offset16[0] = offset;
return result;
}
/* Convert C-style escapes in table */
for (i=0; i<UNESCAPE_MAP.length; i+=2) {
if (c == UNESCAPE_MAP[i]) {
offset16[0] = offset;
return UNESCAPE_MAP[i+1];
} else if (c < UNESCAPE_MAP[i]) {
break;
}
}
/* If no special forms are recognized, then consider
* the backslash to generically escape the next character. */
offset16[0] = offset;
return c;
}
/**
* Convert all escapes in a given string using unescapeAt().
* @exception IllegalArgumentException if an invalid escape is
* seen.
*/
public static String unescape(String s) {
StringBuffer buf = new StringBuffer();
int[] pos = new int[1];
for (int i=0; i<s.length(); ) {
char c = s.charAt(i++);
if (c == '\\') {
pos[0] = i;
int e = unescapeAt(s, pos);
if (e < 0) {
throw new IllegalArgumentException("Invalid escape sequence " +
s.substring(i-1, Math.min(i+8, s.length())));
}
UTF16.append(buf, e);
i = pos[0];
} else {
buf.append(c);
}
}
return buf.toString();
}
/**
* Convert a char to 4 hex uppercase digits. E.g., hex('a') =>
* "0041".
@ -689,6 +825,16 @@ public final class Utility {
return output;
}
/**
* Convert a integer to size width (minimum) hex uppercase digits.
* E.g., hex('a', 4, str) => "0041". If the integer requires more
* than width digits, more will be used.
*/
public static String hex(int ch, int width) {
String foo = Integer.toString(ch, 16).toUpperCase();
return "0000000".substring(foo.length() + 7 - width) + foo;
}
/**
* Convert a string to comma-separated groups of 4 hex uppercase
* digits. E.g., hex('ab') => "0041,0042". Append the output

View file

@ -5,12 +5,14 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/util/Attic/Utility.java,v $
* $Date: 2001/07/03 16:35:12 $
* $Revision: 1.6 $
* $Date: 2001/09/24 19:57:51 $
* $Revision: 1.7 $
*
*****************************************************************************************
*/
package com.ibm.util;
import com.ibm.text.UCharacter;
import com.ibm.text.UTF16;
public final class Utility {
@ -635,6 +637,140 @@ public final class Utility {
return buf.toString();
}
/* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
static private final char[] UNESCAPE_MAP = {
/*" 0x22, 0x22 */
/*' 0x27, 0x27 */
/*? 0x3F, 0x3F */
/*\ 0x5C, 0x5C */
/*a*/ 0x61, 0x07,
/*b*/ 0x62, 0x08,
/*f*/ 0x66, 0x0c,
/*n*/ 0x6E, 0x0a,
/*r*/ 0x72, 0x0d,
/*t*/ 0x74, 0x09,
/*v*/ 0x76, 0x0b
};
/**
* Convert an escape to a 32-bit code point value. We attempt
* to parallel the icu4c unesacpeAt() function.
* @param offset16 an array containing offset to the character
* <em>after</em> the backslash. Upon return offset16[0] will
* be updated to point after the escape sequence.
* @return character value from 0 to 10FFFF, or -1 on error.
*/
public static int unescapeAt(String s, int[] offset16) {
int c;
int result = 0;
int n = 0;
int minDig = 0;
int maxDig = 0;
int bitsPerDigit = 4;
int dig;
int i;
/* Check that offset is in range */
int offset = offset16[0];
int length = s.length();
if (offset < 0 || offset >= length) {
return -1;
}
/* Fetch first UChar after '\\' */
c = UTF16.charAt(s, offset);
offset += UTF16.getCharCount(c);
/* Convert hexadecimal and octal escapes */
switch (c) {
case 'u':
minDig = maxDig = 4;
break;
case 'U':
minDig = maxDig = 8;
break;
case 'x':
minDig = 1;
maxDig = 2;
break;
default:
dig = UCharacter.digit(c, 8);
if (dig >= 0) {
minDig = 1;
maxDig = 3;
n = 1; /* Already have first octal digit */
bitsPerDigit = 3;
result = dig;
}
break;
}
if (minDig != 0) {
while (offset < length && n < maxDig) {
// TEMPORARY
// TODO: Restore the char32-based code when UCharacter.digit
// is working (Bug 66).
//c = UTF16.charAt(s, offset);
//dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
c = s.charAt(offset);
dig = Character.digit((char)c, (bitsPerDigit == 3) ? 8 : 16);
if (dig < 0) {
break;
}
result = (result << bitsPerDigit) | dig;
//offset += UTF16.getCharCount(c);
++offset;
++n;
}
if (n < minDig) {
return -1;
}
offset16[0] = offset;
return result;
}
/* Convert C-style escapes in table */
for (i=0; i<UNESCAPE_MAP.length; i+=2) {
if (c == UNESCAPE_MAP[i]) {
offset16[0] = offset;
return UNESCAPE_MAP[i+1];
} else if (c < UNESCAPE_MAP[i]) {
break;
}
}
/* If no special forms are recognized, then consider
* the backslash to generically escape the next character. */
offset16[0] = offset;
return c;
}
/**
* Convert all escapes in a given string using unescapeAt().
* @exception IllegalArgumentException if an invalid escape is
* seen.
*/
public static String unescape(String s) {
StringBuffer buf = new StringBuffer();
int[] pos = new int[1];
for (int i=0; i<s.length(); ) {
char c = s.charAt(i++);
if (c == '\\') {
pos[0] = i;
int e = unescapeAt(s, pos);
if (e < 0) {
throw new IllegalArgumentException("Invalid escape sequence " +
s.substring(i-1, Math.min(i+8, s.length())));
}
UTF16.append(buf, e);
i = pos[0];
} else {
buf.append(c);
}
}
return buf.toString();
}
/**
* Convert a char to 4 hex uppercase digits. E.g., hex('a') =>
* "0041".
@ -689,6 +825,16 @@ public final class Utility {
return output;
}
/**
* Convert a integer to size width (minimum) hex uppercase digits.
* E.g., hex('a', 4, str) => "0041". If the integer requires more
* than width digits, more will be used.
*/
public static String hex(int ch, int width) {
String foo = Integer.toString(ch, 16).toUpperCase();
return "0000000".substring(foo.length() + 7 - width) + foo;
}
/**
* Convert a string to comma-separated groups of 4 hex uppercase
* digits. E.g., hex('ab') => "0041,0042". Append the output