mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
ICU-60 add unescapeAt and unescape methods
X-SVN-Rev: 5905
This commit is contained in:
parent
a01b74ee85
commit
1260e882d6
2 changed files with 296 additions and 4 deletions
icu4j/src/com/ibm
|
@ -5,12 +5,14 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/Utility.java,v $
|
||||
* $Date: 2001/07/03 16:35:12 $
|
||||
* $Revision: 1.6 $
|
||||
* $Date: 2001/09/24 19:57:51 $
|
||||
* $Revision: 1.7 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
package com.ibm.util;
|
||||
import com.ibm.text.UCharacter;
|
||||
import com.ibm.text.UTF16;
|
||||
|
||||
public final class Utility {
|
||||
|
||||
|
@ -635,6 +637,140 @@ public final class Utility {
|
|||
return buf.toString();
|
||||
}
|
||||
|
||||
/* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
|
||||
static private final char[] UNESCAPE_MAP = {
|
||||
/*" 0x22, 0x22 */
|
||||
/*' 0x27, 0x27 */
|
||||
/*? 0x3F, 0x3F */
|
||||
/*\ 0x5C, 0x5C */
|
||||
/*a*/ 0x61, 0x07,
|
||||
/*b*/ 0x62, 0x08,
|
||||
/*f*/ 0x66, 0x0c,
|
||||
/*n*/ 0x6E, 0x0a,
|
||||
/*r*/ 0x72, 0x0d,
|
||||
/*t*/ 0x74, 0x09,
|
||||
/*v*/ 0x76, 0x0b
|
||||
};
|
||||
|
||||
/**
|
||||
* Convert an escape to a 32-bit code point value. We attempt
|
||||
* to parallel the icu4c unesacpeAt() function.
|
||||
* @param offset16 an array containing offset to the character
|
||||
* <em>after</em> the backslash. Upon return offset16[0] will
|
||||
* be updated to point after the escape sequence.
|
||||
* @return character value from 0 to 10FFFF, or -1 on error.
|
||||
*/
|
||||
public static int unescapeAt(String s, int[] offset16) {
|
||||
int c;
|
||||
int result = 0;
|
||||
int n = 0;
|
||||
int minDig = 0;
|
||||
int maxDig = 0;
|
||||
int bitsPerDigit = 4;
|
||||
int dig;
|
||||
int i;
|
||||
|
||||
/* Check that offset is in range */
|
||||
int offset = offset16[0];
|
||||
int length = s.length();
|
||||
if (offset < 0 || offset >= length) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Fetch first UChar after '\\' */
|
||||
c = UTF16.charAt(s, offset);
|
||||
offset += UTF16.getCharCount(c);
|
||||
|
||||
/* Convert hexadecimal and octal escapes */
|
||||
switch (c) {
|
||||
case 'u':
|
||||
minDig = maxDig = 4;
|
||||
break;
|
||||
case 'U':
|
||||
minDig = maxDig = 8;
|
||||
break;
|
||||
case 'x':
|
||||
minDig = 1;
|
||||
maxDig = 2;
|
||||
break;
|
||||
default:
|
||||
dig = UCharacter.digit(c, 8);
|
||||
if (dig >= 0) {
|
||||
minDig = 1;
|
||||
maxDig = 3;
|
||||
n = 1; /* Already have first octal digit */
|
||||
bitsPerDigit = 3;
|
||||
result = dig;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (minDig != 0) {
|
||||
while (offset < length && n < maxDig) {
|
||||
// TEMPORARY
|
||||
// TODO: Restore the char32-based code when UCharacter.digit
|
||||
// is working (Bug 66).
|
||||
|
||||
//c = UTF16.charAt(s, offset);
|
||||
//dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
|
||||
c = s.charAt(offset);
|
||||
dig = Character.digit((char)c, (bitsPerDigit == 3) ? 8 : 16);
|
||||
if (dig < 0) {
|
||||
break;
|
||||
}
|
||||
result = (result << bitsPerDigit) | dig;
|
||||
//offset += UTF16.getCharCount(c);
|
||||
++offset;
|
||||
++n;
|
||||
}
|
||||
if (n < minDig) {
|
||||
return -1;
|
||||
}
|
||||
offset16[0] = offset;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Convert C-style escapes in table */
|
||||
for (i=0; i<UNESCAPE_MAP.length; i+=2) {
|
||||
if (c == UNESCAPE_MAP[i]) {
|
||||
offset16[0] = offset;
|
||||
return UNESCAPE_MAP[i+1];
|
||||
} else if (c < UNESCAPE_MAP[i]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* If no special forms are recognized, then consider
|
||||
* the backslash to generically escape the next character. */
|
||||
offset16[0] = offset;
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert all escapes in a given string using unescapeAt().
|
||||
* @exception IllegalArgumentException if an invalid escape is
|
||||
* seen.
|
||||
*/
|
||||
public static String unescape(String s) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
int[] pos = new int[1];
|
||||
for (int i=0; i<s.length(); ) {
|
||||
char c = s.charAt(i++);
|
||||
if (c == '\\') {
|
||||
pos[0] = i;
|
||||
int e = unescapeAt(s, pos);
|
||||
if (e < 0) {
|
||||
throw new IllegalArgumentException("Invalid escape sequence " +
|
||||
s.substring(i-1, Math.min(i+8, s.length())));
|
||||
}
|
||||
UTF16.append(buf, e);
|
||||
i = pos[0];
|
||||
} else {
|
||||
buf.append(c);
|
||||
}
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a char to 4 hex uppercase digits. E.g., hex('a') =>
|
||||
* "0041".
|
||||
|
@ -689,6 +825,16 @@ public final class Utility {
|
|||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a integer to size width (minimum) hex uppercase digits.
|
||||
* E.g., hex('a', 4, str) => "0041". If the integer requires more
|
||||
* than width digits, more will be used.
|
||||
*/
|
||||
public static String hex(int ch, int width) {
|
||||
String foo = Integer.toString(ch, 16).toUpperCase();
|
||||
return "0000000".substring(foo.length() + 7 - width) + foo;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a string to comma-separated groups of 4 hex uppercase
|
||||
* digits. E.g., hex('ab') => "0041,0042". Append the output
|
||||
|
|
|
@ -5,12 +5,14 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/util/Attic/Utility.java,v $
|
||||
* $Date: 2001/07/03 16:35:12 $
|
||||
* $Revision: 1.6 $
|
||||
* $Date: 2001/09/24 19:57:51 $
|
||||
* $Revision: 1.7 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
package com.ibm.util;
|
||||
import com.ibm.text.UCharacter;
|
||||
import com.ibm.text.UTF16;
|
||||
|
||||
public final class Utility {
|
||||
|
||||
|
@ -635,6 +637,140 @@ public final class Utility {
|
|||
return buf.toString();
|
||||
}
|
||||
|
||||
/* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
|
||||
static private final char[] UNESCAPE_MAP = {
|
||||
/*" 0x22, 0x22 */
|
||||
/*' 0x27, 0x27 */
|
||||
/*? 0x3F, 0x3F */
|
||||
/*\ 0x5C, 0x5C */
|
||||
/*a*/ 0x61, 0x07,
|
||||
/*b*/ 0x62, 0x08,
|
||||
/*f*/ 0x66, 0x0c,
|
||||
/*n*/ 0x6E, 0x0a,
|
||||
/*r*/ 0x72, 0x0d,
|
||||
/*t*/ 0x74, 0x09,
|
||||
/*v*/ 0x76, 0x0b
|
||||
};
|
||||
|
||||
/**
|
||||
* Convert an escape to a 32-bit code point value. We attempt
|
||||
* to parallel the icu4c unesacpeAt() function.
|
||||
* @param offset16 an array containing offset to the character
|
||||
* <em>after</em> the backslash. Upon return offset16[0] will
|
||||
* be updated to point after the escape sequence.
|
||||
* @return character value from 0 to 10FFFF, or -1 on error.
|
||||
*/
|
||||
public static int unescapeAt(String s, int[] offset16) {
|
||||
int c;
|
||||
int result = 0;
|
||||
int n = 0;
|
||||
int minDig = 0;
|
||||
int maxDig = 0;
|
||||
int bitsPerDigit = 4;
|
||||
int dig;
|
||||
int i;
|
||||
|
||||
/* Check that offset is in range */
|
||||
int offset = offset16[0];
|
||||
int length = s.length();
|
||||
if (offset < 0 || offset >= length) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Fetch first UChar after '\\' */
|
||||
c = UTF16.charAt(s, offset);
|
||||
offset += UTF16.getCharCount(c);
|
||||
|
||||
/* Convert hexadecimal and octal escapes */
|
||||
switch (c) {
|
||||
case 'u':
|
||||
minDig = maxDig = 4;
|
||||
break;
|
||||
case 'U':
|
||||
minDig = maxDig = 8;
|
||||
break;
|
||||
case 'x':
|
||||
minDig = 1;
|
||||
maxDig = 2;
|
||||
break;
|
||||
default:
|
||||
dig = UCharacter.digit(c, 8);
|
||||
if (dig >= 0) {
|
||||
minDig = 1;
|
||||
maxDig = 3;
|
||||
n = 1; /* Already have first octal digit */
|
||||
bitsPerDigit = 3;
|
||||
result = dig;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (minDig != 0) {
|
||||
while (offset < length && n < maxDig) {
|
||||
// TEMPORARY
|
||||
// TODO: Restore the char32-based code when UCharacter.digit
|
||||
// is working (Bug 66).
|
||||
|
||||
//c = UTF16.charAt(s, offset);
|
||||
//dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
|
||||
c = s.charAt(offset);
|
||||
dig = Character.digit((char)c, (bitsPerDigit == 3) ? 8 : 16);
|
||||
if (dig < 0) {
|
||||
break;
|
||||
}
|
||||
result = (result << bitsPerDigit) | dig;
|
||||
//offset += UTF16.getCharCount(c);
|
||||
++offset;
|
||||
++n;
|
||||
}
|
||||
if (n < minDig) {
|
||||
return -1;
|
||||
}
|
||||
offset16[0] = offset;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Convert C-style escapes in table */
|
||||
for (i=0; i<UNESCAPE_MAP.length; i+=2) {
|
||||
if (c == UNESCAPE_MAP[i]) {
|
||||
offset16[0] = offset;
|
||||
return UNESCAPE_MAP[i+1];
|
||||
} else if (c < UNESCAPE_MAP[i]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* If no special forms are recognized, then consider
|
||||
* the backslash to generically escape the next character. */
|
||||
offset16[0] = offset;
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert all escapes in a given string using unescapeAt().
|
||||
* @exception IllegalArgumentException if an invalid escape is
|
||||
* seen.
|
||||
*/
|
||||
public static String unescape(String s) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
int[] pos = new int[1];
|
||||
for (int i=0; i<s.length(); ) {
|
||||
char c = s.charAt(i++);
|
||||
if (c == '\\') {
|
||||
pos[0] = i;
|
||||
int e = unescapeAt(s, pos);
|
||||
if (e < 0) {
|
||||
throw new IllegalArgumentException("Invalid escape sequence " +
|
||||
s.substring(i-1, Math.min(i+8, s.length())));
|
||||
}
|
||||
UTF16.append(buf, e);
|
||||
i = pos[0];
|
||||
} else {
|
||||
buf.append(c);
|
||||
}
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a char to 4 hex uppercase digits. E.g., hex('a') =>
|
||||
* "0041".
|
||||
|
@ -689,6 +825,16 @@ public final class Utility {
|
|||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a integer to size width (minimum) hex uppercase digits.
|
||||
* E.g., hex('a', 4, str) => "0041". If the integer requires more
|
||||
* than width digits, more will be used.
|
||||
*/
|
||||
public static String hex(int ch, int width) {
|
||||
String foo = Integer.toString(ch, 16).toUpperCase();
|
||||
return "0000000".substring(foo.length() + 7 - width) + foo;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a string to comma-separated groups of 4 hex uppercase
|
||||
* digits. E.g., hex('ab') => "0041,0042". Append the output
|
||||
|
|
Loading…
Add table
Reference in a new issue