ICU-60 add unescapeAt and unescape methods

X-SVN-Rev: 5905
2025-04-13 08:53:20 +00:00 · 2001-09-24 19:57:51 +00:00 · 2001-09-24 19:57:51 +00:00 · 1260e882d6
commit 1260e882d6
parent a01b74ee85
2 changed files with 296 additions and 4 deletions
--- a/icu4j/src/com/ibm/icu/impl/Utility.java
+++ b/icu4j/src/com/ibm/icu/impl/Utility.java
@ -5,12 +5,14 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/Utility.java,v $ 
- * $Date: 2001/07/03 16:35:12 $ 
- * $Revision: 1.6 $
+ * $Date: 2001/09/24 19:57:51 $ 
+ * $Revision: 1.7 $
 *
 *****************************************************************************************
 */
 package com.ibm.util;
+import com.ibm.text.UCharacter;
+import com.ibm.text.UTF16;

 public final class Utility {

@ -635,6 +637,140 @@ public final class Utility {
        return buf.toString();
    }

+    /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
+    static private final char[] UNESCAPE_MAP = {
+        /*"   0x22, 0x22 */
+        /*'   0x27, 0x27 */
+        /*?   0x3F, 0x3F */
+        /*\   0x5C, 0x5C */
+        /*a*/ 0x61, 0x07,
+        /*b*/ 0x62, 0x08,
+        /*f*/ 0x66, 0x0c,
+        /*n*/ 0x6E, 0x0a,
+        /*r*/ 0x72, 0x0d,
+        /*t*/ 0x74, 0x09,
+        /*v*/ 0x76, 0x0b
+    };
+
+    /**
+     * Convert an escape to a 32-bit code point value.  We attempt
+     * to parallel the icu4c unesacpeAt() function.
+     * @param offset16 an array containing offset to the character
+     * <em>after</em> the backslash.  Upon return offset16[0] will
+     * be updated to point after the escape sequence.
+     * @return character value from 0 to 10FFFF, or -1 on error.
+     */
+    public static int unescapeAt(String s, int[] offset16) {
+        int c;
+        int result = 0;
+        int n = 0;
+        int minDig = 0;
+        int maxDig = 0;
+        int bitsPerDigit = 4; 
+        int dig;
+        int i;
+
+        /* Check that offset is in range */
+        int offset = offset16[0];
+        int length = s.length();
+        if (offset < 0 || offset >= length) {
+            return -1;
+        }
+
+        /* Fetch first UChar after '\\' */
+        c = UTF16.charAt(s, offset);
+        offset += UTF16.getCharCount(c);
+
+        /* Convert hexadecimal and octal escapes */
+        switch (c) {
+        case 'u':
+            minDig = maxDig = 4;
+            break;
+        case 'U':
+            minDig = maxDig = 8;
+            break;
+        case 'x':
+            minDig = 1;
+            maxDig = 2;
+            break;
+        default:
+            dig = UCharacter.digit(c, 8);
+            if (dig >= 0) {
+                minDig = 1;
+                maxDig = 3;
+                n = 1; /* Already have first octal digit */
+                bitsPerDigit = 3;
+                result = dig;
+            }
+            break;
+        }
+        if (minDig != 0) {
+            while (offset < length && n < maxDig) {
+                // TEMPORARY
+                // TODO: Restore the char32-based code when UCharacter.digit
+                // is working (Bug 66).
+
+                //c = UTF16.charAt(s, offset);
+                //dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
+                c = s.charAt(offset);
+                dig = Character.digit((char)c, (bitsPerDigit == 3) ? 8 : 16);
+                if (dig < 0) {
+                    break;
+                }
+                result = (result << bitsPerDigit) | dig;
+                //offset += UTF16.getCharCount(c);
+                ++offset;
+                ++n;
+            }
+            if (n < minDig) {
+                return -1;
+            }
+            offset16[0] = offset;
+            return result;
+        }
+
+        /* Convert C-style escapes in table */
+        for (i=0; i<UNESCAPE_MAP.length; i+=2) {
+            if (c == UNESCAPE_MAP[i]) {
+                offset16[0] = offset;
+                return UNESCAPE_MAP[i+1];
+            } else if (c < UNESCAPE_MAP[i]) {
+                break;
+            }
+        }
+        
+        /* If no special forms are recognized, then consider
+         * the backslash to generically escape the next character. */        
+        offset16[0] = offset;
+        return c;
+    }
+
+    /**
+     * Convert all escapes in a given string using unescapeAt().
+     * @exception IllegalArgumentException if an invalid escape is
+     * seen.
+     */
+    public static String unescape(String s) {
+        StringBuffer buf = new StringBuffer();
+        int[] pos = new int[1];
+        for (int i=0; i<s.length(); ) {
+            char c = s.charAt(i++);
+            if (c == '\\') {
+                pos[0] = i;
+                int e = unescapeAt(s, pos);
+                if (e < 0) {
+                    throw new IllegalArgumentException("Invalid escape sequence " +
+                                                       s.substring(i-1, Math.min(i+8, s.length())));
+                }
+                UTF16.append(buf, e);
+                i = pos[0];
+            } else {
+                buf.append(c);
+            }
+        }
+        return buf.toString();
+    }
+
    /**
     * Convert a char to 4 hex uppercase digits.  E.g., hex('a') =>
     * "0041".
@ -689,6 +825,16 @@ public final class Utility {
        return output;
    }

+    /**
+     * Convert a integer to size width (minimum) hex uppercase digits. 
+     * E.g., hex('a', 4, str) => "0041".  If the integer requires more
+     * than width digits, more will be used.
+     */
+    public static String hex(int ch, int width) {
+        String foo = Integer.toString(ch, 16).toUpperCase();
+        return "0000000".substring(foo.length() + 7 - width) + foo;
+    }
+
    /**
     * Convert a string to comma-separated groups of 4 hex uppercase
     * digits.  E.g., hex('ab') => "0041,0042".  Append the output
--- a/icu4j/src/com/ibm/util/Utility.java
+++ b/icu4j/src/com/ibm/util/Utility.java
@ -5,12 +5,14 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/util/Attic/Utility.java,v $ 
- * $Date: 2001/07/03 16:35:12 $ 
- * $Revision: 1.6 $
+ * $Date: 2001/09/24 19:57:51 $ 
+ * $Revision: 1.7 $
 *
 *****************************************************************************************
 */
 package com.ibm.util;
+import com.ibm.text.UCharacter;
+import com.ibm.text.UTF16;

 public final class Utility {

@ -635,6 +637,140 @@ public final class Utility {
        return buf.toString();
    }

+    /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
+    static private final char[] UNESCAPE_MAP = {
+        /*"   0x22, 0x22 */
+        /*'   0x27, 0x27 */
+        /*?   0x3F, 0x3F */
+        /*\   0x5C, 0x5C */
+        /*a*/ 0x61, 0x07,
+        /*b*/ 0x62, 0x08,
+        /*f*/ 0x66, 0x0c,
+        /*n*/ 0x6E, 0x0a,
+        /*r*/ 0x72, 0x0d,
+        /*t*/ 0x74, 0x09,
+        /*v*/ 0x76, 0x0b
+    };
+
+    /**
+     * Convert an escape to a 32-bit code point value.  We attempt
+     * to parallel the icu4c unesacpeAt() function.
+     * @param offset16 an array containing offset to the character
+     * <em>after</em> the backslash.  Upon return offset16[0] will
+     * be updated to point after the escape sequence.
+     * @return character value from 0 to 10FFFF, or -1 on error.
+     */
+    public static int unescapeAt(String s, int[] offset16) {
+        int c;
+        int result = 0;
+        int n = 0;
+        int minDig = 0;
+        int maxDig = 0;
+        int bitsPerDigit = 4; 
+        int dig;
+        int i;
+
+        /* Check that offset is in range */
+        int offset = offset16[0];
+        int length = s.length();
+        if (offset < 0 || offset >= length) {
+            return -1;
+        }
+
+        /* Fetch first UChar after '\\' */
+        c = UTF16.charAt(s, offset);
+        offset += UTF16.getCharCount(c);
+
+        /* Convert hexadecimal and octal escapes */
+        switch (c) {
+        case 'u':
+            minDig = maxDig = 4;
+            break;
+        case 'U':
+            minDig = maxDig = 8;
+            break;
+        case 'x':
+            minDig = 1;
+            maxDig = 2;
+            break;
+        default:
+            dig = UCharacter.digit(c, 8);
+            if (dig >= 0) {
+                minDig = 1;
+                maxDig = 3;
+                n = 1; /* Already have first octal digit */
+                bitsPerDigit = 3;
+                result = dig;
+            }
+            break;
+        }
+        if (minDig != 0) {
+            while (offset < length && n < maxDig) {
+                // TEMPORARY
+                // TODO: Restore the char32-based code when UCharacter.digit
+                // is working (Bug 66).
+
+                //c = UTF16.charAt(s, offset);
+                //dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
+                c = s.charAt(offset);
+                dig = Character.digit((char)c, (bitsPerDigit == 3) ? 8 : 16);
+                if (dig < 0) {
+                    break;
+                }
+                result = (result << bitsPerDigit) | dig;
+                //offset += UTF16.getCharCount(c);
+                ++offset;
+                ++n;
+            }
+            if (n < minDig) {
+                return -1;
+            }
+            offset16[0] = offset;
+            return result;
+        }
+
+        /* Convert C-style escapes in table */
+        for (i=0; i<UNESCAPE_MAP.length; i+=2) {
+            if (c == UNESCAPE_MAP[i]) {
+                offset16[0] = offset;
+                return UNESCAPE_MAP[i+1];
+            } else if (c < UNESCAPE_MAP[i]) {
+                break;
+            }
+        }
+        
+        /* If no special forms are recognized, then consider
+         * the backslash to generically escape the next character. */        
+        offset16[0] = offset;
+        return c;
+    }
+
+    /**
+     * Convert all escapes in a given string using unescapeAt().
+     * @exception IllegalArgumentException if an invalid escape is
+     * seen.
+     */
+    public static String unescape(String s) {
+        StringBuffer buf = new StringBuffer();
+        int[] pos = new int[1];
+        for (int i=0; i<s.length(); ) {
+            char c = s.charAt(i++);
+            if (c == '\\') {
+                pos[0] = i;
+                int e = unescapeAt(s, pos);
+                if (e < 0) {
+                    throw new IllegalArgumentException("Invalid escape sequence " +
+                                                       s.substring(i-1, Math.min(i+8, s.length())));
+                }
+                UTF16.append(buf, e);
+                i = pos[0];
+            } else {
+                buf.append(c);
+            }
+        }
+        return buf.toString();
+    }
+
    /**
     * Convert a char to 4 hex uppercase digits.  E.g., hex('a') =>
     * "0041".
@ -689,6 +825,16 @@ public final class Utility {
        return output;
    }

+    /**
+     * Convert a integer to size width (minimum) hex uppercase digits. 
+     * E.g., hex('a', 4, str) => "0041".  If the integer requires more
+     * than width digits, more will be used.
+     */
+    public static String hex(int ch, int width) {
+        String foo = Integer.toString(ch, 16).toUpperCase();
+        return "0000000".substring(foo.length() + 7 - width) + foo;
+    }
+
    /**
     * Convert a string to comma-separated groups of 4 hex uppercase
     * digits.  E.g., hex('ab') => "0041,0042".  Append the output