ICU-21645 reduce heap allocations in unescape() and other parsing

2025-04-13 17:01:16 +00:00 · 2021-06-19 20:25:33 +00:00 · 2021-06-19 20:25:33 +00:00 · ca3e86d119
commit ca3e86d119
parent d9aa1ecab0
8 changed files with 162 additions and 122 deletions
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/RuleCharacterIterator.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/RuleCharacterIterator.java
@ -52,7 +52,7 @@ public class RuleCharacterIterator {
    /**
     * Current variable expansion, or null if none.
     */
-    private char[] buf;
+    private String buf;

    /**
     * Position within buf[].  Meaningless if buf == null.
@ -79,7 +79,7 @@ public class RuleCharacterIterator {
    /**
     * Bitmask option to enable parsing of escape sequences.  If (options &
     * PARSE_ESCAPES) != 0, then an embedded escape sequence will be expanded
-     * to its value.  Escapes are parsed using Utility.unescapeAt().
+     * to its value.  Escapes are parsed using Utility.unescapeAndLengthAt().
     */
    public static final int PARSE_ESCAPES   = 2;

@ -90,6 +90,13 @@ public class RuleCharacterIterator {
     */
    public static final int SKIP_WHITESPACE = 4;

+    /** For use with {@link #getPos(Position)} & {@link #setPos(Position)}. */
+    public static final class Position {
+        private String buf;
+        private int bufPos;
+        private int posIndex;
+    };
+
    /**
     * Constructs an iterator over the given text, starting at the given
     * position.
@ -144,15 +151,17 @@ public class RuleCharacterIterator {
                    break;
                }
                bufPos = 0;
-                buf = sym.lookup(name);
-                if (buf == null) {
+                char[] chars = sym.lookup(name);
+                if (chars == null) {
+                    buf = null;
                    throw new IllegalArgumentException(
                                "Undefined variable: " + name);
                }
                // Handle empty variable value
-                if (buf.length == 0) {
+                if (chars.length == 0) {
                    buf = null;
                }
+                buf = new String(chars);
                continue;
            }

@ -162,13 +171,14 @@ public class RuleCharacterIterator {
            }

            if (c == '\\' && (options & PARSE_ESCAPES) != 0) {
-                int offset[] = new int[] { 0 };
-                c = Utility.unescapeAt(lookahead(), offset);
-                jumpahead(offset[0]);
-                isEscaped = true;
-                if (c < 0) {
+                int cpAndLength = Utility.unescapeAndLengthAt(
+                        getCurrentBuffer(), getCurrentBufferPos());
+                if (cpAndLength < 0) {
                    throw new IllegalArgumentException("Invalid escape");
                }
+                c = Utility.cpFromCodePointAndLength(cpAndLength);
+                jumpahead(Utility.lengthFromCodePointAndLength(cpAndLength));
+                isEscaped = true;
            }

            break;
@ -199,7 +209,7 @@ public class RuleCharacterIterator {
     * restore this iterator's position.  Usage idiom:
     *
     * RuleCharacterIterator iterator = ...;
-     * Object pos = iterator.getPos(null); // allocate position object
+     * Position pos = iterator.getPos(null); // allocate position object
     * for (;;) {
     *   pos = iterator.getPos(pos); // reuse position object
     *   int c = iterator.next(...);
@ -213,15 +223,13 @@ public class RuleCharacterIterator {
     * @return a position object which may be passed to setPos(),
     * either `p,' or if `p' == null, a newly-allocated object
     */
-    public Object getPos(Object p) {
+    public Position getPos(Position p) {
        if (p == null) {
-            return new Object[] {buf, new int[] {pos.getIndex(), bufPos}};
+            p = new Position();
        }
-        Object[] a = (Object[]) p;
-        a[0] = buf;
-        int[] v = (int[]) a[1];
-        v[0] = pos.getIndex();
-        v[1] = bufPos;
+        p.buf = buf;
+        p.bufPos = bufPos;
+        p.posIndex = pos.getIndex();
        return p;
    }

@ -230,12 +238,10 @@ public class RuleCharacterIterator {
     * returned the given object.
     * @param p a position object previously returned by getPos()
     */
-    public void setPos(Object p) {
-        Object[] a = (Object[]) p;
-        buf = (char[]) a[0];
-        int[] v = (int[]) a[1];
-        pos.setIndex(v[0]);
-        bufPos = v[1];
+    public void setPos(Position p) {
+        buf = p.buf;
+        pos.setIndex(p.posIndex);
+        bufPos = p.bufPos;
    }

    /**
@ -260,25 +266,35 @@ public class RuleCharacterIterator {
     * Returns a string containing the remainder of the characters to be
     * returned by this iterator, without any option processing.  If the
     * iterator is currently within a variable expansion, this will only
-     * extend to the end of the variable expansion.  This method is provided
-     * so that iterators may interoperate with string-based APIs.  The typical
-     * sequence of calls is to call skipIgnored(), then call lookahead(), then
-     * parse the string returned by lookahead(), then call jumpahead() to
+     * extend to the end of the variable expansion.
+     * This method, together with getCurrentBufferPos() (which replace the former lookahead()),
+     * is provided so that iterators may interoperate with string-based APIs. The typical
+     * sequence of calls is to call skipIgnored(), then call these methods, then
+     * parse that substring, then call jumpahead() to
     * resynchronize the iterator.
     * @return a string containing the characters to be returned by future
     * calls to next()
     */
-    public String lookahead() {
+    public String getCurrentBuffer() {
        if (buf != null) {
-            return new String(buf, bufPos, buf.length - bufPos);
+            return buf;
        } else {
-            return text.substring(pos.getIndex());
+            return text;
+        }
+    }
+
+    public int getCurrentBufferPos() {
+        if (buf != null) {
+            return bufPos;
+        } else {
+            return pos.getIndex();
        }
    }

    /**
     * Advances the position by the given number of 16-bit code units.
-     * This is useful in conjunction with the lookahead() method.
+     * This is useful in conjunction with getCurrentBuffer()+getCurrentBufferPos()
+     * (formerly lookahead()).
     * @param count the number of 16-bit code units to jump over
     */
    public void jumpahead(int count) {
@ -287,10 +303,10 @@ public class RuleCharacterIterator {
        }
        if (buf != null) {
            bufPos += count;
-            if (bufPos > buf.length) {
+            if (bufPos > buf.length()) {
                throw new IllegalArgumentException();
            }
-            if (bufPos == buf.length) {
+            if (bufPos == buf.length()) {
                buf = null;
            }
        } else {
@ -321,7 +337,7 @@ public class RuleCharacterIterator {
     */
    private int _current() {
        if (buf != null) {
-            return UTF16.charAt(buf, 0, buf.length, bufPos);
+            return UTF16.charAt(buf, bufPos);
        } else {
            int i = pos.getIndex();
            return (i < text.length()) ? UTF16.charAt(text, i) : DONE;
@ -335,7 +351,7 @@ public class RuleCharacterIterator {
    private void _advance(int count) {
        if (buf != null) {
            bufPos += count;
-            if (bufPos == buf.length) {
+            if (bufPos == buf.length()) {
                buf = null;
            }
        } else {
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java
@ -778,14 +778,16 @@ public final class Utility {
    };

    /**
-     * Convert an escape to a 32-bit code point value.  We attempt
+     * Converts an escape to a code point value. We attempt
     * to parallel the icu4c unescapeAt() function.
-     * @param offset16 an array containing offset to the character
-     * <em>after</em> the backslash.  Upon return offset16[0] will
-     * be updated to point after the escape sequence.
-     * @return character value from 0 to 10FFFF, or -1 on error.
+     * This function returns an integer with
+     * both the code point (bits 28..8) and the length of the escape sequence (bits 7..0).
+     * offset+length is the index after the escape sequence.
+     *
+     * @param offset the offset to the character <em>after</em> the backslash.
+     * @return the code point and length, or -1 on error.
     */
-    public static int unescapeAt(String s, int[] offset16) {
+    public static int unescapeAndLengthAt(CharSequence s, int offset) {
        int c;
        int result = 0;
        int n = 0;
@ -797,11 +799,11 @@ public final class Utility {
        boolean braces = false;

        /* Check that offset is in range */
-        int offset = offset16[0];
        int length = s.length();
        if (offset < 0 || offset >= length) {
            return -1;
        }
+        int start = offset;

        /* Fetch first UChar after '\\' */
        c = Character.codePointAt(s, offset);
@ -867,24 +869,24 @@ public final class Utility {
                int ahead = offset+1;
                c = s.charAt(offset); // [sic] get 16-bit code unit
                if (c == '\\' && ahead < length) {
-                    int o[] = new int[] { ahead };
-                    c = unescapeAt(s, o);
-                    ahead = o[0];
+                    int cpAndLength = unescapeAndLengthAt(s, ahead);
+                    if (cpAndLength >= 0) {
+                        c = cpAndLength >> 8;
+                        ahead += cpAndLength & 0xff;
+                    }
                }
                if (c <= 0xffff && UTF16.isTrailSurrogate((char) c)) {
                    offset = ahead;
                    result = Character.toCodePoint((char) result, (char) c);
                }
            }
-            offset16[0] = offset;
-            return result;
+            return codePointAndLength(result, start, offset);
        }

        /* Convert C-style escapes in table */
        for (i=0; i<UNESCAPE_MAP.length; i+=2) {
            if (c == UNESCAPE_MAP[i]) {
-                offset16[0] = offset;
-                return UNESCAPE_MAP[i+1];
+                return codePointAndLength(UNESCAPE_MAP[i+1], start, offset);
            } else if (c < UNESCAPE_MAP[i]) {
                break;
            }
@ -893,64 +895,94 @@ public final class Utility {
        /* Map \cX to control-X: X & 0x1F */
        if (c == 'c' && offset < length) {
            c = UTF16.charAt(s, offset);
-            offset16[0] = offset + UTF16.getCharCount(c);
-            return 0x1F & c;
+            return codePointAndLength(c & 0x1F, start, offset + UTF16.getCharCount(c));
        }

        /* If no special forms are recognized, then consider
         * the backslash to generically escape the next character. */
-        offset16[0] = offset;
-        return c;
+        return codePointAndLength(c, start, offset);
+    }
+
+    private static int codePointAndLength(int c, int length) {
+        assert 0 <= c && c <= 0x10ffff;
+        assert 0 <= length && length <= 0xff;
+        return c << 8 | length;
+    }
+
+    private static int codePointAndLength(int c, int start, int limit) {
+        return codePointAndLength(c, limit - start);
+    }
+
+    public static int cpFromCodePointAndLength(int cpAndLength) {
+        assert cpAndLength >= 0;
+        return cpAndLength >> 8;
+    }
+
+    public static int lengthFromCodePointAndLength(int cpAndLength) {
+        assert cpAndLength >= 0;
+        return cpAndLength & 0xff;
    }

    /**
-     * Convert all escapes in a given string using unescapeAt().
+     * Convert all escapes in a given string using unescapeAndLengthAt().
     * @exception IllegalArgumentException if an invalid escape is
     * seen.
     */
-    public static String unescape(String s) {
-        StringBuilder buf = new StringBuilder();
-        int[] pos = new int[1];
+    public static String unescape(CharSequence s) {
+        StringBuilder buf = null;
        for (int i=0; i<s.length(); ) {
            char c = s.charAt(i++);
            if (c == '\\') {
-                pos[0] = i;
-                int e = unescapeAt(s, pos);
-                if (e < 0) {
-                    throw new IllegalArgumentException("Invalid escape sequence " +
-                            s.substring(i-1, Math.min(i+8, s.length())));
+                if (buf == null) {
+                    buf = new StringBuilder(s.length()).append(s, 0, i - 1);
                }
-                buf.appendCodePoint(e);
-                i = pos[0];
-            } else {
+                int cpAndLength = unescapeAndLengthAt(s, i);
+                if (cpAndLength < 0) {
+                    throw new IllegalArgumentException("Invalid escape sequence " +
+                            s.subSequence(i-1, Math.min(i+9, s.length())));
+                }
+                buf.appendCodePoint(cpAndLength >> 8);
+                i += cpAndLength & 0xff;
+            } else if (buf != null) {
+                // We could optimize this further by appending whole substrings between escapes.
                buf.append(c);
            }
        }
+        if (buf == null) {
+            // No escapes in s.
+            return s.toString();
+        }
        return buf.toString();
    }

    /**
-     * Convert all escapes in a given string using unescapeAt().
+     * Convert all escapes in a given string using unescapeAndLengthAt().
     * Leave invalid escape sequences unchanged.
     */
-    public static String unescapeLeniently(String s) {
-        StringBuilder buf = new StringBuilder();
-        int[] pos = new int[1];
+    public static String unescapeLeniently(CharSequence s) {
+        StringBuilder buf = null;
        for (int i=0; i<s.length(); ) {
            char c = s.charAt(i++);
            if (c == '\\') {
-                pos[0] = i;
-                int e = unescapeAt(s, pos);
-                if (e < 0) {
+                if (buf == null) {
+                    buf = new StringBuilder(s.length()).append(s, 0, i - 1);
+                }
+                int cpAndLength = unescapeAndLengthAt(s, i);
+                if (cpAndLength < 0) {
                    buf.append(c);
                } else {
-                    buf.appendCodePoint(e);
-                    i = pos[0];
+                    buf.appendCodePoint(cpAndLength >> 8);
+                    i += cpAndLength & 0xff;
                }
-            } else {
+            } else if (buf != null) {
+                // We could optimize this further by appending whole substrings between escapes.
                buf.append(c);
            }
        }
+        if (buf == null) {
+            // No escapes in s.
+            return s.toString();
+        }
        return buf.toString();
    }

--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/data/TokenIterator.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/data/TokenIterator.java
@ -87,7 +87,7 @@ public class TokenIterator {
    public int getLineNumber() {
        return reader.getLineNumber();
    }
-    
+
    /**
     * Return a string description of the position of the last line
     * returned by readLine() or readLineSkippingComments().
@ -95,7 +95,7 @@ public class TokenIterator {
    public String describePosition() {
        return reader.describePosition() + ':' + (lastpos+1);
    }
-    
+
    /**
     * Read the next token from 'this.line' and append it to
     * 'this.buf'.  Tokens are separated by Pattern_White_Space.  Tokens
@ -127,22 +127,17 @@ public class TokenIterator {
            buf.append(c);
            break;
        }
-        int[] posref = null;
        while (position < line.length()) {
            c = line.charAt(position); // 16-bit ok
            if (c == '\\') {
-                if (posref == null) {
-                    posref = new int[1];
-                }
-                posref[0] = position+1;
-                int c32 = Utility.unescapeAt(line, posref);
-                if (c32 < 0) {
+                int cpAndLength = Utility.unescapeAndLengthAt(line, position + 1);
+                if (cpAndLength < 0) {
                    throw new RuntimeException("Invalid escape at " +
                                               reader.describePosition() + ':' +
                                               position);
                }
-                UTF16.append(buf, c32);
-                position = posref[0];
+                UTF16.append(buf, Utility.cpFromCodePointAndLength(cpAndLength));
+                position += 1 + Utility.lengthFromCodePointAndLength(cpAndLength);
            } else if ((quote != 0 && c == quote) ||
                       (quote == 0 && PatternProps.isWhiteSpace(c))) {
                return ++position;
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java
@ -823,19 +823,18 @@ class RBBIRuleScanner {

            //
            //  check for backslash escaped characters.
-            //  Use String.unescapeAt() to handle them.
            //
            if (c.fChar == '\\') {
                c.fEscaped = true;
-                int[] unescapeIndex = new int[1];
-                unescapeIndex[0] = fNextIndex;
-                c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex);
-                if (unescapeIndex[0] == fNextIndex) {
+                int cpAndLength = Utility.unescapeAndLengthAt(fRB.fRules, fNextIndex);
+                if (cpAndLength < 0) {
                    error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED);
                }
+                c.fChar = Utility.cpFromCodePointAndLength(cpAndLength);
+                int length = Utility.lengthFromCodePointAndLength(cpAndLength);

-                fCharNum += unescapeIndex[0] - fNextIndex;
-                fNextIndex = unescapeIndex[0];
+                fCharNum += length;
+                fNextIndex += length;
            }
        }
        // putc(c.fChar, stdout);
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
@ -2555,7 +2555,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
        StringBuilder patBuf = new StringBuilder(), buf = null;
        boolean usePat = false;
        UnicodeSet scratch = null;
-        Object backup = null;
+        RuleCharacterIterator.Position backup = null;

        // mode: 0=before [, 1=between [...], 2=after ]
        // lastItem: 0=none, 1=char, 2=set
@ -3673,7 +3673,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
            int iterOpts) {
        boolean result = false;
        iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES;
-        Object pos = chars.getPos(null);
+        RuleCharacterIterator.Position pos = chars.getPos(null);
        int c = chars.next(iterOpts);
        if (c == '[' || c == '\\') {
            int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE);
@ -3784,14 +3784,16 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
     */
    private void applyPropertyPattern(RuleCharacterIterator chars,
            Appendable rebuiltPat, SymbolTable symbols) {
-        String patStr = chars.lookahead();
-        ParsePosition pos = new ParsePosition(0);
+        String patStr = chars.getCurrentBuffer();
+        int start = chars.getCurrentBufferPos();
+        ParsePosition pos = new ParsePosition(start);
        applyPropertyPattern(patStr, pos, symbols);
-        if (pos.getIndex() == 0) {
+        int length = pos.getIndex() - start;
+        if (length == 0) {
            syntaxError(chars, "Invalid property pattern");
        }
-        chars.jumpahead(pos.getIndex());
-        append(rebuiltPat, patStr.substring(0, pos.getIndex()));
+        chars.jumpahead(length);
+        append(rebuiltPat, patStr.substring(start, pos.getIndex()));
    }

    //----------------------------------------------------------------
--- a/icu4j/main/classes/translit/src/com/ibm/icu/text/TransliteratorParser.java
+++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/TransliteratorParser.java
@ -493,12 +493,12 @@ class TransliteratorParser {
                    if (pos == limit) {
                        syntaxError("Trailing backslash", rule, start);
                    }
-                    iref[0] = pos;
-                    int escaped = Utility.unescapeAt(rule, iref);
-                    pos = iref[0];
-                    if (escaped == -1) {
+                    int cpAndLength = Utility.unescapeAndLengthAt(rule, pos);
+                    if (cpAndLength < 0) {
                        syntaxError("Malformed escape", rule, start);
                    }
+                    int escaped = Utility.cpFromCodePointAndLength(cpAndLength);
+                    pos += Utility.lengthFromCodePointAndLength(cpAndLength);
                    parser.checkVariableRange(escaped, rule, start);
                    UTF16.append(buf, escaped);
                    continue;
@ -902,16 +902,16 @@ class TransliteratorParser {
        boolean parsingIDs = true;
        int ruleCount = 0;

-        dataVector = new ArrayList<Data>();
-        idBlockVector = new ArrayList<String>();
+        dataVector = new ArrayList<>();
+        idBlockVector = new ArrayList<>();
        curData = null;
        direction = dir;
        compoundFilter = null;
-        variablesVector = new ArrayList<Object>();
-        variableNames = new HashMap<String, char[]>();
+        variablesVector = new ArrayList<>();
+        variableNames = new HashMap<>();
        parseData = new ParseData();

-        List<RuntimeException> errors = new ArrayList<RuntimeException>();
+        List<RuntimeException> errors = new ArrayList<>();
        int errorCount = 0;

        ruleArray.reset();
@ -1079,7 +1079,7 @@ class TransliteratorParser {
            Data data = dataVector.get(i);
            data.variables = new Object[variablesVector.size()];
            variablesVector.toArray(data.variables);
-            data.variableNames = new HashMap<String, char[]>();
+            data.variableNames = new HashMap<>();
            data.variableNames.putAll(variableNames);
        }
        variablesVector = null;
@ -1143,7 +1143,7 @@ class TransliteratorParser {

        // Set up segments data
        segmentStandins = new StringBuffer();
-        segmentObjects = new ArrayList<StringMatcher>();
+        segmentObjects = new ArrayList<>();

        RuleHalf left  = new RuleHalf();
        RuleHalf right = new RuleHalf();
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java
@ -344,14 +344,12 @@ public void TestExtended() {
                }

                // Let unescape handle the back slash.
-                int  charIdxAr[] = new int[1];
-                charIdxAr[0] = charIdx;
-                cp = Utility.unescapeAt(testString, charIdxAr);
-                if (cp != -1) {
+                int cpAndLength = Utility.unescapeAndLengthAt(testString, charIdx);
+                if (cpAndLength >= 0) {
                    // Escape sequence was recognized.  Insert the char
                    //   into the test data.
-                    charIdx = charIdxAr[0];
-                    tp.dataToBreak.appendCodePoint(cp);
+                    charIdx += Utility.lengthFromCodePointAndLength(cpAndLength);
+                    tp.dataToBreak.appendCodePoint(Utility.cpFromCodePointAndLength(cpAndLength));
                    for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
                        tp.srcLine[i] = lineNum;
                        tp.srcCol[i]  = column;
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java
@ -52,11 +52,9 @@ public class UtilityTest extends TestFmwk {

        // Regression test for ICU-21645
        String s = "\\U0001DA8B\\U0001DF00-\\U0001DF1E";
-        int[] offset16 = new int[] { 1 };  // after the backslash
        // This returned U+B2F00 for the first _two_ escapes.
-        int c = Utility.unescapeAt(s, offset16);
-        assertEquals(s + " unescape at 1, code point", 0x1DA8B, c);
-        assertEquals(s + " unescape at 1, offset", 10, offset16[0]);
+        int cpAndLength = Utility.unescapeAndLengthAt(s, 1);  // index 1 = after the backslash
+        assertEquals(s + " unescape at 1, cpAndLength", 0x1DA8B09, cpAndLength);
        String pattern = "[" + s + "]";
        // This threw an IllegalArgumentException because the parser called Utility.unescapeAt()
        // and saw an invalid range of B2F00..1DF1E (start >= end).