diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/RuleCharacterIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/RuleCharacterIterator.java index 5e981655015..49c74f4d95b 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/RuleCharacterIterator.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/RuleCharacterIterator.java @@ -52,7 +52,7 @@ public class RuleCharacterIterator { /** * Current variable expansion, or null if none. */ - private char[] buf; + private String buf; /** * Position within buf[]. Meaningless if buf == null. @@ -79,7 +79,7 @@ public class RuleCharacterIterator { /** * Bitmask option to enable parsing of escape sequences. If (options & * PARSE_ESCAPES) != 0, then an embedded escape sequence will be expanded - * to its value. Escapes are parsed using Utility.unescapeAt(). + * to its value. Escapes are parsed using Utility.unescapeAndLengthAt(). */ public static final int PARSE_ESCAPES = 2; @@ -90,6 +90,13 @@ public class RuleCharacterIterator { */ public static final int SKIP_WHITESPACE = 4; + /** For use with {@link #getPos(Position)} & {@link #setPos(Position)}. */ + public static final class Position { + private String buf; + private int bufPos; + private int posIndex; + }; + /** * Constructs an iterator over the given text, starting at the given * position. @@ -144,15 +151,17 @@ public class RuleCharacterIterator { break; } bufPos = 0; - buf = sym.lookup(name); - if (buf == null) { + char[] chars = sym.lookup(name); + if (chars == null) { + buf = null; throw new IllegalArgumentException( "Undefined variable: " + name); } // Handle empty variable value - if (buf.length == 0) { + if (chars.length == 0) { buf = null; } + buf = new String(chars); continue; } @@ -162,13 +171,14 @@ public class RuleCharacterIterator { } if (c == '\\' && (options & PARSE_ESCAPES) != 0) { - int offset[] = new int[] { 0 }; - c = Utility.unescapeAt(lookahead(), offset); - jumpahead(offset[0]); - isEscaped = true; - if (c < 0) { + int cpAndLength = Utility.unescapeAndLengthAt( + getCurrentBuffer(), getCurrentBufferPos()); + if (cpAndLength < 0) { throw new IllegalArgumentException("Invalid escape"); } + c = Utility.cpFromCodePointAndLength(cpAndLength); + jumpahead(Utility.lengthFromCodePointAndLength(cpAndLength)); + isEscaped = true; } break; @@ -199,7 +209,7 @@ public class RuleCharacterIterator { * restore this iterator's position. Usage idiom: * * RuleCharacterIterator iterator = ...; - * Object pos = iterator.getPos(null); // allocate position object + * Position pos = iterator.getPos(null); // allocate position object * for (;;) { * pos = iterator.getPos(pos); // reuse position object * int c = iterator.next(...); @@ -213,15 +223,13 @@ public class RuleCharacterIterator { * @return a position object which may be passed to setPos(), * either `p,' or if `p' == null, a newly-allocated object */ - public Object getPos(Object p) { + public Position getPos(Position p) { if (p == null) { - return new Object[] {buf, new int[] {pos.getIndex(), bufPos}}; + p = new Position(); } - Object[] a = (Object[]) p; - a[0] = buf; - int[] v = (int[]) a[1]; - v[0] = pos.getIndex(); - v[1] = bufPos; + p.buf = buf; + p.bufPos = bufPos; + p.posIndex = pos.getIndex(); return p; } @@ -230,12 +238,10 @@ public class RuleCharacterIterator { * returned the given object. * @param p a position object previously returned by getPos() */ - public void setPos(Object p) { - Object[] a = (Object[]) p; - buf = (char[]) a[0]; - int[] v = (int[]) a[1]; - pos.setIndex(v[0]); - bufPos = v[1]; + public void setPos(Position p) { + buf = p.buf; + pos.setIndex(p.posIndex); + bufPos = p.bufPos; } /** @@ -260,25 +266,35 @@ public class RuleCharacterIterator { * Returns a string containing the remainder of the characters to be * returned by this iterator, without any option processing. If the * iterator is currently within a variable expansion, this will only - * extend to the end of the variable expansion. This method is provided - * so that iterators may interoperate with string-based APIs. The typical - * sequence of calls is to call skipIgnored(), then call lookahead(), then - * parse the string returned by lookahead(), then call jumpahead() to + * extend to the end of the variable expansion. + * This method, together with getCurrentBufferPos() (which replace the former lookahead()), + * is provided so that iterators may interoperate with string-based APIs. The typical + * sequence of calls is to call skipIgnored(), then call these methods, then + * parse that substring, then call jumpahead() to * resynchronize the iterator. * @return a string containing the characters to be returned by future * calls to next() */ - public String lookahead() { + public String getCurrentBuffer() { if (buf != null) { - return new String(buf, bufPos, buf.length - bufPos); + return buf; } else { - return text.substring(pos.getIndex()); + return text; + } + } + + public int getCurrentBufferPos() { + if (buf != null) { + return bufPos; + } else { + return pos.getIndex(); } } /** * Advances the position by the given number of 16-bit code units. - * This is useful in conjunction with the lookahead() method. + * This is useful in conjunction with getCurrentBuffer()+getCurrentBufferPos() + * (formerly lookahead()). * @param count the number of 16-bit code units to jump over */ public void jumpahead(int count) { @@ -287,10 +303,10 @@ public class RuleCharacterIterator { } if (buf != null) { bufPos += count; - if (bufPos > buf.length) { + if (bufPos > buf.length()) { throw new IllegalArgumentException(); } - if (bufPos == buf.length) { + if (bufPos == buf.length()) { buf = null; } } else { @@ -321,7 +337,7 @@ public class RuleCharacterIterator { */ private int _current() { if (buf != null) { - return UTF16.charAt(buf, 0, buf.length, bufPos); + return UTF16.charAt(buf, bufPos); } else { int i = pos.getIndex(); return (i < text.length()) ? UTF16.charAt(text, i) : DONE; @@ -335,7 +351,7 @@ public class RuleCharacterIterator { private void _advance(int count) { if (buf != null) { bufPos += count; - if (bufPos == buf.length) { + if (bufPos == buf.length()) { buf = null; } } else { diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java index cd906011552..25b02e50694 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java @@ -778,14 +778,16 @@ public final class Utility { }; /** - * Convert an escape to a 32-bit code point value. We attempt + * Converts an escape to a code point value. We attempt * to parallel the icu4c unescapeAt() function. - * @param offset16 an array containing offset to the character - * after the backslash. Upon return offset16[0] will - * be updated to point after the escape sequence. - * @return character value from 0 to 10FFFF, or -1 on error. + * This function returns an integer with + * both the code point (bits 28..8) and the length of the escape sequence (bits 7..0). + * offset+length is the index after the escape sequence. + * + * @param offset the offset to the character after the backslash. + * @return the code point and length, or -1 on error. */ - public static int unescapeAt(String s, int[] offset16) { + public static int unescapeAndLengthAt(CharSequence s, int offset) { int c; int result = 0; int n = 0; @@ -797,11 +799,11 @@ public final class Utility { boolean braces = false; /* Check that offset is in range */ - int offset = offset16[0]; int length = s.length(); if (offset < 0 || offset >= length) { return -1; } + int start = offset; /* Fetch first UChar after '\\' */ c = Character.codePointAt(s, offset); @@ -867,24 +869,24 @@ public final class Utility { int ahead = offset+1; c = s.charAt(offset); // [sic] get 16-bit code unit if (c == '\\' && ahead < length) { - int o[] = new int[] { ahead }; - c = unescapeAt(s, o); - ahead = o[0]; + int cpAndLength = unescapeAndLengthAt(s, ahead); + if (cpAndLength >= 0) { + c = cpAndLength >> 8; + ahead += cpAndLength & 0xff; + } } if (c <= 0xffff && UTF16.isTrailSurrogate((char) c)) { offset = ahead; result = Character.toCodePoint((char) result, (char) c); } } - offset16[0] = offset; - return result; + return codePointAndLength(result, start, offset); } /* Convert C-style escapes in table */ for (i=0; i= 0; + return cpAndLength >> 8; + } + + public static int lengthFromCodePointAndLength(int cpAndLength) { + assert cpAndLength >= 0; + return cpAndLength & 0xff; } /** - * Convert all escapes in a given string using unescapeAt(). + * Convert all escapes in a given string using unescapeAndLengthAt(). * @exception IllegalArgumentException if an invalid escape is * seen. */ - public static String unescape(String s) { - StringBuilder buf = new StringBuilder(); - int[] pos = new int[1]; + public static String unescape(CharSequence s) { + StringBuilder buf = null; for (int i=0; i> 8); + i += cpAndLength & 0xff; + } else if (buf != null) { + // We could optimize this further by appending whole substrings between escapes. buf.append(c); } } + if (buf == null) { + // No escapes in s. + return s.toString(); + } return buf.toString(); } /** - * Convert all escapes in a given string using unescapeAt(). + * Convert all escapes in a given string using unescapeAndLengthAt(). * Leave invalid escape sequences unchanged. */ - public static String unescapeLeniently(String s) { - StringBuilder buf = new StringBuilder(); - int[] pos = new int[1]; + public static String unescapeLeniently(CharSequence s) { + StringBuilder buf = null; for (int i=0; i> 8); + i += cpAndLength & 0xff; } - } else { + } else if (buf != null) { + // We could optimize this further by appending whole substrings between escapes. buf.append(c); } } + if (buf == null) { + // No escapes in s. + return s.toString(); + } return buf.toString(); } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/data/TokenIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/data/TokenIterator.java index d37ea468ff5..166948b417d 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/data/TokenIterator.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/data/TokenIterator.java @@ -87,7 +87,7 @@ public class TokenIterator { public int getLineNumber() { return reader.getLineNumber(); } - + /** * Return a string description of the position of the last line * returned by readLine() or readLineSkippingComments(). @@ -95,7 +95,7 @@ public class TokenIterator { public String describePosition() { return reader.describePosition() + ':' + (lastpos+1); } - + /** * Read the next token from 'this.line' and append it to * 'this.buf'. Tokens are separated by Pattern_White_Space. Tokens @@ -127,22 +127,17 @@ public class TokenIterator { buf.append(c); break; } - int[] posref = null; while (position < line.length()) { c = line.charAt(position); // 16-bit ok if (c == '\\') { - if (posref == null) { - posref = new int[1]; - } - posref[0] = position+1; - int c32 = Utility.unescapeAt(line, posref); - if (c32 < 0) { + int cpAndLength = Utility.unescapeAndLengthAt(line, position + 1); + if (cpAndLength < 0) { throw new RuntimeException("Invalid escape at " + reader.describePosition() + ':' + position); } - UTF16.append(buf, c32); - position = posref[0]; + UTF16.append(buf, Utility.cpFromCodePointAndLength(cpAndLength)); + position += 1 + Utility.lengthFromCodePointAndLength(cpAndLength); } else if ((quote != 0 && c == quote) || (quote == 0 && PatternProps.isWhiteSpace(c))) { return ++position; diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java index c9a8aff5a6d..9249ba86edc 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java @@ -823,19 +823,18 @@ class RBBIRuleScanner { // // check for backslash escaped characters. - // Use String.unescapeAt() to handle them. // if (c.fChar == '\\') { c.fEscaped = true; - int[] unescapeIndex = new int[1]; - unescapeIndex[0] = fNextIndex; - c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex); - if (unescapeIndex[0] == fNextIndex) { + int cpAndLength = Utility.unescapeAndLengthAt(fRB.fRules, fNextIndex); + if (cpAndLength < 0) { error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED); } + c.fChar = Utility.cpFromCodePointAndLength(cpAndLength); + int length = Utility.lengthFromCodePointAndLength(cpAndLength); - fCharNum += unescapeIndex[0] - fNextIndex; - fNextIndex = unescapeIndex[0]; + fCharNum += length; + fNextIndex += length; } } // putc(c.fChar, stdout); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java index 04154054ff8..d41ff99ea80 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java @@ -2555,7 +2555,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa StringBuilder patBuf = new StringBuilder(), buf = null; boolean usePat = false; UnicodeSet scratch = null; - Object backup = null; + RuleCharacterIterator.Position backup = null; // mode: 0=before [, 1=between [...], 2=after ] // lastItem: 0=none, 1=char, 2=set @@ -3673,7 +3673,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa int iterOpts) { boolean result = false; iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES; - Object pos = chars.getPos(null); + RuleCharacterIterator.Position pos = chars.getPos(null); int c = chars.next(iterOpts); if (c == '[' || c == '\\') { int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE); @@ -3784,14 +3784,16 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa */ private void applyPropertyPattern(RuleCharacterIterator chars, Appendable rebuiltPat, SymbolTable symbols) { - String patStr = chars.lookahead(); - ParsePosition pos = new ParsePosition(0); + String patStr = chars.getCurrentBuffer(); + int start = chars.getCurrentBufferPos(); + ParsePosition pos = new ParsePosition(start); applyPropertyPattern(patStr, pos, symbols); - if (pos.getIndex() == 0) { + int length = pos.getIndex() - start; + if (length == 0) { syntaxError(chars, "Invalid property pattern"); } - chars.jumpahead(pos.getIndex()); - append(rebuiltPat, patStr.substring(0, pos.getIndex())); + chars.jumpahead(length); + append(rebuiltPat, patStr.substring(start, pos.getIndex())); } //---------------------------------------------------------------- diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/TransliteratorParser.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/TransliteratorParser.java index 9e300a729b2..f16a121e71b 100644 --- a/icu4j/main/classes/translit/src/com/ibm/icu/text/TransliteratorParser.java +++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/TransliteratorParser.java @@ -493,12 +493,12 @@ class TransliteratorParser { if (pos == limit) { syntaxError("Trailing backslash", rule, start); } - iref[0] = pos; - int escaped = Utility.unescapeAt(rule, iref); - pos = iref[0]; - if (escaped == -1) { + int cpAndLength = Utility.unescapeAndLengthAt(rule, pos); + if (cpAndLength < 0) { syntaxError("Malformed escape", rule, start); } + int escaped = Utility.cpFromCodePointAndLength(cpAndLength); + pos += Utility.lengthFromCodePointAndLength(cpAndLength); parser.checkVariableRange(escaped, rule, start); UTF16.append(buf, escaped); continue; @@ -902,16 +902,16 @@ class TransliteratorParser { boolean parsingIDs = true; int ruleCount = 0; - dataVector = new ArrayList(); - idBlockVector = new ArrayList(); + dataVector = new ArrayList<>(); + idBlockVector = new ArrayList<>(); curData = null; direction = dir; compoundFilter = null; - variablesVector = new ArrayList(); - variableNames = new HashMap(); + variablesVector = new ArrayList<>(); + variableNames = new HashMap<>(); parseData = new ParseData(); - List errors = new ArrayList(); + List errors = new ArrayList<>(); int errorCount = 0; ruleArray.reset(); @@ -1079,7 +1079,7 @@ class TransliteratorParser { Data data = dataVector.get(i); data.variables = new Object[variablesVector.size()]; variablesVector.toArray(data.variables); - data.variableNames = new HashMap(); + data.variableNames = new HashMap<>(); data.variableNames.putAll(variableNames); } variablesVector = null; @@ -1143,7 +1143,7 @@ class TransliteratorParser { // Set up segments data segmentStandins = new StringBuffer(); - segmentObjects = new ArrayList(); + segmentObjects = new ArrayList<>(); RuleHalf left = new RuleHalf(); RuleHalf right = new RuleHalf(); diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java index 76c76b02680..4bea6cc0ec8 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java @@ -344,14 +344,12 @@ public void TestExtended() { } // Let unescape handle the back slash. - int charIdxAr[] = new int[1]; - charIdxAr[0] = charIdx; - cp = Utility.unescapeAt(testString, charIdxAr); - if (cp != -1) { + int cpAndLength = Utility.unescapeAndLengthAt(testString, charIdx); + if (cpAndLength >= 0) { // Escape sequence was recognized. Insert the char // into the test data. - charIdx = charIdxAr[0]; - tp.dataToBreak.appendCodePoint(cp); + charIdx += Utility.lengthFromCodePointAndLength(cpAndLength); + tp.dataToBreak.appendCodePoint(Utility.cpFromCodePointAndLength(cpAndLength)); for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) { tp.srcLine[i] = lineNum; tp.srcCol[i] = column; diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java index a7ee1d072d3..15d1e601049 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java @@ -52,11 +52,9 @@ public class UtilityTest extends TestFmwk { // Regression test for ICU-21645 String s = "\\U0001DA8B\\U0001DF00-\\U0001DF1E"; - int[] offset16 = new int[] { 1 }; // after the backslash // This returned U+B2F00 for the first _two_ escapes. - int c = Utility.unescapeAt(s, offset16); - assertEquals(s + " unescape at 1, code point", 0x1DA8B, c); - assertEquals(s + " unescape at 1, offset", 10, offset16[0]); + int cpAndLength = Utility.unescapeAndLengthAt(s, 1); // index 1 = after the backslash + assertEquals(s + " unescape at 1, cpAndLength", 0x1DA8B09, cpAndLength); String pattern = "[" + s + "]"; // This threw an IllegalArgumentException because the parser called Utility.unescapeAt() // and saw an invalid range of B2F00..1DF1E (start >= end).