ICU-21645 reduce heap allocations in unescape() and other parsing

This commit is contained in:
Markus Scherer 2021-06-19 20:25:33 +00:00
parent d9aa1ecab0
commit ca3e86d119
8 changed files with 162 additions and 122 deletions

View file

@ -52,7 +52,7 @@ public class RuleCharacterIterator {
/**
* Current variable expansion, or null if none.
*/
private char[] buf;
private String buf;
/**
* Position within buf[]. Meaningless if buf == null.
@ -79,7 +79,7 @@ public class RuleCharacterIterator {
/**
* Bitmask option to enable parsing of escape sequences. If (options &
* PARSE_ESCAPES) != 0, then an embedded escape sequence will be expanded
* to its value. Escapes are parsed using Utility.unescapeAt().
* to its value. Escapes are parsed using Utility.unescapeAndLengthAt().
*/
public static final int PARSE_ESCAPES = 2;
@ -90,6 +90,13 @@ public class RuleCharacterIterator {
*/
public static final int SKIP_WHITESPACE = 4;
/** For use with {@link #getPos(Position)} & {@link #setPos(Position)}. */
public static final class Position {
private String buf;
private int bufPos;
private int posIndex;
};
/**
* Constructs an iterator over the given text, starting at the given
* position.
@ -144,15 +151,17 @@ public class RuleCharacterIterator {
break;
}
bufPos = 0;
buf = sym.lookup(name);
if (buf == null) {
char[] chars = sym.lookup(name);
if (chars == null) {
buf = null;
throw new IllegalArgumentException(
"Undefined variable: " + name);
}
// Handle empty variable value
if (buf.length == 0) {
if (chars.length == 0) {
buf = null;
}
buf = new String(chars);
continue;
}
@ -162,13 +171,14 @@ public class RuleCharacterIterator {
}
if (c == '\\' && (options & PARSE_ESCAPES) != 0) {
int offset[] = new int[] { 0 };
c = Utility.unescapeAt(lookahead(), offset);
jumpahead(offset[0]);
isEscaped = true;
if (c < 0) {
int cpAndLength = Utility.unescapeAndLengthAt(
getCurrentBuffer(), getCurrentBufferPos());
if (cpAndLength < 0) {
throw new IllegalArgumentException("Invalid escape");
}
c = Utility.cpFromCodePointAndLength(cpAndLength);
jumpahead(Utility.lengthFromCodePointAndLength(cpAndLength));
isEscaped = true;
}
break;
@ -199,7 +209,7 @@ public class RuleCharacterIterator {
* restore this iterator's position. Usage idiom:
*
* RuleCharacterIterator iterator = ...;
* Object pos = iterator.getPos(null); // allocate position object
* Position pos = iterator.getPos(null); // allocate position object
* for (;;) {
* pos = iterator.getPos(pos); // reuse position object
* int c = iterator.next(...);
@ -213,15 +223,13 @@ public class RuleCharacterIterator {
* @return a position object which may be passed to setPos(),
* either `p,' or if `p' == null, a newly-allocated object
*/
public Object getPos(Object p) {
public Position getPos(Position p) {
if (p == null) {
return new Object[] {buf, new int[] {pos.getIndex(), bufPos}};
p = new Position();
}
Object[] a = (Object[]) p;
a[0] = buf;
int[] v = (int[]) a[1];
v[0] = pos.getIndex();
v[1] = bufPos;
p.buf = buf;
p.bufPos = bufPos;
p.posIndex = pos.getIndex();
return p;
}
@ -230,12 +238,10 @@ public class RuleCharacterIterator {
* returned the given object.
* @param p a position object previously returned by getPos()
*/
public void setPos(Object p) {
Object[] a = (Object[]) p;
buf = (char[]) a[0];
int[] v = (int[]) a[1];
pos.setIndex(v[0]);
bufPos = v[1];
public void setPos(Position p) {
buf = p.buf;
pos.setIndex(p.posIndex);
bufPos = p.bufPos;
}
/**
@ -260,25 +266,35 @@ public class RuleCharacterIterator {
* Returns a string containing the remainder of the characters to be
* returned by this iterator, without any option processing. If the
* iterator is currently within a variable expansion, this will only
* extend to the end of the variable expansion. This method is provided
* so that iterators may interoperate with string-based APIs. The typical
* sequence of calls is to call skipIgnored(), then call lookahead(), then
* parse the string returned by lookahead(), then call jumpahead() to
* extend to the end of the variable expansion.
* This method, together with getCurrentBufferPos() (which replace the former lookahead()),
* is provided so that iterators may interoperate with string-based APIs. The typical
* sequence of calls is to call skipIgnored(), then call these methods, then
* parse that substring, then call jumpahead() to
* resynchronize the iterator.
* @return a string containing the characters to be returned by future
* calls to next()
*/
public String lookahead() {
public String getCurrentBuffer() {
if (buf != null) {
return new String(buf, bufPos, buf.length - bufPos);
return buf;
} else {
return text.substring(pos.getIndex());
return text;
}
}
public int getCurrentBufferPos() {
if (buf != null) {
return bufPos;
} else {
return pos.getIndex();
}
}
/**
* Advances the position by the given number of 16-bit code units.
* This is useful in conjunction with the lookahead() method.
* This is useful in conjunction with getCurrentBuffer()+getCurrentBufferPos()
* (formerly lookahead()).
* @param count the number of 16-bit code units to jump over
*/
public void jumpahead(int count) {
@ -287,10 +303,10 @@ public class RuleCharacterIterator {
}
if (buf != null) {
bufPos += count;
if (bufPos > buf.length) {
if (bufPos > buf.length()) {
throw new IllegalArgumentException();
}
if (bufPos == buf.length) {
if (bufPos == buf.length()) {
buf = null;
}
} else {
@ -321,7 +337,7 @@ public class RuleCharacterIterator {
*/
private int _current() {
if (buf != null) {
return UTF16.charAt(buf, 0, buf.length, bufPos);
return UTF16.charAt(buf, bufPos);
} else {
int i = pos.getIndex();
return (i < text.length()) ? UTF16.charAt(text, i) : DONE;
@ -335,7 +351,7 @@ public class RuleCharacterIterator {
private void _advance(int count) {
if (buf != null) {
bufPos += count;
if (bufPos == buf.length) {
if (bufPos == buf.length()) {
buf = null;
}
} else {

View file

@ -778,14 +778,16 @@ public final class Utility {
};
/**
* Convert an escape to a 32-bit code point value. We attempt
* Converts an escape to a code point value. We attempt
* to parallel the icu4c unescapeAt() function.
* @param offset16 an array containing offset to the character
* <em>after</em> the backslash. Upon return offset16[0] will
* be updated to point after the escape sequence.
* @return character value from 0 to 10FFFF, or -1 on error.
* This function returns an integer with
* both the code point (bits 28..8) and the length of the escape sequence (bits 7..0).
* offset+length is the index after the escape sequence.
*
* @param offset the offset to the character <em>after</em> the backslash.
* @return the code point and length, or -1 on error.
*/
public static int unescapeAt(String s, int[] offset16) {
public static int unescapeAndLengthAt(CharSequence s, int offset) {
int c;
int result = 0;
int n = 0;
@ -797,11 +799,11 @@ public final class Utility {
boolean braces = false;
/* Check that offset is in range */
int offset = offset16[0];
int length = s.length();
if (offset < 0 || offset >= length) {
return -1;
}
int start = offset;
/* Fetch first UChar after '\\' */
c = Character.codePointAt(s, offset);
@ -867,24 +869,24 @@ public final class Utility {
int ahead = offset+1;
c = s.charAt(offset); // [sic] get 16-bit code unit
if (c == '\\' && ahead < length) {
int o[] = new int[] { ahead };
c = unescapeAt(s, o);
ahead = o[0];
int cpAndLength = unescapeAndLengthAt(s, ahead);
if (cpAndLength >= 0) {
c = cpAndLength >> 8;
ahead += cpAndLength & 0xff;
}
}
if (c <= 0xffff && UTF16.isTrailSurrogate((char) c)) {
offset = ahead;
result = Character.toCodePoint((char) result, (char) c);
}
}
offset16[0] = offset;
return result;
return codePointAndLength(result, start, offset);
}
/* Convert C-style escapes in table */
for (i=0; i<UNESCAPE_MAP.length; i+=2) {
if (c == UNESCAPE_MAP[i]) {
offset16[0] = offset;
return UNESCAPE_MAP[i+1];
return codePointAndLength(UNESCAPE_MAP[i+1], start, offset);
} else if (c < UNESCAPE_MAP[i]) {
break;
}
@ -893,64 +895,94 @@ public final class Utility {
/* Map \cX to control-X: X & 0x1F */
if (c == 'c' && offset < length) {
c = UTF16.charAt(s, offset);
offset16[0] = offset + UTF16.getCharCount(c);
return 0x1F & c;
return codePointAndLength(c & 0x1F, start, offset + UTF16.getCharCount(c));
}
/* If no special forms are recognized, then consider
* the backslash to generically escape the next character. */
offset16[0] = offset;
return c;
return codePointAndLength(c, start, offset);
}
private static int codePointAndLength(int c, int length) {
assert 0 <= c && c <= 0x10ffff;
assert 0 <= length && length <= 0xff;
return c << 8 | length;
}
private static int codePointAndLength(int c, int start, int limit) {
return codePointAndLength(c, limit - start);
}
public static int cpFromCodePointAndLength(int cpAndLength) {
assert cpAndLength >= 0;
return cpAndLength >> 8;
}
public static int lengthFromCodePointAndLength(int cpAndLength) {
assert cpAndLength >= 0;
return cpAndLength & 0xff;
}
/**
* Convert all escapes in a given string using unescapeAt().
* Convert all escapes in a given string using unescapeAndLengthAt().
* @exception IllegalArgumentException if an invalid escape is
* seen.
*/
public static String unescape(String s) {
StringBuilder buf = new StringBuilder();
int[] pos = new int[1];
public static String unescape(CharSequence s) {
StringBuilder buf = null;
for (int i=0; i<s.length(); ) {
char c = s.charAt(i++);
if (c == '\\') {
pos[0] = i;
int e = unescapeAt(s, pos);
if (e < 0) {
throw new IllegalArgumentException("Invalid escape sequence " +
s.substring(i-1, Math.min(i+8, s.length())));
if (buf == null) {
buf = new StringBuilder(s.length()).append(s, 0, i - 1);
}
buf.appendCodePoint(e);
i = pos[0];
} else {
int cpAndLength = unescapeAndLengthAt(s, i);
if (cpAndLength < 0) {
throw new IllegalArgumentException("Invalid escape sequence " +
s.subSequence(i-1, Math.min(i+9, s.length())));
}
buf.appendCodePoint(cpAndLength >> 8);
i += cpAndLength & 0xff;
} else if (buf != null) {
// We could optimize this further by appending whole substrings between escapes.
buf.append(c);
}
}
if (buf == null) {
// No escapes in s.
return s.toString();
}
return buf.toString();
}
/**
* Convert all escapes in a given string using unescapeAt().
* Convert all escapes in a given string using unescapeAndLengthAt().
* Leave invalid escape sequences unchanged.
*/
public static String unescapeLeniently(String s) {
StringBuilder buf = new StringBuilder();
int[] pos = new int[1];
public static String unescapeLeniently(CharSequence s) {
StringBuilder buf = null;
for (int i=0; i<s.length(); ) {
char c = s.charAt(i++);
if (c == '\\') {
pos[0] = i;
int e = unescapeAt(s, pos);
if (e < 0) {
if (buf == null) {
buf = new StringBuilder(s.length()).append(s, 0, i - 1);
}
int cpAndLength = unescapeAndLengthAt(s, i);
if (cpAndLength < 0) {
buf.append(c);
} else {
buf.appendCodePoint(e);
i = pos[0];
buf.appendCodePoint(cpAndLength >> 8);
i += cpAndLength & 0xff;
}
} else {
} else if (buf != null) {
// We could optimize this further by appending whole substrings between escapes.
buf.append(c);
}
}
if (buf == null) {
// No escapes in s.
return s.toString();
}
return buf.toString();
}

View file

@ -87,7 +87,7 @@ public class TokenIterator {
public int getLineNumber() {
return reader.getLineNumber();
}
/**
* Return a string description of the position of the last line
* returned by readLine() or readLineSkippingComments().
@ -95,7 +95,7 @@ public class TokenIterator {
public String describePosition() {
return reader.describePosition() + ':' + (lastpos+1);
}
/**
* Read the next token from 'this.line' and append it to
* 'this.buf'. Tokens are separated by Pattern_White_Space. Tokens
@ -127,22 +127,17 @@ public class TokenIterator {
buf.append(c);
break;
}
int[] posref = null;
while (position < line.length()) {
c = line.charAt(position); // 16-bit ok
if (c == '\\') {
if (posref == null) {
posref = new int[1];
}
posref[0] = position+1;
int c32 = Utility.unescapeAt(line, posref);
if (c32 < 0) {
int cpAndLength = Utility.unescapeAndLengthAt(line, position + 1);
if (cpAndLength < 0) {
throw new RuntimeException("Invalid escape at " +
reader.describePosition() + ':' +
position);
}
UTF16.append(buf, c32);
position = posref[0];
UTF16.append(buf, Utility.cpFromCodePointAndLength(cpAndLength));
position += 1 + Utility.lengthFromCodePointAndLength(cpAndLength);
} else if ((quote != 0 && c == quote) ||
(quote == 0 && PatternProps.isWhiteSpace(c))) {
return ++position;

View file

@ -823,19 +823,18 @@ class RBBIRuleScanner {
//
// check for backslash escaped characters.
// Use String.unescapeAt() to handle them.
//
if (c.fChar == '\\') {
c.fEscaped = true;
int[] unescapeIndex = new int[1];
unescapeIndex[0] = fNextIndex;
c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex);
if (unescapeIndex[0] == fNextIndex) {
int cpAndLength = Utility.unescapeAndLengthAt(fRB.fRules, fNextIndex);
if (cpAndLength < 0) {
error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED);
}
c.fChar = Utility.cpFromCodePointAndLength(cpAndLength);
int length = Utility.lengthFromCodePointAndLength(cpAndLength);
fCharNum += unescapeIndex[0] - fNextIndex;
fNextIndex = unescapeIndex[0];
fCharNum += length;
fNextIndex += length;
}
}
// putc(c.fChar, stdout);

View file

@ -2555,7 +2555,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
StringBuilder patBuf = new StringBuilder(), buf = null;
boolean usePat = false;
UnicodeSet scratch = null;
Object backup = null;
RuleCharacterIterator.Position backup = null;
// mode: 0=before [, 1=between [...], 2=after ]
// lastItem: 0=none, 1=char, 2=set
@ -3673,7 +3673,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
int iterOpts) {
boolean result = false;
iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES;
Object pos = chars.getPos(null);
RuleCharacterIterator.Position pos = chars.getPos(null);
int c = chars.next(iterOpts);
if (c == '[' || c == '\\') {
int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE);
@ -3784,14 +3784,16 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
*/
private void applyPropertyPattern(RuleCharacterIterator chars,
Appendable rebuiltPat, SymbolTable symbols) {
String patStr = chars.lookahead();
ParsePosition pos = new ParsePosition(0);
String patStr = chars.getCurrentBuffer();
int start = chars.getCurrentBufferPos();
ParsePosition pos = new ParsePosition(start);
applyPropertyPattern(patStr, pos, symbols);
if (pos.getIndex() == 0) {
int length = pos.getIndex() - start;
if (length == 0) {
syntaxError(chars, "Invalid property pattern");
}
chars.jumpahead(pos.getIndex());
append(rebuiltPat, patStr.substring(0, pos.getIndex()));
chars.jumpahead(length);
append(rebuiltPat, patStr.substring(start, pos.getIndex()));
}
//----------------------------------------------------------------

View file

@ -493,12 +493,12 @@ class TransliteratorParser {
if (pos == limit) {
syntaxError("Trailing backslash", rule, start);
}
iref[0] = pos;
int escaped = Utility.unescapeAt(rule, iref);
pos = iref[0];
if (escaped == -1) {
int cpAndLength = Utility.unescapeAndLengthAt(rule, pos);
if (cpAndLength < 0) {
syntaxError("Malformed escape", rule, start);
}
int escaped = Utility.cpFromCodePointAndLength(cpAndLength);
pos += Utility.lengthFromCodePointAndLength(cpAndLength);
parser.checkVariableRange(escaped, rule, start);
UTF16.append(buf, escaped);
continue;
@ -902,16 +902,16 @@ class TransliteratorParser {
boolean parsingIDs = true;
int ruleCount = 0;
dataVector = new ArrayList<Data>();
idBlockVector = new ArrayList<String>();
dataVector = new ArrayList<>();
idBlockVector = new ArrayList<>();
curData = null;
direction = dir;
compoundFilter = null;
variablesVector = new ArrayList<Object>();
variableNames = new HashMap<String, char[]>();
variablesVector = new ArrayList<>();
variableNames = new HashMap<>();
parseData = new ParseData();
List<RuntimeException> errors = new ArrayList<RuntimeException>();
List<RuntimeException> errors = new ArrayList<>();
int errorCount = 0;
ruleArray.reset();
@ -1079,7 +1079,7 @@ class TransliteratorParser {
Data data = dataVector.get(i);
data.variables = new Object[variablesVector.size()];
variablesVector.toArray(data.variables);
data.variableNames = new HashMap<String, char[]>();
data.variableNames = new HashMap<>();
data.variableNames.putAll(variableNames);
}
variablesVector = null;
@ -1143,7 +1143,7 @@ class TransliteratorParser {
// Set up segments data
segmentStandins = new StringBuffer();
segmentObjects = new ArrayList<StringMatcher>();
segmentObjects = new ArrayList<>();
RuleHalf left = new RuleHalf();
RuleHalf right = new RuleHalf();

View file

@ -344,14 +344,12 @@ public void TestExtended() {
}
// Let unescape handle the back slash.
int charIdxAr[] = new int[1];
charIdxAr[0] = charIdx;
cp = Utility.unescapeAt(testString, charIdxAr);
if (cp != -1) {
int cpAndLength = Utility.unescapeAndLengthAt(testString, charIdx);
if (cpAndLength >= 0) {
// Escape sequence was recognized. Insert the char
// into the test data.
charIdx = charIdxAr[0];
tp.dataToBreak.appendCodePoint(cp);
charIdx += Utility.lengthFromCodePointAndLength(cpAndLength);
tp.dataToBreak.appendCodePoint(Utility.cpFromCodePointAndLength(cpAndLength));
for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
tp.srcLine[i] = lineNum;
tp.srcCol[i] = column;

View file

@ -52,11 +52,9 @@ public class UtilityTest extends TestFmwk {
// Regression test for ICU-21645
String s = "\\U0001DA8B\\U0001DF00-\\U0001DF1E";
int[] offset16 = new int[] { 1 }; // after the backslash
// This returned U+B2F00 for the first _two_ escapes.
int c = Utility.unescapeAt(s, offset16);
assertEquals(s + " unescape at 1, code point", 0x1DA8B, c);
assertEquals(s + " unescape at 1, offset", 10, offset16[0]);
int cpAndLength = Utility.unescapeAndLengthAt(s, 1); // index 1 = after the backslash
assertEquals(s + " unescape at 1, cpAndLength", 0x1DA8B09, cpAndLength);
String pattern = "[" + s + "]";
// This threw an IllegalArgumentException because the parser called Utility.unescapeAt()
// and saw an invalid range of B2F00..1DF1E (start >= end).