mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 17:01:16 +00:00
ICU-21645 reduce heap allocations in unescape() and other parsing
This commit is contained in:
parent
d9aa1ecab0
commit
ca3e86d119
8 changed files with 162 additions and 122 deletions
|
@ -52,7 +52,7 @@ public class RuleCharacterIterator {
|
|||
/**
|
||||
* Current variable expansion, or null if none.
|
||||
*/
|
||||
private char[] buf;
|
||||
private String buf;
|
||||
|
||||
/**
|
||||
* Position within buf[]. Meaningless if buf == null.
|
||||
|
@ -79,7 +79,7 @@ public class RuleCharacterIterator {
|
|||
/**
|
||||
* Bitmask option to enable parsing of escape sequences. If (options &
|
||||
* PARSE_ESCAPES) != 0, then an embedded escape sequence will be expanded
|
||||
* to its value. Escapes are parsed using Utility.unescapeAt().
|
||||
* to its value. Escapes are parsed using Utility.unescapeAndLengthAt().
|
||||
*/
|
||||
public static final int PARSE_ESCAPES = 2;
|
||||
|
||||
|
@ -90,6 +90,13 @@ public class RuleCharacterIterator {
|
|||
*/
|
||||
public static final int SKIP_WHITESPACE = 4;
|
||||
|
||||
/** For use with {@link #getPos(Position)} & {@link #setPos(Position)}. */
|
||||
public static final class Position {
|
||||
private String buf;
|
||||
private int bufPos;
|
||||
private int posIndex;
|
||||
};
|
||||
|
||||
/**
|
||||
* Constructs an iterator over the given text, starting at the given
|
||||
* position.
|
||||
|
@ -144,15 +151,17 @@ public class RuleCharacterIterator {
|
|||
break;
|
||||
}
|
||||
bufPos = 0;
|
||||
buf = sym.lookup(name);
|
||||
if (buf == null) {
|
||||
char[] chars = sym.lookup(name);
|
||||
if (chars == null) {
|
||||
buf = null;
|
||||
throw new IllegalArgumentException(
|
||||
"Undefined variable: " + name);
|
||||
}
|
||||
// Handle empty variable value
|
||||
if (buf.length == 0) {
|
||||
if (chars.length == 0) {
|
||||
buf = null;
|
||||
}
|
||||
buf = new String(chars);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -162,13 +171,14 @@ public class RuleCharacterIterator {
|
|||
}
|
||||
|
||||
if (c == '\\' && (options & PARSE_ESCAPES) != 0) {
|
||||
int offset[] = new int[] { 0 };
|
||||
c = Utility.unescapeAt(lookahead(), offset);
|
||||
jumpahead(offset[0]);
|
||||
isEscaped = true;
|
||||
if (c < 0) {
|
||||
int cpAndLength = Utility.unescapeAndLengthAt(
|
||||
getCurrentBuffer(), getCurrentBufferPos());
|
||||
if (cpAndLength < 0) {
|
||||
throw new IllegalArgumentException("Invalid escape");
|
||||
}
|
||||
c = Utility.cpFromCodePointAndLength(cpAndLength);
|
||||
jumpahead(Utility.lengthFromCodePointAndLength(cpAndLength));
|
||||
isEscaped = true;
|
||||
}
|
||||
|
||||
break;
|
||||
|
@ -199,7 +209,7 @@ public class RuleCharacterIterator {
|
|||
* restore this iterator's position. Usage idiom:
|
||||
*
|
||||
* RuleCharacterIterator iterator = ...;
|
||||
* Object pos = iterator.getPos(null); // allocate position object
|
||||
* Position pos = iterator.getPos(null); // allocate position object
|
||||
* for (;;) {
|
||||
* pos = iterator.getPos(pos); // reuse position object
|
||||
* int c = iterator.next(...);
|
||||
|
@ -213,15 +223,13 @@ public class RuleCharacterIterator {
|
|||
* @return a position object which may be passed to setPos(),
|
||||
* either `p,' or if `p' == null, a newly-allocated object
|
||||
*/
|
||||
public Object getPos(Object p) {
|
||||
public Position getPos(Position p) {
|
||||
if (p == null) {
|
||||
return new Object[] {buf, new int[] {pos.getIndex(), bufPos}};
|
||||
p = new Position();
|
||||
}
|
||||
Object[] a = (Object[]) p;
|
||||
a[0] = buf;
|
||||
int[] v = (int[]) a[1];
|
||||
v[0] = pos.getIndex();
|
||||
v[1] = bufPos;
|
||||
p.buf = buf;
|
||||
p.bufPos = bufPos;
|
||||
p.posIndex = pos.getIndex();
|
||||
return p;
|
||||
}
|
||||
|
||||
|
@ -230,12 +238,10 @@ public class RuleCharacterIterator {
|
|||
* returned the given object.
|
||||
* @param p a position object previously returned by getPos()
|
||||
*/
|
||||
public void setPos(Object p) {
|
||||
Object[] a = (Object[]) p;
|
||||
buf = (char[]) a[0];
|
||||
int[] v = (int[]) a[1];
|
||||
pos.setIndex(v[0]);
|
||||
bufPos = v[1];
|
||||
public void setPos(Position p) {
|
||||
buf = p.buf;
|
||||
pos.setIndex(p.posIndex);
|
||||
bufPos = p.bufPos;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -260,25 +266,35 @@ public class RuleCharacterIterator {
|
|||
* Returns a string containing the remainder of the characters to be
|
||||
* returned by this iterator, without any option processing. If the
|
||||
* iterator is currently within a variable expansion, this will only
|
||||
* extend to the end of the variable expansion. This method is provided
|
||||
* so that iterators may interoperate with string-based APIs. The typical
|
||||
* sequence of calls is to call skipIgnored(), then call lookahead(), then
|
||||
* parse the string returned by lookahead(), then call jumpahead() to
|
||||
* extend to the end of the variable expansion.
|
||||
* This method, together with getCurrentBufferPos() (which replace the former lookahead()),
|
||||
* is provided so that iterators may interoperate with string-based APIs. The typical
|
||||
* sequence of calls is to call skipIgnored(), then call these methods, then
|
||||
* parse that substring, then call jumpahead() to
|
||||
* resynchronize the iterator.
|
||||
* @return a string containing the characters to be returned by future
|
||||
* calls to next()
|
||||
*/
|
||||
public String lookahead() {
|
||||
public String getCurrentBuffer() {
|
||||
if (buf != null) {
|
||||
return new String(buf, bufPos, buf.length - bufPos);
|
||||
return buf;
|
||||
} else {
|
||||
return text.substring(pos.getIndex());
|
||||
return text;
|
||||
}
|
||||
}
|
||||
|
||||
public int getCurrentBufferPos() {
|
||||
if (buf != null) {
|
||||
return bufPos;
|
||||
} else {
|
||||
return pos.getIndex();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Advances the position by the given number of 16-bit code units.
|
||||
* This is useful in conjunction with the lookahead() method.
|
||||
* This is useful in conjunction with getCurrentBuffer()+getCurrentBufferPos()
|
||||
* (formerly lookahead()).
|
||||
* @param count the number of 16-bit code units to jump over
|
||||
*/
|
||||
public void jumpahead(int count) {
|
||||
|
@ -287,10 +303,10 @@ public class RuleCharacterIterator {
|
|||
}
|
||||
if (buf != null) {
|
||||
bufPos += count;
|
||||
if (bufPos > buf.length) {
|
||||
if (bufPos > buf.length()) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
if (bufPos == buf.length) {
|
||||
if (bufPos == buf.length()) {
|
||||
buf = null;
|
||||
}
|
||||
} else {
|
||||
|
@ -321,7 +337,7 @@ public class RuleCharacterIterator {
|
|||
*/
|
||||
private int _current() {
|
||||
if (buf != null) {
|
||||
return UTF16.charAt(buf, 0, buf.length, bufPos);
|
||||
return UTF16.charAt(buf, bufPos);
|
||||
} else {
|
||||
int i = pos.getIndex();
|
||||
return (i < text.length()) ? UTF16.charAt(text, i) : DONE;
|
||||
|
@ -335,7 +351,7 @@ public class RuleCharacterIterator {
|
|||
private void _advance(int count) {
|
||||
if (buf != null) {
|
||||
bufPos += count;
|
||||
if (bufPos == buf.length) {
|
||||
if (bufPos == buf.length()) {
|
||||
buf = null;
|
||||
}
|
||||
} else {
|
||||
|
|
|
@ -778,14 +778,16 @@ public final class Utility {
|
|||
};
|
||||
|
||||
/**
|
||||
* Convert an escape to a 32-bit code point value. We attempt
|
||||
* Converts an escape to a code point value. We attempt
|
||||
* to parallel the icu4c unescapeAt() function.
|
||||
* @param offset16 an array containing offset to the character
|
||||
* <em>after</em> the backslash. Upon return offset16[0] will
|
||||
* be updated to point after the escape sequence.
|
||||
* @return character value from 0 to 10FFFF, or -1 on error.
|
||||
* This function returns an integer with
|
||||
* both the code point (bits 28..8) and the length of the escape sequence (bits 7..0).
|
||||
* offset+length is the index after the escape sequence.
|
||||
*
|
||||
* @param offset the offset to the character <em>after</em> the backslash.
|
||||
* @return the code point and length, or -1 on error.
|
||||
*/
|
||||
public static int unescapeAt(String s, int[] offset16) {
|
||||
public static int unescapeAndLengthAt(CharSequence s, int offset) {
|
||||
int c;
|
||||
int result = 0;
|
||||
int n = 0;
|
||||
|
@ -797,11 +799,11 @@ public final class Utility {
|
|||
boolean braces = false;
|
||||
|
||||
/* Check that offset is in range */
|
||||
int offset = offset16[0];
|
||||
int length = s.length();
|
||||
if (offset < 0 || offset >= length) {
|
||||
return -1;
|
||||
}
|
||||
int start = offset;
|
||||
|
||||
/* Fetch first UChar after '\\' */
|
||||
c = Character.codePointAt(s, offset);
|
||||
|
@ -867,24 +869,24 @@ public final class Utility {
|
|||
int ahead = offset+1;
|
||||
c = s.charAt(offset); // [sic] get 16-bit code unit
|
||||
if (c == '\\' && ahead < length) {
|
||||
int o[] = new int[] { ahead };
|
||||
c = unescapeAt(s, o);
|
||||
ahead = o[0];
|
||||
int cpAndLength = unescapeAndLengthAt(s, ahead);
|
||||
if (cpAndLength >= 0) {
|
||||
c = cpAndLength >> 8;
|
||||
ahead += cpAndLength & 0xff;
|
||||
}
|
||||
}
|
||||
if (c <= 0xffff && UTF16.isTrailSurrogate((char) c)) {
|
||||
offset = ahead;
|
||||
result = Character.toCodePoint((char) result, (char) c);
|
||||
}
|
||||
}
|
||||
offset16[0] = offset;
|
||||
return result;
|
||||
return codePointAndLength(result, start, offset);
|
||||
}
|
||||
|
||||
/* Convert C-style escapes in table */
|
||||
for (i=0; i<UNESCAPE_MAP.length; i+=2) {
|
||||
if (c == UNESCAPE_MAP[i]) {
|
||||
offset16[0] = offset;
|
||||
return UNESCAPE_MAP[i+1];
|
||||
return codePointAndLength(UNESCAPE_MAP[i+1], start, offset);
|
||||
} else if (c < UNESCAPE_MAP[i]) {
|
||||
break;
|
||||
}
|
||||
|
@ -893,64 +895,94 @@ public final class Utility {
|
|||
/* Map \cX to control-X: X & 0x1F */
|
||||
if (c == 'c' && offset < length) {
|
||||
c = UTF16.charAt(s, offset);
|
||||
offset16[0] = offset + UTF16.getCharCount(c);
|
||||
return 0x1F & c;
|
||||
return codePointAndLength(c & 0x1F, start, offset + UTF16.getCharCount(c));
|
||||
}
|
||||
|
||||
/* If no special forms are recognized, then consider
|
||||
* the backslash to generically escape the next character. */
|
||||
offset16[0] = offset;
|
||||
return c;
|
||||
return codePointAndLength(c, start, offset);
|
||||
}
|
||||
|
||||
private static int codePointAndLength(int c, int length) {
|
||||
assert 0 <= c && c <= 0x10ffff;
|
||||
assert 0 <= length && length <= 0xff;
|
||||
return c << 8 | length;
|
||||
}
|
||||
|
||||
private static int codePointAndLength(int c, int start, int limit) {
|
||||
return codePointAndLength(c, limit - start);
|
||||
}
|
||||
|
||||
public static int cpFromCodePointAndLength(int cpAndLength) {
|
||||
assert cpAndLength >= 0;
|
||||
return cpAndLength >> 8;
|
||||
}
|
||||
|
||||
public static int lengthFromCodePointAndLength(int cpAndLength) {
|
||||
assert cpAndLength >= 0;
|
||||
return cpAndLength & 0xff;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert all escapes in a given string using unescapeAt().
|
||||
* Convert all escapes in a given string using unescapeAndLengthAt().
|
||||
* @exception IllegalArgumentException if an invalid escape is
|
||||
* seen.
|
||||
*/
|
||||
public static String unescape(String s) {
|
||||
StringBuilder buf = new StringBuilder();
|
||||
int[] pos = new int[1];
|
||||
public static String unescape(CharSequence s) {
|
||||
StringBuilder buf = null;
|
||||
for (int i=0; i<s.length(); ) {
|
||||
char c = s.charAt(i++);
|
||||
if (c == '\\') {
|
||||
pos[0] = i;
|
||||
int e = unescapeAt(s, pos);
|
||||
if (e < 0) {
|
||||
throw new IllegalArgumentException("Invalid escape sequence " +
|
||||
s.substring(i-1, Math.min(i+8, s.length())));
|
||||
if (buf == null) {
|
||||
buf = new StringBuilder(s.length()).append(s, 0, i - 1);
|
||||
}
|
||||
buf.appendCodePoint(e);
|
||||
i = pos[0];
|
||||
} else {
|
||||
int cpAndLength = unescapeAndLengthAt(s, i);
|
||||
if (cpAndLength < 0) {
|
||||
throw new IllegalArgumentException("Invalid escape sequence " +
|
||||
s.subSequence(i-1, Math.min(i+9, s.length())));
|
||||
}
|
||||
buf.appendCodePoint(cpAndLength >> 8);
|
||||
i += cpAndLength & 0xff;
|
||||
} else if (buf != null) {
|
||||
// We could optimize this further by appending whole substrings between escapes.
|
||||
buf.append(c);
|
||||
}
|
||||
}
|
||||
if (buf == null) {
|
||||
// No escapes in s.
|
||||
return s.toString();
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert all escapes in a given string using unescapeAt().
|
||||
* Convert all escapes in a given string using unescapeAndLengthAt().
|
||||
* Leave invalid escape sequences unchanged.
|
||||
*/
|
||||
public static String unescapeLeniently(String s) {
|
||||
StringBuilder buf = new StringBuilder();
|
||||
int[] pos = new int[1];
|
||||
public static String unescapeLeniently(CharSequence s) {
|
||||
StringBuilder buf = null;
|
||||
for (int i=0; i<s.length(); ) {
|
||||
char c = s.charAt(i++);
|
||||
if (c == '\\') {
|
||||
pos[0] = i;
|
||||
int e = unescapeAt(s, pos);
|
||||
if (e < 0) {
|
||||
if (buf == null) {
|
||||
buf = new StringBuilder(s.length()).append(s, 0, i - 1);
|
||||
}
|
||||
int cpAndLength = unescapeAndLengthAt(s, i);
|
||||
if (cpAndLength < 0) {
|
||||
buf.append(c);
|
||||
} else {
|
||||
buf.appendCodePoint(e);
|
||||
i = pos[0];
|
||||
buf.appendCodePoint(cpAndLength >> 8);
|
||||
i += cpAndLength & 0xff;
|
||||
}
|
||||
} else {
|
||||
} else if (buf != null) {
|
||||
// We could optimize this further by appending whole substrings between escapes.
|
||||
buf.append(c);
|
||||
}
|
||||
}
|
||||
if (buf == null) {
|
||||
// No escapes in s.
|
||||
return s.toString();
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
|
|
|
@ -87,7 +87,7 @@ public class TokenIterator {
|
|||
public int getLineNumber() {
|
||||
return reader.getLineNumber();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Return a string description of the position of the last line
|
||||
* returned by readLine() or readLineSkippingComments().
|
||||
|
@ -95,7 +95,7 @@ public class TokenIterator {
|
|||
public String describePosition() {
|
||||
return reader.describePosition() + ':' + (lastpos+1);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Read the next token from 'this.line' and append it to
|
||||
* 'this.buf'. Tokens are separated by Pattern_White_Space. Tokens
|
||||
|
@ -127,22 +127,17 @@ public class TokenIterator {
|
|||
buf.append(c);
|
||||
break;
|
||||
}
|
||||
int[] posref = null;
|
||||
while (position < line.length()) {
|
||||
c = line.charAt(position); // 16-bit ok
|
||||
if (c == '\\') {
|
||||
if (posref == null) {
|
||||
posref = new int[1];
|
||||
}
|
||||
posref[0] = position+1;
|
||||
int c32 = Utility.unescapeAt(line, posref);
|
||||
if (c32 < 0) {
|
||||
int cpAndLength = Utility.unescapeAndLengthAt(line, position + 1);
|
||||
if (cpAndLength < 0) {
|
||||
throw new RuntimeException("Invalid escape at " +
|
||||
reader.describePosition() + ':' +
|
||||
position);
|
||||
}
|
||||
UTF16.append(buf, c32);
|
||||
position = posref[0];
|
||||
UTF16.append(buf, Utility.cpFromCodePointAndLength(cpAndLength));
|
||||
position += 1 + Utility.lengthFromCodePointAndLength(cpAndLength);
|
||||
} else if ((quote != 0 && c == quote) ||
|
||||
(quote == 0 && PatternProps.isWhiteSpace(c))) {
|
||||
return ++position;
|
||||
|
|
|
@ -823,19 +823,18 @@ class RBBIRuleScanner {
|
|||
|
||||
//
|
||||
// check for backslash escaped characters.
|
||||
// Use String.unescapeAt() to handle them.
|
||||
//
|
||||
if (c.fChar == '\\') {
|
||||
c.fEscaped = true;
|
||||
int[] unescapeIndex = new int[1];
|
||||
unescapeIndex[0] = fNextIndex;
|
||||
c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex);
|
||||
if (unescapeIndex[0] == fNextIndex) {
|
||||
int cpAndLength = Utility.unescapeAndLengthAt(fRB.fRules, fNextIndex);
|
||||
if (cpAndLength < 0) {
|
||||
error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED);
|
||||
}
|
||||
c.fChar = Utility.cpFromCodePointAndLength(cpAndLength);
|
||||
int length = Utility.lengthFromCodePointAndLength(cpAndLength);
|
||||
|
||||
fCharNum += unescapeIndex[0] - fNextIndex;
|
||||
fNextIndex = unescapeIndex[0];
|
||||
fCharNum += length;
|
||||
fNextIndex += length;
|
||||
}
|
||||
}
|
||||
// putc(c.fChar, stdout);
|
||||
|
|
|
@ -2555,7 +2555,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
StringBuilder patBuf = new StringBuilder(), buf = null;
|
||||
boolean usePat = false;
|
||||
UnicodeSet scratch = null;
|
||||
Object backup = null;
|
||||
RuleCharacterIterator.Position backup = null;
|
||||
|
||||
// mode: 0=before [, 1=between [...], 2=after ]
|
||||
// lastItem: 0=none, 1=char, 2=set
|
||||
|
@ -3673,7 +3673,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
int iterOpts) {
|
||||
boolean result = false;
|
||||
iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES;
|
||||
Object pos = chars.getPos(null);
|
||||
RuleCharacterIterator.Position pos = chars.getPos(null);
|
||||
int c = chars.next(iterOpts);
|
||||
if (c == '[' || c == '\\') {
|
||||
int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE);
|
||||
|
@ -3784,14 +3784,16 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
*/
|
||||
private void applyPropertyPattern(RuleCharacterIterator chars,
|
||||
Appendable rebuiltPat, SymbolTable symbols) {
|
||||
String patStr = chars.lookahead();
|
||||
ParsePosition pos = new ParsePosition(0);
|
||||
String patStr = chars.getCurrentBuffer();
|
||||
int start = chars.getCurrentBufferPos();
|
||||
ParsePosition pos = new ParsePosition(start);
|
||||
applyPropertyPattern(patStr, pos, symbols);
|
||||
if (pos.getIndex() == 0) {
|
||||
int length = pos.getIndex() - start;
|
||||
if (length == 0) {
|
||||
syntaxError(chars, "Invalid property pattern");
|
||||
}
|
||||
chars.jumpahead(pos.getIndex());
|
||||
append(rebuiltPat, patStr.substring(0, pos.getIndex()));
|
||||
chars.jumpahead(length);
|
||||
append(rebuiltPat, patStr.substring(start, pos.getIndex()));
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
|
|
|
@ -493,12 +493,12 @@ class TransliteratorParser {
|
|||
if (pos == limit) {
|
||||
syntaxError("Trailing backslash", rule, start);
|
||||
}
|
||||
iref[0] = pos;
|
||||
int escaped = Utility.unescapeAt(rule, iref);
|
||||
pos = iref[0];
|
||||
if (escaped == -1) {
|
||||
int cpAndLength = Utility.unescapeAndLengthAt(rule, pos);
|
||||
if (cpAndLength < 0) {
|
||||
syntaxError("Malformed escape", rule, start);
|
||||
}
|
||||
int escaped = Utility.cpFromCodePointAndLength(cpAndLength);
|
||||
pos += Utility.lengthFromCodePointAndLength(cpAndLength);
|
||||
parser.checkVariableRange(escaped, rule, start);
|
||||
UTF16.append(buf, escaped);
|
||||
continue;
|
||||
|
@ -902,16 +902,16 @@ class TransliteratorParser {
|
|||
boolean parsingIDs = true;
|
||||
int ruleCount = 0;
|
||||
|
||||
dataVector = new ArrayList<Data>();
|
||||
idBlockVector = new ArrayList<String>();
|
||||
dataVector = new ArrayList<>();
|
||||
idBlockVector = new ArrayList<>();
|
||||
curData = null;
|
||||
direction = dir;
|
||||
compoundFilter = null;
|
||||
variablesVector = new ArrayList<Object>();
|
||||
variableNames = new HashMap<String, char[]>();
|
||||
variablesVector = new ArrayList<>();
|
||||
variableNames = new HashMap<>();
|
||||
parseData = new ParseData();
|
||||
|
||||
List<RuntimeException> errors = new ArrayList<RuntimeException>();
|
||||
List<RuntimeException> errors = new ArrayList<>();
|
||||
int errorCount = 0;
|
||||
|
||||
ruleArray.reset();
|
||||
|
@ -1079,7 +1079,7 @@ class TransliteratorParser {
|
|||
Data data = dataVector.get(i);
|
||||
data.variables = new Object[variablesVector.size()];
|
||||
variablesVector.toArray(data.variables);
|
||||
data.variableNames = new HashMap<String, char[]>();
|
||||
data.variableNames = new HashMap<>();
|
||||
data.variableNames.putAll(variableNames);
|
||||
}
|
||||
variablesVector = null;
|
||||
|
@ -1143,7 +1143,7 @@ class TransliteratorParser {
|
|||
|
||||
// Set up segments data
|
||||
segmentStandins = new StringBuffer();
|
||||
segmentObjects = new ArrayList<StringMatcher>();
|
||||
segmentObjects = new ArrayList<>();
|
||||
|
||||
RuleHalf left = new RuleHalf();
|
||||
RuleHalf right = new RuleHalf();
|
||||
|
|
|
@ -344,14 +344,12 @@ public void TestExtended() {
|
|||
}
|
||||
|
||||
// Let unescape handle the back slash.
|
||||
int charIdxAr[] = new int[1];
|
||||
charIdxAr[0] = charIdx;
|
||||
cp = Utility.unescapeAt(testString, charIdxAr);
|
||||
if (cp != -1) {
|
||||
int cpAndLength = Utility.unescapeAndLengthAt(testString, charIdx);
|
||||
if (cpAndLength >= 0) {
|
||||
// Escape sequence was recognized. Insert the char
|
||||
// into the test data.
|
||||
charIdx = charIdxAr[0];
|
||||
tp.dataToBreak.appendCodePoint(cp);
|
||||
charIdx += Utility.lengthFromCodePointAndLength(cpAndLength);
|
||||
tp.dataToBreak.appendCodePoint(Utility.cpFromCodePointAndLength(cpAndLength));
|
||||
for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
|
||||
tp.srcLine[i] = lineNum;
|
||||
tp.srcCol[i] = column;
|
||||
|
|
|
@ -52,11 +52,9 @@ public class UtilityTest extends TestFmwk {
|
|||
|
||||
// Regression test for ICU-21645
|
||||
String s = "\\U0001DA8B\\U0001DF00-\\U0001DF1E";
|
||||
int[] offset16 = new int[] { 1 }; // after the backslash
|
||||
// This returned U+B2F00 for the first _two_ escapes.
|
||||
int c = Utility.unescapeAt(s, offset16);
|
||||
assertEquals(s + " unescape at 1, code point", 0x1DA8B, c);
|
||||
assertEquals(s + " unescape at 1, offset", 10, offset16[0]);
|
||||
int cpAndLength = Utility.unescapeAndLengthAt(s, 1); // index 1 = after the backslash
|
||||
assertEquals(s + " unescape at 1, cpAndLength", 0x1DA8B09, cpAndLength);
|
||||
String pattern = "[" + s + "]";
|
||||
// This threw an IllegalArgumentException because the parser called Utility.unescapeAt()
|
||||
// and saw an invalid range of B2F00..1DF1E (start >= end).
|
||||
|
|
Loading…
Add table
Reference in a new issue