diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/UnicodeRegex.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/UnicodeRegex.java index e4260e43c4d..93b91381c4b 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/UnicodeRegex.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/UnicodeRegex.java @@ -40,6 +40,8 @@ import com.ibm.icu.util.Freezable; * @author markdavis */ public class UnicodeRegex implements Cloneable, Freezable, StringTransform { + private static final Pattern SUPP_ESCAPE = Pattern.compile("\\\\U00([0-9a-fA-F]{6})"); + // Note: we don't currently have any state, but intend to in the future, // particularly for the regex style supported. @@ -75,7 +77,7 @@ public class UnicodeRegex implements Cloneable, Freezable, StringT *

Not thread-safe; create a separate copy for different threads. *

In the future, we may extend this to support other regex packages. * - * @regex A modified Java regex pattern, as in the input to + * @param regex A modified Java regex pattern, as in the input to * Pattern.compile(), except that all "character classes" are * processed as if they were UnicodeSet patterns. Example: * "abc[:bc=N:]. See UnicodeSet for the differences in syntax. @@ -208,7 +210,7 @@ public class UnicodeRegex implements Cloneable, Freezable, StringT */ public String compileBnf(List lines) { Map variables = getVariables(lines); - Set unused = new LinkedHashSet(variables.keySet()); + Set unused = new LinkedHashSet<>(variables.keySet()); // brute force replacement; do twice to allow for different order // later on can optimize for (int i = 0; i < 2; ++i) { @@ -343,7 +345,12 @@ public class UnicodeRegex implements Cloneable, Freezable, StringT pos.setIndex(i); UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0); x.complement().complement(); // hack to fix toPattern - result.append(x.toPattern(false)); + String pattern = x.toPattern(false); + // Escaping of supplementary code points differs between ICU UnicodeSet and Java regex. + if (pattern.contains("\\U")) { + pattern = SUPP_ESCAPE.matcher(pattern).replaceAll("\\\\x{$1}"); + } + result.append(pattern); i = pos.getIndex() - 1; // allow for the loop increment return i; } catch (Exception e) { @@ -370,7 +377,7 @@ public class UnicodeRegex implements Cloneable, Freezable, StringT }; private Map getVariables(List lines) { - Map variables = new TreeMap(LongestFirst); + Map variables = new TreeMap<>(LongestFirst); String variable = null; StringBuffer definition = new StringBuffer(); int count = 0; diff --git a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RegexUtilitiesTest.java b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RegexUtilitiesTest.java index 88e18b160a2..ea0afe876af 100644 --- a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RegexUtilitiesTest.java +++ b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RegexUtilitiesTest.java @@ -66,7 +66,8 @@ public class RegexUtilitiesTest extends TestFmwk { UnicodeSet requiresQuote = new UnicodeSet("[\\$\\&\\-\\:\\[\\\\\\]\\^\\{\\}[:pattern_whitespace:]]"); boolean skip = TestFmwk.getExhaustiveness() < 10; for (int cp = 0; cp < 0x110000; ++cp) { - if (cp > 0xFF && skip && (cp % 37 != 0)) { + // Do always test U+1FFFE to cover UnicodeSet escaping a supplementary noncharacter. + if (cp > 0xFF && skip && (cp % 37 != 0) && cp != 0x1fffe) { continue; } String cpString = UTF16.valueOf(cp); @@ -82,7 +83,9 @@ public class RegexUtilitiesTest extends TestFmwk { String expected = "[" + s + "]"; // Try this first for faster testing. boolean ok = pattern.equals(expected); if (!ok) { - expected = new UnicodeSet(expected).toPattern(false); + // Escape like in UnicodeSet, and change supplementary escapes to Java regex syntax. + expected = new UnicodeSet(expected).toPattern(false). + replaceAll("\\\\U00([0-9a-fA-F]{6})", "\\\\x{$1}"); ok = pattern.equals(expected); } assertTrue("Doubled character works " + hex.transform(s), ok);