From 003c9da518a8ad3f0f2192b204382e606757bba2 Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Thu, 10 Sep 2015 14:30:28 +0000 Subject: [PATCH] ICU-11738 Updated to handle string ranges. X-SVN-Rev: 37943 --- .../src/com/ibm/icu/impl/StringRange.java | 282 ++++++++++++++++++ .../core/src/com/ibm/icu/text/UnicodeSet.java | 135 ++++++--- .../ibm/icu/dev/test/lang/UnicodeSetTest.java | 36 ++- 3 files changed, 406 insertions(+), 47 deletions(-) create mode 100644 icu4j/main/classes/core/src/com/ibm/icu/impl/StringRange.java diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/StringRange.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/StringRange.java new file mode 100644 index 00000000000..9b52085d81f --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/StringRange.java @@ -0,0 +1,282 @@ +/* + ******************************************************************************* + * Copyright (C) 1996-2015, Google, Inc., International Business Machines Corporation and + * others. All Rights Reserved. + ******************************************************************************* + */ +package com.ibm.icu.impl; + +import java.util.Collection; +import java.util.Comparator; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.Map.Entry; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; + +import com.ibm.icu.lang.CharSequences; +import com.ibm.icu.util.ICUException; + +@SuppressWarnings("deprecation") +public class StringRange { + private static final boolean DEBUG = false; + + public interface Adder { + /** + * @param start + * @param end may be null, for adding single string + */ + void add(String start, String end); + } + + public static final Comparator COMPARE_INT_ARRAYS = new Comparator() { + public int compare(int[] o1, int[] o2) { + int minIndex = Math.min(o1.length, o2.length); + for (int i = 0; i < minIndex; ++i) { + int diff = o1[i] - o2[i]; + if (diff != 0) { + return diff; + } + } + return o1.length - o2.length; + } + }; + + /** + * Compact the set of strings. + * @param source + * @param adder adds each pair to the output. See the {@link Adder} interface. + * @param shorterPairs use abc-d instead of abc-abd + * @param moreCompact use a more compact form, at the expense of more processing. If false, source must be sorted. + */ + public static void compact(Set source, Adder adder, boolean shorterPairs, boolean moreCompact) { + if (!moreCompact) { + String start = null; + String end = null; + int lastCp = 0; + int prefixLen = 0; + for (String s : source) { + if (start != null) { // We have something queued up + if (s.regionMatches(0, start, 0, prefixLen)) { + int currentCp = s.codePointAt(prefixLen); + if (currentCp == 1+lastCp && s.length() == prefixLen + Character.charCount(currentCp)) { + end = s; + lastCp = currentCp; + continue; + } + } + // We failed to find continuation. Add what we have and restart + adder.add(start, end == null ? null + : !shorterPairs ? end + : end.substring(prefixLen, end.length())); + } + // new possible range + start = s; + end = null; + lastCp = s.codePointBefore(s.length()); + prefixLen = s.length() - Character.charCount(lastCp); + } + adder.add(start, end == null ? null + : !shorterPairs ? end + : end.substring(prefixLen, end.length())); + } else { + // not a fast algorithm, but ok for now + // TODO rewire to use the first (slower) algorithm to generate the ranges, then compact them from there. + // first sort by lengths + Relation lengthToArrays = Relation.of(new TreeMap>(), TreeSet.class); + for (String s : source) { + Ranges item = new Ranges(s); + lengthToArrays.put(item.size(), item); + } + // then compact items of each length and emit compacted sets + for (Entry> entry : lengthToArrays.keyValuesSet()) { + LinkedList compacted = compact(entry.getKey(), entry.getValue()); + for (Ranges ranges : compacted) { + adder.add(ranges.start(), ranges.end(shorterPairs)); + } + } + } + } + + /** + * Faster but not as good compaction. Only looks at final codepoint. + * @param source + * @param adder + * @param shorterPairs + */ + public static void compact(Set source, Adder adder, boolean shorterPairs) { + compact(source,adder,shorterPairs,false); + } + + private static LinkedList compact(int size, Set inputRanges) { + LinkedList ranges = new LinkedList(inputRanges); + for (int i = size-1; i >= 0; --i) { + Ranges last = null; + for (Iterator it = ranges.iterator(); it.hasNext();) { + Ranges item = it.next(); + if (last == null) { + last = item; + } else if (last.merge(i, item)) { + it.remove(); + } else { + last = item; // go to next + } + } + }; + return ranges; + } + + static final class Range implements Comparable{ + int min; + int max; + public Range(int min, int max) { + this.min = min; + this.max = max; + } + @Override + public boolean equals(Object obj) { + return compareTo((Range)obj) == 0; + } + public int compareTo(Range that) { + int diff = min - that.min; + if (diff != 0) { + return diff; + } + return max - that.max; + } + @Override + public int hashCode() { + return min * 37 + max; + } + @Override + public String toString() { + StringBuilder result = new StringBuilder().appendCodePoint(min); + return min == max ? result.toString() : result.append('~').appendCodePoint(max).toString(); + } + } + + static final class Ranges implements Comparable { + private final Range[] ranges; + public Ranges(String s) { + int[] array = CharSequences.codePoints(s); + ranges = new Range[array.length]; + for (int i = 0; i < array.length; ++i) { + ranges[i] = new Range(array[i], array[i]); + } + } + public boolean merge(int pivot, Ranges other) { +// if (this.toString().equals("afz")) { +// int debug = 0; +// } + // we will merge items if the pivot is adjacent, and all other ranges are equal + for (int i = ranges.length-1; i >= 0; --i) { + if (i == pivot) { + if (ranges[i].max != other.ranges[i].min-1) { // not adjacent + return false; + } + } else { + if (!ranges[i].equals(other.ranges[i])) { + return false; + } + } + } + if (DEBUG) System.out.print("Merging: " + this + ", " + other); + ranges[pivot].max = other.ranges[pivot].max; + if (DEBUG) System.out.println(" => " + this); + return true; + } + + public String start() { + StringBuilder result = new StringBuilder(); + for (int i = 0; i < ranges.length; ++i) { + result.appendCodePoint(ranges[i].min); + } + return result.toString(); + } + public String end(boolean mostCompact) { + int firstDiff = firstDifference(); + if (firstDiff == ranges.length) { + return null; + } + StringBuilder result = new StringBuilder(); + for (int i = mostCompact ? firstDiff : 0; i < ranges.length; ++i) { + result.appendCodePoint(ranges[i].max); + } + return result.toString(); + } + public int firstDifference() { + for (int i = 0; i < ranges.length; ++i) { + if (ranges[i].min != ranges[i].max){ + return i; + } + } + return ranges.length; + } + public Integer size() { + return ranges.length; + } + public int compareTo(Ranges other) { + int diff = ranges.length - other.ranges.length; + if (diff != 0) { + return diff; + } + for (int i = 0; i < ranges.length; ++i) { + diff = ranges[i].compareTo(other.ranges[i]); + if (diff != 0) { + return diff; + } + } + return 0; + } + @Override + public String toString() { + String start = start(); + String end = end(false); + return end == null ? start : start + "~" + end; + } + } + + public static Collection expand(String start, String end, boolean requireSameLength, Collection output) { + if (start == null || end == null) { + throw new ICUException("Range must have 2 valid strings"); + } + int[] startCps = CharSequences.codePoints(start); + int[] endCps = CharSequences.codePoints(end); + int startOffset = startCps.length - endCps.length; + + if (requireSameLength && startOffset != 0) { + throw new ICUException("Range must have equal-length strings"); + } else if (startOffset < 0) { + throw new ICUException("Range must have start-length ≥ end-length"); + } else if (endCps.length == 0) { + throw new ICUException("Range must have end-length > 0"); + } + + StringBuilder builder = new StringBuilder(); + for (int i = 0; i < startOffset; ++i) { + builder.appendCodePoint(startCps[i]); + } + add(0, startOffset, startCps, endCps, builder, output); + return output; + } + + private static void add(int endIndex, int startOffset, int[] starts, int[] ends, StringBuilder builder, Collection output) { + int start = starts[endIndex+startOffset]; + int end = ends[endIndex]; + if (start > end) { + throw new ICUException("Range must have xᵢ ≤ yᵢ for each index i"); + } + boolean last = endIndex == ends.length - 1; + int startLen = builder.length(); + for (int i = start; i <= end; ++i) { + builder.appendCodePoint(i); + if (last) { + output.add(builder.toString()); + } else { + add(endIndex+1, startOffset, starts, ends, builder, output); + } + builder.setLength(startLen); + } + } +} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java index 87374f2ba9f..1f4abe58d24 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java @@ -20,6 +20,7 @@ import com.ibm.icu.impl.Norm2AllModes; import com.ibm.icu.impl.PatternProps; import com.ibm.icu.impl.RuleCharacterIterator; import com.ibm.icu.impl.SortedSetRelation; +import com.ibm.icu.impl.StringRange; import com.ibm.icu.impl.UBiDiProps; import com.ibm.icu.impl.UCaseProps; import com.ibm.icu.impl.UCharacterProperty; @@ -772,19 +773,19 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa T result, boolean escapeUnprintable, boolean includeStrings) { try { result.append('['); - + int count = getRangeCount(); - + // If the set contains at least 2 intervals and includes both // MIN_VALUE and MAX_VALUE, then the inverse representation will // be more economical. if (count > 1 && getRangeStart(0) == MIN_VALUE && getRangeEnd(count-1) == MAX_VALUE) { - + // Emit the inverse result.append('^'); - + for (int i = 1; i < count; ++i) { int start = getRangeEnd(i-1)+1; int end = getRangeStart(i)-1; @@ -797,7 +798,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa } } } - + // Default; emit the ranges as pairs else { for (int i = 0; i < count; ++i) { @@ -812,7 +813,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa } } } - + if (includeStrings && strings.size() > 0) { for (String s : strings) { result.append('{'); @@ -2431,6 +2432,21 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa return this; } + // Add constants to make the code easier to follow + + static final int LAST0_START = 0, + LAST1_RANGE = 1, + LAST2_SET = 2; + + static final int MODE0_NONE = 0, + MODE1_INBRACKET = 1, + MODE2_OUTBRACKET = 2; + + static final int SETMODE0_NONE = 0, + SETMODE1_UNICODESET = 1, + SETMODE2_PROPERTYPAT = 2, + SETMODE3_PREPARSED = 3; + /** * Parse the pattern from the given RuleCharacterIterator. The * iterator is advanced over the parsed pattern. @@ -2465,14 +2481,15 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa // mode: 0=before [, 1=between [...], 2=after ] // lastItem: 0=none, 1=char, 2=set - int lastItem = 0, lastChar = 0, mode = 0; + int lastItem = LAST0_START, lastChar = 0, mode = MODE0_NONE; char op = 0; boolean invert = false; clear(); + String lastString = null; - while (mode != 2 && !chars.atEnd()) { + while (mode != MODE2_OUTBRACKET && !chars.atEnd()) { //Eclipse stated the following is "dead code" /* if (false) { @@ -2491,9 +2508,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa // -------- Check for property pattern // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed - int setMode = 0; + int setMode = SETMODE0_NONE; if (resemblesPropertyPattern(chars, opts)) { - setMode = 2; + setMode = SETMODE2_PROPERTYPAT; } // -------- Parse '[' of opening delimiter OR nested set. @@ -2511,12 +2528,12 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa literal = chars.isEscaped(); if (c == '[' && !literal) { - if (mode == 1) { + if (mode == MODE1_INBRACKET) { chars.setPos(backup); // backup - setMode = 1; + setMode = SETMODE1_UNICODESET; } else { // Handle opening '[' delimiter - mode = 1; + mode = MODE1_INBRACKET; patBuf.append('['); backup = chars.getPos(backup); // prepare to backup c = chars.next(opts); @@ -2543,7 +2560,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa if (m != null) { try { nested = (UnicodeSet) m; - setMode = 3; + setMode = SETMODE3_PREPARSED; } catch (ClassCastException e) { syntaxError(chars, "Syntax error"); } @@ -2556,14 +2573,15 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa // previously been parsed and was looked up in the symbol // table. - if (setMode != 0) { - if (lastItem == 1) { + if (setMode != SETMODE0_NONE) { + if (lastItem == LAST1_RANGE) { if (op != 0) { syntaxError(chars, "Char expected after operator"); } add_unchecked(lastChar, lastChar); _appendToPat(patBuf, lastChar, false); - lastItem = op = 0; + lastItem = LAST0_START; + op = 0; } if (op == '-' || op == '&') { @@ -2575,24 +2593,24 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa nested = scratch; } switch (setMode) { - case 1: + case SETMODE1_UNICODESET: nested.applyPattern(chars, symbols, patBuf, options); break; - case 2: + case SETMODE2_PROPERTYPAT: chars.skipIgnored(opts); nested.applyPropertyPattern(chars, patBuf, symbols); break; - case 3: // `nested' already parsed + case SETMODE3_PREPARSED: // `nested' already parsed nested._toPattern(patBuf, false); break; } usePat = true; - if (mode == 0) { + if (mode == MODE0_NONE) { // Entire pattern is a category; leave parse loop set(nested); - mode = 2; + mode = MODE2_OUTBRACKET; break; } @@ -2609,12 +2627,12 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa } op = 0; - lastItem = 2; + lastItem = LAST2_SET; continue; } - if (mode == 0) { + if (mode == MODE0_NONE) { syntaxError(chars, "Missing '['"); } @@ -2625,7 +2643,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa if (!literal) { switch (c) { case ']': - if (lastItem == 1) { + if (lastItem == LAST1_RANGE) { add_unchecked(lastChar, lastChar); _appendToPat(patBuf, lastChar, false); } @@ -2637,11 +2655,14 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa syntaxError(chars, "Trailing '&'"); } patBuf.append(']'); - mode = 2; + mode = MODE2_OUTBRACKET; continue; case '-': if (op == 0) { - if (lastItem != 0) { + if (lastItem != LAST0_START) { + op = (char) c; + continue; + } else if (lastString != null) { op = (char) c; continue; } else { @@ -2651,15 +2672,15 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa literal = chars.isEscaped(); if (c == ']' && !literal) { patBuf.append("-]"); - mode = 2; + mode = MODE2_OUTBRACKET; continue; } } } - syntaxError(chars, "'-' not after char or set"); + syntaxError(chars, "'-' not after char, string, or set"); break; case '&': - if (lastItem == 2 && op == 0) { + if (lastItem == LAST2_SET && op == 0) { op = (char) c; continue; } @@ -2669,14 +2690,14 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa syntaxError(chars, "'^' not after '['"); break; case '{': - if (op != 0) { + if (op != 0 && op != '-') { syntaxError(chars, "Missing operand after operator"); } - if (lastItem == 1) { + if (lastItem == LAST1_RANGE) { add_unchecked(lastChar, lastChar); _appendToPat(patBuf, lastChar, false); } - lastItem = 0; + lastItem = LAST0_START; if (buf == null) { buf = new StringBuilder(); } else { @@ -2698,9 +2719,27 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa // We have new string. Add it to set and continue; // we don't need to drop through to the further // processing - add(buf.toString()); + String curString = buf.toString(); + if (op == '-') { + int lastSingle = CharSequences.getSingleCodePoint(lastString == null ? "" : lastString); + int curSingle = CharSequences.getSingleCodePoint(curString); + if (lastSingle != Integer.MAX_VALUE && curSingle != Integer.MAX_VALUE) { + add(lastSingle,curSingle); + } else { + try { + StringRange.expand(lastString, curString, true, strings); + } catch (Exception e) { + syntaxError(chars, e.getMessage()); + } + } + lastString = null; + op = 0; + } else { + add(curString); + lastString = curString; + } patBuf.append('{'); - _appendToPat(patBuf, buf.toString(), false); + _appendToPat(patBuf, curString, false); patBuf.append('}'); continue; case SymbolTable.SYMBOL_REF: @@ -2720,14 +2759,14 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa break; // literal '$' } if (anchor && op == 0) { - if (lastItem == 1) { + if (lastItem == LAST1_RANGE) { add_unchecked(lastChar, lastChar); _appendToPat(patBuf, lastChar, false); } add_unchecked(UnicodeMatcher.ETHER); usePat = true; patBuf.append(SymbolTable.SYMBOL_REF).append(']'); - mode = 2; + mode = MODE2_OUTBRACKET; continue; } syntaxError(chars, "Unquoted '$'"); @@ -2742,12 +2781,19 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa // ("a"). switch (lastItem) { - case 0: - lastItem = 1; + case LAST0_START: + if (op == '-' && lastString != null) { + syntaxError(chars, "Invalid range"); + } + lastItem = LAST1_RANGE; lastChar = c; + lastString = null; break; - case 1: + case LAST1_RANGE: if (op == '-') { + if (lastString != null) { + syntaxError(chars, "Invalid range"); + } if (lastChar >= c) { // Don't allow redundant (a-a) or empty (b-a) ranges; // these are most likely typos. @@ -2757,24 +2803,25 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa _appendToPat(patBuf, lastChar, false); patBuf.append(op); _appendToPat(patBuf, c, false); - lastItem = op = 0; + lastItem = LAST0_START; + op = 0; } else { add_unchecked(lastChar, lastChar); _appendToPat(patBuf, lastChar, false); lastChar = c; } break; - case 2: + case LAST2_SET: if (op != 0) { syntaxError(chars, "Set expected after operator"); } lastChar = c; - lastItem = 1; + lastItem = LAST1_RANGE; break; } } - if (mode != 2) { + if (mode != MODE2_OUTBRACKET) { syntaxError(chars, "Missing ']'"); } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java index 9e8ccd6c9d9..a1697d755b6 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java @@ -1,6 +1,6 @@ /* ******************************************************************************* - * Copyright (C) 1996-2014, International Business Machines Corporation and + * Copyright (C) 1996-2015, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ @@ -2611,7 +2611,7 @@ public class UnicodeSetTest extends TestFmwk { assertEquals("CharSequence remove", new UnicodeSet("[Aa-c{qr}]"), new UnicodeSet("[a-cA{abc}{qr}]").remove(new StringBuilder("abc")) ); assertEquals("CharSequence complement", new UnicodeSet("[Aa-c{qr}]"), new UnicodeSet("[a-cA{abc}{qr}]").complement(new StringBuilder("abc")) ); assertEquals("CharSequence complement", new UnicodeSet("[Aa-c{abc}{qr}]"), new UnicodeSet("[a-cA{qr}]").complement(new StringBuilder("abc")) ); - + assertEquals("CharSequence addAll", new UnicodeSet("[a-cABC]"), new UnicodeSet("[a-cA]").addAll(new StringBuilder("ABC")) ); assertEquals("CharSequence retainAll", new UnicodeSet("[a-c]"), new UnicodeSet("[a-cA]").retainAll(new StringBuilder("abcB")) ); assertEquals("CharSequence removeAll", new UnicodeSet("[Aab]"), new UnicodeSet("[a-cA]").removeAll(new StringBuilder("cC")) ); @@ -2621,7 +2621,7 @@ public class UnicodeSetTest extends TestFmwk { assertEquals("CharSequence contains", true, new UnicodeSet("[a-cA{ab}]"). contains(new StringBuilder("ab")) ); assertEquals("CharSequence containsNone", false, new UnicodeSet("[a-cA]"). containsNone(new StringBuilder("ab")) ); assertEquals("CharSequence containsSome", true, new UnicodeSet("[a-cA{ab}]"). containsSome(new StringBuilder("ab")) ); - + // spanning assertEquals("CharSequence span", 3, new UnicodeSet("[a-cA]"). span(new StringBuilder("abc"), SpanCondition.SIMPLE) ); assertEquals("CharSequence span", 3, new UnicodeSet("[a-cA]"). span(new StringBuilder("abc"), 1, SpanCondition.SIMPLE) ); @@ -2636,4 +2636,34 @@ public class UnicodeSetTest extends TestFmwk { assertEquals("CharSequence findLastIn", -1, new UnicodeSet("[a-cA]"). findLastIn(new StringBuilder("abc"), 1, true) ); assertEquals("CharSequence add", "c", new UnicodeSet("[abA]"). stripFrom(new StringBuilder("abc"), true)); } + + public void TestAStringRange() { + String[][] tests = { + {"[{ax}-{bz}]", "[{ax}{ay}{az}{bx}{by}{bz}]"}, + {"[{a}-{c}]", "[a-c]"}, + //{"[a-{c}]", "[a-c]"}, // don't handle these yet: enable once we do + //{"[{a}-c]", "[a-c]"}, // don't handle these yet: enable once we do + {"[{ax}-{by}-{cz}]", "Error: '-' not after char, string, or set at \"[{ax}-{by}-{|cz}]\""}, + {"[{a}-{bz}]", "Error: Range must have equal-length strings at \"[{a}-{bz}|]\""}, + {"[{ax}-{b}]", "Error: Range must have equal-length strings at \"[{ax}-{b}|]\""}, + {"[{ax}-bz]", "Error: Invalid range at \"[{ax}-b|z]\""}, + {"[ax-{bz}]", "Error: Range must have 2 valid strings at \"[ax-{bz}|]\""}, + {"[{bx}-{az}]", "Error: Range must have xᵢ ≤ yᵢ for each index i at \"[{bx}-{az}|]\""}, + }; + int i = 0; + for (String[] test : tests) { + String expected = test[1]; + if (test[1].startsWith("[")) { + expected = new UnicodeSet(expected).toPattern(false); + } + String actual; + try { + actual = new UnicodeSet(test[0]).toPattern(false); + } catch (Exception e) { + actual = e.getMessage(); + } + assertEquals("StringRange " + i, expected, actual); + ++i; + } + } }