ICU-9131 update trunk from branch, after fixes as per core review.

X-SVN-Rev: 36187
This commit is contained in:
Mark Davis 2014-08-18 12:58:44 +00:00
parent b31ff49acf
commit f7c551d636
9 changed files with 1408 additions and 453 deletions

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 2009-2011, International Business Machines
* Copyright (C) 2009-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -10,23 +10,25 @@
package com.ibm.icu.impl;
import com.ibm.icu.text.UnicodeSet.SpanCondition;
import com.ibm.icu.util.OutputInt;
/*
/**
* Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points.
*
* Latin-1: Look up bytes. 2-byte characters: Bits organized vertically. 3-byte characters: Use zero/one/mixed data
* per 64-block in U+0000..U+FFFF, with mixed for illegal ranges. Supplementary characters: Call contains() on the
* parent set.
* Latin-1: Look up bytes.
* 2-byte characters: Bits organized vertically.
* 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges.
* Supplementary characters: Call contains() on the parent set.
*/
public final class BMPSet {
public static int U16_SURROGATE_OFFSET = ((0xd800 << 10) + 0xdc00 - 0x10000);
/*
/**
* One boolean ('true' or 'false') per Latin-1 character.
*/
private boolean[] latin1Contains;
/*
/**
* One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points
* correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6}
* trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead)
@ -36,7 +38,7 @@ public final class BMPSet {
*/
private int[] table7FF;
/*
/**
* One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks
* correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12}
* t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit
@ -48,14 +50,14 @@ public final class BMPSet {
*/
private int[] bmpBlockBits;
/*
/**
* Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000,
* U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are
* always looked up in the bit tables. The last pair of indexes is for finding supplementary code points.
*/
private int[] list4kStarts;
/*
/**
* The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for
* supplementary code points. The list is terminated with list[listLength-1]=0x110000.
*/
@ -120,22 +122,24 @@ public final class BMPSet {
}
}
/*
/**
* Span the initial substring for which each character c has spanCondition==contains(c). It must be
* spanCondition==0 or 1.
*
* @param start The start index
* @param end The end index
* @return The length of the span.
* @param outCount If not null: Receives the number of code points in the span.
* @return the limit (exclusive end) of the span
*
* NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for
* sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points
* as usual in ICU.
*/
public final int span(CharSequence s, int start, int end, SpanCondition spanCondition) {
public final int span(CharSequence s, int start, SpanCondition spanCondition,
OutputInt outCount) {
char c, c2;
int i = start;
int limit = Math.min(s.length(), end);
int limit = s.length();
int numSupplementary = 0;
if (SpanCondition.NOT_CONTAINED != spanCondition) {
// span
while (i < limit) {
@ -170,6 +174,7 @@ public final class BMPSet {
if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
break;
}
++numSupplementary;
++i;
}
++i;
@ -208,15 +213,20 @@ public final class BMPSet {
if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
break;
}
++numSupplementary;
++i;
}
++i;
}
}
return i - start;
if (outCount != null) {
int spanLength = i - start;
outCount.value = spanLength - numSupplementary; // number of code points
}
return i;
}
/*
/**
* Symmetrical with span().
* Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >=
* limit and spanCondition==0 or 1.
@ -226,7 +236,6 @@ public final class BMPSet {
public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) {
char c, c2;
limit = Math.min(s.length(), limit);
if (SpanCondition.NOT_CONTAINED != spanCondition) {
// span
for (;;) {
@ -311,7 +320,7 @@ public final class BMPSet {
return limit + 1;
}
/*
/**
* Set bits in a bit rectangle in "vertical" bit organization. start<limit<=0x800
*/
private static void set32x64Bits(int[] table, int start, int limit) {

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 2009-2012, International Business Machines
* Copyright (C) 2009-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -13,6 +13,7 @@ import java.util.ArrayList;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSet.SpanCondition;
import com.ibm.icu.util.OutputInt;
/*
* Implement span() etc. for a set with strings.
@ -22,54 +23,69 @@ import com.ibm.icu.text.UnicodeSet.SpanCondition;
public class UnicodeSetStringSpan {
/*
* Which span() variant will be used? The object is either built for one variant and used once, or built for all and
* may be used many times.
* Which span() variant will be used? The object is either built for one variant and used once,
* or built for all and may be used many times.
*/
public static final int WITH_COUNT = 0x40; // spanAndCount() may be called
public static final int FWD = 0x20;
public static final int BACK = 0x10;
public static final int UTF16 = 8;
// public static final int UTF16 = 8;
public static final int CONTAINED = 2;
public static final int NOT_CONTAINED = 1;
public static final int ALL = 0x3f;
public static final int ALL = 0x7f;
public static final int FWD_UTF16_CONTAINED = FWD | UTF16 | CONTAINED;
public static final int FWD_UTF16_NOT_CONTAINED = FWD | UTF16 | NOT_CONTAINED;
public static final int BACK_UTF16_CONTAINED = BACK | UTF16 | CONTAINED;
public static final int BACK_UTF16_NOT_CONTAINED = BACK | UTF16 | NOT_CONTAINED;
public static final int FWD_UTF16_CONTAINED = FWD | /* UTF16 | */ CONTAINED;
public static final int FWD_UTF16_NOT_CONTAINED = FWD | /* UTF16 | */NOT_CONTAINED;
public static final int BACK_UTF16_CONTAINED = BACK | /* UTF16 | */ CONTAINED;
public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED;
// Special spanLength short values. (since Java has not unsigned byte type)
// All code points in the string are contained in the parent set.
/**
* Special spanLength short values. (since Java has not unsigned byte type)
* All code points in the string are contained in the parent set.
*/
static final short ALL_CP_CONTAINED = 0xff;
// The spanLength is >=0xfe.
/** The spanLength is >=0xfe. */
static final short LONG_SPAN = ALL_CP_CONTAINED - 1;
// Set for span(). Same as parent but without strings.
/** Set for span(). Same as parent but without strings. */
private UnicodeSet spanSet;
// Set for span(not contained).
// Same as spanSet, plus characters that start or end strings.
/**
* Set for span(not contained).
* Same as spanSet, plus characters that start or end strings.
*/
private UnicodeSet spanNotSet;
// The strings of the parent set.
/** The strings of the parent set. */
private ArrayList<String> strings;
// the lengths of span(), spanBack() etc. for each string.
/** The lengths of span(), spanBack() etc. for each string. */
private short[] spanLengths;
// Maximum lengths of relevant strings.
/** Maximum lengths of relevant strings. */
private int maxLength16;
// Set up for all variants of span()?
/** Are there strings that are not fully contained in the code point set? */
private boolean someRelevant;
/** Set up for all variants of span()? */
private boolean all;
// Span helper
/** Span helper */
private OffsetList offsets;
// Construct for all variants of span(), or only for any one variant.
// Initialize as little as possible, for single use.
/**
* Constructs for all variants of span(), or only for any one variant.
* Initializes as little as possible, for single use.
*/
public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) {
spanSet = new UnicodeSet(0, 0x10ffff);
// TODO: With Java 6, just take the parent set's strings as is,
// as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings.
// Then iterate via the first() and higher() methods.
// (We do not want to create multiple Iterator objects in each span().)
// See ICU ticket #7454.
strings = setStrings;
all = (which == ALL);
spanSet.retainAll(set);
@ -90,7 +106,7 @@ public class UnicodeSetStringSpan {
int stringsLength = strings.size();
int i, spanLength;
boolean someRelevant = false;
someRelevant = false;
for (i = 0; i < stringsLength; ++i) {
String string = strings.get(i);
int length16 = string.length();
@ -98,12 +114,11 @@ public class UnicodeSetStringSpan {
if (spanLength < length16) { // Relevant string.
someRelevant = true;
}
if ((0 != (which & UTF16)) && length16 > maxLength16) {
if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) {
maxLength16 = length16;
}
}
if (!someRelevant) {
maxLength16 = 0;
if (!someRelevant && (which & WITH_COUNT) == 0) {
return;
}
@ -140,7 +155,7 @@ public class UnicodeSetStringSpan {
int length16 = string.length();
spanLength = spanSet.span(string, SpanCondition.CONTAINED);
if (spanLength < length16) { // Relevant string.
if (0 != (which & UTF16)) {
if (true /* 0 != (which & UTF16) */) {
if (0 != (which & CONTAINED)) {
if (0 != (which & FWD)) {
spanLengths[i] = makeSpanLengthByte(spanLength);
@ -188,10 +203,12 @@ public class UnicodeSetStringSpan {
* Constructs a copy of an existing UnicodeSetStringSpan.
* Assumes which==ALL for a frozen set.
*/
public UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan, final ArrayList<String> newParentSetStrings) {
public UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan,
final ArrayList<String> newParentSetStrings) {
spanSet = otherStringSpan.spanSet;
strings = newParentSetStrings;
maxLength16 = otherStringSpan.maxLength16;
someRelevant = otherStringSpan.someRelevant;
all = true;
if (otherStringSpan.spanNotSet == otherStringSpan.spanSet) {
spanNotSet = spanSet;
@ -203,22 +220,25 @@ public class UnicodeSetStringSpan {
spanLengths = otherStringSpan.spanLengths.clone();
}
/*
/**
* Do the strings need to be checked in span() etc.?
*
* @return TRUE if strings need to be checked (call span() here), FALSE if not (use a BMPSet for best performance).
* @return true if strings need to be checked (call span() here),
* false if not (use a BMPSet for best performance).
*/
public boolean needsStringSpanUTF16() {
return (maxLength16 != 0);
return someRelevant;
}
// For fast UnicodeSet::contains(c).
/** For fast UnicodeSet::contains(c). */
public boolean contains(int c) {
return spanSet.contains(c);
}
// Add a starting or ending string character to the spanNotSet
// so that a character span ends before any string.
/**
* Adds a starting or ending string character to the spanNotSet
* so that a character span ends before any string.
*/
private void addToSpanNotSet(int c) {
if (spanNotSet == null || spanNotSet == spanSet) {
if (spanSet.contains(c)) {
@ -230,12 +250,14 @@ public class UnicodeSetStringSpan {
}
/*
* Note: In span() when spanLength==0 (after a string match, or at the beginning after an empty code point span) and
* in spanNot() and spanNotUTF8(), string matching could use a binary search because all string matches are done
* Note: In span() when spanLength==0
* (after a string match, or at the beginning after an empty code point span)
* and in spanNot() and spanNotUTF8(),
* string matching could use a binary search because all string matches are done
* from the same start index.
*
*
* For UTF-8, this would require a comparison function that returns UTF-16 order.
*
*
* This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets
* with strings have very few very short strings. For cases with many strings, it might be better to use a different
* API and implementation with a DFA (state machine).
@ -244,84 +266,119 @@ public class UnicodeSetStringSpan {
/*
* Algorithm for span(SpanCondition.CONTAINED)
*
* Theoretical algorithm: - Iterate through the string, and at each code point boundary: + If the code point there
* is in the set, then remember to continue after it. + If a set string matches at the current position, then
* remember to continue after it. + Either recursively span for each code point or string match, or recursively span
* for all but the shortest one and iteratively continue the span with the shortest local match. + Remember the
* longest recursive span (the farthest end point). + If there is no match at the current position, neither for the
* code point there nor for any set string, then stop and return the longest recursive span length.
*
* Theoretical algorithm:
* - Iterate through the string, and at each code point boundary:
* + If the code point there is in the set, then remember to continue after it.
* + If a set string matches at the current position, then remember to continue after it.
* + Either recursively span for each code point or string match, or recursively span
* for all but the shortest one and iteratively continue the span with the shortest local match.
* + Remember the longest recursive span (the farthest end point).
* + If there is no match at the current position,
* neither for the code point there nor for any set string,
* then stop and return the longest recursive span length.
*
* Optimized implementation:
*
* (We assume that most sets will have very few very short strings. A span using a string-less set is extremely
* fast.)
*
* Create and cache a spanSet which contains all of the single code points of the original set but none of its
* strings.
*
* - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). - Loop: + Try to match each set
* string at the end of the spanLength. ~ Set strings that start with set-contained code points must be matched with
* a partial overlap because the recursive algorithm would have tried to match them at every position. ~ Set strings
* that entirely consist of set-contained code points are irrelevant for span(SpanCondition.CONTAINED)
* because the recursive algorithm would continue after them anyway and find the longest recursive match from their
* end. ~ Rather than recursing, note each end point of a set string match. + If no set string matched after
* spanSet.span(), then return with where the spanSet.span() ended. + If at least one set string matched after
* spanSet.span(), then pop the shortest string match end point and continue the loop, trying to match all set
* strings from there. + If at least one more set string matched after a previous string match, then test if the
* code point after the previous string match is also contained in the set. Continue the loop with the shortest end
* point of either this code point or a matching set string. + If no more set string matched after a previous string
* match, then try another spanLength=spanSet.span(SpanCondition.CONTAINED). Stop if spanLength==0,
* otherwise continue the loop.
*
*
* (We assume that most sets will have very few very short strings.
* A span using a string-less set is extremely fast.)
*
* Create and cache a spanSet which contains all of the single code points of the original set
* but none of its strings.
*
* - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
* - Loop:
* + Try to match each set string at the end of the spanLength.
* ~ Set strings that start with set-contained code points
* must be matched with a partial overlap
* because the recursive algorithm would have tried to match them at every position.
* ~ Set strings that entirely consist of set-contained code points
* are irrelevant for span(SpanCondition.CONTAINED)
* because the recursive algorithm would continue after them anyway and
* find the longest recursive match from their end.
* ~ Rather than recursing, note each end point of a set string match.
* + If no set string matched after spanSet.span(),
* then return with where the spanSet.span() ended.
* + If at least one set string matched after spanSet.span(),
* then pop the shortest string match end point and continue the loop,
* trying to match all set strings from there.
* + If at least one more set string matched after a previous string match, then test if the
* code point after the previous string match is also contained in the set.
* Continue the loop with the shortest end point of
* either this code point or a matching set string.
* + If no more set string matched after a previous string match,
* then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
* Stop if spanLength==0, otherwise continue the loop.
*
* By noting each end point of a set string match, the function visits each string position at most once and
* finishes in linear time.
*
* The recursive algorithm may visit the same string position many times if multiple paths lead to it and finishes
* in exponential time.
*
* The recursive algorithm may visit the same string position many times
* if multiple paths lead to it and finishes in exponential time.
*/
/*
* Algorithm for span(SIMPLE)
*
* Theoretical algorithm: - Iterate through the string, and at each code point boundary: + If the code point there
* is in the set, then remember to continue after it. + If a set string matches at the current position, then
* remember to continue after it. + Continue from the farthest match position and ignore all others. + If there is
* no match at the current position, then stop and return the current position.
*
* Theoretical algorithm:
* - Iterate through the string, and at each code point boundary:
* + If the code point there is in the set, then remember to continue after it.
* + If a set string matches at the current position, then remember to continue after it.
* + Continue from the farthest match position and ignore all others.
* + If there is no match at the current position, then stop and return the current position.
*
* Optimized implementation:
*
*
* (Same assumption and spanSet as above.)
*
* - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). - Loop: + Try to match each set
* string at the end of the spanLength. ~ Set strings that start with set-contained code points must be matched with
* a partial overlap because the standard algorithm would have tried to match them earlier. ~ Set strings that
* entirely consist of set-contained code points must be matched with a full overlap because the longest-match
* algorithm would hide set string matches that end earlier. Such set strings need not be matched earlier inside the
* code point span because the standard algorithm would then have continued after the set string match anyway. ~
* Remember the longest set string match (farthest end point) from the earliest starting point. + If no set string
* matched after spanSet.span(), then return with where the spanSet.span() ended. + If at least one set string
* matched, then continue the loop after the longest match from the earliest position. + If no more set string
* matched after a previous string match, then try another
* spanLength=spanSet.span(SpanCondition.CONTAINED). Stop if spanLength==0, otherwise continue the
* loop.
*
* - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
* - Loop:
* + Try to match each set string at the end of the spanLength.
* ~ Set strings that start with set-contained code points
* must be matched with a partial overlap
* because the standard algorithm would have tried to match them earlier.
* ~ Set strings that entirely consist of set-contained code points
* must be matched with a full overlap because the longest-match algorithm
* would hide set string matches that end earlier.
* Such set strings need not be matched earlier inside the code point span
* because the standard algorithm would then have
* continued after the set string match anyway.
* ~ Remember the longest set string match (farthest end point)
* from the earliest starting point.
* + If no set string matched after spanSet.span(),
* then return with where the spanSet.span() ended.
* + If at least one set string matched,
* then continue the loop after the longest match from the earliest position.
* + If no more set string matched after a previous string match,
* then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
* Stop if spanLength==0, otherwise continue the loop.
*/
/**
* Span a string.
* Spans a string.
*
* @param s The string to be spanned
* @param start The start index that the span begins
* @param spanCondition The span condition
* @return the length of the span
* @return the limit (exclusive end) of the span
*/
public synchronized int span(CharSequence s, int start, int length, SpanCondition spanCondition) {
public int span(CharSequence s, int start, SpanCondition spanCondition) {
if (spanCondition == SpanCondition.NOT_CONTAINED) {
return spanNot(s, start, length);
return spanNot(s, start, null);
}
int spanLength = spanSet.span(s.subSequence(start, start + length), SpanCondition.CONTAINED);
if (spanLength == length) {
return length;
int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED);
if (spanLimit == s.length()) {
return spanLimit;
}
return spanWithStrings(s, start, spanLimit, spanCondition);
}
/**
* Synchronized method for complicated spans using the offsets.
* Avoids synchronization for simple cases.
*
* @param spanLimit = spanSet.span(s, start, CONTAINED)
*/
private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit,
SpanCondition spanCondition) {
// Consider strings; they may overlap with the span.
int initSize = 0;
if (spanCondition == SpanCondition.CONTAINED) {
@ -329,7 +386,9 @@ public class UnicodeSetStringSpan {
initSize = maxLength16;
}
offsets.setMaxLength(initSize);
int pos = start + spanLength, rest = length - spanLength;
int length = s.length();
int pos = spanLimit, rest = length - spanLimit;
int spanLength = spanLimit - start;
int i, stringsLength = strings.size();
for (;;) {
if (spanCondition == SpanCondition.CONTAINED) {
@ -429,7 +488,7 @@ public class UnicodeSetStringSpan {
// Otherwise, an unlimited code point span is only tried again when no
// strings match, and if such a non-initial span fails we stop.
if (offsets.isEmpty()) {
return pos - start; // No strings matched after a span.
return pos; // No strings matched after a span.
}
// Match strings from after the next string match.
} else {
@ -437,11 +496,12 @@ public class UnicodeSetStringSpan {
if (offsets.isEmpty()) {
// No more strings matched after a previous string match.
// Try another code point span from after the last string match.
spanLength = spanSet.span(s.subSequence(pos, pos + rest), SpanCondition.CONTAINED);
spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED);
spanLength = spanLimit - pos;
if (spanLength == rest || // Reached the end of the string, or
spanLength == 0 // neither strings nor span progressed.
) {
return pos + spanLength - start;
return spanLimit;
}
pos += spanLength;
rest -= spanLength;
@ -467,13 +527,110 @@ public class UnicodeSetStringSpan {
// Match strings from after the next string match.
}
}
int minOffset = offsets.popMinimum();
int minOffset = offsets.popMinimum(null);
pos += minOffset;
rest -= minOffset;
spanLength = 0; // Match strings from after a string match.
}
}
/**
* Spans a string and counts the smallest number of set elements on any path across the span.
*
* <p>For proper counting, we cannot ignore strings that are fully contained in code point spans.
*
* <p>If the set does not have any fully-contained strings, then we could optimize this
* like span(), but such sets are likely rare, and this is at least still linear.
*
* @param s The string to be spanned
* @param start The start index that the span begins
* @param spanCondition The span condition
* @param outCount The count
* @return the limit (exclusive end) of the span
*/
public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition,
OutputInt outCount) {
if (spanCondition == SpanCondition.NOT_CONTAINED) {
return spanNot(s, start, outCount);
}
// Consider strings; they may overlap with the span,
// and they may result in a smaller count that with just code points.
if (spanCondition == SpanCondition.CONTAINED) {
return spanContainedAndCount(s, start, outCount);
}
// SIMPLE (not synchronized, does not use offsets)
int stringsLength = strings.size();
int length = s.length();
int pos = start;
int rest = length - start;
int count = 0;
while (rest != 0) {
// Try to match the next code point.
int cpLength = spanOne(spanSet, s, pos, rest);
int maxInc = (cpLength > 0) ? cpLength : 0;
// Try to match all of the strings.
for (int i = 0; i < stringsLength; ++i) {
String string = strings.get(i);
int length16 = string.length();
if (maxInc < length16 && length16 <= rest &&
matches16CPB(s, pos, length, string, length16)) {
maxInc = length16;
}
}
// We are done if there is no match beyond pos.
if (maxInc == 0) {
outCount.value = count;
return pos;
}
// Continue from the longest match.
++count;
pos += maxInc;
rest -= maxInc;
}
outCount.value = count;
return pos;
}
private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) {
// Use offset list to try all possibilities.
offsets.setMaxLength(maxLength16);
int stringsLength = strings.size();
int length = s.length();
int pos = start;
int rest = length - start;
int count = 0;
while (rest != 0) {
// Try to match the next code point.
int cpLength = spanOne(spanSet, s, pos, rest);
if (cpLength > 0) {
offsets.addOffsetAndCount(cpLength, count + 1);
}
// Try to match all of the strings.
for (int i = 0; i < stringsLength; ++i) {
String string = strings.get(i);
int length16 = string.length();
// Note: If the strings were sorted by length, then we could also
// avoid trying to match if there is already a match of the same length.
if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) &&
matches16CPB(s, pos, length, string, length16)) {
offsets.addOffsetAndCount(length16, count + 1);
}
}
// We are done if there is no match beyond pos.
if (offsets.isEmpty()) {
outCount.value = count;
return pos;
}
// Continue from the nearest match.
int minOffset = offsets.popMinimum(outCount);
count = outCount.value;
pos += minOffset;
rest -= minOffset;
}
outCount.value = count;
return pos;
}
/**
* Span a string backwards.
*
@ -638,59 +795,72 @@ public class UnicodeSetStringSpan {
// Match strings from before the next string match.
}
}
pos -= offsets.popMinimum();
pos -= offsets.popMinimum(null);
spanLength = 0; // Match strings from before a string match.
}
}
/*
/**
* Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED)
*
* Theoretical algorithm: - Iterate through the string, and at each code point boundary: + If the code point there
* is in the set, then return with the current position. + If a set string matches at the current position, then
* return with the current position.
*
* Theoretical algorithm:
* - Iterate through the string, and at each code point boundary:
* + If the code point there is in the set, then return with the current position.
* + If a set string matches at the current position, then return with the current position.
*
* Optimized implementation:
*
*
* (Same assumption as for span() above.)
*
* Create and cache a spanNotSet which contains all of the single code points of the original set but none of its
* strings. For each set string add its initial code point to the spanNotSet. (Also add its final code point for
* spanNotBack().)
*
*
* Create and cache a spanNotSet which contains
* all of the single code points of the original set but none of its strings.
* For each set string add its initial code point to the spanNotSet.
* (Also add its final code point for spanNotBack().)
*
* - Loop:
* + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED).
* + If the current code point is in the original set, then return the current position.
* + If any set string matches at the current position, then return the current position.
* + If there is no match at the current position, neither for the code point
* there nor for any set string, then skip this code point and continue the loop. This happens for
* set-string-initial code points that were added to spanNotSet when there is not actually a match for such a set
* string.
* there nor for any set string, then skip this code point and continue the loop.
* This happens for set-string-initial code points that were added to spanNotSet
* when there is not actually a match for such a set string.
*
* @return the length of the span
* @param s The string to be spanned
* @param start The start index that the span begins
* @param outCount If not null: Receives the number of code points across the span.
* @return the limit (exclusive end) of the span
*/
private int spanNot(CharSequence s, int start, int length) {
int pos = start, rest = length;
int i, stringsLength = strings.size();
private int spanNot(CharSequence s, int start, OutputInt outCount) {
int length = s.length();
int pos = start, rest = length - start;
int stringsLength = strings.size();
int count = 0;
do {
// Span until we find a code point from the set,
// or a code point that starts or ends some string.
i = spanNotSet.span(s.subSequence(pos, pos + rest), SpanCondition.NOT_CONTAINED);
if (i == rest) {
int spanLimit;
if (outCount == null) {
spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED);
} else {
spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount);
outCount.value = count = count + outCount.value;
}
if (spanLimit == length) {
return length; // Reached the end of the string.
}
pos += i;
rest -= i;
pos = spanLimit;
rest = length - spanLimit;
// Check whether the current code point is in the original set,
// without the string starts and ends.
int cpLength = spanOne(spanSet, s, pos, rest);
if (cpLength > 0) {
return pos - start; // There is a set element at pos.
return pos; // There is a set element at pos.
}
// Try to match the strings at pos.
for (i = 0; i < stringsLength; ++i) {
for (int i = 0; i < stringsLength; ++i) {
if (spanLengths[i] == ALL_CP_CONTAINED) {
continue; // Irrelevant string.
}
@ -698,7 +868,7 @@ public class UnicodeSetStringSpan {
int length16 = string.length();
if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) {
return pos - start; // There is a set element at pos.
return pos; // There is a set element at pos.
}
}
@ -707,7 +877,11 @@ public class UnicodeSetStringSpan {
// cpLength<0
pos -= cpLength;
rest += cpLength;
++count;
} while (rest != 0);
if (outCount != null) {
outCount.value = count;
}
return length; // Reached the end of the string.
}
@ -773,20 +947,24 @@ public class UnicodeSetStringSpan {
* Compare 16-bit Unicode strings (which may be malformed UTF-16)
* at code point boundaries.
* That is, each edge of a match must not be in the middle of a surrogate pair.
* @param s The string to match in.
* @param start The start index of s.
* @param slength The length of s from start.
* @param limit The limit of the subsequence of s being spanned.
* @param t The substring to be matched in s.
* @param tlength The length of t.
*/
static boolean matches16CPB(CharSequence s, int start, int slength, final String t, int tlength) {
return !(0 < start && com.ibm.icu.text.UTF16.isLeadSurrogate (s.charAt(start - 1)) &&
com.ibm.icu.text.UTF16.isTrailSurrogate(s.charAt(start + 0)))
&& !(tlength < slength && com.ibm.icu.text.UTF16.isLeadSurrogate (s.charAt(start + tlength - 1)) &&
com.ibm.icu.text.UTF16.isTrailSurrogate(s.charAt(start + tlength)))
&& matches16(s, start, t, tlength);
static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) {
return matches16(s, start, t, tlength)
&& !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) &&
Character.isLowSurrogate(s.charAt(start)))
&& !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) &&
Character.isLowSurrogate(s.charAt(start + tlength)));
}
// Does the set contain the next code point?
// If so, return its length; otherwise return its negative length.
/**
* Does the set contain the next code point?
* If so, return its length; otherwise return its negative length.
*/
static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) {
char c = s.charAt(start);
if (c >= 0xd800 && c <= 0xdbff && length >= 2) {
@ -811,47 +989,57 @@ public class UnicodeSetStringSpan {
return set.contains(c) ? 1 : -1;
}
/*
/**
* Helper class for UnicodeSetStringSpan.
*
* List of offsets from the current position from where to try matching a code point or a string. Store offsets rather
* than indexes to simplify the code and use the same list for both increments (in span()) and decrements (in
* spanBack()).
*
* Assumption: The maximum offset is limited, and the offsets that are stored at any one time are relatively dense, that
* is, there are normally no gaps of hundreds or thousands of offset values.
*
* The implementation uses a circular buffer of byte flags, each indicating whether the corresponding offset is in the
* list. This avoids inserting into a sorted list of offsets (or absolute indexes) and physically moving part of the
* list.
*
* Note: In principle, the caller should setMaxLength() to the maximum of the max string length and U16_LENGTH/U8_LENGTH
* <p>List of offsets from the current position from where to try matching
* a code point or a string.
* Stores offsets rather than indexes to simplify the code and use the same list
* for both increments (in span()) and decrements (in spanBack()).
*
* <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time
* are relatively dense, that is,
* there are normally no gaps of hundreds or thousands of offset values.
*
* <p>This class optionally also tracks the minimum non-negative count for each position,
* intended to count the smallest number of elements of any path leading to that position.
*
* <p>The implementation uses a circular buffer of count integers,
* each indicating whether the corresponding offset is in the list,
* and its path element count.
* This avoids inserting into a sorted list of offsets (or absolute indexes)
* and physically moving part of the list.
*
* <p>Note: In principle, the caller should setMaxLength() to
* the maximum of the max string length and U16_LENGTH/U8_LENGTH
* to account for "long" single code points.
*
* Note: If maxLength were guaranteed to be no more than 32 or 64, the list could be stored as bit flags in a single
* integer. Rather than handling a circular buffer with a start list index, the integer would simply be shifted when
* lower offsets are removed. UnicodeSet does not have a limit on the lengths of strings.
*
* <p>Note: An earlier version did not track counts and stored only byte flags.
* With boolean flags, if maxLength were guaranteed to be no more than 32 or 64,
* the list could be stored as bit flags in a single integer.
* Rather than handling a circular buffer with a start list index,
* the integer would simply be shifted when lower offsets are removed.
* UnicodeSet does not have a limit on the lengths of strings.
*/
static class OffsetList {
private boolean[] list;
private static final class OffsetList {
private int[] list;
private int length;
private int start;
public OffsetList() {
list = new boolean[16]; // default size
list = new int[16]; // default size
}
public void setMaxLength(int maxLength) {
if (maxLength > list.length) {
list = new boolean[maxLength];
list = new int[maxLength];
}
clear();
}
public void clear() {
for (int i = list.length; i-- > 0;) {
list[i] = false;
list[i] = 0;
}
start = length = 0;
}
@ -860,55 +1048,97 @@ public class UnicodeSetStringSpan {
return (length == 0);
}
// Reduce all stored offsets by delta, used when the current position
// moves by delta.
// There must not be any offsets lower than delta.
// If there is an offset equal to delta, it is removed.
// delta=[1..maxLength]
/**
* Reduces all stored offsets by delta, used when the current position moves by delta.
* There must not be any offsets lower than delta.
* If there is an offset equal to delta, it is removed.
*
* @param delta [1..maxLength]
*/
public void shift(int delta) {
int i = start + delta;
if (i >= list.length) {
i -= list.length;
}
if (list[i]) {
list[i] = false;
if (list[i] != 0) {
list[i] = 0;
--length;
}
start = i;
}
// Add an offset. The list must not contain it yet.
// offset=[1..maxLength]
/**
* Adds an offset. The list must not contain it yet.
* @param offset [1..maxLength]
*/
public void addOffset(int offset) {
int i = start + offset;
if (i >= list.length) {
i -= list.length;
}
list[i] = true;
assert list[i] == 0;
list[i] = 1;
++length;
}
// offset=[1..maxLength]
/**
* Adds an offset and updates its count.
* The list may already contain the offset.
* @param offset [1..maxLength]
*/
public void addOffsetAndCount(int offset, int count) {
assert count > 0;
int i = start + offset;
if (i >= list.length) {
i -= list.length;
}
if (list[i] == 0) {
list[i] = count;
++length;
} else if (count < list[i]) {
list[i] = count;
}
}
/**
* @param offset [1..maxLength]
*/
public boolean containsOffset(int offset) {
int i = start + offset;
if (i >= list.length) {
i -= list.length;
}
return list[i];
return list[i] != 0;
}
// Find the lowest stored offset from a non-empty list, remove it,
// and reduce all other offsets by this minimum.
// Returns [1..maxLength].
public int popMinimum() {
/**
* @param offset [1..maxLength]
*/
public boolean hasCountAtOffset(int offset, int count) {
int i = start + offset;
if (i >= list.length) {
i -= list.length;
}
int oldCount = list[i];
return oldCount != 0 && oldCount <= count;
}
/**
* Finds the lowest stored offset from a non-empty list, removes it,
* and reduces all other offsets by this minimum.
* @return min=[1..maxLength]
*/
public int popMinimum(OutputInt outCount) {
// Look for the next offset in list[start+1..list.length-1].
int i = start, result;
while (++i < list.length) {
if (list[i]) {
list[i] = false;
int count = list[i];
if (count != 0) {
list[i] = 0;
--length;
result = i - start;
start = i;
if (outCount != null) { outCount.value = count; }
return result;
}
}
@ -918,12 +1148,14 @@ public class UnicodeSetStringSpan {
// Since the list is not empty, there will be one.
result = list.length - start;
i = 0;
while (!list[i]) {
int count;
while ((count = list[i]) == 0) {
++i;
}
list[i] = false;
list[i] = 0;
--length;
start = i;
if (outCount != null) { outCount.value = count; }
return result + i;
}
}

View file

@ -1,6 +1,6 @@
/**
*******************************************************************************
* Copyright (C) 1996-2012, International Business Machines Corporation and
* Copyright (C) 1996-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
@ -2612,6 +2612,61 @@ public final class UTF16 {
}
}
/**
* Utility for getting a code point from a CharSequence that contains exactly one code point.
* @return a code point IF the string is non-null and consists of a single code point.
* otherwise returns -1.
* @param s to test
*/
public static int getSingleCodePoint(CharSequence s) {
if (s == null || s.length() == 0) {
return -1;
} else if (s.length() == 1) {
return s.charAt(0);
} else if (s.length() > 2) {
return -1;
}
// at this point, len = 2
int cp = UTF16.charAt(s, 0);
if (cp > 0xFFFF) { // is surrogate pair
return cp;
}
return -1;
}
/**
* Utility for comparing a code point to a string without having to create a new string. Returns the same results
* as a code point comparison of UTF16.valueOf(codePoint) and s.toString(). More specifically, if
* <pre>
* sc = new StringComparator(true,false,0);
* fast = UTF16.compare(codePoint, charSequence)
* slower = sc.compare(UTF16.valueOf(codePoint), charSequence == null ? "" : charSequence.toString())
* </pre>
* then
* </pre>
* Integer.signum(fast) == Integer.signum(slower)
* </pre>
* @param codePoint to test
* @param s to test
* @return equivalent of code point comparator comparing two strings.
*/
public static int compareCodePoint(int codePoint, CharSequence s) {
if (s == null) {
return 1;
}
final int strLen = s.length();
if (strLen == 0) {
return 1;
}
int second = Character.codePointAt(s, 0);
int diff = codePoint - second;
if (diff != 0) {
return diff;
}
return strLen == Character.charCount(codePoint) ? 0 : -1;
}
// private data members -------------------------------------------------
/**

View file

@ -29,6 +29,7 @@ import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.util.Freezable;
import com.ibm.icu.util.OutputInt;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.VersionInfo;
@ -265,11 +266,20 @@ import com.ibm.icu.util.VersionInfo;
* </tr>
* </table>
* </blockquote>
* <p>To iterate over contents of UnicodeSet, use UnicodeSetIterator class.
* <p>To iterate over contents of UnicodeSet, the following are available:
* <ul><li>{@link #ranges()} to iterate through the ranges</li>
* <li>{@link #strings()} to iterate through the strings</li>
* <li>{@link #iterator()} to iterate through the entire contents in a single loop.
* That method is, however, not particularly efficient, since it "boxes" each code point into a String.
* </ul>
* All of the above can be used in <b>for</b> loops.
* The {@link com.ibm.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in <b>for</b> loops.
* <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
*
* @author Alan Liu
* @stable ICU 2.0
* @see UnicodeSetIterator
* @see UnicodeSetSpanner
*/
public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Comparable<UnicodeSet>, Freezable<UnicodeSet> {
@ -283,7 +293,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* @stable ICU 4.8
*/
public static final UnicodeSet ALL_CODE_POINTS = new UnicodeSet(0, 0x10FFFF).freeze();
private static XSymbolTable XSYMBOL_TABLE = null; // for overriding the the function processing
private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
@ -338,7 +348,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
*/
private static UnicodeSet INCLUSIONS[] = null;
private BMPSet bmpSet; // The set is frozen iff either bmpSet or stringSpan is not null.
private BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null.
private UnicodeSetStringSpan stringSpan;
//----------------------------------------------------------------
// Public API
@ -492,6 +502,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* @stable ICU 2.0
*/
public Object clone() {
if (isFrozen()) {
return this;
}
UnicodeSet result = new UnicodeSet(this);
result.bmpSet = this.bmpSet;
result.stringSpan = this.stringSpan;
@ -588,27 +601,30 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
/**
* Append the <code>toPattern()</code> representation of a
* string to the given <code>StringBuffer</code>.
* @return
*/
private static void _appendToPat(StringBuffer buf, String s, boolean escapeUnprintable) {
private static StringBuffer _appendToPat(StringBuffer buf, String s, boolean escapeUnprintable) {
int cp;
for (int i = 0; i < s.length(); i += Character.charCount(cp)) {
cp = s.codePointAt(i);
_appendToPat(buf, cp, escapeUnprintable);
}
return buf;
}
/**
* Append the <code>toPattern()</code> representation of a
* character to the given <code>StringBuffer</code>.
* @return
*/
private static void _appendToPat(StringBuffer buf, int c, boolean escapeUnprintable) {
private static StringBuffer _appendToPat(StringBuffer buf, int c, boolean escapeUnprintable) {
// "Utility.isUnprintable(c)" seems redundant since the the call
// "Utility.escapeUnprintable(buf, c)" does it again inside the if statement
if (escapeUnprintable && Utility.isUnprintable(c)) {
// Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything
// unprintable
if (Utility.escapeUnprintable(buf, c)) {
return;
return buf;
}
}
// Okay to let ':' pass through
@ -633,6 +649,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
break;
}
UTF16.append(buf, c);
return buf;
}
/**
@ -1279,9 +1296,11 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
}
/**
* Utility for getting code point from single code point CharSequence.
* See the public UTF16.getSingleCodePoint()
* @return a code point IF the string consists of a single one.
* otherwise returns -1.
* @param string to test
* @param s to test
*/
private static int getSingleCP(CharSequence s) {
if (s.length() < 1) {
@ -1322,7 +1341,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* @return this object, for chaining
* @stable ICU 2.0
*/
public final UnicodeSet retainAll(String s) {
public final UnicodeSet retainAll(CharSequence s) {
return retainAll(fromAll(s));
}
@ -1333,7 +1352,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* @return this object, for chaining
* @stable ICU 2.0
*/
public final UnicodeSet complementAll(String s) {
public final UnicodeSet complementAll(CharSequence s) {
return complementAll(fromAll(s));
}
@ -1344,7 +1363,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* @return this object, for chaining
* @stable ICU 2.0
*/
public final UnicodeSet removeAll(String s) {
public final UnicodeSet removeAll(CharSequence s) {
return removeAll(fromAll(s));
}
@ -1369,7 +1388,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* @return a newly created set containing the given string
* @stable ICU 2.0
*/
public static UnicodeSet from(String s) {
public static UnicodeSet from(CharSequence s) {
return new UnicodeSet().add(s);
}
@ -1380,7 +1399,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* @return a newly created set containing the given characters
* @stable ICU 2.0
*/
public static UnicodeSet fromAll(String s) {
public static UnicodeSet fromAll(CharSequence s) {
return new UnicodeSet().addAll(s);
}
@ -1428,13 +1447,15 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* Retain the specified string in this set if it is present.
* Upon return this set will be empty if it did not contain s, or
* will only contain s if it did contain s.
* @param s the string to be retained
* @param cs the string to be retained
* @return this object, for chaining
* @stable ICU 2.0
*/
public final UnicodeSet retain(String s) {
int cp = getSingleCP(s);
public final UnicodeSet retain(CharSequence cs) {
int cp = getSingleCP(cs);
if (cp < 0) {
String s = cs.toString();
boolean isIn = strings.contains(s);
if (isIn && size() == 1) {
return this;
@ -1494,7 +1515,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* @return this object, for chaining
* @stable ICU 2.0
*/
public final UnicodeSet remove(String s) {
public final UnicodeSet remove(CharSequence s) {
int cp = getSingleCP(s);
if (cp < 0) {
strings.remove(s);
@ -1571,14 +1592,14 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* @return this object, for chaining
* @stable ICU 2.0
*/
public final UnicodeSet complement(String s) {
public final UnicodeSet complement(CharSequence s) {
checkFrozen();
int cp = getSingleCP(s);
if (cp < 0) {
if (strings.contains(s)) {
strings.remove(s);
} else {
strings.add(s);
strings.add(s.toString());
}
pat = null;
} else {
@ -1804,11 +1825,11 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* @return <tt>true</tt> if this set contains the specified string
* @stable ICU 2.0
*/
public final boolean contains(String s) {
public final boolean contains(CharSequence s) {
int cp = getSingleCP(s);
if (cp < 0) {
return strings.contains(s);
return strings.contains(s.toString());
} else {
return contains(cp);
}
@ -2072,7 +2093,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* @return true if the test condition is met
* @stable ICU 2.0
*/
public boolean containsNone(String s) {
public boolean containsNone(CharSequence s) {
return span(s, SpanCondition.NOT_CONTAINED) == s.length();
}
@ -2106,7 +2127,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* @return true if the condition is met
* @stable ICU 2.0
*/
public final boolean containsSome(String s) {
public final boolean containsSome(CharSequence s) {
return !containsNone(s);
}
@ -2344,7 +2365,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
StringBuffer rebuiltPat = new StringBuffer();
RuleCharacterIterator chars =
new RuleCharacterIterator(pattern, symbols, pos);
new RuleCharacterIterator(pattern, symbols, pos);
applyPattern(chars, symbols, rebuiltPat, options);
if (chars.inVariable()) {
syntaxError(chars, "Extra chars in variable value");
@ -2388,7 +2409,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
// Recognized special forms for chars, sets: c-c s-s s&s
int opts = RuleCharacterIterator.PARSE_VARIABLES |
RuleCharacterIterator.PARSE_ESCAPES;
RuleCharacterIterator.PARSE_ESCAPES;
if ((options & IGNORE_SPACE) != 0) {
opts |= RuleCharacterIterator.SKIP_WHITESPACE;
}
@ -2740,7 +2761,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
private static void syntaxError(RuleCharacterIterator chars, String msg) {
throw new IllegalArgumentException("Error: " + msg + " at \"" +
Utility.escape(chars.toString()) +
'"');
'"');
}
/**
@ -2771,23 +2792,24 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
}
/**
* Add the contents of the collection (as strings) into this UnicodeSet.
* Add the contents of the collection (as strings) into this UnicodeSet.
* The collection must not contain null.
* @param source the collection to add
* @return a reference to this object
* @stable ICU 4.4
*/
public UnicodeSet add(Collection<?> source) {
public UnicodeSet add(Iterable<?> source) {
return addAll(source);
}
/**
* Add the contents of the UnicodeSet (as strings) into a collection.
* Add a collection (as strings) into this UnicodeSet.
* Uses standard naming convention.
* @param source collection to add into
* @return a reference to this object
* @stable ICU 4.4
*/
public UnicodeSet addAll(Collection<?> source) {
public UnicodeSet addAll(Iterable<?> source) {
checkFrozen();
for (Object o : source) {
add(o.toString());
@ -3104,7 +3126,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
// Reference comparison ok; VersionInfo caches and reuses
// unique objects.
return v != NO_VERSION &&
v.compareTo(version) <= 0;
v.compareTo(version) <= 0;
}
}
@ -3297,7 +3319,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
public UnicodeSet applyPropertyAlias(String propertyAlias, String valueAlias) {
return applyPropertyAlias(propertyAlias, valueAlias, null);
}
/**
* Modifies this set to contain those code points which have the
* given value for the given property. Prior contents of this
@ -3321,7 +3343,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
&& ((XSymbolTable)symbols).applyPropertyAlias(propertyAlias, valueAlias, this)) {
return this;
}
if (XSYMBOL_TABLE != null) {
if (XSYMBOL_TABLE.applyPropertyAlias(propertyAlias, valueAlias, this)) {
return this;
@ -3476,8 +3498,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
// Look for an opening [:, [:^, \p, or \P
return pattern.regionMatches(pos, "[:", 0, 2) ||
pattern.regionMatches(true, pos, "\\p", 0, 2) ||
pattern.regionMatches(pos, "\\N", 0, 2);
pattern.regionMatches(true, pos, "\\p", 0, 2) ||
pattern.regionMatches(pos, "\\N", 0, 2);
}
/**
@ -3879,17 +3901,14 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
// Optimize contains() and span() and similar functions.
if (!strings.isEmpty()) {
stringSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), UnicodeSetStringSpan.ALL);
if (!stringSpan.needsStringSpanUTF16()) {
// All strings are irrelevant for span() etc. because
// all of each string's code points are contained in this set.
// Do not check needsStringSpanUTF8() because UTF-8 has at most as
// many relevant strings as UTF-16.
// (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
stringSpan = null;
}
}
if (stringSpan == null) {
// No span-relevant strings: Optimize for code point spans.
if (stringSpan == null || !stringSpan.needsStringSpanUTF16()) {
// Optimize for code point spans.
// There are no strings, or
// all strings are irrelevant for span() etc. because
// all of each string's code points are contained in this set.
// However, fully contained strings are relevant for spanAndCount(),
// so we create both objects.
bmpSet = new BMPSet(list, len);
}
}
@ -3898,7 +3917,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
/**
* Span a string using this UnicodeSet.
*
* <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
* @param s The string to be spanned
* @param spanCondition The span condition
* @return the length of the span
@ -3912,7 +3931,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* Span a string using this UnicodeSet.
* If the start index is less than 0, span will start from 0.
* If the start index is greater than the string length, span returns the string length.
*
* <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
* @param s The string to be spanned
* @param start The start index that the span begins
* @param spanCondition The span condition
@ -3927,52 +3946,97 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
return end;
}
if (bmpSet != null) {
return start + bmpSet.span(s, start, end, spanCondition);
// Frozen set without strings, or no string is relevant for span().
return bmpSet.span(s, start, spanCondition, null);
}
int len = end - start;
if (stringSpan != null) {
return start + stringSpan.span(s, start, len, spanCondition);
return stringSpan.span(s, start, spanCondition);
} else if (!strings.isEmpty()) {
int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
: UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
if (strSpan.needsStringSpanUTF16()) {
return start + strSpan.span(s, start, len, spanCondition);
return strSpan.span(s, start, spanCondition);
}
}
return spanCodePointsAndCount(s, start, spanCondition, null);
}
/**
* Same as span() but also counts the smallest number of set elements on any path across the span.
* <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
* @param outCount An output-only object (must not be null) for returning the count.
* @return the limit (exclusive end) of the span
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) {
if (outCount == null) {
throw new IllegalArgumentException("outCount must not be null");
}
int end = s.length();
if (start < 0) {
start = 0;
} else if (start >= end) {
return end;
}
if (stringSpan != null) {
// We might also have bmpSet != null,
// but fully-contained strings are relevant for counting elements.
return stringSpan.spanAndCount(s, start, spanCondition, outCount);
} else if (bmpSet != null) {
return bmpSet.span(s, start, spanCondition, outCount);
} else if (!strings.isEmpty()) {
int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
: UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
which |= UnicodeSetStringSpan.WITH_COUNT;
UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
return strSpan.spanAndCount(s, start, spanCondition, outCount);
}
return spanCodePointsAndCount(s, start, spanCondition, outCount);
}
private int spanCodePointsAndCount(CharSequence s, int start,
SpanCondition spanCondition, OutputInt outCount) {
// Pin to 0/1 values.
boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED);
int c;
int next = start;
int length = s.length();
int count = 0;
do {
c = Character.codePointAt(s, next);
if (spanContained != contains(c)) {
break;
}
next = Character.offsetByCodePoints(s, next, 1);
} while (next < end);
++count;
next += Character.charCount(c);
} while (next < length);
if (outCount != null) { outCount.value = count; }
return next;
}
/**
* Span a string backwards (from the end) using this UnicodeSet.
*
* <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
* @param s The string to be spanned
* @param spanCondition The span condition
* @return The string index which starts the span (i.e. inclusive).
* @stable ICU 4.4
*/
public int spanBack(CharSequence s, SpanCondition spanCondition) {
return spanBack(s, s.length(), spanCondition);
return spanBack(s, s.length(), spanCondition);
}
/**
* Span a string backwards (from the fromIndex) using this UnicodeSet.
* If the fromIndex is less than 0, spanBack will return 0.
* If fromIndex is greater than the string length, spanBack will start from the string length.
*
* <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
* @param s The string to be spanned
* @param fromIndex The index of the char (exclusive) that the string should be spanned backwards
* @param spanCondition The span condition
@ -3987,6 +4051,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
fromIndex = s.length();
}
if (bmpSet != null) {
// Frozen set without strings, or no string is relevant for spanBack().
return bmpSet.spanBack(s, fromIndex, spanCondition);
}
if (stringSpan != null) {
@ -3994,7 +4059,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
} else if (!strings.isEmpty()) {
int which = (spanCondition == SpanCondition.NOT_CONTAINED)
? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED
: UnicodeSetStringSpan.BACK_UTF16_CONTAINED;
: UnicodeSetStringSpan.BACK_UTF16_CONTAINED;
UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
if (strSpan.needsStringSpanUTF16()) {
return strSpan.spanBack(s, fromIndex, spanCondition);
@ -4011,20 +4076,19 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
if (spanContained != contains(c)) {
break;
}
prev = Character.offsetByCodePoints(s, prev, -1);
prev -= Character.charCount(c);
} while (prev > 0);
return prev;
}
/**
* Clone a thawed version of this class, according to the Freezable interface.
* @return this
* @return the clone, not frozen
* @stable ICU 4.4
*/
public UnicodeSet cloneAsThawed() {
UnicodeSet result = (UnicodeSet) clone();
result.bmpSet = null;
result.stringSpan = null;
UnicodeSet result = new UnicodeSet(this);
assert !result.isFrozen();
return result;
}
@ -4039,6 +4103,80 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
// Additional methods for integration with Generics and Collections
// ************************
/**
* A struct-like class used for iteration through ranges, for faster iteration than by String.
* Read about the restrictions on usage in {@link #UnicodeSet.ranges()}.
*/
public static class EntryRange {
/**
* The starting code point of the range.
*/
public int codepoint;
/**
* The ending code point of the range
*/
public int codepointEnd;
@Override
public String toString() {
StringBuffer b = new StringBuffer();
return (
codepoint == codepointEnd ? _appendToPat(b, codepoint, false)
: _appendToPat(_appendToPat(b, codepoint, false).append('-'), codepointEnd, false))
.toString();
}
}
/**
* Provide for faster iteration than by String. Returns an iterator over a range values. The UnicodeSet
* must not be altered during the iteration. The EntryRange is the same each time; the contents are just reset.
* <br><b>Warning: </b>To iterate over the full contents, you have to also iterate over the strings.
*
* <pre>
* // Sample code
* for (EntryRange range : us1.ranges()) {
* // do something with code points between range.codepointEnd and range.codepointEnd;
* }
* for (String s : us1.strings()) {
* // do something with each string;
* }
* </pre>
*/
public Iterable<EntryRange> ranges() {
return new EntryRanges();
}
private class EntryRanges implements Iterable<EntryRange>, Iterator<EntryRange> {
int pos;
EntryRange result = new EntryRange();
// Iterator<String> stringIterator = strings == null ? null : strings.iterator();
public Iterator<EntryRange> iterator() {
return this;
}
public boolean hasNext() {
return pos < len-1
// || (stringIterator != null && stringIterator.hasNext())
;
}
public EntryRange next() {
if (pos < len-1) {
result.codepoint = list[pos++];
result.codepointEnd = list[pos++]-1;
// result.string = null;
} else {
throw new ArrayIndexOutOfBoundsException(pos);
// result.codepoint = -1;
// result.string = stringIterator.next();
}
return result;
}
public void remove() {
throw new UnsupportedOperationException();
}
}
/**
* Returns a string iterator. Uses the same order of iteration as {@link UnicodeSetIterator}.
* @see java.util.Set#iterator()
@ -4129,8 +4267,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* @see #containsAll(com.ibm.icu.text.UnicodeSet)
* @stable ICU 4.4
*/
public boolean containsAll(Collection<String> collection) {
for (String o : collection) {
public <T extends CharSequence> boolean containsAll(Iterable<T> collection) {
for (T o : collection) {
if (!contains(o)) {
return false;
}
@ -4142,8 +4280,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* @see #containsNone(com.ibm.icu.text.UnicodeSet)
* @stable ICU 4.4
*/
public boolean containsNone(Collection<String> collection) {
for (String o : collection) {
public <T extends CharSequence> boolean containsNone(Iterable<T> collection) {
for (T o : collection) {
if (contains(o)) {
return false;
}
@ -4155,7 +4293,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* @see #containsAll(com.ibm.icu.text.UnicodeSet)
* @stable ICU 4.4
*/
public final boolean containsSome(Collection<String> collection) {
public final <T extends CharSequence> boolean containsSome(Iterable<T> collection) {
return !containsNone(collection);
}
@ -4163,9 +4301,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* @see #addAll(com.ibm.icu.text.UnicodeSet)
* @stable ICU 4.4
*/
public UnicodeSet addAll(String... collection) {
public <T extends CharSequence> UnicodeSet addAll(T... collection) {
checkFrozen();
for (String str : collection) {
for (T str : collection) {
add(str);
}
return this;
@ -4176,9 +4314,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* @see #removeAll(com.ibm.icu.text.UnicodeSet)
* @stable ICU 4.4
*/
public UnicodeSet removeAll(Collection<String> collection) {
public <T extends CharSequence> UnicodeSet removeAll(Iterable<T> collection) {
checkFrozen();
for (String o : collection) {
for (T o : collection) {
remove(o);
}
return this;
@ -4188,7 +4326,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* @see #retainAll(com.ibm.icu.text.UnicodeSet)
* @stable ICU 4.4
*/
public UnicodeSet retainAll(Collection<String> collection) {
public <T extends CharSequence> UnicodeSet retainAll(Iterable<T> collection) {
checkFrozen();
// TODO optimize
UnicodeSet toRetain = new UnicodeSet();
@ -4277,7 +4415,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* @stable ICU 4.4
*/
public static int compare(String string, int codePoint) {
public static int compare(CharSequence string, int codePoint) {
return CharSequences.compare(string, codePoint);
}
@ -4288,7 +4426,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* Note that this (=String) order is UTF-16 order -- *not* code point order.
* @stable ICU 4.4
*/
public static int compare(int codePoint, String string) {
public static int compare(int codePoint, CharSequence string) {
return -CharSequences.compare(string, codePoint);
}
@ -4304,7 +4442,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
public static <T extends Comparable<T>> int compare(Iterable<T> collection1, Iterable<T> collection2) {
return compare(collection1.iterator(), collection2.iterator());
}
/**
* Utility to compare two iterators. Warning: the ordering in iterables is important. For Collections that are ordered,
* like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration.
@ -4378,7 +4516,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* </pre>
* @stable ICU 4.4
*/
public Iterable<String> strings() {
public Collection<String> strings() {
return Collections.unmodifiableSortedSet(strings);
}
@ -4417,7 +4555,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* If findNot is true, then reverse the sense of the match: find the first place where the UnicodeSet doesn't match.
* If there is no match, length is returned.
* @internal
* @deprecated This API is ICU internal only.
* @deprecated This API is ICU internal only. Use span instead.
*/
@Deprecated
public int findIn(CharSequence value, int fromIndex, boolean findNot) {
@ -4438,7 +4576,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* If there is no match, -1 is returned.
* BEFORE index is not in the UnicodeSet.
* @internal
* @deprecated This API is ICU internal only.
* @deprecated This API is ICU internal only. Use spanBack instead.
*/
@Deprecated
public int findLastIn(CharSequence value, int fromIndex, boolean findNot) {
@ -4460,7 +4598,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* @param matches A boolean to either strip all that matches or don't match with the current UnicodeSet object.
* @return The string after it has been stripped.
* @internal
* @deprecated This API is ICU internal only.
* @deprecated This API is ICU internal only. Use replaceFrom.
*/
@Deprecated
public String stripFrom(CharSequence source, boolean matches) {
@ -4593,6 +4731,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
*/
@Deprecated
public static void setDefaultXSymbolTable(XSymbolTable xSymbolTable) {
INCLUSIONS = null; // If the properties override inclusions, these have to be regenerated.
XSYMBOL_TABLE = xSymbolTable;
}
}

View file

@ -0,0 +1,333 @@
/*
*******************************************************************************
* Copyright (C) 2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.text;
import com.ibm.icu.text.UnicodeSet.SpanCondition;
import com.ibm.icu.util.OutputInt;
/**
* A helper class used to count, replace, and trim CharSequences based on UnicodeSet matches.
* An instance is immutable (and thus thread-safe) iff the source UnicodeSet is frozen.
*/
public class UnicodeSetSpanner {
private final UnicodeSet unicodeSet;
/**
* Create a spanner from a UnicodeSet. For speed and safety, the UnicodeSet should be frozen. However, this class
* can be used with a non-frozen version to avoid the cost of freezing.
*
* @param source
* the original UnicodeSet
*/
public UnicodeSetSpanner(UnicodeSet source) {
unicodeSet = source;
}
/**
* Returns the UnicodeSet used for processing. It is frozen iff the original was.
*
* @return the construction set.
*/
public UnicodeSet getUnicodeSet() {
return unicodeSet;
}
/*
* (non-Javadoc)
*
* @see java.lang.Object#equals(java.lang.Object)
*/
@Override
public boolean equals(Object other) {
return other instanceof UnicodeSetSpanner && unicodeSet.equals(((UnicodeSetSpanner) other).unicodeSet);
}
/*
* (non-Javadoc)
*
* @see java.lang.Object#hashCode()
*/
@Override
public int hashCode() {
return unicodeSet.hashCode();
}
/**
* Options for replaceFrom and countIn to control how to treat each matched span. The name is from "qualifier" as used in regex,
* since it is similar to whether one is replacing [abc] by x, or [abc]* by x.
*
*/
public enum Quantifier {
/**
* Collapse spans. That is, modify/count the entire matching span as a single item, instead of separate
* code points.
*
*/
SPAN,
/**
* Use the smallest number of elements in the spanned range for counting and modification. In other words, the "longest matches" are
* used where possible. If there are no strings, this will be the same as code points.
* <p>For example, in the string "abab":
* <ul>
* <li>spanning with [ab] will also count four MIN_ELEMENTS.</li>
* <li>spanning with [{ab}] will count two MIN_ELEMENTS.</li>
* <li>spanning with [ab{ab}] will also count two MIN_ELEMENTS.</li>
* </ul>
*/
MIN_ELEMENTS,
// Note: could in the future have an additional option MAX_ELEMENTS
}
/**
* Returns the number of matching characters found in a character sequence, counting by Quantifier.ELEMENT using SpanCondition.CONTAINED.
*
* @param sequence
* the sequence to count characters in
* @return the count. Zero if there are none.
*/
public int countIn(CharSequence sequence) {
return countIn(sequence, Quantifier.MIN_ELEMENTS, SpanCondition.CONTAINED);
}
/**
* Returns the number of matching characters found in a character sequence, using SpanCondition.CONTAINED
*
* @param sequence
* the sequence to count characters in
* @return the count. Zero if there are none.
*/
public int countIn(CharSequence sequence, Quantifier quantifier) {
return countIn(sequence, quantifier, SpanCondition.CONTAINED);
}
/**
* Returns the number of matching characters found in a character sequence.
*
* @param sequence
* the sequence to count characters in
* @param quantifier
* (optional) whether to treat the entire span as a match, or individual code points
* @param countSpan
* (optional) the spanCondition to use. CONTAINED means only count the code points in the CONTAINED span;
* NOT_CONTAINED is the reverse.
* @return the count. Zero if there are none.
*/
public int countIn(CharSequence sequence, Quantifier quantifier, SpanCondition countSpan) {
int count = 0;
int start = 0;
SpanCondition skipSpan = countSpan == SpanCondition.CONTAINED ? SpanCondition.NOT_CONTAINED
: SpanCondition.CONTAINED;
final int length = sequence.length();
OutputInt spanCount = new OutputInt();
while (start != length) {
int endNotContained = unicodeSet.span(sequence, start, skipSpan);
if (endNotContained == length) {
break;
}
start = unicodeSet.spanAndCount(sequence, endNotContained, countSpan, spanCount);
count += quantifier == Quantifier.SPAN ? 1 : spanCount.value;
}
return count;
}
/**
* Delete all the matching spans in sequence, using SpanCondition.CONTAINED
*
* @param sequence
* charsequence to replace matching spans in.
* @return modified string.
*/
public String deleteFrom(CharSequence sequence) {
return replaceFrom(sequence, "", Quantifier.SPAN, SpanCondition.CONTAINED);
}
/**
* Delete all matching spans in sequence, according to the operations.
*
* @param sequence
* charsequence to replace matching spans in.
* @param modifySpan
* specify whether to modify the matching spans (CONTAINED) or the non-matching (NOT_CONTAINED)
* @return modified string.
*/
public String deleteFrom(CharSequence sequence, SpanCondition modifySpan) {
return replaceFrom(sequence, "", Quantifier.SPAN, modifySpan);
}
/**
* Replace all matching spans in sequence by the replacement,
* counting by Quantifier.ELEMENT using SpanCondition.CONTAINED.
*
* @param sequence
* charsequence to replace matching spans in.
* @param replacement
* replacement sequence. To delete, use ""
* @return modified string.
*/
public String replaceFrom(CharSequence sequence, CharSequence replacement) {
return replaceFrom(sequence, replacement, Quantifier.MIN_ELEMENTS, SpanCondition.CONTAINED);
}
/**
* Replace all matching spans in sequence by replacement, according to the Quantifier, using SpanCondition.CONTAINED.
*
* @param sequence
* charsequence to replace matching spans in.
* @param replacement
* replacement sequence. To delete, use ""
* @param quantifier
* whether to treat the entire span as a match, or individual code points
* @return modified string.
*/
public String replaceFrom(CharSequence sequence, CharSequence replacement, Quantifier quantifier) {
return replaceFrom(sequence, replacement, quantifier, SpanCondition.CONTAINED);
}
/**
* Replace all matching spans in sequence by replacement, according to the operations quantifier and modifySpan.
*
* @param sequence
* charsequence to replace matching spans in.
* @param replacement
* replacement sequence. To delete, use ""
* @param modifySpan
* (optional) specify whether to modify the matching spans (CONTAINED) or the non-matching
* (NOT_CONTAINED)
* @param quantifier
* (optional) specify whether to collapse or do codepoint by codepoint.
* @return modified string.
*/
public String replaceFrom(CharSequence sequence, CharSequence replacement, Quantifier quantifier,
SpanCondition modifySpan) {
SpanCondition copySpan = modifySpan == SpanCondition.CONTAINED ? SpanCondition.NOT_CONTAINED
: SpanCondition.CONTAINED;
final boolean remove = replacement.length() == 0;
StringBuilder result = new StringBuilder();
// TODO, we can optimize this to
// avoid this allocation unless needed
final int length = sequence.length();
OutputInt spanCount = new OutputInt();
for (int endCopy = 0; endCopy != length;) {
int endModify = unicodeSet.spanAndCount(sequence, endCopy, modifySpan, spanCount);
if (remove || endModify == 0) {
// do nothing
} else if (quantifier == Quantifier.SPAN) {
result.append(replacement);
} else {
for (int i = spanCount.value; i > 0; --i) {
result.append(replacement);
}
}
if (endModify == length) {
break;
}
endCopy = unicodeSet.span(sequence, endModify, copySpan);
result.append(sequence.subSequence(endModify, endCopy));
}
return result.toString();
}
/**
* Options for the trim() method
*
*/
public enum TrimOption {
/**
* Trim leading spans (subject to INVERT).
*
*/
LEADING,
/**
* Trim leading and trailing spans (subject to INVERT).
*
*/
BOTH,
/**
* Trim trailing spans (subject to INVERT).
*
*/
TRAILING;
}
/**
* Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching code points at the start or
* end of the string, using TrimOption.BOTH and SpanCondition.CONTAINED. For example:
*
* <pre>
* {@code
*
* new UnicodeSet("[ab]").trim("abacatbab")}
* </pre>
*
* ... returns {@code "catbab"}.
*
*/
public CharSequence trim(CharSequence sequence) {
return trim(sequence, TrimOption.BOTH, SpanCondition.CONTAINED);
}
/**
* Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching code points at the start or
* end of the string, using the trimOption and SpanCondition.CONTAINED. For example:
*
* <pre>
* {@code
*
* new UnicodeSet("[ab]").trim("abacatbab")}
* </pre>
*
* ... returns {@code "catbab"}.
*
*/
public CharSequence trim(CharSequence sequence, TrimOption trimOption) {
return trim(sequence, trimOption, SpanCondition.CONTAINED);
}
/**
* Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching code points at the start or
* end of the string, depending on the trimOption and modifySpan. For example:
*
* <pre>
* {@code
*
* new UnicodeSet("[ab]").trim("abacatbab")}
* </pre>
*
* ... returns {@code "catbab"}.
*
* @param sequence
* the sequence to trim
* @param trimOption
* (optional) LEADING, TRAILING, or BOTH
* @param modifySpan
* (optional) CONTAINED or NOT_CONTAINED
* @return a subsequence
*/
public CharSequence trim(CharSequence sequence, TrimOption trimOption, SpanCondition modifySpan) {
int endLeadContained, startTrailContained;
final int length = sequence.length();
if (trimOption != TrimOption.TRAILING) {
endLeadContained = unicodeSet.span(sequence, modifySpan);
if (endLeadContained == length) {
return "";
}
} else {
endLeadContained = 0;
}
if (trimOption != TrimOption.LEADING) {
startTrailContained = unicodeSet.spanBack(sequence, modifySpan);
} else {
startTrailContained = length;
}
return endLeadContained == 0 && startTrailContained == length ? sequence : sequence.subSequence(
endLeadContained, startTrailContained);
}
}

View file

@ -0,0 +1,58 @@
/*
*******************************************************************************
* Copyright (C) 2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.util;
/**
* Simple struct-like class for int output parameters.
* Like <code>Output&lt;Integer&gt;</code> but without auto-boxing.
*
* @internal but could become public
* @deprecated This API is ICU internal only.
*/
@Deprecated
public class OutputInt {
/**
* The value field.
*
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public int value;
/**
* Constructs an <code>OutputInt</code> with value 0.
*
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public OutputInt() {
}
/**
* Constructs an <code>OutputInt</code> with the given value.
*
* @param value the initial value
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public OutputInt(int value) {
this.value = value;
}
/**
* {@inheritDoc}
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public String toString() {
return Integer.toString(value);
}
}

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 1996-2010, International Business Machines Corporation and *
* Copyright (C) 1996-2014, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
@ -13,6 +13,7 @@ import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.ReplaceableString;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UTF16.StringComparator;
/**
* Testing class for UTF16
@ -1560,6 +1561,39 @@ public final class UTF16Test extends TestFmwk
}
}
public void TestUtilities() {
String[] tests = {
"a",
"\uFFFF",
"😀",
"\uD800",
"\uDC00",
"\uDBFF\uDfff",
"",
"\u0000",
"\uDC00\uD800",
"ab",
"😀a",
null,
};
StringComparator sc = new UTF16.StringComparator(true,false,0);
for (String item1 : tests) {
String nonNull1 = item1 == null ? "" : item1;
int count = UTF16.countCodePoint(nonNull1);
int expected = count == 0 || count > 1 ? -1 : nonNull1.codePointAt(0);
assertEquals("codepoint test " + Utility.hex(nonNull1), expected, UTF16.getSingleCodePoint(item1));
if (expected == -1) {
continue;
}
for (String item2 : tests) {
String nonNull2 = item2 == null ? "" : item2;
int scValue = Integer.signum(sc.compare(nonNull1, nonNull2));
int fValue = Integer.signum(UTF16.compareCodePoint(expected, item2));
assertEquals("comparison " + Utility.hex(nonNull1) + ", " + Utility.hex(nonNull2), scValue, fValue);
}
}
}
public void TestNewString() {
final int[] codePoints = {
UCharacter.toCodePoint(UCharacter.MIN_HIGH_SURROGATE, UCharacter.MAX_LOW_SURROGATE),
@ -1568,6 +1602,7 @@ public final class UTF16Test extends TestFmwk
'A',
-1,
};
final String cpString = "" +
UCharacter.MIN_HIGH_SURROGATE +

View file

@ -1,17 +1,19 @@
/*
*******************************************************************************
* Copyright (C) 2009-2011, International Business Machines Corporation and *
* others. All Rights Reserved. *
* Copyright (C) 2009-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.dev.test.lang;
import java.util.Collection;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSet.SpanCondition;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.util.OutputInt;
/**
* @test
@ -41,7 +43,7 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
}
pos = set.span(string, 1, SpanCondition.SIMPLE);
if (pos != 3) {
errln(String.format("FAIL: UnicodeSet(%s).span(%s) returns the wrong value pos %d (!= 3)",
errln(String.format("FAIL: UnicodeSet(%s).span(%s, 1) returns the wrong value pos %d (!= 3)",
set.toString(), string, pos));
}
}
@ -129,33 +131,15 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
// more complex test. --------------------------------------------------------
// Make the strings in a UnicodeSet easily accessible.
static class UnicodeSetWithStrings {
private static class UnicodeSetWithStrings {
private UnicodeSet set;
private String strings[];
private Collection<String> setStrings;
private int stringsLength;
private boolean hasSurrogates;
public UnicodeSetWithStrings(final UnicodeSet normalSet) {
set = normalSet;
stringsLength = 0;
hasSurrogates = false;
strings = new String[20];
int size = set.size();
if (size > 0 && set.charAt(size - 1) < 0) {
// If a set's last element is not a code point, then it must contain strings.
// Iterate over the set, skip all code point ranges, and cache the strings.
UnicodeSetIterator iter = new UnicodeSetIterator(set);
while (iter.nextRange() && stringsLength < strings.length) {
if (iter.codepoint == UnicodeSetIterator.IS_STRING) {
// Store the pointer to the set's string element
// which we happen to know is a stable pointer.
strings[stringsLength] = iter.getString();
++stringsLength;
}
}
}
setStrings = normalSet.strings();
stringsLength = setStrings.size();
}
public final UnicodeSet getSet() {
@ -166,34 +150,9 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
return (stringsLength > 0);
}
public boolean hasStringsWithSurrogates() {
return hasSurrogates;
public Iterable<String> strings() {
return setStrings;
}
}
static class UnicodeSetWithStringsIterator {
private UnicodeSetWithStrings fSet;
private int nextStringIndex;
public UnicodeSetWithStringsIterator(final UnicodeSetWithStrings set) {
fSet = set;
nextStringIndex = 0;
}
public void reset() {
nextStringIndex = 0;
}
public final String nextString() {
if (nextStringIndex < fSet.stringsLength) {
return fSet.strings[nextStringIndex++];
} else {
return null;
}
}
}
// Compare 16-bit Unicode strings (which may be malformed UTF-16)
@ -231,7 +190,6 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
}
return prev;
} else if (spanCondition == SpanCondition.NOT_CONTAINED) {
UnicodeSetWithStringsIterator iter = new UnicodeSetWithStringsIterator(set);
int c;
int start, next;
for (start = next = 0; start < length;) {
@ -240,9 +198,7 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
if (realSet.contains(c)) {
break;
}
String str;
iter.reset();
while ((str = iter.nextString()) != null) {
for (String str : set.strings()) {
if (str.length() <= (length - start) && matches16CPB(s, start, length, str)) {
// spanNeedsStrings=true;
return start;
@ -252,7 +208,6 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
}
return start;
} else /* CONTAINED or SIMPLE */{
UnicodeSetWithStringsIterator iter = new UnicodeSetWithStringsIterator(set);
int c;
int start, next, maxSpanLimit = 0;
for (start = next = 0; start < length;) {
@ -261,9 +216,7 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
if (!realSet.contains(c)) {
next = start; // Do not span this single, not-contained code point.
}
String str;
iter.reset();
while ((str = iter.nextString()) != null) {
for (String str : set.strings()) {
if (str.length() <= (length - start) && matches16CPB(s, start, length, str)) {
// spanNeedsStrings=true;
int matchLimit = start + str.length();
@ -336,7 +289,6 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
} while (prev > 0);
return prev;
} else if (spanCondition == SpanCondition.NOT_CONTAINED) {
UnicodeSetWithStringsIterator iter = new UnicodeSetWithStringsIterator(set);
int c;
int prev = length, length0 = length;
do {
@ -344,9 +296,7 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
if (realSet.contains(c)) {
break;
}
String str;
iter.reset();
while ((str = iter.nextString()) != null) {
for (String str : set.strings()) {
if (str.length() <= prev && matches16CPB(s, prev - str.length(), length0, str)) {
// spanNeedsStrings=true;
return prev;
@ -356,7 +306,6 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
} while (prev > 0);
return prev;
} else /* SpanCondition.CONTAINED or SIMPLE */{
UnicodeSetWithStringsIterator iter = new UnicodeSetWithStringsIterator(set);
int c;
int prev = length, minSpanStart = length, length0 = length;
do {
@ -365,9 +314,7 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
if (!realSet.contains(c)) {
length = prev; // Do not span this single, not-contained code point.
}
String str;
iter.reset();
while ((str = iter.nextString()) != null) {
for (String str : set.strings()) {
if (str.length() <= prev && matches16CPB(s, prev - str.length(), length0, str)) {
// spanNeedsStrings=true;
int matchStart = prev - str.length();
@ -616,7 +563,7 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
* input expectCount<0).
*/
void verifySpan(final UnicodeSetWithStrings sets[], final String s, int whichSpans,
int expectLimits[], int expectCount, // TODO
int expectLimits[], int expectCount,
final String testName, int index) {
int[] limits = new int[500];
int limitsCount;
@ -1129,4 +1076,54 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
}
}
public void TestSpanAndCount() {
// a set with no strings
UnicodeSet abc = new UnicodeSet('a', 'c');
// a set with an "irrelevant" string (fully contained in the code point set)
UnicodeSet crlf = new UnicodeSet().add('\n').add('\r').add("\r\n");
// a set with no "irrelevant" string but some interesting overlaps
UnicodeSet ab_cd = new UnicodeSet().add('a').add("ab").add("abc").add("cd");
String s = "ab\n\r\r\n" + UTF16.valueOf(0x50000) + "abcde";
OutputInt count = new OutputInt();
assertEquals("abc span[8, 11[", 11,
abc.spanAndCount(s, 8, SpanCondition.SIMPLE, count));
assertEquals("abc count=3", 3, count.value);
assertEquals("no abc span[2, 8[", 8,
abc.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count));
assertEquals("no abc count=5", 5, count.value);
assertEquals("line endings span[2, 6[", 6,
crlf.spanAndCount(s, 2, SpanCondition.CONTAINED, count));
assertEquals("line endings count=3", 3, count.value);
assertEquals("no ab+cd span[2, 8[", 8,
ab_cd.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count));
assertEquals("no ab+cd count=5", 5, count.value);
assertEquals("ab+cd span[8, 12[", 12,
ab_cd.spanAndCount(s, 8, SpanCondition.CONTAINED, count));
assertEquals("ab+cd count=2", 2, count.value);
assertEquals("1x abc span[8, 11[", 11,
ab_cd.spanAndCount(s, 8, SpanCondition.SIMPLE, count));
assertEquals("1x abc count=1", 1, count.value);
abc.freeze();
crlf.freeze();
ab_cd.freeze();
assertEquals("abc span[8, 11[ (frozen)", 11,
abc.spanAndCount(s, 8, SpanCondition.SIMPLE, count));
assertEquals("abc count=3 (frozen)", 3, count.value);
assertEquals("no abc span[2, 8[ (frozen)", 8,
abc.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count));
assertEquals("no abc count=5 (frozen)", 5, count.value);
assertEquals("line endings span[2, 6[ (frozen)", 6,
crlf.spanAndCount(s, 2, SpanCondition.CONTAINED, count));
assertEquals("line endings count=3 (frozen)", 3, count.value);
assertEquals("no ab+cd span[2, 8[ (frozen)", 8,
ab_cd.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count));
assertEquals("no ab+cd count=5 (frozen)", 5, count.value);
assertEquals("ab+cd span[8, 12[ (frozen)", 12,
ab_cd.spanAndCount(s, 8, SpanCondition.CONTAINED, count));
assertEquals("ab+cd count=2 (frozen)", 2, count.value);
assertEquals("1x abc span[8, 11[ (frozen)", 11,
ab_cd.spanAndCount(s, 8, SpanCondition.SIMPLE, count));
assertEquals("1x abc count=1 (frozen)", 1, count.value);
}
}

View file

@ -11,6 +11,7 @@ import java.text.ParsePosition;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
@ -22,6 +23,7 @@ import java.util.SortedSet;
import java.util.TreeSet;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.dev.util.CollectionUtilities;
import com.ibm.icu.impl.SortedSetRelation;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
@ -33,6 +35,11 @@ import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeMatcher;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSet.ComparisonStyle;
import com.ibm.icu.text.UnicodeSet.EntryRange;
import com.ibm.icu.text.UnicodeSetSpanner;
import com.ibm.icu.text.UnicodeSetSpanner.Quantifier;
import com.ibm.icu.text.UnicodeSet.SpanCondition;
import com.ibm.icu.text.UnicodeSetSpanner.TrimOption;
import com.ibm.icu.text.UnicodeSetIterator;
/**
@ -1256,10 +1263,10 @@ public class UnicodeSetTest extends TestFmwk {
String pat = "";
try {
String name =
(j==0) ? UScript.getName(i) : UScript.getShortName(i);
pat = "[:" + name + ":]";
UnicodeSet set = new UnicodeSet(pat);
logln("Ok: " + pat + " -> " + set.toPattern(false));
(j==0) ? UScript.getName(i) : UScript.getShortName(i);
pat = "[:" + name + ":]";
UnicodeSet set = new UnicodeSet(pat);
logln("Ok: " + pat + " -> " + set.toPattern(false));
} catch (IllegalArgumentException e) {
if (pat.length() == 0) {
errln("FAIL (in UScript): No name for script " + i);
@ -1330,9 +1337,9 @@ public class UnicodeSetTest extends TestFmwk {
// The following pattern must contain at least one range "c-d"
// where c or d is a Pattern_White_Space.
String pattern =
"[\\uFEFF \\u200E-\\u20FF \\uFFF9-\\uFFFC \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
"[\\uFEFF \\u200E-\\u20FF \\uFFF9-\\uFFFC \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
String exp =
"[\\u200E-\\u20FF\\uFEFF\\uFFF9-\\uFFFC\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
"[\\u200E-\\u20FF\\uFEFF\\uFFF9-\\uFFFC\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
// We test this with two passes; in the second pass we
// pre-unescape the pattern. Since U+200E is Pattern_White_Space,
// this fails -- which is what we expect.
@ -1563,7 +1570,7 @@ public class UnicodeSetTest extends TestFmwk {
mod2 = new UnicodeSet(set1).retainAll(set2.addAllTo(new LinkedHashSet<String>()));
assertEquals("remove all", mod1, mod2);
}
public void TestComparison() {
UnicodeSet set1 = new UnicodeSet("[a-b d-g {ch} {zh}]").freeze();
UnicodeSet set2 = new UnicodeSet("[c-e {ch}]").freeze();
@ -1579,7 +1586,7 @@ public class UnicodeSetTest extends TestFmwk {
List<UnicodeSet> sorted = new ArrayList(new TreeSet<UnicodeSet>(unsorted));
assertNotEquals("compareTo-shorter-first", unsorted, sorted);
assertEquals("compareTo-shorter-first", goalShortest, sorted);
TreeSet<UnicodeSet> sorted1 = new TreeSet<UnicodeSet>(new Comparator<UnicodeSet>(){
public int compare(UnicodeSet o1, UnicodeSet o2) {
// TODO Auto-generated method stub
@ -1616,34 +1623,34 @@ public class UnicodeSetTest extends TestFmwk {
// now compare all the combinations. If any of them is a code point, use it.
int maxErrorCount = 0;
compare:
for (String last : target) {
for (String curr : target) {
int lastCount = Character.codePointCount(last, 0, last.length());
int currCount = Character.codePointCount(curr, 0, curr.length());
int comparison;
if (lastCount == 1) {
comparison = UnicodeSet.compare(last.codePointAt(0), curr);
} else if (currCount == 1) {
comparison = UnicodeSet.compare(last, curr.codePointAt(0));
} else {
continue;
}
if (comparison != last.compareTo(curr)) {
// repeat for debugging
for (String last : target) {
for (String curr : target) {
int lastCount = Character.codePointCount(last, 0, last.length());
int currCount = Character.codePointCount(curr, 0, curr.length());
int comparison;
if (lastCount == 1) {
comparison = UnicodeSet.compare(last.codePointAt(0), curr);
} else if (currCount == 1) {
comparison = UnicodeSet.compare(last, curr.codePointAt(0));
} else {
continue;
}
if (maxErrorCount++ > 10) {
errln(maxErrorCount + " Failure in comparing " + last + " & " + curr + "\tOmitting others...");
break compare;
if (comparison != last.compareTo(curr)) {
// repeat for debugging
if (lastCount == 1) {
comparison = UnicodeSet.compare(last.codePointAt(0), curr);
} else if (currCount == 1) {
comparison = UnicodeSet.compare(last, curr.codePointAt(0));
}
if (maxErrorCount++ > 10) {
errln(maxErrorCount + " Failure in comparing " + last + " & " + curr + "\tOmitting others...");
break compare;
}
errln(maxErrorCount + " Failure in comparing " + last + " & " + curr);
}
errln(maxErrorCount + " Failure in comparing " + last + " & " + curr);
}
}
}
//compare(Iterable<T>, Iterable<T>)
int max = 10;
List<String> test1 = new ArrayList<String>(max);
@ -1669,7 +1676,7 @@ public class UnicodeSetTest extends TestFmwk {
// check to make sure right exceptions are thrown
Class expected = IllegalArgumentException.class;
Class actual;
try {
actual = null;
@SuppressWarnings("unused")
@ -1678,7 +1685,7 @@ public class UnicodeSetTest extends TestFmwk {
actual = e.getClass();
}
assertEquals("exception if odd", expected, actual);
try {
actual = null;
@SuppressWarnings("unused")
@ -1687,7 +1694,7 @@ public class UnicodeSetTest extends TestFmwk {
actual = e.getClass();
}
assertEquals("exception for start/end problem", expected, actual);
try {
actual = null;
@SuppressWarnings("unused")
@ -1696,7 +1703,7 @@ public class UnicodeSetTest extends TestFmwk {
actual = e.getClass();
}
assertEquals("exception for end/start problem", expected, actual);
CheckRangeSpeed(10000, new UnicodeSet("[:whitespace:]"));
CheckRangeSpeed(1000, new UnicodeSet("[:letter:]"));
}
@ -1731,14 +1738,14 @@ public class UnicodeSetTest extends TestFmwk {
double rangeConstructorTime = (middle - start)/iterations;
double patternConstructorTime = (end - middle)/iterations;
String message = "Range constructor:\t" + rangeConstructorTime + ";\tPattern constructor:\t" + patternConstructorTime + "\t\t"
+ percent.format(rangeConstructorTime/patternConstructorTime-1);
+ percent.format(rangeConstructorTime/patternConstructorTime-1);
if (rangeConstructorTime < 2*patternConstructorTime) {
logln(message);
} else {
errln(message);
}
}
NumberFormat percent = NumberFormat.getPercentInstance();
{
percent.setMaximumFractionDigits(2);
@ -1806,69 +1813,69 @@ public class UnicodeSetTest extends TestFmwk {
}
}
// Following cod block is commented out to eliminate PrettyPrinter depenencies
// Following cod block is commented out to eliminate PrettyPrinter depenencies
// String[] prettyData = {
// "[\\uD7DE-\\uD90C \\uDCB5-\\uDD9F]", // special case
// "[:any:]",
// "[:whitespace:]",
// "[:linebreak=AL:]",
// };
//
// public void TestPrettyPrinting() {
// try{
// PrettyPrinter pp = new PrettyPrinter();
//
// int i = 0;
// for (; i < prettyData.length; ++i) {
// UnicodeSet test = new UnicodeSet(prettyData[i]);
// checkPrettySet(pp, i, test);
// }
// Random random = new Random(0);
// UnicodeSet test = new UnicodeSet();
//
// // To keep runtimes under control, make the number of random test cases
// // to try depends on the test framework exhaustive setting.
// // params.inclusions = 5: default exhaustive value
// // params.inclusions = 10: max exhaustive value.
// int iterations = 50;
// if (params.inclusion > 5) {
// iterations = (params.inclusion-5) * 200;
// }
// for (; i < iterations; ++i) {
// double start = random.nextGaussian() * 0x10000;
// if (start < 0) start = - start;
// if (start > 0x10FFFF) {
// start = 0x10FFFF;
// }
// double end = random.nextGaussian() * 0x100;
// if (end < 0) end = -end;
// end = start + end;
// if (end > 0x10FFFF) {
// end = 0x10FFFF;
// }
// test.complement((int)start, (int)end);
// checkPrettySet(pp, i, test);
// }
// }catch(RuntimeException ex){
// warnln("Could not load Collator");
// }
// }
//
// private void checkPrettySet(PrettyPrinter pp, int i, UnicodeSet test) {
// String pretty = pp.toPattern(test);
// UnicodeSet retry = new UnicodeSet(pretty);
// if (!test.equals(retry)) {
// errln(i + ". Failed test: " + test + " != " + pretty);
// } else {
// logln(i + ". Worked for " + truncate(test.toString()) + " => " + truncate(pretty));
// }
// }
//
// private String truncate(String string) {
// if (string.length() <= 100) return string;
// return string.substring(0,97) + "...";
// }
// String[] prettyData = {
// "[\\uD7DE-\\uD90C \\uDCB5-\\uDD9F]", // special case
// "[:any:]",
// "[:whitespace:]",
// "[:linebreak=AL:]",
// };
//
// public void TestPrettyPrinting() {
// try{
// PrettyPrinter pp = new PrettyPrinter();
//
// int i = 0;
// for (; i < prettyData.length; ++i) {
// UnicodeSet test = new UnicodeSet(prettyData[i]);
// checkPrettySet(pp, i, test);
// }
// Random random = new Random(0);
// UnicodeSet test = new UnicodeSet();
//
// // To keep runtimes under control, make the number of random test cases
// // to try depends on the test framework exhaustive setting.
// // params.inclusions = 5: default exhaustive value
// // params.inclusions = 10: max exhaustive value.
// int iterations = 50;
// if (params.inclusion > 5) {
// iterations = (params.inclusion-5) * 200;
// }
// for (; i < iterations; ++i) {
// double start = random.nextGaussian() * 0x10000;
// if (start < 0) start = - start;
// if (start > 0x10FFFF) {
// start = 0x10FFFF;
// }
// double end = random.nextGaussian() * 0x100;
// if (end < 0) end = -end;
// end = start + end;
// if (end > 0x10FFFF) {
// end = 0x10FFFF;
// }
// test.complement((int)start, (int)end);
// checkPrettySet(pp, i, test);
// }
// }catch(RuntimeException ex){
// warnln("Could not load Collator");
// }
// }
//
// private void checkPrettySet(PrettyPrinter pp, int i, UnicodeSet test) {
// String pretty = pp.toPattern(test);
// UnicodeSet retry = new UnicodeSet(pretty);
// if (!test.equals(retry)) {
// errln(i + ". Failed test: " + test + " != " + pretty);
// } else {
// logln(i + ". Worked for " + truncate(test.toString()) + " => " + truncate(pretty));
// }
// }
//
// private String truncate(String string) {
// if (string.length() <= 100) return string;
// return string.substring(0,97) + "...";
// }
public class TokenSymbolTable implements SymbolTable {
HashMap contents = new HashMap();
@ -1944,7 +1951,7 @@ public class UnicodeSetTest extends TestFmwk {
UnicodeSet set = new UnicodeSet(DATA[i]);
expectContainment(set,
CharsToUnicodeString("abc\\U00010000"),
"\uD800;\uDC00"); // split apart surrogate-pair
"\uD800;\uDC00"); // split apart surrogate-pair
if (set.size() != 4) {
errln(Utility.escape("FAIL: " + DATA[i] + ".size() == " +
set.size() + ", expected 4"));
@ -2171,16 +2178,16 @@ public class UnicodeSetTest extends TestFmwk {
// Now see if the expected relation is true
int status = (minus12.size() != 0 ? 4 : 0)
| (intersection.size() != 0 ? 2 : 0)
| (minus21.size() != 0 ? 1 : 0);
| (intersection.size() != 0 ? 2 : 0)
| (minus21.size() != 0 ? 1 : 0);
if (status != relation) {
errln("FAIL relation incorrect" + message
+ "; desired = " + RELATION_NAME[relation]
+ "; found = " + RELATION_NAME[status]
+ "; set1 = " + set1.toPattern(true)
+ "; set2 = " + set2.toPattern(true)
);
+ "; found = " + RELATION_NAME[status]
+ "; set1 = " + set1.toPattern(true)
+ "; set2 = " + set2.toPattern(true)
);
}
}
@ -2234,7 +2241,7 @@ public class UnicodeSetTest extends TestFmwk {
errln("FAIL " + message
+ "; source = " + s.toPattern(true)
+ "; result = " + t.toPattern(true)
);
);
return false;
}
return true;
@ -2379,7 +2386,7 @@ public class UnicodeSetTest extends TestFmwk {
errln("UnicodeSetIterator.getSet() was not suppose to given an " + "an exception.");
}
}
/* Tests the method public UnicodeSet add(Collection<?> source) */
public void TestAddCollection() {
UnicodeSet us = new UnicodeSet();
@ -2390,9 +2397,99 @@ public class UnicodeSetTest extends TestFmwk {
} catch (Exception e) {
}
}
public void TestConstants() {
assertEquals("Empty", new UnicodeSet(), UnicodeSet.EMPTY);
assertEquals("All", new UnicodeSet(0,0x10FFFF), UnicodeSet.ALL_CODE_POINTS);
}
public void TestIteration() {
UnicodeSet us1 = new UnicodeSet("[abcM{xy}]");
assertEquals("", "M, a-c", CollectionUtilities.join(us1.ranges(), ", "));
// Sample code
for (EntryRange range : us1.ranges()) {
// do something with code points between range.codepointEnd and range.codepointEnd;
}
for (String s : us1.strings()) {
// do something with each string;
}
String[] tests = {
"[M-Qzab{XY}{ZW}]",
"[]",
"[a]",
"[a-c]",
"[{XY}]",
};
for (String test : tests) {
UnicodeSet us = new UnicodeSet(test);
UnicodeSetIterator it = new UnicodeSetIterator(us);
for (EntryRange range : us.ranges()) {
final String title = range.toString();
logln(title);
it.nextRange();
assertEquals(title, it.codepoint, range.codepoint);
assertEquals(title, it.codepointEnd, range.codepointEnd);
// if (range.codepoint != -1) {
// } else {
// assertEquals(title, it.string, range.string);
// }
}
for (String s : us.strings()) {
it.nextRange();
assertEquals("strings", it.string, s);
}
assertFalse("", it.next());
}
}
public void TestReplaceAndDelete() {
UnicodeSetSpanner m;
m = new UnicodeSetSpanner(new UnicodeSet("[._]"));
assertEquals("", "abc", m.deleteFrom("_._a_._b_._c_._"));
assertEquals("", "_.__.__.__._", m.deleteFrom("_._a_._b_._c_._", SpanCondition.NOT_CONTAINED));
assertEquals("", "a_._b_._c", m.trim("_._a_._b_._c_._"));
assertEquals("", "a_._b_._c_._", m.trim("_._a_._b_._c_._", TrimOption.LEADING));
assertEquals("", "_._a_._b_._c", m.trim("_._a_._b_._c_._", TrimOption.TRAILING));
assertEquals("", "a??b??c", m.replaceFrom("a_._b_._c", "??", Quantifier.SPAN));
assertEquals("", "a??b??c", m.replaceFrom(m.trim("_._a_._b_._c_._"), "??", Quantifier.SPAN));
assertEquals("", "XYXYXYaXYXYXYbXYXYXYcXYXYXY", m.replaceFrom("_._a_._b_._c_._", "XY"));
assertEquals("", "XYaXYbXYcXY", m.replaceFrom("_._a_._b_._c_._", "XY", Quantifier.SPAN));
m = new UnicodeSetSpanner(new UnicodeSet("\\p{uppercase}"));
assertEquals("", "TQBF", m.deleteFrom("The Quick Brown Fox.", SpanCondition.NOT_CONTAINED));
m = new UnicodeSetSpanner(m.getUnicodeSet().addAll(new UnicodeSet("\\p{lowercase}")));
assertEquals("", "TheQuickBrownFox", m.deleteFrom("The Quick Brown Fox.", SpanCondition.NOT_CONTAINED));
m = new UnicodeSetSpanner(new UnicodeSet("[{ab}]"));
assertEquals("", "XXc acb", m.replaceFrom("ababc acb", "X"));
assertEquals("", "Xc acb", m.replaceFrom("ababc acb", "X", Quantifier.SPAN));
}
public void TestCodePoints() {
// test supplemental code points and strings clusters
checkCodePoints("x\u0308", "z\u0308", Quantifier.MIN_ELEMENTS, null, 1);
checkCodePoints("𣿡", "𣿢", Quantifier.MIN_ELEMENTS, null, 1);
checkCodePoints("👦", "👧", Quantifier.MIN_ELEMENTS, null, 1);
}
private void checkCodePoints(String a, String b, Quantifier quantifier, String expectedReplaced, int expectedCount) {
final String ab = a+b;
UnicodeSetSpanner m = new UnicodeSetSpanner(new UnicodeSet("[{" + a + "}]"));
assertEquals("new UnicodeSetSpanner(\"[{" + a + "}]\").countIn(\"" + ab + "\")",
expectedCount,
m.countIn(ab, quantifier));
if (expectedReplaced == null) {
expectedReplaced = "-" + b;
}
assertEquals("new UnicodeSetSpanner(\"[{" + a + "}]\").replaceFrom(\"" + ab + "\", \"-\")",
expectedReplaced, m.replaceFrom(ab, "-", quantifier));
}
}