mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 22:44:49 +00:00
ICU-9131 update trunk from branch, after fixes as per core review.
X-SVN-Rev: 36187
This commit is contained in:
parent
b31ff49acf
commit
f7c551d636
9 changed files with 1408 additions and 453 deletions
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2009-2011, International Business Machines
|
||||
* Copyright (C) 2009-2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
|
@ -10,23 +10,25 @@
|
|||
package com.ibm.icu.impl;
|
||||
|
||||
import com.ibm.icu.text.UnicodeSet.SpanCondition;
|
||||
import com.ibm.icu.util.OutputInt;
|
||||
|
||||
/*
|
||||
/**
|
||||
* Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points.
|
||||
*
|
||||
* Latin-1: Look up bytes. 2-byte characters: Bits organized vertically. 3-byte characters: Use zero/one/mixed data
|
||||
* per 64-block in U+0000..U+FFFF, with mixed for illegal ranges. Supplementary characters: Call contains() on the
|
||||
* parent set.
|
||||
* Latin-1: Look up bytes.
|
||||
* 2-byte characters: Bits organized vertically.
|
||||
* 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges.
|
||||
* Supplementary characters: Call contains() on the parent set.
|
||||
*/
|
||||
public final class BMPSet {
|
||||
public static int U16_SURROGATE_OFFSET = ((0xd800 << 10) + 0xdc00 - 0x10000);
|
||||
|
||||
/*
|
||||
/**
|
||||
* One boolean ('true' or 'false') per Latin-1 character.
|
||||
*/
|
||||
private boolean[] latin1Contains;
|
||||
|
||||
/*
|
||||
/**
|
||||
* One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points
|
||||
* correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6}
|
||||
* trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead)
|
||||
|
@ -36,7 +38,7 @@ public final class BMPSet {
|
|||
*/
|
||||
private int[] table7FF;
|
||||
|
||||
/*
|
||||
/**
|
||||
* One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks
|
||||
* correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12}
|
||||
* t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit
|
||||
|
@ -48,14 +50,14 @@ public final class BMPSet {
|
|||
*/
|
||||
private int[] bmpBlockBits;
|
||||
|
||||
/*
|
||||
/**
|
||||
* Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000,
|
||||
* U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are
|
||||
* always looked up in the bit tables. The last pair of indexes is for finding supplementary code points.
|
||||
*/
|
||||
private int[] list4kStarts;
|
||||
|
||||
/*
|
||||
/**
|
||||
* The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for
|
||||
* supplementary code points. The list is terminated with list[listLength-1]=0x110000.
|
||||
*/
|
||||
|
@ -120,22 +122,24 @@ public final class BMPSet {
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
/**
|
||||
* Span the initial substring for which each character c has spanCondition==contains(c). It must be
|
||||
* spanCondition==0 or 1.
|
||||
*
|
||||
* @param start The start index
|
||||
* @param end The end index
|
||||
* @return The length of the span.
|
||||
* @param outCount If not null: Receives the number of code points in the span.
|
||||
* @return the limit (exclusive end) of the span
|
||||
*
|
||||
* NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for
|
||||
* sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points
|
||||
* as usual in ICU.
|
||||
*/
|
||||
public final int span(CharSequence s, int start, int end, SpanCondition spanCondition) {
|
||||
public final int span(CharSequence s, int start, SpanCondition spanCondition,
|
||||
OutputInt outCount) {
|
||||
char c, c2;
|
||||
int i = start;
|
||||
int limit = Math.min(s.length(), end);
|
||||
int limit = s.length();
|
||||
int numSupplementary = 0;
|
||||
if (SpanCondition.NOT_CONTAINED != spanCondition) {
|
||||
// span
|
||||
while (i < limit) {
|
||||
|
@ -170,6 +174,7 @@ public final class BMPSet {
|
|||
if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
|
||||
break;
|
||||
}
|
||||
++numSupplementary;
|
||||
++i;
|
||||
}
|
||||
++i;
|
||||
|
@ -208,15 +213,20 @@ public final class BMPSet {
|
|||
if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
|
||||
break;
|
||||
}
|
||||
++numSupplementary;
|
||||
++i;
|
||||
}
|
||||
++i;
|
||||
}
|
||||
}
|
||||
return i - start;
|
||||
if (outCount != null) {
|
||||
int spanLength = i - start;
|
||||
outCount.value = spanLength - numSupplementary; // number of code points
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
/*
|
||||
/**
|
||||
* Symmetrical with span().
|
||||
* Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >=
|
||||
* limit and spanCondition==0 or 1.
|
||||
|
@ -226,7 +236,6 @@ public final class BMPSet {
|
|||
public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) {
|
||||
char c, c2;
|
||||
|
||||
limit = Math.min(s.length(), limit);
|
||||
if (SpanCondition.NOT_CONTAINED != spanCondition) {
|
||||
// span
|
||||
for (;;) {
|
||||
|
@ -311,7 +320,7 @@ public final class BMPSet {
|
|||
return limit + 1;
|
||||
}
|
||||
|
||||
/*
|
||||
/**
|
||||
* Set bits in a bit rectangle in "vertical" bit organization. start<limit<=0x800
|
||||
*/
|
||||
private static void set32x64Bits(int[] table, int start, int limit) {
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2009-2012, International Business Machines
|
||||
* Copyright (C) 2009-2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
|
@ -13,6 +13,7 @@ import java.util.ArrayList;
|
|||
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSet.SpanCondition;
|
||||
import com.ibm.icu.util.OutputInt;
|
||||
|
||||
/*
|
||||
* Implement span() etc. for a set with strings.
|
||||
|
@ -22,54 +23,69 @@ import com.ibm.icu.text.UnicodeSet.SpanCondition;
|
|||
public class UnicodeSetStringSpan {
|
||||
|
||||
/*
|
||||
* Which span() variant will be used? The object is either built for one variant and used once, or built for all and
|
||||
* may be used many times.
|
||||
* Which span() variant will be used? The object is either built for one variant and used once,
|
||||
* or built for all and may be used many times.
|
||||
*/
|
||||
public static final int WITH_COUNT = 0x40; // spanAndCount() may be called
|
||||
public static final int FWD = 0x20;
|
||||
public static final int BACK = 0x10;
|
||||
public static final int UTF16 = 8;
|
||||
// public static final int UTF16 = 8;
|
||||
public static final int CONTAINED = 2;
|
||||
public static final int NOT_CONTAINED = 1;
|
||||
|
||||
public static final int ALL = 0x3f;
|
||||
public static final int ALL = 0x7f;
|
||||
|
||||
public static final int FWD_UTF16_CONTAINED = FWD | UTF16 | CONTAINED;
|
||||
public static final int FWD_UTF16_NOT_CONTAINED = FWD | UTF16 | NOT_CONTAINED;
|
||||
public static final int BACK_UTF16_CONTAINED = BACK | UTF16 | CONTAINED;
|
||||
public static final int BACK_UTF16_NOT_CONTAINED = BACK | UTF16 | NOT_CONTAINED;
|
||||
public static final int FWD_UTF16_CONTAINED = FWD | /* UTF16 | */ CONTAINED;
|
||||
public static final int FWD_UTF16_NOT_CONTAINED = FWD | /* UTF16 | */NOT_CONTAINED;
|
||||
public static final int BACK_UTF16_CONTAINED = BACK | /* UTF16 | */ CONTAINED;
|
||||
public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED;
|
||||
|
||||
// Special spanLength short values. (since Java has not unsigned byte type)
|
||||
// All code points in the string are contained in the parent set.
|
||||
/**
|
||||
* Special spanLength short values. (since Java has not unsigned byte type)
|
||||
* All code points in the string are contained in the parent set.
|
||||
*/
|
||||
static final short ALL_CP_CONTAINED = 0xff;
|
||||
// The spanLength is >=0xfe.
|
||||
/** The spanLength is >=0xfe. */
|
||||
static final short LONG_SPAN = ALL_CP_CONTAINED - 1;
|
||||
|
||||
// Set for span(). Same as parent but without strings.
|
||||
/** Set for span(). Same as parent but without strings. */
|
||||
private UnicodeSet spanSet;
|
||||
|
||||
// Set for span(not contained).
|
||||
// Same as spanSet, plus characters that start or end strings.
|
||||
/**
|
||||
* Set for span(not contained).
|
||||
* Same as spanSet, plus characters that start or end strings.
|
||||
*/
|
||||
private UnicodeSet spanNotSet;
|
||||
|
||||
// The strings of the parent set.
|
||||
/** The strings of the parent set. */
|
||||
private ArrayList<String> strings;
|
||||
|
||||
// the lengths of span(), spanBack() etc. for each string.
|
||||
/** The lengths of span(), spanBack() etc. for each string. */
|
||||
private short[] spanLengths;
|
||||
|
||||
// Maximum lengths of relevant strings.
|
||||
/** Maximum lengths of relevant strings. */
|
||||
private int maxLength16;
|
||||
|
||||
// Set up for all variants of span()?
|
||||
/** Are there strings that are not fully contained in the code point set? */
|
||||
private boolean someRelevant;
|
||||
|
||||
/** Set up for all variants of span()? */
|
||||
private boolean all;
|
||||
|
||||
// Span helper
|
||||
/** Span helper */
|
||||
private OffsetList offsets;
|
||||
|
||||
// Construct for all variants of span(), or only for any one variant.
|
||||
// Initialize as little as possible, for single use.
|
||||
/**
|
||||
* Constructs for all variants of span(), or only for any one variant.
|
||||
* Initializes as little as possible, for single use.
|
||||
*/
|
||||
public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) {
|
||||
spanSet = new UnicodeSet(0, 0x10ffff);
|
||||
// TODO: With Java 6, just take the parent set's strings as is,
|
||||
// as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings.
|
||||
// Then iterate via the first() and higher() methods.
|
||||
// (We do not want to create multiple Iterator objects in each span().)
|
||||
// See ICU ticket #7454.
|
||||
strings = setStrings;
|
||||
all = (which == ALL);
|
||||
spanSet.retainAll(set);
|
||||
|
@ -90,7 +106,7 @@ public class UnicodeSetStringSpan {
|
|||
int stringsLength = strings.size();
|
||||
|
||||
int i, spanLength;
|
||||
boolean someRelevant = false;
|
||||
someRelevant = false;
|
||||
for (i = 0; i < stringsLength; ++i) {
|
||||
String string = strings.get(i);
|
||||
int length16 = string.length();
|
||||
|
@ -98,12 +114,11 @@ public class UnicodeSetStringSpan {
|
|||
if (spanLength < length16) { // Relevant string.
|
||||
someRelevant = true;
|
||||
}
|
||||
if ((0 != (which & UTF16)) && length16 > maxLength16) {
|
||||
if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) {
|
||||
maxLength16 = length16;
|
||||
}
|
||||
}
|
||||
if (!someRelevant) {
|
||||
maxLength16 = 0;
|
||||
if (!someRelevant && (which & WITH_COUNT) == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -140,7 +155,7 @@ public class UnicodeSetStringSpan {
|
|||
int length16 = string.length();
|
||||
spanLength = spanSet.span(string, SpanCondition.CONTAINED);
|
||||
if (spanLength < length16) { // Relevant string.
|
||||
if (0 != (which & UTF16)) {
|
||||
if (true /* 0 != (which & UTF16) */) {
|
||||
if (0 != (which & CONTAINED)) {
|
||||
if (0 != (which & FWD)) {
|
||||
spanLengths[i] = makeSpanLengthByte(spanLength);
|
||||
|
@ -188,10 +203,12 @@ public class UnicodeSetStringSpan {
|
|||
* Constructs a copy of an existing UnicodeSetStringSpan.
|
||||
* Assumes which==ALL for a frozen set.
|
||||
*/
|
||||
public UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan, final ArrayList<String> newParentSetStrings) {
|
||||
public UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan,
|
||||
final ArrayList<String> newParentSetStrings) {
|
||||
spanSet = otherStringSpan.spanSet;
|
||||
strings = newParentSetStrings;
|
||||
maxLength16 = otherStringSpan.maxLength16;
|
||||
someRelevant = otherStringSpan.someRelevant;
|
||||
all = true;
|
||||
if (otherStringSpan.spanNotSet == otherStringSpan.spanSet) {
|
||||
spanNotSet = spanSet;
|
||||
|
@ -203,22 +220,25 @@ public class UnicodeSetStringSpan {
|
|||
spanLengths = otherStringSpan.spanLengths.clone();
|
||||
}
|
||||
|
||||
/*
|
||||
/**
|
||||
* Do the strings need to be checked in span() etc.?
|
||||
*
|
||||
* @return TRUE if strings need to be checked (call span() here), FALSE if not (use a BMPSet for best performance).
|
||||
* @return true if strings need to be checked (call span() here),
|
||||
* false if not (use a BMPSet for best performance).
|
||||
*/
|
||||
public boolean needsStringSpanUTF16() {
|
||||
return (maxLength16 != 0);
|
||||
return someRelevant;
|
||||
}
|
||||
|
||||
// For fast UnicodeSet::contains(c).
|
||||
/** For fast UnicodeSet::contains(c). */
|
||||
public boolean contains(int c) {
|
||||
return spanSet.contains(c);
|
||||
}
|
||||
|
||||
// Add a starting or ending string character to the spanNotSet
|
||||
// so that a character span ends before any string.
|
||||
/**
|
||||
* Adds a starting or ending string character to the spanNotSet
|
||||
* so that a character span ends before any string.
|
||||
*/
|
||||
private void addToSpanNotSet(int c) {
|
||||
if (spanNotSet == null || spanNotSet == spanSet) {
|
||||
if (spanSet.contains(c)) {
|
||||
|
@ -230,12 +250,14 @@ public class UnicodeSetStringSpan {
|
|||
}
|
||||
|
||||
/*
|
||||
* Note: In span() when spanLength==0 (after a string match, or at the beginning after an empty code point span) and
|
||||
* in spanNot() and spanNotUTF8(), string matching could use a binary search because all string matches are done
|
||||
* Note: In span() when spanLength==0
|
||||
* (after a string match, or at the beginning after an empty code point span)
|
||||
* and in spanNot() and spanNotUTF8(),
|
||||
* string matching could use a binary search because all string matches are done
|
||||
* from the same start index.
|
||||
*
|
||||
*
|
||||
* For UTF-8, this would require a comparison function that returns UTF-16 order.
|
||||
*
|
||||
*
|
||||
* This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets
|
||||
* with strings have very few very short strings. For cases with many strings, it might be better to use a different
|
||||
* API and implementation with a DFA (state machine).
|
||||
|
@ -244,84 +266,119 @@ public class UnicodeSetStringSpan {
|
|||
/*
|
||||
* Algorithm for span(SpanCondition.CONTAINED)
|
||||
*
|
||||
* Theoretical algorithm: - Iterate through the string, and at each code point boundary: + If the code point there
|
||||
* is in the set, then remember to continue after it. + If a set string matches at the current position, then
|
||||
* remember to continue after it. + Either recursively span for each code point or string match, or recursively span
|
||||
* for all but the shortest one and iteratively continue the span with the shortest local match. + Remember the
|
||||
* longest recursive span (the farthest end point). + If there is no match at the current position, neither for the
|
||||
* code point there nor for any set string, then stop and return the longest recursive span length.
|
||||
*
|
||||
* Theoretical algorithm:
|
||||
* - Iterate through the string, and at each code point boundary:
|
||||
* + If the code point there is in the set, then remember to continue after it.
|
||||
* + If a set string matches at the current position, then remember to continue after it.
|
||||
* + Either recursively span for each code point or string match, or recursively span
|
||||
* for all but the shortest one and iteratively continue the span with the shortest local match.
|
||||
* + Remember the longest recursive span (the farthest end point).
|
||||
* + If there is no match at the current position,
|
||||
* neither for the code point there nor for any set string,
|
||||
* then stop and return the longest recursive span length.
|
||||
*
|
||||
* Optimized implementation:
|
||||
*
|
||||
* (We assume that most sets will have very few very short strings. A span using a string-less set is extremely
|
||||
* fast.)
|
||||
*
|
||||
* Create and cache a spanSet which contains all of the single code points of the original set but none of its
|
||||
* strings.
|
||||
*
|
||||
* - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). - Loop: + Try to match each set
|
||||
* string at the end of the spanLength. ~ Set strings that start with set-contained code points must be matched with
|
||||
* a partial overlap because the recursive algorithm would have tried to match them at every position. ~ Set strings
|
||||
* that entirely consist of set-contained code points are irrelevant for span(SpanCondition.CONTAINED)
|
||||
* because the recursive algorithm would continue after them anyway and find the longest recursive match from their
|
||||
* end. ~ Rather than recursing, note each end point of a set string match. + If no set string matched after
|
||||
* spanSet.span(), then return with where the spanSet.span() ended. + If at least one set string matched after
|
||||
* spanSet.span(), then pop the shortest string match end point and continue the loop, trying to match all set
|
||||
* strings from there. + If at least one more set string matched after a previous string match, then test if the
|
||||
* code point after the previous string match is also contained in the set. Continue the loop with the shortest end
|
||||
* point of either this code point or a matching set string. + If no more set string matched after a previous string
|
||||
* match, then try another spanLength=spanSet.span(SpanCondition.CONTAINED). Stop if spanLength==0,
|
||||
* otherwise continue the loop.
|
||||
*
|
||||
*
|
||||
* (We assume that most sets will have very few very short strings.
|
||||
* A span using a string-less set is extremely fast.)
|
||||
*
|
||||
* Create and cache a spanSet which contains all of the single code points of the original set
|
||||
* but none of its strings.
|
||||
*
|
||||
* - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
|
||||
* - Loop:
|
||||
* + Try to match each set string at the end of the spanLength.
|
||||
* ~ Set strings that start with set-contained code points
|
||||
* must be matched with a partial overlap
|
||||
* because the recursive algorithm would have tried to match them at every position.
|
||||
* ~ Set strings that entirely consist of set-contained code points
|
||||
* are irrelevant for span(SpanCondition.CONTAINED)
|
||||
* because the recursive algorithm would continue after them anyway and
|
||||
* find the longest recursive match from their end.
|
||||
* ~ Rather than recursing, note each end point of a set string match.
|
||||
* + If no set string matched after spanSet.span(),
|
||||
* then return with where the spanSet.span() ended.
|
||||
* + If at least one set string matched after spanSet.span(),
|
||||
* then pop the shortest string match end point and continue the loop,
|
||||
* trying to match all set strings from there.
|
||||
* + If at least one more set string matched after a previous string match, then test if the
|
||||
* code point after the previous string match is also contained in the set.
|
||||
* Continue the loop with the shortest end point of
|
||||
* either this code point or a matching set string.
|
||||
* + If no more set string matched after a previous string match,
|
||||
* then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
|
||||
* Stop if spanLength==0, otherwise continue the loop.
|
||||
*
|
||||
* By noting each end point of a set string match, the function visits each string position at most once and
|
||||
* finishes in linear time.
|
||||
*
|
||||
* The recursive algorithm may visit the same string position many times if multiple paths lead to it and finishes
|
||||
* in exponential time.
|
||||
*
|
||||
* The recursive algorithm may visit the same string position many times
|
||||
* if multiple paths lead to it and finishes in exponential time.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Algorithm for span(SIMPLE)
|
||||
*
|
||||
* Theoretical algorithm: - Iterate through the string, and at each code point boundary: + If the code point there
|
||||
* is in the set, then remember to continue after it. + If a set string matches at the current position, then
|
||||
* remember to continue after it. + Continue from the farthest match position and ignore all others. + If there is
|
||||
* no match at the current position, then stop and return the current position.
|
||||
*
|
||||
* Theoretical algorithm:
|
||||
* - Iterate through the string, and at each code point boundary:
|
||||
* + If the code point there is in the set, then remember to continue after it.
|
||||
* + If a set string matches at the current position, then remember to continue after it.
|
||||
* + Continue from the farthest match position and ignore all others.
|
||||
* + If there is no match at the current position, then stop and return the current position.
|
||||
*
|
||||
* Optimized implementation:
|
||||
*
|
||||
*
|
||||
* (Same assumption and spanSet as above.)
|
||||
*
|
||||
* - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). - Loop: + Try to match each set
|
||||
* string at the end of the spanLength. ~ Set strings that start with set-contained code points must be matched with
|
||||
* a partial overlap because the standard algorithm would have tried to match them earlier. ~ Set strings that
|
||||
* entirely consist of set-contained code points must be matched with a full overlap because the longest-match
|
||||
* algorithm would hide set string matches that end earlier. Such set strings need not be matched earlier inside the
|
||||
* code point span because the standard algorithm would then have continued after the set string match anyway. ~
|
||||
* Remember the longest set string match (farthest end point) from the earliest starting point. + If no set string
|
||||
* matched after spanSet.span(), then return with where the spanSet.span() ended. + If at least one set string
|
||||
* matched, then continue the loop after the longest match from the earliest position. + If no more set string
|
||||
* matched after a previous string match, then try another
|
||||
* spanLength=spanSet.span(SpanCondition.CONTAINED). Stop if spanLength==0, otherwise continue the
|
||||
* loop.
|
||||
*
|
||||
* - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
|
||||
* - Loop:
|
||||
* + Try to match each set string at the end of the spanLength.
|
||||
* ~ Set strings that start with set-contained code points
|
||||
* must be matched with a partial overlap
|
||||
* because the standard algorithm would have tried to match them earlier.
|
||||
* ~ Set strings that entirely consist of set-contained code points
|
||||
* must be matched with a full overlap because the longest-match algorithm
|
||||
* would hide set string matches that end earlier.
|
||||
* Such set strings need not be matched earlier inside the code point span
|
||||
* because the standard algorithm would then have
|
||||
* continued after the set string match anyway.
|
||||
* ~ Remember the longest set string match (farthest end point)
|
||||
* from the earliest starting point.
|
||||
* + If no set string matched after spanSet.span(),
|
||||
* then return with where the spanSet.span() ended.
|
||||
* + If at least one set string matched,
|
||||
* then continue the loop after the longest match from the earliest position.
|
||||
* + If no more set string matched after a previous string match,
|
||||
* then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
|
||||
* Stop if spanLength==0, otherwise continue the loop.
|
||||
*/
|
||||
/**
|
||||
* Span a string.
|
||||
* Spans a string.
|
||||
*
|
||||
* @param s The string to be spanned
|
||||
* @param start The start index that the span begins
|
||||
* @param spanCondition The span condition
|
||||
* @return the length of the span
|
||||
* @return the limit (exclusive end) of the span
|
||||
*/
|
||||
public synchronized int span(CharSequence s, int start, int length, SpanCondition spanCondition) {
|
||||
public int span(CharSequence s, int start, SpanCondition spanCondition) {
|
||||
if (spanCondition == SpanCondition.NOT_CONTAINED) {
|
||||
return spanNot(s, start, length);
|
||||
return spanNot(s, start, null);
|
||||
}
|
||||
int spanLength = spanSet.span(s.subSequence(start, start + length), SpanCondition.CONTAINED);
|
||||
if (spanLength == length) {
|
||||
return length;
|
||||
int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED);
|
||||
if (spanLimit == s.length()) {
|
||||
return spanLimit;
|
||||
}
|
||||
return spanWithStrings(s, start, spanLimit, spanCondition);
|
||||
}
|
||||
|
||||
/**
|
||||
* Synchronized method for complicated spans using the offsets.
|
||||
* Avoids synchronization for simple cases.
|
||||
*
|
||||
* @param spanLimit = spanSet.span(s, start, CONTAINED)
|
||||
*/
|
||||
private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit,
|
||||
SpanCondition spanCondition) {
|
||||
// Consider strings; they may overlap with the span.
|
||||
int initSize = 0;
|
||||
if (spanCondition == SpanCondition.CONTAINED) {
|
||||
|
@ -329,7 +386,9 @@ public class UnicodeSetStringSpan {
|
|||
initSize = maxLength16;
|
||||
}
|
||||
offsets.setMaxLength(initSize);
|
||||
int pos = start + spanLength, rest = length - spanLength;
|
||||
int length = s.length();
|
||||
int pos = spanLimit, rest = length - spanLimit;
|
||||
int spanLength = spanLimit - start;
|
||||
int i, stringsLength = strings.size();
|
||||
for (;;) {
|
||||
if (spanCondition == SpanCondition.CONTAINED) {
|
||||
|
@ -429,7 +488,7 @@ public class UnicodeSetStringSpan {
|
|||
// Otherwise, an unlimited code point span is only tried again when no
|
||||
// strings match, and if such a non-initial span fails we stop.
|
||||
if (offsets.isEmpty()) {
|
||||
return pos - start; // No strings matched after a span.
|
||||
return pos; // No strings matched after a span.
|
||||
}
|
||||
// Match strings from after the next string match.
|
||||
} else {
|
||||
|
@ -437,11 +496,12 @@ public class UnicodeSetStringSpan {
|
|||
if (offsets.isEmpty()) {
|
||||
// No more strings matched after a previous string match.
|
||||
// Try another code point span from after the last string match.
|
||||
spanLength = spanSet.span(s.subSequence(pos, pos + rest), SpanCondition.CONTAINED);
|
||||
spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED);
|
||||
spanLength = spanLimit - pos;
|
||||
if (spanLength == rest || // Reached the end of the string, or
|
||||
spanLength == 0 // neither strings nor span progressed.
|
||||
) {
|
||||
return pos + spanLength - start;
|
||||
return spanLimit;
|
||||
}
|
||||
pos += spanLength;
|
||||
rest -= spanLength;
|
||||
|
@ -467,13 +527,110 @@ public class UnicodeSetStringSpan {
|
|||
// Match strings from after the next string match.
|
||||
}
|
||||
}
|
||||
int minOffset = offsets.popMinimum();
|
||||
int minOffset = offsets.popMinimum(null);
|
||||
pos += minOffset;
|
||||
rest -= minOffset;
|
||||
spanLength = 0; // Match strings from after a string match.
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Spans a string and counts the smallest number of set elements on any path across the span.
|
||||
*
|
||||
* <p>For proper counting, we cannot ignore strings that are fully contained in code point spans.
|
||||
*
|
||||
* <p>If the set does not have any fully-contained strings, then we could optimize this
|
||||
* like span(), but such sets are likely rare, and this is at least still linear.
|
||||
*
|
||||
* @param s The string to be spanned
|
||||
* @param start The start index that the span begins
|
||||
* @param spanCondition The span condition
|
||||
* @param outCount The count
|
||||
* @return the limit (exclusive end) of the span
|
||||
*/
|
||||
public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition,
|
||||
OutputInt outCount) {
|
||||
if (spanCondition == SpanCondition.NOT_CONTAINED) {
|
||||
return spanNot(s, start, outCount);
|
||||
}
|
||||
// Consider strings; they may overlap with the span,
|
||||
// and they may result in a smaller count that with just code points.
|
||||
if (spanCondition == SpanCondition.CONTAINED) {
|
||||
return spanContainedAndCount(s, start, outCount);
|
||||
}
|
||||
// SIMPLE (not synchronized, does not use offsets)
|
||||
int stringsLength = strings.size();
|
||||
int length = s.length();
|
||||
int pos = start;
|
||||
int rest = length - start;
|
||||
int count = 0;
|
||||
while (rest != 0) {
|
||||
// Try to match the next code point.
|
||||
int cpLength = spanOne(spanSet, s, pos, rest);
|
||||
int maxInc = (cpLength > 0) ? cpLength : 0;
|
||||
// Try to match all of the strings.
|
||||
for (int i = 0; i < stringsLength; ++i) {
|
||||
String string = strings.get(i);
|
||||
int length16 = string.length();
|
||||
if (maxInc < length16 && length16 <= rest &&
|
||||
matches16CPB(s, pos, length, string, length16)) {
|
||||
maxInc = length16;
|
||||
}
|
||||
}
|
||||
// We are done if there is no match beyond pos.
|
||||
if (maxInc == 0) {
|
||||
outCount.value = count;
|
||||
return pos;
|
||||
}
|
||||
// Continue from the longest match.
|
||||
++count;
|
||||
pos += maxInc;
|
||||
rest -= maxInc;
|
||||
}
|
||||
outCount.value = count;
|
||||
return pos;
|
||||
}
|
||||
|
||||
private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) {
|
||||
// Use offset list to try all possibilities.
|
||||
offsets.setMaxLength(maxLength16);
|
||||
int stringsLength = strings.size();
|
||||
int length = s.length();
|
||||
int pos = start;
|
||||
int rest = length - start;
|
||||
int count = 0;
|
||||
while (rest != 0) {
|
||||
// Try to match the next code point.
|
||||
int cpLength = spanOne(spanSet, s, pos, rest);
|
||||
if (cpLength > 0) {
|
||||
offsets.addOffsetAndCount(cpLength, count + 1);
|
||||
}
|
||||
// Try to match all of the strings.
|
||||
for (int i = 0; i < stringsLength; ++i) {
|
||||
String string = strings.get(i);
|
||||
int length16 = string.length();
|
||||
// Note: If the strings were sorted by length, then we could also
|
||||
// avoid trying to match if there is already a match of the same length.
|
||||
if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) &&
|
||||
matches16CPB(s, pos, length, string, length16)) {
|
||||
offsets.addOffsetAndCount(length16, count + 1);
|
||||
}
|
||||
}
|
||||
// We are done if there is no match beyond pos.
|
||||
if (offsets.isEmpty()) {
|
||||
outCount.value = count;
|
||||
return pos;
|
||||
}
|
||||
// Continue from the nearest match.
|
||||
int minOffset = offsets.popMinimum(outCount);
|
||||
count = outCount.value;
|
||||
pos += minOffset;
|
||||
rest -= minOffset;
|
||||
}
|
||||
outCount.value = count;
|
||||
return pos;
|
||||
}
|
||||
|
||||
/**
|
||||
* Span a string backwards.
|
||||
*
|
||||
|
@ -638,59 +795,72 @@ public class UnicodeSetStringSpan {
|
|||
// Match strings from before the next string match.
|
||||
}
|
||||
}
|
||||
pos -= offsets.popMinimum();
|
||||
pos -= offsets.popMinimum(null);
|
||||
spanLength = 0; // Match strings from before a string match.
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
/**
|
||||
* Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED)
|
||||
*
|
||||
* Theoretical algorithm: - Iterate through the string, and at each code point boundary: + If the code point there
|
||||
* is in the set, then return with the current position. + If a set string matches at the current position, then
|
||||
* return with the current position.
|
||||
*
|
||||
* Theoretical algorithm:
|
||||
* - Iterate through the string, and at each code point boundary:
|
||||
* + If the code point there is in the set, then return with the current position.
|
||||
* + If a set string matches at the current position, then return with the current position.
|
||||
*
|
||||
* Optimized implementation:
|
||||
*
|
||||
*
|
||||
* (Same assumption as for span() above.)
|
||||
*
|
||||
* Create and cache a spanNotSet which contains all of the single code points of the original set but none of its
|
||||
* strings. For each set string add its initial code point to the spanNotSet. (Also add its final code point for
|
||||
* spanNotBack().)
|
||||
*
|
||||
*
|
||||
* Create and cache a spanNotSet which contains
|
||||
* all of the single code points of the original set but none of its strings.
|
||||
* For each set string add its initial code point to the spanNotSet.
|
||||
* (Also add its final code point for spanNotBack().)
|
||||
*
|
||||
* - Loop:
|
||||
* + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED).
|
||||
* + If the current code point is in the original set, then return the current position.
|
||||
* + If any set string matches at the current position, then return the current position.
|
||||
* + If there is no match at the current position, neither for the code point
|
||||
* there nor for any set string, then skip this code point and continue the loop. This happens for
|
||||
* set-string-initial code points that were added to spanNotSet when there is not actually a match for such a set
|
||||
* string.
|
||||
* there nor for any set string, then skip this code point and continue the loop.
|
||||
* This happens for set-string-initial code points that were added to spanNotSet
|
||||
* when there is not actually a match for such a set string.
|
||||
*
|
||||
* @return the length of the span
|
||||
* @param s The string to be spanned
|
||||
* @param start The start index that the span begins
|
||||
* @param outCount If not null: Receives the number of code points across the span.
|
||||
* @return the limit (exclusive end) of the span
|
||||
*/
|
||||
private int spanNot(CharSequence s, int start, int length) {
|
||||
int pos = start, rest = length;
|
||||
int i, stringsLength = strings.size();
|
||||
private int spanNot(CharSequence s, int start, OutputInt outCount) {
|
||||
int length = s.length();
|
||||
int pos = start, rest = length - start;
|
||||
int stringsLength = strings.size();
|
||||
int count = 0;
|
||||
do {
|
||||
// Span until we find a code point from the set,
|
||||
// or a code point that starts or ends some string.
|
||||
i = spanNotSet.span(s.subSequence(pos, pos + rest), SpanCondition.NOT_CONTAINED);
|
||||
if (i == rest) {
|
||||
int spanLimit;
|
||||
if (outCount == null) {
|
||||
spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED);
|
||||
} else {
|
||||
spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount);
|
||||
outCount.value = count = count + outCount.value;
|
||||
}
|
||||
if (spanLimit == length) {
|
||||
return length; // Reached the end of the string.
|
||||
}
|
||||
pos += i;
|
||||
rest -= i;
|
||||
pos = spanLimit;
|
||||
rest = length - spanLimit;
|
||||
|
||||
// Check whether the current code point is in the original set,
|
||||
// without the string starts and ends.
|
||||
int cpLength = spanOne(spanSet, s, pos, rest);
|
||||
if (cpLength > 0) {
|
||||
return pos - start; // There is a set element at pos.
|
||||
return pos; // There is a set element at pos.
|
||||
}
|
||||
|
||||
// Try to match the strings at pos.
|
||||
for (i = 0; i < stringsLength; ++i) {
|
||||
for (int i = 0; i < stringsLength; ++i) {
|
||||
if (spanLengths[i] == ALL_CP_CONTAINED) {
|
||||
continue; // Irrelevant string.
|
||||
}
|
||||
|
@ -698,7 +868,7 @@ public class UnicodeSetStringSpan {
|
|||
|
||||
int length16 = string.length();
|
||||
if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) {
|
||||
return pos - start; // There is a set element at pos.
|
||||
return pos; // There is a set element at pos.
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -707,7 +877,11 @@ public class UnicodeSetStringSpan {
|
|||
// cpLength<0
|
||||
pos -= cpLength;
|
||||
rest += cpLength;
|
||||
++count;
|
||||
} while (rest != 0);
|
||||
if (outCount != null) {
|
||||
outCount.value = count;
|
||||
}
|
||||
return length; // Reached the end of the string.
|
||||
}
|
||||
|
||||
|
@ -773,20 +947,24 @@ public class UnicodeSetStringSpan {
|
|||
* Compare 16-bit Unicode strings (which may be malformed UTF-16)
|
||||
* at code point boundaries.
|
||||
* That is, each edge of a match must not be in the middle of a surrogate pair.
|
||||
* @param s The string to match in.
|
||||
* @param start The start index of s.
|
||||
* @param slength The length of s from start.
|
||||
* @param limit The limit of the subsequence of s being spanned.
|
||||
* @param t The substring to be matched in s.
|
||||
* @param tlength The length of t.
|
||||
*/
|
||||
static boolean matches16CPB(CharSequence s, int start, int slength, final String t, int tlength) {
|
||||
return !(0 < start && com.ibm.icu.text.UTF16.isLeadSurrogate (s.charAt(start - 1)) &&
|
||||
com.ibm.icu.text.UTF16.isTrailSurrogate(s.charAt(start + 0)))
|
||||
&& !(tlength < slength && com.ibm.icu.text.UTF16.isLeadSurrogate (s.charAt(start + tlength - 1)) &&
|
||||
com.ibm.icu.text.UTF16.isTrailSurrogate(s.charAt(start + tlength)))
|
||||
&& matches16(s, start, t, tlength);
|
||||
static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) {
|
||||
return matches16(s, start, t, tlength)
|
||||
&& !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) &&
|
||||
Character.isLowSurrogate(s.charAt(start)))
|
||||
&& !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) &&
|
||||
Character.isLowSurrogate(s.charAt(start + tlength)));
|
||||
}
|
||||
|
||||
// Does the set contain the next code point?
|
||||
// If so, return its length; otherwise return its negative length.
|
||||
/**
|
||||
* Does the set contain the next code point?
|
||||
* If so, return its length; otherwise return its negative length.
|
||||
*/
|
||||
static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) {
|
||||
char c = s.charAt(start);
|
||||
if (c >= 0xd800 && c <= 0xdbff && length >= 2) {
|
||||
|
@ -811,47 +989,57 @@ public class UnicodeSetStringSpan {
|
|||
return set.contains(c) ? 1 : -1;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
/**
|
||||
* Helper class for UnicodeSetStringSpan.
|
||||
*
|
||||
* List of offsets from the current position from where to try matching a code point or a string. Store offsets rather
|
||||
* than indexes to simplify the code and use the same list for both increments (in span()) and decrements (in
|
||||
* spanBack()).
|
||||
*
|
||||
* Assumption: The maximum offset is limited, and the offsets that are stored at any one time are relatively dense, that
|
||||
* is, there are normally no gaps of hundreds or thousands of offset values.
|
||||
*
|
||||
* The implementation uses a circular buffer of byte flags, each indicating whether the corresponding offset is in the
|
||||
* list. This avoids inserting into a sorted list of offsets (or absolute indexes) and physically moving part of the
|
||||
* list.
|
||||
*
|
||||
* Note: In principle, the caller should setMaxLength() to the maximum of the max string length and U16_LENGTH/U8_LENGTH
|
||||
* <p>List of offsets from the current position from where to try matching
|
||||
* a code point or a string.
|
||||
* Stores offsets rather than indexes to simplify the code and use the same list
|
||||
* for both increments (in span()) and decrements (in spanBack()).
|
||||
*
|
||||
* <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time
|
||||
* are relatively dense, that is,
|
||||
* there are normally no gaps of hundreds or thousands of offset values.
|
||||
*
|
||||
* <p>This class optionally also tracks the minimum non-negative count for each position,
|
||||
* intended to count the smallest number of elements of any path leading to that position.
|
||||
*
|
||||
* <p>The implementation uses a circular buffer of count integers,
|
||||
* each indicating whether the corresponding offset is in the list,
|
||||
* and its path element count.
|
||||
* This avoids inserting into a sorted list of offsets (or absolute indexes)
|
||||
* and physically moving part of the list.
|
||||
*
|
||||
* <p>Note: In principle, the caller should setMaxLength() to
|
||||
* the maximum of the max string length and U16_LENGTH/U8_LENGTH
|
||||
* to account for "long" single code points.
|
||||
*
|
||||
* Note: If maxLength were guaranteed to be no more than 32 or 64, the list could be stored as bit flags in a single
|
||||
* integer. Rather than handling a circular buffer with a start list index, the integer would simply be shifted when
|
||||
* lower offsets are removed. UnicodeSet does not have a limit on the lengths of strings.
|
||||
*
|
||||
* <p>Note: An earlier version did not track counts and stored only byte flags.
|
||||
* With boolean flags, if maxLength were guaranteed to be no more than 32 or 64,
|
||||
* the list could be stored as bit flags in a single integer.
|
||||
* Rather than handling a circular buffer with a start list index,
|
||||
* the integer would simply be shifted when lower offsets are removed.
|
||||
* UnicodeSet does not have a limit on the lengths of strings.
|
||||
*/
|
||||
static class OffsetList {
|
||||
private boolean[] list;
|
||||
private static final class OffsetList {
|
||||
private int[] list;
|
||||
private int length;
|
||||
private int start;
|
||||
|
||||
public OffsetList() {
|
||||
list = new boolean[16]; // default size
|
||||
list = new int[16]; // default size
|
||||
}
|
||||
|
||||
public void setMaxLength(int maxLength) {
|
||||
if (maxLength > list.length) {
|
||||
list = new boolean[maxLength];
|
||||
list = new int[maxLength];
|
||||
}
|
||||
clear();
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
for (int i = list.length; i-- > 0;) {
|
||||
list[i] = false;
|
||||
list[i] = 0;
|
||||
}
|
||||
start = length = 0;
|
||||
}
|
||||
|
@ -860,55 +1048,97 @@ public class UnicodeSetStringSpan {
|
|||
return (length == 0);
|
||||
}
|
||||
|
||||
// Reduce all stored offsets by delta, used when the current position
|
||||
// moves by delta.
|
||||
// There must not be any offsets lower than delta.
|
||||
// If there is an offset equal to delta, it is removed.
|
||||
// delta=[1..maxLength]
|
||||
/**
|
||||
* Reduces all stored offsets by delta, used when the current position moves by delta.
|
||||
* There must not be any offsets lower than delta.
|
||||
* If there is an offset equal to delta, it is removed.
|
||||
*
|
||||
* @param delta [1..maxLength]
|
||||
*/
|
||||
public void shift(int delta) {
|
||||
int i = start + delta;
|
||||
if (i >= list.length) {
|
||||
i -= list.length;
|
||||
}
|
||||
if (list[i]) {
|
||||
list[i] = false;
|
||||
if (list[i] != 0) {
|
||||
list[i] = 0;
|
||||
--length;
|
||||
}
|
||||
start = i;
|
||||
}
|
||||
|
||||
// Add an offset. The list must not contain it yet.
|
||||
// offset=[1..maxLength]
|
||||
/**
|
||||
* Adds an offset. The list must not contain it yet.
|
||||
* @param offset [1..maxLength]
|
||||
*/
|
||||
public void addOffset(int offset) {
|
||||
int i = start + offset;
|
||||
if (i >= list.length) {
|
||||
i -= list.length;
|
||||
}
|
||||
list[i] = true;
|
||||
assert list[i] == 0;
|
||||
list[i] = 1;
|
||||
++length;
|
||||
}
|
||||
|
||||
// offset=[1..maxLength]
|
||||
/**
|
||||
* Adds an offset and updates its count.
|
||||
* The list may already contain the offset.
|
||||
* @param offset [1..maxLength]
|
||||
*/
|
||||
public void addOffsetAndCount(int offset, int count) {
|
||||
assert count > 0;
|
||||
int i = start + offset;
|
||||
if (i >= list.length) {
|
||||
i -= list.length;
|
||||
}
|
||||
if (list[i] == 0) {
|
||||
list[i] = count;
|
||||
++length;
|
||||
} else if (count < list[i]) {
|
||||
list[i] = count;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param offset [1..maxLength]
|
||||
*/
|
||||
public boolean containsOffset(int offset) {
|
||||
int i = start + offset;
|
||||
if (i >= list.length) {
|
||||
i -= list.length;
|
||||
}
|
||||
return list[i];
|
||||
return list[i] != 0;
|
||||
}
|
||||
|
||||
// Find the lowest stored offset from a non-empty list, remove it,
|
||||
// and reduce all other offsets by this minimum.
|
||||
// Returns [1..maxLength].
|
||||
public int popMinimum() {
|
||||
/**
|
||||
* @param offset [1..maxLength]
|
||||
*/
|
||||
public boolean hasCountAtOffset(int offset, int count) {
|
||||
int i = start + offset;
|
||||
if (i >= list.length) {
|
||||
i -= list.length;
|
||||
}
|
||||
int oldCount = list[i];
|
||||
return oldCount != 0 && oldCount <= count;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the lowest stored offset from a non-empty list, removes it,
|
||||
* and reduces all other offsets by this minimum.
|
||||
* @return min=[1..maxLength]
|
||||
*/
|
||||
public int popMinimum(OutputInt outCount) {
|
||||
// Look for the next offset in list[start+1..list.length-1].
|
||||
int i = start, result;
|
||||
while (++i < list.length) {
|
||||
if (list[i]) {
|
||||
list[i] = false;
|
||||
int count = list[i];
|
||||
if (count != 0) {
|
||||
list[i] = 0;
|
||||
--length;
|
||||
result = i - start;
|
||||
start = i;
|
||||
if (outCount != null) { outCount.value = count; }
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
@ -918,12 +1148,14 @@ public class UnicodeSetStringSpan {
|
|||
// Since the list is not empty, there will be one.
|
||||
result = list.length - start;
|
||||
i = 0;
|
||||
while (!list[i]) {
|
||||
int count;
|
||||
while ((count = list[i]) == 0) {
|
||||
++i;
|
||||
}
|
||||
list[i] = false;
|
||||
list[i] = 0;
|
||||
--length;
|
||||
start = i;
|
||||
if (outCount != null) { outCount.value = count; }
|
||||
return result + i;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2012, International Business Machines Corporation and
|
||||
* Copyright (C) 1996-2014, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -2612,6 +2612,61 @@ public final class UTF16 {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility for getting a code point from a CharSequence that contains exactly one code point.
|
||||
* @return a code point IF the string is non-null and consists of a single code point.
|
||||
* otherwise returns -1.
|
||||
* @param s to test
|
||||
*/
|
||||
public static int getSingleCodePoint(CharSequence s) {
|
||||
if (s == null || s.length() == 0) {
|
||||
return -1;
|
||||
} else if (s.length() == 1) {
|
||||
return s.charAt(0);
|
||||
} else if (s.length() > 2) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// at this point, len = 2
|
||||
int cp = UTF16.charAt(s, 0);
|
||||
if (cp > 0xFFFF) { // is surrogate pair
|
||||
return cp;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility for comparing a code point to a string without having to create a new string. Returns the same results
|
||||
* as a code point comparison of UTF16.valueOf(codePoint) and s.toString(). More specifically, if
|
||||
* <pre>
|
||||
* sc = new StringComparator(true,false,0);
|
||||
* fast = UTF16.compare(codePoint, charSequence)
|
||||
* slower = sc.compare(UTF16.valueOf(codePoint), charSequence == null ? "" : charSequence.toString())
|
||||
* </pre>
|
||||
* then
|
||||
* </pre>
|
||||
* Integer.signum(fast) == Integer.signum(slower)
|
||||
* </pre>
|
||||
* @param codePoint to test
|
||||
* @param s to test
|
||||
* @return equivalent of code point comparator comparing two strings.
|
||||
*/
|
||||
public static int compareCodePoint(int codePoint, CharSequence s) {
|
||||
if (s == null) {
|
||||
return 1;
|
||||
}
|
||||
final int strLen = s.length();
|
||||
if (strLen == 0) {
|
||||
return 1;
|
||||
}
|
||||
int second = Character.codePointAt(s, 0);
|
||||
int diff = codePoint - second;
|
||||
if (diff != 0) {
|
||||
return diff;
|
||||
}
|
||||
return strLen == Character.charCount(codePoint) ? 0 : -1;
|
||||
}
|
||||
|
||||
// private data members -------------------------------------------------
|
||||
|
||||
/**
|
||||
|
|
|
@ -29,6 +29,7 @@ import com.ibm.icu.lang.UCharacter;
|
|||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.util.Freezable;
|
||||
import com.ibm.icu.util.OutputInt;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
import com.ibm.icu.util.VersionInfo;
|
||||
|
||||
|
@ -265,11 +266,20 @@ import com.ibm.icu.util.VersionInfo;
|
|||
* </tr>
|
||||
* </table>
|
||||
* </blockquote>
|
||||
* <p>To iterate over contents of UnicodeSet, use UnicodeSetIterator class.
|
||||
* <p>To iterate over contents of UnicodeSet, the following are available:
|
||||
* <ul><li>{@link #ranges()} to iterate through the ranges</li>
|
||||
* <li>{@link #strings()} to iterate through the strings</li>
|
||||
* <li>{@link #iterator()} to iterate through the entire contents in a single loop.
|
||||
* That method is, however, not particularly efficient, since it "boxes" each code point into a String.
|
||||
* </ul>
|
||||
* All of the above can be used in <b>for</b> loops.
|
||||
* The {@link com.ibm.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in <b>for</b> loops.
|
||||
* <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @stable ICU 2.0
|
||||
* @see UnicodeSetIterator
|
||||
* @see UnicodeSetSpanner
|
||||
*/
|
||||
public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Comparable<UnicodeSet>, Freezable<UnicodeSet> {
|
||||
|
||||
|
@ -283,7 +293,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* @stable ICU 4.8
|
||||
*/
|
||||
public static final UnicodeSet ALL_CODE_POINTS = new UnicodeSet(0, 0x10FFFF).freeze();
|
||||
|
||||
|
||||
private static XSymbolTable XSYMBOL_TABLE = null; // for overriding the the function processing
|
||||
|
||||
private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
|
||||
|
@ -338,7 +348,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
*/
|
||||
private static UnicodeSet INCLUSIONS[] = null;
|
||||
|
||||
private BMPSet bmpSet; // The set is frozen iff either bmpSet or stringSpan is not null.
|
||||
private BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null.
|
||||
private UnicodeSetStringSpan stringSpan;
|
||||
//----------------------------------------------------------------
|
||||
// Public API
|
||||
|
@ -492,6 +502,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* @stable ICU 2.0
|
||||
*/
|
||||
public Object clone() {
|
||||
if (isFrozen()) {
|
||||
return this;
|
||||
}
|
||||
UnicodeSet result = new UnicodeSet(this);
|
||||
result.bmpSet = this.bmpSet;
|
||||
result.stringSpan = this.stringSpan;
|
||||
|
@ -588,27 +601,30 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
/**
|
||||
* Append the <code>toPattern()</code> representation of a
|
||||
* string to the given <code>StringBuffer</code>.
|
||||
* @return
|
||||
*/
|
||||
private static void _appendToPat(StringBuffer buf, String s, boolean escapeUnprintable) {
|
||||
private static StringBuffer _appendToPat(StringBuffer buf, String s, boolean escapeUnprintable) {
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i += Character.charCount(cp)) {
|
||||
cp = s.codePointAt(i);
|
||||
_appendToPat(buf, cp, escapeUnprintable);
|
||||
}
|
||||
return buf;
|
||||
}
|
||||
|
||||
/**
|
||||
* Append the <code>toPattern()</code> representation of a
|
||||
* character to the given <code>StringBuffer</code>.
|
||||
* @return
|
||||
*/
|
||||
private static void _appendToPat(StringBuffer buf, int c, boolean escapeUnprintable) {
|
||||
private static StringBuffer _appendToPat(StringBuffer buf, int c, boolean escapeUnprintable) {
|
||||
// "Utility.isUnprintable(c)" seems redundant since the the call
|
||||
// "Utility.escapeUnprintable(buf, c)" does it again inside the if statement
|
||||
if (escapeUnprintable && Utility.isUnprintable(c)) {
|
||||
// Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything
|
||||
// unprintable
|
||||
if (Utility.escapeUnprintable(buf, c)) {
|
||||
return;
|
||||
return buf;
|
||||
}
|
||||
}
|
||||
// Okay to let ':' pass through
|
||||
|
@ -633,6 +649,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
break;
|
||||
}
|
||||
UTF16.append(buf, c);
|
||||
return buf;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1279,9 +1296,11 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
}
|
||||
|
||||
/**
|
||||
* Utility for getting code point from single code point CharSequence.
|
||||
* See the public UTF16.getSingleCodePoint()
|
||||
* @return a code point IF the string consists of a single one.
|
||||
* otherwise returns -1.
|
||||
* @param string to test
|
||||
* @param s to test
|
||||
*/
|
||||
private static int getSingleCP(CharSequence s) {
|
||||
if (s.length() < 1) {
|
||||
|
@ -1322,7 +1341,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* @return this object, for chaining
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public final UnicodeSet retainAll(String s) {
|
||||
public final UnicodeSet retainAll(CharSequence s) {
|
||||
return retainAll(fromAll(s));
|
||||
}
|
||||
|
||||
|
@ -1333,7 +1352,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* @return this object, for chaining
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public final UnicodeSet complementAll(String s) {
|
||||
public final UnicodeSet complementAll(CharSequence s) {
|
||||
return complementAll(fromAll(s));
|
||||
}
|
||||
|
||||
|
@ -1344,7 +1363,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* @return this object, for chaining
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public final UnicodeSet removeAll(String s) {
|
||||
public final UnicodeSet removeAll(CharSequence s) {
|
||||
return removeAll(fromAll(s));
|
||||
}
|
||||
|
||||
|
@ -1369,7 +1388,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* @return a newly created set containing the given string
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public static UnicodeSet from(String s) {
|
||||
public static UnicodeSet from(CharSequence s) {
|
||||
return new UnicodeSet().add(s);
|
||||
}
|
||||
|
||||
|
@ -1380,7 +1399,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* @return a newly created set containing the given characters
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public static UnicodeSet fromAll(String s) {
|
||||
public static UnicodeSet fromAll(CharSequence s) {
|
||||
return new UnicodeSet().addAll(s);
|
||||
}
|
||||
|
||||
|
@ -1428,13 +1447,15 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* Retain the specified string in this set if it is present.
|
||||
* Upon return this set will be empty if it did not contain s, or
|
||||
* will only contain s if it did contain s.
|
||||
* @param s the string to be retained
|
||||
* @param cs the string to be retained
|
||||
* @return this object, for chaining
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public final UnicodeSet retain(String s) {
|
||||
int cp = getSingleCP(s);
|
||||
public final UnicodeSet retain(CharSequence cs) {
|
||||
|
||||
int cp = getSingleCP(cs);
|
||||
if (cp < 0) {
|
||||
String s = cs.toString();
|
||||
boolean isIn = strings.contains(s);
|
||||
if (isIn && size() == 1) {
|
||||
return this;
|
||||
|
@ -1494,7 +1515,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* @return this object, for chaining
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public final UnicodeSet remove(String s) {
|
||||
public final UnicodeSet remove(CharSequence s) {
|
||||
int cp = getSingleCP(s);
|
||||
if (cp < 0) {
|
||||
strings.remove(s);
|
||||
|
@ -1571,14 +1592,14 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* @return this object, for chaining
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public final UnicodeSet complement(String s) {
|
||||
public final UnicodeSet complement(CharSequence s) {
|
||||
checkFrozen();
|
||||
int cp = getSingleCP(s);
|
||||
if (cp < 0) {
|
||||
if (strings.contains(s)) {
|
||||
strings.remove(s);
|
||||
} else {
|
||||
strings.add(s);
|
||||
strings.add(s.toString());
|
||||
}
|
||||
pat = null;
|
||||
} else {
|
||||
|
@ -1804,11 +1825,11 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* @return <tt>true</tt> if this set contains the specified string
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public final boolean contains(String s) {
|
||||
public final boolean contains(CharSequence s) {
|
||||
|
||||
int cp = getSingleCP(s);
|
||||
if (cp < 0) {
|
||||
return strings.contains(s);
|
||||
return strings.contains(s.toString());
|
||||
} else {
|
||||
return contains(cp);
|
||||
}
|
||||
|
@ -2072,7 +2093,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* @return true if the test condition is met
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public boolean containsNone(String s) {
|
||||
public boolean containsNone(CharSequence s) {
|
||||
return span(s, SpanCondition.NOT_CONTAINED) == s.length();
|
||||
}
|
||||
|
||||
|
@ -2106,7 +2127,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* @return true if the condition is met
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public final boolean containsSome(String s) {
|
||||
public final boolean containsSome(CharSequence s) {
|
||||
return !containsNone(s);
|
||||
}
|
||||
|
||||
|
@ -2344,7 +2365,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
|
||||
StringBuffer rebuiltPat = new StringBuffer();
|
||||
RuleCharacterIterator chars =
|
||||
new RuleCharacterIterator(pattern, symbols, pos);
|
||||
new RuleCharacterIterator(pattern, symbols, pos);
|
||||
applyPattern(chars, symbols, rebuiltPat, options);
|
||||
if (chars.inVariable()) {
|
||||
syntaxError(chars, "Extra chars in variable value");
|
||||
|
@ -2388,7 +2409,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
// Recognized special forms for chars, sets: c-c s-s s&s
|
||||
|
||||
int opts = RuleCharacterIterator.PARSE_VARIABLES |
|
||||
RuleCharacterIterator.PARSE_ESCAPES;
|
||||
RuleCharacterIterator.PARSE_ESCAPES;
|
||||
if ((options & IGNORE_SPACE) != 0) {
|
||||
opts |= RuleCharacterIterator.SKIP_WHITESPACE;
|
||||
}
|
||||
|
@ -2740,7 +2761,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
private static void syntaxError(RuleCharacterIterator chars, String msg) {
|
||||
throw new IllegalArgumentException("Error: " + msg + " at \"" +
|
||||
Utility.escape(chars.toString()) +
|
||||
'"');
|
||||
'"');
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -2771,23 +2792,24 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
}
|
||||
|
||||
/**
|
||||
* Add the contents of the collection (as strings) into this UnicodeSet.
|
||||
* Add the contents of the collection (as strings) into this UnicodeSet.
|
||||
* The collection must not contain null.
|
||||
* @param source the collection to add
|
||||
* @return a reference to this object
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public UnicodeSet add(Collection<?> source) {
|
||||
public UnicodeSet add(Iterable<?> source) {
|
||||
return addAll(source);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the contents of the UnicodeSet (as strings) into a collection.
|
||||
* Add a collection (as strings) into this UnicodeSet.
|
||||
* Uses standard naming convention.
|
||||
* @param source collection to add into
|
||||
* @return a reference to this object
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public UnicodeSet addAll(Collection<?> source) {
|
||||
public UnicodeSet addAll(Iterable<?> source) {
|
||||
checkFrozen();
|
||||
for (Object o : source) {
|
||||
add(o.toString());
|
||||
|
@ -3104,7 +3126,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
// Reference comparison ok; VersionInfo caches and reuses
|
||||
// unique objects.
|
||||
return v != NO_VERSION &&
|
||||
v.compareTo(version) <= 0;
|
||||
v.compareTo(version) <= 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3297,7 +3319,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
public UnicodeSet applyPropertyAlias(String propertyAlias, String valueAlias) {
|
||||
return applyPropertyAlias(propertyAlias, valueAlias, null);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Modifies this set to contain those code points which have the
|
||||
* given value for the given property. Prior contents of this
|
||||
|
@ -3321,7 +3343,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
&& ((XSymbolTable)symbols).applyPropertyAlias(propertyAlias, valueAlias, this)) {
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
if (XSYMBOL_TABLE != null) {
|
||||
if (XSYMBOL_TABLE.applyPropertyAlias(propertyAlias, valueAlias, this)) {
|
||||
return this;
|
||||
|
@ -3476,8 +3498,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
|
||||
// Look for an opening [:, [:^, \p, or \P
|
||||
return pattern.regionMatches(pos, "[:", 0, 2) ||
|
||||
pattern.regionMatches(true, pos, "\\p", 0, 2) ||
|
||||
pattern.regionMatches(pos, "\\N", 0, 2);
|
||||
pattern.regionMatches(true, pos, "\\p", 0, 2) ||
|
||||
pattern.regionMatches(pos, "\\N", 0, 2);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -3879,17 +3901,14 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
// Optimize contains() and span() and similar functions.
|
||||
if (!strings.isEmpty()) {
|
||||
stringSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), UnicodeSetStringSpan.ALL);
|
||||
if (!stringSpan.needsStringSpanUTF16()) {
|
||||
// All strings are irrelevant for span() etc. because
|
||||
// all of each string's code points are contained in this set.
|
||||
// Do not check needsStringSpanUTF8() because UTF-8 has at most as
|
||||
// many relevant strings as UTF-16.
|
||||
// (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
|
||||
stringSpan = null;
|
||||
}
|
||||
}
|
||||
if (stringSpan == null) {
|
||||
// No span-relevant strings: Optimize for code point spans.
|
||||
if (stringSpan == null || !stringSpan.needsStringSpanUTF16()) {
|
||||
// Optimize for code point spans.
|
||||
// There are no strings, or
|
||||
// all strings are irrelevant for span() etc. because
|
||||
// all of each string's code points are contained in this set.
|
||||
// However, fully contained strings are relevant for spanAndCount(),
|
||||
// so we create both objects.
|
||||
bmpSet = new BMPSet(list, len);
|
||||
}
|
||||
}
|
||||
|
@ -3898,7 +3917,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
|
||||
/**
|
||||
* Span a string using this UnicodeSet.
|
||||
*
|
||||
* <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
|
||||
* @param s The string to be spanned
|
||||
* @param spanCondition The span condition
|
||||
* @return the length of the span
|
||||
|
@ -3912,7 +3931,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* Span a string using this UnicodeSet.
|
||||
* If the start index is less than 0, span will start from 0.
|
||||
* If the start index is greater than the string length, span returns the string length.
|
||||
*
|
||||
* <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
|
||||
* @param s The string to be spanned
|
||||
* @param start The start index that the span begins
|
||||
* @param spanCondition The span condition
|
||||
|
@ -3927,52 +3946,97 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
return end;
|
||||
}
|
||||
if (bmpSet != null) {
|
||||
return start + bmpSet.span(s, start, end, spanCondition);
|
||||
// Frozen set without strings, or no string is relevant for span().
|
||||
return bmpSet.span(s, start, spanCondition, null);
|
||||
}
|
||||
int len = end - start;
|
||||
if (stringSpan != null) {
|
||||
return start + stringSpan.span(s, start, len, spanCondition);
|
||||
return stringSpan.span(s, start, spanCondition);
|
||||
} else if (!strings.isEmpty()) {
|
||||
int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
|
||||
: UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
|
||||
UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
|
||||
if (strSpan.needsStringSpanUTF16()) {
|
||||
return start + strSpan.span(s, start, len, spanCondition);
|
||||
return strSpan.span(s, start, spanCondition);
|
||||
}
|
||||
}
|
||||
|
||||
return spanCodePointsAndCount(s, start, spanCondition, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Same as span() but also counts the smallest number of set elements on any path across the span.
|
||||
* <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
|
||||
* @param outCount An output-only object (must not be null) for returning the count.
|
||||
* @return the limit (exclusive end) of the span
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) {
|
||||
if (outCount == null) {
|
||||
throw new IllegalArgumentException("outCount must not be null");
|
||||
}
|
||||
int end = s.length();
|
||||
if (start < 0) {
|
||||
start = 0;
|
||||
} else if (start >= end) {
|
||||
return end;
|
||||
}
|
||||
if (stringSpan != null) {
|
||||
// We might also have bmpSet != null,
|
||||
// but fully-contained strings are relevant for counting elements.
|
||||
return stringSpan.spanAndCount(s, start, spanCondition, outCount);
|
||||
} else if (bmpSet != null) {
|
||||
return bmpSet.span(s, start, spanCondition, outCount);
|
||||
} else if (!strings.isEmpty()) {
|
||||
int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
|
||||
: UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
|
||||
which |= UnicodeSetStringSpan.WITH_COUNT;
|
||||
UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
|
||||
return strSpan.spanAndCount(s, start, spanCondition, outCount);
|
||||
}
|
||||
|
||||
return spanCodePointsAndCount(s, start, spanCondition, outCount);
|
||||
}
|
||||
|
||||
private int spanCodePointsAndCount(CharSequence s, int start,
|
||||
SpanCondition spanCondition, OutputInt outCount) {
|
||||
// Pin to 0/1 values.
|
||||
boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED);
|
||||
|
||||
int c;
|
||||
int next = start;
|
||||
int length = s.length();
|
||||
int count = 0;
|
||||
do {
|
||||
c = Character.codePointAt(s, next);
|
||||
if (spanContained != contains(c)) {
|
||||
break;
|
||||
}
|
||||
next = Character.offsetByCodePoints(s, next, 1);
|
||||
} while (next < end);
|
||||
++count;
|
||||
next += Character.charCount(c);
|
||||
} while (next < length);
|
||||
if (outCount != null) { outCount.value = count; }
|
||||
return next;
|
||||
}
|
||||
|
||||
/**
|
||||
* Span a string backwards (from the end) using this UnicodeSet.
|
||||
*
|
||||
* <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
|
||||
* @param s The string to be spanned
|
||||
* @param spanCondition The span condition
|
||||
* @return The string index which starts the span (i.e. inclusive).
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public int spanBack(CharSequence s, SpanCondition spanCondition) {
|
||||
return spanBack(s, s.length(), spanCondition);
|
||||
return spanBack(s, s.length(), spanCondition);
|
||||
}
|
||||
|
||||
/**
|
||||
* Span a string backwards (from the fromIndex) using this UnicodeSet.
|
||||
* If the fromIndex is less than 0, spanBack will return 0.
|
||||
* If fromIndex is greater than the string length, spanBack will start from the string length.
|
||||
*
|
||||
* <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
|
||||
* @param s The string to be spanned
|
||||
* @param fromIndex The index of the char (exclusive) that the string should be spanned backwards
|
||||
* @param spanCondition The span condition
|
||||
|
@ -3987,6 +4051,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
fromIndex = s.length();
|
||||
}
|
||||
if (bmpSet != null) {
|
||||
// Frozen set without strings, or no string is relevant for spanBack().
|
||||
return bmpSet.spanBack(s, fromIndex, spanCondition);
|
||||
}
|
||||
if (stringSpan != null) {
|
||||
|
@ -3994,7 +4059,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
} else if (!strings.isEmpty()) {
|
||||
int which = (spanCondition == SpanCondition.NOT_CONTAINED)
|
||||
? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED
|
||||
: UnicodeSetStringSpan.BACK_UTF16_CONTAINED;
|
||||
: UnicodeSetStringSpan.BACK_UTF16_CONTAINED;
|
||||
UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
|
||||
if (strSpan.needsStringSpanUTF16()) {
|
||||
return strSpan.spanBack(s, fromIndex, spanCondition);
|
||||
|
@ -4011,20 +4076,19 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
if (spanContained != contains(c)) {
|
||||
break;
|
||||
}
|
||||
prev = Character.offsetByCodePoints(s, prev, -1);
|
||||
prev -= Character.charCount(c);
|
||||
} while (prev > 0);
|
||||
return prev;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clone a thawed version of this class, according to the Freezable interface.
|
||||
* @return this
|
||||
* @return the clone, not frozen
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public UnicodeSet cloneAsThawed() {
|
||||
UnicodeSet result = (UnicodeSet) clone();
|
||||
result.bmpSet = null;
|
||||
result.stringSpan = null;
|
||||
UnicodeSet result = new UnicodeSet(this);
|
||||
assert !result.isFrozen();
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -4039,6 +4103,80 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
// Additional methods for integration with Generics and Collections
|
||||
// ************************
|
||||
|
||||
/**
|
||||
* A struct-like class used for iteration through ranges, for faster iteration than by String.
|
||||
* Read about the restrictions on usage in {@link #UnicodeSet.ranges()}.
|
||||
*/
|
||||
public static class EntryRange {
|
||||
/**
|
||||
* The starting code point of the range.
|
||||
*/
|
||||
public int codepoint;
|
||||
/**
|
||||
* The ending code point of the range
|
||||
*/
|
||||
public int codepointEnd;
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuffer b = new StringBuffer();
|
||||
return (
|
||||
codepoint == codepointEnd ? _appendToPat(b, codepoint, false)
|
||||
: _appendToPat(_appendToPat(b, codepoint, false).append('-'), codepointEnd, false))
|
||||
.toString();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Provide for faster iteration than by String. Returns an iterator over a range values. The UnicodeSet
|
||||
* must not be altered during the iteration. The EntryRange is the same each time; the contents are just reset.
|
||||
* <br><b>Warning: </b>To iterate over the full contents, you have to also iterate over the strings.
|
||||
*
|
||||
* <pre>
|
||||
* // Sample code
|
||||
* for (EntryRange range : us1.ranges()) {
|
||||
* // do something with code points between range.codepointEnd and range.codepointEnd;
|
||||
* }
|
||||
* for (String s : us1.strings()) {
|
||||
* // do something with each string;
|
||||
* }
|
||||
* </pre>
|
||||
*/
|
||||
public Iterable<EntryRange> ranges() {
|
||||
return new EntryRanges();
|
||||
}
|
||||
|
||||
private class EntryRanges implements Iterable<EntryRange>, Iterator<EntryRange> {
|
||||
int pos;
|
||||
EntryRange result = new EntryRange();
|
||||
// Iterator<String> stringIterator = strings == null ? null : strings.iterator();
|
||||
|
||||
public Iterator<EntryRange> iterator() {
|
||||
return this;
|
||||
}
|
||||
public boolean hasNext() {
|
||||
return pos < len-1
|
||||
// || (stringIterator != null && stringIterator.hasNext())
|
||||
;
|
||||
}
|
||||
public EntryRange next() {
|
||||
if (pos < len-1) {
|
||||
result.codepoint = list[pos++];
|
||||
result.codepointEnd = list[pos++]-1;
|
||||
// result.string = null;
|
||||
} else {
|
||||
throw new ArrayIndexOutOfBoundsException(pos);
|
||||
// result.codepoint = -1;
|
||||
// result.string = stringIterator.next();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns a string iterator. Uses the same order of iteration as {@link UnicodeSetIterator}.
|
||||
* @see java.util.Set#iterator()
|
||||
|
@ -4129,8 +4267,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* @see #containsAll(com.ibm.icu.text.UnicodeSet)
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public boolean containsAll(Collection<String> collection) {
|
||||
for (String o : collection) {
|
||||
public <T extends CharSequence> boolean containsAll(Iterable<T> collection) {
|
||||
for (T o : collection) {
|
||||
if (!contains(o)) {
|
||||
return false;
|
||||
}
|
||||
|
@ -4142,8 +4280,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* @see #containsNone(com.ibm.icu.text.UnicodeSet)
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public boolean containsNone(Collection<String> collection) {
|
||||
for (String o : collection) {
|
||||
public <T extends CharSequence> boolean containsNone(Iterable<T> collection) {
|
||||
for (T o : collection) {
|
||||
if (contains(o)) {
|
||||
return false;
|
||||
}
|
||||
|
@ -4155,7 +4293,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* @see #containsAll(com.ibm.icu.text.UnicodeSet)
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public final boolean containsSome(Collection<String> collection) {
|
||||
public final <T extends CharSequence> boolean containsSome(Iterable<T> collection) {
|
||||
return !containsNone(collection);
|
||||
}
|
||||
|
||||
|
@ -4163,9 +4301,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* @see #addAll(com.ibm.icu.text.UnicodeSet)
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public UnicodeSet addAll(String... collection) {
|
||||
public <T extends CharSequence> UnicodeSet addAll(T... collection) {
|
||||
checkFrozen();
|
||||
for (String str : collection) {
|
||||
for (T str : collection) {
|
||||
add(str);
|
||||
}
|
||||
return this;
|
||||
|
@ -4176,9 +4314,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* @see #removeAll(com.ibm.icu.text.UnicodeSet)
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public UnicodeSet removeAll(Collection<String> collection) {
|
||||
public <T extends CharSequence> UnicodeSet removeAll(Iterable<T> collection) {
|
||||
checkFrozen();
|
||||
for (String o : collection) {
|
||||
for (T o : collection) {
|
||||
remove(o);
|
||||
}
|
||||
return this;
|
||||
|
@ -4188,7 +4326,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* @see #retainAll(com.ibm.icu.text.UnicodeSet)
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public UnicodeSet retainAll(Collection<String> collection) {
|
||||
public <T extends CharSequence> UnicodeSet retainAll(Iterable<T> collection) {
|
||||
checkFrozen();
|
||||
// TODO optimize
|
||||
UnicodeSet toRetain = new UnicodeSet();
|
||||
|
@ -4277,7 +4415,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* @stable ICU 4.4
|
||||
*/
|
||||
|
||||
public static int compare(String string, int codePoint) {
|
||||
public static int compare(CharSequence string, int codePoint) {
|
||||
return CharSequences.compare(string, codePoint);
|
||||
}
|
||||
|
||||
|
@ -4288,7 +4426,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* Note that this (=String) order is UTF-16 order -- *not* code point order.
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public static int compare(int codePoint, String string) {
|
||||
public static int compare(int codePoint, CharSequence string) {
|
||||
return -CharSequences.compare(string, codePoint);
|
||||
}
|
||||
|
||||
|
@ -4304,7 +4442,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
public static <T extends Comparable<T>> int compare(Iterable<T> collection1, Iterable<T> collection2) {
|
||||
return compare(collection1.iterator(), collection2.iterator());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Utility to compare two iterators. Warning: the ordering in iterables is important. For Collections that are ordered,
|
||||
* like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration.
|
||||
|
@ -4378,7 +4516,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* </pre>
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public Iterable<String> strings() {
|
||||
public Collection<String> strings() {
|
||||
return Collections.unmodifiableSortedSet(strings);
|
||||
}
|
||||
|
||||
|
@ -4417,7 +4555,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* If findNot is true, then reverse the sense of the match: find the first place where the UnicodeSet doesn't match.
|
||||
* If there is no match, length is returned.
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
* @deprecated This API is ICU internal only. Use span instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public int findIn(CharSequence value, int fromIndex, boolean findNot) {
|
||||
|
@ -4438,7 +4576,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* If there is no match, -1 is returned.
|
||||
* BEFORE index is not in the UnicodeSet.
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
* @deprecated This API is ICU internal only. Use spanBack instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public int findLastIn(CharSequence value, int fromIndex, boolean findNot) {
|
||||
|
@ -4460,7 +4598,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* @param matches A boolean to either strip all that matches or don't match with the current UnicodeSet object.
|
||||
* @return The string after it has been stripped.
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
* @deprecated This API is ICU internal only. Use replaceFrom.
|
||||
*/
|
||||
@Deprecated
|
||||
public String stripFrom(CharSequence source, boolean matches) {
|
||||
|
@ -4593,6 +4731,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
*/
|
||||
@Deprecated
|
||||
public static void setDefaultXSymbolTable(XSymbolTable xSymbolTable) {
|
||||
INCLUSIONS = null; // If the properties override inclusions, these have to be regenerated.
|
||||
XSYMBOL_TABLE = xSymbolTable;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,333 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2014, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import com.ibm.icu.text.UnicodeSet.SpanCondition;
|
||||
import com.ibm.icu.util.OutputInt;
|
||||
|
||||
/**
|
||||
* A helper class used to count, replace, and trim CharSequences based on UnicodeSet matches.
|
||||
* An instance is immutable (and thus thread-safe) iff the source UnicodeSet is frozen.
|
||||
*/
|
||||
public class UnicodeSetSpanner {
|
||||
|
||||
private final UnicodeSet unicodeSet;
|
||||
|
||||
/**
|
||||
* Create a spanner from a UnicodeSet. For speed and safety, the UnicodeSet should be frozen. However, this class
|
||||
* can be used with a non-frozen version to avoid the cost of freezing.
|
||||
*
|
||||
* @param source
|
||||
* the original UnicodeSet
|
||||
*/
|
||||
public UnicodeSetSpanner(UnicodeSet source) {
|
||||
unicodeSet = source;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the UnicodeSet used for processing. It is frozen iff the original was.
|
||||
*
|
||||
* @return the construction set.
|
||||
*/
|
||||
public UnicodeSet getUnicodeSet() {
|
||||
return unicodeSet;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.lang.Object#equals(java.lang.Object)
|
||||
*/
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
return other instanceof UnicodeSetSpanner && unicodeSet.equals(((UnicodeSetSpanner) other).unicodeSet);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.lang.Object#hashCode()
|
||||
*/
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return unicodeSet.hashCode();
|
||||
}
|
||||
|
||||
/**
|
||||
* Options for replaceFrom and countIn to control how to treat each matched span. The name is from "qualifier" as used in regex,
|
||||
* since it is similar to whether one is replacing [abc] by x, or [abc]* by x.
|
||||
*
|
||||
*/
|
||||
public enum Quantifier {
|
||||
/**
|
||||
* Collapse spans. That is, modify/count the entire matching span as a single item, instead of separate
|
||||
* code points.
|
||||
*
|
||||
*/
|
||||
SPAN,
|
||||
/**
|
||||
* Use the smallest number of elements in the spanned range for counting and modification. In other words, the "longest matches" are
|
||||
* used where possible. If there are no strings, this will be the same as code points.
|
||||
* <p>For example, in the string "abab":
|
||||
* <ul>
|
||||
* <li>spanning with [ab] will also count four MIN_ELEMENTS.</li>
|
||||
* <li>spanning with [{ab}] will count two MIN_ELEMENTS.</li>
|
||||
* <li>spanning with [ab{ab}] will also count two MIN_ELEMENTS.</li>
|
||||
* </ul>
|
||||
*/
|
||||
MIN_ELEMENTS,
|
||||
// Note: could in the future have an additional option MAX_ELEMENTS
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of matching characters found in a character sequence, counting by Quantifier.ELEMENT using SpanCondition.CONTAINED.
|
||||
*
|
||||
* @param sequence
|
||||
* the sequence to count characters in
|
||||
* @return the count. Zero if there are none.
|
||||
*/
|
||||
public int countIn(CharSequence sequence) {
|
||||
return countIn(sequence, Quantifier.MIN_ELEMENTS, SpanCondition.CONTAINED);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of matching characters found in a character sequence, using SpanCondition.CONTAINED
|
||||
*
|
||||
* @param sequence
|
||||
* the sequence to count characters in
|
||||
* @return the count. Zero if there are none.
|
||||
*/
|
||||
public int countIn(CharSequence sequence, Quantifier quantifier) {
|
||||
return countIn(sequence, quantifier, SpanCondition.CONTAINED);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of matching characters found in a character sequence.
|
||||
*
|
||||
* @param sequence
|
||||
* the sequence to count characters in
|
||||
* @param quantifier
|
||||
* (optional) whether to treat the entire span as a match, or individual code points
|
||||
* @param countSpan
|
||||
* (optional) the spanCondition to use. CONTAINED means only count the code points in the CONTAINED span;
|
||||
* NOT_CONTAINED is the reverse.
|
||||
* @return the count. Zero if there are none.
|
||||
*/
|
||||
public int countIn(CharSequence sequence, Quantifier quantifier, SpanCondition countSpan) {
|
||||
int count = 0;
|
||||
int start = 0;
|
||||
SpanCondition skipSpan = countSpan == SpanCondition.CONTAINED ? SpanCondition.NOT_CONTAINED
|
||||
: SpanCondition.CONTAINED;
|
||||
final int length = sequence.length();
|
||||
OutputInt spanCount = new OutputInt();
|
||||
while (start != length) {
|
||||
int endNotContained = unicodeSet.span(sequence, start, skipSpan);
|
||||
if (endNotContained == length) {
|
||||
break;
|
||||
}
|
||||
start = unicodeSet.spanAndCount(sequence, endNotContained, countSpan, spanCount);
|
||||
count += quantifier == Quantifier.SPAN ? 1 : spanCount.value;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete all the matching spans in sequence, using SpanCondition.CONTAINED
|
||||
*
|
||||
* @param sequence
|
||||
* charsequence to replace matching spans in.
|
||||
* @return modified string.
|
||||
*/
|
||||
public String deleteFrom(CharSequence sequence) {
|
||||
return replaceFrom(sequence, "", Quantifier.SPAN, SpanCondition.CONTAINED);
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete all matching spans in sequence, according to the operations.
|
||||
*
|
||||
* @param sequence
|
||||
* charsequence to replace matching spans in.
|
||||
* @param modifySpan
|
||||
* specify whether to modify the matching spans (CONTAINED) or the non-matching (NOT_CONTAINED)
|
||||
* @return modified string.
|
||||
*/
|
||||
public String deleteFrom(CharSequence sequence, SpanCondition modifySpan) {
|
||||
return replaceFrom(sequence, "", Quantifier.SPAN, modifySpan);
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace all matching spans in sequence by the replacement,
|
||||
* counting by Quantifier.ELEMENT using SpanCondition.CONTAINED.
|
||||
*
|
||||
* @param sequence
|
||||
* charsequence to replace matching spans in.
|
||||
* @param replacement
|
||||
* replacement sequence. To delete, use ""
|
||||
* @return modified string.
|
||||
*/
|
||||
public String replaceFrom(CharSequence sequence, CharSequence replacement) {
|
||||
return replaceFrom(sequence, replacement, Quantifier.MIN_ELEMENTS, SpanCondition.CONTAINED);
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace all matching spans in sequence by replacement, according to the Quantifier, using SpanCondition.CONTAINED.
|
||||
*
|
||||
* @param sequence
|
||||
* charsequence to replace matching spans in.
|
||||
* @param replacement
|
||||
* replacement sequence. To delete, use ""
|
||||
* @param quantifier
|
||||
* whether to treat the entire span as a match, or individual code points
|
||||
* @return modified string.
|
||||
*/
|
||||
public String replaceFrom(CharSequence sequence, CharSequence replacement, Quantifier quantifier) {
|
||||
return replaceFrom(sequence, replacement, quantifier, SpanCondition.CONTAINED);
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace all matching spans in sequence by replacement, according to the operations quantifier and modifySpan.
|
||||
*
|
||||
* @param sequence
|
||||
* charsequence to replace matching spans in.
|
||||
* @param replacement
|
||||
* replacement sequence. To delete, use ""
|
||||
* @param modifySpan
|
||||
* (optional) specify whether to modify the matching spans (CONTAINED) or the non-matching
|
||||
* (NOT_CONTAINED)
|
||||
* @param quantifier
|
||||
* (optional) specify whether to collapse or do codepoint by codepoint.
|
||||
* @return modified string.
|
||||
*/
|
||||
public String replaceFrom(CharSequence sequence, CharSequence replacement, Quantifier quantifier,
|
||||
SpanCondition modifySpan) {
|
||||
SpanCondition copySpan = modifySpan == SpanCondition.CONTAINED ? SpanCondition.NOT_CONTAINED
|
||||
: SpanCondition.CONTAINED;
|
||||
final boolean remove = replacement.length() == 0;
|
||||
StringBuilder result = new StringBuilder();
|
||||
// TODO, we can optimize this to
|
||||
// avoid this allocation unless needed
|
||||
|
||||
final int length = sequence.length();
|
||||
OutputInt spanCount = new OutputInt();
|
||||
for (int endCopy = 0; endCopy != length;) {
|
||||
int endModify = unicodeSet.spanAndCount(sequence, endCopy, modifySpan, spanCount);
|
||||
if (remove || endModify == 0) {
|
||||
// do nothing
|
||||
} else if (quantifier == Quantifier.SPAN) {
|
||||
result.append(replacement);
|
||||
} else {
|
||||
for (int i = spanCount.value; i > 0; --i) {
|
||||
result.append(replacement);
|
||||
}
|
||||
}
|
||||
if (endModify == length) {
|
||||
break;
|
||||
}
|
||||
endCopy = unicodeSet.span(sequence, endModify, copySpan);
|
||||
result.append(sequence.subSequence(endModify, endCopy));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Options for the trim() method
|
||||
*
|
||||
*/
|
||||
public enum TrimOption {
|
||||
/**
|
||||
* Trim leading spans (subject to INVERT).
|
||||
*
|
||||
*/
|
||||
LEADING,
|
||||
/**
|
||||
* Trim leading and trailing spans (subject to INVERT).
|
||||
*
|
||||
*/
|
||||
BOTH,
|
||||
/**
|
||||
* Trim trailing spans (subject to INVERT).
|
||||
*
|
||||
*/
|
||||
TRAILING;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching code points at the start or
|
||||
* end of the string, using TrimOption.BOTH and SpanCondition.CONTAINED. For example:
|
||||
*
|
||||
* <pre>
|
||||
* {@code
|
||||
*
|
||||
* new UnicodeSet("[ab]").trim("abacatbab")}
|
||||
* </pre>
|
||||
*
|
||||
* ... returns {@code "catbab"}.
|
||||
*
|
||||
*/
|
||||
public CharSequence trim(CharSequence sequence) {
|
||||
return trim(sequence, TrimOption.BOTH, SpanCondition.CONTAINED);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching code points at the start or
|
||||
* end of the string, using the trimOption and SpanCondition.CONTAINED. For example:
|
||||
*
|
||||
* <pre>
|
||||
* {@code
|
||||
*
|
||||
* new UnicodeSet("[ab]").trim("abacatbab")}
|
||||
* </pre>
|
||||
*
|
||||
* ... returns {@code "catbab"}.
|
||||
*
|
||||
*/
|
||||
public CharSequence trim(CharSequence sequence, TrimOption trimOption) {
|
||||
return trim(sequence, trimOption, SpanCondition.CONTAINED);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching code points at the start or
|
||||
* end of the string, depending on the trimOption and modifySpan. For example:
|
||||
*
|
||||
* <pre>
|
||||
* {@code
|
||||
*
|
||||
* new UnicodeSet("[ab]").trim("abacatbab")}
|
||||
* </pre>
|
||||
*
|
||||
* ... returns {@code "catbab"}.
|
||||
*
|
||||
* @param sequence
|
||||
* the sequence to trim
|
||||
* @param trimOption
|
||||
* (optional) LEADING, TRAILING, or BOTH
|
||||
* @param modifySpan
|
||||
* (optional) CONTAINED or NOT_CONTAINED
|
||||
* @return a subsequence
|
||||
*/
|
||||
public CharSequence trim(CharSequence sequence, TrimOption trimOption, SpanCondition modifySpan) {
|
||||
int endLeadContained, startTrailContained;
|
||||
final int length = sequence.length();
|
||||
if (trimOption != TrimOption.TRAILING) {
|
||||
endLeadContained = unicodeSet.span(sequence, modifySpan);
|
||||
if (endLeadContained == length) {
|
||||
return "";
|
||||
}
|
||||
} else {
|
||||
endLeadContained = 0;
|
||||
}
|
||||
if (trimOption != TrimOption.LEADING) {
|
||||
startTrailContained = unicodeSet.spanBack(sequence, modifySpan);
|
||||
} else {
|
||||
startTrailContained = length;
|
||||
}
|
||||
return endLeadContained == 0 && startTrailContained == length ? sequence : sequence.subSequence(
|
||||
endLeadContained, startTrailContained);
|
||||
}
|
||||
|
||||
}
|
58
icu4j/main/classes/core/src/com/ibm/icu/util/OutputInt.java
Normal file
58
icu4j/main/classes/core/src/com/ibm/icu/util/OutputInt.java
Normal file
|
@ -0,0 +1,58 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2014, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.util;
|
||||
|
||||
/**
|
||||
* Simple struct-like class for int output parameters.
|
||||
* Like <code>Output<Integer></code> but without auto-boxing.
|
||||
*
|
||||
* @internal but could become public
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
public class OutputInt {
|
||||
/**
|
||||
* The value field.
|
||||
*
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
public int value;
|
||||
|
||||
/**
|
||||
* Constructs an <code>OutputInt</code> with value 0.
|
||||
*
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
public OutputInt() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs an <code>OutputInt</code> with the given value.
|
||||
*
|
||||
* @param value the initial value
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
public OutputInt(int value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
public String toString() {
|
||||
return Integer.toString(value);
|
||||
}
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2010, International Business Machines Corporation and *
|
||||
* Copyright (C) 1996-2014, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -13,6 +13,7 @@ import com.ibm.icu.impl.Utility;
|
|||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.ReplaceableString;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UTF16.StringComparator;
|
||||
|
||||
/**
|
||||
* Testing class for UTF16
|
||||
|
@ -1560,6 +1561,39 @@ public final class UTF16Test extends TestFmwk
|
|||
}
|
||||
}
|
||||
|
||||
public void TestUtilities() {
|
||||
String[] tests = {
|
||||
"a",
|
||||
"\uFFFF",
|
||||
"😀",
|
||||
"\uD800",
|
||||
"\uDC00",
|
||||
"\uDBFF\uDfff",
|
||||
"",
|
||||
"\u0000",
|
||||
"\uDC00\uD800",
|
||||
"ab",
|
||||
"😀a",
|
||||
null,
|
||||
};
|
||||
StringComparator sc = new UTF16.StringComparator(true,false,0);
|
||||
for (String item1 : tests) {
|
||||
String nonNull1 = item1 == null ? "" : item1;
|
||||
int count = UTF16.countCodePoint(nonNull1);
|
||||
int expected = count == 0 || count > 1 ? -1 : nonNull1.codePointAt(0);
|
||||
assertEquals("codepoint test " + Utility.hex(nonNull1), expected, UTF16.getSingleCodePoint(item1));
|
||||
if (expected == -1) {
|
||||
continue;
|
||||
}
|
||||
for (String item2 : tests) {
|
||||
String nonNull2 = item2 == null ? "" : item2;
|
||||
int scValue = Integer.signum(sc.compare(nonNull1, nonNull2));
|
||||
int fValue = Integer.signum(UTF16.compareCodePoint(expected, item2));
|
||||
assertEquals("comparison " + Utility.hex(nonNull1) + ", " + Utility.hex(nonNull2), scValue, fValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void TestNewString() {
|
||||
final int[] codePoints = {
|
||||
UCharacter.toCodePoint(UCharacter.MIN_HIGH_SURROGATE, UCharacter.MAX_LOW_SURROGATE),
|
||||
|
@ -1568,6 +1602,7 @@ public final class UTF16Test extends TestFmwk
|
|||
'A',
|
||||
-1,
|
||||
};
|
||||
|
||||
|
||||
final String cpString = "" +
|
||||
UCharacter.MIN_HIGH_SURROGATE +
|
||||
|
|
|
@ -1,17 +1,19 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2009-2011, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
* Copyright (C) 2009-2014, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.dev.test.lang;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSet.SpanCondition;
|
||||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
import com.ibm.icu.util.OutputInt;
|
||||
|
||||
/**
|
||||
* @test
|
||||
|
@ -41,7 +43,7 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
|
|||
}
|
||||
pos = set.span(string, 1, SpanCondition.SIMPLE);
|
||||
if (pos != 3) {
|
||||
errln(String.format("FAIL: UnicodeSet(%s).span(%s) returns the wrong value pos %d (!= 3)",
|
||||
errln(String.format("FAIL: UnicodeSet(%s).span(%s, 1) returns the wrong value pos %d (!= 3)",
|
||||
set.toString(), string, pos));
|
||||
}
|
||||
}
|
||||
|
@ -129,33 +131,15 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
|
|||
// more complex test. --------------------------------------------------------
|
||||
|
||||
// Make the strings in a UnicodeSet easily accessible.
|
||||
static class UnicodeSetWithStrings {
|
||||
|
||||
private static class UnicodeSetWithStrings {
|
||||
private UnicodeSet set;
|
||||
|
||||
private String strings[];
|
||||
private Collection<String> setStrings;
|
||||
private int stringsLength;
|
||||
private boolean hasSurrogates;
|
||||
|
||||
public UnicodeSetWithStrings(final UnicodeSet normalSet) {
|
||||
set = normalSet;
|
||||
stringsLength = 0;
|
||||
hasSurrogates = false;
|
||||
strings = new String[20];
|
||||
int size = set.size();
|
||||
if (size > 0 && set.charAt(size - 1) < 0) {
|
||||
// If a set's last element is not a code point, then it must contain strings.
|
||||
// Iterate over the set, skip all code point ranges, and cache the strings.
|
||||
UnicodeSetIterator iter = new UnicodeSetIterator(set);
|
||||
while (iter.nextRange() && stringsLength < strings.length) {
|
||||
if (iter.codepoint == UnicodeSetIterator.IS_STRING) {
|
||||
// Store the pointer to the set's string element
|
||||
// which we happen to know is a stable pointer.
|
||||
strings[stringsLength] = iter.getString();
|
||||
++stringsLength;
|
||||
}
|
||||
}
|
||||
}
|
||||
setStrings = normalSet.strings();
|
||||
stringsLength = setStrings.size();
|
||||
}
|
||||
|
||||
public final UnicodeSet getSet() {
|
||||
|
@ -166,34 +150,9 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
|
|||
return (stringsLength > 0);
|
||||
}
|
||||
|
||||
public boolean hasStringsWithSurrogates() {
|
||||
return hasSurrogates;
|
||||
public Iterable<String> strings() {
|
||||
return setStrings;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static class UnicodeSetWithStringsIterator {
|
||||
|
||||
private UnicodeSetWithStrings fSet;
|
||||
private int nextStringIndex;
|
||||
|
||||
public UnicodeSetWithStringsIterator(final UnicodeSetWithStrings set) {
|
||||
fSet = set;
|
||||
nextStringIndex = 0;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
nextStringIndex = 0;
|
||||
}
|
||||
|
||||
public final String nextString() {
|
||||
if (nextStringIndex < fSet.stringsLength) {
|
||||
return fSet.strings[nextStringIndex++];
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Compare 16-bit Unicode strings (which may be malformed UTF-16)
|
||||
|
@ -231,7 +190,6 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
|
|||
}
|
||||
return prev;
|
||||
} else if (spanCondition == SpanCondition.NOT_CONTAINED) {
|
||||
UnicodeSetWithStringsIterator iter = new UnicodeSetWithStringsIterator(set);
|
||||
int c;
|
||||
int start, next;
|
||||
for (start = next = 0; start < length;) {
|
||||
|
@ -240,9 +198,7 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
|
|||
if (realSet.contains(c)) {
|
||||
break;
|
||||
}
|
||||
String str;
|
||||
iter.reset();
|
||||
while ((str = iter.nextString()) != null) {
|
||||
for (String str : set.strings()) {
|
||||
if (str.length() <= (length - start) && matches16CPB(s, start, length, str)) {
|
||||
// spanNeedsStrings=true;
|
||||
return start;
|
||||
|
@ -252,7 +208,6 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
|
|||
}
|
||||
return start;
|
||||
} else /* CONTAINED or SIMPLE */{
|
||||
UnicodeSetWithStringsIterator iter = new UnicodeSetWithStringsIterator(set);
|
||||
int c;
|
||||
int start, next, maxSpanLimit = 0;
|
||||
for (start = next = 0; start < length;) {
|
||||
|
@ -261,9 +216,7 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
|
|||
if (!realSet.contains(c)) {
|
||||
next = start; // Do not span this single, not-contained code point.
|
||||
}
|
||||
String str;
|
||||
iter.reset();
|
||||
while ((str = iter.nextString()) != null) {
|
||||
for (String str : set.strings()) {
|
||||
if (str.length() <= (length - start) && matches16CPB(s, start, length, str)) {
|
||||
// spanNeedsStrings=true;
|
||||
int matchLimit = start + str.length();
|
||||
|
@ -336,7 +289,6 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
|
|||
} while (prev > 0);
|
||||
return prev;
|
||||
} else if (spanCondition == SpanCondition.NOT_CONTAINED) {
|
||||
UnicodeSetWithStringsIterator iter = new UnicodeSetWithStringsIterator(set);
|
||||
int c;
|
||||
int prev = length, length0 = length;
|
||||
do {
|
||||
|
@ -344,9 +296,7 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
|
|||
if (realSet.contains(c)) {
|
||||
break;
|
||||
}
|
||||
String str;
|
||||
iter.reset();
|
||||
while ((str = iter.nextString()) != null) {
|
||||
for (String str : set.strings()) {
|
||||
if (str.length() <= prev && matches16CPB(s, prev - str.length(), length0, str)) {
|
||||
// spanNeedsStrings=true;
|
||||
return prev;
|
||||
|
@ -356,7 +306,6 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
|
|||
} while (prev > 0);
|
||||
return prev;
|
||||
} else /* SpanCondition.CONTAINED or SIMPLE */{
|
||||
UnicodeSetWithStringsIterator iter = new UnicodeSetWithStringsIterator(set);
|
||||
int c;
|
||||
int prev = length, minSpanStart = length, length0 = length;
|
||||
do {
|
||||
|
@ -365,9 +314,7 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
|
|||
if (!realSet.contains(c)) {
|
||||
length = prev; // Do not span this single, not-contained code point.
|
||||
}
|
||||
String str;
|
||||
iter.reset();
|
||||
while ((str = iter.nextString()) != null) {
|
||||
for (String str : set.strings()) {
|
||||
if (str.length() <= prev && matches16CPB(s, prev - str.length(), length0, str)) {
|
||||
// spanNeedsStrings=true;
|
||||
int matchStart = prev - str.length();
|
||||
|
@ -616,7 +563,7 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
|
|||
* input expectCount<0).
|
||||
*/
|
||||
void verifySpan(final UnicodeSetWithStrings sets[], final String s, int whichSpans,
|
||||
int expectLimits[], int expectCount, // TODO
|
||||
int expectLimits[], int expectCount,
|
||||
final String testName, int index) {
|
||||
int[] limits = new int[500];
|
||||
int limitsCount;
|
||||
|
@ -1129,4 +1076,54 @@ public class UnicodeSetStringSpanTest extends TestFmwk {
|
|||
}
|
||||
}
|
||||
|
||||
public void TestSpanAndCount() {
|
||||
// a set with no strings
|
||||
UnicodeSet abc = new UnicodeSet('a', 'c');
|
||||
// a set with an "irrelevant" string (fully contained in the code point set)
|
||||
UnicodeSet crlf = new UnicodeSet().add('\n').add('\r').add("\r\n");
|
||||
// a set with no "irrelevant" string but some interesting overlaps
|
||||
UnicodeSet ab_cd = new UnicodeSet().add('a').add("ab").add("abc").add("cd");
|
||||
String s = "ab\n\r\r\n" + UTF16.valueOf(0x50000) + "abcde";
|
||||
OutputInt count = new OutputInt();
|
||||
assertEquals("abc span[8, 11[", 11,
|
||||
abc.spanAndCount(s, 8, SpanCondition.SIMPLE, count));
|
||||
assertEquals("abc count=3", 3, count.value);
|
||||
assertEquals("no abc span[2, 8[", 8,
|
||||
abc.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count));
|
||||
assertEquals("no abc count=5", 5, count.value);
|
||||
assertEquals("line endings span[2, 6[", 6,
|
||||
crlf.spanAndCount(s, 2, SpanCondition.CONTAINED, count));
|
||||
assertEquals("line endings count=3", 3, count.value);
|
||||
assertEquals("no ab+cd span[2, 8[", 8,
|
||||
ab_cd.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count));
|
||||
assertEquals("no ab+cd count=5", 5, count.value);
|
||||
assertEquals("ab+cd span[8, 12[", 12,
|
||||
ab_cd.spanAndCount(s, 8, SpanCondition.CONTAINED, count));
|
||||
assertEquals("ab+cd count=2", 2, count.value);
|
||||
assertEquals("1x abc span[8, 11[", 11,
|
||||
ab_cd.spanAndCount(s, 8, SpanCondition.SIMPLE, count));
|
||||
assertEquals("1x abc count=1", 1, count.value);
|
||||
|
||||
abc.freeze();
|
||||
crlf.freeze();
|
||||
ab_cd.freeze();
|
||||
assertEquals("abc span[8, 11[ (frozen)", 11,
|
||||
abc.spanAndCount(s, 8, SpanCondition.SIMPLE, count));
|
||||
assertEquals("abc count=3 (frozen)", 3, count.value);
|
||||
assertEquals("no abc span[2, 8[ (frozen)", 8,
|
||||
abc.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count));
|
||||
assertEquals("no abc count=5 (frozen)", 5, count.value);
|
||||
assertEquals("line endings span[2, 6[ (frozen)", 6,
|
||||
crlf.spanAndCount(s, 2, SpanCondition.CONTAINED, count));
|
||||
assertEquals("line endings count=3 (frozen)", 3, count.value);
|
||||
assertEquals("no ab+cd span[2, 8[ (frozen)", 8,
|
||||
ab_cd.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count));
|
||||
assertEquals("no ab+cd count=5 (frozen)", 5, count.value);
|
||||
assertEquals("ab+cd span[8, 12[ (frozen)", 12,
|
||||
ab_cd.spanAndCount(s, 8, SpanCondition.CONTAINED, count));
|
||||
assertEquals("ab+cd count=2 (frozen)", 2, count.value);
|
||||
assertEquals("1x abc span[8, 11[ (frozen)", 11,
|
||||
ab_cd.spanAndCount(s, 8, SpanCondition.SIMPLE, count));
|
||||
assertEquals("1x abc count=1 (frozen)", 1, count.value);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -11,6 +11,7 @@ import java.text.ParsePosition;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
@ -22,6 +23,7 @@ import java.util.SortedSet;
|
|||
import java.util.TreeSet;
|
||||
|
||||
import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.dev.util.CollectionUtilities;
|
||||
import com.ibm.icu.impl.SortedSetRelation;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
|
@ -33,6 +35,11 @@ import com.ibm.icu.text.UTF16;
|
|||
import com.ibm.icu.text.UnicodeMatcher;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSet.ComparisonStyle;
|
||||
import com.ibm.icu.text.UnicodeSet.EntryRange;
|
||||
import com.ibm.icu.text.UnicodeSetSpanner;
|
||||
import com.ibm.icu.text.UnicodeSetSpanner.Quantifier;
|
||||
import com.ibm.icu.text.UnicodeSet.SpanCondition;
|
||||
import com.ibm.icu.text.UnicodeSetSpanner.TrimOption;
|
||||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
|
||||
/**
|
||||
|
@ -1256,10 +1263,10 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
String pat = "";
|
||||
try {
|
||||
String name =
|
||||
(j==0) ? UScript.getName(i) : UScript.getShortName(i);
|
||||
pat = "[:" + name + ":]";
|
||||
UnicodeSet set = new UnicodeSet(pat);
|
||||
logln("Ok: " + pat + " -> " + set.toPattern(false));
|
||||
(j==0) ? UScript.getName(i) : UScript.getShortName(i);
|
||||
pat = "[:" + name + ":]";
|
||||
UnicodeSet set = new UnicodeSet(pat);
|
||||
logln("Ok: " + pat + " -> " + set.toPattern(false));
|
||||
} catch (IllegalArgumentException e) {
|
||||
if (pat.length() == 0) {
|
||||
errln("FAIL (in UScript): No name for script " + i);
|
||||
|
@ -1330,9 +1337,9 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
// The following pattern must contain at least one range "c-d"
|
||||
// where c or d is a Pattern_White_Space.
|
||||
String pattern =
|
||||
"[\\uFEFF \\u200E-\\u20FF \\uFFF9-\\uFFFC \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
|
||||
"[\\uFEFF \\u200E-\\u20FF \\uFFF9-\\uFFFC \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
|
||||
String exp =
|
||||
"[\\u200E-\\u20FF\\uFEFF\\uFFF9-\\uFFFC\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
|
||||
"[\\u200E-\\u20FF\\uFEFF\\uFFF9-\\uFFFC\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
|
||||
// We test this with two passes; in the second pass we
|
||||
// pre-unescape the pattern. Since U+200E is Pattern_White_Space,
|
||||
// this fails -- which is what we expect.
|
||||
|
@ -1563,7 +1570,7 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
mod2 = new UnicodeSet(set1).retainAll(set2.addAllTo(new LinkedHashSet<String>()));
|
||||
assertEquals("remove all", mod1, mod2);
|
||||
}
|
||||
|
||||
|
||||
public void TestComparison() {
|
||||
UnicodeSet set1 = new UnicodeSet("[a-b d-g {ch} {zh}]").freeze();
|
||||
UnicodeSet set2 = new UnicodeSet("[c-e {ch}]").freeze();
|
||||
|
@ -1579,7 +1586,7 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
List<UnicodeSet> sorted = new ArrayList(new TreeSet<UnicodeSet>(unsorted));
|
||||
assertNotEquals("compareTo-shorter-first", unsorted, sorted);
|
||||
assertEquals("compareTo-shorter-first", goalShortest, sorted);
|
||||
|
||||
|
||||
TreeSet<UnicodeSet> sorted1 = new TreeSet<UnicodeSet>(new Comparator<UnicodeSet>(){
|
||||
public int compare(UnicodeSet o1, UnicodeSet o2) {
|
||||
// TODO Auto-generated method stub
|
||||
|
@ -1616,34 +1623,34 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
// now compare all the combinations. If any of them is a code point, use it.
|
||||
int maxErrorCount = 0;
|
||||
compare:
|
||||
for (String last : target) {
|
||||
for (String curr : target) {
|
||||
int lastCount = Character.codePointCount(last, 0, last.length());
|
||||
int currCount = Character.codePointCount(curr, 0, curr.length());
|
||||
int comparison;
|
||||
if (lastCount == 1) {
|
||||
comparison = UnicodeSet.compare(last.codePointAt(0), curr);
|
||||
} else if (currCount == 1) {
|
||||
comparison = UnicodeSet.compare(last, curr.codePointAt(0));
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
if (comparison != last.compareTo(curr)) {
|
||||
// repeat for debugging
|
||||
for (String last : target) {
|
||||
for (String curr : target) {
|
||||
int lastCount = Character.codePointCount(last, 0, last.length());
|
||||
int currCount = Character.codePointCount(curr, 0, curr.length());
|
||||
int comparison;
|
||||
if (lastCount == 1) {
|
||||
comparison = UnicodeSet.compare(last.codePointAt(0), curr);
|
||||
} else if (currCount == 1) {
|
||||
comparison = UnicodeSet.compare(last, curr.codePointAt(0));
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
if (maxErrorCount++ > 10) {
|
||||
errln(maxErrorCount + " Failure in comparing " + last + " & " + curr + "\tOmitting others...");
|
||||
break compare;
|
||||
if (comparison != last.compareTo(curr)) {
|
||||
// repeat for debugging
|
||||
if (lastCount == 1) {
|
||||
comparison = UnicodeSet.compare(last.codePointAt(0), curr);
|
||||
} else if (currCount == 1) {
|
||||
comparison = UnicodeSet.compare(last, curr.codePointAt(0));
|
||||
}
|
||||
if (maxErrorCount++ > 10) {
|
||||
errln(maxErrorCount + " Failure in comparing " + last + " & " + curr + "\tOmitting others...");
|
||||
break compare;
|
||||
}
|
||||
errln(maxErrorCount + " Failure in comparing " + last + " & " + curr);
|
||||
}
|
||||
errln(maxErrorCount + " Failure in comparing " + last + " & " + curr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//compare(Iterable<T>, Iterable<T>)
|
||||
int max = 10;
|
||||
List<String> test1 = new ArrayList<String>(max);
|
||||
|
@ -1669,7 +1676,7 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
// check to make sure right exceptions are thrown
|
||||
Class expected = IllegalArgumentException.class;
|
||||
Class actual;
|
||||
|
||||
|
||||
try {
|
||||
actual = null;
|
||||
@SuppressWarnings("unused")
|
||||
|
@ -1678,7 +1685,7 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
actual = e.getClass();
|
||||
}
|
||||
assertEquals("exception if odd", expected, actual);
|
||||
|
||||
|
||||
try {
|
||||
actual = null;
|
||||
@SuppressWarnings("unused")
|
||||
|
@ -1687,7 +1694,7 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
actual = e.getClass();
|
||||
}
|
||||
assertEquals("exception for start/end problem", expected, actual);
|
||||
|
||||
|
||||
try {
|
||||
actual = null;
|
||||
@SuppressWarnings("unused")
|
||||
|
@ -1696,7 +1703,7 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
actual = e.getClass();
|
||||
}
|
||||
assertEquals("exception for end/start problem", expected, actual);
|
||||
|
||||
|
||||
CheckRangeSpeed(10000, new UnicodeSet("[:whitespace:]"));
|
||||
CheckRangeSpeed(1000, new UnicodeSet("[:letter:]"));
|
||||
}
|
||||
|
@ -1731,14 +1738,14 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
double rangeConstructorTime = (middle - start)/iterations;
|
||||
double patternConstructorTime = (end - middle)/iterations;
|
||||
String message = "Range constructor:\t" + rangeConstructorTime + ";\tPattern constructor:\t" + patternConstructorTime + "\t\t"
|
||||
+ percent.format(rangeConstructorTime/patternConstructorTime-1);
|
||||
+ percent.format(rangeConstructorTime/patternConstructorTime-1);
|
||||
if (rangeConstructorTime < 2*patternConstructorTime) {
|
||||
logln(message);
|
||||
} else {
|
||||
errln(message);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
NumberFormat percent = NumberFormat.getPercentInstance();
|
||||
{
|
||||
percent.setMaximumFractionDigits(2);
|
||||
|
@ -1806,69 +1813,69 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
}
|
||||
}
|
||||
|
||||
// Following cod block is commented out to eliminate PrettyPrinter depenencies
|
||||
// Following cod block is commented out to eliminate PrettyPrinter depenencies
|
||||
|
||||
// String[] prettyData = {
|
||||
// "[\\uD7DE-\\uD90C \\uDCB5-\\uDD9F]", // special case
|
||||
// "[:any:]",
|
||||
// "[:whitespace:]",
|
||||
// "[:linebreak=AL:]",
|
||||
// };
|
||||
//
|
||||
// public void TestPrettyPrinting() {
|
||||
// try{
|
||||
// PrettyPrinter pp = new PrettyPrinter();
|
||||
//
|
||||
// int i = 0;
|
||||
// for (; i < prettyData.length; ++i) {
|
||||
// UnicodeSet test = new UnicodeSet(prettyData[i]);
|
||||
// checkPrettySet(pp, i, test);
|
||||
// }
|
||||
// Random random = new Random(0);
|
||||
// UnicodeSet test = new UnicodeSet();
|
||||
//
|
||||
// // To keep runtimes under control, make the number of random test cases
|
||||
// // to try depends on the test framework exhaustive setting.
|
||||
// // params.inclusions = 5: default exhaustive value
|
||||
// // params.inclusions = 10: max exhaustive value.
|
||||
// int iterations = 50;
|
||||
// if (params.inclusion > 5) {
|
||||
// iterations = (params.inclusion-5) * 200;
|
||||
// }
|
||||
// for (; i < iterations; ++i) {
|
||||
// double start = random.nextGaussian() * 0x10000;
|
||||
// if (start < 0) start = - start;
|
||||
// if (start > 0x10FFFF) {
|
||||
// start = 0x10FFFF;
|
||||
// }
|
||||
// double end = random.nextGaussian() * 0x100;
|
||||
// if (end < 0) end = -end;
|
||||
// end = start + end;
|
||||
// if (end > 0x10FFFF) {
|
||||
// end = 0x10FFFF;
|
||||
// }
|
||||
// test.complement((int)start, (int)end);
|
||||
// checkPrettySet(pp, i, test);
|
||||
// }
|
||||
// }catch(RuntimeException ex){
|
||||
// warnln("Could not load Collator");
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// private void checkPrettySet(PrettyPrinter pp, int i, UnicodeSet test) {
|
||||
// String pretty = pp.toPattern(test);
|
||||
// UnicodeSet retry = new UnicodeSet(pretty);
|
||||
// if (!test.equals(retry)) {
|
||||
// errln(i + ". Failed test: " + test + " != " + pretty);
|
||||
// } else {
|
||||
// logln(i + ". Worked for " + truncate(test.toString()) + " => " + truncate(pretty));
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// private String truncate(String string) {
|
||||
// if (string.length() <= 100) return string;
|
||||
// return string.substring(0,97) + "...";
|
||||
// }
|
||||
// String[] prettyData = {
|
||||
// "[\\uD7DE-\\uD90C \\uDCB5-\\uDD9F]", // special case
|
||||
// "[:any:]",
|
||||
// "[:whitespace:]",
|
||||
// "[:linebreak=AL:]",
|
||||
// };
|
||||
//
|
||||
// public void TestPrettyPrinting() {
|
||||
// try{
|
||||
// PrettyPrinter pp = new PrettyPrinter();
|
||||
//
|
||||
// int i = 0;
|
||||
// for (; i < prettyData.length; ++i) {
|
||||
// UnicodeSet test = new UnicodeSet(prettyData[i]);
|
||||
// checkPrettySet(pp, i, test);
|
||||
// }
|
||||
// Random random = new Random(0);
|
||||
// UnicodeSet test = new UnicodeSet();
|
||||
//
|
||||
// // To keep runtimes under control, make the number of random test cases
|
||||
// // to try depends on the test framework exhaustive setting.
|
||||
// // params.inclusions = 5: default exhaustive value
|
||||
// // params.inclusions = 10: max exhaustive value.
|
||||
// int iterations = 50;
|
||||
// if (params.inclusion > 5) {
|
||||
// iterations = (params.inclusion-5) * 200;
|
||||
// }
|
||||
// for (; i < iterations; ++i) {
|
||||
// double start = random.nextGaussian() * 0x10000;
|
||||
// if (start < 0) start = - start;
|
||||
// if (start > 0x10FFFF) {
|
||||
// start = 0x10FFFF;
|
||||
// }
|
||||
// double end = random.nextGaussian() * 0x100;
|
||||
// if (end < 0) end = -end;
|
||||
// end = start + end;
|
||||
// if (end > 0x10FFFF) {
|
||||
// end = 0x10FFFF;
|
||||
// }
|
||||
// test.complement((int)start, (int)end);
|
||||
// checkPrettySet(pp, i, test);
|
||||
// }
|
||||
// }catch(RuntimeException ex){
|
||||
// warnln("Could not load Collator");
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// private void checkPrettySet(PrettyPrinter pp, int i, UnicodeSet test) {
|
||||
// String pretty = pp.toPattern(test);
|
||||
// UnicodeSet retry = new UnicodeSet(pretty);
|
||||
// if (!test.equals(retry)) {
|
||||
// errln(i + ". Failed test: " + test + " != " + pretty);
|
||||
// } else {
|
||||
// logln(i + ". Worked for " + truncate(test.toString()) + " => " + truncate(pretty));
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// private String truncate(String string) {
|
||||
// if (string.length() <= 100) return string;
|
||||
// return string.substring(0,97) + "...";
|
||||
// }
|
||||
|
||||
public class TokenSymbolTable implements SymbolTable {
|
||||
HashMap contents = new HashMap();
|
||||
|
@ -1944,7 +1951,7 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
UnicodeSet set = new UnicodeSet(DATA[i]);
|
||||
expectContainment(set,
|
||||
CharsToUnicodeString("abc\\U00010000"),
|
||||
"\uD800;\uDC00"); // split apart surrogate-pair
|
||||
"\uD800;\uDC00"); // split apart surrogate-pair
|
||||
if (set.size() != 4) {
|
||||
errln(Utility.escape("FAIL: " + DATA[i] + ".size() == " +
|
||||
set.size() + ", expected 4"));
|
||||
|
@ -2171,16 +2178,16 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
|
||||
// Now see if the expected relation is true
|
||||
int status = (minus12.size() != 0 ? 4 : 0)
|
||||
| (intersection.size() != 0 ? 2 : 0)
|
||||
| (minus21.size() != 0 ? 1 : 0);
|
||||
| (intersection.size() != 0 ? 2 : 0)
|
||||
| (minus21.size() != 0 ? 1 : 0);
|
||||
|
||||
if (status != relation) {
|
||||
errln("FAIL relation incorrect" + message
|
||||
+ "; desired = " + RELATION_NAME[relation]
|
||||
+ "; found = " + RELATION_NAME[status]
|
||||
+ "; set1 = " + set1.toPattern(true)
|
||||
+ "; set2 = " + set2.toPattern(true)
|
||||
);
|
||||
+ "; found = " + RELATION_NAME[status]
|
||||
+ "; set1 = " + set1.toPattern(true)
|
||||
+ "; set2 = " + set2.toPattern(true)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2234,7 +2241,7 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
errln("FAIL " + message
|
||||
+ "; source = " + s.toPattern(true)
|
||||
+ "; result = " + t.toPattern(true)
|
||||
);
|
||||
);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -2379,7 +2386,7 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
errln("UnicodeSetIterator.getSet() was not suppose to given an " + "an exception.");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Tests the method public UnicodeSet add(Collection<?> source) */
|
||||
public void TestAddCollection() {
|
||||
UnicodeSet us = new UnicodeSet();
|
||||
|
@ -2390,9 +2397,99 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void TestConstants() {
|
||||
assertEquals("Empty", new UnicodeSet(), UnicodeSet.EMPTY);
|
||||
assertEquals("All", new UnicodeSet(0,0x10FFFF), UnicodeSet.ALL_CODE_POINTS);
|
||||
}
|
||||
|
||||
public void TestIteration() {
|
||||
UnicodeSet us1 = new UnicodeSet("[abcM{xy}]");
|
||||
assertEquals("", "M, a-c", CollectionUtilities.join(us1.ranges(), ", "));
|
||||
|
||||
// Sample code
|
||||
for (EntryRange range : us1.ranges()) {
|
||||
// do something with code points between range.codepointEnd and range.codepointEnd;
|
||||
}
|
||||
for (String s : us1.strings()) {
|
||||
// do something with each string;
|
||||
}
|
||||
|
||||
String[] tests = {
|
||||
"[M-Qzab{XY}{ZW}]",
|
||||
"[]",
|
||||
"[a]",
|
||||
"[a-c]",
|
||||
"[{XY}]",
|
||||
};
|
||||
for (String test : tests) {
|
||||
UnicodeSet us = new UnicodeSet(test);
|
||||
UnicodeSetIterator it = new UnicodeSetIterator(us);
|
||||
for (EntryRange range : us.ranges()) {
|
||||
final String title = range.toString();
|
||||
logln(title);
|
||||
it.nextRange();
|
||||
assertEquals(title, it.codepoint, range.codepoint);
|
||||
assertEquals(title, it.codepointEnd, range.codepointEnd);
|
||||
// if (range.codepoint != -1) {
|
||||
// } else {
|
||||
// assertEquals(title, it.string, range.string);
|
||||
// }
|
||||
}
|
||||
for (String s : us.strings()) {
|
||||
it.nextRange();
|
||||
assertEquals("strings", it.string, s);
|
||||
}
|
||||
assertFalse("", it.next());
|
||||
}
|
||||
}
|
||||
|
||||
public void TestReplaceAndDelete() {
|
||||
UnicodeSetSpanner m;
|
||||
|
||||
m = new UnicodeSetSpanner(new UnicodeSet("[._]"));
|
||||
assertEquals("", "abc", m.deleteFrom("_._a_._b_._c_._"));
|
||||
assertEquals("", "_.__.__.__._", m.deleteFrom("_._a_._b_._c_._", SpanCondition.NOT_CONTAINED));
|
||||
|
||||
assertEquals("", "a_._b_._c", m.trim("_._a_._b_._c_._"));
|
||||
assertEquals("", "a_._b_._c_._", m.trim("_._a_._b_._c_._", TrimOption.LEADING));
|
||||
assertEquals("", "_._a_._b_._c", m.trim("_._a_._b_._c_._", TrimOption.TRAILING));
|
||||
|
||||
assertEquals("", "a??b??c", m.replaceFrom("a_._b_._c", "??", Quantifier.SPAN));
|
||||
assertEquals("", "a??b??c", m.replaceFrom(m.trim("_._a_._b_._c_._"), "??", Quantifier.SPAN));
|
||||
assertEquals("", "XYXYXYaXYXYXYbXYXYXYcXYXYXY", m.replaceFrom("_._a_._b_._c_._", "XY"));
|
||||
assertEquals("", "XYaXYbXYcXY", m.replaceFrom("_._a_._b_._c_._", "XY", Quantifier.SPAN));
|
||||
|
||||
m = new UnicodeSetSpanner(new UnicodeSet("\\p{uppercase}"));
|
||||
assertEquals("", "TQBF", m.deleteFrom("The Quick Brown Fox.", SpanCondition.NOT_CONTAINED));
|
||||
|
||||
m = new UnicodeSetSpanner(m.getUnicodeSet().addAll(new UnicodeSet("\\p{lowercase}")));
|
||||
assertEquals("", "TheQuickBrownFox", m.deleteFrom("The Quick Brown Fox.", SpanCondition.NOT_CONTAINED));
|
||||
|
||||
m = new UnicodeSetSpanner(new UnicodeSet("[{ab}]"));
|
||||
assertEquals("", "XXc acb", m.replaceFrom("ababc acb", "X"));
|
||||
assertEquals("", "Xc acb", m.replaceFrom("ababc acb", "X", Quantifier.SPAN));
|
||||
}
|
||||
|
||||
public void TestCodePoints() {
|
||||
// test supplemental code points and strings clusters
|
||||
checkCodePoints("x\u0308", "z\u0308", Quantifier.MIN_ELEMENTS, null, 1);
|
||||
checkCodePoints("𣿡", "𣿢", Quantifier.MIN_ELEMENTS, null, 1);
|
||||
checkCodePoints("👦", "👧", Quantifier.MIN_ELEMENTS, null, 1);
|
||||
}
|
||||
|
||||
private void checkCodePoints(String a, String b, Quantifier quantifier, String expectedReplaced, int expectedCount) {
|
||||
final String ab = a+b;
|
||||
UnicodeSetSpanner m = new UnicodeSetSpanner(new UnicodeSet("[{" + a + "}]"));
|
||||
assertEquals("new UnicodeSetSpanner(\"[{" + a + "}]\").countIn(\"" + ab + "\")",
|
||||
expectedCount,
|
||||
m.countIn(ab, quantifier));
|
||||
|
||||
if (expectedReplaced == null) {
|
||||
expectedReplaced = "-" + b;
|
||||
}
|
||||
assertEquals("new UnicodeSetSpanner(\"[{" + a + "}]\").replaceFrom(\"" + ab + "\", \"-\")",
|
||||
expectedReplaced, m.replaceFrom(ab, "-", quantifier));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue