ICU-7015 Compact collation syntax (lists and ranges): svn merge -r 27727:28181 icu4j/branches/umesh/collation

X-SVN-Rev: 28192
This commit is contained in:
Umesh Nair 2010-06-14 23:39:34 +00:00
parent 59ac0e22f9
commit 3e24713df1
2 changed files with 889 additions and 468 deletions

View file

@ -37,18 +37,26 @@ final class CollationRuleParser
*/
CollationRuleParser(String rules) throws ParseException
{
// Prepares m_copySet_ and m_removeSet_.
extractSetsFromRules(rules);
// Save the rules as a long string. The StringBuilder object is
// used to store the result of token parsing as well.
m_source_ = new StringBuilder(Normalizer.decompose(rules, false).trim());
m_rules_ = m_source_.toString();
// Index of the next unparsed character.
m_current_ = 0;
// Index of the next unwritten character in the parsed result.
m_extraCurrent_ = m_source_.length();
m_variableTop_ = null;
m_parsedToken_ = new ParsedToken();
m_hashTable_ = new HashMap<Token, Token>();
m_options_ = new OptionSet(RuleBasedCollator.UCA_);
m_listHeader_ = new TokenListHeader[512];
m_resultLength_ = 0;
m_prevStrength_ = TOKEN_UNSET_;
// call assembleTokenList() manually, so that we can
// init a parser and manually parse tokens
//assembleTokenList();
@ -411,11 +419,6 @@ final class CollationRuleParser
* the UCA.
*/
UnicodeSet m_removeSet_;
/**
* Stores the previous token's strength when making a list of same level
* differences.
*/
private int m_prevStrength_;
/*
* This is space for the extra strings that need to be unquoted during the
@ -675,6 +678,24 @@ final class CollationRuleParser
= RuleBasedCollator.UCA_.getCollationElementIterator("");
private int m_utilCEBuffer_[] = new int[2];
private boolean m_isStarred_;
private int m_currentStarredCharIndex_;
private int m_lastStarredCharIndex_;
private int m_currentRangeCp_;
private int m_lastRangeCp_;
private boolean m_inRange_;
private int m_previousCp_;
private boolean m_savedIsStarred_;
// private methods -------------------------------------------------------
/**
@ -688,7 +709,9 @@ final class CollationRuleParser
int sourcelimit = m_source_.length();
int expandNext = 0;
while (m_current_ < sourcelimit) {
m_isStarred_ = false;
while (m_current_ < sourcelimit || m_isStarred_) {
m_parsedToken_.m_prefixOffset_ = 0;
if (parseNextToken(lastToken == null) < 0) {
// we have reached the end
@ -734,6 +757,8 @@ final class CollationRuleParser
if (sourceToken.m_strength_ != TOKEN_RESET_
&& lastToken != sourceToken) {
// otherwise remove sourceToken from where it was.
// Take care of the next node
if (sourceToken.m_next_ != null) {
if (sourceToken.m_next_.m_strength_
> sourceToken.m_strength_) {
@ -744,14 +769,20 @@ final class CollationRuleParser
= sourceToken.m_previous_;
}
else {
// sourcetoken is the last token.
// Redefine the tail token.
sourceToken.m_listHeader_.m_last_
= sourceToken.m_previous_;
}
// Take care of the previous node.
if (sourceToken.m_previous_ != null) {
sourceToken.m_previous_.m_next_
= sourceToken.m_next_;
}
else {
// sourcetoken is the first token.
// Redefine the head node.
sourceToken.m_listHeader_.m_first_
= sourceToken.m_next_;
}
@ -1148,18 +1179,150 @@ final class CollationRuleParser
}
/**
* Getting the next token
* Parses the next token.
*
* It updates/accesses the following member variables:
* m_current_: Index to the next unparsed character (not code point)
* in the character array (a StringBuilder object) m_source_.
* m_parsedToken_: The parsed token. The following of the token are updated.
* .m_strength: The strength of the token.
* .m_charsOffset, m_charsLen_: Index to the first character (after operators),
* and number of characters in the token.
* This may be in the main string, or in the appended string.
* .m_extensionOffset_, .m_extensionLen_:
* .m_flags:
* .m_prefixOffset, .m_prefixLen: Used when "|" is used to specify "context before".
* .m_indirectIndex:
* @param startofrules
* flag indicating if we are at the start of rules
* @return the offset of the rules
* @return the offset of the next unparsed char
* @exception ParseException
* thrown when rule parsing fails
*/
@SuppressWarnings("fallthrough")
private int parseNextToken(boolean startofrules) throws ParseException
{
// parsing part
if (m_inRange_) {
// We are not done processing a range. Continue it.
return processNextCodePointInRange();
} else if (m_isStarred_) {
// We are not done processing a starred token. Continue it.
return processNextTokenInTheStarredList();
}
// Get the next token.
int nextOffset = parseNextTokenInternal(startofrules);
// If the next token is starred and/or in range, we need to handle it here.
if (m_inRange_) {
// A new range has started.
// Check whether it is a chain of ranges with more than one hyphen.
if (m_lastRangeCp_ > 0 && m_lastRangeCp_ == m_previousCp_) {
throw new ParseException("Chained range syntax", m_current_);
}
// The current token is the first character of the second code point of the range.
// Process just that, and then proceed with the star.
m_lastRangeCp_ = m_source_.codePointAt(this.m_parsedToken_.m_charsOffset_);
if (m_lastRangeCp_ <= m_previousCp_) {
throw new ParseException("Invalid range", m_current_);
}
// Set current range code point to process the range loop
m_currentRangeCp_ = m_previousCp_ + 1;
// Set current starred char index to continue processing the starred
// expression after the range is done.
m_currentStarredCharIndex_ = m_parsedToken_.m_charsOffset_
+ Character.charCount(m_lastRangeCp_);
m_lastStarredCharIndex_ = m_parsedToken_.m_charsOffset_ + m_parsedToken_.m_charsLen_ - 1;
return processNextCodePointInRange();
} else if (m_isStarred_) {
// We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that
// [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be
// separated into several tokens and returned.
m_currentStarredCharIndex_ = m_parsedToken_.m_charsOffset_;
m_lastStarredCharIndex_ = m_parsedToken_.m_charsOffset_ + m_parsedToken_.m_charsLen_ - 1;
return processNextTokenInTheStarredList();
}
return nextOffset;
}
private int processNextCodePointInRange() throws ParseException {
int nChars = Character.charCount(m_currentRangeCp_);
m_source_.appendCodePoint(m_currentRangeCp_);
m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
m_parsedToken_.m_charsLen_ = nChars;
m_extraCurrent_ += nChars;
++m_currentRangeCp_;
if (m_currentRangeCp_ > m_lastRangeCp_) {
// All the code points in the range are processed.
// Turn the range flag off.
m_inRange_ = false;
// If there is a starred portion remaining in the current
// parsed token, resume the starred operation.
if (m_currentStarredCharIndex_ <= m_lastStarredCharIndex_) {
m_isStarred_ = true;
} else {
m_isStarred_ = false;
}
} else {
m_previousCp_ = m_currentRangeCp_;
}
return m_current_;
}
/**
* Extracts the next token from the starred token from
* m_currentStarredCharIndex_ and returns it.
* @return the offset of the next unparsed char
* @throws ParseException
*/
private int processNextTokenInTheStarredList() throws ParseException {
// Extract the characters corresponding to the next code point.
int cp = m_source_.codePointAt(m_currentStarredCharIndex_);
int nChars = Character.charCount(cp);
m_parsedToken_.m_charsLen_ = nChars;
m_parsedToken_.m_charsOffset_ = m_currentStarredCharIndex_;
m_currentStarredCharIndex_ += nChars;
// When we are done parsing the starred string, turn the flag off so that
// the normal processing is restored.
if (m_currentStarredCharIndex_ > m_lastStarredCharIndex_) {
m_isStarred_ = false;
}
m_previousCp_ = cp;
return m_current_;
}
private int resetToTop(boolean top, boolean variableTop,
int extensionOffset, int newExtensionLen,
byte byteBefore) throws ParseException {
m_parsedToken_.m_indirectIndex_ = 5;
top = doSetTop();
return doEndParseNextToken(TOKEN_RESET_,
top,
extensionOffset,
newExtensionLen,
variableTop, byteBefore);
}
/**
* Gets the next token and sets the necessary internal variables.
* This function parses a starred string as a single token, which will be separated
* in the calling function.
* @param startofrules Boolean value indicating whether this is the first rule
* @return the offset of the next unparsed char
* @throws ParseException
*/
private int parseNextTokenInternal(boolean startofrules) throws ParseException {
boolean variabletop = false;
boolean top = false;
boolean inchars = true;
@ -1171,11 +1334,7 @@ final class CollationRuleParser
int /*charsoffset = 0,*/ extensionoffset = 0;
int newstrength = TOKEN_UNSET_;
m_parsedToken_.m_charsLen_ = 0;
m_parsedToken_.m_charsOffset_ = 0;
m_parsedToken_.m_prefixOffset_ = 0;
m_parsedToken_.m_prefixLen_ = 0;
m_parsedToken_.m_indirectIndex_ = 0;
initializeParsedToken();
int limit = m_rules_.length();
while (m_current_ < limit) {
@ -1186,10 +1345,10 @@ final class CollationRuleParser
}
else {
if ((m_parsedToken_.m_charsLen_ == 0) || inchars) {
if (m_parsedToken_.m_charsLen_ == 0) {
m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
}
m_parsedToken_.m_charsLen_ ++;
if (m_parsedToken_.m_charsLen_ == 0) {
m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
}
m_parsedToken_.m_charsLen_ ++;
}
else {
if (newextensionlen == 0) {
@ -1233,20 +1392,13 @@ final class CollationRuleParser
}
// if we start with strength, we'll reset to top
if (startofrules == true) {
m_parsedToken_.m_indirectIndex_ = 5;
top = doSetTop();
return doEndParseNextToken(TOKEN_RESET_,
top,
extensionoffset,
newextensionlen,
variabletop, before);
return resetToTop(top, variabletop, extensionoffset,
newextensionlen, before);
}
newstrength = Collator.IDENTICAL;
if(m_source_.charAt(m_current_ + 1) == 0x002A) { // '*'
if (m_source_.charAt(m_current_ + 1) == 0x002A) { // '*'
m_current_++;
m_prevStrength_ = newstrength;
}else{
m_prevStrength_ = TOKEN_UNSET_;
m_isStarred_ = true;
}
break;
case 0x002C : // ','
@ -1259,16 +1411,10 @@ final class CollationRuleParser
}
// if we start with strength, we'll reset to top
if (startofrules == true) {
m_parsedToken_.m_indirectIndex_ = 5;
top = doSetTop();
return doEndParseNextToken(TOKEN_RESET_,
top,
extensionoffset,
newextensionlen,
variabletop, before);
return resetToTop(top, variabletop, extensionoffset,
newextensionlen, before);
}
newstrength = Collator.TERTIARY;
m_prevStrength_ = TOKEN_UNSET_;
break;
case 0x003B : // ';'
if (newstrength != TOKEN_UNSET_) {
@ -1278,18 +1424,12 @@ final class CollationRuleParser
newextensionlen,
variabletop, before);
}
// if we start with strength, we'll reset to top
if (startofrules == true) {
m_parsedToken_.m_indirectIndex_ = 5;
top = doSetTop();
return doEndParseNextToken(TOKEN_RESET_,
top,
extensionoffset,
newextensionlen,
variabletop, before);
//if we start with strength, we'll reset to top
if(startofrules == true) {
return resetToTop(top, variabletop, extensionoffset,
newextensionlen, before);
}
newstrength = Collator.SECONDARY;
m_prevStrength_ = TOKEN_UNSET_;
break;
case 0x003C : // '<'
if (newstrength != TOKEN_UNSET_) {
@ -1299,15 +1439,10 @@ final class CollationRuleParser
newextensionlen,
variabletop, before);
}
// if we start with strength, we'll reset to top
if (startofrules == true) {
m_parsedToken_.m_indirectIndex_ = 5;
top = doSetTop();
return doEndParseNextToken(TOKEN_RESET_,
top,
extensionoffset,
newextensionlen,
variabletop, before);
// if we start with strength, we'll reset to top
if (startofrules == true) {
return resetToTop(top, variabletop, extensionoffset,
newextensionlen, before);
}
// before this, do a scan to verify whether this is
// another strength
@ -1324,14 +1459,12 @@ final class CollationRuleParser
else { // just one
newstrength = Collator.PRIMARY;
}
if(m_source_.charAt(m_current_ + 1) == 0x002A) { // '*'
if (m_source_.charAt(m_current_ + 1) == 0x002A) { // '*'
m_current_++;
m_prevStrength_ = newstrength;
}else{
m_prevStrength_ = TOKEN_UNSET_;
m_isStarred_ = true;
}
break;
case 0x0026 : // '&'
if (newstrength != TOKEN_UNSET_) {
return doEndParseNextToken(newstrength,
@ -1341,7 +1474,6 @@ final class CollationRuleParser
variabletop, before);
}
newstrength = TOKEN_RESET_; // PatternEntry::RESET = 0
m_prevStrength_ = TOKEN_UNSET_;
break;
case 0x005b : // '['
// options - read an option, analyze it
@ -1413,12 +1545,8 @@ final class CollationRuleParser
// found a quote, we're gonna start copying
case 0x0027 : //'\''
if (newstrength == TOKEN_UNSET_) {
if (m_prevStrength_ == TOKEN_UNSET_) {
// quote is illegal until we have a strength
throwParseException(m_rules_, m_current_);
}else{
newstrength = m_prevStrength_;
}
// quote is illegal until we have a strength
throwParseException(m_rules_, m_current_);
}
inquote = true;
if (inchars) { // we're doing characters
@ -1426,6 +1554,9 @@ final class CollationRuleParser
m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
}
if (m_parsedToken_.m_charsLen_ != 0) {
// We are processing characters in quote together.
// Copy whatever is in the current token, so that
// the unquoted string can be appended to that.
m_source_.append(m_source_.substring(
m_current_ - m_parsedToken_.m_charsLen_,
m_current_));
@ -1495,6 +1626,26 @@ final class CollationRuleParser
// skip whitespace between '|' and the character
} while (UCharacterProperty.isRuleWhiteSpace(ch));
break;
case 0x002D : // '-', indicates a range.
if (newstrength != TOKEN_UNSET_) {
m_savedIsStarred_ = m_isStarred_;
return doEndParseNextToken(newstrength,
top,
extensionoffset,
newextensionlen,
variabletop, before);
}
m_isStarred_ = m_savedIsStarred_;
// Ranges are valid only in starred tokens.
if (!m_isStarred_) {
throwParseException(m_rules_, m_current_);
}
newstrength = m_parsedToken_.m_strength_;
m_inRange_ = true;
break;
case 0x0023: // '#' // this is a comment, skip everything through the end of line
do {
m_current_ ++;
@ -1505,14 +1656,10 @@ final class CollationRuleParser
break;
default :
if (newstrength == TOKEN_UNSET_) {
if(m_prevStrength_ == TOKEN_UNSET_){
throwParseException(m_rules_, m_current_);
}else{
newstrength = m_prevStrength_;
}
throwParseException(m_rules_, m_current_);
}
if (isSpecialChar(ch) && (inquote == false)) {
throwParseException(m_rules_, m_current_);
throwParseException(m_rules_, m_current_);
}
if (ch == 0x0000 && m_current_ + 1 == limit) {
break;
@ -1522,16 +1669,6 @@ final class CollationRuleParser
m_parsedToken_.m_charsOffset_ = m_current_;
}
m_parsedToken_.m_charsLen_++;
if(m_prevStrength_ != TOKEN_UNSET_){
char[] fullchar = Character.toChars(Character.codePointAt(m_source_, m_current_));
m_current_ += fullchar.length;
m_parsedToken_.m_charsLen_ += fullchar.length - 1;
return doEndParseNextToken(newstrength,
top,
extensionoffset,
newextensionlen,
variabletop, before);
}
}
else {
if (newextensionlen == 0) {
@ -1556,6 +1693,18 @@ final class CollationRuleParser
variabletop, before);
}
/**
*
*/
private void initializeParsedToken() {
m_parsedToken_.m_charsLen_ = 0;
m_parsedToken_.m_charsOffset_ = 0;
m_parsedToken_.m_prefixOffset_ = 0;
m_parsedToken_.m_prefixLen_ = 0;
m_parsedToken_.m_indirectIndex_ = 0;
}
/**
* End the next parse token
* @param newstrength new strength
@ -1941,7 +2090,7 @@ final class CollationRuleParser
else if (i == 7) { // variable top
return TOKEN_SUCCESS_MASK_ | TOKEN_VARIABLE_TOP_MASK_;
}
else if (i == 8) { // rearange
else if (i == 8) { // rearrange
return TOKEN_SUCCESS_MASK_;
}
else if (i == 9) { // before
@ -2086,7 +2235,7 @@ final class CollationRuleParser
int setStart = 0;
int i = 0;
while(i < rules.length()) {
if(rules.charAt(i) == 0x005B) {
if(rules.charAt(i) == 0x005B) { // [
optionNumber = readOption(rules, i+1, rules.length());
setStart = m_optionarg_;
if(optionNumber == 13) { /* copy - parts of UCA to tailoring */