diff --git a/icu4c/source/i18n/rbt_data.cpp b/icu4c/source/i18n/rbt_data.cpp index d1a9eab3aae..9abf69a0e54 100644 --- a/icu4c/source/i18n/rbt_data.cpp +++ b/icu4c/source/i18n/rbt_data.cpp @@ -43,6 +43,14 @@ TransliterationRuleData::defineVariable(const UnicodeString& name, UChar standIn, UnicodeSet* adoptedSet, UErrorCode& status) { + defineVariable(name, standIn, status); + defineSet(standIn, adoptedSet, status); +} + +void +TransliterationRuleData::defineSet(UChar standIn, + UnicodeSet* adoptedSet, + UErrorCode& status) { if (U_FAILURE(status)) { return; } @@ -50,9 +58,6 @@ TransliterationRuleData::defineVariable(const UnicodeString& name, status = U_MEMORY_ALLOCATION_ERROR; return; } - uhash_putKey(variableNames, name.hashCode() & 0x7FFFFFFF, - (void*) standIn, - &status); uhash_putKey(setVariables, (int32_t) (standIn & 0x7FFFFFFF), adoptedSet, &status); diff --git a/icu4c/source/i18n/rbt_data.h b/icu4c/source/i18n/rbt_data.h index 0ab7b18ec8e..7ba43fcb3e1 100644 --- a/icu4c/source/i18n/rbt_data.h +++ b/icu4c/source/i18n/rbt_data.h @@ -72,6 +72,10 @@ public: UnicodeSet* adoptedSet, UErrorCode& status); + void defineSet(UChar standIn, + UnicodeSet* adoptedSet, + UErrorCode& status); + UChar lookupVariable(const UnicodeString& name, UErrorCode& status) const; diff --git a/icu4c/source/i18n/rbt_pars.cpp b/icu4c/source/i18n/rbt_pars.cpp index 99945b15730..ef20b261822 100644 --- a/icu4c/source/i18n/rbt_pars.cpp +++ b/icu4c/source/i18n/rbt_pars.cpp @@ -13,35 +13,30 @@ #include "unirange.h" #include "rbt_data.h" #include "unicode/uniset.h" +#include "cstring.h" +#include "unicode/parsepos.h" // Operators const UChar TransliterationRuleParser::VARIABLE_DEF_OP = '='; const UChar TransliterationRuleParser::FORWARD_RULE_OP = '>'; const UChar TransliterationRuleParser::REVERSE_RULE_OP = '<'; -const char* TransliterationRuleParser::OPERATORS = "=><"; +const UChar TransliterationRuleParser::FWDREV_RULE_OP = '~'; // internal rep of <> op +const UnicodeString TransliterationRuleParser::OPERATORS = UNICODE_STRING("=><", 3); // Other special characters const UChar TransliterationRuleParser::QUOTE = '\''; -const UChar TransliterationRuleParser::VARIABLE_REF_OPEN = '{'; -const UChar TransliterationRuleParser::VARIABLE_REF_CLOSE = '}'; -const UChar TransliterationRuleParser::CONTEXT_OPEN = '['; -const UChar TransliterationRuleParser::CONTEXT_CLOSE = ']'; -const UChar TransliterationRuleParser::CURSOR_POS = '|'; +const UChar TransliterationRuleParser::ESCAPE = '\\'; +const UChar TransliterationRuleParser::END_OF_RULE = ';'; const UChar TransliterationRuleParser::RULE_COMMENT_CHAR = '#'; +const UChar TransliterationRuleParser::VARIABLE_REF_OPEN = '{'; +const UChar TransliterationRuleParser::VARIABLE_REF_CLOSE = '}'; +const UChar TransliterationRuleParser::CONTEXT_OPEN = '('; +const UChar TransliterationRuleParser::CONTEXT_CLOSE = ')'; +const UChar TransliterationRuleParser::SET_OPEN = '['; +const UChar TransliterationRuleParser::SET_CLOSE = ']'; +const UChar TransliterationRuleParser::CURSOR_POS = '|'; -/** - * Specials must be quoted in rules to be used as literals. - * Specials may not occur in variable names. - * - * This string is a superset of OPERATORS. - */ -const char* TransliterationRuleParser::SPECIALS = "'{}[]|#=><"; - -/** - * Specials that must be quoted in variable definitions. - */ -const char* TransliterationRuleParser::DEF_SPECIALS = "'{}"; TransliterationRuleData* TransliterationRuleParser::parse(const UnicodeString& rules, @@ -84,465 +79,339 @@ void TransliterationRuleParser::parseRules(void) { determineVariableRange(); - int32_t n = rules.length(); - int32_t i = 0; - while (i0 && rules.charAt(limit-1) == '\\') { - limit = rules.indexOf('\n', limit+1); + int32_t pos = 0; + int32_t limit = rules.length(); + while (pos < limit && U_SUCCESS(status)) { + UChar c = rules.charAt(pos++); + if (Unicode::isWhitespace(c)) { + // Ignore leading whitespace. Note that this is not + // Unicode spaces, but Java spaces -- a subset, + // representing whitespace likely to be seen in code. + continue; } - - if (limit == -1) { - limit = n; + // Skip lines starting with the comment character + if (c == RULE_COMMENT_CHAR) { + pos = rules.indexOf("\n", pos) + 1; + if (pos == 0) { + break; // No "\n" found; rest of rule is a commnet + } + continue; // Either fall out or restart with next line } - // Skip over empty lines and line starting with # - if (limit > i && rules.charAt(i) != RULE_COMMENT_CHAR) { - applyRule(i, limit); - } - i = limit + 1; + // We've found the start of a rule. c is its first + // character, and pos points past c. Lexically parse the + // rule into component pieces. + pos = parseRule(--pos, limit); + } + + // Index the rules + if (U_SUCCESS(status)) { + data->ruleSet.freeze(*data, status); } - - data->ruleSet.freeze(); } /** - * Parse the given substring as a rule, and append it to the rules currently - * represented in this object. - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= rules.length(). - * @exception IllegalArgumentException if there is a syntax error in the - * rules + * MAIN PARSER. Parse the next rule in the given rule string, starting + * at pos. Return the index after the last character parsed. Do not + * parse characters at or after limit. + * + * Important: The character at pos must be a non-whitespace character + * that is not the comment character. + * + * This method handles quoting, escaping, and whitespace removal. It + * parses the end-of-rule character. It recognizes context and cursor + * indicators. Once it does a lexical breakdown of the rule at pos, it + * creates a rule object and adds it to our rule list. */ -void TransliterationRuleParser::applyRule(int32_t start, int32_t limit) { - /* General description of parsing: Initially, rules contain two types of - * quoted characters. First, there are variable references, such as - * "{alpha}". Second, there are quotes, such as "'<'" or "''". One of - * the first steps in parsing a rule is to resolve such quoted matter. - * Quotes are removed early, leaving unquoted literal matter. Variable - * references are resolved and replaced by single characters. In some - * instances these characters represent themselves; in others, they - * stand for categories of characters. Character categories are either - * predefined (e.g., "{Lu}"), or are defined by the user using a - * statement (e.g., "vowels:aeiouAEIOU"). - * - * Another early step in parsing is to split each rule into component - * pieces. These pieces are, for every rule, a left-hand side, a right- - * hand side, and an operator. The left- and right-hand sides may not - * be empty, except for the output patterns of forward and reverse - * rules. In addition to this partitioning, the match patterns of - * forward and reverse rules must be partitioned into antecontext, - * postcontext, and literal pattern, where the context portions may or - * may not be present. Finally, output patterns must have the cursor - * indicator '|' detected and removed, with its position recorded. - * - * Quote removal, variable resolution, and sub-pattern splitting must - * all happen at once. This is due chiefly to the quoting mechanism, - * which allows special characters to appear at arbitrary positions in - * the final unquoted text. (For this reason, alteration of the rule - * language is somewhat clumsy; it entails reassessment and revision of - * the parsing methods as a whole.) - * - * After this processing of rules is complete, the final end products - * are unquoted pieces of text of various types, and an integer cursor - * position, if one is specified. These processed raw materials are now - * easy to deal with; other classes such as UnicodeSet and - * TransliterationRule need know nothing of quoting or variables. - */ +int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) { + // Locate the left side, operator, and right side + int32_t start = pos; + UChar op = 0; + + UnicodeString buf; + int32_t cursor = -1; // position of cursor in buf + int32_t ante = -1; // position of ante context marker ')' in buf + int32_t post = -1; // position of post context marker '(' in buf + int32_t postClose = -1; // position of post context close ')' in buf + + // Assigned to buf and its adjuncts after the LHS has been + // parsed. Thereafter, buf etc. refer to the RHS. UnicodeString left; - UnicodeString right; - UnicodeString anteContext; - UnicodeString postContext; - int32_t cursorPos; + int32_t leftCursor = -1, leftAnte = -1, leftPost = -1, leftPostClose = -1; - UChar op = parseRule(start, limit, left, right, - anteContext, postContext, cursorPos); + UnicodeString scratch; - if (U_FAILURE(status)) { - return; + while (pos < limit) { + UChar c = rules.charAt(pos++); + if (Unicode::isWhitespace(c)) { + // Ignore whitespace. Note that this is not Unicode + // spaces, but Java spaces -- a subset, representing + // whitespace likely to be seen in code. + continue; + } + // Handle escapes + if (c == ESCAPE) { + if (pos == limit) { + return syntaxError("Trailing backslash", rules, start); + } + // Parse \uXXXX escapes + c = rules.charAt(pos++); + if (c == 'u') { + if ((pos+4) > limit) { + return syntaxError("Malformed Unicode escape", rules, start); + } + c = (UChar)0x0000; + for (int32_t plim=pos+4; pos= 0) { + if (op != 0) { + return syntaxError("Unquoted special", rules, start); + } + // Found an operator char. Check for forward-reverse operator. + if (c == REVERSE_RULE_OP && + (pos < limit && rules.charAt(pos) == FORWARD_RULE_OP)) { + ++pos; + op = FWDREV_RULE_OP; + } else { + op = c; + } + left = buf; // lhs + leftCursor = cursor; + leftAnte = ante; + leftPost = post; + leftPostClose = postClose; + + buf.truncate(0); + cursor = ante = post = postClose = -1; + continue; + } + if (c == END_OF_RULE) { + break; + } + switch (c) { + case VARIABLE_REF_OPEN: + { + int32_t j = rules.indexOf(VARIABLE_REF_CLOSE, pos); + if (pos == j || j < 0) { // empty or unterminated + return syntaxError("Malformed variable reference", rules, start); + } + scratch.truncate(0); + rules.extractBetween(pos, j, scratch); + pos = j+1; + UChar v = data->lookupVariable(scratch, status); + if (U_FAILURE(status)) { + return syntaxError("Undefined variable", rules, start); + } + buf.append(v); + } + break; + case CONTEXT_OPEN: + if (post >= 0) { + return syntaxError("Multiple post contexts", rules, start); + } + // Ignore CONTEXT_OPEN if buffer length is zero -- that means + // this is the optional opening delimiter for the ante context. + if (buf.length() > 0) { + post = buf.length(); + } + break; + case CONTEXT_CLOSE: + if (postClose >= 0) { + return syntaxError("Unexpected ')'", rules, start); + } + if (post >= 0) { + // This is probably the optional closing delimiter + // for the post context; save the pos and check later. + postClose = buf.length(); + } else if (ante >= 0) { + return syntaxError("Multiple ante contexts", rules, start); + } else { + ante = buf.length(); + } + break; + case SET_OPEN: { + ParsePosition pp(pos-1); // Backup to opening '[' + buf.append(registerSet(new UnicodeSet(rules, pp, data, status))); + if (U_FAILURE(status)) { + return syntaxError("Invalid set", rules, start); + } + pos = pp.getIndex(); } + break; + case VARIABLE_REF_CLOSE: + case SET_CLOSE: + return syntaxError("Unquoted special", rules, start); + case CURSOR_POS: + if (cursor >= 0) { + return syntaxError("Multiple cursors", rules, start); + } + cursor = buf.length(); + break; + default: + buf.append(c); + break; + } } + if (op == 0) { + return syntaxError("No operator", rules, start); + } + + // Check context close parameters + if ((leftPostClose >= 0 && leftPostClose != left.length()) || + (postClose >= 0 && postClose != buf.length())) { + return syntaxError("Extra text after ]", rules, start); + } + + // Context is only allowed on the input side; that is, the left side + // for forward rules. Cursors are only allowed on the output side; + // that is, the right side for forward rules. Bidirectional rules + // ignore elements that do not apply. switch (op) { case VARIABLE_DEF_OP: - applyVariableDef(left, right); + // LHS is the name. RHS is a single character, either a literal + // or a set (already parsed). If RHS is longer than one + // character, it is either a multi-character string, or multiple + // sets, or a mixture of chars and sets -- syntax error. + if (buf.length() != 1) { + return syntaxError("Malformed RHS", rules, start); + } + if (data->isVariableDefined(left)) { + return syntaxError("Duplicate definition", rules, start); + } + data->defineVariable(left, buf.charAt(0), status); break; + case FORWARD_RULE_OP: if (direction == RuleBasedTransliterator::FORWARD) { + if (ante >= 0 || post >= 0 || leftCursor >= 0) { + return syntaxError("Malformed rule", rules, start); + } data->ruleSet.addRule(new TransliterationRule( - left, right, - anteContext, postContext, - cursorPos, status), - status); + left, leftAnte, leftPost, + buf, cursor, status), status); } // otherwise ignore the rule; it's not the direction we want break; + case REVERSE_RULE_OP: if (direction == RuleBasedTransliterator::REVERSE) { + if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) { + return syntaxError("Malformed rule", rules, start); + } data->ruleSet.addRule(new TransliterationRule( - right, left, - anteContext, postContext, - cursorPos, status), - status); + buf, ante, post, + left, leftCursor, status), status); } // otherwise ignore the rule; it's not the direction we want break; - } -} -/** - * Add a variable definition. - * @param name the name of the variable. It must not already be defined. - * @param pattern the value of the variable. It may be a single character - * or a pattern describing a character set. - * @exception IllegalArgumentException if there is a syntax error - */ -void TransliterationRuleParser::applyVariableDef(const UnicodeString& name, - const UnicodeString& pattern) { - validateVariableName(name); - - if (U_FAILURE(status)) { - return; - } - - if (data->isVariableDefined(name)) { - // throw new IllegalArgumentException("Duplicate variable definition: " - // + name + '=' + pattern); - status = U_ILLEGAL_ARGUMENT_ERROR; - return; - } -//! if (UnicodeSet.getCategoryID(name) >= 0) { -//! throw new IllegalArgumentException("Reserved variable name: " -//! + name); -//! } - if (pattern.length() < 1) { - // throw new IllegalArgumentException("Variable definition missing: " - // + name); - status = U_ILLEGAL_ARGUMENT_ERROR; - return; - } - - if (pattern.length() == 1) { - // Got a single character variable definition - //$ data->variableNames.put(name, new Character(pattern.charAt(0))); - data->defineVariable(name, pattern.charAt(0), status); - } else { - // Got more than one character; parse it as a category - if (variableNext >= variableLimit) { - //$ throw new RuntimeException("Private use variables exhausted"); - status = U_ILLEGAL_ARGUMENT_ERROR; - return; + case FWDREV_RULE_OP: + if (direction == RuleBasedTransliterator::FORWARD) { + // The output side is the right; trim off any context + if (post >= 0) { + buf.remove(post); + } + if (ante >= 0) { + buf.removeBetween(0, ante); + } + data->ruleSet.addRule(new TransliterationRule( + left, leftAnte, leftPost, + buf, cursor, status), status); + } else { + // The output side is the left; trim off any context + if (leftPost >= 0) { + left.remove(leftPost); + } + if (leftAnte >= 0) { + left.removeBetween(0, leftAnte); + } + data->ruleSet.addRule(new TransliterationRule( + buf, ante, post, + left, leftCursor, status), status); } - //$ Character c = new Character(variableNext++); - //$ data->variableNames.put(name, c); - //$ data->setVariables.put(c, new UnicodeSet(pattern)); - data->defineVariable(name, variableNext++, - new UnicodeSet(pattern, status), - status); + break; } + + return pos; } /** - * Given a rule, parses it into three pieces: The left side, the right side, - * and the operator. Returns the operator. Quotes and variable references - * are resolved; the otuput text in all StringBuffer parameters - * is literal text. This method delegates to other parsing methods to - * handle the match pattern, output pattern, and other sub-patterns in the - * rule. - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= rules.length(). - * @param left left side of rule is appended to this buffer - * with the quotes removed and variables resolved - * @param right right side of rule is appended to this buffer - * with the quotes removed and variables resolved - * @param anteContext the preceding context of the match pattern, - * if there is one, is appended to this buffer - * @param postContext the following context of the match pattern, - * if there is one, is appended to this buffer - * @param cursorPos if there is a cursor in the output pattern, its - * offset is stored in cursorPos, otherwise set to -1. - * @return The operator character, one of the characters in OPERATORS. + * Called by main parser upon syntax error. Search the rule string + * for the probable end of the rule. Of course, if the error is that + * the end of rule marker is missing, then the rule end will not be found. + * In any case the rule start will be correctly reported. + * @param msg error description + * @param rule pattern string + * @param start position of first character of current rule */ -UChar TransliterationRuleParser::parseRule(int32_t start, int32_t limit, - UnicodeString& left, - UnicodeString& right, - UnicodeString& anteContext, - UnicodeString& postContext, - int32_t& cursorPos) { - /* Parse the rule into three pieces -- left, operator, and right, - * parsing out quotes. The result is that left and right will have - * unquoted text. E.g., "gt<'>'" will have right = ">". Unquoted - * operators throw an exception. Two quotes inside or outside - * quotes indicates a quote literal. E.g., "o''clock" -> "o'clock". - */ - int32_t i = quotedIndexOf(rules, start, limit, OPERATORS); - if (i < 0) { - //$ throw new IllegalArgumentException( - //$ "Syntax error: " - //$ + rules.substring(start, limit)); +int32_t TransliterationRuleParser::syntaxError(const char* /*msg*/, + const UnicodeString& /*rule*/, + int32_t start) { +//| int end = quotedIndexOf(rule, start, rule.length(), ";"); +//| if (end < 0) { +//| end = rule.length(); +//| } +//| throw new IllegalArgumentException(msg + " in " + +//| rule.substring(start, end)); + status = U_ILLEGAL_ARGUMENT_ERROR; + return start; +} + +/** + * Allocate a private-use substitution character for the given set, + * register it in the setVariables hash, and return the substitution + * character. + */ +UChar TransliterationRuleParser::registerSet(UnicodeSet* adoptedSet) { + if (variableNext >= variableLimit) { + // throw new RuntimeException("Private use variables exhausted"); status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } - cursorPos = -1; - UChar c = rules.charAt(i); - switch (c) { - case FORWARD_RULE_OP: - if (i == start) { - //$ throw new IllegalArgumentException( - //$ "Empty left side: " - //$ + rules.substring(start, limit)); - status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - parseMatchPattern(start, i, left, anteContext, postContext); - if (i != (limit-1)) { - parseOutputPattern(i+1, limit, right, cursorPos); - } - break; - case REVERSE_RULE_OP: - if (i == (limit-1)) { - //$ throw new IllegalArgumentException( - //$ "Empty right side: " - //$ + rules.substring(start, limit)); - status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - if (i != start) { - parseOutputPattern(start, i, left, cursorPos); - } - parseMatchPattern(i+1, limit, right, anteContext, postContext); - break; - default: - if (i == start || i == (limit-1)) { - //$ throw new IllegalArgumentException( - //$ "Empty left or right side: " - //$ + rules.substring(start, limit)); - status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - parseSubPattern(start, i, left); - parseDefPattern(i+1, limit, right); - break; - } + UChar c = variableNext++; + data->defineSet(c, adoptedSet, status); return c; } -/** - * Parses the match pattern of a forward or reverse rule. Given the raw - * match pattern, return the match text and the context on both sides, if - * any. Resolves all quotes and variables. - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= rules.length(). - * @param text the key to be matched will be appended to this buffer - * @param anteContext the preceding context, if any, will be appended - * to this buffer. - * @param postContext the following context, if any, will be appended - * to this buffer. - */ -void TransliterationRuleParser::parseMatchPattern(int32_t start, int32_t limit, - UnicodeString& text, - UnicodeString& anteContext, - UnicodeString& postContext) { - if (start >= limit) { - //$ throw new IllegalArgumentException( - //$ "Empty expression in rule: " - //$ + rules.substring(start, limit)); - status = U_ILLEGAL_ARGUMENT_ERROR; - return; - } - //$ if (anteContext != 0) { - // Ignore optional opening and closing context characters - if (rules.charAt(start) == CONTEXT_OPEN) { - ++start; - } - if (rules.charAt(limit-1) == CONTEXT_CLOSE) { - --limit; - } - // The four possibilities are: - // key - // anteContext]key - // anteContext]key[postContext - // key[postContext - int32_t ante = quotedIndexOf(rules, start, limit, CONTEXT_CLOSE); - int32_t post = quotedIndexOf(rules, start, limit, CONTEXT_OPEN); - if (ante >= 0 && post >= 0 && ante > post) { - //$ throw new IllegalArgumentException( - //$ "Syntax error in context specifier: " - //$ + rules.substring(start, limit)); - status = U_ILLEGAL_ARGUMENT_ERROR; - return; - } - if (ante >= 0) { - parseSubPattern(start, ante, anteContext); - start = ante+1; - } - if (post >= 0) { - parseSubPattern(post+1, limit, postContext); - limit = post; - } - //$ } - parseSubPattern(start, limit, text); -} - -void TransliterationRuleParser::parseSubPattern(int32_t start, int32_t limit, - UnicodeString& text) { - parseSubPattern(start, limit, text, 0, SPECIALS); -} - -/** - * Parse a variable definition sub pattern. This kind of sub - * pattern differs in the set of characters that are considered - * special. In particular, the '[' and ']' characters are not - * special, since these are used in UnicodeSet patterns. - */ -void TransliterationRuleParser::parseDefPattern(int32_t start, int32_t limit, - UnicodeString& text) { - parseSubPattern(start, limit, text, 0, DEF_SPECIALS); -} - -/** - * Parses the output pattern of a forward or reverse rule. Given the - * output pattern, return the output text and the position of the cursor, - * if any. Resolves all quotes and variables. - * @param rules the string to be parsed - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= rules.length(). - * @param text the output text will be appended to this buffer - * @param cursorPos if this parameter is not null, then cursorPos - * will be set to the cursor position, or -1 if there is none. If this - * parameter is null, then cursors will be disallowed. - */ -void TransliterationRuleParser::parseOutputPattern(int32_t start, int32_t limit, - UnicodeString& text, - int32_t& cursorPos) { - parseSubPattern(start, limit, text, &cursorPos, SPECIALS); -} - -/** - * Parses a sub-pattern of a rule. Return the text and the position of the cursor, - * if any. Resolves all quotes and variables. - * @param rules the string to be parsed - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= rules.length(). - * @param text the output text will be appended to this buffer - * @param cursorPos if this parameter is not null, then cursorPos - * will be set to the cursor position, or -1 if there is none. If this - * parameter is null, then cursors will be disallowed. - * @param specials characters that must be quoted; typically either - * SPECIALS or DEF_SPECIALS. - */ -void TransliterationRuleParser::parseSubPattern(int32_t start, int32_t limit, - UnicodeString& text, - int32_t* cursorPos, - const UnicodeString& specials) { - bool_t inQuote = FALSE; - - if (start >= limit) { - //$ throw new IllegalArgumentException("Empty expression in rule"); - status = U_ILLEGAL_ARGUMENT_ERROR; - return; - } - if (cursorPos != 0) { - *cursorPos = -1; - } - for (int32_t i=start; ilookupVariable(name, status); - if (U_FAILURE(status)) { - return; - } - text.append(ch); - i = j; - } else if (c == CURSOR_POS && cursorPos != 0) { - if (*cursorPos >= 0) { - //$ throw new IllegalArgumentException("Multiple cursors: " - //$ + rules.substring(start, limit)); - status = U_ILLEGAL_ARGUMENT_ERROR; - return; - } - *cursorPos = text.length(); - } else if (specials.indexOf(c) >= 0) { - //$ throw new IllegalArgumentException("Unquoted special character: " - //$ + rules.substring(start, limit)); - status = U_ILLEGAL_ARGUMENT_ERROR; - return; - } else { - text.append(c); - } - } -} - -void TransliterationRuleParser::validateVariableName(const UnicodeString& name) { - if (indexOf(name, SPECIALS) >= 0) { - //throw new IllegalArgumentException( - // "Special character in variable name: " - // + name); - status = U_ILLEGAL_ARGUMENT_ERROR; - } -} - -/** - * Returns the single character value of the given variable name. Defined - * names are recognized. - * - * NO LONGER SUPPORTED: - * If a Unicode category name is given, a standard character variable - * in the range firstCategoryVariable to lastCategoryVariable is returned, - * with value firstCategoryVariable + n, where n is the category - * number. - * @exception IllegalArgumentException if the name is unknown. - */ -//$ UChar TransliterationRuleParser::getVariableDef(const UnicodeString& name) { -//$ UChar ch = data->lookupVariable(name, status); -//$ //! if (ch == null) { -//$ //! int id = UnicodeSet.getCategoryID(name); -//$ //! if (id >= 0) { -//$ //! ch = new Character((char) (firstCategoryVariable + id)); -//$ //! data->variableNames.put(name, ch); -//$ //! data->setVariables.put(ch, new UnicodeSet(id)); -//$ //! } -//$ //! } -//$ if (ch == 0) { -//$ throw new IllegalArgumentException("Undefined variable: " -//$ + name); -//$ } -//$ return ch; -//$ } - /** * Determines what part of the private use region of Unicode we can use for * variable stand-ins. The correct way to do this is as follows: Parse each @@ -599,43 +468,3 @@ int32_t TransliterationRuleParser::quotedIndexOf(const UnicodeString& text, } return -1; } - -/** - * Returns the index of the first character in a set. Unlike - * String.indexOf(), this method searches not for a single character, but - * for any character of the string setOfChars. - * @param text text to be searched - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= text.length(). - * @param setOfChars string with one or more distinct characters - * @return Offset of the first character in setOfChars - * found, or -1 if not found. - * @see #quotedIndexOf - */ -int32_t TransliterationRuleParser::indexOf(const UnicodeString& text, - int32_t start, int32_t limit, - const UnicodeString& setOfChars) { - for (int32_t i=start; i= 0) { - return i; - } - } - return -1; -} - -/** - * Returns the index of the first character in a set. Unlike - * String.indexOf(), this method searches not for a single character, but - * for any character of the string setOfChars. - * @param text text to be searched - * @param setOfChars string with one or more distinct characters - * @return Offset of the first character in setOfChars - * found, or -1 if not found. - * @see #quotedIndexOf - */ -int32_t TransliterationRuleParser::indexOf(const UnicodeString& text, - const UnicodeString& setOfChars) { - return indexOf(text, 0, text.length(), setOfChars); -} diff --git a/icu4c/source/i18n/rbt_pars.h b/icu4c/source/i18n/rbt_pars.h index 33e3b713f34..be6ad0b0ddf 100644 --- a/icu4c/source/i18n/rbt_pars.h +++ b/icu4c/source/i18n/rbt_pars.h @@ -11,6 +11,7 @@ #include "unicode/rbt.h" class TransliterationRuleData; +class UnicodeSet; class TransliterationRuleParser { @@ -49,29 +50,21 @@ class TransliterationRuleParser { static const UChar VARIABLE_DEF_OP; static const UChar FORWARD_RULE_OP; static const UChar REVERSE_RULE_OP; - static const char* OPERATORS; - + static const UChar FWDREV_RULE_OP; // internal rep of <> op + static const UnicodeString OPERATORS; // Other special characters static const UChar QUOTE; + static const UChar ESCAPE; + static const UChar END_OF_RULE; + static const UChar RULE_COMMENT_CHAR; static const UChar VARIABLE_REF_OPEN; static const UChar VARIABLE_REF_CLOSE; static const UChar CONTEXT_OPEN; static const UChar CONTEXT_CLOSE; + static const UChar SET_OPEN; + static const UChar SET_CLOSE; static const UChar CURSOR_POS; - static const UChar RULE_COMMENT_CHAR; - - - /** - * Specials must be quoted in rules to be used as literals. - * Specials may not occur in variable names. - */ - static const char* SPECIALS; - - /** - * Specials that must be quoted in variable definitions. - */ - static const char* DEF_SPECIALS; public: @@ -100,140 +93,38 @@ private: void parseRules(void); /** - * Parse the given substring as a rule, and append it to the rules currently - * represented in this object. - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= rules.length(). - * @exception IllegalArgumentException if there is a syntax error in the - * rules - */ - void applyRule(int32_t start, int32_t limit); - - /** - * Add a variable definition. - * @param name the name of the variable. It must not already be defined. - * @param pattern the value of the variable. It may be a single character - * or a pattern describing a character set. - * @exception IllegalArgumentException if there is a syntax error - */ - void applyVariableDef(const UnicodeString& name, - const UnicodeString& pattern); - - /** - * Given a rule, parses it into three pieces: The left side, the right side, - * and the operator. Returns the operator. Quotes and variable references - * are resolved; the otuput text in all StringBuffer parameters - * is literal text. This method delegates to other parsing methods to - * handle the match pattern, output pattern, and other sub-patterns in the - * rule. - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= rules.length(). - * @param left left side of rule is appended to this buffer - * with the quotes removed and variables resolved - * @param right right side of rule is appended to this buffer - * with the quotes removed and variables resolved - * @param anteContext the preceding context of the match pattern, - * if there is one, is appended to this buffer - * @param postContext the following context of the match pattern, - * if there is one, is appended to this buffer - * @param cursorPos if there is a cursor in the output pattern, its - * offset is stored in cursorPos[0] - * @return The operator character, one of the characters in OPERATORS. - */ - UChar parseRule(int32_t start, int32_t limit, - UnicodeString& left, UnicodeString& right, - UnicodeString& anteContext, - UnicodeString& postContext, - int32_t& cursorPos); - - /** - * Parses the match pattern of a forward or reverse rule. Given the raw - * match pattern, return the match text and the context on both sides, if - * any. Resolves all quotes and variables. - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= rules.length(). - * @param text the key to be matched will be appended to this buffer - * @param anteContext the preceding context, if any, will be appended - * to this buffer. - * @param postContext the following context, if any, will be appended - * to this buffer. - */ - void parseMatchPattern(int32_t start, int32_t limit, - UnicodeString& text, - UnicodeString& anteContext, - UnicodeString& postContext); - - void parseSubPattern(int32_t start, int32_t limit, - UnicodeString& text); - - /** - * Parse a variable definition sub pattern. This kind of sub - * pattern differs in the set of characters that are considered - * special. In particular, the '[' and ']' characters are not - * special, since these are used in UnicodeSet patterns. - */ - void parseDefPattern(int32_t start, int32_t limit, - UnicodeString& text); - - /** - * Parses the output pattern of a forward or reverse rule. Given the - * output pattern, return the output text and the position of the cursor, - * if any. Resolves all quotes and variables. - * @param rules the string to be parsed - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= rules.length(). - * @param text the output text will be appended to this buffer - * @param cursorPos if this parameter is not null, then cursorPos[0] - * will be set to the cursor position, or -1 if there is none. If this - * parameter is null, then cursors will be disallowed. - */ - void parseOutputPattern(int32_t start, int32_t limit, - UnicodeString& text, - int32_t& cursorPos); - - /** - * Parses a sub-pattern of a rule. Return the text and the position of the cursor, - * if any. Resolves all quotes and variables. - * @param rules the string to be parsed - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= rules.length(). - * @param text the output text will be appended to this buffer - * @param cursorPos if this parameter is not null, then cursorPos[0] - * will be set to the cursor position, or -1 if there is none. If this - * parameter is null, then cursors will be disallowed. - * @param specials characters that must be quoted; typically either - * SPECIALS or DEF_SPECIALS. - */ - void parseSubPattern(int32_t start, int32_t limit, - UnicodeString& text, - int32_t* cursorPos, - const UnicodeString& specials); - - void validateVariableName(const UnicodeString& name); - - /** - * Returns the single character value of the given variable name. Defined - * names are recognized. + * MAIN PARSER. Parse the next rule in the given rule string, starting + * at pos. Return the index after the last character parsed. Do not + * parse characters at or after limit. * - * NO LONGER SUPPORTED: - * If a Unicode category name is given, a standard character variable - * in the range firstCategoryVariable to lastCategoryVariable is returned, - * with value firstCategoryVariable + n, where n is the category - * number. - * @exception IllegalArgumentException if the name is unknown. + * Important: The character at pos must be a non-whitespace character + * that is not the comment character. + * + * This method handles quoting, escaping, and whitespace removal. It + * parses the end-of-rule character. It recognizes context and cursor + * indicators. Once it does a lexical breakdown of the rule at pos, it + * creates a rule object and adds it to our rule list. */ - //$ Character getVariableDef(const UnicodeString& name); + int32_t parseRule(int32_t pos, int32_t limit); + /** + * Called by main parser upon syntax error. Search the rule string + * for the probable end of the rule. Of course, if the error is that + * the end of rule marker is missing, then the rule end will not be found. + * In any case the rule start will be correctly reported. + * @param msg error description + * @param rule pattern string + * @param start position of first character of current rule + */ + int32_t syntaxError(const char* msg, const UnicodeString&, int32_t start); + + /** + * Allocate a private-use substitution character for the given set, + * register it in the setVariables hash, and return the substitution + * character. + */ + UChar registerSet(UnicodeSet* adoptedSet); + /** * Determines what part of the private use region of Unicode we can use for * variable stand-ins. The correct way to do this is as follows: Parse each @@ -263,38 +154,6 @@ private: static int32_t quotedIndexOf(const UnicodeString& text, int32_t start, int32_t limit, const UnicodeString& setOfChars); - - /** - * Returns the index of the first character in a set. Unlike - * String.indexOf(), this method searches not for a single character, but - * for any character of the string setOfChars. - * @param text text to be searched - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= text.length(). - * @param setOfChars string with one or more distinct characters - * @return Offset of the first character in setOfChars - * found, or -1 if not found. - * @see #quotedIndexOf - */ - static int32_t indexOf(const UnicodeString& text, - int32_t start, int32_t limit, - const UnicodeString& setOfChars); - - /** - * Returns the index of the first character in a set. Unlike - * String.indexOf(), this method searches not for a single character, but - * for any character of the string setOfChars. - * @param text text to be searched - * @param setOfChars string with one or more distinct characters - * @return Offset of the first character in setOfChars - * found, or -1 if not found. - * @see #quotedIndexOf - */ - static int32_t indexOf(const UnicodeString& text, - const UnicodeString& setOfChars); - }; #endif diff --git a/icu4c/source/i18n/rbt_rule.cpp b/icu4c/source/i18n/rbt_rule.cpp index 518b98f3d83..3e27f6962f1 100644 --- a/icu4c/source/i18n/rbt_rule.cpp +++ b/icu4c/source/i18n/rbt_rule.cpp @@ -25,6 +25,7 @@ * after the key * @param cursorPos a position for the cursor after the output * is emitted. If less than zero, then the cursor is placed after the + * output; that is, -1 is equivalent to * output.length(). If greater than * output.length() then an exception is thrown. @@ -37,55 +38,93 @@ TransliterationRule::TransliterationRule(const UnicodeString& theKey, const UnicodeString& thePostContext, int32_t theCursorPos, UErrorCode &status) : - key(theKey), output(theOutput), - anteContext(theAnteContext), - postContext(thePostContext), - cursorPos(theCursorPos), - maskKey(0) { - + output(theOutput), + cursorPos(theCursorPos) +{ if (U_FAILURE(status)) { return; } - + anteContextLength = theAnteContext.length(); + keyLength = theKey.length(); + pattern = theAnteContext; + pattern.append(theKey).append(thePostContext); if (cursorPos < 0) { cursorPos = output.length(); } if (cursorPos > output.length()) { status = U_ILLEGAL_ARGUMENT_ERROR; } - /* The mask key is needed when we are adding individual rules to a rule - * set, for performance. Here are the numbers: Without mask key, 13.0 - * seconds. With mask key, 6.2 seconds. However, once the rules have - * been added to the set, then they can be discarded to free up space. - * This is what the freeze() method does. After freeze() has been - * called, the method masks() must NOT be called. - */ - maskKey = new UnicodeString(key); - if (maskKey == 0) { - status = U_MEMORY_ALLOCATION_ERROR; - } else { - maskKey->append(postContext); - } } -TransliterationRule::~TransliterationRule() { - delete maskKey; +/** + * Construct a new rule with the given input, output text, and other + * attributes. A cursor position may be specified for the output text. + * @param input input string, including key and optional ante and + * post context + * @param anteContextPos offset into input to end of ante context, or -1 if + * none. Must be <= input.length() if not -1. + * @param postContextPos offset into input to start of post context, or -1 + * if none. Must be <= input.length() if not -1, and must be >= + * anteContextPos. + * @param output output string + * @param cursorPos offset into output at which cursor is located, or -1 if + * none. If less than zero, then the cursor is placed after the + * output; that is, -1 is equivalent to + * output.length(). If greater than + * output.length() then an exception is thrown. + */ +TransliterationRule::TransliterationRule(const UnicodeString& input, + int32_t anteContextPos, int32_t postContextPos, + const UnicodeString& output, + int32_t cursorPos, + UErrorCode& status) { + if (U_FAILURE(status)) { + return; + } + // Do range checks only when warranted to save time + if (anteContextPos < 0) { + anteContextLength = 0; + } else { + if (anteContextPos > input.length()) { + // throw new IllegalArgumentException("Invalid ante context"); + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + anteContextLength = anteContextPos; + } + if (postContextPos < 0) { + keyLength = input.length() - anteContextLength; + } else { + if (postContextPos < anteContextLength || + postContextPos > input.length()) { + // throw new IllegalArgumentException("Invalid post context"); + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + keyLength = postContextPos - anteContextLength; + } + if (cursorPos < 0) { + this->cursorPos = output.length(); + } else { + if (cursorPos > output.length()) { + // throw new IllegalArgumentException("Invalid cursor position"); + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + this->cursorPos = cursorPos; + } + pattern = input; + this->output = output; } +TransliterationRule::~TransliterationRule() {} + /** * Return the length of the key. Equivalent to getKey().length(). * @return the length of the match key. */ int32_t TransliterationRule::getKeyLength(void) const { - return key.length(); -} - -/** - * Return the key. - * @return the match key. - */ -const UnicodeString& TransliterationRule::getKey(void) const { - return key; + return keyLength; } /** @@ -110,7 +149,45 @@ int32_t TransliterationRule::getCursorPos(void) const { * getMaximumContextLength(). */ int32_t TransliterationRule::getAnteContextLength(void) const { - return anteContext.length(); + return anteContextLength; +} + +/** + * Internal method. Returns 8-bit index value for this rule. + * This is the low byte of the first character of the key, + * unless the first character of the key is a set. If it's a + * set, or otherwise can match multiple keys, the index value is -1. + */ +int16_t TransliterationRule::getIndexValue(const TransliterationRuleData& data) { + if (anteContextLength == pattern.length()) { + // A pattern with just ante context {such as foo)>bar} can + // match any key. + return -1; + } + UChar c = pattern.charAt(anteContextLength); + return data.lookupSet(c) == NULL ? (c & 0xFF) : -1; +} + +/** + * Internal method. Returns true if this rule matches the given + * index value. The index value is an 8-bit integer, 0..255, + * representing the low byte of the first character of the key. + * It matches this rule if it matches the first character of the + * key, or if the first character of the key is a set, and the set + * contains any character with a low byte equal to the index + * value. If the rule contains only ante context, as in foo)>bar, + * then it will match any key. + */ +bool_t TransliterationRule::matchesIndexValue(uint8_t v, + const TransliterationRuleData& data) { + if (anteContextLength == pattern.length()) { + // A pattern with just ante context {such as foo)>bar} can + // match any key. + return TRUE; + } + UChar c = pattern.charAt(anteContextLength); + UnicodeSet* set = data.lookupSet(c); + return set == NULL ? (uint8_t(c) == v) : set->containsIndexValue(v); } /** @@ -118,43 +195,37 @@ int32_t TransliterationRule::getAnteContextLength(void) const { * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". * "[c]a>x" masks "[dc]a>y". - * - *

This method must not be called after freeze() is called. */ bool_t TransliterationRule::masks(const TransliterationRule& r2) const { - /* There are three cases of masking. In each instance, rule1 - * masks rule2. + /* Rule r1 masks rule r2 if the string formed of the + * antecontext, key, and postcontext overlaps in the following + * way: * - * 1. KEY mask: len(key1) < len(key2), key2 starts with key1. - * - * 2. PREFIX mask: key1 == key2, len(prefix1) < len(prefix2), - * prefix2 ends with prefix1, suffix2 starts with suffix1. - * - * 3. SUFFIX mask: key1 == key2, len(suffix1) < len(suffix2), - * prefix2 ends with prefix1, suffix2 starts with suffix1. + * r1: aakkkpppp + * r2: aaakkkkkpppp + * ^ + * + * The strings must be aligned at the first character of the + * key. The length of r1 to the left of the alignment point + * must be <= the length of r2 to the left; ditto for the + * right. The characters of r1 must equal (or be a superset + * of) the corresponding characters of r2. The superset + * operation should be performed to check for UnicodeSet + * masking. */ /* LIMITATION of the current mask algorithm: Some rule * maskings are currently not detected. For example, - * "{Lu}]a>x" masks "A]a>y". To detect these sorts of masking, - * we need a subset operator on UnicodeSet objects, which we - * currently do not have. This can be added later. + * "{Lu}]a>x" masks "A]a>y". This can be added later. TODO */ - return ((maskKey->length() < r2.maskKey->length() && - r2.maskKey->startsWith(*maskKey)) || - (r2.anteContext.length() != 0 && *maskKey == *r2.maskKey && - ((anteContext.length() == 0) || - (anteContext.length() < r2.anteContext.length() && - r2.anteContext.endsWith(anteContext))))); -} -/** - * Free up space. Once this method is called, masks() must NOT be called. - * If it is called, an exception will be thrown. - */ -void TransliterationRule::freeze(void) { - delete maskKey; - maskKey = 0; + int32_t len = pattern.length(); + int32_t left = anteContextLength; + int32_t left2 = r2.anteContextLength; + int32_t right = len - left; + int32_t right2 = r2.pattern.length() - left2; + return left <= left2 && right <= right2 && + 0 == r2.pattern.compare(left2 - left, len, pattern); } /** @@ -186,17 +257,10 @@ bool_t TransliterationRule::matches(const UnicodeString& text, int32_t cursor, const TransliterationRuleData& data, const UnicodeFilter* filter) const { - return - (anteContext.length() == 0 - || regionMatches(text, start, limit, result, - cursor - anteContext.length(), - anteContext, data, filter)) && - regionMatches(text, start, limit, result, cursor, - key, data, filter) && - (postContext.length() == 0 - || regionMatches(text, start, limit, result, - cursor + key.length(), - postContext, data, filter)); + // Match anteContext, key, and postContext + return regionMatches(text, start, limit, result, + cursor - anteContextLength, + pattern, data, filter); } /** @@ -219,15 +283,10 @@ bool_t TransliterationRule::matches(const Replaceable& text, int32_t cursor, const TransliterationRuleData& data, const UnicodeFilter* filter) const { - return - (anteContext.length() == 0 - || regionMatches(text, start, limit, cursor - anteContext.length(), - anteContext, data, filter)) && - regionMatches(text, start, limit, cursor, - key, data, filter) && - (postContext.length() == 0 - || regionMatches(text, start, limit, cursor + key.length(), - postContext, data, filter)); + // Match anteContext, key, and postContext + return regionMatches(text, start, limit, + cursor - anteContextLength, + pattern, data, filter); } /** @@ -260,28 +319,10 @@ int32_t TransliterationRule::getMatchDegree(const Replaceable& text, int32_t cursor, const TransliterationRuleData& data, const UnicodeFilter* filter) const { - if (anteContext.length() != 0 - && !regionMatches(text, start, limit, cursor - anteContext.length(), - anteContext, data, filter)) { - return MISMATCH; - } - int32_t len = getRegionMatchLength(text, start, limit, cursor, - key, data, filter); - if (len < 0) { - return MISMATCH; - } - if (len < key.length()) { - return PARTIAL_MATCH; - } - if (postContext.length() == 0) { - return FULL_MATCH; - } - len = getRegionMatchLength(text, start, limit, - cursor + key.length(), - postContext, data, filter); - return (len < 0) ? MISMATCH - : ((len == postContext.length()) ? FULL_MATCH - : PARTIAL_MATCH); + int len = getRegionMatchLength(text, start, limit, cursor - anteContextLength, + pattern, data, filter); + return len < anteContextLength ? MISMATCH : + (len < pattern.length() ? PARTIAL_MATCH : FULL_MATCH); } /** diff --git a/icu4c/source/i18n/rbt_rule.h b/icu4c/source/i18n/rbt_rule.h index ed957a22354..84b6fe1010a 100644 --- a/icu4c/source/i18n/rbt_rule.h +++ b/icu4c/source/i18n/rbt_rule.h @@ -72,9 +72,13 @@ public: private: /** - * The string that must be matched. + * The string that must be matched, consisting of the anteContext, key, + * and postContext, concatenated together, in that order. Some components + * may be empty (zero length). + * @see anteContextLength + * @see keyLength */ - UnicodeString key; + UnicodeString pattern; /** * The string that is emitted if the key, anteContext, and postContext @@ -83,16 +87,18 @@ private: UnicodeString output; /** - * The string that must match before the key. If empty, then - * there is no matching requirement before the key. + * The length of the string that must match before the key. If + * zero, then there is no matching requirement before the key. + * Substring [0,anteContextLength) of pattern is the anteContext. */ - UnicodeString anteContext; + int32_t anteContextLength; /** - * The string that must match after the key. If empty, then there - * is no matching requirement after the key. + * The length of the key. Substring [anteContextLength, + * anteContextLength + keyLength) is the key. + */ - UnicodeString postContext; + int32_t keyLength; /** * The position of the cursor after emitting the output string, from 0 to @@ -101,12 +107,6 @@ private: */ int32_t cursorPos; - /** - * A string used to implement masks(). - * @see #freeze - */ - UnicodeString* maskKey; - public: /** @@ -134,6 +134,29 @@ public: int32_t theCursorPos, UErrorCode &status); + /** + * Construct a new rule with the given input, output text, and other + * attributes. A cursor position may be specified for the output text. + * @param input input string, including key and optional ante and + * post context + * @param anteContextPos offset into input to end of ante context, or -1 if + * none. Must be <= input.length() if not -1. + * @param postContextPos offset into input to start of post context, or -1 + * if none. Must be <= input.length() if not -1, and must be >= + * anteContextPos. + * @param output output string + * @param cursorPos offset into output at which cursor is located, or -1 if + * none. If less than zero, then the cursor is placed after the + * output; that is, -1 is equivalent to + * output.length(). If greater than + * output.length() then an exception is thrown. + */ + TransliterationRule(const UnicodeString& input, + int32_t anteContextPos, int32_t postContextPos, + const UnicodeString& output, + int32_t cursorPos, + UErrorCode& status); + /** * Destructor. */ @@ -145,12 +168,6 @@ public: */ virtual int32_t getKeyLength(void) const; - /** - * Return the key. - * @return the match key. - */ - virtual const UnicodeString& getKey(void) const; - /** * Return the output string. * @return the output string. @@ -170,22 +187,39 @@ public: */ virtual int32_t getAnteContextLength(void) const; +private: + friend class TransliterationRuleSet; + + /** + * Internal method. Returns 8-bit index value for this rule. + * This is the low byte of the first character of the key, + * unless the first character of the key is a set. If it's a + * set, or otherwise can match multiple keys, the index value is -1. + */ + int16_t getIndexValue(const TransliterationRuleData& data); + + /** + * Internal method. Returns true if this rule matches the given + * index value. The index value is an 8-bit integer, 0..255, + * representing the low byte of the first character of the key. + * It matches this rule if it matches the first character of the + * key, or if the first character of the key is a set, and the set + * contains any character with a low byte equal to the index + * value. If the rule contains only ante context, as in foo)>bar, + * then it will match any key. + */ + bool_t matchesIndexValue(uint8_t v, + const TransliterationRuleData& data); + +public: /** * Return true if this rule masks another rule. If r1 masks r2 then * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". * "[c]a>x" masks "[dc]a>y". - * - *

This method must not be called after freeze() is called. */ virtual bool_t masks(const TransliterationRule& r2) const; - /** - * Free up space. Once this method is called, masks() must NOT be called. - * If it is called, an exception will be thrown. - */ - virtual void freeze(void); - /** * Return true if this rule matches the given text. The text being matched * occupies a virtual buffer consisting of the contents of diff --git a/icu4c/source/i18n/rbt_set.cpp b/icu4c/source/i18n/rbt_set.cpp index 489832747b7..a1ff2e46008 100644 --- a/icu4c/source/i18n/rbt_set.cpp +++ b/icu4c/source/i18n/rbt_set.cpp @@ -30,6 +30,16 @@ */ TransliterationRuleSet::TransliterationRuleSet() { maxContextLength = 0; + ruleVector = new UVector(); + rules = NULL; +} + +/** + * Destructor. + */ +TransliterationRuleSet::~TransliterationRuleSet() { + delete ruleVector; + delete[] rules; } /** @@ -45,31 +55,22 @@ int32_t TransliterationRuleSet::getMaximumContextLength(void) const { * significant. * *

Once freeze() is called, this method must not be called. - * @param rule the rule to add + * @param adoptedRule the rule to add */ void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule, UErrorCode& status) { - - // Build time, no checking : 3562 ms - // Build time, with checking: 6234 ms - if (U_FAILURE(status)) { delete adoptedRule; return; } - - for (int32_t i=0; imasks(*adoptedRule)) { - //throw new IllegalArgumentException("Rule " + rule + - // " must precede " + r); - status = U_ILLEGAL_ARGUMENT_ERROR; - delete adoptedRule; - return; - } + if (ruleVector == NULL) { + // throw new IllegalArgumentException("Cannot add rules after freezing"); + status = U_ILLEGAL_ARGUMENT_ERROR; + delete adoptedRule; + return; } + ruleVector->addElement(adoptedRule); - rules.addElement(adoptedRule); int32_t len; if ((len = adoptedRule->getAnteContextLength()) > maxContextLength) { maxContextLength = len; @@ -77,13 +78,109 @@ void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule, } /** - * Free up space. Once this method is called, addRule() must NOT - * be called again. + * Close this rule set to further additions, check it for masked rules, + * and index it to optimize performance. Once this method is called, + * addRule() can no longer be called. + * @exception IllegalArgumentException if some rules are masked */ -void TransliterationRuleSet::freeze(void) { - for (int32_t i=0; ifreeze(); +void TransliterationRuleSet::freeze(const TransliterationRuleData& data, + UErrorCode& status) { + if (U_FAILURE(status)) { + return; } + + /* Construct the rule array and index table. We reorder the + * rules by sorting them into 256 bins. Each bin contains all + * rules matching the index value for that bin. A rule + * matches an index value if string whose first key character + * has a low byte equal to the index value can match the rule. + * + * Each bin contains zero or more rules, in the same order + * they were found originally. However, the total rules in + * the bins may exceed the number in the original vector, + * since rules that have a variable as their first key + * character will generally fall into more than one bin. + * + * That is, each bin contains all rules that either have that + * first index value as their first key character, or have + * a set containing the index value as their first character. + */ + int32_t n = ruleVector->size(); + int32_t j; + int16_t x; + UVector v(2*n); // heuristic; adjust as needed + + /* Precompute the index values. This saves a LOT of time. + */ + int16_t* indexValue = new int16_t[n]; + for (j=0; jelementAt(j); + indexValue[j] = r->getIndexValue(data); + } + for (x=0; x<256; ++x) { + index[x] = v.size(); + for (j=0; j= 0) { + if (indexValue[j] == x) { + v.addElement(ruleVector->elementAt(j)); + } + } else { + // If the indexValue is < 0, then the first key character is + // a set, and we must use the more time-consuming + // matchesIndexValue check. In practice this happens + // rarely, so we seldom tread this code path. + TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j); + if (r->matchesIndexValue((uint8_t)x, data)) { + v.addElement(r); + } + } + } + } + delete[] indexValue; + index[256] = v.size(); + + /* Freeze things into an array. + */ + rules = new TransliterationRule*[v.size()]; + for (j=0; jmasks(*r2)) { +//| if (errors == null) { +//| errors = new StringBuffer(); +//| } else { +//| errors.append("\n"); +//| } +//| errors.append("Rule " + r1 + " masks " + r2); + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + } + } + } + + //if (errors != null) { + // throw new IllegalArgumentException(errors.toString()); + //} } /** @@ -119,15 +216,18 @@ TransliterationRuleSet::findMatch(const UnicodeString& text, int32_t cursor, const TransliterationRuleData& data, const UnicodeFilter* filter) const { - for (int32_t i=0; imatches(text, start, limit, result, - cursor, data, filter)) { - return rule; + /* We only need to check our indexed bin of the rule table, + * based on the low byte of the first key character. + */ + int32_t rlen = result.length(); + int16_t x = 0xFF & (cursor < rlen ? result.charAt(cursor) + : text.charAt(cursor - rlen + start)); + for (int32_t i=index[x]; imatches(text, start, limit, result, cursor, data, filter)) { + return rules[i]; } } - return 0; + return NULL; } /** @@ -154,15 +254,16 @@ TransliterationRuleSet::findMatch(const Replaceable& text, int32_t cursor, const TransliterationRuleData& data, const UnicodeFilter* filter) const { - for (int32_t i=0; imatches(text, start, limit, cursor, - data, filter)) { - return rule; + /* We only need to check our indexed bin of the rule table, + * based on the low byte of the first key character. + */ + int16_t x = text.charAt(cursor) & 0xFF; + for (int32_t i=index[x]; imatches(text, start, limit, cursor, data, filter)) { + return rules[i]; } } - return 0; + return NULL; } /** @@ -199,19 +300,22 @@ TransliterationRuleSet::findIncrementalMatch(const Replaceable& text, const TransliterationRuleData& data, bool_t& isPartial, const UnicodeFilter* filter) const { + + /* We only need to check our indexed bin of the rule table, + * based on the low byte of the first key character. + */ isPartial = FALSE; - for (int32_t i=0; igetMatchDegree(text, start, limit, cursor, - data, filter); + int16_t x = text.charAt(cursor) & 0xFF; + for (int32_t i=index[x]; igetMatchDegree(text, start, limit, cursor, + data, filter); switch (match) { case TransliterationRule::FULL_MATCH: - return rule; + return rules[i]; case TransliterationRule::PARTIAL_MATCH: isPartial = TRUE; - return 0; + return NULL; } } - return 0; + return NULL; } diff --git a/icu4c/source/i18n/rbt_set.h b/icu4c/source/i18n/rbt_set.h index d6602bc4bab..3b9aa5c9d06 100644 --- a/icu4c/source/i18n/rbt_set.h +++ b/icu4c/source/i18n/rbt_set.h @@ -30,15 +30,30 @@ class UnicodeString; */ class TransliterationRuleSet { /** - * Vector of rules, in the order added. + * Vector of rules, in the order added. This is only used while the rule + * set is getting built. After that, freeze() reorders and indexes the + * rules, and this Vector is freed. */ - UVector rules; + UVector* ruleVector; /** * Length of the longest preceding context */ int32_t maxContextLength; + /** + * Sorted and indexed table of rules. This is created by freeze() from + * the rules in ruleVector. + */ + TransliterationRule** rules; + + /** + * Index table. For text having a first character c, compute x = c&0xFF. + * Now use rules[index[x]..index[x+1]-1]. This index table is created by + * freeze(). + */ + int32_t index[257]; + public: /** @@ -46,6 +61,11 @@ public: */ TransliterationRuleSet(); + /** + * Destructor. + */ + virtual ~TransliterationRuleSet(); + /** * Return the maximum context length. * @return the length of the longest preceding context. @@ -57,16 +77,19 @@ public: * significant. * *

Once freeze() is called, this method must not be called. - * @param rule the rule to add + * @param adoptedRule the rule to add */ virtual void addRule(TransliterationRule* adoptedRule, UErrorCode& status); /** - * Free up space. Once this method is called, addRule() must NOT - * be called again. + * Close this rule set to further additions, check it for masked rules, + * and index it to optimize performance. Once this method is called, + * addRule() can no longer be called. + * @exception IllegalArgumentException if some rules are masked */ - virtual void freeze(void); + virtual void freeze(const TransliterationRuleData& data, + UErrorCode& status); /** * Attempt to find a matching rule at the specified point in the text. The diff --git a/icu4c/source/i18n/uniset.cpp b/icu4c/source/i18n/uniset.cpp index 7a8540a7c1c..dc772d1b278 100644 --- a/icu4c/source/i18n/uniset.cpp +++ b/icu4c/source/i18n/uniset.cpp @@ -14,7 +14,7 @@ // N.B.: This mapping is different in ICU and Java const UnicodeString UnicodeSet::CATEGORY_NAMES( - "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCfCoCsPdPsPePcPoSmScSkSoPiPf"); + "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCfCoCsPdPsPePcPoSmScSkSoPiPf", ""); /** * A cache mapping character category integers, as returned by @@ -28,7 +28,7 @@ UnicodeString* UnicodeSet::CATEGORY_PAIRS_CACHE = * Delimiter string used in patterns to close a category reference: * ":]". Example: "[:Lu:]". */ -const UnicodeString UnicodeSet::CATEGORY_CLOSE(":]", ""); +const UnicodeString UnicodeSet::CATEGORY_CLOSE = UNICODE_STRING(":]", 2); /** * Delimiter char beginning a variable reference: @@ -69,23 +69,20 @@ UnicodeSet::UnicodeSet() : pairs() {} * white space. See the class description for the syntax of the * pattern language. * @param pattern a string specifying what characters are in the set - * @param ignoreSpaces if true, all spaces in the - * pattern are ignored, except those preceded by '\\'. Spaces are - * those characters for which Character.isSpaceChar() - * is true. * @exception IllegalArgumentException if the pattern * contains a syntax error. */ -UnicodeSet::UnicodeSet(const UnicodeString& pattern, bool_t ignoreSpaces, - UErrorCode& status) : pairs() { - applyPattern(pattern, ignoreSpaces, status); -} - UnicodeSet::UnicodeSet(const UnicodeString& pattern, UErrorCode& status) : pairs() { applyPattern(pattern, status); } +UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, + const TransliterationRuleData* data, + UErrorCode& status) { + parse(pairs, pattern, pos, data, status); +} + /** * Constructs a set from the given Unicode character category. * @param category an integer indicating the character category as @@ -164,50 +161,24 @@ int32_t UnicodeSet::hashCode(void) const { * contains a syntax error. */ void UnicodeSet::applyPattern(const UnicodeString& pattern, - bool_t ignoreSpaces, UErrorCode& status) { if (U_FAILURE(status)) { return; } ParsePosition pos(0); - UnicodeString* pat = (UnicodeString*) &pattern; + parse(pairs, pattern, pos, NULL, status); - // To ignore spaces, create a new pattern without spaces. We - // have to process all '\' escapes. If '\' is encountered, - // insert it and the following character (if any -- let parse - // deal with any syntax errors) in the pattern. This allows - // escaped spaces. - if (ignoreSpaces) { - pat = new UnicodeString(); - for (int32_t i=0; iappend(c); - c = pattern.charAt(++i); - // Fall through and append the following char - } - pat->append(c); - } + // Skip over trailing whitespace + int32_t i = pos.getIndex(); + int32_t n = pattern.length(); + while (ilength() && - Unicode::isWhitespace(pat->charAt(pos.getIndex()))) { - pos.setIndex(pos.getIndex() + 1); - } - - if (pos.getIndex() != pat->length()) { + if (i != n) { status = U_ILLEGAL_ARGUMENT_ERROR; } - if (pat != &pattern) { - delete pat; - } } /** @@ -279,6 +250,34 @@ bool_t UnicodeSet::contains(UChar c) const { return contains(c, c); } +/** + * Returns true if this set contains any character whose low byte + * is the given value. This is used by RuleBasedTransliterator for + * indexing. + */ +bool_t UnicodeSet::containsIndexValue(uint8_t v) const { + /* The index value v, in the range [0,255], is contained in this set if + * it is contained in any pair of this set. Pairs either have the high + * bytes equal, or unequal. If the high bytes are equal, then we have + * aaxx..aayy, where aa is the high byte. Then v is contained if xx <= + * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa. + * Then v is contained if xx <= v || v <= yy. (This is identical to the + * time zone month containment logic.) + */ + for (int32_t i=0; i