ICU-329 parse engines need better error reporting

X-SVN-Rev: 958
This commit is contained in:
Alan Liu 2000-03-18 01:42:45 +00:00
parent 24bb0f4fce
commit af7124308c
7 changed files with 145 additions and 69 deletions

View file

@ -868,6 +868,25 @@ InputPath=.\unicode\numfmt.h
# End Source File
# Begin Source File
SOURCE=.\unicode\parseerr.h
!IF "$(CFG)" == "i18n - Win32 Release"
!ELSEIF "$(CFG)" == "i18n - Win32 Debug"
# Begin Custom Build
InputPath=.\unicode\parseerr.h
"..\..\include\unicode\parseerr.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy unicode\parseerr.h ..\..\include\unicode
# End Custom Build
!ENDIF
# End Source File
# Begin Source File
SOURCE=.\unicode\parsepos.h
!IF "$(CFG)" == "i18n - Win32 Release"

View file

@ -15,11 +15,12 @@
void RuleBasedTransliterator::_construct(const UnicodeString& rules,
Direction direction,
UErrorCode& status) {
UErrorCode& status,
ParseError* parseError) {
data = 0;
isDataOwned = TRUE;
if (U_SUCCESS(status)) {
data = TransliterationRuleParser::parse(rules, direction);
data = TransliterationRuleParser::parse(rules, direction, parseError);
if (data == 0) {
status = U_ILLEGAL_ARGUMENT_ERROR;
} else {

View file

@ -16,6 +16,7 @@
#include "cstring.h"
#include "unicode/parsepos.h"
#include "symtable.h"
#include "unicode/parseerr.h"
// Operators
const UChar TransliterationRuleParser::VARIABLE_DEF_OP = 0x003D/*=*/;
@ -91,8 +92,9 @@ void ParseData::lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,
TransliterationRuleData*
TransliterationRuleParser::parse(const UnicodeString& rules,
RuleBasedTransliterator::Direction direction) {
TransliterationRuleParser parser(rules, direction);
RuleBasedTransliterator::Direction direction,
ParseError* parseError) {
TransliterationRuleParser parser(rules, direction, parseError);
parser.parseRules();
if (U_FAILURE(parser.status)) {
delete parser.data;
@ -108,8 +110,9 @@ TransliterationRuleParser::parse(const UnicodeString& rules,
*/
TransliterationRuleParser::TransliterationRuleParser(
const UnicodeString& theRules,
RuleBasedTransliterator::Direction theDirection) :
rules(theRules), direction(theDirection), data(0) {
RuleBasedTransliterator::Direction theDirection,
ParseError* theParseError) :
rules(theRules), direction(theDirection), data(0), parseError(theParseError) {
parseData = new ParseData(0, &setVariablesVector);
}
@ -139,6 +142,9 @@ void TransliterationRuleParser::parseRules(void) {
parseData->data = data;
setVariablesVector.removeAllElements();
if (parseError != 0) {
parseError->code = 0;
}
determineVariableRange();
int32_t pos = 0;
@ -225,19 +231,19 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
// Handle escapes
if (c == ESCAPE) {
if (pos == limit) {
return syntaxError("Trailing backslash", rules, start);
return syntaxError(RuleBasedTransliterator::TRAILING_BACKSLASH, rules, start);
}
// Parse \uXXXX escapes
c = rules.charAt(pos++);
if (c == 0x0075/*u*/) {
if ((pos+4) > limit) {
return syntaxError("Malformed Unicode escape", rules, start);
return syntaxError(RuleBasedTransliterator::MALFORMED_UNICODE_ESCAPE, rules, start);
}
c = (UChar)0x0000;
for (int32_t plim=pos+4; pos<plim; ++pos) { // [sic]
int32_t digit = Unicode::digit(rules.charAt(pos), 16);
if (digit<0) {
return syntaxError("Malformed Unicode escape", rules, start);
return syntaxError(RuleBasedTransliterator::MALFORMED_UNICODE_ESCAPE, rules, start);
}
c = (UChar) ((c << 4) | digit);
}
@ -261,7 +267,7 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
*/
for (;;) {
if (iq < 0) {
return syntaxError("Unterminated quote", rules, start);
return syntaxError(RuleBasedTransliterator::UNTERMINATED_QUOTE, rules, start);
}
scratch.truncate(0);
rules.extractBetween(pos, iq, scratch);
@ -280,7 +286,7 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
}
if (OPERATORS.indexOf(c) >= 0) {
if (op != 0) {
return syntaxError("Unquoted special", rules, start);
return syntaxError(RuleBasedTransliterator::UNQUOTED_SPECIAL, rules, start);
}
// Found an operator char. Check for forward-reverse operator.
if (c == REVERSE_RULE_OP &&
@ -308,21 +314,21 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
{
int32_t j = rules.indexOf(VARIABLE_REF_CLOSE, pos);
if (pos == j || j < 0) { // empty or unterminated
return syntaxError("Malformed variable reference", rules, start);
return syntaxError(RuleBasedTransliterator::MALFORMED_VARIABLE_REFERENCE, rules, start);
}
scratch.truncate(0);
rules.extractBetween(pos, j, scratch);
pos = j+1;
UChar v = data->lookupVariable(scratch, status);
if (U_FAILURE(status)) {
return syntaxError("Undefined variable", rules, start);
return syntaxError(RuleBasedTransliterator::UNDEFINED_VARIABLE, rules, start);
}
buf.append(v);
}
break;
case CONTEXT_OPEN:
if (post >= 0) {
return syntaxError("Multiple post contexts", rules, start);
return syntaxError(RuleBasedTransliterator::MULTIPLE_POST_CONTEXTS, rules, start);
}
// Ignore CONTEXT_OPEN if buffer length is zero -- that means
// this is the optional opening delimiter for the ante context.
@ -332,14 +338,14 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
break;
case CONTEXT_CLOSE:
if (postClose >= 0) {
return syntaxError("Unexpected ')'", rules, start);
return syntaxError(RuleBasedTransliterator::UNEXPECTED_CLOSE_CONTEXT, rules, start);
}
if (post >= 0) {
// This is probably the optional closing delimiter
// for the post context; save the pos and check later.
postClose = buf.length();
} else if (ante >= 0) {
return syntaxError("Multiple ante contexts", rules, start);
return syntaxError(RuleBasedTransliterator::MULTIPLE_ANTE_CONTEXTS, rules, start);
} else {
ante = buf.length();
}
@ -348,16 +354,16 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
ParsePosition pp(pos-1); // Backup to opening '['
buf.append(registerSet(new UnicodeSet(rules, pp, *parseData, status)));
if (U_FAILURE(status)) {
return syntaxError("Invalid set", rules, start);
return syntaxError(RuleBasedTransliterator::MALFORMED_SET, rules, start);
}
pos = pp.getIndex(); }
break;
case VARIABLE_REF_CLOSE:
case SET_CLOSE:
return syntaxError("Unquoted special", rules, start);
return syntaxError(RuleBasedTransliterator::UNQUOTED_SPECIAL, rules, start);
case CURSOR_POS:
if (cursor >= 0) {
return syntaxError("Multiple cursors", rules, start);
return syntaxError(RuleBasedTransliterator::MULTIPLE_CURSORS, rules, start);
}
cursor = buf.length();
break;
@ -367,13 +373,13 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
}
}
if (op == 0) {
return syntaxError("No operator", rules, start);
return syntaxError(RuleBasedTransliterator::MISSING_OPERATOR, rules, start);
}
// Check context close parameters
if ((leftPostClose >= 0 && leftPostClose != left.length()) ||
(postClose >= 0 && postClose != buf.length())) {
return syntaxError("Extra text after ]", rules, start);
return syntaxError(RuleBasedTransliterator::TEXT_AFTER_CLOSE_CONTEXT, rules, start);
}
// Context is only allowed on the input side; that is, the left side
@ -388,10 +394,10 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
// character, it is either a multi-character string, or multiple
// sets, or a mixture of chars and sets -- syntax error.
if (buf.length() != 1) {
return syntaxError("Malformed RHS", rules, start);
return syntaxError(RuleBasedTransliterator::MALFORMED_RHS, rules, start);
}
if (data->isVariableDefined(left)) {
return syntaxError("Duplicate definition", rules, start);
return syntaxError(RuleBasedTransliterator::DUPLICATE_VARIABLE_DEFINITION, rules, start);
}
data->defineVariable(left, buf.charAt(0), status);
break;
@ -399,7 +405,7 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
case FORWARD_RULE_OP:
if (direction == RuleBasedTransliterator::FORWARD) {
if (ante >= 0 || post >= 0 || leftCursor >= 0) {
return syntaxError("Malformed rule", rules, start);
return syntaxError(RuleBasedTransliterator::MALFORMED_RULE, rules, start);
}
data->ruleSet.addRule(new TransliterationRule(
left, leftAnte, leftPost,
@ -410,7 +416,7 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
case REVERSE_RULE_OP:
if (direction == RuleBasedTransliterator::REVERSE) {
if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) {
return syntaxError("Malformed rule", rules, start);
return syntaxError(RuleBasedTransliterator::MALFORMED_RULE, rules, start);
}
data->ruleSet.addRule(new TransliterationRule(
buf, ante, post,
@ -457,15 +463,19 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
* @param rule pattern string
* @param start position of first character of current rule
*/
int32_t TransliterationRuleParser::syntaxError(const char* /*msg*/,
const UnicodeString& /*rule*/,
int32_t TransliterationRuleParser::syntaxError(int32_t parseErrorCode,
const UnicodeString& rule,
int32_t start) {
//| int end = quotedIndexOf(rule, start, rule.length(), ";");
//| if (end < 0) {
//| end = rule.length();
//| }
//| throw new IllegalArgumentException(msg + " in " +
//| rule.substring(start, end));
if (parseError != 0) {
parseError->code = parseErrorCode;
parseError->line = 0; // We don't return a line #
parseError->offset = start; // Character offset from rule start
int32_t end = quotedIndexOf(rule, start, rule.length(), END_OF_RULE);
if (end < 0) {
end = rule.length();
}
rule.extractBetween(start, end, parseError->context); // Current rule
}
status = U_ILLEGAL_ARGUMENT_ERROR;
return start;
}
@ -512,30 +522,21 @@ void TransliterationRuleParser::determineVariableRange(void) {
}
/**
* Returns the index of the first character in a set, ignoring quoted text.
* Returns the index of a character, ignoring quoted text.
* For example, in the string "abc'hide'h", the 'h' in "hide" will not be
* found by a search for "h". Unlike String.indexOf(), this method searches
* not for a single character, but for any character of the string
* <code>setOfChars</code>.
* @param text text to be searched
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param setOfChars string with one or more distinct characters
* @return Offset of the first character in <code>setOfChars</code>
* found, or -1 if not found.
* @see #indexOf
* found by a search for 'h'.
*/
int32_t TransliterationRuleParser::quotedIndexOf(const UnicodeString& text,
int32_t start, int32_t limit,
const UnicodeString& setOfChars) {
UChar charToFind) {
for (int32_t i=start; i<limit; ++i) {
UChar c = text.charAt(i);
if (c == QUOTE) {
if (c == ESCAPE) {
++i;
} else if (c == QUOTE) {
while (++i < limit
&& text.charAt(i) != QUOTE) {}
} else if (setOfChars.indexOf(c) >= 0) {
} else if (c == charToFind) {
return i;
}
}

View file

@ -10,6 +10,7 @@
#include "unicode/rbt.h"
#include "uvector.h"
#include "unicode/parseerr.h"
class TransliterationRuleData;
class UnicodeSet;
@ -33,6 +34,12 @@ class TransliterationRuleParser {
*/
UErrorCode status;
/**
* Pointer to user structure in which to return parse error information.
* May be NULL.
*/
ParseError* parseError;
/**
* Temporary symbol table used during parsing.
*/
@ -84,7 +91,8 @@ public:
static TransliterationRuleData*
parse(const UnicodeString& rules,
RuleBasedTransliterator::Direction direction);
RuleBasedTransliterator::Direction direction,
ParseError* parseError = 0);
private:
@ -94,7 +102,8 @@ private:
* rules
*/
TransliterationRuleParser(const UnicodeString& rules,
RuleBasedTransliterator::Direction direction);
RuleBasedTransliterator::Direction direction,
ParseError* parseError = 0);
/**
* Destructor.
@ -135,7 +144,7 @@ private:
* @param rule pattern string
* @param start position of first character of current rule
*/
int32_t syntaxError(const char* msg, const UnicodeString&, int32_t start);
int32_t syntaxError(int32_t parseErrorCode, const UnicodeString&, int32_t start);
/**
* Allocate a private-use substitution character for the given set,
@ -155,24 +164,20 @@ private:
void determineVariableRange(void);
/**
* Returns the index of the first character in a set, ignoring quoted text.
* Returns the index of a character, ignoring quoted text.
* For example, in the string "abc'hide'h", the 'h' in "hide" will not be
* found by a search for "h". Unlike String.indexOf(), this method searches
* not for a single character, but for any character of the string
* <code>setOfChars</code>.
* found by a search for 'h'.
* @param text text to be searched
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param setOfChars string with one or more distinct characters
* @return Offset of the first character in <code>setOfChars</code>
* found, or -1 if not found.
* @see #indexOf
* @param c character to search for
* @return Offset of the first instance of c, or -1 if not found.
*/
static int32_t quotedIndexOf(const UnicodeString& text,
int32_t start, int32_t limit,
const UnicodeString& setOfChars);
UChar c);
};
#endif

View file

@ -534,7 +534,8 @@ Transliterator* Transliterator::createInverse(void) const {
* @see #getID
*/
Transliterator* Transliterator::createInstance(const UnicodeString& ID,
Transliterator::Direction dir) {
Transliterator::Direction dir,
ParseError* parseError) {
if (ID.indexOf(ID_DELIM) >= 0) {
return new CompoundTransliterator(ID, dir, 0);
}
@ -546,10 +547,10 @@ Transliterator* Transliterator::createInstance(const UnicodeString& ID,
ID.extractBetween(i+1, ID.length(), inverseID);
ID.extractBetween(0, i, right);
inverseID.append(ID_SEP).append(right);
t = _createInstance(inverseID);
t = _createInstance(inverseID, parseError);
}
} else {
t = _createInstance(ID);
t = _createInstance(ID, parseError);
}
return t;
}
@ -607,7 +608,8 @@ inline int32_t Transliterator::hash(const UnicodeString& str) {
* Returns a transliterator object given its ID. Unlike getInstance(),
* this method returns null if it cannot make use of the given ID.
*/
Transliterator* Transliterator::_createInstance(const UnicodeString& ID) {
Transliterator* Transliterator::_createInstance(const UnicodeString& ID,
ParseError* parseError) {
UErrorCode status = U_ZERO_ERROR;
if (!cacheInitialized) {
@ -660,8 +662,9 @@ Transliterator* Transliterator::_createInstance(const UnicodeString& ID) {
data = TransliterationRuleParser::parse(*rules, isReverse
? RuleBasedTransliterator::REVERSE
: RuleBasedTransliterator::FORWARD);
: RuleBasedTransliterator::FORWARD,
parseError);
// Double check to see if someone has modified the entry
// since we last looked at it.
if (entry->entryType != CacheEntry::RBT_DATA) {

View file

@ -51,6 +51,7 @@ TransliteratorTest::runIndexedTest(int32_t index, bool_t exec,
CASE(11,TestPatternQuoting);
CASE(12,TestJ277);
CASE(13,TestJ243);
CASE(14,TestJ329);
default: name = ""; break;
}
}
@ -65,11 +66,15 @@ void TransliteratorTest::TestInstantiation() {
i + ") returned empty string");
continue;
}
Transliterator* t = Transliterator::createInstance(id);
ParseError parseError;
Transliterator* t = Transliterator::createInstance(id,
Transliterator::FORWARD, &parseError);
name.truncate(0);
Transliterator::getDisplayName(id, name);
if (t == 0) {
errln(UnicodeString("FAIL: Couldn't create ") + id);
errln(UnicodeString("FAIL: Couldn't create ") + id +
", parse error " + parseError.code + ", line " +
parseError.line + ", offset " + parseError.offset);
// When createInstance fails, it deletes the failing
// entry from the available ID list. We detect this
// here by looking for a change in countAvailableIDs.
@ -577,6 +582,43 @@ void TransliteratorTest::TestJ243(void) {
expect(hex3, "012", "&#x30;&#x31;&#x32;");
}
/**
* Parsers need better syntax error messages.
*/
void TransliteratorTest::TestJ329(void) {
struct { bool_t containsErrors; const char* rule; } DATA[] = {
{ FALSE, "a > b; c > d" },
{ TRUE, "a > b; no operator; c > d" },
};
int32_t DATA_length = sizeof(DATA) / sizeof(DATA[0]);
for (int32_t i=0; i<DATA_length; ++i) {
UErrorCode status = U_ZERO_ERROR;
ParseError parseError;
RuleBasedTransliterator rbt("<ID>",
DATA[i].rule,
Transliterator::FORWARD,
0,
parseError,
status);
bool_t gotError = U_FAILURE(status);
UnicodeString desc(DATA[i].rule);
desc.append(gotError ? " -> error" : " -> no error");
if (gotError) {
desc = desc + ", ParseError code=" + parseError.code +
" line=" + parseError.line +
" offset=" + parseError.offset +
" context=" + parseError.context;
}
if (gotError == DATA[i].containsErrors) {
logln(UnicodeString("Ok: ") + desc);
} else {
errln(UnicodeString("FAIL: ") + desc);
}
}
}
//======================================================================
// Support methods
//======================================================================

View file

@ -91,6 +91,11 @@ class TransliteratorTest : public IntlTest {
*/
void TestJ243(void);
/**
* Parsers need better syntax error messages.
*/
void TestJ329(void);
//======================================================================
// Support methods
//======================================================================