ICU-4220 Add a rule stripping function for transliterators.

X-SVN-Rev: 17288
This commit is contained in:
George Rhoten 2005-03-08 07:04:29 +00:00
parent ebcb00caff
commit 26f029b88d
2 changed files with 106 additions and 0 deletions

View file

@ -1545,4 +1545,95 @@ Transliterator* TransliteratorParser::createBasicInstance(const UnicodeString& i
U_NAMESPACE_END
U_CAPI int32_t
utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status) {
const UChar *sourceStart = source;
const UChar *targetStart = target;
const UChar *sourceLimit = source+sourceLen;
UChar *targetLimit = target+sourceLen;
uint32_t isError = 0;
UChar32 c = 0;
UBool quoted = FALSE;
int32_t index;
uprv_memset(target, 0, sourceLen*U_SIZEOF_UCHAR);
/* read the rules into the buffer */
while (source < sourceLimit)
{
index=0;
U16_NEXT_UNSAFE(source, index, c);
source+=index;
if(c == QUOTE) {
quoted = (UBool)!quoted;
}
else if (!quoted) {
if (c == RULE_COMMENT_CHAR) {
/* skip comments and all preceding spaces */
while (targetStart < target && *(target - 1) == 0x0020) {
target--;
}
do {
c = *(source++);
}
while (c != CR && c != LF);
continue;
}
else if (c == ESCAPE) {
UChar32 c2 = *source;
if (c2 == CR || c2 == LF) {
/* A backslash at the end of a line. */
/* Since we're stripping lines, ignore the backslash. */
source++;
continue;
}
if (c2 == 0x0075 && source+5 < sourceLimit) { /* \u seen. \U isn't unescaped. */
int32_t escapeOffset = 0;
UnicodeString escapedStr(source, 5);
c2 = escapedStr.unescapeAt(escapeOffset);
if (c2 == (UChar32)0xFFFFFFFF || escapeOffset == 0)
{
*status = U_PARSE_ERROR;
return 0;
}
if (!uprv_isRuleWhiteSpace(c2) && !u_iscntrl(c2) && !u_ispunct(c2)) {
/* It was escaped for a reason. Write what it was suppose to be. */
source+=5;
c = c2;
}
}
else if (c2 == QUOTE) {
/* \' seen. Make sure we don't do anything when we see it again. */
quoted = (UBool)!quoted;
}
}
}
if (c == CR || c == LF)
{
/* ignore spaces carriage returns, and all leading spaces on the next line.
* and line feed unless in the form \uXXXX
*/
quoted = FALSE;
while (source < sourceLimit) {
c = *(source);
if (c != CR && c != LF && c != 0x0020) {
break;
}
source++;
}
continue;
}
/* Append UChar * after dissembling if c > 0xffff*/
index=0;
U16_APPEND_UNSAFE(target, index, c);
target+=index;
}
if (target < targetLimit) {
*target = 0;
}
return (int32_t)(target-targetStart);
}
#endif /* #if !UCONFIG_NO_TRANSLITERATION */

View file

@ -10,6 +10,7 @@
#include "unicode/utypes.h"
#ifdef XP_CPLUSPLUS
#if !UCONFIG_NO_TRANSLITERATION
#include "unicode/uobject.h"
@ -345,5 +346,19 @@ private:
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_TRANSLITERATION */
#endif /* #ifdef XP_CPLUSPLUS */
/**
* Strip/convert the following from the transliterator rules:
* comments
* newlines
* white space at the beginning and end of a line
* unescape \u notation
*
* The target must be equal in size as the source.
* @internal
*/
U_CAPI int32_t
utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);
#endif