ICU-4220 Add a rule stripping function for transliterators.

X-SVN-Rev: 17288
2025-04-08 06:53:45 +00:00 · 2005-03-08 07:04:29 +00:00 · 2005-03-08 07:04:29 +00:00 · 26f029b88d
commit 26f029b88d
parent ebcb00caff
2 changed files with 106 additions and 0 deletions
--- a/icu4c/source/i18n/rbt_pars.cpp
+++ b/icu4c/source/i18n/rbt_pars.cpp
@ -1545,4 +1545,95 @@ Transliterator* TransliteratorParser::createBasicInstance(const UnicodeString& i

 U_NAMESPACE_END

+U_CAPI int32_t
+utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status) {
+    const UChar *sourceStart = source;
+    const UChar *targetStart = target;
+    const UChar *sourceLimit = source+sourceLen;
+    UChar *targetLimit = target+sourceLen;
+    uint32_t isError = 0;
+    UChar32 c = 0;
+    UBool quoted = FALSE;
+    int32_t index;
+
+    uprv_memset(target, 0, sourceLen*U_SIZEOF_UCHAR);
+
+    /* read the rules into the buffer */
+    while (source < sourceLimit)
+    {
+        index=0;
+        U16_NEXT_UNSAFE(source, index, c);
+        source+=index;
+        if(c == QUOTE) {
+            quoted = (UBool)!quoted;
+        }
+        else if (!quoted) {
+            if (c == RULE_COMMENT_CHAR) {
+                /* skip comments and all preceding spaces */
+                while (targetStart < target && *(target - 1) == 0x0020) {
+                    target--;
+                }
+                do {
+                    c = *(source++);
+                }
+                while (c != CR && c != LF);
+                continue;
+            }
+            else if (c == ESCAPE) {
+                UChar32   c2 = *source;
+                if (c2 == CR || c2 == LF) {
+                    /* A backslash at the end of a line. */
+                    /* Since we're stripping lines, ignore the backslash. */
+                    source++;
+                    continue;
+                }
+                if (c2 == 0x0075 && source+5 < sourceLimit) { /* \u seen. \U isn't unescaped. */
+                    int32_t escapeOffset = 0;
+                    UnicodeString escapedStr(source, 5);
+                    c2 = escapedStr.unescapeAt(escapeOffset);
+
+                    if (c2 == (UChar32)0xFFFFFFFF || escapeOffset == 0)
+                    {
+                        *status = U_PARSE_ERROR;
+                        return 0;
+                    }
+                    if (!uprv_isRuleWhiteSpace(c2) && !u_iscntrl(c2) && !u_ispunct(c2)) {
+                        /* It was escaped for a reason. Write what it was suppose to be. */
+                        source+=5;
+                        c = c2;
+                    }
+                }
+                else if (c2 == QUOTE) {
+                    /* \' seen. Make sure we don't do anything when we see it again. */
+                    quoted = (UBool)!quoted;
+                }
+            }
+        }
+        if (c == CR || c == LF)
+        {
+            /* ignore spaces carriage returns, and all leading spaces on the next line.
+            * and line feed unless in the form \uXXXX
+            */
+            quoted = FALSE;
+            while (source < sourceLimit) {
+                c = *(source);
+                if (c != CR && c != LF && c != 0x0020) {
+                    break;
+                }
+                source++;
+            }
+            continue;
+        }
+
+        /* Append UChar * after dissembling if c > 0xffff*/
+        index=0;
+        U16_APPEND_UNSAFE(target, index, c);
+        target+=index;
+    }
+    if (target < targetLimit) {
+        *target = 0;
+    }
+    return (int32_t)(target-targetStart);
+}
+
 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
--- a/icu4c/source/i18n/rbt_pars.h
+++ b/icu4c/source/i18n/rbt_pars.h
@ -10,6 +10,7 @@

 #include "unicode/utypes.h"

+#ifdef XP_CPLUSPLUS
 #if !UCONFIG_NO_TRANSLITERATION

 #include "unicode/uobject.h"
@ -345,5 +346,19 @@ private:
 U_NAMESPACE_END

 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
+#endif /* #ifdef XP_CPLUSPLUS */
+
+/**
+ * Strip/convert the following from the transliterator rules:
+ * comments
+ * newlines
+ * white space at the beginning and end of a line
+ * unescape \u notation
+ *
+ * The target must be equal in size as the source.
+ * @internal
+ */
+U_CAPI int32_t
+utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);

 #endif