ICU-1130 use perl \N{name} notation in Any-Name and Name-Any

X-SVN-Rev: 9849
2025-04-10 07:39:16 +00:00 · 2002-09-06 23:30:29 +00:00 · 2002-09-06 23:30:29 +00:00 · 85d23479b6
commit 85d23479b6
parent c8f160bcab
5 changed files with 143 additions and 127 deletions
--- a/icu4c/source/i18n/name2uni.cpp
+++ b/icu4c/source/i18n/name2uni.cpp
@ -10,11 +10,11 @@

 #include "unicode/unifilt.h"
 #include "unicode/uchar.h"
+#include "unicode/uniset.h"
 #include "name2uni.h"
+#include "cmemory.h"
 #include "uprops.h"
-
-// As of Unicode 3.0.0, the longest name is 83 characters long.
-#define LONGEST_NAME 83
+#include "util.h"

 U_NAMESPACE_BEGIN

@ -22,25 +22,17 @@ const char NameUnicodeTransliterator::fgClassID=0;

 const char NameUnicodeTransliterator::_ID[] = "Name-Any";

-/**
- * Constructs a transliterator.
- */
-NameUnicodeTransliterator::NameUnicodeTransliterator(
-                                 UChar32 openDelim, UChar32 closeDelim,
-                                 UnicodeFilter* adoptedFilter) :
-    Transliterator(_ID, adoptedFilter),
-    openDelimiter(openDelim),
-    closeDelimiter(closeDelim) {
-}
+static const UChar OPEN[] = {92,78,126,123,126,0}; // "\N~{~"
+static const UChar OPEN_DELIM  = 92;  // '\\' first char of OPEN
+static const UChar CLOSE_DELIM = 125; // '}'
+static const UChar SPACE       = 32;  // ' '

 /**
 * Constructs a transliterator with the default delimiters '{' and
 * '}'.
 */
 NameUnicodeTransliterator::NameUnicodeTransliterator(UnicodeFilter* adoptedFilter) :
-    Transliterator(_ID, adoptedFilter),
-    openDelimiter((UChar) 0x007B /*{*/),
-    closeDelimiter((UChar) 0x007D /*}*/) {
+    Transliterator(_ID, adoptedFilter) {
 }

 /**
@ -52,9 +44,7 @@ NameUnicodeTransliterator::~NameUnicodeTransliterator() {}
 * Copy constructor.
 */
 NameUnicodeTransliterator::NameUnicodeTransliterator(const NameUnicodeTransliterator& o) :
-    Transliterator(o),
-    openDelimiter(o.openDelimiter),
-    closeDelimiter(o.closeDelimiter) {}
+    Transliterator(o) {}

 /**
 * Assignment operator.
@ -62,8 +52,6 @@ NameUnicodeTransliterator::NameUnicodeTransliterator(const NameUnicodeTransliter
 NameUnicodeTransliterator& NameUnicodeTransliterator::operator=(
                             const NameUnicodeTransliterator& o) {
    Transliterator::operator=(o);
-    openDelimiter = o.openDelimiter;
-    closeDelimiter = o.closeDelimiter;
    return *this;
 }

@ -79,13 +67,30 @@ Transliterator* NameUnicodeTransliterator::clone(void) const {
 */
 void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
                                                    UBool isIncremental) const {
-    // Accomodate the longest possible name plus padding
-    UChar buf[LONGEST_NAME + 8];
-    char cbuf[LONGEST_NAME + 8]; // Default converter
+    // The failure mode, here and below, is to behave like Any-Null,
+    // if either there is no name data (max len == 0) or there is no
+    // memory (malloc() => NULL).

-    // The only characters used in names are (as of Unicode 3.0.0):
-    //  -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ
-    // (first character is a space).
+    int32_t maxLen = uprv_getMaxCharNameLength();
+    if (maxLen == 0) {
+        offsets.start = offsets.limit;
+        return;
+    }
+
+    // Accomodate the longest possible name
+    ++maxLen; // allow for temporary trailing space
+    char* cbuf = (char*) uprv_malloc(maxLen);
+    if (cbuf == NULL) {
+        offsets.start = offsets.limit;
+        return;
+    }
+
+    UnicodeString openPat(TRUE, OPEN, -1);
+    UnicodeString str, name;
+
+    // Get the legal character set
+    UnicodeSet legal;
+    uprv_getCharNameCharacters((USet*) &legal); // USet* == UnicodeSet*
    
    int32_t cursor = offsets.start;
    int32_t limit = offsets.limit;
@ -94,63 +99,77 @@ void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPos
    // 0 - looking for open delimiter
    // 1 - after open delimiter
    int32_t mode = 0;
-    int32_t ibuf = 0;
-    int32_t openPos = offsets.start; // position of openDelimiter
-
-    UnicodeString str;
+    int32_t openPos = -1; // open delim candidate pos

    UChar32 c;
-    for (; cursor < limit; cursor+=UTF_CHAR_LENGTH(c)) {
+    while (cursor < limit) {
        c = text.char32At(cursor);

        switch (mode) {
        case 0: // looking for open delimiter
-            if (c == openDelimiter) {
+            if (c == OPEN_DELIM) { // quick check first
                openPos = cursor;
-                mode = 1;
-                ibuf = 0;
+                int32_t i =
+                    ICU_Utility::parsePattern(openPat, text, cursor, limit);
+                if (i >= 0 && i < limit) {
+                    mode = 1;
+                    name.truncate(0);
+                    cursor = i;
+                    continue; // *** reprocess char32At(cursor)
+                }
            }
            break;

        case 1: // after open delimiter
-            // Look for [-a-zA-Z0-9<>].  If \s+ is found, convert it
+            // Look for legal chars.  If \s+ is found, convert it
            // to a single space.  If closeDelimiter is found, exit
            // the loop.  If any other character is found, exit the
-            // loop.  If the limit is found, exit the loop.
+            // loop.  If the limit is reached, exit the loop.
+
+            // Convert \s+ => SPACE.  This assumes there are no
+            // runs of >1 space characters in names.
            if (uprv_isRuleWhiteSpace(c)) {
                // Ignore leading whitespace
-                if (ibuf != 0 && buf[ibuf-1] != (UChar)0x0020) {
-                    buf[ibuf++] = (UChar)0x0020 /* */;
-                    // If we go a bit past the longest possible name then abort
-                    if (ibuf == (LONGEST_NAME + 4)) {
+                if (name.length() > 0 &&
+                    name.charAt(name.length()-1) != SPACE) {
+                    name.append(SPACE);
+                    // If we are too long then abort.  maxLen includes
+                    // temporary trailing space, so use '>'.
+                    if (name.length() > maxLen) {
                        mode = 0;
                    }
                }
-                continue;
+                break;
            }

-            if (c == closeDelimiter) {
+            if (c == CLOSE_DELIM) {
+
+                int32_t len = name.length();
+
                // Delete trailing space, if any
-                if (ibuf > 0 && buf[ibuf-1] == (UChar)0x0020) {
-                    --ibuf;
+                if (len > 0 &&
+                    name.charAt(len-1) == SPACE) {
+                    --len;
                }
-                buf[ibuf] = 0; // Add terminating zero
+
+                name.extract(0, len, cbuf, "");
+
                UErrorCode status = U_ZERO_ERROR;
-
-                UChar32 ch;
-
-                u_UCharsToChars(buf, cbuf, ibuf+1);
-                ch = u_charFromName(U_EXTENDED_CHAR_NAME, cbuf, &status);
+                c = u_charFromName(U_EXTENDED_CHAR_NAME, cbuf, &status);
                if (U_SUCCESS(status)) {
                    // Lookup succeeded
+
+                    // assert(UTF_CHAR_LENGTH(CLOSE_DELIM) == 1);
+                    cursor++; // advance over CLOSE_DELIM
+
                    str.truncate(0);
-                    str.append(ch);
-                    text.handleReplaceBetween(openPos, cursor+1, str);
+                    str.append(c);
+                    text.handleReplaceBetween(openPos, cursor, str);

                    // Adjust indices for the change in the length of
                    // the string.  Do not assume that str.length() ==
                    // 1, in case of surrogates.
-                    int32_t delta = cursor + 1 - openPos - str.length();
+                    int32_t delta = cursor - openPos - str.length();
                    cursor -= delta;
                    limit -= delta;
                    // assert(cursor == openPos + str.length());
@ -158,18 +177,18 @@ void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPos
                // If the lookup failed, we leave things as-is and
                // still switch to mode 0 and continue.
                mode = 0;
-                continue;
+                openPos = -1; // close off candidate
+                continue; // *** reprocess char32At(cursor)
            }
            
-            // Check if c =~ [-A-Za-z0-9<> ]
-            if (c == (UChar)0x002D ||
-                (c >= (UChar)0x0041 && c <= (UChar)0x005A) ||
-                (c >= (UChar)0x0061 && c <= (UChar)0x007A) ||
-                (c >= (UChar)0x0030 && c <= (UChar)0x0039) ||
-                c == (UChar)0x003C || c == (UChar)0x003E) {
-                buf[ibuf++] = (char) c;
-                // If we go a bit past the longest possible name then abort
-                if (ibuf == (LONGEST_NAME + 4)) {
+            // Check if c is a legal char.  We assume here that
+            // legal.contains(OPEN_DELIM) is FALSE, so when we abort a
+            // name, we don't have to go back to openPos+1.
+            if (legal.contains(c)) {
+                name.append(c);
+                // If we go past the longest possible name then abort.
+                // maxLen includes temporary trailing space, so use '>='.
+                if (name.length() >= maxLen) {
                    mode = 0;
                }
            }
@ -182,13 +201,17 @@ void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPos

            break;
        }
+
+        cursor += UTF_CHAR_LENGTH(c);
    }
        
    offsets.contextLimit += limit - offsets.limit;
    offsets.limit = limit;
    // In incremental mode, only advance the cursor up to the last
-    // open delimiter, if we are in mode 1.
-    offsets.start = (mode == 1 && isIncremental) ? openPos : cursor;
+    // open delimiter candidate.
+    offsets.start = (isIncremental && openPos >= 0) ? openPos : cursor;
+
+    uprv_free(cbuf);
 }

 U_NAMESPACE_END
--- a/icu4c/source/i18n/name2uni.h
+++ b/icu4c/source/i18n/name2uni.h
@ -16,14 +16,12 @@ U_NAMESPACE_BEGIN

 /**
 * A transliterator that performs name to character mapping.
+ * It recognizes the Perl syntax \N{name}.
 * @author Alan Liu
 * @draft ICU 2.0
 */
 class U_I18N_API NameUnicodeTransliterator : public Transliterator {

-    UChar32 openDelimiter;
-    UChar32 closeDelimiter;
-
    /**
     * The address of this static class variable serves as this class's ID
     * for ICU "poor man's RTTI".
@ -34,17 +32,6 @@ class U_I18N_API NameUnicodeTransliterator : public Transliterator {

    /**
     * Constructs a transliterator.
-     * @param openDelimiter    the open delimiter character.
-     * @param closeDelimiter   the close delimiter character.
-     * @param adoptedFilter    the filter for this transliterator.
-     * @draft ICU 2.0
-     */
-    NameUnicodeTransliterator(UChar32 openDelimiter, UChar32 closeDelimiter,
-                              UnicodeFilter* adoptedFilter = 0);
-
-    /**
-     * Constructs a transliterator with the default delimiters '{' and
-     * '}'.
     * @param adoptedFilter    the filter for this transliterator.
     * @draft ICU 2.0
     */
--- a/icu4c/source/i18n/uni2name.cpp
+++ b/icu4c/source/i18n/uni2name.cpp
@ -12,6 +12,8 @@
 #include "unicode/uchar.h"
 #include "uni2name.h"
 #include "cstring.h"
+#include "cmemory.h"
+#include "uprops.h"

 U_NAMESPACE_BEGIN

@ -19,25 +21,15 @@ const char UnicodeNameTransliterator::fgClassID=0;

 const char UnicodeNameTransliterator::_ID[] = "Any-Name";

+static const UChar OPEN_DELIM[] = {92,78,123,0}; // "\N{"
+static const UChar CLOSE_DELIM  = 125; // "}"
+#define OPEN_DELIM_LEN 3
+
 /**
 * Constructs a transliterator.
 */
-UnicodeNameTransliterator::UnicodeNameTransliterator(
-                                 UChar32 openDelim, UChar32 closeDelim,
-                                 UnicodeFilter* adoptedFilter) :
-    Transliterator(_ID, adoptedFilter),
-    openDelimiter(openDelim),
-    closeDelimiter(closeDelim) {
-}
-
-/**
- * Constructs a transliterator with the default delimiters '{' and
- * '}'.
- */
 UnicodeNameTransliterator::UnicodeNameTransliterator(UnicodeFilter* adoptedFilter) :
-    Transliterator(_ID, adoptedFilter),
-    openDelimiter((UChar) 0x007B /*{*/),
-    closeDelimiter((UChar) 0x007D /*}*/) {
+    Transliterator(_ID, adoptedFilter) {
 }

 /**
@ -49,9 +41,7 @@ UnicodeNameTransliterator::~UnicodeNameTransliterator() {}
 * Copy constructor.
 */
 UnicodeNameTransliterator::UnicodeNameTransliterator(const UnicodeNameTransliterator& o) :
-    Transliterator(o),
-    openDelimiter(o.openDelimiter),
-    closeDelimiter(o.closeDelimiter) {}
+    Transliterator(o) {}

 /**
 * Assignment operator.
@ -59,8 +49,6 @@ UnicodeNameTransliterator::UnicodeNameTransliterator(const UnicodeNameTransliter
 UnicodeNameTransliterator& UnicodeNameTransliterator::operator=(
                             const UnicodeNameTransliterator& o) {
    Transliterator::operator=(o);
-    openDelimiter = o.openDelimiter;
-    closeDelimiter = o.closeDelimiter;
    return *this;
 }

@ -78,15 +66,27 @@ Transliterator* UnicodeNameTransliterator::clone(void) const {
 */
 void UnicodeNameTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
                                                    UBool /*isIncremental*/) const {
-    // As of Unicode 3.0.0, the longest name is 83 characters long.
-    // Adjust this buffer size as needed.
+    // The failure mode, here and below, is to behave like Any-Null,
+    // if either there is no name data (max len == 0) or there is no
+    // memory (malloc() => NULL).

-    char buf[128];
+    int32_t maxLen = uprv_getMaxCharNameLength();
+    if (maxLen == 0) {
+        offsets.start = offsets.limit;
+        return;
+    }
+
+    // Accomodate the longest possible name plus padding
+    char* buf = (char*) uprv_malloc(maxLen);
+    if (buf == NULL) {
+        offsets.start = offsets.limit;
+        return;
+    }
    
    int32_t cursor = offsets.start;
    int32_t limit = offsets.limit;

-    UnicodeString str(openDelimiter);
+    UnicodeString str(FALSE, OPEN_DELIM, OPEN_DELIM_LEN);
    UErrorCode status;
    int32_t len;

@ -94,11 +94,11 @@ void UnicodeNameTransliterator::handleTransliterate(Replaceable& text, UTransPos
        UChar32 c = text.char32At(cursor);
        int32_t clen = UTF_CHAR_LENGTH(c);
        status = U_ZERO_ERROR;
-        if ((len = u_charName(c, U_EXTENDED_CHAR_NAME, buf, sizeof(buf), &status)) >0 && !U_FAILURE(status)) {
-            str.truncate(1);
-            str.append(UnicodeString(buf, len, "")).append(closeDelimiter);
+        if ((len = u_charName(c, U_EXTENDED_CHAR_NAME, buf, maxLen, &status)) >0 && !U_FAILURE(status)) {
+            str.truncate(OPEN_DELIM_LEN);
+            str.append(UnicodeString(buf, len, "")).append(CLOSE_DELIM);
            text.handleReplaceBetween(cursor, cursor+clen, str);
-            len += 2; // adjust for delimiters
+            len += OPEN_DELIM_LEN + 1; // adjust for delimiters
            cursor += len; // advance cursor and adjust for new text
            limit += len-clen; // change in length
        } else {
@ -109,6 +109,8 @@ void UnicodeNameTransliterator::handleTransliterate(Replaceable& text, UTransPos
    offsets.contextLimit += limit - offsets.limit;
    offsets.limit = limit;
    offsets.start = cursor;
+
+    uprv_free(buf);
 }

 U_NAMESPACE_END
--- a/icu4c/source/i18n/uni2name.h
+++ b/icu4c/source/i18n/uni2name.h
@ -16,27 +16,15 @@ U_NAMESPACE_BEGIN

 /**
 * A transliterator that performs character to name mapping.
+ * It generates the Perl syntax \N{name}.
 * @author Alan Liu
 */
 class U_I18N_API UnicodeNameTransliterator : public Transliterator {

-    UChar32 openDelimiter;
-    UChar32 closeDelimiter;
-
 public:

    /**
     * Constructs a transliterator.
-     * @param openDelimiter  the open delimiter character.
-     * @param closeDelimiter the close delimiter character.
-     * @param adoptedFilter  the filter to be adopted.
-     */
-    UnicodeNameTransliterator(UChar32 openDelimiter, UChar32 closeDelimiter,
-                                UnicodeFilter* adoptedFilter = 0);
-
-    /**
-     * Constructs a transliterator with the default delimiters '{' and
-     * '}'.
     * @param adoptedFilter the filter to be adopted.
     */
    UnicodeNameTransliterator(UnicodeFilter* adoptedFilter = 0);
--- a/icu4c/source/test/intltest/transtst.cpp
+++ b/icu4c/source/test/intltest/transtst.cpp
@ -1180,13 +1180,28 @@ void TransliteratorTest::TestNameMap(void) {
        return;
    }

+    // Careful:  CharsToUS will convert "\\N" => "N"; use "\\\\N" for \N
    expect(*uni2name, CharsToUnicodeString("\\u00A0abc\\u4E01\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF"),
-           CharsToUnicodeString("{NO-BREAK SPACE}abc{CJK UNIFIED IDEOGRAPH-4E01}{MICRO SIGN}{GUJARATI SIGN CANDRABINDU}{REPLACEMENT CHARACTER}{END OF TRANSMISSION}{CHARACTER TABULATION}{<control-0081>}{<noncharacter-FFFF>}"));
-    expect(*name2uni, "{ NO-BREAK SPACE}abc{  CJK UNIFIED  IDEOGRAPH-4E01  }{x{MICRO SIGN}{GUJARATI SIGN CANDRABINDU}{REPLACEMENT CHARACTER}{END OF TRANSMISSION}{CHARACTER TABULATION}{<control-0081>}{<noncharacter-FFFF>}{<control-0004>}{",
-           CharsToUnicodeString("\\u00A0abc\\u4E01{x\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF\\u0004{"));
+           CharsToUnicodeString("\\\\N{NO-BREAK SPACE}abc\\\\N{CJK UNIFIED IDEOGRAPH-4E01}\\\\N{MICRO SIGN}\\\\N{GUJARATI SIGN CANDRABINDU}\\\\N{REPLACEMENT CHARACTER}\\\\N{END OF TRANSMISSION}\\\\N{CHARACTER TABULATION}\\\\N{<control-0081>}\\\\N{<noncharacter-FFFF>}"));
+    expect(*name2uni, "{\\N { NO-BREAK SPACE}abc\\N{  CJK UNIFIED  IDEOGRAPH-4E01  }\\N{x\\N{MICRO SIGN}\\N{GUJARATI SIGN CANDRABINDU}\\N{REPLACEMENT CHARACTER}\\N{END OF TRANSMISSION}\\N{CHARACTER TABULATION}\\N{<control-0081>}\\N{<noncharacter-FFFF>}\\N{<control-0004>}\\N{",
+           CharsToUnicodeString("{\\u00A0abc\\u4E01\\\\N{x\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF\\u0004\\\\N{"));

    delete uni2name;
    delete name2uni;
+
+    // round trip
+    Transliterator* t =
+        Transliterator::createInstance("Any-Name;Name-Any", UTRANS_FORWARD, parseError, status);
+    if (t==0) {
+        errln("FAIL: createInstance returned NULL");
+        delete t;
+        return;
+    }
+
+    // Careful:  CharsToUS will convert "\\N" => "N"; use "\\\\N" for \N
+    UnicodeString s = CharsToUnicodeString("{\\u00A0abc\\u4E01\\\\N{x\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF\\u0004\\\\N{");
+    expect(*t, s, s);
+    delete t;
 }

 /**
@ -1678,7 +1693,7 @@ void TransliteratorTest::TestSupplemental() {

    expectT("Any-Name",
           CharsToUnicodeString("\\U00010330\\U000E0061\\u00A0"),
-           "{GOTHIC LETTER AHSA}{TAG LATIN SMALL LETTER A}{NO-BREAK SPACE}");
+           "\\N{GOTHIC LETTER AHSA}\\N{TAG LATIN SMALL LETTER A}\\N{NO-BREAK SPACE}");

    expectT("Any-Hex/Unicode",
           CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
@ -3541,12 +3556,13 @@ void TransliteratorTest::TestUserFunction() {
    _TUFReg("Any-gif", t, 0);

    t = Transliterator::createFromRules("RemoveCurly",
-                                        "[\\{\\}] > ;",
+                                        "[\\{\\}] > ; '\\N' > ;",
                                        UTRANS_FORWARD, pe, ec);
    if (t == NULL || U_FAILURE(ec)) {
        errln((UnicodeString)"FAIL: createFromRules RemoveCurly " + u_errorName(ec));
        goto FAIL;
    }
+    expect(*t, "\\N{name}", "name");
    _TUFReg("Any-RemoveCurly", t, 1);

    logln("Trying &hex");
@ -3588,7 +3604,7 @@ void TransliteratorTest::TestUserFunction() {

    // Test that filters are allowed after &
    t = Transliterator::createFromRules("test",
-                                        "(.) > &Hex($1) ' ' &[\\{\\}]Remove(&Name($1)) ' ';",
+                                        "(.) > &Hex($1) ' ' &RemoveCurly(&Name($1)) ' ';",
                                        UTRANS_FORWARD, pe, ec);
    if (t == NULL || U_FAILURE(ec)) {
        errln((UnicodeString)"FAIL: createFromRules test " + u_errorName(ec));