ICU-3383 add utility to 'autoquote' unquoted apostrophes in message format

X-SVN-Rev: 17720
2025-04-14 17:24:01 +00:00 · 2005-05-27 22:07:16 +00:00 · 2005-05-27 22:07:16 +00:00 · 70537cf0ca
commit 70537cf0ca
parent dc7aed1d04
6 changed files with 207 additions and 1 deletions
--- a/icu4c/source/i18n/msgfmt.cpp
+++ b/icu4c/source/i18n/msgfmt.cpp
@ -31,6 +31,7 @@
 #include "unicode/ustring.h"
 #include "unicode/ucnv_err.h"
 #include "unicode/uchar.h"
+#include "unicode/umsg.h"
 #include "unicode/rbnf.h"
 #include "ustrfmt.h"
 #include "cmemory.h"
@ -1169,6 +1170,29 @@ MessageFormat::parseObject( const UnicodeString& source,
        result.adoptArray(tmpResult, cnt);
 }
  
+UnicodeString 
+MessageFormat::autoQuoteApostrophe(const UnicodeString& pattern, UErrorCode& status) {
+  UnicodeString result;
+  if (U_SUCCESS(status)) {
+    int32_t plen = pattern.length();
+    const UChar* pat = pattern.getBuffer();
+    int32_t blen = plen * 2 + 1; // space for null termination, convenience
+    UChar* buf = result.getBuffer(blen);
+    if (buf == NULL) {
+      status = U_MEMORY_ALLOCATION_ERROR;
+    } else {
+      int32_t len = umsg_autoQuoteApostrophe(pat, plen, buf, blen, &status);
+      if (U_SUCCESS(status)) {
+	result.releaseBuffer(len);
+      }
+    }
+  }
+  if (U_FAILURE(status)) {
+    result.setToBogus();
+  }
+  return result;
+}
+
 // -------------------------------------

 static Format* makeRBNF(URBNFRuleSetTag tag, const Locale& locale, const UnicodeString& defaultRuleSet, UErrorCode& ec) {
--- a/icu4c/source/i18n/umsg.cpp
+++ b/icu4c/source/i18n/umsg.cpp
@ -30,6 +30,7 @@
 #include "unicode/unistr.h"
 #include "cpputils.h"
 #include "uassert.h"
+#include "ustr_imp.h"

 U_NAMESPACE_USE

@ -601,5 +602,106 @@ umsg_vparse(const UMessageFormat *fmt,
    delete [] args;
 }

+#define SINGLE_QUOTE      ((UChar)0x0027)
+#define CURLY_BRACE_LEFT  ((UChar)0x007B)
+#define CURLY_BRACE_RIGHT ((UChar)0x007D)
+
+#define STATE_INITIAL 0
+#define STATE_SINGLE_QUOTE 1
+#define STATE_IN_QUOTE 2
+#define STATE_MSG_ELEMENT 3
+
+#define MAppend(c) if (len < blen) buffer[len++] = c; else len++
+
+
+int32_t umsg_autoQuoteApostrophe(const UChar* pattern, 
+			     int32_t plen,
+			     UChar* buffer,
+			     int32_t blen,
+			     UErrorCode* ec)
+{
+  int32_t state = STATE_INITIAL;
+  int32_t braceCount = 0;
+  int32_t len = 0;
+
+  if (ec == NULL || U_FAILURE(*ec)) {
+    return -1;
+  }
+
+  if (pattern == NULL || plen < -1 || (buffer == NULL && blen > 0)) {
+    *ec = U_ILLEGAL_ARGUMENT_ERROR;
+    return -1;
+  }
+
+  if (plen == -1) {
+    plen = u_strlen(pattern);
+  }
+
+  for (int i = 0; i < plen; ++i) {
+    UChar c = pattern[i];
+    switch (state) {
+    case STATE_INITIAL:
+      switch (c) {
+      case SINGLE_QUOTE:
+	state = STATE_SINGLE_QUOTE;
+	break;
+      case CURLY_BRACE_LEFT:
+	state = STATE_MSG_ELEMENT;
+	++braceCount;
+	break;
+      }
+      break;
+
+    case STATE_SINGLE_QUOTE:
+      switch (c) {
+      case SINGLE_QUOTE:
+	state = STATE_INITIAL;
+	break;
+      case CURLY_BRACE_LEFT:
+      case CURLY_BRACE_RIGHT:
+	state = STATE_IN_QUOTE;
+	break;
+      default:
+	MAppend(SINGLE_QUOTE);
+	state = STATE_INITIAL;
+	break;
+      }
+      break;
+
+    case STATE_IN_QUOTE:
+      switch (c) {
+      case SINGLE_QUOTE:
+	state = STATE_INITIAL;
+	break;
+      }
+      break;
+
+    case STATE_MSG_ELEMENT:
+      switch (c) {
+      case CURLY_BRACE_LEFT:
+	++braceCount;
+	break;
+      case CURLY_BRACE_RIGHT:
+	if (--braceCount == 0) {
+	  state = STATE_INITIAL;
+	}
+	break;
+      }
+      break;
+
+    default: // Never happens.
+      break;
+    }
+
+    MAppend(c);
+  }
+
+  // End of scan
+  if (state == STATE_SINGLE_QUOTE || state == STATE_IN_QUOTE) {
+    MAppend(SINGLE_QUOTE);
+  }
+
+  return u_terminateUChars(buffer, blen, len, ec);
+}

 #endif /* #if !UCONFIG_NO_FORMATTING */
--- a/icu4c/source/i18n/unicode/msgfmt.h
+++ b/icu4c/source/i18n/unicode/msgfmt.h
@ -573,6 +573,26 @@ public:
                             Formattable& result,
                             ParsePosition& pos) const;

+    /**
+     * Convert an 'apostrophe-friendly' pattern into a standard
+     * pattern.  Standard patterns treat all apostrophes as
+     * quotes, which is problematic in some languages, e.g. 
+     * French, where apostrophe is commonly used.  This utility
+     * assumes that only an unpaired apostrophe immediately before
+     * a brace is a true quote.  Other unpaired apostrophes are paired,
+     * and the resulting standard pattern string is returned.
+     *
+     * <p><b>Note</b> it is not guaranteed that the returned pattern
+     * is indeed a valid pattern.  The only effect is to convert
+     * between patterns having different quoting semantics.
+     *
+     * @param pattern the 'apostrophe-friendly' patttern to convert
+     * @return the standard equivalent of the original pattern
+     * @since ICU 3.4
+     */
+    static UnicodeString autoQuoteApostrophe(const UnicodeString& pattern, 
+        UErrorCode& status);
+
    /**
     * Returns a unique class ID POLYMORPHICALLY.  Pure virtual override.
     * This method is to implement a simple version of RTTI, since not all
--- a/icu4c/source/i18n/unicode/umsg.h
+++ b/icu4c/source/i18n/unicode/umsg.h
@ -597,6 +597,36 @@ umsg_vparse(const UMessageFormat *fmt,
            UErrorCode     *status);


+    /**
+     * Convert an 'apostrophe-friendly' pattern into a standard
+     * pattern.  Standard patterns treat all apostrophes as
+     * quotes, which is problematic in some languages, e.g. 
+     * French, where apostrophe is commonly used.  This utility
+     * assumes that only an unpaired apostrophe immediately before
+     * a brace is a true quote.  Other unpaired apostrophes are paired,
+     * and the resulting standard pattern string is returned.
+     *
+     * <p><b>Note</b> it is not guaranteed that the returned pattern
+     * is indeed a valid pattern.  The only effect is to convert
+     * between patterns having different quoting semantics.
+     *
+     * @param pattern the 'apostrophe-friendly' patttern to convert
+     * @param plen the length of pattern, or -1 if unknown and pattern is null-terminated
+     * @param buffer the buffer for the result, or NULL if preflight only
+     * @param blen the length of the buffer, or 0 if preflighting
+     * @param ec the error code
+     * @return the length of the resulting text, not including trailing null
+     *        if buffer has room for the trailing null, it is provided, otherwise
+     *        not
+     * @draft ICU 3.4
+     */
+U_STABLE int32_t U_EXPORT2 
+umsg_autoQuoteApostrophe(const UChar* pattern, 
+			 int32_t plen,
+			 UChar* buffer,
+			 int32_t blen,
+			 UErrorCode* ec);
+
 #endif /* #if !UCONFIG_NO_FORMATTING */

 #endif
--- a/icu4c/source/test/intltest/tmsgfmt.cpp
+++ b/icu4c/source/test/intltest/tmsgfmt.cpp
@ -28,6 +28,7 @@
 #include "unicode/numfmt.h"
 #include "unicode/choicfmt.h"
 #include "unicode/gregocal.h"
+#include <stdio.h>

 void
 TestMessageFormat::runIndexedTest(int32_t index, UBool exec,
@ -53,6 +54,7 @@ TestMessageFormat::runIndexedTest(int32_t index, UBool exec,
        TESTCASE(17,TestUnlimitedArgsAndSubformats);
        TESTCASE(18,TestRBNF);
        TESTCASE(19,TestTurkishCasing);
+        TESTCASE(20,testAutoQuoteApostrophe);
        default: name = ""; break;
    }
 }
@ -193,7 +195,6 @@ void TestMessageFormat::testBug2()
 #include "unicode/datefmt.h"
 #include <stdlib.h>
 #include <stdio.h>
-#include <string.h>

 IntlTest&
 operator<<( IntlTest&           stream,
@ -1211,4 +1212,32 @@ void TestMessageFormat::TestRBNF(void) {
    delete numFmt;
 }

+void TestMessageFormat::testAutoQuoteApostrophe(void) {
+    const char* patterns[] = { // pattern, expected pattern
+        "'", "''",
+        "''", "''",
+        "'{", "'{'",
+        "' {", "'' {",
+        "'a", "''a",
+        "'{'a", "'{'a",
+        "'{a'", "'{a'",
+        "'{}", "'{}'",
+        "{'", "{'",
+        "{'a", "{'a",
+        "{'a{}'a}'a", "{'a{}'a}''a",
+    };
+    int32_t pattern_count = sizeof(patterns)/sizeof(patterns[0]);
+
+    for (int i = 0; i < pattern_count; i += 2) {
+	UErrorCode status = U_ZERO_ERROR;
+        UnicodeString result = MessageFormat::autoQuoteApostrophe(patterns[i], status);
+        UnicodeString target(patterns[i+1]);
+        if (target != result) {
+            char buf[128];
+            sprintf(buf, "[%2d] \"%s\" : \"%s\" != \"%s\"\n", i/2, patterns[i], patterns[i+1], result);
+            errln(buf);
+        }
+    }
+}
+
 #endif /* #if !UCONFIG_NO_FORMATTING */
--- a/icu4c/source/test/intltest/tmsgfmt.h
+++ b/icu4c/source/test/intltest/tmsgfmt.h
@ -86,6 +86,7 @@ public:
    void testParse(void);
    void testAdopt(void);
    void TestTurkishCasing(void);
+    void testAutoQuoteApostrophe(void);

 private:
 };