ICU-9538 add script metadata properties API

X-SVN-Rev: 33255
2025-04-07 06:25:30 +00:00 · 2013-02-17 00:49:18 +00:00 · 2013-02-17 00:49:18 +00:00 · ebbc5423ef
commit ebbc5423ef
parent b1309f6306
10 changed files with 523 additions and 8 deletions
--- a/icu4c/source/common/Makefile.in
+++ b/icu4c/source/common/Makefile.in
@ -1,6 +1,6 @@
 #******************************************************************************
 #
-#   Copyright (C) 1999-2012, International Business Machines
+#   Copyright (C) 1999-2013, International Business Machines
 #   Corporation and others.  All Rights Reserved.
 #
 #******************************************************************************
@ -97,7 +97,7 @@ unistr_case_locale.o ustrcase_locale.o unistr_titlecase_brkiter.o ustr_titlecase
 normalizer2impl.o normalizer2.o filterednormalizer2.o normlzr.o unorm.o unormcmp.o unorm_it.o \
 chariter.o schriter.o uchriter.o uiter.o \
 patternprops.o uchar.o uprops.o ucase.o propname.o ubidi_props.o ubidi.o ubidiwrt.o ubidiln.o ushape.o \
-uscript.o usc_impl.o unames.o \
+uscript.o uscript_props.o usc_impl.o unames.o \
 utrie.o utrie2.o utrie2_builder.o bmpset.o unisetspan.o uset_props.o uniset_props.o uniset_closure.o uset.o uniset.o usetiter.o ruleiter.o caniter.o unifilt.o unifunct.o \
 uarrsort.o brkiter.o ubrk.o brkeng.o dictbe.o \
 rbbi.o rbbidata.o rbbinode.o rbbirb.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o \
--- a/icu4c/source/common/common.vcxproj
+++ b/icu4c/source/common/common.vcxproj
@ -389,6 +389,7 @@
    <ClCompile Include="uprops.cpp" />
    <ClCompile Include="usc_impl.c" />
    <ClCompile Include="uscript.c" />
+    <ClCompile Include="uscript_props.cpp" />
    <ClCompile Include="uset.cpp" />
    <ClCompile Include="uset_props.cpp" />
    <ClCompile Include="usetiter.cpp" />
--- a/icu4c/source/common/common.vcxproj.filters
+++ b/icu4c/source/common/common.vcxproj.filters
@ -406,6 +406,9 @@
    <ClCompile Include="uscript.c">
      <Filter>properties &amp; sets</Filter>
    </ClCompile>
+    <ClCompile Include="uscript_props.cpp">
+      <Filter>properties &amp; sets</Filter>
+    </ClCompile>
    <ClCompile Include="uset.cpp">
      <Filter>properties &amp; sets</Filter>
    </ClCompile>
--- a/icu4c/source/common/unicode/uscript.h
+++ b/icu4c/source/common/unicode/uscript.h
@ -1,6 +1,6 @@
 /*
 **********************************************************************
- *   Copyright (C) 1997-2012, International Business Machines
+ *   Copyright (C) 1997-2013, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *
@ -512,4 +512,111 @@ uscript_getScriptExtensions(UChar32 c,
                            UErrorCode *errorCode);
 #endif  /* U_HIDE_DRAFT_API */

+#ifndef U_HIDE_DRAFT_API
+
+/**
+ * Script usage constants.
+ * See UAX #31 Unicode Identifier and Pattern Syntax.
+ * http://www.unicode.org/reports/tr31/#Table_Candidate_Characters_for_Exclusion_from_Identifiers
+ *
+ * @draft ICU 51
+ */
+typedef enum UScriptUsage {
+    /** Not encoded in Unicode. @draft ICU 51 */
+    USCRIPT_USAGE_NOT_ENCODED,
+    /** Unknown script usage. @draft ICU 51 */
+    USCRIPT_USAGE_UNKNOWN,
+    /** Candidate for Exclusion from Identifiers. @draft ICU 51 */
+    USCRIPT_USAGE_EXCLUDED,
+    /** Limited Use script. @draft ICU 51 */
+    USCRIPT_USAGE_LIMITED_USE,
+    /** Aspirational Use script. @draft ICU 51 */
+    USCRIPT_USAGE_ASPIRATIONAL,
+    /** Recommended script. @draft ICU 51 */
+    USCRIPT_USAGE_RECOMMENDED
+} UScriptUsage;
+
+/**
+ * Writes the script sample character string.
+ * This string normally consists of one code point but might be longer.
+ * The string is empty if the script is not encoded.
+ *
+ * @param script script code
+ * @param dest output string array
+ * @param capacity number of UChars in the dest array
+ * @param pErrorCode standard ICU in/out error code, must pass U_SUCCESS() on input
+ * @return the string length, even if U_BUFFER_OVERFLOW_ERROR
+ * @draft ICU 51
+ */
+U_DRAFT int32_t U_EXPORT2
+uscript_getSampleString(UScriptCode script, UChar *dest, int32_t capacity, UErrorCode *pErrorCode);
+
+#if U_SHOW_CPLUSPLUS_API
+
+U_NAMESPACE_BEGIN
+class UnicodeString;
+U_NAMESPACE_END
+
+/**
+ * Returns the script sample character string.
+ * This string normally consists of one code point but might be longer.
+ * The string is empty if the script is not encoded.
+ *
+ * @param script script code
+ * @return the sample character string
+ * @draft ICU 51
+ */
+U_COMMON_API icu::UnicodeString U_EXPORT2
+uscript_getSampleUnicodeString(UScriptCode script);
+
+#endif
+
+/**
+ * Returns the script usage according to UAX #31 Unicode Identifier and Pattern Syntax.
+ * Returns USCRIPT_USAGE_NOT_ENCODED if the script is not encoded in Unicode.
+ *
+ * @param script script code
+ * @return script usage
+ * @see UScriptUsage
+ * @draft ICU 51
+ */
+U_DRAFT UScriptUsage U_EXPORT2
+uscript_getUsage(UScriptCode script);
+
+/**
+ * Returns TRUE if the script is written right-to-left.
+ * For example, Arab and Hebr.
+ *
+ * @param script script code
+ * @return TRUE if the script is right-to-left
+ * @draft ICU 51
+ */
+U_DRAFT UBool U_EXPORT2
+uscript_isRightToLeft(UScriptCode script);
+
+/**
+ * Returns TRUE if the script allows line breaks between letters (excluding hyphenation).
+ * Such a script typically requires dictionary-based line breaking.
+ * For example, Hani and Thai.
+ *
+ * @param script script code
+ * @return TRUE if the script allows line breaks between letters
+ * @draft ICU 51
+ */
+U_DRAFT UBool U_EXPORT2
+uscript_breaksBetweenLetters(UScriptCode script);
+
+/**
+ * Returns TRUE if in modern (or most recent) usage of the script case distinctions are customary.
+ * For example, Latn and Cyrl.
+ *
+ * @param script script code
+ * @return TRUE if the script is cased
+ * @draft ICU 51
+ */
+U_DRAFT UBool U_EXPORT2
+uscript_isCased(UScriptCode script);
+
+#endif  /* U_HIDE_DRAFT_API */
+
 #endif
--- a/icu4c/source/common/uscript_props.cpp
+++ b/icu4c/source/common/uscript_props.cpp
@ -0,0 +1,267 @@
+/*
+*******************************************************************************
+*   Copyright (C) 2013, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+*******************************************************************************
+*   file name:  uscript_props.cpp
+*   encoding:   US-ASCII
+*   tab size:   8 (not used)
+*   indentation:4
+*
+*   created on: 2013feb16
+*   created by: Markus W. Scherer
+*/
+
+#include "unicode/utypes.h"
+#include "unicode/unistr.h"
+#include "unicode/uscript.h"
+#include "unicode/utf16.h"
+#include "ustr_imp.h"
+
+#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
+
+namespace {
+
+// Script metadata (script properties).
+// See http://unicode.org/cldr/trac/browser/trunk/common/properties/scriptMetadata.txt
+
+// 0 = NOT_ENCODED, no sample character, default false script properties.
+// Bits 20.. 0: sample character
+
+// Bits 23..21: usage
+const int32_t UNKNOWN = 1 << 21;
+const int32_t EXCLUSION = 2 << 21;
+const int32_t LIMITED_USE = 3 << 21;
+const int32_t ASPIRATIONAL = 4 << 21;
+const int32_t RECOMMENDED = 5 << 21;
+
+// Bits 31..24: Single-bit flags
+const int32_t RTL = 1 << 24;
+const int32_t LB_LETTERS = 1 << 25;
+const int32_t CASED = 1 << 26;
+
+const int32_t SCRIPT_PROPS[] = {
+    // Begin copy-paste output from
+    // icu/tools/trunk/unicode/py/parsescriptmetadata.py
+    0x0040 | UNKNOWN,  // Zyyy
+    0x0308 | UNKNOWN,  // Zinh
+    0x0628 | RECOMMENDED | RTL,  // Arab
+    0x0531 | RECOMMENDED | CASED,  // Armn
+    0x0995 | RECOMMENDED,  // Beng
+    0x3105 | RECOMMENDED | LB_LETTERS,  // Bopo
+    0x13C4 | LIMITED_USE,  // Cher
+    0x03E2 | EXCLUSION | CASED,  // Copt
+    0x042F | RECOMMENDED | CASED,  // Cyrl
+    0x10414 | EXCLUSION | CASED,  // Dsrt
+    0x0905 | RECOMMENDED,  // Deva
+    0x12A0 | RECOMMENDED,  // Ethi
+    0x10D3 | RECOMMENDED,  // Geor
+    0x10330 | EXCLUSION,  // Goth
+    0x03A9 | RECOMMENDED | CASED,  // Grek
+    0x0A95 | RECOMMENDED,  // Gujr
+    0x0A15 | RECOMMENDED,  // Guru
+    0x5B57 | RECOMMENDED | LB_LETTERS,  // Hani
+    0xAC00 | RECOMMENDED,  // Hang
+    0x05D0 | RECOMMENDED | RTL,  // Hebr
+    0x304B | RECOMMENDED | LB_LETTERS,  // Hira
+    0x0C95 | RECOMMENDED,  // Knda
+    0x30AB | RECOMMENDED | LB_LETTERS,  // Kana
+    0x1780 | RECOMMENDED | LB_LETTERS,  // Khmr
+    0x0EA5 | RECOMMENDED | LB_LETTERS,  // Laoo
+    0x004C | RECOMMENDED | CASED,  // Latn
+    0x0D15 | RECOMMENDED,  // Mlym
+    0x1826 | ASPIRATIONAL,  // Mong
+    0x1000 | RECOMMENDED | LB_LETTERS,  // Mymr
+    0x168F | EXCLUSION,  // Ogam
+    0x10300 | EXCLUSION,  // Ital
+    0x0B15 | RECOMMENDED,  // Orya
+    0x16A0 | EXCLUSION,  // Runr
+    0x0D85 | RECOMMENDED,  // Sinh
+    0x0710 | LIMITED_USE | RTL,  // Syrc
+    0x0B95 | RECOMMENDED,  // Taml
+    0x0C15 | RECOMMENDED,  // Telu
+    0x078C | RECOMMENDED | RTL,  // Thaa
+    0x0E17 | RECOMMENDED | LB_LETTERS,  // Thai
+    0x0F40 | RECOMMENDED,  // Tibt
+    0x14C0 | ASPIRATIONAL,  // Cans
+    0xA288 | ASPIRATIONAL | LB_LETTERS,  // Yiii
+    0x1703 | EXCLUSION,  // Tglg
+    0x1723 | EXCLUSION,  // Hano
+    0x1743 | EXCLUSION,  // Buhd
+    0x1763 | EXCLUSION,  // Tagb
+    0x2800 | UNKNOWN,  // Brai
+    0x10800 | EXCLUSION | RTL,  // Cprt
+    0x1900 | LIMITED_USE,  // Limb
+    0x10000 | EXCLUSION,  // Linb
+    0x10480 | EXCLUSION,  // Osma
+    0x10450 | EXCLUSION,  // Shaw
+    0x1950 | LIMITED_USE | LB_LETTERS,  // Tale
+    0x10380 | EXCLUSION,  // Ugar
+    0,
+    0x1A00 | EXCLUSION,  // Bugi
+    0x2C00 | EXCLUSION | CASED,  // Glag
+    0x10A00 | EXCLUSION | RTL,  // Khar
+    0xA800 | LIMITED_USE,  // Sylo
+    0x1980 | LIMITED_USE | LB_LETTERS,  // Talu
+    0x2D30 | ASPIRATIONAL,  // Tfng
+    0x103A0 | EXCLUSION,  // Xpeo
+    0x1B05 | LIMITED_USE | LB_LETTERS,  // Bali
+    0x1BC0 | LIMITED_USE,  // Batk
+    0,
+    0x11005 | EXCLUSION,  // Brah
+    0xAA00 | LIMITED_USE,  // Cham
+    0,
+    0,
+    0,
+    0,
+    0x13153 | EXCLUSION,  // Egyp
+    0,
+    0x5B57 | RECOMMENDED | LB_LETTERS,  // Hans
+    0x5B57 | RECOMMENDED | LB_LETTERS,  // Hant
+    0,
+    0,
+    0,
+    0xA984 | LIMITED_USE | LB_LETTERS,  // Java
+    0xA90A | LIMITED_USE,  // Kali
+    0,
+    0,
+    0x1C00 | LIMITED_USE,  // Lepc
+    0,
+    0x0840 | LIMITED_USE | RTL,  // Mand
+    0,
+    0x10980 | EXCLUSION | RTL,  // Mero
+    0x07CA | LIMITED_USE | RTL,  // Nkoo
+    0x10C00 | EXCLUSION | RTL,  // Orkh
+    0,
+    0xA840 | EXCLUSION,  // Phag
+    0x10900 | EXCLUSION | RTL,  // Phnx
+    0x16F00 | ASPIRATIONAL,  // Plrd
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0xA549 | LIMITED_USE,  // Vaii
+    0,
+    0x12000 | EXCLUSION,  // Xsux
+    0,
+    0xFFFF | UNKNOWN,  // Zzzz
+    0x102A0 | EXCLUSION,  // Cari
+    0x304B | RECOMMENDED | LB_LETTERS,  // Jpan
+    0x1A20 | LIMITED_USE | LB_LETTERS,  // Lana
+    0x10280 | EXCLUSION,  // Lyci
+    0x10920 | EXCLUSION | RTL,  // Lydi
+    0x1C5A | LIMITED_USE,  // Olck
+    0xA930 | EXCLUSION,  // Rjng
+    0xA882 | LIMITED_USE,  // Saur
+    0,
+    0x1B83 | LIMITED_USE,  // Sund
+    0,
+    0xABC0 | LIMITED_USE,  // Mtei
+    0x10840 | EXCLUSION | RTL,  // Armi
+    0x10B00 | EXCLUSION | RTL,  // Avst
+    0x11103 | LIMITED_USE,  // Cakm
+    0xAC00 | RECOMMENDED,  // Kore
+    0x11083 | EXCLUSION,  // Kthi
+    0,
+    0x10B60 | EXCLUSION | RTL,  // Phli
+    0,
+    0,
+    0x10B40 | EXCLUSION | RTL,  // Prti
+    0x0800 | EXCLUSION | RTL,  // Samr
+    0xAA80 | LIMITED_USE | LB_LETTERS,  // Tavt
+    0,
+    0,
+    0xA6A0 | LIMITED_USE,  // Bamu
+    0xA4D0 | LIMITED_USE,  // Lisu
+    0,
+    0x10A60 | EXCLUSION | RTL,  // Sarb
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0x109A0 | EXCLUSION | RTL,  // Merc
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0x11183 | EXCLUSION,  // Shrd
+    0x110D0 | EXCLUSION,  // Sora
+    0x11680 | EXCLUSION,  // Takr
+    0,
+    0,
+    0,
+    0,
+    0,
+    // End copy-paste from parsescriptmetadata.py
+};
+
+int32_t getScriptProps(UScriptCode script) {
+    if (0 <= script && script < LENGTHOF(SCRIPT_PROPS)) {
+        return SCRIPT_PROPS[script];
+    } else {
+        return 0;
+    }
+}
+
+}  // namespace
+
+U_CAPI int32_t U_EXPORT2
+uscript_getSampleString(UScriptCode script, UChar *dest, int32_t capacity, UErrorCode *pErrorCode) {
+    if(U_FAILURE(*pErrorCode)) { return 0; }
+    if(capacity < 0 || (capacity > 0 && dest == NULL)) {
+        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+    int32_t sampleChar = getScriptProps(script) & 0x1fffff;
+    int32_t length;
+    if(sampleChar == 0) {
+        length = 0;
+    } else {
+        length = U16_LENGTH(sampleChar);
+        if(length <= capacity) {
+            int32_t i = 0;
+            U16_APPEND_UNSAFE(dest, i, sampleChar);
+        }
+    }
+    return u_terminateUChars(dest, capacity, length, pErrorCode);
+}
+
+U_COMMON_API icu::UnicodeString U_EXPORT2
+uscript_getSampleUnicodeString(UScriptCode script) {
+    icu::UnicodeString sample;
+    int32_t sampleChar = getScriptProps(script) & 0x1fffff;
+    if(sampleChar != 0) {
+        sample.append(sampleChar);
+    }
+    return sample;
+}
+
+U_CAPI UScriptUsage U_EXPORT2
+uscript_getUsage(UScriptCode script) {
+    return (UScriptUsage)((getScriptProps(script) >> 21) & 7);
+}
+
+U_CAPI UBool U_EXPORT2
+uscript_isRightToLeft(UScriptCode script) {
+    return (getScriptProps(script) & RTL) != 0;
+}
+
+U_CAPI UBool U_EXPORT2
+uscript_breaksBetweenLetters(UScriptCode script) {
+    return (getScriptProps(script) & LB_LETTERS) != 0;
+}
+
+U_CAPI UBool U_EXPORT2
+uscript_isCased(UScriptCode script) {
+    return (getScriptProps(script) & CASED) != 0;
+}
--- a/icu4c/source/test/cintltst/cucdapi.c
+++ b/icu4c/source/test/cintltst/cucdapi.c
@ -1,5 +1,5 @@
 /********************************************************************
- * Copyright (c) 1997-2012, International Business Machines
+ * Copyright (c) 1997-2013, International Business Machines
 * Corporation and others. All Rights Reserved.
 ********************************************************************/

@ -527,6 +527,68 @@ void TestGetScriptExtensions() {
    }
 }

+void TestScriptMetadataAPI() {
+    /* API & code coverage. More testing in intltest/ucdtest.cpp. */
+    UErrorCode errorCode=U_ZERO_ERROR;
+    UChar sample[8];
+
+    if(uscript_getSampleString(USCRIPT_LATIN, sample, LENGTHOF(sample), &errorCode)!=1 ||
+            U_FAILURE(errorCode) ||
+            uscript_getScript(sample[0], &errorCode)!=USCRIPT_LATIN ||
+            sample[1]!=0) {
+        log_err("uscript_getSampleString(Latn) failed - %s\n", u_errorName(errorCode));
+    }
+    sample[0]=0xfffe;
+    if(uscript_getSampleString(USCRIPT_LATIN, sample, 0, &errorCode)!=1 ||
+            errorCode!=U_BUFFER_OVERFLOW_ERROR ||
+            sample[0]!=0xfffe) {
+        log_err("uscript_getSampleString(Latn, capacity=0) failed - %s\n", u_errorName(errorCode));
+    }
+    errorCode=U_ZERO_ERROR;
+    if(uscript_getSampleString(USCRIPT_INVALID_CODE, sample, LENGTHOF(sample), &errorCode)!=0 ||
+            U_FAILURE(errorCode) ||
+            sample[0]!=0) {
+        log_err("uscript_getSampleString(invalid) failed - %s\n", u_errorName(errorCode));
+    }
+    sample[0]=0xfffe;
+    if(uscript_getSampleString(USCRIPT_CODE_LIMIT, sample, 0, &errorCode)!=0 ||
+            errorCode!=U_STRING_NOT_TERMINATED_WARNING ||
+            sample[0]!=0xfffe) {
+        log_err("uscript_getSampleString(limit, capacity=0) failed - %s\n", u_errorName(errorCode));
+    }
+
+    if(uscript_getUsage(USCRIPT_LATIN)!=USCRIPT_USAGE_RECOMMENDED ||
+            uscript_getUsage(USCRIPT_YI)!=USCRIPT_USAGE_ASPIRATIONAL ||
+            uscript_getUsage(USCRIPT_CHEROKEE)!=USCRIPT_USAGE_LIMITED_USE ||
+            uscript_getUsage(USCRIPT_COPTIC)!=USCRIPT_USAGE_EXCLUDED ||
+            uscript_getUsage(USCRIPT_CIRTH)!=USCRIPT_USAGE_NOT_ENCODED ||
+            uscript_getUsage(USCRIPT_INVALID_CODE)!=USCRIPT_USAGE_NOT_ENCODED ||
+            uscript_getUsage(USCRIPT_INVALID_CODE)!=USCRIPT_USAGE_NOT_ENCODED) {
+        log_err("uscript_getUsage() failed\n");
+    }
+
+    if(uscript_isRightToLeft(USCRIPT_LATIN) ||
+            uscript_isRightToLeft(USCRIPT_CIRTH) ||
+            !uscript_isRightToLeft(USCRIPT_ARABIC) ||
+            !uscript_isRightToLeft(USCRIPT_HEBREW)) {
+        log_err("uscript_isRightToLeft() failed\n");
+    }
+
+    if(uscript_breaksBetweenLetters(USCRIPT_LATIN) ||
+            uscript_breaksBetweenLetters(USCRIPT_CIRTH) ||
+            !uscript_breaksBetweenLetters(USCRIPT_HAN) ||
+            !uscript_breaksBetweenLetters(USCRIPT_THAI)) {
+        log_err("uscript_breaksBetweenLetters() failed\n");
+    }
+
+    if(uscript_isCased(USCRIPT_CIRTH) ||
+            uscript_isCased(USCRIPT_HAN) ||
+            !uscript_isCased(USCRIPT_LATIN) ||
+            !uscript_isCased(USCRIPT_GREEK)) {
+        log_err("uscript_isCased() failed\n");
+    }
+}
+
 void TestBinaryValues() {
    /*
     * Unicode 5.1 explicitly defines binary property value aliases.
--- a/icu4c/source/test/cintltst/cucdapi.h
+++ b/icu4c/source/test/cintltst/cucdapi.h
@ -1,10 +1,11 @@
 /********************************************************************
 * COPYRIGHT:
- * Copyright (c) 2003-2010, International Business Machines Corporation and
+ * Copyright (c) 2003-2013, International Business Machines Corporation and
 * others. All Rights Reserved.
 ********************************************************************/

 void TestUScriptCodeAPI(void);
 void TestHasScript(void);
 void TestGetScriptExtensions(void);
+void TestScriptMetadataAPI(void);
 void TestBinaryValues(void);
--- a/icu4c/source/test/cintltst/cucdtst.c
+++ b/icu4c/source/test/cintltst/cucdtst.c
@ -1,6 +1,6 @@
 /********************************************************************
 * COPYRIGHT:
- * Copyright (c) 1997-2012, International Business Machines Corporation and
+ * Copyright (c) 1997-2013, International Business Machines Corporation and
 * others. All Rights Reserved.
 ********************************************************************/
 /*******************************************************************************
@ -184,6 +184,7 @@ void addUnicodeTest(TestNode** root)
    addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI");
    addTest(root, &TestHasScript, "tsutil/cucdtst/TestHasScript");
    addTest(root, &TestGetScriptExtensions, "tsutil/cucdtst/TestGetScriptExtensions");
+    addTest(root, &TestScriptMetadataAPI, "tsutil/cucdtst/TestScriptMetadataAPI");
    addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI");
    addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames");
    addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues");
--- a/icu4c/source/test/intltest/ucdtest.cpp
+++ b/icu4c/source/test/intltest/ucdtest.cpp
@ -1,6 +1,6 @@
 /********************************************************************
 * COPYRIGHT: 
- * Copyright (c) 1997-2011, International Business Machines Corporation and
+ * Copyright (c) 1997-2013, International Business Machines Corporation and
 * others. All Rights Reserved.
 ********************************************************************/

@ -8,6 +8,7 @@
 #include "unicode/uchar.h"
 #include "unicode/uniset.h"
 #include "unicode/putil.h"
+#include "unicode/uscript.h"
 #include "cstring.h"
 #include "hash.h"
 #include "patternprops.h"
@ -59,6 +60,7 @@ void UnicodeTest::runIndexedTest( int32_t index, UBool exec, const char* &name,
    TESTCASE_AUTO(TestBinaryValues);
    TESTCASE_AUTO(TestConsistency);
    TESTCASE_AUTO(TestPatternProperties);
+    TESTCASE_AUTO(TestScriptMetadata);
    TESTCASE_AUTO_END;
 }

@ -426,3 +428,73 @@ UnicodeTest::compareUSets(const UnicodeSet &a, const UnicodeSet &b,
    }
    return same;
 }
+
+namespace {
+
+/**
+ * Maps a special script code to the most common script of its encoded characters.
+ */
+UScriptCode getCharScript(UScriptCode script) {
+    switch(script) {
+    case USCRIPT_SIMPLIFIED_HAN:
+    case USCRIPT_TRADITIONAL_HAN:
+        return USCRIPT_HAN;
+    case USCRIPT_JAPANESE:
+        return USCRIPT_HIRAGANA;
+    case USCRIPT_KOREAN:
+        return USCRIPT_HANGUL;
+    default:
+        return script;
+    }
+}
+
+}  // namespace
+
+void UnicodeTest::TestScriptMetadata() {
+    IcuTestErrorCode errorCode(*this, "TestScriptMetadata()");
+    UnicodeSet rtl("[[:bc=R:][:bc=AL:]-[:Cn:]-[:sc=Common:]]", errorCode);
+    // So far, sample characters are uppercase.
+    // Georgian is special.
+    UnicodeSet cased("[[:Lu:]-[:sc=Common:]-[:sc=Geor:]]", errorCode);
+    for(int32_t sci = 0; sci < USCRIPT_CODE_LIMIT; ++sci) {
+        UScriptCode sc = (UScriptCode)sci;
+        // Run the test with -v to see which script has failures:
+        // .../intltest$ make && ./intltest utility/UnicodeTest/TestScriptMetadata -v | grep -C 3 FAIL
+        logln(uscript_getShortName(sc));
+        UScriptUsage usage = uscript_getUsage(sc);
+        UnicodeString sample = uscript_getSampleUnicodeString(sc);
+        UnicodeSet scriptSet;
+        scriptSet.applyIntPropertyValue(UCHAR_SCRIPT, sc, errorCode);
+        if(usage == USCRIPT_USAGE_NOT_ENCODED) {
+            assertTrue("not encoded, no sample", sample.isEmpty());
+            assertFalse("not encoded, not RTL", uscript_isRightToLeft(sc));
+            assertFalse("not encoded, not LB letters", uscript_breaksBetweenLetters(sc));
+            assertFalse("not encoded, not cased", uscript_isCased(sc));
+            assertTrue("not encoded, no characters", scriptSet.isEmpty());
+        } else {
+            assertFalse("encoded, has a sample character", sample.isEmpty());
+            UChar32 firstChar = sample.char32At(0);
+            UScriptCode charScript = getCharScript(sc);
+            assertEquals("script(sample(script))",
+                         charScript, uscript_getScript(firstChar, errorCode));
+            assertEquals("RTL vs. set", rtl.contains(firstChar), uscript_isRightToLeft(sc));
+            assertEquals("cased vs. set", cased.contains(firstChar), uscript_isCased(sc));
+            assertEquals("encoded, has characters", sc == charScript, !scriptSet.isEmpty());
+            if(uscript_isRightToLeft(sc)) {
+                rtl.removeAll(scriptSet);
+            }
+            if(uscript_isCased(sc)) {
+                cased.removeAll(scriptSet);
+            }
+        }
+    }
+    UnicodeString pattern;
+    assertEquals("no remaining RTL characters",
+                 UnicodeString("[]"), rtl.toPattern(pattern));
+    assertEquals("no remaining cased characters",
+                 UnicodeString("[]"), cased.toPattern(pattern));
+
+    assertTrue("Hani breaks between letters", uscript_breaksBetweenLetters(USCRIPT_HAN));
+    assertTrue("Thai breaks between letters", uscript_breaksBetweenLetters(USCRIPT_THAI));
+    assertFalse("Latn does not break between letters", uscript_breaksBetweenLetters(USCRIPT_LATIN));
+}
--- a/icu4c/source/test/intltest/ucdtest.h
+++ b/icu4c/source/test/intltest/ucdtest.h
@ -1,6 +1,6 @@
 /********************************************************************
 * COPYRIGHT: 
- * Copyright (c) 1997-2011, International Business Machines Corporation and
+ * Copyright (c) 1997-2013, International Business Machines Corporation and
 * others. All Rights Reserved.
 ********************************************************************/

@ -37,6 +37,7 @@ public:
    void TestBinaryValues();
    void TestConsistency();
    void TestPatternProperties();
+    void TestScriptMetadata();

 private: