ICU-7772 Fast string direction detection

X-SVN-Rev: 28665
2025-04-10 07:39:16 +00:00 · 2010-09-21 07:12:28 +00:00 · 2010-09-21 07:12:28 +00:00 · 12de1d68b7
commit 12de1d68b7
parent 52d02d3a55
4 changed files with 222 additions and 7 deletions
--- a/icu4c/source/common/ubidi.c
+++ b/icu4c/source/common/ubidi.c
@ -325,6 +325,34 @@ ubidi_getReorderingOptions(UBiDi *pBiDi) {
    }
 }

+U_CAPI UBiDiDirection U_EXPORT2
+ubidi_getBaseDirection(const UChar *text,
+int32_t length){
+
+    int32_t i;
+    UChar32 uchar;
+    UCharDirection dir;
+    
+    if( text==NULL || length<-1 ){
+        return UBIDI_NEUTRAL;
+    }
+
+    if(length==-1) {
+        length=u_strlen(text);
+    }
+
+    for( i = 0 ; i < length; ) {
+        /* i is incremented by U16_NEXT */
+        U16_NEXT(text, i, length, uchar);
+        dir = u_charDirection(uchar);
+        if( dir == U_LEFT_TO_RIGHT )
+                return UBIDI_LTR;
+        if( dir == U_RIGHT_TO_LEFT || dir ==U_RIGHT_TO_LEFT_ARABIC )
+                return UBIDI_RTL;
+    }
+    return UBIDI_NEUTRAL;
+}
+
 /* perform (P2)..(P3) ------------------------------------------------------- */

 /*
--- a/icu4c/source/common/unicode/ubidi.h
+++ b/icu4c/source/common/unicode/ubidi.h
@ -416,12 +416,44 @@ typedef uint8_t UBiDiLevel;
 * @stable ICU 2.0
 */
 enum UBiDiDirection {
-    /** All left-to-right text. This is a 0 value. @stable ICU 2.0 */
-    UBIDI_LTR,
-    /** All right-to-left text. This is a 1 value. @stable ICU 2.0 */
-    UBIDI_RTL,
-    /** Mixed-directional text. @stable ICU 2.0 */
-    UBIDI_MIXED
+  /** Left-to-right text. This is a 0 value.
+   * <ul>
+   * <li>As return value for <code>ubidi_getDirection()</code>, it means
+   *     that the source string contains no right-to-left characters, or
+   *     that the source string is empty and the paragraph level is even.
+   * <li> As return value for <code>ubidi_getBaseDirection()</code>, it
+   *      means that the first strong character of the source string has
+   *      a left-to-right direction.
+   * </ul>
+   * @stable ICU 2.0
+   */
+  UBIDI_LTR,
+  /** Right-to-left text. This is a 1 value.
+   * <ul>
+   * <li>As return value for <code>ubidi_getDirection()</code>, it means
+   *     that the source string contains no left-to-right characters, or
+   *     that the source string is empty and the paragraph level is odd.
+   * <li> As return value for <code>ubidi_getBaseDirection()</code>, it
+   *      means that the first strong character of the source string has
+   *      a right-to-left direction.
+   * </ul>
+   * @stable ICU 2.0
+   */
+  UBIDI_RTL,
+  /** Mixed-directional text.
+   * <p>As return value for <code>ubidi_getDirection()</code>, it means
+   *    that the source string contains both left-to-right and
+   *    right-to-left characters.
+   * @stable ICU 2.0
+   */
+  UBIDI_MIXED,
+  /** No strongly directional text.
+   * <p>As return value for <code>ubidi_getBaseDirection()</code>, it means
+   *    that the source string is missing or empty, or contains neither left-to-right
+   *    nor right-to-left characters.
+   * @draft ICU 4.6
+   */
+  UBIDI_NEUTRAL
 };

 /** @stable ICU 2.0 */
@ -1159,6 +1191,7 @@ ubidi_setLine(const UBiDi *pParaBiDi,
 *         that indicates if the entire text
 *         represented by this object is unidirectional,
 *         and which direction, or if it is mixed-directional.
+ * Note -  The value <code>UBIDI_NEUTRAL</code> is never returned from this method.
 *
 * @see UBiDiDirection
 * @stable ICU 2.0
@ -1166,6 +1199,36 @@ ubidi_setLine(const UBiDi *pParaBiDi,
 U_STABLE UBiDiDirection U_EXPORT2
 ubidi_getDirection(const UBiDi *pBiDi);

+/**
+ * Gets the base direction of the text provided according
+ * to the Unicode Bidirectional Algorithm. The base direction
+ * is derived from the first character in the string with bidirectional
+ * character type L, R, or AL. If the first such character has type L,
+ * <code>UBIDI_LTR</code> is returned. If the first such character has
+ * type R or AL, <code>UBIDI_RTL</code> is returned. If the string does
+ * not contain any character of these types, then
+ * <code>UBIDI_NEUTRAL</code> is returned.
+ *
+ * This is a lightweight function for use when only the base direction
+ * is needed and no further bidi processing of the text is needed.
+ *
+ * @param text is a pointer to the text whose base
+ *             direction is needed.
+ * Note: the text must be (at least) @c length long.
+ *
+ * @param length is the length of the text;
+ *               if <code>length==-1</code> then the text
+ *               must be zero-terminated.
+ *
+ * @return  <code>UBIDI_LTR</code>, <code>UBIDI_RTL</code>,
+ *          <code>UBIDI_NEUTRAL</code>
+ *
+ * @see UBiDiDirection
+ * @draft ICU 4.6
+ */
+U_DRAFT UBiDiDirection U_EXPORT2
+ubidi_getBaseDirection(const UChar *text,  int32_t length );
+
 /**
 * Get the pointer to the text.
 *
@ -1398,7 +1461,8 @@ ubidi_countRuns(UBiDi *pBiDi, UErrorCode *pErrorCode);
 *
 * @return the directionality of the run,
 *         <code>UBIDI_LTR==0</code> or <code>UBIDI_RTL==1</code>,
- *         never <code>UBIDI_MIXED</code>.
+ *         never <code>UBIDI_MIXED</code>,
+ *         never <code>UBIDI_NEUTRAL</code>.
 *
 * @see ubidi_countRuns
 *
--- a/icu4c/source/common/unicode/urename.h
+++ b/icu4c/source/common/unicode/urename.h
@ -475,6 +475,7 @@
 #define ubidi_setReorderingOptions U_ICU_ENTRY_POINT_RENAME(ubidi_setReorderingOptions)
 #define ubidi_writeReordered U_ICU_ENTRY_POINT_RENAME(ubidi_writeReordered)
 #define ubidi_writeReverse U_ICU_ENTRY_POINT_RENAME(ubidi_writeReverse)
+#define ubidi_getBaseDirection U_ICU_ENTRY_POINT_RENAME(ubidi_getBaseDirection)
 #define ublock_getCode U_ICU_ENTRY_POINT_RENAME(ublock_getCode)
 #define ubrk_close U_ICU_ENTRY_POINT_RENAME(ubrk_close)
 #define ubrk_countAvailable U_ICU_ENTRY_POINT_RENAME(ubrk_countAvailable)
--- a/icu4c/source/test/cintltst/cbiditst.c
+++ b/icu4c/source/test/cintltst/cbiditst.c
@ -72,6 +72,8 @@ static void testFailureRecovery(void);

 static void testMultipleParagraphs(void);

+static void testGetBaseDirection(void);
+
 /* new BIDI API */
 static void testReorderingMode(void);
 static void testReorderRunsOnly(void);
@ -122,6 +124,7 @@ addComplexTest(TestNode** root) {
    addTest(root, doTashkeelSpecialVLTRArabicShapingTest, "complex/arabic-shaping/tashkeel");
    addTest(root, doLOGICALArabicDeShapingTest, "complex/arabic-shaping/unshaping");
    addTest(root, doArabicShapingTestForBug5421, "complex/arabic-shaping/bug-5421");
+    addTest(root, testGetBaseDirection, "complex/bidi/testGetBaseDirection");
 }

 static void
@ -1132,6 +1135,125 @@ _testReordering(UBiDi *pBiDi, int testNumber) {
        return;     \
    }               \

+#define STRING_TEST_CASE(s) { (s), LENGTHOF(s) }
+
+static void testGetBaseDirection(void) {
+    UBiDiDirection dir;
+    int i;
+
+/* Test Data */
+    static const UChar
+/*Mixed Start with L*/
+    stringMixedEnglishFirst[]={ 0x61, 0x627, 0x32, 0x6f3, 0x61, 0x34, 0 },
+/*Mixed Start with AL*/
+    stringMixedArabicFirst[]={ 0x661, 0x627, 0x662, 0x6f3, 0x61, 0x664, 0 },
+/*Mixed Start with R*/
+    stringMixedHebrewFirst[]={ 0x05EA, 0x627, 0x662, 0x6f3, 0x61, 0x664, 0 },
+/*All AL (Arabic. Persian)*/
+    stringPersian[]={0x0698, 0x067E, 0x0686, 0x06AF, 0},
+/*All R (Hebrew etc.)*/
+    stringHebrew[]={0x0590, 0x05D5, 0x05EA, 0x05F1, 0},
+/*All L (English)*/
+    stringEnglish[]={0x71, 0x61, 0x66, 0},
+/*Mixed Start with weak AL an then L*/
+    stringStartWeakAL[]={ 0x0663, 0x71, 0x61, 0x66, 0},
+/*Mixed Start with weak L and then AL*/
+    stringStartWeakL[]={0x31, 0x0698, 0x067E, 0x0686, 0x06AF, 0},
+/*Empty*/
+    stringEmpty[]={0},
+/*Surrogate Char.*/
+    stringSurrogateChar[]={0xD800, 0xDC00, 0},
+/*Invalid UChar*/
+    stringInvalidUchar[]={-1},
+/*All weak L (English Digits)*/
+    stringAllEnglishDigits[]={0x31, 0x32, 0x33, 0},
+/*All weak AL (Arabic Digits)*/
+    stringAllArabicDigits[]={0x0663, 0x0664, 0x0665, 0},
+/*First L (English) others are R (Hebrew etc.) */
+    stringFirstL[] = {0x71, 0x0590, 0x05D5, 0x05EA, 0x05F1, 0},
+/*Last R (Hebrew etc.) others are weak L (English Digits)*/
+    stringLastR[] = {0x31, 0x32, 0x33, 0x05F1, 0};
+
+    static const struct {
+        const UChar *s;
+        int32_t length;
+    } testCases[]={
+        STRING_TEST_CASE(stringMixedEnglishFirst),
+        STRING_TEST_CASE(stringMixedArabicFirst),
+        STRING_TEST_CASE(stringMixedHebrewFirst),
+        STRING_TEST_CASE(stringPersian),
+        STRING_TEST_CASE(stringHebrew),
+        STRING_TEST_CASE(stringEnglish),
+        STRING_TEST_CASE(stringStartWeakAL),
+        STRING_TEST_CASE(stringStartWeakL),
+        STRING_TEST_CASE(stringEmpty),
+        STRING_TEST_CASE(stringSurrogateChar),
+        STRING_TEST_CASE(stringInvalidUchar),
+        STRING_TEST_CASE(stringAllEnglishDigits),
+        STRING_TEST_CASE(stringAllArabicDigits),
+        STRING_TEST_CASE(stringFirstL),
+        STRING_TEST_CASE(stringLastR),
+    };
+
+/* Expected results */
+    static const UBiDiDirection expectedDir[] ={
+        UBIDI_LTR, UBIDI_RTL, UBIDI_RTL,
+        UBIDI_RTL, UBIDI_RTL, UBIDI_LTR,
+        UBIDI_LTR, UBIDI_RTL, UBIDI_NEUTRAL,
+        UBIDI_LTR, UBIDI_NEUTRAL, UBIDI_NEUTRAL,
+        UBIDI_NEUTRAL, UBIDI_LTR, UBIDI_RTL
+    };
+
+    log_verbose("testGetBaseDirection() with %u test cases ---\n",
+    LENGTHOF(testCases));
+/* Run Tests */
+     for(i=0; i<LENGTHOF(testCases); ++i) {
+        dir = ubidi_getBaseDirection(testCases[i].s, testCases[i].length );
+        log_verbose("Testing case %d\tReceived dir %d\n", i, dir);
+        if (dir != expectedDir[i]) 
+            log_err("\nFailed getBaseDirection case %d Expected  %d \tReceived %d\n", 
+            i, expectedDir[i], dir);
+    }
+
+/* Misc. tests */
+/* NULL string */
+    dir = ubidi_getBaseDirection(NULL, 3);
+    if (dir != UBIDI_NEUTRAL )
+        log_err("\nFailed getBaseDirection for NULL string " ,
+        "\nExpected  %d \nReceived %d", UBIDI_NEUTRAL, dir);
+/*All L- English string and length=-3 */
+    dir = ubidi_getBaseDirection( stringEnglish, -3);
+    if (dir != UBIDI_NEUTRAL )
+        log_err("\nFailed getBaseDirection for string w length= -3 ",
+        "\nExpected  %d \nReceived %d", UBIDI_NEUTRAL, dir);
+/*All L- English string and length=-1 */
+    dir = ubidi_getBaseDirection( stringEnglish, -1);
+    if (dir != UBIDI_LTR )
+        log_err("\nFailed getBaseDirection for English string w length= -1 ",
+        "\nExpected  %d \nReceived %d", UBIDI_LTR, dir);
+/*All AL- Persian string and length=-1 */
+    dir = ubidi_getBaseDirection( stringPersian, -1);
+    if (dir != UBIDI_RTL )
+        log_err("\nFailed getBaseDirection for Persian string w length= -1 ",
+        "\nExpected  %d \nReceived %d", UBIDI_RTL, dir);
+/*All R- Hebrew string and length=-1 */
+    dir = ubidi_getBaseDirection( stringHebrew, -1);
+    if (dir != UBIDI_RTL )
+        log_err("\nFailed getBaseDirection for Hebrew string w length= -1 ",
+        "\nExpected  %d \nReceived %d", UBIDI_RTL, dir);
+/*All weak L- English digits string and length=-1 */
+    dir = ubidi_getBaseDirection(stringAllEnglishDigits, -1);
+    if (dir != UBIDI_NEUTRAL )
+        log_err("\nFailed getBaseDirection for English digits string w length= -1 ",
+        "\nExpected  %d \nReceived %d", UBIDI_NEUTRAL, dir);
+/*All weak AL- Arabic digits string and length=-1 */
+    dir = ubidi_getBaseDirection(stringAllArabicDigits, -1);
+    if (dir != UBIDI_NEUTRAL )
+        log_err("\nFailed getBaseDirection for Arabic string w length= -1 ",
+        "\nExpected  %d \nReceived %d", UBIDI_NEUTRAL, dir);
+
+}
+

 static void doMisc(void) {
 /* Miscellaneous tests to exercize less popular code paths */