ICU-7772 Fast string direction detection

X-SVN-Rev: 28665
This commit is contained in:
Jinglun Li 2010-09-21 07:12:28 +00:00
parent 52d02d3a55
commit 12de1d68b7
4 changed files with 222 additions and 7 deletions

View file

@ -325,6 +325,34 @@ ubidi_getReorderingOptions(UBiDi *pBiDi) {
}
}
U_CAPI UBiDiDirection U_EXPORT2
ubidi_getBaseDirection(const UChar *text,
int32_t length){
int32_t i;
UChar32 uchar;
UCharDirection dir;
if( text==NULL || length<-1 ){
return UBIDI_NEUTRAL;
}
if(length==-1) {
length=u_strlen(text);
}
for( i = 0 ; i < length; ) {
/* i is incremented by U16_NEXT */
U16_NEXT(text, i, length, uchar);
dir = u_charDirection(uchar);
if( dir == U_LEFT_TO_RIGHT )
return UBIDI_LTR;
if( dir == U_RIGHT_TO_LEFT || dir ==U_RIGHT_TO_LEFT_ARABIC )
return UBIDI_RTL;
}
return UBIDI_NEUTRAL;
}
/* perform (P2)..(P3) ------------------------------------------------------- */
/*

View file

@ -416,12 +416,44 @@ typedef uint8_t UBiDiLevel;
* @stable ICU 2.0
*/
enum UBiDiDirection {
/** All left-to-right text. This is a 0 value. @stable ICU 2.0 */
UBIDI_LTR,
/** All right-to-left text. This is a 1 value. @stable ICU 2.0 */
UBIDI_RTL,
/** Mixed-directional text. @stable ICU 2.0 */
UBIDI_MIXED
/** Left-to-right text. This is a 0 value.
* <ul>
* <li>As return value for <code>ubidi_getDirection()</code>, it means
* that the source string contains no right-to-left characters, or
* that the source string is empty and the paragraph level is even.
* <li> As return value for <code>ubidi_getBaseDirection()</code>, it
* means that the first strong character of the source string has
* a left-to-right direction.
* </ul>
* @stable ICU 2.0
*/
UBIDI_LTR,
/** Right-to-left text. This is a 1 value.
* <ul>
* <li>As return value for <code>ubidi_getDirection()</code>, it means
* that the source string contains no left-to-right characters, or
* that the source string is empty and the paragraph level is odd.
* <li> As return value for <code>ubidi_getBaseDirection()</code>, it
* means that the first strong character of the source string has
* a right-to-left direction.
* </ul>
* @stable ICU 2.0
*/
UBIDI_RTL,
/** Mixed-directional text.
* <p>As return value for <code>ubidi_getDirection()</code>, it means
* that the source string contains both left-to-right and
* right-to-left characters.
* @stable ICU 2.0
*/
UBIDI_MIXED,
/** No strongly directional text.
* <p>As return value for <code>ubidi_getBaseDirection()</code>, it means
* that the source string is missing or empty, or contains neither left-to-right
* nor right-to-left characters.
* @draft ICU 4.6
*/
UBIDI_NEUTRAL
};
/** @stable ICU 2.0 */
@ -1159,6 +1191,7 @@ ubidi_setLine(const UBiDi *pParaBiDi,
* that indicates if the entire text
* represented by this object is unidirectional,
* and which direction, or if it is mixed-directional.
* Note - The value <code>UBIDI_NEUTRAL</code> is never returned from this method.
*
* @see UBiDiDirection
* @stable ICU 2.0
@ -1166,6 +1199,36 @@ ubidi_setLine(const UBiDi *pParaBiDi,
U_STABLE UBiDiDirection U_EXPORT2
ubidi_getDirection(const UBiDi *pBiDi);
/**
* Gets the base direction of the text provided according
* to the Unicode Bidirectional Algorithm. The base direction
* is derived from the first character in the string with bidirectional
* character type L, R, or AL. If the first such character has type L,
* <code>UBIDI_LTR</code> is returned. If the first such character has
* type R or AL, <code>UBIDI_RTL</code> is returned. If the string does
* not contain any character of these types, then
* <code>UBIDI_NEUTRAL</code> is returned.
*
* This is a lightweight function for use when only the base direction
* is needed and no further bidi processing of the text is needed.
*
* @param text is a pointer to the text whose base
* direction is needed.
* Note: the text must be (at least) @c length long.
*
* @param length is the length of the text;
* if <code>length==-1</code> then the text
* must be zero-terminated.
*
* @return <code>UBIDI_LTR</code>, <code>UBIDI_RTL</code>,
* <code>UBIDI_NEUTRAL</code>
*
* @see UBiDiDirection
* @draft ICU 4.6
*/
U_DRAFT UBiDiDirection U_EXPORT2
ubidi_getBaseDirection(const UChar *text, int32_t length );
/**
* Get the pointer to the text.
*
@ -1398,7 +1461,8 @@ ubidi_countRuns(UBiDi *pBiDi, UErrorCode *pErrorCode);
*
* @return the directionality of the run,
* <code>UBIDI_LTR==0</code> or <code>UBIDI_RTL==1</code>,
* never <code>UBIDI_MIXED</code>.
* never <code>UBIDI_MIXED</code>,
* never <code>UBIDI_NEUTRAL</code>.
*
* @see ubidi_countRuns
*

View file

@ -475,6 +475,7 @@
#define ubidi_setReorderingOptions U_ICU_ENTRY_POINT_RENAME(ubidi_setReorderingOptions)
#define ubidi_writeReordered U_ICU_ENTRY_POINT_RENAME(ubidi_writeReordered)
#define ubidi_writeReverse U_ICU_ENTRY_POINT_RENAME(ubidi_writeReverse)
#define ubidi_getBaseDirection U_ICU_ENTRY_POINT_RENAME(ubidi_getBaseDirection)
#define ublock_getCode U_ICU_ENTRY_POINT_RENAME(ublock_getCode)
#define ubrk_close U_ICU_ENTRY_POINT_RENAME(ubrk_close)
#define ubrk_countAvailable U_ICU_ENTRY_POINT_RENAME(ubrk_countAvailable)

View file

@ -72,6 +72,8 @@ static void testFailureRecovery(void);
static void testMultipleParagraphs(void);
static void testGetBaseDirection(void);
/* new BIDI API */
static void testReorderingMode(void);
static void testReorderRunsOnly(void);
@ -122,6 +124,7 @@ addComplexTest(TestNode** root) {
addTest(root, doTashkeelSpecialVLTRArabicShapingTest, "complex/arabic-shaping/tashkeel");
addTest(root, doLOGICALArabicDeShapingTest, "complex/arabic-shaping/unshaping");
addTest(root, doArabicShapingTestForBug5421, "complex/arabic-shaping/bug-5421");
addTest(root, testGetBaseDirection, "complex/bidi/testGetBaseDirection");
}
static void
@ -1132,6 +1135,125 @@ _testReordering(UBiDi *pBiDi, int testNumber) {
return; \
} \
#define STRING_TEST_CASE(s) { (s), LENGTHOF(s) }
static void testGetBaseDirection(void) {
UBiDiDirection dir;
int i;
/* Test Data */
static const UChar
/*Mixed Start with L*/
stringMixedEnglishFirst[]={ 0x61, 0x627, 0x32, 0x6f3, 0x61, 0x34, 0 },
/*Mixed Start with AL*/
stringMixedArabicFirst[]={ 0x661, 0x627, 0x662, 0x6f3, 0x61, 0x664, 0 },
/*Mixed Start with R*/
stringMixedHebrewFirst[]={ 0x05EA, 0x627, 0x662, 0x6f3, 0x61, 0x664, 0 },
/*All AL (Arabic. Persian)*/
stringPersian[]={0x0698, 0x067E, 0x0686, 0x06AF, 0},
/*All R (Hebrew etc.)*/
stringHebrew[]={0x0590, 0x05D5, 0x05EA, 0x05F1, 0},
/*All L (English)*/
stringEnglish[]={0x71, 0x61, 0x66, 0},
/*Mixed Start with weak AL an then L*/
stringStartWeakAL[]={ 0x0663, 0x71, 0x61, 0x66, 0},
/*Mixed Start with weak L and then AL*/
stringStartWeakL[]={0x31, 0x0698, 0x067E, 0x0686, 0x06AF, 0},
/*Empty*/
stringEmpty[]={0},
/*Surrogate Char.*/
stringSurrogateChar[]={0xD800, 0xDC00, 0},
/*Invalid UChar*/
stringInvalidUchar[]={-1},
/*All weak L (English Digits)*/
stringAllEnglishDigits[]={0x31, 0x32, 0x33, 0},
/*All weak AL (Arabic Digits)*/
stringAllArabicDigits[]={0x0663, 0x0664, 0x0665, 0},
/*First L (English) others are R (Hebrew etc.) */
stringFirstL[] = {0x71, 0x0590, 0x05D5, 0x05EA, 0x05F1, 0},
/*Last R (Hebrew etc.) others are weak L (English Digits)*/
stringLastR[] = {0x31, 0x32, 0x33, 0x05F1, 0};
static const struct {
const UChar *s;
int32_t length;
} testCases[]={
STRING_TEST_CASE(stringMixedEnglishFirst),
STRING_TEST_CASE(stringMixedArabicFirst),
STRING_TEST_CASE(stringMixedHebrewFirst),
STRING_TEST_CASE(stringPersian),
STRING_TEST_CASE(stringHebrew),
STRING_TEST_CASE(stringEnglish),
STRING_TEST_CASE(stringStartWeakAL),
STRING_TEST_CASE(stringStartWeakL),
STRING_TEST_CASE(stringEmpty),
STRING_TEST_CASE(stringSurrogateChar),
STRING_TEST_CASE(stringInvalidUchar),
STRING_TEST_CASE(stringAllEnglishDigits),
STRING_TEST_CASE(stringAllArabicDigits),
STRING_TEST_CASE(stringFirstL),
STRING_TEST_CASE(stringLastR),
};
/* Expected results */
static const UBiDiDirection expectedDir[] ={
UBIDI_LTR, UBIDI_RTL, UBIDI_RTL,
UBIDI_RTL, UBIDI_RTL, UBIDI_LTR,
UBIDI_LTR, UBIDI_RTL, UBIDI_NEUTRAL,
UBIDI_LTR, UBIDI_NEUTRAL, UBIDI_NEUTRAL,
UBIDI_NEUTRAL, UBIDI_LTR, UBIDI_RTL
};
log_verbose("testGetBaseDirection() with %u test cases ---\n",
LENGTHOF(testCases));
/* Run Tests */
for(i=0; i<LENGTHOF(testCases); ++i) {
dir = ubidi_getBaseDirection(testCases[i].s, testCases[i].length );
log_verbose("Testing case %d\tReceived dir %d\n", i, dir);
if (dir != expectedDir[i])
log_err("\nFailed getBaseDirection case %d Expected %d \tReceived %d\n",
i, expectedDir[i], dir);
}
/* Misc. tests */
/* NULL string */
dir = ubidi_getBaseDirection(NULL, 3);
if (dir != UBIDI_NEUTRAL )
log_err("\nFailed getBaseDirection for NULL string " ,
"\nExpected %d \nReceived %d", UBIDI_NEUTRAL, dir);
/*All L- English string and length=-3 */
dir = ubidi_getBaseDirection( stringEnglish, -3);
if (dir != UBIDI_NEUTRAL )
log_err("\nFailed getBaseDirection for string w length= -3 ",
"\nExpected %d \nReceived %d", UBIDI_NEUTRAL, dir);
/*All L- English string and length=-1 */
dir = ubidi_getBaseDirection( stringEnglish, -1);
if (dir != UBIDI_LTR )
log_err("\nFailed getBaseDirection for English string w length= -1 ",
"\nExpected %d \nReceived %d", UBIDI_LTR, dir);
/*All AL- Persian string and length=-1 */
dir = ubidi_getBaseDirection( stringPersian, -1);
if (dir != UBIDI_RTL )
log_err("\nFailed getBaseDirection for Persian string w length= -1 ",
"\nExpected %d \nReceived %d", UBIDI_RTL, dir);
/*All R- Hebrew string and length=-1 */
dir = ubidi_getBaseDirection( stringHebrew, -1);
if (dir != UBIDI_RTL )
log_err("\nFailed getBaseDirection for Hebrew string w length= -1 ",
"\nExpected %d \nReceived %d", UBIDI_RTL, dir);
/*All weak L- English digits string and length=-1 */
dir = ubidi_getBaseDirection(stringAllEnglishDigits, -1);
if (dir != UBIDI_NEUTRAL )
log_err("\nFailed getBaseDirection for English digits string w length= -1 ",
"\nExpected %d \nReceived %d", UBIDI_NEUTRAL, dir);
/*All weak AL- Arabic digits string and length=-1 */
dir = ubidi_getBaseDirection(stringAllArabicDigits, -1);
if (dir != UBIDI_NEUTRAL )
log_err("\nFailed getBaseDirection for Arabic string w length= -1 ",
"\nExpected %d \nReceived %d", UBIDI_NEUTRAL, dir);
}
static void doMisc(void) {
/* Miscellaneous tests to exercize less popular code paths */