mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-7772 Fast string direction detection
X-SVN-Rev: 28665
This commit is contained in:
parent
52d02d3a55
commit
12de1d68b7
4 changed files with 222 additions and 7 deletions
|
@ -325,6 +325,34 @@ ubidi_getReorderingOptions(UBiDi *pBiDi) {
|
|||
}
|
||||
}
|
||||
|
||||
U_CAPI UBiDiDirection U_EXPORT2
|
||||
ubidi_getBaseDirection(const UChar *text,
|
||||
int32_t length){
|
||||
|
||||
int32_t i;
|
||||
UChar32 uchar;
|
||||
UCharDirection dir;
|
||||
|
||||
if( text==NULL || length<-1 ){
|
||||
return UBIDI_NEUTRAL;
|
||||
}
|
||||
|
||||
if(length==-1) {
|
||||
length=u_strlen(text);
|
||||
}
|
||||
|
||||
for( i = 0 ; i < length; ) {
|
||||
/* i is incremented by U16_NEXT */
|
||||
U16_NEXT(text, i, length, uchar);
|
||||
dir = u_charDirection(uchar);
|
||||
if( dir == U_LEFT_TO_RIGHT )
|
||||
return UBIDI_LTR;
|
||||
if( dir == U_RIGHT_TO_LEFT || dir ==U_RIGHT_TO_LEFT_ARABIC )
|
||||
return UBIDI_RTL;
|
||||
}
|
||||
return UBIDI_NEUTRAL;
|
||||
}
|
||||
|
||||
/* perform (P2)..(P3) ------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
|
|
|
@ -416,12 +416,44 @@ typedef uint8_t UBiDiLevel;
|
|||
* @stable ICU 2.0
|
||||
*/
|
||||
enum UBiDiDirection {
|
||||
/** All left-to-right text. This is a 0 value. @stable ICU 2.0 */
|
||||
UBIDI_LTR,
|
||||
/** All right-to-left text. This is a 1 value. @stable ICU 2.0 */
|
||||
UBIDI_RTL,
|
||||
/** Mixed-directional text. @stable ICU 2.0 */
|
||||
UBIDI_MIXED
|
||||
/** Left-to-right text. This is a 0 value.
|
||||
* <ul>
|
||||
* <li>As return value for <code>ubidi_getDirection()</code>, it means
|
||||
* that the source string contains no right-to-left characters, or
|
||||
* that the source string is empty and the paragraph level is even.
|
||||
* <li> As return value for <code>ubidi_getBaseDirection()</code>, it
|
||||
* means that the first strong character of the source string has
|
||||
* a left-to-right direction.
|
||||
* </ul>
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
UBIDI_LTR,
|
||||
/** Right-to-left text. This is a 1 value.
|
||||
* <ul>
|
||||
* <li>As return value for <code>ubidi_getDirection()</code>, it means
|
||||
* that the source string contains no left-to-right characters, or
|
||||
* that the source string is empty and the paragraph level is odd.
|
||||
* <li> As return value for <code>ubidi_getBaseDirection()</code>, it
|
||||
* means that the first strong character of the source string has
|
||||
* a right-to-left direction.
|
||||
* </ul>
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
UBIDI_RTL,
|
||||
/** Mixed-directional text.
|
||||
* <p>As return value for <code>ubidi_getDirection()</code>, it means
|
||||
* that the source string contains both left-to-right and
|
||||
* right-to-left characters.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
UBIDI_MIXED,
|
||||
/** No strongly directional text.
|
||||
* <p>As return value for <code>ubidi_getBaseDirection()</code>, it means
|
||||
* that the source string is missing or empty, or contains neither left-to-right
|
||||
* nor right-to-left characters.
|
||||
* @draft ICU 4.6
|
||||
*/
|
||||
UBIDI_NEUTRAL
|
||||
};
|
||||
|
||||
/** @stable ICU 2.0 */
|
||||
|
@ -1159,6 +1191,7 @@ ubidi_setLine(const UBiDi *pParaBiDi,
|
|||
* that indicates if the entire text
|
||||
* represented by this object is unidirectional,
|
||||
* and which direction, or if it is mixed-directional.
|
||||
* Note - The value <code>UBIDI_NEUTRAL</code> is never returned from this method.
|
||||
*
|
||||
* @see UBiDiDirection
|
||||
* @stable ICU 2.0
|
||||
|
@ -1166,6 +1199,36 @@ ubidi_setLine(const UBiDi *pParaBiDi,
|
|||
U_STABLE UBiDiDirection U_EXPORT2
|
||||
ubidi_getDirection(const UBiDi *pBiDi);
|
||||
|
||||
/**
|
||||
* Gets the base direction of the text provided according
|
||||
* to the Unicode Bidirectional Algorithm. The base direction
|
||||
* is derived from the first character in the string with bidirectional
|
||||
* character type L, R, or AL. If the first such character has type L,
|
||||
* <code>UBIDI_LTR</code> is returned. If the first such character has
|
||||
* type R or AL, <code>UBIDI_RTL</code> is returned. If the string does
|
||||
* not contain any character of these types, then
|
||||
* <code>UBIDI_NEUTRAL</code> is returned.
|
||||
*
|
||||
* This is a lightweight function for use when only the base direction
|
||||
* is needed and no further bidi processing of the text is needed.
|
||||
*
|
||||
* @param text is a pointer to the text whose base
|
||||
* direction is needed.
|
||||
* Note: the text must be (at least) @c length long.
|
||||
*
|
||||
* @param length is the length of the text;
|
||||
* if <code>length==-1</code> then the text
|
||||
* must be zero-terminated.
|
||||
*
|
||||
* @return <code>UBIDI_LTR</code>, <code>UBIDI_RTL</code>,
|
||||
* <code>UBIDI_NEUTRAL</code>
|
||||
*
|
||||
* @see UBiDiDirection
|
||||
* @draft ICU 4.6
|
||||
*/
|
||||
U_DRAFT UBiDiDirection U_EXPORT2
|
||||
ubidi_getBaseDirection(const UChar *text, int32_t length );
|
||||
|
||||
/**
|
||||
* Get the pointer to the text.
|
||||
*
|
||||
|
@ -1398,7 +1461,8 @@ ubidi_countRuns(UBiDi *pBiDi, UErrorCode *pErrorCode);
|
|||
*
|
||||
* @return the directionality of the run,
|
||||
* <code>UBIDI_LTR==0</code> or <code>UBIDI_RTL==1</code>,
|
||||
* never <code>UBIDI_MIXED</code>.
|
||||
* never <code>UBIDI_MIXED</code>,
|
||||
* never <code>UBIDI_NEUTRAL</code>.
|
||||
*
|
||||
* @see ubidi_countRuns
|
||||
*
|
||||
|
|
|
@ -475,6 +475,7 @@
|
|||
#define ubidi_setReorderingOptions U_ICU_ENTRY_POINT_RENAME(ubidi_setReorderingOptions)
|
||||
#define ubidi_writeReordered U_ICU_ENTRY_POINT_RENAME(ubidi_writeReordered)
|
||||
#define ubidi_writeReverse U_ICU_ENTRY_POINT_RENAME(ubidi_writeReverse)
|
||||
#define ubidi_getBaseDirection U_ICU_ENTRY_POINT_RENAME(ubidi_getBaseDirection)
|
||||
#define ublock_getCode U_ICU_ENTRY_POINT_RENAME(ublock_getCode)
|
||||
#define ubrk_close U_ICU_ENTRY_POINT_RENAME(ubrk_close)
|
||||
#define ubrk_countAvailable U_ICU_ENTRY_POINT_RENAME(ubrk_countAvailable)
|
||||
|
|
|
@ -72,6 +72,8 @@ static void testFailureRecovery(void);
|
|||
|
||||
static void testMultipleParagraphs(void);
|
||||
|
||||
static void testGetBaseDirection(void);
|
||||
|
||||
/* new BIDI API */
|
||||
static void testReorderingMode(void);
|
||||
static void testReorderRunsOnly(void);
|
||||
|
@ -122,6 +124,7 @@ addComplexTest(TestNode** root) {
|
|||
addTest(root, doTashkeelSpecialVLTRArabicShapingTest, "complex/arabic-shaping/tashkeel");
|
||||
addTest(root, doLOGICALArabicDeShapingTest, "complex/arabic-shaping/unshaping");
|
||||
addTest(root, doArabicShapingTestForBug5421, "complex/arabic-shaping/bug-5421");
|
||||
addTest(root, testGetBaseDirection, "complex/bidi/testGetBaseDirection");
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -1132,6 +1135,125 @@ _testReordering(UBiDi *pBiDi, int testNumber) {
|
|||
return; \
|
||||
} \
|
||||
|
||||
#define STRING_TEST_CASE(s) { (s), LENGTHOF(s) }
|
||||
|
||||
static void testGetBaseDirection(void) {
|
||||
UBiDiDirection dir;
|
||||
int i;
|
||||
|
||||
/* Test Data */
|
||||
static const UChar
|
||||
/*Mixed Start with L*/
|
||||
stringMixedEnglishFirst[]={ 0x61, 0x627, 0x32, 0x6f3, 0x61, 0x34, 0 },
|
||||
/*Mixed Start with AL*/
|
||||
stringMixedArabicFirst[]={ 0x661, 0x627, 0x662, 0x6f3, 0x61, 0x664, 0 },
|
||||
/*Mixed Start with R*/
|
||||
stringMixedHebrewFirst[]={ 0x05EA, 0x627, 0x662, 0x6f3, 0x61, 0x664, 0 },
|
||||
/*All AL (Arabic. Persian)*/
|
||||
stringPersian[]={0x0698, 0x067E, 0x0686, 0x06AF, 0},
|
||||
/*All R (Hebrew etc.)*/
|
||||
stringHebrew[]={0x0590, 0x05D5, 0x05EA, 0x05F1, 0},
|
||||
/*All L (English)*/
|
||||
stringEnglish[]={0x71, 0x61, 0x66, 0},
|
||||
/*Mixed Start with weak AL an then L*/
|
||||
stringStartWeakAL[]={ 0x0663, 0x71, 0x61, 0x66, 0},
|
||||
/*Mixed Start with weak L and then AL*/
|
||||
stringStartWeakL[]={0x31, 0x0698, 0x067E, 0x0686, 0x06AF, 0},
|
||||
/*Empty*/
|
||||
stringEmpty[]={0},
|
||||
/*Surrogate Char.*/
|
||||
stringSurrogateChar[]={0xD800, 0xDC00, 0},
|
||||
/*Invalid UChar*/
|
||||
stringInvalidUchar[]={-1},
|
||||
/*All weak L (English Digits)*/
|
||||
stringAllEnglishDigits[]={0x31, 0x32, 0x33, 0},
|
||||
/*All weak AL (Arabic Digits)*/
|
||||
stringAllArabicDigits[]={0x0663, 0x0664, 0x0665, 0},
|
||||
/*First L (English) others are R (Hebrew etc.) */
|
||||
stringFirstL[] = {0x71, 0x0590, 0x05D5, 0x05EA, 0x05F1, 0},
|
||||
/*Last R (Hebrew etc.) others are weak L (English Digits)*/
|
||||
stringLastR[] = {0x31, 0x32, 0x33, 0x05F1, 0};
|
||||
|
||||
static const struct {
|
||||
const UChar *s;
|
||||
int32_t length;
|
||||
} testCases[]={
|
||||
STRING_TEST_CASE(stringMixedEnglishFirst),
|
||||
STRING_TEST_CASE(stringMixedArabicFirst),
|
||||
STRING_TEST_CASE(stringMixedHebrewFirst),
|
||||
STRING_TEST_CASE(stringPersian),
|
||||
STRING_TEST_CASE(stringHebrew),
|
||||
STRING_TEST_CASE(stringEnglish),
|
||||
STRING_TEST_CASE(stringStartWeakAL),
|
||||
STRING_TEST_CASE(stringStartWeakL),
|
||||
STRING_TEST_CASE(stringEmpty),
|
||||
STRING_TEST_CASE(stringSurrogateChar),
|
||||
STRING_TEST_CASE(stringInvalidUchar),
|
||||
STRING_TEST_CASE(stringAllEnglishDigits),
|
||||
STRING_TEST_CASE(stringAllArabicDigits),
|
||||
STRING_TEST_CASE(stringFirstL),
|
||||
STRING_TEST_CASE(stringLastR),
|
||||
};
|
||||
|
||||
/* Expected results */
|
||||
static const UBiDiDirection expectedDir[] ={
|
||||
UBIDI_LTR, UBIDI_RTL, UBIDI_RTL,
|
||||
UBIDI_RTL, UBIDI_RTL, UBIDI_LTR,
|
||||
UBIDI_LTR, UBIDI_RTL, UBIDI_NEUTRAL,
|
||||
UBIDI_LTR, UBIDI_NEUTRAL, UBIDI_NEUTRAL,
|
||||
UBIDI_NEUTRAL, UBIDI_LTR, UBIDI_RTL
|
||||
};
|
||||
|
||||
log_verbose("testGetBaseDirection() with %u test cases ---\n",
|
||||
LENGTHOF(testCases));
|
||||
/* Run Tests */
|
||||
for(i=0; i<LENGTHOF(testCases); ++i) {
|
||||
dir = ubidi_getBaseDirection(testCases[i].s, testCases[i].length );
|
||||
log_verbose("Testing case %d\tReceived dir %d\n", i, dir);
|
||||
if (dir != expectedDir[i])
|
||||
log_err("\nFailed getBaseDirection case %d Expected %d \tReceived %d\n",
|
||||
i, expectedDir[i], dir);
|
||||
}
|
||||
|
||||
/* Misc. tests */
|
||||
/* NULL string */
|
||||
dir = ubidi_getBaseDirection(NULL, 3);
|
||||
if (dir != UBIDI_NEUTRAL )
|
||||
log_err("\nFailed getBaseDirection for NULL string " ,
|
||||
"\nExpected %d \nReceived %d", UBIDI_NEUTRAL, dir);
|
||||
/*All L- English string and length=-3 */
|
||||
dir = ubidi_getBaseDirection( stringEnglish, -3);
|
||||
if (dir != UBIDI_NEUTRAL )
|
||||
log_err("\nFailed getBaseDirection for string w length= -3 ",
|
||||
"\nExpected %d \nReceived %d", UBIDI_NEUTRAL, dir);
|
||||
/*All L- English string and length=-1 */
|
||||
dir = ubidi_getBaseDirection( stringEnglish, -1);
|
||||
if (dir != UBIDI_LTR )
|
||||
log_err("\nFailed getBaseDirection for English string w length= -1 ",
|
||||
"\nExpected %d \nReceived %d", UBIDI_LTR, dir);
|
||||
/*All AL- Persian string and length=-1 */
|
||||
dir = ubidi_getBaseDirection( stringPersian, -1);
|
||||
if (dir != UBIDI_RTL )
|
||||
log_err("\nFailed getBaseDirection for Persian string w length= -1 ",
|
||||
"\nExpected %d \nReceived %d", UBIDI_RTL, dir);
|
||||
/*All R- Hebrew string and length=-1 */
|
||||
dir = ubidi_getBaseDirection( stringHebrew, -1);
|
||||
if (dir != UBIDI_RTL )
|
||||
log_err("\nFailed getBaseDirection for Hebrew string w length= -1 ",
|
||||
"\nExpected %d \nReceived %d", UBIDI_RTL, dir);
|
||||
/*All weak L- English digits string and length=-1 */
|
||||
dir = ubidi_getBaseDirection(stringAllEnglishDigits, -1);
|
||||
if (dir != UBIDI_NEUTRAL )
|
||||
log_err("\nFailed getBaseDirection for English digits string w length= -1 ",
|
||||
"\nExpected %d \nReceived %d", UBIDI_NEUTRAL, dir);
|
||||
/*All weak AL- Arabic digits string and length=-1 */
|
||||
dir = ubidi_getBaseDirection(stringAllArabicDigits, -1);
|
||||
if (dir != UBIDI_NEUTRAL )
|
||||
log_err("\nFailed getBaseDirection for Arabic string w length= -1 ",
|
||||
"\nExpected %d \nReceived %d", UBIDI_NEUTRAL, dir);
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void doMisc(void) {
|
||||
/* Miscellaneous tests to exercize less popular code paths */
|
||||
|
|
Loading…
Add table
Reference in a new issue