mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-21 12:40:02 +00:00
X-SVN-Rev: 30369
This commit is contained in:
parent
05870d9516
commit
5163aad948
10 changed files with 263 additions and 83 deletions
|
@ -74,6 +74,11 @@ might have to #include some other header
|
|||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
/*
|
||||
* Forward declarations
|
||||
*/
|
||||
static UDataMemory *udata_findCachedData(const char *path);
|
||||
|
||||
/***********************************************************************
|
||||
*
|
||||
* static (Global) data
|
||||
|
@ -123,7 +128,26 @@ udata_cleanup(void)
|
|||
return TRUE; /* Everything was cleaned up */
|
||||
}
|
||||
|
||||
static UBool U_CALLCONV
|
||||
findCommonICUDataByName(const char *inBasename)
|
||||
{
|
||||
UBool found = FALSE;
|
||||
int32_t i;
|
||||
|
||||
UDataMemory *pData = udata_findCachedData(inBasename);
|
||||
if (pData == NULL)
|
||||
return FALSE;
|
||||
|
||||
for (i = 0; i < LENGTHOF(gCommonICUDataArray); ++i) {
|
||||
if ((gCommonICUDataArray[i] != NULL) && (gCommonICUDataArray[i]->pHeader == pData->pHeader)) {
|
||||
/* The data pointer is already in the array. */
|
||||
found = TRUE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return found;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
|
@ -786,8 +810,6 @@ static UBool extendICUData(UErrorCode *pErr)
|
|||
umtx_lock(&extendICUDataMutex);
|
||||
#endif
|
||||
if(!gHaveTriedToLoadCommonData) {
|
||||
gHaveTriedToLoadCommonData = TRUE;
|
||||
|
||||
/* See if we can explicitly open a .dat file for the ICUData. */
|
||||
pData = openCommonData(
|
||||
U_ICUDATA_NAME, /* "icudt20l" , for example. */
|
||||
|
@ -806,12 +828,20 @@ static UBool extendICUData(UErrorCode *pErr)
|
|||
/* fields in the UDataMemory that we're assigning */
|
||||
/* to CommonICUData. */
|
||||
|
||||
didUpdate =
|
||||
didUpdate = /* no longer using this result */
|
||||
setCommonICUData(©PData,/* The new common data. */
|
||||
FALSE, /* No warnings if write didn't happen */
|
||||
pErr); /* setCommonICUData honors errors; NOP if error set */
|
||||
}
|
||||
|
||||
gHaveTriedToLoadCommonData = TRUE;
|
||||
}
|
||||
|
||||
didUpdate = findCommonICUDataByName(U_ICUDATA_NAME); /* Return 'true' when a racing writes out the extended */
|
||||
/* data after another thread has failed to see it (in openCommonData), so */
|
||||
/* extended data can be examined. */
|
||||
/* Also handles a race through here before gHaveTriedToLoadCommonData is set. */
|
||||
|
||||
#if MAP_IMPLEMENTATION==MAP_STDIO
|
||||
umtx_unlock(&extendICUDataMutex);
|
||||
#endif
|
||||
|
@ -996,6 +1026,7 @@ static UDataMemory *doLoadFromCommonData(UBool isICUData, const char * /*pkgName
|
|||
const DataHeader *pHeader;
|
||||
UDataMemory *pCommonData;
|
||||
int32_t commonDataIndex;
|
||||
UBool checkedExtendedICUData = FALSE;
|
||||
/* try to get common data. The loop is for platforms such as the 390 that do
|
||||
* not initially load the full set of ICU data. If the lookup of an ICU data item
|
||||
* fails, the full (but slower to load) set is loaded, the and the loop repeats,
|
||||
|
@ -1038,7 +1069,8 @@ static UDataMemory *doLoadFromCommonData(UBool isICUData, const char * /*pkgName
|
|||
return NULL;
|
||||
} else if (pCommonData != NULL) {
|
||||
++commonDataIndex; /* try the next data package */
|
||||
} else if (extendICUData(subErrorCode)) {
|
||||
} else if ((!checkedExtendedICUData) && extendICUData(subErrorCode)) {
|
||||
checkedExtendedICUData = TRUE;
|
||||
/* try this data package slot again: it changed from NULL to non-NULL */
|
||||
} else {
|
||||
return NULL;
|
||||
|
|
|
@ -51,7 +51,7 @@ tfsmalls.o tmsgfmt.o trcoll.o tscoll.o tsdate.o tsdcfmsy.o tsdtfmsy.o \
|
|||
tsmthred.o tsnmfmt.o tsputil.o tstnrapi.o tstnorm.o tzbdtest.o \
|
||||
tzregts.o tztest.o ucdtest.o usettest.o ustrtest.o strcase.o transtst.o strtest.o thcoll.o \
|
||||
bytestrietest.o ucharstrietest.o \
|
||||
itrbbi.o rbbiapts.o rbbitst.o ittrans.o transapi.o cpdtrtst.o \
|
||||
itrbbi.o rbbiapts.o dicttest.o rbbitst.o ittrans.o transapi.o cpdtrtst.o \
|
||||
testutil.o transrt.o trnserr.o normconf.o sfwdchit.o \
|
||||
jamotest.o srchtest.o reptest.o regextst.o \
|
||||
itrbnf.o itrbnfrt.o itrbnfp.o ucaconf.o icusvtst.o \
|
||||
|
|
140
source/test/intltest/dicttest.cpp
Normal file
140
source/test/intltest/dicttest.cpp
Normal file
|
@ -0,0 +1,140 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2011-2011, International Business Machines Corporation
|
||||
* and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
************************************************************************
|
||||
* Date Name Description
|
||||
* 05/14/2011 grhoten Creation.
|
||||
************************************************************************/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "dicttest.h"
|
||||
#include "textfile.h"
|
||||
#include "uvector.h"
|
||||
#include "unicode/rbbi.h"
|
||||
|
||||
void DictionaryWordTest::TestThaiBreaks() {
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
BreakIterator* b;
|
||||
Locale locale = Locale("th");
|
||||
int32_t p, index;
|
||||
UChar c[]= {
|
||||
0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B,
|
||||
0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19,
|
||||
0x0E16, 0x0E49, 0x0E33, 0x0000
|
||||
};
|
||||
int32_t expectedWordResult[] = {
|
||||
2, 3, 6, 10, 11, 15, 17, 20, 22
|
||||
};
|
||||
int32_t expectedLineResult[] = {
|
||||
3, 6, 11, 15, 17, 20, 22
|
||||
};
|
||||
|
||||
int32_t size = u_strlen(c);
|
||||
UnicodeString text=UnicodeString(c);
|
||||
|
||||
b = BreakIterator::createWordInstance(locale, status);
|
||||
if (U_FAILURE(status)) {
|
||||
errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
|
||||
return;
|
||||
}
|
||||
b->setText(text);
|
||||
p = index = 0;
|
||||
while ((p=b->next())!=BreakIterator::DONE && p < size) {
|
||||
if (p != expectedWordResult[index++]) {
|
||||
errln("Incorrect break given by thai word break iterator. Expected: %d Got: %d", expectedWordResult[index-1], p);
|
||||
}
|
||||
}
|
||||
delete b;
|
||||
|
||||
b = BreakIterator::createLineInstance(locale, status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("Unable to create thai line break iterator.");
|
||||
return;
|
||||
}
|
||||
b->setText(text);
|
||||
p = index = 0;
|
||||
while ((p=b->next())!=BreakIterator::DONE && p < size) {
|
||||
if (p != expectedLineResult[index++]) {
|
||||
errln("Incorrect break given by thai line break iterator. Expected: %d Got: %d", expectedLineResult[index-1], p);
|
||||
}
|
||||
}
|
||||
|
||||
delete b;
|
||||
}
|
||||
|
||||
#define DICTIONARY_TEST_FILE "wordsegments.txt"
|
||||
|
||||
void DictionaryWordTest::TestWordBoundaries() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
TextFile phrases(DICTIONARY_TEST_FILE, "UTF8", status);
|
||||
if (U_FAILURE(status)) {
|
||||
dataerrln("Can't open "DICTIONARY_TEST_FILE": %s; skipping test",
|
||||
u_errorName(status));
|
||||
return;
|
||||
}
|
||||
|
||||
// Due to how the word break iterator works,
|
||||
// scripts for languages that use no spaces should use the correct dictionary by default.
|
||||
BreakIterator *wb = BreakIterator::createWordInstance("en", status);
|
||||
if (U_FAILURE(status)) {
|
||||
dataerrln("Word break iterator can not be opened: %s; skipping test",
|
||||
u_errorName(status));
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t pos, pIdx;
|
||||
int32_t testLines = 0;
|
||||
UnicodeString phrase;
|
||||
while (phrases.readLineSkippingComments(phrase, status, FALSE) && U_SUCCESS(status)) {
|
||||
UVector breaks(status);
|
||||
|
||||
for (pIdx = 0; pIdx < phrase.length(); pIdx++) {
|
||||
if (phrase.charAt(pIdx) == 0x007C /* | */) {
|
||||
breaks.addElement(pIdx, status);
|
||||
phrase.remove(pIdx, 1);
|
||||
}
|
||||
}
|
||||
breaks.addElement(pIdx, status);
|
||||
|
||||
wb->setText(phrase);
|
||||
int32_t brkArrPos = 0;
|
||||
while ((pos=wb->next())!=BreakIterator::DONE) {
|
||||
int32_t expectedPos = breaks.elementAti(brkArrPos);
|
||||
if (expectedPos != pos) {
|
||||
errln("Incorrect forward word break on line %d. Expected: %d Got: %d",
|
||||
phrases.getLineNumber(), breaks.elementAt(brkArrPos), pos);
|
||||
}
|
||||
brkArrPos++;
|
||||
}
|
||||
brkArrPos = breaks.size() - 1;
|
||||
while ((pos=wb->previous())!=BreakIterator::DONE) {
|
||||
brkArrPos--;
|
||||
int32_t expectedPos = breaks.elementAti(brkArrPos);
|
||||
if (expectedPos != pos) {
|
||||
errln("Incorrect backward word break on line %d. Expected: %d Got: %d",
|
||||
phrases.getLineNumber(), breaks.elementAt(brkArrPos), pos);
|
||||
}
|
||||
}
|
||||
testLines++;
|
||||
}
|
||||
delete wb;
|
||||
logln("%d tests were run.", testLines);
|
||||
}
|
||||
|
||||
void DictionaryWordTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par */)
|
||||
{
|
||||
if (exec) logln("TestSuite DictionaryWordTest: ");
|
||||
TESTCASE_AUTO_BEGIN;
|
||||
TESTCASE_AUTO(TestThaiBreaks);
|
||||
TESTCASE_AUTO(TestWordBoundaries);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
31
source/test/intltest/dicttest.h
Normal file
31
source/test/intltest/dicttest.h
Normal file
|
@ -0,0 +1,31 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2011-2011, International Business Machines Corporation
|
||||
* and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
************************************************************************
|
||||
* Date Name Description
|
||||
* 05/14/2011 grhoten Creation.
|
||||
************************************************************************/
|
||||
|
||||
#ifndef DICTTEST_H
|
||||
#define DICTTEST_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "intltest.h"
|
||||
|
||||
|
||||
class DictionaryWordTest: public IntlTest {
|
||||
public:
|
||||
void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
|
||||
void TestWordBoundaries();
|
||||
void TestThaiBreaks();
|
||||
};
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
|
||||
#endif
|
||||
|
|
@ -224,6 +224,7 @@
|
|||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="bytestrietest.cpp" />
|
||||
<ClCompile Include="dicttest.cpp" />
|
||||
<ClCompile Include="ucharstrietest.cpp" />
|
||||
<ClCompile Include="itrbbi.cpp" />
|
||||
<ClCompile Include="rbbiapts.cpp" />
|
||||
|
@ -389,6 +390,7 @@
|
|||
<ClCompile Include="bidiconf.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="dicttest.h" />
|
||||
<ClInclude Include="itrbbi.h" />
|
||||
<ClInclude Include="rbbiapts.h" />
|
||||
<ClInclude Include="rbbitst.h" />
|
||||
|
@ -533,4 +535,4 @@
|
|||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
|
|
@ -444,6 +444,9 @@
|
|||
<ClCompile Include="alphaindextst.cpp">
|
||||
<Filter>collation</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="dicttest.cpp">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="itrbbi.h">
|
||||
|
@ -812,5 +815,8 @@
|
|||
<ClInclude Include="alphaindextst.h">
|
||||
<Filter>collation</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="dicttest.h">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1998-2001, International Business Machines Corporation
|
||||
* Copyright (C) 1998-2011, International Business Machines Corporation
|
||||
* and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
@ -19,28 +19,27 @@
|
|||
#include "itrbbi.h"
|
||||
#include "rbbiapts.h"
|
||||
#include "rbbitst.h"
|
||||
#include "dicttest.h"
|
||||
|
||||
#define TESTCLASS(n,classname) \
|
||||
case n: \
|
||||
name = #classname; \
|
||||
if (exec) { \
|
||||
logln(#classname "---"); \
|
||||
logln(""); \
|
||||
classname t; \
|
||||
callTest(t, par); \
|
||||
} \
|
||||
break
|
||||
|
||||
|
||||
void IntlTestRBBI::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par )
|
||||
{
|
||||
if (exec) logln("TestSuite RuleBasedBreakIterator: ");
|
||||
switch (index) {
|
||||
case 0:
|
||||
name = "RBBIAPITest";
|
||||
if (exec) {
|
||||
logln("RBBIAPITest--"); logln("");
|
||||
RBBIAPITest test;
|
||||
callTest( test, par );
|
||||
}
|
||||
break;
|
||||
|
||||
case 1:
|
||||
name = "RBBITest";
|
||||
if (exec) {
|
||||
logln("RBBITest---"); logln("");
|
||||
RBBITest test;
|
||||
callTest( test, par );
|
||||
}
|
||||
break;
|
||||
TESTCLASS(0, RBBIAPITest);
|
||||
TESTCLASS(1, RBBITest);
|
||||
TESTCLASS(2, DictionaryWordTest);
|
||||
default: name=""; break;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -134,17 +134,15 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
|||
#if !UCONFIG_NO_FILE_IO
|
||||
case 21: name = "TestBug5775";
|
||||
if (exec) TestBug5775(); break;
|
||||
case 22: name = "TestThaiBreaks";
|
||||
if (exec) TestThaiBreaks(); break;
|
||||
case 23: name = "TestTailoredBreaks";
|
||||
case 22: name = "TestTailoredBreaks";
|
||||
if (exec) TestTailoredBreaks(); break;
|
||||
#else
|
||||
case 21: case 22: case 23: name = "skip";
|
||||
case 21: case 22: name = "skip";
|
||||
break;
|
||||
#endif
|
||||
case 24: name = "TestDictRules";
|
||||
case 23: name = "TestDictRules";
|
||||
if (exec) TestDictRules(); break;
|
||||
case 25: name = "TestBug5532";
|
||||
case 24: name = "TestBug5532";
|
||||
if (exec) TestBug5532(); break;
|
||||
default: name = ""; break; //needed to end loop
|
||||
}
|
||||
|
@ -1810,56 +1808,6 @@ end_test:
|
|||
#endif
|
||||
}
|
||||
|
||||
void RBBITest::TestThaiBreaks() {
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
BreakIterator* b;
|
||||
Locale locale = Locale("th");
|
||||
int32_t p, index;
|
||||
UChar c[]= {
|
||||
0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B,
|
||||
0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19,
|
||||
0x0E16, 0x0E49, 0x0E33, 0x0000
|
||||
};
|
||||
int32_t expectedWordResult[] = {
|
||||
2, 3, 6, 10, 11, 15, 17, 20, 22
|
||||
};
|
||||
int32_t expectedLineResult[] = {
|
||||
3, 6, 11, 15, 17, 20, 22
|
||||
};
|
||||
|
||||
int32_t size = u_strlen(c);
|
||||
UnicodeString text=UnicodeString(c);
|
||||
|
||||
b = BreakIterator::createWordInstance(locale, status);
|
||||
if (U_FAILURE(status)) {
|
||||
errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
|
||||
return;
|
||||
}
|
||||
b->setText(text);
|
||||
p = index = 0;
|
||||
while ((p=b->next())!=BreakIterator::DONE && p < size) {
|
||||
if (p != expectedWordResult[index++]) {
|
||||
errln("Incorrect break given by thai word break iterator. Expected: %d Got: %d", expectedWordResult[index-1], p);
|
||||
}
|
||||
}
|
||||
delete b;
|
||||
|
||||
b = BreakIterator::createLineInstance(locale, status);
|
||||
if (U_FAILURE(status)) {
|
||||
printf("Unable to create thai line break iterator.\n");
|
||||
return;
|
||||
}
|
||||
b->setText(text);
|
||||
p = index = 0;
|
||||
while ((p=b->next())!=BreakIterator::DONE && p < size) {
|
||||
if (p != expectedLineResult[index++]) {
|
||||
errln("Incorrect break given by thai line break iterator. Expected: %d Got: %d", expectedLineResult[index-1], p);
|
||||
}
|
||||
}
|
||||
|
||||
delete b;
|
||||
}
|
||||
|
||||
// UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
|
||||
// Words don't include colon or period (cldrbug #1969).
|
||||
static const char posxWordText[] = "Can't have breaks in xx:yy or struct.field for CS-types.";
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 1999-2010, International Business Machines
|
||||
* Copyright (c) 1999-2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*************************************************************************
|
||||
* Date Name Description
|
||||
|
@ -68,7 +68,6 @@ public:
|
|||
void TestTrieDict();
|
||||
void TestUnicodeFiles();
|
||||
void TestBug5775();
|
||||
void TestThaiBreaks();
|
||||
void TestTailoredBreaks();
|
||||
void TestDictRules();
|
||||
void TestBug5532();
|
||||
|
|
23
source/test/testdata/wordsegments.txt
vendored
Normal file
23
source/test/testdata/wordsegments.txt
vendored
Normal file
|
@ -0,0 +1,23 @@
|
|||
# Copyright (C) 2011-2011, International Business Machines Corporation
|
||||
# and others. All Rights Reserved.
|
||||
#
|
||||
# file name: wordsegments.txt
|
||||
# encoding: UTF-8
|
||||
#
|
||||
# created on: 2011may14
|
||||
# created by: George Rhoten
|
||||
# created by: Nathan Wells
|
||||
#
|
||||
# Word boundary test data for languages that contain no spaces.
|
||||
# Boundaries are deliminated with the | character so that it's easier to debug.
|
||||
#
|
||||
# If you have test data with zero width spaces to deliminate the words, use the following command example.
|
||||
# Be sure to copy the zero width space in the sed command.
|
||||
# echo 'សូមចំណាយពេលបន្តិចដើម្បីអធិស្ឋានអរព្រះគុណដល់ព្រះអង្គ' | sed 's//\|/g'
|
||||
#
|
||||
|
||||
# Thai
|
||||
กู| |กิน|กุ้ง| |ปิ้่|งอ|ยู่|ใน|ถ้ำ
|
||||
|
||||
# Khmer
|
||||
សូម|ចំណាយពេល|បន្តិច|ដើម្បី|អធិស្ឋាន|អរ|ព្រះគុណ|ដល់|ព្រះអង្គ
|
Loading…
Add table
Reference in a new issue