mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
ICU-1126 Add title break iterator
X-SVN-Rev: 7801
This commit is contained in:
parent
7aadc85a12
commit
13e01fb91d
13 changed files with 223 additions and 8 deletions
|
@ -152,6 +152,28 @@ BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status)
|
|||
|
||||
// -------------------------------------
|
||||
|
||||
// Creates a simple text boundary for title casing breaks.
|
||||
BreakIterator*
|
||||
BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status)
|
||||
{
|
||||
// WARNING: This routine is currently written specifically to handle only the
|
||||
// default rules files. This function will have to be made fully general
|
||||
// at some time in the future!
|
||||
BreakIterator* result = NULL;
|
||||
static const char filename[] = "title";
|
||||
|
||||
if (U_FAILURE(status))
|
||||
return NULL;
|
||||
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
|
||||
|
||||
if (!U_FAILURE(status)) {
|
||||
result = new RuleBasedBreakIterator(file);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
// -------------------------------------
|
||||
|
||||
// Gets all the available locales that has localized text boundary data.
|
||||
const Locale*
|
||||
BreakIterator::getAvailableLocales(int32_t& count)
|
||||
|
|
|
@ -466,7 +466,12 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
|
|||
// to the last saved lookup-state position
|
||||
if (tables->isLookaheadState(state)) {
|
||||
if (tables->isEndState(state)) {
|
||||
result = lookaheadResult;
|
||||
if (lookaheadResult > 0) {
|
||||
result = lookaheadResult;
|
||||
}
|
||||
else {
|
||||
result = text->getIndex() + 1;
|
||||
}
|
||||
}
|
||||
else {
|
||||
lookaheadResult = text->getIndex() + 1;
|
||||
|
@ -658,5 +663,12 @@ BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
|
|||
return localIterator;
|
||||
}
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
void RuleBasedBreakIterator::debugDumpTables() const {
|
||||
tables->debugDumpTables();
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -10,6 +10,10 @@
|
|||
#include "ucmp8.h"
|
||||
#include "cmemory.h"
|
||||
#include "rbbi_tbl.h"
|
||||
#include "unicode/unistr.h"
|
||||
#ifdef RBBI_DEBUG
|
||||
#include <stdio.h>
|
||||
#endif
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -146,5 +150,97 @@ UBool
|
|||
RuleBasedBreakIteratorTables::isLookaheadState(int32_t state) const {
|
||||
return lookaheadStates[state];
|
||||
}
|
||||
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
//
|
||||
// debugDumpTables
|
||||
//
|
||||
void RuleBasedBreakIteratorTables::debugDumpTables() const {
|
||||
printf("Character Classes:\n");
|
||||
int currentCharClass = 257;
|
||||
int startCurrentRange = 0;
|
||||
int initialStringLength = 0;
|
||||
char buf[80];
|
||||
|
||||
UnicodeString *charClassRanges = new UnicodeString[numCategories];
|
||||
|
||||
for (int i = 0; i < 0xffff; i++) {
|
||||
if ( ucmp8_get(charCategoryTable, i) != currentCharClass) {
|
||||
if (currentCharClass != 257) {
|
||||
// Complete the output of the previous range.
|
||||
if (i != startCurrentRange+1) {
|
||||
sprintf(buf, "-%x", i-1);
|
||||
charClassRanges[currentCharClass].append(buf);
|
||||
}
|
||||
if (charClassRanges[currentCharClass].length() % 72 < initialStringLength % 72) {
|
||||
charClassRanges[currentCharClass].append("\n ");
|
||||
}
|
||||
}
|
||||
|
||||
// Output the start of the new range.
|
||||
currentCharClass = ucmp8_get(charCategoryTable, i);
|
||||
startCurrentRange = i;
|
||||
initialStringLength = charClassRanges[currentCharClass].length();
|
||||
if (charClassRanges[currentCharClass].length() > 0)
|
||||
charClassRanges[currentCharClass].append(", ");
|
||||
sprintf(buf, "%x", i);
|
||||
charClassRanges[currentCharClass].append(buf);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i=0; i<numCategories; i++) {
|
||||
printf("%d: ", i);
|
||||
// Write out the chars in the UnicodeStrings.
|
||||
// We know we didn't put anything into them except for plain ascii chars.
|
||||
for (int j=0; j<charClassRanges[i].length(); j++) {
|
||||
putchar(charClassRanges[i].charAt(j));
|
||||
}
|
||||
putchar('\n');
|
||||
}
|
||||
|
||||
delete [] charClassRanges;
|
||||
|
||||
|
||||
// State table length might be too big by one, because the only indication
|
||||
// we have is the pointer to the start of the next item in the memory
|
||||
// image, the backwardsStateTable, which is 4 byte aligned.
|
||||
//
|
||||
int stateTableLength = backwardsStateTable - stateTable;
|
||||
if ((stateTableLength % numCategories) == 1) {
|
||||
stateTableLength -= 1;
|
||||
}
|
||||
|
||||
printf("\n\nState Table. *: end state %%: look ahead state\n");
|
||||
printf("C:\t");
|
||||
for (int i = 0; i < numCategories; i++) {
|
||||
printf("%d\t", i);
|
||||
}
|
||||
printf("\n=================================================");
|
||||
|
||||
for (int i = 0; i < stateTableLength; i++) {
|
||||
if (i % numCategories == 0) {
|
||||
putchar('\n');
|
||||
if (endStates[i / numCategories])
|
||||
putchar('*');
|
||||
else
|
||||
putchar(' ');
|
||||
if (lookaheadStates[i / numCategories]) {
|
||||
putchar('%');
|
||||
}
|
||||
else
|
||||
putchar(' ');
|
||||
printf("%d:\t", i / numCategories);
|
||||
}
|
||||
if (stateTable[i] == 0) {
|
||||
printf(".\t");
|
||||
} else {
|
||||
printf("%d\t", stateTable[i]);
|
||||
}
|
||||
}
|
||||
printf("\n\n\n");
|
||||
}
|
||||
#endif // RBBI_DEBUG
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -198,6 +198,14 @@ protected:
|
|||
*/
|
||||
virtual UBool isLookaheadState(int32_t state) const;
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
//
|
||||
// Print out state table and character classes.
|
||||
// For debugging only.
|
||||
//
|
||||
void debugDumpTables() const;
|
||||
#endif
|
||||
|
||||
friend class RuleBasedBreakIterator;
|
||||
friend class DictionaryBasedBreakIterator;
|
||||
};
|
||||
|
|
|
@ -43,6 +43,10 @@ ubrk_open(UBreakIteratorType type,
|
|||
case UBRK_SENTENCE:
|
||||
result = BreakIterator::createSentenceInstance(Locale(locale), *status);
|
||||
break;
|
||||
|
||||
case UBRK_TITLE:
|
||||
result = BreakIterator::createTitleInstance(Locale(locale), *status);
|
||||
break;
|
||||
}
|
||||
|
||||
// check for allocation error
|
||||
|
|
|
@ -419,6 +419,26 @@ public:
|
|||
static BreakIterator* createSentenceInstance(const Locale& where,
|
||||
UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Create BreakIterator for title-casing breaks using the specified locale
|
||||
* Returns an instance of a BreakIterator implementing title breaks.
|
||||
* @param where the locale.
|
||||
* @return A BreakIterator for title-breaks. The UErrorCode& status
|
||||
* parameter is used to return status information to the user.
|
||||
* To check whether the construction succeeded or not, you should check
|
||||
* the value of U_SUCCESS(err). If you wish more detailed information, you
|
||||
* can check for informational error results which still indicate success.
|
||||
* U_USING_FALLBACK_ERROR indicates that a fall back locale was used. For
|
||||
* example, 'de_CH' was requested, but nothing was found there, so 'de' was
|
||||
* used. U_USING_DEFAULT_ERROR indicates that the default locale data was
|
||||
* used; neither the requested locale nor any of its fall back locales
|
||||
* could be found.
|
||||
* The caller owns the returned object and is responsible for deleting it.
|
||||
* @stable
|
||||
*/
|
||||
static BreakIterator* createTitleInstance(const Locale& where,
|
||||
UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Get the set of Locales for which TextBoundaries are installed
|
||||
* @param count the output parameter of number of elements in the locale list
|
||||
|
|
|
@ -428,6 +428,9 @@ RuleBasedBreakIterator(UDataMemory* image);
|
|||
virtual BreakIterator * createBufferClone(void *stackBuffer,
|
||||
int32_t &BufferSize,
|
||||
UErrorCode &status);
|
||||
#ifdef RBBI_DEBUG
|
||||
void debugDumpTables() const;
|
||||
#endif
|
||||
|
||||
|
||||
protected:
|
||||
|
|
|
@ -41,6 +41,11 @@
|
|||
* stored as a base character and a diacritical mark. What users
|
||||
* consider to be a character can differ between languages.
|
||||
* <P>
|
||||
* Title boundary analysis locates all positions,
|
||||
* typically starts of words, that should be set to Title Case
|
||||
* when title casing the text.
|
||||
* <P>
|
||||
*
|
||||
* This is the interface for all text boundaries.
|
||||
* <P>
|
||||
* Examples:
|
||||
|
@ -177,7 +182,9 @@ enum UBreakIteratorType {
|
|||
/** Line breaks */
|
||||
UBRK_LINE,
|
||||
/** Sentence breaks */
|
||||
UBRK_SENTENCE
|
||||
UBRK_SENTENCE,
|
||||
/** Title Case breaks */
|
||||
UBRK_TITLE
|
||||
};
|
||||
typedef enum UBreakIteratorType UBreakIteratorType;
|
||||
|
||||
|
|
|
@ -132,7 +132,7 @@ TEST_DAT_FILES=$(TESTBUILDDIR)/test.dat
|
|||
|
||||
## BRK files
|
||||
# ALL of these files can be deleted (the following BRK files) - they are copied
|
||||
BRK_FILES=$(BUILDDIR)/char.brk $(BUILDDIR)/line.brk $(BUILDDIR)/line_th.brk $(BUILDDIR)/sent.brk $(BUILDDIR)/word.brk $(BUILDDIR)/word_th.brk
|
||||
BRK_FILES=$(BUILDDIR)/char.brk $(BUILDDIR)/line.brk $(BUILDDIR)/line_th.brk $(BUILDDIR)/sent.brk $(BUILDDIR)/word.brk $(BUILDDIR)/title.brk $(BUILDDIR)/word_th.brk
|
||||
# don't include thaidict.brk - it goes into a resource bundle - plus it isn't deleted
|
||||
|
||||
## UCM files
|
||||
|
|
|
@ -220,7 +220,7 @@ testdata: ucadata.dat $(TRANSLIT_FILES) $(RB_FILES) {"$(ICUTOOLS)\genrb\$(CFG)"
|
|||
@echo building testdata...
|
||||
nmake /nologo /f "$(TESTDATA)\testdata.mk" TESTDATA=. ICUTOOLS="$(ICUTOOLS)" PKGOPT="$(PKGOPT)" CFG=$(CFG) TESTDATAOUT="$(TESTDATAOUT)" ICUDATA="$(ICUDATA)" TESTDATABLD="$(TESTDATABLD)"
|
||||
|
||||
BRK_FILES = "$(ICUDBLD)\sent.brk" "$(ICUDBLD)\char.brk" "$(ICUDBLD)\line.brk" "$(ICUDBLD)\word.brk" "$(ICUDBLD)\line_th.brk" "$(ICUDBLD)\word_th.brk"
|
||||
BRK_FILES = "$(ICUDBLD)\sent.brk" "$(ICUDBLD)\char.brk" "$(ICUDBLD)\line.brk" "$(ICUDBLD)\word.brk" "$(ICUDBLD)\title.brk" "$(ICUDBLD)\line_th.brk" "$(ICUDBLD)\word_th.brk"
|
||||
|
||||
#invoke pkgdata for ICU common data
|
||||
# pkgdata will drop all output files (.dat, .dll, .lib) into the target (ICUDBLD) directory.
|
||||
|
@ -266,6 +266,9 @@ $(BRK_FILES:.brk" =.brk"
|
|||
"$(ICUDBLD)\word.brk" : "$(ICUBRK)\wordLE.brk"
|
||||
copy "$(ICUBRK)\wordLE.brk" "$(ICUDBLD)\word.brk"
|
||||
|
||||
"$(ICUDBLD)\title.brk" : "$(ICUBRK)\titleLE.brk"
|
||||
copy "$(ICUBRK)\titleLE.brk" "$(ICUDBLD)\title.brk"
|
||||
|
||||
"$(ICUDBLD)\line_th.brk" : "$(ICUBRK)\line_thLE.brk"
|
||||
copy "$(ICUBRK)\line_thLE.brk" "$(ICUDBLD)\line_th.brk"
|
||||
|
||||
|
|
|
@ -708,6 +708,33 @@ void RBBITest::TestHindiWordBreak()
|
|||
delete e;
|
||||
delete hindiWordData;
|
||||
}
|
||||
|
||||
|
||||
void RBBITest::TestTitleBreak()
|
||||
{
|
||||
UErrorCode status= U_ZERO_ERROR;
|
||||
RuleBasedBreakIterator* titleI=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createTitleInstance(Locale::getDefault(), status);
|
||||
if(U_FAILURE(status)){
|
||||
errln("FAIL : in construction");
|
||||
return;
|
||||
}
|
||||
// titleI->debugDumpTables();
|
||||
|
||||
Vector *titleData = new Vector();
|
||||
titleData->addElement(" ");
|
||||
titleData->addElement("This ");
|
||||
titleData->addElement("is ");
|
||||
titleData->addElement("a ");
|
||||
titleData->addElement("simple ");
|
||||
titleData->addElement("sample ");
|
||||
titleData->addElement("sentence. ");
|
||||
titleData->addElement("This ");
|
||||
|
||||
generalIteratorTest(*titleI, titleData);
|
||||
delete titleI;
|
||||
delete titleData;
|
||||
}
|
||||
|
||||
/*
|
||||
//Bug: if there is no word break before and after danda when it is followed by a space
|
||||
void RBBITest::TestDanda()
|
||||
|
@ -979,6 +1006,9 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
|||
if(exec) TestHindiCharacterBreak(); break;
|
||||
case 5: name = "TestHindiWordBreak";
|
||||
if(exec) TestHindiWordBreak(); break;
|
||||
case 6: name = "TestTitleBreak";
|
||||
if(exec) TestTitleBreak(); break;
|
||||
|
||||
// case 6: name = "TestDanda()";
|
||||
// if(exec) TestDanda(); break;
|
||||
// case 7: name = "TestHindiCharacterWrapping()";
|
||||
|
@ -1069,9 +1099,11 @@ Vector* RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, UnicodeString& te
|
|||
while (p != RuleBasedBreakIterator::DONE) {
|
||||
p = bi.next();
|
||||
if (p != RuleBasedBreakIterator::DONE) {
|
||||
if (p <= lastP)
|
||||
if (p <= lastP) {
|
||||
errln((UnicodeString)"next() failed to move forward: next() on position "
|
||||
+ lastP + (UnicodeString)" yielded " + p);
|
||||
break;
|
||||
}
|
||||
|
||||
text.extractBetween(lastP, p, selection);
|
||||
result->addElement(selection);
|
||||
|
@ -1097,16 +1129,20 @@ Vector* RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, UnicodeString&
|
|||
while (p != RuleBasedBreakIterator::DONE) {
|
||||
p = bi.previous();
|
||||
if (p != RuleBasedBreakIterator::DONE) {
|
||||
if (p >= lastP)
|
||||
if (p >= lastP) {
|
||||
errln((UnicodeString)"previous() failed to move backward: previous() on position "
|
||||
+ lastP + (UnicodeString)" yielded " + p);
|
||||
break;
|
||||
}
|
||||
text.extractBetween(p, lastP, selection);
|
||||
result->insertElementAt(selection, 0);
|
||||
}
|
||||
else {
|
||||
if (lastP != 0)
|
||||
if (lastP != 0) {
|
||||
errln((UnicodeString)"previous() returned DONE prematurely: offset was "
|
||||
+ lastP + (UnicodeString)" instead of 0");
|
||||
break;
|
||||
}
|
||||
}
|
||||
lastP = p;
|
||||
}
|
||||
|
|
|
@ -51,6 +51,10 @@ public:
|
|||
* Tests Hindi(Devanagiri) word iteration
|
||||
**/
|
||||
void TestHindiWordBreak(void);
|
||||
/**
|
||||
* Tests Title Case break iteration
|
||||
**/
|
||||
void TestTitleBreak(void);
|
||||
/**
|
||||
* Test Hindi Danda i.e make sure we have a break point before and after danda
|
||||
**/
|
||||
|
|
|
@ -99,7 +99,7 @@ all-local: build-local
|
|||
|
||||
DAT_FILES=uprops.dat unames.dat cnvalias.dat tz.dat
|
||||
# ALL of these files can be deleted (the following BRK files) - they are copied
|
||||
BRK_FILES=char.brk line.brk line_th.brk sent.brk word.brk word_th.brk
|
||||
BRK_FILES=char.brk line.brk line_th.brk sent.brk word.brk title.brk word_th.brk
|
||||
# don't include thaidict.brk - it goes into a resource bundle - plus it isn't deleted
|
||||
|
||||
DATAFILESD=$(DAT_FILES:%=$(OBJDATADIR)/%)
|
||||
|
|
Loading…
Add table
Reference in a new issue