ICU-9353 merge dbbi-tries work into the trunk

X-SVN-Rev: 32184
This commit is contained in:
Maxime Serrano 2012-08-16 23:01:49 +00:00
parent 8bcdfa544d
commit c64c0299d7
43 changed files with 328856 additions and 2877 deletions

4
.gitattributes vendored
View file

@ -187,8 +187,8 @@ icu4c/source/tools/gencmn/gencmn.vcxproj -text
icu4c/source/tools/gencmn/gencmn.vcxproj.filters -text
icu4c/source/tools/gencnval/gencnval.vcxproj -text
icu4c/source/tools/gencnval/gencnval.vcxproj.filters -text
icu4c/source/tools/genctd/genctd.vcxproj -text
icu4c/source/tools/genctd/genctd.vcxproj.filters -text
icu4c/source/tools/gendict/gendict.vcxproj -text
icu4c/source/tools/gendict/gendict.vcxproj.filters -text
icu4c/source/tools/gennorm2/gennorm2.vcxproj -text
icu4c/source/tools/genrb/derb.vcxproj -text
icu4c/source/tools/genrb/derb.vcxproj.filters -text

15
.gitignore vendored
View file

@ -709,21 +709,6 @@ icu4c/source/tools/gencnval/gencnval.vcproj.*.*.user
icu4c/source/tools/gencnval/release
icu4c/source/tools/gencnval/x64
icu4c/source/tools/gencnval/x86
icu4c/source/tools/genctd/*.d
icu4c/source/tools/genctd/*.o
icu4c/source/tools/genctd/*.pdb
icu4c/source/tools/genctd/*.plg
icu4c/source/tools/genctd/*.vcxproj.user
icu4c/source/tools/genctd/Debug
icu4c/source/tools/genctd/Makefile
icu4c/source/tools/genctd/Release
icu4c/source/tools/genctd/debug
icu4c/source/tools/genctd/genctd
icu4c/source/tools/genctd/genctd.1
icu4c/source/tools/genctd/genctd.vcproj.*.*.user
icu4c/source/tools/genctd/release
icu4c/source/tools/genctd/x64
icu4c/source/tools/genctd/x86
icu4c/source/tools/gennorm2/*.d
icu4c/source/tools/gennorm2/*.o
icu4c/source/tools/gennorm2/*.pdb

View file

@ -52,7 +52,7 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "iotest", "..\test\iotest\io
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "icupkg", "..\tools\icupkg\icupkg.vcxproj", "{62D4B15D-7A90-4ECB-BA19-5E021D6A21BC}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "genctd", "..\tools\genctd\genctd.vcxproj", "{9D4211F7-2C77-439C-82F0-30A4E43BA569}"
Project("{9D4211F7-2C77-439C-82F0-30A4E43BA569}") = "gendict", "..\tools\gendict\gendict.vcxproj", "{9D4211F7-2C77-439C-82F0-30A4E43BA569}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "letest", "..\test\letest\letest.vcxproj", "{67351485-4D18-4245-BE39-A7EF0675ACD2}"
EndProject

View file

@ -90,6 +90,7 @@ bytestream.o stringpiece.o \
stringtriebuilder.o bytestriebuilder.o \
bytestrie.o bytestrieiterator.o \
ucharstrie.o ucharstriebuilder.o ucharstrieiterator.o \
dictionarydata.o \
appendable.o ustr_cnv.o unistr_cnv.o unistr.o unistr_case.o unistr_props.o \
utf_impl.o ustring.o ustrcase.o ucasemap.o ucasemap_titlecase_brkiter.o cstring.o ustrfmt.o ustrtrns.o ustr_wcs.o utext.o \
unistr_case_locale.o ustrcase_locale.o unistr_titlecase_brkiter.o ustr_titlecase_brkiter.o \
@ -98,7 +99,7 @@ chariter.o schriter.o uchriter.o uiter.o \
patternprops.o uchar.o uprops.o ucase.o propname.o ubidi_props.o ubidi.o ubidiwrt.o ubidiln.o ushape.o \
uscript.o usc_impl.o unames.o \
utrie.o utrie2.o utrie2_builder.o bmpset.o unisetspan.o uset_props.o uniset_props.o uniset_closure.o uset.o uniset.o usetiter.o ruleiter.o caniter.o unifilt.o unifunct.o \
uarrsort.o brkiter.o ubrk.o brkeng.o dictbe.o triedict.o \
uarrsort.o brkiter.o ubrk.o brkeng.o dictbe.o \
rbbi.o rbbidata.o rbbinode.o rbbirb.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o \
serv.o servnotf.o servls.o servlk.o servlkf.o servrbf.o servslkf.o \
uidna.o usprep.o uts46.o punycode.o \

View file

@ -1,6 +1,6 @@
/*
************************************************************************************
* Copyright (C) 2006-2011, International Business Machines Corporation
* Copyright (C) 2006-2012, International Business Machines Corporation
* and others. All Rights Reserved.
************************************************************************************
*/
@ -11,7 +11,6 @@
#include "brkeng.h"
#include "dictbe.h"
#include "triedict.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "unicode/chariter.h"
@ -20,6 +19,9 @@
#include "unicode/putil.h"
#include "unicode/ustring.h"
#include "unicode/uscript.h"
#include "unicode/ucharstrie.h"
#include "unicode/bytestrie.h"
#include "dictionarydata.h"
#include "uvector.h"
#include "umutex.h"
#include "uresimp.h"
@ -219,21 +221,45 @@ ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
UErrorCode status = U_ZERO_ERROR;
UScriptCode code = uscript_getScript(c, &status);
if (U_SUCCESS(status)) {
const CompactTrieDictionary *dict = loadDictionaryFor(code, breakType);
if (dict != NULL) {
DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);
if (m != NULL) {
const LanguageBreakEngine *engine = NULL;
switch(code) {
case USCRIPT_THAI:
engine = new ThaiBreakEngine(dict, status);
engine = new ThaiBreakEngine(m, status);
break;
case USCRIPT_KHMER:
engine = new KhmerBreakEngine(dict, status);
engine = new KhmerBreakEngine(m, status);
break;
case USCRIPT_HANGUL:
engine = new CjkBreakEngine(m, kKorean, status);
break;
// use same BreakEngine and dictionary for both Chinese and Japanese
case USCRIPT_HIRAGANA:
case USCRIPT_KATAKANA:
case USCRIPT_HAN:
engine = new CjkBreakEngine(m, kChineseJapanese, status);
break;
#if 0
// TODO: Have to get some characters with script=common handled
// by CjkBreakEngine (e.g. U+309B). Simply subjecting
// them to CjkBreakEngine does not work. The engine has to
// special-case them.
case USCRIPT_COMMON:
{
UBlockCode block = ublock_getCode(code);
if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
engine = new CjkBreakEngine(dict, kChineseJapanese, status);
break;
}
#endif
default:
break;
}
if (engine == NULL) {
delete dict;
delete m;
}
else if (U_FAILURE(status)) {
delete engine;
@ -245,45 +271,61 @@ ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
return NULL;
}
const CompactTrieDictionary *
ICULanguageBreakFactory::loadDictionaryFor(UScriptCode script, int32_t /*breakType*/) {
DictionaryMatcher *
ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) {
UErrorCode status = U_ZERO_ERROR;
// Open root from brkitr tree.
char dictnbuff[256];
char ext[4]={'\0'};
// open root from brkitr tree.
char dictnbuf[256];
char ext[6] = {'\0'};
UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
b = ures_getByKeyWithFallback(b, uscript_getShortName(script), b, &status);
int32_t dictnlength = 0;
const UChar *dictfname = ures_getString(b, &dictnlength, &status);
if (U_SUCCESS(status) && (size_t)dictnlength >= sizeof(dictnbuff)) {
if (U_SUCCESS(status) && (size_t)dictnlength >= sizeof(dictnbuf)) {
dictnlength = 0;
status = U_BUFFER_OVERFLOW_ERROR;
}
if (U_SUCCESS(status) && dictfname) {
UChar* extStart=u_strchr(dictfname, 0x002e);
UChar *extStart = u_strchr(dictfname, 0x002e);
int len = 0;
if(extStart!=NULL){
len = (int)(extStart-dictfname);
u_UCharsToChars(extStart+1, ext, sizeof(ext)); // nul terminates the buff
u_UCharsToChars(dictfname, dictnbuff, len);
if (extStart != NULL) {
len = (int)(extStart - dictfname);
u_UCharsToChars(extStart+1, ext, sizeof(ext)); // null-terminates the buffer
u_UCharsToChars(dictfname, dictnbuf, len);
}
dictnbuff[len]=0; // nul terminate
dictnbuf[len] = '\0'; // null-terminate
}
ures_close(b);
UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext, dictnbuff, &status);
UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext, dictnbuf, &status);
if (U_SUCCESS(status)) {
const CompactTrieDictionary *dict = new CompactTrieDictionary(
file, status);
if (U_SUCCESS(status) && dict == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
// build trie
const uint8_t *data = (const uint8_t *)udata_getMemory(file);
const int32_t *indexes = (const int32_t *)data;
const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
DictionaryMatcher *m = NULL;
if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
const char *characters = (const char *)(data + offset);
m = new BytesDictionaryMatcher(characters, transform, file);
}
if (U_FAILURE(status)) {
delete dict;
dict = NULL;
else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
const UChar *characters = (const UChar *)(data + offset);
m = new UCharsDictionaryMatcher(characters, file);
}
return dict;
if (m == NULL) {
// no matcher exists to take ownership - either we are an invalid
// type or memory allocation failed
udata_close(file);
}
return m;
} else if (dictfname != NULL) {
// we don't have a dictionary matcher.
// returning NULL here will cause us to fail to find a dictionary break engine, as expected
status = U_ZERO_ERROR;
return NULL;
}
return NULL;
}

View file

@ -1,6 +1,6 @@
/**
************************************************************************************
* Copyright (C) 2006-2007, International Business Machines Corporation and others. *
* Copyright (C) 2006-2012, International Business Machines Corporation and others. *
* All Rights Reserved. *
************************************************************************************
*/
@ -17,7 +17,7 @@ U_NAMESPACE_BEGIN
class UnicodeSet;
class UStack;
class CompactTrieDictionary;
class DictionaryMatcher;
/*******************************************************************
* LanguageBreakEngine
@ -259,8 +259,7 @@ class ICULanguageBreakFactory : public LanguageBreakFactory {
*/
virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType);
protected:
protected:
/**
* <p>Create a LanguageBreakEngine for the set of characters to which
* the supplied character belongs, for the specified break type.</p>
@ -273,17 +272,15 @@ class ICULanguageBreakFactory : public LanguageBreakFactory {
*/
virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, int32_t breakType);
/**
* <p>Create a CompactTrieDictionary for the specified script and break type.</p>
*
* @param script An ISO 15924 script code that identifies the dictionary to be
* created.
* @param breakType The kind of text break for which a dictionary is
* sought.
* @return A CompactTrieDictionary with the desired characteristics, or 0.
*/
virtual const CompactTrieDictionary *loadDictionaryFor(UScriptCode script, int32_t breakType);
/**
* <p>Create a DictionaryMatcher for the specified script and break type.</p>
* @param script An ISO 15924 script code that identifies the dictionary to be
* created.
* @param breakType The kind of text break for which a dictionary is
* sought.
* @return A DictionaryMatcher with the desired characteristics, or NULL.
*/
virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType);
};
U_NAMESPACE_END

View file

@ -248,7 +248,7 @@
<ClCompile Include="rbbisetb.cpp" />
<ClCompile Include="rbbistbl.cpp" />
<ClCompile Include="rbbitblb.cpp" />
<ClCompile Include="triedict.cpp" />
<ClCompile Include="dictionarydata.cpp" />
<ClCompile Include="ubrk.cpp" />
<ClCompile Include="ucol_swp.cpp">
<AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">..\i18n;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
@ -520,7 +520,7 @@
<ClInclude Include="rbbiscan.h" />
<ClInclude Include="rbbisetb.h" />
<ClInclude Include="rbbitblb.h" />
<ClInclude Include="triedict.h" />
<ClInclude Include="dictionarydata.h" />
<CustomBuild Include="unicode\ubrk.h">
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">copy "%(FullPath)" ..\..\include\unicode
</Command>

View file

@ -1,6 +1,6 @@
/**
*******************************************************************************
* Copyright (C) 2006-2008,2011, International Business Machines Corporation *
* Copyright (C) 2006-2008,2012, International Business Machines Corporation *
* and others. All Rights Reserved. *
*******************************************************************************
*/
@ -15,7 +15,10 @@
#include "unicode/chariter.h"
#include "unicode/ubrk.h"
#include "uvector.h"
#include "triedict.h"
#include "uassert.h"
#include "unicode/normlzr.h"
#include "cmemory.h"
#include "dictionarydata.h"
U_NAMESPACE_BEGIN
@ -23,10 +26,6 @@ U_NAMESPACE_BEGIN
******************************************************************
*/
/*DictionaryBreakEngine::DictionaryBreakEngine() {
fTypes = 0;
}*/
DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
fTypes = breakTypes;
}
@ -87,11 +86,6 @@ DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
fSet.compact();
}
/*void
DictionaryBreakEngine::setBreakTypes( uint32_t breakTypes ) {
fTypes = breakTypes;
}*/
/*
******************************************************************
*/
@ -105,34 +99,34 @@ DictionaryBreakEngine::setBreakTypes( uint32_t breakTypes ) {
#define POSSIBLE_WORD_LIST_MAX 20
class PossibleWord {
private:
// list of word candidate lengths, in increasing length order
int32_t lengths[POSSIBLE_WORD_LIST_MAX];
int count; // Count of candidates
int32_t prefix; // The longest match with a dictionary word
int32_t offset; // Offset in the text of these candidates
int mark; // The preferred candidate's offset
int current; // The candidate we're currently looking at
private:
// list of word candidate lengths, in increasing length order
int32_t lengths[POSSIBLE_WORD_LIST_MAX];
int count; // Count of candidates
int32_t prefix; // The longest match with a dictionary word
int32_t offset; // Offset in the text of these candidates
int mark; // The preferred candidate's offset
int current; // The candidate we're currently looking at
public:
PossibleWord();
~PossibleWord();
public:
PossibleWord();
~PossibleWord();
// Fill the list of candidates if needed, select the longest, and return the number found
int candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd );
// Fill the list of candidates if needed, select the longest, and return the number found
int candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
// Select the currently marked candidate, point after it in the text, and invalidate self
int32_t acceptMarked( UText *text );
// Select the currently marked candidate, point after it in the text, and invalidate self
int32_t acceptMarked( UText *text );
// Back up from the current candidate to the next shorter one; return TRUE if that exists
// and point the text after it
UBool backUp( UText *text );
// Back up from the current candidate to the next shorter one; return TRUE if that exists
// and point the text after it
UBool backUp( UText *text );
// Return the longest prefix this candidate location shares with a dictionary word
int32_t longestPrefix();
// Return the longest prefix this candidate location shares with a dictionary word
int32_t longestPrefix();
// Mark the current candidate as the one we like
void markCurrent();
// Mark the current candidate as the one we like
void markCurrent();
};
inline
@ -145,7 +139,7 @@ PossibleWord::~PossibleWord() {
}
inline int
PossibleWord::candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd ) {
PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
// TODO: If getIndex is too slow, use offset < 0 and add discardAll()
int32_t start = (int32_t)utext_getNativeIndex(text);
if (start != offset) {
@ -211,7 +205,7 @@ PossibleWord::markCurrent() {
// Minimum number of characters for two words
#define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2)
ThaiBreakEngine::ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status)
ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
: DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
fDictionary(adoptDictionary)
{
@ -266,10 +260,9 @@ ThaiBreakEngine::divideUpDictionaryRange( UText *text,
// If we found exactly one, use that
if (candidates == 1) {
wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text);
wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
wordsFound += 1;
}
// If there was more than one, see which one can take us forward the most words
else if (candidates > 1) {
// If we're already at the end of the range, we're done
@ -278,7 +271,7 @@ ThaiBreakEngine::divideUpDictionaryRange( UText *text,
}
do {
int wordsMatched = 1;
if (words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
if (wordsMatched < 2) {
// Followed by another dictionary word; mark first word as a good candidate
words[wordsFound%THAI_LOOKAHEAD].markCurrent();
@ -293,17 +286,17 @@ ThaiBreakEngine::divideUpDictionaryRange( UText *text,
// See if any of the possible second words is followed by a third word
do {
// If we find a third word, stop right away
if (words[(wordsFound+2)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
words[wordsFound%THAI_LOOKAHEAD].markCurrent();
if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
words[wordsFound % THAI_LOOKAHEAD].markCurrent();
goto foundBest;
}
}
while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(text));
while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
}
}
while (words[wordsFound%THAI_LOOKAHEAD].backUp(text));
while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
foundBest:
wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text);
wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
wordsFound += 1;
}
@ -316,7 +309,7 @@ foundBest:
// if it is a dictionary word, do nothing. If it isn't, then if there is
// no preceding word, or the non-word shares less than the minimum threshold
// of characters with a dictionary word, then scan to resynchronize
if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
&& (wordLength == 0
|| words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
// Look for a plausible word boundary
@ -339,8 +332,8 @@ foundBest:
// two characters after uc were not 0x0E4C THANTHAKHAT before
// checking the dictionary. That is just a performance filter,
// but it's not clear it's faster than checking the trie.
int candidates = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
utext_setNativeIndex(text, current+wordLength+chars);
int candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
utext_setNativeIndex(text, current + wordLength + chars);
if (candidates > 0) {
break;
}
@ -438,8 +431,8 @@ foundBest:
// Minimum number of characters for two words
#define KHMER_MIN_WORD_SPAN (KHMER_MIN_WORD * 2)
KhmerBreakEngine::KhmerBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status)
: DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
: DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
fDictionary(adoptDictionary)
{
fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
@ -511,10 +504,10 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text,
}
do {
int wordsMatched = 1;
if (words[(wordsFound+1)%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
if (wordsMatched < 2) {
// Followed by another dictionary word; mark first word as a good candidate
words[wordsFound%KHMER_LOOKAHEAD].markCurrent();
words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
wordsMatched = 2;
}
@ -526,17 +519,17 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text,
// See if any of the possible second words is followed by a third word
do {
// If we find a third word, stop right away
if (words[(wordsFound+2)%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
words[wordsFound%KHMER_LOOKAHEAD].markCurrent();
if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
goto foundBest;
}
}
while (words[(wordsFound+1)%KHMER_LOOKAHEAD].backUp(text));
while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
}
}
while (words[wordsFound%KHMER_LOOKAHEAD].backUp(text));
while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
foundBest:
wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text);
wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
wordsFound += 1;
}
@ -549,9 +542,9 @@ foundBest:
// if it is a dictionary word, do nothing. If it isn't, then if there is
// no preceding word, or the non-word shares less than the minimum threshold
// of characters with a dictionary word, then scan to resynchronize
if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
&& (wordLength == 0
|| words[wordsFound%KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
|| words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
// Look for a plausible word boundary
//TODO: This section will need a rework for UText.
int32_t remaining = rangeEnd - (current+wordLength);
@ -568,7 +561,7 @@ foundBest:
}
if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
// Maybe. See if it's in the dictionary.
int candidates = words[(wordsFound+1)%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
int candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
utext_setNativeIndex(text, current+wordLength+chars);
if (candidates > 0) {
break;
@ -651,6 +644,296 @@ foundBest:
return wordsFound;
}
/*
******************************************************************
* CjkBreakEngine
*/
static const uint32_t kuint32max = 0xFFFFFFFF;
CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
: DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
// Korean dictionary only includes Hangul syllables
fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
if (U_SUCCESS(status)) {
// handle Korean and Japanese/Chinese using different dictionaries
if (type == kKorean) {
setCharacters(fHangulWordSet);
} else { //Chinese and Japanese
UnicodeSet cjSet;
cjSet.addAll(fHanWordSet);
cjSet.addAll(fKatakanaWordSet);
cjSet.addAll(fHiraganaWordSet);
cjSet.add(UNICODE_STRING_SIMPLE("\\uff70\\u30fc"));
setCharacters(cjSet);
}
}
}
CjkBreakEngine::~CjkBreakEngine(){
delete fDictionary;
}
// The katakanaCost values below are based on the length frequencies of all
// katakana phrases in the dictionary
static const int kMaxKatakanaLength = 8;
static const int kMaxKatakanaGroupLength = 20;
static const uint32_t maxSnlp = 255;
static inline uint32_t getKatakanaCost(int wordLength){
//TODO: fill array with actual values from dictionary!
static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
= {8192, 984, 408, 240, 204, 252, 300, 372, 480};
return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
}
static inline bool isKatakana(uint16_t value) {
return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
(value >= 0xFF66u && value <= 0xFF9fu);
}
// A very simple helper class to streamline the buffer handling in
// divideUpDictionaryRange.
template<class T, size_t N>
class AutoBuffer {
public:
AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) {
if (size > N) {
buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
capacity = size;
}
}
~AutoBuffer() {
if (buffer != stackBuffer)
uprv_free(buffer);
}
T* elems() {
return buffer;
}
const T& operator[] (size_t i) const {
return buffer[i];
}
T& operator[] (size_t i) {
return buffer[i];
}
// resize without copy
void resize(size_t size) {
if (size <= capacity)
return;
if (buffer != stackBuffer)
uprv_free(buffer);
buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
capacity = size;
}
private:
T stackBuffer[N];
T* buffer;
AutoBuffer();
size_t capacity;
};
/*
* @param text A UText representing the text
* @param rangeStart The start of the range of dictionary characters
* @param rangeEnd The end of the range of dictionary characters
* @param foundBreaks Output of C array of int32_t break positions, or 0
* @return The number of breaks found
*/
int32_t
CjkBreakEngine::divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UStack &foundBreaks ) const {
if (rangeStart >= rangeEnd) {
return 0;
}
const size_t defaultInputLength = 80;
size_t inputLength = rangeEnd - rangeStart;
// TODO: Replace by UnicodeString.
AutoBuffer<UChar, defaultInputLength> charString(inputLength);
// Normalize the input string and put it in normalizedText.
// The map from the indices of the normalized input to the raw
// input is kept in charPositions.
UErrorCode status = U_ZERO_ERROR;
utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status);
if (U_FAILURE(status)) {
return 0;
}
UnicodeString inputString(charString.elems(), inputLength);
UNormalizationMode norm_mode = UNORM_NFKC;
UBool isNormalized =
Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES ||
Normalizer::isNormalized(inputString, norm_mode, status);
// TODO: Replace by UVector32.
AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1);
int numChars = 0;
UText normalizedText = UTEXT_INITIALIZER;
// Needs to be declared here because normalizedText holds onto its buffer.
UnicodeString normalizedString;
if (isNormalized) {
int32_t index = 0;
charPositions[0] = 0;
while(index < inputString.length()) {
index = inputString.moveIndex32(index, 1);
charPositions[++numChars] = index;
}
utext_openUnicodeString(&normalizedText, &inputString, &status);
}
else {
Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status);
if (U_FAILURE(status)) {
return 0;
}
charPositions.resize(normalizedString.length() + 1);
Normalizer normalizer(charString.elems(), inputLength, norm_mode);
int32_t index = 0;
charPositions[0] = 0;
while(index < normalizer.endIndex()){
UChar32 uc = normalizer.next();
charPositions[++numChars] = index = normalizer.getIndex();
}
utext_openUnicodeString(&normalizedText, &normalizedString, &status);
}
if (U_FAILURE(status)) {
return 0;
}
// From this point on, all the indices refer to the indices of
// the normalized input string.
// bestSnlp[i] is the snlp of the best segmentation of the first i
// characters in the range to be matched.
// TODO: Replace by UVector32.
AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1);
bestSnlp[0] = 0;
for(int i = 1; i <= numChars; i++) {
bestSnlp[i] = kuint32max;
}
// prev[i] is the index of the last CJK character in the previous word in
// the best segmentation of the first i characters.
// TODO: Replace by UVector32.
AutoBuffer<int, defaultInputLength> prev(numChars + 1);
for(int i = 0; i <= numChars; i++){
prev[i] = -1;
}
const size_t maxWordSize = 20;
// TODO: Replace both with UVector32.
AutoBuffer<int32_t, maxWordSize> values(numChars);
AutoBuffer<int32_t, maxWordSize> lengths(numChars);
// Dynamic programming to find the best segmentation.
bool is_prev_katakana = false;
for (int i = 0; i < numChars; ++i) {
//utext_setNativeIndex(text, rangeStart + i);
utext_setNativeIndex(&normalizedText, i);
if (bestSnlp[i] == kuint32max)
continue;
int count;
// limit maximum word length matched to size of current substring
int maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize : (numChars - i);
fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems());
// if there are no single character matches found in the dictionary
// starting with this charcter, treat character as a 1-character word
// with the highest value possible, i.e. the least likely to occur.
// Exclude Korean characters from this treatment, as they should be left
// together by default.
if((count == 0 || lengths[0] != 1) &&
!fHangulWordSet.contains(utext_current32(&normalizedText))) {
values[count] = maxSnlp;
lengths[count++] = 1;
}
for (int j = 0; j < count; j++) {
uint32_t newSnlp = bestSnlp[i] + values[j];
if (newSnlp < bestSnlp[lengths[j] + i]) {
bestSnlp[lengths[j] + i] = newSnlp;
prev[lengths[j] + i] = i;
}
}
// In Japanese,
// Katakana word in single character is pretty rare. So we apply
// the following heuristic to Katakana: any continuous run of Katakana
// characters is considered a candidate word with a default cost
// specified in the katakanaCost table according to its length.
//utext_setNativeIndex(text, rangeStart + i);
utext_setNativeIndex(&normalizedText, i);
bool is_katakana = isKatakana(utext_current32(&normalizedText));
if (!is_prev_katakana && is_katakana) {
int j = i + 1;
utext_next32(&normalizedText);
// Find the end of the continuous run of Katakana characters
while (j < numChars && (j - i) < kMaxKatakanaGroupLength &&
isKatakana(utext_current32(&normalizedText))) {
utext_next32(&normalizedText);
++j;
}
if ((j - i) < kMaxKatakanaGroupLength) {
uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
if (newSnlp < bestSnlp[j]) {
bestSnlp[j] = newSnlp;
prev[j] = i;
}
}
}
is_prev_katakana = is_katakana;
}
// Start pushing the optimal offset index into t_boundary (t for tentative).
// prev[numChars] is guaranteed to be meaningful.
// We'll first push in the reverse order, i.e.,
// t_boundary[0] = numChars, and afterwards do a swap.
// TODO: Replace by UVector32.
AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);
int numBreaks = 0;
// No segmentation found, set boundary to end of range
if (bestSnlp[numChars] == kuint32max) {
t_boundary[numBreaks++] = numChars;
} else {
for (int i = numChars; i > 0; i = prev[i]) {
t_boundary[numBreaks++] = i;
}
U_ASSERT(prev[t_boundary[numBreaks - 1]] == 0);
}
// Reverse offset index in t_boundary.
// Don't add a break for the start of the dictionary range if there is one
// there already.
if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
t_boundary[numBreaks++] = 0;
}
// Now that we're done, convert positions in t_bdry[] (indices in
// the normalized input string) back to indices in the raw input string
// while reversing t_bdry and pushing values to foundBreaks.
for (int i = numBreaks-1; i >= 0; i--) {
foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status);
}
utext_close(&normalizedText);
return numBreaks;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

View file

@ -1,6 +1,6 @@
/**
*******************************************************************************
* Copyright (C) 2006,2011, International Business Machines Corporation *
* Copyright (C) 2006,2012, International Business Machines Corporation *
* and others. All Rights Reserved. *
*******************************************************************************
*/
@ -16,7 +16,7 @@
U_NAMESPACE_BEGIN
class TrieWordDictionary;
class DictionaryMatcher;
/*******************************************************************
* DictionaryBreakEngine
@ -65,31 +65,31 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
*/
virtual ~DictionaryBreakEngine();
/**
* <p>Indicate whether this engine handles a particular character for
* a particular kind of break.</p>
*
* @param c A character which begins a run that the engine might handle
* @param breakType The type of text break which the caller wants to determine
* @return TRUE if this engine handles the particular character and break
* type.
*/
/**
* <p>Indicate whether this engine handles a particular character for
* a particular kind of break.</p>
*
* @param c A character which begins a run that the engine might handle
* @param breakType The type of text break which the caller wants to determine
* @return TRUE if this engine handles the particular character and break
* type.
*/
virtual UBool handles( UChar32 c, int32_t breakType ) const;
/**
* <p>Find any breaks within a run in the supplied text.</p>
*
* @param text A UText representing the text. The
* iterator is left at the end of the run of characters which the engine
* is capable of handling.
* @param startPos The start of the run within the supplied text.
* @param endPos The end of the run within the supplied text.
* @param reverse Whether the caller is looking for breaks in a reverse
* direction.
* @param breakType The type of break desired, or -1.
* @param foundBreaks An allocated C array of the breaks found, if any
* @return The number of breaks found.
*/
/**
* <p>Find any breaks within a run in the supplied text.</p>
*
* @param text A UText representing the text. The iterator is left at
* the end of the run of characters which the engine is capable of handling
* that starts from the first (or last) character in the range.
* @param startPos The start of the run within the supplied text.
* @param endPos The end of the run within the supplied text.
* @param reverse Whether the caller is looking for breaks in a reverse
* direction.
* @param breakType The type of break desired, or -1.
* @param foundBreaks An allocated C array of the breaks found, if any
* @return The number of breaks found.
*/
virtual int32_t findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
@ -114,7 +114,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
// virtual void setBreakTypes( uint32_t breakTypes );
/**
* <p>Divide up a range of known dictionary characters.</p>
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
*
* @param text A UText representing the text
* @param rangeStart The start of the range of dictionary characters
@ -135,7 +135,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
/**
* <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
* TrieWordDictionary and heuristics to determine Thai-specific breaks.</p>
* dictionary and heuristics to determine Thai-specific breaks.</p>
*
* <p>After it is constructed a ThaiBreakEngine may be shared between
* threads without synchronization.</p>
@ -152,17 +152,17 @@ class ThaiBreakEngine : public DictionaryBreakEngine {
UnicodeSet fBeginWordSet;
UnicodeSet fSuffixSet;
UnicodeSet fMarkSet;
const TrieWordDictionary *fDictionary;
DictionaryMatcher *fDictionary;
public:
/**
* <p>Default constructor.</p>
*
* @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
* engine is deleted.
*/
ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status);
ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
/**
* <p>Virtual destructor.</p>
@ -171,7 +171,7 @@ class ThaiBreakEngine : public DictionaryBreakEngine {
protected:
/**
* <p>Divide up a range of known dictionary characters.</p>
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
*
* @param text A UText representing the text
* @param rangeStart The start of the range of dictionary characters
@ -186,6 +186,66 @@ class ThaiBreakEngine : public DictionaryBreakEngine {
};
/*******************************************************************
* CjkBreakEngine
*/
//indicates language/script that the CjkBreakEngine will handle
enum LanguageType {
kKorean,
kChineseJapanese
};
/**
* <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
* dictionary with costs associated with each word and
* Viterbi decoding to determine CJK-specific breaks.</p>
*/
class CjkBreakEngine : public DictionaryBreakEngine {
protected:
/**
* The set of characters handled by this engine
* @internal
*/
UnicodeSet fHangulWordSet;
UnicodeSet fHanWordSet;
UnicodeSet fKatakanaWordSet;
UnicodeSet fHiraganaWordSet;
DictionaryMatcher *fDictionary;
public:
/**
* <p>Default constructor.</p>
*
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
* engine is deleted. The DictionaryMatcher must contain costs for each word
* in order for the dictionary to work properly.
*/
CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
/**
* <p>Virtual destructor.</p>
*/
virtual ~CjkBreakEngine();
protected:
/**
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
*
* @param text A UText representing the text
* @param rangeStart The start of the range of dictionary characters
* @param rangeEnd The end of the range of dictionary characters
* @param foundBreaks Output of C array of int32_t break positions, or 0
* @return The number of breaks found
*/
virtual int32_t divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UStack &foundBreaks ) const;
};
/*******************************************************************
* KhmerBreakEngine
@ -209,7 +269,7 @@ class KhmerBreakEngine : public DictionaryBreakEngine {
UnicodeSet fEndWordSet;
UnicodeSet fBeginWordSet;
UnicodeSet fMarkSet;
const TrieWordDictionary *fDictionary;
DictionaryMatcher *fDictionary;
public:
@ -219,7 +279,7 @@ class KhmerBreakEngine : public DictionaryBreakEngine {
* @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the
* engine is deleted.
*/
KhmerBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status);
KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
/**
* <p>Virtual destructor.</p>

View file

@ -0,0 +1,218 @@
/*
*******************************************************************************
* Copyright (C) 2012, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* dictionarydata.h
*
* created on: 2012may31
* created by: Markus W. Scherer & Maxime Serrano
*/
#include "dictionarydata.h"
#include "unicode/ucharstrie.h"
#include "unicode/bytestrie.h"
#include "unicode/udata.h"
#include "cmemory.h"
U_NAMESPACE_BEGIN
UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
udata_close(file);
}
int32_t UCharsDictionaryMatcher::getType() const {
return DictionaryData::TRIE_TYPE_UCHARS;
}
int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int *lengths, int &count, int limit, int32_t *values) const {
UCharsTrie uct(characters);
UChar32 c = utext_next32(text);
if (c < 0) {
return 0;
}
UStringTrieResult result = uct.first(c);
int32_t numChars = 1;
count = 0;
for (;;) {
if (USTRINGTRIE_HAS_VALUE(result)) {
if (count < limit) {
if (values != NULL) {
values[count] = uct.getValue();
}
lengths[count++] = numChars;
}
if (result == USTRINGTRIE_FINAL_VALUE) {
break;
}
}
else if (result == USTRINGTRIE_NO_MATCH) {
break;
}
// TODO: why do we have a text limit if the UText knows its length?
if (numChars >= maxLength) {
break;
}
c = utext_next32(text);
if (c < 0) {
break;
}
++numChars;
result = uct.next(c);
}
return numChars;
}
BytesDictionaryMatcher::~BytesDictionaryMatcher() {
udata_close(file);
}
UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
if (c == 0x200D) {
return 0xFF;
} else if (c == 0x200C) {
return 0xFE;
}
int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
if (delta < 0 || 0xFD < delta) {
return U_SENTINEL;
}
return (UChar32)delta;
}
return c;
}
int32_t BytesDictionaryMatcher::getType() const {
return DictionaryData::TRIE_TYPE_BYTES;
}
int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int *lengths, int &count, int limit, int32_t *values) const {
BytesTrie bt(characters);
UChar32 c = utext_next32(text);
if (c < 0) {
return 0;
}
UStringTrieResult result = bt.first(transform(c));
int32_t numChars = 1;
count = 0;
for (;;) {
if (USTRINGTRIE_HAS_VALUE(result)) {
if (count < limit) {
if (values != NULL) {
values[count] = bt.getValue();
}
lengths[count++] = numChars;
}
if (result == USTRINGTRIE_FINAL_VALUE) {
break;
}
}
else if (result == USTRINGTRIE_NO_MATCH) {
break;
}
// TODO: why do we have a text limit if the UText knows its length?
if (numChars >= maxLength) {
break;
}
c = utext_next32(text);
if (c < 0) {
break;
}
++numChars;
result = bt.next(transform(c));
}
return numChars;
}
U_NAMESPACE_END
U_NAMESPACE_USE
U_CAPI int32_t U_EXPORT2
udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
void *outData, UErrorCode *pErrorCode) {
const UDataInfo *pInfo;
int32_t headerSize;
const uint8_t *inBytes;
uint8_t *outBytes;
const int32_t *inIndexes;
int32_t indexes[DictionaryData::IX_COUNT];
int32_t i, offset, size;
headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
pInfo = (const UDataInfo *)((const char *)inData + 4);
if (!(pInfo->dataFormat[0] == 0x44 &&
pInfo->dataFormat[1] == 0x69 &&
pInfo->dataFormat[2] == 0x63 &&
pInfo->dataFormat[3] == 0x74 &&
pInfo->formatVersion[0] == 1)) {
udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
*pErrorCode = U_UNSUPPORTED_ERROR;
return 0;
}
inBytes = (const uint8_t *)inData + headerSize;
outBytes = (uint8_t *)outData + headerSize;
inIndexes = (const int32_t *)inBytes;
if (length >= 0) {
length -= headerSize;
if (length < (int32_t)(sizeof(indexes))) {
udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
}
for (i = 0; i < DictionaryData::IX_COUNT; i++) {
indexes[i] = udata_readInt32(ds, inIndexes[i]);
}
size = indexes[DictionaryData::IX_TOTAL_SIZE];
if (length >= 0) {
if (length < size) {
udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
if (inBytes != outBytes) {
uprv_memcpy(outBytes, inBytes, size);
}
offset = 0;
ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
offset = (int32_t)sizeof(indexes);
int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
} else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
// nothing to do
} else {
udata_printError(ds, "udict_swap(): unknown trie type!\n");
*pErrorCode = U_UNSUPPORTED_ERROR;
return 0;
}
// these next two sections are empty in the current format,
// but may be used later.
offset = nextOffset;
nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
offset = nextOffset;
nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
offset = nextOffset;
}
return headerSize + size;
}

View file

@ -0,0 +1,160 @@
/*
*******************************************************************************
* Copyright (C) 2012, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* dictionarydata.h
*
* created on: 2012may31
* created by: Markus W. Scherer & Maxime Serrano
*/
#ifndef __DICTIONARYDATA_H__
#define __DICTIONARYDATA_H__
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/utext.h"
#include "unicode/udata.h"
#include "udataswp.h"
#include "unicode/uobject.h"
#include "unicode/ustringtrie.h"
U_NAMESPACE_BEGIN
class UCharsTrie;
class BytesTrie;
class U_COMMON_API DictionaryData : public UMemory {
public:
static const int32_t TRIE_TYPE_BYTES = 0;
static const int32_t TRIE_TYPE_UCHARS = 1;
static const int32_t TRIE_TYPE_MASK = 7;
static const int32_t TRIE_HAS_VALUES = 8;
static const int32_t TRANSFORM_NONE = 0;
static const int32_t TRANSFORM_TYPE_OFFSET = 0x1000000;
static const int32_t TRANSFORM_TYPE_MASK = 0x7f000000;
static const int32_t TRANSFORM_OFFSET_MASK = 0x1fffff;
enum {
// Byte offsets from the start of the data, after the generic header.
IX_STRING_TRIE_OFFSET,
IX_RESERVED1_OFFSET,
IX_RESERVED2_OFFSET,
IX_TOTAL_SIZE,
// Trie type: TRIE_HAS_VALUES | TRIE_TYPE_BYTES etc.
IX_TRIE_TYPE,
// Transform specification: TRANSFORM_TYPE_OFFSET | 0xe00 etc.
IX_TRANSFORM,
IX_RESERVED6,
IX_RESERVED7,
IX_COUNT
};
};
/**
* Wrapper class around generic dictionaries, implementing matches().
* getType() should return a TRIE_TYPE_??? constant from DictionaryData.
*
* All implementations of this interface must be threadsafe if they are to be used inside of the
* dictionary-based break iteration code.
*/
class U_COMMON_API DictionaryMatcher {
public:
// this should emulate CompactTrieDictionary::matches()
virtual int32_t matches(UText *text, int32_t maxLength, int32_t *lengths, int &count, int limit, int32_t *values = NULL) const = 0;
/** @return DictionaryData::TRIE_TYPE_XYZ */
virtual int32_t getType() const = 0;
};
// Implementation of the DictionaryMatcher interface for a UCharsTrie dictionary
class U_COMMON_API UCharsDictionaryMatcher : public DictionaryMatcher {
public:
// constructs a new UCharsDictionaryMatcher.
// The UDataMemory * will be closed on this object's destruction.
UCharsDictionaryMatcher(const UChar *c, UDataMemory *f) : characters(c), file(f) { }
~UCharsDictionaryMatcher();
virtual int32_t matches(UText *text, int32_t maxLength, int32_t *lengths, int &count, int limit, int32_t *values = NULL) const;
virtual int32_t getType() const;
private:
const UChar *characters;
UDataMemory *file;
};
// Implementation of the DictionaryMatcher interface for a BytesTrie dictionary
class U_COMMON_API BytesDictionaryMatcher : public DictionaryMatcher {
public:
// constructs a new BytesTrieDictionaryMatcher
// the transform constant should be the constant read from the file, not a masked version!
// the UDataMemory * fed in here will be closed on this object's destruction
BytesDictionaryMatcher(const char *c, int32_t t, UDataMemory *f) : characters(c), transformConstant(t), file(f) { }
~BytesDictionaryMatcher();
virtual int32_t matches(UText *text, int32_t maxLength, int32_t *lengths, int &count, int limit, int32_t *values = NULL) const;
virtual int32_t getType() const;
private:
UChar32 transform(UChar32 c) const;
const char *characters;
int32_t transformConstant;
UDataMemory *file;
};
U_NAMESPACE_END
U_CAPI int32_t U_EXPORT2
udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *pErrorCode);
/**
* Format of dictionary .dict data files.
* Format version 1.0.
*
* A dictionary .dict data file contains a byte-serialized BytesTrie or
* a UChars-serialized UCharsTrie.
* Such files are used in dictionary-based break iteration (DBBI).
*
* For a BytesTrie, a transformation type is specified for
* transforming Unicode strings into byte sequences.
*
* A .dict file begins with a standard ICU data file header
* (DataHeader, see ucmndata.h and unicode/udata.h).
* The UDataInfo.dataVersion field is currently unused (set to 0.0.0.0).
*
* After the header, the file contains the following parts.
* Constants are defined in the DictionaryData class.
*
* For the data structure of BytesTrie & UCharsTrie see
* http://site.icu-project.org/design/struct/tries
* and the bytestrie.h and ucharstrie.h header files.
*
* int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_STRING_TRIE_OFFSET]/4;
*
* The first four indexes are byte offsets in ascending order.
* Each byte offset marks the start of the next part in the data file,
* and the end of the previous one.
* When two consecutive byte offsets are the same, then the corresponding part is empty.
* Byte offsets are offsets from after the header,
* that is, from the beginning of the indexes[].
* Each part starts at an offset with proper alignment for its data.
* If necessary, the previous part may include padding bytes to achieve this alignment.
*
* trieType=indexes[IX_TRIE_TYPE] defines the trie type.
* transform=indexes[IX_TRANSFORM] defines the Unicode-to-bytes transformation.
* If the transformation type is TRANSFORM_TYPE_OFFSET,
* then the lower 21 bits contain the offset code point.
* Each code point c is mapped to byte b = (c - offset).
* Code points outside the range offset..(offset+0xff) cannot be mapped
* and do not occur in the dictionary.
*
* stringTrie; -- a serialized BytesTrie or UCharsTrie
*
* The dictionary maps strings to specific values (TRIE_HAS_VALUES bit set in trieType),
* or it maps all strings to 0 (TRIE_HAS_VALUES bit not set).
*/
#endif /* !UCONFIG_NO_BREAK_ITERATION */
#endif /* __DICTIONARYDATA_H__ */

View file

@ -1615,10 +1615,12 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
int32_t endPos,
UBool reverse) {
// Reset the old break cache first.
uint32_t dictionaryCount = fDictionaryCharCount;
reset();
if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
// note: code segment below assumes that dictionary chars are in the
// startPos-endPos range
// value returned should be next character in sequence
if ((endPos - startPos) <= 1) {
return (reverse ? startPos : endPos);
}
@ -1771,7 +1773,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
// proposed break by one of the breaks we found. Use following() and
// preceding() to do the work. They should never recurse in this case.
if (reverse) {
return preceding(endPos - 1);
return preceding(endPos);
}
else {
return following(startPos);
@ -1861,7 +1863,7 @@ getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
//-------------------------------------------------------------------------------
//
// getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the
// the characer c.
// the character c.
//
//-------------------------------------------------------------------------------
const LanguageBreakEngine *

File diff suppressed because it is too large Load diff

View file

@ -1,346 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and others. *
* All Rights Reserved. *
*******************************************************************************
*/
#ifndef TRIEDICT_H
#define TRIEDICT_H
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/utext.h"
struct UEnumeration;
struct UDataSwapper;
struct UDataMemory;
/**
* <p>UDataSwapFn function for use in swapping a compact dictionary.</p>
*
* @param ds Pointer to UDataSwapper containing global data about the
* transformation and function pointers for handling primitive
* types.
* @param inData Pointer to the input data to be transformed or examined.
* @param length Length of the data, counting bytes. May be -1 for preflighting.
* If length>=0, then transform the data.
* If length==-1, then only determine the length of the data.
* The length cannot be determined from the data itself for all
* types of data (e.g., not for simple arrays of integers).
* @param outData Pointer to the output data buffer.
* If length>=0 (transformation), then the output buffer must
* have a capacity of at least length.
* If length==-1, then outData will not be used and can be NULL.
* @param pErrorCode ICU UErrorCode parameter, must not be NULL and must
* fulfill U_SUCCESS on input.
* @return The actual length of the data.
*
* @see UDataSwapper
*/
U_CAPI int32_t U_EXPORT2
triedict_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode);
U_NAMESPACE_BEGIN
class StringEnumeration;
struct CompactTrieHeader;
/*******************************************************************
* TrieWordDictionary
*/
/**
* <p>TrieWordDictionary is an abstract class that represents a word
* dictionary based on a trie. The base protocol is read-only.
* Subclasses may allow writing.</p>
*/
class U_COMMON_API TrieWordDictionary : public UMemory {
public:
/**
* <p>Default constructor.</p>
*
*/
TrieWordDictionary();
/**
* <p>Virtual destructor.</p>
*/
virtual ~TrieWordDictionary();
/**
* <p>Find dictionary words that match the text.</p>
*
* @param text A UText representing the text. The
* iterator is left after the longest prefix match in the dictionary.
* @param start The current position in text.
* @param maxLength The maximum number of code units to match.
* @param lengths An array that is filled with the lengths of words that matched.
* @param count Filled with the number of elements output in lengths.
* @param limit The size of the lengths array; this limits the number of words output.
* @return The number of characters in text that were matched.
*/
virtual int32_t matches( UText *text,
int32_t maxLength,
int32_t *lengths,
int &count,
int limit ) const = 0;
/**
* <p>Return a StringEnumeration for iterating all the words in the dictionary.</p>
*
* @param status A status code recording the success of the call.
* @return A StringEnumeration that will iterate through the whole dictionary.
* The caller is responsible for closing it. The order is unspecified.
*/
virtual StringEnumeration *openWords( UErrorCode &status ) const = 0;
};
/*******************************************************************
* MutableTrieDictionary
*/
/**
* <p>MutableTrieDictionary is a TrieWordDictionary that allows words to be
* added.</p>
*/
struct TernaryNode; // Forwards declaration
class U_COMMON_API MutableTrieDictionary : public TrieWordDictionary {
private:
/**
* The root node of the trie
* @internal
*/
TernaryNode *fTrie;
/**
* A UText for internal use
* @internal
*/
UText *fIter;
friend class CompactTrieDictionary; // For fast conversion
public:
/**
* <p>Constructor.</p>
*
* @param median A UChar around which to balance the trie. Ideally, it should
* begin at least one word that is near the median of the set in the dictionary
* @param status A status code recording the success of the call.
*/
MutableTrieDictionary( UChar median, UErrorCode &status );
/**
* <p>Virtual destructor.</p>
*/
virtual ~MutableTrieDictionary();
/**
* <p>Find dictionary words that match the text.</p>
*
* @param text A UText representing the text. The
* iterator is left after the longest prefix match in the dictionary.
* @param maxLength The maximum number of code units to match.
* @param lengths An array that is filled with the lengths of words that matched.
* @param count Filled with the number of elements output in lengths.
* @param limit The size of the lengths array; this limits the number of words output.
* @return The number of characters in text that were matched.
*/
virtual int32_t matches( UText *text,
int32_t maxLength,
int32_t *lengths,
int &count,
int limit ) const;
/**
* <p>Return a StringEnumeration for iterating all the words in the dictionary.</p>
*
* @param status A status code recording the success of the call.
* @return A StringEnumeration that will iterate through the whole dictionary.
* The caller is responsible for closing it. The order is unspecified.
*/
virtual StringEnumeration *openWords( UErrorCode &status ) const;
/**
* <p>Add one word to the dictionary.</p>
*
* @param word A UChar buffer containing the word.
* @param length The length of the word.
* @param status The resultant status
*/
virtual void addWord( const UChar *word,
int32_t length,
UErrorCode &status);
#if 0
/**
* <p>Add all strings from a UEnumeration to the dictionary.</p>
*
* @param words A UEnumeration that will return the desired words.
* @param status The resultant status
*/
virtual void addWords( UEnumeration *words, UErrorCode &status );
#endif
protected:
/**
* <p>Search the dictionary for matches.</p>
*
* @param text A UText representing the text. The
* iterator is left after the longest prefix match in the dictionary.
* @param maxLength The maximum number of code units to match.
* @param lengths An array that is filled with the lengths of words that matched.
* @param count Filled with the number of elements output in lengths.
* @param limit The size of the lengths array; this limits the number of words output.
* @param parent The parent of the current node
* @param pMatched The returned parent node matched the input
* @return The number of characters in text that were matched.
*/
virtual int32_t search( UText *text,
int32_t maxLength,
int32_t *lengths,
int &count,
int limit,
TernaryNode *&parent,
UBool &pMatched ) const;
private:
/**
* <p>Private constructor. The root node it not allocated.</p>
*
* @param status A status code recording the success of the call.
*/
MutableTrieDictionary( UErrorCode &status );
};
/*******************************************************************
* CompactTrieDictionary
*/
/**
* <p>CompactTrieDictionary is a TrieWordDictionary that has been compacted
* to save space.</p>
*/
class U_COMMON_API CompactTrieDictionary : public TrieWordDictionary {
private:
/**
* The root node of the trie
*/
const CompactTrieHeader *fData;
/**
* A UBool indicating whether or not we own the fData.
*/
UBool fOwnData;
UDataMemory *fUData;
public:
/**
* <p>Construct a dictionary from a UDataMemory.</p>
*
* @param data A pointer to a UDataMemory, which is adopted
* @param status A status code giving the result of the constructor
*/
CompactTrieDictionary(UDataMemory *dataObj, UErrorCode &status);
/**
* <p>Construct a dictionary from raw saved data.</p>
*
* @param data A pointer to the raw data, which is still owned by the caller
* @param status A status code giving the result of the constructor
*/
CompactTrieDictionary(const void *dataObj, UErrorCode &status);
/**
* <p>Construct a dictionary from a MutableTrieDictionary.</p>
*
* @param dict The dictionary to use as input.
* @param status A status code recording the success of the call.
*/
CompactTrieDictionary( const MutableTrieDictionary &dict, UErrorCode &status );
/**
* <p>Virtual destructor.</p>
*/
virtual ~CompactTrieDictionary();
/**
* <p>Find dictionary words that match the text.</p>
*
* @param text A UText representing the text. The
* iterator is left after the longest prefix match in the dictionary.
* @param maxLength The maximum number of code units to match.
* @param lengths An array that is filled with the lengths of words that matched.
* @param count Filled with the number of elements output in lengths.
* @param limit The size of the lengths array; this limits the number of words output.
* @return The number of characters in text that were matched.
*/
virtual int32_t matches( UText *text,
int32_t rangeEnd,
int32_t *lengths,
int &count,
int limit ) const;
/**
* <p>Return a StringEnumeration for iterating all the words in the dictionary.</p>
*
* @param status A status code recording the success of the call.
* @return A StringEnumeration that will iterate through the whole dictionary.
* The caller is responsible for closing it. The order is unspecified.
*/
virtual StringEnumeration *openWords( UErrorCode &status ) const;
/**
* <p>Return the size of the compact data.</p>
*
* @return The size of the dictionary's compact data.
*/
virtual uint32_t dataSize() const;
/**
* <p>Return a void * pointer to the compact data, platform-endian.</p>
*
* @return The data for the compact dictionary, suitable for passing to the
* constructor.
*/
virtual const void *data() const;
/**
* <p>Return a MutableTrieDictionary clone of this dictionary.</p>
*
* @param status A status code recording the success of the call.
* @return A MutableTrieDictionary with the same data as this dictionary
*/
virtual MutableTrieDictionary *cloneMutable( UErrorCode &status ) const;
private:
/**
* <p>Convert a MutableTrieDictionary into a compact data blob.</p>
*
* @param dict The dictionary to convert.
* @param status A status code recording the success of the call.
* @return A single data blob starting with a CompactTrieHeader.
*/
static CompactTrieHeader *compactMutableTrieDictionary( const MutableTrieDictionary &dict,
UErrorCode &status );
};
U_NAMESPACE_END
/* TRIEDICT_H */
#endif

View file

@ -7498,7 +7498,7 @@ echo "CXXFLAGS=$CXXFLAGS"
# output the Makefiles
ac_config_files="$ac_config_files icudefs.mk Makefile data/pkgdataMakefile config/Makefile.inc config/icu.pc config/pkgdataMakefile data/Makefile stubdata/Makefile common/Makefile i18n/Makefile layout/Makefile layoutex/Makefile io/Makefile extra/Makefile extra/uconv/Makefile extra/uconv/pkgdataMakefile extra/scrptrun/Makefile tools/Makefile tools/ctestfw/Makefile tools/toolutil/Makefile tools/makeconv/Makefile tools/genrb/Makefile tools/genccode/Makefile tools/gencmn/Makefile tools/gencnval/Makefile tools/genctd/Makefile tools/gentest/Makefile tools/gennorm2/Makefile tools/genbrk/Makefile tools/gensprep/Makefile tools/icuinfo/Makefile tools/icupkg/Makefile tools/icuswap/Makefile tools/pkgdata/Makefile tools/tzcode/Makefile tools/gencfu/Makefile test/Makefile test/compat/Makefile test/testdata/Makefile test/testdata/pkgdataMakefile test/hdrtst/Makefile test/intltest/Makefile test/cintltst/Makefile test/iotest/Makefile test/letest/Makefile test/perf/Makefile test/perf/collationperf/Makefile test/perf/collperf/Makefile test/perf/dicttrieperf/Makefile test/perf/ubrkperf/Makefile test/perf/charperf/Makefile test/perf/convperf/Makefile test/perf/normperf/Makefile test/perf/DateFmtPerf/Makefile test/perf/howExpensiveIs/Makefile test/perf/strsrchperf/Makefile test/perf/unisetperf/Makefile test/perf/usetperf/Makefile test/perf/ustrperf/Makefile test/perf/utfperf/Makefile test/perf/utrie2perf/Makefile samples/Makefile samples/date/Makefile samples/cal/Makefile samples/layout/Makefile"
ac_config_files="$ac_config_files icudefs.mk Makefile data/pkgdataMakefile config/Makefile.inc config/icu.pc config/pkgdataMakefile data/Makefile stubdata/Makefile common/Makefile i18n/Makefile layout/Makefile layoutex/Makefile io/Makefile extra/Makefile extra/uconv/Makefile extra/uconv/pkgdataMakefile extra/scrptrun/Makefile tools/Makefile tools/ctestfw/Makefile tools/toolutil/Makefile tools/makeconv/Makefile tools/genrb/Makefile tools/genccode/Makefile tools/gencmn/Makefile tools/gencnval/Makefile tools/gendict/Makefile tools/gentest/Makefile tools/gennorm2/Makefile tools/genbrk/Makefile tools/gensprep/Makefile tools/icuinfo/Makefile tools/icupkg/Makefile tools/icuswap/Makefile tools/pkgdata/Makefile tools/tzcode/Makefile tools/gencfu/Makefile test/Makefile test/compat/Makefile test/testdata/Makefile test/testdata/pkgdataMakefile test/hdrtst/Makefile test/intltest/Makefile test/cintltst/Makefile test/iotest/Makefile test/letest/Makefile test/perf/Makefile test/perf/collationperf/Makefile test/perf/collperf/Makefile test/perf/dicttrieperf/Makefile test/perf/ubrkperf/Makefile test/perf/charperf/Makefile test/perf/convperf/Makefile test/perf/normperf/Makefile test/perf/DateFmtPerf/Makefile test/perf/howExpensiveIs/Makefile test/perf/strsrchperf/Makefile test/perf/unisetperf/Makefile test/perf/usetperf/Makefile test/perf/ustrperf/Makefile test/perf/utfperf/Makefile test/perf/utrie2perf/Makefile samples/Makefile samples/date/Makefile samples/cal/Makefile samples/layout/Makefile"
cat >confcache <<\_ACEOF
# This file is a shell script that caches the results of configure
@ -8244,7 +8244,7 @@ do
"tools/genccode/Makefile") CONFIG_FILES="$CONFIG_FILES tools/genccode/Makefile" ;;
"tools/gencmn/Makefile") CONFIG_FILES="$CONFIG_FILES tools/gencmn/Makefile" ;;
"tools/gencnval/Makefile") CONFIG_FILES="$CONFIG_FILES tools/gencnval/Makefile" ;;
"tools/genctd/Makefile") CONFIG_FILES="$CONFIG_FILES tools/genctd/Makefile" ;;
"tools/gendict/Makefile") CONFIG_FILES="$CONFIG_FILES tools/gendict/Makefile" ;;
"tools/gentest/Makefile") CONFIG_FILES="$CONFIG_FILES tools/gentest/Makefile" ;;
"tools/gennorm2/Makefile") CONFIG_FILES="$CONFIG_FILES tools/gennorm2/Makefile" ;;
"tools/genbrk/Makefile") CONFIG_FILES="$CONFIG_FILES tools/genbrk/Makefile" ;;

View file

@ -1229,7 +1229,7 @@ AC_CONFIG_FILES([icudefs.mk \
tools/genccode/Makefile \
tools/gencmn/Makefile \
tools/gencnval/Makefile \
tools/genctd/Makefile \
tools/gendict/Makefile \
tools/gentest/Makefile \
tools/gennorm2/Makefile \
tools/genbrk/Makefile \

View file

@ -250,10 +250,10 @@ BREAK_TREE=brkitr
ALL_BRK_SOURCE= $(BRK_SOURCE) $(BRK_SOURCE_LOCAL)
BRK_FILES_SHORT=$(ALL_BRK_SOURCE:%.txt=$(BREAK_TREE)/%.brk)
BRK_FILES=$(ALL_BRK_SOURCE:%.txt=$(BRKBLDDIR)/%.brk)
ifdef BRK_CTD_SOURCE
ALL_CTD_SOURCE=$(BRK_CTD_SOURCE) $(BRK_CTD_SOURCE_LOCAL)
CTD_FILES_SHORT=$(ALL_CTD_SOURCE:%.txt=$(BREAK_TREE)/%.ctd)
CTD_FILES=$(ALL_CTD_SOURCE:%.txt=$(BRKBLDDIR)/%.ctd)
ifdef BRK_DICT_SOURCE
ALL_DICT_SOURCE=$(BRK_DICT_SOURCE) $(BRK_DICT_SOURCE_LOCAL)
DICT_FILES_SHORT=$(ALL_DICT_SOURCE:%.txt=$(BREAK_TREE)/%.dict)
DICT_FILES=$(ALL_DICT_SOURCE:%.txt=$(BRKBLDDIR)/%.dict)
endif
ifdef BRK_RES_SOURCE
BRS_SRC= root.txt $(BRK_RES_SOURCE) $(BRK_RES_SOURCE_LOCAL)
@ -417,11 +417,11 @@ SPREP_FILES = $(ALL_SPREP_SOURCE:%.txt=$(BUILDDIR)/%.spp)
SPREP_FILES_SHORT = $(ALL_SPREP_SOURCE:%.txt=%.spp)
## All generated files
ALL_FILES = $(DAT_FILES) $(CNV_FILES) $(CNV_FILES_SPECIAL) $(BRK_FILES) $(CTD_FILES) $(RES_FILES) $(INDEX_RES_FILE) $(CURR_FILES) $(LANG_FILES) $(REGION_FILES) $(ZONE_FILES) $(COLLATION_FILES) $(BRK_RES_FILES) $(RBNF_FILES) $(TRANSLIT_FILES) $(SPREP_FILES) $(CFU_FILES)
ALL_FILES = $(DAT_FILES) $(CNV_FILES) $(CNV_FILES_SPECIAL) $(BRK_FILES) $(DICT_FILES) $(RES_FILES) $(INDEX_RES_FILE) $(CURR_FILES) $(LANG_FILES) $(REGION_FILES) $(ZONE_FILES) $(COLLATION_FILES) $(BRK_RES_FILES) $(RBNF_FILES) $(TRANSLIT_FILES) $(SPREP_FILES) $(CFU_FILES)
ALL_INDEX_SRC_FILES = $(PKGDATA_LIST) $(INDEX_FILE) $(CURR_INDEX_FILE) $(LANG_INDEX_FILE) $(REGION_INDEX_FILE) $(ZONE_INDEX_FILE) $(COLLATION_INDEX_FILE) $(BRK_RES_INDEX_FILE) $(RBNF_INDEX_FILE)
# a list to use in the .lst files (package-relative)
COLL_FILES_LIST=$(COLLATION_FILES_SHORT) $(COLLATION_INDEX_RES_SHORT)
BRK_FILES_LIST=$(BRK_FILES_SHORT) $(CTD_FILES_SHORT) $(BRK_RES_FILES_SHORT) $(BRK_RES_INDEX_RES_SHORT)
BRK_FILES_LIST=$(BRK_FILES_SHORT) $(BRK_RES_FILES_SHORT) $(BRK_RES_INDEX_RES_SHORT) $(DICT_FILES_SHORT)
LOCALE_FILES_LIST= $(RES_FILES_SHORT) $(LANG_FILES_SHORT) $(REGION_FILES_SHORT) $(ZONE_FILES_SHORT)
MISC_FILES_LIST=$(DAT_FILES_SHORT) $(CNV_FILES_SHORT) $(CNV_FILES_SHORT_SPECIAL) $(CURR_FILES_SHORT) $(RBNF_FILES_SHORT) $(RBNF_INDEX_RES_SHORT) $(TRANSLIT_FILES_SHORT) $(SPREP_FILES_SHORT) $(CFU_FILES_SHORT)
UNI_CORE_DATA=pnames.icu uprops.icu ucase.icu ubidi.icu
@ -516,11 +516,20 @@ $(BUILDDIR)/%.spp: $(SPREPSRCDIR)/%.txt $(TOOLBINDIR)/gensprep$(TOOLEXEEXT) $(BU
$(BRKBLDDIR)/%.brk: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genbrk$(TOOLEXEEXT) $(DAT_FILES)
$(INVOKE) $(TOOLBINDIR)/genbrk -c -i $(BUILDDIR) -r $< -o $@
#################################################### CTD
# CTD FILES
#################################################### DICT
# DICT FILES
$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES)
$(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $<
# .dict file generated regardless of whether dictionary file exists
$(BRKBLDDIR)/%.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
$(INVOKE) $(TOOLBINDIR)/gendict --uchars -c -i $(BUILDDIR) $(BRKSRCDIR)/$(*F).txt $@
$(BRKBLDDIR)/thaidict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
$(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x0e00 -c -i $(BUILDDIR) $(BRKSRCDIR)/thaidict.txt $(BRKBLDDIR)/thaidict.dict
# TODO: figure out why combining characters are here?
$(BRKBLDDIR)/khmerdict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
$(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(BRKSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict
#################################################### CFU
# CFU FILES

View file

@ -33,15 +33,14 @@ BRK_RES_SYNTHETIC_ALIAS =
BRK_RES_ALIAS_SOURCE = $(BRK_RES_SYNTHETIC_ALIAS)
# List of compact trie dictionary files (ctd).
BRK_CTD_SOURCE = thaidict.txt khmerdict.txt
# List of dictionary files (dict).
BRK_DICT_SOURCE = thaidict.txt khmerdict.txt cjdict.txt
# List of break iterator files (brk).
BRK_SOURCE = sent_el.txt word_POSIX.txt line_fi.txt word_ja.txt line_ja.txt char.txt word.txt line.txt sent.txt title.txt
BRK_SOURCE = sent_el.txt word_POSIX.txt line_fi.txt line_ja.txt char.txt word.txt line.txt sent.txt title.txt
# Ordinary resources
BRK_RES_SOURCE = el.txt en.txt en_US.txt en_US_POSIX.txt\
fi.txt ja.txt
fi.txt

File diff suppressed because it is too large Load diff

View file

@ -1,5 +1,5 @@
 Copyright (c) 2011-2012 International Business Machines Corporation
and others. All Rights Reserved.
 # Copyright (c) 2011-2012 International Business Machines Corporation
# and others. All Rights Reserved.
កក
កកកុញ
@ -23380,7 +23380,7 @@
ថ្ងៃមានឫក្ស
ថ្ងៃមិញ
ថ្ងៃមុខ
ថ្ងៃមុន
ថ្ងៃមុន
ថ្ងៃមួយ
ថ្ងៃម្សិល
ថ្ងៃម្សិលមិញ

View file

@ -16,7 +16,10 @@ root{
word:process(dependency){"word.brk"}
}
dictionaries{
Khmr:process(dependency){"khmerdict.ctd"}
Thai:process(dependency){"thaidict.ctd"}
Khmr:process(dependency){"khmerdict.dict"}
Thai:process(dependency){"thaidict.dict"}
Hani:process(dependency){"cjdict.dict"}
Hira:process(dependency){"cjdict.dict"}
Kata:process(dependency){"cjdict.dict"}
}
}

View file

@ -1,5 +1,5 @@
 Copyright (c) 2006 International Business Machines Corporation,
Apple Computer, Inc., and others. All Rights Reserved.
 # Copyright (c) 2006-2012 International Business Machines Corporation,
# Apple Computer, Inc., and others. All Rights Reserved.
กก
กกขนาก
กกช้าง
@ -5400,7 +5400,7 @@
ดิ้นรน
ดิ้ว
ดี
ดี.ซี.
# ดี.ซี. -- TODO: why does this have full stop in it?
ดีกรี
ดีงู
ดีฉัน
@ -15972,8 +15972,8 @@
วิ่งเปี้ยว
วิ่น
วี
วี.ดี.
วี.ไอ.พี.
# วี.ดี. # TODO: why do these have full stops?
# วี.ไอ.พี.
วีค
วีจิ
วีชนี
@ -16357,9 +16357,9 @@
ษัษฐ
ษัษฐี
ษิโณทก
ส.ธรนินทร์
ส.ธรรมภักดี
ส.นิยม
# ส.ธรนินทร์ -- TODO: why do these have full stops?
# ส.ธรรมภักดี
# ส.นิยม
สก
สกฏ
สกฏภาร
@ -23311,7 +23311,7 @@
เห่า
เห้งเจีย
เอ
เอ.ยู.เอ.
# เอ.ยู.เอ. -- TODO: why do we have a full stop?
เอก
เอกจิต
เอกฉันท์

View file

@ -1,5 +1,5 @@
#
# Copyright (C) 2002-2011, International Business Machines Corporation
# Copyright (C) 2002-2012, International Business Machines Corporation
# and others. All Rights Reserved.
#
# file: word.txt
@ -29,7 +29,9 @@ $LF = [\p{Word_Break = LF}];
$Newline = [\p{Word_Break = Newline}];
$Extend = [\p{Word_Break = Extend}];
$Format = [\p{Word_Break = Format}];
$Hiragana = [:Hiragana:];
$Katakana = [\p{Word_Break = Katakana}];
$Han = [:Han:];
$ALetter = [\p{Word_Break = ALetter}];
$MidNumLet = [\p{Word_Break = MidNumLet}];
$MidLetter = [\p{Word_Break = MidLetter}];
@ -43,15 +45,22 @@ $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
# 5.0 or later as the definition of Complex_Context was corrected to include all
# characters requiring dictionary break.
$dictionary = [:LineBreak = Complex_Context:];
$Control = [\p{Grapheme_Cluster_Break = Control}];
$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not
# include the dictionary characters.
$HangulSyllable = [\uac00-\ud7a3];
$ComplexContext = [:LineBreak = Complex_Context:];
$KanaKanji = [$Han $Hiragana $Katakana];
$dictionaryCJK = [$KanaKanji $HangulSyllable];
$dictionary = [$ComplexContext $dictionaryCJK];
# leave CJK scripts out of ALetterPlus
$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
#
# Rules 4 Ignore Format and Extend characters,
# except when they appear at the beginning of a region of text.
#
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
$KatakanaEx = $Katakana ($Extend | $Format)*;
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
@ -60,7 +69,6 @@ $MidNumEx = $MidNum ($Extend | $Format)*;
$NumericEx = $Numeric ($Extend | $Format)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
$Hiragana = [\p{script=Hiragana}];
$Ideographic = [\p{Ideographic}];
$HiraganaEx = $Hiragana ($Extend | $Format)*;
$IdeographicEx = $Ideographic ($Extend | $Format)*;
@ -78,13 +86,14 @@ $CR $LF;
# of a region of Text. The rule here comes into play when the start of text
# begins with a group of Format chars, or with a "word" consisting of a single
# char that is not in any of the listed word break categories followed by
# format char(s).
[^$CR $LF $Newline]? ($Extend | $Format)+;
# format char(s), or is not a CJK dictionary character.
[^$CR $LF $Newline $dictionaryCJK]? ($Extend | $Format)+;
$NumericEx {100};
$ALetterEx {200};
$KatakanaEx {300}; # note: these status values override those from rule 5
$HiraganaEx {300}; # by virtual of being numerically larger.
$HangulSyllable {200};
$KatakanaEx {400}; # note: these status values override those from rule 5
$HiraganaEx {400}; # by virtue of being numerically larger.
$IdeographicEx {400}; #
#
@ -113,20 +122,25 @@ $NumericEx $ALetterEx {200};
$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
# rule 13
$KatakanaEx $KatakanaEx {300};
# to be consistent with $KanaKanji $KanaKanhi, changed
# from 300 to 400.
# See also TestRuleStatus in intltest/rbbiapts.cpp
$KatakanaEx $KatakanaEx {400};
# rule 13a/b
$ALetterEx $ExtendNumLetEx {200}; # (13a)
$NumericEx $ExtendNumLetEx {100}; # (13a)
$KatakanaEx $ExtendNumLetEx {300}; # (13a)
$KatakanaEx $ExtendNumLetEx {400}; # (13a)
$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
$ExtendNumLetEx $ALetterEx {200}; # (13b)
$ExtendNumLetEx $NumericEx {100}; # (13b)
$ExtendNumLetEx $KatakanaEx {300}; # (13b)
$ExtendNumLetEx $KatakanaEx {400}; # (13b)
# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable {200};
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
## -------------------------------------------------
@ -139,13 +153,14 @@ $BackNumericEx = ($Format | $Extend)* $Numeric;
$BackMidNumEx = ($Format | $Extend)* $MidNum;
$BackMidLetterEx = ($Format | $Extend)* $MidLetter;
$BackKatakanaEx = ($Format | $Extend)* $Katakana;
$BackHiraganaEx = ($Format | $Extend)* $Hiragana;
$BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;
# rule 3
$LF $CR;
# rule 4
($Format | $Extend)* [^$CR $LF $Newline]?;
($Format | $Extend)* [^$CR $LF $Newline $dictionaryCJK]?;
# rule 5
@ -181,6 +196,10 @@ $BackKatakanaEx $BackKatakanaEx;
$BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;
# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable;
$KanaKanji $KanaKanji; #different rule status if both kanji and kana found
## -------------------------------------------------
!!safe_reverse;

View file

@ -236,7 +236,7 @@ CNV_FILES_SPECIAL=$(UCM_SOURCE_SPECIAL:.ucm=.cnv)
!IF EXISTS("$(ICUSRCDATA)\$(ICUBRK)\brklocal.mk")
!INCLUDE "$(ICUSRCDATA)\$(ICUBRK)\brklocal.mk"
BRK_SOURCE=$(BRK_SOURCE) $(BRK_SOURCE_LOCAL)
BRK_CTD_SOURCE=$(BRK_CTD_SOURCE) $(BRK_CTD_SOURCE_LOCAL)
BRK_DICT_SOURCE=$(BRK_DICT_SOURCE) $(BRK_DICT_SOURCE_LOCAL)
BRK_RES_SOURCE=$(BRK_RES_SOURCE) $(BRK_RES_SOURCE_LOCAL)
!ELSE
!MESSAGE Information: cannot find "brklocal.mk". Not building user-additional break iterator files.
@ -252,10 +252,10 @@ BRK_FILES=$(ICUBRK)\$(BRK_SOURCE:.txt =.brk brkitr\)
BRK_FILES=$(BRK_FILES:.txt=.brk)
BRK_FILES=$(BRK_FILES:brkitr\ =brkitr\)
!IFDEF BRK_CTD_SOURCE
BRK_CTD_FILES = $(ICUBRK)\$(BRK_CTD_SOURCE:.txt =.ctd brkitr\)
BRK_CTD_FILES = $(BRK_CTD_FILES:.txt=.ctd)
BRK_CTD_FILES = $(BRK_CTD_FILES:brkitr\ =)
!IFDEF BRK_DICT_SOURCE
BRK_DICT_FILES = $(ICUBRK)\$(BRK_DICT_SOURCE):.txt=.dict brkitr\)
BRK_DICT_FILES = $(BRK_DICT_FILES:.txt=.dict)
BRK_DICT_FILES = $(BRK_DICT_FILES:brkitr\ =)
!ENDIF
!IFDEF BRK_RES_SOURCE
@ -360,6 +360,9 @@ ZONE_SOURCE=$(ZONE_SOURCE) $(ZONE_SOURCE_LOCAL)
!MESSAGE Warning: cannot find "zone\resfiles.mk"
!ENDIF
BRK_DICT_FILES = $(ICUBRK)\$(BRK_DICT_SOURCE):.txt=.dict brkitr\)
BRK_DICT_FILES = $(BRK_DICT_FILES:.txt=.dict)
BRK_DICT_FILES = $(BRK_DICT_FILES:brkitr\ =)
!IFDEF ZONE_SOURCE
ZONE_FILES = zone\root.txt $(ZONE_ALIAS_SOURCE) $(ZONE_SOURCE)
ZONE_RES_FILES = $(ZONE_FILES:.txt =.res zone\)
@ -602,7 +605,7 @@ icu4j-data-install :
copy "$(ICUTMP)\$(ICUPKG).dat" "$(ICUOUT)\$(U_ICUDATA_NAME)$(U_ICUDATA_ENDIAN_SUFFIX).dat"
-@erase "$(ICUTMP)\$(ICUPKG).dat"
!ELSE
"$(ICU_LIB_TARGET)" : $(COMMON_ICUDATA_DEPENDENCIES) $(CNV_FILES) $(CNV_FILES_SPECIAL) "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\cnvalias.icu" "$(ICUBLD_PKG)\nfc.nrm" "$(ICUBLD_PKG)\nfkc.nrm" "$(ICUBLD_PKG)\nfkc_cf.nrm" "$(ICUBLD_PKG)\uts46.nrm" "$(ICUBLD_PKG)\$(ICUCOL)\ucadata.icu" "$(ICUBLD_PKG)\$(ICUCOL)\invuca.icu" $(CURR_RES_FILES) $(LANG_RES_FILES) $(REGION_RES_FILES) $(ZONE_RES_FILES) $(BRK_FILES) $(BRK_CTD_FILES) $(BRK_RES_FILES) $(COL_COL_FILES) $(RBNF_RES_FILES) $(TRANSLIT_RES_FILES) $(ALL_RES) $(SPREP_FILES) "$(ICUBLD_PKG)\confusables.cfu"
"$(ICU_LIB_TARGET)" : $(COMMON_ICUDATA_DEPENDENCIES) $(CNV_FILES) $(CNV_FILES_SPECIAL) "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\cnvalias.icu" "$(ICUBLD_PKG)\nfc.nrm" "$(ICUBLD_PKG)\nfkc.nrm" "$(ICUBLD_PKG)\nfkc_cf.nrm" "$(ICUBLD_PKG)\uts46.nrm" "$(ICUBLD_PKG)\$(ICUCOL)\ucadata.icu" "$(ICUBLD_PKG)\$(ICUCOL)\invuca.icu" $(CURR_RES_FILES) $(LANG_RES_FILES) $(REGION_RES_FILES) $(ZONE_RES_FILES) $(BRK_FILES) $(BRK_DICT_FILES) $(BRK_RES_FILES) $(COL_COL_FILES) $(RBNF_RES_FILES) $(TRANSLIT_RES_FILES) $(ALL_RES) $(SPREP_FILES) "$(ICUBLD_PKG)\confusables.cfu"
@echo Building icu data
cd "$(ICUBLD_PKG)"
"$(ICUPBIN)\pkgdata" $(COMMON_ICUDATA_ARGUMENTS) <<"$(ICUTMP)\icudata.lst"
@ -637,7 +640,7 @@ $(TRANSLIT_RES_FILES:.res =.res
)
$(BRK_FILES:.brk =.brk
)
$(BRK_CTD_FILES:.ctd =.ctd
$(BRK_DICT_FILES:.dict=.dict
)
$(BRK_RES_FILES:.res =.res
)
@ -696,7 +699,6 @@ CLEAN : GODATA
-@erase "zone\*.txt"
@cd "$(ICUBLD_PKG)\$(ICUBRK)"
-@erase "*.brk"
-@erase "*.ctd"
-@erase "*.res"
-@erase "*.txt"
@cd "$(ICUBLD_PKG)\$(ICUCOL)"
@ -735,10 +737,10 @@ CLEAN : GODATA
@echo Creating $@
@"$(ICUTOOLS)\genbrk\$(CFG)\genbrk" -c -r $< -o $@ -d"$(ICUBLD_PKG)" -i "$(ICUBLD_PKG)"
# RBBI .ctd file generation.
{$(ICUSRCDATA_RELATIVE_PATH)\$(ICUBRK)}.txt.ctd:
@echo Creating $@
@"$(ICUTOOLS)\genctd\$(CFG)\genctd" -c -o $@ -d"$(ICUBLD_PKG)" -i "$(ICUBLD_PKG)" $<
#RBBI .dict file generation.
{$(ICUSRCDATA_RELATIVE_PATH)\$(ICUBRK)}.txt.dict:
@echo Creating $@
@"$(ICUTOOLS)\gendict\$(CFG)\gendict" -c --uchars -i "$(ICUBLD_PKG)" $< $(ICUBLD_PKG)\$@
!IFNDEF ICUDATA_SOURCE_ARCHIVE
# Rule for creating converters

View file

@ -209,7 +209,7 @@
<Project>{8b41752b-5a52-41e4-b7e0-07921c0cc6bf}</Project>
<ReferenceOutputAssembly>false</ReferenceOutputAssembly>
</ProjectReference>
<ProjectReference Include="..\tools\genctd\genctd.vcxproj">
<ProjectReference Include="..\tools\gendict\gendict.vcxproj">
<Project>{9d4211f7-2c77-439c-82f0-30a4e43ba569}</Project>
<ReferenceOutputAssembly>false</ReferenceOutputAssembly>
</ProjectReference>

View file

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!--
Copyright (c) 2010-2011 International Business Machines Corporation and others. All rights reserved.
Copyright (c) 2010-2012 International Business Machines Corporation and others. All rights reserved.
-->
<!DOCTYPE ldml SYSTEM "http://www.unicode.org/repos/cldr/trunk/common/dtd/ldml.dtd"
[
@ -24,8 +24,11 @@
<icu:title icu:dependency="title.brk"/>
</icu:boundaries>
<icu:dictionaries>
<icu:dictionary type="Thai" icu:dependency="thaidict.ctd"/>
<icu:dictionary type="Khmr" icu:dependency="khmerdict.ctd"/>
<icu:dictionary type="Thai" icu:dependency="thaidict.dict"/>
<icu:dictionary type="Khmr" icu:dependency="khmerdict.dict"/>
<icu:dictionary type="Hani" icu:dependency="cjdict.dict"/>
<icu:dictionary type="Hira" icu:dependency="cjdict.dict"/>
<icu:dictionary type="Kata" icu:dependency="cjdict.dict"/>
</icu:dictionaries>
</icu:breakIteratorData>
</special>

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2011, International Business Machines Corporation and
* Copyright (c) 1997-2012, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/********************************************************************************
@ -768,7 +768,7 @@ typedef struct {
static const RBBITailoringTest tailoringTests[] = {
{ "en", UBRK_CHARACTER, thTest, thTestOffs_thFwd, thTestOffs_thRev, sizeof(thTestOffs_thFwd)/sizeof(thTestOffs_thFwd[0]) },
{ "th", UBRK_CHARACTER, thTest, thTestOffs_thFwd, thTestOffs_thRev, sizeof(thTestOffs_thFwd)/sizeof(thTestOffs_thFwd[0]) },
{ "en_US_POSIX", UBRK_CHARACTER, thTest, thTestOffs_thFwd, thTestOffs_thRev, sizeof(thTestOffs_thFwd)/sizeof(thTestOffs_thFwd[0]) },
{ "en", UBRK_LINE, heTest, heTestOffs_heFwd, heTestOffs_heRev, sizeof(heTestOffs_heFwd)/sizeof(heTestOffs_heFwd[0]) },
{ "he", UBRK_LINE, heTest, heTestOffs_heFwd, heTestOffs_heRev, sizeof(heTestOffs_heFwd)/sizeof(heTestOffs_heFwd[0]) },
{ "en", UBRK_LINE, fiTest, fiTestOffs_enFwd, fiTestOffs_enRev, sizeof(fiTestOffs_enFwd)/sizeof(fiTestOffs_enFwd[0]) },

View file

@ -2184,26 +2184,7 @@ static void TestResourceLevelAliasing(void) {
} else if(seqLen != strLen || u_strncmp(sequence, string, seqLen) != 0) {
log_err("Referencing alias didn't get the right string (3)\n");
}
{
UResourceBundle* ja = ures_open(U_ICUDATA_BRKITR,"ja", &status);
const UChar *got = NULL, *exp=NULL;
int32_t gotLen = 0, expLen=0;
ja = ures_getByKey(ja, "boundaries", ja, &status);
exp = tres_getString(ja, -1, "word", &expLen, &status);
tb = ures_getByKey(aliasB, "boundaries", tb, &status);
got = tres_getString(tb, -1, "word", &gotLen, &status);
if(U_FAILURE(status)) {
log_err("%s trying to read str boundaries\n", u_errorName(status));
} else if(gotLen != expLen || u_strncmp(exp, got, gotLen) != 0) {
log_err("Referencing alias didn't get the right data\n");
}
ures_close(ja);
status = U_ZERO_ERROR;
}
/* simple alias */
testtypes = ures_open(testdatapath, "testtypes", &status);
strcpy(buffer, "menu/file/open");

View file

@ -1236,11 +1236,9 @@ static const struct {
}
};
/* Unfortunately, trie dictionaries are in a C++ header */
int32_t
triedict_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode);
/* Unfortunately, dictionaries are in a C++ header */
U_CAPI int32_t U_EXPORT2
udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *pErrorCode);
/* test cases for maximum data swapping code coverage */
static const struct {
@ -1305,7 +1303,7 @@ static const struct {
#if !UCONFIG_NO_BREAK_ITERATION
{"char", "brk", ubrk_swap},
{"thaidict", "ctd", triedict_swap},
{"thaidict", "dict",udict_swap},
#endif
#if 0
@ -1658,7 +1656,7 @@ TestSwapData() {
nm=swapCases[i].name+1;
uprv_strcpy(name, "testdata");
} else if (uprv_strcmp(swapCases[i].type, "brk")==0
|| uprv_strcmp(swapCases[i].type, "ctd")==0) {
|| uprv_strcmp(swapCases[i].type, "dict")==0) {
pkg=U_ICUDATA_BRKITR;
nm=swapCases[i].name;
uprv_strcpy(name, U_ICUDATA_BRKITR);

View file

@ -1,5 +1,5 @@
/********************************************************************
* Copyright (c) 1999-2011, International Business Machines
* Copyright (c) 1999-2012, International Business Machines
* Corporation and others. All Rights Reserved.
********************************************************************
* Date Name Description
@ -157,10 +157,13 @@ void RBBIAPITest::TestBoilerPlate()
if(*a!=*b){
errln("Failed: boilerplate method operator!= does not return correct results");
}
BreakIterator* c = BreakIterator::createWordInstance(Locale("ja"),status);
if(a && c){
if(*c==*a){
errln("Failed: boilerplate method opertator== does not return correct results");
// Japanese word break iterators are identical to root with
// a dictionary-based break iterator
BreakIterator* c = BreakIterator::createCharacterInstance(Locale("ja"),status);
BreakIterator* d = BreakIterator::createCharacterInstance(Locale("root"),status);
if(c && d){
if(*c!=*d){
errln("Failed: boilerplate method operator== does not return correct results");
}
}else{
errln("creation of break iterator failed");
@ -168,6 +171,7 @@ void RBBIAPITest::TestBoilerPlate()
delete a;
delete b;
delete c;
delete d;
}
void RBBIAPITest::TestgetRules()
@ -636,21 +640,21 @@ void RBBIAPITest::TestQuoteGrouping() {
//
void RBBIAPITest::TestRuleStatus() {
UChar str[30];
u_unescape("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094",
// 012345678901234567 8 9 0 1 2 3 4 5 6
// Ideographic Katakana Hiragana
//no longer test Han or hiragana breaking here: ruleStatusVec would return nothing
// changed UBRK_WORD_KANA to UBRK_WORD_IDEO
u_unescape("plain word 123.45 \\u30a1\\u30a2 ",
// 012345678901234567 8 9 0
// Katakana
str, 30);
UnicodeString testString1(str);
int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 19, 20, 21, 23, 24, 25, 26};
int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 20, 21};
int32_t tag_lo[] = {UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_LETTER,
UBRK_WORD_NONE, UBRK_WORD_NUMBER, UBRK_WORD_NONE,
UBRK_WORD_IDEO, UBRK_WORD_IDEO, UBRK_WORD_NONE,
UBRK_WORD_KANA, UBRK_WORD_NONE, UBRK_WORD_KANA, UBRK_WORD_KANA};
UBRK_WORD_IDEO, UBRK_WORD_NONE};
int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT,
UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT,
UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT,
UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_KANA_LIMIT, UBRK_WORD_KANA_LIMIT};
UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT};
UErrorCode status=U_ZERO_ERROR;
@ -871,7 +875,6 @@ void RBBIAPITest::TestRegistration() {
#if !UCONFIG_NO_SERVICE
UErrorCode status = U_ZERO_ERROR;
BreakIterator* ja_word = BreakIterator::createWordInstance("ja_JP", status);
// ok to not delete these if we exit because of error?
BreakIterator* ja_char = BreakIterator::createCharacterInstance("ja_JP", status);
BreakIterator* root_word = BreakIterator::createWordInstance("", status);
@ -879,6 +882,7 @@ void RBBIAPITest::TestRegistration() {
if (status == U_MISSING_RESOURCE_ERROR || status == U_FILE_ACCESS_ERROR) {
dataerrln("Error creating instances of break interactors - %s", u_errorName(status));
delete ja_word;
delete ja_char;
delete root_word;
@ -889,9 +893,11 @@ void RBBIAPITest::TestRegistration() {
URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD, status);
{
#if 0 // With a dictionary based word breaking, ja_word is identical to root.
if (ja_word && *ja_word == *root_word) {
errln("japan not different from root");
}
#endif
}
{

View file

@ -33,10 +33,11 @@
#include <string.h>
#include "uvector.h"
#include "uvectr32.h"
#include "triedict.h"
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include "unicode/numfmt.h"
#include "unicode/uscript.h"
#define TEST_ASSERT(x) {if (!(x)) { \
errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
@ -111,8 +112,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
#endif
#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
case 16: name = "TestMonkey";
if(exec) TestMonkey(params); break;
case 16:
name = "TestMonkey"; if(exec) TestMonkey(params); break;
#else
case 16:
name = "skip"; break;
@ -130,8 +131,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
break;
case 19: name = "TestDebug";
if(exec) TestDebug(); break;
case 20: name = "TestTrieDict";
if(exec) TestTrieDict(); break;
case 20: name = "skip";
break;
#if !UCONFIG_NO_FILE_IO
case 21: name = "TestBug5775";
@ -428,227 +429,6 @@ void RBBITest::TestBug3818() {
delete bi;
}
void RBBITest::TestTrieDict() {
UErrorCode status = U_ZERO_ERROR;
//
// Open and read the test data file.
//
const char *testDataDirectory = IntlTest::getSourceTestData(status);
char testFileName[1000];
if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) {
errln("Can't open test data. Path too long.");
return;
}
strcpy(testFileName, testDataDirectory);
strcat(testFileName, "riwords.txt");
// Items needing deleting at the end
MutableTrieDictionary *mutableDict = NULL;
CompactTrieDictionary *compactDict = NULL;
UnicodeSet *breaks = NULL;
UChar *testFile = NULL;
StringEnumeration *enumer1 = NULL;
StringEnumeration *enumer2 = NULL;
MutableTrieDictionary *mutable2 = NULL;
StringEnumeration *cloneEnum = NULL;
CompactTrieDictionary *compact2 = NULL;
const UnicodeString *originalWord = NULL;
const UnicodeString *cloneWord = NULL;
UChar *current;
UChar *word;
UChar uc;
int32_t wordLen;
int32_t wordCount;
int32_t testCount;
int len;
testFile = ReadAndConvertFile(testFileName, len, NULL, status);
if (U_FAILURE(status)) {
goto cleanup; /* something went wrong, error already output */
}
mutableDict = new MutableTrieDictionary(0x0E1C, status);
if (U_FAILURE(status)) {
errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
goto cleanup;
}
breaks = new UnicodeSet;
breaks->add(0x000A); // Line Feed
breaks->add(0x000D); // Carriage Return
breaks->add(0x2028); // Line Separator
breaks->add(0x2029); // Paragraph Separator
// Now add each non-comment line of the file as a word.
current = testFile;
word = current;
uc = *current++;
wordLen = 0;
wordCount = 0;
while (uc) {
if (uc == 0x0023) { // #comment line, skip
while (uc && !breaks->contains(uc)) {
uc = *current++;
}
}
else while (uc && !breaks->contains(uc)) {
++wordLen;
uc = *current++;
}
if (wordLen > 0) {
mutableDict->addWord(word, wordLen, status);
if (U_FAILURE(status)) {
errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
goto cleanup;
}
wordCount += 1;
}
// Find beginning of next line
while (uc && breaks->contains(uc)) {
uc = *current++;
}
word = current-1;
wordLen = 0;
}
if (wordCount < 50) {
errln("Word count (%d) unreasonably small\n", wordCount);
goto cleanup;
}
enumer1 = mutableDict->openWords(status);
if (U_FAILURE(status)) {
errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
goto cleanup;
}
testCount = 0;
if (wordCount != (testCount = enumer1->count(status))) {
errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
testCount, wordCount, u_errorName(status));
goto cleanup;
}
// Now compact it
compactDict = new CompactTrieDictionary(*mutableDict, status);
if (U_FAILURE(status)) {
errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
goto cleanup;
}
enumer2 = compactDict->openWords(status);
if (U_FAILURE(status)) {
errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
goto cleanup;
}
if (wordCount != (testCount = enumer2->count(status))) {
errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
testCount, wordCount, u_errorName(status));
goto cleanup;
}
if (typeid(*enumer1) == typeid(*enumer2)) {
errln("CompactTrieEnumeration and MutableTrieEnumeration typeids are the same");
}
delete enumer1;
enumer1 = NULL;
delete enumer2;
enumer2 = NULL;
// Now un-compact it
mutable2 = compactDict->cloneMutable(status);
if (U_FAILURE(status)) {
errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
goto cleanup;
}
cloneEnum = mutable2->openWords(status);
if (U_FAILURE(status)) {
errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
goto cleanup;
}
if (wordCount != (testCount = cloneEnum->count(status))) {
errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
testCount, wordCount, u_errorName(status));
goto cleanup;
}
// Compact original dictionary to clone. Note that we can only compare the same kind of
// dictionary as the order of the enumerators is not guaranteed to be the same between
// different kinds
enumer1 = mutableDict->openWords(status);
if (U_FAILURE(status)) {
errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
goto cleanup;
}
originalWord = enumer1->snext(status);
cloneWord = cloneEnum->snext(status);
while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
if (*originalWord != *cloneWord) {
errln("Original and cloned MutableTrieDictionary word mismatch\n");
goto cleanup;
}
originalWord = enumer1->snext(status);
cloneWord = cloneEnum->snext(status);
}
if (U_FAILURE(status)) {
errln("Enumeration failed: %s\n", u_errorName(status));
goto cleanup;
}
if (originalWord != cloneWord) {
errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
goto cleanup;
}
// Test the data copying constructor for CompactTrieDict, and the data access APIs.
compact2 = new CompactTrieDictionary(compactDict->data(), status);
if (U_FAILURE(status)) {
errln("CompactTrieDictionary(const void *,...) failed\n");
goto cleanup;
}
if (compact2->dataSize() == 0) {
errln("CompactTrieDictionary->dataSize() == 0\n");
goto cleanup;
}
// Now count the words via the second dictionary
delete enumer1;
enumer1 = compact2->openWords(status);
if (U_FAILURE(status)) {
errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
goto cleanup;
}
if (wordCount != (testCount = enumer1->count(status))) {
errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
testCount, wordCount, u_errorName(status));
goto cleanup;
}
cleanup:
delete compactDict;
delete mutableDict;
delete breaks;
delete[] testFile;
delete enumer1;
delete mutable2;
delete cloneEnum;
delete compact2;
}
//----------------------------------------------------------------------------
//
// generalIteratorTest Given a break iterator and a set of test data,
@ -2215,6 +1995,8 @@ private:
UnicodeSet *fNewlineSet;
UnicodeSet *fKatakanaSet;
UnicodeSet *fALetterSet;
// TODO(jungshik): Do we still need this change?
// UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt
UnicodeSet *fMidNumLetSet;
UnicodeSet *fMidLetterSet;
UnicodeSet *fMidNumSet;
@ -2223,6 +2005,7 @@ private:
UnicodeSet *fOtherSet;
UnicodeSet *fExtendSet;
UnicodeSet *fExtendNumLetSet;
UnicodeSet *fDictionaryCjkSet;
RegexMatcher *fMatcher;
@ -2239,11 +2022,25 @@ RBBIWordMonkey::RBBIWordMonkey()
fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status);
fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status);
fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status);
fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
// Exclude Hangul syllables from ALetterSet during testing.
// Leave CJK dictionary characters out from the monkey tests!
#if 0
fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"
"[\\p{Line_Break = Complex_Context}"
"-\\p{Grapheme_Cluster_Break = Extend}"
"-\\p{Grapheme_Cluster_Break = Control}"
"]]",
status);
#endif
fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
fALetterSet->removeAll(*fDictionaryCjkSet);
fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status);
fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status);
fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status);
fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status);
// TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
// we should figure out why
fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status);
fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);
fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
@ -2268,13 +2065,14 @@ RBBIWordMonkey::RBBIWordMonkey()
fOtherSet->removeAll(*fFormatSet);
fOtherSet->removeAll(*fExtendSet);
// Inhibit dictionary characters from being tested at all.
fOtherSet->removeAll(*fDictionaryCjkSet);
fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
fSets->addElement(fCRSet, status);
fSets->addElement(fLFSet, status);
fSets->addElement(fNewlineSet, status);
fSets->addElement(fALetterSet, status);
fSets->addElement(fKatakanaSet, status);
//fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana
fSets->addElement(fMidLetterSet, status);
fSets->addElement(fMidNumLetSet, status);
fSets->addElement(fMidNumSet, status);
@ -3547,6 +3345,7 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
count --;
if (forward[count] != i) {
printStringBreaks(ustr, expected, expectedcount);
test->errln("happy break test previous() failed: expected %d but got %d",
forward[count], i);
break;
@ -3580,23 +3379,25 @@ void RBBITest::TestWordBreaks(void)
UErrorCode status = U_ZERO_ERROR;
// BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
// Replaced any C+J characters in a row with a random sequence of characters
// of the same length to make our C+J segmentation not get in the way.
static const char *strlist[] =
{
"\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
"\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
"\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
"\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
"\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
"\\u90ca\\u3588\\u009c\\u0953\\u194b",
"\\uac00\\u3588\\u009c\\u0953\\u194b",
"\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
"\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
"\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
"\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
"\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
"\\u003b\\u024a\\u102e\\U000e0071\\u0600",
"\\u2027\\U000e0067\\u0a47\\u00b7",
"\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
"\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
"\\u0589\\U000e006e\\u0a42\\U000104a5",
"\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
"\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
"\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
"\\u0027\\u11af\\U000e0057\\u0602",
"\\U0001d7f2\\U000e007\\u0004\\u0589",
@ -3608,7 +3409,7 @@ void RBBITest::TestWordBreaks(void)
"\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
"\\u0233\\U000e0020\\u0a69\\u0d6a",
"\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
"\\u58f4\\U000e0049\\u20e7\\u2027",
"\\u18f4\\U000e0049\\u20e7\\u2027",
"\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
"\\ua183\\u102d\\u0bec\\u003a",
"\\u17e8\\u06e7\\u002e\\u096d\\u003b",
@ -3618,7 +3419,7 @@ void RBBITest::TestWordBreaks(void)
"\\U000e005d\\u2044\\u0731\\u0650\\u0061",
"\\u003a\\u0664\\u00b7\\u1fba",
"\\u003b\\u0027\\u00b7\\u47a3",
"\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
"\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
"\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
"\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
};
@ -3673,12 +3474,12 @@ void RBBITest::TestWordBoundary(void)
"\\U0001d7f2\\U000e007d\\u0004\\u0589",
"\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
"\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
"\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
"\\U000e0065\\u302c\\u09ee\\U000e0068",
"\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
"\\u0233\\U000e0020\\u0a69\\u0d6a",
"\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
"\\u58f4\\U000e0049\\u20e7\\u2027",
"\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
"\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
"\\ua183\\u102d\\u0bec\\u003a",
"\\u17e8\\u06e7\\u002e\\u096d\\u003b",
"\\u003a\\u0e57\\u0fad\\u002e",

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2010-2011, International Business Machines
* Copyright (C) 2010-2012, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: dicttrieperf.cpp
@ -34,7 +34,6 @@
#include "charstr.h"
#include "package.h"
#include "toolutil.h"
#include "triedict.h"
#include "ucbuf.h" // struct ULine
#include "uoptions.h"
#include "uvectr32.h"
@ -337,56 +336,6 @@ protected:
const DictionaryTriePerfTest &perf;
};
class CompactTrieDictLookup : public DictLookup {
public:
CompactTrieDictLookup(const DictionaryTriePerfTest &perfTest)
: DictLookup(perfTest), ctd(NULL) {
IcuToolErrorCode errorCode("UCharsTrieDictLookup()");
// U+0E1C is the median code unit, from
// the UCharsTrie root node (split-branch node) for thaidict.txt.
MutableTrieDictionary builder(0xe1c, errorCode);
const ULine *lines=perf.getCachedLines();
int32_t numLines=perf.getNumLines();
for(int32_t i=0; i<numLines; ++i) {
// Skip comment lines (start with a character below 'A').
if(lines[i].name[0]<0x41) {
continue;
}
builder.addWord(lines[i].name, lines[i].len, errorCode);
}
ctd=new CompactTrieDictionary(builder, errorCode);
int32_t length=(int32_t)ctd->dataSize();
printf("size of CompactTrieDict: %6ld bytes\n", (long)length);
}
virtual ~CompactTrieDictLookup() {
delete ctd;
}
virtual void call(UErrorCode *pErrorCode) {
UText text=UTEXT_INITIALIZER;
int32_t lengths[20];
const ULine *lines=perf.getCachedLines();
int32_t numLines=perf.getNumLines();
for(int32_t i=0; i<numLines; ++i) {
// Skip comment lines (start with a character below 'A').
if(lines[i].name[0]<0x41) {
continue;
}
utext_openUChars(&text, lines[i].name, lines[i].len, pErrorCode);
int32_t count;
ctd->matches(&text, lines[i].len,
lengths, count, LENGTHOF(lengths));
if(count==0 || lengths[count-1]!=lines[i].len) {
fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
}
}
}
protected:
CompactTrieDictionary *ctd;
};
// Closely imitate CompactTrieDictionary::matches().
// Note: CompactTrieDictionary::matches() is part of its trie implementation,
// and while it loops over the text, it knows the current state.
@ -695,30 +644,24 @@ UPerfFunction *DictionaryTriePerfTest::runIndexedTest(int32_t index, UBool exec,
if(hasFile()) {
switch(index) {
case 0:
name="compacttriematches";
if(exec) {
return new CompactTrieDictLookup(*this);
}
break;
case 1:
name="ucharstriematches";
if(exec) {
return new UCharsTrieDictMatches(*this);
}
break;
case 2:
case 1:
name="ucharstriecontains";
if(exec) {
return new UCharsTrieDictContains(*this);
}
break;
case 3:
case 2:
name="bytestriematches";
if(exec) {
return new BytesTrieDictMatches(*this);
}
break;
case 4:
case 3:
name="bytestriecontains";
if(exec) {
return new BytesTrieDictContains(*this);

View file

@ -170,7 +170,23 @@
<data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data>
# Hiragana & Katakana stay together, but separates from each other and Latin.
<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data>
# *** what to do about theoretical combos of chars? i.e. hiragana + accent
#<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<400>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<400>\N{HIRAGANA ITERATION MARK}<400>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<400>def<200>#•</data>
# test normalization/dictionary handling of halfwidth katakana: same dictionary phrase in fullwidth and halfwidth
<data>•芽キャベツ<400>芽キャベツ<400></data>
# more Japanese tests
# TODO: Currently, U+30FC and other characters (script=common) in the Hiragana
# and the Katakana block are not treated correctly. Enable this later.
#<data>•どー<400>せ<400>日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data>
<data>•日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data>
# Testing of word boundary for dictionary word containing both kanji and kana
<data>•中だるみ<400>蔵王の森<400>ウ離島<400></data>
# Testing of Chinese segmentation (taken from a Chinese news article)
<data>•400<100>余<400>名<400>中央<400>委员<400>和<400>中央<400>候补<400>委员<400>都<400>领<400>到了<400>“•推荐<400>票<400>”•,•有<400>资格<400>在<400>200<100>多<400>名<400>符合<400>条件<400>的<400>63<100>岁<400>以下<400>中共<400>正<400>部<400>级<400>干部<400>中<400>,•选出<400>他们<400>属意<400>的<400>中央<400>政治局<400>委员<400>以<400>向<400>政治局<400>常委<400>会<400>举荐<400>。•</data>
# Words with interior formatting characters
<data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</data>
@ -178,6 +194,9 @@
# to test for bug #4097779
<data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>
# fullwidth numeric, midletter characters etc should be treated like their halfwidth counterparts
# <data>•ISN'<200> •19<100>日<400></data>
# why was this added with the dbbi stuff?
# to test for bug #4098467
# What follows is a string of Korean characters (I found it in the Yellow Pages
@ -187,9 +206,15 @@
# precomposed syllables...
<data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data>
<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> •</data>
# more Korean tests (Jamo not tested here, not counted as dictionary characters)
# Disable them now because we don't include a Korean dictionary.
#<data>•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<200>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data>
#<data>•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2dd<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200> •\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e•</data>
<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> •</data>
<data>•\u06c9<200>\uc799<200>\ufffa•</data>
<data>•\u06c9\uc799\ufffa<200></data>
#
# Try some words from other scripts.
@ -506,8 +531,7 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
<data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c•</data>
# conjoining jamo...
# TODO: rules update needed
#<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data>
<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data>
# to test for bug #4117554: Fullwidth .!? should be treated as postJwrd
<data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data>
@ -713,7 +737,7 @@ Bangkok)•</data>
<locale ja>
<line>
<data>•\u3041\u3043\u3045\u31f1•</data>
<data>•\u3041\u3043\u3045\u31f1•</data>
<locale en>
<line>
<data>•\u3041\u3043\u3045\u31f1•</data>
@ -721,19 +745,20 @@ Bangkok)•</data>
# The following data was originally in RBBITest::TestJapaneseWordBreak()
<locale ja>
<word>
<data>•\u4ECA\u65E5<400>\u306F\u3044\u3044<300>\u5929\u6C17<400>\u3067\u3059\u306D<300>\u3002•\u000D\u000A•</data>
<data>•\u4ECA\u65E5<400>\u306F<400>\u3044\u3044<400>\u5929\u6C17<400>\u3067\u3059<400>\u306D<400>\u3002•\u000D\u000A•</data>
# UBreakIteratorType UBRK_WORD, Locale "ja"
# Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
# \u79C1\u9054\u306B\u4E00\u3007\u3007\u3007\u306E\u30B3\u30F3\u30D4\u30E5\u30FC\u30BF\u304C\u3042\u308B\u3002\u5948\u3005\u306F\u30EF\u30FC\u30C9\u3067\u3042\u308B\u3002
# modified to work with dbbi code - should verify
<locale ja>
<word>
<data>•私達<400>に<300>一〇〇〇<400>の<300>コンピュータ<300>がある<300>。<0>奈々<400>は<300>ワード<300>である<300>。•</data>
<data>•私<400>達<400>に<400>一<400><400><400>の<400>コンピュ<400>ー<400>タ<400>が<400>ある<400>。<0>奈々<400>は<400>ワ<400>ー<400>ド<400>で<400>ある<400>。•</data>
<locale root>
<word>
<data>•私<400>達<400>に<300>一<400><400><400><400>の<300>コンピュータ<300>が<300>あ<300>る<300>。<0>奈<400>々<200>は<300>ワード<300>で<300>あ<300>る<300>。•</data>
<data>•私<400>達<400>に<400>一<400><400><400>の<400>コンピュ<400>ー<400>タ<400>が<400>ある<400>。<0>奈々<400>は<400>ワ<400>ー<400>ド<400>で<400>ある<400>。•</data>
# UBreakIteratorType UBRK_SENTENCE, Locale "el"
# Add break after Greek question mark (cldrbug #2069).
@ -778,12 +803,6 @@ Bangkok)•</data>
(•\u0E2A\u0E38•\u0E0A•\u0E32•\u0E15\u0E34•-•\u0E08\u0E38•\u0E11•\u0E32•\u0E21•\u0E32•\u0E28•)• •\
\u0E40•\u0E14\u0E47•\u0E01•\u0E21\u0E35•\u0E1B\u0E31•\u0E0D•\u0E2B•\u0E32• •</data>
<locale root>
<char>
<data>•\u0E01•\u0E23•\u0E30•\u0E17\u0E48•\u0E2D•\u0E21•\u0E23•\u0E08•\u0E19•\u0E32• •\
(•\u0E2A\u0E38•\u0E0A•\u0E32•\u0E15\u0E34•-•\u0E08\u0E38•\u0E11•\u0E32•\u0E21•\u0E32•\u0E28•)• •\
\u0E40•\u0E14\u0E47•\u0E01•\u0E21\u0E35•\u0E1B\u0E31•\u0E0D•\u0E2B•\u0E32• •</data>
# Finnish line breaking
#
# These rules deal with hyphens when there is a space on the leading side.

View file

@ -1,6 +1,6 @@
//*******************************************************************************
//*
//* Copyright (C) 2002-2009, International Business Machines
//* Copyright (C) 2002-2012, International Business Machines
//* Corporation and others. All Rights Reserved.
//*
//*******************************************************************************
@ -28,7 +28,7 @@ testaliases:table(nofallback)
LocaleScript:alias { "/ICUDATA/ja/LocaleScript" }
// aliasing using position
boundaries:alias { "/ICUDATA-brkitr/ja" } // Referencing corresponding resource in another bundle
boundaries:alias { "/ICUDATA-brkitr/th" } // Referencing corresponding resource in another bundle
// aliasing arrays
zoneTests {

View file

@ -1,5 +1,5 @@
## Makefile.in for ICU tools
## Copyright (c) 1999-2011, International Business Machines Corporation and
## Copyright (c) 1999-2012, International Business Machines Corporation and
## others. All Rights Reserved.
## Source directory information
@ -13,9 +13,9 @@ include $(top_builddir)/icudefs.mk
## Build directory information
subdir = tools
SUBDIRS = toolutil ctestfw makeconv genrb genbrk genctd \
SUBDIRS = toolutil ctestfw makeconv genrb genbrk \
gencnval gensprep icuinfo genccode gencmn icupkg pkgdata \
gentest gennorm2 gencfu
gentest gennorm2 gencfu gendict
## List of phony targets
.PHONY : all all-local all-recursive install install-local \

View file

@ -1,111 +0,0 @@
.\" Hey, Emacs! This is -*-nroff-*- you know...
.\"
.\" genctd.1: manual page for the genctd utility
.\"
.\" Copyright (C) 2006-2007 International Business Machines Corporation and others
.\"
.TH GENCTD 1 "8 March 2006" "ICU MANPAGE" "ICU @VERSION@ Manual"
.SH NAME
.B genctd
\- Compiles word list into ICU compact trie dictionary
.SH SYNOPSIS
.B genctd
[
.BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
]
[
.BR "\-V\fP, \fB\-\-version"
]
[
.BR "\-c\fP, \fB\-\-copyright"
]
[
.BR "\-v\fP, \fB\-\-verbose"
]
[
.BI "\-d\fP, \fB\-\-destdir" " destination"
]
[
.BI "\-i\fP, \fB\-\-icudatadir" " directory"
]
.BI "\-o\fP, \fB\-\-out" " output\-file"
.IR " dictionary\-file"
.SH DESCRIPTION
.B genctd
reads the word list from
.I dictionary-file
and creates a compact trie dictionary file. Normally this data file has the
.B .ctd
extension.
.PP
Words begin at the beginning of a line and are terminated by the first whitespace.
Lines that begin with whitespace are ignored.
.SH OPTIONS
.TP
.BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
Print help about usage and exit.
.TP
.BR "\-V\fP, \fB\-\-version"
Print the version of
.B genctd
and exit.
.TP
.BR "\-c\fP, \fB\-\-copyright"
Embeds the standard ICU copyright into the
.IR output-file .
.TP
.BR "\-v\fP, \fB\-\-verbose"
Display extra informative messages during execution.
.TP
.BI "\-d\fP, \fB\-\-destdir" " destination"
Set the destination directory of the
.IR output-file
to
.IR destination .
.TP
.BI "\-i\fP, \fB\-\-icudatadir" " directory"
Look for any necessary ICU data files in
.IR directory .
For example, the file
.B pnames.icu
must be located when ICU's data is not built as a shared library.
The default ICU data directory is specified by the environment variable
.BR ICU_DATA .
Most configurations of ICU do not require this argument.
.TP
.BI " dictionary\-file"
The source file to read.
.TP
.BI "\-o\fP, \fB\-\-out" " output\-file"
The output data file to write.
.SH CAVEATS
When the
.IR dictionary-file
contains a byte order mark (BOM) at the beginning of the file, which is the Unicode character
.B U+FEFF,
then the
.IR dictionary-file
is interpreted as Unicode. Without the BOM,
the file is interpreted in the current operating system default codepage.
In order to eliminate any ambiguity of the encoding for how the
.IR rule-file
was written, it is recommended that you write this file in UTF-8
with the BOM.
.SH ENVIRONMENT
.TP 10
.B ICU_DATA
Specifies the directory containing ICU data. Defaults to
.BR @thepkgicudatadir@/@PACKAGE@/@VERSION@/ .
Some tools in ICU depend on the presence of the trailing slash. It is thus
important to make sure that it is present if
.B ICU_DATA
is set.
.SH AUTHORS
Deborah Goldsmith
.SH VERSION
1.0
.SH COPYRIGHT
Copyright (C) 2006 International Business Machines Corporation and others
.SH SEE ALSO
.BR http://www.icu-project.org/userguide/boundaryAnalysis.html

View file

@ -1,396 +0,0 @@
/*
**********************************************************************
* Copyright (C) 2002-2009, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
* File genctd.c
*/
//--------------------------------------------------------------------
//
// Tool for generating CompactTrieDictionary data files (.ctd files).
//
// Usage: genctd [options] -o output-file.ctd input-file
//
// options: -v verbose
// -? or -h help
//
// The input file is a plain text file containing words, one per line.
// Words end at the first whitespace; lines beginning with whitespace
// are ignored.
// The file can be encoded as utf-8, or utf-16 (either endian), or
// in the default code page (platform dependent.). utf encoded
// files must include a BOM.
//
//--------------------------------------------------------------------
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/ucnv.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/uclean.h"
#include "unicode/udata.h"
#include "unicode/putil.h"
#include "uoptions.h"
#include "unewdata.h"
#include "ucmndata.h"
#include "rbbidata.h"
#include "triedict.h"
#include "cmemory.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
U_NAMESPACE_USE
static char *progName;
static UOption options[]={
UOPTION_HELP_H, /* 0 */
UOPTION_HELP_QUESTION_MARK, /* 1 */
UOPTION_VERBOSE, /* 2 */
{ "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 3 */
UOPTION_ICUDATADIR, /* 4 */
UOPTION_DESTDIR, /* 5 */
UOPTION_COPYRIGHT, /* 6 */
};
void usageAndDie(int retCode) {
printf("Usage: %s [-v] [-options] -o output-file dictionary-file\n", progName);
printf("\tRead in word list and write out compact trie dictionary\n"
"options:\n"
"\t-h or -? or --help this usage text\n"
"\t-V or --version show a version message\n"
"\t-c or --copyright include a copyright notice\n"
"\t-v or --verbose turn on verbose output\n"
"\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
"\t followed by path, defaults to %s\n"
"\t-d or --destdir destination directory, followed by the path\n",
u_getDataDirectory());
exit (retCode);
}
#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
/* dummy UDataInfo cf. udata.h */
static UDataInfo dummyDataInfo = {
sizeof(UDataInfo),
0,
U_IS_BIG_ENDIAN,
U_CHARSET_FAMILY,
U_SIZEOF_UCHAR,
0,
{ 0, 0, 0, 0 }, /* dummy dataFormat */
{ 0, 0, 0, 0 }, /* dummy formatVersion */
{ 0, 0, 0, 0 } /* dummy dataVersion */
};
#else
//
// Set up the ICU data header, defined in ucmndata.h
//
DataHeader dh ={
{sizeof(DataHeader), // Struct MappedData
0xda,
0x27},
{ // struct UDataInfo
sizeof(UDataInfo), // size
0, // reserved
U_IS_BIG_ENDIAN,
U_CHARSET_FAMILY,
U_SIZEOF_UCHAR,
0, // reserved
{ 0x54, 0x72, 0x44, 0x63 }, // "TrDc" Trie Dictionary
{ 1, 0, 0, 0 }, // 1.0.0.0
{ 0, 0, 0, 0 }, // Irrelevant for this data type
}};
#endif
//----------------------------------------------------------------------------
//
// main for genctd
//
//----------------------------------------------------------------------------
int main(int argc, char **argv) {
UErrorCode status = U_ZERO_ERROR;
const char *wordFileName;
const char *outFileName;
const char *outDir = NULL;
const char *copyright = NULL;
//
// Pick up and check the command line arguments,
// using the standard ICU tool utils option handling.
//
U_MAIN_INIT_ARGS(argc, argv);
progName = argv[0];
argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
if(argc<0) {
// Unrecognized option
fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
}
if(options[0].doesOccur || options[1].doesOccur) {
// -? or -h for help.
usageAndDie(0);
}
if (!options[3].doesOccur || argc < 2) {
fprintf(stderr, "input and output file must both be specified.\n");
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
}
outFileName = options[3].value;
wordFileName = argv[1];
if (options[4].doesOccur) {
u_setDataDirectory(options[4].value);
}
status = U_ZERO_ERROR;
/* Combine the directory with the file name */
if(options[5].doesOccur) {
outDir = options[5].value;
}
if (options[6].doesOccur) {
copyright = U_COPYRIGHT_STRING;
}
#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
UNewDataMemory *pData;
char msg[1024];
/* write message with just the name */
sprintf(msg, "genctd writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
fprintf(stderr, "%s\n", msg);
/* write the dummy data file */
pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
udata_writeBlock(pData, msg, strlen(msg));
udata_finish(pData, &status);
return (int)status;
#else
/* Initialize ICU */
u_init(&status);
if (U_FAILURE(status)) {
fprintf(stderr, "%s: can not initialize ICU. status = %s\n",
argv[0], u_errorName(status));
exit(1);
}
status = U_ZERO_ERROR;
//
// Read in the dictionary source file
//
long result;
long wordFileSize;
FILE *file;
char *wordBufferC;
file = fopen(wordFileName, "rb");
if( file == 0 ) {
fprintf(stderr, "Could not open file \"%s\"\n", wordFileName);
exit(-1);
}
fseek(file, 0, SEEK_END);
wordFileSize = ftell(file);
fseek(file, 0, SEEK_SET);
wordBufferC = new char[wordFileSize+10];
result = (long)fread(wordBufferC, 1, wordFileSize, file);
if (result != wordFileSize) {
fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
exit (-1);
}
wordBufferC[wordFileSize]=0;
fclose(file);
//
// Look for a Unicode Signature (BOM) on the word file
//
int32_t signatureLength;
const char * wordSourceC = wordBufferC;
const char* encoding = ucnv_detectUnicodeSignature(
wordSourceC, wordFileSize, &signatureLength, &status);
if (U_FAILURE(status)) {
exit(status);
}
if(encoding!=NULL ){
wordSourceC += signatureLength;
wordFileSize -= signatureLength;
}
//
// Open a converter to take the rule file to UTF-16
//
UConverter* conv;
conv = ucnv_open(encoding, &status);
if (U_FAILURE(status)) {
fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
exit(status);
}
//
// Convert the words to UChar.
// Preflight first to determine required buffer size.
//
uint32_t destCap = ucnv_toUChars(conv,
NULL, // dest,
0, // destCapacity,
wordSourceC,
wordFileSize,
&status);
if (status != U_BUFFER_OVERFLOW_ERROR) {
fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
exit(status);
};
status = U_ZERO_ERROR;
UChar *wordSourceU = new UChar[destCap+1];
ucnv_toUChars(conv,
wordSourceU, // dest,
destCap+1,
wordSourceC,
wordFileSize,
&status);
if (U_FAILURE(status)) {
fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
exit(status);
};
ucnv_close(conv);
// Get rid of the original file buffer
delete[] wordBufferC;
// Create a MutableTrieDictionary, and loop through all the lines, inserting
// words.
// First, pick a median character.
UChar *current = wordSourceU + (destCap/2);
UChar uc = *current++;
UnicodeSet breaks;
breaks.add(0x000A); // Line Feed
breaks.add(0x000D); // Carriage Return
breaks.add(0x2028); // Line Separator
breaks.add(0x2029); // Paragraph Separator
do {
// Look for line break
while (uc && !breaks.contains(uc)) {
uc = *current++;
}
// Now skip to first non-line-break
while (uc && breaks.contains(uc)) {
uc = *current++;
}
}
while (uc && (breaks.contains(uc) || u_isspace(uc)));
MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status);
if (U_FAILURE(status)) {
fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
exit(status);
}
// Now add the words. Words are non-space characters at the beginning of
// lines, and must be at least one UChar.
current = wordSourceU;
UChar *candidate = current;
uc = *current++;
int32_t length = 0;
while (uc) {
while (uc && !u_isspace(uc)) {
++length;
uc = *current++;
}
if (length > 0) {
mtd->addWord(candidate, length, status);
if (U_FAILURE(status)) {
fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\"\n",
u_errorName(status));
exit(status);
}
}
// Find beginning of next line
while (uc && !breaks.contains(uc)) {
uc = *current++;
}
while (uc && breaks.contains(uc)) {
uc = *current++;
}
candidate = current-1;
length = 0;
}
// Get rid of the Unicode text buffer
delete[] wordSourceU;
// Now, create a CompactTrieDictionary from the mutable dictionary
CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status);
if (U_FAILURE(status)) {
fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
exit(status);
}
// Get rid of the MutableTrieDictionary
delete mtd;
//
// Get the binary data from the dictionary.
//
uint32_t outDataSize = ctd->dataSize();
const uint8_t *outData = (const uint8_t *)ctd->data();
//
// Create the output file
//
size_t bytesWritten;
UNewDataMemory *pData;
pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
if(U_FAILURE(status)) {
fprintf(stderr, "genctd: Could not open output file \"%s\", \"%s\"\n",
outFileName, u_errorName(status));
exit(status);
}
// Write the data itself.
udata_writeBlock(pData, outData, outDataSize);
// finish up
bytesWritten = udata_finish(pData, &status);
if(U_FAILURE(status)) {
fprintf(stderr, "genctd: error \"%s\" writing the output file\n", u_errorName(status));
exit(status);
}
if (bytesWritten != outDataSize) {
fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
exit(-1);
}
// Get rid of the CompactTrieDictionary
delete ctd;
u_cleanup();
printf("genctd: tool completed successfully.\n");
return 0;
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
}

View file

@ -1,5 +1,5 @@
## Makefile.in for ICU - tools/genctd
## Copyright (c) 2002-2011 International Business Machines Corporation and
## Makefile.in for ICU - tools/gendict
## Copyright (c) 2002-2012 International Business Machines Corporation and
## others. All Rights Reserved.
## Source directory information
@ -11,9 +11,9 @@ top_builddir = ../..
include $(top_builddir)/icudefs.mk
## Build directory information
subdir = tools/genctd
subdir = tools/gendict
TARGET_STUB_NAME = genctd
TARGET_STUB_NAME = gendict
SECTION = 1
@ -29,7 +29,7 @@ TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)
CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil
LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
OBJECTS = genctd.o
OBJECTS = gendict.o
DEPS = $(OBJECTS:.o=.d)

View file

@ -0,0 +1,131 @@
.\" Hey, Emacs! This is -*-nroff-*- you know...
.\"
.\" gendict.1: manual page for the gendict utility
.\"
.\" Copyright (C) 2012 International Business Machines Corporation and others
.\"
.TH GENDICT 1 "1 June 2012" "ICU MANPAGE" "ICU @VERSION@ Manual"
.SH NAME
.B gendict
\- Compiles word list into ICU string trie dictionary
.SH SYNOPSIS
.B gendict
[
.BR "\fB\-\-uchars"
|
.BR "\fB\-\-bytes"
.BI "\fB\-\-transform" " transform"
]
[
.BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
]
[
.BR "\-V\fP, \fB\-\-version"
]
[
.BR "\-c\fP, \fB\-\-copyright"
]
[
.BR "\-v\fP, \fB\-\-verbose"
]
[
.BI "\-i\fP, \fB\-\-icudatadir" " directory"
]
.IR " input-file"
.IR " output\-file"
.SH DESCRIPTION
.B gendict
reads the word list from
.I dictionary-file
and creates a string trie dictionary file. Normally this data file has the
.B .dict
extension.
.PP
Words begin at the beginning of a line and are terminated by the first whitespace.
Lines that begin with whitespace are ignored.
.SH OPTIONS
.TP
.BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
Print help about usage and exit.
.TP
.BR "\-V\fP, \fB\-\-version"
Print the version of
.B gendict
and exit.
.TP
.BR "\-c\fP, \fB\-\-copyright"
Embeds the standard ICU copyright into the
.IR output-file .
.TP
.BR "\-v\fP, \fB\-\-verbose"
Display extra informative messages during execution.
.TP
.BI "\-i\fP, \fB\-\-icudatadir" " directory"
Look for any necessary ICU data files in
.IR directory .
For example, the file
.B pnames.icu
must be located when ICU's data is not built as a shared library.
The default ICU data directory is specified by the environment variable
.BR ICU_DATA .
Most configurations of ICU do not require this argument.
.TP
.BR "\fB\-\-uchars"
Set the output trie type to UChar. Mutually exclusive with
.BR --bytes.
.TP
.BR "\fB\-\-bytes"
Set the output trie type to Bytes. Mutually exclusive with
.BR --uchars.
.TP
.BR "\fB\-\-transform"
Set the transform type. Should only be specified with
.BR --bytes.
Currently supported transforms are:
.BR offset-<hex-number>,
which specifies an offset to subtract from all input characters.
It should be noted that the offset transform also maps U+200D
to 0xFF and U+200C to 0xFE, in order to offer compatibility to
languages that require these characters.
A transform must be specified for a bytes trie, and when applied
to the non-value characters in the
.IR input-file
must produce output between 0x00 and 0xFF.
.TP
.BI " input\-file"
The source file to read.
.TP
.BI " output\-file"
The file to write the output dictionary to.
.SH CAVEATS
The
.IR input-file
is assumed to be encoded in UTF-8.
The integers in the
.IR input-file
that are used as values must be made up of ASCII digits. They
may be specified either in hex, by using a 0x prefix, or in
decimal.
Either
.BI --bytes
or
.BI --uchars
must be specified.
.SH ENVIRONMENT
.TP 10
.B ICU_DATA
Specifies the directory containing ICU data. Defaults to
.BR @thepkgicudatadir@/@PACKAGE@/@VERSION@/ .
Some tools in ICU depend on the presence of the trailing slash. It is thus
important to make sure that it is present if
.B ICU_DATA
is set.
.SH AUTHORS
Maxime Serrano
.SH VERSION
1.0
.SH COPYRIGHT
Copyright (C) 2012 International Business Machines Corporation and others
.SH SEE ALSO
.BR http://www.icu-project.org/userguide/boundaryAnalysis.html

View file

@ -0,0 +1,448 @@
/*
**********************************************************************
* Copyright (C) 2002-2012, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
* File gendict.cpp
*/
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/ucnv.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/uclean.h"
#include "unicode/udata.h"
#include "unicode/putil.h"
#include "unicode/ucharstriebuilder.h"
#include "unicode/bytestriebuilder.h"
#include "unicode/ucharstrie.h"
#include "unicode/bytestrie.h"
#include "unicode/ucnv.h"
#include "charstr.h"
#include "dictionarydata.h"
#include "uoptions.h"
#include "unewdata.h"
#include "cmemory.h"
#include "uassert.h"
#include "ucbuf.h"
#include "toolutil.h"
#include "cstring.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
U_NAMESPACE_USE
static char *progName;
static UOption options[]={
UOPTION_HELP_H, /* 0 */
UOPTION_HELP_QUESTION_MARK, /* 1 */
UOPTION_VERBOSE, /* 2 */
UOPTION_ICUDATADIR, /* 4 */
UOPTION_COPYRIGHT, /* 5 */
{ "uchars", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0}, /* 6 */
{ "bytes", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0}, /* 7 */
{ "transform", NULL, NULL, NULL, '\1', UOPT_REQUIRES_ARG, 0}, /* 8 */
};
enum arguments {
ARG_HELP = 0,
ARG_QMARK,
ARG_VERBOSE,
ARG_ICUDATADIR,
ARG_COPYRIGHT,
ARG_UCHARS,
ARG_BYTES,
ARG_TRANSFORM
};
// prints out the standard usage method describing command line arguments,
// then bails out with the desired exit code
static void usageAndDie(UErrorCode retCode) {
fprintf((U_SUCCESS(retCode) ? stdout : stderr), "Usage: %s -trietype [-options] input-dictionary-file output-file\n", progName);
fprintf((U_SUCCESS(retCode) ? stdout : stderr),
"\tRead in a word list and write out a string trie dictionary\n"
"options:\n"
"\t-h or -? or --help this usage text\n"
"\t-V or --version show a version message\n"
"\t-c or --copyright include a copyright notice\n"
"\t-v or --verbose turn on verbose output\n"
"\t-i or --icudatadir directory for locating any needed intermediate data files,\n" // TODO: figure out if we need this option
"\t followed by path, defaults to %s\n"
"\t--uchars output a UCharsTrie (mutually exclusive with -b!)\n"
"\t--bytes output a BytesTrie (mutually exclusive with -u!)\n"
"\t--transform the kind of transform to use (eg --transform offset-40A3,\n"
"\t which specifies an offset transform with constant 0x40A3)\n",
u_getDataDirectory());
exit(retCode);
}
/* UDataInfo cf. udata.h */
static UDataInfo dataInfo = {
sizeof(UDataInfo),
0,
U_IS_BIG_ENDIAN,
U_CHARSET_FAMILY,
U_SIZEOF_UCHAR,
0,
{ 0x44, 0x69, 0x63, 0x74 }, /* "Dict" */
{ 1, 0, 0, 0 }, /* format version */
{ 0, 0, 0, 0 } /* data version */
};
// A wrapper for both BytesTrieBuilder and UCharsTrieBuilder.
// may want to put this somewhere in ICU, as it could be useful outside
// of this tool?
class DataDict {
private:
BytesTrieBuilder *bt;
UCharsTrieBuilder *ut;
UChar32 transformConstant;
int32_t transformType;
public:
// constructs a new data dictionary. if there is an error,
// it will be returned in status
// isBytesTrie != 0 will produce a BytesTrieBuilder,
// isBytesTrie == 0 will produce a UCharsTrieBuilder
DataDict(UBool isBytesTrie, UErrorCode &status) : bt(NULL), ut(NULL),
transformConstant(0), transformType(DictionaryData::TRANSFORM_NONE) {
if (isBytesTrie) {
bt = new BytesTrieBuilder(status);
} else {
ut = new UCharsTrieBuilder(status);
}
}
~DataDict() {
delete bt;
delete ut;
}
private:
char transform(UChar32 c, UErrorCode &status) {
if (transformType == DictionaryData::TRANSFORM_TYPE_OFFSET) {
if (c == 0x200D) { return (char)0xFF; }
else if (c == 0x200C) { return (char)0xFE; }
int32_t delta = c - transformConstant;
if (delta < 0 || 0xFD < delta) {
fprintf(stderr, "Codepoint U+%04lx out of range for --transform offset-%04lx!\n",
(long)c, (long)transformConstant);
exit(U_ILLEGAL_ARGUMENT_ERROR); // TODO: should return and print the line number
}
return (char)delta;
} else { // no such transform type
status = U_INTERNAL_PROGRAM_ERROR;
return (char)c; // it should be noted this transform type will not generally work
}
}
void transform(const UnicodeString &word, CharString &buf, UErrorCode &errorCode) {
UChar32 c = 0;
int32_t len = word.length();
for (int32_t i = 0; i < len; i += U16_LENGTH(c)) {
c = word.char32At(i);
buf.append(transform(c, errorCode), errorCode);
}
}
public:
// sets the desired transformation data.
// should be populated from a command line argument
// so far the only acceptable format is offset-<hex constant>
// eventually others (mask-<hex constant>?) may be enabled
// more complex functions may be more difficult
void setTransform(const char *t) {
if (strncmp(t, "offset-", 7) == 0) {
char *end;
unsigned long base = uprv_strtoul(t + 7, &end, 16);
if (end == (t + 7) || *end != 0 || base > 0x10FF80) {
fprintf(stderr, "Syntax for offset value in --transform offset-%s invalid!\n", t + 7);
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
}
transformType = DictionaryData::TRANSFORM_TYPE_OFFSET;
transformConstant = (UChar32)base;
}
else {
fprintf(stderr, "Invalid transform specified: %s\n", t);
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
}
}
// add a word to the trie
void addWord(const UnicodeString &word, int32_t value, UErrorCode &status) {
if (bt) {
CharString buf;
transform(word, buf, status);
bt->add(buf.toStringPiece(), value, status);
}
if (ut) { ut->add(word, value, status); }
}
// if we are a bytestrie, give back the StringPiece representing the serialized version of us
StringPiece serializeBytes(UErrorCode &status) {
return bt->buildStringPiece(USTRINGTRIE_BUILD_SMALL, status);
}
// if we are a ucharstrie, produce the UnicodeString representing the serialized version of us
void serializeUChars(UnicodeString &s, UErrorCode &status) {
ut->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, s, status);
}
int32_t getTransform() {
return (int32_t)(transformType | transformConstant);
}
};
static const UChar LINEFEED_CHARACTER = 0x000A;
static const UChar CARRIAGE_RETURN_CHARACTER = 0x000D;
static UBool readLine(UCHARBUF *f, UnicodeString &fileLine, IcuToolErrorCode &errorCode) {
int32_t lineLength;
const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
if(line == NULL || errorCode.isFailure()) { return FALSE; }
// Strip trailing CR/LF, comments, and spaces.
const UChar *comment = u_memchr(line, 0x23, lineLength); // '#'
if(comment != NULL) {
lineLength = (int32_t)(comment - line);
} else {
while(lineLength > 0 && (line[lineLength - 1] == CARRIAGE_RETURN_CHARACTER || line[lineLength - 1] == LINEFEED_CHARACTER)) { --lineLength; }
}
while(lineLength > 0 && u_isspace(line[lineLength - 1])) { --lineLength; }
fileLine.setTo(FALSE, line, lineLength);
return TRUE;
}
//----------------------------------------------------------------------------
//
// main for gendict
//
//----------------------------------------------------------------------------
int main(int argc, char **argv) {
//
// Pick up and check the command line arguments,
// using the standard ICU tool utils option handling.
//
U_MAIN_INIT_ARGS(argc, argv);
progName = argv[0];
argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
if(argc<0) {
// Unrecognized option
fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
}
if(options[ARG_HELP].doesOccur || options[ARG_QMARK].doesOccur) {
// -? or -h for help.
usageAndDie(U_ZERO_ERROR);
}
UBool verbose = options[ARG_VERBOSE].doesOccur;
if (argc < 3) {
fprintf(stderr, "input and output file must both be specified.\n");
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
}
const char *outFileName = argv[2];
const char *wordFileName = argv[1];
if (options[ARG_ICUDATADIR].doesOccur) {
u_setDataDirectory(options[ARG_ICUDATADIR].value);
}
const char *copyright = NULL;
if (options[ARG_COPYRIGHT].doesOccur) {
copyright = U_COPYRIGHT_STRING;
}
if (options[ARG_UCHARS].doesOccur == options[ARG_BYTES].doesOccur) {
fprintf(stderr, "you must specify exactly one type of trie to output!\n");
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
}
UBool isBytesTrie = options[ARG_BYTES].doesOccur;
if (isBytesTrie != options[ARG_TRANSFORM].doesOccur) {
fprintf(stderr, "you must provide a transformation for a bytes trie, and must not provide one for a uchars trie!\n");
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
}
IcuToolErrorCode status("gendict/main()");
#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
UNewDataMemory *pData;
char msg[1024];
/* write message with just the name */ // potential for a buffer overflow here...
sprintf(msg, "gendict writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
fprintf(stderr, "%s\n", msg);
/* write the dummy data file */
pData = udata_create(outDir, NULL, outFileName, &dataInfo, NULL, &status);
udata_writeBlock(pData, msg, strlen(msg));
udata_finish(pData, &status);
return (int)status;
#else
// Read in the dictionary source file
if (verbose) { printf("Opening file %s...\n", wordFileName); }
const char *codepage = "UTF-8";
UCHARBUF *f = ucbuf_open(wordFileName, &codepage, TRUE, FALSE, status);
if (status.isFailure()) {
fprintf(stderr, "error opening input file: ICU Error \"%s\"\n", status.errorName());
exit(status.reset());
}
if (verbose) { printf("Initializing dictionary builder of type %s...\n", (isBytesTrie ? "BytesTrie" : "UCharsTrie")); }
DataDict dict(isBytesTrie, status);
if (status.isFailure()) {
fprintf(stderr, "new DataDict: ICU Error \"%s\"\n", status.errorName());
exit(status.reset());
}
if (options[ARG_TRANSFORM].doesOccur) {
dict.setTransform(options[ARG_TRANSFORM].value);
}
UnicodeString fileLine;
if (verbose) { puts("Adding words to dictionary..."); }
UBool hasValues = FALSE;
UBool hasValuelessContents = FALSE;
int lineCount = 0;
UBool isOk = TRUE;
while (readLine(f, fileLine, status)) {
lineCount++;
if (fileLine.isEmpty()) continue;
// Parse word [spaces value].
int32_t keyLen;
for (keyLen = 0; keyLen < fileLine.length() && !u_isspace(fileLine[keyLen]); ++keyLen) {}
if (keyLen == 0) {
fprintf(stderr, "Error: no word on line %i!\n", lineCount);
isOk = FALSE;
continue;
}
int32_t valueStart;
for (valueStart = keyLen;
valueStart < fileLine.length() && u_isspace(fileLine[valueStart]);
++valueStart) {}
if (keyLen < valueStart) {
int32_t valueLength = fileLine.length() - valueStart;
if (valueLength > 15) {
fprintf(stderr, "Error: value too long on line %i!\n", lineCount);
isOk = FALSE;
continue;
}
char s[16];
fileLine.extract(valueStart, valueLength, s, 16, US_INV);
char *end;
unsigned long value = uprv_strtoul(s, &end, 0);
if (end == s || *end != 0 || (int32_t)uprv_strlen(s) != valueLength || value > 0xffffffff) {
fprintf(stderr, "Error: value syntax error or value too large on line %i!\n", lineCount);
isOk = FALSE;
continue;
}
dict.addWord(fileLine.tempSubString(0, keyLen), (int32_t)value, status);
hasValues = TRUE;
} else {
dict.addWord(fileLine.tempSubString(0, keyLen), 0, status);
hasValuelessContents = FALSE;
}
if (status.isFailure()) {
fprintf(stderr, "ICU Error \"%s\": Failed to add word to trie at input line %d in input file\n",
status.errorName(), lineCount);
exit(status.reset());
}
}
if (!isOk && status.isSuccess()) {
status.set(U_ILLEGAL_ARGUMENT_ERROR);
}
if (hasValues && hasValuelessContents) {
fprintf(stderr, "warning: file contained both valued and unvalued strings!\n");
}
if (verbose) { puts("Serializing data..."); }
int32_t outDataSize;
const void *outData;
UnicodeString usp;
if (isBytesTrie) {
StringPiece sp = dict.serializeBytes(status);
outDataSize = sp.size();
outData = sp.data();
} else {
dict.serializeUChars(usp, status);
outDataSize = usp.length() * U_SIZEOF_UCHAR;
outData = usp.getBuffer();
}
if (status.isFailure()) {
fprintf(stderr, "gendict: got failure of type %s while serializing\n", status.errorName());
exit(status.reset());
}
if (verbose) { puts("Opening output file..."); }
UNewDataMemory *pData = udata_create(NULL, NULL, outFileName, &dataInfo, copyright, status);
if (status.isFailure()) {
fprintf(stderr, "gendict: could not open output file \"%s\", \"%s\"\n", outFileName, status.errorName());
exit(status.reset());
}
if (verbose) { puts("Writing to output file..."); }
int32_t indexes[DictionaryData::IX_COUNT] = {
DictionaryData::IX_COUNT * sizeof(int32_t), 0, 0, 0, 0, 0, 0, 0
};
int32_t size = outDataSize + indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
indexes[DictionaryData::IX_RESERVED1_OFFSET] = size;
indexes[DictionaryData::IX_RESERVED2_OFFSET] = size;
indexes[DictionaryData::IX_TOTAL_SIZE] = size;
indexes[DictionaryData::IX_TRIE_TYPE] = isBytesTrie ? DictionaryData::TRIE_TYPE_BYTES : DictionaryData::TRIE_TYPE_UCHARS;
if (hasValues) {
indexes[DictionaryData::IX_TRIE_TYPE] |= DictionaryData::TRIE_HAS_VALUES;
}
indexes[DictionaryData::IX_TRANSFORM] = dict.getTransform();
udata_writeBlock(pData, indexes, sizeof(indexes));
udata_writeBlock(pData, outData, outDataSize);
size_t bytesWritten = udata_finish(pData, status);
if (status.isFailure()) {
fprintf(stderr, "gendict: error \"%s\" writing the output file\n", status.errorName());
exit(status.reset());
}
if (bytesWritten != (size_t)size) {
fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
exit(U_INTERNAL_PROGRAM_ERROR);
}
puts("gendict: tool completed successfully.");
#ifdef TEST_GENDICT
if (isBytesTrie) {
BytesTrie::Iterator it(outData, outDataSize, status);
while (it.hasNext()) {
it.next(status);
const StringPiece s = it.getString();
int32_t val = it.getValue();
printf("%s -> %i\n", s.data(), val);
}
} else {
UCharsTrie::Iterator it((const UChar *)outData, outDataSize, status);
while (it.hasNext()) {
it.next(status);
const UnicodeString s = it.getString();
int32_t val = it.getValue();
char tmp[1024];
s.extract(0, s.length(), tmp, 1024);
printf("%s -> %i\n", tmp, val);
}
}
#endif
return 0;
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
}

View file

@ -84,7 +84,7 @@
<Outputs>..\..\..\bin\$(TargetFileName);%(Outputs)</Outputs>
</CustomBuildStep>
<Midl>
<TypeLibraryName>.\x86\Release/genctd.tlb</TypeLibraryName>
<TypeLibraryName>.\x86\Release/gendict.tlb</TypeLibraryName>
</Midl>
<ClCompile>
<AdditionalIncludeDirectories>..\..\common;..\toolutil;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
@ -94,7 +94,7 @@
<FunctionLevelLinking>true</FunctionLevelLinking>
<DisableLanguageExtensions>true</DisableLanguageExtensions>
<TreatWChar_tAsBuiltInType>true</TreatWChar_tAsBuiltInType>
<PrecompiledHeaderOutputFile>.\x86\Release/genctd.pch</PrecompiledHeaderOutputFile>
<PrecompiledHeaderOutputFile>.\x86\Release/gendict.pch</PrecompiledHeaderOutputFile>
<AssemblerListingLocation>.\x86\Release/</AssemblerListingLocation>
<ObjectFileName>.\x86\Release/</ObjectFileName>
<ProgramDataBaseFileName>.\x86\Release/</ProgramDataBaseFileName>
@ -107,9 +107,9 @@
<Culture>0x0409</Culture>
</ResourceCompile>
<Link>
<OutputFile>.\x86\Release/genctd.exe</OutputFile>
<OutputFile>.\x86\Release/gendict.exe</OutputFile>
<SuppressStartupBanner>true</SuppressStartupBanner>
<ProgramDatabaseFile>.\x86\Release/genctd.pdb</ProgramDatabaseFile>
<ProgramDatabaseFile>.\x86\Release/gendict.pdb</ProgramDatabaseFile>
<SubSystem>Console</SubSystem>
<RandomizedBaseAddress>false</RandomizedBaseAddress>
<DataExecutionPrevention>
@ -123,7 +123,7 @@
<Outputs>..\..\..\bin\$(TargetFileName);%(Outputs)</Outputs>
</CustomBuildStep>
<Midl>
<TypeLibraryName>.\x86\Debug/genctd.tlb</TypeLibraryName>
<TypeLibraryName>.\x86\Debug/gendict.tlb</TypeLibraryName>
</Midl>
<ClCompile>
<Optimization>Disabled</Optimization>
@ -134,7 +134,7 @@
<BufferSecurityCheck>true</BufferSecurityCheck>
<DisableLanguageExtensions>true</DisableLanguageExtensions>
<TreatWChar_tAsBuiltInType>true</TreatWChar_tAsBuiltInType>
<PrecompiledHeaderOutputFile>.\x86\Debug/genctd.pch</PrecompiledHeaderOutputFile>
<PrecompiledHeaderOutputFile>.\x86\Debug/gendict.pch</PrecompiledHeaderOutputFile>
<AssemblerListingLocation>.\x86\Debug/</AssemblerListingLocation>
<ObjectFileName>.\x86\Debug/</ObjectFileName>
<ProgramDataBaseFileName>.\x86\Debug/</ProgramDataBaseFileName>
@ -149,10 +149,10 @@
<Culture>0x0409</Culture>
</ResourceCompile>
<Link>
<OutputFile>.\x86\Debug/genctd.exe</OutputFile>
<OutputFile>.\x86\Debug/gendict.exe</OutputFile>
<SuppressStartupBanner>true</SuppressStartupBanner>
<GenerateDebugInformation>true</GenerateDebugInformation>
<ProgramDatabaseFile>.\x86\Debug/genctd.pdb</ProgramDatabaseFile>
<ProgramDatabaseFile>.\x86\Debug/gendict.pdb</ProgramDatabaseFile>
<SubSystem>Console</SubSystem>
<RandomizedBaseAddress>false</RandomizedBaseAddress>
<DataExecutionPrevention>
@ -167,7 +167,7 @@
</CustomBuildStep>
<Midl>
<TargetEnvironment>X64</TargetEnvironment>
<TypeLibraryName>.\x64\Release/genctd.tlb</TypeLibraryName>
<TypeLibraryName>.\x64\Release/gendict.tlb</TypeLibraryName>
</Midl>
<ClCompile>
<AdditionalIncludeDirectories>..\..\common;..\toolutil;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
@ -177,7 +177,7 @@
<FunctionLevelLinking>true</FunctionLevelLinking>
<DisableLanguageExtensions>true</DisableLanguageExtensions>
<TreatWChar_tAsBuiltInType>true</TreatWChar_tAsBuiltInType>
<PrecompiledHeaderOutputFile>.\x64\Release/genctd.pch</PrecompiledHeaderOutputFile>
<PrecompiledHeaderOutputFile>.\x64\Release/gendict.pch</PrecompiledHeaderOutputFile>
<AssemblerListingLocation>.\x64\Release/</AssemblerListingLocation>
<ObjectFileName>.\x64\Release/</ObjectFileName>
<ProgramDataBaseFileName>.\x64\Release/</ProgramDataBaseFileName>
@ -190,9 +190,9 @@
<Culture>0x0409</Culture>
</ResourceCompile>
<Link>
<OutputFile>.\x64\Release/genctd.exe</OutputFile>
<OutputFile>.\x64\Release/gendict.exe</OutputFile>
<SuppressStartupBanner>true</SuppressStartupBanner>
<ProgramDatabaseFile>.\x64\Release/genctd.pdb</ProgramDatabaseFile>
<ProgramDatabaseFile>.\x64\Release/gendict.pdb</ProgramDatabaseFile>
<SubSystem>Console</SubSystem>
<TargetMachine>MachineX64</TargetMachine>
</Link>
@ -205,7 +205,7 @@
</CustomBuildStep>
<Midl>
<TargetEnvironment>X64</TargetEnvironment>
<TypeLibraryName>.\x64\Debug/genctd.tlb</TypeLibraryName>
<TypeLibraryName>.\x64\Debug/gendict.tlb</TypeLibraryName>
</Midl>
<ClCompile>
<Optimization>Disabled</Optimization>
@ -216,7 +216,7 @@
<BufferSecurityCheck>true</BufferSecurityCheck>
<DisableLanguageExtensions>true</DisableLanguageExtensions>
<TreatWChar_tAsBuiltInType>true</TreatWChar_tAsBuiltInType>
<PrecompiledHeaderOutputFile>.\x64\Debug/genctd.pch</PrecompiledHeaderOutputFile>
<PrecompiledHeaderOutputFile>.\x64\Debug/gendict.pch</PrecompiledHeaderOutputFile>
<AssemblerListingLocation>.\x64\Debug/</AssemblerListingLocation>
<ObjectFileName>.\x64\Debug/</ObjectFileName>
<ProgramDataBaseFileName>.\x64\Debug/</ProgramDataBaseFileName>
@ -231,16 +231,16 @@
<Culture>0x0409</Culture>
</ResourceCompile>
<Link>
<OutputFile>.\x64\Debug/genctd.exe</OutputFile>
<OutputFile>.\x64\Debug/gendict.exe</OutputFile>
<SuppressStartupBanner>true</SuppressStartupBanner>
<GenerateDebugInformation>true</GenerateDebugInformation>
<ProgramDatabaseFile>.\x64\Debug/genctd.pdb</ProgramDatabaseFile>
<ProgramDatabaseFile>.\x64\Debug/gendict.pdb</ProgramDatabaseFile>
<SubSystem>Console</SubSystem>
<TargetMachine>MachineX64</TargetMachine>
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="genctd.cpp" />
<ClCompile Include="gendict.cpp" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\..\common\common.vcxproj">

View file

@ -2,21 +2,21 @@
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{13ddeaaf-33bc-4f07-a772-cd365dd75257}</UniqueIdentifier>
<UniqueIdentifier>{570fb8ae-ac18-467d-8502-470a241a60d4}</UniqueIdentifier>
<Extensions>cpp;c;cxx;rc;def;r;odl;idl;hpj;bat</Extensions>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{259ce86d-ab79-4867-b42f-d114c3b8ed6e}</UniqueIdentifier>
<UniqueIdentifier>{7b2185f2-4ff9-4419-b596-0a21e37414c9}</UniqueIdentifier>
<Extensions>h;hpp;hxx;hm;inl</Extensions>
</Filter>
<Filter Include="Resource Files">
<UniqueIdentifier>{3b1a7423-5627-4cf4-a0d5-29ad34d9e5ac}</UniqueIdentifier>
<UniqueIdentifier>{1dc5e7e3-4d1b-4031-a31f-c39b3a3e283a}</UniqueIdentifier>
<Extensions>ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe</Extensions>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="genctd.cpp">
<ClCompile Include="gendict.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
</Project>
</Project>

View file

@ -54,8 +54,8 @@
#include "sprpimpl.h"
#include "propname.h"
#include "rbbidata.h"
#include "triedict.h"
#include "utrie2.h"
#include "dictionarydata.h"
/* swapping implementations in i18n */
@ -734,7 +734,7 @@ static const struct {
#endif
#if !UCONFIG_NO_BREAK_ITERATION
{ { 0x42, 0x72, 0x6b, 0x20 }, ubrk_swap }, /* dataFormat="Brk " */
{ { 0x54, 0x72, 0x44, 0x63 }, triedict_swap }, /* dataFormat="TrDc " */
{ { 0x44, 0x69, 0x63, 0x74 }, udict_swap }, /* dataFormat="Dict" */
#endif
{ { 0x70, 0x6e, 0x61, 0x6d }, upname_swap }, /* dataFormat="pnam" */
{ { 0x75, 0x6e, 0x61, 0x6d }, uchar_swapNames }, /* dataFormat="unam" */