mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 21:45:37 +00:00
ICU-9353 merge dbbi-tries work into the trunk
X-SVN-Rev: 32184
This commit is contained in:
parent
8bcdfa544d
commit
c64c0299d7
43 changed files with 328856 additions and 2877 deletions
4
.gitattributes
vendored
4
.gitattributes
vendored
|
@ -187,8 +187,8 @@ icu4c/source/tools/gencmn/gencmn.vcxproj -text
|
|||
icu4c/source/tools/gencmn/gencmn.vcxproj.filters -text
|
||||
icu4c/source/tools/gencnval/gencnval.vcxproj -text
|
||||
icu4c/source/tools/gencnval/gencnval.vcxproj.filters -text
|
||||
icu4c/source/tools/genctd/genctd.vcxproj -text
|
||||
icu4c/source/tools/genctd/genctd.vcxproj.filters -text
|
||||
icu4c/source/tools/gendict/gendict.vcxproj -text
|
||||
icu4c/source/tools/gendict/gendict.vcxproj.filters -text
|
||||
icu4c/source/tools/gennorm2/gennorm2.vcxproj -text
|
||||
icu4c/source/tools/genrb/derb.vcxproj -text
|
||||
icu4c/source/tools/genrb/derb.vcxproj.filters -text
|
||||
|
|
15
.gitignore
vendored
15
.gitignore
vendored
|
@ -709,21 +709,6 @@ icu4c/source/tools/gencnval/gencnval.vcproj.*.*.user
|
|||
icu4c/source/tools/gencnval/release
|
||||
icu4c/source/tools/gencnval/x64
|
||||
icu4c/source/tools/gencnval/x86
|
||||
icu4c/source/tools/genctd/*.d
|
||||
icu4c/source/tools/genctd/*.o
|
||||
icu4c/source/tools/genctd/*.pdb
|
||||
icu4c/source/tools/genctd/*.plg
|
||||
icu4c/source/tools/genctd/*.vcxproj.user
|
||||
icu4c/source/tools/genctd/Debug
|
||||
icu4c/source/tools/genctd/Makefile
|
||||
icu4c/source/tools/genctd/Release
|
||||
icu4c/source/tools/genctd/debug
|
||||
icu4c/source/tools/genctd/genctd
|
||||
icu4c/source/tools/genctd/genctd.1
|
||||
icu4c/source/tools/genctd/genctd.vcproj.*.*.user
|
||||
icu4c/source/tools/genctd/release
|
||||
icu4c/source/tools/genctd/x64
|
||||
icu4c/source/tools/genctd/x86
|
||||
icu4c/source/tools/gennorm2/*.d
|
||||
icu4c/source/tools/gennorm2/*.o
|
||||
icu4c/source/tools/gennorm2/*.pdb
|
||||
|
|
|
@ -52,7 +52,7 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "iotest", "..\test\iotest\io
|
|||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "icupkg", "..\tools\icupkg\icupkg.vcxproj", "{62D4B15D-7A90-4ECB-BA19-5E021D6A21BC}"
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "genctd", "..\tools\genctd\genctd.vcxproj", "{9D4211F7-2C77-439C-82F0-30A4E43BA569}"
|
||||
Project("{9D4211F7-2C77-439C-82F0-30A4E43BA569}") = "gendict", "..\tools\gendict\gendict.vcxproj", "{9D4211F7-2C77-439C-82F0-30A4E43BA569}"
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "letest", "..\test\letest\letest.vcxproj", "{67351485-4D18-4245-BE39-A7EF0675ACD2}"
|
||||
EndProject
|
||||
|
|
|
@ -90,6 +90,7 @@ bytestream.o stringpiece.o \
|
|||
stringtriebuilder.o bytestriebuilder.o \
|
||||
bytestrie.o bytestrieiterator.o \
|
||||
ucharstrie.o ucharstriebuilder.o ucharstrieiterator.o \
|
||||
dictionarydata.o \
|
||||
appendable.o ustr_cnv.o unistr_cnv.o unistr.o unistr_case.o unistr_props.o \
|
||||
utf_impl.o ustring.o ustrcase.o ucasemap.o ucasemap_titlecase_brkiter.o cstring.o ustrfmt.o ustrtrns.o ustr_wcs.o utext.o \
|
||||
unistr_case_locale.o ustrcase_locale.o unistr_titlecase_brkiter.o ustr_titlecase_brkiter.o \
|
||||
|
@ -98,7 +99,7 @@ chariter.o schriter.o uchriter.o uiter.o \
|
|||
patternprops.o uchar.o uprops.o ucase.o propname.o ubidi_props.o ubidi.o ubidiwrt.o ubidiln.o ushape.o \
|
||||
uscript.o usc_impl.o unames.o \
|
||||
utrie.o utrie2.o utrie2_builder.o bmpset.o unisetspan.o uset_props.o uniset_props.o uniset_closure.o uset.o uniset.o usetiter.o ruleiter.o caniter.o unifilt.o unifunct.o \
|
||||
uarrsort.o brkiter.o ubrk.o brkeng.o dictbe.o triedict.o \
|
||||
uarrsort.o brkiter.o ubrk.o brkeng.o dictbe.o \
|
||||
rbbi.o rbbidata.o rbbinode.o rbbirb.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o \
|
||||
serv.o servnotf.o servls.o servlk.o servlkf.o servrbf.o servslkf.o \
|
||||
uidna.o usprep.o uts46.o punycode.o \
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
************************************************************************************
|
||||
* Copyright (C) 2006-2011, International Business Machines Corporation
|
||||
* Copyright (C) 2006-2012, International Business Machines Corporation
|
||||
* and others. All Rights Reserved.
|
||||
************************************************************************************
|
||||
*/
|
||||
|
@ -11,7 +11,6 @@
|
|||
|
||||
#include "brkeng.h"
|
||||
#include "dictbe.h"
|
||||
#include "triedict.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/chariter.h"
|
||||
|
@ -20,6 +19,9 @@
|
|||
#include "unicode/putil.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/uscript.h"
|
||||
#include "unicode/ucharstrie.h"
|
||||
#include "unicode/bytestrie.h"
|
||||
#include "dictionarydata.h"
|
||||
#include "uvector.h"
|
||||
#include "umutex.h"
|
||||
#include "uresimp.h"
|
||||
|
@ -219,21 +221,45 @@ ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
|
|||
UErrorCode status = U_ZERO_ERROR;
|
||||
UScriptCode code = uscript_getScript(c, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
const CompactTrieDictionary *dict = loadDictionaryFor(code, breakType);
|
||||
if (dict != NULL) {
|
||||
DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);
|
||||
if (m != NULL) {
|
||||
const LanguageBreakEngine *engine = NULL;
|
||||
switch(code) {
|
||||
case USCRIPT_THAI:
|
||||
engine = new ThaiBreakEngine(dict, status);
|
||||
engine = new ThaiBreakEngine(m, status);
|
||||
break;
|
||||
case USCRIPT_KHMER:
|
||||
engine = new KhmerBreakEngine(dict, status);
|
||||
engine = new KhmerBreakEngine(m, status);
|
||||
break;
|
||||
|
||||
case USCRIPT_HANGUL:
|
||||
engine = new CjkBreakEngine(m, kKorean, status);
|
||||
break;
|
||||
|
||||
// use same BreakEngine and dictionary for both Chinese and Japanese
|
||||
case USCRIPT_HIRAGANA:
|
||||
case USCRIPT_KATAKANA:
|
||||
case USCRIPT_HAN:
|
||||
engine = new CjkBreakEngine(m, kChineseJapanese, status);
|
||||
break;
|
||||
#if 0
|
||||
// TODO: Have to get some characters with script=common handled
|
||||
// by CjkBreakEngine (e.g. U+309B). Simply subjecting
|
||||
// them to CjkBreakEngine does not work. The engine has to
|
||||
// special-case them.
|
||||
case USCRIPT_COMMON:
|
||||
{
|
||||
UBlockCode block = ublock_getCode(code);
|
||||
if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
|
||||
engine = new CjkBreakEngine(dict, kChineseJapanese, status);
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (engine == NULL) {
|
||||
delete dict;
|
||||
delete m;
|
||||
}
|
||||
else if (U_FAILURE(status)) {
|
||||
delete engine;
|
||||
|
@ -245,45 +271,61 @@ ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
|
|||
return NULL;
|
||||
}
|
||||
|
||||
const CompactTrieDictionary *
|
||||
ICULanguageBreakFactory::loadDictionaryFor(UScriptCode script, int32_t /*breakType*/) {
|
||||
DictionaryMatcher *
|
||||
ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
// Open root from brkitr tree.
|
||||
char dictnbuff[256];
|
||||
char ext[4]={'\0'};
|
||||
|
||||
// open root from brkitr tree.
|
||||
char dictnbuf[256];
|
||||
char ext[6] = {'\0'};
|
||||
UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
|
||||
b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
|
||||
b = ures_getByKeyWithFallback(b, uscript_getShortName(script), b, &status);
|
||||
int32_t dictnlength = 0;
|
||||
const UChar *dictfname = ures_getString(b, &dictnlength, &status);
|
||||
if (U_SUCCESS(status) && (size_t)dictnlength >= sizeof(dictnbuff)) {
|
||||
if (U_SUCCESS(status) && (size_t)dictnlength >= sizeof(dictnbuf)) {
|
||||
dictnlength = 0;
|
||||
status = U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
if (U_SUCCESS(status) && dictfname) {
|
||||
UChar* extStart=u_strchr(dictfname, 0x002e);
|
||||
UChar *extStart = u_strchr(dictfname, 0x002e);
|
||||
int len = 0;
|
||||
if(extStart!=NULL){
|
||||
len = (int)(extStart-dictfname);
|
||||
u_UCharsToChars(extStart+1, ext, sizeof(ext)); // nul terminates the buff
|
||||
u_UCharsToChars(dictfname, dictnbuff, len);
|
||||
if (extStart != NULL) {
|
||||
len = (int)(extStart - dictfname);
|
||||
u_UCharsToChars(extStart+1, ext, sizeof(ext)); // null-terminates the buffer
|
||||
u_UCharsToChars(dictfname, dictnbuf, len);
|
||||
}
|
||||
dictnbuff[len]=0; // nul terminate
|
||||
dictnbuf[len] = '\0'; // null-terminate
|
||||
}
|
||||
ures_close(b);
|
||||
UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext, dictnbuff, &status);
|
||||
|
||||
UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext, dictnbuf, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
const CompactTrieDictionary *dict = new CompactTrieDictionary(
|
||||
file, status);
|
||||
if (U_SUCCESS(status) && dict == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
// build trie
|
||||
const uint8_t *data = (const uint8_t *)udata_getMemory(file);
|
||||
const int32_t *indexes = (const int32_t *)data;
|
||||
const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
|
||||
const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
|
||||
DictionaryMatcher *m = NULL;
|
||||
if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
|
||||
const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
|
||||
const char *characters = (const char *)(data + offset);
|
||||
m = new BytesDictionaryMatcher(characters, transform, file);
|
||||
}
|
||||
if (U_FAILURE(status)) {
|
||||
delete dict;
|
||||
dict = NULL;
|
||||
else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
|
||||
const UChar *characters = (const UChar *)(data + offset);
|
||||
m = new UCharsDictionaryMatcher(characters, file);
|
||||
}
|
||||
return dict;
|
||||
if (m == NULL) {
|
||||
// no matcher exists to take ownership - either we are an invalid
|
||||
// type or memory allocation failed
|
||||
udata_close(file);
|
||||
}
|
||||
return m;
|
||||
} else if (dictfname != NULL) {
|
||||
// we don't have a dictionary matcher.
|
||||
// returning NULL here will cause us to fail to find a dictionary break engine, as expected
|
||||
status = U_ZERO_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/**
|
||||
************************************************************************************
|
||||
* Copyright (C) 2006-2007, International Business Machines Corporation and others. *
|
||||
* Copyright (C) 2006-2012, International Business Machines Corporation and others. *
|
||||
* All Rights Reserved. *
|
||||
************************************************************************************
|
||||
*/
|
||||
|
@ -17,7 +17,7 @@ U_NAMESPACE_BEGIN
|
|||
|
||||
class UnicodeSet;
|
||||
class UStack;
|
||||
class CompactTrieDictionary;
|
||||
class DictionaryMatcher;
|
||||
|
||||
/*******************************************************************
|
||||
* LanguageBreakEngine
|
||||
|
@ -259,8 +259,7 @@ class ICULanguageBreakFactory : public LanguageBreakFactory {
|
|||
*/
|
||||
virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType);
|
||||
|
||||
protected:
|
||||
|
||||
protected:
|
||||
/**
|
||||
* <p>Create a LanguageBreakEngine for the set of characters to which
|
||||
* the supplied character belongs, for the specified break type.</p>
|
||||
|
@ -273,17 +272,15 @@ class ICULanguageBreakFactory : public LanguageBreakFactory {
|
|||
*/
|
||||
virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, int32_t breakType);
|
||||
|
||||
/**
|
||||
* <p>Create a CompactTrieDictionary for the specified script and break type.</p>
|
||||
*
|
||||
* @param script An ISO 15924 script code that identifies the dictionary to be
|
||||
* created.
|
||||
* @param breakType The kind of text break for which a dictionary is
|
||||
* sought.
|
||||
* @return A CompactTrieDictionary with the desired characteristics, or 0.
|
||||
*/
|
||||
virtual const CompactTrieDictionary *loadDictionaryFor(UScriptCode script, int32_t breakType);
|
||||
|
||||
/**
|
||||
* <p>Create a DictionaryMatcher for the specified script and break type.</p>
|
||||
* @param script An ISO 15924 script code that identifies the dictionary to be
|
||||
* created.
|
||||
* @param breakType The kind of text break for which a dictionary is
|
||||
* sought.
|
||||
* @return A DictionaryMatcher with the desired characteristics, or NULL.
|
||||
*/
|
||||
virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType);
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -248,7 +248,7 @@
|
|||
<ClCompile Include="rbbisetb.cpp" />
|
||||
<ClCompile Include="rbbistbl.cpp" />
|
||||
<ClCompile Include="rbbitblb.cpp" />
|
||||
<ClCompile Include="triedict.cpp" />
|
||||
<ClCompile Include="dictionarydata.cpp" />
|
||||
<ClCompile Include="ubrk.cpp" />
|
||||
<ClCompile Include="ucol_swp.cpp">
|
||||
<AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">..\i18n;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
|
||||
|
@ -520,7 +520,7 @@
|
|||
<ClInclude Include="rbbiscan.h" />
|
||||
<ClInclude Include="rbbisetb.h" />
|
||||
<ClInclude Include="rbbitblb.h" />
|
||||
<ClInclude Include="triedict.h" />
|
||||
<ClInclude Include="dictionarydata.h" />
|
||||
<CustomBuild Include="unicode\ubrk.h">
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">copy "%(FullPath)" ..\..\include\unicode
|
||||
</Command>
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006-2008,2011, International Business Machines Corporation *
|
||||
* Copyright (C) 2006-2008,2012, International Business Machines Corporation *
|
||||
* and others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -15,7 +15,10 @@
|
|||
#include "unicode/chariter.h"
|
||||
#include "unicode/ubrk.h"
|
||||
#include "uvector.h"
|
||||
#include "triedict.h"
|
||||
#include "uassert.h"
|
||||
#include "unicode/normlzr.h"
|
||||
#include "cmemory.h"
|
||||
#include "dictionarydata.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -23,10 +26,6 @@ U_NAMESPACE_BEGIN
|
|||
******************************************************************
|
||||
*/
|
||||
|
||||
/*DictionaryBreakEngine::DictionaryBreakEngine() {
|
||||
fTypes = 0;
|
||||
}*/
|
||||
|
||||
DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
|
||||
fTypes = breakTypes;
|
||||
}
|
||||
|
@ -87,11 +86,6 @@ DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
|
|||
fSet.compact();
|
||||
}
|
||||
|
||||
/*void
|
||||
DictionaryBreakEngine::setBreakTypes( uint32_t breakTypes ) {
|
||||
fTypes = breakTypes;
|
||||
}*/
|
||||
|
||||
/*
|
||||
******************************************************************
|
||||
*/
|
||||
|
@ -105,34 +99,34 @@ DictionaryBreakEngine::setBreakTypes( uint32_t breakTypes ) {
|
|||
#define POSSIBLE_WORD_LIST_MAX 20
|
||||
|
||||
class PossibleWord {
|
||||
private:
|
||||
// list of word candidate lengths, in increasing length order
|
||||
int32_t lengths[POSSIBLE_WORD_LIST_MAX];
|
||||
int count; // Count of candidates
|
||||
int32_t prefix; // The longest match with a dictionary word
|
||||
int32_t offset; // Offset in the text of these candidates
|
||||
int mark; // The preferred candidate's offset
|
||||
int current; // The candidate we're currently looking at
|
||||
private:
|
||||
// list of word candidate lengths, in increasing length order
|
||||
int32_t lengths[POSSIBLE_WORD_LIST_MAX];
|
||||
int count; // Count of candidates
|
||||
int32_t prefix; // The longest match with a dictionary word
|
||||
int32_t offset; // Offset in the text of these candidates
|
||||
int mark; // The preferred candidate's offset
|
||||
int current; // The candidate we're currently looking at
|
||||
|
||||
public:
|
||||
PossibleWord();
|
||||
~PossibleWord();
|
||||
public:
|
||||
PossibleWord();
|
||||
~PossibleWord();
|
||||
|
||||
// Fill the list of candidates if needed, select the longest, and return the number found
|
||||
int candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd );
|
||||
// Fill the list of candidates if needed, select the longest, and return the number found
|
||||
int candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
|
||||
|
||||
// Select the currently marked candidate, point after it in the text, and invalidate self
|
||||
int32_t acceptMarked( UText *text );
|
||||
// Select the currently marked candidate, point after it in the text, and invalidate self
|
||||
int32_t acceptMarked( UText *text );
|
||||
|
||||
// Back up from the current candidate to the next shorter one; return TRUE if that exists
|
||||
// and point the text after it
|
||||
UBool backUp( UText *text );
|
||||
// Back up from the current candidate to the next shorter one; return TRUE if that exists
|
||||
// and point the text after it
|
||||
UBool backUp( UText *text );
|
||||
|
||||
// Return the longest prefix this candidate location shares with a dictionary word
|
||||
int32_t longestPrefix();
|
||||
// Return the longest prefix this candidate location shares with a dictionary word
|
||||
int32_t longestPrefix();
|
||||
|
||||
// Mark the current candidate as the one we like
|
||||
void markCurrent();
|
||||
// Mark the current candidate as the one we like
|
||||
void markCurrent();
|
||||
};
|
||||
|
||||
inline
|
||||
|
@ -145,7 +139,7 @@ PossibleWord::~PossibleWord() {
|
|||
}
|
||||
|
||||
inline int
|
||||
PossibleWord::candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd ) {
|
||||
PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
|
||||
// TODO: If getIndex is too slow, use offset < 0 and add discardAll()
|
||||
int32_t start = (int32_t)utext_getNativeIndex(text);
|
||||
if (start != offset) {
|
||||
|
@ -211,7 +205,7 @@ PossibleWord::markCurrent() {
|
|||
// Minimum number of characters for two words
|
||||
#define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2)
|
||||
|
||||
ThaiBreakEngine::ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status)
|
||||
ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
|
||||
: DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
|
||||
fDictionary(adoptDictionary)
|
||||
{
|
||||
|
@ -266,10 +260,9 @@ ThaiBreakEngine::divideUpDictionaryRange( UText *text,
|
|||
|
||||
// If we found exactly one, use that
|
||||
if (candidates == 1) {
|
||||
wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text);
|
||||
wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
|
||||
wordsFound += 1;
|
||||
}
|
||||
|
||||
// If there was more than one, see which one can take us forward the most words
|
||||
else if (candidates > 1) {
|
||||
// If we're already at the end of the range, we're done
|
||||
|
@ -278,7 +271,7 @@ ThaiBreakEngine::divideUpDictionaryRange( UText *text,
|
|||
}
|
||||
do {
|
||||
int wordsMatched = 1;
|
||||
if (words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
|
||||
if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
|
||||
if (wordsMatched < 2) {
|
||||
// Followed by another dictionary word; mark first word as a good candidate
|
||||
words[wordsFound%THAI_LOOKAHEAD].markCurrent();
|
||||
|
@ -293,17 +286,17 @@ ThaiBreakEngine::divideUpDictionaryRange( UText *text,
|
|||
// See if any of the possible second words is followed by a third word
|
||||
do {
|
||||
// If we find a third word, stop right away
|
||||
if (words[(wordsFound+2)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
|
||||
words[wordsFound%THAI_LOOKAHEAD].markCurrent();
|
||||
if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
|
||||
words[wordsFound % THAI_LOOKAHEAD].markCurrent();
|
||||
goto foundBest;
|
||||
}
|
||||
}
|
||||
while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(text));
|
||||
while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
|
||||
}
|
||||
}
|
||||
while (words[wordsFound%THAI_LOOKAHEAD].backUp(text));
|
||||
while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
|
||||
foundBest:
|
||||
wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text);
|
||||
wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
|
||||
wordsFound += 1;
|
||||
}
|
||||
|
||||
|
@ -316,7 +309,7 @@ foundBest:
|
|||
// if it is a dictionary word, do nothing. If it isn't, then if there is
|
||||
// no preceding word, or the non-word shares less than the minimum threshold
|
||||
// of characters with a dictionary word, then scan to resynchronize
|
||||
if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
|
||||
if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
|
||||
&& (wordLength == 0
|
||||
|| words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
|
||||
// Look for a plausible word boundary
|
||||
|
@ -339,8 +332,8 @@ foundBest:
|
|||
// two characters after uc were not 0x0E4C THANTHAKHAT before
|
||||
// checking the dictionary. That is just a performance filter,
|
||||
// but it's not clear it's faster than checking the trie.
|
||||
int candidates = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
|
||||
utext_setNativeIndex(text, current+wordLength+chars);
|
||||
int candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
|
||||
utext_setNativeIndex(text, current + wordLength + chars);
|
||||
if (candidates > 0) {
|
||||
break;
|
||||
}
|
||||
|
@ -438,8 +431,8 @@ foundBest:
|
|||
// Minimum number of characters for two words
|
||||
#define KHMER_MIN_WORD_SPAN (KHMER_MIN_WORD * 2)
|
||||
|
||||
KhmerBreakEngine::KhmerBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status)
|
||||
: DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
|
||||
KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
|
||||
: DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
|
||||
fDictionary(adoptDictionary)
|
||||
{
|
||||
fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
|
||||
|
@ -511,10 +504,10 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text,
|
|||
}
|
||||
do {
|
||||
int wordsMatched = 1;
|
||||
if (words[(wordsFound+1)%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
|
||||
if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
|
||||
if (wordsMatched < 2) {
|
||||
// Followed by another dictionary word; mark first word as a good candidate
|
||||
words[wordsFound%KHMER_LOOKAHEAD].markCurrent();
|
||||
words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
|
||||
wordsMatched = 2;
|
||||
}
|
||||
|
||||
|
@ -526,17 +519,17 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text,
|
|||
// See if any of the possible second words is followed by a third word
|
||||
do {
|
||||
// If we find a third word, stop right away
|
||||
if (words[(wordsFound+2)%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
|
||||
words[wordsFound%KHMER_LOOKAHEAD].markCurrent();
|
||||
if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
|
||||
words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
|
||||
goto foundBest;
|
||||
}
|
||||
}
|
||||
while (words[(wordsFound+1)%KHMER_LOOKAHEAD].backUp(text));
|
||||
while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
|
||||
}
|
||||
}
|
||||
while (words[wordsFound%KHMER_LOOKAHEAD].backUp(text));
|
||||
while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
|
||||
foundBest:
|
||||
wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text);
|
||||
wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
|
||||
wordsFound += 1;
|
||||
}
|
||||
|
||||
|
@ -549,9 +542,9 @@ foundBest:
|
|||
// if it is a dictionary word, do nothing. If it isn't, then if there is
|
||||
// no preceding word, or the non-word shares less than the minimum threshold
|
||||
// of characters with a dictionary word, then scan to resynchronize
|
||||
if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
|
||||
if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
|
||||
&& (wordLength == 0
|
||||
|| words[wordsFound%KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
|
||||
|| words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
|
||||
// Look for a plausible word boundary
|
||||
//TODO: This section will need a rework for UText.
|
||||
int32_t remaining = rangeEnd - (current+wordLength);
|
||||
|
@ -568,7 +561,7 @@ foundBest:
|
|||
}
|
||||
if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
|
||||
// Maybe. See if it's in the dictionary.
|
||||
int candidates = words[(wordsFound+1)%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
|
||||
int candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
|
||||
utext_setNativeIndex(text, current+wordLength+chars);
|
||||
if (candidates > 0) {
|
||||
break;
|
||||
|
@ -651,6 +644,296 @@ foundBest:
|
|||
return wordsFound;
|
||||
}
|
||||
|
||||
/*
|
||||
******************************************************************
|
||||
* CjkBreakEngine
|
||||
*/
|
||||
static const uint32_t kuint32max = 0xFFFFFFFF;
|
||||
CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
|
||||
: DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
|
||||
// Korean dictionary only includes Hangul syllables
|
||||
fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
|
||||
fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
|
||||
fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
|
||||
fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
|
||||
|
||||
if (U_SUCCESS(status)) {
|
||||
// handle Korean and Japanese/Chinese using different dictionaries
|
||||
if (type == kKorean) {
|
||||
setCharacters(fHangulWordSet);
|
||||
} else { //Chinese and Japanese
|
||||
UnicodeSet cjSet;
|
||||
cjSet.addAll(fHanWordSet);
|
||||
cjSet.addAll(fKatakanaWordSet);
|
||||
cjSet.addAll(fHiraganaWordSet);
|
||||
cjSet.add(UNICODE_STRING_SIMPLE("\\uff70\\u30fc"));
|
||||
setCharacters(cjSet);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CjkBreakEngine::~CjkBreakEngine(){
|
||||
delete fDictionary;
|
||||
}
|
||||
|
||||
// The katakanaCost values below are based on the length frequencies of all
|
||||
// katakana phrases in the dictionary
|
||||
static const int kMaxKatakanaLength = 8;
|
||||
static const int kMaxKatakanaGroupLength = 20;
|
||||
static const uint32_t maxSnlp = 255;
|
||||
|
||||
static inline uint32_t getKatakanaCost(int wordLength){
|
||||
//TODO: fill array with actual values from dictionary!
|
||||
static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
|
||||
= {8192, 984, 408, 240, 204, 252, 300, 372, 480};
|
||||
return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
|
||||
}
|
||||
|
||||
static inline bool isKatakana(uint16_t value) {
|
||||
return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
|
||||
(value >= 0xFF66u && value <= 0xFF9fu);
|
||||
}
|
||||
|
||||
// A very simple helper class to streamline the buffer handling in
|
||||
// divideUpDictionaryRange.
|
||||
template<class T, size_t N>
|
||||
class AutoBuffer {
|
||||
public:
|
||||
AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) {
|
||||
if (size > N) {
|
||||
buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
|
||||
capacity = size;
|
||||
}
|
||||
}
|
||||
~AutoBuffer() {
|
||||
if (buffer != stackBuffer)
|
||||
uprv_free(buffer);
|
||||
}
|
||||
|
||||
T* elems() {
|
||||
return buffer;
|
||||
}
|
||||
|
||||
const T& operator[] (size_t i) const {
|
||||
return buffer[i];
|
||||
}
|
||||
|
||||
T& operator[] (size_t i) {
|
||||
return buffer[i];
|
||||
}
|
||||
|
||||
// resize without copy
|
||||
void resize(size_t size) {
|
||||
if (size <= capacity)
|
||||
return;
|
||||
if (buffer != stackBuffer)
|
||||
uprv_free(buffer);
|
||||
buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
|
||||
capacity = size;
|
||||
}
|
||||
|
||||
private:
|
||||
T stackBuffer[N];
|
||||
T* buffer;
|
||||
AutoBuffer();
|
||||
size_t capacity;
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* @param text A UText representing the text
|
||||
* @param rangeStart The start of the range of dictionary characters
|
||||
* @param rangeEnd The end of the range of dictionary characters
|
||||
* @param foundBreaks Output of C array of int32_t break positions, or 0
|
||||
* @return The number of breaks found
|
||||
*/
|
||||
int32_t
|
||||
CjkBreakEngine::divideUpDictionaryRange( UText *text,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UStack &foundBreaks ) const {
|
||||
if (rangeStart >= rangeEnd) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const size_t defaultInputLength = 80;
|
||||
size_t inputLength = rangeEnd - rangeStart;
|
||||
// TODO: Replace by UnicodeString.
|
||||
AutoBuffer<UChar, defaultInputLength> charString(inputLength);
|
||||
|
||||
// Normalize the input string and put it in normalizedText.
|
||||
// The map from the indices of the normalized input to the raw
|
||||
// input is kept in charPositions.
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
UnicodeString inputString(charString.elems(), inputLength);
|
||||
UNormalizationMode norm_mode = UNORM_NFKC;
|
||||
UBool isNormalized =
|
||||
Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES ||
|
||||
Normalizer::isNormalized(inputString, norm_mode, status);
|
||||
|
||||
// TODO: Replace by UVector32.
|
||||
AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1);
|
||||
int numChars = 0;
|
||||
UText normalizedText = UTEXT_INITIALIZER;
|
||||
// Needs to be declared here because normalizedText holds onto its buffer.
|
||||
UnicodeString normalizedString;
|
||||
if (isNormalized) {
|
||||
int32_t index = 0;
|
||||
charPositions[0] = 0;
|
||||
while(index < inputString.length()) {
|
||||
index = inputString.moveIndex32(index, 1);
|
||||
charPositions[++numChars] = index;
|
||||
}
|
||||
utext_openUnicodeString(&normalizedText, &inputString, &status);
|
||||
}
|
||||
else {
|
||||
Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
}
|
||||
charPositions.resize(normalizedString.length() + 1);
|
||||
Normalizer normalizer(charString.elems(), inputLength, norm_mode);
|
||||
int32_t index = 0;
|
||||
charPositions[0] = 0;
|
||||
while(index < normalizer.endIndex()){
|
||||
UChar32 uc = normalizer.next();
|
||||
charPositions[++numChars] = index = normalizer.getIndex();
|
||||
}
|
||||
utext_openUnicodeString(&normalizedText, &normalizedString, &status);
|
||||
}
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// From this point on, all the indices refer to the indices of
|
||||
// the normalized input string.
|
||||
|
||||
// bestSnlp[i] is the snlp of the best segmentation of the first i
|
||||
// characters in the range to be matched.
|
||||
// TODO: Replace by UVector32.
|
||||
AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1);
|
||||
bestSnlp[0] = 0;
|
||||
for(int i = 1; i <= numChars; i++) {
|
||||
bestSnlp[i] = kuint32max;
|
||||
}
|
||||
|
||||
// prev[i] is the index of the last CJK character in the previous word in
|
||||
// the best segmentation of the first i characters.
|
||||
// TODO: Replace by UVector32.
|
||||
AutoBuffer<int, defaultInputLength> prev(numChars + 1);
|
||||
for(int i = 0; i <= numChars; i++){
|
||||
prev[i] = -1;
|
||||
}
|
||||
|
||||
const size_t maxWordSize = 20;
|
||||
// TODO: Replace both with UVector32.
|
||||
AutoBuffer<int32_t, maxWordSize> values(numChars);
|
||||
AutoBuffer<int32_t, maxWordSize> lengths(numChars);
|
||||
|
||||
// Dynamic programming to find the best segmentation.
|
||||
bool is_prev_katakana = false;
|
||||
for (int i = 0; i < numChars; ++i) {
|
||||
//utext_setNativeIndex(text, rangeStart + i);
|
||||
utext_setNativeIndex(&normalizedText, i);
|
||||
if (bestSnlp[i] == kuint32max)
|
||||
continue;
|
||||
|
||||
int count;
|
||||
// limit maximum word length matched to size of current substring
|
||||
int maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize : (numChars - i);
|
||||
|
||||
fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems());
|
||||
|
||||
// if there are no single character matches found in the dictionary
|
||||
// starting with this charcter, treat character as a 1-character word
|
||||
// with the highest value possible, i.e. the least likely to occur.
|
||||
// Exclude Korean characters from this treatment, as they should be left
|
||||
// together by default.
|
||||
if((count == 0 || lengths[0] != 1) &&
|
||||
!fHangulWordSet.contains(utext_current32(&normalizedText))) {
|
||||
values[count] = maxSnlp;
|
||||
lengths[count++] = 1;
|
||||
}
|
||||
|
||||
for (int j = 0; j < count; j++) {
|
||||
uint32_t newSnlp = bestSnlp[i] + values[j];
|
||||
if (newSnlp < bestSnlp[lengths[j] + i]) {
|
||||
bestSnlp[lengths[j] + i] = newSnlp;
|
||||
prev[lengths[j] + i] = i;
|
||||
}
|
||||
}
|
||||
|
||||
// In Japanese,
|
||||
// Katakana word in single character is pretty rare. So we apply
|
||||
// the following heuristic to Katakana: any continuous run of Katakana
|
||||
// characters is considered a candidate word with a default cost
|
||||
// specified in the katakanaCost table according to its length.
|
||||
//utext_setNativeIndex(text, rangeStart + i);
|
||||
utext_setNativeIndex(&normalizedText, i);
|
||||
bool is_katakana = isKatakana(utext_current32(&normalizedText));
|
||||
if (!is_prev_katakana && is_katakana) {
|
||||
int j = i + 1;
|
||||
utext_next32(&normalizedText);
|
||||
// Find the end of the continuous run of Katakana characters
|
||||
while (j < numChars && (j - i) < kMaxKatakanaGroupLength &&
|
||||
isKatakana(utext_current32(&normalizedText))) {
|
||||
utext_next32(&normalizedText);
|
||||
++j;
|
||||
}
|
||||
if ((j - i) < kMaxKatakanaGroupLength) {
|
||||
uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
|
||||
if (newSnlp < bestSnlp[j]) {
|
||||
bestSnlp[j] = newSnlp;
|
||||
prev[j] = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
is_prev_katakana = is_katakana;
|
||||
}
|
||||
|
||||
// Start pushing the optimal offset index into t_boundary (t for tentative).
|
||||
// prev[numChars] is guaranteed to be meaningful.
|
||||
// We'll first push in the reverse order, i.e.,
|
||||
// t_boundary[0] = numChars, and afterwards do a swap.
|
||||
// TODO: Replace by UVector32.
|
||||
AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);
|
||||
|
||||
int numBreaks = 0;
|
||||
// No segmentation found, set boundary to end of range
|
||||
if (bestSnlp[numChars] == kuint32max) {
|
||||
t_boundary[numBreaks++] = numChars;
|
||||
} else {
|
||||
for (int i = numChars; i > 0; i = prev[i]) {
|
||||
t_boundary[numBreaks++] = i;
|
||||
}
|
||||
U_ASSERT(prev[t_boundary[numBreaks - 1]] == 0);
|
||||
}
|
||||
|
||||
// Reverse offset index in t_boundary.
|
||||
// Don't add a break for the start of the dictionary range if there is one
|
||||
// there already.
|
||||
if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
|
||||
t_boundary[numBreaks++] = 0;
|
||||
}
|
||||
|
||||
// Now that we're done, convert positions in t_bdry[] (indices in
|
||||
// the normalized input string) back to indices in the raw input string
|
||||
// while reversing t_bdry and pushing values to foundBreaks.
|
||||
for (int i = numBreaks-1; i >= 0; i--) {
|
||||
foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status);
|
||||
}
|
||||
|
||||
utext_close(&normalizedText);
|
||||
return numBreaks;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006,2011, International Business Machines Corporation *
|
||||
* Copyright (C) 2006,2012, International Business Machines Corporation *
|
||||
* and others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -16,7 +16,7 @@
|
|||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class TrieWordDictionary;
|
||||
class DictionaryMatcher;
|
||||
|
||||
/*******************************************************************
|
||||
* DictionaryBreakEngine
|
||||
|
@ -65,31 +65,31 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
|
|||
*/
|
||||
virtual ~DictionaryBreakEngine();
|
||||
|
||||
/**
|
||||
* <p>Indicate whether this engine handles a particular character for
|
||||
* a particular kind of break.</p>
|
||||
*
|
||||
* @param c A character which begins a run that the engine might handle
|
||||
* @param breakType The type of text break which the caller wants to determine
|
||||
* @return TRUE if this engine handles the particular character and break
|
||||
* type.
|
||||
*/
|
||||
/**
|
||||
* <p>Indicate whether this engine handles a particular character for
|
||||
* a particular kind of break.</p>
|
||||
*
|
||||
* @param c A character which begins a run that the engine might handle
|
||||
* @param breakType The type of text break which the caller wants to determine
|
||||
* @return TRUE if this engine handles the particular character and break
|
||||
* type.
|
||||
*/
|
||||
virtual UBool handles( UChar32 c, int32_t breakType ) const;
|
||||
|
||||
/**
|
||||
* <p>Find any breaks within a run in the supplied text.</p>
|
||||
*
|
||||
* @param text A UText representing the text. The
|
||||
* iterator is left at the end of the run of characters which the engine
|
||||
* is capable of handling.
|
||||
* @param startPos The start of the run within the supplied text.
|
||||
* @param endPos The end of the run within the supplied text.
|
||||
* @param reverse Whether the caller is looking for breaks in a reverse
|
||||
* direction.
|
||||
* @param breakType The type of break desired, or -1.
|
||||
* @param foundBreaks An allocated C array of the breaks found, if any
|
||||
* @return The number of breaks found.
|
||||
*/
|
||||
/**
|
||||
* <p>Find any breaks within a run in the supplied text.</p>
|
||||
*
|
||||
* @param text A UText representing the text. The iterator is left at
|
||||
* the end of the run of characters which the engine is capable of handling
|
||||
* that starts from the first (or last) character in the range.
|
||||
* @param startPos The start of the run within the supplied text.
|
||||
* @param endPos The end of the run within the supplied text.
|
||||
* @param reverse Whether the caller is looking for breaks in a reverse
|
||||
* direction.
|
||||
* @param breakType The type of break desired, or -1.
|
||||
* @param foundBreaks An allocated C array of the breaks found, if any
|
||||
* @return The number of breaks found.
|
||||
*/
|
||||
virtual int32_t findBreaks( UText *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
|
@ -114,7 +114,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
|
|||
// virtual void setBreakTypes( uint32_t breakTypes );
|
||||
|
||||
/**
|
||||
* <p>Divide up a range of known dictionary characters.</p>
|
||||
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
|
||||
*
|
||||
* @param text A UText representing the text
|
||||
* @param rangeStart The start of the range of dictionary characters
|
||||
|
@ -135,7 +135,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
|
|||
|
||||
/**
|
||||
* <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
|
||||
* TrieWordDictionary and heuristics to determine Thai-specific breaks.</p>
|
||||
* dictionary and heuristics to determine Thai-specific breaks.</p>
|
||||
*
|
||||
* <p>After it is constructed a ThaiBreakEngine may be shared between
|
||||
* threads without synchronization.</p>
|
||||
|
@ -152,17 +152,17 @@ class ThaiBreakEngine : public DictionaryBreakEngine {
|
|||
UnicodeSet fBeginWordSet;
|
||||
UnicodeSet fSuffixSet;
|
||||
UnicodeSet fMarkSet;
|
||||
const TrieWordDictionary *fDictionary;
|
||||
DictionaryMatcher *fDictionary;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* <p>Default constructor.</p>
|
||||
*
|
||||
* @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the
|
||||
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
|
||||
* engine is deleted.
|
||||
*/
|
||||
ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status);
|
||||
ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
|
@ -171,7 +171,7 @@ class ThaiBreakEngine : public DictionaryBreakEngine {
|
|||
|
||||
protected:
|
||||
/**
|
||||
* <p>Divide up a range of known dictionary characters.</p>
|
||||
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
|
||||
*
|
||||
* @param text A UText representing the text
|
||||
* @param rangeStart The start of the range of dictionary characters
|
||||
|
@ -186,6 +186,66 @@ class ThaiBreakEngine : public DictionaryBreakEngine {
|
|||
|
||||
};
|
||||
|
||||
/*******************************************************************
|
||||
* CjkBreakEngine
|
||||
*/
|
||||
|
||||
//indicates language/script that the CjkBreakEngine will handle
|
||||
enum LanguageType {
|
||||
kKorean,
|
||||
kChineseJapanese
|
||||
};
|
||||
|
||||
/**
|
||||
* <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
|
||||
* dictionary with costs associated with each word and
|
||||
* Viterbi decoding to determine CJK-specific breaks.</p>
|
||||
*/
|
||||
class CjkBreakEngine : public DictionaryBreakEngine {
|
||||
protected:
|
||||
/**
|
||||
* The set of characters handled by this engine
|
||||
* @internal
|
||||
*/
|
||||
UnicodeSet fHangulWordSet;
|
||||
UnicodeSet fHanWordSet;
|
||||
UnicodeSet fKatakanaWordSet;
|
||||
UnicodeSet fHiraganaWordSet;
|
||||
|
||||
DictionaryMatcher *fDictionary;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* <p>Default constructor.</p>
|
||||
*
|
||||
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
|
||||
* engine is deleted. The DictionaryMatcher must contain costs for each word
|
||||
* in order for the dictionary to work properly.
|
||||
*/
|
||||
CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~CjkBreakEngine();
|
||||
|
||||
protected:
|
||||
/**
|
||||
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
|
||||
*
|
||||
* @param text A UText representing the text
|
||||
* @param rangeStart The start of the range of dictionary characters
|
||||
* @param rangeEnd The end of the range of dictionary characters
|
||||
* @param foundBreaks Output of C array of int32_t break positions, or 0
|
||||
* @return The number of breaks found
|
||||
*/
|
||||
virtual int32_t divideUpDictionaryRange( UText *text,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UStack &foundBreaks ) const;
|
||||
|
||||
};
|
||||
|
||||
/*******************************************************************
|
||||
* KhmerBreakEngine
|
||||
|
@ -209,7 +269,7 @@ class KhmerBreakEngine : public DictionaryBreakEngine {
|
|||
UnicodeSet fEndWordSet;
|
||||
UnicodeSet fBeginWordSet;
|
||||
UnicodeSet fMarkSet;
|
||||
const TrieWordDictionary *fDictionary;
|
||||
DictionaryMatcher *fDictionary;
|
||||
|
||||
public:
|
||||
|
||||
|
@ -219,7 +279,7 @@ class KhmerBreakEngine : public DictionaryBreakEngine {
|
|||
* @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the
|
||||
* engine is deleted.
|
||||
*/
|
||||
KhmerBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status);
|
||||
KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
|
|
218
icu4c/source/common/dictionarydata.cpp
Normal file
218
icu4c/source/common/dictionarydata.cpp
Normal file
|
@ -0,0 +1,218 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2012, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* dictionarydata.h
|
||||
*
|
||||
* created on: 2012may31
|
||||
* created by: Markus W. Scherer & Maxime Serrano
|
||||
*/
|
||||
|
||||
#include "dictionarydata.h"
|
||||
#include "unicode/ucharstrie.h"
|
||||
#include "unicode/bytestrie.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
|
||||
udata_close(file);
|
||||
}
|
||||
|
||||
int32_t UCharsDictionaryMatcher::getType() const {
|
||||
return DictionaryData::TRIE_TYPE_UCHARS;
|
||||
}
|
||||
|
||||
int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int *lengths, int &count, int limit, int32_t *values) const {
|
||||
UCharsTrie uct(characters);
|
||||
UChar32 c = utext_next32(text);
|
||||
if (c < 0) {
|
||||
return 0;
|
||||
}
|
||||
UStringTrieResult result = uct.first(c);
|
||||
int32_t numChars = 1;
|
||||
count = 0;
|
||||
for (;;) {
|
||||
if (USTRINGTRIE_HAS_VALUE(result)) {
|
||||
if (count < limit) {
|
||||
if (values != NULL) {
|
||||
values[count] = uct.getValue();
|
||||
}
|
||||
lengths[count++] = numChars;
|
||||
}
|
||||
if (result == USTRINGTRIE_FINAL_VALUE) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (result == USTRINGTRIE_NO_MATCH) {
|
||||
break;
|
||||
}
|
||||
|
||||
// TODO: why do we have a text limit if the UText knows its length?
|
||||
if (numChars >= maxLength) {
|
||||
break;
|
||||
}
|
||||
|
||||
c = utext_next32(text);
|
||||
if (c < 0) {
|
||||
break;
|
||||
}
|
||||
++numChars;
|
||||
result = uct.next(c);
|
||||
}
|
||||
return numChars;
|
||||
}
|
||||
|
||||
BytesDictionaryMatcher::~BytesDictionaryMatcher() {
|
||||
udata_close(file);
|
||||
}
|
||||
|
||||
UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
|
||||
if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
|
||||
if (c == 0x200D) {
|
||||
return 0xFF;
|
||||
} else if (c == 0x200C) {
|
||||
return 0xFE;
|
||||
}
|
||||
int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
|
||||
if (delta < 0 || 0xFD < delta) {
|
||||
return U_SENTINEL;
|
||||
}
|
||||
return (UChar32)delta;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
int32_t BytesDictionaryMatcher::getType() const {
|
||||
return DictionaryData::TRIE_TYPE_BYTES;
|
||||
}
|
||||
|
||||
int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int *lengths, int &count, int limit, int32_t *values) const {
|
||||
BytesTrie bt(characters);
|
||||
UChar32 c = utext_next32(text);
|
||||
if (c < 0) {
|
||||
return 0;
|
||||
}
|
||||
UStringTrieResult result = bt.first(transform(c));
|
||||
int32_t numChars = 1;
|
||||
count = 0;
|
||||
for (;;) {
|
||||
if (USTRINGTRIE_HAS_VALUE(result)) {
|
||||
if (count < limit) {
|
||||
if (values != NULL) {
|
||||
values[count] = bt.getValue();
|
||||
}
|
||||
lengths[count++] = numChars;
|
||||
}
|
||||
if (result == USTRINGTRIE_FINAL_VALUE) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (result == USTRINGTRIE_NO_MATCH) {
|
||||
break;
|
||||
}
|
||||
|
||||
// TODO: why do we have a text limit if the UText knows its length?
|
||||
if (numChars >= maxLength) {
|
||||
break;
|
||||
}
|
||||
|
||||
c = utext_next32(text);
|
||||
if (c < 0) {
|
||||
break;
|
||||
}
|
||||
++numChars;
|
||||
result = bt.next(transform(c));
|
||||
}
|
||||
return numChars;
|
||||
}
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
|
||||
void *outData, UErrorCode *pErrorCode) {
|
||||
const UDataInfo *pInfo;
|
||||
int32_t headerSize;
|
||||
const uint8_t *inBytes;
|
||||
uint8_t *outBytes;
|
||||
const int32_t *inIndexes;
|
||||
int32_t indexes[DictionaryData::IX_COUNT];
|
||||
int32_t i, offset, size;
|
||||
|
||||
headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
|
||||
if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
|
||||
pInfo = (const UDataInfo *)((const char *)inData + 4);
|
||||
if (!(pInfo->dataFormat[0] == 0x44 &&
|
||||
pInfo->dataFormat[1] == 0x69 &&
|
||||
pInfo->dataFormat[2] == 0x63 &&
|
||||
pInfo->dataFormat[3] == 0x74 &&
|
||||
pInfo->formatVersion[0] == 1)) {
|
||||
udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
|
||||
pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
|
||||
*pErrorCode = U_UNSUPPORTED_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
inBytes = (const uint8_t *)inData + headerSize;
|
||||
outBytes = (uint8_t *)outData + headerSize;
|
||||
|
||||
inIndexes = (const int32_t *)inBytes;
|
||||
if (length >= 0) {
|
||||
length -= headerSize;
|
||||
if (length < (int32_t)(sizeof(indexes))) {
|
||||
udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
|
||||
*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < DictionaryData::IX_COUNT; i++) {
|
||||
indexes[i] = udata_readInt32(ds, inIndexes[i]);
|
||||
}
|
||||
|
||||
size = indexes[DictionaryData::IX_TOTAL_SIZE];
|
||||
|
||||
if (length >= 0) {
|
||||
if (length < size) {
|
||||
udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
|
||||
*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (inBytes != outBytes) {
|
||||
uprv_memcpy(outBytes, inBytes, size);
|
||||
}
|
||||
|
||||
offset = 0;
|
||||
ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
|
||||
offset = (int32_t)sizeof(indexes);
|
||||
int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
|
||||
int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
|
||||
|
||||
if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
|
||||
ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
|
||||
} else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
|
||||
// nothing to do
|
||||
} else {
|
||||
udata_printError(ds, "udict_swap(): unknown trie type!\n");
|
||||
*pErrorCode = U_UNSUPPORTED_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// these next two sections are empty in the current format,
|
||||
// but may be used later.
|
||||
offset = nextOffset;
|
||||
nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
|
||||
offset = nextOffset;
|
||||
nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
|
||||
offset = nextOffset;
|
||||
}
|
||||
return headerSize + size;
|
||||
}
|
||||
|
160
icu4c/source/common/dictionarydata.h
Normal file
160
icu4c/source/common/dictionarydata.h
Normal file
|
@ -0,0 +1,160 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2012, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* dictionarydata.h
|
||||
*
|
||||
* created on: 2012may31
|
||||
* created by: Markus W. Scherer & Maxime Serrano
|
||||
*/
|
||||
|
||||
#ifndef __DICTIONARYDATA_H__
|
||||
#define __DICTIONARYDATA_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "unicode/utext.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "udataswp.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/ustringtrie.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class UCharsTrie;
|
||||
class BytesTrie;
|
||||
|
||||
class U_COMMON_API DictionaryData : public UMemory {
|
||||
public:
|
||||
static const int32_t TRIE_TYPE_BYTES = 0;
|
||||
static const int32_t TRIE_TYPE_UCHARS = 1;
|
||||
static const int32_t TRIE_TYPE_MASK = 7;
|
||||
static const int32_t TRIE_HAS_VALUES = 8;
|
||||
|
||||
static const int32_t TRANSFORM_NONE = 0;
|
||||
static const int32_t TRANSFORM_TYPE_OFFSET = 0x1000000;
|
||||
static const int32_t TRANSFORM_TYPE_MASK = 0x7f000000;
|
||||
static const int32_t TRANSFORM_OFFSET_MASK = 0x1fffff;
|
||||
|
||||
enum {
|
||||
// Byte offsets from the start of the data, after the generic header.
|
||||
IX_STRING_TRIE_OFFSET,
|
||||
IX_RESERVED1_OFFSET,
|
||||
IX_RESERVED2_OFFSET,
|
||||
IX_TOTAL_SIZE,
|
||||
|
||||
// Trie type: TRIE_HAS_VALUES | TRIE_TYPE_BYTES etc.
|
||||
IX_TRIE_TYPE,
|
||||
// Transform specification: TRANSFORM_TYPE_OFFSET | 0xe00 etc.
|
||||
IX_TRANSFORM,
|
||||
|
||||
IX_RESERVED6,
|
||||
IX_RESERVED7,
|
||||
IX_COUNT
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
* Wrapper class around generic dictionaries, implementing matches().
|
||||
* getType() should return a TRIE_TYPE_??? constant from DictionaryData.
|
||||
*
|
||||
* All implementations of this interface must be threadsafe if they are to be used inside of the
|
||||
* dictionary-based break iteration code.
|
||||
*/
|
||||
class U_COMMON_API DictionaryMatcher {
|
||||
public:
|
||||
// this should emulate CompactTrieDictionary::matches()
|
||||
virtual int32_t matches(UText *text, int32_t maxLength, int32_t *lengths, int &count, int limit, int32_t *values = NULL) const = 0;
|
||||
/** @return DictionaryData::TRIE_TYPE_XYZ */
|
||||
virtual int32_t getType() const = 0;
|
||||
};
|
||||
|
||||
// Implementation of the DictionaryMatcher interface for a UCharsTrie dictionary
|
||||
class U_COMMON_API UCharsDictionaryMatcher : public DictionaryMatcher {
|
||||
public:
|
||||
// constructs a new UCharsDictionaryMatcher.
|
||||
// The UDataMemory * will be closed on this object's destruction.
|
||||
UCharsDictionaryMatcher(const UChar *c, UDataMemory *f) : characters(c), file(f) { }
|
||||
~UCharsDictionaryMatcher();
|
||||
virtual int32_t matches(UText *text, int32_t maxLength, int32_t *lengths, int &count, int limit, int32_t *values = NULL) const;
|
||||
virtual int32_t getType() const;
|
||||
private:
|
||||
const UChar *characters;
|
||||
UDataMemory *file;
|
||||
};
|
||||
|
||||
// Implementation of the DictionaryMatcher interface for a BytesTrie dictionary
|
||||
class U_COMMON_API BytesDictionaryMatcher : public DictionaryMatcher {
|
||||
public:
|
||||
// constructs a new BytesTrieDictionaryMatcher
|
||||
// the transform constant should be the constant read from the file, not a masked version!
|
||||
// the UDataMemory * fed in here will be closed on this object's destruction
|
||||
BytesDictionaryMatcher(const char *c, int32_t t, UDataMemory *f) : characters(c), transformConstant(t), file(f) { }
|
||||
~BytesDictionaryMatcher();
|
||||
virtual int32_t matches(UText *text, int32_t maxLength, int32_t *lengths, int &count, int limit, int32_t *values = NULL) const;
|
||||
virtual int32_t getType() const;
|
||||
private:
|
||||
UChar32 transform(UChar32 c) const;
|
||||
|
||||
const char *characters;
|
||||
int32_t transformConstant;
|
||||
UDataMemory *file;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Format of dictionary .dict data files.
|
||||
* Format version 1.0.
|
||||
*
|
||||
* A dictionary .dict data file contains a byte-serialized BytesTrie or
|
||||
* a UChars-serialized UCharsTrie.
|
||||
* Such files are used in dictionary-based break iteration (DBBI).
|
||||
*
|
||||
* For a BytesTrie, a transformation type is specified for
|
||||
* transforming Unicode strings into byte sequences.
|
||||
*
|
||||
* A .dict file begins with a standard ICU data file header
|
||||
* (DataHeader, see ucmndata.h and unicode/udata.h).
|
||||
* The UDataInfo.dataVersion field is currently unused (set to 0.0.0.0).
|
||||
*
|
||||
* After the header, the file contains the following parts.
|
||||
* Constants are defined in the DictionaryData class.
|
||||
*
|
||||
* For the data structure of BytesTrie & UCharsTrie see
|
||||
* http://site.icu-project.org/design/struct/tries
|
||||
* and the bytestrie.h and ucharstrie.h header files.
|
||||
*
|
||||
* int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_STRING_TRIE_OFFSET]/4;
|
||||
*
|
||||
* The first four indexes are byte offsets in ascending order.
|
||||
* Each byte offset marks the start of the next part in the data file,
|
||||
* and the end of the previous one.
|
||||
* When two consecutive byte offsets are the same, then the corresponding part is empty.
|
||||
* Byte offsets are offsets from after the header,
|
||||
* that is, from the beginning of the indexes[].
|
||||
* Each part starts at an offset with proper alignment for its data.
|
||||
* If necessary, the previous part may include padding bytes to achieve this alignment.
|
||||
*
|
||||
* trieType=indexes[IX_TRIE_TYPE] defines the trie type.
|
||||
* transform=indexes[IX_TRANSFORM] defines the Unicode-to-bytes transformation.
|
||||
* If the transformation type is TRANSFORM_TYPE_OFFSET,
|
||||
* then the lower 21 bits contain the offset code point.
|
||||
* Each code point c is mapped to byte b = (c - offset).
|
||||
* Code points outside the range offset..(offset+0xff) cannot be mapped
|
||||
* and do not occur in the dictionary.
|
||||
*
|
||||
* stringTrie; -- a serialized BytesTrie or UCharsTrie
|
||||
*
|
||||
* The dictionary maps strings to specific values (TRIE_HAS_VALUES bit set in trieType),
|
||||
* or it maps all strings to 0 (TRIE_HAS_VALUES bit not set).
|
||||
*/
|
||||
|
||||
#endif /* !UCONFIG_NO_BREAK_ITERATION */
|
||||
#endif /* __DICTIONARYDATA_H__ */
|
|
@ -1615,10 +1615,12 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
|
|||
int32_t endPos,
|
||||
UBool reverse) {
|
||||
// Reset the old break cache first.
|
||||
uint32_t dictionaryCount = fDictionaryCharCount;
|
||||
reset();
|
||||
|
||||
if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
|
||||
// note: code segment below assumes that dictionary chars are in the
|
||||
// startPos-endPos range
|
||||
// value returned should be next character in sequence
|
||||
if ((endPos - startPos) <= 1) {
|
||||
return (reverse ? startPos : endPos);
|
||||
}
|
||||
|
||||
|
@ -1771,7 +1773,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
|
|||
// proposed break by one of the breaks we found. Use following() and
|
||||
// preceding() to do the work. They should never recurse in this case.
|
||||
if (reverse) {
|
||||
return preceding(endPos - 1);
|
||||
return preceding(endPos);
|
||||
}
|
||||
else {
|
||||
return following(startPos);
|
||||
|
@ -1861,7 +1863,7 @@ getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
|
|||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the
|
||||
// the characer c.
|
||||
// the character c.
|
||||
//
|
||||
//-------------------------------------------------------------------------------
|
||||
const LanguageBreakEngine *
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,346 +0,0 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and others. *
|
||||
* All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
#ifndef TRIEDICT_H
|
||||
#define TRIEDICT_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/utext.h"
|
||||
|
||||
struct UEnumeration;
|
||||
struct UDataSwapper;
|
||||
struct UDataMemory;
|
||||
|
||||
/**
|
||||
* <p>UDataSwapFn function for use in swapping a compact dictionary.</p>
|
||||
*
|
||||
* @param ds Pointer to UDataSwapper containing global data about the
|
||||
* transformation and function pointers for handling primitive
|
||||
* types.
|
||||
* @param inData Pointer to the input data to be transformed or examined.
|
||||
* @param length Length of the data, counting bytes. May be -1 for preflighting.
|
||||
* If length>=0, then transform the data.
|
||||
* If length==-1, then only determine the length of the data.
|
||||
* The length cannot be determined from the data itself for all
|
||||
* types of data (e.g., not for simple arrays of integers).
|
||||
* @param outData Pointer to the output data buffer.
|
||||
* If length>=0 (transformation), then the output buffer must
|
||||
* have a capacity of at least length.
|
||||
* If length==-1, then outData will not be used and can be NULL.
|
||||
* @param pErrorCode ICU UErrorCode parameter, must not be NULL and must
|
||||
* fulfill U_SUCCESS on input.
|
||||
* @return The actual length of the data.
|
||||
*
|
||||
* @see UDataSwapper
|
||||
*/
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
triedict_swap(const UDataSwapper *ds,
|
||||
const void *inData, int32_t length, void *outData,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class StringEnumeration;
|
||||
struct CompactTrieHeader;
|
||||
|
||||
/*******************************************************************
|
||||
* TrieWordDictionary
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>TrieWordDictionary is an abstract class that represents a word
|
||||
* dictionary based on a trie. The base protocol is read-only.
|
||||
* Subclasses may allow writing.</p>
|
||||
*/
|
||||
class U_COMMON_API TrieWordDictionary : public UMemory {
|
||||
public:
|
||||
|
||||
/**
|
||||
* <p>Default constructor.</p>
|
||||
*
|
||||
*/
|
||||
TrieWordDictionary();
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~TrieWordDictionary();
|
||||
|
||||
/**
|
||||
* <p>Find dictionary words that match the text.</p>
|
||||
*
|
||||
* @param text A UText representing the text. The
|
||||
* iterator is left after the longest prefix match in the dictionary.
|
||||
* @param start The current position in text.
|
||||
* @param maxLength The maximum number of code units to match.
|
||||
* @param lengths An array that is filled with the lengths of words that matched.
|
||||
* @param count Filled with the number of elements output in lengths.
|
||||
* @param limit The size of the lengths array; this limits the number of words output.
|
||||
* @return The number of characters in text that were matched.
|
||||
*/
|
||||
virtual int32_t matches( UText *text,
|
||||
int32_t maxLength,
|
||||
int32_t *lengths,
|
||||
int &count,
|
||||
int limit ) const = 0;
|
||||
|
||||
/**
|
||||
* <p>Return a StringEnumeration for iterating all the words in the dictionary.</p>
|
||||
*
|
||||
* @param status A status code recording the success of the call.
|
||||
* @return A StringEnumeration that will iterate through the whole dictionary.
|
||||
* The caller is responsible for closing it. The order is unspecified.
|
||||
*/
|
||||
virtual StringEnumeration *openWords( UErrorCode &status ) const = 0;
|
||||
|
||||
};
|
||||
|
||||
/*******************************************************************
|
||||
* MutableTrieDictionary
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>MutableTrieDictionary is a TrieWordDictionary that allows words to be
|
||||
* added.</p>
|
||||
*/
|
||||
|
||||
struct TernaryNode; // Forwards declaration
|
||||
|
||||
class U_COMMON_API MutableTrieDictionary : public TrieWordDictionary {
|
||||
private:
|
||||
/**
|
||||
* The root node of the trie
|
||||
* @internal
|
||||
*/
|
||||
|
||||
TernaryNode *fTrie;
|
||||
|
||||
/**
|
||||
* A UText for internal use
|
||||
* @internal
|
||||
*/
|
||||
|
||||
UText *fIter;
|
||||
|
||||
friend class CompactTrieDictionary; // For fast conversion
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* <p>Constructor.</p>
|
||||
*
|
||||
* @param median A UChar around which to balance the trie. Ideally, it should
|
||||
* begin at least one word that is near the median of the set in the dictionary
|
||||
* @param status A status code recording the success of the call.
|
||||
*/
|
||||
MutableTrieDictionary( UChar median, UErrorCode &status );
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~MutableTrieDictionary();
|
||||
|
||||
/**
|
||||
* <p>Find dictionary words that match the text.</p>
|
||||
*
|
||||
* @param text A UText representing the text. The
|
||||
* iterator is left after the longest prefix match in the dictionary.
|
||||
* @param maxLength The maximum number of code units to match.
|
||||
* @param lengths An array that is filled with the lengths of words that matched.
|
||||
* @param count Filled with the number of elements output in lengths.
|
||||
* @param limit The size of the lengths array; this limits the number of words output.
|
||||
* @return The number of characters in text that were matched.
|
||||
*/
|
||||
virtual int32_t matches( UText *text,
|
||||
int32_t maxLength,
|
||||
int32_t *lengths,
|
||||
int &count,
|
||||
int limit ) const;
|
||||
|
||||
/**
|
||||
* <p>Return a StringEnumeration for iterating all the words in the dictionary.</p>
|
||||
*
|
||||
* @param status A status code recording the success of the call.
|
||||
* @return A StringEnumeration that will iterate through the whole dictionary.
|
||||
* The caller is responsible for closing it. The order is unspecified.
|
||||
*/
|
||||
virtual StringEnumeration *openWords( UErrorCode &status ) const;
|
||||
|
||||
/**
|
||||
* <p>Add one word to the dictionary.</p>
|
||||
*
|
||||
* @param word A UChar buffer containing the word.
|
||||
* @param length The length of the word.
|
||||
* @param status The resultant status
|
||||
*/
|
||||
virtual void addWord( const UChar *word,
|
||||
int32_t length,
|
||||
UErrorCode &status);
|
||||
|
||||
#if 0
|
||||
/**
|
||||
* <p>Add all strings from a UEnumeration to the dictionary.</p>
|
||||
*
|
||||
* @param words A UEnumeration that will return the desired words.
|
||||
* @param status The resultant status
|
||||
*/
|
||||
virtual void addWords( UEnumeration *words, UErrorCode &status );
|
||||
#endif
|
||||
|
||||
protected:
|
||||
/**
|
||||
* <p>Search the dictionary for matches.</p>
|
||||
*
|
||||
* @param text A UText representing the text. The
|
||||
* iterator is left after the longest prefix match in the dictionary.
|
||||
* @param maxLength The maximum number of code units to match.
|
||||
* @param lengths An array that is filled with the lengths of words that matched.
|
||||
* @param count Filled with the number of elements output in lengths.
|
||||
* @param limit The size of the lengths array; this limits the number of words output.
|
||||
* @param parent The parent of the current node
|
||||
* @param pMatched The returned parent node matched the input
|
||||
* @return The number of characters in text that were matched.
|
||||
*/
|
||||
virtual int32_t search( UText *text,
|
||||
int32_t maxLength,
|
||||
int32_t *lengths,
|
||||
int &count,
|
||||
int limit,
|
||||
TernaryNode *&parent,
|
||||
UBool &pMatched ) const;
|
||||
|
||||
private:
|
||||
/**
|
||||
* <p>Private constructor. The root node it not allocated.</p>
|
||||
*
|
||||
* @param status A status code recording the success of the call.
|
||||
*/
|
||||
MutableTrieDictionary( UErrorCode &status );
|
||||
};
|
||||
|
||||
/*******************************************************************
|
||||
* CompactTrieDictionary
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>CompactTrieDictionary is a TrieWordDictionary that has been compacted
|
||||
* to save space.</p>
|
||||
*/
|
||||
class U_COMMON_API CompactTrieDictionary : public TrieWordDictionary {
|
||||
private:
|
||||
/**
|
||||
* The root node of the trie
|
||||
*/
|
||||
|
||||
const CompactTrieHeader *fData;
|
||||
|
||||
/**
|
||||
* A UBool indicating whether or not we own the fData.
|
||||
*/
|
||||
|
||||
UBool fOwnData;
|
||||
|
||||
UDataMemory *fUData;
|
||||
public:
|
||||
/**
|
||||
* <p>Construct a dictionary from a UDataMemory.</p>
|
||||
*
|
||||
* @param data A pointer to a UDataMemory, which is adopted
|
||||
* @param status A status code giving the result of the constructor
|
||||
*/
|
||||
CompactTrieDictionary(UDataMemory *dataObj, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* <p>Construct a dictionary from raw saved data.</p>
|
||||
*
|
||||
* @param data A pointer to the raw data, which is still owned by the caller
|
||||
* @param status A status code giving the result of the constructor
|
||||
*/
|
||||
CompactTrieDictionary(const void *dataObj, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* <p>Construct a dictionary from a MutableTrieDictionary.</p>
|
||||
*
|
||||
* @param dict The dictionary to use as input.
|
||||
* @param status A status code recording the success of the call.
|
||||
*/
|
||||
CompactTrieDictionary( const MutableTrieDictionary &dict, UErrorCode &status );
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~CompactTrieDictionary();
|
||||
|
||||
/**
|
||||
* <p>Find dictionary words that match the text.</p>
|
||||
*
|
||||
* @param text A UText representing the text. The
|
||||
* iterator is left after the longest prefix match in the dictionary.
|
||||
* @param maxLength The maximum number of code units to match.
|
||||
* @param lengths An array that is filled with the lengths of words that matched.
|
||||
* @param count Filled with the number of elements output in lengths.
|
||||
* @param limit The size of the lengths array; this limits the number of words output.
|
||||
* @return The number of characters in text that were matched.
|
||||
*/
|
||||
virtual int32_t matches( UText *text,
|
||||
int32_t rangeEnd,
|
||||
int32_t *lengths,
|
||||
int &count,
|
||||
int limit ) const;
|
||||
|
||||
/**
|
||||
* <p>Return a StringEnumeration for iterating all the words in the dictionary.</p>
|
||||
*
|
||||
* @param status A status code recording the success of the call.
|
||||
* @return A StringEnumeration that will iterate through the whole dictionary.
|
||||
* The caller is responsible for closing it. The order is unspecified.
|
||||
*/
|
||||
virtual StringEnumeration *openWords( UErrorCode &status ) const;
|
||||
|
||||
/**
|
||||
* <p>Return the size of the compact data.</p>
|
||||
*
|
||||
* @return The size of the dictionary's compact data.
|
||||
*/
|
||||
virtual uint32_t dataSize() const;
|
||||
|
||||
/**
|
||||
* <p>Return a void * pointer to the compact data, platform-endian.</p>
|
||||
*
|
||||
* @return The data for the compact dictionary, suitable for passing to the
|
||||
* constructor.
|
||||
*/
|
||||
virtual const void *data() const;
|
||||
|
||||
/**
|
||||
* <p>Return a MutableTrieDictionary clone of this dictionary.</p>
|
||||
*
|
||||
* @param status A status code recording the success of the call.
|
||||
* @return A MutableTrieDictionary with the same data as this dictionary
|
||||
*/
|
||||
virtual MutableTrieDictionary *cloneMutable( UErrorCode &status ) const;
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
* <p>Convert a MutableTrieDictionary into a compact data blob.</p>
|
||||
*
|
||||
* @param dict The dictionary to convert.
|
||||
* @param status A status code recording the success of the call.
|
||||
* @return A single data blob starting with a CompactTrieHeader.
|
||||
*/
|
||||
static CompactTrieHeader *compactMutableTrieDictionary( const MutableTrieDictionary &dict,
|
||||
UErrorCode &status );
|
||||
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
/* TRIEDICT_H */
|
||||
#endif
|
4
icu4c/source/configure
vendored
4
icu4c/source/configure
vendored
|
@ -7498,7 +7498,7 @@ echo "CXXFLAGS=$CXXFLAGS"
|
|||
|
||||
|
||||
# output the Makefiles
|
||||
ac_config_files="$ac_config_files icudefs.mk Makefile data/pkgdataMakefile config/Makefile.inc config/icu.pc config/pkgdataMakefile data/Makefile stubdata/Makefile common/Makefile i18n/Makefile layout/Makefile layoutex/Makefile io/Makefile extra/Makefile extra/uconv/Makefile extra/uconv/pkgdataMakefile extra/scrptrun/Makefile tools/Makefile tools/ctestfw/Makefile tools/toolutil/Makefile tools/makeconv/Makefile tools/genrb/Makefile tools/genccode/Makefile tools/gencmn/Makefile tools/gencnval/Makefile tools/genctd/Makefile tools/gentest/Makefile tools/gennorm2/Makefile tools/genbrk/Makefile tools/gensprep/Makefile tools/icuinfo/Makefile tools/icupkg/Makefile tools/icuswap/Makefile tools/pkgdata/Makefile tools/tzcode/Makefile tools/gencfu/Makefile test/Makefile test/compat/Makefile test/testdata/Makefile test/testdata/pkgdataMakefile test/hdrtst/Makefile test/intltest/Makefile test/cintltst/Makefile test/iotest/Makefile test/letest/Makefile test/perf/Makefile test/perf/collationperf/Makefile test/perf/collperf/Makefile test/perf/dicttrieperf/Makefile test/perf/ubrkperf/Makefile test/perf/charperf/Makefile test/perf/convperf/Makefile test/perf/normperf/Makefile test/perf/DateFmtPerf/Makefile test/perf/howExpensiveIs/Makefile test/perf/strsrchperf/Makefile test/perf/unisetperf/Makefile test/perf/usetperf/Makefile test/perf/ustrperf/Makefile test/perf/utfperf/Makefile test/perf/utrie2perf/Makefile samples/Makefile samples/date/Makefile samples/cal/Makefile samples/layout/Makefile"
|
||||
ac_config_files="$ac_config_files icudefs.mk Makefile data/pkgdataMakefile config/Makefile.inc config/icu.pc config/pkgdataMakefile data/Makefile stubdata/Makefile common/Makefile i18n/Makefile layout/Makefile layoutex/Makefile io/Makefile extra/Makefile extra/uconv/Makefile extra/uconv/pkgdataMakefile extra/scrptrun/Makefile tools/Makefile tools/ctestfw/Makefile tools/toolutil/Makefile tools/makeconv/Makefile tools/genrb/Makefile tools/genccode/Makefile tools/gencmn/Makefile tools/gencnval/Makefile tools/gendict/Makefile tools/gentest/Makefile tools/gennorm2/Makefile tools/genbrk/Makefile tools/gensprep/Makefile tools/icuinfo/Makefile tools/icupkg/Makefile tools/icuswap/Makefile tools/pkgdata/Makefile tools/tzcode/Makefile tools/gencfu/Makefile test/Makefile test/compat/Makefile test/testdata/Makefile test/testdata/pkgdataMakefile test/hdrtst/Makefile test/intltest/Makefile test/cintltst/Makefile test/iotest/Makefile test/letest/Makefile test/perf/Makefile test/perf/collationperf/Makefile test/perf/collperf/Makefile test/perf/dicttrieperf/Makefile test/perf/ubrkperf/Makefile test/perf/charperf/Makefile test/perf/convperf/Makefile test/perf/normperf/Makefile test/perf/DateFmtPerf/Makefile test/perf/howExpensiveIs/Makefile test/perf/strsrchperf/Makefile test/perf/unisetperf/Makefile test/perf/usetperf/Makefile test/perf/ustrperf/Makefile test/perf/utfperf/Makefile test/perf/utrie2perf/Makefile samples/Makefile samples/date/Makefile samples/cal/Makefile samples/layout/Makefile"
|
||||
|
||||
cat >confcache <<\_ACEOF
|
||||
# This file is a shell script that caches the results of configure
|
||||
|
@ -8244,7 +8244,7 @@ do
|
|||
"tools/genccode/Makefile") CONFIG_FILES="$CONFIG_FILES tools/genccode/Makefile" ;;
|
||||
"tools/gencmn/Makefile") CONFIG_FILES="$CONFIG_FILES tools/gencmn/Makefile" ;;
|
||||
"tools/gencnval/Makefile") CONFIG_FILES="$CONFIG_FILES tools/gencnval/Makefile" ;;
|
||||
"tools/genctd/Makefile") CONFIG_FILES="$CONFIG_FILES tools/genctd/Makefile" ;;
|
||||
"tools/gendict/Makefile") CONFIG_FILES="$CONFIG_FILES tools/gendict/Makefile" ;;
|
||||
"tools/gentest/Makefile") CONFIG_FILES="$CONFIG_FILES tools/gentest/Makefile" ;;
|
||||
"tools/gennorm2/Makefile") CONFIG_FILES="$CONFIG_FILES tools/gennorm2/Makefile" ;;
|
||||
"tools/genbrk/Makefile") CONFIG_FILES="$CONFIG_FILES tools/genbrk/Makefile" ;;
|
||||
|
|
|
@ -1229,7 +1229,7 @@ AC_CONFIG_FILES([icudefs.mk \
|
|||
tools/genccode/Makefile \
|
||||
tools/gencmn/Makefile \
|
||||
tools/gencnval/Makefile \
|
||||
tools/genctd/Makefile \
|
||||
tools/gendict/Makefile \
|
||||
tools/gentest/Makefile \
|
||||
tools/gennorm2/Makefile \
|
||||
tools/genbrk/Makefile \
|
||||
|
|
|
@ -250,10 +250,10 @@ BREAK_TREE=brkitr
|
|||
ALL_BRK_SOURCE= $(BRK_SOURCE) $(BRK_SOURCE_LOCAL)
|
||||
BRK_FILES_SHORT=$(ALL_BRK_SOURCE:%.txt=$(BREAK_TREE)/%.brk)
|
||||
BRK_FILES=$(ALL_BRK_SOURCE:%.txt=$(BRKBLDDIR)/%.brk)
|
||||
ifdef BRK_CTD_SOURCE
|
||||
ALL_CTD_SOURCE=$(BRK_CTD_SOURCE) $(BRK_CTD_SOURCE_LOCAL)
|
||||
CTD_FILES_SHORT=$(ALL_CTD_SOURCE:%.txt=$(BREAK_TREE)/%.ctd)
|
||||
CTD_FILES=$(ALL_CTD_SOURCE:%.txt=$(BRKBLDDIR)/%.ctd)
|
||||
ifdef BRK_DICT_SOURCE
|
||||
ALL_DICT_SOURCE=$(BRK_DICT_SOURCE) $(BRK_DICT_SOURCE_LOCAL)
|
||||
DICT_FILES_SHORT=$(ALL_DICT_SOURCE:%.txt=$(BREAK_TREE)/%.dict)
|
||||
DICT_FILES=$(ALL_DICT_SOURCE:%.txt=$(BRKBLDDIR)/%.dict)
|
||||
endif
|
||||
ifdef BRK_RES_SOURCE
|
||||
BRS_SRC= root.txt $(BRK_RES_SOURCE) $(BRK_RES_SOURCE_LOCAL)
|
||||
|
@ -417,11 +417,11 @@ SPREP_FILES = $(ALL_SPREP_SOURCE:%.txt=$(BUILDDIR)/%.spp)
|
|||
SPREP_FILES_SHORT = $(ALL_SPREP_SOURCE:%.txt=%.spp)
|
||||
|
||||
## All generated files
|
||||
ALL_FILES = $(DAT_FILES) $(CNV_FILES) $(CNV_FILES_SPECIAL) $(BRK_FILES) $(CTD_FILES) $(RES_FILES) $(INDEX_RES_FILE) $(CURR_FILES) $(LANG_FILES) $(REGION_FILES) $(ZONE_FILES) $(COLLATION_FILES) $(BRK_RES_FILES) $(RBNF_FILES) $(TRANSLIT_FILES) $(SPREP_FILES) $(CFU_FILES)
|
||||
ALL_FILES = $(DAT_FILES) $(CNV_FILES) $(CNV_FILES_SPECIAL) $(BRK_FILES) $(DICT_FILES) $(RES_FILES) $(INDEX_RES_FILE) $(CURR_FILES) $(LANG_FILES) $(REGION_FILES) $(ZONE_FILES) $(COLLATION_FILES) $(BRK_RES_FILES) $(RBNF_FILES) $(TRANSLIT_FILES) $(SPREP_FILES) $(CFU_FILES)
|
||||
ALL_INDEX_SRC_FILES = $(PKGDATA_LIST) $(INDEX_FILE) $(CURR_INDEX_FILE) $(LANG_INDEX_FILE) $(REGION_INDEX_FILE) $(ZONE_INDEX_FILE) $(COLLATION_INDEX_FILE) $(BRK_RES_INDEX_FILE) $(RBNF_INDEX_FILE)
|
||||
# a list to use in the .lst files (package-relative)
|
||||
COLL_FILES_LIST=$(COLLATION_FILES_SHORT) $(COLLATION_INDEX_RES_SHORT)
|
||||
BRK_FILES_LIST=$(BRK_FILES_SHORT) $(CTD_FILES_SHORT) $(BRK_RES_FILES_SHORT) $(BRK_RES_INDEX_RES_SHORT)
|
||||
BRK_FILES_LIST=$(BRK_FILES_SHORT) $(BRK_RES_FILES_SHORT) $(BRK_RES_INDEX_RES_SHORT) $(DICT_FILES_SHORT)
|
||||
LOCALE_FILES_LIST= $(RES_FILES_SHORT) $(LANG_FILES_SHORT) $(REGION_FILES_SHORT) $(ZONE_FILES_SHORT)
|
||||
MISC_FILES_LIST=$(DAT_FILES_SHORT) $(CNV_FILES_SHORT) $(CNV_FILES_SHORT_SPECIAL) $(CURR_FILES_SHORT) $(RBNF_FILES_SHORT) $(RBNF_INDEX_RES_SHORT) $(TRANSLIT_FILES_SHORT) $(SPREP_FILES_SHORT) $(CFU_FILES_SHORT)
|
||||
UNI_CORE_DATA=pnames.icu uprops.icu ucase.icu ubidi.icu
|
||||
|
@ -516,11 +516,20 @@ $(BUILDDIR)/%.spp: $(SPREPSRCDIR)/%.txt $(TOOLBINDIR)/gensprep$(TOOLEXEEXT) $(BU
|
|||
$(BRKBLDDIR)/%.brk: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genbrk$(TOOLEXEEXT) $(DAT_FILES)
|
||||
$(INVOKE) $(TOOLBINDIR)/genbrk -c -i $(BUILDDIR) -r $< -o $@
|
||||
|
||||
#################################################### CTD
|
||||
# CTD FILES
|
||||
#################################################### DICT
|
||||
# DICT FILES
|
||||
|
||||
$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES)
|
||||
$(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $<
|
||||
# .dict file generated regardless of whether dictionary file exists
|
||||
|
||||
$(BRKBLDDIR)/%.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
|
||||
$(INVOKE) $(TOOLBINDIR)/gendict --uchars -c -i $(BUILDDIR) $(BRKSRCDIR)/$(*F).txt $@
|
||||
|
||||
$(BRKBLDDIR)/thaidict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
|
||||
$(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x0e00 -c -i $(BUILDDIR) $(BRKSRCDIR)/thaidict.txt $(BRKBLDDIR)/thaidict.dict
|
||||
|
||||
# TODO: figure out why combining characters are here?
|
||||
$(BRKBLDDIR)/khmerdict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
|
||||
$(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(BRKSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict
|
||||
|
||||
#################################################### CFU
|
||||
# CFU FILES
|
||||
|
|
|
@ -33,15 +33,14 @@ BRK_RES_SYNTHETIC_ALIAS =
|
|||
BRK_RES_ALIAS_SOURCE = $(BRK_RES_SYNTHETIC_ALIAS)
|
||||
|
||||
|
||||
# List of compact trie dictionary files (ctd).
|
||||
BRK_CTD_SOURCE = thaidict.txt khmerdict.txt
|
||||
|
||||
# List of dictionary files (dict).
|
||||
BRK_DICT_SOURCE = thaidict.txt khmerdict.txt cjdict.txt
|
||||
|
||||
# List of break iterator files (brk).
|
||||
BRK_SOURCE = sent_el.txt word_POSIX.txt line_fi.txt word_ja.txt line_ja.txt char.txt word.txt line.txt sent.txt title.txt
|
||||
BRK_SOURCE = sent_el.txt word_POSIX.txt line_fi.txt line_ja.txt char.txt word.txt line.txt sent.txt title.txt
|
||||
|
||||
|
||||
# Ordinary resources
|
||||
BRK_RES_SOURCE = el.txt en.txt en_US.txt en_US_POSIX.txt\
|
||||
fi.txt ja.txt
|
||||
fi.txt
|
||||
|
||||
|
|
327135
icu4c/source/data/brkitr/cjdict.txt
Normal file
327135
icu4c/source/data/brkitr/cjdict.txt
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1,5 +1,5 @@
|
|||
Copyright (c) 2011-2012 International Business Machines Corporation
|
||||
and others. All Rights Reserved.
|
||||
# Copyright (c) 2011-2012 International Business Machines Corporation
|
||||
# and others. All Rights Reserved.
|
||||
ក
|
||||
កក
|
||||
កកកុញ
|
||||
|
@ -23380,7 +23380,7 @@
|
|||
ថ្ងៃមានឫក្ស
|
||||
ថ្ងៃមិញ
|
||||
ថ្ងៃមុខ
|
||||
ថ្ងៃមុន
|
||||
ថ្ងៃមុន
|
||||
ថ្ងៃមួយ
|
||||
ថ្ងៃម្សិល
|
||||
ថ្ងៃម្សិលមិញ
|
||||
|
|
|
@ -16,7 +16,10 @@ root{
|
|||
word:process(dependency){"word.brk"}
|
||||
}
|
||||
dictionaries{
|
||||
Khmr:process(dependency){"khmerdict.ctd"}
|
||||
Thai:process(dependency){"thaidict.ctd"}
|
||||
Khmr:process(dependency){"khmerdict.dict"}
|
||||
Thai:process(dependency){"thaidict.dict"}
|
||||
Hani:process(dependency){"cjdict.dict"}
|
||||
Hira:process(dependency){"cjdict.dict"}
|
||||
Kata:process(dependency){"cjdict.dict"}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
Copyright (c) 2006 International Business Machines Corporation,
|
||||
Apple Computer, Inc., and others. All Rights Reserved.
|
||||
# Copyright (c) 2006-2012 International Business Machines Corporation,
|
||||
# Apple Computer, Inc., and others. All Rights Reserved.
|
||||
กก
|
||||
กกขนาก
|
||||
กกช้าง
|
||||
|
@ -5400,7 +5400,7 @@
|
|||
ดิ้นรน
|
||||
ดิ้ว
|
||||
ดี
|
||||
ดี.ซี.
|
||||
# ดี.ซี. -- TODO: why does this have full stop in it?
|
||||
ดีกรี
|
||||
ดีงู
|
||||
ดีฉัน
|
||||
|
@ -15972,8 +15972,8 @@
|
|||
วิ่งเปี้ยว
|
||||
วิ่น
|
||||
วี
|
||||
วี.ดี.
|
||||
วี.ไอ.พี.
|
||||
# วี.ดี. # TODO: why do these have full stops?
|
||||
# วี.ไอ.พี.
|
||||
วีค
|
||||
วีจิ
|
||||
วีชนี
|
||||
|
@ -16357,9 +16357,9 @@
|
|||
ษัษฐ
|
||||
ษัษฐี
|
||||
ษิโณทก
|
||||
ส.ธรนินทร์
|
||||
ส.ธรรมภักดี
|
||||
ส.นิยม
|
||||
# ส.ธรนินทร์ -- TODO: why do these have full stops?
|
||||
# ส.ธรรมภักดี
|
||||
# ส.นิยม
|
||||
สก
|
||||
สกฏ
|
||||
สกฏภาร
|
||||
|
@ -23311,7 +23311,7 @@
|
|||
เห่า
|
||||
เห้งเจีย
|
||||
เอ
|
||||
เอ.ยู.เอ.
|
||||
# เอ.ยู.เอ. -- TODO: why do we have a full stop?
|
||||
เอก
|
||||
เอกจิต
|
||||
เอกฉันท์
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# Copyright (C) 2002-2011, International Business Machines Corporation
|
||||
# Copyright (C) 2002-2012, International Business Machines Corporation
|
||||
# and others. All Rights Reserved.
|
||||
#
|
||||
# file: word.txt
|
||||
|
@ -29,7 +29,9 @@ $LF = [\p{Word_Break = LF}];
|
|||
$Newline = [\p{Word_Break = Newline}];
|
||||
$Extend = [\p{Word_Break = Extend}];
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$Hiragana = [:Hiragana:];
|
||||
$Katakana = [\p{Word_Break = Katakana}];
|
||||
$Han = [:Han:];
|
||||
$ALetter = [\p{Word_Break = ALetter}];
|
||||
$MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
$MidLetter = [\p{Word_Break = MidLetter}];
|
||||
|
@ -43,15 +45,22 @@ $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
|||
# 5.0 or later as the definition of Complex_Context was corrected to include all
|
||||
# characters requiring dictionary break.
|
||||
|
||||
$dictionary = [:LineBreak = Complex_Context:];
|
||||
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not
|
||||
# include the dictionary characters.
|
||||
$HangulSyllable = [\uac00-\ud7a3];
|
||||
$ComplexContext = [:LineBreak = Complex_Context:];
|
||||
$KanaKanji = [$Han $Hiragana $Katakana];
|
||||
$dictionaryCJK = [$KanaKanji $HangulSyllable];
|
||||
$dictionary = [$ComplexContext $dictionaryCJK];
|
||||
|
||||
# leave CJK scripts out of ALetterPlus
|
||||
$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
|
||||
|
||||
|
||||
#
|
||||
# Rules 4 Ignore Format and Extend characters,
|
||||
# except when they appear at the beginning of a region of text.
|
||||
#
|
||||
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
|
||||
$KatakanaEx = $Katakana ($Extend | $Format)*;
|
||||
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
|
||||
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
||||
|
@ -60,7 +69,6 @@ $MidNumEx = $MidNum ($Extend | $Format)*;
|
|||
$NumericEx = $Numeric ($Extend | $Format)*;
|
||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
||||
|
||||
$Hiragana = [\p{script=Hiragana}];
|
||||
$Ideographic = [\p{Ideographic}];
|
||||
$HiraganaEx = $Hiragana ($Extend | $Format)*;
|
||||
$IdeographicEx = $Ideographic ($Extend | $Format)*;
|
||||
|
@ -78,13 +86,14 @@ $CR $LF;
|
|||
# of a region of Text. The rule here comes into play when the start of text
|
||||
# begins with a group of Format chars, or with a "word" consisting of a single
|
||||
# char that is not in any of the listed word break categories followed by
|
||||
# format char(s).
|
||||
[^$CR $LF $Newline]? ($Extend | $Format)+;
|
||||
# format char(s), or is not a CJK dictionary character.
|
||||
[^$CR $LF $Newline $dictionaryCJK]? ($Extend | $Format)+;
|
||||
|
||||
$NumericEx {100};
|
||||
$ALetterEx {200};
|
||||
$KatakanaEx {300}; # note: these status values override those from rule 5
|
||||
$HiraganaEx {300}; # by virtual of being numerically larger.
|
||||
$HangulSyllable {200};
|
||||
$KatakanaEx {400}; # note: these status values override those from rule 5
|
||||
$HiraganaEx {400}; # by virtue of being numerically larger.
|
||||
$IdeographicEx {400}; #
|
||||
|
||||
#
|
||||
|
@ -113,20 +122,25 @@ $NumericEx $ALetterEx {200};
|
|||
$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
|
||||
|
||||
# rule 13
|
||||
|
||||
$KatakanaEx $KatakanaEx {300};
|
||||
# to be consistent with $KanaKanji $KanaKanhi, changed
|
||||
# from 300 to 400.
|
||||
# See also TestRuleStatus in intltest/rbbiapts.cpp
|
||||
$KatakanaEx $KatakanaEx {400};
|
||||
|
||||
# rule 13a/b
|
||||
|
||||
$ALetterEx $ExtendNumLetEx {200}; # (13a)
|
||||
$NumericEx $ExtendNumLetEx {100}; # (13a)
|
||||
$KatakanaEx $ExtendNumLetEx {300}; # (13a)
|
||||
$KatakanaEx $ExtendNumLetEx {400}; # (13a)
|
||||
$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
|
||||
|
||||
$ExtendNumLetEx $ALetterEx {200}; # (13b)
|
||||
$ExtendNumLetEx $NumericEx {100}; # (13b)
|
||||
$ExtendNumLetEx $KatakanaEx {300}; # (13b)
|
||||
|
||||
$ExtendNumLetEx $KatakanaEx {400}; # (13b)
|
||||
|
||||
# special handling for CJK characters: chain for later dictionary segmentation
|
||||
$HangulSyllable $HangulSyllable {200};
|
||||
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
@ -139,13 +153,14 @@ $BackNumericEx = ($Format | $Extend)* $Numeric;
|
|||
$BackMidNumEx = ($Format | $Extend)* $MidNum;
|
||||
$BackMidLetterEx = ($Format | $Extend)* $MidLetter;
|
||||
$BackKatakanaEx = ($Format | $Extend)* $Katakana;
|
||||
$BackHiraganaEx = ($Format | $Extend)* $Hiragana;
|
||||
$BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;
|
||||
|
||||
# rule 3
|
||||
$LF $CR;
|
||||
|
||||
# rule 4
|
||||
($Format | $Extend)* [^$CR $LF $Newline]?;
|
||||
($Format | $Extend)* [^$CR $LF $Newline $dictionaryCJK]?;
|
||||
|
||||
# rule 5
|
||||
|
||||
|
@ -181,6 +196,10 @@ $BackKatakanaEx $BackKatakanaEx;
|
|||
$BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
|
||||
($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;
|
||||
|
||||
# special handling for CJK characters: chain for later dictionary segmentation
|
||||
$HangulSyllable $HangulSyllable;
|
||||
$KanaKanji $KanaKanji; #different rule status if both kanji and kana found
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
|
|
@ -236,7 +236,7 @@ CNV_FILES_SPECIAL=$(UCM_SOURCE_SPECIAL:.ucm=.cnv)
|
|||
!IF EXISTS("$(ICUSRCDATA)\$(ICUBRK)\brklocal.mk")
|
||||
!INCLUDE "$(ICUSRCDATA)\$(ICUBRK)\brklocal.mk"
|
||||
BRK_SOURCE=$(BRK_SOURCE) $(BRK_SOURCE_LOCAL)
|
||||
BRK_CTD_SOURCE=$(BRK_CTD_SOURCE) $(BRK_CTD_SOURCE_LOCAL)
|
||||
BRK_DICT_SOURCE=$(BRK_DICT_SOURCE) $(BRK_DICT_SOURCE_LOCAL)
|
||||
BRK_RES_SOURCE=$(BRK_RES_SOURCE) $(BRK_RES_SOURCE_LOCAL)
|
||||
!ELSE
|
||||
!MESSAGE Information: cannot find "brklocal.mk". Not building user-additional break iterator files.
|
||||
|
@ -252,10 +252,10 @@ BRK_FILES=$(ICUBRK)\$(BRK_SOURCE:.txt =.brk brkitr\)
|
|||
BRK_FILES=$(BRK_FILES:.txt=.brk)
|
||||
BRK_FILES=$(BRK_FILES:brkitr\ =brkitr\)
|
||||
|
||||
!IFDEF BRK_CTD_SOURCE
|
||||
BRK_CTD_FILES = $(ICUBRK)\$(BRK_CTD_SOURCE:.txt =.ctd brkitr\)
|
||||
BRK_CTD_FILES = $(BRK_CTD_FILES:.txt=.ctd)
|
||||
BRK_CTD_FILES = $(BRK_CTD_FILES:brkitr\ =)
|
||||
!IFDEF BRK_DICT_SOURCE
|
||||
BRK_DICT_FILES = $(ICUBRK)\$(BRK_DICT_SOURCE):.txt=.dict brkitr\)
|
||||
BRK_DICT_FILES = $(BRK_DICT_FILES:.txt=.dict)
|
||||
BRK_DICT_FILES = $(BRK_DICT_FILES:brkitr\ =)
|
||||
!ENDIF
|
||||
|
||||
!IFDEF BRK_RES_SOURCE
|
||||
|
@ -360,6 +360,9 @@ ZONE_SOURCE=$(ZONE_SOURCE) $(ZONE_SOURCE_LOCAL)
|
|||
!MESSAGE Warning: cannot find "zone\resfiles.mk"
|
||||
!ENDIF
|
||||
|
||||
BRK_DICT_FILES = $(ICUBRK)\$(BRK_DICT_SOURCE):.txt=.dict brkitr\)
|
||||
BRK_DICT_FILES = $(BRK_DICT_FILES:.txt=.dict)
|
||||
BRK_DICT_FILES = $(BRK_DICT_FILES:brkitr\ =)
|
||||
!IFDEF ZONE_SOURCE
|
||||
ZONE_FILES = zone\root.txt $(ZONE_ALIAS_SOURCE) $(ZONE_SOURCE)
|
||||
ZONE_RES_FILES = $(ZONE_FILES:.txt =.res zone\)
|
||||
|
@ -602,7 +605,7 @@ icu4j-data-install :
|
|||
copy "$(ICUTMP)\$(ICUPKG).dat" "$(ICUOUT)\$(U_ICUDATA_NAME)$(U_ICUDATA_ENDIAN_SUFFIX).dat"
|
||||
-@erase "$(ICUTMP)\$(ICUPKG).dat"
|
||||
!ELSE
|
||||
"$(ICU_LIB_TARGET)" : $(COMMON_ICUDATA_DEPENDENCIES) $(CNV_FILES) $(CNV_FILES_SPECIAL) "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\cnvalias.icu" "$(ICUBLD_PKG)\nfc.nrm" "$(ICUBLD_PKG)\nfkc.nrm" "$(ICUBLD_PKG)\nfkc_cf.nrm" "$(ICUBLD_PKG)\uts46.nrm" "$(ICUBLD_PKG)\$(ICUCOL)\ucadata.icu" "$(ICUBLD_PKG)\$(ICUCOL)\invuca.icu" $(CURR_RES_FILES) $(LANG_RES_FILES) $(REGION_RES_FILES) $(ZONE_RES_FILES) $(BRK_FILES) $(BRK_CTD_FILES) $(BRK_RES_FILES) $(COL_COL_FILES) $(RBNF_RES_FILES) $(TRANSLIT_RES_FILES) $(ALL_RES) $(SPREP_FILES) "$(ICUBLD_PKG)\confusables.cfu"
|
||||
"$(ICU_LIB_TARGET)" : $(COMMON_ICUDATA_DEPENDENCIES) $(CNV_FILES) $(CNV_FILES_SPECIAL) "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\cnvalias.icu" "$(ICUBLD_PKG)\nfc.nrm" "$(ICUBLD_PKG)\nfkc.nrm" "$(ICUBLD_PKG)\nfkc_cf.nrm" "$(ICUBLD_PKG)\uts46.nrm" "$(ICUBLD_PKG)\$(ICUCOL)\ucadata.icu" "$(ICUBLD_PKG)\$(ICUCOL)\invuca.icu" $(CURR_RES_FILES) $(LANG_RES_FILES) $(REGION_RES_FILES) $(ZONE_RES_FILES) $(BRK_FILES) $(BRK_DICT_FILES) $(BRK_RES_FILES) $(COL_COL_FILES) $(RBNF_RES_FILES) $(TRANSLIT_RES_FILES) $(ALL_RES) $(SPREP_FILES) "$(ICUBLD_PKG)\confusables.cfu"
|
||||
@echo Building icu data
|
||||
cd "$(ICUBLD_PKG)"
|
||||
"$(ICUPBIN)\pkgdata" $(COMMON_ICUDATA_ARGUMENTS) <<"$(ICUTMP)\icudata.lst"
|
||||
|
@ -637,7 +640,7 @@ $(TRANSLIT_RES_FILES:.res =.res
|
|||
)
|
||||
$(BRK_FILES:.brk =.brk
|
||||
)
|
||||
$(BRK_CTD_FILES:.ctd =.ctd
|
||||
$(BRK_DICT_FILES:.dict=.dict
|
||||
)
|
||||
$(BRK_RES_FILES:.res =.res
|
||||
)
|
||||
|
@ -696,7 +699,6 @@ CLEAN : GODATA
|
|||
-@erase "zone\*.txt"
|
||||
@cd "$(ICUBLD_PKG)\$(ICUBRK)"
|
||||
-@erase "*.brk"
|
||||
-@erase "*.ctd"
|
||||
-@erase "*.res"
|
||||
-@erase "*.txt"
|
||||
@cd "$(ICUBLD_PKG)\$(ICUCOL)"
|
||||
|
@ -735,10 +737,10 @@ CLEAN : GODATA
|
|||
@echo Creating $@
|
||||
@"$(ICUTOOLS)\genbrk\$(CFG)\genbrk" -c -r $< -o $@ -d"$(ICUBLD_PKG)" -i "$(ICUBLD_PKG)"
|
||||
|
||||
# RBBI .ctd file generation.
|
||||
{$(ICUSRCDATA_RELATIVE_PATH)\$(ICUBRK)}.txt.ctd:
|
||||
@echo Creating $@
|
||||
@"$(ICUTOOLS)\genctd\$(CFG)\genctd" -c -o $@ -d"$(ICUBLD_PKG)" -i "$(ICUBLD_PKG)" $<
|
||||
#RBBI .dict file generation.
|
||||
{$(ICUSRCDATA_RELATIVE_PATH)\$(ICUBRK)}.txt.dict:
|
||||
@echo Creating $@
|
||||
@"$(ICUTOOLS)\gendict\$(CFG)\gendict" -c --uchars -i "$(ICUBLD_PKG)" $< $(ICUBLD_PKG)\$@
|
||||
|
||||
!IFNDEF ICUDATA_SOURCE_ARCHIVE
|
||||
# Rule for creating converters
|
||||
|
|
|
@ -209,7 +209,7 @@
|
|||
<Project>{8b41752b-5a52-41e4-b7e0-07921c0cc6bf}</Project>
|
||||
<ReferenceOutputAssembly>false</ReferenceOutputAssembly>
|
||||
</ProjectReference>
|
||||
<ProjectReference Include="..\tools\genctd\genctd.vcxproj">
|
||||
<ProjectReference Include="..\tools\gendict\gendict.vcxproj">
|
||||
<Project>{9d4211f7-2c77-439c-82f0-30a4e43ba569}</Project>
|
||||
<ReferenceOutputAssembly>false</ReferenceOutputAssembly>
|
||||
</ProjectReference>
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!--
|
||||
Copyright (c) 2010-2011 International Business Machines Corporation and others. All rights reserved.
|
||||
Copyright (c) 2010-2012 International Business Machines Corporation and others. All rights reserved.
|
||||
-->
|
||||
<!DOCTYPE ldml SYSTEM "http://www.unicode.org/repos/cldr/trunk/common/dtd/ldml.dtd"
|
||||
[
|
||||
|
@ -24,8 +24,11 @@
|
|||
<icu:title icu:dependency="title.brk"/>
|
||||
</icu:boundaries>
|
||||
<icu:dictionaries>
|
||||
<icu:dictionary type="Thai" icu:dependency="thaidict.ctd"/>
|
||||
<icu:dictionary type="Khmr" icu:dependency="khmerdict.ctd"/>
|
||||
<icu:dictionary type="Thai" icu:dependency="thaidict.dict"/>
|
||||
<icu:dictionary type="Khmr" icu:dependency="khmerdict.dict"/>
|
||||
<icu:dictionary type="Hani" icu:dependency="cjdict.dict"/>
|
||||
<icu:dictionary type="Hira" icu:dependency="cjdict.dict"/>
|
||||
<icu:dictionary type="Kata" icu:dependency="cjdict.dict"/>
|
||||
</icu:dictionaries>
|
||||
</icu:breakIteratorData>
|
||||
</special>
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2011, International Business Machines Corporation and
|
||||
* Copyright (c) 1997-2012, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
/********************************************************************************
|
||||
|
@ -768,7 +768,7 @@ typedef struct {
|
|||
|
||||
static const RBBITailoringTest tailoringTests[] = {
|
||||
{ "en", UBRK_CHARACTER, thTest, thTestOffs_thFwd, thTestOffs_thRev, sizeof(thTestOffs_thFwd)/sizeof(thTestOffs_thFwd[0]) },
|
||||
{ "th", UBRK_CHARACTER, thTest, thTestOffs_thFwd, thTestOffs_thRev, sizeof(thTestOffs_thFwd)/sizeof(thTestOffs_thFwd[0]) },
|
||||
{ "en_US_POSIX", UBRK_CHARACTER, thTest, thTestOffs_thFwd, thTestOffs_thRev, sizeof(thTestOffs_thFwd)/sizeof(thTestOffs_thFwd[0]) },
|
||||
{ "en", UBRK_LINE, heTest, heTestOffs_heFwd, heTestOffs_heRev, sizeof(heTestOffs_heFwd)/sizeof(heTestOffs_heFwd[0]) },
|
||||
{ "he", UBRK_LINE, heTest, heTestOffs_heFwd, heTestOffs_heRev, sizeof(heTestOffs_heFwd)/sizeof(heTestOffs_heFwd[0]) },
|
||||
{ "en", UBRK_LINE, fiTest, fiTestOffs_enFwd, fiTestOffs_enRev, sizeof(fiTestOffs_enFwd)/sizeof(fiTestOffs_enFwd[0]) },
|
||||
|
|
|
@ -2184,26 +2184,7 @@ static void TestResourceLevelAliasing(void) {
|
|||
} else if(seqLen != strLen || u_strncmp(sequence, string, seqLen) != 0) {
|
||||
log_err("Referencing alias didn't get the right string (3)\n");
|
||||
}
|
||||
|
||||
|
||||
{
|
||||
UResourceBundle* ja = ures_open(U_ICUDATA_BRKITR,"ja", &status);
|
||||
const UChar *got = NULL, *exp=NULL;
|
||||
int32_t gotLen = 0, expLen=0;
|
||||
ja = ures_getByKey(ja, "boundaries", ja, &status);
|
||||
exp = tres_getString(ja, -1, "word", &expLen, &status);
|
||||
|
||||
tb = ures_getByKey(aliasB, "boundaries", tb, &status);
|
||||
got = tres_getString(tb, -1, "word", &gotLen, &status);
|
||||
|
||||
if(U_FAILURE(status)) {
|
||||
log_err("%s trying to read str boundaries\n", u_errorName(status));
|
||||
} else if(gotLen != expLen || u_strncmp(exp, got, gotLen) != 0) {
|
||||
log_err("Referencing alias didn't get the right data\n");
|
||||
}
|
||||
ures_close(ja);
|
||||
status = U_ZERO_ERROR;
|
||||
}
|
||||
/* simple alias */
|
||||
testtypes = ures_open(testdatapath, "testtypes", &status);
|
||||
strcpy(buffer, "menu/file/open");
|
||||
|
|
|
@ -1236,11 +1236,9 @@ static const struct {
|
|||
}
|
||||
};
|
||||
|
||||
/* Unfortunately, trie dictionaries are in a C++ header */
|
||||
int32_t
|
||||
triedict_swap(const UDataSwapper *ds,
|
||||
const void *inData, int32_t length, void *outData,
|
||||
UErrorCode *pErrorCode);
|
||||
/* Unfortunately, dictionaries are in a C++ header */
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *pErrorCode);
|
||||
|
||||
/* test cases for maximum data swapping code coverage */
|
||||
static const struct {
|
||||
|
@ -1305,7 +1303,7 @@ static const struct {
|
|||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
{"char", "brk", ubrk_swap},
|
||||
{"thaidict", "ctd", triedict_swap},
|
||||
{"thaidict", "dict",udict_swap},
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
|
@ -1658,7 +1656,7 @@ TestSwapData() {
|
|||
nm=swapCases[i].name+1;
|
||||
uprv_strcpy(name, "testdata");
|
||||
} else if (uprv_strcmp(swapCases[i].type, "brk")==0
|
||||
|| uprv_strcmp(swapCases[i].type, "ctd")==0) {
|
||||
|| uprv_strcmp(swapCases[i].type, "dict")==0) {
|
||||
pkg=U_ICUDATA_BRKITR;
|
||||
nm=swapCases[i].name;
|
||||
uprv_strcpy(name, U_ICUDATA_BRKITR);
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/********************************************************************
|
||||
* Copyright (c) 1999-2011, International Business Machines
|
||||
* Copyright (c) 1999-2012, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
********************************************************************
|
||||
* Date Name Description
|
||||
|
@ -157,10 +157,13 @@ void RBBIAPITest::TestBoilerPlate()
|
|||
if(*a!=*b){
|
||||
errln("Failed: boilerplate method operator!= does not return correct results");
|
||||
}
|
||||
BreakIterator* c = BreakIterator::createWordInstance(Locale("ja"),status);
|
||||
if(a && c){
|
||||
if(*c==*a){
|
||||
errln("Failed: boilerplate method opertator== does not return correct results");
|
||||
// Japanese word break iterators are identical to root with
|
||||
// a dictionary-based break iterator
|
||||
BreakIterator* c = BreakIterator::createCharacterInstance(Locale("ja"),status);
|
||||
BreakIterator* d = BreakIterator::createCharacterInstance(Locale("root"),status);
|
||||
if(c && d){
|
||||
if(*c!=*d){
|
||||
errln("Failed: boilerplate method operator== does not return correct results");
|
||||
}
|
||||
}else{
|
||||
errln("creation of break iterator failed");
|
||||
|
@ -168,6 +171,7 @@ void RBBIAPITest::TestBoilerPlate()
|
|||
delete a;
|
||||
delete b;
|
||||
delete c;
|
||||
delete d;
|
||||
}
|
||||
|
||||
void RBBIAPITest::TestgetRules()
|
||||
|
@ -636,21 +640,21 @@ void RBBIAPITest::TestQuoteGrouping() {
|
|||
//
|
||||
void RBBIAPITest::TestRuleStatus() {
|
||||
UChar str[30];
|
||||
u_unescape("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094",
|
||||
// 012345678901234567 8 9 0 1 2 3 4 5 6
|
||||
// Ideographic Katakana Hiragana
|
||||
//no longer test Han or hiragana breaking here: ruleStatusVec would return nothing
|
||||
// changed UBRK_WORD_KANA to UBRK_WORD_IDEO
|
||||
u_unescape("plain word 123.45 \\u30a1\\u30a2 ",
|
||||
// 012345678901234567 8 9 0
|
||||
// Katakana
|
||||
str, 30);
|
||||
UnicodeString testString1(str);
|
||||
int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 19, 20, 21, 23, 24, 25, 26};
|
||||
int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 20, 21};
|
||||
int32_t tag_lo[] = {UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_LETTER,
|
||||
UBRK_WORD_NONE, UBRK_WORD_NUMBER, UBRK_WORD_NONE,
|
||||
UBRK_WORD_IDEO, UBRK_WORD_IDEO, UBRK_WORD_NONE,
|
||||
UBRK_WORD_KANA, UBRK_WORD_NONE, UBRK_WORD_KANA, UBRK_WORD_KANA};
|
||||
UBRK_WORD_IDEO, UBRK_WORD_NONE};
|
||||
|
||||
int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT,
|
||||
UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT,
|
||||
UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT,
|
||||
UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_KANA_LIMIT, UBRK_WORD_KANA_LIMIT};
|
||||
UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT};
|
||||
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
|
||||
|
@ -871,7 +875,6 @@ void RBBIAPITest::TestRegistration() {
|
|||
#if !UCONFIG_NO_SERVICE
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
BreakIterator* ja_word = BreakIterator::createWordInstance("ja_JP", status);
|
||||
|
||||
// ok to not delete these if we exit because of error?
|
||||
BreakIterator* ja_char = BreakIterator::createCharacterInstance("ja_JP", status);
|
||||
BreakIterator* root_word = BreakIterator::createWordInstance("", status);
|
||||
|
@ -879,6 +882,7 @@ void RBBIAPITest::TestRegistration() {
|
|||
|
||||
if (status == U_MISSING_RESOURCE_ERROR || status == U_FILE_ACCESS_ERROR) {
|
||||
dataerrln("Error creating instances of break interactors - %s", u_errorName(status));
|
||||
|
||||
delete ja_word;
|
||||
delete ja_char;
|
||||
delete root_word;
|
||||
|
@ -889,9 +893,11 @@ void RBBIAPITest::TestRegistration() {
|
|||
|
||||
URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD, status);
|
||||
{
|
||||
#if 0 // With a dictionary based word breaking, ja_word is identical to root.
|
||||
if (ja_word && *ja_word == *root_word) {
|
||||
errln("japan not different from root");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
{
|
||||
|
|
|
@ -33,10 +33,11 @@
|
|||
#include <string.h>
|
||||
#include "uvector.h"
|
||||
#include "uvectr32.h"
|
||||
#include "triedict.h"
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "unicode/numfmt.h"
|
||||
#include "unicode/uscript.h"
|
||||
|
||||
#define TEST_ASSERT(x) {if (!(x)) { \
|
||||
errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
|
||||
|
@ -111,8 +112,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
|||
#endif
|
||||
|
||||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
|
||||
case 16: name = "TestMonkey";
|
||||
if(exec) TestMonkey(params); break;
|
||||
case 16:
|
||||
name = "TestMonkey"; if(exec) TestMonkey(params); break;
|
||||
#else
|
||||
case 16:
|
||||
name = "skip"; break;
|
||||
|
@ -130,8 +131,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
|||
break;
|
||||
case 19: name = "TestDebug";
|
||||
if(exec) TestDebug(); break;
|
||||
case 20: name = "TestTrieDict";
|
||||
if(exec) TestTrieDict(); break;
|
||||
case 20: name = "skip";
|
||||
break;
|
||||
|
||||
#if !UCONFIG_NO_FILE_IO
|
||||
case 21: name = "TestBug5775";
|
||||
|
@ -428,227 +429,6 @@ void RBBITest::TestBug3818() {
|
|||
delete bi;
|
||||
}
|
||||
|
||||
|
||||
void RBBITest::TestTrieDict() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
//
|
||||
// Open and read the test data file.
|
||||
//
|
||||
const char *testDataDirectory = IntlTest::getSourceTestData(status);
|
||||
char testFileName[1000];
|
||||
if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) {
|
||||
errln("Can't open test data. Path too long.");
|
||||
return;
|
||||
}
|
||||
strcpy(testFileName, testDataDirectory);
|
||||
strcat(testFileName, "riwords.txt");
|
||||
|
||||
// Items needing deleting at the end
|
||||
MutableTrieDictionary *mutableDict = NULL;
|
||||
CompactTrieDictionary *compactDict = NULL;
|
||||
UnicodeSet *breaks = NULL;
|
||||
UChar *testFile = NULL;
|
||||
StringEnumeration *enumer1 = NULL;
|
||||
StringEnumeration *enumer2 = NULL;
|
||||
MutableTrieDictionary *mutable2 = NULL;
|
||||
StringEnumeration *cloneEnum = NULL;
|
||||
CompactTrieDictionary *compact2 = NULL;
|
||||
|
||||
|
||||
const UnicodeString *originalWord = NULL;
|
||||
const UnicodeString *cloneWord = NULL;
|
||||
UChar *current;
|
||||
UChar *word;
|
||||
UChar uc;
|
||||
int32_t wordLen;
|
||||
int32_t wordCount;
|
||||
int32_t testCount;
|
||||
|
||||
int len;
|
||||
testFile = ReadAndConvertFile(testFileName, len, NULL, status);
|
||||
if (U_FAILURE(status)) {
|
||||
goto cleanup; /* something went wrong, error already output */
|
||||
}
|
||||
|
||||
mutableDict = new MutableTrieDictionary(0x0E1C, status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
breaks = new UnicodeSet;
|
||||
breaks->add(0x000A); // Line Feed
|
||||
breaks->add(0x000D); // Carriage Return
|
||||
breaks->add(0x2028); // Line Separator
|
||||
breaks->add(0x2029); // Paragraph Separator
|
||||
|
||||
// Now add each non-comment line of the file as a word.
|
||||
current = testFile;
|
||||
word = current;
|
||||
uc = *current++;
|
||||
wordLen = 0;
|
||||
wordCount = 0;
|
||||
|
||||
while (uc) {
|
||||
if (uc == 0x0023) { // #comment line, skip
|
||||
while (uc && !breaks->contains(uc)) {
|
||||
uc = *current++;
|
||||
}
|
||||
}
|
||||
else while (uc && !breaks->contains(uc)) {
|
||||
++wordLen;
|
||||
uc = *current++;
|
||||
}
|
||||
if (wordLen > 0) {
|
||||
mutableDict->addWord(word, wordLen, status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
|
||||
goto cleanup;
|
||||
}
|
||||
wordCount += 1;
|
||||
}
|
||||
|
||||
// Find beginning of next line
|
||||
while (uc && breaks->contains(uc)) {
|
||||
uc = *current++;
|
||||
}
|
||||
word = current-1;
|
||||
wordLen = 0;
|
||||
}
|
||||
|
||||
if (wordCount < 50) {
|
||||
errln("Word count (%d) unreasonably small\n", wordCount);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
enumer1 = mutableDict->openWords(status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
testCount = 0;
|
||||
if (wordCount != (testCount = enumer1->count(status))) {
|
||||
errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
|
||||
testCount, wordCount, u_errorName(status));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
// Now compact it
|
||||
compactDict = new CompactTrieDictionary(*mutableDict, status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
enumer2 = compactDict->openWords(status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (wordCount != (testCount = enumer2->count(status))) {
|
||||
errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
|
||||
testCount, wordCount, u_errorName(status));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (typeid(*enumer1) == typeid(*enumer2)) {
|
||||
errln("CompactTrieEnumeration and MutableTrieEnumeration typeids are the same");
|
||||
}
|
||||
delete enumer1;
|
||||
enumer1 = NULL;
|
||||
delete enumer2;
|
||||
enumer2 = NULL;
|
||||
|
||||
// Now un-compact it
|
||||
mutable2 = compactDict->cloneMutable(status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
cloneEnum = mutable2->openWords(status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (wordCount != (testCount = cloneEnum->count(status))) {
|
||||
errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
|
||||
testCount, wordCount, u_errorName(status));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
// Compact original dictionary to clone. Note that we can only compare the same kind of
|
||||
// dictionary as the order of the enumerators is not guaranteed to be the same between
|
||||
// different kinds
|
||||
enumer1 = mutableDict->openWords(status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
originalWord = enumer1->snext(status);
|
||||
cloneWord = cloneEnum->snext(status);
|
||||
while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
|
||||
if (*originalWord != *cloneWord) {
|
||||
errln("Original and cloned MutableTrieDictionary word mismatch\n");
|
||||
goto cleanup;
|
||||
}
|
||||
originalWord = enumer1->snext(status);
|
||||
cloneWord = cloneEnum->snext(status);
|
||||
}
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
errln("Enumeration failed: %s\n", u_errorName(status));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (originalWord != cloneWord) {
|
||||
errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
// Test the data copying constructor for CompactTrieDict, and the data access APIs.
|
||||
compact2 = new CompactTrieDictionary(compactDict->data(), status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("CompactTrieDictionary(const void *,...) failed\n");
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (compact2->dataSize() == 0) {
|
||||
errln("CompactTrieDictionary->dataSize() == 0\n");
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
// Now count the words via the second dictionary
|
||||
delete enumer1;
|
||||
enumer1 = compact2->openWords(status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (wordCount != (testCount = enumer1->count(status))) {
|
||||
errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
|
||||
testCount, wordCount, u_errorName(status));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
delete compactDict;
|
||||
delete mutableDict;
|
||||
delete breaks;
|
||||
delete[] testFile;
|
||||
delete enumer1;
|
||||
delete mutable2;
|
||||
delete cloneEnum;
|
||||
delete compact2;
|
||||
}
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//
|
||||
// generalIteratorTest Given a break iterator and a set of test data,
|
||||
|
@ -2215,6 +1995,8 @@ private:
|
|||
UnicodeSet *fNewlineSet;
|
||||
UnicodeSet *fKatakanaSet;
|
||||
UnicodeSet *fALetterSet;
|
||||
// TODO(jungshik): Do we still need this change?
|
||||
// UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt
|
||||
UnicodeSet *fMidNumLetSet;
|
||||
UnicodeSet *fMidLetterSet;
|
||||
UnicodeSet *fMidNumSet;
|
||||
|
@ -2223,6 +2005,7 @@ private:
|
|||
UnicodeSet *fOtherSet;
|
||||
UnicodeSet *fExtendSet;
|
||||
UnicodeSet *fExtendNumLetSet;
|
||||
UnicodeSet *fDictionaryCjkSet;
|
||||
|
||||
RegexMatcher *fMatcher;
|
||||
|
||||
|
@ -2239,11 +2022,25 @@ RBBIWordMonkey::RBBIWordMonkey()
|
|||
fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status);
|
||||
fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status);
|
||||
fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status);
|
||||
fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
|
||||
fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
|
||||
// Exclude Hangul syllables from ALetterSet during testing.
|
||||
// Leave CJK dictionary characters out from the monkey tests!
|
||||
#if 0
|
||||
fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"
|
||||
"[\\p{Line_Break = Complex_Context}"
|
||||
"-\\p{Grapheme_Cluster_Break = Extend}"
|
||||
"-\\p{Grapheme_Cluster_Break = Control}"
|
||||
"]]",
|
||||
status);
|
||||
#endif
|
||||
fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
|
||||
fALetterSet->removeAll(*fDictionaryCjkSet);
|
||||
fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status);
|
||||
fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status);
|
||||
fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status);
|
||||
fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status);
|
||||
// TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
|
||||
// we should figure out why
|
||||
fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status);
|
||||
fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);
|
||||
fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
|
||||
|
@ -2268,13 +2065,14 @@ RBBIWordMonkey::RBBIWordMonkey()
|
|||
fOtherSet->removeAll(*fFormatSet);
|
||||
fOtherSet->removeAll(*fExtendSet);
|
||||
// Inhibit dictionary characters from being tested at all.
|
||||
fOtherSet->removeAll(*fDictionaryCjkSet);
|
||||
fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
|
||||
|
||||
fSets->addElement(fCRSet, status);
|
||||
fSets->addElement(fLFSet, status);
|
||||
fSets->addElement(fNewlineSet, status);
|
||||
fSets->addElement(fALetterSet, status);
|
||||
fSets->addElement(fKatakanaSet, status);
|
||||
//fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana
|
||||
fSets->addElement(fMidLetterSet, status);
|
||||
fSets->addElement(fMidNumLetSet, status);
|
||||
fSets->addElement(fMidNumSet, status);
|
||||
|
@ -3547,6 +3345,7 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
|
|||
for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
|
||||
count --;
|
||||
if (forward[count] != i) {
|
||||
printStringBreaks(ustr, expected, expectedcount);
|
||||
test->errln("happy break test previous() failed: expected %d but got %d",
|
||||
forward[count], i);
|
||||
break;
|
||||
|
@ -3580,23 +3379,25 @@ void RBBITest::TestWordBreaks(void)
|
|||
UErrorCode status = U_ZERO_ERROR;
|
||||
// BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
|
||||
BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
|
||||
// Replaced any C+J characters in a row with a random sequence of characters
|
||||
// of the same length to make our C+J segmentation not get in the way.
|
||||
static const char *strlist[] =
|
||||
{
|
||||
"\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
|
||||
"\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
|
||||
"\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
|
||||
"\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
|
||||
"\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
|
||||
"\\u90ca\\u3588\\u009c\\u0953\\u194b",
|
||||
"\\uac00\\u3588\\u009c\\u0953\\u194b",
|
||||
"\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
|
||||
"\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
|
||||
"\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
|
||||
"\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
|
||||
"\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
|
||||
"\\u003b\\u024a\\u102e\\U000e0071\\u0600",
|
||||
"\\u2027\\U000e0067\\u0a47\\u00b7",
|
||||
"\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
|
||||
"\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
|
||||
"\\u0589\\U000e006e\\u0a42\\U000104a5",
|
||||
"\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
|
||||
"\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
|
||||
"\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
|
||||
"\\u0027\\u11af\\U000e0057\\u0602",
|
||||
"\\U0001d7f2\\U000e007\\u0004\\u0589",
|
||||
|
@ -3608,7 +3409,7 @@ void RBBITest::TestWordBreaks(void)
|
|||
"\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
|
||||
"\\u0233\\U000e0020\\u0a69\\u0d6a",
|
||||
"\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
|
||||
"\\u58f4\\U000e0049\\u20e7\\u2027",
|
||||
"\\u18f4\\U000e0049\\u20e7\\u2027",
|
||||
"\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
|
||||
"\\ua183\\u102d\\u0bec\\u003a",
|
||||
"\\u17e8\\u06e7\\u002e\\u096d\\u003b",
|
||||
|
@ -3618,7 +3419,7 @@ void RBBITest::TestWordBreaks(void)
|
|||
"\\U000e005d\\u2044\\u0731\\u0650\\u0061",
|
||||
"\\u003a\\u0664\\u00b7\\u1fba",
|
||||
"\\u003b\\u0027\\u00b7\\u47a3",
|
||||
"\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
|
||||
"\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
|
||||
"\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
|
||||
"\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
|
||||
};
|
||||
|
@ -3673,12 +3474,12 @@ void RBBITest::TestWordBoundary(void)
|
|||
"\\U0001d7f2\\U000e007d\\u0004\\u0589",
|
||||
"\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
|
||||
"\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
|
||||
"\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
|
||||
"\\U000e0065\\u302c\\u09ee\\U000e0068",
|
||||
"\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
|
||||
"\\u0233\\U000e0020\\u0a69\\u0d6a",
|
||||
"\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
|
||||
"\\u58f4\\U000e0049\\u20e7\\u2027",
|
||||
"\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
|
||||
"\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
|
||||
"\\ua183\\u102d\\u0bec\\u003a",
|
||||
"\\u17e8\\u06e7\\u002e\\u096d\\u003b",
|
||||
"\\u003a\\u0e57\\u0fad\\u002e",
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2010-2011, International Business Machines
|
||||
* Copyright (C) 2010-2012, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* file name: dicttrieperf.cpp
|
||||
|
@ -34,7 +34,6 @@
|
|||
#include "charstr.h"
|
||||
#include "package.h"
|
||||
#include "toolutil.h"
|
||||
#include "triedict.h"
|
||||
#include "ucbuf.h" // struct ULine
|
||||
#include "uoptions.h"
|
||||
#include "uvectr32.h"
|
||||
|
@ -337,56 +336,6 @@ protected:
|
|||
const DictionaryTriePerfTest &perf;
|
||||
};
|
||||
|
||||
class CompactTrieDictLookup : public DictLookup {
|
||||
public:
|
||||
CompactTrieDictLookup(const DictionaryTriePerfTest &perfTest)
|
||||
: DictLookup(perfTest), ctd(NULL) {
|
||||
IcuToolErrorCode errorCode("UCharsTrieDictLookup()");
|
||||
// U+0E1C is the median code unit, from
|
||||
// the UCharsTrie root node (split-branch node) for thaidict.txt.
|
||||
MutableTrieDictionary builder(0xe1c, errorCode);
|
||||
const ULine *lines=perf.getCachedLines();
|
||||
int32_t numLines=perf.getNumLines();
|
||||
for(int32_t i=0; i<numLines; ++i) {
|
||||
// Skip comment lines (start with a character below 'A').
|
||||
if(lines[i].name[0]<0x41) {
|
||||
continue;
|
||||
}
|
||||
builder.addWord(lines[i].name, lines[i].len, errorCode);
|
||||
}
|
||||
ctd=new CompactTrieDictionary(builder, errorCode);
|
||||
int32_t length=(int32_t)ctd->dataSize();
|
||||
printf("size of CompactTrieDict: %6ld bytes\n", (long)length);
|
||||
}
|
||||
|
||||
virtual ~CompactTrieDictLookup() {
|
||||
delete ctd;
|
||||
}
|
||||
|
||||
virtual void call(UErrorCode *pErrorCode) {
|
||||
UText text=UTEXT_INITIALIZER;
|
||||
int32_t lengths[20];
|
||||
const ULine *lines=perf.getCachedLines();
|
||||
int32_t numLines=perf.getNumLines();
|
||||
for(int32_t i=0; i<numLines; ++i) {
|
||||
// Skip comment lines (start with a character below 'A').
|
||||
if(lines[i].name[0]<0x41) {
|
||||
continue;
|
||||
}
|
||||
utext_openUChars(&text, lines[i].name, lines[i].len, pErrorCode);
|
||||
int32_t count;
|
||||
ctd->matches(&text, lines[i].len,
|
||||
lengths, count, LENGTHOF(lengths));
|
||||
if(count==0 || lengths[count-1]!=lines[i].len) {
|
||||
fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
CompactTrieDictionary *ctd;
|
||||
};
|
||||
|
||||
// Closely imitate CompactTrieDictionary::matches().
|
||||
// Note: CompactTrieDictionary::matches() is part of its trie implementation,
|
||||
// and while it loops over the text, it knows the current state.
|
||||
|
@ -695,30 +644,24 @@ UPerfFunction *DictionaryTriePerfTest::runIndexedTest(int32_t index, UBool exec,
|
|||
if(hasFile()) {
|
||||
switch(index) {
|
||||
case 0:
|
||||
name="compacttriematches";
|
||||
if(exec) {
|
||||
return new CompactTrieDictLookup(*this);
|
||||
}
|
||||
break;
|
||||
case 1:
|
||||
name="ucharstriematches";
|
||||
if(exec) {
|
||||
return new UCharsTrieDictMatches(*this);
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
case 1:
|
||||
name="ucharstriecontains";
|
||||
if(exec) {
|
||||
return new UCharsTrieDictContains(*this);
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
case 2:
|
||||
name="bytestriematches";
|
||||
if(exec) {
|
||||
return new BytesTrieDictMatches(*this);
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
case 3:
|
||||
name="bytestriecontains";
|
||||
if(exec) {
|
||||
return new BytesTrieDictContains(*this);
|
||||
|
|
49
icu4c/source/test/testdata/rbbitst.txt
vendored
49
icu4c/source/test/testdata/rbbitst.txt
vendored
|
@ -170,7 +170,23 @@
|
|||
<data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data>
|
||||
|
||||
# Hiragana & Katakana stay together, but separates from each other and Latin.
|
||||
<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data>
|
||||
# *** what to do about theoretical combos of chars? i.e. hiragana + accent
|
||||
#<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<400>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<400>\N{HIRAGANA ITERATION MARK}<400>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<400>def<200>#•</data>
|
||||
|
||||
# test normalization/dictionary handling of halfwidth katakana: same dictionary phrase in fullwidth and halfwidth
|
||||
<data>•芽キャベツ<400>芽キャベツ<400></data>
|
||||
|
||||
# more Japanese tests
|
||||
# TODO: Currently, U+30FC and other characters (script=common) in the Hiragana
|
||||
# and the Katakana block are not treated correctly. Enable this later.
|
||||
#<data>•どー<400>せ<400>日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data>
|
||||
<data>•日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data>
|
||||
|
||||
# Testing of word boundary for dictionary word containing both kanji and kana
|
||||
<data>•中だるみ<400>蔵王の森<400>ウ離島<400></data>
|
||||
|
||||
# Testing of Chinese segmentation (taken from a Chinese news article)
|
||||
<data>•400<100>余<400>名<400>中央<400>委员<400>和<400>中央<400>候补<400>委员<400>都<400>领<400>到了<400>“•推荐<400>票<400>”•,•有<400>资格<400>在<400>200<100>多<400>名<400>符合<400>条件<400>的<400>63<100>岁<400>以下<400>中共<400>正<400>部<400>级<400>干部<400>中<400>,•选出<400>他们<400>属意<400>的<400>中央<400>政治局<400>委员<400>以<400>向<400>政治局<400>常委<400>会<400>举荐<400>。•</data>
|
||||
|
||||
# Words with interior formatting characters
|
||||
<data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</data>
|
||||
|
@ -178,6 +194,9 @@
|
|||
# to test for bug #4097779
|
||||
<data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>
|
||||
|
||||
# fullwidth numeric, midletter characters etc should be treated like their halfwidth counterparts
|
||||
# <data>•ISN'T<200> •19<100>日<400></data>
|
||||
# why was this added with the dbbi stuff?
|
||||
|
||||
# to test for bug #4098467
|
||||
# What follows is a string of Korean characters (I found it in the Yellow Pages
|
||||
|
@ -187,9 +206,15 @@
|
|||
# precomposed syllables...
|
||||
<data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data>
|
||||
|
||||
<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> •</data>
|
||||
# more Korean tests (Jamo not tested here, not counted as dictionary characters)
|
||||
# Disable them now because we don't include a Korean dictionary.
|
||||
#<data>•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<200>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data>
|
||||
#<data>•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2dd<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200> •\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e•</data>
|
||||
|
||||
<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> •</data>
|
||||
|
||||
<data>•\u06c9<200>\uc799<200>\ufffa•</data>
|
||||
|
||||
<data>•\u06c9\uc799\ufffa<200></data>
|
||||
|
||||
#
|
||||
# Try some words from other scripts.
|
||||
|
@ -506,8 +531,7 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
|
|||
<data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c•</data>
|
||||
|
||||
# conjoining jamo...
|
||||
# TODO: rules update needed
|
||||
#<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data>
|
||||
<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data>
|
||||
|
||||
# to test for bug #4117554: Fullwidth .!? should be treated as postJwrd
|
||||
<data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data>
|
||||
|
@ -713,7 +737,7 @@ Bangkok)•</data>
|
|||
|
||||
<locale ja>
|
||||
<line>
|
||||
<data>•\u3041•\u3043•\u3045•\u31f1•</data>
|
||||
<data>•\u3041\u3043\u3045\u31f1•</data>
|
||||
<locale en>
|
||||
<line>
|
||||
<data>•\u3041\u3043\u3045\u31f1•</data>
|
||||
|
@ -721,19 +745,20 @@ Bangkok)•</data>
|
|||
# The following data was originally in RBBITest::TestJapaneseWordBreak()
|
||||
<locale ja>
|
||||
<word>
|
||||
<data>•\u4ECA\u65E5<400>\u306F\u3044\u3044<300>\u5929\u6C17<400>\u3067\u3059\u306D<300>\u3002•\u000D\u000A•</data>
|
||||
<data>•\u4ECA\u65E5<400>\u306F<400>\u3044\u3044<400>\u5929\u6C17<400>\u3067\u3059<400>\u306D<400>\u3002•\u000D\u000A•</data>
|
||||
|
||||
# UBreakIteratorType UBRK_WORD, Locale "ja"
|
||||
# Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
|
||||
# \u79C1\u9054\u306B\u4E00\u3007\u3007\u3007\u306E\u30B3\u30F3\u30D4\u30E5\u30FC\u30BF\u304C\u3042\u308B\u3002\u5948\u3005\u306F\u30EF\u30FC\u30C9\u3067\u3042\u308B\u3002
|
||||
# modified to work with dbbi code - should verify
|
||||
|
||||
<locale ja>
|
||||
<word>
|
||||
<data>•私達<400>に<300>一〇〇〇<400>の<300>コンピュータ<300>がある<300>。<0>奈々<400>は<300>ワード<300>である<300>。•</data>
|
||||
<data>•私<400>達<400>に<400>一<400>〇<400>〇〇<400>の<400>コンピュ<400>ー<400>タ<400>が<400>ある<400>。<0>奈々<400>は<400>ワ<400>ー<400>ド<400>で<400>ある<400>。•</data>
|
||||
|
||||
<locale root>
|
||||
<word>
|
||||
<data>•私<400>達<400>に<300>一<400>〇<400>〇<400>〇<400>の<300>コンピュータ<300>が<300>あ<300>る<300>。<0>奈<400>々<200>は<300>ワード<300>で<300>あ<300>る<300>。•</data>
|
||||
<data>•私<400>達<400>に<400>一<400>〇<400>〇〇<400>の<400>コンピュ<400>ー<400>タ<400>が<400>ある<400>。<0>奈々<400>は<400>ワ<400>ー<400>ド<400>で<400>ある<400>。•</data>
|
||||
|
||||
# UBreakIteratorType UBRK_SENTENCE, Locale "el"
|
||||
# Add break after Greek question mark (cldrbug #2069).
|
||||
|
@ -778,12 +803,6 @@ Bangkok)•</data>
|
|||
(•\u0E2A\u0E38•\u0E0A•\u0E32•\u0E15\u0E34•-•\u0E08\u0E38•\u0E11•\u0E32•\u0E21•\u0E32•\u0E28•)• •\
|
||||
\u0E40•\u0E14\u0E47•\u0E01•\u0E21\u0E35•\u0E1B\u0E31•\u0E0D•\u0E2B•\u0E32• •</data>
|
||||
|
||||
<locale root>
|
||||
<char>
|
||||
<data>•\u0E01•\u0E23•\u0E30•\u0E17\u0E48•\u0E2D•\u0E21•\u0E23•\u0E08•\u0E19•\u0E32• •\
|
||||
(•\u0E2A\u0E38•\u0E0A•\u0E32•\u0E15\u0E34•-•\u0E08\u0E38•\u0E11•\u0E32•\u0E21•\u0E32•\u0E28•)• •\
|
||||
\u0E40•\u0E14\u0E47•\u0E01•\u0E21\u0E35•\u0E1B\u0E31•\u0E0D•\u0E2B•\u0E32• •</data>
|
||||
|
||||
# Finnish line breaking
|
||||
#
|
||||
# These rules deal with hyphens when there is a space on the leading side.
|
||||
|
|
4
icu4c/source/test/testdata/testaliases.txt
vendored
4
icu4c/source/test/testdata/testaliases.txt
vendored
|
@ -1,6 +1,6 @@
|
|||
//*******************************************************************************
|
||||
//*
|
||||
//* Copyright (C) 2002-2009, International Business Machines
|
||||
//* Copyright (C) 2002-2012, International Business Machines
|
||||
//* Corporation and others. All Rights Reserved.
|
||||
//*
|
||||
//*******************************************************************************
|
||||
|
@ -28,7 +28,7 @@ testaliases:table(nofallback)
|
|||
LocaleScript:alias { "/ICUDATA/ja/LocaleScript" }
|
||||
|
||||
// aliasing using position
|
||||
boundaries:alias { "/ICUDATA-brkitr/ja" } // Referencing corresponding resource in another bundle
|
||||
boundaries:alias { "/ICUDATA-brkitr/th" } // Referencing corresponding resource in another bundle
|
||||
|
||||
// aliasing arrays
|
||||
zoneTests {
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
## Makefile.in for ICU tools
|
||||
## Copyright (c) 1999-2011, International Business Machines Corporation and
|
||||
## Copyright (c) 1999-2012, International Business Machines Corporation and
|
||||
## others. All Rights Reserved.
|
||||
|
||||
## Source directory information
|
||||
|
@ -13,9 +13,9 @@ include $(top_builddir)/icudefs.mk
|
|||
## Build directory information
|
||||
subdir = tools
|
||||
|
||||
SUBDIRS = toolutil ctestfw makeconv genrb genbrk genctd \
|
||||
SUBDIRS = toolutil ctestfw makeconv genrb genbrk \
|
||||
gencnval gensprep icuinfo genccode gencmn icupkg pkgdata \
|
||||
gentest gennorm2 gencfu
|
||||
gentest gennorm2 gencfu gendict
|
||||
|
||||
## List of phony targets
|
||||
.PHONY : all all-local all-recursive install install-local \
|
||||
|
|
|
@ -1,111 +0,0 @@
|
|||
.\" Hey, Emacs! This is -*-nroff-*- you know...
|
||||
.\"
|
||||
.\" genctd.1: manual page for the genctd utility
|
||||
.\"
|
||||
.\" Copyright (C) 2006-2007 International Business Machines Corporation and others
|
||||
.\"
|
||||
.TH GENCTD 1 "8 March 2006" "ICU MANPAGE" "ICU @VERSION@ Manual"
|
||||
.SH NAME
|
||||
.B genctd
|
||||
\- Compiles word list into ICU compact trie dictionary
|
||||
.SH SYNOPSIS
|
||||
.B genctd
|
||||
[
|
||||
.BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
|
||||
]
|
||||
[
|
||||
.BR "\-V\fP, \fB\-\-version"
|
||||
]
|
||||
[
|
||||
.BR "\-c\fP, \fB\-\-copyright"
|
||||
]
|
||||
[
|
||||
.BR "\-v\fP, \fB\-\-verbose"
|
||||
]
|
||||
[
|
||||
.BI "\-d\fP, \fB\-\-destdir" " destination"
|
||||
]
|
||||
[
|
||||
.BI "\-i\fP, \fB\-\-icudatadir" " directory"
|
||||
]
|
||||
.BI "\-o\fP, \fB\-\-out" " output\-file"
|
||||
.IR " dictionary\-file"
|
||||
.SH DESCRIPTION
|
||||
.B genctd
|
||||
reads the word list from
|
||||
.I dictionary-file
|
||||
and creates a compact trie dictionary file. Normally this data file has the
|
||||
.B .ctd
|
||||
extension.
|
||||
.PP
|
||||
Words begin at the beginning of a line and are terminated by the first whitespace.
|
||||
Lines that begin with whitespace are ignored.
|
||||
.SH OPTIONS
|
||||
.TP
|
||||
.BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
|
||||
Print help about usage and exit.
|
||||
.TP
|
||||
.BR "\-V\fP, \fB\-\-version"
|
||||
Print the version of
|
||||
.B genctd
|
||||
and exit.
|
||||
.TP
|
||||
.BR "\-c\fP, \fB\-\-copyright"
|
||||
Embeds the standard ICU copyright into the
|
||||
.IR output-file .
|
||||
.TP
|
||||
.BR "\-v\fP, \fB\-\-verbose"
|
||||
Display extra informative messages during execution.
|
||||
.TP
|
||||
.BI "\-d\fP, \fB\-\-destdir" " destination"
|
||||
Set the destination directory of the
|
||||
.IR output-file
|
||||
to
|
||||
.IR destination .
|
||||
.TP
|
||||
.BI "\-i\fP, \fB\-\-icudatadir" " directory"
|
||||
Look for any necessary ICU data files in
|
||||
.IR directory .
|
||||
For example, the file
|
||||
.B pnames.icu
|
||||
must be located when ICU's data is not built as a shared library.
|
||||
The default ICU data directory is specified by the environment variable
|
||||
.BR ICU_DATA .
|
||||
Most configurations of ICU do not require this argument.
|
||||
.TP
|
||||
.BI " dictionary\-file"
|
||||
The source file to read.
|
||||
.TP
|
||||
.BI "\-o\fP, \fB\-\-out" " output\-file"
|
||||
The output data file to write.
|
||||
.SH CAVEATS
|
||||
When the
|
||||
.IR dictionary-file
|
||||
contains a byte order mark (BOM) at the beginning of the file, which is the Unicode character
|
||||
.B U+FEFF,
|
||||
then the
|
||||
.IR dictionary-file
|
||||
is interpreted as Unicode. Without the BOM,
|
||||
the file is interpreted in the current operating system default codepage.
|
||||
In order to eliminate any ambiguity of the encoding for how the
|
||||
.IR rule-file
|
||||
was written, it is recommended that you write this file in UTF-8
|
||||
with the BOM.
|
||||
.SH ENVIRONMENT
|
||||
.TP 10
|
||||
.B ICU_DATA
|
||||
Specifies the directory containing ICU data. Defaults to
|
||||
.BR @thepkgicudatadir@/@PACKAGE@/@VERSION@/ .
|
||||
Some tools in ICU depend on the presence of the trailing slash. It is thus
|
||||
important to make sure that it is present if
|
||||
.B ICU_DATA
|
||||
is set.
|
||||
.SH AUTHORS
|
||||
Deborah Goldsmith
|
||||
.SH VERSION
|
||||
1.0
|
||||
.SH COPYRIGHT
|
||||
Copyright (C) 2006 International Business Machines Corporation and others
|
||||
.SH SEE ALSO
|
||||
.BR http://www.icu-project.org/userguide/boundaryAnalysis.html
|
||||
|
|
@ -1,396 +0,0 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2002-2009, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*
|
||||
* File genctd.c
|
||||
*/
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
//
|
||||
// Tool for generating CompactTrieDictionary data files (.ctd files).
|
||||
//
|
||||
// Usage: genctd [options] -o output-file.ctd input-file
|
||||
//
|
||||
// options: -v verbose
|
||||
// -? or -h help
|
||||
//
|
||||
// The input file is a plain text file containing words, one per line.
|
||||
// Words end at the first whitespace; lines beginning with whitespace
|
||||
// are ignored.
|
||||
// The file can be encoded as utf-8, or utf-16 (either endian), or
|
||||
// in the default code page (platform dependent.). utf encoded
|
||||
// files must include a BOM.
|
||||
//
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/ucnv.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uclean.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/putil.h"
|
||||
|
||||
#include "uoptions.h"
|
||||
#include "unewdata.h"
|
||||
#include "ucmndata.h"
|
||||
#include "rbbidata.h"
|
||||
#include "triedict.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
static char *progName;
|
||||
static UOption options[]={
|
||||
UOPTION_HELP_H, /* 0 */
|
||||
UOPTION_HELP_QUESTION_MARK, /* 1 */
|
||||
UOPTION_VERBOSE, /* 2 */
|
||||
{ "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 3 */
|
||||
UOPTION_ICUDATADIR, /* 4 */
|
||||
UOPTION_DESTDIR, /* 5 */
|
||||
UOPTION_COPYRIGHT, /* 6 */
|
||||
};
|
||||
|
||||
void usageAndDie(int retCode) {
|
||||
printf("Usage: %s [-v] [-options] -o output-file dictionary-file\n", progName);
|
||||
printf("\tRead in word list and write out compact trie dictionary\n"
|
||||
"options:\n"
|
||||
"\t-h or -? or --help this usage text\n"
|
||||
"\t-V or --version show a version message\n"
|
||||
"\t-c or --copyright include a copyright notice\n"
|
||||
"\t-v or --verbose turn on verbose output\n"
|
||||
"\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
|
||||
"\t followed by path, defaults to %s\n"
|
||||
"\t-d or --destdir destination directory, followed by the path\n",
|
||||
u_getDataDirectory());
|
||||
exit (retCode);
|
||||
}
|
||||
|
||||
|
||||
#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
|
||||
|
||||
/* dummy UDataInfo cf. udata.h */
|
||||
static UDataInfo dummyDataInfo = {
|
||||
sizeof(UDataInfo),
|
||||
0,
|
||||
|
||||
U_IS_BIG_ENDIAN,
|
||||
U_CHARSET_FAMILY,
|
||||
U_SIZEOF_UCHAR,
|
||||
0,
|
||||
|
||||
{ 0, 0, 0, 0 }, /* dummy dataFormat */
|
||||
{ 0, 0, 0, 0 }, /* dummy formatVersion */
|
||||
{ 0, 0, 0, 0 } /* dummy dataVersion */
|
||||
};
|
||||
|
||||
#else
|
||||
|
||||
//
|
||||
// Set up the ICU data header, defined in ucmndata.h
|
||||
//
|
||||
DataHeader dh ={
|
||||
{sizeof(DataHeader), // Struct MappedData
|
||||
0xda,
|
||||
0x27},
|
||||
|
||||
{ // struct UDataInfo
|
||||
sizeof(UDataInfo), // size
|
||||
0, // reserved
|
||||
U_IS_BIG_ENDIAN,
|
||||
U_CHARSET_FAMILY,
|
||||
U_SIZEOF_UCHAR,
|
||||
0, // reserved
|
||||
|
||||
{ 0x54, 0x72, 0x44, 0x63 }, // "TrDc" Trie Dictionary
|
||||
{ 1, 0, 0, 0 }, // 1.0.0.0
|
||||
{ 0, 0, 0, 0 }, // Irrelevant for this data type
|
||||
}};
|
||||
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//
|
||||
// main for genctd
|
||||
//
|
||||
//----------------------------------------------------------------------------
|
||||
int main(int argc, char **argv) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
const char *wordFileName;
|
||||
const char *outFileName;
|
||||
const char *outDir = NULL;
|
||||
const char *copyright = NULL;
|
||||
|
||||
//
|
||||
// Pick up and check the command line arguments,
|
||||
// using the standard ICU tool utils option handling.
|
||||
//
|
||||
U_MAIN_INIT_ARGS(argc, argv);
|
||||
progName = argv[0];
|
||||
argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
|
||||
if(argc<0) {
|
||||
// Unrecognized option
|
||||
fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
|
||||
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
|
||||
}
|
||||
|
||||
if(options[0].doesOccur || options[1].doesOccur) {
|
||||
// -? or -h for help.
|
||||
usageAndDie(0);
|
||||
}
|
||||
|
||||
if (!options[3].doesOccur || argc < 2) {
|
||||
fprintf(stderr, "input and output file must both be specified.\n");
|
||||
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
|
||||
}
|
||||
outFileName = options[3].value;
|
||||
wordFileName = argv[1];
|
||||
|
||||
if (options[4].doesOccur) {
|
||||
u_setDataDirectory(options[4].value);
|
||||
}
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
|
||||
/* Combine the directory with the file name */
|
||||
if(options[5].doesOccur) {
|
||||
outDir = options[5].value;
|
||||
}
|
||||
if (options[6].doesOccur) {
|
||||
copyright = U_COPYRIGHT_STRING;
|
||||
}
|
||||
|
||||
#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
|
||||
|
||||
UNewDataMemory *pData;
|
||||
char msg[1024];
|
||||
|
||||
/* write message with just the name */
|
||||
sprintf(msg, "genctd writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
|
||||
fprintf(stderr, "%s\n", msg);
|
||||
|
||||
/* write the dummy data file */
|
||||
pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
|
||||
udata_writeBlock(pData, msg, strlen(msg));
|
||||
udata_finish(pData, &status);
|
||||
return (int)status;
|
||||
|
||||
#else
|
||||
/* Initialize ICU */
|
||||
u_init(&status);
|
||||
if (U_FAILURE(status)) {
|
||||
fprintf(stderr, "%s: can not initialize ICU. status = %s\n",
|
||||
argv[0], u_errorName(status));
|
||||
exit(1);
|
||||
}
|
||||
status = U_ZERO_ERROR;
|
||||
|
||||
//
|
||||
// Read in the dictionary source file
|
||||
//
|
||||
long result;
|
||||
long wordFileSize;
|
||||
FILE *file;
|
||||
char *wordBufferC;
|
||||
|
||||
file = fopen(wordFileName, "rb");
|
||||
if( file == 0 ) {
|
||||
fprintf(stderr, "Could not open file \"%s\"\n", wordFileName);
|
||||
exit(-1);
|
||||
}
|
||||
fseek(file, 0, SEEK_END);
|
||||
wordFileSize = ftell(file);
|
||||
fseek(file, 0, SEEK_SET);
|
||||
wordBufferC = new char[wordFileSize+10];
|
||||
|
||||
result = (long)fread(wordBufferC, 1, wordFileSize, file);
|
||||
if (result != wordFileSize) {
|
||||
fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
|
||||
exit (-1);
|
||||
}
|
||||
wordBufferC[wordFileSize]=0;
|
||||
fclose(file);
|
||||
|
||||
//
|
||||
// Look for a Unicode Signature (BOM) on the word file
|
||||
//
|
||||
int32_t signatureLength;
|
||||
const char * wordSourceC = wordBufferC;
|
||||
const char* encoding = ucnv_detectUnicodeSignature(
|
||||
wordSourceC, wordFileSize, &signatureLength, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
exit(status);
|
||||
}
|
||||
if(encoding!=NULL ){
|
||||
wordSourceC += signatureLength;
|
||||
wordFileSize -= signatureLength;
|
||||
}
|
||||
|
||||
//
|
||||
// Open a converter to take the rule file to UTF-16
|
||||
//
|
||||
UConverter* conv;
|
||||
conv = ucnv_open(encoding, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
|
||||
exit(status);
|
||||
}
|
||||
|
||||
//
|
||||
// Convert the words to UChar.
|
||||
// Preflight first to determine required buffer size.
|
||||
//
|
||||
uint32_t destCap = ucnv_toUChars(conv,
|
||||
NULL, // dest,
|
||||
0, // destCapacity,
|
||||
wordSourceC,
|
||||
wordFileSize,
|
||||
&status);
|
||||
if (status != U_BUFFER_OVERFLOW_ERROR) {
|
||||
fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
|
||||
exit(status);
|
||||
};
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
UChar *wordSourceU = new UChar[destCap+1];
|
||||
ucnv_toUChars(conv,
|
||||
wordSourceU, // dest,
|
||||
destCap+1,
|
||||
wordSourceC,
|
||||
wordFileSize,
|
||||
&status);
|
||||
if (U_FAILURE(status)) {
|
||||
fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
|
||||
exit(status);
|
||||
};
|
||||
ucnv_close(conv);
|
||||
|
||||
// Get rid of the original file buffer
|
||||
delete[] wordBufferC;
|
||||
|
||||
// Create a MutableTrieDictionary, and loop through all the lines, inserting
|
||||
// words.
|
||||
|
||||
// First, pick a median character.
|
||||
UChar *current = wordSourceU + (destCap/2);
|
||||
UChar uc = *current++;
|
||||
UnicodeSet breaks;
|
||||
breaks.add(0x000A); // Line Feed
|
||||
breaks.add(0x000D); // Carriage Return
|
||||
breaks.add(0x2028); // Line Separator
|
||||
breaks.add(0x2029); // Paragraph Separator
|
||||
|
||||
do {
|
||||
// Look for line break
|
||||
while (uc && !breaks.contains(uc)) {
|
||||
uc = *current++;
|
||||
}
|
||||
// Now skip to first non-line-break
|
||||
while (uc && breaks.contains(uc)) {
|
||||
uc = *current++;
|
||||
}
|
||||
}
|
||||
while (uc && (breaks.contains(uc) || u_isspace(uc)));
|
||||
|
||||
MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
|
||||
exit(status);
|
||||
}
|
||||
|
||||
// Now add the words. Words are non-space characters at the beginning of
|
||||
// lines, and must be at least one UChar.
|
||||
current = wordSourceU;
|
||||
UChar *candidate = current;
|
||||
uc = *current++;
|
||||
int32_t length = 0;
|
||||
|
||||
while (uc) {
|
||||
while (uc && !u_isspace(uc)) {
|
||||
++length;
|
||||
uc = *current++;
|
||||
}
|
||||
if (length > 0) {
|
||||
mtd->addWord(candidate, length, status);
|
||||
if (U_FAILURE(status)) {
|
||||
fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\"\n",
|
||||
u_errorName(status));
|
||||
exit(status);
|
||||
}
|
||||
}
|
||||
// Find beginning of next line
|
||||
while (uc && !breaks.contains(uc)) {
|
||||
uc = *current++;
|
||||
}
|
||||
while (uc && breaks.contains(uc)) {
|
||||
uc = *current++;
|
||||
}
|
||||
candidate = current-1;
|
||||
length = 0;
|
||||
}
|
||||
|
||||
// Get rid of the Unicode text buffer
|
||||
delete[] wordSourceU;
|
||||
|
||||
// Now, create a CompactTrieDictionary from the mutable dictionary
|
||||
CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status);
|
||||
if (U_FAILURE(status)) {
|
||||
fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
|
||||
exit(status);
|
||||
}
|
||||
|
||||
// Get rid of the MutableTrieDictionary
|
||||
delete mtd;
|
||||
|
||||
//
|
||||
// Get the binary data from the dictionary.
|
||||
//
|
||||
uint32_t outDataSize = ctd->dataSize();
|
||||
const uint8_t *outData = (const uint8_t *)ctd->data();
|
||||
|
||||
//
|
||||
// Create the output file
|
||||
//
|
||||
size_t bytesWritten;
|
||||
UNewDataMemory *pData;
|
||||
pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
|
||||
if(U_FAILURE(status)) {
|
||||
fprintf(stderr, "genctd: Could not open output file \"%s\", \"%s\"\n",
|
||||
outFileName, u_errorName(status));
|
||||
exit(status);
|
||||
}
|
||||
|
||||
|
||||
// Write the data itself.
|
||||
udata_writeBlock(pData, outData, outDataSize);
|
||||
// finish up
|
||||
bytesWritten = udata_finish(pData, &status);
|
||||
if(U_FAILURE(status)) {
|
||||
fprintf(stderr, "genctd: error \"%s\" writing the output file\n", u_errorName(status));
|
||||
exit(status);
|
||||
}
|
||||
|
||||
if (bytesWritten != outDataSize) {
|
||||
fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// Get rid of the CompactTrieDictionary
|
||||
delete ctd;
|
||||
|
||||
u_cleanup();
|
||||
|
||||
printf("genctd: tool completed successfully.\n");
|
||||
return 0;
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
}
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
## Makefile.in for ICU - tools/genctd
|
||||
## Copyright (c) 2002-2011 International Business Machines Corporation and
|
||||
## Makefile.in for ICU - tools/gendict
|
||||
## Copyright (c) 2002-2012 International Business Machines Corporation and
|
||||
## others. All Rights Reserved.
|
||||
|
||||
## Source directory information
|
||||
|
@ -11,9 +11,9 @@ top_builddir = ../..
|
|||
include $(top_builddir)/icudefs.mk
|
||||
|
||||
## Build directory information
|
||||
subdir = tools/genctd
|
||||
subdir = tools/gendict
|
||||
|
||||
TARGET_STUB_NAME = genctd
|
||||
TARGET_STUB_NAME = gendict
|
||||
|
||||
SECTION = 1
|
||||
|
||||
|
@ -29,7 +29,7 @@ TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)
|
|||
CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil
|
||||
LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
|
||||
|
||||
OBJECTS = genctd.o
|
||||
OBJECTS = gendict.o
|
||||
|
||||
DEPS = $(OBJECTS:.o=.d)
|
||||
|
131
icu4c/source/tools/gendict/gendict.1.in
Normal file
131
icu4c/source/tools/gendict/gendict.1.in
Normal file
|
@ -0,0 +1,131 @@
|
|||
.\" Hey, Emacs! This is -*-nroff-*- you know...
|
||||
.\"
|
||||
.\" gendict.1: manual page for the gendict utility
|
||||
.\"
|
||||
.\" Copyright (C) 2012 International Business Machines Corporation and others
|
||||
.\"
|
||||
.TH GENDICT 1 "1 June 2012" "ICU MANPAGE" "ICU @VERSION@ Manual"
|
||||
.SH NAME
|
||||
.B gendict
|
||||
\- Compiles word list into ICU string trie dictionary
|
||||
.SH SYNOPSIS
|
||||
.B gendict
|
||||
[
|
||||
.BR "\fB\-\-uchars"
|
||||
|
|
||||
.BR "\fB\-\-bytes"
|
||||
.BI "\fB\-\-transform" " transform"
|
||||
]
|
||||
[
|
||||
.BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
|
||||
]
|
||||
[
|
||||
.BR "\-V\fP, \fB\-\-version"
|
||||
]
|
||||
[
|
||||
.BR "\-c\fP, \fB\-\-copyright"
|
||||
]
|
||||
[
|
||||
.BR "\-v\fP, \fB\-\-verbose"
|
||||
]
|
||||
[
|
||||
.BI "\-i\fP, \fB\-\-icudatadir" " directory"
|
||||
]
|
||||
.IR " input-file"
|
||||
.IR " output\-file"
|
||||
.SH DESCRIPTION
|
||||
.B gendict
|
||||
reads the word list from
|
||||
.I dictionary-file
|
||||
and creates a string trie dictionary file. Normally this data file has the
|
||||
.B .dict
|
||||
extension.
|
||||
.PP
|
||||
Words begin at the beginning of a line and are terminated by the first whitespace.
|
||||
Lines that begin with whitespace are ignored.
|
||||
.SH OPTIONS
|
||||
.TP
|
||||
.BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
|
||||
Print help about usage and exit.
|
||||
.TP
|
||||
.BR "\-V\fP, \fB\-\-version"
|
||||
Print the version of
|
||||
.B gendict
|
||||
and exit.
|
||||
.TP
|
||||
.BR "\-c\fP, \fB\-\-copyright"
|
||||
Embeds the standard ICU copyright into the
|
||||
.IR output-file .
|
||||
.TP
|
||||
.BR "\-v\fP, \fB\-\-verbose"
|
||||
Display extra informative messages during execution.
|
||||
.TP
|
||||
.BI "\-i\fP, \fB\-\-icudatadir" " directory"
|
||||
Look for any necessary ICU data files in
|
||||
.IR directory .
|
||||
For example, the file
|
||||
.B pnames.icu
|
||||
must be located when ICU's data is not built as a shared library.
|
||||
The default ICU data directory is specified by the environment variable
|
||||
.BR ICU_DATA .
|
||||
Most configurations of ICU do not require this argument.
|
||||
.TP
|
||||
.BR "\fB\-\-uchars"
|
||||
Set the output trie type to UChar. Mutually exclusive with
|
||||
.BR --bytes.
|
||||
.TP
|
||||
.BR "\fB\-\-bytes"
|
||||
Set the output trie type to Bytes. Mutually exclusive with
|
||||
.BR --uchars.
|
||||
.TP
|
||||
.BR "\fB\-\-transform"
|
||||
Set the transform type. Should only be specified with
|
||||
.BR --bytes.
|
||||
Currently supported transforms are:
|
||||
.BR offset-<hex-number>,
|
||||
which specifies an offset to subtract from all input characters.
|
||||
It should be noted that the offset transform also maps U+200D
|
||||
to 0xFF and U+200C to 0xFE, in order to offer compatibility to
|
||||
languages that require these characters.
|
||||
A transform must be specified for a bytes trie, and when applied
|
||||
to the non-value characters in the
|
||||
.IR input-file
|
||||
must produce output between 0x00 and 0xFF.
|
||||
.TP
|
||||
.BI " input\-file"
|
||||
The source file to read.
|
||||
.TP
|
||||
.BI " output\-file"
|
||||
The file to write the output dictionary to.
|
||||
.SH CAVEATS
|
||||
The
|
||||
.IR input-file
|
||||
is assumed to be encoded in UTF-8.
|
||||
The integers in the
|
||||
.IR input-file
|
||||
that are used as values must be made up of ASCII digits. They
|
||||
may be specified either in hex, by using a 0x prefix, or in
|
||||
decimal.
|
||||
Either
|
||||
.BI --bytes
|
||||
or
|
||||
.BI --uchars
|
||||
must be specified.
|
||||
.SH ENVIRONMENT
|
||||
.TP 10
|
||||
.B ICU_DATA
|
||||
Specifies the directory containing ICU data. Defaults to
|
||||
.BR @thepkgicudatadir@/@PACKAGE@/@VERSION@/ .
|
||||
Some tools in ICU depend on the presence of the trailing slash. It is thus
|
||||
important to make sure that it is present if
|
||||
.B ICU_DATA
|
||||
is set.
|
||||
.SH AUTHORS
|
||||
Maxime Serrano
|
||||
.SH VERSION
|
||||
1.0
|
||||
.SH COPYRIGHT
|
||||
Copyright (C) 2012 International Business Machines Corporation and others
|
||||
.SH SEE ALSO
|
||||
.BR http://www.icu-project.org/userguide/boundaryAnalysis.html
|
||||
|
448
icu4c/source/tools/gendict/gendict.cpp
Normal file
448
icu4c/source/tools/gendict/gendict.cpp
Normal file
|
@ -0,0 +1,448 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2002-2012, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*
|
||||
* File gendict.cpp
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/ucnv.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uclean.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "unicode/ucharstriebuilder.h"
|
||||
#include "unicode/bytestriebuilder.h"
|
||||
#include "unicode/ucharstrie.h"
|
||||
#include "unicode/bytestrie.h"
|
||||
#include "unicode/ucnv.h"
|
||||
|
||||
#include "charstr.h"
|
||||
#include "dictionarydata.h"
|
||||
#include "uoptions.h"
|
||||
#include "unewdata.h"
|
||||
#include "cmemory.h"
|
||||
#include "uassert.h"
|
||||
#include "ucbuf.h"
|
||||
#include "toolutil.h"
|
||||
#include "cstring.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
static char *progName;
|
||||
static UOption options[]={
|
||||
UOPTION_HELP_H, /* 0 */
|
||||
UOPTION_HELP_QUESTION_MARK, /* 1 */
|
||||
UOPTION_VERBOSE, /* 2 */
|
||||
UOPTION_ICUDATADIR, /* 4 */
|
||||
UOPTION_COPYRIGHT, /* 5 */
|
||||
{ "uchars", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0}, /* 6 */
|
||||
{ "bytes", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0}, /* 7 */
|
||||
{ "transform", NULL, NULL, NULL, '\1', UOPT_REQUIRES_ARG, 0}, /* 8 */
|
||||
};
|
||||
|
||||
enum arguments {
|
||||
ARG_HELP = 0,
|
||||
ARG_QMARK,
|
||||
ARG_VERBOSE,
|
||||
ARG_ICUDATADIR,
|
||||
ARG_COPYRIGHT,
|
||||
ARG_UCHARS,
|
||||
ARG_BYTES,
|
||||
ARG_TRANSFORM
|
||||
};
|
||||
|
||||
// prints out the standard usage method describing command line arguments,
|
||||
// then bails out with the desired exit code
|
||||
static void usageAndDie(UErrorCode retCode) {
|
||||
fprintf((U_SUCCESS(retCode) ? stdout : stderr), "Usage: %s -trietype [-options] input-dictionary-file output-file\n", progName);
|
||||
fprintf((U_SUCCESS(retCode) ? stdout : stderr),
|
||||
"\tRead in a word list and write out a string trie dictionary\n"
|
||||
"options:\n"
|
||||
"\t-h or -? or --help this usage text\n"
|
||||
"\t-V or --version show a version message\n"
|
||||
"\t-c or --copyright include a copyright notice\n"
|
||||
"\t-v or --verbose turn on verbose output\n"
|
||||
"\t-i or --icudatadir directory for locating any needed intermediate data files,\n" // TODO: figure out if we need this option
|
||||
"\t followed by path, defaults to %s\n"
|
||||
"\t--uchars output a UCharsTrie (mutually exclusive with -b!)\n"
|
||||
"\t--bytes output a BytesTrie (mutually exclusive with -u!)\n"
|
||||
"\t--transform the kind of transform to use (eg --transform offset-40A3,\n"
|
||||
"\t which specifies an offset transform with constant 0x40A3)\n",
|
||||
u_getDataDirectory());
|
||||
exit(retCode);
|
||||
}
|
||||
|
||||
|
||||
/* UDataInfo cf. udata.h */
|
||||
static UDataInfo dataInfo = {
|
||||
sizeof(UDataInfo),
|
||||
0,
|
||||
|
||||
U_IS_BIG_ENDIAN,
|
||||
U_CHARSET_FAMILY,
|
||||
U_SIZEOF_UCHAR,
|
||||
0,
|
||||
|
||||
{ 0x44, 0x69, 0x63, 0x74 }, /* "Dict" */
|
||||
{ 1, 0, 0, 0 }, /* format version */
|
||||
{ 0, 0, 0, 0 } /* data version */
|
||||
};
|
||||
|
||||
// A wrapper for both BytesTrieBuilder and UCharsTrieBuilder.
|
||||
// may want to put this somewhere in ICU, as it could be useful outside
|
||||
// of this tool?
|
||||
class DataDict {
|
||||
private:
|
||||
BytesTrieBuilder *bt;
|
||||
UCharsTrieBuilder *ut;
|
||||
UChar32 transformConstant;
|
||||
int32_t transformType;
|
||||
public:
|
||||
// constructs a new data dictionary. if there is an error,
|
||||
// it will be returned in status
|
||||
// isBytesTrie != 0 will produce a BytesTrieBuilder,
|
||||
// isBytesTrie == 0 will produce a UCharsTrieBuilder
|
||||
DataDict(UBool isBytesTrie, UErrorCode &status) : bt(NULL), ut(NULL),
|
||||
transformConstant(0), transformType(DictionaryData::TRANSFORM_NONE) {
|
||||
if (isBytesTrie) {
|
||||
bt = new BytesTrieBuilder(status);
|
||||
} else {
|
||||
ut = new UCharsTrieBuilder(status);
|
||||
}
|
||||
}
|
||||
|
||||
~DataDict() {
|
||||
delete bt;
|
||||
delete ut;
|
||||
}
|
||||
|
||||
private:
|
||||
char transform(UChar32 c, UErrorCode &status) {
|
||||
if (transformType == DictionaryData::TRANSFORM_TYPE_OFFSET) {
|
||||
if (c == 0x200D) { return (char)0xFF; }
|
||||
else if (c == 0x200C) { return (char)0xFE; }
|
||||
int32_t delta = c - transformConstant;
|
||||
if (delta < 0 || 0xFD < delta) {
|
||||
fprintf(stderr, "Codepoint U+%04lx out of range for --transform offset-%04lx!\n",
|
||||
(long)c, (long)transformConstant);
|
||||
exit(U_ILLEGAL_ARGUMENT_ERROR); // TODO: should return and print the line number
|
||||
}
|
||||
return (char)delta;
|
||||
} else { // no such transform type
|
||||
status = U_INTERNAL_PROGRAM_ERROR;
|
||||
return (char)c; // it should be noted this transform type will not generally work
|
||||
}
|
||||
}
|
||||
|
||||
void transform(const UnicodeString &word, CharString &buf, UErrorCode &errorCode) {
|
||||
UChar32 c = 0;
|
||||
int32_t len = word.length();
|
||||
for (int32_t i = 0; i < len; i += U16_LENGTH(c)) {
|
||||
c = word.char32At(i);
|
||||
buf.append(transform(c, errorCode), errorCode);
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
// sets the desired transformation data.
|
||||
// should be populated from a command line argument
|
||||
// so far the only acceptable format is offset-<hex constant>
|
||||
// eventually others (mask-<hex constant>?) may be enabled
|
||||
// more complex functions may be more difficult
|
||||
void setTransform(const char *t) {
|
||||
if (strncmp(t, "offset-", 7) == 0) {
|
||||
char *end;
|
||||
unsigned long base = uprv_strtoul(t + 7, &end, 16);
|
||||
if (end == (t + 7) || *end != 0 || base > 0x10FF80) {
|
||||
fprintf(stderr, "Syntax for offset value in --transform offset-%s invalid!\n", t + 7);
|
||||
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
|
||||
}
|
||||
transformType = DictionaryData::TRANSFORM_TYPE_OFFSET;
|
||||
transformConstant = (UChar32)base;
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "Invalid transform specified: %s\n", t);
|
||||
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
// add a word to the trie
|
||||
void addWord(const UnicodeString &word, int32_t value, UErrorCode &status) {
|
||||
if (bt) {
|
||||
CharString buf;
|
||||
transform(word, buf, status);
|
||||
bt->add(buf.toStringPiece(), value, status);
|
||||
}
|
||||
if (ut) { ut->add(word, value, status); }
|
||||
}
|
||||
|
||||
// if we are a bytestrie, give back the StringPiece representing the serialized version of us
|
||||
StringPiece serializeBytes(UErrorCode &status) {
|
||||
return bt->buildStringPiece(USTRINGTRIE_BUILD_SMALL, status);
|
||||
}
|
||||
|
||||
// if we are a ucharstrie, produce the UnicodeString representing the serialized version of us
|
||||
void serializeUChars(UnicodeString &s, UErrorCode &status) {
|
||||
ut->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, s, status);
|
||||
}
|
||||
|
||||
int32_t getTransform() {
|
||||
return (int32_t)(transformType | transformConstant);
|
||||
}
|
||||
};
|
||||
|
||||
static const UChar LINEFEED_CHARACTER = 0x000A;
|
||||
static const UChar CARRIAGE_RETURN_CHARACTER = 0x000D;
|
||||
|
||||
static UBool readLine(UCHARBUF *f, UnicodeString &fileLine, IcuToolErrorCode &errorCode) {
|
||||
int32_t lineLength;
|
||||
const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
|
||||
if(line == NULL || errorCode.isFailure()) { return FALSE; }
|
||||
// Strip trailing CR/LF, comments, and spaces.
|
||||
const UChar *comment = u_memchr(line, 0x23, lineLength); // '#'
|
||||
if(comment != NULL) {
|
||||
lineLength = (int32_t)(comment - line);
|
||||
} else {
|
||||
while(lineLength > 0 && (line[lineLength - 1] == CARRIAGE_RETURN_CHARACTER || line[lineLength - 1] == LINEFEED_CHARACTER)) { --lineLength; }
|
||||
}
|
||||
while(lineLength > 0 && u_isspace(line[lineLength - 1])) { --lineLength; }
|
||||
fileLine.setTo(FALSE, line, lineLength);
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//
|
||||
// main for gendict
|
||||
//
|
||||
//----------------------------------------------------------------------------
|
||||
int main(int argc, char **argv) {
|
||||
//
|
||||
// Pick up and check the command line arguments,
|
||||
// using the standard ICU tool utils option handling.
|
||||
//
|
||||
U_MAIN_INIT_ARGS(argc, argv);
|
||||
progName = argv[0];
|
||||
argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
|
||||
if(argc<0) {
|
||||
// Unrecognized option
|
||||
fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
|
||||
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
|
||||
}
|
||||
|
||||
if(options[ARG_HELP].doesOccur || options[ARG_QMARK].doesOccur) {
|
||||
// -? or -h for help.
|
||||
usageAndDie(U_ZERO_ERROR);
|
||||
}
|
||||
|
||||
UBool verbose = options[ARG_VERBOSE].doesOccur;
|
||||
|
||||
if (argc < 3) {
|
||||
fprintf(stderr, "input and output file must both be specified.\n");
|
||||
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
|
||||
}
|
||||
const char *outFileName = argv[2];
|
||||
const char *wordFileName = argv[1];
|
||||
|
||||
if (options[ARG_ICUDATADIR].doesOccur) {
|
||||
u_setDataDirectory(options[ARG_ICUDATADIR].value);
|
||||
}
|
||||
|
||||
const char *copyright = NULL;
|
||||
if (options[ARG_COPYRIGHT].doesOccur) {
|
||||
copyright = U_COPYRIGHT_STRING;
|
||||
}
|
||||
|
||||
if (options[ARG_UCHARS].doesOccur == options[ARG_BYTES].doesOccur) {
|
||||
fprintf(stderr, "you must specify exactly one type of trie to output!\n");
|
||||
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
|
||||
}
|
||||
UBool isBytesTrie = options[ARG_BYTES].doesOccur;
|
||||
if (isBytesTrie != options[ARG_TRANSFORM].doesOccur) {
|
||||
fprintf(stderr, "you must provide a transformation for a bytes trie, and must not provide one for a uchars trie!\n");
|
||||
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
|
||||
}
|
||||
|
||||
IcuToolErrorCode status("gendict/main()");
|
||||
|
||||
#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
|
||||
|
||||
UNewDataMemory *pData;
|
||||
char msg[1024];
|
||||
|
||||
/* write message with just the name */ // potential for a buffer overflow here...
|
||||
sprintf(msg, "gendict writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
|
||||
fprintf(stderr, "%s\n", msg);
|
||||
|
||||
/* write the dummy data file */
|
||||
pData = udata_create(outDir, NULL, outFileName, &dataInfo, NULL, &status);
|
||||
udata_writeBlock(pData, msg, strlen(msg));
|
||||
udata_finish(pData, &status);
|
||||
return (int)status;
|
||||
|
||||
#else
|
||||
// Read in the dictionary source file
|
||||
if (verbose) { printf("Opening file %s...\n", wordFileName); }
|
||||
const char *codepage = "UTF-8";
|
||||
UCHARBUF *f = ucbuf_open(wordFileName, &codepage, TRUE, FALSE, status);
|
||||
if (status.isFailure()) {
|
||||
fprintf(stderr, "error opening input file: ICU Error \"%s\"\n", status.errorName());
|
||||
exit(status.reset());
|
||||
}
|
||||
if (verbose) { printf("Initializing dictionary builder of type %s...\n", (isBytesTrie ? "BytesTrie" : "UCharsTrie")); }
|
||||
DataDict dict(isBytesTrie, status);
|
||||
if (status.isFailure()) {
|
||||
fprintf(stderr, "new DataDict: ICU Error \"%s\"\n", status.errorName());
|
||||
exit(status.reset());
|
||||
}
|
||||
if (options[ARG_TRANSFORM].doesOccur) {
|
||||
dict.setTransform(options[ARG_TRANSFORM].value);
|
||||
}
|
||||
|
||||
UnicodeString fileLine;
|
||||
if (verbose) { puts("Adding words to dictionary..."); }
|
||||
UBool hasValues = FALSE;
|
||||
UBool hasValuelessContents = FALSE;
|
||||
int lineCount = 0;
|
||||
UBool isOk = TRUE;
|
||||
while (readLine(f, fileLine, status)) {
|
||||
lineCount++;
|
||||
if (fileLine.isEmpty()) continue;
|
||||
|
||||
// Parse word [spaces value].
|
||||
int32_t keyLen;
|
||||
for (keyLen = 0; keyLen < fileLine.length() && !u_isspace(fileLine[keyLen]); ++keyLen) {}
|
||||
if (keyLen == 0) {
|
||||
fprintf(stderr, "Error: no word on line %i!\n", lineCount);
|
||||
isOk = FALSE;
|
||||
continue;
|
||||
}
|
||||
int32_t valueStart;
|
||||
for (valueStart = keyLen;
|
||||
valueStart < fileLine.length() && u_isspace(fileLine[valueStart]);
|
||||
++valueStart) {}
|
||||
|
||||
if (keyLen < valueStart) {
|
||||
int32_t valueLength = fileLine.length() - valueStart;
|
||||
if (valueLength > 15) {
|
||||
fprintf(stderr, "Error: value too long on line %i!\n", lineCount);
|
||||
isOk = FALSE;
|
||||
continue;
|
||||
}
|
||||
char s[16];
|
||||
fileLine.extract(valueStart, valueLength, s, 16, US_INV);
|
||||
char *end;
|
||||
unsigned long value = uprv_strtoul(s, &end, 0);
|
||||
if (end == s || *end != 0 || (int32_t)uprv_strlen(s) != valueLength || value > 0xffffffff) {
|
||||
fprintf(stderr, "Error: value syntax error or value too large on line %i!\n", lineCount);
|
||||
isOk = FALSE;
|
||||
continue;
|
||||
}
|
||||
dict.addWord(fileLine.tempSubString(0, keyLen), (int32_t)value, status);
|
||||
hasValues = TRUE;
|
||||
} else {
|
||||
dict.addWord(fileLine.tempSubString(0, keyLen), 0, status);
|
||||
hasValuelessContents = FALSE;
|
||||
}
|
||||
|
||||
if (status.isFailure()) {
|
||||
fprintf(stderr, "ICU Error \"%s\": Failed to add word to trie at input line %d in input file\n",
|
||||
status.errorName(), lineCount);
|
||||
exit(status.reset());
|
||||
}
|
||||
}
|
||||
|
||||
if (!isOk && status.isSuccess()) {
|
||||
status.set(U_ILLEGAL_ARGUMENT_ERROR);
|
||||
}
|
||||
if (hasValues && hasValuelessContents) {
|
||||
fprintf(stderr, "warning: file contained both valued and unvalued strings!\n");
|
||||
}
|
||||
|
||||
if (verbose) { puts("Serializing data..."); }
|
||||
int32_t outDataSize;
|
||||
const void *outData;
|
||||
UnicodeString usp;
|
||||
if (isBytesTrie) {
|
||||
StringPiece sp = dict.serializeBytes(status);
|
||||
outDataSize = sp.size();
|
||||
outData = sp.data();
|
||||
} else {
|
||||
dict.serializeUChars(usp, status);
|
||||
outDataSize = usp.length() * U_SIZEOF_UCHAR;
|
||||
outData = usp.getBuffer();
|
||||
}
|
||||
if (status.isFailure()) {
|
||||
fprintf(stderr, "gendict: got failure of type %s while serializing\n", status.errorName());
|
||||
exit(status.reset());
|
||||
}
|
||||
if (verbose) { puts("Opening output file..."); }
|
||||
UNewDataMemory *pData = udata_create(NULL, NULL, outFileName, &dataInfo, copyright, status);
|
||||
if (status.isFailure()) {
|
||||
fprintf(stderr, "gendict: could not open output file \"%s\", \"%s\"\n", outFileName, status.errorName());
|
||||
exit(status.reset());
|
||||
}
|
||||
|
||||
if (verbose) { puts("Writing to output file..."); }
|
||||
int32_t indexes[DictionaryData::IX_COUNT] = {
|
||||
DictionaryData::IX_COUNT * sizeof(int32_t), 0, 0, 0, 0, 0, 0, 0
|
||||
};
|
||||
int32_t size = outDataSize + indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
|
||||
indexes[DictionaryData::IX_RESERVED1_OFFSET] = size;
|
||||
indexes[DictionaryData::IX_RESERVED2_OFFSET] = size;
|
||||
indexes[DictionaryData::IX_TOTAL_SIZE] = size;
|
||||
|
||||
indexes[DictionaryData::IX_TRIE_TYPE] = isBytesTrie ? DictionaryData::TRIE_TYPE_BYTES : DictionaryData::TRIE_TYPE_UCHARS;
|
||||
if (hasValues) {
|
||||
indexes[DictionaryData::IX_TRIE_TYPE] |= DictionaryData::TRIE_HAS_VALUES;
|
||||
}
|
||||
|
||||
indexes[DictionaryData::IX_TRANSFORM] = dict.getTransform();
|
||||
udata_writeBlock(pData, indexes, sizeof(indexes));
|
||||
udata_writeBlock(pData, outData, outDataSize);
|
||||
size_t bytesWritten = udata_finish(pData, status);
|
||||
if (status.isFailure()) {
|
||||
fprintf(stderr, "gendict: error \"%s\" writing the output file\n", status.errorName());
|
||||
exit(status.reset());
|
||||
}
|
||||
|
||||
if (bytesWritten != (size_t)size) {
|
||||
fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
|
||||
exit(U_INTERNAL_PROGRAM_ERROR);
|
||||
}
|
||||
|
||||
puts("gendict: tool completed successfully.");
|
||||
|
||||
#ifdef TEST_GENDICT
|
||||
if (isBytesTrie) {
|
||||
BytesTrie::Iterator it(outData, outDataSize, status);
|
||||
while (it.hasNext()) {
|
||||
it.next(status);
|
||||
const StringPiece s = it.getString();
|
||||
int32_t val = it.getValue();
|
||||
printf("%s -> %i\n", s.data(), val);
|
||||
}
|
||||
} else {
|
||||
UCharsTrie::Iterator it((const UChar *)outData, outDataSize, status);
|
||||
while (it.hasNext()) {
|
||||
it.next(status);
|
||||
const UnicodeString s = it.getString();
|
||||
int32_t val = it.getValue();
|
||||
char tmp[1024];
|
||||
s.extract(0, s.length(), tmp, 1024);
|
||||
printf("%s -> %i\n", tmp, val);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
}
|
|
@ -84,7 +84,7 @@
|
|||
<Outputs>..\..\..\bin\$(TargetFileName);%(Outputs)</Outputs>
|
||||
</CustomBuildStep>
|
||||
<Midl>
|
||||
<TypeLibraryName>.\x86\Release/genctd.tlb</TypeLibraryName>
|
||||
<TypeLibraryName>.\x86\Release/gendict.tlb</TypeLibraryName>
|
||||
</Midl>
|
||||
<ClCompile>
|
||||
<AdditionalIncludeDirectories>..\..\common;..\toolutil;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
|
||||
|
@ -94,7 +94,7 @@
|
|||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<DisableLanguageExtensions>true</DisableLanguageExtensions>
|
||||
<TreatWChar_tAsBuiltInType>true</TreatWChar_tAsBuiltInType>
|
||||
<PrecompiledHeaderOutputFile>.\x86\Release/genctd.pch</PrecompiledHeaderOutputFile>
|
||||
<PrecompiledHeaderOutputFile>.\x86\Release/gendict.pch</PrecompiledHeaderOutputFile>
|
||||
<AssemblerListingLocation>.\x86\Release/</AssemblerListingLocation>
|
||||
<ObjectFileName>.\x86\Release/</ObjectFileName>
|
||||
<ProgramDataBaseFileName>.\x86\Release/</ProgramDataBaseFileName>
|
||||
|
@ -107,9 +107,9 @@
|
|||
<Culture>0x0409</Culture>
|
||||
</ResourceCompile>
|
||||
<Link>
|
||||
<OutputFile>.\x86\Release/genctd.exe</OutputFile>
|
||||
<OutputFile>.\x86\Release/gendict.exe</OutputFile>
|
||||
<SuppressStartupBanner>true</SuppressStartupBanner>
|
||||
<ProgramDatabaseFile>.\x86\Release/genctd.pdb</ProgramDatabaseFile>
|
||||
<ProgramDatabaseFile>.\x86\Release/gendict.pdb</ProgramDatabaseFile>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<RandomizedBaseAddress>false</RandomizedBaseAddress>
|
||||
<DataExecutionPrevention>
|
||||
|
@ -123,7 +123,7 @@
|
|||
<Outputs>..\..\..\bin\$(TargetFileName);%(Outputs)</Outputs>
|
||||
</CustomBuildStep>
|
||||
<Midl>
|
||||
<TypeLibraryName>.\x86\Debug/genctd.tlb</TypeLibraryName>
|
||||
<TypeLibraryName>.\x86\Debug/gendict.tlb</TypeLibraryName>
|
||||
</Midl>
|
||||
<ClCompile>
|
||||
<Optimization>Disabled</Optimization>
|
||||
|
@ -134,7 +134,7 @@
|
|||
<BufferSecurityCheck>true</BufferSecurityCheck>
|
||||
<DisableLanguageExtensions>true</DisableLanguageExtensions>
|
||||
<TreatWChar_tAsBuiltInType>true</TreatWChar_tAsBuiltInType>
|
||||
<PrecompiledHeaderOutputFile>.\x86\Debug/genctd.pch</PrecompiledHeaderOutputFile>
|
||||
<PrecompiledHeaderOutputFile>.\x86\Debug/gendict.pch</PrecompiledHeaderOutputFile>
|
||||
<AssemblerListingLocation>.\x86\Debug/</AssemblerListingLocation>
|
||||
<ObjectFileName>.\x86\Debug/</ObjectFileName>
|
||||
<ProgramDataBaseFileName>.\x86\Debug/</ProgramDataBaseFileName>
|
||||
|
@ -149,10 +149,10 @@
|
|||
<Culture>0x0409</Culture>
|
||||
</ResourceCompile>
|
||||
<Link>
|
||||
<OutputFile>.\x86\Debug/genctd.exe</OutputFile>
|
||||
<OutputFile>.\x86\Debug/gendict.exe</OutputFile>
|
||||
<SuppressStartupBanner>true</SuppressStartupBanner>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<ProgramDatabaseFile>.\x86\Debug/genctd.pdb</ProgramDatabaseFile>
|
||||
<ProgramDatabaseFile>.\x86\Debug/gendict.pdb</ProgramDatabaseFile>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<RandomizedBaseAddress>false</RandomizedBaseAddress>
|
||||
<DataExecutionPrevention>
|
||||
|
@ -167,7 +167,7 @@
|
|||
</CustomBuildStep>
|
||||
<Midl>
|
||||
<TargetEnvironment>X64</TargetEnvironment>
|
||||
<TypeLibraryName>.\x64\Release/genctd.tlb</TypeLibraryName>
|
||||
<TypeLibraryName>.\x64\Release/gendict.tlb</TypeLibraryName>
|
||||
</Midl>
|
||||
<ClCompile>
|
||||
<AdditionalIncludeDirectories>..\..\common;..\toolutil;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
|
||||
|
@ -177,7 +177,7 @@
|
|||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<DisableLanguageExtensions>true</DisableLanguageExtensions>
|
||||
<TreatWChar_tAsBuiltInType>true</TreatWChar_tAsBuiltInType>
|
||||
<PrecompiledHeaderOutputFile>.\x64\Release/genctd.pch</PrecompiledHeaderOutputFile>
|
||||
<PrecompiledHeaderOutputFile>.\x64\Release/gendict.pch</PrecompiledHeaderOutputFile>
|
||||
<AssemblerListingLocation>.\x64\Release/</AssemblerListingLocation>
|
||||
<ObjectFileName>.\x64\Release/</ObjectFileName>
|
||||
<ProgramDataBaseFileName>.\x64\Release/</ProgramDataBaseFileName>
|
||||
|
@ -190,9 +190,9 @@
|
|||
<Culture>0x0409</Culture>
|
||||
</ResourceCompile>
|
||||
<Link>
|
||||
<OutputFile>.\x64\Release/genctd.exe</OutputFile>
|
||||
<OutputFile>.\x64\Release/gendict.exe</OutputFile>
|
||||
<SuppressStartupBanner>true</SuppressStartupBanner>
|
||||
<ProgramDatabaseFile>.\x64\Release/genctd.pdb</ProgramDatabaseFile>
|
||||
<ProgramDatabaseFile>.\x64\Release/gendict.pdb</ProgramDatabaseFile>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<TargetMachine>MachineX64</TargetMachine>
|
||||
</Link>
|
||||
|
@ -205,7 +205,7 @@
|
|||
</CustomBuildStep>
|
||||
<Midl>
|
||||
<TargetEnvironment>X64</TargetEnvironment>
|
||||
<TypeLibraryName>.\x64\Debug/genctd.tlb</TypeLibraryName>
|
||||
<TypeLibraryName>.\x64\Debug/gendict.tlb</TypeLibraryName>
|
||||
</Midl>
|
||||
<ClCompile>
|
||||
<Optimization>Disabled</Optimization>
|
||||
|
@ -216,7 +216,7 @@
|
|||
<BufferSecurityCheck>true</BufferSecurityCheck>
|
||||
<DisableLanguageExtensions>true</DisableLanguageExtensions>
|
||||
<TreatWChar_tAsBuiltInType>true</TreatWChar_tAsBuiltInType>
|
||||
<PrecompiledHeaderOutputFile>.\x64\Debug/genctd.pch</PrecompiledHeaderOutputFile>
|
||||
<PrecompiledHeaderOutputFile>.\x64\Debug/gendict.pch</PrecompiledHeaderOutputFile>
|
||||
<AssemblerListingLocation>.\x64\Debug/</AssemblerListingLocation>
|
||||
<ObjectFileName>.\x64\Debug/</ObjectFileName>
|
||||
<ProgramDataBaseFileName>.\x64\Debug/</ProgramDataBaseFileName>
|
||||
|
@ -231,16 +231,16 @@
|
|||
<Culture>0x0409</Culture>
|
||||
</ResourceCompile>
|
||||
<Link>
|
||||
<OutputFile>.\x64\Debug/genctd.exe</OutputFile>
|
||||
<OutputFile>.\x64\Debug/gendict.exe</OutputFile>
|
||||
<SuppressStartupBanner>true</SuppressStartupBanner>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<ProgramDatabaseFile>.\x64\Debug/genctd.pdb</ProgramDatabaseFile>
|
||||
<ProgramDatabaseFile>.\x64\Debug/gendict.pdb</ProgramDatabaseFile>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<TargetMachine>MachineX64</TargetMachine>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="genctd.cpp" />
|
||||
<ClCompile Include="gendict.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\..\common\common.vcxproj">
|
|
@ -2,21 +2,21 @@
|
|||
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup>
|
||||
<Filter Include="Source Files">
|
||||
<UniqueIdentifier>{13ddeaaf-33bc-4f07-a772-cd365dd75257}</UniqueIdentifier>
|
||||
<UniqueIdentifier>{570fb8ae-ac18-467d-8502-470a241a60d4}</UniqueIdentifier>
|
||||
<Extensions>cpp;c;cxx;rc;def;r;odl;idl;hpj;bat</Extensions>
|
||||
</Filter>
|
||||
<Filter Include="Header Files">
|
||||
<UniqueIdentifier>{259ce86d-ab79-4867-b42f-d114c3b8ed6e}</UniqueIdentifier>
|
||||
<UniqueIdentifier>{7b2185f2-4ff9-4419-b596-0a21e37414c9}</UniqueIdentifier>
|
||||
<Extensions>h;hpp;hxx;hm;inl</Extensions>
|
||||
</Filter>
|
||||
<Filter Include="Resource Files">
|
||||
<UniqueIdentifier>{3b1a7423-5627-4cf4-a0d5-29ad34d9e5ac}</UniqueIdentifier>
|
||||
<UniqueIdentifier>{1dc5e7e3-4d1b-4031-a31f-c39b3a3e283a}</UniqueIdentifier>
|
||||
<Extensions>ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe</Extensions>
|
||||
</Filter>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="genctd.cpp">
|
||||
<ClCompile Include="gendict.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
</Project>
|
|
@ -54,8 +54,8 @@
|
|||
#include "sprpimpl.h"
|
||||
#include "propname.h"
|
||||
#include "rbbidata.h"
|
||||
#include "triedict.h"
|
||||
#include "utrie2.h"
|
||||
#include "dictionarydata.h"
|
||||
|
||||
/* swapping implementations in i18n */
|
||||
|
||||
|
@ -734,7 +734,7 @@ static const struct {
|
|||
#endif
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
{ { 0x42, 0x72, 0x6b, 0x20 }, ubrk_swap }, /* dataFormat="Brk " */
|
||||
{ { 0x54, 0x72, 0x44, 0x63 }, triedict_swap }, /* dataFormat="TrDc " */
|
||||
{ { 0x44, 0x69, 0x63, 0x74 }, udict_swap }, /* dataFormat="Dict" */
|
||||
#endif
|
||||
{ { 0x70, 0x6e, 0x61, 0x6d }, upname_swap }, /* dataFormat="pnam" */
|
||||
{ { 0x75, 0x6e, 0x61, 0x6d }, uchar_swapNames }, /* dataFormat="unam" */
|
||||
|
|
Loading…
Add table
Reference in a new issue