ICU-9573 Removing Boyer-Moore string search implementation. CollData has moved from i18n package to intltest.

X-SVN-Rev: 32994
This commit is contained in:
Yoshito Umaoka 2012-12-19 05:12:25 +00:00
parent 49d85a9d59
commit 7586fcf7d4
18 changed files with 106 additions and 3403 deletions

View file

@ -1,5 +1,5 @@
Microsoft Visual Studio Solution File, Format Version 11.00
# Visual Studio 2010
# Visual C++ Express 2010
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cal", "..\samples\cal\cal.vcxproj", "{F7659D77-09CF-4FE9-ACEE-927287AA9509}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cintltst", "..\test\cintltst\cintltst.vcxproj", "{3D1246AE-1B32-479B-BECA-AEFA97BE2321}"
@ -52,7 +52,7 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "iotest", "..\test\iotest\io
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "icupkg", "..\tools\icupkg\icupkg.vcxproj", "{62D4B15D-7A90-4ECB-BA19-5E021D6A21BC}"
EndProject
Project("{9D4211F7-2C77-439C-82F0-30A4E43BA569}") = "gendict", "..\tools\gendict\gendict.vcxproj", "{9D4211F7-2C77-439C-82F0-30A4E43BA569}"
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gendict", "..\tools\gendict\gendict.vcxproj", "{9D4211F7-2C77-439C-82F0-30A4E43BA569}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "letest", "..\test\letest\letest.vcxproj", "{67351485-4D18-4245-BE39-A7EF0675ACD2}"
EndProject
@ -65,10 +65,6 @@ EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "testplug", "..\tools\icuinfo\testplug.vcxproj", "{659D0C08-D4ED-4BF3-B02B-2D8D4B5A7A7A}"
EndProject
Global
GlobalSection(SubversionScc) = preSolution
Svn-Managed = True
Manager = AnkhSVN - Subversion Support for Visual Studio
EndGlobalSection
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Win32 = Debug|Win32
Debug|x64 = Debug|x64
@ -174,12 +170,10 @@ Global
{77C78066-746F-4EA6-B3FE-B8C8A4A97891}.Release|x64.Build.0 = Release|x64
{0178B127-6269-407D-B112-93877BB62776}.Debug|Win32.ActiveCfg = Debug|Win32
{0178B127-6269-407D-B112-93877BB62776}.Debug|Win32.Build.0 = Debug|Win32
{0178B127-6269-407D-B112-93877BB62776}.Debug|x64.ActiveCfg = Debug|x64
{0178B127-6269-407D-B112-93877BB62776}.Debug|x64.Build.0 = Debug|x64
{0178B127-6269-407D-B112-93877BB62776}.Debug|x64.ActiveCfg = Debug|Win32
{0178B127-6269-407D-B112-93877BB62776}.Release|Win32.ActiveCfg = Release|Win32
{0178B127-6269-407D-B112-93877BB62776}.Release|Win32.Build.0 = Release|Win32
{0178B127-6269-407D-B112-93877BB62776}.Release|x64.ActiveCfg = Release|x64
{0178B127-6269-407D-B112-93877BB62776}.Release|x64.Build.0 = Release|x64
{0178B127-6269-407D-B112-93877BB62776}.Release|x64.ActiveCfg = Release|Win32
{73632960-B3A6-464D-83A3-4B43365F19B8}.Debug|Win32.ActiveCfg = Debug|Win32
{73632960-B3A6-464D-83A3-4B43365F19B8}.Debug|Win32.Build.0 = Debug|Win32
{73632960-B3A6-464D-83A3-4B43365F19B8}.Debug|x64.ActiveCfg = Debug|x64
@ -336,4 +330,8 @@ Global
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(SubversionScc) = preSolution
Svn-Managed = True
Manager = AnkhSVN - Subversion Support for Visual Studio
EndGlobalSection
EndGlobal

View file

@ -82,7 +82,7 @@ ulocdata.o measfmt.o currfmt.o curramt.o currunit.o measure.o utmscale.o \
csdetect.o csmatch.o csr2022.o csrecog.o csrmbcs.o csrsbcs.o csrucode.o csrutf8.o inputext.o \
wintzimpl.o windtfmt.o winnmfmt.o basictz.o dtrule.o rbtz.o tzrule.o tztrans.o vtzone.o zonemeta.o \
upluralrules.o plurrule.o plurfmt.o selfmt.o dtitvfmt.o dtitvinf.o udateintervalformat.o \
tmunit.o tmutamt.o tmutfmt.o colldata.o bmsearch.o bms.o currpinf.o \
tmunit.o tmutamt.o tmutfmt.o currpinf.o \
uspoof.o uspoof_impl.o uspoof_build.o uspoof_conf.o uspoof_wsconf.o decfmtst.o smpdtfst.o \
ztrans.o zrule.o vzone.o fphdlimp.o fpositer.o locdspnm.o \
decNumber.o decContext.o alphaindex.o tznames.o tznames_impl.o tzgnames.o \

View file

@ -1,155 +0,0 @@
/*
* Copyright (C) 2008-2011, International Business Machines Corporation and Others.
* All rights reserved.
*/
#include "unicode/utypes.h"
#include "cmemory.h"
#include "unicode/bms.h"
#include "unicode/unistr.h"
#include "unicode/colldata.h"
#include "unicode/bmsearch.h"
#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
//#define USE_SAFE_CASTS
#ifdef USE_SAFE_CASTS
#define STATIC_CAST(type,value) static_cast<type>(value)
#define CONST_CAST(type,value) const_cast<type>(value)
#else
#define STATIC_CAST(type,value) (type) (value)
#define CONST_CAST(type,value) (type) (value)
#endif
U_NAMESPACE_USE
U_CAPI UCD * U_EXPORT2
ucd_open(UCollator *coll, UErrorCode *status)
{
return STATIC_CAST(UCD *, CollData::open(coll, *status));
}
U_CAPI void U_EXPORT2
ucd_close(UCD *ucd)
{
if (ucd != NULL) {
CollData *data = STATIC_CAST(CollData *, ucd);
CollData::close(data);
}
}
U_CAPI UCollator * U_EXPORT2
ucd_getCollator(UCD *ucd)
{
CollData *data = STATIC_CAST(CollData *, ucd);
return data->getCollator();
}
U_CAPI void U_EXPORT2
ucd_freeCache()
{
CollData::freeCollDataCache();
}
U_CAPI void U_EXPORT2
ucd_flushCache()
{
CollData::flushCollDataCache();
}
struct BMS
{
BoyerMooreSearch *bms;
const UnicodeString *targetString;
};
U_CAPI BMS * U_EXPORT2
bms_open(UCD *ucd,
const UChar *pattern, int32_t patternLength,
const UChar *target, int32_t targetLength,
UErrorCode *status)
{
BMS *bms = STATIC_CAST(BMS *, uprv_malloc(sizeof(BMS)));
if (bms == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
CollData *data = (CollData *) ucd;
UnicodeString patternString(pattern, patternLength);
if (target != NULL) {
bms->targetString = new UnicodeString(target, targetLength);
if (bms->targetString == NULL) {
bms->bms = NULL;
*status = U_MEMORY_ALLOCATION_ERROR;
return bms;
}
} else {
bms->targetString = NULL;
}
bms->bms = new BoyerMooreSearch(data, patternString, bms->targetString, *status);
if (bms->bms == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
}
return bms;
}
U_CAPI void U_EXPORT2
bms_close(BMS *bms)
{
delete bms->bms;
delete bms->targetString;
uprv_free(bms);
}
U_CAPI UBool U_EXPORT2
bms_empty(BMS *bms)
{
return bms->bms->empty();
}
U_CAPI UCD * U_EXPORT2
bms_getData(BMS *bms)
{
return STATIC_CAST(UCD *, bms->bms->getData());
}
U_CAPI UBool U_EXPORT2
bms_search(BMS *bms, int32_t offset, int32_t *start, int32_t *end)
{
return bms->bms->search(offset, *start, *end);
}
U_CAPI void U_EXPORT2
bms_setTargetString(BMS *bms, const UChar *target, int32_t targetLength, UErrorCode *status)
{
if (U_FAILURE(*status)) {
return;
}
if (bms->targetString != NULL) {
delete bms->targetString;
}
if (target != NULL) {
bms->targetString = new UnicodeString(target, targetLength);
} else {
bms->targetString = NULL;
}
bms->bms->setTargetString(bms->targetString, *status);
}
#endif

View file

@ -1,827 +0,0 @@
/*
******************************************************************************
* Copyright (C) 1996-2012, International Business Machines *
* Corporation and others. All Rights Reserved. *
******************************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
#include "unicode/unistr.h"
#include "unicode/putil.h"
#include "unicode/usearch.h"
#include "cmemory.h"
#include "unicode/coll.h"
#include "unicode/tblcoll.h"
#include "unicode/coleitr.h"
#include "unicode/ucoleitr.h"
#include "unicode/regex.h" // TODO: make conditional on regexp being built.
#include "unicode/uniset.h"
#include "unicode/uset.h"
#include "unicode/ustring.h"
#include "hash.h"
#include "uhash.h"
#include "ucol_imp.h"
#include "normalizer2impl.h"
#include "unicode/colldata.h"
#include "unicode/bmsearch.h"
U_NAMESPACE_BEGIN
#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
#define DELETE_ARRAY(array) uprv_free((void *) (array))
struct CEI
{
uint32_t order;
int32_t lowOffset;
int32_t highOffset;
};
class Target : public UMemory
{
public:
Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status);
~Target();
void setTargetString(const UnicodeString *target);
const CEI *nextCE(int32_t offset);
const CEI *prevCE(int32_t offset);
int32_t stringLength();
UChar charAt(int32_t offset);
UBool isBreakBoundary(int32_t offset);
int32_t nextBreakBoundary(int32_t offset);
int32_t nextSafeBoundary(int32_t offset);
UBool isIdentical(UnicodeString &pattern, int32_t start, int32_t end);
void setOffset(int32_t offset);
void setLast(int32_t last);
int32_t getOffset();
private:
CEI *ceb;
int32_t bufferSize;
int32_t bufferMin;
int32_t bufferMax;
uint32_t strengthMask;
UCollationStrength strength;
uint32_t variableTop;
UBool toShift;
UCollator *coll;
const Normalizer2 &nfd;
const UnicodeString *targetString;
const UChar *targetBuffer;
int32_t targetLength;
UCollationElements *elements;
UBreakIterator *charBreakIterator;
};
Target::Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status)
: bufferSize(0), bufferMin(0), bufferMax(0),
strengthMask(0), strength(UCOL_PRIMARY), variableTop(0), toShift(FALSE), coll(theCollator),
nfd(*Normalizer2Factory::getNFDInstance(status)),
targetString(NULL), targetBuffer(NULL), targetLength(0), elements(NULL), charBreakIterator(NULL)
{
strength = ucol_getStrength(coll);
toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
variableTop = ucol_getVariableTop(coll, &status);
// find the largest expansion
uint8_t maxExpansion = 0;
for (const uint8_t *expansion = coll->expansionCESize; *expansion != 0; expansion += 1) {
if (*expansion > maxExpansion) {
maxExpansion = *expansion;
}
}
// room for an extra character on each end, plus 4 for safety
bufferSize = patternLength + (2 * maxExpansion) + 4;
ceb = NEW_ARRAY(CEI, bufferSize);
if (ceb == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
if (target != NULL) {
setTargetString(target);
}
switch (strength)
{
default:
strengthMask |= UCOL_TERTIARYORDERMASK;
/* fall through */
case UCOL_SECONDARY:
strengthMask |= UCOL_SECONDARYORDERMASK;
/* fall through */
case UCOL_PRIMARY:
strengthMask |= UCOL_PRIMARYORDERMASK;
}
}
Target::~Target()
{
ubrk_close(charBreakIterator);
ucol_closeElements(elements);
DELETE_ARRAY(ceb);
}
void Target::setTargetString(const UnicodeString *target)
{
if (charBreakIterator != NULL) {
ubrk_close(charBreakIterator);
ucol_closeElements(elements);
}
targetString = target;
if (targetString != NULL) {
UErrorCode status = U_ZERO_ERROR;
targetBuffer = targetString->getBuffer();
targetLength = targetString->length();
elements = ucol_openElements(coll, target->getBuffer(), target->length(), &status);
ucol_forceHanImplicit(elements, &status);
charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
targetBuffer, targetLength, &status);
} else {
targetBuffer = NULL;
targetLength = 0;
}
}
const CEI *Target::nextCE(int32_t offset)
{
UErrorCode status = U_ZERO_ERROR;
int32_t low = -1, high = -1;
uint32_t order;
UBool cont = FALSE;
if (offset >= bufferMin && offset < bufferMax) {
return &ceb[offset];
}
if (bufferMax >= bufferSize || offset != bufferMax) {
return NULL;
}
do {
low = ucol_getOffset(elements);
order = ucol_next(elements, &status);
high = ucol_getOffset(elements);
if (order == (uint32_t)UCOL_NULLORDER) {
//high = low = -1;
break;
}
cont = isContinuation(order);
order &= strengthMask;
if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
if (strength >= UCOL_QUATERNARY) {
order &= UCOL_PRIMARYORDERMASK;
} else {
order = UCOL_IGNORABLE;
}
}
} while (order == UCOL_IGNORABLE);
if (cont) {
order |= UCOL_CONTINUATION_MARKER;
}
ceb[offset].order = order;
ceb[offset].lowOffset = low;
ceb[offset].highOffset = high;
bufferMax += 1;
return &ceb[offset];
}
const CEI *Target::prevCE(int32_t offset)
{
UErrorCode status = U_ZERO_ERROR;
int32_t low = -1, high = -1;
uint32_t order;
UBool cont = FALSE;
if (offset >= bufferMin && offset < bufferMax) {
return &ceb[offset];
}
if (bufferMax >= bufferSize || offset != bufferMax) {
return NULL;
}
do {
high = ucol_getOffset(elements);
order = ucol_previous(elements, &status);
low = ucol_getOffset(elements);
if (order == (uint32_t)UCOL_NULLORDER) {
break;
}
cont = isContinuation(order);
order &= strengthMask;
if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
if (strength >= UCOL_QUATERNARY) {
order &= UCOL_PRIMARYORDERMASK;
} else {
order = UCOL_IGNORABLE;
}
}
} while (order == UCOL_IGNORABLE);
bufferMax += 1;
if (cont) {
order |= UCOL_CONTINUATION_MARKER;
}
ceb[offset].order = order;
ceb[offset].lowOffset = low;
ceb[offset].highOffset = high;
return &ceb[offset];
}
int32_t Target::stringLength()
{
if (targetString != NULL) {
return targetLength;
}
return 0;
}
UChar Target::charAt(int32_t offset)
{
if (targetString != NULL) {
return targetBuffer[offset];
}
return 0x0000;
}
void Target::setOffset(int32_t offset)
{
UErrorCode status = U_ZERO_ERROR;
bufferMin = 0;
bufferMax = 0;
ucol_setOffset(elements, offset, &status);
}
void Target::setLast(int32_t last)
{
UErrorCode status = U_ZERO_ERROR;
bufferMin = 0;
bufferMax = 1;
ceb[0].order = (uint32_t)UCOL_NULLORDER;
ceb[0].lowOffset = last;
ceb[0].highOffset = last;
ucol_setOffset(elements, last, &status);
}
int32_t Target::getOffset()
{
return ucol_getOffset(elements);
}
UBool Target::isBreakBoundary(int32_t offset)
{
return ubrk_isBoundary(charBreakIterator, offset);
}
int32_t Target::nextBreakBoundary(int32_t offset)
{
return ubrk_following(charBreakIterator, offset);
}
int32_t Target::nextSafeBoundary(int32_t offset)
{
while (offset < targetLength) {
//UChar ch = charAt(offset);
UChar ch = targetBuffer[offset];
if (U_IS_LEAD(ch) || ! ucol_unsafeCP(ch, coll)) {
return offset;
}
offset += 1;
}
return targetLength;
}
UBool Target::isIdentical(UnicodeString &pattern, int32_t start, int32_t end)
{
if (strength < UCOL_IDENTICAL) {
return TRUE;
}
// Note: We could use Normalizer::compare() or similar, but for short strings
// which may not be in FCD it might be faster to just NFD them.
UErrorCode status = U_ZERO_ERROR;
UnicodeString t2, p2;
nfd.normalize(UnicodeString(FALSE, targetBuffer + start, end - start), t2, status);
nfd.normalize(pattern, p2, status);
// return FALSE if NFD failed
return U_SUCCESS(status) && t2 == p2;
}
#define HASH_TABLE_SIZE 257
class BadCharacterTable : public UMemory
{
public:
BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status);
~BadCharacterTable();
int32_t operator[](uint32_t ce) const;
int32_t getMaxSkip() const;
int32_t minLengthInChars(int32_t index);
private:
static int32_t hash(uint32_t ce);
int32_t maxSkip;
int32_t badCharacterTable[HASH_TABLE_SIZE];
int32_t *minLengthCache;
};
BadCharacterTable::BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status)
: minLengthCache(NULL)
{
int32_t plen = patternCEs.size();
// **** need a better way to deal with this ****
if (U_FAILURE(status) || plen == 0) {
return;
}
int32_t *history = NEW_ARRAY(int32_t, plen);
if (history == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
for (int32_t i = 0; i < plen; i += 1) {
history[i] = -1;
}
minLengthCache = NEW_ARRAY(int32_t, plen + 1);
if (minLengthCache == NULL) {
DELETE_ARRAY(history);
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
maxSkip = minLengthCache[0] = data->minLengthInChars(&patternCEs, 0, history);
for(int32_t j = 0; j < HASH_TABLE_SIZE; j += 1) {
badCharacterTable[j] = maxSkip;
}
for(int32_t p = 1; p < plen; p += 1) {
minLengthCache[p] = data->minLengthInChars(&patternCEs, p, history);
// Make sure this entry is not bigger than the previous one.
// Otherwise, we might skip too far in some cases.
if (minLengthCache[p] < 0 || minLengthCache[p] > minLengthCache[p - 1]) {
minLengthCache[p] = minLengthCache[p - 1];
}
}
minLengthCache[plen] = 0;
for(int32_t p = 0; p < plen - 1; p += 1) {
badCharacterTable[hash(patternCEs[p])] = minLengthCache[p + 1];
}
DELETE_ARRAY(history);
}
BadCharacterTable::~BadCharacterTable()
{
DELETE_ARRAY(minLengthCache);
}
int32_t BadCharacterTable::operator[](uint32_t ce) const
{
return badCharacterTable[hash(ce)];
}
int32_t BadCharacterTable::getMaxSkip() const
{
return maxSkip;
}
int32_t BadCharacterTable::minLengthInChars(int32_t index)
{
return minLengthCache[index];
}
int32_t BadCharacterTable::hash(uint32_t ce)
{
return UCOL_PRIMARYORDER(ce) % HASH_TABLE_SIZE;
}
class GoodSuffixTable : public UMemory
{
public:
GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status);
~GoodSuffixTable();
int32_t operator[](int32_t offset) const;
private:
int32_t *goodSuffixTable;
};
GoodSuffixTable::GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status)
: goodSuffixTable(NULL)
{
int32_t patlen = patternCEs.size();
// **** need a better way to deal with this ****
if (U_FAILURE(status) || patlen <= 0) {
return;
}
int32_t *suff = NEW_ARRAY(int32_t, patlen);
int32_t start = patlen - 1, end = - 1;
int32_t maxSkip = badCharacterTable.getMaxSkip();
if (suff == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
// initialze suff
suff[patlen - 1] = patlen;
for (int32_t i = patlen - 2; i >= 0; i -= 1) {
// (i > start) means we're inside the last suffix match we found
// ((patlen - 1) - end) is how far the end of that match is from end of pattern
// (i - start) is how far we are from start of that match
// (i + (patlen - 1) - end) is index of same character at end of pattern
// so if any suffix match at that character doesn't extend beyond the last match,
// it's the suffix for this character as well
if (i > start && suff[i + patlen - 1 - end] < i - start) {
suff[i] = suff[i + patlen - 1 - end];
} else {
start = end = i;
int32_t s = patlen;
while (start >= 0 && patternCEs[start] == patternCEs[--s]) {
start -= 1;
}
suff[i] = end - start;
}
}
// now build goodSuffixTable
goodSuffixTable = NEW_ARRAY(int32_t, patlen);
if (goodSuffixTable == NULL) {
DELETE_ARRAY(suff);
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
// initialize entries to minLengthInChars of the pattern
for (int32_t i = 0; i < patlen; i += 1) {
goodSuffixTable[i] = maxSkip;
}
int32_t prefix = 0;
for (int32_t i = patlen - /*1*/ 2; i >= 0; i -= 1) {
if (suff[i] == i + 1) {
// this matching suffix is a prefix of the pattern
int32_t prefixSkip = badCharacterTable.minLengthInChars(i + 1);
// for any mis-match before this suffix, we should skip
// so that the front of the pattern (i.e. the prefix)
// lines up with the front of the suffix.
// (patlen - 1 - i) is the start of the suffix
while (prefix < patlen - 1 - i) {
// value of maxSkip means never set...
if (goodSuffixTable[prefix] == maxSkip) {
goodSuffixTable[prefix] = prefixSkip;
}
prefix += 1;
}
}
}
for (int32_t i = 0; i < patlen - 1; i += 1) {
goodSuffixTable[patlen - 1 - suff[i]] = badCharacterTable.minLengthInChars(i + 1);
}
DELETE_ARRAY(suff);
}
GoodSuffixTable::~GoodSuffixTable()
{
DELETE_ARRAY(goodSuffixTable);
}
int32_t GoodSuffixTable::operator[](int32_t offset) const
{
return goodSuffixTable[offset];
}
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BoyerMooreSearch)
UBool BoyerMooreSearch::empty()
{
return patCEs->size() <= 0;
}
CollData *BoyerMooreSearch::getData()
{
return data;
}
CEList *BoyerMooreSearch::getPatternCEs()
{
return patCEs;
}
BadCharacterTable *BoyerMooreSearch::getBadCharacterTable()
{
return badCharacterTable;
}
GoodSuffixTable *BoyerMooreSearch::getGoodSuffixTable()
{
return goodSuffixTable;
}
BoyerMooreSearch::BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString,
UErrorCode &status)
: data(theData), patCEs(NULL), badCharacterTable(NULL), goodSuffixTable(NULL), pattern(patternString), target(NULL)
{
if (U_FAILURE(status)) {
return;
}
UCollator *collator = data->getCollator();
patCEs = new CEList(collator, patternString, status);
if (patCEs == NULL || U_FAILURE(status)) {
return;
}
badCharacterTable = new BadCharacterTable(*patCEs, data, status);
if (badCharacterTable == NULL || U_FAILURE(status)) {
return;
}
goodSuffixTable = new GoodSuffixTable(*patCEs, *badCharacterTable, status);
if (targetString != NULL) {
target = new Target(collator, targetString, patCEs->size(), status);
}
}
BoyerMooreSearch::~BoyerMooreSearch()
{
delete target;
delete goodSuffixTable;
delete badCharacterTable;
delete patCEs;
}
void BoyerMooreSearch::setTargetString(const UnicodeString *targetString, UErrorCode &status)
{
if (U_FAILURE(status)) {
return;
}
if (target == NULL) {
target = new Target(data->getCollator(), targetString, patCEs->size(), status);
} else {
target->setTargetString(targetString);
}
}
// **** main flow of this code from Laura Werner's "Unicode Text Searching in Java" paper. ****
/*
* TODO:
* * deal with trailing (and leading?) ignorables.
* * Adding BoyerMooreSearch object slowed it down. How can we speed it up?
*/
UBool BoyerMooreSearch::search(int32_t offset, int32_t &start, int32_t &end)
{
/*UCollator *coll =*/ data->getCollator();
int32_t plen = patCEs->size();
int32_t tlen = target->stringLength();
int32_t maxSkip = badCharacterTable->getMaxSkip();
int32_t tOffset = offset + maxSkip;
if (plen <= 0) {
// Searching for a zero length pattern always fails.
start = end = -1;
return FALSE;
}
while (tOffset <= tlen) {
int32_t pIndex = plen - 1;
int32_t tIndex = 0;
int32_t lIndex = 0;
if (tOffset < tlen) {
// **** we really want to skip ahead enough to ****
// **** be sure we get at least 1 non-ignorable ****
// **** CE after the end of the pattern. ****
int32_t next = target->nextSafeBoundary(tOffset + 1);
target->setOffset(next);
for (lIndex = 0; ; lIndex += 1) {
const CEI *cei = target->prevCE(lIndex);
int32_t low = cei->lowOffset;
int32_t high = cei->highOffset;
if (high == 0 || (low < high && low <= tOffset)) {
if (low < tOffset) {
while (lIndex >= 0 && target->prevCE(lIndex)->highOffset == high) {
lIndex -= 1;
}
if (high > tOffset) {
tOffset = high;
}
}
break;
}
}
} else {
target->setLast(tOffset);
lIndex = 0;
}
tIndex = ++lIndex;
// Iterate backward until we hit the beginning of the pattern
while (pIndex >= 0) {
uint32_t pce = (*patCEs)[pIndex];
const CEI *tcei = target->prevCE(tIndex++);
if (tcei->order != pce) {
// There is a mismatch at this position. Decide how far
// over to shift the pattern, then try again.
int32_t gsOffset = tOffset + (*goodSuffixTable)[pIndex];
#ifdef EXTRA_CAUTIOUS
int32_t old = tOffset;
#endif
tOffset += (*badCharacterTable)[tcei->order] - badCharacterTable->minLengthInChars(pIndex + 1);
if (gsOffset > tOffset) {
tOffset = gsOffset;
}
#ifdef EXTRA_CAUTIOUS
// Make sure we don't skip backwards...
if (tOffset <= old) {
tOffset = old + 1;
}
#endif
break;
}
pIndex -= 1;
}
if (pIndex < 0) {
// We made it back to the beginning of the pattern,
// which means we matched it all. Return the location.
const CEI firstCEI = *target->prevCE(tIndex - 1);
const CEI lastCEI = *target->prevCE(lIndex);
int32_t mStart = firstCEI.lowOffset;
int32_t minLimit = lastCEI.lowOffset;
int32_t maxLimit = lastCEI.highOffset;
int32_t mLimit;
UBool found = TRUE;
target->setOffset(/*tOffset*/maxLimit);
const CEI nextCEI = *target->nextCE(0);
if (nextCEI.lowOffset > maxLimit) {
maxLimit = nextCEI.lowOffset;
}
if (nextCEI.lowOffset == nextCEI.highOffset && nextCEI.order != (uint32_t)UCOL_NULLORDER) {
found = FALSE;
}
if (! target->isBreakBoundary(mStart)) {
found = FALSE;
}
if (firstCEI.lowOffset == firstCEI.highOffset) {
found = FALSE;
}
mLimit = maxLimit;
if (minLimit < maxLimit) {
// When the last CE's low index is same with its high index, the CE is likely
// a part of expansion. In this case, the index is located just after the
// character corresponding to the CEs compared above. If the index is right
// at the break boundary, move the position to the next boundary will result
// incorrect match length when there are ignorable characters exist between
// the position and the next character produces CE(s). See ticket#8482.
if (minLimit == lastCEI.highOffset && target->isBreakBoundary(minLimit)) {
mLimit = minLimit;
} else {
int32_t nbb = target->nextBreakBoundary(minLimit);
if (nbb >= lastCEI.highOffset) {
mLimit = nbb;
}
}
}
if (mLimit > maxLimit) {
found = FALSE;
}
if (! target->isBreakBoundary(mLimit)) {
found = FALSE;
}
if (! target->isIdentical(pattern, mStart, mLimit)) {
found = FALSE;
}
if (found) {
start = mStart;
end = mLimit;
return TRUE;
}
tOffset += (*goodSuffixTable)[0]; // really? Maybe += 1 or += maxSkip?
}
// Otherwise, we're here because of a mismatch, so keep going....
}
// no match
start = -1;
end = -1;
return FALSE;
}
U_NAMESPACE_END
#endif // #if !UCONFIG_NO_COLLATION

View file

@ -245,12 +245,9 @@
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="alphaindex.cpp" />
<ClCompile Include="bms.cpp" />
<ClCompile Include="bmsearch.cpp" />
<ClCompile Include="bocsu.cpp" />
<ClCompile Include="coleitr.cpp" />
<ClCompile Include="coll.cpp" />
<ClCompile Include="colldata.cpp" />
<ClCompile Include="search.cpp" />
<ClCompile Include="sortkey.cpp" />
<ClCompile Include="stsearch.cpp" />
@ -436,34 +433,6 @@
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">copy "%(FullPath)" ..\..\include\unicode
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
</CustomBuild>
<CustomBuild Include="unicode\bms.h">
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">copy "%(FullPath)" ..\..\include\unicode
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">copy "%(FullPath)" ..\..\include\unicode
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">copy "%(FullPath)" ..\..\include\unicode
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">copy "%(FullPath)" ..\..\include\unicode
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
</CustomBuild>
<CustomBuild Include="unicode\bmsearch.h">
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">copy "%(FullPath)" ..\..\include\unicode
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">copy "%(FullPath)" ..\..\include\unicode
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">copy "%(FullPath)" ..\..\include\unicode
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">copy "%(FullPath)" ..\..\include\unicode
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
</CustomBuild>
@ -493,20 +462,6 @@
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">copy "%(FullPath)" ..\..\include\unicode
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
</CustomBuild>
<CustomBuild Include="unicode\colldata.h">
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">copy "%(FullPath)" ..\..\include\unicode
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">copy "%(FullPath)" ..\..\include\unicode
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">copy "%(FullPath)" ..\..\include\unicode
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">copy "%(FullPath)" ..\..\include\unicode
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
</CustomBuild>
@ -1604,4 +1559,4 @@
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>
</Project>

View file

@ -24,21 +24,12 @@
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="bms.cpp">
<Filter>collation</Filter>
</ClCompile>
<ClCompile Include="bmsearch.cpp">
<Filter>collation</Filter>
</ClCompile>
<ClCompile Include="coleitr.cpp">
<Filter>collation</Filter>
</ClCompile>
<ClCompile Include="coll.cpp">
<Filter>collation</Filter>
</ClCompile>
<ClCompile Include="colldata.cpp">
<Filter>collation</Filter>
</ClCompile>
<ClCompile Include="search.cpp">
<Filter>collation</Filter>
</ClCompile>
@ -796,21 +787,12 @@
</ResourceCompile>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="unicode\bms.h">
<Filter>collation</Filter>
</CustomBuild>
<CustomBuild Include="unicode\bmsearch.h">
<Filter>collation</Filter>
</CustomBuild>
<CustomBuild Include="unicode\coleitr.h">
<Filter>collation</Filter>
</CustomBuild>
<CustomBuild Include="unicode\coll.h">
<Filter>collation</Filter>
</CustomBuild>
<CustomBuild Include="unicode\colldata.h">
<Filter>collation</Filter>
</CustomBuild>
<CustomBuild Include="unicode\search.h">
<Filter>collation</Filter>
</CustomBuild>
@ -1026,4 +1008,4 @@
<Filter>formatting</Filter>
</CustomBuild>
</ItemGroup>
</Project>
</Project>

View file

@ -48,7 +48,6 @@ typedef enum ECleanupI18NType {
UCLN_I18N_UCOL_RES,
UCLN_I18N_UCOL_BLD,
UCLN_I18N_CSDET,
UCLN_I18N_COLL_DATA,
UCLN_I18N_INDEX_CHARACTERS,
UCLN_I18N_GENDERINFO,
UCLN_I18N_CDFINFO,

View file

@ -1,280 +0,0 @@
/*
* Copyright (C) 1996-2012, International Business Machines Corporation and Others.
* All rights reserved.
*/
/**
* \file
* \brief C API: Boyer-Moore StringSearch prototype.
* \internal
*/
#ifndef _BMS_H
#define _BMS_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
#include "unicode/ucol.h"
#ifndef U_HIDE_INTERNAL_API
/**
* A <code>UCD</code> object holds the Collator-specific data needed to
* compute the length of the shortest string that can
* generate a partcular list of CEs.
*
* <code>UCD</code> objects are quite expensive to compute. Because
* of this, they are cached. When you call <code>ucd_open</code> it
* returns a reference counted cached object. When you call <code>ucd_close</code>
* the reference count on the object is decremented but the object is not deleted.
*
* If you do not need to reuse any unreferenced objects in the cache, you can call
* <code>ucd_flushCCache</code>. If you no longer need any <code>UCD</code>
* objects, you can call <code>ucd_freeCache</code>
*
* @internal ICU 4.0.1 technology preview
*/
typedef void UCD;
/**
* Open a <code>UCD</code> object.
*
* @param coll - the collator
* @param status - will be set if any errors occur.
*
* @return the <code>UCD</code> object. You must call
* <code>ucd_close</code> when you are done using the object.
*
* Note: if on return status is set to an error, the only safe
* thing to do with the returned object is to call <code>ucd_close</code>.
*
* @internal ICU 4.0.1 technology preview
*/
U_INTERNAL UCD * U_EXPORT2
ucd_open(UCollator *coll, UErrorCode *status);
/**
* Release a <code>UCD</code> object.
*
* @param ucd - the object
*
* @internal ICU 4.0.1 technology preview
*/
U_INTERNAL void U_EXPORT2
ucd_close(UCD *ucd);
/**
* Get the <code>UCollator</code> object used to create a <code>UCD</code> object.
* The <code>UCollator</code> object returned may not be the exact
* object that was used to create this object, but it will have the
* same behavior.
*
* @param ucd - the <code>UCD</code> object
*
* @return the <code>UCollator</code> used to create the given
* <code>UCD</code> object.
*
* @internal ICU 4.0.1 technology preview
*/
U_INTERNAL UCollator * U_EXPORT2
ucd_getCollator(UCD *ucd);
/**
* <code>UCD</code> objects are expensive to compute, and so
* may be cached. This routine will free the cached objects and delete
* the cache.
*
* WARNING: Don't call this until you are have called <code>close</code>
* for each <code>UCD</code> object that you have used. also,
* DO NOT call this if another thread may be calling <code>ucd_flushCache</code>
* at the same time.
*
* @internal ICU 4.0.1 technology preview
*/
U_INTERNAL void U_EXPORT2
ucd_freeCache();
/**
* <code>UCD</code> objects are expensive to compute, and so
* may be cached. This routine will remove any unused <code>UCD</code>
* objects from the cache.
*
* @internal 4.0.1 technology preview
*/
U_INTERNAL void U_EXPORT2
ucd_flushCache();
/**
* BMS
*
* This object holds the information needed to do a Collation sensitive Boyer-Moore search. It encapulates
* the pattern, the "bad character" and "good suffix" tables, the Collator-based data needed to compute them,
* and a reference to the text being searched.
*
* To do a search, you first need to get a <code>UCD</code> object by calling <code>ucd_open</code>.
* Then you construct a <code>BMS</code> object from the <code>UCD</code> object, the pattern
* string and the target string. Then you call the <code>search</code> method. Here's a code sample:
*
* <pre>
* void boyerMooreExample(UCollator *collator, UChar *pattern, int32_t patternLen, UChar *target, int32_t targetLength)
* {
* UErrorCode status = U_ZERO_ERROR;
* int32_t offset = 0, start = -1, end = -1;
* UCD *ucd = NULL);
* BMS *bms = NULL;
*
* ucd = ucd_open(collator, &status);
* if (U_FAILURE(status)) {
* // could not create a UCD object
* return;
* }
*
* BMS *bms = bms_open(ucd, pattern, patternLength, target, targetlength, &status);
* if (U_FAILURE(status)) {
* // could not create a BMS object
* ucd_close(ucd);
* return;
* }
*
*
* // Find all matches
* while (bms_search(bms, offset, &start, &end)) {
* // process the match between start and end
* ...
*
* // advance past the match
* offset = end;
* }
*
* // at this point, if offset == 0, there were no matches
* if (offset == 0) {
* // handle the case of no matches
* }
*
* bms_close(bms);
* ucd_close(ucd);
*
* // UCD objects are cached, so the call to
* // ucd_close doesn't delete the object.
* // Call this if you don't need the object any more.
* ucd_flushCache();
* }
* </pre>
*
* NOTE: This is a technology preview. The final version of this API may not bear any resenblence to this API.
*
* Knows linitations:
* 1) Backwards searching has not been implemented.
*
* 2) For Han and Hangul characters, this code ignores any Collation tailorings. In general,
* this isn't a problem, but in Korean locals, at strength 1, Hangul characters are tailored
* to be equal to Han characters with the same pronounciation. Because this code ignroes
* tailorings, searching for a Hangul character will not find a Han character and visa-versa.
*
* 3) In some cases, searching for a pattern that needs to be normalized and ends
* in a discontiguous contraction may fail. The only known cases of this are with
* the Tibetan script. For example searching for the pattern
* "\u0F7F\u0F80\u0F81\u0F82\u0F83\u0F84\u0F85" will fail. (This case is artificial. We've
* been unable to find a pratical, real-world example of this failure.)
*
* NOTE: This is a technology preview. The final version of this API may not bear any resenblence to this API.
*
* @internal ICU 4.0.1 technology preview
*/
struct BMS;
typedef struct BMS BMS; /**< @see BMS */
/**
* Construct a <code>MBS</code> object.
*
* @param ucd - A <code>UCD</code> object holding the Collator-sensitive data
* @param pattern - the string for which to search
* @param patternLength - the length of the string for which to search
* @param target - the string in which to search
* @param targetLength - the length of the string in which to search
* @param status - will be set if any errors occur.
*
* @return the <code>BMS</code> object.
*
* Note: if on return status is set to an error, the only safe
* thing to do with the returned object is to call
* <code>bms_close</code>.
*
* @internal ICU 4.0.1 technology preview
*/
U_INTERNAL BMS * U_EXPORT2
bms_open(UCD *ucd,
const UChar *pattern, int32_t patternLength,
const UChar *target, int32_t targetLength,
UErrorCode *status);
/**
* Close a <code>BMS</code> object and release all the
* storage associated with it.
*
* @param bms - the <code>BMS</code> object to close.
* @internal ICU 4.0.1 technology preview
*/
U_INTERNAL void U_EXPORT2
bms_close(BMS *bms);
/**
* Test the pattern to see if it generates any CEs.
*
* @param bms - the <code>BMS</code> object
* @return <code>TRUE</code> if the pattern string did not generate any CEs
*
* @internal ICU 4.0.1 technology preview
*/
U_INTERNAL UBool U_EXPORT2
bms_empty(BMS *bms);
/**
* Get the <code>UCD</code> object used to create
* a given <code>BMS</code> object.
*
* @param bms - the <code>BMS</code> object
*
* @return - the <code>UCD</code> object used to create
* the given <code>BMS</code> object.
*
* @internal ICU 4.0.1 technology preview
*/
U_INTERNAL UCD * U_EXPORT2
bms_getData(BMS *bms);
/**
* Search for the pattern string in the target string.
*
* @param bms - the <code>BMS</code> object
* @param offset - the offset in the target string at which to begin the search
* @param start - will be set to the starting offset of the match, or -1 if there's no match
* @param end - will be set to the ending offset of the match, or -1 if there's no match
*
* @return <code>TRUE</code> if the match succeeds, <code>FALSE</code> otherwise.
*
* @internal ICU 4.0.1 technology preview
*/
U_INTERNAL UBool U_EXPORT2
bms_search(BMS *bms, int32_t offset, int32_t *start, int32_t *end);
/**
* Set the target string for the match.
*
* @param bms - the <code>BMS</code> object
* @param target - the new target string
* @param targetLength - the length of the new target string
* @param status - will be set if any errors occur.
*
* @internal ICU 4.0.1 technology preview
*/
U_INTERNAL void U_EXPORT2
bms_setTargetString(BMS *bms, const UChar *target, int32_t targetLength, UErrorCode *status);
#endif /* U_HIDE_INTERNAL_API */
#endif
#endif /* _BMS_H */

View file

@ -1,228 +0,0 @@
/*
******************************************************************************
* Copyright (C) 1996-2011, International Business Machines *
* Corporation and others. All Rights Reserved. *
******************************************************************************
*/
/**
* \file
* \brief C++ API: Boyer-Moore StringSearch technology preview
* \internal ICU 4.0.1 technology preview
*/
#ifndef B_M_SEARCH_H
#define B_M_SEARCH_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
#include "unicode/uobject.h"
#include "unicode/ucol.h"
#include "unicode/colldata.h"
U_NAMESPACE_BEGIN
class BadCharacterTable;
class GoodSuffixTable;
class Target;
#ifndef U_HIDE_INTERNAL_API
/**
* BoyerMooreSearch
*
* This object holds the information needed to do a Collation sensitive Boyer-Moore search. It encapulates
* the pattern, the "bad character" and "good suffix" tables, the Collator-based data needed to compute them,
* and a reference to the text being searched.
*
* To do a search, you fist need to get a <code>CollData</code> object by calling <code>CollData::open</code>.
* Then you construct a <code>BoyerMooreSearch</code> object from the <code>CollData</code> object, the pattern
* string and the target string. Then you call the <code>search</code> method. Here's a code sample:
*
* <pre>
* void boyerMooreExample(UCollator *collator, UnicodeString *pattern, UnicodeString *target)
* {
* UErrorCode status = U_ZERO_ERROR;
* CollData *collData = CollData::open(collator, status);
*
* if (U_FAILURE(status)) {
* // could not create a CollData object
* return;
* }
*
* BoyerMooreSearch *search = new BoyerMooreSearch(collData, *patternString, target, status);
*
* if (U_FAILURE(status)) {
* // could not create a BoyerMooreSearch object
* CollData::close(collData);
* return;
* }
*
* int32_t offset = 0, start = -1, end = -1;
*
* // Find all matches
* while (search->search(offset, start, end)) {
* // process the match between start and end
* ...
* // advance past the match
* offset = end;
* }
*
* // at this point, if offset == 0, there were no matches
* if (offset == 0) {
* // handle the case of no matches
* }
*
* delete search;
* CollData::close(collData);
*
* // CollData objects are cached, so the call to
* // CollData::close doesn't delete the object.
* // Call this if you don't need the object any more.
* CollData::flushCollDataCache();
* }
* </pre>
*
* NOTE: This is a technology preview. The final version of this API may not bear any resenblence to this API.
*
* Knows linitations:
* 1) Backwards searching has not been implemented.
*
* 2) For Han and Hangul characters, this code ignores any Collation tailorings. In general,
* this isn't a problem, but in Korean locals, at strength 1, Hangul characters are tailored
* to be equal to Han characters with the same pronounciation. Because this code ignroes
* tailorings, searching for a Hangul character will not find a Han character and visa-versa.
*
* 3) In some cases, searching for a pattern that needs to be normalized and ends
* in a discontiguous contraction may fail. The only known cases of this are with
* the Tibetan script. For example searching for the pattern
* "\u0F7F\u0F80\u0F81\u0F82\u0F83\u0F84\u0F85" will fail. (This case is artificial. We've
* been unable to find a pratical, real-world example of this failure.)
*
* @internal ICU 4.0.1 technology preview
*
* @see CollData
*/
class U_I18N_API BoyerMooreSearch : public UObject
{
public:
/**
* Construct a <code>BoyerMooreSearch</code> object.
*
* @param theData - A <code>CollData</code> object holding the Collator-sensitive data
* @param patternString - the string for which to search
* @param targetString - the string in which to search or <code>NULL</code> if youu will
* set it later by calling <code>setTargetString</code>.
* @param status - will be set if any errors occur.
*
* Note: if on return, status is set to an error code,
* the only safe thing to do with this object is to call
* the destructor.
*
* @internal ICU 4.0.1 technology preview
*/
BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString, UErrorCode &status);
/**
* The desstructor
*
* @internal ICU 4.0.1 technology preview
*/
~BoyerMooreSearch();
/**
* Test the pattern to see if it generates any CEs.
*
* @return <code>TRUE</code> if the pattern string did not generate any CEs
*
* @internal ICU 4.0.1 technology preview
*/
UBool empty();
/**
* Search for the pattern string in the target string.
*
* @param offset - the offset in the target string at which to begin the search
* @param start - will be set to the starting offset of the match, or -1 if there's no match
* @param end - will be set to the ending offset of the match, or -1 if there's no match
*
* @return <code>TRUE</code> if the match succeeds, <code>FALSE</code> otherwise.
*
* @internal ICU 4.0.1 technology preview
*/
UBool search(int32_t offset, int32_t &start, int32_t &end);
/**
* Set the target string for the match.
*
* @param targetString - the new target string
* @param status - will be set if any errors occur.
*
* @internal ICU 4.0.1 technology preview
*/
void setTargetString(const UnicodeString *targetString, UErrorCode &status);
// **** no longer need these? ****
/**
* Return the <code>CollData</code> object used for searching
*
* @return the <code>CollData</code> object used for searching
*
* @internal ICU 4.0.1 technology preview
*/
CollData *getData();
/**
* Return the CEs generated by the pattern string.
*
* @return a <code>CEList</code> object holding the CEs generated by the pattern string.
*
* @internal ICU 4.0.1 technology preview
*/
CEList *getPatternCEs();
/**
* Return the <code>BadCharacterTable</code> object computed for the pattern string.
*
* @return the <code>BadCharacterTable</code> object.
*
* @internal ICU 4.0.1 technology preview
*/
BadCharacterTable *getBadCharacterTable();
/**
* Return the <code>GoodSuffixTable</code> object computed for the pattern string.
*
* @return the <code>GoodSuffixTable</code> object computed for the pattern string.
*
* @internal ICU 4.0.1 technology preview
*/
GoodSuffixTable *getGoodSuffixTable();
/**
* UObject glue...
* @internal ICU 4.0.1 technology preview
*/
virtual UClassID getDynamicClassID() const;
/**
* UObject glue...
* @internal ICU 4.0.1 technology preview
*/
static UClassID getStaticClassID();
private:
CollData *data;
CEList *patCEs;
BadCharacterTable *badCharacterTable;
GoodSuffixTable *goodSuffixTable;
UnicodeString pattern;
Target *target;
};
#endif /* U_HIDE_INTERNAL_API */
U_NAMESPACE_END
#endif // #if !UCONFIG_NO_COLLATION
#endif // #ifndef B_M_SEARCH_H

View file

@ -37,7 +37,7 @@ DEFS += -D'U_TOPSRCDIR="$(top_srcdir)/"' -D'U_TOPBUILDDIR="$(BUILDDIR)"'
LIBS = $(LIBCTESTFW) $(LIBICUI18N) $(LIBICUUC) $(LIBICUTOOLUTIL) $(DEFAULT_LIBS) $(LIB_M) $(LIB_THREAD)
OBJECTS = aliastst.o allcoll.o apicoll.o astrotst.o callimts.o calregts.o caltest.o \
caltztst.o canittst.o citrtest.o cntabcol.o convtest.o currcoll.o \
caltztst.o canittst.o citrtest.o cntabcol.o colldata.o convtest.o currcoll.o \
fldset.o dadrfmt.o dadrcal.o dadrcoll.o dcfmapts.o decoll.o dtfmapts.o dtfmrgts.o dtfmtrtts.o dtfmttst.o \
dtptngts.o encoll.o escoll.o ficoll.o frcoll.o g7coll.o intltest.o \
itercoll.o itformat.o itmajor.o itutil.o jacoll.o lcukocol.o \

View file

@ -10,7 +10,6 @@
#if !UCONFIG_NO_COLLATION
#include "unicode/unistr.h"
#include "unicode/putil.h"
#include "unicode/usearch.h"
#include "cmemory.h"
@ -26,27 +25,16 @@
#include "unicode/ustring.h"
#include "hash.h"
#include "uhash.h"
#include "ucln_in.h"
#include "ucol_imp.h"
#include "umutex.h"
#include "uassert.h"
#include "unicode/colldata.h"
U_NAMESPACE_BEGIN
#include "colldata.h"
#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
#define DELETE_ARRAY(array) uprv_free((void *) (array))
#define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CEList)
#ifdef INSTRUMENT_CELIST
int32_t CEList::_active = 0;
int32_t CEList::_histogram[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
#endif
CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
: ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
{
@ -78,11 +66,6 @@ CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
strengthMask |= UCOL_PRIMARYORDERMASK;
}
#ifdef INSTRUMENT_CELIST
_active += 1;
_histogram[0] += 1;
#endif
ces = ceBuffer;
while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
@ -114,10 +97,6 @@ CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
CEList::~CEList()
{
#ifdef INSTRUMENT_CELIST
_active -= 1;
#endif
if (ces != ceBuffer) {
DELETE_ARRAY(ces);
}
@ -131,11 +110,6 @@ void CEList::add(uint32_t ce, UErrorCode &status)
if (listSize >= listMax) {
int32_t newMax = listMax + CELIST_BUFFER_SIZE;
#ifdef INSTRUMENT_CELIST
_histogram[listSize / CELIST_BUFFER_SIZE] += 1;
#endif
uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
if (newCEs == NULL) {
@ -190,14 +164,6 @@ int32_t CEList::size() const
return listSize;
}
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringList)
#ifdef INSTRUMENT_STRING_LIST
int32_t StringList::_lists = 0;
int32_t StringList::_strings = 0;
int32_t StringList::_histogram[101] = {0};
#endif
StringList::StringList(UErrorCode &status)
: strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
{
@ -211,11 +177,6 @@ StringList::StringList(UErrorCode &status)
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
#ifdef INSTRUMENT_STRING_LIST
_lists += 1;
_histogram[0] += 1;
#endif
}
StringList::~StringList()
@ -228,11 +189,6 @@ void StringList::add(const UnicodeString *string, UErrorCode &status)
if (U_FAILURE(status)) {
return;
}
#ifdef INSTRUMENT_STRING_LIST
_strings += 1;
#endif
if (listSize >= listMax) {
int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
UnicodeString *newStrings = new UnicodeString[newMax];
@ -243,17 +199,6 @@ void StringList::add(const UnicodeString *string, UErrorCode &status)
for (int32_t i=0; i<listSize; ++i) {
newStrings[i] = strings[i];
}
#ifdef INSTRUMENT_STRING_LIST
int32_t _h = listSize / STRING_LIST_BUFFER_SIZE;
if (_h > 100) {
_h = 100;
}
_histogram[_h] += 1;
#endif
delete[] strings;
strings = newStrings;
listMax = newMax;
@ -295,38 +240,11 @@ deleteStringList(void *obj)
delete strings;
}
static void U_CALLCONV
deleteCEList(void *obj)
{
CEList *list = (CEList *) obj;
delete list;
}
static void U_CALLCONV
deleteUnicodeStringKey(void *obj)
{
UnicodeString *key = (UnicodeString *) obj;
delete key;
}
static void U_CALLCONV
deleteChars(void * /*obj*/)
{
// char *chars = (char *) obj;
// All the key strings are owned by the
// CollData objects and don't need to
// be freed here.
//DELETE_ARRAY(chars);
}
U_CDECL_END
class CEToStringsMap : public UMemory
class CEToStringsMap
{
public:
CEToStringsMap(UErrorCode &status);
~CEToStringsMap();
@ -334,7 +252,6 @@ public:
StringList *getStringList(uint32_t ce) const;
private:
void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
UHashtable *map;
};
@ -390,260 +307,10 @@ void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCo
uhash_iput(map, ce, (void *) stringList, &status);
}
class StringToCEsMap : public UMemory
{
public:
StringToCEsMap(UErrorCode &status);
~StringToCEsMap();
void put(const UnicodeString *string, const CEList *ces, UErrorCode &status);
const CEList *get(const UnicodeString *string);
void free(const CEList *list);
private:
UHashtable *map;
};
StringToCEsMap::StringToCEsMap(UErrorCode &status)
: map(NULL)
{
if (U_FAILURE(status)) {
return;
}
map = uhash_open(uhash_hashUnicodeString,
uhash_compareUnicodeString,
uhash_compareLong,
&status);
if (U_FAILURE(status)) {
return;
}
uhash_setValueDeleter(map, deleteCEList);
uhash_setKeyDeleter(map, deleteUnicodeStringKey);
}
StringToCEsMap::~StringToCEsMap()
{
uhash_close(map);
}
void StringToCEsMap::put(const UnicodeString *string, const CEList *ces, UErrorCode &status)
{
uhash_put(map, (void *) string, (void *) ces, &status);
}
const CEList *StringToCEsMap::get(const UnicodeString *string)
{
return (const CEList *) uhash_get(map, string);
}
class CollDataCacheEntry : public UMemory
{
public:
CollDataCacheEntry(CollData *theData);
~CollDataCacheEntry();
CollData *data;
int32_t refCount;
};
CollDataCacheEntry::CollDataCacheEntry(CollData *theData)
: data(theData), refCount(1)
{
// nothing else to do
}
CollDataCacheEntry::~CollDataCacheEntry()
{
// check refCount?
delete data;
}
class CollDataCache : public UMemory
{
public:
CollDataCache(UErrorCode &status);
~CollDataCache();
CollData *get(UCollator *collator, UErrorCode &status);
void unref(CollData *collData);
void flush();
private:
static char *getKey(UCollator *collator, char *keyBuffer, int32_t *charBufferLength);
static void deleteKey(char *key);
UHashtable *cache;
};
static UMutex lock = U_MUTEX_INITIALIZER;
U_CDECL_BEGIN
static void U_CALLCONV
deleteCollDataCacheEntry(void *obj)
{
CollDataCacheEntry *entry = (CollDataCacheEntry *) obj;
delete entry;
}
U_CDECL_END
CollDataCache::CollDataCache(UErrorCode &status)
: cache(NULL)
{
if (U_FAILURE(status)) {
return;
}
cache = uhash_open(uhash_hashChars, uhash_compareChars, uhash_compareLong, &status);
if (U_FAILURE(status)) {
return;
}
uhash_setValueDeleter(cache, deleteCollDataCacheEntry);
uhash_setKeyDeleter(cache, deleteChars);
}
CollDataCache::~CollDataCache()
{
umtx_lock(&lock);
uhash_close(cache);
cache = NULL;
umtx_unlock(&lock);
}
CollData *CollDataCache::get(UCollator *collator, UErrorCode &status)
{
char keyBuffer[KEY_BUFFER_SIZE];
int32_t keyLength = KEY_BUFFER_SIZE;
char *key = getKey(collator, keyBuffer, &keyLength);
CollData *result = NULL, *newData = NULL;
CollDataCacheEntry *entry = NULL, *newEntry = NULL;
umtx_lock(&lock);
entry = (CollDataCacheEntry *) uhash_get(cache, key);
if (entry == NULL) {
umtx_unlock(&lock);
newData = new CollData(collator, key, keyLength, status);
newEntry = new CollDataCacheEntry(newData);
if (U_FAILURE(status) || newData == NULL || newEntry == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
umtx_lock(&lock);
entry = (CollDataCacheEntry *) uhash_get(cache, key);
if (entry == NULL) {
uhash_put(cache, newData->key, newEntry, &status);
umtx_unlock(&lock);
if (U_FAILURE(status)) {
delete newEntry;
delete newData;
return NULL;
}
return newData;
}
}
result = entry->data;
entry->refCount += 1;
umtx_unlock(&lock);
if (key != keyBuffer) {
deleteKey(key);
}
if (newEntry != NULL) {
delete newEntry;
delete newData;
}
return result;
}
void CollDataCache::unref(CollData *collData)
{
CollDataCacheEntry *entry = NULL;
umtx_lock(&lock);
entry = (CollDataCacheEntry *) uhash_get(cache, collData->key);
if (entry != NULL) {
entry->refCount -= 1;
}
umtx_unlock(&lock);
}
char *CollDataCache::getKey(UCollator *collator, char *keyBuffer, int32_t *keyBufferLength)
{
UErrorCode status = U_ZERO_ERROR;
int32_t len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
if (len >= *keyBufferLength) {
*keyBufferLength = (len + 2) & ~1; // round to even length, leaving room for terminating null
keyBuffer = NEW_ARRAY(char, *keyBufferLength);
status = U_ZERO_ERROR;
len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
}
keyBuffer[len] = '\0';
return keyBuffer;
}
void CollDataCache::flush()
{
const UHashElement *element;
int32_t pos = -1;
umtx_lock(&lock);
while ((element = uhash_nextElement(cache, &pos)) != NULL) {
CollDataCacheEntry *entry = (CollDataCacheEntry *) element->value.pointer;
if (entry->refCount <= 0) {
uhash_removeElement(cache, element);
}
}
umtx_unlock(&lock);
}
void CollDataCache::deleteKey(char *key)
{
DELETE_ARRAY(key);
}
U_CDECL_BEGIN
static UBool coll_data_cleanup(void) {
CollData::freeCollDataCache();
return TRUE;
}
U_CDECL_END
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData)
CollData::CollData()
{
// nothing
}
#define CLONE_COLLATOR
//#define CACHE_CELISTS
CollData::CollData(UCollator *collator, char *cacheKey, int32_t cacheKeyLength, UErrorCode &status)
: coll(NULL), charsToCEList(NULL), ceToCharsStartingWith(NULL), key(NULL)
CollData::CollData(UCollator *collator, UErrorCode &status)
: coll(NULL), ceToCharsStartingWith(NULL)
{
// [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
// i.e. other, control, private use, format, surrogate
@ -665,35 +332,12 @@ CollData::CollData(UCollator *collator, char *cacheKey, int32_t cacheKeyLength,
USet *contractions = uset_openEmpty();
int32_t itemCount;
#ifdef CACHE_CELISTS
charsToCEList = new StringToCEsMap(status);
if (U_FAILURE(status)) {
goto bail;
}
#else
charsToCEList = NULL;
#endif
ceToCharsStartingWith = new CEToStringsMap(status);
if (U_FAILURE(status)) {
goto bail;
}
if (cacheKeyLength > KEY_BUFFER_SIZE) {
key = NEW_ARRAY(char, cacheKeyLength);
if (key == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
goto bail;
}
} else {
key = keyBuffer;
}
ARRAY_COPY(key, cacheKey, cacheKeyLength);
#ifdef CLONE_COLLATOR
coll = ucol_safeClone(collator, NULL, NULL, &status);
@ -730,12 +374,8 @@ CollData::CollData(UCollator *collator, char *cacheKey, int32_t cacheKeyLength,
ceToCharsStartingWith->put(ceList->get(0), st, status);
#ifdef CACHE_CELISTS
charsToCEList->put(st, ceList, status);
#else
delete ceList;
delete st;
#endif
}
} else if (len > 0) {
UnicodeString *st = new UnicodeString(buffer, len);
@ -749,12 +389,8 @@ CollData::CollData(UCollator *collator, char *cacheKey, int32_t cacheKeyLength,
ceToCharsStartingWith->put(ceList->get(0), st, status);
#ifdef CACHE_CELISTS
charsToCEList->put(st, ceList, status);
#else
delete ceList;
delete st;
#endif
} else {
// shouldn't happen...
}
@ -821,15 +457,7 @@ CollData::~CollData()
ucol_close(coll);
#endif
if (key != keyBuffer) {
DELETE_ARRAY(key);
}
delete ceToCharsStartingWith;
#ifdef CACHE_CELISTS
delete charsToCEList;
#endif
}
UCollator *CollData::getCollator() const
@ -844,9 +472,6 @@ const StringList *CollData::getStringList(int32_t ce) const
const CEList *CollData::getCEList(const UnicodeString *string) const
{
#ifdef CACHE_CELISTS
return charsToCEList->get(string);
#else
UErrorCode status = U_ZERO_ERROR;
const CEList *list = new CEList(coll, *string, status);
@ -856,14 +481,11 @@ const CEList *CollData::getCEList(const UnicodeString *string) const
}
return list;
#endif
}
void CollData::freeCEList(const CEList *list)
{
#ifndef CACHE_CELISTS
delete list;
#endif
}
int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
@ -885,9 +507,6 @@ int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t
for (int32_t s = 0; s < stringCount; s += 1) {
const UnicodeString *string = strings->get(s);
#ifdef CACHE_CELISTS
const CEList *ceList2 = charsToCEList->get(string);
#else
UErrorCode status = U_ZERO_ERROR;
const CEList *ceList2 = new CEList(coll, *string, status);
@ -895,7 +514,6 @@ int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t
delete ceList2;
ceList2 = NULL;
}
#endif
if (ceList->matchesAt(offset, ceList2)) {
U_ASSERT(ceList2 != NULL);
@ -909,9 +527,8 @@ int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t
if (rlength <= 0) {
// delete before continue to avoid memory leak.
#ifndef CACHE_CELISTS
delete ceList2;
#endif
// ignore any dead ends
continue;
}
@ -922,9 +539,7 @@ int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t
}
}
#ifndef CACHE_CELISTS
delete ceList2;
#endif
}
}
@ -1020,89 +635,4 @@ int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
return minLength;
}
CollData *CollData::open(UCollator *collator, UErrorCode &status)
{
if (U_FAILURE(status)) {
return NULL;
}
CollDataCache *cache = getCollDataCache();
return cache->get(collator, status);
}
void CollData::close(CollData *collData)
{
CollDataCache *cache = getCollDataCache();
cache->unref(collData);
}
CollDataCache *CollData::collDataCache = NULL;
CollDataCache *CollData::getCollDataCache()
{
UErrorCode status = U_ZERO_ERROR;
CollDataCache *cache = NULL;
UMTX_CHECK(NULL, collDataCache, cache);
if (cache == NULL) {
cache = new CollDataCache(status);
if (U_FAILURE(status)) {
delete cache;
return NULL;
}
umtx_lock(NULL);
if (collDataCache == NULL) {
collDataCache = cache;
ucln_i18n_registerCleanup(UCLN_I18N_COLL_DATA, coll_data_cleanup);
}
umtx_unlock(NULL);
if (collDataCache != cache) {
delete cache;
}
}
return collDataCache;
}
void CollData::freeCollDataCache()
{
CollDataCache *cache = NULL;
UMTX_CHECK(NULL, collDataCache, cache);
if (cache != NULL) {
umtx_lock(NULL);
if (collDataCache != NULL) {
collDataCache = NULL;
} else {
cache = NULL;
}
umtx_unlock(NULL);
delete cache;
}
}
void CollData::flushCollDataCache()
{
CollDataCache *cache = NULL;
UMTX_CHECK(NULL, collDataCache, cache);
// **** this will fail if the another ****
// **** thread deletes the cache here ****
if (cache != NULL) {
cache->flush();
}
}
U_NAMESPACE_END
#endif // #if !UCONFIG_NO_COLLATION

View file

@ -7,10 +7,17 @@
/**
* \file
* \brief C++ API: Collation data used to compute minLengthInChars.
* \brief Originally, added as C++ API for Collation data used to compute minLengthInChars
* \internal
*/
/*
* Note: This module was incldued in ICU 4.0.1 as @internal technology preview for supporting
* Boyer-Moore string search API. For now, only SSearchTest depends on this module. I temporaly
* moved the module from i18n directory to intltest, because we have no plan to publish this
* as public API. (2012-12-18 yoshito)
*/
#ifndef COLL_DATA_H
#define COLL_DATA_H
@ -18,21 +25,10 @@
#if !UCONFIG_NO_COLLATION
#include "unicode/uobject.h"
#include "unicode/ucol.h"
U_NAMESPACE_BEGIN
#ifndef U_HIDE_INTERNAL_API
/**
* The size of the internal buffer for the Collator's short description string.
* @internal ICU 4.0.1 technology preview
*/
#define KEY_BUFFER_SIZE 64
/**
* The size of the internal CE buffer in a <code>CEList</code> object
* @internal ICU 4.0.1 technology preview
*/
#define CELIST_BUFFER_SIZE 4
@ -40,31 +36,19 @@ U_NAMESPACE_BEGIN
* \def INSTRUMENT_CELIST
* Define this to enable the <code>CEList</code> objects to collect
* statistics.
* @internal ICU 4.0.1 technology preview
*/
//#define INSTRUMENT_CELIST
/**
* The size of the initial list in a <code>StringList</code> object.
* @internal ICU 4.0.1 technology preview
*/
#define STRING_LIST_BUFFER_SIZE 16
/**
* \def INSTRUMENT_STRING_LIST
* Define this to enable the <code>StringList</code> objects to
* collect statistics.
* @internal ICU 4.0.1 technology preview
*/
//#define INSTRUMENT_STRING_LIST
/**
* This object holds a list of CEs generated from a particular
* <code>UnicodeString</code>
*
* @internal ICU 4.0.1 technology preview
*/
class U_I18N_API CEList : public UObject
class CEList
{
public:
/**
@ -77,14 +61,11 @@ public:
* Note: if on return, status is set to an error code,
* the only safe thing to do with this object is to call
* the destructor.
*
* @internal ICU 4.0.1 technology preview
*/
CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status);
/**
* The destructor.
* @internal ICU 4.0.1 technology preview
*/
~CEList();
@ -92,8 +73,6 @@ public:
* Return the number of CEs in the list.
*
* @return the number of CEs in the list.
*
* @internal ICU 4.0.1 technology preview
*/
int32_t size() const;
@ -103,8 +82,6 @@ public:
* @param index - the index of the CE to return
*
* @return the CE, or <code>0</code> if <code>index</code> is out of range
*
* @internal ICU 4.0.1 technology preview
*/
uint32_t get(int32_t index) const;
@ -116,8 +93,6 @@ public:
* @param other - the other <code>CEList</code>
*
* @return <code>TRUE</code> if the CEs match, <code>FALSE</code> otherwise.
*
* @internal ICU 4.0.1 technology preview
*/
UBool matchesAt(int32_t offset, const CEList *other) const;
@ -127,22 +102,9 @@ public:
* @param index - the index
*
* @return a reference to the given CE in the list
*
* @internal ICU 4.0.1 technology preview
*/
uint32_t &operator[](int32_t index) const;
/**
* UObject glue...
* @internal ICU 4.0.1 technology preview
*/
virtual UClassID getDynamicClassID() const;
/**
* UObject glue...
* @internal ICU 4.0.1 technology preview
*/
static UClassID getStaticClassID();
private:
void add(uint32_t ce, UErrorCode &status);
@ -150,21 +112,14 @@ private:
uint32_t *ces;
int32_t listMax;
int32_t listSize;
#ifdef INSTRUMENT_CELIST
static int32_t _active;
static int32_t _histogram[10];
#endif
};
/**
* StringList
*
* This object holds a list of <code>UnicodeString</code> objects.
*
* @internal ICU 4.0.1 technology preview
*/
class U_I18N_API StringList : public UObject
class StringList
{
public:
/**
@ -175,15 +130,11 @@ public:
* Note: if on return, status is set to an error code,
* the only safe thing to do with this object is to call
* the destructor.
*
* @internal ICU 4.0.1 technology preview
*/
StringList(UErrorCode &status);
/**
* The destructor.
*
* @internal ICU 4.0.1 technology preview
*/
~StringList();
@ -192,8 +143,6 @@ public:
*
* @param string - the string to add
* @param status - will be set if any errors occur.
*
* @internal ICU 4.0.1 technology preview
*/
void add(const UnicodeString *string, UErrorCode &status);
@ -203,8 +152,6 @@ public:
* @param chars - the address of the array of code points
* @param count - the number of code points in the array
* @param status - will be set if any errors occur.
*
* @internal ICU 4.0.1 technology preview
*/
void add(const UChar *chars, int32_t count, UErrorCode &status);
@ -215,8 +162,6 @@ public:
*
* @return a pointer to the <code>UnicodeString</code> or <code>NULL</code>
* if <code>index</code> is out of bounds.
*
* @internal ICU 4.0.1 technology preview
*/
const UnicodeString *get(int32_t index) const;
@ -224,43 +169,22 @@ public:
* Get the number of stings in the list.
*
* @return the number of strings in the list.
*
* @internal ICU 4.0.1 technology preview
*/
int32_t size() const;
/**
* the UObject glue...
* @internal ICU 4.0.1 technology preview
*/
virtual UClassID getDynamicClassID() const;
/**
* the UObject glue...
* @internal ICU 4.0.1 technology preview
*/
static UClassID getStaticClassID();
private:
UnicodeString *strings;
int32_t listMax;
int32_t listSize;
#ifdef INSTRUMENT_STRING_LIST
static int32_t _lists;
static int32_t _strings;
static int32_t _histogram[101];
#endif
};
#endif /* U_HIDE_INTERNAL_API */
/*
* Forward references to internal classes.
*/
class StringToCEsMap;
class CEToStringsMap;
class CollDataCache;
#ifndef U_HIDE_INTERNAL_API
/**
* CollData
*
@ -276,10 +200,8 @@ class CollDataCache;
* If you do not need to reuse any unreferenced objects in the cache, you can call
* <code>CollData::flushCollDataCache</code>. If you no longer need any <code>CollData</code>
* objects, you can call <code>CollData::freeCollDataCache</code>
*
* @internal ICU 4.0.1 technology preview
*/
class U_I18N_API CollData : public UObject
class CollData
{
public:
/**
@ -287,32 +209,18 @@ public:
*
* @param collator - the collator
* @param status - will be set if any errors occur.
*
* @return the <code>CollData</code> object. You must call
* <code>close</code> when you are done using the object.
*
* Note: if on return, status is set to an error code,
* the only safe thing to do with this object is to call
* <code>CollData::close</code>.
*
* @internal ICU 4.0.1 technology preview
*/
static CollData *open(UCollator *collator, UErrorCode &status);
CollData(UCollator *collator, UErrorCode &status);
/**
* Release a <code>CollData</code> object.
*
* @param collData - the object
*
* @internal ICU 4.0.1 technology preview
* The destructor.
*/
static void close(CollData *collData);
~CollData();
/**
* Get the <code>UCollator</code> object used to create this object.
* The object returned may not be the exact object that was used to
* create this object, but it will have the same behavior.
* @internal ICU 4.0.1 technology preview
*/
UCollator *getCollator() const;
@ -325,8 +233,6 @@ public:
* return a <code>StringList</code> object containing all
* the stirngs, or <code>NULL</code> if there are
* no such strings.
*
* @internal ICU 4.0.1 technology preview.
*/
const StringList *getStringList(int32_t ce) const;
@ -338,8 +244,6 @@ public:
* @return a <code>CEList</code> object containt the CEs. You
* must call <code>freeCEList</code> when you are finished
* using the <code>CEList</code>/
*
* @internal ICU 4.0.1 technology preview.
*/
const CEList *getCEList(const UnicodeString *string) const;
@ -347,8 +251,6 @@ public:
* Release a <code>CEList</code> returned by <code>getCEList</code>.
*
* @param list - the <code>CEList</code> to free.
*
* @internal ICU 4.0.1 technology preview
*/
void freeCEList(const CEList *list);
@ -360,8 +262,6 @@ public:
* @param offset - the offset of the first CE in the list to use.
*
* @return the length of the shortest string.
*
* @internal ICU 4.0.1 technology preview
*/
int32_t minLengthInChars(const CEList *ces, int32_t offset) const;
@ -382,75 +282,18 @@ public:
* the number of cEs in the <code>CEList</code>
*
* @return the length of the shortest string.
*
* @internal ICU 4.0.1 technology preview
*/
int32_t minLengthInChars(const CEList *ces, int32_t offset, int32_t *history) const;
/**
* UObject glue...
* @internal ICU 4.0.1 technology preview
*/
virtual UClassID getDynamicClassID() const;
/**
* UObject glue...
* @internal ICU 4.0.1 technology preview
*/
static UClassID getStaticClassID();
/**
* <code>CollData</code> objects are expensive to compute, and so
* may be cached. This routine will free the cached objects and delete
* the cache.
*
* WARNING: Don't call this until you are have called <code>close</code>
* for each <code>CollData</code> object that you have used. also,
* DO NOT call this if another thread may be calling <code>flushCollDataCache</code>
* at the same time.
*
* @internal 4.0.1 technology preview
*/
static void freeCollDataCache();
/**
* <code>CollData</code> objects are expensive to compute, and so
* may be cached. This routine will remove any unused <code>CollData</code>
* objects from the cache.
*
* @internal 4.0.1 technology preview
*/
static void flushCollDataCache();
private:
friend class CollDataCache;
friend class CollDataCacheEntry;
CollData(UCollator *collator, char *cacheKey, int32_t cachekeyLength, UErrorCode &status);
~CollData();
CollData();
static char *getCollatorKey(UCollator *collator, char *buffer, int32_t bufferLength);
static CollDataCache *getCollDataCache();
UCollator *coll;
StringToCEsMap *charsToCEList;
CEToStringsMap *ceToCharsStartingWith;
char keyBuffer[KEY_BUFFER_SIZE];
char *key;
static CollDataCache *collDataCache;
uint32_t minHan;
uint32_t maxHan;
uint32_t jamoLimits[4];
};
#endif /* U_HIDE_INTERNAL_API */
U_NAMESPACE_END
#endif // #if !UCONFIG_NO_COLLATION
#endif // #ifndef COLL_DATA_H

View file

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
@ -224,6 +224,7 @@
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="bytestrietest.cpp" />
<ClCompile Include="colldata.cpp" />
<ClCompile Include="ucharstrietest.cpp" />
<ClCompile Include="itrbbi.cpp" />
<ClCompile Include="rbbiapts.cpp" />
@ -392,6 +393,7 @@
<ClCompile Include="listformattertest.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="colldata.h" />
<ClInclude Include="itrbbi.h" />
<ClInclude Include="rbbiapts.h" />
<ClInclude Include="rbbitst.h" />
@ -537,4 +539,4 @@
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>
</Project>

View file

@ -450,6 +450,10 @@
<ClCompile Include="alphaindextst.cpp">
<Filter>collation</Filter>
</ClCompile>
<ClCompile Include="listformattertest.cpp" />
<ClCompile Include="colldata.cpp">
<Filter>collation</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="itrbbi.h">
@ -818,5 +822,9 @@
<ClInclude Include="alphaindextst.h">
<Filter>collation</Filter>
</ClInclude>
<ClInclude Include="listformattertest.h" />
<ClInclude Include="colldata.h">
<Filter>collation</Filter>
</ClInclude>
</ItemGroup>
</Project>
</Project>

File diff suppressed because it is too large Load diff

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2005-2009, International Business Machines
* Copyright (C) 2005-2012, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
@ -11,7 +11,6 @@
#include "unicode/utypes.h"
#include "unicode/unistr.h"
#include "unicode/ucol.h"
#include "unicode/bmsearch.h"
#include "intltest.h"
@ -34,30 +33,17 @@ public:
virtual void searchTest();
virtual void offsetTest();
virtual void monkeyTest(char *params);
virtual void bmMonkeyTest(char *params);
virtual void boyerMooreTest();
virtual void sharpSTest();
virtual void goodSuffixTest();
virtual void searchTime();
virtual void bmsTest();
virtual void bmSearchTest();
virtual void udhrTest();
virtual void stringListTest();
private:
virtual const char *getPath(char buffer[2048], const char *filename);
virtual int32_t monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
const char *name, const char *strength, uint32_t seed);
virtual int32_t bmMonkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
BoyerMooreSearch *bms, BoyerMooreSearch *abms,
const char *name, const char *strength, uint32_t seed);
#endif
};
#endif
#endif
#endif

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (C) 2008-2009 IBM, Inc. All Rights Reserved.
* Copyright (C) 2008-2012 IBM, Inc. All Rights Reserved.
*
********************************************************************/
/**
@ -14,13 +14,7 @@
StringSearchPerformanceTest::StringSearchPerformanceTest(int32_t argc, const char *argv[], UErrorCode &status)
:UPerfTest(argc,argv,status){
int32_t start, end;
#ifdef TEST_BOYER_MOORE_SEARCH
bms = NULL;
#else
srch = NULL;
#endif
pttrn = NULL;
if(status== U_ILLEGAL_ARGUMENT_ERROR || line_mode){
fprintf(stderr,gUsageString, "strsrchperf");
@ -65,17 +59,8 @@ StringSearchPerformanceTest::StringSearchPerformanceTest(int32_t argc, const cha
pttrn = temp; /* store word in pttrn */
#endif
#ifdef TEST_BOYER_MOORE_SEARCH
UnicodeString patternString(pttrn, pttrnLen);
UCollator *coll = ucol_open(locale, &status);
CollData *data = CollData::open(coll, status);
targetString = new UnicodeString(src, srcLen);
bms = new BoyerMooreSearch(data, patternString, targetString, status);
#else
/* Create the StringSearch object to be use in performance test. */
srch = usearch_open(pttrn, pttrnLen, src, srcLen, locale, NULL, &status);
#endif
if(U_FAILURE(status)){
fprintf(stderr, "FAILED to create UPerfTest object. Error: %s\n", u_errorName(status));
@ -85,23 +70,12 @@ StringSearchPerformanceTest::StringSearchPerformanceTest(int32_t argc, const cha
}
StringSearchPerformanceTest::~StringSearchPerformanceTest() {
CollData *data = bms->getData();
UCollator *coll = data->getCollator();
delete bms;
delete targetString;
CollData::close(data);
ucol_close(coll);
if (pttrn != NULL) {
free(pttrn);
}
#ifndef TEST_BOYER_MOORE_SEARCH
if (srch != NULL) {
usearch_close(srch);
}
#endif
}
UPerfFunction* StringSearchPerformanceTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char *par) {
@ -117,20 +91,12 @@ UPerfFunction* StringSearchPerformanceTest::runIndexedTest(int32_t index, UBool
}
UPerfFunction* StringSearchPerformanceTest::Test_ICU_Forward_Search(){
#ifdef TEST_BOYER_MOORE_SEARCH
StringSearchPerfFunction *func = new StringSearchPerfFunction(ICUForwardSearch, bms, src, srcLen, pttrn, pttrnLen);
#else
StringSearchPerfFunction* func = new StringSearchPerfFunction(ICUForwardSearch, srch, src, srcLen, pttrn, pttrnLen);
#endif
return func;
}
UPerfFunction* StringSearchPerformanceTest::Test_ICU_Backward_Search(){
#ifdef TEST_BOYER_MOORE_SEARCH
StringSearchPerfFunction *func = new StringSearchPerfFunction(ICUBackwardSearch, bms, src, srcLen, pttrn, pttrnLen);
#else
StringSearchPerfFunction* func = new StringSearchPerfFunction(ICUBackwardSearch, srch, src, srcLen, pttrn, pttrnLen);
#endif
return func;
}

View file

@ -1,26 +1,17 @@
/********************************************************************
* COPYRIGHT:
* Copyright (C) 2008-2009 IBM, Inc. All Rights Reserved.
* Copyright (C) 2008-2012 IBM, Inc. All Rights Reserved.
*
********************************************************************/
#ifndef _STRSRCHPERF_H
#define _STRSRCHPERF_H
#include "unicode/ubrk.h"
#include "unicode/usearch.h"
#include "unicode/colldata.h"
#include "unicode/bmsearch.h"
#include "unicode/uperf.h"
#include <stdlib.h>
#include <stdio.h>
#define TEST_BOYER_MOORE_SEARCH
#ifdef TEST_BOYER_MOORE_SEARCH
typedef void (*StrSrchFn) (BoyerMooreSearch * bms, const UChar *src, int32_t srcLen, const UChar *pttrn, int32_t pttrnLen, UErrorCode *status);
#else
typedef void (*StrSrchFn)(UStringSearch* srch, const UChar* src,int32_t srcLen, const UChar* pttrn, int32_t pttrnLen, UErrorCode* status);
#endif
class StringSearchPerfFunction : public UPerfFunction {
private:
@ -29,39 +20,17 @@ private:
int32_t srcLen;
const UChar* pttrn;
int32_t pttrnLen;
#ifdef TEST_BOYER_MOORE_SEARCH
BoyerMooreSearch *bms;
#else
UStringSearch* srch;
#endif
public:
virtual void call(UErrorCode* status) {
#ifdef TEST_BOYER_MOORE_SEARCH
(*fn)(bms, src, srcLen, pttrn, pttrnLen, status);
#else
(*fn)(srch, src, srcLen, pttrn, pttrnLen, status);
#endif
}
virtual long getOperationsPerIteration() {
#if 0
return (long)(srcLen/pttrnLen);
#else
return (long) srcLen;
#endif
}
#ifdef TEST_BOYER_MOORE_SEARCH
StringSearchPerfFunction(StrSrchFn func, BoyerMooreSearch *search, const UChar *source, int32_t sourceLen, const UChar *pattern, int32_t patternLen) {
fn = func;
src = source;
srcLen = sourceLen;
pttrn = pattern;
pttrnLen = patternLen;
bms = search;
}
#else
StringSearchPerfFunction(StrSrchFn func, UStringSearch* search, const UChar* source,int32_t sourceLen, const UChar* pattern, int32_t patternLen) {
fn = func;
src = source;
@ -70,7 +39,6 @@ public:
pttrnLen = patternLen;
srch = search;
}
#endif
};
class StringSearchPerformanceTest : public UPerfTest {
@ -79,42 +47,17 @@ private:
int32_t srcLen;
UChar* pttrn;
int32_t pttrnLen;
#ifdef TEST_BOYER_MOORE_SEARCH
UnicodeString *targetString;
BoyerMooreSearch *bms;
#else
UStringSearch* srch;
#endif
public:
StringSearchPerformanceTest(int32_t argc, const char *argv[], UErrorCode &status);
~StringSearchPerformanceTest();
virtual UPerfFunction* runIndexedTest(int32_t index, UBool exec, const char *&name, char *par = NULL);
UPerfFunction* Test_ICU_Forward_Search();
UPerfFunction* Test_ICU_Backward_Search();
};
#ifdef TEST_BOYER_MOORE_SEARCH
void ICUForwardSearch(BoyerMooreSearch *bms, const UChar *source, int32_t sourceLen, const UChar *pattern, int32_t patternLen, UErrorCode * /*status*/) {
int32_t offset = 0, start = -1, end = -1;
while (bms->search(offset, start, end)) {
offset = end;
}
}
void ICUBackwardSearch(BoyerMooreSearch *bms, const UChar *source, int32_t sourceLen, const UChar *pattern, int32_t patternLen, UErrorCode * /*status*/) {
int32_t offset = 0, start = -1, end = -1;
/* NOTE: No Boyer-Moore backward search yet... */
while (bms->search(offset, start, end)) {
offset = end;
}
}
#else
void ICUForwardSearch(UStringSearch *srch, const UChar* source, int32_t sourceLen, const UChar* pattern, int32_t patternLen, UErrorCode* status) {
int32_t match;
@ -132,6 +75,5 @@ void ICUBackwardSearch(UStringSearch *srch, const UChar* source, int32_t sourceL
match = usearch_previous(srch, status);
}
}
#endif
#endif /* _STRSRCHPERF_H */