mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-14 17:24:01 +00:00
ICU-1004 New sample codes for Boyer Moore search.
X-SVN-Rev: 5030
This commit is contained in:
parent
e9f3387660
commit
1973c85169
7 changed files with 2135 additions and 0 deletions
170
icu4c/source/samples/search/search.cpp
Normal file
170
icu4c/source/samples/search/search.cpp
Normal file
|
@ -0,0 +1,170 @@
|
|||
/**************************************************************************
|
||||
*
|
||||
* Copyright (C) 2000, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
***************************************************************************
|
||||
* file name: colex.cpp
|
||||
*
|
||||
* created on: 2001June8
|
||||
* created by: Helena Shih
|
||||
*
|
||||
* Sample code for the ICU Search C++ routines.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/locid.h"
|
||||
|
||||
#include "strsrch.h"
|
||||
|
||||
int main()
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeString target("A quick fox jumped over the lazy dog.", "");
|
||||
UnicodeString easyPatterns[] = {"FoX", "CAT", "jump", "under" };
|
||||
int exactOffsets[] = { -1, -1, 12, -1 };
|
||||
int tertiaryOffsets[] = { 8, -1, 12, -1 };
|
||||
uint32_t patternIndex[] = { 3, 9, 13, 17 };
|
||||
UnicodeString monkeyTarget("abcdefgh");
|
||||
UnicodeString monkeyTarget2("ijklmnop");
|
||||
|
||||
int i, j;
|
||||
int pos = 0;
|
||||
StringSearch *searchIter = new StringSearch(easyPatterns[0], target, status);
|
||||
fprintf(stdout, "\n");
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
fprintf(stderr, "Failed to create a StringSearch object for the default locale.\n");
|
||||
}
|
||||
fprintf(stdout, "Try with default normalization mode and strength.\n");
|
||||
for (i = 0; TRUE; i++)
|
||||
{
|
||||
status = U_ZERO_ERROR;
|
||||
searchIter->reset();
|
||||
pos = searchIter->next();
|
||||
if ( pos != exactOffsets[i] )
|
||||
fprintf(stdout, "Exact match failed at the index %d pattern.\n", i);
|
||||
|
||||
if (i + 1 == 4) {
|
||||
break;
|
||||
}
|
||||
|
||||
searchIter->setPattern(easyPatterns[i+1], status);
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
fprintf(stderr, "Failed to set a pattern for %d element.\n", i);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
fprintf(stdout, "Try now with strength == primary.\n");
|
||||
status = U_ZERO_ERROR;
|
||||
searchIter->setStrength(Collator::PRIMARY, status);
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
fprintf(stderr, "Failed to set strength of the string search object.\n");
|
||||
}
|
||||
searchIter->reset();
|
||||
searchIter->setPattern(easyPatterns[0], status);
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
fprintf(stderr, "Failed to set a pattern for the first element.\n");
|
||||
}
|
||||
pos = searchIter->first();
|
||||
if (pos != tertiaryOffsets[0])
|
||||
fprintf(stdout, "Tertiary match failed at the first pattern.\n");
|
||||
for (i = 1; i < 4; i++)
|
||||
{
|
||||
status = U_ZERO_ERROR;
|
||||
searchIter->setPattern(easyPatterns[i], status);
|
||||
searchIter->reset();
|
||||
pos = searchIter->next();
|
||||
if (pos != tertiaryOffsets[i])
|
||||
fprintf(stdout, "Tertiary match failed at index %d pattern.\n", i);
|
||||
}
|
||||
// Going backwards
|
||||
searchIter->reset();
|
||||
searchIter->setPattern(easyPatterns[--i], status);
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
fprintf(stderr, "Failed to set a pattern for the last element.\n");
|
||||
}
|
||||
pos = searchIter->last();
|
||||
if (pos != tertiaryOffsets[i])
|
||||
fprintf(stdout, "Tertiary match failed at the last pattern.\n");
|
||||
for (; i >= 1 ; --i)
|
||||
{
|
||||
status = U_ZERO_ERROR;
|
||||
searchIter->setPattern(easyPatterns[i-1], status);
|
||||
searchIter->reset();
|
||||
pos = searchIter->previous();
|
||||
if (pos != tertiaryOffsets[i-1])
|
||||
fprintf(stdout, "Walking backwards: tertiary match failed at index %d pattern.\n", i);
|
||||
}
|
||||
status = U_ZERO_ERROR;
|
||||
searchIter->setTarget(monkeyTarget);
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
fprintf(stderr, "Failed to set a pattern for the monkey target.\n");
|
||||
goto cleanup;
|
||||
}
|
||||
searchIter->setStrength(Collator::TERTIARY, status);
|
||||
// change direction again
|
||||
searchIter->reset();
|
||||
searchIter->setPattern(monkeyTarget, status);
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
fprintf(stderr, "Failed to set a pattern as monkey test itself.\n");
|
||||
}
|
||||
pos = searchIter->first();
|
||||
if (pos == -1)
|
||||
fprintf(stdout, "Matching monkey test itself failed.\n");
|
||||
for (i = 0; i < monkeyTarget.length() - 1; i++)
|
||||
{
|
||||
// will always find its substring
|
||||
for (j = i+1; j < monkeyTarget.length(); j++)
|
||||
{
|
||||
UnicodeString temp;
|
||||
status = U_ZERO_ERROR;
|
||||
searchIter->reset();
|
||||
monkeyTarget.extract(i, j, temp);
|
||||
searchIter->setPattern(temp, status);
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
fprintf(stderr, "Failed to set a pattern for the %d -th monkey pattern of length %d.\n", i, j);
|
||||
continue;
|
||||
}
|
||||
pos = searchIter->next();
|
||||
if (pos == -1)
|
||||
fprintf(stdout, "Monkey match failed at index %d in monkey pattern of length %d.\n", i, j);
|
||||
}
|
||||
}
|
||||
status = U_ZERO_ERROR;
|
||||
searchIter->setTarget(monkeyTarget2);
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
fprintf(stderr, "Failed to set a pattern for the monkey target2.\n");
|
||||
goto cleanup;
|
||||
}
|
||||
for (i = 0; i < monkeyTarget.length() - 1; i++)
|
||||
{
|
||||
// will never find the match
|
||||
UnicodeString temp;
|
||||
status = U_ZERO_ERROR;
|
||||
monkeyTarget.extract(i, monkeyTarget.length(), temp);
|
||||
searchIter->reset();
|
||||
searchIter->setPattern(temp, status);
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
fprintf(stderr, "Failed to set a pattern for the monkey pattern at offset index %d.\n", i);
|
||||
continue;
|
||||
}
|
||||
pos = searchIter->next();
|
||||
if (pos != -1)
|
||||
fprintf(stdout, "Monkey mismatch failed at index %d in monkey pattern.\n", i);
|
||||
}
|
||||
|
||||
cleanup:
|
||||
delete searchIter;
|
||||
return 0;
|
||||
}
|
118
icu4c/source/samples/search/search.dsp
Normal file
118
icu4c/source/samples/search/search.dsp
Normal file
|
@ -0,0 +1,118 @@
|
|||
# Microsoft Developer Studio Project File - Name="search" - Package Owner=<4>
|
||||
# Microsoft Developer Studio Generated Build File, Format Version 6.00
|
||||
# ** DO NOT EDIT **
|
||||
|
||||
# TARGTYPE "Win32 (x86) Console Application" 0x0103
|
||||
|
||||
CFG=search - Win32 Debug
|
||||
!MESSAGE This is not a valid makefile. To build this project using NMAKE,
|
||||
!MESSAGE use the Export Makefile command and run
|
||||
!MESSAGE
|
||||
!MESSAGE NMAKE /f "search.mak".
|
||||
!MESSAGE
|
||||
!MESSAGE You can specify a configuration when running NMAKE
|
||||
!MESSAGE by defining the macro CFG on the command line. For example:
|
||||
!MESSAGE
|
||||
!MESSAGE NMAKE /f "search.mak" CFG="search - Win32 Debug"
|
||||
!MESSAGE
|
||||
!MESSAGE Possible choices for configuration are:
|
||||
!MESSAGE
|
||||
!MESSAGE "search - Win32 Release" (based on "Win32 (x86) Console Application")
|
||||
!MESSAGE "search - Win32 Debug" (based on "Win32 (x86) Console Application")
|
||||
!MESSAGE
|
||||
|
||||
# Begin Project
|
||||
# PROP AllowPerConfigDependencies 0
|
||||
# PROP Scc_ProjName ""
|
||||
# PROP Scc_LocalPath ""
|
||||
CPP=cl.exe
|
||||
RSC=rc.exe
|
||||
|
||||
!IF "$(CFG)" == "search - Win32 Release"
|
||||
|
||||
# PROP BASE Use_MFC 0
|
||||
# PROP BASE Use_Debug_Libraries 0
|
||||
# PROP BASE Output_Dir "Release"
|
||||
# PROP BASE Intermediate_Dir "Release"
|
||||
# PROP BASE Target_Dir ""
|
||||
# PROP Use_MFC 0
|
||||
# PROP Use_Debug_Libraries 0
|
||||
# PROP Output_Dir "Release"
|
||||
# PROP Intermediate_Dir "Release"
|
||||
# PROP Ignore_Export_Lib 0
|
||||
# PROP Target_Dir ""
|
||||
# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
|
||||
# ADD CPP /nologo /MT /W3 /GX /O2 /I "..\..\..\include" /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
|
||||
# ADD BASE RSC /l 0x409 /d "NDEBUG"
|
||||
# ADD RSC /l 0x409 /d "NDEBUG"
|
||||
BSC32=bscmake.exe
|
||||
# ADD BASE BSC32 /nologo
|
||||
# ADD BSC32 /nologo
|
||||
LINK32=link.exe
|
||||
# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
|
||||
# ADD LINK32 ..\..\..\lib\icuuc.lib ..\..\..\lib\icuin.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 /libpath:"..\..\..\lib"
|
||||
|
||||
!ELSEIF "$(CFG)" == "search - Win32 Debug"
|
||||
|
||||
# PROP BASE Use_MFC 0
|
||||
# PROP BASE Use_Debug_Libraries 1
|
||||
# PROP BASE Output_Dir "Debug"
|
||||
# PROP BASE Intermediate_Dir "Debug"
|
||||
# PROP BASE Target_Dir ""
|
||||
# PROP Use_MFC 0
|
||||
# PROP Use_Debug_Libraries 1
|
||||
# PROP Output_Dir "Debug"
|
||||
# PROP Intermediate_Dir "Debug"
|
||||
# PROP Ignore_Export_Lib 0
|
||||
# PROP Target_Dir ""
|
||||
# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
|
||||
# ADD CPP /nologo /MTd /W3 /Gm /GX /ZI /Od /I "..\..\..\include" /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
|
||||
# ADD BASE RSC /l 0x409 /d "_DEBUG"
|
||||
# ADD RSC /l 0x409 /d "_DEBUG"
|
||||
BSC32=bscmake.exe
|
||||
# ADD BASE BSC32 /nologo
|
||||
# ADD BSC32 /nologo
|
||||
LINK32=link.exe
|
||||
# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
|
||||
# ADD LINK32 ..\..\..\lib\icuucd.lib ..\..\..\lib\icuind.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept /libpath:"..\..\..\lib"
|
||||
|
||||
!ENDIF
|
||||
|
||||
# Begin Target
|
||||
|
||||
# Name "search - Win32 Release"
|
||||
# Name "search - Win32 Debug"
|
||||
# Begin Group "Source Files"
|
||||
|
||||
# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\search.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\srchiter.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\strsrch.cpp
|
||||
# End Source File
|
||||
# End Group
|
||||
# Begin Group "Header Files"
|
||||
|
||||
# PROP Default_Filter "h;hpp;hxx;hm;inl"
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\srchiter.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\strsrch.h
|
||||
# End Source File
|
||||
# End Group
|
||||
# Begin Group "Resource Files"
|
||||
|
||||
# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
|
||||
# End Group
|
||||
# End Target
|
||||
# End Project
|
29
icu4c/source/samples/search/search.dsw
Normal file
29
icu4c/source/samples/search/search.dsw
Normal file
|
@ -0,0 +1,29 @@
|
|||
Microsoft Developer Studio Workspace File, Format Version 6.00
|
||||
# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE!
|
||||
|
||||
###############################################################################
|
||||
|
||||
Project: "search"=.\search.dsp - Package Owner=<4>
|
||||
|
||||
Package=<5>
|
||||
{{{
|
||||
}}}
|
||||
|
||||
Package=<4>
|
||||
{{{
|
||||
}}}
|
||||
|
||||
###############################################################################
|
||||
|
||||
Global:
|
||||
|
||||
Package=<5>
|
||||
{{{
|
||||
}}}
|
||||
|
||||
Package=<3>
|
||||
{{{
|
||||
}}}
|
||||
|
||||
###############################################################################
|
||||
|
279
icu4c/source/samples/search/srchiter.cpp
Normal file
279
icu4c/source/samples/search/srchiter.cpp
Normal file
|
@ -0,0 +1,279 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2000 IBM and others. All rights reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 03/22/2000 helena Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/schriter.h"
|
||||
#include "srchiter.h"
|
||||
|
||||
int32_t const SearchIterator::DONE = -1;
|
||||
int32_t const SearchIterator::BEFORE = -2;
|
||||
|
||||
SearchIterator::SearchIterator(void) :
|
||||
index(0),
|
||||
length(0),
|
||||
target(0),
|
||||
backward(FALSE), /* going forward */
|
||||
breaker(NULL),
|
||||
overlap(TRUE)
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
this->breaker = BreakIterator::createCharacterInstance(Locale::getDefault(), status);
|
||||
if (U_FAILURE(status)) return;
|
||||
}
|
||||
|
||||
SearchIterator::SearchIterator(CharacterIterator* target,
|
||||
BreakIterator* breaker) :
|
||||
index(0),
|
||||
length(0),
|
||||
target(0),
|
||||
backward(FALSE), /* going forward */
|
||||
breaker(NULL),
|
||||
overlap(TRUE)
|
||||
{
|
||||
this->target = target;
|
||||
|
||||
this->breaker = breaker;
|
||||
this->breaker->adoptText(this->target);
|
||||
|
||||
index = this->target->startIndex();
|
||||
length = 0;
|
||||
}
|
||||
|
||||
SearchIterator::SearchIterator(const SearchIterator& other) :
|
||||
length(other.length),
|
||||
target(0),
|
||||
backward(other.backward), /* going forward */
|
||||
breaker(NULL),
|
||||
overlap(other.overlap)
|
||||
{
|
||||
index = other.target->startIndex();
|
||||
this->target = other.target->clone();
|
||||
|
||||
this->breaker = ((BreakIterator&)other.breaker).clone();
|
||||
this->breaker->adoptText(this->target);
|
||||
}
|
||||
|
||||
SearchIterator::~SearchIterator()
|
||||
{
|
||||
// deletion of breaker will delete target
|
||||
if (breaker != NULL) {
|
||||
delete breaker;
|
||||
breaker = 0;
|
||||
}
|
||||
}
|
||||
|
||||
bool_t SearchIterator::operator == (const SearchIterator& that) const
|
||||
{
|
||||
if (this == &that) return TRUE;
|
||||
if (*that.breaker != *breaker) return FALSE;
|
||||
else if (*that.target != *target) return FALSE;
|
||||
else if (that.backward != backward) return FALSE;
|
||||
else if (that.index != index) return FALSE;
|
||||
else if (that.length != length) return FALSE;
|
||||
else if (that.overlap != overlap) return FALSE;
|
||||
else return TRUE;
|
||||
}
|
||||
|
||||
int32_t SearchIterator::first(void)
|
||||
{
|
||||
setIndex(SearchIterator::BEFORE);
|
||||
return next();
|
||||
}
|
||||
|
||||
int32_t SearchIterator::following(int32_t pos)
|
||||
{
|
||||
setIndex(pos);
|
||||
return next();
|
||||
}
|
||||
|
||||
int32_t SearchIterator::last(void)
|
||||
{
|
||||
setIndex(SearchIterator::DONE);
|
||||
return previous();
|
||||
}
|
||||
|
||||
int32_t SearchIterator::preceding(int32_t pos)
|
||||
{
|
||||
setIndex(pos);
|
||||
return previous();
|
||||
}
|
||||
|
||||
int32_t SearchIterator::next(void)
|
||||
{
|
||||
if (index == SearchIterator::BEFORE){
|
||||
// Starting at the beginning of the text
|
||||
index = target->startIndex();
|
||||
} else if (index == SearchIterator::DONE) {
|
||||
return SearchIterator::DONE;
|
||||
} else if (length > 0) {
|
||||
// Finding the next match after a previous one
|
||||
index += overlap ? 1 : length;
|
||||
}
|
||||
index -= 1;
|
||||
backward = FALSE;
|
||||
|
||||
do {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
length = 0;
|
||||
index = handleNext(index + 1, status);
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
return SearchIterator::DONE;
|
||||
}
|
||||
} while (index != SearchIterator::DONE && !isBreakUnit(index, index+length));
|
||||
|
||||
return index;
|
||||
}
|
||||
|
||||
int32_t SearchIterator::previous(void)
|
||||
{
|
||||
if (index == SearchIterator::DONE) {
|
||||
index = target->endIndex();
|
||||
} else if (index == SearchIterator::BEFORE) {
|
||||
return SearchIterator::DONE;
|
||||
} else if (length > 0) {
|
||||
// Finding the previous match before a following one
|
||||
index = overlap ? index + length - 1 : index;
|
||||
}
|
||||
index += 1;
|
||||
backward = TRUE;
|
||||
|
||||
do {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
length = 0;
|
||||
index = handlePrev(index - 1, status);
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
return SearchIterator::DONE;
|
||||
}
|
||||
} while (index != SearchIterator::DONE && !isBreakUnit(index, index+length));
|
||||
|
||||
if (index == SearchIterator::DONE) {
|
||||
index = SearchIterator::BEFORE;
|
||||
}
|
||||
return getIndex();
|
||||
}
|
||||
|
||||
|
||||
int32_t SearchIterator::getIndex() const
|
||||
{
|
||||
return index == SearchIterator::BEFORE ? SearchIterator::DONE : index;
|
||||
}
|
||||
|
||||
void SearchIterator::setOverlapping(bool_t allowOverlap)
|
||||
{
|
||||
overlap = allowOverlap;
|
||||
}
|
||||
|
||||
bool_t SearchIterator::isOverlapping(void) const
|
||||
{
|
||||
return overlap;
|
||||
}
|
||||
|
||||
int32_t SearchIterator::getMatchLength(void) const
|
||||
{
|
||||
return length;
|
||||
}
|
||||
|
||||
void SearchIterator::reset(void)
|
||||
{
|
||||
length = 0;
|
||||
if (backward == FALSE) {
|
||||
index = 0;
|
||||
target->setToStart();
|
||||
breaker->first();
|
||||
} else {
|
||||
index = SearchIterator::DONE;
|
||||
target->setToEnd();
|
||||
breaker->last();
|
||||
}
|
||||
overlap = TRUE;
|
||||
}
|
||||
|
||||
void SearchIterator::setBreakIterator(const BreakIterator* iterator)
|
||||
{
|
||||
CharacterIterator *buffer = target->clone();
|
||||
delete breaker;
|
||||
breaker = iterator->clone();
|
||||
breaker->adoptText(buffer);
|
||||
}
|
||||
|
||||
const BreakIterator& SearchIterator::getBreakIterator(void) const
|
||||
{
|
||||
return *breaker;
|
||||
}
|
||||
|
||||
void SearchIterator::setTarget(const UnicodeString& newText)
|
||||
{
|
||||
if (target != NULL && target->getDynamicClassID()
|
||||
== StringCharacterIterator::getStaticClassID()) {
|
||||
((StringCharacterIterator*)target)->setText(newText);
|
||||
}
|
||||
else {
|
||||
delete target;
|
||||
target = new StringCharacterIterator(newText);
|
||||
target->first();
|
||||
breaker->adoptText(target);
|
||||
}
|
||||
}
|
||||
|
||||
void SearchIterator::adoptTarget(CharacterIterator* iterator) {
|
||||
target = iterator;
|
||||
breaker->adoptText(target);
|
||||
setIndex(SearchIterator::BEFORE);
|
||||
}
|
||||
|
||||
const CharacterIterator& SearchIterator::getTarget(void) const
|
||||
{
|
||||
SearchIterator* nonConstThis = (SearchIterator*)this;
|
||||
|
||||
// The iterator is initialized pointing to no text at all, so if this
|
||||
// function is called while we're in that state, we have to fudge an
|
||||
// an iterator to return.
|
||||
if (nonConstThis->target == NULL)
|
||||
nonConstThis->target = new StringCharacterIterator("");
|
||||
return *nonConstThis->target;
|
||||
}
|
||||
|
||||
void SearchIterator::getMatchedText(UnicodeString& result)
|
||||
{
|
||||
result.remove();
|
||||
if (length > 0) {
|
||||
int i = 0;
|
||||
for (UChar c = target->setIndex(index); i < length; c = target->next(), i++)
|
||||
{
|
||||
result += c;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void SearchIterator::setMatchLength(int32_t length)
|
||||
{
|
||||
this->length = length;
|
||||
}
|
||||
|
||||
void SearchIterator::setIndex(int32_t pos) {
|
||||
index = pos;
|
||||
length = 0;
|
||||
}
|
||||
|
||||
bool_t SearchIterator::isBreakUnit(int32_t start,
|
||||
int32_t end)
|
||||
{
|
||||
if (breaker == NULL) {
|
||||
return TRUE;
|
||||
}
|
||||
bool_t startBound = breaker->isBoundary(start);
|
||||
bool_t endBound = (end == target->endIndex()) || breaker->isBoundary(end);
|
||||
|
||||
return startBound && endBound;
|
||||
}
|
||||
|
||||
|
388
icu4c/source/samples/search/srchiter.h
Normal file
388
icu4c/source/samples/search/srchiter.h
Normal file
|
@ -0,0 +1,388 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2000 IBM and others. All rights reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 03/22/2000 helena Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef SRCHITER_H
|
||||
#define SRCHITER_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/chariter.h"
|
||||
#include "unicode/brkiter.h"
|
||||
|
||||
/**
|
||||
* <code>SearchIterator</code> is an abstract base class that provides methods
|
||||
* to search for a pattern within a text string. Instances of
|
||||
* <code>SearchIterator</code> maintain a current position and scan over
|
||||
* the target text, returning the indices the pattern is matched
|
||||
* and the length of each match.
|
||||
* <p>
|
||||
* <code>SearchIterator</code> is an abstract base class that defines a
|
||||
* protocol for text searching. Subclasses provide concrete implementations of
|
||||
* various search algorithms. For example, {@link StringSearch}
|
||||
* implements language-sensitive pattern matching based on the comparison rules
|
||||
* defined in a {@link RuleBasedCollator} object.
|
||||
* <p>
|
||||
* Internally, <code>SearchIterator</code> scans text using a
|
||||
* {@link CharacterIterator}, and is thus able to scan text held
|
||||
* by any object implementing that protocol. A <code>StringCharacterIterator</code>
|
||||
* is used to scan <code>String</code> objects passed to <code>setText</code>.
|
||||
* <p>
|
||||
* <code>SearchIterator</code> provides an API that is similar to that of
|
||||
* other text iteration classes such as <code>BreakIterator</code>. Using this
|
||||
* class, it is easy to scan through text looking for all occurances of a
|
||||
* given pattern. The following example uses a <code>StringSearch</code> object to
|
||||
* find all instances of "fox" in the target string. Any other subclass of
|
||||
* <code>SearchIterator</code> can be used in an identical manner.
|
||||
* <pre><code>
|
||||
* UnicodeString target("The quick brown fox jumped over the lazy fox");
|
||||
* UnicodeString pattern("fox");
|
||||
*
|
||||
* SearchIterator *iter = new StringSearch(pattern, target);
|
||||
*
|
||||
* for (int pos = iter->first(); pos != SearchIterator::DONE; pos = iter->next()) {
|
||||
* printf("Found match at %d pos, length is %d\n", pos, iter.getMatchLength());
|
||||
* }
|
||||
* </code></pre>
|
||||
*
|
||||
* @see StringSearch
|
||||
*/
|
||||
|
||||
class SearchIterator {
|
||||
public:
|
||||
/**
|
||||
* DONE is returned by previous() and next() after all valid
|
||||
* matches have been returned, and by first() and last() if
|
||||
* there are no matches at all.
|
||||
*/
|
||||
static const int32_t DONE;
|
||||
|
||||
//=======================================================================
|
||||
// boilerplate
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
virtual ~SearchIterator();
|
||||
|
||||
/** copy constructor */
|
||||
SearchIterator(const SearchIterator& other);
|
||||
|
||||
/**
|
||||
* Equality operator. Returns TRUE if both BreakIterators are of the
|
||||
* same class, have the same behavior, and iterate over the same text.
|
||||
*/
|
||||
virtual bool_t operator==(const SearchIterator& that) const;
|
||||
|
||||
/**
|
||||
* Not-equal operator. If operator== returns TRUE, this returns FALSE,
|
||||
* and vice versa.
|
||||
*/
|
||||
bool_t operator!=(const SearchIterator& that) const;
|
||||
|
||||
/**
|
||||
* Returns a newly-constructed RuleBasedBreakIterator with the same
|
||||
* behavior, and iterating over the same text, as this one.
|
||||
*/
|
||||
virtual SearchIterator* clone(void) const = 0;
|
||||
|
||||
/**
|
||||
* Return a polymorphic class ID for this object. Different subclasses
|
||||
* will return distinct unequal values.
|
||||
* @stable
|
||||
*/
|
||||
virtual UClassID getDynamicClassID(void) const = 0;
|
||||
|
||||
/**
|
||||
* Return the first index at which the target text matches the search
|
||||
* pattern. The iterator is adjusted so that its current index
|
||||
* (as returned by {@link #getIndex}) is the match posisition if one was found
|
||||
* and <code>DONE</code> if one was not.
|
||||
*
|
||||
* @return The character index of the first match, or <code>DONE</code> if there
|
||||
* are no matches.
|
||||
*/
|
||||
int32_t first(void);
|
||||
|
||||
/**
|
||||
* Return the first index greater than <tt>pos</tt> at which the target
|
||||
* text matches the search pattern. The iterator is adjusted so that its current index
|
||||
* (as returned by {@link #getIndex}) is the match posisition if one was found
|
||||
* and <code>DONE</code> if one was not.
|
||||
*
|
||||
* @return The character index of the first match following <code>pos</code>,
|
||||
* or <tt>DONE</tt> if there are no matches.
|
||||
*/
|
||||
int32_t following(int32_t pos);
|
||||
|
||||
/**
|
||||
* Return the last index in the target text at which it matches
|
||||
* the search pattern and adjusts the iteration to point to that position.
|
||||
*
|
||||
* @return The index of the first match, or <tt>DONE</tt> if there
|
||||
* are no matches.
|
||||
*/
|
||||
int32_t last(void);
|
||||
|
||||
/**
|
||||
* Return the first index less than <code>pos</code> at which the target
|
||||
* text matches the search pattern. The iterator is adjusted so that its current index
|
||||
* (as returned by {@link #getIndex}) is the match posisition if one was found
|
||||
* and <tt>DONE</tt> if one was not.
|
||||
*
|
||||
* @return The character index of the first match preceding <code>pos</code>,
|
||||
* or <code>DONE</code> if there are no matches.
|
||||
*/
|
||||
int32_t preceding(int32_t pos);
|
||||
|
||||
/**
|
||||
* Return the index of the next point at which the text matches the
|
||||
* search pattern, starting from the current position
|
||||
* <p>
|
||||
* @return The index of the next match after the current position,
|
||||
* or <code>DONE</code> if there are no more matches.
|
||||
*
|
||||
* @see #first
|
||||
*/
|
||||
int32_t next(void);
|
||||
|
||||
/**
|
||||
* Return the index of the previous point at which the text matches
|
||||
* the search pattern, starting at the current position
|
||||
*
|
||||
* @return The index of the previous match before the current position,
|
||||
* or <code>DONE</code> if there are no more matches.
|
||||
*/
|
||||
int32_t previous(void);
|
||||
|
||||
/**
|
||||
* Return the current index in the text being searched.
|
||||
* If the iteration has gone past the end of the text
|
||||
* (or past the beginning for a backwards search),
|
||||
* {@link #DONE} is returned.
|
||||
*/
|
||||
int32_t getIndex(void) const;
|
||||
/**
|
||||
* Determines whether overlapping matches are returned. If this
|
||||
* property is <code>true</code>, matches that begin within the
|
||||
* boundry of the previous match are considered valid and will
|
||||
* be returned. For example, when searching for "abab" in the
|
||||
* target text "ababab", both offsets 0 and 2 will be returned
|
||||
* as valid matches if this property is <code>true</code>.
|
||||
* <p>
|
||||
* The default setting of this property is <tt>true</tt>
|
||||
*/
|
||||
void setOverlapping(bool_t allowOverlap);
|
||||
|
||||
/**
|
||||
* Determines whether overlapping matches are returned.
|
||||
*
|
||||
* @see #setOverlapping
|
||||
*/
|
||||
bool_t isOverlapping(void) const;
|
||||
|
||||
/**
|
||||
* Returns the length of text in the target which matches the search
|
||||
* pattern. This call returns a valid result only after a successful
|
||||
* call to {@link #first}, {@link #next}, {@link #previous}, or {@link #last}.
|
||||
* Just after construction, or after a searching method returns
|
||||
* <tt>DONE</tt>, this method will return 0.
|
||||
*
|
||||
* @return The length of the match in the target text, or 0 if there
|
||||
* is no match currently.
|
||||
*/
|
||||
int32_t getMatchLength(void) const;
|
||||
|
||||
/**
|
||||
* Set the BreakIterator that will be used to restrict the points
|
||||
* at which matches are detected.
|
||||
*
|
||||
* @param breaker A {@link java.text.BreakIterator BreakIterator}
|
||||
* that will be used to restrict the points
|
||||
* at which matches are detected. If a match is found, but the match's start
|
||||
* or end index is not a boundary as determined by
|
||||
* the <tt>BreakIterator</tt>, the match will be rejected and
|
||||
* another will be searched for.
|
||||
*
|
||||
* If this parameter is <tt>null</tt>, no break
|
||||
* detection is attempted.
|
||||
*
|
||||
* @see #getBreakIterator
|
||||
*/
|
||||
/* HSYS : Check, aliasing or owning */
|
||||
void setBreakIterator(const BreakIterator* iterator);
|
||||
|
||||
/**
|
||||
* Returns the BreakIterator that is used to restrict the points
|
||||
* at which matches are detected. This will be the same object
|
||||
* that was passed to the constructor or to <code>setBreakIterator</code>.
|
||||
* Note that <tt>null</tt> is a legal value; it means that break
|
||||
* detection should not be attempted.
|
||||
*
|
||||
* @see #setBreakIterator
|
||||
*/
|
||||
const BreakIterator& getBreakIterator(void) const;
|
||||
|
||||
/**
|
||||
* Set the target text which should be searched and resets the
|
||||
* iterator's position to point before the start of the target text.
|
||||
* This method is useful if you want to re-use an iterator to
|
||||
* search for the same pattern within a different body of text.
|
||||
*
|
||||
* @see #getTarget
|
||||
*/
|
||||
virtual void setTarget(const UnicodeString& newText);
|
||||
|
||||
/**
|
||||
* Set the target text which should be searched and resets the
|
||||
* iterator's position to point before the start of the target text.
|
||||
* This method is useful if you want to re-use an iterator to
|
||||
* search for the same pattern within a different body of text.
|
||||
*
|
||||
* @see #getTarget
|
||||
*/
|
||||
virtual void adoptTarget(CharacterIterator* iterator);
|
||||
/**
|
||||
* Return the target text which is being searched
|
||||
*
|
||||
* @see #setTarget
|
||||
*/
|
||||
const CharacterIterator& getTarget(void) const;
|
||||
|
||||
/** Reset the iteration.
|
||||
*/
|
||||
virtual void reset(void);
|
||||
|
||||
/**
|
||||
* Returns the text that was matched by the most recent call to
|
||||
* {@link #first}, {@link #next}, {@link #previous}, or {@link #last}.
|
||||
* If the iterator is not pointing at a valid match (e.g. just after
|
||||
* construction or after <tt>DONE</tt> has been returned, returns
|
||||
* an empty string.
|
||||
*/
|
||||
void getMatchedText(UnicodeString& result);
|
||||
|
||||
//-------------------------------------------------------------------
|
||||
// Protected interface for subclasses
|
||||
//-------------------------------------------------------------------
|
||||
|
||||
protected:
|
||||
SearchIterator();
|
||||
|
||||
/**
|
||||
* Constructor for use by subclasses
|
||||
* <p>
|
||||
* @param target The target text to be searched. This is for internal
|
||||
* use by this class. Subclasses need to maintain their
|
||||
* own reference to or iterator over the target text
|
||||
* for use by their {@link #handleNext handleNext} and
|
||||
* {@link #handlePrev handlePrev} methods. The target will
|
||||
* be adopted and owned by the SearchIterator object.
|
||||
*
|
||||
* @param breaker A {@link BreakIterator} that is used to restrict the points
|
||||
* at which matches are detected. If <tt>handleNext</tt> or
|
||||
* <tt>handlePrev</tt> finds a match, but the match's start
|
||||
* or end index is not a boundary as determined by
|
||||
* the <tt>BreakIterator</tt>, the match is rejected and
|
||||
* <tt>handleNext</tt> or <tt>handlePrev</tt> is called again.
|
||||
* If this parameter is <tt>null</tt>, no break
|
||||
* detection is attempted.
|
||||
*
|
||||
*/
|
||||
SearchIterator(CharacterIterator* target,
|
||||
BreakIterator* breaker);
|
||||
/**
|
||||
* Abstract method which subclasses override to provide the mechanism
|
||||
* for finding the next match in the target text. This allows different
|
||||
* subclasses to provide different search algorithms.
|
||||
* <p>
|
||||
* If a match is found, the implementation should return the index at
|
||||
* which the match starts and should call {@link #setMatchLength setMatchLength}
|
||||
* with the number of characters in the target
|
||||
* text that make up the match. If no match is found, the method
|
||||
* should return DONE and should not call <tt>setMatchLength</tt>.
|
||||
* <p>
|
||||
* @param startAt The index in the target text at which the search
|
||||
* should start.
|
||||
*
|
||||
* @see #setMatchLength
|
||||
*/
|
||||
virtual int32_t handleNext(int32_t startAt, UErrorCode& status) = 0;
|
||||
|
||||
/**
|
||||
* Abstract method which subclasses override to provide the mechanism
|
||||
* for finding the previous match in the target text. This allows different
|
||||
* subclasses to provide different search algorithms.
|
||||
* <p>
|
||||
* If a match is found, the implementation should return the index at
|
||||
* which the match starts and should call {@link #setMatchLength setMatchLength}
|
||||
* with the number of characters in the target
|
||||
* text that make up the match. If no match is found, the method
|
||||
* should return DONE and should not call <tt>setMatchLength</tt>.
|
||||
* <p>
|
||||
* @param startAt The index in the target text at which the search
|
||||
* should start.
|
||||
*
|
||||
* @see #setMatchLength
|
||||
*/
|
||||
virtual int32_t handlePrev(int32_t startAt, UErrorCode& status) = 0;
|
||||
|
||||
/**
|
||||
* Sets the length of the currently matched string in the target text.
|
||||
* Subclasses' <code>handleNext</code> and <code>handlePrev</code>
|
||||
* methods should call this when they find a match in the target text.
|
||||
*/
|
||||
void setMatchLength(int32_t length);
|
||||
|
||||
//-------------------------------------------------------------------
|
||||
// Privates
|
||||
//
|
||||
private:
|
||||
/**
|
||||
* Class ID
|
||||
*/
|
||||
static char fgClassID;
|
||||
private:
|
||||
/**
|
||||
* Private value indicating that the iterator is pointing
|
||||
* before the beginning of the target text.
|
||||
*/
|
||||
static const int32_t BEFORE;
|
||||
|
||||
/**
|
||||
* Internal method used by preceding and following. Sets the index
|
||||
* to point to the given position, and clears any state that's
|
||||
* affected.
|
||||
*/
|
||||
void setIndex(int32_t pos);
|
||||
|
||||
/**
|
||||
* Determine whether the target text bounded by <code>start</code> and
|
||||
* <code>end</code> is one or more whole units of text as determined by
|
||||
* the current <code>BreakIterator</code>.
|
||||
*/
|
||||
bool_t isBreakUnit(int32_t start, int32_t end);
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
// Private data...
|
||||
//-------------------------------------------------------------------------
|
||||
int32_t index; // Current position in the target text
|
||||
int32_t length; // Length of matched text, or 0
|
||||
bool_t overlap; // Return overlapping matches?
|
||||
CharacterIterator* target; // Target text to be searched
|
||||
BreakIterator* breaker; // Break iterator to constrain matches
|
||||
bool_t backward;
|
||||
};
|
||||
|
||||
inline bool_t SearchIterator::operator!=(const SearchIterator& that) const
|
||||
{
|
||||
return !operator==(that);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
758
icu4c/source/samples/search/strsrch.cpp
Normal file
758
icu4c/source/samples/search/strsrch.cpp
Normal file
|
@ -0,0 +1,758 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2000 IBM and others. All rights reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 03/22/2000 helena Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include <memory.h>
|
||||
#include "unicode/coleitr.h"
|
||||
#include "unicode/schriter.h"
|
||||
#include "strsrch.h"
|
||||
/**
|
||||
* <code>StringSearch</code> is a <code>SearchIterator</code> that provides
|
||||
* language-sensitive text searching based on the comparison rules defined
|
||||
* in a {@link RuleBasedCollator} object.
|
||||
* Instances of <code>StringSearch</code> function as iterators
|
||||
* maintain a current position and scan over text returning the index of
|
||||
* characters where the pattern occurs and the length of each match.
|
||||
* <p>
|
||||
* <code>StringSearch</code> uses a version of the fast Boyer-Moore search
|
||||
* algorithm that has been adapted to work with the large character set of
|
||||
* Unicode. See "Efficient Text Searching in Java", to be published in
|
||||
* <i>Java Report</i> in February, 1999, for further information on the algorithm.
|
||||
* <p>
|
||||
* Consult the <code>SearchIterator</code> documentation for information on
|
||||
* and examples of how to use instances of this class to implement text
|
||||
* searching. <code>SearchIterator</code> provides all of the necessary
|
||||
* API; this class only provides constructors and internal implementation
|
||||
* methods.
|
||||
*
|
||||
* @see SearchIterator
|
||||
* @see RuleBasedCollator
|
||||
*
|
||||
* @author Laura Werner
|
||||
* @version 1.0
|
||||
*/
|
||||
|
||||
char StringSearch::fgClassID = 0; // Value is irrelevant // class id
|
||||
/* to be removed */
|
||||
void StringSearch::dumpTables() {
|
||||
int i;
|
||||
for (i = 0; i < 256; i++) {
|
||||
if (shiftTable[i] != minLen) {
|
||||
// debug("shift[" + Integer.toString(i,16) + "] = " + shiftTable[i]);
|
||||
}
|
||||
}
|
||||
for (i = 0; i < 256; i++) {
|
||||
if (backShiftTable[i] != minLen) {
|
||||
// debug("backShift[" + Integer.toString(i,16) + "] = " + backShiftTable[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
StringSearch::StringSearch(const UnicodeString& pat,
|
||||
CharacterIterator* target,
|
||||
RuleBasedCollator* coll,
|
||||
BreakIterator* breaker,
|
||||
UErrorCode& status) :
|
||||
SearchIterator(target, breaker),
|
||||
strength(coll->getStrength()),
|
||||
pattern(pat),
|
||||
valueList(NULL),
|
||||
valueListLen(0),
|
||||
normLen(0), // num. of collation elements in pattern.
|
||||
minLen(0), // Min of composed, decomposed versions
|
||||
maxLen(0), // Max
|
||||
it(NULL)
|
||||
|
||||
{
|
||||
if (U_FAILURE(status)) return;
|
||||
collator = (RuleBasedCollator*)(coll->clone());
|
||||
iter = collator->createCollationElementIterator(*target);
|
||||
it = collator->createCollationElementIterator(pat);
|
||||
|
||||
initialize(status); // Initialize the Boyer-Moore tables
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a <code>StringSearch</code> object using a specific collator.
|
||||
* <p>
|
||||
* @param pattern The text for which this object will search.
|
||||
*
|
||||
* @param target The text in which to search for the pattern.
|
||||
*
|
||||
* @param collator A <code>RuleBasedCollator</code> object which defines the
|
||||
* language-sensitive comparison rules used to determine
|
||||
* whether text in the pattern and target matches.
|
||||
*/
|
||||
StringSearch::StringSearch(const UnicodeString& pat,
|
||||
CharacterIterator* target,
|
||||
RuleBasedCollator* collator,
|
||||
UErrorCode& status) :
|
||||
SearchIterator(),
|
||||
strength(collator->getStrength()),
|
||||
pattern(pat),
|
||||
valueList(NULL),
|
||||
valueListLen(0),
|
||||
normLen(0), // num. of collation elements in pattern.
|
||||
minLen(0), // Min of composed, decomposed versions
|
||||
maxLen(0), // Max
|
||||
it(NULL)
|
||||
{
|
||||
if (U_FAILURE(status)) return;
|
||||
this->adoptTarget(target);
|
||||
this->collator = (RuleBasedCollator*)(collator->clone());
|
||||
this->iter = collator->createCollationElementIterator(*target);
|
||||
this->it = collator->createCollationElementIterator(pat);
|
||||
initialize(status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a <code>StringSearch</code> object using the collator and
|
||||
* character boundary detection rules for a given locale
|
||||
* <p>
|
||||
* @param pattern The text for which this object will search.
|
||||
*
|
||||
* @param target The text in which to search for the pattern.
|
||||
*
|
||||
* @param loc The locale whose collation and break-detection rules
|
||||
* should be used.
|
||||
*
|
||||
* @exception ClassCastException thrown if the collator for the specified
|
||||
* locale is not a RuleBasedCollator.
|
||||
*/
|
||||
StringSearch::StringSearch(const StringSearch& that) :
|
||||
SearchIterator(that),
|
||||
valueList(NULL),
|
||||
valueListLen(that.valueListLen),
|
||||
normLen(that.normLen), // num. of collation elements in pattern.
|
||||
minLen(that.minLen), // Min of composed, decomposed versions
|
||||
maxLen(that.maxLen),
|
||||
collator(that.collator),
|
||||
strength(that.strength),
|
||||
iter(NULL),
|
||||
it(NULL)
|
||||
{
|
||||
valueList = new int32_t[valueListLen];
|
||||
memcpy(valueList, that.valueList, valueListLen*sizeof(int32_t));
|
||||
iter = that.collator->createCollationElementIterator(that.getTarget());
|
||||
it = that.collator->createCollationElementIterator(that.pattern);
|
||||
}
|
||||
|
||||
StringSearch::StringSearch(const UnicodeString& pat,
|
||||
CharacterIterator* target,
|
||||
const Locale& loc,
|
||||
UErrorCode& status) :
|
||||
SearchIterator(),
|
||||
pattern(pat),
|
||||
valueList(NULL),
|
||||
valueListLen(0),
|
||||
normLen(0), // num. of collation elements in pattern.
|
||||
minLen(0), // Min of composed, decomposed versions
|
||||
maxLen(0) // Max
|
||||
{
|
||||
if (U_FAILURE(status)) return;
|
||||
this->adoptTarget(target);
|
||||
collator = (RuleBasedCollator*)Collator::createInstance(loc, status);
|
||||
iter = collator->createCollationElementIterator(*target);
|
||||
it = collator->createCollationElementIterator(pat);
|
||||
|
||||
strength = collator->getStrength();
|
||||
|
||||
initialize(status);
|
||||
}
|
||||
|
||||
bool_t
|
||||
StringSearch::operator==(const SearchIterator& that) const
|
||||
{
|
||||
if (that.getDynamicClassID() != getDynamicClassID())
|
||||
return FALSE;
|
||||
if (!SearchIterator::operator==(that))
|
||||
return FALSE;
|
||||
const StringSearch& that2 = (const StringSearch&)that;
|
||||
if (*that2.iter != *iter) return FALSE;
|
||||
else if (*that2.collator != *collator) return FALSE;
|
||||
else if (that2.strength != strength) return FALSE;
|
||||
else if (that2.valueListLen != valueListLen) return FALSE;
|
||||
else if (memcmp(that2.valueList, valueList, valueListLen*sizeof(int32_t)) != 0) return FALSE;
|
||||
else if (that2.pattern != pattern) return FALSE;
|
||||
else if (that2.normLen != normLen) return FALSE;
|
||||
else if (that2.minLen != minLen) return FALSE;
|
||||
else if (that2.maxLen != maxLen) return FALSE;
|
||||
else return TRUE;
|
||||
}
|
||||
|
||||
SearchIterator*
|
||||
StringSearch::clone(void) const
|
||||
{
|
||||
return new StringSearch(*this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a <code>StringSearch</code> object using the collator for the default
|
||||
* locale
|
||||
* <p>
|
||||
* @param pattern The text for which this object will search.
|
||||
*
|
||||
* @param target The text in which to search for the pattern.
|
||||
*
|
||||
* @param collator A <code>RuleBasedCollator</code> object which defines the
|
||||
* language-sensitive comparison rules used to determine
|
||||
* whether text in the pattern and target matches.
|
||||
*/
|
||||
StringSearch::StringSearch(const UnicodeString& pat,
|
||||
const UnicodeString& newText,
|
||||
UErrorCode& status) :
|
||||
SearchIterator(),
|
||||
pattern(pat),
|
||||
valueList(NULL),
|
||||
valueListLen(0),
|
||||
normLen(0), // num. of collation elements in pattern.
|
||||
minLen(0), // Min of composed, decomposed versions
|
||||
maxLen(0) // Max
|
||||
{
|
||||
StringCharacterIterator *s = new StringCharacterIterator(newText);
|
||||
collator = (RuleBasedCollator*)Collator::createInstance(Locale::getDefault(), status);
|
||||
strength = collator->getStrength();
|
||||
iter = collator->createCollationElementIterator(newText);
|
||||
it = collator->createCollationElementIterator(pat);
|
||||
this->adoptTarget(s);
|
||||
initialize(status);
|
||||
}
|
||||
|
||||
StringSearch::~StringSearch(void)
|
||||
{
|
||||
if (valueList != NULL) {
|
||||
delete [] valueList;
|
||||
valueList = 0;
|
||||
}
|
||||
if (iter != NULL) {
|
||||
delete iter;
|
||||
iter = 0;
|
||||
}
|
||||
if (collator != NULL) {
|
||||
delete collator;
|
||||
collator = 0;
|
||||
}
|
||||
if (it != NULL) {
|
||||
delete it;
|
||||
it = 0;
|
||||
}
|
||||
}
|
||||
//-------------------------------------------------------------------
|
||||
// Getters and Setters
|
||||
//-------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Sets this object's strength property. The strength determines the
|
||||
* minimum level of difference considered significant during a
|
||||
* search. Generally, {@link Collator#TERTIARY} and
|
||||
* {@link Collator#IDENTICAL} indicate that all differences are
|
||||
* considered significant, {@link Collator#SECONDARY} indicates
|
||||
* that upper/lower case distinctions should be ignored, and
|
||||
* {@link Collator#PRIMARY} indicates that both case and accents
|
||||
* should be ignored. However, the exact meanings of these constants
|
||||
* are determined by individual Collator objects.
|
||||
* <p>
|
||||
* @see Collator#PRIMARY
|
||||
* @see Collator#SECONDARY
|
||||
* @see Collator#TERTIARY
|
||||
* @see Collator#IDENTICAL
|
||||
*/
|
||||
void StringSearch::setStrength(Collator::ECollationStrength newStrength, UErrorCode& status) {
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
return;
|
||||
}
|
||||
strength = newStrength;
|
||||
|
||||
// Due to a bug (?) in CollationElementIterator, we must set the
|
||||
// collator's strength as well, since the iterator is going to
|
||||
// mask out the portions of the collation element that are not
|
||||
// relevant for the collator's current strength setting
|
||||
// Note that this makes it impossible to share a Collator among
|
||||
// multiple StringSearch objects if you adjust Strength settings.
|
||||
collator->setStrength(strength);
|
||||
initialize(status);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns this object's strength property, which indicates what level
|
||||
* of differences are considered significant during a search.
|
||||
* <p>
|
||||
* @see #setStrength
|
||||
*/
|
||||
Collator::ECollationStrength StringSearch::getStrength() const
|
||||
{
|
||||
return strength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the collator to be used for this string search. Also changes
|
||||
* the search strength to match that of the new collator.
|
||||
* <p>
|
||||
* This method causes internal data such as Boyer-Moore shift tables
|
||||
* to be recalculated, but the iterator's position is unchanged.
|
||||
* <p>
|
||||
* @see #getCollator
|
||||
*/
|
||||
void StringSearch::setCollator(const RuleBasedCollator *coll, UErrorCode& status)
|
||||
{
|
||||
delete iter;
|
||||
delete collator;
|
||||
collator = (RuleBasedCollator*)coll->clone();
|
||||
strength = collator->getStrength();
|
||||
// Also need to recompute the pattern and get a new target iterator
|
||||
iter = collator->createCollationElementIterator(getTarget());
|
||||
initialize(status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the RuleBasedCollator being used for this string search.
|
||||
*/
|
||||
const RuleBasedCollator& StringSearch::getCollator(void) const
|
||||
{
|
||||
return *collator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the pattern for which to search.
|
||||
* This method causes internal data such as Boyer-Moore shift tables
|
||||
* to be recalculated, but the iterator's position is unchanged.
|
||||
*/
|
||||
void StringSearch::setPattern(const UnicodeString& pat, UErrorCode& status)
|
||||
{
|
||||
pattern = pat;
|
||||
initialize(status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the pattern for which this object is searching.
|
||||
*/
|
||||
const UnicodeString& StringSearch::getPattern() const
|
||||
{
|
||||
return pattern;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the target text which should be searched and resets the
|
||||
* iterator's position to point before the start of the new text.
|
||||
* This method is useful if you want to re-use an iterator to
|
||||
* search for the same pattern within a different body of text.
|
||||
*/
|
||||
void StringSearch::adoptTarget(CharacterIterator* target)
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
SearchIterator::adoptTarget(target);
|
||||
|
||||
// fix me: Skipped the error code
|
||||
// Since we're caching a CollationElementIterator, recreate it
|
||||
iter->setText(*target, status);
|
||||
}
|
||||
void StringSearch::setTarget(const UnicodeString& newText)
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
SearchIterator::setTarget(newText);
|
||||
// Since we're caching a CollationElementIterator, recreate it
|
||||
iter->setText(newText, status);
|
||||
}
|
||||
|
||||
void StringSearch::reset(void)
|
||||
{
|
||||
SearchIterator::reset();
|
||||
iter->reset();
|
||||
}//-------------------------------------------------------------------
|
||||
// Privates
|
||||
//-------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Search forward for matching text, starting at a given location.
|
||||
* Clients should not call this method directly; instead they should call
|
||||
* {@link SearchIterator#next}.
|
||||
* <p>
|
||||
* If a match is found, this method returns the index at which the match
|
||||
* starts and calls {@link SearchIterator#setMatchLength}
|
||||
* with the number of characters in the target
|
||||
* text that make up the match. If no match is found, the method returns
|
||||
* <code>DONE</code> and does not call <tt>setMatchLength</tt>.
|
||||
* <p>
|
||||
* @param start The index in the target text at which the search starts.
|
||||
*
|
||||
* @return The index at which the matched text in the target starts, or DONE
|
||||
* if no match was found.
|
||||
* <p>
|
||||
* @see SearchIterator#next
|
||||
* @see SearchIterator#DONE
|
||||
*/
|
||||
int32_t StringSearch::handleNext(int32_t start, UErrorCode& status)
|
||||
{
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
return SearchIterator::DONE;
|
||||
}
|
||||
const CharacterIterator& target = getTarget();
|
||||
|
||||
int mask = getMask(strength);
|
||||
int done = CollationElementIterator::NULLORDER & mask;
|
||||
#if 0
|
||||
if (DEBUG) {
|
||||
debug("-------------------------handleNext-----------------------------------");
|
||||
debug("");
|
||||
debug("strength=" + strength + ", mask=" + Integer.toString(mask,16)
|
||||
+ ", done=" + Integer.toString(done,16));
|
||||
debug("decomp=" + collator.getDecomposition());
|
||||
|
||||
debug("target.begin=" + getTarget().getBeginIndex());
|
||||
debug("target.end=" + getTarget().getEndIndex());
|
||||
debug("start = " + start);
|
||||
}
|
||||
#endif
|
||||
int32_t index = start + minLen;
|
||||
int32_t matchEnd = 0;
|
||||
|
||||
while (index <= target.endIndex())
|
||||
{
|
||||
int32_t patIndex = normLen;
|
||||
int32_t tval = 0, pval = 0;
|
||||
bool_t getP = TRUE;
|
||||
|
||||
iter->setOffset(index, status);
|
||||
matchEnd = index;
|
||||
|
||||
//if (DEBUG) debug(" outer loop: patIndex=" + patIndex + ", index=" + index);
|
||||
|
||||
while ((patIndex > 0 || getP == false) && iter->getOffset() > start)
|
||||
{
|
||||
#if 0
|
||||
if (DEBUG) {
|
||||
debug(" inner loop: patIndex=" + patIndex + " iter=" + iter.getOffset());
|
||||
debug(" getP=" + getP);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Get the previous character in both the pattern and the target
|
||||
tval = iter->previous(status) & mask;
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
return SearchIterator::DONE;
|
||||
}
|
||||
|
||||
if (getP) pval = valueList[--patIndex];
|
||||
getP = TRUE;
|
||||
|
||||
// (DEBUG) debug(" pval=" + Integer.toString(pval,16) + ", tval=" + Integer.toString(tval,16));
|
||||
|
||||
if (tval == 0) { // skip tval, use same pval
|
||||
// (DEBUG) debug(" tval is ignorable");
|
||||
getP = FALSE;
|
||||
}
|
||||
else if (pval != tval) { // Mismatch, skip ahead
|
||||
// (DEBUG) debug(" mismatch: skippping " + getShift(tval, patIndex));
|
||||
|
||||
index += getShift(tval, patIndex);
|
||||
break;
|
||||
}
|
||||
else if (patIndex == 0) {
|
||||
// The values matched, and we're at the beginning of the pattern,
|
||||
// which means we matched the whole thing.
|
||||
start = iter->getOffset();
|
||||
setMatchLength(matchEnd - start);
|
||||
// if (DEBUG) debug("Found match at index "+ start );
|
||||
return start;
|
||||
}
|
||||
}
|
||||
#if 0
|
||||
if (DEBUG) debug(" end of inner loop: patIndex=" + patIndex + " iter=" + iter.getOffset());
|
||||
if (DEBUG) debug(" getP=" + getP);
|
||||
#endif
|
||||
if (iter->getOffset() <= start) {
|
||||
// We hit the beginning of the text being searched, which is
|
||||
// possible if it contains lots of ignorable characters.
|
||||
// Advance one character and try again.
|
||||
// if (DEBUG) debug("hit beginning of target; advance by one");
|
||||
index++;
|
||||
}
|
||||
}
|
||||
// if (DEBUG) debug("Fell off end of outer loop; returning DONE");
|
||||
return SearchIterator::DONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Search backward for matching text ,starting at a given location.
|
||||
* Clients should not call this method directly; instead they should call
|
||||
* <code>SearchIterator.previous()</code>, which this method overrides.
|
||||
* <p>
|
||||
* If a match is found, this method returns the index at which the match
|
||||
* starts and calls {@link SearchIterator#setMatchLength}
|
||||
* with the number of characters in the target
|
||||
* text that make up the match. If no match is found, the method returns
|
||||
* <code>DONE</code> and does not call <tt>setMatchLength</tt>.
|
||||
* <p>
|
||||
* @param start The index in the target text at which the search starts.
|
||||
*
|
||||
* @return The index at which the matched text in the target starts, or DONE
|
||||
* if no match was found.
|
||||
* <p>
|
||||
* @see SearchIterator#previous
|
||||
* @see SearchIterator#DONE
|
||||
*/
|
||||
int32_t StringSearch::handlePrev(int32_t start, UErrorCode& status)
|
||||
{
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
return SearchIterator::DONE;
|
||||
}
|
||||
int patLen = normLen;
|
||||
int index = start - minLen;
|
||||
|
||||
int mask = getMask(strength);
|
||||
int done = CollationElementIterator.NULLORDER & mask;
|
||||
#if 0
|
||||
if (DEBUG) {
|
||||
debug("-------------------------handlePrev-----------------------------------");
|
||||
debug("");
|
||||
debug("strength=" + strength + ", mask=" + Integer.toString(mask,16)
|
||||
+ ", done=" + Integer.toString(done,16));
|
||||
debug("decomp=" + collator.getDecomposition());
|
||||
|
||||
debug("target.begin=" + getTarget().getBeginIndex());
|
||||
debug("target.end=" + getTarget().getEndIndex());
|
||||
}
|
||||
#endif
|
||||
|
||||
while (index >= 0) {
|
||||
int patIndex = 0;
|
||||
int tval = 0, pval = 0;
|
||||
bool_t getP = TRUE;
|
||||
|
||||
iter->setOffset(index, status);
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
return SearchIterator::DONE;
|
||||
}
|
||||
|
||||
|
||||
// if (DEBUG) debug(" outer loop: patIndex=" + patIndex + ", index=" + index);
|
||||
|
||||
while ((patIndex < patLen || !getP) && iter->getOffset() < start)
|
||||
{
|
||||
/* if (DEBUG) {
|
||||
debug(" inner loop: patIndex=" + patIndex + " iter=" + iter.getOffset());
|
||||
}
|
||||
*/
|
||||
tval = iter->next(status) & mask;
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
return SearchIterator::DONE;
|
||||
}
|
||||
if (getP) pval = valueList[patIndex++];
|
||||
getP = TRUE;
|
||||
|
||||
//if (DEBUG) debug(" pval=" + Integer.toString(pval,16) + ", tval=" + Integer.toString(tval,16));
|
||||
|
||||
if (tval == done) {
|
||||
// if (DEBUG) debug(" end of target; no match");
|
||||
return DONE;
|
||||
}
|
||||
else if (tval == 0) {
|
||||
// if (DEBUG) debug(" tval is ignorable");
|
||||
getP = false;
|
||||
}
|
||||
else if (pval != tval) {
|
||||
// We didn't match this pattern. Skip ahead
|
||||
// if (DEBUG) debug(" mismatch: skippping " + getBackShift(tval, patIndex));
|
||||
|
||||
int shift = getBackShift(tval, patIndex);
|
||||
index -= shift;
|
||||
break;
|
||||
}
|
||||
else if (patIndex == patLen) {
|
||||
// The elements matched and we're at the end of the pattern,
|
||||
// which means we matched the whole thing.
|
||||
setMatchLength(iter->getOffset() - index);
|
||||
return index;
|
||||
}
|
||||
}
|
||||
if (iter->getOffset() >= start) {
|
||||
// We hit the end of the text being searched, which is
|
||||
// possible if it contains lots of ignorable characters.
|
||||
// Back up one character and try again.
|
||||
// if (DEBUG) debug("hit end of target; back by one");
|
||||
index--;
|
||||
}
|
||||
}
|
||||
return SearchIterator::DONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a bitmask that will select only the portions of a collation
|
||||
* element that are significant at the given strength level.
|
||||
*/
|
||||
int32_t StringSearch::getMask(Collator::ECollationStrength strength)
|
||||
{
|
||||
switch (strength) {
|
||||
case Collator::PRIMARY:
|
||||
return 0xFFFF0000;
|
||||
case Collator::SECONDARY:
|
||||
return 0xFFFFFF00;
|
||||
default:
|
||||
return 0xFFFFFFFF;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void StringSearch::initialize(UErrorCode& status) {
|
||||
/*
|
||||
if (DEBUG) {
|
||||
debug("-------------------------initialize-----------------------------------");
|
||||
debug("pattern=" + pattern);
|
||||
}
|
||||
*/
|
||||
it->setText(pattern, status);
|
||||
if (U_FAILURE(status)) {
|
||||
delete it;
|
||||
return;
|
||||
}
|
||||
|
||||
int mask = getMask(strength);
|
||||
|
||||
// See how many non-ignorable collation keys are in the text
|
||||
normLen = 0;
|
||||
int32_t elem;
|
||||
while ((elem = it->next(status)) != CollationElementIterator::NULLORDER)
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
if ((elem & mask) != 0) {
|
||||
normLen++;
|
||||
}
|
||||
}
|
||||
|
||||
// Save them all
|
||||
valueList = new int32_t[normLen];
|
||||
int expandLen = 0;
|
||||
it->reset();
|
||||
|
||||
for (int32_t i = 0; i < normLen; i++)
|
||||
{
|
||||
elem = it->next(status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if ((elem & mask) != 0) {
|
||||
valueList[i] = elem & mask;
|
||||
|
||||
}
|
||||
// Keep track of whether there are any expanding-character
|
||||
// sequences that can result in one of the characters that's in
|
||||
// the pattern. If there are, we have to reduce the shift
|
||||
// distances calculated below to account for it.
|
||||
expandLen += it->getMaxExpansion(elem) - 1;
|
||||
}
|
||||
|
||||
//
|
||||
// We need to remember the size of the composed and decomposed
|
||||
// versions of the string. Standard Boyer-Moore shift calculations
|
||||
// can be wrong by an amount up to that difference, since a small
|
||||
// small number of characters in the pattern can map to a larger
|
||||
// number in the text being searched, or vice-versa.
|
||||
//
|
||||
int uniLen = pattern.length();
|
||||
maxLen = uprv_max(normLen, uniLen);
|
||||
minLen = uprv_min(normLen, uniLen) - expandLen;
|
||||
|
||||
|
||||
/*
|
||||
if (DEBUG) debug("normLen=" + normLen + ", expandLen=" + expandLen
|
||||
+ ", maxLen=" + maxLen + ", minLen=" + minLen);
|
||||
*/
|
||||
// Now initialize the shift tables
|
||||
//
|
||||
// NOTE: This is the most conservative way to build them. If we had a way
|
||||
// of knowing that there were no expanding/contracting chars in the rules,
|
||||
// we could get rid of the "- 1" in the shiftTable calculations.
|
||||
// But all of the default collators have at least one expansion or
|
||||
// contraction, so it probably doesn't matter anyway.
|
||||
//
|
||||
for (i = 0; i < 256; i++) {
|
||||
shiftTable[i] = backShiftTable[i] = minLen;
|
||||
}
|
||||
|
||||
for (i = 0; i < normLen-1; i++) {
|
||||
shiftTable[hash(valueList[i])] = uprv_max(minLen - i - 1, 1);
|
||||
}
|
||||
shiftTable[hash(valueList[normLen-1])] = 1;
|
||||
|
||||
for (i = normLen - 1; i > 0; i--) {
|
||||
backShiftTable[hash(valueList[i])] = i;
|
||||
}
|
||||
backShiftTable[hash(valueList[0])] = 1;
|
||||
|
||||
/* dumpTables(); */
|
||||
}
|
||||
|
||||
/**
|
||||
* Method used by StringSearch to determine how far to the right to
|
||||
* shift the pattern during a Boyer-Moore search.
|
||||
*
|
||||
* @param curValue The current value in the target text
|
||||
* @param curIndex The index in the pattern at which we failed to match
|
||||
* curValue in the target text.
|
||||
*/
|
||||
int32_t StringSearch::getShift( int32_t curValue, int32_t curIndex ) const
|
||||
{
|
||||
int32_t shiftAmt = shiftTable[hash(curValue)];
|
||||
|
||||
if (minLen != maxLen) {
|
||||
int adjust = normLen - curIndex;
|
||||
if (shiftAmt > adjust + 1) {
|
||||
// if (DEBUG) debug("getShift: adjusting by " + adjust);
|
||||
shiftAmt -= adjust;
|
||||
}
|
||||
}
|
||||
return shiftAmt;
|
||||
}
|
||||
|
||||
/**
|
||||
* Method used by StringSearch to determine how far to the left to
|
||||
* shift the pattern during a reverse Boyer-Moore search.
|
||||
*
|
||||
* @param curValue The current value in the target text
|
||||
* @param curIndex The index in the pattern at which we failed to match
|
||||
* curValue in the target text.
|
||||
*/
|
||||
int32_t StringSearch::getBackShift( int32_t curValue, int32_t curIndex ) const
|
||||
{
|
||||
int shiftAmt = backShiftTable[hash(curValue)];
|
||||
|
||||
if (minLen != maxLen) {
|
||||
int adjust = normLen - (minLen - curIndex);
|
||||
if (shiftAmt > adjust + 1) {
|
||||
// if (DEBUG) debug("getBackShift: adjusting by " + adjust);
|
||||
shiftAmt -= adjust;
|
||||
}
|
||||
}
|
||||
return shiftAmt;
|
||||
}
|
||||
|
||||
/**
|
||||
* Hash a collation element from its full size (32 bits) down into a
|
||||
* value that can be used as an index into the shift tables. Right
|
||||
* now we do a modulus by the size of the hash table.
|
||||
*
|
||||
* TODO: At some point I should experiment to see whether a slightly
|
||||
* more complicated hash function gives us a better distribution
|
||||
* on multilingual text. I doubt it will have much effect on
|
||||
* performance, though.
|
||||
*/
|
||||
int32_t StringSearch::hash(int32_t order)
|
||||
{
|
||||
return CollationElementIterator::primaryOrder(order) % 256;
|
||||
}
|
||||
|
393
icu4c/source/samples/search/strsrch.h
Normal file
393
icu4c/source/samples/search/strsrch.h
Normal file
|
@ -0,0 +1,393 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2000 IBM and others. All rights reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 03/22/2000 helena Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef STRSRCH_H
|
||||
#define STRSRCH_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/chariter.h"
|
||||
#include "unicode/tblcoll.h"
|
||||
#include "unicode/brkiter.h"
|
||||
#include "srchiter.h"
|
||||
|
||||
class SearchIterator;
|
||||
/**
|
||||
* <code>StringSearch</code> is a <code>SearchIterator</code> that provides
|
||||
* language-sensitive text searching based on the comparison rules defined
|
||||
* in a {@link RuleBasedCollator} object.
|
||||
* Instances of <code>StringSearch</code> function as iterators
|
||||
* maintain a current position and scan over text returning the index of
|
||||
* characters where the pattern occurs and the length of each match.
|
||||
* <p>
|
||||
* <code>StringSearch</code> uses a version of the fast Boyer-Moore search
|
||||
* algorithm that has been adapted to work with the large character set of
|
||||
* Unicode. See "Efficient Text Searching in Java", to be published in
|
||||
* <i>Java Report</i> in February, 1999, for further information on the algorithm.
|
||||
* <p>
|
||||
* Consult the <code>SearchIterator</code> documentation for information on
|
||||
* and examples of how to use instances of this class to implement text
|
||||
* searching. <code>SearchIterator</code> provides all of the necessary
|
||||
* API; this class only provides constructors and internal implementation
|
||||
* methods.
|
||||
*
|
||||
* @see SearchIterator
|
||||
* @see RuleBasedCollator
|
||||
*
|
||||
* @author Laura Werner
|
||||
* @version 1.0
|
||||
*/
|
||||
|
||||
class StringSearch : public SearchIterator
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* Construct a <code>StringSearch</code> object using a specific collator and set
|
||||
* of boundary-detection rules.
|
||||
* <p>
|
||||
* @param pat The text for which this object will search.
|
||||
*
|
||||
* @param target The text in which to search for the pattern.
|
||||
*
|
||||
* @param coll A <code>RuleBasedCollator</code> object which defines the
|
||||
* language-sensitive comparison rules used to determine
|
||||
* whether text in the pattern and target matches.
|
||||
*
|
||||
* @param breaker A <code>BreakIterator</code> object used to constrain the matches
|
||||
* that are found. Matches whose start and end indices
|
||||
* in the target text are not boundaries as determined
|
||||
* by the <code>BreakIterator</code> are ignored. If this behavior
|
||||
* is not desired, <code>null</code> can be passed in instead.
|
||||
*/
|
||||
StringSearch(const UnicodeString& pat,
|
||||
CharacterIterator* target,
|
||||
RuleBasedCollator* coll,
|
||||
BreakIterator* breaker,
|
||||
UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Construct a <code>StringSearch</code> object using a specific collator.
|
||||
* <p>
|
||||
* @param pattern The text for which this object will search.
|
||||
*
|
||||
* @param target The text in which to search for the pattern.
|
||||
*
|
||||
* @param collator A <code>RuleBasedCollator</code> object which defines the
|
||||
* language-sensitive comparison rules used to determine
|
||||
* whether text in the pattern and target matches.
|
||||
*/
|
||||
StringSearch(const UnicodeString& pattern,
|
||||
CharacterIterator* target,
|
||||
RuleBasedCollator* collator,
|
||||
UErrorCode& status);
|
||||
|
||||
/**
|
||||
* copy constructor
|
||||
*/
|
||||
StringSearch(const StringSearch& that);
|
||||
|
||||
/**
|
||||
* Construct a <code>StringSearch</code> object using the collator and
|
||||
* character boundary detection rules for a given locale
|
||||
* <p>
|
||||
* @param pattern The text for which this object will search.
|
||||
*
|
||||
* @param target The text in which to search for the pattern.
|
||||
*
|
||||
* @param loc The locale whose collation and break-detection rules
|
||||
* should be used.
|
||||
*
|
||||
* @exception ClassCastException thrown if the collator for the specified
|
||||
* locale is not a RuleBasedCollator.
|
||||
*/
|
||||
StringSearch(const UnicodeString& pattern,
|
||||
CharacterIterator* target,
|
||||
const Locale& loc,
|
||||
UErrorCode& status);
|
||||
/**
|
||||
* Construct a <code>StringSearch</code> object using the collator for the default
|
||||
* locale
|
||||
* <p>
|
||||
* @param pattern The text for which this object will search.
|
||||
*
|
||||
* @param target The text in which to search for the pattern.
|
||||
*
|
||||
* @param collator A <code>RuleBasedCollator</code> object which defines the
|
||||
* language-sensitive comparison rules used to determine
|
||||
* whether text in the pattern and target matches.
|
||||
*/
|
||||
StringSearch(const UnicodeString& pattern,
|
||||
const UnicodeString& target,
|
||||
UErrorCode& status);
|
||||
|
||||
virtual ~StringSearch(void);
|
||||
/**
|
||||
* Assignment operator. Sets this iterator to have the same behavior,
|
||||
* and iterate over the same text, as the one passed in.
|
||||
*/
|
||||
StringSearch& operator=(const StringSearch& that);
|
||||
|
||||
/**
|
||||
* Equality operator. Returns TRUE if both BreakIterators are of the
|
||||
* same class, have the same behavior, and iterate over the same text.
|
||||
*/
|
||||
virtual bool_t operator==(const SearchIterator& that) const;
|
||||
|
||||
/**
|
||||
* Not-equal operator. If operator== returns TRUE, this returns FALSE,
|
||||
* and vice versa.
|
||||
*/
|
||||
bool_t operator!=(const SearchIterator& that) const;
|
||||
|
||||
/**
|
||||
* Returns a newly-constructed RuleBasedBreakIterator with the same
|
||||
* behavior, and iterating over the same text, as this one.
|
||||
*/
|
||||
virtual SearchIterator* clone(void) const;
|
||||
|
||||
//-------------------------------------------------------------------
|
||||
// Getters and Setters
|
||||
//-------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Sets this object's strength property. The strength determines the
|
||||
* minimum level of difference considered significant during a
|
||||
* search. Generally, {@link Collator#TERTIARY} and
|
||||
* {@link Collator#IDENTICAL} indicate that all differences are
|
||||
* considered significant, {@link Collator#SECONDARY} indicates
|
||||
* that upper/lower case distinctions should be ignored, and
|
||||
* {@link Collator#PRIMARY} indicates that both case and accents
|
||||
* should be ignored. However, the exact meanings of these constants
|
||||
* are determined by individual Collator objects.
|
||||
* <p>
|
||||
* @see Collator#PRIMARY
|
||||
* @see Collator#SECONDARY
|
||||
* @see Collator#TERTIARY
|
||||
* @see Collator#IDENTICAL
|
||||
*/
|
||||
void setStrength(Collator::ECollationStrength newStrength, UErrorCode& status);
|
||||
|
||||
|
||||
/**
|
||||
* Returns this object's strength property, which indicates what level
|
||||
* of differences are considered significant during a search.
|
||||
* <p>
|
||||
* @see #setStrength
|
||||
*/
|
||||
Collator::ECollationStrength getStrength(void) const;
|
||||
|
||||
/**
|
||||
* Set the collator to be used for this string search. Also changes
|
||||
* the search strength to match that of the new collator.
|
||||
* <p>
|
||||
* This method causes internal data such as Boyer-Moore shift tables
|
||||
* to be recalculated, but the iterator's position is unchanged.
|
||||
* <p>
|
||||
* @see #getCollator
|
||||
*/
|
||||
void setCollator(const RuleBasedCollator* coll, UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Return the RuleBasedCollator being used for this string search.
|
||||
*/
|
||||
const RuleBasedCollator& getCollator() const;
|
||||
|
||||
/**
|
||||
* Set the pattern for which to search.
|
||||
* This method causes internal data such as Boyer-Moore shift tables
|
||||
* to be recalculated, but the iterator's position is unchanged.
|
||||
*/
|
||||
void setPattern(const UnicodeString& pat, UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Returns the pattern for which this object is searching.
|
||||
*/
|
||||
const UnicodeString& getPattern() const;
|
||||
|
||||
/**
|
||||
* Set the target text which should be searched and resets the
|
||||
* iterator's position to point before the start of the new text.
|
||||
* This method is useful if you want to re-use an iterator to
|
||||
* search for the same pattern within a different body of text.
|
||||
*/
|
||||
virtual void setTarget(const UnicodeString& newText);
|
||||
|
||||
/**
|
||||
* Set the target text which should be searched and resets the
|
||||
* iterator's position to point before the start of the target text.
|
||||
* This method is useful if you want to re-use an iterator to
|
||||
* search for the same pattern within a different body of text.
|
||||
*
|
||||
* @see #getTarget
|
||||
*/
|
||||
virtual void adoptTarget(CharacterIterator* iterator);
|
||||
|
||||
/** Reset iterator
|
||||
*/
|
||||
virtual void reset(void);
|
||||
/**
|
||||
* Returns a unique class ID POLYMORPHICALLY. Pure virtual override.
|
||||
* This method is to implement a simple version of RTTI, since not all
|
||||
* C++ compilers support genuine RTTI. Polymorphic operator==() and
|
||||
* clone() methods call this method.
|
||||
*
|
||||
* @return The class ID for this object. All objects of a
|
||||
* given class have the same class ID. Objects of
|
||||
* other classes have different class IDs.
|
||||
*/
|
||||
inline virtual UClassID getDynamicClassID(void) const;
|
||||
|
||||
/**
|
||||
* Returns the class ID for this class. This is useful only for
|
||||
* comparing to a return value from getDynamicClassID(). For example:
|
||||
*
|
||||
* Base* polymorphic_pointer = createPolymorphicObject();
|
||||
* if (polymorphic_pointer->getDynamicClassID() ==
|
||||
* Derived::getStaticClassID()) ...
|
||||
*
|
||||
* @return The class ID for all objects of this class.
|
||||
*/
|
||||
inline static UClassID getStaticClassID(void);
|
||||
|
||||
protected:
|
||||
//-------------------------------------------------------------------
|
||||
// Privates
|
||||
//-------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Search forward for matching text, starting at a given location.
|
||||
* Clients should not call this method directly; instead they should call
|
||||
* {@link SearchIterator#next}.
|
||||
* <p>
|
||||
* If a match is found, this method returns the index at which the match
|
||||
* starts and calls {@link SearchIterator#setMatchLength}
|
||||
* with the number of characters in the target
|
||||
* text that make up the match. If no match is found, the method returns
|
||||
* <code>DONE</code> and does not call <tt>setMatchLength</tt>.
|
||||
* <p>
|
||||
* @param start The index in the target text at which the search starts.
|
||||
*
|
||||
* @return The index at which the matched text in the target starts, or DONE
|
||||
* if no match was found.
|
||||
* <p>
|
||||
* @see SearchIterator#next
|
||||
* @see SearchIterator#DONE
|
||||
*/
|
||||
virtual int32_t handleNext(int32_t start, UErrorCode& status);
|
||||
/**
|
||||
* Search backward for matching text ,starting at a given location.
|
||||
* Clients should not call this method directly; instead they should call
|
||||
* <code>SearchIterator.previous()</code>, which this method overrides.
|
||||
* <p>
|
||||
* If a match is found, this method returns the index at which the match
|
||||
* starts and calls {@link SearchIterator#setMatchLength}
|
||||
* with the number of characters in the target
|
||||
* text that make up the match. If no match is found, the method returns
|
||||
* <code>DONE</code> and does not call <tt>setMatchLength</tt>.
|
||||
* <p>
|
||||
* @param start The index in the target text at which the search starts.
|
||||
*
|
||||
* @return The index at which the matched text in the target starts, or DONE
|
||||
* if no match was found.
|
||||
* <p>
|
||||
* @see SearchIterator#previous
|
||||
* @see SearchIterator#DONE
|
||||
*/
|
||||
virtual int32_t handlePrev(int32_t start, UErrorCode& status);
|
||||
private:
|
||||
/**
|
||||
* Return a bitmask that will select only the portions of a collation
|
||||
* element that are significant at the given strength level.
|
||||
*/
|
||||
static int32_t getMask(Collator::ECollationStrength strength);
|
||||
|
||||
|
||||
void initialize(UErrorCode& status);
|
||||
/**
|
||||
* Method used by StringSearch to determine how far to the right to
|
||||
* shift the pattern during a Boyer-Moore search.
|
||||
*
|
||||
* @param curValue The current value in the target text
|
||||
* @param curIndex The index in the pattern at which we failed to match
|
||||
* curValue in the target text.
|
||||
*/
|
||||
int32_t getShift( int32_t curValue, int32_t curIndex ) const;
|
||||
|
||||
/**
|
||||
* Method used by StringSearch to determine how far to the left to
|
||||
* shift the pattern during a reverse Boyer-Moore search.
|
||||
*
|
||||
* @param curValue The current value in the target text
|
||||
* @param curIndex The index in the pattern at which we failed to match
|
||||
* curValue in the target text.
|
||||
*/
|
||||
int32_t getBackShift( int32_t curValue, int32_t curIndex ) const;
|
||||
|
||||
/**
|
||||
* Hash a collation element from its full size (32 bits) down into a
|
||||
* value that can be used as an index into the shift tables. Right
|
||||
* now we do a modulus by the size of the hash table.
|
||||
*
|
||||
* TODO: At some point I should experiment to see whether a slightly
|
||||
* more complicated hash function gives us a better distribution
|
||||
* on multilingual text. I doubt it will have much effect on
|
||||
* performance, though.
|
||||
*/
|
||||
static int32_t hash(int32_t order);
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// Private Data
|
||||
//
|
||||
CollationElementIterator *iter;
|
||||
RuleBasedCollator *collator;
|
||||
/* HSYS ? Why? Changes to this will not affect collator. no changes to the comparsion result */
|
||||
Collator::ECollationStrength strength;
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// Everything from here on down is the data used to represent the
|
||||
// Boyer-Moore shift tables and the code that generates and manipulates
|
||||
// them.
|
||||
//
|
||||
int32_t *valueList;
|
||||
int32_t valueListLen;
|
||||
int32_t shiftTable[256];
|
||||
int32_t backShiftTable[256];
|
||||
|
||||
UnicodeString pattern; // The pattern string
|
||||
int32_t normLen; // num. of collation elements in pattern.
|
||||
int32_t minLen; // Min of composed, decomposed versions
|
||||
int32_t maxLen; // Max
|
||||
CollationElementIterator *it; // to be removed
|
||||
|
||||
private:
|
||||
/* to be removed */
|
||||
void dumpTables();
|
||||
/**
|
||||
* Class ID
|
||||
*/
|
||||
static char fgClassID;
|
||||
};
|
||||
|
||||
inline bool_t StringSearch::operator!=(const SearchIterator& that) const
|
||||
{
|
||||
return !operator==(that);
|
||||
}
|
||||
|
||||
inline UClassID StringSearch::getDynamicClassID(void) const
|
||||
{
|
||||
return StringSearch::getStaticClassID();
|
||||
}
|
||||
|
||||
inline UClassID StringSearch::getStaticClassID(void)
|
||||
{
|
||||
return (UClassID)(&fgClassID);
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
Loading…
Add table
Reference in a new issue