diff --git a/icu4c/source/samples/search/search.cpp b/icu4c/source/samples/search/search.cpp new file mode 100644 index 00000000000..9fa31a2bfc7 --- /dev/null +++ b/icu4c/source/samples/search/search.cpp @@ -0,0 +1,170 @@ +/************************************************************************** +* +* Copyright (C) 2000, International Business Machines +* Corporation and others. All Rights Reserved. +* +*************************************************************************** +* file name: colex.cpp +* +* created on: 2001June8 +* created by: Helena Shih +* +* Sample code for the ICU Search C++ routines. +*/ +#include +#include "unicode/utypes.h" +#include "unicode/unistr.h" +#include "unicode/locid.h" + +#include "strsrch.h" + +int main() +{ + UErrorCode status = U_ZERO_ERROR; + UnicodeString target("A quick fox jumped over the lazy dog.", ""); + UnicodeString easyPatterns[] = {"FoX", "CAT", "jump", "under" }; + int exactOffsets[] = { -1, -1, 12, -1 }; + int tertiaryOffsets[] = { 8, -1, 12, -1 }; + uint32_t patternIndex[] = { 3, 9, 13, 17 }; + UnicodeString monkeyTarget("abcdefgh"); + UnicodeString monkeyTarget2("ijklmnop"); + + int i, j; + int pos = 0; + StringSearch *searchIter = new StringSearch(easyPatterns[0], target, status); + fprintf(stdout, "\n"); + if (U_FAILURE(status)) + { + fprintf(stderr, "Failed to create a StringSearch object for the default locale.\n"); + } + fprintf(stdout, "Try with default normalization mode and strength.\n"); + for (i = 0; TRUE; i++) + { + status = U_ZERO_ERROR; + searchIter->reset(); + pos = searchIter->next(); + if ( pos != exactOffsets[i] ) + fprintf(stdout, "Exact match failed at the index %d pattern.\n", i); + + if (i + 1 == 4) { + break; + } + + searchIter->setPattern(easyPatterns[i+1], status); + if (U_FAILURE(status)) + { + fprintf(stderr, "Failed to set a pattern for %d element.\n", i); + continue; + } + } + fprintf(stdout, "Try now with strength == primary.\n"); + status = U_ZERO_ERROR; + searchIter->setStrength(Collator::PRIMARY, status); + if (U_FAILURE(status)) + { + fprintf(stderr, "Failed to set strength of the string search object.\n"); + } + searchIter->reset(); + searchIter->setPattern(easyPatterns[0], status); + if (U_FAILURE(status)) + { + fprintf(stderr, "Failed to set a pattern for the first element.\n"); + } + pos = searchIter->first(); + if (pos != tertiaryOffsets[0]) + fprintf(stdout, "Tertiary match failed at the first pattern.\n"); + for (i = 1; i < 4; i++) + { + status = U_ZERO_ERROR; + searchIter->setPattern(easyPatterns[i], status); + searchIter->reset(); + pos = searchIter->next(); + if (pos != tertiaryOffsets[i]) + fprintf(stdout, "Tertiary match failed at index %d pattern.\n", i); + } + // Going backwards + searchIter->reset(); + searchIter->setPattern(easyPatterns[--i], status); + if (U_FAILURE(status)) + { + fprintf(stderr, "Failed to set a pattern for the last element.\n"); + } + pos = searchIter->last(); + if (pos != tertiaryOffsets[i]) + fprintf(stdout, "Tertiary match failed at the last pattern.\n"); + for (; i >= 1 ; --i) + { + status = U_ZERO_ERROR; + searchIter->setPattern(easyPatterns[i-1], status); + searchIter->reset(); + pos = searchIter->previous(); + if (pos != tertiaryOffsets[i-1]) + fprintf(stdout, "Walking backwards: tertiary match failed at index %d pattern.\n", i); + } + status = U_ZERO_ERROR; + searchIter->setTarget(monkeyTarget); + if (U_FAILURE(status)) + { + fprintf(stderr, "Failed to set a pattern for the monkey target.\n"); + goto cleanup; + } + searchIter->setStrength(Collator::TERTIARY, status); + // change direction again + searchIter->reset(); + searchIter->setPattern(monkeyTarget, status); + if (U_FAILURE(status)) + { + fprintf(stderr, "Failed to set a pattern as monkey test itself.\n"); + } + pos = searchIter->first(); + if (pos == -1) + fprintf(stdout, "Matching monkey test itself failed.\n"); + for (i = 0; i < monkeyTarget.length() - 1; i++) + { + // will always find its substring + for (j = i+1; j < monkeyTarget.length(); j++) + { + UnicodeString temp; + status = U_ZERO_ERROR; + searchIter->reset(); + monkeyTarget.extract(i, j, temp); + searchIter->setPattern(temp, status); + if (U_FAILURE(status)) + { + fprintf(stderr, "Failed to set a pattern for the %d -th monkey pattern of length %d.\n", i, j); + continue; + } + pos = searchIter->next(); + if (pos == -1) + fprintf(stdout, "Monkey match failed at index %d in monkey pattern of length %d.\n", i, j); + } + } + status = U_ZERO_ERROR; + searchIter->setTarget(monkeyTarget2); + if (U_FAILURE(status)) + { + fprintf(stderr, "Failed to set a pattern for the monkey target2.\n"); + goto cleanup; + } + for (i = 0; i < monkeyTarget.length() - 1; i++) + { + // will never find the match + UnicodeString temp; + status = U_ZERO_ERROR; + monkeyTarget.extract(i, monkeyTarget.length(), temp); + searchIter->reset(); + searchIter->setPattern(temp, status); + if (U_FAILURE(status)) + { + fprintf(stderr, "Failed to set a pattern for the monkey pattern at offset index %d.\n", i); + continue; + } + pos = searchIter->next(); + if (pos != -1) + fprintf(stdout, "Monkey mismatch failed at index %d in monkey pattern.\n", i); + } + +cleanup: + delete searchIter; + return 0; +} \ No newline at end of file diff --git a/icu4c/source/samples/search/search.dsp b/icu4c/source/samples/search/search.dsp new file mode 100644 index 00000000000..81b995bfb38 --- /dev/null +++ b/icu4c/source/samples/search/search.dsp @@ -0,0 +1,118 @@ +# Microsoft Developer Studio Project File - Name="search" - Package Owner=<4> +# Microsoft Developer Studio Generated Build File, Format Version 6.00 +# ** DO NOT EDIT ** + +# TARGTYPE "Win32 (x86) Console Application" 0x0103 + +CFG=search - Win32 Debug +!MESSAGE This is not a valid makefile. To build this project using NMAKE, +!MESSAGE use the Export Makefile command and run +!MESSAGE +!MESSAGE NMAKE /f "search.mak". +!MESSAGE +!MESSAGE You can specify a configuration when running NMAKE +!MESSAGE by defining the macro CFG on the command line. For example: +!MESSAGE +!MESSAGE NMAKE /f "search.mak" CFG="search - Win32 Debug" +!MESSAGE +!MESSAGE Possible choices for configuration are: +!MESSAGE +!MESSAGE "search - Win32 Release" (based on "Win32 (x86) Console Application") +!MESSAGE "search - Win32 Debug" (based on "Win32 (x86) Console Application") +!MESSAGE + +# Begin Project +# PROP AllowPerConfigDependencies 0 +# PROP Scc_ProjName "" +# PROP Scc_LocalPath "" +CPP=cl.exe +RSC=rc.exe + +!IF "$(CFG)" == "search - Win32 Release" + +# PROP BASE Use_MFC 0 +# PROP BASE Use_Debug_Libraries 0 +# PROP BASE Output_Dir "Release" +# PROP BASE Intermediate_Dir "Release" +# PROP BASE Target_Dir "" +# PROP Use_MFC 0 +# PROP Use_Debug_Libraries 0 +# PROP Output_Dir "Release" +# PROP Intermediate_Dir "Release" +# PROP Ignore_Export_Lib 0 +# PROP Target_Dir "" +# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c +# ADD CPP /nologo /MT /W3 /GX /O2 /I "..\..\..\include" /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c +# ADD BASE RSC /l 0x409 /d "NDEBUG" +# ADD RSC /l 0x409 /d "NDEBUG" +BSC32=bscmake.exe +# ADD BASE BSC32 /nologo +# ADD BSC32 /nologo +LINK32=link.exe +# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 +# ADD LINK32 ..\..\..\lib\icuuc.lib ..\..\..\lib\icuin.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 /libpath:"..\..\..\lib" + +!ELSEIF "$(CFG)" == "search - Win32 Debug" + +# PROP BASE Use_MFC 0 +# PROP BASE Use_Debug_Libraries 1 +# PROP BASE Output_Dir "Debug" +# PROP BASE Intermediate_Dir "Debug" +# PROP BASE Target_Dir "" +# PROP Use_MFC 0 +# PROP Use_Debug_Libraries 1 +# PROP Output_Dir "Debug" +# PROP Intermediate_Dir "Debug" +# PROP Ignore_Export_Lib 0 +# PROP Target_Dir "" +# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c +# ADD CPP /nologo /MTd /W3 /Gm /GX /ZI /Od /I "..\..\..\include" /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c +# ADD BASE RSC /l 0x409 /d "_DEBUG" +# ADD RSC /l 0x409 /d "_DEBUG" +BSC32=bscmake.exe +# ADD BASE BSC32 /nologo +# ADD BSC32 /nologo +LINK32=link.exe +# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept +# ADD LINK32 ..\..\..\lib\icuucd.lib ..\..\..\lib\icuind.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept /libpath:"..\..\..\lib" + +!ENDIF + +# Begin Target + +# Name "search - Win32 Release" +# Name "search - Win32 Debug" +# Begin Group "Source Files" + +# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat" +# Begin Source File + +SOURCE=.\search.cpp +# End Source File +# Begin Source File + +SOURCE=.\srchiter.cpp +# End Source File +# Begin Source File + +SOURCE=.\strsrch.cpp +# End Source File +# End Group +# Begin Group "Header Files" + +# PROP Default_Filter "h;hpp;hxx;hm;inl" +# Begin Source File + +SOURCE=.\srchiter.h +# End Source File +# Begin Source File + +SOURCE=.\strsrch.h +# End Source File +# End Group +# Begin Group "Resource Files" + +# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe" +# End Group +# End Target +# End Project diff --git a/icu4c/source/samples/search/search.dsw b/icu4c/source/samples/search/search.dsw new file mode 100644 index 00000000000..bb71dfb9ffe --- /dev/null +++ b/icu4c/source/samples/search/search.dsw @@ -0,0 +1,29 @@ +Microsoft Developer Studio Workspace File, Format Version 6.00 +# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE! + +############################################################################### + +Project: "search"=.\search.dsp - Package Owner=<4> + +Package=<5> +{{{ +}}} + +Package=<4> +{{{ +}}} + +############################################################################### + +Global: + +Package=<5> +{{{ +}}} + +Package=<3> +{{{ +}}} + +############################################################################### + diff --git a/icu4c/source/samples/search/srchiter.cpp b/icu4c/source/samples/search/srchiter.cpp new file mode 100644 index 00000000000..e8b87254773 --- /dev/null +++ b/icu4c/source/samples/search/srchiter.cpp @@ -0,0 +1,279 @@ +/* +********************************************************************** +* Copyright (C) 1999-2000 IBM and others. All rights reserved. +********************************************************************** +* Date Name Description +* 03/22/2000 helena Creation. +********************************************************************** +*/ + +#include "unicode/brkiter.h" +#include "unicode/schriter.h" +#include "srchiter.h" + +int32_t const SearchIterator::DONE = -1; +int32_t const SearchIterator::BEFORE = -2; + +SearchIterator::SearchIterator(void) : + index(0), + length(0), + target(0), + backward(FALSE), /* going forward */ + breaker(NULL), + overlap(TRUE) +{ + UErrorCode status = U_ZERO_ERROR; + this->breaker = BreakIterator::createCharacterInstance(Locale::getDefault(), status); + if (U_FAILURE(status)) return; +} + +SearchIterator::SearchIterator(CharacterIterator* target, + BreakIterator* breaker) : + index(0), + length(0), + target(0), + backward(FALSE), /* going forward */ + breaker(NULL), + overlap(TRUE) +{ + this->target = target; + + this->breaker = breaker; + this->breaker->adoptText(this->target); + + index = this->target->startIndex(); + length = 0; +} + +SearchIterator::SearchIterator(const SearchIterator& other) : + length(other.length), + target(0), + backward(other.backward), /* going forward */ + breaker(NULL), + overlap(other.overlap) +{ + index = other.target->startIndex(); + this->target = other.target->clone(); + + this->breaker = ((BreakIterator&)other.breaker).clone(); + this->breaker->adoptText(this->target); +} + +SearchIterator::~SearchIterator() +{ + // deletion of breaker will delete target + if (breaker != NULL) { + delete breaker; + breaker = 0; + } +} + +bool_t SearchIterator::operator == (const SearchIterator& that) const +{ + if (this == &that) return TRUE; + if (*that.breaker != *breaker) return FALSE; + else if (*that.target != *target) return FALSE; + else if (that.backward != backward) return FALSE; + else if (that.index != index) return FALSE; + else if (that.length != length) return FALSE; + else if (that.overlap != overlap) return FALSE; + else return TRUE; +} + +int32_t SearchIterator::first(void) +{ + setIndex(SearchIterator::BEFORE); + return next(); +} + +int32_t SearchIterator::following(int32_t pos) +{ + setIndex(pos); + return next(); +} + +int32_t SearchIterator::last(void) +{ + setIndex(SearchIterator::DONE); + return previous(); +} + +int32_t SearchIterator::preceding(int32_t pos) +{ + setIndex(pos); + return previous(); +} + +int32_t SearchIterator::next(void) +{ + if (index == SearchIterator::BEFORE){ + // Starting at the beginning of the text + index = target->startIndex(); + } else if (index == SearchIterator::DONE) { + return SearchIterator::DONE; + } else if (length > 0) { + // Finding the next match after a previous one + index += overlap ? 1 : length; + } + index -= 1; + backward = FALSE; + + do { + UErrorCode status = U_ZERO_ERROR; + length = 0; + index = handleNext(index + 1, status); + if (U_FAILURE(status)) + { + return SearchIterator::DONE; + } + } while (index != SearchIterator::DONE && !isBreakUnit(index, index+length)); + + return index; +} + +int32_t SearchIterator::previous(void) +{ + if (index == SearchIterator::DONE) { + index = target->endIndex(); + } else if (index == SearchIterator::BEFORE) { + return SearchIterator::DONE; + } else if (length > 0) { + // Finding the previous match before a following one + index = overlap ? index + length - 1 : index; + } + index += 1; + backward = TRUE; + + do { + UErrorCode status = U_ZERO_ERROR; + length = 0; + index = handlePrev(index - 1, status); + if (U_FAILURE(status)) + { + return SearchIterator::DONE; + } + } while (index != SearchIterator::DONE && !isBreakUnit(index, index+length)); + + if (index == SearchIterator::DONE) { + index = SearchIterator::BEFORE; + } + return getIndex(); +} + + +int32_t SearchIterator::getIndex() const +{ + return index == SearchIterator::BEFORE ? SearchIterator::DONE : index; +} + +void SearchIterator::setOverlapping(bool_t allowOverlap) +{ + overlap = allowOverlap; +} + +bool_t SearchIterator::isOverlapping(void) const +{ + return overlap; +} + +int32_t SearchIterator::getMatchLength(void) const +{ + return length; +} + +void SearchIterator::reset(void) +{ + length = 0; + if (backward == FALSE) { + index = 0; + target->setToStart(); + breaker->first(); + } else { + index = SearchIterator::DONE; + target->setToEnd(); + breaker->last(); + } + overlap = TRUE; +} + +void SearchIterator::setBreakIterator(const BreakIterator* iterator) +{ + CharacterIterator *buffer = target->clone(); + delete breaker; + breaker = iterator->clone(); + breaker->adoptText(buffer); +} + +const BreakIterator& SearchIterator::getBreakIterator(void) const +{ + return *breaker; +} + +void SearchIterator::setTarget(const UnicodeString& newText) +{ + if (target != NULL && target->getDynamicClassID() + == StringCharacterIterator::getStaticClassID()) { + ((StringCharacterIterator*)target)->setText(newText); + } + else { + delete target; + target = new StringCharacterIterator(newText); + target->first(); + breaker->adoptText(target); + } +} + +void SearchIterator::adoptTarget(CharacterIterator* iterator) { + target = iterator; + breaker->adoptText(target); + setIndex(SearchIterator::BEFORE); +} + +const CharacterIterator& SearchIterator::getTarget(void) const +{ + SearchIterator* nonConstThis = (SearchIterator*)this; + + // The iterator is initialized pointing to no text at all, so if this + // function is called while we're in that state, we have to fudge an + // an iterator to return. + if (nonConstThis->target == NULL) + nonConstThis->target = new StringCharacterIterator(""); + return *nonConstThis->target; +} + +void SearchIterator::getMatchedText(UnicodeString& result) +{ + result.remove(); + if (length > 0) { + int i = 0; + for (UChar c = target->setIndex(index); i < length; c = target->next(), i++) + { + result += c; + } + } +} + + +void SearchIterator::setMatchLength(int32_t length) +{ + this->length = length; +} + +void SearchIterator::setIndex(int32_t pos) { + index = pos; + length = 0; +} + +bool_t SearchIterator::isBreakUnit(int32_t start, + int32_t end) +{ + if (breaker == NULL) { + return TRUE; + } + bool_t startBound = breaker->isBoundary(start); + bool_t endBound = (end == target->endIndex()) || breaker->isBoundary(end); + + return startBound && endBound; +} + + diff --git a/icu4c/source/samples/search/srchiter.h b/icu4c/source/samples/search/srchiter.h new file mode 100644 index 00000000000..c631df70357 --- /dev/null +++ b/icu4c/source/samples/search/srchiter.h @@ -0,0 +1,388 @@ +/* +********************************************************************** +* Copyright (C) 1999-2000 IBM and others. All rights reserved. +********************************************************************** +* Date Name Description +* 03/22/2000 helena Creation. +********************************************************************** +*/ +#ifndef SRCHITER_H +#define SRCHITER_H + +#include "unicode/utypes.h" +#include "unicode/unistr.h" +#include "unicode/chariter.h" +#include "unicode/brkiter.h" + +/** + * SearchIterator is an abstract base class that provides methods + * to search for a pattern within a text string. Instances of + * SearchIterator maintain a current position and scan over + * the target text, returning the indices the pattern is matched + * and the length of each match. + *

+ * SearchIterator is an abstract base class that defines a + * protocol for text searching. Subclasses provide concrete implementations of + * various search algorithms. For example, {@link StringSearch} + * implements language-sensitive pattern matching based on the comparison rules + * defined in a {@link RuleBasedCollator} object. + *

+ * Internally, SearchIterator scans text using a + * {@link CharacterIterator}, and is thus able to scan text held + * by any object implementing that protocol. A StringCharacterIterator + * is used to scan String objects passed to setText. + *

+ * SearchIterator provides an API that is similar to that of + * other text iteration classes such as BreakIterator. Using this + * class, it is easy to scan through text looking for all occurances of a + * given pattern. The following example uses a StringSearch object to + * find all instances of "fox" in the target string. Any other subclass of + * SearchIterator can be used in an identical manner. + *


+ * UnicodeString target("The quick brown fox jumped over the lazy fox");
+ * UnicodeString pattern("fox");
+ *
+ * SearchIterator *iter = new StringSearch(pattern, target);
+ *
+ * for (int pos = iter->first(); pos != SearchIterator::DONE; pos = iter->next()) {
+ *     printf("Found match at %d pos, length is %d\n", pos, iter.getMatchLength());
+ * }
+ * 
+ * + * @see StringSearch + */ + +class SearchIterator { +public: + /** + * DONE is returned by previous() and next() after all valid + * matches have been returned, and by first() and last() if + * there are no matches at all. + */ + static const int32_t DONE; + + //======================================================================= + // boilerplate + //======================================================================= + + /** + * Destructor + */ + virtual ~SearchIterator(); + + /** copy constructor */ + SearchIterator(const SearchIterator& other); + + /** + * Equality operator. Returns TRUE if both BreakIterators are of the + * same class, have the same behavior, and iterate over the same text. + */ + virtual bool_t operator==(const SearchIterator& that) const; + + /** + * Not-equal operator. If operator== returns TRUE, this returns FALSE, + * and vice versa. + */ + bool_t operator!=(const SearchIterator& that) const; + + /** + * Returns a newly-constructed RuleBasedBreakIterator with the same + * behavior, and iterating over the same text, as this one. + */ + virtual SearchIterator* clone(void) const = 0; + + /** + * Return a polymorphic class ID for this object. Different subclasses + * will return distinct unequal values. + * @stable + */ + virtual UClassID getDynamicClassID(void) const = 0; + + /** + * Return the first index at which the target text matches the search + * pattern. The iterator is adjusted so that its current index + * (as returned by {@link #getIndex}) is the match posisition if one was found + * and DONE if one was not. + * + * @return The character index of the first match, or DONE if there + * are no matches. + */ + int32_t first(void); + + /** + * Return the first index greater than pos at which the target + * text matches the search pattern. The iterator is adjusted so that its current index + * (as returned by {@link #getIndex}) is the match posisition if one was found + * and DONE if one was not. + * + * @return The character index of the first match following pos, + * or DONE if there are no matches. + */ + int32_t following(int32_t pos); + + /** + * Return the last index in the target text at which it matches + * the search pattern and adjusts the iteration to point to that position. + * + * @return The index of the first match, or DONE if there + * are no matches. + */ + int32_t last(void); + + /** + * Return the first index less than pos at which the target + * text matches the search pattern. The iterator is adjusted so that its current index + * (as returned by {@link #getIndex}) is the match posisition if one was found + * and DONE if one was not. + * + * @return The character index of the first match preceding pos, + * or DONE if there are no matches. + */ + int32_t preceding(int32_t pos); + + /** + * Return the index of the next point at which the text matches the + * search pattern, starting from the current position + *

+ * @return The index of the next match after the current position, + * or DONE if there are no more matches. + * + * @see #first + */ + int32_t next(void); + + /** + * Return the index of the previous point at which the text matches + * the search pattern, starting at the current position + * + * @return The index of the previous match before the current position, + * or DONE if there are no more matches. + */ + int32_t previous(void); + + /** + * Return the current index in the text being searched. + * If the iteration has gone past the end of the text + * (or past the beginning for a backwards search), + * {@link #DONE} is returned. + */ + int32_t getIndex(void) const; + /** + * Determines whether overlapping matches are returned. If this + * property is true, matches that begin within the + * boundry of the previous match are considered valid and will + * be returned. For example, when searching for "abab" in the + * target text "ababab", both offsets 0 and 2 will be returned + * as valid matches if this property is true. + *

+ * The default setting of this property is true + */ + void setOverlapping(bool_t allowOverlap); + + /** + * Determines whether overlapping matches are returned. + * + * @see #setOverlapping + */ + bool_t isOverlapping(void) const; + + /** + * Returns the length of text in the target which matches the search + * pattern. This call returns a valid result only after a successful + * call to {@link #first}, {@link #next}, {@link #previous}, or {@link #last}. + * Just after construction, or after a searching method returns + * DONE, this method will return 0. + * + * @return The length of the match in the target text, or 0 if there + * is no match currently. + */ + int32_t getMatchLength(void) const; + + /** + * Set the BreakIterator that will be used to restrict the points + * at which matches are detected. + * + * @param breaker A {@link java.text.BreakIterator BreakIterator} + * that will be used to restrict the points + * at which matches are detected. If a match is found, but the match's start + * or end index is not a boundary as determined by + * the BreakIterator, the match will be rejected and + * another will be searched for. + * + * If this parameter is null, no break + * detection is attempted. + * + * @see #getBreakIterator + */ + /* HSYS : Check, aliasing or owning */ + void setBreakIterator(const BreakIterator* iterator); + + /** + * Returns the BreakIterator that is used to restrict the points + * at which matches are detected. This will be the same object + * that was passed to the constructor or to setBreakIterator. + * Note that null is a legal value; it means that break + * detection should not be attempted. + * + * @see #setBreakIterator + */ + const BreakIterator& getBreakIterator(void) const; + + /** + * Set the target text which should be searched and resets the + * iterator's position to point before the start of the target text. + * This method is useful if you want to re-use an iterator to + * search for the same pattern within a different body of text. + * + * @see #getTarget + */ + virtual void setTarget(const UnicodeString& newText); + + /** + * Set the target text which should be searched and resets the + * iterator's position to point before the start of the target text. + * This method is useful if you want to re-use an iterator to + * search for the same pattern within a different body of text. + * + * @see #getTarget + */ + virtual void adoptTarget(CharacterIterator* iterator); + /** + * Return the target text which is being searched + * + * @see #setTarget + */ + const CharacterIterator& getTarget(void) const; + + /** Reset the iteration. + */ + virtual void reset(void); + + /** + * Returns the text that was matched by the most recent call to + * {@link #first}, {@link #next}, {@link #previous}, or {@link #last}. + * If the iterator is not pointing at a valid match (e.g. just after + * construction or after DONE has been returned, returns + * an empty string. + */ + void getMatchedText(UnicodeString& result); + + //------------------------------------------------------------------- + // Protected interface for subclasses + //------------------------------------------------------------------- + +protected: + SearchIterator(); + + /** + * Constructor for use by subclasses + *

+ * @param target The target text to be searched. This is for internal + * use by this class. Subclasses need to maintain their + * own reference to or iterator over the target text + * for use by their {@link #handleNext handleNext} and + * {@link #handlePrev handlePrev} methods. The target will + * be adopted and owned by the SearchIterator object. + * + * @param breaker A {@link BreakIterator} that is used to restrict the points + * at which matches are detected. If handleNext or + * handlePrev finds a match, but the match's start + * or end index is not a boundary as determined by + * the BreakIterator, the match is rejected and + * handleNext or handlePrev is called again. + * If this parameter is null, no break + * detection is attempted. + * + */ + SearchIterator(CharacterIterator* target, + BreakIterator* breaker); +/** + * Abstract method which subclasses override to provide the mechanism + * for finding the next match in the target text. This allows different + * subclasses to provide different search algorithms. + *

+ * If a match is found, the implementation should return the index at + * which the match starts and should call {@link #setMatchLength setMatchLength} + * with the number of characters in the target + * text that make up the match. If no match is found, the method + * should return DONE and should not call setMatchLength. + *

+ * @param startAt The index in the target text at which the search + * should start. + * + * @see #setMatchLength + */ + virtual int32_t handleNext(int32_t startAt, UErrorCode& status) = 0; + + /** + * Abstract method which subclasses override to provide the mechanism + * for finding the previous match in the target text. This allows different + * subclasses to provide different search algorithms. + *

+ * If a match is found, the implementation should return the index at + * which the match starts and should call {@link #setMatchLength setMatchLength} + * with the number of characters in the target + * text that make up the match. If no match is found, the method + * should return DONE and should not call setMatchLength. + *

+ * @param startAt The index in the target text at which the search + * should start. + * + * @see #setMatchLength + */ + virtual int32_t handlePrev(int32_t startAt, UErrorCode& status) = 0; + + /** + * Sets the length of the currently matched string in the target text. + * Subclasses' handleNext and handlePrev + * methods should call this when they find a match in the target text. + */ + void setMatchLength(int32_t length); + + //------------------------------------------------------------------- + // Privates + // +private: + /** + * Class ID + */ + static char fgClassID; +private: + /** + * Private value indicating that the iterator is pointing + * before the beginning of the target text. + */ + static const int32_t BEFORE; + + /** + * Internal method used by preceding and following. Sets the index + * to point to the given position, and clears any state that's + * affected. + */ + void setIndex(int32_t pos); + + /** + * Determine whether the target text bounded by start and + * end is one or more whole units of text as determined by + * the current BreakIterator. + */ + bool_t isBreakUnit(int32_t start, int32_t end); + + //------------------------------------------------------------------------- + // Private data... + //------------------------------------------------------------------------- + int32_t index; // Current position in the target text + int32_t length; // Length of matched text, or 0 + bool_t overlap; // Return overlapping matches? + CharacterIterator* target; // Target text to be searched + BreakIterator* breaker; // Break iterator to constrain matches + bool_t backward; +}; + +inline bool_t SearchIterator::operator!=(const SearchIterator& that) const +{ + return !operator==(that); +} + +#endif + diff --git a/icu4c/source/samples/search/strsrch.cpp b/icu4c/source/samples/search/strsrch.cpp new file mode 100644 index 00000000000..1c3f0e202c4 --- /dev/null +++ b/icu4c/source/samples/search/strsrch.cpp @@ -0,0 +1,758 @@ +/* +********************************************************************** +* Copyright (C) 1999-2000 IBM and others. All rights reserved. +********************************************************************** +* Date Name Description +* 03/22/2000 helena Creation. +********************************************************************** +*/ + +#include +#include "unicode/coleitr.h" +#include "unicode/schriter.h" +#include "strsrch.h" +/** + * StringSearch is a SearchIterator that provides + * language-sensitive text searching based on the comparison rules defined + * in a {@link RuleBasedCollator} object. + * Instances of StringSearch function as iterators + * maintain a current position and scan over text returning the index of + * characters where the pattern occurs and the length of each match. + *

+ * StringSearch uses a version of the fast Boyer-Moore search + * algorithm that has been adapted to work with the large character set of + * Unicode. See "Efficient Text Searching in Java", to be published in + * Java Report in February, 1999, for further information on the algorithm. + *

+ * Consult the SearchIterator documentation for information on + * and examples of how to use instances of this class to implement text + * searching. SearchIterator provides all of the necessary + * API; this class only provides constructors and internal implementation + * methods. + * + * @see SearchIterator + * @see RuleBasedCollator + * + * @author Laura Werner + * @version 1.0 + */ + +char StringSearch::fgClassID = 0; // Value is irrelevant // class id +/* to be removed */ +void StringSearch::dumpTables() { + int i; + for (i = 0; i < 256; i++) { + if (shiftTable[i] != minLen) { +// debug("shift[" + Integer.toString(i,16) + "] = " + shiftTable[i]); + } + } + for (i = 0; i < 256; i++) { + if (backShiftTable[i] != minLen) { +// debug("backShift[" + Integer.toString(i,16) + "] = " + backShiftTable[i]); + } + } +} + +StringSearch::StringSearch(const UnicodeString& pat, + CharacterIterator* target, + RuleBasedCollator* coll, + BreakIterator* breaker, + UErrorCode& status) : + SearchIterator(target, breaker), + strength(coll->getStrength()), + pattern(pat), + valueList(NULL), + valueListLen(0), + normLen(0), // num. of collation elements in pattern. + minLen(0), // Min of composed, decomposed versions + maxLen(0), // Max + it(NULL) + +{ + if (U_FAILURE(status)) return; + collator = (RuleBasedCollator*)(coll->clone()); + iter = collator->createCollationElementIterator(*target); + it = collator->createCollationElementIterator(pat); + + initialize(status); // Initialize the Boyer-Moore tables +} + +/** + * Construct a StringSearch object using a specific collator. + *

+ * @param pattern The text for which this object will search. + * + * @param target The text in which to search for the pattern. + * + * @param collator A RuleBasedCollator object which defines the + * language-sensitive comparison rules used to determine + * whether text in the pattern and target matches. + */ +StringSearch::StringSearch(const UnicodeString& pat, + CharacterIterator* target, + RuleBasedCollator* collator, + UErrorCode& status) : + SearchIterator(), + strength(collator->getStrength()), + pattern(pat), + valueList(NULL), + valueListLen(0), + normLen(0), // num. of collation elements in pattern. + minLen(0), // Min of composed, decomposed versions + maxLen(0), // Max + it(NULL) +{ + if (U_FAILURE(status)) return; + this->adoptTarget(target); + this->collator = (RuleBasedCollator*)(collator->clone()); + this->iter = collator->createCollationElementIterator(*target); + this->it = collator->createCollationElementIterator(pat); + initialize(status); +} + +/** + * Construct a StringSearch object using the collator and + * character boundary detection rules for a given locale + *

+ * @param pattern The text for which this object will search. + * + * @param target The text in which to search for the pattern. + * + * @param loc The locale whose collation and break-detection rules + * should be used. + * + * @exception ClassCastException thrown if the collator for the specified + * locale is not a RuleBasedCollator. + */ +StringSearch::StringSearch(const StringSearch& that) : + SearchIterator(that), + valueList(NULL), + valueListLen(that.valueListLen), + normLen(that.normLen), // num. of collation elements in pattern. + minLen(that.minLen), // Min of composed, decomposed versions + maxLen(that.maxLen), + collator(that.collator), + strength(that.strength), + iter(NULL), + it(NULL) +{ + valueList = new int32_t[valueListLen]; + memcpy(valueList, that.valueList, valueListLen*sizeof(int32_t)); + iter = that.collator->createCollationElementIterator(that.getTarget()); + it = that.collator->createCollationElementIterator(that.pattern); +} + +StringSearch::StringSearch(const UnicodeString& pat, + CharacterIterator* target, + const Locale& loc, + UErrorCode& status) : + SearchIterator(), + pattern(pat), + valueList(NULL), + valueListLen(0), + normLen(0), // num. of collation elements in pattern. + minLen(0), // Min of composed, decomposed versions + maxLen(0) // Max +{ + if (U_FAILURE(status)) return; + this->adoptTarget(target); + collator = (RuleBasedCollator*)Collator::createInstance(loc, status); + iter = collator->createCollationElementIterator(*target); + it = collator->createCollationElementIterator(pat); + + strength = collator->getStrength(); + + initialize(status); +} + +bool_t +StringSearch::operator==(const SearchIterator& that) const +{ + if (that.getDynamicClassID() != getDynamicClassID()) + return FALSE; + if (!SearchIterator::operator==(that)) + return FALSE; + const StringSearch& that2 = (const StringSearch&)that; + if (*that2.iter != *iter) return FALSE; + else if (*that2.collator != *collator) return FALSE; + else if (that2.strength != strength) return FALSE; + else if (that2.valueListLen != valueListLen) return FALSE; + else if (memcmp(that2.valueList, valueList, valueListLen*sizeof(int32_t)) != 0) return FALSE; + else if (that2.pattern != pattern) return FALSE; + else if (that2.normLen != normLen) return FALSE; + else if (that2.minLen != minLen) return FALSE; + else if (that2.maxLen != maxLen) return FALSE; + else return TRUE; +} + +SearchIterator* +StringSearch::clone(void) const +{ + return new StringSearch(*this); +} + +/** + * Construct a StringSearch object using the collator for the default + * locale + *

+ * @param pattern The text for which this object will search. + * + * @param target The text in which to search for the pattern. + * + * @param collator A RuleBasedCollator object which defines the + * language-sensitive comparison rules used to determine + * whether text in the pattern and target matches. + */ +StringSearch::StringSearch(const UnicodeString& pat, + const UnicodeString& newText, + UErrorCode& status) : + SearchIterator(), + pattern(pat), + valueList(NULL), + valueListLen(0), + normLen(0), // num. of collation elements in pattern. + minLen(0), // Min of composed, decomposed versions + maxLen(0) // Max +{ + StringCharacterIterator *s = new StringCharacterIterator(newText); + collator = (RuleBasedCollator*)Collator::createInstance(Locale::getDefault(), status); + strength = collator->getStrength(); + iter = collator->createCollationElementIterator(newText); + it = collator->createCollationElementIterator(pat); + this->adoptTarget(s); + initialize(status); +} + +StringSearch::~StringSearch(void) +{ + if (valueList != NULL) { + delete [] valueList; + valueList = 0; + } + if (iter != NULL) { + delete iter; + iter = 0; + } + if (collator != NULL) { + delete collator; + collator = 0; + } + if (it != NULL) { + delete it; + it = 0; + } +} +//------------------------------------------------------------------- +// Getters and Setters +//------------------------------------------------------------------- + +/** + * Sets this object's strength property. The strength determines the + * minimum level of difference considered significant during a + * search. Generally, {@link Collator#TERTIARY} and + * {@link Collator#IDENTICAL} indicate that all differences are + * considered significant, {@link Collator#SECONDARY} indicates + * that upper/lower case distinctions should be ignored, and + * {@link Collator#PRIMARY} indicates that both case and accents + * should be ignored. However, the exact meanings of these constants + * are determined by individual Collator objects. + *

+ * @see Collator#PRIMARY + * @see Collator#SECONDARY + * @see Collator#TERTIARY + * @see Collator#IDENTICAL + */ +void StringSearch::setStrength(Collator::ECollationStrength newStrength, UErrorCode& status) { + if (U_FAILURE(status)) + { + return; + } + strength = newStrength; + + // Due to a bug (?) in CollationElementIterator, we must set the + // collator's strength as well, since the iterator is going to + // mask out the portions of the collation element that are not + // relevant for the collator's current strength setting + // Note that this makes it impossible to share a Collator among + // multiple StringSearch objects if you adjust Strength settings. + collator->setStrength(strength); + initialize(status); +} + + +/** + * Returns this object's strength property, which indicates what level + * of differences are considered significant during a search. + *

+ * @see #setStrength + */ +Collator::ECollationStrength StringSearch::getStrength() const +{ + return strength; +} + +/** + * Set the collator to be used for this string search. Also changes + * the search strength to match that of the new collator. + *

+ * This method causes internal data such as Boyer-Moore shift tables + * to be recalculated, but the iterator's position is unchanged. + *

+ * @see #getCollator + */ +void StringSearch::setCollator(const RuleBasedCollator *coll, UErrorCode& status) +{ + delete iter; + delete collator; + collator = (RuleBasedCollator*)coll->clone(); + strength = collator->getStrength(); + // Also need to recompute the pattern and get a new target iterator + iter = collator->createCollationElementIterator(getTarget()); + initialize(status); +} + +/** + * Return the RuleBasedCollator being used for this string search. + */ +const RuleBasedCollator& StringSearch::getCollator(void) const +{ + return *collator; +} + +/** + * Set the pattern for which to search. + * This method causes internal data such as Boyer-Moore shift tables + * to be recalculated, but the iterator's position is unchanged. + */ +void StringSearch::setPattern(const UnicodeString& pat, UErrorCode& status) +{ + pattern = pat; + initialize(status); +} + +/** + * Returns the pattern for which this object is searching. + */ +const UnicodeString& StringSearch::getPattern() const +{ + return pattern; +} + +/** + * Set the target text which should be searched and resets the + * iterator's position to point before the start of the new text. + * This method is useful if you want to re-use an iterator to + * search for the same pattern within a different body of text. + */ +void StringSearch::adoptTarget(CharacterIterator* target) +{ + UErrorCode status = U_ZERO_ERROR; + SearchIterator::adoptTarget(target); + + // fix me: Skipped the error code + // Since we're caching a CollationElementIterator, recreate it + iter->setText(*target, status); +} +void StringSearch::setTarget(const UnicodeString& newText) +{ + UErrorCode status = U_ZERO_ERROR; + SearchIterator::setTarget(newText); + // Since we're caching a CollationElementIterator, recreate it + iter->setText(newText, status); +} + +void StringSearch::reset(void) +{ + SearchIterator::reset(); + iter->reset(); +}//------------------------------------------------------------------- +// Privates +//------------------------------------------------------------------- + +/** + * Search forward for matching text, starting at a given location. + * Clients should not call this method directly; instead they should call + * {@link SearchIterator#next}. + *

+ * If a match is found, this method returns the index at which the match + * starts and calls {@link SearchIterator#setMatchLength} + * with the number of characters in the target + * text that make up the match. If no match is found, the method returns + * DONE and does not call setMatchLength. + *

+ * @param start The index in the target text at which the search starts. + * + * @return The index at which the matched text in the target starts, or DONE + * if no match was found. + *

+ * @see SearchIterator#next + * @see SearchIterator#DONE + */ +int32_t StringSearch::handleNext(int32_t start, UErrorCode& status) +{ + if (U_FAILURE(status)) + { + return SearchIterator::DONE; + } + const CharacterIterator& target = getTarget(); + + int mask = getMask(strength); + int done = CollationElementIterator::NULLORDER & mask; +#if 0 + if (DEBUG) { + debug("-------------------------handleNext-----------------------------------"); + debug(""); + debug("strength=" + strength + ", mask=" + Integer.toString(mask,16) + + ", done=" + Integer.toString(done,16)); + debug("decomp=" + collator.getDecomposition()); + + debug("target.begin=" + getTarget().getBeginIndex()); + debug("target.end=" + getTarget().getEndIndex()); + debug("start = " + start); + } +#endif + int32_t index = start + minLen; + int32_t matchEnd = 0; + + while (index <= target.endIndex()) + { + int32_t patIndex = normLen; + int32_t tval = 0, pval = 0; + bool_t getP = TRUE; + + iter->setOffset(index, status); + matchEnd = index; + + //if (DEBUG) debug(" outer loop: patIndex=" + patIndex + ", index=" + index); + + while ((patIndex > 0 || getP == false) && iter->getOffset() > start) + { +#if 0 + if (DEBUG) { + debug(" inner loop: patIndex=" + patIndex + " iter=" + iter.getOffset()); + debug(" getP=" + getP); + } +#endif + + // Get the previous character in both the pattern and the target + tval = iter->previous(status) & mask; + if (U_FAILURE(status)) + { + return SearchIterator::DONE; + } + + if (getP) pval = valueList[--patIndex]; + getP = TRUE; + + // (DEBUG) debug(" pval=" + Integer.toString(pval,16) + ", tval=" + Integer.toString(tval,16)); + + if (tval == 0) { // skip tval, use same pval + // (DEBUG) debug(" tval is ignorable"); + getP = FALSE; + } + else if (pval != tval) { // Mismatch, skip ahead + // (DEBUG) debug(" mismatch: skippping " + getShift(tval, patIndex)); + + index += getShift(tval, patIndex); + break; + } + else if (patIndex == 0) { + // The values matched, and we're at the beginning of the pattern, + // which means we matched the whole thing. + start = iter->getOffset(); + setMatchLength(matchEnd - start); + // if (DEBUG) debug("Found match at index "+ start ); + return start; + } + } +#if 0 + if (DEBUG) debug(" end of inner loop: patIndex=" + patIndex + " iter=" + iter.getOffset()); + if (DEBUG) debug(" getP=" + getP); +#endif + if (iter->getOffset() <= start) { + // We hit the beginning of the text being searched, which is + // possible if it contains lots of ignorable characters. + // Advance one character and try again. + // if (DEBUG) debug("hit beginning of target; advance by one"); + index++; + } + } + // if (DEBUG) debug("Fell off end of outer loop; returning DONE"); + return SearchIterator::DONE; +} + +/** + * Search backward for matching text ,starting at a given location. + * Clients should not call this method directly; instead they should call + * SearchIterator.previous(), which this method overrides. + *

+ * If a match is found, this method returns the index at which the match + * starts and calls {@link SearchIterator#setMatchLength} + * with the number of characters in the target + * text that make up the match. If no match is found, the method returns + * DONE and does not call setMatchLength. + *

+ * @param start The index in the target text at which the search starts. + * + * @return The index at which the matched text in the target starts, or DONE + * if no match was found. + *

+ * @see SearchIterator#previous + * @see SearchIterator#DONE + */ +int32_t StringSearch::handlePrev(int32_t start, UErrorCode& status) +{ + if (U_FAILURE(status)) + { + return SearchIterator::DONE; + } + int patLen = normLen; + int index = start - minLen; + + int mask = getMask(strength); + int done = CollationElementIterator.NULLORDER & mask; +#if 0 + if (DEBUG) { + debug("-------------------------handlePrev-----------------------------------"); + debug(""); + debug("strength=" + strength + ", mask=" + Integer.toString(mask,16) + + ", done=" + Integer.toString(done,16)); + debug("decomp=" + collator.getDecomposition()); + + debug("target.begin=" + getTarget().getBeginIndex()); + debug("target.end=" + getTarget().getEndIndex()); + } +#endif + + while (index >= 0) { + int patIndex = 0; + int tval = 0, pval = 0; + bool_t getP = TRUE; + + iter->setOffset(index, status); + if (U_FAILURE(status)) + { + return SearchIterator::DONE; + } + + + // if (DEBUG) debug(" outer loop: patIndex=" + patIndex + ", index=" + index); + + while ((patIndex < patLen || !getP) && iter->getOffset() < start) + { + /* if (DEBUG) { + debug(" inner loop: patIndex=" + patIndex + " iter=" + iter.getOffset()); + } + */ + tval = iter->next(status) & mask; + if (U_FAILURE(status)) + { + return SearchIterator::DONE; + } + if (getP) pval = valueList[patIndex++]; + getP = TRUE; + + //if (DEBUG) debug(" pval=" + Integer.toString(pval,16) + ", tval=" + Integer.toString(tval,16)); + + if (tval == done) { + // if (DEBUG) debug(" end of target; no match"); + return DONE; + } + else if (tval == 0) { + // if (DEBUG) debug(" tval is ignorable"); + getP = false; + } + else if (pval != tval) { + // We didn't match this pattern. Skip ahead + // if (DEBUG) debug(" mismatch: skippping " + getBackShift(tval, patIndex)); + + int shift = getBackShift(tval, patIndex); + index -= shift; + break; + } + else if (patIndex == patLen) { + // The elements matched and we're at the end of the pattern, + // which means we matched the whole thing. + setMatchLength(iter->getOffset() - index); + return index; + } + } + if (iter->getOffset() >= start) { + // We hit the end of the text being searched, which is + // possible if it contains lots of ignorable characters. + // Back up one character and try again. + // if (DEBUG) debug("hit end of target; back by one"); + index--; + } + } + return SearchIterator::DONE; +} + +/** + * Return a bitmask that will select only the portions of a collation + * element that are significant at the given strength level. + */ +int32_t StringSearch::getMask(Collator::ECollationStrength strength) +{ + switch (strength) { + case Collator::PRIMARY: + return 0xFFFF0000; + case Collator::SECONDARY: + return 0xFFFFFF00; + default: + return 0xFFFFFFFF; + } +} + + +void StringSearch::initialize(UErrorCode& status) { + /* + if (DEBUG) { + debug("-------------------------initialize-----------------------------------"); + debug("pattern=" + pattern); + } + */ + it->setText(pattern, status); + if (U_FAILURE(status)) { + delete it; + return; + } + + int mask = getMask(strength); + + // See how many non-ignorable collation keys are in the text + normLen = 0; + int32_t elem; + while ((elem = it->next(status)) != CollationElementIterator::NULLORDER) + { + if (U_FAILURE(status)) { + return; + } + if ((elem & mask) != 0) { + normLen++; + } + } + + // Save them all + valueList = new int32_t[normLen]; + int expandLen = 0; + it->reset(); + + for (int32_t i = 0; i < normLen; i++) + { + elem = it->next(status); + if (U_FAILURE(status)) { + return; + } + + if ((elem & mask) != 0) { + valueList[i] = elem & mask; + + } + // Keep track of whether there are any expanding-character + // sequences that can result in one of the characters that's in + // the pattern. If there are, we have to reduce the shift + // distances calculated below to account for it. + expandLen += it->getMaxExpansion(elem) - 1; + } + + // + // We need to remember the size of the composed and decomposed + // versions of the string. Standard Boyer-Moore shift calculations + // can be wrong by an amount up to that difference, since a small + // small number of characters in the pattern can map to a larger + // number in the text being searched, or vice-versa. + // + int uniLen = pattern.length(); + maxLen = uprv_max(normLen, uniLen); + minLen = uprv_min(normLen, uniLen) - expandLen; + + + /* + if (DEBUG) debug("normLen=" + normLen + ", expandLen=" + expandLen + + ", maxLen=" + maxLen + ", minLen=" + minLen); + */ + // Now initialize the shift tables + // + // NOTE: This is the most conservative way to build them. If we had a way + // of knowing that there were no expanding/contracting chars in the rules, + // we could get rid of the "- 1" in the shiftTable calculations. + // But all of the default collators have at least one expansion or + // contraction, so it probably doesn't matter anyway. + // + for (i = 0; i < 256; i++) { + shiftTable[i] = backShiftTable[i] = minLen; + } + + for (i = 0; i < normLen-1; i++) { + shiftTable[hash(valueList[i])] = uprv_max(minLen - i - 1, 1); + } + shiftTable[hash(valueList[normLen-1])] = 1; + + for (i = normLen - 1; i > 0; i--) { + backShiftTable[hash(valueList[i])] = i; + } + backShiftTable[hash(valueList[0])] = 1; + + /* dumpTables(); */ +} + +/** + * Method used by StringSearch to determine how far to the right to + * shift the pattern during a Boyer-Moore search. + * + * @param curValue The current value in the target text + * @param curIndex The index in the pattern at which we failed to match + * curValue in the target text. + */ +int32_t StringSearch::getShift( int32_t curValue, int32_t curIndex ) const +{ + int32_t shiftAmt = shiftTable[hash(curValue)]; + + if (minLen != maxLen) { + int adjust = normLen - curIndex; + if (shiftAmt > adjust + 1) { +// if (DEBUG) debug("getShift: adjusting by " + adjust); + shiftAmt -= adjust; + } + } + return shiftAmt; +} + +/** + * Method used by StringSearch to determine how far to the left to + * shift the pattern during a reverse Boyer-Moore search. + * + * @param curValue The current value in the target text + * @param curIndex The index in the pattern at which we failed to match + * curValue in the target text. + */ +int32_t StringSearch::getBackShift( int32_t curValue, int32_t curIndex ) const +{ + int shiftAmt = backShiftTable[hash(curValue)]; + + if (minLen != maxLen) { + int adjust = normLen - (minLen - curIndex); + if (shiftAmt > adjust + 1) { + // if (DEBUG) debug("getBackShift: adjusting by " + adjust); + shiftAmt -= adjust; + } + } + return shiftAmt; +} + +/** + * Hash a collation element from its full size (32 bits) down into a + * value that can be used as an index into the shift tables. Right + * now we do a modulus by the size of the hash table. + * + * TODO: At some point I should experiment to see whether a slightly + * more complicated hash function gives us a better distribution + * on multilingual text. I doubt it will have much effect on + * performance, though. + */ +int32_t StringSearch::hash(int32_t order) +{ + return CollationElementIterator::primaryOrder(order) % 256; +} + diff --git a/icu4c/source/samples/search/strsrch.h b/icu4c/source/samples/search/strsrch.h new file mode 100644 index 00000000000..4909cdc0ce6 --- /dev/null +++ b/icu4c/source/samples/search/strsrch.h @@ -0,0 +1,393 @@ +/* +********************************************************************** +* Copyright (C) 1999-2000 IBM and others. All rights reserved. +********************************************************************** +* Date Name Description +* 03/22/2000 helena Creation. +********************************************************************** +*/ +#ifndef STRSRCH_H +#define STRSRCH_H + +#include "unicode/utypes.h" +#include "unicode/unistr.h" +#include "unicode/chariter.h" +#include "unicode/tblcoll.h" +#include "unicode/brkiter.h" +#include "srchiter.h" + +class SearchIterator; +/** + * StringSearch is a SearchIterator that provides + * language-sensitive text searching based on the comparison rules defined + * in a {@link RuleBasedCollator} object. + * Instances of StringSearch function as iterators + * maintain a current position and scan over text returning the index of + * characters where the pattern occurs and the length of each match. + *

+ * StringSearch uses a version of the fast Boyer-Moore search + * algorithm that has been adapted to work with the large character set of + * Unicode. See "Efficient Text Searching in Java", to be published in + * Java Report in February, 1999, for further information on the algorithm. + *

+ * Consult the SearchIterator documentation for information on + * and examples of how to use instances of this class to implement text + * searching. SearchIterator provides all of the necessary + * API; this class only provides constructors and internal implementation + * methods. + * + * @see SearchIterator + * @see RuleBasedCollator + * + * @author Laura Werner + * @version 1.0 + */ + +class StringSearch : public SearchIterator +{ +public: + /** + * Construct a StringSearch object using a specific collator and set + * of boundary-detection rules. + *

+ * @param pat The text for which this object will search. + * + * @param target The text in which to search for the pattern. + * + * @param coll A RuleBasedCollator object which defines the + * language-sensitive comparison rules used to determine + * whether text in the pattern and target matches. + * + * @param breaker A BreakIterator object used to constrain the matches + * that are found. Matches whose start and end indices + * in the target text are not boundaries as determined + * by the BreakIterator are ignored. If this behavior + * is not desired, null can be passed in instead. + */ + StringSearch(const UnicodeString& pat, + CharacterIterator* target, + RuleBasedCollator* coll, + BreakIterator* breaker, + UErrorCode& status); + + /** + * Construct a StringSearch object using a specific collator. + *

+ * @param pattern The text for which this object will search. + * + * @param target The text in which to search for the pattern. + * + * @param collator A RuleBasedCollator object which defines the + * language-sensitive comparison rules used to determine + * whether text in the pattern and target matches. + */ + StringSearch(const UnicodeString& pattern, + CharacterIterator* target, + RuleBasedCollator* collator, + UErrorCode& status); + + /** + * copy constructor + */ + StringSearch(const StringSearch& that); + + /** + * Construct a StringSearch object using the collator and + * character boundary detection rules for a given locale + *

+ * @param pattern The text for which this object will search. + * + * @param target The text in which to search for the pattern. + * + * @param loc The locale whose collation and break-detection rules + * should be used. + * + * @exception ClassCastException thrown if the collator for the specified + * locale is not a RuleBasedCollator. + */ + StringSearch(const UnicodeString& pattern, + CharacterIterator* target, + const Locale& loc, + UErrorCode& status); + /** + * Construct a StringSearch object using the collator for the default + * locale + *

+ * @param pattern The text for which this object will search. + * + * @param target The text in which to search for the pattern. + * + * @param collator A RuleBasedCollator object which defines the + * language-sensitive comparison rules used to determine + * whether text in the pattern and target matches. + */ + StringSearch(const UnicodeString& pattern, + const UnicodeString& target, + UErrorCode& status); + + virtual ~StringSearch(void); + /** + * Assignment operator. Sets this iterator to have the same behavior, + * and iterate over the same text, as the one passed in. + */ + StringSearch& operator=(const StringSearch& that); + + /** + * Equality operator. Returns TRUE if both BreakIterators are of the + * same class, have the same behavior, and iterate over the same text. + */ + virtual bool_t operator==(const SearchIterator& that) const; + + /** + * Not-equal operator. If operator== returns TRUE, this returns FALSE, + * and vice versa. + */ + bool_t operator!=(const SearchIterator& that) const; + + /** + * Returns a newly-constructed RuleBasedBreakIterator with the same + * behavior, and iterating over the same text, as this one. + */ + virtual SearchIterator* clone(void) const; + + //------------------------------------------------------------------- + // Getters and Setters + //------------------------------------------------------------------- + + /** + * Sets this object's strength property. The strength determines the + * minimum level of difference considered significant during a + * search. Generally, {@link Collator#TERTIARY} and + * {@link Collator#IDENTICAL} indicate that all differences are + * considered significant, {@link Collator#SECONDARY} indicates + * that upper/lower case distinctions should be ignored, and + * {@link Collator#PRIMARY} indicates that both case and accents + * should be ignored. However, the exact meanings of these constants + * are determined by individual Collator objects. + *

+ * @see Collator#PRIMARY + * @see Collator#SECONDARY + * @see Collator#TERTIARY + * @see Collator#IDENTICAL + */ + void setStrength(Collator::ECollationStrength newStrength, UErrorCode& status); + + + /** + * Returns this object's strength property, which indicates what level + * of differences are considered significant during a search. + *

+ * @see #setStrength + */ + Collator::ECollationStrength getStrength(void) const; + + /** + * Set the collator to be used for this string search. Also changes + * the search strength to match that of the new collator. + *

+ * This method causes internal data such as Boyer-Moore shift tables + * to be recalculated, but the iterator's position is unchanged. + *

+ * @see #getCollator + */ + void setCollator(const RuleBasedCollator* coll, UErrorCode& status); + + /** + * Return the RuleBasedCollator being used for this string search. + */ + const RuleBasedCollator& getCollator() const; + + /** + * Set the pattern for which to search. + * This method causes internal data such as Boyer-Moore shift tables + * to be recalculated, but the iterator's position is unchanged. + */ + void setPattern(const UnicodeString& pat, UErrorCode& status); + + /** + * Returns the pattern for which this object is searching. + */ + const UnicodeString& getPattern() const; + + /** + * Set the target text which should be searched and resets the + * iterator's position to point before the start of the new text. + * This method is useful if you want to re-use an iterator to + * search for the same pattern within a different body of text. + */ + virtual void setTarget(const UnicodeString& newText); + + /** + * Set the target text which should be searched and resets the + * iterator's position to point before the start of the target text. + * This method is useful if you want to re-use an iterator to + * search for the same pattern within a different body of text. + * + * @see #getTarget + */ + virtual void adoptTarget(CharacterIterator* iterator); + + /** Reset iterator + */ + virtual void reset(void); + /** + * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. + * This method is to implement a simple version of RTTI, since not all + * C++ compilers support genuine RTTI. Polymorphic operator==() and + * clone() methods call this method. + * + * @return The class ID for this object. All objects of a + * given class have the same class ID. Objects of + * other classes have different class IDs. + */ + inline virtual UClassID getDynamicClassID(void) const; + + /** + * Returns the class ID for this class. This is useful only for + * comparing to a return value from getDynamicClassID(). For example: + * + * Base* polymorphic_pointer = createPolymorphicObject(); + * if (polymorphic_pointer->getDynamicClassID() == + * Derived::getStaticClassID()) ... + * + * @return The class ID for all objects of this class. + */ + inline static UClassID getStaticClassID(void); + +protected: + //------------------------------------------------------------------- + // Privates + //------------------------------------------------------------------- + + /** + * Search forward for matching text, starting at a given location. + * Clients should not call this method directly; instead they should call + * {@link SearchIterator#next}. + *

+ * If a match is found, this method returns the index at which the match + * starts and calls {@link SearchIterator#setMatchLength} + * with the number of characters in the target + * text that make up the match. If no match is found, the method returns + * DONE and does not call setMatchLength. + *

+ * @param start The index in the target text at which the search starts. + * + * @return The index at which the matched text in the target starts, or DONE + * if no match was found. + *

+ * @see SearchIterator#next + * @see SearchIterator#DONE + */ + virtual int32_t handleNext(int32_t start, UErrorCode& status); + /** + * Search backward for matching text ,starting at a given location. + * Clients should not call this method directly; instead they should call + * SearchIterator.previous(), which this method overrides. + *

+ * If a match is found, this method returns the index at which the match + * starts and calls {@link SearchIterator#setMatchLength} + * with the number of characters in the target + * text that make up the match. If no match is found, the method returns + * DONE and does not call setMatchLength. + *

+ * @param start The index in the target text at which the search starts. + * + * @return The index at which the matched text in the target starts, or DONE + * if no match was found. + *

+ * @see SearchIterator#previous + * @see SearchIterator#DONE + */ + virtual int32_t handlePrev(int32_t start, UErrorCode& status); +private: + /** + * Return a bitmask that will select only the portions of a collation + * element that are significant at the given strength level. + */ + static int32_t getMask(Collator::ECollationStrength strength); + + + void initialize(UErrorCode& status); + /** + * Method used by StringSearch to determine how far to the right to + * shift the pattern during a Boyer-Moore search. + * + * @param curValue The current value in the target text + * @param curIndex The index in the pattern at which we failed to match + * curValue in the target text. + */ + int32_t getShift( int32_t curValue, int32_t curIndex ) const; + + /** + * Method used by StringSearch to determine how far to the left to + * shift the pattern during a reverse Boyer-Moore search. + * + * @param curValue The current value in the target text + * @param curIndex The index in the pattern at which we failed to match + * curValue in the target text. + */ + int32_t getBackShift( int32_t curValue, int32_t curIndex ) const; + + /** + * Hash a collation element from its full size (32 bits) down into a + * value that can be used as an index into the shift tables. Right + * now we do a modulus by the size of the hash table. + * + * TODO: At some point I should experiment to see whether a slightly + * more complicated hash function gives us a better distribution + * on multilingual text. I doubt it will have much effect on + * performance, though. + */ + static int32_t hash(int32_t order); + + //------------------------------------------------------------------------ + // Private Data + // + CollationElementIterator *iter; + RuleBasedCollator *collator; + /* HSYS ? Why? Changes to this will not affect collator. no changes to the comparsion result */ + Collator::ECollationStrength strength; + + //------------------------------------------------------------------------ + // Everything from here on down is the data used to represent the + // Boyer-Moore shift tables and the code that generates and manipulates + // them. + // + int32_t *valueList; + int32_t valueListLen; + int32_t shiftTable[256]; + int32_t backShiftTable[256]; + + UnicodeString pattern; // The pattern string + int32_t normLen; // num. of collation elements in pattern. + int32_t minLen; // Min of composed, decomposed versions + int32_t maxLen; // Max + CollationElementIterator *it; // to be removed + +private: + /* to be removed */ + void dumpTables(); + /** + * Class ID + */ + static char fgClassID; +}; + +inline bool_t StringSearch::operator!=(const SearchIterator& that) const +{ + return !operator==(that); +} + +inline UClassID StringSearch::getDynamicClassID(void) const +{ + return StringSearch::getStaticClassID(); +} + +inline UClassID StringSearch::getStaticClassID(void) +{ + return (UClassID)(&fgClassID); +} + + +#endif +