diff --git a/icu4c/source/i18n/Makefile.in b/icu4c/source/i18n/Makefile.in index ce88e8f15a0..3802981bbbe 100644 --- a/icu4c/source/i18n/Makefile.in +++ b/icu4c/source/i18n/Makefile.in @@ -72,7 +72,8 @@ unifltlg.o unirange.o uniset.o unitohex.o unum.o \ dbbi.o dbbi_tbl.o rbbi.o rbbi_tbl.o brkdict.o nultrans.o jamohang.o hangjamo.o \ remtrans.o utrans.o \ titletrn.o tolowtrn.o toupptrn.o xformtrn.o name2uni.o uni2name.o nortrans.o \ -unifilt.o quant.o strmatch.o transreg.o +unifilt.o quant.o strmatch.o transreg.o usearch.o search.o stsearch.o + STATIC_OBJECTS = $(OBJECTS:.o=.$(STATIC_O)) diff --git a/icu4c/source/i18n/i18n.dsp b/icu4c/source/i18n/i18n.dsp index c9e0f50b726..bc75ac84192 100644 --- a/icu4c/source/i18n/i18n.dsp +++ b/icu4c/source/i18n/i18n.dsp @@ -234,6 +234,10 @@ SOURCE=.\remtrans.cpp # End Source File # Begin Source File +SOURCE=.\search.cpp +# End Source File +# Begin Source File + SOURCE=.\simpletz.cpp # End Source File # Begin Source File @@ -250,6 +254,10 @@ SOURCE=.\strmatch.cpp # End Source File # Begin Source File +SOURCE=.\stsearch.cpp +# End Source File +# Begin Source File + SOURCE=.\tblcoll.cpp # End Source File # Begin Source File @@ -350,6 +358,10 @@ SOURCE=.\unum.cpp # End Source File # Begin Source File +SOURCE=.\usearch.cpp +# End Source File +# Begin Source File + SOURCE=.\utrans.cpp # End Source File # Begin Source File @@ -1119,6 +1131,25 @@ SOURCE=.\unicode\remtrans.h # End Source File # Begin Source File +SOURCE=.\unicode\search.h + +!IF "$(CFG)" == "i18n - Win32 Release" + +!ELSEIF "$(CFG)" == "i18n - Win32 Debug" + +# Begin Custom Build +InputPath=.\unicode\search.h + +"..\..\include\unicode\search.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" + copy unicode\search.h ..\..\include\unicode + +# End Custom Build + +!ENDIF + +# End Source File +# Begin Source File + SOURCE=.\unicode\simpletz.h !IF "$(CFG)" == "i18n - Win32 Release" @@ -1204,6 +1235,25 @@ SOURCE=.\strmatch.h # End Source File # Begin Source File +SOURCE=.\unicode\stsearch.h + +!IF "$(CFG)" == "i18n - Win32 Release" + +!ELSEIF "$(CFG)" == "i18n - Win32 Debug" + +# Begin Custom Build +InputPath=.\unicode\stsearch.h + +"..\..\include\unicode\stsearch.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" + copy unicode\stsearch.h ..\..\include\unicode + +# End Custom Build + +!ENDIF + +# End Source File +# Begin Source File + SOURCE=.\unicode\tblcoll.h !IF "$(CFG)" == "i18n - Win32 Release" @@ -1676,6 +1726,29 @@ InputPath=.\unicode\unum.h # End Source File # Begin Source File +SOURCE=.\unicode\usearch.h + +!IF "$(CFG)" == "i18n - Win32 Release" + +!ELSEIF "$(CFG)" == "i18n - Win32 Debug" + +# Begin Custom Build +InputPath=.\unicode\usearch.h + +"..\..\include\unicode\usearch.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" + copy unicode\usearch.h ..\..\include\unicode + +# End Custom Build + +!ENDIF + +# End Source File +# Begin Source File + +SOURCE=.\usrchimp.h +# End Source File +# Begin Source File + SOURCE=.\unicode\utrans.h !IF "$(CFG)" == "i18n - Win32 Release" diff --git a/icu4c/source/i18n/search.cpp b/icu4c/source/i18n/search.cpp new file mode 100644 index 00000000000..879d704b77a --- /dev/null +++ b/icu4c/source/i18n/search.cpp @@ -0,0 +1,357 @@ +/* +********************************************************************** +* Copyright (C) 2001 IBM and others. All rights reserved. +********************************************************************** +* Date Name Description +* 03/22/2000 helena Creation. +********************************************************************** +*/ + +#include "unicode/brkiter.h" +#include "unicode/schriter.h" +#include "unicode/search.h" +#include "usrchimp.h" +#include "cmemory.h" + +// public constructors and destructors ----------------------------------- + +SearchIterator::SearchIterator(const SearchIterator &other) +{ + if (other != *this) { + m_breakiterator_ = other.m_breakiterator_; + m_text_ = other.m_text_; + m_search_ = (USearch *)uprv_malloc(sizeof(USearch)); + m_search_->breakIter = other.m_search_->breakIter; + m_search_->isCanonicalMatch = other.m_search_->isCanonicalMatch; + m_search_->isOverlap = other.m_search_->isOverlap; + m_search_->matchedIndex = other.m_search_->matchedIndex; + m_search_->matchedLength = other.m_search_->matchedLength; + m_search_->text = other.m_search_->text; + m_search_->textLength = other.m_search_->textLength; + } +} + +SearchIterator::~SearchIterator() +{ + if (m_search_ != NULL) { + uprv_free(m_search_); + } +} + +// public get and set methods ---------------------------------------- + +void SearchIterator::setAttribute(USearchAttribute attribute, + USearchAttributeValue value, + UErrorCode &status) +{ + if (U_SUCCESS(status)) { + switch (attribute) + { + case USEARCH_OVERLAP : + m_search_->isOverlap = (value == USEARCH_ON ? TRUE : FALSE); + break; + case USEARCH_CANONICAL_MATCH : + m_search_->isCanonicalMatch = (value == USEARCH_ON ? TRUE : FALSE); + break; + default: + status = U_ILLEGAL_ARGUMENT_ERROR; + } + } + if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) { + status = U_ILLEGAL_ARGUMENT_ERROR; + } +} + +USearchAttributeValue SearchIterator::getAttribute( + USearchAttribute attribute) const +{ + switch (attribute) { + case USEARCH_ATTRIBUTE_COUNT : + return USEARCH_DEFAULT; + case USEARCH_OVERLAP : + return (m_search_->isOverlap == TRUE ? USEARCH_ON : USEARCH_OFF); + case USEARCH_CANONICAL_MATCH : + return (m_search_->isCanonicalMatch == TRUE ? USEARCH_ON : + USEARCH_OFF); + } + return USEARCH_DEFAULT; +} + +UTextOffset SearchIterator::getMatchedStart() const +{ + return m_search_->matchedIndex; +} + +int32_t SearchIterator::getMatchedLength() const +{ + return m_search_->matchedLength; +} + +void SearchIterator::getMatchedText(UnicodeString &result) const +{ + UTextOffset matchedindex = m_search_->matchedIndex; + int32_t matchedlength = m_search_->matchedLength; + if (matchedindex != USEARCH_DONE && matchedlength != 0) { + result.setTo(m_search_->text + matchedindex, matchedlength); + } + else { + result.remove(); + } +} + +void SearchIterator::setBreakIterator(BreakIterator *breakiter, + UErrorCode &status) +{ + if (U_SUCCESS(status)) { + m_search_->breakIter = NULL; + // the c++ breakiterator may not make use of ubreakiterator. + // so we'll have to keep track of it ourselves. + m_breakiterator_ = breakiter; + } +} + +const BreakIterator * SearchIterator::getBreakIterator(void) const +{ + return m_breakiterator_; +} + +void SearchIterator::setText(const UnicodeString &text, UErrorCode &status) +{ + if (U_SUCCESS(status)) { + if (text.length() == 0) { + status = U_ILLEGAL_ARGUMENT_ERROR; + } + else { + m_text_ = text; + m_search_->text = m_text_.fArray; + } + } +} + +void SearchIterator::setText(CharacterIterator &text, UErrorCode &status) +{ + if (U_SUCCESS(status)) { + text.getText(m_text_); + setText(m_text_, status); + } +} + +const UnicodeString & SearchIterator::getText(void) const +{ + return m_text_; +} + +// operator overloading ---------------------------------------------- + +UBool SearchIterator::operator==(const SearchIterator &that) const +{ + if (this == &that) { + return TRUE; + } + return (m_breakiterator_ == that.m_breakiterator_ && + m_search_->isCanonicalMatch == that.m_search_->isCanonicalMatch && + m_search_->isOverlap == that.m_search_->isOverlap && + m_search_->matchedIndex == that.m_search_->matchedIndex && + m_search_->matchedLength == that.m_search_->matchedLength && + m_search_->textLength == that.m_search_->textLength && + getOffset() == that.getOffset() && + (uprv_memcmp(m_search_->text, that.m_search_->text, + m_search_->textLength * sizeof(UChar)) == 0)); +} + +// public methods ---------------------------------------------------- + +UTextOffset SearchIterator::first(UErrorCode &status) +{ + setOffset(0, status); + return handleNext(0, status); +} + +UTextOffset SearchIterator::following(UTextOffset position, + UErrorCode &status) +{ + setOffset(position, status); + return handleNext(position, status); +} + +UTextOffset SearchIterator::last(UErrorCode &status) +{ + setOffset(m_search_->textLength, status); + return handlePrev(m_search_->textLength, status); +} + +UTextOffset SearchIterator::preceding(UTextOffset position, + UErrorCode &status) +{ + setOffset(position, status); + return handlePrev(position, status); +} + +UTextOffset SearchIterator::next(UErrorCode &status) +{ + if (U_SUCCESS(status)) { + UTextOffset offset = getOffset(); + UTextOffset matchindex = m_search_->matchedIndex; + int32_t matchlength = m_search_->matchedLength; + m_search_->reset = FALSE; + if (m_search_->isForwardSearching == TRUE) { + int32_t textlength = m_search_->textLength; + if (offset == textlength || matchindex == textlength || + (matchindex != USEARCH_DONE && + matchindex + matchlength >= textlength)) { + // not enough characters to match + setMatchNotFound(); + return USEARCH_DONE; + } + } + else { + // switching direction. + // if matchedIndex == USEARCH_DONE, it means that either a + // setOffset has been called or that previous ran off the text + // string. the iterator would have been set to offset 0 if a + // match is not found. + m_search_->isForwardSearching = TRUE; + if (m_search_->matchedIndex != USEARCH_DONE) { + // there's no need to set the collation element iterator + // the next call to next will set the offset. + return matchindex; + } + } + + if (matchindex != USEARCH_DONE) { + return handleNext(matchindex + matchlength, status); + } + return handleNext(offset, status); + } + return USEARCH_DONE; +} + +UTextOffset SearchIterator::previous(UErrorCode &status) +{ + if (U_SUCCESS(status)) { + UTextOffset offset; + if (m_search_->reset) { + offset = m_search_->textLength; + m_search_->isForwardSearching = FALSE; + m_search_->reset = FALSE; + } + else { + offset = getOffset(); + } + + UTextOffset matchindex = m_search_->matchedIndex; + if (m_search_->isForwardSearching == TRUE) { + // switching direction. + // if matchedIndex == USEARCH_DONE, it means that either a + // setOffset has been called or that next ran off the text + // string. the iterator would have been set to offset textLength if + // a match is not found. + m_search_->isForwardSearching = FALSE; + if (matchindex != USEARCH_DONE) { + return matchindex; + } + } + else { + if (offset == 0 || matchindex == 0) { + // not enough characters to match + setMatchNotFound(); + return USEARCH_DONE; + } + } + + if (matchindex != USEARCH_DONE) { + return handlePrev(matchindex, status); + } + return handlePrev(offset, status); + } + return USEARCH_DONE; +} + +void SearchIterator::reset() +{ + setMatchNotFound(); + m_search_->isOverlap = FALSE; + m_search_->isCanonicalMatch = FALSE; + m_search_->isForwardSearching = TRUE; + m_search_->reset = TRUE; +} + +// protected constructors and destructors ----------------------------- + +SearchIterator::SearchIterator() : m_breakiterator_(NULL) +{ + m_search_ = (USearch *)uprv_malloc(sizeof(USearch)); + m_search_->breakIter = NULL; + m_search_->isOverlap = FALSE; + m_search_->isCanonicalMatch = FALSE; + m_search_->isForwardSearching = TRUE; + m_search_->reset = TRUE; + m_search_->matchedIndex = USEARCH_DONE; + m_search_->matchedLength = 0; + m_search_->text = NULL; + m_search_->textLength = 0; +} + +SearchIterator::SearchIterator(const UnicodeString &text, + BreakIterator *breakiter) : + m_breakiterator_(breakiter), + m_text_(text) +{ + m_search_ = (USearch *)uprv_malloc(sizeof(USearch)); + m_search_->breakIter = NULL; + m_search_->isOverlap = FALSE; + m_search_->isCanonicalMatch = FALSE; + m_search_->isForwardSearching = TRUE; + m_search_->reset = TRUE; + m_search_->matchedIndex = USEARCH_DONE; + m_search_->matchedLength = 0; + m_search_->text = m_text_.fArray; + m_search_->textLength = text.length(); +} + +SearchIterator::SearchIterator(CharacterIterator &text, + BreakIterator *breakiter) : + m_breakiterator_(breakiter) +{ + m_search_ = (USearch *)uprv_malloc(sizeof(USearch)); + m_search_->breakIter = NULL; + m_search_->isOverlap = FALSE; + m_search_->isCanonicalMatch = FALSE; + m_search_->isForwardSearching = TRUE; + m_search_->reset = TRUE; + m_search_->matchedIndex = USEARCH_DONE; + m_search_->matchedLength = 0; + text.getText(m_text_); + m_search_->text = m_text_.fArray; + m_search_->textLength = m_text_.length(); + m_breakiterator_ = breakiter; +} + +// protected methods ------------------------------------------------------ + +void SearchIterator::setMatchLength(int32_t length) +{ + m_search_->matchedLength = length; +} + +void SearchIterator::setMatchStart(UTextOffset position) +{ + m_search_->matchedIndex = position; +} + +void SearchIterator::setMatchNotFound() +{ + setMatchStart(USEARCH_DONE); + setMatchLength(0); + UErrorCode status = U_ZERO_ERROR; + // by default no errors should be returned here since offsets are within + // range. + if (m_search_->isForwardSearching) { + setOffset(m_search_->textLength, status); + } + else { + setOffset(0, status); + } +} + + diff --git a/icu4c/source/i18n/stsearch.cpp b/icu4c/source/i18n/stsearch.cpp new file mode 100644 index 00000000000..06d056b5bd8 --- /dev/null +++ b/icu4c/source/i18n/stsearch.cpp @@ -0,0 +1,387 @@ +/* +********************************************************************** +* Copyright (C) 2001 IBM and others. All rights reserved. +********************************************************************** +* Date Name Description +* 03/22/2000 helena Creation. +********************************************************************** +*/ + +#include "unicode/stsearch.h" +#include "cmemory.h" +#include "usrchimp.h" + +// public constructors and destructors ----------------------------------- + +StringSearch::StringSearch(const UnicodeString &pattern, + const UnicodeString &text, + const Locale &locale, + BreakIterator *breakiter, + UErrorCode &status) : + SearchIterator(text, breakiter), + m_collator_(), + m_pattern_(pattern) +{ + m_strsrch_ = usearch_open(m_pattern_.fArray, m_pattern_.fLength, + m_text_.fArray, m_text_.fLength, + locale.getName(), NULL, &status); + uprv_free(m_search_); + m_search_ = NULL; + + if (U_SUCCESS(status)) { + int32_t length; + const UChar *rules = ucol_getRules(m_strsrch_->collator, &length); + m_collation_rules_.setTo(rules, length); + m_collator_.setUCollator((UCollator *)m_strsrch_->collator, + &m_collation_rules_); + // m_search_ has been created by the base SearchIterator class + m_search_ = m_strsrch_->search; + } +} + +StringSearch::StringSearch(const UnicodeString &pattern, + const UnicodeString &text, + RuleBasedCollator *coll, + BreakIterator *breakiter, + UErrorCode &status) : + SearchIterator(text, breakiter), + m_collator_(), + m_pattern_(pattern) +{ + if (coll == NULL) { + status = U_ILLEGAL_ARGUMENT_ERROR; + m_strsrch_ = NULL; + return; + } + m_strsrch_ = usearch_openFromCollator(m_pattern_.fArray, + m_pattern_.fLength, m_text_.fArray, + m_text_.fLength, coll->ucollator, + NULL, &status); + uprv_free(m_search_); + m_search_ = NULL; + + if (U_SUCCESS(status)) { + int32_t length; + const UChar *rules = ucol_getRules(m_strsrch_->collator, &length); + m_collation_rules_.setTo(rules, length); + m_collator_.setUCollator((UCollator *)m_strsrch_->collator, + &m_collation_rules_); + // m_search_ has been created by the base SearchIterator class + m_search_ = m_strsrch_->search; + } +} + +StringSearch::StringSearch(const UnicodeString &pattern, + CharacterIterator &text, + const Locale &locale, + BreakIterator *breakiter, + UErrorCode &status) : + SearchIterator(text, breakiter), + m_collator_(), + m_pattern_(pattern) +{ + m_strsrch_ = usearch_open(m_pattern_.fArray, m_pattern_.fLength, + m_text_.fArray, m_text_.fLength, + locale.getName(), NULL, &status); + uprv_free(m_search_); + m_search_ = NULL; + + if (U_SUCCESS(status)) { + int32_t length; + const UChar *rules = ucol_getRules(m_strsrch_->collator, &length); + m_collation_rules_.setTo(rules, length); + m_collator_.setUCollator((UCollator *)m_strsrch_->collator, + &m_collation_rules_); + // m_search_ has been created by the base SearchIterator class + m_search_ = m_strsrch_->search; + } +} + +StringSearch::StringSearch(const UnicodeString &pattern, + CharacterIterator &text, + RuleBasedCollator *coll, + BreakIterator *breakiter, + UErrorCode &status) : + SearchIterator(text, breakiter), + m_collator_(), + m_pattern_(pattern) +{ + if (coll == NULL) { + status = U_ILLEGAL_ARGUMENT_ERROR; + m_strsrch_ = NULL; + return; + } + m_strsrch_ = usearch_openFromCollator(m_pattern_.fArray, + m_pattern_.fLength, m_text_.fArray, + m_text_.fLength, coll->ucollator, + NULL, &status); + uprv_free(m_search_); + m_search_ = NULL; + + if (U_SUCCESS(status)) { + int32_t length; + const UChar *rules = ucol_getRules(m_strsrch_->collator, &length); + m_collation_rules_.setTo(rules, length); + m_collator_.setUCollator((UCollator *)m_strsrch_->collator, + &m_collation_rules_); + // m_search_ has been created by the base SearchIterator class + m_search_ = m_strsrch_->search; + } +} + +StringSearch::StringSearch(const StringSearch &that) : + SearchIterator(that.m_text_, that.m_breakiterator_), + m_collator_(), + m_pattern_(that.m_pattern_) +{ + UErrorCode status = U_ZERO_ERROR; + if (that.m_strsrch_ == NULL) { + m_strsrch_ = NULL; + status = U_ILLEGAL_ARGUMENT_ERROR; + } + else { + m_strsrch_ = usearch_openFromCollator(m_pattern_.fArray, + m_pattern_.fLength, + m_text_.fArray, m_text_.fLength, + that.m_strsrch_->collator, + NULL, &status); + } + uprv_free(m_search_); + m_search_ = NULL; + + if (U_SUCCESS(status)) { + int32_t length; + const UChar *rules = ucol_getRules(m_strsrch_->collator, &length); + m_collation_rules_.setTo(rules, length); + m_collator_.setUCollator((UCollator *)m_strsrch_->collator, + &m_collation_rules_); + // m_search_ has been created by the base SearchIterator class + m_search_ = m_strsrch_->search; + m_breakiterator_ = that.m_breakiterator_; + } +} + +StringSearch::~StringSearch() +{ + usearch_close(m_strsrch_); + m_search_ = NULL; +} + +// operator overloading --------------------------------------------- +StringSearch & StringSearch::operator=(const StringSearch &that) +{ + if ((*this) != that) { + UErrorCode status = U_ZERO_ERROR; + m_text_ = that.m_text_; + m_breakiterator_ = that.m_breakiterator_; + m_pattern_ = that.m_pattern_; + // all m_search_ in the parent class is linked up with m_strsrch_ + usearch_close(m_strsrch_); + m_strsrch_ = usearch_openFromCollator(m_pattern_.fArray, + m_pattern_.fLength, + m_text_.fArray, + m_text_.fLength, + that.m_strsrch_->collator, + NULL, &status); + int32_t length; + const UChar *rules = ucol_getRules(m_strsrch_->collator, &length); + m_collation_rules_.setTo(rules, length); + m_collator_.setUCollator((UCollator *)m_strsrch_->collator, + &m_collation_rules_); + m_search_ = m_strsrch_->search; + } + return *this; +} + +UBool StringSearch::operator==(const SearchIterator &that) const +{ + if (this == &that) { + return TRUE; + } + if (SearchIterator::operator ==(that)) { + StringSearch &thatsrch = (StringSearch &)that; + return (this->m_pattern_ == thatsrch.m_pattern_ && + this->m_strsrch_->collator == thatsrch.m_strsrch_->collator); + } + return FALSE; +} + +// public get and set methods ---------------------------------------- + +void StringSearch::setOffset(UTextOffset position, UErrorCode &status) +{ + usearch_setOffset(m_strsrch_, position, &status); +} + +UTextOffset StringSearch::getOffset(void) const +{ + return usearch_getOffset(m_strsrch_); +} + +void StringSearch::setText(const UnicodeString &text, UErrorCode &status) +{ + m_text_ = text; + usearch_setText(m_strsrch_, text.fArray, text.fLength, &status); +} + +void StringSearch::setText(CharacterIterator &text, UErrorCode &status) +{ + text.getText(m_text_); + usearch_setText(m_strsrch_, m_text_.fArray, m_text_.fLength, &status); +} + +RuleBasedCollator * StringSearch::getCollator() const +{ + return (RuleBasedCollator *)&m_collator_; +} + +void StringSearch::setCollator(RuleBasedCollator *coll, UErrorCode &status) +{ + usearch_setCollator(m_strsrch_, coll->getUCollator(), &status); + m_collation_rules_.setTo(coll->getRules()); + m_collator_.setUCollator((UCollator *)m_strsrch_->collator, + &m_collation_rules_); +} + +void StringSearch::setPattern(const UnicodeString &pattern, + UErrorCode &status) +{ + m_pattern_ = pattern; + usearch_setPattern(m_strsrch_, m_pattern_.fArray, m_pattern_.fLength, + &status); +} + +const UnicodeString & StringSearch::getPattern() const +{ + return m_pattern_; +} + +// public methods ---------------------------------------------------- + +void StringSearch::reset() +{ + usearch_reset(m_strsrch_); +} + +SearchIterator * StringSearch::safeClone(void) const +{ + UErrorCode status = U_ZERO_ERROR; + StringSearch *result = new StringSearch(m_pattern_, m_text_, + (RuleBasedCollator *)&m_collator_, + m_breakiterator_, + status); + result->setOffset(getOffset(), status); + result->setMatchStart(m_strsrch_->search->matchedIndex); + result->setMatchLength(m_strsrch_->search->matchedLength); + if (U_FAILURE(status)) { + return NULL; + } + return result; +} + +// protected method ------------------------------------------------- + +UTextOffset StringSearch::handleNext(int32_t position, UErrorCode &status) +{ + // values passed here are already in the pre-shift position + if (U_SUCCESS(status)) { + if (m_strsrch_->pattern.CELength == 0) { + m_search_->matchedIndex = + m_search_->matchedIndex == USEARCH_DONE ? + getOffset() : m_search_->matchedIndex + 1; + m_search_->matchedLength = 0; + ucol_setOffset(m_strsrch_->textIter, m_search_->matchedIndex, + &status); + if (m_search_->matchedIndex == m_search_->textLength) { + m_search_->matchedIndex = USEARCH_DONE; + } + } + else { + // looking at usearch.cpp, this part is shifted out to + // StringSearch instead of SearchIterator because m_strsrch_ is + // not accessible in SearchIterator + if (!m_search_->isOverlap && + position + m_strsrch_->pattern.defaultShiftSize > + m_search_->textLength) { + setMatchNotFound(); + return USEARCH_DONE; + } + while (TRUE) { + if (m_search_->isCanonicalMatch) { + // can't use exact here since extra accents are allowed. + usearch_handleNextCanonical(m_strsrch_, &status); + } + else { + usearch_handleNextExact(m_strsrch_, &status); + } + if (U_FAILURE(status)) { + return USEARCH_DONE; + } + if (m_breakiterator_ == NULL || + m_search_->matchedIndex == USEARCH_DONE || + (m_breakiterator_->isBoundary(m_search_->matchedIndex) && + m_breakiterator_->isBoundary(m_search_->matchedIndex + + m_search_->matchedLength))) { + return m_search_->matchedIndex; + } + } + } + } + return USEARCH_DONE; +} + +UTextOffset StringSearch::handlePrev(int32_t position, UErrorCode &status) +{ + // values passed here are already in the pre-shift position + if (U_SUCCESS(status)) { + if (m_strsrch_->pattern.CELength == 0) { + m_search_->matchedIndex = + (m_search_->matchedIndex == USEARCH_DONE ? getOffset() : + m_search_->matchedIndex); + if (m_search_->matchedIndex == 0) { + setMatchNotFound(); + } + else { + m_search_->matchedIndex --; + ucol_setOffset(m_strsrch_->textIter, m_search_->matchedIndex, + &status); + m_search_->matchedLength = 0; + } + } + else { + // looking at usearch.cpp, this part is shifted out to + // StringSearch instead of SearchIterator because m_strsrch_ is + // not accessible in SearchIterator + if (!m_search_->isOverlap && + position - m_strsrch_->pattern.defaultShiftSize < 0) { + setMatchNotFound(); + return USEARCH_DONE; + } + while (TRUE) { + if (m_search_->isCanonicalMatch) { + // can't use exact here since extra accents are allowed. + usearch_handlePreviousCanonical(m_strsrch_, &status); + } + else { + usearch_handlePreviousExact(m_strsrch_, &status); + } + if (U_FAILURE(status)) { + return USEARCH_DONE; + } + if (m_breakiterator_ == NULL || + m_search_->matchedIndex == USEARCH_DONE || + (m_breakiterator_->isBoundary(m_search_->matchedIndex) && + m_breakiterator_->isBoundary(m_search_->matchedIndex + + m_search_->matchedLength))) { + return m_search_->matchedIndex; + } + } + } + + return m_search_->matchedIndex; + } + return USEARCH_DONE; +} + + + diff --git a/icu4c/source/i18n/tblcoll.cpp b/icu4c/source/i18n/tblcoll.cpp index 52761d66430..2b9eec0120e 100644 --- a/icu4c/source/i18n/tblcoll.cpp +++ b/icu4c/source/i18n/tblcoll.cpp @@ -326,18 +326,12 @@ Collator* RuleBasedCollator::clone() const return new RuleBasedCollator(*this); } -/** -* Create a CollationElementIterator object that will iterator over the -* elements in a string, using the collation rules defined in this -* RuleBasedCollator -*/ CollationElementIterator* RuleBasedCollator::createCollationElementIterator (const UnicodeString& source) const { UErrorCode status = U_ZERO_ERROR; CollationElementIterator *result = new CollationElementIterator(source, this, status); - if (U_FAILURE(status)) return NULL; diff --git a/icu4c/source/i18n/ucol_imp.h b/icu4c/source/i18n/ucol_imp.h index 370e7e51c2f..2c23b048cb3 100644 --- a/icu4c/source/i18n/ucol_imp.h +++ b/icu4c/source/i18n/ucol_imp.h @@ -368,6 +368,13 @@ ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status); #define getExpansionCount(CE) ((CE)&0xF) #define isCEIgnorable(CE) (((CE) & 0xFFFFFFBF) == 0) +/* StringSearch internal use */ +#define inNormBuf(coleiter) ((coleiter)->iteratordata_.flags & UCOL_ITER_INNORMBUF) +#define isFCDPointerNull(coleiter) ((coleiter)->iteratordata_.fcdPosition == NULL) +#define getExpansionPrefix(coleiter) ((coleiter)->iteratordata_.toReturn - (coleiter)->iteratordata_.CEs) +#define setExpansionPrefix(coleiter, offset) ((coleiter)->iteratordata_.CEs + offset) +#define getExpansionSuffix(coleiter) ((coleiter)->iteratordata_.CEpos - (coleiter)->iteratordata_.toReturn) +#define setExpansionSuffix(coleiter, offset) ((coleiter)->iteratordata_.toReturn = (coleiter)->iteratordata_.CEpos - leftoverces) #define UCA_DATA_TYPE "dat" #define UCA_DATA_NAME "ucadata" diff --git a/icu4c/source/i18n/ucoleitr.cpp b/icu4c/source/i18n/ucoleitr.cpp index 95b4221917e..70072db2341 100644 --- a/icu4c/source/i18n/ucoleitr.cpp +++ b/icu4c/source/i18n/ucoleitr.cpp @@ -223,6 +223,9 @@ ucol_setOffset(UCollationElements *elems, collIterate *ci = &(elems->iteratordata_); ci->pos = ci->string + offset; ci->CEpos = ci->toReturn = ci->CEs; + if (ci->flags & UCOL_ITER_INNORMBUF) { + ci->flags = ci->origFlags; + } if ((ci->flags & UCOL_ITER_HASLEN) == 0) { ci->endp = ci->string + u_strlen(ci->string); } diff --git a/icu4c/source/i18n/unicode/search.h b/icu4c/source/i18n/unicode/search.h new file mode 100644 index 00000000000..7f3d38eaac7 --- /dev/null +++ b/icu4c/source/i18n/unicode/search.h @@ -0,0 +1,466 @@ +/* +********************************************************************** +* Copyright (C) 2001 IBM and others. All rights reserved. +********************************************************************** +* Date Name Description +* 03/22/2000 helena Creation. +********************************************************************** +*/ + +#ifndef SEARCH_H +#define SEARCH_H + +#include "unicode/unistr.h" +#include "unicode/chariter.h" +#include "unicode/brkiter.h" +#include "unicode/usearch.h" + +/** + * SearchIterator is an abstract base class that provides + * methods to search for a pattern within a text string. Instances of + * SearchIterator maintain a current position and scans over the + * target text, returning the indices the pattern is matched and the length + * of each match. + *
+ * SearchIterator defines a protocol for text searching. + * Subclasses provide concrete implementations of various search algorithms. + * For example, {@link StringSearch} implements language-sensitive pattern + * matching based on the comparison rules defined in a + * {@link RuleBasedCollator} object. + *
+ * Other options for searching includes using a BreakIterator to restrict + * the points at which matches are detected. + *
+ * SearchIterator provides an API that is similar to that of + * other text iteration classes such as BreakIterator. Using + * this class, it is easy to scan through text looking for all occurances of + * a given pattern. The following example uses a StringSearch + * object to find all instances of "fox" in the target string. Any other + * subclass of SearchIterator can be used in an identical + * manner. + *
+ * UnicodeString target("The quick brown fox jumped over the lazy fox");
+ * UnicodeString pattern("fox");
+ *
+ * SearchIterator *iter = new StringSearch(pattern, target);
+ *
+ * for (int pos = iter->first(); pos != USEARCH_DONE;
+ * pos = iter->next()) {
+ * printf("Found match at %d pos, length is %d\n", pos,
+ * iter.getMatchLength());
+ * }
+ *
+ *
+ * @see StringSearch
+ */
+
+struct USearch;
+typedef struct USearch USearch;
+
+/**
+* Data structure for searching
+*/
+class U_I18N_API SearchIterator {
+
+public:
+
+ // public constructors and destructors -------------------------------
+
+ /**
+ * Copy constructor that creates a SearchIterator instance with the same
+ * behavior, and iterating over the same text.
+ * @param other the SearchIterator instance to be copied.
+ */
+ SearchIterator(const SearchIterator &other);
+
+ /**
+ * Destructor. Cleans up the search iterator data struct.
+ */
+ virtual ~SearchIterator();
+
+ // public get and set methods ----------------------------------------
+
+ /**
+ * Sets the index to point to the given position, and clears any state
+ * that's affected.
+ * + * This method takes the argument index and sets the position in the text + * string accordingly without checking if the index is pointing to a + * valid starting point to begin searching. + * @param position within the text to be set + * @param status for errors if it occurs + */ + virtual void setOffset(UTextOffset position, UErrorCode &status) = 0; + + /** + * Return the current index in the text being searched. + * If the iteration has gone past the end of the text + * (or past the beginning for a backwards search), {@link #USEARCH_DONE} + * is returned. + * @return current index in the text being searched. + */ + virtual UTextOffset getOffset(void) const = 0; + + /** + * Sets the text searching attributes located in the enum + * USearchAttribute with values from the enum USearchAttributeValue. + * USEARCH_DEFAULT can be used for all attributes for resetting. + * @param attribute text attribute (enum USearchAttribute) to be set + * @param value text attribute value + * @param status for errors if it occurs + */ + void setAttribute(USearchAttribute attribute, + USearchAttributeValue value, + UErrorCode &status); + + /** + * Gets the text searching attributes + * @param attribute text attribute (enum USearchAttribute) to be retrieve + * @return text attribute value + */ + USearchAttributeValue getAttribute(USearchAttribute attribute) const; + + /** + * Returns the index to the match in the text string that was searched. + * This call returns a valid result only after a successful call to + * {@link #first}, {@link #next}, {@link #previous}, or {@link #last}. + * Just after construction, or after a searching method returns + * USEARCH_DONE, this method will return USEARCH_DONE. + *
+ * Use getMatchedLength to get the matched string length. + * @return index of a substring within the text string that is being + * searched. + */ + UTextOffset getMatchedStart(void) const; + + /** + * Returns the length of text in the string which matches the search + * pattern. This call returns a valid result only after a successful call + * to {@link #first}, {@link #next}, {@link #previous}, or {@link #last}. + * Just after construction, or after a searching method returns + * USEARCH_DONE, this method will return 0. + * @return The length of the match in the target text, or 0 if there + * is no match currently. + */ + int32_t getMatchedLength(void) const; + + /** + * Returns the text that was matched by the most recent call to + * {@link #first}, {@link #next}, {@link #previous}, or {@link #last}. + * If the iterator is not pointing at a valid match (e.g. just after + * construction or after USEARCH_DONE has been returned, + * returns an empty string. + * @param result stores the matched string or an empty string if a match + * is not found. + */ + void getMatchedText(UnicodeString &result) const; + + /** + * Set the BreakIterator that will be used to restrict the points + * at which matches are detected. The user is responsible for deleting + * the breakiterator. + * @param breakiter A BreakIterator that will be used to restrict the + * points at which matches are detected. If a match is + * found, but the match's start or end index is not a + * boundary as determined by the BreakIterator, + * the match will be rejected and another will be searched + * for. If this parameter is NULL, no break + * detection is attempted. + * @param status for errors if it occurs + */ + void setBreakIterator(BreakIterator *breakiter, UErrorCode &status); + + /** + * Returns the BreakIterator that is used to restrict the points at + * which matches are detected. This will be the same object that was + * passed to the constructor or to setBreakIterator. + * Note that NULL is a legal value; it means that break + * detection should not be attempted. + * @return BreakIterator used to restrict matchings. + */ + const BreakIterator * getBreakIterator(void) const; + + /** + * Set the string text to be searched. Text iteration will hence begin at + * the start of the text string. This method is useful if you want to + * re-use an iterator to search for the same pattern within a different + * body of text. The user is responsible for deleting the text. + * @param text string to be searched. + * @param status for errors if it occurs + */ + virtual void setText(const UnicodeString &text, UErrorCode &status); + + /** + * Set the string text to be searched. Text iteration will hence begin at + * the start of the text string. This method is useful if you want to + * re-use an iterator to search for the same pattern within a different + * body of text. + *
+ * Note: No parsing of the text within the CharacterIterator + * will be done during searching for this version. The block of text + * in CharacterIterator will be used as it is. + * The user is responsible for deleting the text. + * @param text string iterator to be searched. + * @param status for errors if it occurs + */ + virtual void setText(CharacterIterator &text, UErrorCode &status); + + /** + * Return the string text to be searched. + * @return text string to be searched. + */ + const UnicodeString & getText(void) const; + + // operator overloading ---------------------------------------------- + + /** + * Equality operator. + * @param that SearchIterator instance to be compared. + * @return TRUE if both BreakIterators are of the same class, have the + * same behavior, terates over the same text and have the same + * attributes. FALSE otherwise. + */ + virtual UBool operator==(const SearchIterator &that) const; + + /** + * Not-equal operator. + * @param that SearchIterator instance to be compared. + * @return FALSE if operator== returns TRUE, and vice versa. + */ + UBool operator!=(const SearchIterator &that) const; + + // public methods ---------------------------------------------------- + + /** + * Returns a copy of SearchIterator with the same behavior, and + * iterating over the same text, as this one. Note that all data will be + * replicated, except for the text string to be searched. + * @return cloned object + */ + virtual SearchIterator* safeClone(void) const = 0; + + /** + * Returns the first index at which the string text matches the search + * pattern. The iterator is adjusted so that its current index (as + * returned by {@link #usearch_getOffset}) is the match position if one + * was found. + * If a match is not found, USEARCH_DONE will be returned and + * the iterator will be adjusted to the index USEARCH_DONE + * @param status for errors if it occurs + * @return The character index of the first match, or + * USEARCH_DONE if there are no matches. + */ + UTextOffset first(UErrorCode &status); + + /** + * Returns the first index greater than position at which the + * string text matches the search pattern. The iterator is adjusted so + * that its current index (as returned by {@link #getOffset}) is the + * match position if one was found. If a match is not found, + * USEARCH_DONE will be returned and the iterator will be + * adjusted to the index USEARCH_DONE + * @param position where search if to start from + * @param status for errors if it occurs + * @return The character index of the first match following + * position, or USEARCH_DONE if there are no + * matches. + */ + UTextOffset following(UTextOffset position, UErrorCode &status); + + /** + * Returns the last index in the target text at which it matches the + * search pattern. The iterator is adjusted so that its current index + * (as returned by {@link #getOffset}) is the match position if one was + * found. + * If a match is not found, USEARCH_DONE will be returned and + * the iterator will be adjusted to the index USEARCH_DONE. + * @param status for errors if it occurs + * @return The index of the first match, or USEARCH_DONE if + * there are no matches. + */ + UTextOffset last(UErrorCode &status); + + /** + * Returns the first index less than position at which the string + * text matches the search pattern. The iterator is adjusted so that its + * current index (as returned by {@link #getOffset}) is the match + * position if one was found. If a match is not found, + * USEARCH_DONE will be returned and the iterator will be + * adjusted to the index USEARCH_DONE + * @param position where search is to start from + * @param status for errors if it occurs + * @return The character index of the first match preceding + * position, or USEARCH_DONE if there are + * no matches. + */ + UTextOffset preceding(UTextOffset position, UErrorCode &status); + + /** + * Returns the index of the next point at which the text matches the + * search pattern, starting from the current position + * The iterator is adjusted so that its current index (as returned by + * {@link #getIndex}) is the match position if one was found. + * If a match is not found, USEARCH_DONE will be returned and + * the iterator will be adjusted to a position after the end of the text + * string. + * @param status for errors if it occurs + * @return The index of the next match after the current position, + * or USEARCH_DONE if there are no more matches. + */ + UTextOffset next(UErrorCode &status); + + /** + * Returns the index of the previous point at which the string text + * matches the search pattern, starting at the current position. + * The iterator is adjusted so that its current index (as returned by + * {@link #getOffset}) is the match position if one was found. + * If a match is not found, USEARCH_DONE will be returned and + * the iterator will be adjusted to the index USEARCH_DONE + * @param status for errors if it occurs + * @return The index of the previous match before the current position, + * or USEARCH_DONE if there are no more matches. + */ + UTextOffset previous(UErrorCode &status); + + /** + * Resets the iteration. + * Search will begin at the start of the text string if a forward + * iteration is initiated before a backwards iteration. Otherwise if a + * backwards iteration is initiated before a forwards iteration, the + * search will begin at the end of the text string. + */ + virtual void reset(); + +protected: + // protected data members --------------------------------------------- + + /** + * C search data struct + */ + USearch *m_search_; + + /** + * Break iterator. + * Currently the C++ breakiterator does not have getRules etc to reproduce + * another in C. Hence we keep the original around and do the verification + * at the end of the match. The user is responsible for deleting this + * break iterator. + */ + BreakIterator *m_breakiterator_; + + /** + * Unicode string version of the search text + */ + UnicodeString m_text_; + + // protected constructors and destructors ----------------------------- + + /** + * Default constructor. + * Initializes data to the default values. + */ + SearchIterator(); + + /** + * Constructor for use by subclasses. + * @param text The target text to be searched. + * @param breakiter A {@link BreakIterator} that is used to restrict the + * points at which matches are detected. If + * handleNext or handlePrev finds a + * match, but the match's start or end index is not a + * boundary as determined by the BreakIterator, + * the match is rejected and handleNext or + * handlePrev is called again. If this parameter + * is NULL, no break detection is attempted. + * @param status error status + */ + SearchIterator(const UnicodeString &text, + BreakIterator *breakiter = NULL); + + /** + * Constructor for use by subclasses. + *
+ * Note: No parsing of the text within the CharacterIterator + * will be done during searching for this version. The block of text + * in CharacterIterator will be used as it is. + * @param text The target text to be searched. + * @param breakiter A {@link BreakIterator} that is used to restrict the + * points at which matches are detected. If + * handleNext or handlePrev finds a + * match, but the match's start or end index is not a + * boundary as determined by the BreakIterator, + * the match is rejected and handleNext or + * handlePrev is called again. If this parameter + * is NULL, no break detection is attempted. + */ + SearchIterator(CharacterIterator &text, BreakIterator *breakiter = NULL); + + // protected methods -------------------------------------------------- + + /** + * Abstract method which subclasses override to provide the mechanism + * for finding the next match in the target text. This allows different + * subclasses to provide different search algorithms. + *
+ * If a match is found, the implementation should return the index at + * which the match starts and should call + * {@link #setMatchLength setMatchLength} with the number of characters + * in the target text that make up the match. If no match is found, the + * method should return USEARCH_DONE. + *
+ * @param position The index in the target text at which the search + * should start. + * @param status for error codes if it occurs. + */ + virtual UTextOffset handleNext(UTextOffset position, UErrorCode &status) + = 0; + + /** + * Abstract method which subclasses override to provide the mechanism for + * finding the previous match in the target text. This allows different + * subclasses to provide different search algorithms. + *
+ * If a match is found, the implementation should return the index at + * which the match starts and should call + * {@link #setMatchLength setMatchLength} with the number of characters + * in the target text that make up the match. If no match is found, the + * method should return USEARCH_DONE. + *
+ * @param position The index in the target text at which the search + * should start. + * @param status for error codes if it occurs. + */ + virtual UTextOffset handlePrev(UTextOffset position, UErrorCode &status) + = 0; + + /** + * Sets the length of the currently matched string in the text string to + * be searched. + * Subclasses' handleNext and handlePrev + * methods should call this when they find a match in the target text. + * @param length length of the matched text. + */ + virtual void setMatchLength(int32_t length); + + /** + * Sets the offset of the currently matched string in the text string to + * be searched. + * Subclasses' handleNext and handlePrev + * methods should call this when they find a match in the target text. + * @param position start offset of the matched text. + */ + virtual void setMatchStart(UTextOffset position); + + /** + * sets match not found + */ + void setMatchNotFound(); +}; + +inline UBool SearchIterator::operator!=(const SearchIterator &that) const +{ + return !operator==(that); +} + +#endif + diff --git a/icu4c/source/i18n/unicode/stsearch.h b/icu4c/source/i18n/unicode/stsearch.h new file mode 100644 index 00000000000..69f222d9ae4 --- /dev/null +++ b/icu4c/source/i18n/unicode/stsearch.h @@ -0,0 +1,433 @@ +/* +********************************************************************** +* Copyright (C) 2001 IBM and others. All rights reserved. +********************************************************************** +* Date Name Description +* 03/22/2000 helena Creation. +********************************************************************** +*/ + +#ifndef STSEARCH_H +#define STSEARCH_H + +#include "unicode/tblcoll.h" +#include "unicode/coleitr.h" +#include "unicode/search.h" + +/** + * StringSearch is a SearchIterator that provides + * language-sensitive text searching based on the comparison rules defined + * in a {@link RuleBasedCollator} object. + * StringSearch ensures that language eccentricity can be + * handled, e.g. for the German collator, characters ß and SS will be matched + * if case is chosen to be ignored. + * See the + * "ICU Collation Design Document" for more information. + *
+ * The algorithm implemented is a modified form of the Boyer Moore's search. + * For more information see + * + * "Efficient Text Searching in Java", published in Java Report + * in February, 1999, for further information on the algorithm. + *
+ * There are 2 match options for selection:
+ * This search has APIs similar to that of other text iteration mechanisms
+ * such as the break iterators in BreakIterator. Using these
+ * APIs, it is easy to scan through text looking for all occurances of
+ * a given pattern. This search iterator allows changing of direction by
+ * calling a reset followed by a next or previous.
+ * Though a direction change can occur without calling reset first,
+ * this operation comes with some speed penalty.
+ * Match results in the forward direction will match the result matches in
+ * the backwards direction in the reverse order
+ *
+ * SearchIterator provides APIs to specify the starting position
+ * within the text string to be searched, e.g. setOffset,
+ * preceding and following. Since the
+ * starting position will be set as it is specified, please take note that
+ * there are some danger points which the search may render incorrect
+ * results:
+ *
+ * A breakiterator can be used if only matches at logical breaks are desired.
+ *
+ * Options are provided to handle overlapping matches.
+ * E.g. In English, overlapping matches produces the result 0 and 2
+ * for the pattern "abab" in the text "ababab", where else mutually
+ * exclusive matches only produce the result of 0.
+ *
+ * Though collator attributes will be taken into consideration while
+ * performing matches, there are no APIs here for setting and getting the
+ * attributes. These attributes can be set by getting the collator
+ * from getCollator and using the APIs in coll.h.
+ *
+ * Restriction:
+ * Consult the SearchIterator documentation for information on
+ * and examples of how to use instances of this class to implement text
+ * searching.
+ *
+ * Note: No parsing of the text within the CharacterIterator
+ * will be done during searching for this version. The block of text
+ * in CharacterIterator will be used as it is.
+ * @param pattern The text for which this object will search.
+ * @param text The text iterator in which to search for the pattern.
+ * @param locale A locale which defines the language-sensitive
+ * comparison rules used to determine whether text in the
+ * pattern and target matches. User is responsible for
+ * the clearing of this object.
+ * @param breakiter A BreakIterator object used to constrain
+ * the matches that are found. Matches whose start and end
+ * indices in the target text are not boundaries as
+ * determined by the BreakIterator are
+ * ignored. If this behavior is not desired,
+ * NULL can be passed in instead.
+ * @param status for errors if any
+ */
+ StringSearch(const UnicodeString &pattern, CharacterIterator &text,
+ const Locale &locale,
+ BreakIterator *breakiter,
+ UErrorCode &status);
+
+ /**
+ * Creating a StringSearch instance using the argument collator
+ * language rule set. Note, user retains the ownership of this collator,
+ * it does not get destroyed during this instance's destruction.
+ *
+ * Note: No parsing of the text within the CharacterIterator
+ * will be done during searching for this version. The block of text
+ * in CharacterIterator will be used as it is.
+ * @param pattern The text for which this object will search.
+ * @param text The text in which to search for the pattern.
+ * @param coll A RuleBasedCollator object which defines
+ * the language-sensitive comparison rules used to
+ * determine whether text in the pattern and target
+ * matches. User is responsible for the clearing of this
+ * object.
+ * @param breakiter A BreakIterator object used to constrain
+ * the matches that are found. Matches whose start and end
+ * indices in the target text are not boundaries as
+ * determined by the BreakIterator are
+ * ignored. If this behavior is not desired,
+ * NULL can be passed in instead.
+ * @param status for errors if any
+ */
+ StringSearch(const UnicodeString &pattern, CharacterIterator &text,
+ RuleBasedCollator *coll,
+ BreakIterator *breakiter,
+ UErrorCode &status);
+
+ /**
+ * Copy constructor that creates a StringSearch instance with the same
+ * behavior, and iterating over the same text.
+ * @param that StringSearch instance to be copied.
+ */
+ StringSearch(const StringSearch &that);
+
+ /**
+ * Destructor. Cleans up the search iterator data struct.
+ * If a collator is created in the constructor, it will be destroyed here.
+ */
+ virtual ~StringSearch(void);
+
+ // operator overloading ---------------------------------------------
+
+ /**
+ * Assignment operator. Sets this iterator to have the same behavior,
+ * and iterate over the same text, as the one passed in.
+ * @param that instance to be copied.
+ */
+ virtual StringSearch & operator=(const StringSearch &that);
+
+ /**
+ * Equality operator.
+ * @param that instance to be compared.
+ * @return TRUE if both instances have the same attributes,
+ * breakiterators, collators and iterate over the same text
+ * while looking for the same pattern.
+ */
+ virtual UBool operator==(const SearchIterator &that) const;
+
+ // public get and set methods ----------------------------------------
+
+ /**
+ * Sets the index to point to the given position, and clears any state
+ * that's affected.
+ *
+ * This method takes the argument index and sets the position in the text
+ * string accordingly without checking if the index is pointing to a
+ * valid starting point to begin searching.
+ * @param position within the text to be set
+ * @param status for errors if it occurs
+ */
+ virtual void setOffset(UTextOffset position, UErrorCode &status);
+
+ /**
+ * Return the current index in the text being searched.
+ * If the iteration has gone past the end of the text
+ * (or past the beginning for a backwards search), {@link #USEARCH_DONE}
+ * is returned.
+ * @return current index in the text being searched.
+ */
+ virtual UTextOffset getOffset(void) const;
+
+ /**
+ * Set the target text to be searched.
+ * Text iteration will hence begin at the start of the text string.
+ * This method is
+ * useful if you want to re-use an iterator to search for the same
+ * pattern within a different body of text.
+ * @param text text string to be searched
+ * @param status for errors if any
+ */
+ virtual void setText(const UnicodeString &text, UErrorCode &status);
+
+ /**
+ * Set the target text to be searched.
+ * Text iteration will hence begin at the start of the text string.
+ * This method is
+ * useful if you want to re-use an iterator to search for the same
+ * pattern within a different body of text.
+ * Note: No parsing of the text within the CharacterIterator
+ * will be done during searching for this version. The block of text
+ * in CharacterIterator will be used as it is.
+ * @param text text string to be searched
+ * @param status for errors if any
+ */
+ virtual void setText(CharacterIterator &text, UErrorCode &status);
+
+ /**
+ * Gets the collator used for the language rules.
+ *
+ * Deleting the returned RuleBasedCollator before calling
+ * the destructor would cause the string search to fail.
+ * The destructor will delete the collator if this instance owns it
+ * @return collator used for string search
+ */
+ RuleBasedCollator * getCollator() const;
+
+ /**
+ * Sets the collator used for the language rules. User retains the
+ * ownership of this collator, thus the responsibility of deletion lies
+ * with the user. This method causes internal data such as Boyer-Moore
+ * shift tables to be recalculated, but the iterator's position is
+ * unchanged.
+ * @param coll collator
+ * @param status for errors if any
+ */
+ void setCollator(RuleBasedCollator *coll, UErrorCode &status);
+
+ /**
+ * Sets the pattern used for matching.
+ * Internal data like the Boyer Moore table will be recalculated, but
+ * the iterator's position is unchanged.
+ * @param pattern search pattern to be found
+ * @param status for errors if any
+ */
+ void setPattern(const UnicodeString &pattern, UErrorCode &status);
+
+ /**
+ * Gets the search pattern.
+ * @return pattern used for matching
+ */
+ const UnicodeString & getPattern() const;
+
+ // public methods ----------------------------------------------------
+
+ /**
+ * Reset the iteration.
+ * Search will begin at the start of the text string if a forward
+ * iteration is initiated before a backwards iteration. Otherwise if
+ * a backwards iteration is initiated before a forwards iteration, the
+ * search will begin at the end of the text string.
+ */
+ virtual void reset();
+
+ /**
+ * Returns a copy of StringSearch with the same behavior, and
+ * iterating over the same text, as this one. Note that all data will be
+ * replicated, except for the user-specified collator and the
+ * breakiterator.
+ * @return cloned object
+ */
+ virtual SearchIterator * safeClone(void) const;
+
+protected:
+
+ // protected method -------------------------------------------------
+
+ /**
+ * Search forward for matching text, starting at a given location.
+ * Clients should not call this method directly; instead they should
+ * call {@link SearchIterator#next}.
+ *
+ * If a match is found, this method returns the index at which the match
+ * starts and calls {@link SearchIterator#setMatchLength} with the number
+ * of characters in the target text that make up the match. If no match
+ * is found, the method returns USEARCH_DONE.
+ *
+ * The StringSearch is adjusted so that its current index
+ * (as returned by {@link #getOffset}) is the match position if one was
+ * found.
+ * If a match is not found, USEARCH_DONE will be returned and
+ * the StringSearch will be adjusted to the index USEARCH_DONE.
+ * @param position The index in the target text at which the search
+ * starts
+ * @param status for errors if any occurs
+ * @return The index at which the matched text in the target starts, or
+ * USEARCH_DONE if no match was found.
+ */
+ virtual UTextOffset handleNext(UTextOffset position, UErrorCode &status);
+
+ /**
+ * Search backward for matching text, starting at a given location.
+ * Clients should not call this method directly; instead they should call
+ * SearchIterator.previous(), which this method overrides.
+ *
+ * If a match is found, this method returns the index at which the match
+ * starts and calls {@link SearchIterator#setMatchLength} with the number
+ * of characters in the target text that make up the match. If no match
+ * is found, the method returns USEARCH_DONE.
+ *
+ * The StringSearch is adjusted so that its current index
+ * (as returned by {@link #getOffset}) is the match position if one was
+ * found.
+ * If a match is not found, USEARCH_DONE will be returned and
+ * the StringSearch will be adjusted to the index USEARCH_DONE.
+ * @param position The index in the target text at which the search
+ * starts.
+ * @param status for errors if any occurs
+ * @return The index at which the matched text in the target starts, or
+ * USEARCH_DONE if no match was found.
+ */
+ virtual UTextOffset handlePrev(UTextOffset position, UErrorCode &status);
+
+private :
+
+ // private data members ----------------------------------------------
+
+ /**
+ * RuleBasedCollator, contains exactly the same UCollator * in m_strsrch_
+ */
+ RuleBasedCollator m_collator_;
+ /**
+ * Pattern text
+ */
+ UnicodeString m_pattern_;
+ /**
+ * Corresponding collation rules
+ */
+ UnicodeString m_collation_rules_;
+ /**
+ * String search struct data
+ */
+ UStringSearch *m_strsrch_;
+};
+
+#endif
+
diff --git a/icu4c/source/i18n/unicode/tblcoll.h b/icu4c/source/i18n/unicode/tblcoll.h
index 44e69fff139..3c9af0dc836 100644
--- a/icu4c/source/i18n/unicode/tblcoll.h
+++ b/icu4c/source/i18n/unicode/tblcoll.h
@@ -849,6 +849,11 @@ private:
*/
friend class Collator;
+ /**
+ * Searching over collation elements in a character source
+ */
+ friend class StringSearch;
+
// private constructors --------------------------------------------------
/**
@@ -893,10 +898,24 @@ private:
/**
* Creates the c struct for ucollator
* @param collator new ucollator data
- * @param status error status
*/
void setUCollator(UCollator *collator);
+ /**
+ * Creates the c struct for ucollator. This used internally by StringSearch.
+ * Hence the responsibility of cleaning up the ucollator is not done by
+ * this RuleBasedCollator. The isDataOwned flag is set to FALSE.
+ * @param collator new ucollator data
+ * @param rules corresponding collation rules
+ */
+ void setUCollator(UCollator *collator, UnicodeString *rules);
+
+ /**
+ * Get UCollator data struct. Used only by StringSearch.
+ * @return UCollator data struct
+ */
+ const UCollator * getUCollator();
+
/**
* Converts C's UCollationResult to EComparisonResult
* @param result member of the enum UComparisonResult
@@ -947,11 +966,29 @@ inline void RuleBasedCollator::setUCollator(const Locale &locale,
inline void RuleBasedCollator::setUCollator(UCollator *collator)
{
- if (ucollator && dataIsOwned)
+ if (ucollator && dataIsOwned) {
ucol_close(ucollator);
+ }
ucollator = collator;
}
+inline void RuleBasedCollator::setUCollator(UCollator *collator,
+ UnicodeString *rules)
+{
+ if (ucollator && dataIsOwned) {
+ ucol_close(ucollator);
+ delete urulestring;
+ }
+ ucollator = collator;
+ urulestring = rules;
+ dataIsOwned = FALSE;
+}
+
+inline const UCollator * RuleBasedCollator::getUCollator()
+{
+ return ucollator;
+}
+
inline Collator::EComparisonResult RuleBasedCollator::getEComparisonResult(
const UCollationResult &result) const
{
diff --git a/icu4c/source/i18n/unicode/usearch.h b/icu4c/source/i18n/unicode/usearch.h
new file mode 100644
index 00000000000..230aee417c7
--- /dev/null
+++ b/icu4c/source/i18n/unicode/usearch.h
@@ -0,0 +1,546 @@
+/*
+**********************************************************************
+* Copyright (C) 2001 IBM and others. All rights reserved.
+**********************************************************************
+* Date Name Description
+* 06/28/2001 synwee Creation.
+**********************************************************************
+*/
+#ifndef USEARCH_H
+#define USEARCH_H
+
+#include "unicode/utypes.h"
+#include "unicode/ucol.h"
+#include "unicode/ucoleitr.h"
+#include "unicode/ubrk.h"
+
+/**
+ * C Apis for an engine that provides language-sensitive text searching based
+ * on the comparison rules defined in a UCollator data struct,
+ * see ucol.h. This ensures that language eccentricity can be
+ * handled, e.g. for the German collator, characters ß and SS will be matched
+ * if case is chosen to be ignored.
+ * See the
+ * "ICU Collation Design Document" for more information.
+ *
+ * The algorithm implemented is a modified form of the Boyer Moore's search.
+ * For more information see
+ *
+ * "Efficient Text Searching in Java", published in Java Report
+ * in February, 1999, for further information on the algorithm.
+ *
+ * There are 2 match options for selection:
+ * This search has APIs similar to that of other text iteration mechanisms
+ * such as the break iterators in ubrk.h. Using these
+ * APIs, it is easy to scan through text looking for all occurances of
+ * a given pattern. This search iterator allows changing of direction by
+ * calling a reset followed by a next or previous.
+ * Though a direction change can occur without calling reset first,
+ * this operation comes with some speed penalty.
+ * Generally, match results in the forward direction will match the result
+ * matches in the backwards direction in the reverse order
+ *
+ * usearch.h provides APIs to specify the starting position
+ * within the text string to be searched, e.g. usearch_setOffset,
+ * usearch_preceding and usearch_following. Since the
+ * starting position will be set as it is specified, please take note that
+ * there are some dangerous positions which the search may render incorrect
+ * results:
+ *
+ * A breakiterator can be used if only matches at logical breaks are desired.
+ *
+ * Options are provided to handle overlapping matches.
+ * E.g. In English, overlapping matches produces the result 0 and 2
+ * for the pattern "abab" in the text "ababab", where else mutually
+ * exclusive matches only produce the result of 0.
+ *
+ * Though collator attributes will be taken into consideration while
+ * performing matches, there are no APIs here for setting and getting the
+ * attributes. These attributes can be set by getting the collator
+ * from usearch_getCollator and using the APIs in ucol.h.
+ *
+ * Restriction:
+ * Example of use:
+* Use usearch_getMatchedLength to get the matched string length.
+* @param strsrch search iterator data struct
+* @return index to a substring within the text string that is being
+* searched.
+*/
+U_CAPI UTextOffset U_EXPORT2 usearch_getMatchedStart(
+ const UStringSearch *strsrch);
+
+/**
+* Returns the length of text in the string which matches the search pattern.
+* This call returns a valid result only after a successful call to
+* {@link #usearch_first}, {@link #usearch_next},
+* {@link #usearch_previous}, or {@link #usearch_last}.
+* Just after construction, or after a searching method returns
+* USEARCH_DONE, this method will return 0.
+* @param strsrch search iterator data struct
+* @return The length of the match in the string text, or 0 if there is no
+* match currently.
+*/
+U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
+ const UStringSearch *strsrch);
+
+/**
+* Returns the text that was matched by the most recent call to
+* {@link #usearch_first}, {@link #usearch_next},
+* {@link #usearch_previous}, or {@link #usearch_last}.
+* If the iterator is not pointing at a valid match (e.g. just after
+* construction or after USEARCH_DONE has been returned, returns
+* an empty string. If result is not large enough to store the matched text,
+* result will be filled with the partial text and an U_BUFFER_OVERFLOW_ERROR
+* will be returned in status. result will be null-terminated whenever
+* possible. If the buffer fits the matched text exactly, a null-termination
+* is not possible, then a U_STRING_NOT_TERMINATED_ERROR set in status.
+* Pre-flighting can be either done with length = 0 or the API
+* usearch_getMatchLength().
+* @param strsrch search iterator data struct
+* @param result UChar buffer to store the matched string
+* @param resultCapacity length of the result buffer
+* @param status error returned if result is not large enough
+* @return exact length of the matched text, not counting the null-termination
+*/
+U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
+ UChar *result,
+ int32_t resultCapacity,
+ UErrorCode *status);
+
+/**
+* Set the BreakIterator that will be used to restrict the points at which
+* matches are detected.
+* @param strsrch search iterator data struct
+* @param breakiter A BreakIterator that will be used to restrict the points
+* at which matches are detected. If a match is found, but
+* the match's start or end index is not a boundary as
+* determined by the BreakIterator, the match will
+* be rejected and another will be searched for.
+* If this parameter is NULL, no break detection is
+* attempted.
+* @param status for errors if it occurs
+* @see #usearch_getBreakIterator
+*/
+U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch,
+ UBreakIterator *breakiter,
+ UErrorCode *status);
+
+/**
+* Returns the BreakIterator that is used to restrict the points at which
+* matches are detected. This will be the same object that was passed to the
+* constructor or to usearch_setBreakIterator. Note that
+* NULL
+* is a legal value; it means that break detection should not be attempted.
+* @param strsrch search iterator data struct
+* @return break iterator used
+* @see #usearch_setBreakIterator
+*/
+U_CAPI const UBreakIterator * U_EXPORT2 usearch_getBreakIterator(
+ const UStringSearch *strsrch);
+
+/**
+* Set the string text to be searched. Text iteration will hence begin at the
+* start of the text string. This method is useful if you want to re-use an
+* iterator to search for the same pattern within a different body of text.
+* @param strsrch search iterator data struct
+* @param text new string to look for match
+* @param textlength length of the new string, -1 for null-termination
+* @param status for errors if it occurs
+* @see #usearch_getText
+*/
+U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch,
+ const UChar *text,
+ int32_t textlength,
+ UErrorCode *status);
+
+/**
+* Return the string text to be searched.
+* @param strsrch search iterator data struct
+* @param length returned string text length
+* @return string text
+* @see #usearch_setText
+*/
+U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
+ int32_t *length);
+
+/**
+* Gets the collator used for the language rules.
+*
+* Deleting the returned UCollator before calling
+* usearch_close would cause the string search to fail.
+* usearch_close will delete the collator if this search owns it.
+* @param strsrch search iterator data struct
+* @return collator
+*/
+U_CAPI UCollator * U_EXPORT2 usearch_getCollator(
+ const UStringSearch *strsrch);
+
+/**
+* Sets the collator used for the language rules. User retains the ownership
+* of this collator, thus the responsibility of deletion lies with the user.
+* This method causes internal data such as Boyer-Moore shift tables to
+* be recalculated, but the iterator's position is unchanged.
+* @param strsrch search iterator data struct
+* @param collator to be used
+* @param status for errors if it occurs
+*/
+U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch,
+ const UCollator *collator,
+ UErrorCode *status);
+
+/**
+* Sets the pattern used for matching.
+* Internal data like the Boyer Moore table will be recalculated, but the
+* iterator's position is unchanged.
+* @param strsrch search iterator data struct
+* @param pattern string
+* @param patternlength pattern length, -1 for null-terminated string
+* @param status for errors if it occurs
+*/
+U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch,
+ const UChar *pattern,
+ int32_t patternlength,
+ UErrorCode *status);
+
+/**
+* Gets the search pattern
+* @param strsrch search iterator data struct
+* @param length return length of the pattern, -1 indicates that the pattern
+* is null-terminated
+* @return pattern string
+*/
+U_CAPI const UChar * U_EXPORT2 usearch_getPattern(
+ const UStringSearch *strsrch,
+ int32_t *length);
+
+/* methods ------------------------------------------------------------- */
+
+/**
+* Returns the first index at which the string text matches the search
+* pattern.
+* The iterator is adjusted so that its current index (as returned by
+* {@link #usearch_getOffset}) is the match position if one was found.
+* If a match is not found, USEARCH_DONE will be returned and
+* the iterator will be adjusted to the index USEARCH_DONE.
+* @param strsrch search iterator data struct
+* @param status for errors if it occurs
+* @return The character index of the first match, or
+* USEARCH_DONE if there are no matches.
+*/
+U_CAPI UTextOffset U_EXPORT2 usearch_first(UStringSearch *strsrch,
+ UErrorCode *status);
+
+/**
+* Returns the first index greater than position at which the string
+* text
+* matches the search pattern. The iterator is adjusted so that its current
+* index (as returned by {@link #usearch_getOffset}) is the match position if
+* one was found.
+* If a match is not found, USEARCH_DONE will be returned and
+* the iterator will be adjusted to the index USEARCH_DONE
+*
+* Search positions that may render incorrect results are highlighted in the
+* header comments.
+* @param strsrch search iterator data struct
+* @param position to start the search at
+* @param status for errors if it occurs
+* @return The character index of the first match following pos,
+* or USEARCH_DONE if there are no matches.
+*/
+U_CAPI UTextOffset U_EXPORT2 usearch_following(UStringSearch *strsrch,
+ UTextOffset position,
+ UErrorCode *status);
+
+/**
+* Returns the last index in the target text at which it matches the search
+* pattern. The iterator is adjusted so that its current
+* index (as returned by {@link #usearch_getOffset}) is the match position if
+* one was found.
+* If a match is not found, USEARCH_DONE will be returned and
+* the iterator will be adjusted to the index USEARCH_DONE.
+* @param strsrch search iterator data struct
+* @param status for errors if it occurs
+* @return The index of the first match, or USEARCH_DONE if there
+* are no matches.
+*/
+U_CAPI UTextOffset U_EXPORT2 usearch_last(UStringSearch *strsrch,
+ UErrorCode *status);
+
+/**
+* Returns the first index less than position at which the string text
+* matches the search pattern. The iterator is adjusted so that its current
+* index (as returned by {@link #usearch_getOffset}) is the match position if
+* one was found.
+* If a match is not found, USEARCH_DONE will be returned and
+* the iterator will be adjusted to the index USEARCH_DONE
+*
+* Search positions that may render incorrect results are highlighted in the
+* header comments.
+* @param strsrch search iterator data struct
+* @param position index position the search is to begin at
+* @param status for errors if it occurs
+* @return The character index of the first match preceding pos,
+* or USEARCH_DONE if there are no matches.
+*/
+U_CAPI UTextOffset U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
+ UTextOffset position,
+ UErrorCode *status);
+
+/**
+* Returns the index of the next point at which the string text matches the
+* search pattern, starting from the current position.
+* The iterator is adjusted so that its current
+* index (as returned by {@link #usearch_getOffset}) is the match position if
+* one was found.
+* If a match is not found, USEARCH_DONE will be returned and
+* the iterator will be adjusted to the index USEARCH_DONE
+* @param strsrch search iterator data struct
+* @param status for errors if it occurs
+* @return The index of the next match after the current position, or
+* USEARCH_DONE if there are no more matches.
+* @see #usearch_first
+*/
+U_CAPI UTextOffset U_EXPORT2 usearch_next(UStringSearch *strsrch,
+ UErrorCode *status);
+
+/**
+* Returns the index of the previous point at which the string text matches
+* the search pattern, starting at the current position.
+* The iterator is adjusted so that its current
+* index (as returned by {@link #usearch_getOffset}) is the match position if
+* one was found.
+* If a match is not found, USEARCH_DONE will be returned and
+* the iterator will be adjusted to the index USEARCH_DONE
+* @param strsrch search iterator data struct
+* @param status for errors if it occurs
+* @return The index of the previous match before the current position,
+* or USEARCH_DONE if there are no more matches.
+*/
+U_CAPI UTextOffset U_EXPORT2 usearch_previous(UStringSearch *strsrch,
+ UErrorCode *status);
+
+/**
+* Reset the iteration.
+* Search will begin at the start of the text string if a forward iteration
+* is initiated before a backwards iteration. Otherwise if a backwards
+* iteration is initiated before a forwards iteration, the search will begin
+* at the end of the text string.
+* @param strsrch search iterator data struct
+*/
+U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch);
+
+#endif
+
+
diff --git a/icu4c/source/i18n/usearch.cpp b/icu4c/source/i18n/usearch.cpp
new file mode 100644
index 00000000000..67df97d165b
--- /dev/null
+++ b/icu4c/source/i18n/usearch.cpp
@@ -0,0 +1,3159 @@
+/*
+**********************************************************************
+* Copyright (C) 2001 IBM and others. All rights reserved.
+**********************************************************************
+* Date Name Description
+* 07/02/2001 synwee Creation.
+**********************************************************************
+*/
+
+#include "unicode/usearch.h"
+#include "unicode/ustring.h"
+#include "unormimp.h"
+#include "unicode/uchar.h"
+#include "cmemory.h"
+#include "ucol_imp.h"
+#include "usrchimp.h"
+
+// internal definition ---------------------------------------------------
+
+#define LAST_BYTE_MASK_ 0xFF
+#define SECOND_LAST_BYTE_SHIFT_ 8
+#define SUPPLEMENTARY_MIN_VALUE_ 0x10000
+
+static const uint16_t *FCD_ = NULL;
+
+// internal methods -------------------------------------------------
+
+/**
+* Getting the mask for collation strength
+* @param strength collation strength
+* @return collation element mask
+*/
+inline uint32_t getMask(UCollationStrength strength)
+{
+ switch (strength)
+ {
+ case UCOL_PRIMARY:
+ return UCOL_PRIMARYORDERMASK;
+ case UCOL_SECONDARY:
+ return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
+ default:
+ return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
+ UCOL_PRIMARYORDERMASK;
+ }
+}
+
+/**
+* This is to squeeze the 21bit ces into a 256 table
+* @param ce collation element
+* @return collapsed version of the collation element
+*/
+inline int hash(uint32_t ce)
+{
+ // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
+ // well with the new collation where most of the latin 1 characters
+ // are of the value xx000xxx. their hashes will most of the time be 0
+ // to be discussed on the hash algo.
+ return UCOL_PRIMARYORDER(ce) / MAX_TABLE_SIZE_;
+}
+
+/**
+* Initializing the fcd tables
+* @param status error status if any
+*/
+inline void initializeFCD(UErrorCode *status)
+{
+ if (FCD_ == NULL) {
+ FCD_ = unorm_getFCDTrie(status);
+ }
+}
+
+/**
+* Gets the fcd value for a character at the argument index.
+* This method takes into accounts of the supplementary characters.
+* @param str UTF16 string where character for fcd retrieval resides
+* @param offset position of the character whose fcd is to be retrieved, to be
+* overwritten with the next character position, taking
+* surrogate characters into consideration.
+* @param strlength length of the argument string
+* @return fcd value
+*/
+inline uint16_t getFCD(const UChar *str, UTextOffset *offset,
+ int32_t strlength)
+{
+ UTextOffset temp = *offset;
+ uint16_t result;
+ UChar ch = str[temp];
+ result = unorm_getFCD16(FCD_, ch);
+ temp ++;
+
+ if (result != 0 && temp != strlength && UTF_IS_FIRST_SURROGATE(ch)) {
+ ch = str[temp];
+ if (UTF_IS_SECOND_SURROGATE(ch)) {
+ result = unorm_getFCD16FromSurrogatePair(FCD_, result, ch);
+ temp ++;
+ } else {
+ result = 0;
+ }
+ }
+ *offset = temp;
+ return result;
+}
+
+/**
+* Getting the modified collation elements taking into account the collation
+* attributes
+* @param strsrch string search data
+* @param sourcece
+* @return the modified collation element
+*/
+inline uint32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
+{
+ // note for tertiary we can't use the collator->tertiaryMask, that
+ // is a preprocessed mask that takes into account case options. since
+ // we are only concerned with exact matches, we don't need that.
+ sourcece &= strsrch->ceMask;
+
+ if (strsrch->toShift) {
+ // alternate handling here, since only the 16 most significant digits
+ // is only used, we can safely do a compare without masking
+ // if the ce is a variable, we mask and get only the primary values
+ // no shifting to quartenary is required since all primary values
+ // less than variabletop will need to be masked off anyway.
+ if (strsrch->variableTop > sourcece) {
+ if (strsrch->strength == UCOL_QUATERNARY) {
+ sourcece &= UCOL_PRIMARYORDERMASK;
+ }
+ else {
+ sourcece = UCOL_IGNORABLE;
+ }
+ }
+ }
+
+ return sourcece;
+}
+
+/**
+* Allocate a memory and returns NULL if it failed
+* @param size to allocate
+* @param status error status if any
+* @return newly allocated array, NULL otherwise
+*/
+inline void * allocateMemory(uint32_t size, UErrorCode *status)
+{
+ uint32_t *result = (uint32_t *)uprv_malloc(size);
+ if (result == NULL) {
+ *status = U_MEMORY_ALLOCATION_ERROR;
+ }
+ return result;
+}
+
+/**
+* Adds a uint32_t value to a destination array.
+* Creates a new array if we run out of space. The caller will have to
+* manually deallocate the newly allocated array.
+* @param destination target array
+* @param offset destination offset to add value
+* @param destinationlength target array size, return value for the new size
+* @param value to be added
+* @param increments incremental size expected
+* @param status error status if any
+* @return new destination array, destination if there was no new allocation
+*/
+inline uint32_t * addTouint32_tArray(uint32_t *destination,
+ uint32_t offset,
+ uint32_t *destinationlength,
+ uint32_t value,
+ uint32_t increments,
+ UErrorCode *status)
+{
+ if (U_FAILURE(*status)) {
+ return NULL;
+ }
+
+ uint32_t newlength = *destinationlength;
+ if (offset + 1 == newlength) {
+ newlength += increments;
+ uint32_t *temp = (uint32_t *)allocateMemory(
+ sizeof(uint32_t) * newlength, status);
+ if (temp == NULL) {
+ return NULL;
+ }
+ uprv_memcpy(temp, destination, sizeof(uint32_t) * offset);
+ *destinationlength = newlength;
+ destination = temp;
+ }
+ destination[offset] = value;
+ return destination;
+}
+
+/**
+* Initializing the ce table for a pattern.
+* Stores non-ignorable collation keys.
+* Table size will be estimated by the size of the pattern text. Table
+* expansion will be perform as we go along. Adding 1 to ensure that the table
+* size definitely increases.
+* @param strsrch string search data
+* @param status error status if any
+* @return total number of expansions
+*/
+inline uint32_t initializePatternCETable(UStringSearch *strsrch,
+ UErrorCode *status)
+{
+ if (U_SUCCESS(*status)) {
+ UPattern *pattern = &(strsrch->pattern);
+ uint32_t cetablesize = INITIAL_ARRAY_SIZE_;
+ uint32_t *cetable = pattern->CEBuffer;
+ uint32_t patternlength = pattern->textLength;
+
+ UCollationElements *coleiter = ucol_openElements(strsrch->collator,
+ pattern->text, patternlength, status);
+
+ if (pattern->CE != NULL && pattern->CE != cetable) {
+ uprv_free(pattern->CE);
+ }
+
+ uint32_t offset = 0;
+ uint32_t result = 0;
+ uint32_t ce;
+
+ while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER) {
+ uint32_t newce = getCE(strsrch, ce);
+ if (newce) {
+ uint32_t *temp = addTouint32_tArray(cetable, offset,
+ &cetablesize, newce,
+ patternlength - ucol_getOffset(coleiter) + 1,
+ status);
+ if (U_FAILURE(*status)) {
+ return 0;
+ }
+ offset ++;
+ if (cetable != temp && cetable != pattern->CEBuffer) {
+ uprv_free(cetable);
+ }
+ cetable = temp;
+ }
+ result += ucol_getMaxExpansion(coleiter, ce) - 1;
+ }
+
+ cetable[offset] = 0;
+ pattern->CE = cetable;
+ pattern->CELength = offset;
+
+ if (coleiter != NULL) {
+ ucol_closeElements(coleiter);
+ }
+
+ return result;
+ }
+ return 0;
+}
+
+/**
+* Initializes the pattern struct
+* @param strsrch UStringSearch data storage
+* @param splitsize array of size 2 containing
+* 1) the total number of characters from start to
+* the last base character, including any contracting
+* accents of the last base character.
+* 2) the total number of characters from the
+* the first base character, to the end, including any
+* contracting accents of the first base character.
+* @param status for errors if it occurs
+* @return expansionsize the total expansion size of the pattern
+*/
+inline int32_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
+{
+ UPattern *pattern = &(strsrch->pattern);
+ const UChar *patterntext = pattern->text;
+ int32_t result = 0;
+ int32_t length = pattern->textLength;
+ UTextOffset index = 0;
+ // UChar32 firstchar;
+ UChar32 lastchar;
+
+ /* FCD changed
+ UTF_NEXT_CHAR(patterntext, index, length, firstchar);
+ index = length;
+ UTF_PREV_CHAR(patterntext, length, length, lastchar);
+ pattern->hasPrefixAccents = getFCD(firstchar) >> SECOND_LAST_BYTE_SHIFT_;
+ pattern->hasSuffixAccents = getFCD(lastchar) & LAST_BYTE_MASK_;
+ */
+ pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
+ SECOND_LAST_BYTE_SHIFT_;
+ index = length;
+ UTF_PREV_CHAR(patterntext, 0, index, lastchar);
+ pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
+ LAST_BYTE_MASK_;
+
+ result = initializePatternCETable(strsrch, status);
+ return result;
+}
+
+/**
+* Initializing shift tables, with the default values.
+* If a corresponding default value is 0, the shift table is not set.
+* @param shift table for forwards shift
+* @param backshift table for backwards shift
+* @param cetable table containing pattern ce
+* @param cesize size of the pattern ces
+* @param expansionsize total size of the expansions
+* @param defaultforward the default forward value
+* @param defaultbackward the default backward value
+*/
+inline void setShiftTable(int32_t shift[], int32_t backshift[],
+ uint32_t *cetable, int32_t cesize,
+ int32_t expansionsize,
+ int32_t defaultforward,
+ int32_t defaultbackward)
+{
+ // estimate the value to shift. to do that we estimate the smallest
+ // number of characters to give the relevant ces, ie approximately
+ // the number of ces minus their expansion, since expansions can come
+ // from a character.
+ int32_t count;
+ for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
+ shift[count] = defaultforward;
+ }
+ for (count = 0; count < cesize - 1; count ++) {
+ // number of ces from right of array to the count
+ int temp = defaultforward - count - 1;
+ shift[hash(cetable[count])] = temp > 1 ? temp : 1;
+ }
+ shift[hash(cetable[cesize - 1])] = 1;
+ // for ignorables we just shift by one. see test examples.
+ shift[hash(0)] = 1;
+
+ for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
+ backshift[count] = defaultbackward;
+ }
+ for (count = cesize - 1; count > 0; count --) {
+ // the original value count does not seem to work
+ backshift[hash(cetable[count])] = count > expansionsize ?
+ count - expansionsize : 1;
+ }
+ backshift[hash(cetable[0])] = 1;
+ backshift[hash(0)] = 1;
+}
+
+/**
+* Building of the pattern collation element list and the boyer moore strsrch
+* table.
+* The canonical match will only be performed after the default match fails.
+* For both cases we need to remember the size of the composed and decomposed
+* versions of the string. Since the Boyer-Moore shift calculations shifts by
+* a number of characters in the text and tries to match the pattern from that
+* offset, the shift value can not be too large in case we miss some
+* characters. To choose a right shift size, we estimate the NFC form of the
+* and use its size as a shift guide. The NFC form should be the small
+* possible representation of the pattern. Anyways, we'll err on the smaller
+* shift size. Hence the calculation for minlength.
+* Canonical match will be performed slightly differently. We'll split the
+* pattern into 3 parts, the prefix accents (PA), the middle string bounded by
+* the first and last base character (MS), the ending accents (EA). Matches
+* will be done on MS first, and only when we match MS then some processing
+* will be required for the prefix and end accents in order to determine if
+* they match PA and EA. Hence the default shift values
+* for the canonical match will take the size of either end's accent into
+* consideration. Forwards search will take the end accents into consideration
+* for the default shift values and the backwards search will take the prefix
+* accents into consideration.
+* If pattern has no non-ignorable ce, we return a illegal argument error.
+* @param strsrch UStringSearch data storage
+* @param status for errors if it occurs
+*/
+inline void initialize(UStringSearch *strsrch, UErrorCode *status)
+{
+ // uint32_t splitsize[2];
+ int32_t expandlength = initializePattern(strsrch, /*splitsize,*/ status);
+ if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) {
+ UPattern *pattern = &strsrch->pattern;
+ int32_t cesize = pattern->CELength;
+
+ int32_t minlength = cesize > expandlength ? cesize - expandlength :
+ 1;
+ pattern->defaultShiftSize = minlength;
+ setShiftTable(pattern->shift, pattern->backShift, pattern->CE,
+ cesize, expandlength, minlength, minlength);
+ }
+ else {
+ strsrch->pattern.defaultShiftSize = 0;
+ }
+}
+
+/**
+* Determine whether the target text in UStringSearch bounded by the offset
+* start and end is one or more whole units of text as
+* determined by the breakiterator in UStringSearch.
+* @param strsrch string search data
+* @param start target text start offset
+* @param end target text end offset
+*/
+inline UBool isBreakUnit(const UStringSearch *strsrch, UTextOffset start,
+ UTextOffset end)
+{
+ UBreakIterator *breakiterator = strsrch->search->breakIter;
+ if (breakiterator != NULL) {
+ UTextOffset startindex = ubrk_first(breakiterator);
+ UTextOffset endindex = ubrk_last(breakiterator);
+
+ // out-of-range indexes are never boundary positions
+ if (start < startindex || start > endindex ||
+ end < startindex || end > endindex) {
+ return FALSE;
+ }
+ // otherwise, we can use following() on the position before the
+ // specified one and return true of the position we get back is the
+ // one the user specified
+ return (start == startindex ||
+ ubrk_following(breakiterator, start - 1) == start) &&
+ (end == endindex ||
+ ubrk_following(breakiterator, end - 1) == end);
+ }
+ return TRUE;
+}
+
+/**
+* Getting the next base character offset if current offset is an accent,
+* or the current offset if the current character contains a base character.
+* accents the following base character will be returned
+* @param text string
+* @param textoffset current offset
+* @param textlength length of text string
+* @return the next base character or the current offset
+* if the current character is contains a base character.
+*/
+inline UTextOffset getNextBaseOffset(const UChar *text,
+ UTextOffset textoffset,
+ int32_t textlength)
+{
+ if (textoffset >= textlength) {
+ return textlength;
+ }
+ // UChar32 codepoint;
+ UTextOffset temp = textoffset;
+ // UTF_NEXT_CHAR(text, temp, textlength, codepoint);
+ if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
+ UTextOffset result = temp;
+ while (temp < textlength) {
+ result = temp;
+ // UTF_NEXT_CHAR(text, temp, textlength, codepoint);
+ if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_)
+ == 0) {
+ return result;
+ }
+ }
+ return result;
+ }
+ return textoffset;
+}
+
+/**
+* Gets the next base character offset depending on the string search pattern
+* data
+* @param strsrch string search data
+* @param textoffset current offset, one offset away from the last character
+* to search for.
+* @return start index of the next base character or the current offset
+* if the current character is contains a base character.
+*/
+inline UTextOffset getNextUStringSearchBaseOffset(UStringSearch *strsrch,
+ UTextOffset textoffset)
+{
+ int32_t textlength = strsrch->search->textLength;
+ if (strsrch->pattern.hasSuffixAccents) {
+ UChar32 codepoint;
+ UTextOffset temp = textoffset;
+ const UChar *text = strsrch->search->text;
+ UTF_PREV_CHAR(text, 0, temp, codepoint);
+ if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
+ return getNextBaseOffset(text, textoffset, textlength);
+ }
+ }
+ if (textoffset > textlength) {
+ return textlength;
+ }
+ return textoffset;
+}
+
+/**
+* Shifting the collation element iterator position forward to prepare for
+* a following match. If the last character is a unsafe character, we'll only
+* shift by 1 to capture contractions, normalization etc.
+* @param text strsrch string search data
+* @param textoffset start text position to do search
+* @param ce the text ce which failed the match.
+* @param patternceindex index of the ce within the pattern ce buffer which
+* failed the match
+* @param status error if any
+* @return final offset
+*/
+inline UTextOffset shiftForward(UStringSearch *strsrch,
+ UTextOffset textoffset,
+ uint32_t ce,
+ int32_t patternceindex,
+ UErrorCode *status)
+{
+ if (U_SUCCESS(*status)) {
+ int32_t textlength = strsrch->search->textLength;
+ if (textoffset < textlength && strsrch->search->isOverlap) {
+ textoffset ++;
+ }
+ else {
+ int32_t shift =
+ strsrch->pattern.shift[hash(ce)];
+ // this is to adjust for characters in the middle of the substring
+ // for matching that failed.
+ int32_t adjust = strsrch->pattern.CELength - patternceindex;
+ if (adjust > 1 && shift >= adjust) {
+ shift -= adjust - 1;
+ }
+
+ textoffset += shift;
+ }
+ textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
+ // check for unsafe characters
+ // * if it is the start or middle of a contraction: to be done after
+ // a initial match is found
+ // * thai or lao base consonant character: similar to contraction
+ // * high surrogate character: similar to contraction
+ // * next character is a accent: shift to the next base character
+ ucol_setOffset(strsrch->textIter, textoffset, status);
+ }
+ return textoffset;
+}
+
+/**
+* sets match not found
+* @param strsrch string search data
+* @param status error status if any
+*/
+inline void setMatchNotFound(UStringSearch *strsrch, UErrorCode *status)
+{
+ strsrch->search->matchedIndex = USEARCH_DONE;
+ strsrch->search->matchedLength = 0;
+ if (strsrch->search->isForwardSearching) {
+ ucol_setOffset(strsrch->textIter, strsrch->search->textLength,
+ status);
+ }
+ else {
+ ucol_setOffset(strsrch->textIter, 0, status);
+ }
+}
+
+/**
+* Gets the offset to the next safe point in text.
+* ie. not the middle of a contraction, swappable characters or supplementary
+* characters.
+* @param collator collation sata
+* @param text string to work with
+* @param textoffset offset in string
+* @param textlength length of text string
+* @return offset to the next safe character
+*/
+inline UTextOffset getNextSafeOffset(const UCollator *collator,
+ const UChar *text,
+ UTextOffset textoffset,
+ int32_t textlength)
+{
+ UTextOffset result = textoffset; // first contraction character
+ while (result != textlength && ucol_unsafeCP(text[result], collator)) {
+ result ++;
+ }
+ return result;
+}
+
+/**
+* This checks for accents in the potential match started with a .
+* composite character.
+* This is really painful... we have to check that composite character do not
+* have any extra accents. We have to normalize the potential match and find
+* the immediate decomposed character before the match.
+* The first composite character would have been taken care of by the fcd
+* checks in checkForwardExactMatch.
+* This is the slow path after the fcd of the first character and
+* the last character has been checked by checkForwardExactMatch and we
+* determine that the potential match has extra non-ignorable preceding
+* ces.
+* @param strsrch string search data
+* @param start index of the potential unfriendly composite character
+* @param end index of the potential unfriendly composite character
+* @param status error status if any
+* @return TRUE if there is non-ignorable accents before at the beginning
+* of the match, FALSE otherwise.
+*/
+UBool checkExtraMatchAccents(const UStringSearch *strsrch, UTextOffset start,
+ UTextOffset end,
+ UErrorCode *status)
+{
+ UBool result = FALSE;
+ if (strsrch->pattern.hasPrefixAccents) {
+ UTextOffset length = end - start;
+ UChar32 codepoint;
+ UTextOffset offset = 0;
+ const UChar *text = strsrch->search->text + start;
+
+ UTF_NEXT_CHAR(text, offset, length, codepoint);
+ // we are only concerned with the first composite character
+ if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
+ UTextOffset safeoffset = getNextSafeOffset(
+ strsrch->collator, text, 0, length);
+ if (safeoffset != length) {
+ safeoffset ++;
+ }
+ UChar *norm = NULL;
+ UChar buffer[INITIAL_ARRAY_SIZE_];
+ int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
+ buffer, INITIAL_ARRAY_SIZE_,
+ status);
+ if (size >= INITIAL_ARRAY_SIZE_) {
+ norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
+ status);
+ if (norm == NULL) {
+ return TRUE;
+ }
+ size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
+ size, status);
+ }
+ else {
+ norm = buffer;
+ }
+
+ // TODO: keeping pattern iterator and setting text here
+ UCollationElements *coleiter =
+ ucol_openElements(strsrch->collator, norm, size, status);
+ uint32_t firstce = strsrch->pattern.CE[0];
+ UBool ignorable = TRUE;
+ uint32_t ce = UCOL_IGNORABLE;
+ while (U_SUCCESS(*status) && ce != firstce) {
+ offset = ucol_getOffset(coleiter);
+ if (ce != firstce && ce != UCOL_IGNORABLE) {
+ ignorable = FALSE;
+ }
+ ce = ucol_next(coleiter, status);
+ }
+
+ ucol_closeElements(coleiter);
+ UTF_PREV_CHAR(norm, 0, offset, codepoint);
+ result = !ignorable && (u_getCombiningClass(codepoint) != 0);
+
+ if (norm != buffer) {
+ uprv_free(norm);
+ }
+ }
+ }
+
+ return result;
+}
+
+/**
+* Used by exact matches, checks if there are accents before the match.
+* This is really painful... we have to check that composite characters at
+* the start of the matches have to not have any extra accents.
+* In order to determine that we have to normalize the first composite
+* character and find the immediate decomposed character before the match to
+* see if it is an non-ignorable accent.
+* Now normalizing the first composite character is enough because we ensure
+* that when the match is passed in here with extra beginning ces, the
+* first or last ce that match has to occur within the first character.
+* @param strsrch string search data
+* @param start offset
+* @param end offset
+* @return TRUE if there are accents on either side of the match,
+* FALSE otherwise
+*/
+UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, UTextOffset start,
+ UTextOffset end)
+{
+ // TODO: Add example
+ if (strsrch->pattern.hasPrefixAccents) {
+ UCollationElements *coleiter = strsrch->textIter;
+ UErrorCode status = U_ZERO_ERROR;
+ // we have been iterating forwards previously
+ uint32_t ignorable = TRUE;
+ uint32_t firstce = strsrch->pattern.CE[0];
+
+ ucol_setOffset(coleiter, start, &status);
+ uint32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
+ while (ce != firstce) {
+ if (ce != UCOL_IGNORABLE) {
+ ignorable = FALSE;
+ }
+ ce = getCE(strsrch, ucol_next(coleiter, &status));
+ if (U_FAILURE(status)) {
+ return TRUE;
+ }
+ }
+ if (!ignorable && inNormBuf(coleiter)) {
+ // within normalization buffer, discontiguous handled here
+ return TRUE;
+ }
+
+ // within text
+ UTextOffset temp = start;
+ UBool accent = (getFCD(strsrch->search->text, &temp,
+ strsrch->search->textLength) >>
+ SECOND_LAST_BYTE_SHIFT_);
+ if (!accent) {
+ return checkExtraMatchAccents(strsrch, start, end, &status);
+ }
+ if (!ignorable) {
+ return TRUE;
+ }
+ if (start > 0) {
+ temp = start;
+ UChar32 previous;
+ UTF_PREV_CHAR(strsrch->search->text, 0, temp, previous);
+ if (getFCD(strsrch->search->text, &temp,
+ strsrch->search->textLength) & LAST_BYTE_MASK_) {
+ ucol_setOffset(coleiter, start, &status);
+ ce = ucol_previous(coleiter, &status);
+ if (U_FAILURE(status) ||
+ (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
+ return TRUE;
+ }
+ }
+ }
+ }
+
+ return FALSE;
+}
+
+/**
+* Used by exact matches, checks if there are accents bounding the match.
+* Note this is the initial boundary check. If the potential match
+* starts or ends with composite characters, the accents in those
+* characters will be determined later.
+* Not doing backwards iteration here, since discontiguos contraction for
+* backwards collation element iterator, use up too many characters.
+* @param strsrch string search data
+* @param start offset of match
+* @param end end offset of the match
+* @return TRUE if there are accents on either side of the match,
+* FALSE otherwise
+*/
+UBool hasAccentsAfterMatch(const UStringSearch *strsrch, UTextOffset start,
+ UTextOffset end)
+{
+ if (strsrch->pattern.hasSuffixAccents) {
+ const UChar *text = strsrch->search->text;
+ UChar32 lastchar = 0;
+ UTextOffset temp = end;
+ int32_t textlength = strsrch->search->textLength;
+ UTF_PREV_CHAR(text, 0, temp, lastchar);
+ if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
+ uint32_t firstce = strsrch->pattern.CE[0];
+ UCollationElements *coleiter = strsrch->textIter;
+ UErrorCode status = U_ZERO_ERROR;
+ ucol_setOffset(coleiter, start, &status);
+ while (getCE(strsrch, ucol_next(coleiter, &status)) != firstce) {
+ if (U_FAILURE(status)) {
+ return TRUE;
+ }
+ }
+ int32_t count = 1;
+ while (count < strsrch->pattern.CELength) {
+ ucol_next(coleiter, &status);
+ if (U_FAILURE(status)) {
+ return TRUE;
+ }
+ count ++;
+ }
+ uint32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
+ if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
+ if (ucol_getOffset(coleiter) == end) {
+ return TRUE;
+ }
+ if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
+ return TRUE;
+ }
+ }
+ }
+ }
+ return FALSE;
+}
+
+/**
+* Checks if the offset runs out of the text string
+* @param offset
+* @param textlength of the text string
+* @return TRUE if offset is out of bounds, FALSE otherwise
+*/
+inline UBool isOutOfBounds(int32_t textlength, UTextOffset offset)
+{
+ return !(offset >= 0 && (offset <= textlength));
+}
+
+/**
+* Checks for identical match
+* @param strsrch string search data
+* @param start offset of possible match
+* @param end offset of possible match
+* @return TRUE if identical match is found
+*/
+inline UBool checkIdentical(const UStringSearch *strsrch, UTextOffset start,
+ UTextOffset end)
+{
+ int32_t length = end - start;
+ if (strsrch->strength != UCOL_IDENTICAL) {
+ return TRUE;
+ }
+
+ if (strsrch->pattern.textLength != length) {
+ return FALSE;
+ }
+
+ return (uprv_memcmp(strsrch->pattern.text, strsrch->search->text + start,
+ length * sizeof(UChar)) == 0);
+}
+
+/**
+* Checks to see if the match is repeated
+* @param strsrch string search data
+* @param start new match start index
+* @param end new match end index
+* @return TRUE if the the match is repeated, FALSE otherwise
+*/
+inline UBool checkRepeatedMatch(UStringSearch *strsrch,
+ UTextOffset start,
+ UTextOffset end)
+{
+ UTextOffset lastmatchindex = strsrch->search->matchedIndex;
+ UBool result;
+ if (lastmatchindex == USEARCH_DONE) {
+ return FALSE;
+ }
+ if (strsrch->search->isForwardSearching) {
+ result = start <= lastmatchindex;
+ }
+ else {
+ result = start >= lastmatchindex;
+ }
+ // TODO: example for overlapping case
+ if (!strsrch->search->isOverlap) {
+ if (strsrch->search->isForwardSearching) {
+ result = start < lastmatchindex + strsrch->search->matchedLength;
+ }
+ else {
+ result = end > lastmatchindex;
+ }
+ }
+ return result;
+}
+
+/**
+* Gets the collation element iterator's current offset.
+* @param coleiter collation element iterator
+* @param forwards flag TRUE if we are moving in th forwards direction
+* @return current offset
+*/
+inline UTextOffset getColElemIterOffset(const UCollationElements *coleiter,
+ UBool forwards)
+{
+ UTextOffset result = ucol_getOffset(coleiter);
+ // intricacies of the the backwards collation element iterator
+ if (!forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
+ result ++;
+ }
+ return result;
+}
+
+/**
+* Checks and sets the match information if found.
+* Checks
+*
+ * Let S' be the sub-string of a text string S between the offsets start and
+ * end
+ * A pattern string P matches a text string S at the offsets
+ * option 1. Some canonical equivalent of P matches some canonical equivalent
+ * of S'
+ * option 2. P matches S' and if P starts or ends with a combining mark,
+ * there exists no non-ignorable combining mark before or after S?
+ * in S respectively.
+ *
+ * Option 2. will be the default·
+ *
+ *
+ * Currently there are no composite characters that consists of a
+ * character with combining class > 0 before a character with combining
+ * class == 0. However, if such a character exists in the future,
+ * StringSearch does not guarantee the results for option 1.
+ *
+ * @see SearchIterator
+ * @see RuleBasedCollator
+ */
+
+class U_I18N_API StringSearch : public SearchIterator
+{
+public:
+
+ // public constructors and destructors --------------------------------
+
+ /**
+ * Creating a StringSearch instance using the argument locale
+ * language rule set. A collator will be created in the process, which
+ * will be owned by this instance and will be deleted in during
+ * destruction
+ * @param pattern The text for which this object will search.
+ * @param text The text in which to search for the pattern.
+ * @param locale A locale which defines the language-sensitive
+ * comparison rules used to determine whether text in the
+ * pattern and target matches.
+ * @param breakiter A BreakIterator object used to constrain
+ * the matches that are found. Matches whose start and end
+ * indices in the target text are not boundaries as
+ * determined by the BreakIterator are
+ * ignored. If this behavior is not desired,
+ * NULL can be passed in instead.
+ * @param status for errors if any
+ */
+ StringSearch(const UnicodeString &pattern, const UnicodeString &text,
+ const Locale &locale,
+ BreakIterator *breakiter,
+ UErrorCode &status);
+
+ /**
+ * Creating a StringSearch instance using the argument collator
+ * language rule set. Note, user retains the ownership of this collator,
+ * it does not get destroyed during this instance's destruction.
+ * @param pattern The text for which this object will search.
+ * @param text The text in which to search for the pattern.
+ * @param coll A RuleBasedCollator object which defines
+ * the language-sensitive comparison rules used to
+ * determine whether text in the pattern and target
+ * matches. User is responsible for the clearing of this
+ * object.
+ * @param breakiter A BreakIterator object used to constrain
+ * the matches that are found. Matches whose start and end
+ * indices in the target text are not boundaries as
+ * determined by the BreakIterator are
+ * ignored. If this behavior is not desired,
+ * NULL can be passed in instead.
+ * @param status for errors if any
+ */
+ StringSearch(const UnicodeString &pattern,
+ const UnicodeString &text,
+ RuleBasedCollator *coll,
+ BreakIterator *breakiter,
+ UErrorCode &status);
+
+ /**
+ * Creating a StringSearch instance using the argument locale
+ * language rule set. A collator will be created in the process, which
+ * will be owned by this instance and will be deleted in during
+ * destruction
+ *
+ * UnicodeString target("The quick brown fox jumped over the lazy fox");
+ * UnicodeString pattern("fox");
+ *
+ * SearchIterator *iter = new StringSearch(pattern, target);
+ *
+ * for (int pos = iter->first(); pos != USEARCH_DONE;
+ * pos = iter->next()) {
+ * printf("Found match at %d pos, length is %d\n", pos,
+ * iter.getMatchLength());
+ * }
+ *
+ * Let S' be the sub-string of a text string S between the offsets start and
+ * end
+ * A pattern string P matches a text string S at the offsets
+ * option 1. Some canonical equivalent of P matches some canonical equivalent
+ * of S'
+ * option 2. P matches S' and if P starts or ends with a combining mark,
+ * there exists no non-ignorable combining mark before or after S’
+ * in S respectively.
+ *
+ * Option 2. will be the default·
+ *
+ *
+ * Currently there are no composite characters that consists of a
+ * character with combining class > 0 before a character with combining
+ * class == 0. However, if such a character exists in the future, the
+ * search mechanism does not guarantee the results for option 1.
+ *
+ *
+ *
+ */
+
+/**
+* DONE is returned by previous() and next() after all valid matches have
+* been returned, and by first() and last() if there are no matches at all.
+*/
+#define USEARCH_DONE -1
+
+/**
+* Data structure for searching
+*/
+struct UStringSearch;
+typedef struct UStringSearch UStringSearch;
+
+typedef enum {
+ /** Option for overlapping matches */
+ USEARCH_OVERLAP,
+ /**
+ Option for canonical matches. option 1 in header documentation.
+ The default value will be USEARCH_OFF
+ */
+ USEARCH_CANONICAL_MATCH,
+ USEARCH_ATTRIBUTE_COUNT
+} USearchAttribute;
+
+typedef enum {
+ /** default value for any USearchAttribute */
+ USEARCH_DEFAULT = -1,
+ /** value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH */
+ USEARCH_OFF,
+ /** value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH */
+ USEARCH_ON,
+ USEARCH_ATTRIBUTE_VALUE_COUNT
+} USearchAttributeValue;
+
+/* open and close ------------------------------------------------------ */
+
+/**
+* Creating a search iterator data struct using the argument locale language
+* rule set. A collator will be created in the process, which will be owned by
+* this search and will be deleted in usearch_close.
+* @param pattern for matching
+* @param patternlength length of the pattern, -1 for null-termination
+* @param text text string
+* @param textlength length of the text string, -1 for null-termination
+* @param locale name of locale for the rules to be used
+* @param breakiter A BreakIterator that will be used to restrict the points
+* at which matches are detected. If a match is found, but
+* the match's start or end index is not a boundary as
+* determined by the BreakIterator, the match will
+* be rejected and another will be searched for.
+* If this parameter is NULL, no break detection is
+* attempted.
+* @param status for errors if it occurs
+* @return search iterator data structure
+*/
+U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
+ int32_t patternlength,
+ const UChar *text,
+ int32_t textlength,
+ const char *locale,
+ UBreakIterator *breakiter,
+ UErrorCode *status);
+
+/**
+* Creating a search iterator data struct using the argument collator language
+* rule set. Note, user retains the ownership of this collator, thus the
+* responsibility of deletion lies with the user.
+* @param pattern for matching
+* @param patternlength length of the pattern, -1 for null-termination
+* @param text text string
+* @param textlength length of the text string, -1 for null-termination
+* @param collator used for the language rules
+* @param breakiter A BreakIterator that will be used to restrict the points
+* at which matches are detected. If a match is found, but
+* the match's start or end index is not a boundary as
+* determined by the BreakIterator, the match will
+* be rejected and another will be searched for.
+* If this parameter is NULL, no break detection is
+* attempted.
+* @param status for errors if it occurs
+* @return search iterator data structure
+*/
+U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
+ const UChar *pattern,
+ int32_t patternlength,
+ const UChar *text,
+ int32_t textlength,
+ const UCollator *collator,
+ UBreakIterator *breakiter,
+ UErrorCode *status);
+
+/**
+* Destroying and cleaning up the search iterator data struct.
+* If a collator is created in usearch_open, it will be destroyed here.
+* @param searchiter data struct to clean up
+*/
+U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch);
+
+/* get and set methods -------------------------------------------------- */
+
+/**
+* Sets the current position in the text string which the next search will
+* start from. Clears previous states.
+* This method takes the argument index and sets the position in the text
+* string accordingly without checking if the index is pointing to a
+* valid starting point to begin searching.
+* Search positions that may render incorrect results are highlighted in the
+* header comments
+* @param strsrch search iterator data struct
+* @param position position to start next search from.
+* @param status error status if any.
+*/
+U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
+ UTextOffset position,
+ UErrorCode *status);
+
+/**
+* Return the current index in the string text being searched.
+* If the iteration has gone past the end of the text (or past the beginning
+* for a backwards search), {@link #USEARCH_DONE} is returned.
+* @param strsrch search iterator data struct
+*/
+U_CAPI UTextOffset U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch);
+
+/**
+* Sets the text searching attributes located in the enum USearchAttribute
+* with values from the enum USearchAttributeValue.
+* USEARCH_DEFAULT can be used for all attributes for resetting.
+* @param strsrch search iterator data struct
+* @param attribute text attribute to be set
+* @param value text attribute value
+* @param status for errors if it occurs
+* @see #usearch_getAttribute
+*/
+U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
+ USearchAttribute attribute,
+ USearchAttributeValue value,
+ UErrorCode *status);
+
+/**
+* Gets the text searching attributes.
+* @param strsrch search iterator data struct
+* @param attribute text attribute to be retrieve
+* @return text attribute value
+* @see #usearch_setAttribute
+*/
+U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
+ const UStringSearch *strsrch,
+ USearchAttribute attribute);
+
+/**
+* Returns the index to the match in the text string that was searched.
+* This call returns a valid result only after a successful call to
+* {@link #usearch_first}, {@link #usearch_next},
+* {@link #usearch_previous}, or {@link #usearch_last}.
+* Just after construction, or after a searching method returns
+* USEARCH_DONE, this method will return USEARCH_DONE.
+*
+ * char *tgtstr = "The quick brown fox jumped over the lazy fox";
+ * char *patstr = "fox";
+ * UChar target[64];
+ * UChar pattern[16];
+ * UErrorCode status = U_ZERO_ERROR;
+ * u_uastrcpy(target, tgtstr);
+ * u_uastrcpy(pattern, patstr);
+ *
+ * UStringSearch *search = usearch_open(pattern, -1, target, -1, "en_US",
+ * &status);
+ * if (U_SUCCESS(status)) {
+ * for (int pos = usearch_first(search);
+ * pos != USEARCH_DONE;
+ * pos = usearch_next(search)) {
+ * printf("Found match at %d pos, length is %d\n", pos,
+ * usearch_getMatchLength(search));
+ * }
+ * }
+ *
+*
+*
+*
+*