diff --git a/icu4c/source/i18n/Makefile.in b/icu4c/source/i18n/Makefile.in index ce88e8f15a0..3802981bbbe 100644 --- a/icu4c/source/i18n/Makefile.in +++ b/icu4c/source/i18n/Makefile.in @@ -72,7 +72,8 @@ unifltlg.o unirange.o uniset.o unitohex.o unum.o \ dbbi.o dbbi_tbl.o rbbi.o rbbi_tbl.o brkdict.o nultrans.o jamohang.o hangjamo.o \ remtrans.o utrans.o \ titletrn.o tolowtrn.o toupptrn.o xformtrn.o name2uni.o uni2name.o nortrans.o \ -unifilt.o quant.o strmatch.o transreg.o +unifilt.o quant.o strmatch.o transreg.o usearch.o search.o stsearch.o + STATIC_OBJECTS = $(OBJECTS:.o=.$(STATIC_O)) diff --git a/icu4c/source/i18n/i18n.dsp b/icu4c/source/i18n/i18n.dsp index c9e0f50b726..bc75ac84192 100644 --- a/icu4c/source/i18n/i18n.dsp +++ b/icu4c/source/i18n/i18n.dsp @@ -234,6 +234,10 @@ SOURCE=.\remtrans.cpp # End Source File # Begin Source File +SOURCE=.\search.cpp +# End Source File +# Begin Source File + SOURCE=.\simpletz.cpp # End Source File # Begin Source File @@ -250,6 +254,10 @@ SOURCE=.\strmatch.cpp # End Source File # Begin Source File +SOURCE=.\stsearch.cpp +# End Source File +# Begin Source File + SOURCE=.\tblcoll.cpp # End Source File # Begin Source File @@ -350,6 +358,10 @@ SOURCE=.\unum.cpp # End Source File # Begin Source File +SOURCE=.\usearch.cpp +# End Source File +# Begin Source File + SOURCE=.\utrans.cpp # End Source File # Begin Source File @@ -1119,6 +1131,25 @@ SOURCE=.\unicode\remtrans.h # End Source File # Begin Source File +SOURCE=.\unicode\search.h + +!IF "$(CFG)" == "i18n - Win32 Release" + +!ELSEIF "$(CFG)" == "i18n - Win32 Debug" + +# Begin Custom Build +InputPath=.\unicode\search.h + +"..\..\include\unicode\search.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" + copy unicode\search.h ..\..\include\unicode + +# End Custom Build + +!ENDIF + +# End Source File +# Begin Source File + SOURCE=.\unicode\simpletz.h !IF "$(CFG)" == "i18n - Win32 Release" @@ -1204,6 +1235,25 @@ SOURCE=.\strmatch.h # End Source File # Begin Source File +SOURCE=.\unicode\stsearch.h + +!IF "$(CFG)" == "i18n - Win32 Release" + +!ELSEIF "$(CFG)" == "i18n - Win32 Debug" + +# Begin Custom Build +InputPath=.\unicode\stsearch.h + +"..\..\include\unicode\stsearch.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" + copy unicode\stsearch.h ..\..\include\unicode + +# End Custom Build + +!ENDIF + +# End Source File +# Begin Source File + SOURCE=.\unicode\tblcoll.h !IF "$(CFG)" == "i18n - Win32 Release" @@ -1676,6 +1726,29 @@ InputPath=.\unicode\unum.h # End Source File # Begin Source File +SOURCE=.\unicode\usearch.h + +!IF "$(CFG)" == "i18n - Win32 Release" + +!ELSEIF "$(CFG)" == "i18n - Win32 Debug" + +# Begin Custom Build +InputPath=.\unicode\usearch.h + +"..\..\include\unicode\usearch.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" + copy unicode\usearch.h ..\..\include\unicode + +# End Custom Build + +!ENDIF + +# End Source File +# Begin Source File + +SOURCE=.\usrchimp.h +# End Source File +# Begin Source File + SOURCE=.\unicode\utrans.h !IF "$(CFG)" == "i18n - Win32 Release" diff --git a/icu4c/source/i18n/search.cpp b/icu4c/source/i18n/search.cpp new file mode 100644 index 00000000000..879d704b77a --- /dev/null +++ b/icu4c/source/i18n/search.cpp @@ -0,0 +1,357 @@ +/* +********************************************************************** +* Copyright (C) 2001 IBM and others. All rights reserved. +********************************************************************** +* Date Name Description +* 03/22/2000 helena Creation. +********************************************************************** +*/ + +#include "unicode/brkiter.h" +#include "unicode/schriter.h" +#include "unicode/search.h" +#include "usrchimp.h" +#include "cmemory.h" + +// public constructors and destructors ----------------------------------- + +SearchIterator::SearchIterator(const SearchIterator &other) +{ + if (other != *this) { + m_breakiterator_ = other.m_breakiterator_; + m_text_ = other.m_text_; + m_search_ = (USearch *)uprv_malloc(sizeof(USearch)); + m_search_->breakIter = other.m_search_->breakIter; + m_search_->isCanonicalMatch = other.m_search_->isCanonicalMatch; + m_search_->isOverlap = other.m_search_->isOverlap; + m_search_->matchedIndex = other.m_search_->matchedIndex; + m_search_->matchedLength = other.m_search_->matchedLength; + m_search_->text = other.m_search_->text; + m_search_->textLength = other.m_search_->textLength; + } +} + +SearchIterator::~SearchIterator() +{ + if (m_search_ != NULL) { + uprv_free(m_search_); + } +} + +// public get and set methods ---------------------------------------- + +void SearchIterator::setAttribute(USearchAttribute attribute, + USearchAttributeValue value, + UErrorCode &status) +{ + if (U_SUCCESS(status)) { + switch (attribute) + { + case USEARCH_OVERLAP : + m_search_->isOverlap = (value == USEARCH_ON ? TRUE : FALSE); + break; + case USEARCH_CANONICAL_MATCH : + m_search_->isCanonicalMatch = (value == USEARCH_ON ? TRUE : FALSE); + break; + default: + status = U_ILLEGAL_ARGUMENT_ERROR; + } + } + if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) { + status = U_ILLEGAL_ARGUMENT_ERROR; + } +} + +USearchAttributeValue SearchIterator::getAttribute( + USearchAttribute attribute) const +{ + switch (attribute) { + case USEARCH_ATTRIBUTE_COUNT : + return USEARCH_DEFAULT; + case USEARCH_OVERLAP : + return (m_search_->isOverlap == TRUE ? USEARCH_ON : USEARCH_OFF); + case USEARCH_CANONICAL_MATCH : + return (m_search_->isCanonicalMatch == TRUE ? USEARCH_ON : + USEARCH_OFF); + } + return USEARCH_DEFAULT; +} + +UTextOffset SearchIterator::getMatchedStart() const +{ + return m_search_->matchedIndex; +} + +int32_t SearchIterator::getMatchedLength() const +{ + return m_search_->matchedLength; +} + +void SearchIterator::getMatchedText(UnicodeString &result) const +{ + UTextOffset matchedindex = m_search_->matchedIndex; + int32_t matchedlength = m_search_->matchedLength; + if (matchedindex != USEARCH_DONE && matchedlength != 0) { + result.setTo(m_search_->text + matchedindex, matchedlength); + } + else { + result.remove(); + } +} + +void SearchIterator::setBreakIterator(BreakIterator *breakiter, + UErrorCode &status) +{ + if (U_SUCCESS(status)) { + m_search_->breakIter = NULL; + // the c++ breakiterator may not make use of ubreakiterator. + // so we'll have to keep track of it ourselves. + m_breakiterator_ = breakiter; + } +} + +const BreakIterator * SearchIterator::getBreakIterator(void) const +{ + return m_breakiterator_; +} + +void SearchIterator::setText(const UnicodeString &text, UErrorCode &status) +{ + if (U_SUCCESS(status)) { + if (text.length() == 0) { + status = U_ILLEGAL_ARGUMENT_ERROR; + } + else { + m_text_ = text; + m_search_->text = m_text_.fArray; + } + } +} + +void SearchIterator::setText(CharacterIterator &text, UErrorCode &status) +{ + if (U_SUCCESS(status)) { + text.getText(m_text_); + setText(m_text_, status); + } +} + +const UnicodeString & SearchIterator::getText(void) const +{ + return m_text_; +} + +// operator overloading ---------------------------------------------- + +UBool SearchIterator::operator==(const SearchIterator &that) const +{ + if (this == &that) { + return TRUE; + } + return (m_breakiterator_ == that.m_breakiterator_ && + m_search_->isCanonicalMatch == that.m_search_->isCanonicalMatch && + m_search_->isOverlap == that.m_search_->isOverlap && + m_search_->matchedIndex == that.m_search_->matchedIndex && + m_search_->matchedLength == that.m_search_->matchedLength && + m_search_->textLength == that.m_search_->textLength && + getOffset() == that.getOffset() && + (uprv_memcmp(m_search_->text, that.m_search_->text, + m_search_->textLength * sizeof(UChar)) == 0)); +} + +// public methods ---------------------------------------------------- + +UTextOffset SearchIterator::first(UErrorCode &status) +{ + setOffset(0, status); + return handleNext(0, status); +} + +UTextOffset SearchIterator::following(UTextOffset position, + UErrorCode &status) +{ + setOffset(position, status); + return handleNext(position, status); +} + +UTextOffset SearchIterator::last(UErrorCode &status) +{ + setOffset(m_search_->textLength, status); + return handlePrev(m_search_->textLength, status); +} + +UTextOffset SearchIterator::preceding(UTextOffset position, + UErrorCode &status) +{ + setOffset(position, status); + return handlePrev(position, status); +} + +UTextOffset SearchIterator::next(UErrorCode &status) +{ + if (U_SUCCESS(status)) { + UTextOffset offset = getOffset(); + UTextOffset matchindex = m_search_->matchedIndex; + int32_t matchlength = m_search_->matchedLength; + m_search_->reset = FALSE; + if (m_search_->isForwardSearching == TRUE) { + int32_t textlength = m_search_->textLength; + if (offset == textlength || matchindex == textlength || + (matchindex != USEARCH_DONE && + matchindex + matchlength >= textlength)) { + // not enough characters to match + setMatchNotFound(); + return USEARCH_DONE; + } + } + else { + // switching direction. + // if matchedIndex == USEARCH_DONE, it means that either a + // setOffset has been called or that previous ran off the text + // string. the iterator would have been set to offset 0 if a + // match is not found. + m_search_->isForwardSearching = TRUE; + if (m_search_->matchedIndex != USEARCH_DONE) { + // there's no need to set the collation element iterator + // the next call to next will set the offset. + return matchindex; + } + } + + if (matchindex != USEARCH_DONE) { + return handleNext(matchindex + matchlength, status); + } + return handleNext(offset, status); + } + return USEARCH_DONE; +} + +UTextOffset SearchIterator::previous(UErrorCode &status) +{ + if (U_SUCCESS(status)) { + UTextOffset offset; + if (m_search_->reset) { + offset = m_search_->textLength; + m_search_->isForwardSearching = FALSE; + m_search_->reset = FALSE; + } + else { + offset = getOffset(); + } + + UTextOffset matchindex = m_search_->matchedIndex; + if (m_search_->isForwardSearching == TRUE) { + // switching direction. + // if matchedIndex == USEARCH_DONE, it means that either a + // setOffset has been called or that next ran off the text + // string. the iterator would have been set to offset textLength if + // a match is not found. + m_search_->isForwardSearching = FALSE; + if (matchindex != USEARCH_DONE) { + return matchindex; + } + } + else { + if (offset == 0 || matchindex == 0) { + // not enough characters to match + setMatchNotFound(); + return USEARCH_DONE; + } + } + + if (matchindex != USEARCH_DONE) { + return handlePrev(matchindex, status); + } + return handlePrev(offset, status); + } + return USEARCH_DONE; +} + +void SearchIterator::reset() +{ + setMatchNotFound(); + m_search_->isOverlap = FALSE; + m_search_->isCanonicalMatch = FALSE; + m_search_->isForwardSearching = TRUE; + m_search_->reset = TRUE; +} + +// protected constructors and destructors ----------------------------- + +SearchIterator::SearchIterator() : m_breakiterator_(NULL) +{ + m_search_ = (USearch *)uprv_malloc(sizeof(USearch)); + m_search_->breakIter = NULL; + m_search_->isOverlap = FALSE; + m_search_->isCanonicalMatch = FALSE; + m_search_->isForwardSearching = TRUE; + m_search_->reset = TRUE; + m_search_->matchedIndex = USEARCH_DONE; + m_search_->matchedLength = 0; + m_search_->text = NULL; + m_search_->textLength = 0; +} + +SearchIterator::SearchIterator(const UnicodeString &text, + BreakIterator *breakiter) : + m_breakiterator_(breakiter), + m_text_(text) +{ + m_search_ = (USearch *)uprv_malloc(sizeof(USearch)); + m_search_->breakIter = NULL; + m_search_->isOverlap = FALSE; + m_search_->isCanonicalMatch = FALSE; + m_search_->isForwardSearching = TRUE; + m_search_->reset = TRUE; + m_search_->matchedIndex = USEARCH_DONE; + m_search_->matchedLength = 0; + m_search_->text = m_text_.fArray; + m_search_->textLength = text.length(); +} + +SearchIterator::SearchIterator(CharacterIterator &text, + BreakIterator *breakiter) : + m_breakiterator_(breakiter) +{ + m_search_ = (USearch *)uprv_malloc(sizeof(USearch)); + m_search_->breakIter = NULL; + m_search_->isOverlap = FALSE; + m_search_->isCanonicalMatch = FALSE; + m_search_->isForwardSearching = TRUE; + m_search_->reset = TRUE; + m_search_->matchedIndex = USEARCH_DONE; + m_search_->matchedLength = 0; + text.getText(m_text_); + m_search_->text = m_text_.fArray; + m_search_->textLength = m_text_.length(); + m_breakiterator_ = breakiter; +} + +// protected methods ------------------------------------------------------ + +void SearchIterator::setMatchLength(int32_t length) +{ + m_search_->matchedLength = length; +} + +void SearchIterator::setMatchStart(UTextOffset position) +{ + m_search_->matchedIndex = position; +} + +void SearchIterator::setMatchNotFound() +{ + setMatchStart(USEARCH_DONE); + setMatchLength(0); + UErrorCode status = U_ZERO_ERROR; + // by default no errors should be returned here since offsets are within + // range. + if (m_search_->isForwardSearching) { + setOffset(m_search_->textLength, status); + } + else { + setOffset(0, status); + } +} + + diff --git a/icu4c/source/i18n/stsearch.cpp b/icu4c/source/i18n/stsearch.cpp new file mode 100644 index 00000000000..06d056b5bd8 --- /dev/null +++ b/icu4c/source/i18n/stsearch.cpp @@ -0,0 +1,387 @@ +/* +********************************************************************** +* Copyright (C) 2001 IBM and others. All rights reserved. +********************************************************************** +* Date Name Description +* 03/22/2000 helena Creation. +********************************************************************** +*/ + +#include "unicode/stsearch.h" +#include "cmemory.h" +#include "usrchimp.h" + +// public constructors and destructors ----------------------------------- + +StringSearch::StringSearch(const UnicodeString &pattern, + const UnicodeString &text, + const Locale &locale, + BreakIterator *breakiter, + UErrorCode &status) : + SearchIterator(text, breakiter), + m_collator_(), + m_pattern_(pattern) +{ + m_strsrch_ = usearch_open(m_pattern_.fArray, m_pattern_.fLength, + m_text_.fArray, m_text_.fLength, + locale.getName(), NULL, &status); + uprv_free(m_search_); + m_search_ = NULL; + + if (U_SUCCESS(status)) { + int32_t length; + const UChar *rules = ucol_getRules(m_strsrch_->collator, &length); + m_collation_rules_.setTo(rules, length); + m_collator_.setUCollator((UCollator *)m_strsrch_->collator, + &m_collation_rules_); + // m_search_ has been created by the base SearchIterator class + m_search_ = m_strsrch_->search; + } +} + +StringSearch::StringSearch(const UnicodeString &pattern, + const UnicodeString &text, + RuleBasedCollator *coll, + BreakIterator *breakiter, + UErrorCode &status) : + SearchIterator(text, breakiter), + m_collator_(), + m_pattern_(pattern) +{ + if (coll == NULL) { + status = U_ILLEGAL_ARGUMENT_ERROR; + m_strsrch_ = NULL; + return; + } + m_strsrch_ = usearch_openFromCollator(m_pattern_.fArray, + m_pattern_.fLength, m_text_.fArray, + m_text_.fLength, coll->ucollator, + NULL, &status); + uprv_free(m_search_); + m_search_ = NULL; + + if (U_SUCCESS(status)) { + int32_t length; + const UChar *rules = ucol_getRules(m_strsrch_->collator, &length); + m_collation_rules_.setTo(rules, length); + m_collator_.setUCollator((UCollator *)m_strsrch_->collator, + &m_collation_rules_); + // m_search_ has been created by the base SearchIterator class + m_search_ = m_strsrch_->search; + } +} + +StringSearch::StringSearch(const UnicodeString &pattern, + CharacterIterator &text, + const Locale &locale, + BreakIterator *breakiter, + UErrorCode &status) : + SearchIterator(text, breakiter), + m_collator_(), + m_pattern_(pattern) +{ + m_strsrch_ = usearch_open(m_pattern_.fArray, m_pattern_.fLength, + m_text_.fArray, m_text_.fLength, + locale.getName(), NULL, &status); + uprv_free(m_search_); + m_search_ = NULL; + + if (U_SUCCESS(status)) { + int32_t length; + const UChar *rules = ucol_getRules(m_strsrch_->collator, &length); + m_collation_rules_.setTo(rules, length); + m_collator_.setUCollator((UCollator *)m_strsrch_->collator, + &m_collation_rules_); + // m_search_ has been created by the base SearchIterator class + m_search_ = m_strsrch_->search; + } +} + +StringSearch::StringSearch(const UnicodeString &pattern, + CharacterIterator &text, + RuleBasedCollator *coll, + BreakIterator *breakiter, + UErrorCode &status) : + SearchIterator(text, breakiter), + m_collator_(), + m_pattern_(pattern) +{ + if (coll == NULL) { + status = U_ILLEGAL_ARGUMENT_ERROR; + m_strsrch_ = NULL; + return; + } + m_strsrch_ = usearch_openFromCollator(m_pattern_.fArray, + m_pattern_.fLength, m_text_.fArray, + m_text_.fLength, coll->ucollator, + NULL, &status); + uprv_free(m_search_); + m_search_ = NULL; + + if (U_SUCCESS(status)) { + int32_t length; + const UChar *rules = ucol_getRules(m_strsrch_->collator, &length); + m_collation_rules_.setTo(rules, length); + m_collator_.setUCollator((UCollator *)m_strsrch_->collator, + &m_collation_rules_); + // m_search_ has been created by the base SearchIterator class + m_search_ = m_strsrch_->search; + } +} + +StringSearch::StringSearch(const StringSearch &that) : + SearchIterator(that.m_text_, that.m_breakiterator_), + m_collator_(), + m_pattern_(that.m_pattern_) +{ + UErrorCode status = U_ZERO_ERROR; + if (that.m_strsrch_ == NULL) { + m_strsrch_ = NULL; + status = U_ILLEGAL_ARGUMENT_ERROR; + } + else { + m_strsrch_ = usearch_openFromCollator(m_pattern_.fArray, + m_pattern_.fLength, + m_text_.fArray, m_text_.fLength, + that.m_strsrch_->collator, + NULL, &status); + } + uprv_free(m_search_); + m_search_ = NULL; + + if (U_SUCCESS(status)) { + int32_t length; + const UChar *rules = ucol_getRules(m_strsrch_->collator, &length); + m_collation_rules_.setTo(rules, length); + m_collator_.setUCollator((UCollator *)m_strsrch_->collator, + &m_collation_rules_); + // m_search_ has been created by the base SearchIterator class + m_search_ = m_strsrch_->search; + m_breakiterator_ = that.m_breakiterator_; + } +} + +StringSearch::~StringSearch() +{ + usearch_close(m_strsrch_); + m_search_ = NULL; +} + +// operator overloading --------------------------------------------- +StringSearch & StringSearch::operator=(const StringSearch &that) +{ + if ((*this) != that) { + UErrorCode status = U_ZERO_ERROR; + m_text_ = that.m_text_; + m_breakiterator_ = that.m_breakiterator_; + m_pattern_ = that.m_pattern_; + // all m_search_ in the parent class is linked up with m_strsrch_ + usearch_close(m_strsrch_); + m_strsrch_ = usearch_openFromCollator(m_pattern_.fArray, + m_pattern_.fLength, + m_text_.fArray, + m_text_.fLength, + that.m_strsrch_->collator, + NULL, &status); + int32_t length; + const UChar *rules = ucol_getRules(m_strsrch_->collator, &length); + m_collation_rules_.setTo(rules, length); + m_collator_.setUCollator((UCollator *)m_strsrch_->collator, + &m_collation_rules_); + m_search_ = m_strsrch_->search; + } + return *this; +} + +UBool StringSearch::operator==(const SearchIterator &that) const +{ + if (this == &that) { + return TRUE; + } + if (SearchIterator::operator ==(that)) { + StringSearch &thatsrch = (StringSearch &)that; + return (this->m_pattern_ == thatsrch.m_pattern_ && + this->m_strsrch_->collator == thatsrch.m_strsrch_->collator); + } + return FALSE; +} + +// public get and set methods ---------------------------------------- + +void StringSearch::setOffset(UTextOffset position, UErrorCode &status) +{ + usearch_setOffset(m_strsrch_, position, &status); +} + +UTextOffset StringSearch::getOffset(void) const +{ + return usearch_getOffset(m_strsrch_); +} + +void StringSearch::setText(const UnicodeString &text, UErrorCode &status) +{ + m_text_ = text; + usearch_setText(m_strsrch_, text.fArray, text.fLength, &status); +} + +void StringSearch::setText(CharacterIterator &text, UErrorCode &status) +{ + text.getText(m_text_); + usearch_setText(m_strsrch_, m_text_.fArray, m_text_.fLength, &status); +} + +RuleBasedCollator * StringSearch::getCollator() const +{ + return (RuleBasedCollator *)&m_collator_; +} + +void StringSearch::setCollator(RuleBasedCollator *coll, UErrorCode &status) +{ + usearch_setCollator(m_strsrch_, coll->getUCollator(), &status); + m_collation_rules_.setTo(coll->getRules()); + m_collator_.setUCollator((UCollator *)m_strsrch_->collator, + &m_collation_rules_); +} + +void StringSearch::setPattern(const UnicodeString &pattern, + UErrorCode &status) +{ + m_pattern_ = pattern; + usearch_setPattern(m_strsrch_, m_pattern_.fArray, m_pattern_.fLength, + &status); +} + +const UnicodeString & StringSearch::getPattern() const +{ + return m_pattern_; +} + +// public methods ---------------------------------------------------- + +void StringSearch::reset() +{ + usearch_reset(m_strsrch_); +} + +SearchIterator * StringSearch::safeClone(void) const +{ + UErrorCode status = U_ZERO_ERROR; + StringSearch *result = new StringSearch(m_pattern_, m_text_, + (RuleBasedCollator *)&m_collator_, + m_breakiterator_, + status); + result->setOffset(getOffset(), status); + result->setMatchStart(m_strsrch_->search->matchedIndex); + result->setMatchLength(m_strsrch_->search->matchedLength); + if (U_FAILURE(status)) { + return NULL; + } + return result; +} + +// protected method ------------------------------------------------- + +UTextOffset StringSearch::handleNext(int32_t position, UErrorCode &status) +{ + // values passed here are already in the pre-shift position + if (U_SUCCESS(status)) { + if (m_strsrch_->pattern.CELength == 0) { + m_search_->matchedIndex = + m_search_->matchedIndex == USEARCH_DONE ? + getOffset() : m_search_->matchedIndex + 1; + m_search_->matchedLength = 0; + ucol_setOffset(m_strsrch_->textIter, m_search_->matchedIndex, + &status); + if (m_search_->matchedIndex == m_search_->textLength) { + m_search_->matchedIndex = USEARCH_DONE; + } + } + else { + // looking at usearch.cpp, this part is shifted out to + // StringSearch instead of SearchIterator because m_strsrch_ is + // not accessible in SearchIterator + if (!m_search_->isOverlap && + position + m_strsrch_->pattern.defaultShiftSize > + m_search_->textLength) { + setMatchNotFound(); + return USEARCH_DONE; + } + while (TRUE) { + if (m_search_->isCanonicalMatch) { + // can't use exact here since extra accents are allowed. + usearch_handleNextCanonical(m_strsrch_, &status); + } + else { + usearch_handleNextExact(m_strsrch_, &status); + } + if (U_FAILURE(status)) { + return USEARCH_DONE; + } + if (m_breakiterator_ == NULL || + m_search_->matchedIndex == USEARCH_DONE || + (m_breakiterator_->isBoundary(m_search_->matchedIndex) && + m_breakiterator_->isBoundary(m_search_->matchedIndex + + m_search_->matchedLength))) { + return m_search_->matchedIndex; + } + } + } + } + return USEARCH_DONE; +} + +UTextOffset StringSearch::handlePrev(int32_t position, UErrorCode &status) +{ + // values passed here are already in the pre-shift position + if (U_SUCCESS(status)) { + if (m_strsrch_->pattern.CELength == 0) { + m_search_->matchedIndex = + (m_search_->matchedIndex == USEARCH_DONE ? getOffset() : + m_search_->matchedIndex); + if (m_search_->matchedIndex == 0) { + setMatchNotFound(); + } + else { + m_search_->matchedIndex --; + ucol_setOffset(m_strsrch_->textIter, m_search_->matchedIndex, + &status); + m_search_->matchedLength = 0; + } + } + else { + // looking at usearch.cpp, this part is shifted out to + // StringSearch instead of SearchIterator because m_strsrch_ is + // not accessible in SearchIterator + if (!m_search_->isOverlap && + position - m_strsrch_->pattern.defaultShiftSize < 0) { + setMatchNotFound(); + return USEARCH_DONE; + } + while (TRUE) { + if (m_search_->isCanonicalMatch) { + // can't use exact here since extra accents are allowed. + usearch_handlePreviousCanonical(m_strsrch_, &status); + } + else { + usearch_handlePreviousExact(m_strsrch_, &status); + } + if (U_FAILURE(status)) { + return USEARCH_DONE; + } + if (m_breakiterator_ == NULL || + m_search_->matchedIndex == USEARCH_DONE || + (m_breakiterator_->isBoundary(m_search_->matchedIndex) && + m_breakiterator_->isBoundary(m_search_->matchedIndex + + m_search_->matchedLength))) { + return m_search_->matchedIndex; + } + } + } + + return m_search_->matchedIndex; + } + return USEARCH_DONE; +} + + + diff --git a/icu4c/source/i18n/tblcoll.cpp b/icu4c/source/i18n/tblcoll.cpp index 52761d66430..2b9eec0120e 100644 --- a/icu4c/source/i18n/tblcoll.cpp +++ b/icu4c/source/i18n/tblcoll.cpp @@ -326,18 +326,12 @@ Collator* RuleBasedCollator::clone() const return new RuleBasedCollator(*this); } -/** -* Create a CollationElementIterator object that will iterator over the -* elements in a string, using the collation rules defined in this -* RuleBasedCollator -*/ CollationElementIterator* RuleBasedCollator::createCollationElementIterator (const UnicodeString& source) const { UErrorCode status = U_ZERO_ERROR; CollationElementIterator *result = new CollationElementIterator(source, this, status); - if (U_FAILURE(status)) return NULL; diff --git a/icu4c/source/i18n/ucol_imp.h b/icu4c/source/i18n/ucol_imp.h index 370e7e51c2f..2c23b048cb3 100644 --- a/icu4c/source/i18n/ucol_imp.h +++ b/icu4c/source/i18n/ucol_imp.h @@ -368,6 +368,13 @@ ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status); #define getExpansionCount(CE) ((CE)&0xF) #define isCEIgnorable(CE) (((CE) & 0xFFFFFFBF) == 0) +/* StringSearch internal use */ +#define inNormBuf(coleiter) ((coleiter)->iteratordata_.flags & UCOL_ITER_INNORMBUF) +#define isFCDPointerNull(coleiter) ((coleiter)->iteratordata_.fcdPosition == NULL) +#define getExpansionPrefix(coleiter) ((coleiter)->iteratordata_.toReturn - (coleiter)->iteratordata_.CEs) +#define setExpansionPrefix(coleiter, offset) ((coleiter)->iteratordata_.CEs + offset) +#define getExpansionSuffix(coleiter) ((coleiter)->iteratordata_.CEpos - (coleiter)->iteratordata_.toReturn) +#define setExpansionSuffix(coleiter, offset) ((coleiter)->iteratordata_.toReturn = (coleiter)->iteratordata_.CEpos - leftoverces) #define UCA_DATA_TYPE "dat" #define UCA_DATA_NAME "ucadata" diff --git a/icu4c/source/i18n/ucoleitr.cpp b/icu4c/source/i18n/ucoleitr.cpp index 95b4221917e..70072db2341 100644 --- a/icu4c/source/i18n/ucoleitr.cpp +++ b/icu4c/source/i18n/ucoleitr.cpp @@ -223,6 +223,9 @@ ucol_setOffset(UCollationElements *elems, collIterate *ci = &(elems->iteratordata_); ci->pos = ci->string + offset; ci->CEpos = ci->toReturn = ci->CEs; + if (ci->flags & UCOL_ITER_INNORMBUF) { + ci->flags = ci->origFlags; + } if ((ci->flags & UCOL_ITER_HASLEN) == 0) { ci->endp = ci->string + u_strlen(ci->string); } diff --git a/icu4c/source/i18n/unicode/search.h b/icu4c/source/i18n/unicode/search.h new file mode 100644 index 00000000000..7f3d38eaac7 --- /dev/null +++ b/icu4c/source/i18n/unicode/search.h @@ -0,0 +1,466 @@ +/* +********************************************************************** +* Copyright (C) 2001 IBM and others. All rights reserved. +********************************************************************** +* Date Name Description +* 03/22/2000 helena Creation. +********************************************************************** +*/ + +#ifndef SEARCH_H +#define SEARCH_H + +#include "unicode/unistr.h" +#include "unicode/chariter.h" +#include "unicode/brkiter.h" +#include "unicode/usearch.h" + +/** + * SearchIterator is an abstract base class that provides + * methods to search for a pattern within a text string. Instances of + * SearchIterator maintain a current position and scans over the + * target text, returning the indices the pattern is matched and the length + * of each match. + *

+ * SearchIterator defines a protocol for text searching. + * Subclasses provide concrete implementations of various search algorithms. + * For example, {@link StringSearch} implements language-sensitive pattern + * matching based on the comparison rules defined in a + * {@link RuleBasedCollator} object. + *

+ * Other options for searching includes using a BreakIterator to restrict + * the points at which matches are detected. + *

+ * SearchIterator provides an API that is similar to that of + * other text iteration classes such as BreakIterator. Using + * this class, it is easy to scan through text looking for all occurances of + * a given pattern. The following example uses a StringSearch + * object to find all instances of "fox" in the target string. Any other + * subclass of SearchIterator can be used in an identical + * manner. + *


+ * UnicodeString target("The quick brown fox jumped over the lazy fox");
+ * UnicodeString pattern("fox");
+ *
+ * SearchIterator *iter = new StringSearch(pattern, target);
+ *
+ * for (int pos = iter->first(); pos != USEARCH_DONE; 
+ *                               pos = iter->next()) {
+ *     printf("Found match at %d pos, length is %d\n", pos, 
+ *                                             iter.getMatchLength());
+ * }
+ * 
+ * + * @see StringSearch + */ + +struct USearch; +typedef struct USearch USearch; + +/** +* Data structure for searching +*/ +class U_I18N_API SearchIterator { + +public: + + // public constructors and destructors ------------------------------- + + /** + * Copy constructor that creates a SearchIterator instance with the same + * behavior, and iterating over the same text. + * @param other the SearchIterator instance to be copied. + */ + SearchIterator(const SearchIterator &other); + + /** + * Destructor. Cleans up the search iterator data struct. + */ + virtual ~SearchIterator(); + + // public get and set methods ---------------------------------------- + + /** + * Sets the index to point to the given position, and clears any state + * that's affected. + *

+ * This method takes the argument index and sets the position in the text + * string accordingly without checking if the index is pointing to a + * valid starting point to begin searching. + * @param position within the text to be set + * @param status for errors if it occurs + */ + virtual void setOffset(UTextOffset position, UErrorCode &status) = 0; + + /** + * Return the current index in the text being searched. + * If the iteration has gone past the end of the text + * (or past the beginning for a backwards search), {@link #USEARCH_DONE} + * is returned. + * @return current index in the text being searched. + */ + virtual UTextOffset getOffset(void) const = 0; + + /** + * Sets the text searching attributes located in the enum + * USearchAttribute with values from the enum USearchAttributeValue. + * USEARCH_DEFAULT can be used for all attributes for resetting. + * @param attribute text attribute (enum USearchAttribute) to be set + * @param value text attribute value + * @param status for errors if it occurs + */ + void setAttribute(USearchAttribute attribute, + USearchAttributeValue value, + UErrorCode &status); + + /** + * Gets the text searching attributes + * @param attribute text attribute (enum USearchAttribute) to be retrieve + * @return text attribute value + */ + USearchAttributeValue getAttribute(USearchAttribute attribute) const; + + /** + * Returns the index to the match in the text string that was searched. + * This call returns a valid result only after a successful call to + * {@link #first}, {@link #next}, {@link #previous}, or {@link #last}. + * Just after construction, or after a searching method returns + * USEARCH_DONE, this method will return USEARCH_DONE. + *

+ * Use getMatchedLength to get the matched string length. + * @return index of a substring within the text string that is being + * searched. + */ + UTextOffset getMatchedStart(void) const; + + /** + * Returns the length of text in the string which matches the search + * pattern. This call returns a valid result only after a successful call + * to {@link #first}, {@link #next}, {@link #previous}, or {@link #last}. + * Just after construction, or after a searching method returns + * USEARCH_DONE, this method will return 0. + * @return The length of the match in the target text, or 0 if there + * is no match currently. + */ + int32_t getMatchedLength(void) const; + + /** + * Returns the text that was matched by the most recent call to + * {@link #first}, {@link #next}, {@link #previous}, or {@link #last}. + * If the iterator is not pointing at a valid match (e.g. just after + * construction or after USEARCH_DONE has been returned, + * returns an empty string. + * @param result stores the matched string or an empty string if a match + * is not found. + */ + void getMatchedText(UnicodeString &result) const; + + /** + * Set the BreakIterator that will be used to restrict the points + * at which matches are detected. The user is responsible for deleting + * the breakiterator. + * @param breakiter A BreakIterator that will be used to restrict the + * points at which matches are detected. If a match is + * found, but the match's start or end index is not a + * boundary as determined by the BreakIterator, + * the match will be rejected and another will be searched + * for. If this parameter is NULL, no break + * detection is attempted. + * @param status for errors if it occurs + */ + void setBreakIterator(BreakIterator *breakiter, UErrorCode &status); + + /** + * Returns the BreakIterator that is used to restrict the points at + * which matches are detected. This will be the same object that was + * passed to the constructor or to setBreakIterator. + * Note that NULL is a legal value; it means that break + * detection should not be attempted. + * @return BreakIterator used to restrict matchings. + */ + const BreakIterator * getBreakIterator(void) const; + + /** + * Set the string text to be searched. Text iteration will hence begin at + * the start of the text string. This method is useful if you want to + * re-use an iterator to search for the same pattern within a different + * body of text. The user is responsible for deleting the text. + * @param text string to be searched. + * @param status for errors if it occurs + */ + virtual void setText(const UnicodeString &text, UErrorCode &status); + + /** + * Set the string text to be searched. Text iteration will hence begin at + * the start of the text string. This method is useful if you want to + * re-use an iterator to search for the same pattern within a different + * body of text. + *

+ * Note: No parsing of the text within the CharacterIterator + * will be done during searching for this version. The block of text + * in CharacterIterator will be used as it is. + * The user is responsible for deleting the text. + * @param text string iterator to be searched. + * @param status for errors if it occurs + */ + virtual void setText(CharacterIterator &text, UErrorCode &status); + + /** + * Return the string text to be searched. + * @return text string to be searched. + */ + const UnicodeString & getText(void) const; + + // operator overloading ---------------------------------------------- + + /** + * Equality operator. + * @param that SearchIterator instance to be compared. + * @return TRUE if both BreakIterators are of the same class, have the + * same behavior, terates over the same text and have the same + * attributes. FALSE otherwise. + */ + virtual UBool operator==(const SearchIterator &that) const; + + /** + * Not-equal operator. + * @param that SearchIterator instance to be compared. + * @return FALSE if operator== returns TRUE, and vice versa. + */ + UBool operator!=(const SearchIterator &that) const; + + // public methods ---------------------------------------------------- + + /** + * Returns a copy of SearchIterator with the same behavior, and + * iterating over the same text, as this one. Note that all data will be + * replicated, except for the text string to be searched. + * @return cloned object + */ + virtual SearchIterator* safeClone(void) const = 0; + + /** + * Returns the first index at which the string text matches the search + * pattern. The iterator is adjusted so that its current index (as + * returned by {@link #usearch_getOffset}) is the match position if one + * was found. + * If a match is not found, USEARCH_DONE will be returned and + * the iterator will be adjusted to the index USEARCH_DONE + * @param status for errors if it occurs + * @return The character index of the first match, or + * USEARCH_DONE if there are no matches. + */ + UTextOffset first(UErrorCode &status); + + /** + * Returns the first index greater than position at which the + * string text matches the search pattern. The iterator is adjusted so + * that its current index (as returned by {@link #getOffset}) is the + * match position if one was found. If a match is not found, + * USEARCH_DONE will be returned and the iterator will be + * adjusted to the index USEARCH_DONE + * @param position where search if to start from + * @param status for errors if it occurs + * @return The character index of the first match following + * position, or USEARCH_DONE if there are no + * matches. + */ + UTextOffset following(UTextOffset position, UErrorCode &status); + + /** + * Returns the last index in the target text at which it matches the + * search pattern. The iterator is adjusted so that its current index + * (as returned by {@link #getOffset}) is the match position if one was + * found. + * If a match is not found, USEARCH_DONE will be returned and + * the iterator will be adjusted to the index USEARCH_DONE. + * @param status for errors if it occurs + * @return The index of the first match, or USEARCH_DONE if + * there are no matches. + */ + UTextOffset last(UErrorCode &status); + + /** + * Returns the first index less than position at which the string + * text matches the search pattern. The iterator is adjusted so that its + * current index (as returned by {@link #getOffset}) is the match + * position if one was found. If a match is not found, + * USEARCH_DONE will be returned and the iterator will be + * adjusted to the index USEARCH_DONE + * @param position where search is to start from + * @param status for errors if it occurs + * @return The character index of the first match preceding + * position, or USEARCH_DONE if there are + * no matches. + */ + UTextOffset preceding(UTextOffset position, UErrorCode &status); + + /** + * Returns the index of the next point at which the text matches the + * search pattern, starting from the current position + * The iterator is adjusted so that its current index (as returned by + * {@link #getIndex}) is the match position if one was found. + * If a match is not found, USEARCH_DONE will be returned and + * the iterator will be adjusted to a position after the end of the text + * string. + * @param status for errors if it occurs + * @return The index of the next match after the current position, + * or USEARCH_DONE if there are no more matches. + */ + UTextOffset next(UErrorCode &status); + + /** + * Returns the index of the previous point at which the string text + * matches the search pattern, starting at the current position. + * The iterator is adjusted so that its current index (as returned by + * {@link #getOffset}) is the match position if one was found. + * If a match is not found, USEARCH_DONE will be returned and + * the iterator will be adjusted to the index USEARCH_DONE + * @param status for errors if it occurs + * @return The index of the previous match before the current position, + * or USEARCH_DONE if there are no more matches. + */ + UTextOffset previous(UErrorCode &status); + + /** + * Resets the iteration. + * Search will begin at the start of the text string if a forward + * iteration is initiated before a backwards iteration. Otherwise if a + * backwards iteration is initiated before a forwards iteration, the + * search will begin at the end of the text string. + */ + virtual void reset(); + +protected: + // protected data members --------------------------------------------- + + /** + * C search data struct + */ + USearch *m_search_; + + /** + * Break iterator. + * Currently the C++ breakiterator does not have getRules etc to reproduce + * another in C. Hence we keep the original around and do the verification + * at the end of the match. The user is responsible for deleting this + * break iterator. + */ + BreakIterator *m_breakiterator_; + + /** + * Unicode string version of the search text + */ + UnicodeString m_text_; + + // protected constructors and destructors ----------------------------- + + /** + * Default constructor. + * Initializes data to the default values. + */ + SearchIterator(); + + /** + * Constructor for use by subclasses. + * @param text The target text to be searched. + * @param breakiter A {@link BreakIterator} that is used to restrict the + * points at which matches are detected. If + * handleNext or handlePrev finds a + * match, but the match's start or end index is not a + * boundary as determined by the BreakIterator, + * the match is rejected and handleNext or + * handlePrev is called again. If this parameter + * is NULL, no break detection is attempted. + * @param status error status + */ + SearchIterator(const UnicodeString &text, + BreakIterator *breakiter = NULL); + + /** + * Constructor for use by subclasses. + *

+ * Note: No parsing of the text within the CharacterIterator + * will be done during searching for this version. The block of text + * in CharacterIterator will be used as it is. + * @param text The target text to be searched. + * @param breakiter A {@link BreakIterator} that is used to restrict the + * points at which matches are detected. If + * handleNext or handlePrev finds a + * match, but the match's start or end index is not a + * boundary as determined by the BreakIterator, + * the match is rejected and handleNext or + * handlePrev is called again. If this parameter + * is NULL, no break detection is attempted. + */ + SearchIterator(CharacterIterator &text, BreakIterator *breakiter = NULL); + + // protected methods -------------------------------------------------- + + /** + * Abstract method which subclasses override to provide the mechanism + * for finding the next match in the target text. This allows different + * subclasses to provide different search algorithms. + *

+ * If a match is found, the implementation should return the index at + * which the match starts and should call + * {@link #setMatchLength setMatchLength} with the number of characters + * in the target text that make up the match. If no match is found, the + * method should return USEARCH_DONE. + *

+ * @param position The index in the target text at which the search + * should start. + * @param status for error codes if it occurs. + */ + virtual UTextOffset handleNext(UTextOffset position, UErrorCode &status) + = 0; + + /** + * Abstract method which subclasses override to provide the mechanism for + * finding the previous match in the target text. This allows different + * subclasses to provide different search algorithms. + *

+ * If a match is found, the implementation should return the index at + * which the match starts and should call + * {@link #setMatchLength setMatchLength} with the number of characters + * in the target text that make up the match. If no match is found, the + * method should return USEARCH_DONE. + *

+ * @param position The index in the target text at which the search + * should start. + * @param status for error codes if it occurs. + */ + virtual UTextOffset handlePrev(UTextOffset position, UErrorCode &status) + = 0; + + /** + * Sets the length of the currently matched string in the text string to + * be searched. + * Subclasses' handleNext and handlePrev + * methods should call this when they find a match in the target text. + * @param length length of the matched text. + */ + virtual void setMatchLength(int32_t length); + + /** + * Sets the offset of the currently matched string in the text string to + * be searched. + * Subclasses' handleNext and handlePrev + * methods should call this when they find a match in the target text. + * @param position start offset of the matched text. + */ + virtual void setMatchStart(UTextOffset position); + + /** + * sets match not found + */ + void setMatchNotFound(); +}; + +inline UBool SearchIterator::operator!=(const SearchIterator &that) const +{ + return !operator==(that); +} + +#endif + diff --git a/icu4c/source/i18n/unicode/stsearch.h b/icu4c/source/i18n/unicode/stsearch.h new file mode 100644 index 00000000000..69f222d9ae4 --- /dev/null +++ b/icu4c/source/i18n/unicode/stsearch.h @@ -0,0 +1,433 @@ +/* +********************************************************************** +* Copyright (C) 2001 IBM and others. All rights reserved. +********************************************************************** +* Date Name Description +* 03/22/2000 helena Creation. +********************************************************************** +*/ + +#ifndef STSEARCH_H +#define STSEARCH_H + +#include "unicode/tblcoll.h" +#include "unicode/coleitr.h" +#include "unicode/search.h" + +/** + * StringSearch is a SearchIterator that provides + * language-sensitive text searching based on the comparison rules defined + * in a {@link RuleBasedCollator} object. + * StringSearch ensures that language eccentricity can be + * handled, e.g. for the German collator, characters ß and SS will be matched + * if case is chosen to be ignored. + * See the + * "ICU Collation Design Document" for more information. + *

+ * The algorithm implemented is a modified form of the Boyer Moore's search. + * For more information see + * + * "Efficient Text Searching in Java", published in Java Report + * in February, 1999, for further information on the algorithm. + *

+ * There are 2 match options for selection:
+ * Let S' be the sub-string of a text string S between the offsets start and + * end . + *
+ * A pattern string P matches a text string S at the offsets + * if + *

 
+ * option 1. Some canonical equivalent of P matches some canonical equivalent 
+ *           of S'
+ * option 2. P matches S' and if P starts or ends with a combining mark, 
+ *           there exists no non-ignorable combining mark before or after S? 
+ *           in S respectively. 
+ * 
+ * Option 2. will be the default· + *

+ * This search has APIs similar to that of other text iteration mechanisms + * such as the break iterators in BreakIterator. Using these + * APIs, it is easy to scan through text looking for all occurances of + * a given pattern. This search iterator allows changing of direction by + * calling a reset followed by a next or previous. + * Though a direction change can occur without calling reset first, + * this operation comes with some speed penalty. + * Match results in the forward direction will match the result matches in + * the backwards direction in the reverse order + *

+ * SearchIterator provides APIs to specify the starting position + * within the text string to be searched, e.g. setOffset, + * preceding and following. Since the + * starting position will be set as it is specified, please take note that + * there are some danger points which the search may render incorrect + * results: + *