From ae7f45d5c12781d393b5e64af4b2700ba20ecadd Mon Sep 17 00:00:00 2001 From: Peter Edberg Date: Sun, 13 Sep 2015 07:43:51 +0000 Subject: [PATCH] ICU-11750 For Indic search: Allow match end at normalization boundary in middle of grapheme cluster X-SVN-Rev: 37949 --- icu4c/source/i18n/usearch.cpp | 83 +++++++++++++++++++++------ icu4c/source/test/cintltst/usrchdat.c | 12 +++- icu4c/source/test/cintltst/usrchtst.c | 21 ++++++- 3 files changed, 95 insertions(+), 21 deletions(-) diff --git a/icu4c/source/i18n/usearch.cpp b/icu4c/source/i18n/usearch.cpp index a2d83ed2f8f..8b5972764de 100644 --- a/icu4c/source/i18n/usearch.cpp +++ b/icu4c/source/i18n/usearch.cpp @@ -4002,6 +4002,25 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, found = FALSE; } + // Allow matches to end in the middle of a grapheme cluster if the following + // conditions are met; this is needed to make prefix search work properly in + // Indic, see #11750 + // * the default breakIter is being used + // * the next collation element beloging to this combining sequence + // - has non-zero primary weight + // - corresponds to a separate character following the one at end of the current match + // * the match end is a normalization boundary + UChar32 nextChar = 0; + U16_GET(strsrch->search->text, 0, maxLimit, strsrch->search->textLength, nextChar); + UBool allowMidclusterMatch = (strsrch->search->breakIter == NULL && + nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 && + maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit && + strsrch->nfd->hasBoundaryBefore(nextChar)); + // If those conditions are met, then: + // * do NOT advance the match position to a break boundary + // * do NOT require that end of the combining sequence not extend beyond the match in CE space + // * do NOT require that match end position be on a breakIter boundary + // Advance the match end position to the first acceptable match boundary. // This advances the index over any combining charcters. mLimit = maxLimit; @@ -4016,7 +4035,9 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, mLimit = minLimit; } else { int32_t nba = nextBoundaryAfter(strsrch, minLimit); - if (nba >= lastCEI->highIndex) { + // Note that we can have nba < maxLimit, in which case we want + // to set mLimit to nba regardless of allowMidclusterMatch + if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) { mLimit = nba; } } @@ -4028,14 +4049,16 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, } #endif - // If advancing to the end of a combining sequence in character indexing space - // advanced us beyond the end of the match in CE space, reject this match. - if (mLimit > maxLimit) { - found = FALSE; - } + if (!allowMidclusterMatch) { + // If advancing to the end of a combining sequence in character indexing space + // advanced us beyond the end of the match in CE space, reject this match. + if (mLimit > maxLimit) { + found = FALSE; + } - if (!isBreakBoundary(strsrch, mLimit)) { - found = FALSE; + if (!isBreakBoundary(strsrch, mLimit)) { + found = FALSE; + } } if (! checkIdentical(strsrch, mStart, mLimit)) { @@ -4252,25 +4275,47 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, mLimit = maxLimit = nextCEI->lowIndex; + // Allow matches to end in the middle of a grapheme cluster if the following + // conditions are met; this is needed to make prefix search work properly in + // Indic, see #11750 + // * the default breakIter is being used + // * the next collation element beloging to this combining sequence + // - has non-zero primary weight + // - corresponds to a separate character following the one at end of the current match + // * the match end is a normalization boundary + UChar32 nextChar = 0; + U16_GET(strsrch->search->text, 0, maxLimit, strsrch->search->textLength, nextChar); + UBool allowMidclusterMatch = (strsrch->search->breakIter == NULL && + nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 && + maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit && + strsrch->nfd->hasBoundaryBefore(nextChar)); + // If those conditions are met, then: + // * do NOT advance the match position to a break boundary + // * do NOT require that end of the combining sequence not extend beyond the match in CE space + // * do NOT require that match end position be on a breakIter boundary + // Advance the match end position to the first acceptable match boundary. - // This advances the index over any combining charcters. + // This advances the index over any combining characters. if (minLimit < maxLimit) { int32_t nba = nextBoundaryAfter(strsrch, minLimit); - - if (nba >= lastCEI->highIndex) { + // Note that we can have nba < maxLimit, in which case we want + // to set mLimit to nba regardless of allowMidclusterMatch + if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) { mLimit = nba; } } - // If advancing to the end of a combining sequence in character indexing space - // advanced us beyond the end of the match in CE space, reject this match. - if (mLimit > maxLimit) { - found = FALSE; - } + if (!allowMidclusterMatch) { + // If advancing to the end of a combining sequence in character indexing space + // advanced us beyond the end of the match in CE space, reject this match. + if (mLimit > maxLimit) { + found = FALSE; + } - // Make sure the end of the match is on a break boundary - if (!isBreakBoundary(strsrch, mLimit)) { - found = FALSE; + // Make sure the end of the match is on a break boundary + if (!isBreakBoundary(strsrch, mLimit)) { + found = FALSE; + } } } else { diff --git a/icu4c/source/test/cintltst/usrchdat.c b/icu4c/source/test/cintltst/usrchdat.c index de4caf64636..433584daf44 100644 --- a/icu4c/source/test/cintltst/usrchdat.c +++ b/icu4c/source/test/cintltst/usrchdat.c @@ -1,5 +1,5 @@ /******************************************************************** - * Copyright (c) 2001-2011 International Business Machines + * Copyright (c) 2001-2011,2015 International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************** * File USRCHDAT.H @@ -754,6 +754,16 @@ static const SearchData DIACRITICMATCH[] = { {NULL, NULL, NULL, UCOL_TERTIARY, USEARCH_STANDARD_ELEMENT_COMPARISON, NULL, {-1}, {0}} }; +static const SearchData INDICPREFIXMATCH[] = { // + {"\\u0915\\u0020\\u0915\\u0901\\u0020\\u0915\\u0902\\u0020\\u0915\\u0903\\u0020\\u0915\\u0940\\u0020\\u0915\\u093F\\u0020\\u0915\\u0943\\u0020\\u0915\\u093C\\u0020\\u0958", + "\\u0915", NULL, UCOL_PRIMARY, USEARCH_STANDARD_ELEMENT_COMPARISON, NULL, {0, 2, 5, 8, 11, 14, 17, 20, 23,-1}, {1, 2, 2, 2, 1, 1, 1, 2, 1}}, + {"\\u0915\\u0924\\u0020\\u0915\\u0924\\u0940\\u0020\\u0915\\u0924\\u093F\\u0020\\u0915\\u0924\\u0947\\u0020\\u0915\\u0943\\u0924\\u0020\\u0915\\u0943\\u0924\\u0947", + "\\u0915\\u0924", NULL, UCOL_PRIMARY, USEARCH_STANDARD_ELEMENT_COMPARISON, NULL, {0, 3, 7, 11, -1}, {2, 2, 2, 2}}, + {"\\u0915\\u0924\\u0020\\u0915\\u0924\\u0940\\u0020\\u0915\\u0924\\u093F\\u0020\\u0915\\u0924\\u0947\\u0020\\u0915\\u0943\\u0924\\u0020\\u0915\\u0943\\u0924\\u0947", + "\\u0915\\u0943\\u0924", NULL, UCOL_PRIMARY, USEARCH_STANDARD_ELEMENT_COMPARISON, NULL, {15, 19, -1}, {3, 3}}, + {NULL, NULL, NULL, UCOL_TERTIARY, USEARCH_STANDARD_ELEMENT_COMPARISON, NULL, {-1}, {0}} +}; + #endif /* #if !UCONFIG_NO_COLLATION */ #endif diff --git a/icu4c/source/test/cintltst/usrchtst.c b/icu4c/source/test/cintltst/usrchtst.c index 8c88fe2fbbf..acd9b057c4f 100644 --- a/icu4c/source/test/cintltst/usrchtst.c +++ b/icu4c/source/test/cintltst/usrchtst.c @@ -1,5 +1,5 @@ /******************************************************************** - * Copyright (c) 2001-2011 International Business Machines + * Copyright (c) 2001-2011,2015 International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************** * File usrchtst.c @@ -2987,6 +2987,24 @@ static void TestMatchFollowedByIgnorables(void) { ucol_close(coll); } +static void TestIndicPrefixMatch(void) // +{ + int count = 0; + UErrorCode status = U_ZERO_ERROR; + open(&status); + if (U_FAILURE(status)) { + log_err_status(status, "Unable to open static collators %s\n", u_errorName(status)); + return; + } + while (INDICPREFIXMATCH[count].text != NULL) { + if (!assertEqual(INDICPREFIXMATCH[count])) { + log_err("Error at test number %d\n", count); + } + count ++; + } + close(); +} + /** * addSearchTest */ @@ -3049,6 +3067,7 @@ void addSearchTest(TestNode** root) addTest(root, &TestPCEBuffer_100df, "tscoll/usrchtst/TestPCEBuffer/1_00df"); addTest(root, &TestPCEBuffer_2surr, "tscoll/usrchtst/TestPCEBuffer/2_dfff"); addTest(root, &TestMatchFollowedByIgnorables, "tscoll/usrchtst/TestMatchFollowedByIgnorables"); + addTest(root, &TestIndicPrefixMatch, "tscoll/usrchtst/TestIndicPrefixMatch"); } #endif /* #if !UCONFIG_NO_COLLATION */