From ae7f45d5c12781d393b5e64af4b2700ba20ecadd Mon Sep 17 00:00:00 2001
From: Peter Edberg <pedberg@unicode.org>
Date: Sun, 13 Sep 2015 07:43:51 +0000
Subject: [PATCH] ICU-11750 For Indic search: Allow match end at normalization
 boundary in middle of grapheme cluster

X-SVN-Rev: 37949
---
 icu4c/source/i18n/usearch.cpp         | 83 +++++++++++++++++++++------
 icu4c/source/test/cintltst/usrchdat.c | 12 +++-
 icu4c/source/test/cintltst/usrchtst.c | 21 ++++++-
 3 files changed, 95 insertions(+), 21 deletions(-)

diff --git a/icu4c/source/i18n/usearch.cpp b/icu4c/source/i18n/usearch.cpp
index a2d83ed2f8f..8b5972764de 100644
--- a/icu4c/source/i18n/usearch.cpp
+++ b/icu4c/source/i18n/usearch.cpp
@@ -4002,6 +4002,25 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
             found = FALSE;
         }
 
+        // Allow matches to end in the middle of a grapheme cluster if the following
+        // conditions are met; this is needed to make prefix search work properly in
+        // Indic, see #11750
+        // * the default breakIter is being used
+        // * the next collation element beloging to this combining sequence
+        //   - has non-zero primary weight
+        //   - corresponds to a separate character following the one at end of the current match
+        // * the match end is a normalization boundary
+        UChar32 nextChar = 0;
+        U16_GET(strsrch->search->text, 0, maxLimit, strsrch->search->textLength, nextChar);
+        UBool allowMidclusterMatch = (strsrch->search->breakIter == NULL &&
+                    nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
+                    maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
+                    strsrch->nfd->hasBoundaryBefore(nextChar));
+        // If those conditions are met, then:
+        // * do NOT advance the match position to a break boundary
+        // * do NOT require that end of the combining sequence not extend beyond the match in CE space
+        // * do NOT require that match end position be on a breakIter boundary
+
         //  Advance the match end position to the first acceptable match boundary.
         //    This advances the index over any combining charcters.
         mLimit = maxLimit;
@@ -4016,7 +4035,9 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
                 mLimit = minLimit;
             } else {
                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
-                if (nba >= lastCEI->highIndex) {
+                // Note that we can have nba < maxLimit, in which case we want
+                // to set mLimit to nba regardless of allowMidclusterMatch
+                if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
                     mLimit = nba;
                 }
             }
@@ -4028,14 +4049,16 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
         }
     #endif
 
-        // If advancing to the end of a combining sequence in character indexing space
-        //   advanced us beyond the end of the match in CE space, reject this match.
-        if (mLimit > maxLimit) {
-            found = FALSE;
-        }
+        if (!allowMidclusterMatch) {
+            // If advancing to the end of a combining sequence in character indexing space
+            //   advanced us beyond the end of the match in CE space, reject this match.
+            if (mLimit > maxLimit) {
+                found = FALSE;
+            }
 
-        if (!isBreakBoundary(strsrch, mLimit)) {
-            found = FALSE;
+            if (!isBreakBoundary(strsrch, mLimit)) {
+                found = FALSE;
+            }
         }
 
         if (! checkIdentical(strsrch, mStart, mLimit)) {
@@ -4252,25 +4275,47 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
 
             mLimit = maxLimit = nextCEI->lowIndex;
 
+            // Allow matches to end in the middle of a grapheme cluster if the following
+            // conditions are met; this is needed to make prefix search work properly in
+            // Indic, see #11750
+            // * the default breakIter is being used
+            // * the next collation element beloging to this combining sequence
+            //   - has non-zero primary weight
+            //   - corresponds to a separate character following the one at end of the current match
+            // * the match end is a normalization boundary
+            UChar32 nextChar = 0;
+            U16_GET(strsrch->search->text, 0, maxLimit, strsrch->search->textLength, nextChar);
+            UBool allowMidclusterMatch = (strsrch->search->breakIter == NULL &&
+                        nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
+                        maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
+                        strsrch->nfd->hasBoundaryBefore(nextChar));
+            // If those conditions are met, then:
+            // * do NOT advance the match position to a break boundary
+            // * do NOT require that end of the combining sequence not extend beyond the match in CE space
+            // * do NOT require that match end position be on a breakIter boundary
+
             //  Advance the match end position to the first acceptable match boundary.
-            //    This advances the index over any combining charcters.
+            //    This advances the index over any combining characters.
             if (minLimit < maxLimit) {
                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
-
-                if (nba >= lastCEI->highIndex) {
+                // Note that we can have nba < maxLimit, in which case we want
+                // to set mLimit to nba regardless of allowMidclusterMatch
+                if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
                     mLimit = nba;
                 }
             }
 
-            // If advancing to the end of a combining sequence in character indexing space
-            //   advanced us beyond the end of the match in CE space, reject this match.
-            if (mLimit > maxLimit) {
-                found = FALSE;
-            }
+            if (!allowMidclusterMatch) {
+                // If advancing to the end of a combining sequence in character indexing space
+                //   advanced us beyond the end of the match in CE space, reject this match.
+                if (mLimit > maxLimit) {
+                    found = FALSE;
+                }
 
-            // Make sure the end of the match is on a break boundary
-            if (!isBreakBoundary(strsrch, mLimit)) {
-                found = FALSE;
+                // Make sure the end of the match is on a break boundary
+                if (!isBreakBoundary(strsrch, mLimit)) {
+                    found = FALSE;
+                }
             }
 
         } else {
diff --git a/icu4c/source/test/cintltst/usrchdat.c b/icu4c/source/test/cintltst/usrchdat.c
index de4caf64636..433584daf44 100644
--- a/icu4c/source/test/cintltst/usrchdat.c
+++ b/icu4c/source/test/cintltst/usrchdat.c
@@ -1,5 +1,5 @@
 /********************************************************************
- * Copyright (c) 2001-2011 International Business Machines 
+ * Copyright (c) 2001-2011,2015 International Business Machines 
  * Corporation and others. All Rights Reserved.
  ********************************************************************
  * File USRCHDAT.H
@@ -754,6 +754,16 @@ static const SearchData DIACRITICMATCH[] = {
     {NULL, NULL, NULL, UCOL_TERTIARY, USEARCH_STANDARD_ELEMENT_COMPARISON, NULL, {-1}, {0}}
 };
 
+static const SearchData INDICPREFIXMATCH[] = { // <rdar://problem/18063262>
+    {"\\u0915\\u0020\\u0915\\u0901\\u0020\\u0915\\u0902\\u0020\\u0915\\u0903\\u0020\\u0915\\u0940\\u0020\\u0915\\u093F\\u0020\\u0915\\u0943\\u0020\\u0915\\u093C\\u0020\\u0958",
+     "\\u0915",               NULL, UCOL_PRIMARY, USEARCH_STANDARD_ELEMENT_COMPARISON, NULL, {0, 2, 5, 8, 11, 14, 17, 20, 23,-1}, {1, 2, 2, 2, 1, 1, 1, 2, 1}},
+    {"\\u0915\\u0924\\u0020\\u0915\\u0924\\u0940\\u0020\\u0915\\u0924\\u093F\\u0020\\u0915\\u0924\\u0947\\u0020\\u0915\\u0943\\u0924\\u0020\\u0915\\u0943\\u0924\\u0947",
+     "\\u0915\\u0924",        NULL, UCOL_PRIMARY, USEARCH_STANDARD_ELEMENT_COMPARISON, NULL, {0, 3, 7, 11, -1}, {2, 2, 2, 2}},
+    {"\\u0915\\u0924\\u0020\\u0915\\u0924\\u0940\\u0020\\u0915\\u0924\\u093F\\u0020\\u0915\\u0924\\u0947\\u0020\\u0915\\u0943\\u0924\\u0020\\u0915\\u0943\\u0924\\u0947",
+     "\\u0915\\u0943\\u0924", NULL, UCOL_PRIMARY, USEARCH_STANDARD_ELEMENT_COMPARISON, NULL, {15, 19, -1}, {3, 3}},
+    {NULL, NULL, NULL, UCOL_TERTIARY, USEARCH_STANDARD_ELEMENT_COMPARISON, NULL, {-1}, {0}}
+};
+
 #endif /* #if !UCONFIG_NO_COLLATION */
 
 #endif
diff --git a/icu4c/source/test/cintltst/usrchtst.c b/icu4c/source/test/cintltst/usrchtst.c
index 8c88fe2fbbf..acd9b057c4f 100644
--- a/icu4c/source/test/cintltst/usrchtst.c
+++ b/icu4c/source/test/cintltst/usrchtst.c
@@ -1,5 +1,5 @@
 /********************************************************************
- * Copyright (c) 2001-2011 International Business Machines 
+ * Copyright (c) 2001-2011,2015 International Business Machines 
  * Corporation and others. All Rights Reserved.
  ********************************************************************
  * File usrchtst.c
@@ -2987,6 +2987,24 @@ static void TestMatchFollowedByIgnorables(void) {
     ucol_close(coll);
 }
 
+static void TestIndicPrefixMatch(void) // <rdar://problem/18063262>
+{
+    int count = 0;
+    UErrorCode status = U_ZERO_ERROR;
+    open(&status);
+    if (U_FAILURE(status)) {
+        log_err_status(status, "Unable to open static collators %s\n", u_errorName(status));
+        return;
+    }
+    while (INDICPREFIXMATCH[count].text != NULL) {
+        if (!assertEqual(INDICPREFIXMATCH[count])) {
+            log_err("Error at test number %d\n", count);
+        }
+        count ++;
+    }
+    close();
+}
+
 /**
 * addSearchTest
 */
@@ -3049,6 +3067,7 @@ void addSearchTest(TestNode** root)
     addTest(root, &TestPCEBuffer_100df, "tscoll/usrchtst/TestPCEBuffer/1_00df");
     addTest(root, &TestPCEBuffer_2surr, "tscoll/usrchtst/TestPCEBuffer/2_dfff");
     addTest(root, &TestMatchFollowedByIgnorables, "tscoll/usrchtst/TestMatchFollowedByIgnorables");
+    addTest(root, &TestIndicPrefixMatch, "tscoll/usrchtst/TestIndicPrefixMatch");
 }
 
 #endif /* #if !UCONFIG_NO_COLLATION */