ICU-7167 reimplement checkCEValidity() for actual byte value ranges

X-SVN-Rev: 28193
2025-04-15 01:42:37 +00:00 · 2010-06-15 05:28:53 +00:00 · 2010-06-15 05:28:53 +00:00 · 2ef6586467
commit 2ef6586467
parent 3e24713df1
1 changed files with 131 additions and 131 deletions
--- a/icu4c/source/test/cintltst/citertst.c
+++ b/icu4c/source/test/cintltst/citertst.c
@ -1496,147 +1496,147 @@ static void TestCEBufferOverflow()
 }

 /**
-* Byte bounds checks. Checks if each byte in data is between upper and lower
-* inclusive.
-*/
-static UBool checkByteBounds(uint32_t data, char upper, char lower)
-{
-    int count = 4;
-    while (count > 0) {
-        char b = (char)(data & 0xFF);
-        if (b > upper || b < lower) {
-            return FALSE;
-        }
-        data = data >> 8;
-        count --;
-    }
-    return TRUE;
-}
-
-/**
-* Determines case of the string of codepoints.
-* If it is a multiple codepoints it has to treated as a contraction.
-*/
-#if 0
-static uint8_t getCase(const UChar *s, uint32_t len) {
-    UBool       lower = FALSE;
-    UBool       upper = FALSE;
-    UBool       title = FALSE;
-    UErrorCode  status = U_ZERO_ERROR;
-    UChar       str[256];
-    const UChar      *ps = s;
-
-    if (len == 0) {
-        return UCOL_LOWER_CASE;
-    }
-
-    while (len > 0) {
-        UChar c = *ps ++;
-
-        if (u_islower(c)) {
-            lower = TRUE;
-        }
-        if (u_isupper(c)) {
-            upper = TRUE;
-        }
-        if (u_istitle(c)) {
-            title = TRUE;
-        }
-
-        len --;
-    }
-    if ((lower && !upper && !title) || (!lower && !upper && !title)){
-        return UCOL_LOWER_CASE;
-    }
-    if (upper && !lower && !title) {
-        return UCOL_UPPER_CASE;
-    }
-    /* mix of cases here */
-    /* len = unorm_normalize(s, len, UNORM_NFKD, 0, str, 256, &status);
-    if (U_FAILURE(status)) {
-        log_err("Error normalizing data string\n");
-        return UCOL_LOWER_CASE;
-    }*/
-
-    if ((title && len >= 2) || (lower && upper)) {
-        return UCOL_MIXED_CASE;
-    }
-    if (u_isupper(s[0])) {
-        return UCOL_UPPER_CASE;
-    }
-    return UCOL_LOWER_CASE;
-}
-#endif
-
-/**
-* Checking collation element validity given the boundary arguments.
+* Checking collation element validity.
 */
 static UBool checkCEValidity(const UCollator *coll, const UChar *codepoints,
-                             int length, uint32_t primarymax,
-                             uint32_t secondarymax)
+                             int length)
 {
    UErrorCode          status = U_ZERO_ERROR;
    UCollationElements *iter   = ucol_openElements(coll, codepoints, length,
                                                  &status);
-    uint32_t            ce;
-    UBool               first  = TRUE;
-/*
-    UBool               upper  = FALSE;
-    UBool               lower  = FALSE;
-*/
+    UBool result = FALSE;
+    UBool primaryDone = FALSE, secondaryDone = FALSE, tertiaryDone = FALSE;

    if (U_FAILURE(status)) {
        log_err("Error creating iterator for testing validity\n");
+        return FALSE;
    }

-    ce = ucol_next(iter, &status);
+    for (;;) {
+        uint32_t ce = ucol_next(iter, &status);
+        uint32_t primary, p1, p2, secondary, tertiary;
+        if (ce == UCOL_NULLORDER) {
+            result = TRUE;
+            break;
+        }
+        if (ce == 0) {
+            continue;
+        }
+        primary   = UCOL_PRIMARYORDER(ce);
+        p1 = primary >> 8;
+        p2 = primary & 0xFF;
+        secondary = UCOL_SECONDARYORDER(ce);
+        tertiary  = UCOL_TERTIARYORDER(ce) & UCOL_REMOVE_CONTINUATION;

-    while (ce != UCOL_NULLORDER) {
-       if (ce != 0) {
-           uint32_t primary   = UCOL_PRIMARYORDER(ce);
-           uint32_t secondary = UCOL_SECONDARYORDER(ce);
-           uint32_t tertiary  = UCOL_TERTIARYORDER(ce);
-/*           uint32_t scasebits = tertiary & 0xC0;*/
-
-           if ((tertiary == 0 && secondary != 0) ||
-               (tertiary < 0xC0 && secondary == 0 && primary != 0)) {
-               /* n-1th level is not zero when the nth level is
-                  except for continuations, this is wrong */
-               log_err("Lower level weight not 0 when high level weight is 0\n");
-               goto fail;
-           }
-           else {
-               /* checks if any byte is illegal ie = 01 02 03. */
-               if (checkByteBounds(ce, 0x3, 0x1)) {
-                   log_err("Byte range in CE lies in illegal bounds 0x1 - 0x3\n");
-                   goto fail;
-               }
-           }
-           if ((primary != 0 && primary < primarymax) 
-               || ((primary & 0xFF) == 0xFF) || (((primary>>8) & 0xFF) == 0xFF) 
-               || ((primary & 0xFF) && ((primary & 0xFF) <= 2)) 
-               || (((primary>>8) & 0xFF) && ((primary>>8) & 0xFF) <= 2)
-               || (primary >= 0xFE00 && !isContinuation(ce))) {
-               log_err("UCA primary weight out of bounds: %04X for string starting with %04X\n", 
-                   primary, codepoints[0]);
-               goto fail;
-           }
-           /* case matching not done since data generated by ken */
-           if (first) {
-               if (secondary >= 6 && secondary <= secondarymax) {
-                   log_err("Secondary weight out of range\n");
-                   goto fail;
-               }
-               first = FALSE;
-           }
-       }
-       ce   = ucol_next(iter, &status);
+        if (!isContinuation(ce)) {
+            if ((ce & UCOL_REMOVE_CONTINUATION) == 0) {
+                log_err("Empty CE %08lX except for case bits\n", (long)ce);
+                break;
+            }
+            if (p1 == 0) {
+                if (p2 != 0) {
+                    log_err("Primary 00 xx in %08lX\n", (long)ce);
+                    break;
+                }
+                primaryDone = TRUE;
+            } else {
+                if (p1 <= 2 || p1 >= 0xF0) {
+                    /* Primary first bytes F0..FF are specials. */
+                    log_err("Primary first byte of %08lX out of range\n", (long)ce);
+                    break;
+                }
+                if (p2 == 0) {
+                    primaryDone = TRUE;
+                } else {
+                    if (p2 <= 3 || p2 >= 0xFF) {
+                        /* Primary second bytes 03 and FF are sort key compression terminators. */
+                        log_err("Primary second byte of %08lX out of range\n", (long)ce);
+                        break;
+                    }
+                    primaryDone = FALSE;
+                }
+            }
+            if (secondary == 0) {
+                if (primary != 0) {
+                    log_err("Primary!=0 secondary==0 in %08lX\n", (long)ce);
+                    break;
+                }
+                secondaryDone = TRUE;
+            } else {
+                if (secondary <= 2 ||
+                    (UCOL_BYTE_COMMON < secondary && secondary <= (UCOL_BYTE_COMMON + 0x80))
+                ) {
+                    /* Secondary first bytes common+1..+0x80 are used for sort key compression. */
+                    log_err("Secondary byte of %08lX out of range\n", (long)ce);
+                    break;
+                }
+                secondaryDone = FALSE;
+            }
+            if (tertiary == 0) {
+                /* We know that ce != 0. */
+                log_err("Primary!=0 or secondary!=0 but tertiary==0 in %08lX\n", (long)ce);
+                break;
+            }
+            if (tertiary <= 2) {
+                log_err("Tertiary byte of %08lX out of range\n", (long)ce);
+                break;
+            }
+            tertiaryDone = FALSE;
+        } else {
+            if ((ce & UCOL_REMOVE_CONTINUATION) == 0) {
+                log_err("Empty continuation %08lX\n", (long)ce);
+                break;
+            }
+            if (primaryDone && primary != 0) {
+                log_err("Primary was done but continues in %08lX\n", (long)ce);
+                break;
+            }
+            if (p1 == 0) {
+                if (p2 != 0) {
+                    log_err("Primary 00 xx in %08lX\n", (long)ce);
+                    break;
+                }
+                primaryDone = TRUE;
+            } else {
+                if (p1 <= 2) {
+                    log_err("Primary first byte of %08lX out of range\n", (long)ce);
+                    break;
+                }
+                if (p2 == 0) {
+                    primaryDone = TRUE;
+                } else {
+                    if (p2 <= 3) {
+                        log_err("Primary second byte of %08lX out of range\n", (long)ce);
+                        break;
+                    }
+                }
+            }
+            if (secondaryDone && secondary != 0) {
+                log_err("Secondary was done but continues in %08lX\n", (long)ce);
+                break;
+            }
+            if (secondary == 0) {
+                secondaryDone = TRUE;
+            } else {
+                if (secondary <= 2) {
+                    log_err("Secondary byte of %08lX out of range\n", (long)ce);
+                    break;
+                }
+            }
+            if (tertiaryDone && tertiary != 0) {
+                log_err("Tertiary was done but continues in %08lX\n", (long)ce);
+                break;
+            }
+            if (tertiary == 0) {
+                tertiaryDone = TRUE;
+            } else if (tertiary <= 2) {
+                log_err("Tertiary byte of %08lX out of range\n", (long)ce);
+                break;
+            }
+        }
   }
   ucol_closeElements(iter);
-   return TRUE;
-fail :
-   ucol_closeElements(iter);
-   return FALSE;
+   return result;
 }

 static void TestCEValidity()
@ -1674,21 +1674,21 @@ static void TestCEValidity()
        }

        getCodePoints(line, codepoints, contextCPs);
-        checkCEValidity(coll, codepoints, u_strlen(codepoints), 5, 86);
+        checkCEValidity(coll, codepoints, u_strlen(codepoints));
    }

    log_verbose("Testing UCA elements for the whole range of unicode characters\n");
    for (c = 0; c <= 0xffff; ++c) {
        if (u_isdefined(c)) {
            codepoints[0] = (UChar)c;
-            checkCEValidity(coll, codepoints, 1, 5, 86);
+            checkCEValidity(coll, codepoints, 1);
        }
    }
    for (; c <= 0x10ffff; ++c) {
        if (u_isdefined(c)) {
            int32_t i = 0;
            U16_APPEND_UNSAFE(codepoints, i, c);
-            checkCEValidity(coll, codepoints, i, 5, 86);
+            checkCEValidity(coll, codepoints, i);
        }
    }

@ -1765,7 +1765,7 @@ static void TestCEValidity()
                uprv_memcpy(codepoints, src.source + chOffset,
                                                       chLen * sizeof(UChar));
                codepoints[chLen] = 0;
-                checkCEValidity(coll, codepoints, chLen, 4, 85);
+                checkCEValidity(coll, codepoints, chLen);
            }
            free(rulesCopy);
        }