ICU-96 correct handling of level separator for quad level, sortkeytostring private function, some tweaks for CE generation, rule parser factored out

X-SVN-Rev: 4189
2025-04-07 22:44:49 +00:00 · 2001-03-20 00:56:37 +00:00 · 2001-03-20 00:56:37 +00:00 · cda9dc782f
commit cda9dc782f
parent a6265c42b8
6 changed files with 868 additions and 248 deletions
--- a/icu4c/source/i18n/ucol.cpp
+++ b/icu4c/source/i18n/ucol.cpp
@ -1431,7 +1431,7 @@ ucol_calcSortKey(const    UCollator    *coll,
    uint8_t *frenchEndPtr = NULL;
    uint32_t caseShift = 0;

-    sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + (compareQuad?0:1) + (compareIdent?1:0));
+    sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + (qShifted?1:0)/*(compareQuad?0:1)*/ + (compareIdent?1:0));

    collIterate s;
    init_collIterate(coll, (UChar *)source, len, &s, FALSE);
@ -1780,7 +1780,7 @@ ucol_calcSortKey(const    UCollator    *coll,
        if(sortKeySize <= resultLength) {
          uprv_memcpy(primaries, terStart, tersize);
          primaries += tersize;
-          if(compareQuad == 0) {
+          if(/*compareQuad == 0*/qShifted == TRUE) {
              if(count4 > 0) {
                while (count4 >= UCOL_BOT_COUNT4) {
                  *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
@ -2198,6 +2198,65 @@ ucol_calcSortKeySimpleTertiary(const    UCollator    *coll,
    return sortKeySize;
 }

+/* this function makes a string with representation of a sortkey */
+U_CAPI char U_EXPORT2 *ucol_sortKeyToString(const UCollator *coll, const uint8_t *sortkey, char *buffer, uint32_t *len) {
+  uint32_t strength = UCOL_PRIMARY;
+  uint32_t res_size = 0;
+  UBool doneCase = FALSE;
+
+  char *current = buffer;
+  const uint8_t *currentSk = sortkey;
+
+  sprintf(current, "[");
+  current++;
+
+  while(strength <= UCOL_QUATERNARY && strength <= coll->strength) {
+    if(strength > UCOL_PRIMARY) {
+      sprintf(current, " . ");
+      current += 3;
+    }
+    while(*currentSk != 0x01 && *currentSk != 0x00) { /* print a level */
+      sprintf(current, "%02X ", *currentSk++);
+      current+=3;
+    }
+    if(coll->caseLevel == UCOL_ON && strength == UCOL_SECONDARY && doneCase == FALSE) {
+        doneCase = TRUE;
+    } else if(coll->caseLevel == UCOL_OFF || doneCase == TRUE || strength != UCOL_SECONDARY) {
+      strength ++;
+    } 
+    sprintf(current, "%02X", *(currentSk++)); /* This should print '01' */
+    current +=2;
+    if(strength == UCOL_QUATERNARY && coll->alternateHandling == UCOL_NON_IGNORABLE) {
+      break;
+    }
+  }
+
+  if(coll->strength == UCOL_IDENTICAL) {
+    sprintf(current, " . ");
+    current += 3;
+    while(*currentSk != 0) {
+      if(*currentSk == 0x01) {
+        sprintf(current, "%02X", *(currentSk++));
+        current +=2;
+      } 
+
+      sprintf(current, "%02X%02X ", *currentSk, *(currentSk+1));
+      current +=5;
+      currentSk+=2;
+    }
+
+  sprintf(current, "%02X", *(currentSk++)); /* This should print '00' */
+  current += 2;
+
+  }
+  sprintf(current, "]");
+  current += 3;
+
+  return buffer;
+
+        
+}
+
 /* This is a trick string compare function that goes in and uses sortkeys to compare */
 /* It is used when compare gets in trouble and needs to bail out                     */
 UCollationResult ucol_compareUsingSortKeys(const    UCollator    *coll,
--- a/icu4c/source/i18n/ucol_bld.cpp
+++ b/icu4c/source/i18n/ucol_bld.cpp
@ -309,6 +309,10 @@ U_CFUNC uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_
    }
  } 

+  if(low == 0) {
+    low = 0x01000000;
+  }
+
  if(strength == UCOL_SECONDARY) { /* similar as simple */
    if(low >= UCOL_COMMON_BOT2<<24 && low < UCOL_COMMON_TOP2<<24) {
      low = UCOL_COMMON_TOP2<<24;
--- a/icu4c/source/i18n/ucol_imp.h
+++ b/icu4c/source/i18n/ucol_imp.h
@ -549,6 +549,7 @@ uint32_t ucol_getIncrementalUCA(UChar ch, incrementalContext *collationSource, U
 int32_t ucol_getIncrementalSpecialCE(const UCollator *coll, uint32_t CE, incrementalContext *ctx, UErrorCode *status);
 void ucol_updateInternalState(UCollator *coll);
 uint32_t ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status);
+U_CAPI char U_EXPORT2 *ucol_sortKeyToString(const UCollator *coll, const uint8_t *sortkey, char *buffer, uint32_t *len);

 #endif

--- a/icu4c/source/i18n/ucol_tok.c
+++ b/icu4c/source/i18n/ucol_tok.c
@ -312,6 +312,254 @@ UBool ucol_uprv_tok_readAndSetOption(UCATableHeader *image, const UChar* start,
 #define UCOL_TOK_UNSET 0xFFFFFFFF
 #define UCOL_TOK_RESET 0xDEADBEEF

+const UChar *ucol_tok_parseNextToken(UColTokenParser *src, 
+                        uint32_t *strength, 
+                        uint32_t *chOffset, uint32_t *chLen, 
+                        uint32_t *exOffset, uint32_t *exLen,
+                        UBool *varT, UBool *top_,
+                        UBool startOfRules,
+                        UErrorCode *status) { 
+/* parsing part */
+
+  UBool variableTop = FALSE;
+  UBool top = FALSE;
+  UBool inChars = TRUE;
+  UBool inQuote = FALSE;
+  UBool wasInQuote = FALSE;
+  UChar *optionEnd = NULL;
+
+  uint32_t newCharsLen = 0, newExtensionLen = 0;
+  uint32_t charsOffset = 0, extensionOffset = 0;
+  uint32_t newStrength = UCOL_TOK_UNSET; 
+
+  while (src->current < src->end) {
+      UChar ch = *(src->current);
+
+    if (inQuote) {
+      if (ch == 0x0027/*'\''*/) {
+          inQuote = FALSE;
+      } else {
+        if ((newCharsLen == 0) || inChars) {
+          if(newCharsLen == 0) {
+            charsOffset = src->extraCurrent - src->source;
+          }
+          newCharsLen++;
+        } else {
+          if(newExtensionLen == 0) {
+            extensionOffset = src->extraCurrent - src->source;
+          }
+          newExtensionLen++;
+        }
+      }
+    } else {
+      /* Sets the strength for this entry */
+      switch (ch) {
+        case 0x003D/*'='*/ : 
+          if (newStrength != UCOL_TOK_UNSET) {
+            goto EndOfLoop;
+          }
+
+          /* if we start with strength, we'll reset to top */
+          if(startOfRules == TRUE) {
+            top = TRUE;
+            newStrength = UCOL_TOK_RESET;
+            goto EndOfLoop;
+          }
+          newStrength = UCOL_IDENTICAL;
+          break;
+
+        case 0x002C/*','*/:  
+          if (newStrength != UCOL_TOK_UNSET) {
+            goto EndOfLoop;
+          }
+
+          /* if we start with strength, we'll reset to top */
+          if(startOfRules == TRUE) {
+            top = TRUE;
+            newStrength = UCOL_TOK_RESET;
+            goto EndOfLoop;
+          }
+          newStrength = UCOL_TERTIARY;
+          break;
+
+        case  0x003B/*';'*/:
+          if (newStrength != UCOL_TOK_UNSET) {
+            goto EndOfLoop;
+          }
+
+          /* if we start with strength, we'll reset to top */
+          if(startOfRules == TRUE) {
+            top = TRUE;
+            newStrength = UCOL_TOK_RESET;
+            goto EndOfLoop;
+          }
+          newStrength = UCOL_SECONDARY;
+          break;
+
+        case 0x003C/*'<'*/:  
+          if (newStrength != UCOL_TOK_UNSET) {
+            goto EndOfLoop;
+          }
+
+          /* if we start with strength, we'll reset to top */
+          if(startOfRules == TRUE) {
+            top = TRUE;
+            newStrength = UCOL_TOK_RESET;
+            goto EndOfLoop;
+          }
+          /* before this, do a scan to verify whether this is */
+          /* another strength */
+          if(*(src->current+1) == 0x003C) {
+            src->current++;
+            if(*(src->current+1) == 0x003C) {
+              src->current++; /* three in a row! */
+              newStrength = UCOL_TERTIARY;
+            } else { /* two in a row */
+              newStrength = UCOL_SECONDARY;
+            }
+          } else { /* just one */
+            newStrength = UCOL_PRIMARY;
+          }
+          break;
+
+        case 0x0026/*'&'*/:  
+          if (newStrength != UCOL_TOK_UNSET) {
+            goto EndOfLoop;
+          }
+
+          newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
+          break;
+
+        case 0x005b/*'['*/:
+          /* options - read an option, analyze it */
+          if((optionEnd = u_strchr(src->current, 0x005d /*']'*/)) != NULL) {
+            ucol_uprv_tok_readAndSetOption(src->image, src->current, optionEnd, &variableTop, &top, status);
+            src->current = optionEnd;
+            if(top == TRUE) {
+              if(newStrength == UCOL_TOK_RESET) { 
+                src->current++;
+                goto EndOfLoop;
+              } else {
+                *status = U_INVALID_FORMAT_ERROR;
+              }
+            }
+            if(U_FAILURE(*status)) {
+              return NULL;
+            }
+          }
+          break;
+
+        /* Ignore the white spaces */
+        case 0x0009/*'\t'*/:
+        case 0x000C/*'\f'*/:
+        case 0x000D/*'\r'*/:
+        case 0x000A/*'\n'*/:
+        case 0x0020/*' '*/:  
+          break; /* skip whitespace TODO use Unicode */
+
+        case 0x002F/*'/'*/:
+                /* This entry has an extension. */
+          inChars = FALSE;
+          break;
+
+        /* found a quote, we're gonna start copying */
+        case 0x0027/*'\''*/:
+          inQuote = TRUE;
+          wasInQuote = TRUE;
+
+          if (newCharsLen == 0) {
+            charsOffset = src->extraCurrent - src->source;
+            newCharsLen++;
+          } else if (inChars) { /* we're reading some chars */
+            charsOffset = src->extraCurrent - src->source;
+            if(newCharsLen != 0) {
+              uprv_memcpy(src->extraCurrent, src->current - newCharsLen, newCharsLen*sizeof(UChar));
+              src->extraCurrent += newCharsLen;
+            }
+            newCharsLen++;
+          } else {
+            if(newExtensionLen != 0) {
+              uprv_memcpy(src->extraCurrent, src->current - newExtensionLen, newExtensionLen*sizeof(UChar));
+              src->extraCurrent += newExtensionLen;
+            }
+            newExtensionLen++;
+          }
+
+          ch = *(++(src->current)); /*pattern[++index]; */
+          break;
+
+        /* '@' is french only if the strength is not currently set */
+        /* if it is, it's just a regular character in collation rules */
+        case 0x0040/*'@'*/:
+          if (newStrength == UCOL_TOK_UNSET) {
+            src->image->frenchCollation = UCOL_ON;
+            break;
+          }
+
+        default:
+          if (newStrength == UCOL_TOK_UNSET) {
+            *status = U_INVALID_FORMAT_ERROR;
+            return NULL;
+          }
+
+          if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
+            *status = U_INVALID_FORMAT_ERROR;
+            return NULL;
+          }
+
+
+
+          if (inChars) {
+            if(newCharsLen == 0) {
+              charsOffset = src->current - src->source;
+            }
+            newCharsLen++;
+          } else {
+            if(newExtensionLen == 0) {
+              extensionOffset = src->current - src->source;
+            }
+            newExtensionLen++;
+          }
+
+          break;
+        }
+    }
+
+    if(wasInQuote) {
+      if(ch != 0x27 || newCharsLen == 1) {
+        *src->extraCurrent++ = ch;
+      }
+      if(src->extraCurrent == src->extraEnd) {
+        /* reallocate */
+      }
+    }
+
+      src->current++;
+    }
+
+ EndOfLoop:
+    wasInQuote = FALSE;
+  if (newStrength == UCOL_TOK_UNSET) {
+    return NULL;
+  }
+
+  if (newCharsLen == 0 && top == FALSE) {
+    *status = U_INVALID_FORMAT_ERROR;
+    return NULL;
+  }
+
+  *strength = newStrength; 
+
+  *chOffset = charsOffset;
+  *chLen = newCharsLen;
+  *exOffset = extensionOffset;
+  *exLen = newExtensionLen;
+  *varT = variableTop;
+  *top_ = top;
+
+  return src->current;
+}
+
 /*
 Processing Description
  1 Build a ListList. Each list has a header, which contains two lists (positive 
@ -323,14 +571,15 @@ Processing Description

 uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *status) {
  UColToken *lastToken = NULL;
-  uint32_t newCharsLen = 0, newExtensionsLen = 0;
-  uint32_t charsOffset = 0, extensionOffset = 0;
+  const UChar *parseEnd = NULL;
  uint32_t expandNext = 0;
  UBool variableTop = FALSE;
  UBool top = FALSE;

  UColTokListHeader *ListList = NULL;

+  uint32_t newCharsLen = 0, newExtensionsLen = 0;
+  uint32_t charsOffset = 0, extensionOffset = 0;
  uint32_t newStrength = UCOL_TOK_UNSET; 

  ucol_tok_initTokenList(src, status);
@ -340,235 +589,16 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
  src->image->variableTopValue = 0;

  while(src->current < src->end) {
-    { /* parsing part */
+  
+  parseEnd = ucol_tok_parseNextToken(src, 
+                      &newStrength, 
+                      &charsOffset, &newCharsLen, 
+                      &extensionOffset, &newExtensionsLen,
+                      &variableTop, &top,
+                      (UBool)(lastToken == NULL),
+                      status);

-      UBool inChars = TRUE;
-      UBool inQuote = FALSE;
-      UBool wasInQuote = FALSE;
-      UChar *optionEnd = NULL;
-
-      newStrength = UCOL_TOK_UNSET; 
-      newCharsLen = 0; newExtensionsLen = 0;
-      charsOffset = 0; extensionOffset = 0;
-
-      while (src->current < src->end) {
-          UChar ch = *(src->current);
-
-        if (inQuote) {
-          if (ch == 0x0027/*'\''*/) {
-              inQuote = FALSE;
-          } else {
-            if ((newCharsLen == 0) || inChars) {
-              if(newCharsLen == 0) {
-                charsOffset = src->extraCurrent - src->source;
-              }
-              newCharsLen++;
-            } else {
-              if(newExtensionsLen == 0) {
-                extensionOffset = src->extraCurrent - src->source;
-              }
-              newExtensionsLen++;
-            }
-          }
-        } else {
-          /* Sets the strength for this entry */
-          switch (ch) {
-            case 0x003D/*'='*/ : 
-              if (newStrength != UCOL_TOK_UNSET) {
-                goto EndOfLoop;
-              }
-
-              /* if we start with strength, we'll reset to top */
-              if(lastToken == NULL) {
-                top = TRUE;
-                newStrength = UCOL_TOK_RESET;
-                goto EndOfLoop;
-              }
-              newStrength = UCOL_IDENTICAL;
-              break;
-
-            case 0x002C/*','*/:  
-              if (newStrength != UCOL_TOK_UNSET) {
-                goto EndOfLoop;
-              }
-
-              /* if we start with strength, we'll reset to top */
-              if(lastToken == NULL) {
-                top = TRUE;
-                newStrength = UCOL_TOK_RESET;
-                goto EndOfLoop;
-              }
-              newStrength = UCOL_TERTIARY;
-              break;
-
-            case  0x003B/*';'*/:
-              if (newStrength != UCOL_TOK_UNSET) {
-                goto EndOfLoop;
-              }
-
-              /* if we start with strength, we'll reset to top */
-              if(lastToken == NULL) {
-                top = TRUE;
-                newStrength = UCOL_TOK_RESET;
-                goto EndOfLoop;
-              }
-              newStrength = UCOL_SECONDARY;
-              break;
-
-            case 0x003C/*'<'*/:  
-              if (newStrength != UCOL_TOK_UNSET) {
-                goto EndOfLoop;
-              }
-
-              /* if we start with strength, we'll reset to top */
-              if(lastToken == NULL) {
-                top = TRUE;
-                newStrength = UCOL_TOK_RESET;
-                goto EndOfLoop;
-              }
-              /* before this, do a scan to verify whether this is */
-              /* another strength */
-              if(*(src->current+1) == 0x003C) {
-                src->current++;
-                if(*(src->current+1) == 0x003C) {
-                  src->current++; /* three in a row! */
-                  newStrength = UCOL_TERTIARY;
-                } else { /* two in a row */
-                  newStrength = UCOL_SECONDARY;
-                }
-              } else { /* just one */
-                newStrength = UCOL_PRIMARY;
-              }
-              break;
-
-            case 0x0026/*'&'*/:  
-              if (newStrength != UCOL_TOK_UNSET) {
-                goto EndOfLoop;
-              }
-
-              newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
-              break;
-
-            case 0x005b/*'['*/:
-              /* options - read an option, analyze it */
-              if((optionEnd = u_strchr(src->current, 0x005d /*']'*/)) != NULL) {
-                ucol_uprv_tok_readAndSetOption(src->image, src->current, optionEnd, &variableTop, &top, status);
-                src->current = optionEnd;
-                if(top == TRUE) {
-                  if(newStrength == UCOL_TOK_RESET) { 
-                    src->current++;
-                    goto EndOfLoop;
-                  } else {
-                    *status = U_INVALID_FORMAT_ERROR;
-                  }
-                }
-                if(U_FAILURE(*status)) {
-                  return 0;
-                }
-              }
-              break;
-
-            /* Ignore the white spaces */
-            case 0x0009/*'\t'*/:
-            case 0x000C/*'\f'*/:
-            case 0x000D/*'\r'*/:
-            case 0x000A/*'\n'*/:
-            case 0x0020/*' '*/:  
-              break; /* skip whitespace TODO use Unicode */
-
-            case 0x002F/*'/'*/:
-                    /* This entry has an extension. */
-              inChars = FALSE;
-              break;
-
-            /* found a quote, we're gonna start copying */
-            case 0x0027/*'\''*/:
-              inQuote = TRUE;
-              wasInQuote = TRUE;
-
-              if (newCharsLen == 0) {
-                charsOffset = src->extraCurrent - src->source;
-                newCharsLen++;
-              } else if (inChars) { /* we're reading some chars */
-                charsOffset = src->extraCurrent - src->source;
-                if(newCharsLen != 0) {
-                  uprv_memcpy(src->extraCurrent, src->current - newCharsLen, newCharsLen*sizeof(UChar));
-                  src->extraCurrent += newCharsLen;
-                }
-                newCharsLen++;
-              } else {
-                if(newExtensionsLen != 0) {
-                  uprv_memcpy(src->extraCurrent, src->current - newExtensionsLen, newExtensionsLen*sizeof(UChar));
-                  src->extraCurrent += newExtensionsLen;
-                }
-                newExtensionsLen++;
-              }
-
-              ch = *(++(src->current)); /*pattern[++index]; */
-              break;
-
-            /* '@' is french only if the strength is not currently set */
-            /* if it is, it's just a regular character in collation rules */
-            case 0x0040/*'@'*/:
-              if (newStrength == UCOL_TOK_UNSET) {
-                src->image->frenchCollation = UCOL_ON;
-                break;
-              }
-
-            default:
-              if (newStrength == UCOL_TOK_UNSET) {
-                *status = U_INVALID_FORMAT_ERROR;
-                return 0;
-              }
-
-              if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
-                *status = U_INVALID_FORMAT_ERROR;
-                return 0;
-              }
-
-
-
-              if (inChars) {
-                if(newCharsLen == 0) {
-                  charsOffset = src->current - src->source;
-                }
-                newCharsLen++;
-              } else {
-                if(newExtensionsLen == 0) {
-                  extensionOffset = src->current - src->source;
-                }
-                newExtensionsLen++;
-              }
-
-              break;
-            }
-        }
-
-        if(wasInQuote) {
-          if(ch != 0x27) {
-            *src->extraCurrent++ = ch;
-          }
-          if(src->extraCurrent == src->extraEnd) {
-            /* reallocate */
-          }
-        }
-
-          src->current++;
-        }
-
-     EndOfLoop:
-        wasInQuote = FALSE;
-      if (newStrength == UCOL_TOK_UNSET) {
-        return 0;
-      }
-
-      if (newCharsLen == 0 && top == FALSE) {
-        *status = U_INVALID_FORMAT_ERROR;
-        return 0;
-      }
-    }
-
-    {
+    if(U_SUCCESS(*status) && parseEnd != NULL) {
      UColToken *sourceToken = NULL;
      UColToken key;

@ -789,7 +819,9 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
      }
      /*  7 After all this, set LAST to point to sourceToken, and goto step 3. */  
      lastToken = sourceToken;
-    }  
+    } else {
+      return 0;
+    }
  }

  return src->resultLen;
--- a/icu4c/source/i18n/ucol_tok.h
+++ b/icu4c/source/i18n/ucol_tok.h
@ -117,6 +117,13 @@ int32_t uhash_hashTokens(const void *k);
 UBool uhash_compareTokens(const void *key1, const void *key2);
 void ucol_tok_initTokenList(UColTokenParser *src, UErrorCode *status);
 uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *status);
+U_CAPI const UChar U_EXPORT2 *ucol_tok_parseNextToken(UColTokenParser *src, 
+                        uint32_t *strength, 
+                        uint32_t *chOffset, uint32_t *chLen, 
+                        uint32_t *exOffset, uint32_t *exLen,
+                        UBool *varT, UBool *top_,
+                        UBool startOfRules,
+                        UErrorCode *status);

 #endif

--- a/icu4c/source/test/cintltst/cmsccoll.c
+++ b/icu4c/source/test/cintltst/cmsccoll.c
@ -25,6 +25,7 @@
 #include "callcoll.h"
 #include "unicode/ustring.h"
 #include "string.h"
+#include "ucol_imp.h"

 static UCollator *myCollation;
 const static UChar rules[MAX_TOKEN_LEN] =
@ -435,24 +436,55 @@ static void FunkyATest( )
    ucol_close(myCollation);
 }

+UColAttributeValue caseFirst[] = {
+    UCOL_OFF,
+    UCOL_LOWER_FIRST,
+    UCOL_UPPER_FIRST
+};
+
+
 UColAttributeValue alternateHandling[] = {
-  UCOL_NON_IGNORABLE,
+    UCOL_NON_IGNORABLE,
    UCOL_SHIFTED
 };

 UColAttributeValue caseLevel[] = {
-  UCOL_OFF,
+    UCOL_OFF,
    UCOL_ON
 };

 UColAttributeValue strengths[] = {
-  UCOL_PRIMARY,
+    UCOL_PRIMARY,
    UCOL_SECONDARY,
    UCOL_TERTIARY,
    UCOL_QUATERNARY,
    UCOL_IDENTICAL
 };

+char * caseFirstC[] = {
+    "UCOL_OFF",
+    "UCOL_LOWER_FIRST",
+    "UCOL_UPPER_FIRST"
+};
+
+
+char * alternateHandlingC[] = {
+    "UCOL_NON_IGNORABLE",
+    "UCOL_SHIFTED"
+};
+
+char * caseLevelC[] = {
+    "UCOL_OFF",
+    "UCOL_ON"
+};
+
+char * strengthsC[] = {
+    "UCOL_PRIMARY",
+    "UCOL_SECONDARY",
+    "UCOL_TERTIARY",
+    "UCOL_QUATERNARY",
+    "UCOL_IDENTICAL"
+};


 static void PrintMarkDavis( )
@ -461,8 +493,10 @@ static void PrintMarkDavis( )
  UChar m[256];
  uint8_t sortkey[256];
  UCollator *coll = ucol_open(NULL, &status);
-  uint32_t i,j,k,l, sortkeysize;
+  uint32_t h,i,j,k, sortkeysize;
  uint32_t sizem = 0;
+  char buffer[512];
+  uint32_t len = 512;

  u_uastrcpy(m, "Mark Davis");
  sizem = u_strlen(m);
@ -475,20 +509,29 @@ static void PrintMarkDavis( )
  }
  fprintf(stderr, "\n");

-  for(i = 0; i<sizeof(alternateHandling)/sizeof(alternateHandling[0]); i++) {
-    ucol_setAttribute(coll, UCOL_ALTERNATE_HANDLING, alternateHandling[i], &status);
-    for(j = 0; j<sizeof(caseLevel)/sizeof(caseLevel[0]); j++) {
-      ucol_setAttribute(coll, UCOL_CASE_LEVEL, caseLevel[j], &status);
-      for(k = 0; k<sizeof(strengths)/sizeof(strengths[0]); k++) {
-        ucol_setAttribute(coll, UCOL_STRENGTH, strengths[k], &status);
-        sortkeysize = ucol_getSortKey(coll, m, sizem, sortkey, 256);
-        fprintf(stderr, "aH: %i, case: %i, st: %i\nSortkey: ", alternateHandling[i], caseLevel[j], strengths[k]);
-        for(l = 0; l<sortkeysize; l++) {
-          fprintf(stderr, "%02X", sortkey[l]);
+  for(h = 0; h<sizeof(caseFirst)/sizeof(caseFirst[0]); h++) {
+    ucol_setAttribute(coll, UCOL_CASE_FIRST, caseFirst[i], &status);
+    fprintf(stderr, "caseFirst: %s\n", caseFirstC[h]);
+
+    for(i = 0; i<sizeof(alternateHandling)/sizeof(alternateHandling[0]); i++) {
+      ucol_setAttribute(coll, UCOL_ALTERNATE_HANDLING, alternateHandling[i], &status);
+      fprintf(stderr, "  AltHandling: %s\n", alternateHandlingC[i]);
+
+      for(j = 0; j<sizeof(caseLevel)/sizeof(caseLevel[0]); j++) {
+        ucol_setAttribute(coll, UCOL_CASE_LEVEL, caseLevel[j], &status);
+        fprintf(stderr, "    caseLevel: %s\n", caseLevelC[j]);
+
+        for(k = 0; k<sizeof(strengths)/sizeof(strengths[0]); k++) {
+          ucol_setAttribute(coll, UCOL_STRENGTH, strengths[k], &status);
+          sortkeysize = ucol_getSortKey(coll, m, sizem, sortkey, 256);
+          fprintf(stderr, "      strength: %s\n      Sortkey: ", strengthsC[k]);
+          fprintf(stderr, "%s\n", ucol_sortKeyToString(coll, sortkey, buffer, &len));
        }
-        fprintf(stderr, "\n");
+
      }
+
    }
+
  }
 }

@ -502,3 +545,477 @@ void addMiscCollTest(TestNode** root)
    /*addTest(root, &PrintMarkDavis, "tscoll/cmsccoll/PrintMarkDavis");*/
 }

+#if 0
+
+/* Ram's rule test */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "unicode\ucol.h"
+#include "unicode\ustdio.h"
+#include "unicode\ustring.h"
+#include "ucol_tok.h"
+#define AMP    '&'
+#define GREAT  '<'
+#define EQUAL  '='
+#define COMA   ','
+#define SEMIC  ';'
+#define BRACKET '['
+#define ACCENT '@'
+#define AMP_STR    "&"
+#define GREAT_STR  "<"
+#define EQUAL_STR  "="
+#define COMA_STR   ","
+#define SEMIC_STR  ";"
+#define DG_STR     "<<"
+#define TG_STR     "<<<"
+
+static FILE* file;
+
+
+int32_t transformUTF16ToUTF8(uint8_t *dest, int32_t destCapacity,
+                             const UChar *src, int32_t srcLength) {
+    int32_t srcIndex, destIndex;
+    UChar32 c;
+
+    for(srcIndex=destIndex=0; srcIndex<srcLength && destIndex<destCapacity;) {
+        /* get code point from UTF-16 */
+        UTF_NEXT_CHAR(src, srcIndex, srcLength, c);
+        /* write code point in UTF-8 */
+        UTF8_APPEND_CHAR_SAFE(dest, destIndex, destCapacity, c);
+    }
+
+    return destIndex; /* return destination length */
+}
+void resetBuf(UChar** src,int len){
+    UChar* local = *src;
+    int i=0;
+    while(i<len){
+        *local++ = '\0';
+        i++;
+    }
+
+}
+
+UChar* findDelimiter(UChar* source,int srcLen){
+    UChar* local = source;
+    int i=0;
+    while(i<srcLen){
+        switch(*local){
+        case AMP:
+        case EQUAL :
+        case COMA  : 
+        case SEMIC :
+        case GREAT :
+            return local;
+        default:
+            break;
+        }
+        local++;
+        i++;
+    }
+    return NULL;
+}
+char *aescstrdup(const UChar* unichars, char* buf,int len){
+    int length;
+    char *newString,*targetLimit,*target;
+    UConverterFromUCallback cb;
+    void *p;
+    UErrorCode errorCode = U_ZERO_ERROR;
+    UConverter* conv = ucnv_open("US-ASCII",&errorCode);
+    length = u_strlen( unichars);
+    newString = buf;
+    target = newString;
+    targetLimit = newString+len;
+    ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA, &cb, &p, &errorCode);
+    ucnv_fromUnicode(conv,&target,targetLimit, &unichars, (UChar*)(unichars+length),NULL,TRUE,&errorCode);
+    *target = '\0';
+    return newString;
+}
+void testPrimary(UCollator* col, const UChar* p,const UChar* q){
+    UChar source[256] = { '\0'};
+    UChar target[256] = { '\0'};
+    UChar temp[2] = {'\0'};
+    unsigned char utfSource[256] = {'\0'};
+    unsigned char utfTarget[256] = {'\0'};
+    UCollationResult result = ucol_strcoll(col,p,u_strlen(p),q,u_strlen(q));
+
+    if(result!=UCOL_LESS){
+       aescstrdup(p,utfSource,256);
+       aescstrdup(q,utfTarget,256);
+       fprintf(file,"Primary failed  source: %s target: %s \n", utfSource,utfTarget);
+    }
+    source[0] = 0x00E0;
+    u_strcat(source,p);
+    target[0] = 0x0061;
+    u_strcat(target,q);
+    result = ucol_strcoll(col,source,u_strlen(source),target,u_strlen(target));
+    if(result!=UCOL_LESS){
+       aescstrdup(source,utfSource,256);
+       aescstrdup(target,utfTarget,256);
+       fprintf(file,"Primary swamps 2nd failed  source: %s target: %s \n", utfSource,utfTarget);
+    }
+}
+   
+void testSecondary(UCollator* col, const UChar* p,const UChar* q){
+    UChar source[256] = { '\0'};
+    UChar target[256] = { '\0'};
+    UChar temp[2] = {'\0'};
+    unsigned char utfSource[256] = {'\0'};
+    unsigned char utfTarget[256] = {'\0'};
+
+    UCollationResult result= ucol_strcoll(col,p,u_strlen(p),q,u_strlen(q));
+    
+    if(result!=UCOL_LESS){
+       aescstrdup(p,utfSource,256);
+       aescstrdup(q,utfTarget,256);
+       fprintf(file,"secondary failed  source: %s target: %s \n", utfSource,utfTarget);
+    }
+    source[0] = 0x0041;
+    u_strcat(source,p);
+    target[0]= 0x0061;
+    u_strcat(target,q);
+    result = ucol_strcoll(col,source,u_strlen(source),target,u_strlen(target));
+    if(result!=UCOL_LESS){
+       aescstrdup(source,utfSource,256);
+       aescstrdup(target,utfTarget,256);
+       fprintf(file,"secondary swamps 3rd failed  source: %s target: %s \n",utfSource,utfTarget);
+    }
+    source[0] = '\0';
+    u_strcat(source,p);
+    u_strcat(source,(UChar*)"b");
+    target[0] = '\0';
+    u_strcat(target,q);
+    u_strcat(target,(UChar*)"a");
+    result = ucol_strcoll(col,source,u_strlen(source),target,u_strlen(target));
+    if(result!=UCOL_GREATER){
+       aescstrdup(source,utfSource,256);
+       aescstrdup(target,utfTarget,256);
+       fprintf(file,"secondary is swamped by 1  failed  source: %s target: %s \n",utfSource,utfTarget);
+    }
+}
+
+void testTertiary(UCollator* col, const UChar* p,const UChar* q){
+    UChar source[256] = { '\0'};
+    UChar target[256] = { '\0'};
+    UChar temp[2] = {'\0'};
+    unsigned char utfSource[256] = {'\0'};
+    unsigned char utfTarget[256] = {'\0'};
+    UCollationResult result= ucol_strcoll(col,p,u_strlen(p),q,u_strlen(q));
+    if(result!=UCOL_LESS){
+       aescstrdup(p,utfSource,256);
+       aescstrdup(q,utfTarget,256);
+       fprintf(file,"Tertiary failed  source: %s target: %s \n",utfSource,utfTarget);
+    }
+
+    source[0] = 0x0020;
+    u_strcat(source,p);
+    target[0]= 0x002D;
+    u_strcat(target,q);
+    result = ucol_strcoll(col,source,u_strlen(source),target,u_strlen(target));
+    if(result!=UCOL_LESS){
+       aescstrdup(source,utfSource,256);
+       aescstrdup(target,utfTarget,256);
+       fprintf(file,"Tertiary swamps 4th failed  source: %s target: %s \n", utfSource,utfTarget);
+    }
+
+    source[0] = '\0';
+    u_strcat(source,p);
+    *temp = 0x00E0; 
+    u_strcat(source,temp);
+    target[0] = '\0';
+    u_strcat(target,q);
+    u_strcat(target,(UChar*)"a");
+    result = ucol_strcoll(col,source,u_strlen(source),target,u_strlen(target));
+    if(result!=UCOL_GREATER){
+       aescstrdup(source,utfSource,256);
+       aescstrdup(target,utfTarget,256);
+       fprintf(file,"Tertiary is swamped by 3rd failed  source: %s target: %s \n",utfSource,utfTarget);
+    }
+}
+void testEquality(UCollator* col, const UChar* p,const UChar* q){
+    UChar source[256] = { '\0'};
+    UChar target[256] = { '\0'};
+    UChar temp[2] = {'\0'};
+    unsigned char utfSource[256] = {'\0'};
+    unsigned char utfTarget[256] = {'\0'};
+    UCollationResult result = ucol_strcoll(col,p,u_strlen(p),q,u_strlen(q));
+
+    if(result!=UCOL_EQUAL){
+       aescstrdup(p,utfSource,256);
+       aescstrdup(q,utfTarget,256);
+       fprintf(file,"Primary failed  source: %s target: %s \n", utfSource,utfTarget);
+    }
+}
+
+void testCollator(UCollator* col, const UChar* p,const UChar* q, UChar* delimiter,int strength){
+    UChar source[256] = { '\0'};
+    UChar target[256] = { '\0'};
+    UChar temp[2] = {'\0'};
+    unsigned char utfSource[256] = {'\0'};
+    unsigned char utfTarget[256] = {'\0'};
+    UCollationResult result=0;
+    switch(strength){
+    case 0:
+        testEquality(col,p,q);
+        break;
+
+    case 1:
+        testPrimary(col,p,q);
+        break;
+    case 2:
+       testSecondary(col,p,q);
+       break;
+    case 3:
+       testTertiary(col,p,q);
+       break;
+    default:
+        break;
+    }
+}
+/*ar  bg ca cs da el en_BE en_US_POSIX es et fi fr hi hr hu is iw ja ko lt lv mk mt nb nn nn_NO pl ro ru sh sk sl sq sr sv th tr uk vi zh zh_TW*/
+UChar* consumeDelimiter(UChar** source, int srcLen,int* strength, UChar** delimiter){
+     UChar* local = *source;
+     UBool foundDelimiter = FALSE;
+     int i=0;
+     while(i<srcLen){
+        switch(*local){
+        case AMP:
+            *strength=1;
+            *delimiter = (UChar*)AMP_STR ;
+            if(*(local+1) == BRACKET ||*(local+2) == BRACKET  ){
+                local++;
+                continue;
+            }
+            if(*(local-1)!= 0x0027)
+                foundDelimiter = TRUE;
+            break;
+        case BRACKET:
+            {
+             if(*(local-1)!= 0x0027){
+                UChar* limit;
+                limit = findDelimiter(local,srcLen-i);
+                *source=local=limit;
+                continue;
+             }
+            }
+            break;
+        case EQUAL :
+            *strength=0;
+            if(*(local-1)!= 0x0027){
+                *delimiter = (UChar*)EQUAL_STR;
+                foundDelimiter = TRUE;
+            }
+            break;
+        case COMA  : 
+            *strength = 3;
+            *delimiter =(UChar*)COMA_STR ;
+            foundDelimiter = TRUE;
+            break;
+        case SEMIC :
+            *delimiter = (UChar*)SEMIC_STR;
+            *strength = 2;
+            foundDelimiter = TRUE;
+            break;
+        case GREAT :
+             if(*(local+1)== GREAT){
+                 local++; 
+                 if(*(local+2)==GREAT){
+                     *delimiter = (UChar*)DG_STR;
+                     *strength = 2;
+                     local++;
+                 }
+                 else{
+                      *delimiter = (UChar*)TG_STR;
+                      *strength =3;
+                 }
+            }
+            else{
+               *delimiter = (UChar*)GREAT_STR ;
+               *strength =1;
+            }
+            if(*(local-1)!= 0x0027)
+                foundDelimiter =TRUE;
+            break;
+        default:
+            break;
+        }
+        if(foundDelimiter){
+            if(local ==*source){
+                *source = ++local;
+                return NULL;
+            }
+            else{
+                return local;
+            }
+        }
+        local++;
+        i++;
+     }
+     return NULL;
+}
+UChar* istrncpy(UChar* dst,const UChar* src,int32_t n){
+
+    UChar *anchor = dst;            /* save a pointer to start of dst */
+
+    while( (n-- > 0) ) {   /* copy string 2 over              */
+        if(*src!=0x0020 && *src!=0 && *src!=0x0027){
+             *(dst++) = *(src);
+        }
+        *src++;
+    }
+
+    return anchor;
+
+}
+
+
+void parseAndPrintRules(UCollator* col,const char* loc, const UChar* rules, int length){
+    UChar *local = (UChar*)rules;
+    UChar current[20]={'\0'};
+    UChar previous[20]= {'\0'};
+    UChar *first =current, *second = previous;
+    UChar* delimiter = (UChar*)" ";
+    int i = 0, strength;
+    char fileName[20] = {'\0'};
+    UBool gotBoth = FALSE;
+
+    if(loc){
+        strcpy(fileName,loc);
+    }
+    strcat(fileName,"TestCases.txt");
+    file = fopen(fileName,"wb");
+    if(file){
+        while((local-rules < length) && i<300){
+            UChar* limit =consumeDelimiter(&local,length-i,&strength,&delimiter);
+            if(limit==NULL ){
+                if(u_strcmp(delimiter ,(UChar*) AMP_STR)==0){
+                    resetBuf(&first,20);
+                }
+                limit =findDelimiter(local,length-(local-rules));
+                if(limit==NULL){
+                    limit= (UChar*)rules+length;
+                }
+
+            }
+
+            if(limit){
+                if(*first=='\0'){
+                    istrncpy(first,local,(int)(limit-local));
+                    local=limit;
+
+                }
+                else{
+                    if((local-rules) < length){
+                        istrncpy(second,local,(int)(limit-local));
+                    }
+                    local=limit;
+                    gotBoth=TRUE;
+                }
+            }
+            if(gotBoth){
+                unsigned char tempFirst[20] = {'\0'};
+                unsigned char tempSecond[20] = {'\0'};
+                aescstrdup(first,tempFirst,20);
+                aescstrdup(second,tempSecond,20);
+                //fprintf(file,"first:%s second: %s delimiter: %s strength:%i \n ",tempFirst,tempSecond,delimiter,strength);
+                
+                testCollator(col,first,second,delimiter,strength);
+
+                //fprintf(file,"first:%s second: %s delimiter: %s strength:%i \n ",tempFirst,tempSecond,delimiter,strength);
+                resetBuf(&first,20);
+                u_strcpy(first,second);
+                resetBuf(&second,20);
+                gotBoth=FALSE;
+            }
+            i++;
+           
+        }
+           
+    }
+}
+
+void parseAndPrintRules2(UCollator* col,const char* loc, const UChar* rules, int length){
+    UChar *local = (UChar*)rules;
+    UChar current[20]={'\0'};
+    UChar previous[20]= {'\0'};
+    UChar *first =current, *second = previous;
+    UChar* delimiter = (UChar*)" ";
+    int i = 0, strength;
+    char fileName[20] = {'\0'};
+    UBool gotBoth = FALSE;
+
+    if(loc){
+        strcpy(fileName,loc);
+    }
+    strcat(fileName,"TestCases.txt");
+    file = fopen(fileName,"wb");
+    if(file){
+            if(limit){
+                if(*first=='\0'){
+                    istrncpy(first,local,(int)(limit-local));
+                    local=limit;
+
+                }
+                else{
+                    if((local-rules) < length){
+                        istrncpy(second,local,(int)(limit-local));
+                    }
+                    local=limit;
+                    gotBoth=TRUE;
+                }
+            }
+            if(gotBoth){
+                unsigned char tempFirst[20] = {'\0'};
+                unsigned char tempSecond[20] = {'\0'};
+                aescstrdup(first,tempFirst,20);
+                aescstrdup(second,tempSecond,20);
+                //fprintf(file,"first:%s second: %s delimiter: %s strength:%i \n ",tempFirst,tempSecond,delimiter,strength);
+                
+                testCollator(col,first,second,delimiter,strength);
+
+                //fprintf(file,"first:%s second: %s delimiter: %s strength:%i \n ",tempFirst,tempSecond,delimiter,strength);
+                resetBuf(&first,20);
+                u_strcpy(first,second);
+                resetBuf(&second,20);
+                gotBoth=FALSE;
+            }
+            i++;
+           
+        }
+           
+    }
+}
+
+void processRules(const char* loc){
+    UErrorCode status = U_ZERO_ERROR;
+    UCollator* col = ucol_open(loc,&status);
+    int length=0;
+    const UChar* rules;
+    if(loc){
+        rules = ucol_getRules(col,&length);
+    }
+    ucol_setAttribute(col,UCOL_STRENGTH,UCOL_QUATERNARY,&status);
+    parseAndPrintRules2(col,loc,rules,length);
+}
+
+
+extern int
+main(int argc, const char *argv[]) {
+    if(argc<2) {
+
+        fprintf(stderr,
+               "usage: %s { rpmap/rxmap-filename }+\n",
+                argv[0]);
+        exit(1);
+    }
+
+    while(--argc>0) {
+        processRules(*++argv);
+    }
+
+    return 0;
+}
+
+#endif