ICU-7264 UCA 6.0 data, test data, and bug fixes; from branches/markus/uca60 -r 28826:28857

X-SVN-Rev: 28875
2025-04-08 23:10:40 +00:00 · 2010-10-19 21:48:04 +00:00 · 2010-10-19 21:48:04 +00:00 · d29bfdf854
commit d29bfdf854
parent 1c7566e3db
21 changed files with 77360 additions and 44460 deletions
--- a/icu4c/source/data/in/coll/invuca.icu
+++ b/icu4c/source/data/in/coll/invuca.icu
--- a/icu4c/source/data/in/coll/ucadata.icu
+++ b/icu4c/source/data/in/coll/ucadata.icu
--- a/icu4c/source/data/unidata/FractionalUCA.txt
+++ b/icu4c/source/data/unidata/FractionalUCA.txt
--- a/icu4c/source/data/unidata/UCARules.txt
+++ b/icu4c/source/data/unidata/UCARules.txt
--- a/icu4c/source/data/unidata/changes.txt
+++ b/icu4c/source/data/unidata/changes.txt
@ -204,7 +204,8 @@ Unicode 6.0 update
 - update source/data/unidata/UCARules.txt with UCA_Rules_SHORT.txt
 - update Han-implicit ranges for new CJK extensions:
  swapCJK() in ucol.cpp & ImplicitCEGenerator.java
- genuca: allow bytes 02 for U+FFFE, new merge-sort character
+- genuca: allow bytes 02 for U+FFFE, new merge-sort character;
+  do not add it into invuca so that tailoring primary-after an ignorable works
 - genuca: permit space between [variable top] bytes
 - ucol.cpp: treat noncharacters like unassigned rather than ignorable
 - run makeuca.sh:
--- a/icu4c/source/i18n/ucol.cpp
+++ b/icu4c/source/i18n/ucol.cpp
@ -63,7 +63,7 @@ static UChar32 fcdHighStart = 0;
 // implicit generation and supressing sort key compression
 // they should regularly be in the UCA, but if one
 // is running without UCA, it could be a problem
-static const int32_t maxRegularPrimary  = 0xA0;
+static const int32_t maxRegularPrimary  = 0x7A;
 static const int32_t minImplicitPrimary = 0xE0;
 static const int32_t maxImplicitPrimary = 0xE4;

@ -295,6 +295,29 @@ ensureCEsCapacity(collIterate *data, int32_t minCapacity) {
    return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity);
 }

+void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) {
+    if(U_FAILURE(errorCode)) {
+        return;
+    }
+    int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuffer);
+    if(length >= offsetBufferSize) {
+        int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE;
+        int32_t *newBuffer = reinterpret_cast<int32_t *>(uprv_malloc(newCapacity * 4));
+        if(newBuffer == NULL) {
+            errorCode = U_MEMORY_ALLOCATION_ERROR;
+            return;
+        }
+        if(length > 0) {
+            uprv_memcpy(newBuffer, offsetBuffer, length * 4);
+        }
+        uprv_free(offsetBuffer);
+        offsetBuffer = newBuffer;
+        offsetStore = offsetBuffer + length;
+        offsetBufferSize = newCapacity;
+    }
+    *offsetStore++ = offset;
+}
+
 /*
 * collIter_eos()
 *     Checks for a collIterate being positioned at the end of
@ -965,35 +988,63 @@ static int32_t
    min4Boundary = 0;

 static const UChar32
+    // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
+    // 9FCB;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
    CJK_BASE = 0x4E00,
-    CJK_LIMIT = 0x9FFF+1,
+    CJK_LIMIT = 0x9FCB+1,
+    // Unified CJK ideographs in the compatibility ideographs block.
    CJK_COMPAT_USED_BASE = 0xFA0E,
    CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
+    // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
+    // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
    CJK_A_BASE = 0x3400,
-    CJK_A_LIMIT = 0x4DBF+1,
+    CJK_A_LIMIT = 0x4DB5+1,
+    // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
+    // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
    CJK_B_BASE = 0x20000,
-    CJK_B_LIMIT = 0x2A6DF+1;
+    CJK_B_LIMIT = 0x2A6D6+1,
+    // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
+    // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
+    CJK_C_BASE = 0x2A700,
+    CJK_C_LIMIT = 0x2B734+1,
+    // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
+    // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
+    CJK_D_BASE = 0x2B740,
+    CJK_D_LIMIT = 0x2B81D+1;
+    // when adding to this list, look for all occurrences (in project)
+    // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!!

 static UChar32 swapCJK(UChar32 i) {
-
-    if (i >= CJK_BASE) {
-        if (i < CJK_LIMIT)              return i - CJK_BASE;
-
-        if (i < CJK_COMPAT_USED_BASE)   return i + NON_CJK_OFFSET;
-
-        if (i < CJK_COMPAT_USED_LIMIT)  return i - CJK_COMPAT_USED_BASE
-                                                + (CJK_LIMIT - CJK_BASE);
-        if (i < CJK_B_BASE)             return i + NON_CJK_OFFSET;
-
-        if (i < CJK_B_LIMIT)            return i; // non-BMP-CJK
-
-        return i + NON_CJK_OFFSET;  // non-CJK
+    if (i < CJK_A_BASE) {
+        // non-CJK
+    } else if (i < CJK_A_LIMIT) {
+        // Extension A has lower code points than the original Unihan+compat
+        // but sorts higher.
+        return i - CJK_A_BASE
+                + (CJK_LIMIT - CJK_BASE)
+                + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
+    } else if (i < CJK_BASE) {
+        // non-CJK
+    } else if (i < CJK_LIMIT) {
+        return i - CJK_BASE;
+    } else if (i < CJK_COMPAT_USED_BASE) {
+        // non-CJK
+    } else if (i < CJK_COMPAT_USED_LIMIT) {
+        return i - CJK_COMPAT_USED_BASE
+                + (CJK_LIMIT - CJK_BASE);
+    } else if (i < CJK_B_BASE) {
+        // non-CJK
+    } else if (i < CJK_B_LIMIT) {
+        return i; // non-BMP-CJK
+    } else if (i < CJK_C_BASE) {
+        // non-CJK
+    } else if (i < CJK_C_LIMIT) {
+        return i; // non-BMP-CJK
+    } else if (i < CJK_D_BASE) {
+        // non-CJK
+    } else if (i < CJK_D_LIMIT) {
+        return i; // non-BMP-CJK
    }
-    if (i < CJK_A_BASE)                 return i + NON_CJK_OFFSET;
-
-    if (i < CJK_A_LIMIT)                return i - CJK_A_BASE
-                                                + (CJK_LIMIT - CJK_BASE)
-                                                + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
    return i + NON_CJK_OFFSET; // non-CJK
 }

@ -1599,23 +1650,6 @@ void collPrevIterNormalize(collIterate *data)
    */
    data->writableBuffer.insert(0, (UChar)0);

-    if (data->offsetBuffer == NULL) {
-        int32_t len = normLen >= UCOL_EXPAND_CE_BUFFER_SIZE ? normLen + 1 : UCOL_EXPAND_CE_BUFFER_SIZE;
-
-        data->offsetBufferSize = len;
-        data->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * len);
-        data->offsetStore = data->offsetBuffer;
-    } else if(data->offsetBufferSize < normLen) {
-        int32_t storeIX = (int32_t)(data->offsetStore - data->offsetBuffer);
-        int32_t *tob    = (int32_t *) uprv_realloc(data->offsetBuffer, sizeof(int32_t) * (normLen + 1));
-
-        if (tob != NULL) {
-            data->offsetBuffer = tob;
-            data->offsetStore = &data->offsetBuffer[storeIX];
-            data->offsetBufferSize = normLen + 1;
-        }
-    }
-
    /*
     * The usual case at this point is that we've got a base
     * character followed by marks that were normalized. If
@ -1660,13 +1694,13 @@ void collPrevIterNormalize(collIterate *data)
            }
        }

-        *(data->offsetStore++) = baseOffset;
+        data->appendOffset(baseOffset, status);
    }

-    *(data->offsetStore++) = firstMarkOffset;
+    data->appendOffset(firstMarkOffset, status);

    for (int32_t i = 0; i < trailCount; i += 1) {
-        *(data->offsetStore++) = trailOffset;
+        data->appendOffset(trailOffset, status);
    }

    data->offsetRepeatValue = trailOffset;
@ -1748,26 +1782,92 @@ inline UBool collPrevIterFCD(collIterate *data)
    return result;
 }

-/** gets a character from the string at a given offset
+/** gets a code unit from the string at a given offset
 *  Handles both normal and iterative cases.
 *  No error checking - caller beware!
 */
-inline static
-UChar peekCharacter(collIterate *source, int32_t offset) {
+static inline
+UChar peekCodeUnit(collIterate *source, int32_t offset) {
    if(source->pos != NULL) {
        return *(source->pos + offset);
    } else if(source->iterator != NULL) {
+        UChar32 c;
        if(offset != 0) {
            source->iterator->move(source->iterator, offset, UITER_CURRENT);
-            UChar toReturn = (UChar)source->iterator->next(source->iterator);
+            c = source->iterator->next(source->iterator);
            source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
-            return toReturn;
        } else {
-            return (UChar)source->iterator->current(source->iterator);
+            c = source->iterator->current(source->iterator);
+        }
+        return c >= 0 ? (UChar)c : 0xfffd;  // If the caller works properly, we should never see c<0.
+    } else {
+        return 0xfffd;
+    }
+}
+
+// Code point version. Treats the offset as a _code point_ delta.
+// We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-formed UTF-16.
+// We cannot use U16_FWD_1 and similar because we do not know the start and limit of the buffer.
+static inline
+UChar32 peekCodePoint(collIterate *source, int32_t offset) {
+    UChar32 c;
+    if(source->pos != NULL) {
+        const UChar *p = source->pos;
+        if(offset >= 0) {
+            // Skip forward over (offset-1) code points.
+            while(--offset >= 0) {
+                if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) {
+                    ++p;
+                }
+            }
+            // Read the code point there.
+            c = *p++;
+            UChar trail;
+            if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) {
+                c = U16_GET_SUPPLEMENTARY(c, trail);
+            }
+        } else /* offset<0 */ {
+            // Skip backward over (offset-1) code points.
+            while(++offset < 0) {
+                if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) {
+                    --p;
+                }
+            }
+            // Read the code point before that.
+            c = *--p;
+            UChar lead;
+            if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) {
+                c = U16_GET_SUPPLEMENTARY(lead, c);
+            }
+        }
+    } else if(source->iterator != NULL) {
+        if(offset >= 0) {
+            // Skip forward over (offset-1) code points.
+            int32_t fwd = offset;
+            while(fwd-- > 0) {
+                uiter_next32(source->iterator);
+            }
+            // Read the code point there.
+            c = uiter_current32(source->iterator);
+            // Return to the starting point, skipping backward over (offset-1) code points.
+            while(offset-- > 0) {
+                uiter_previous32(source->iterator);
+            }
+        } else /* offset<0 */ {
+            // Read backward, reading offset code points, remember only the last-read one.
+            int32_t back = offset;
+            do {
+                c = uiter_previous32(source->iterator);
+            } while(++back < 0);
+            // Return to the starting position, skipping forward over offset code points.
+            do {
+                uiter_next32(source->iterator);
+            } while(++offset < 0);
        }
    } else {
-        return (UChar)U_SENTINEL;
+        c = U_SENTINEL;
    }
+    return c;
 }

 /**
@ -1830,7 +1930,7 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
        } else {
            if (data->offsetReturn == data->offsetBuffer) {
                data->offsetReturn = NULL;
-				data->offsetStore  = data->offsetBuffer;
+                data->offsetStore  = data->offsetBuffer;
            } else {
                data->offsetReturn -= 1;
            }
@ -2304,7 +2404,7 @@ uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,

          backupState(source, &discState);

-    buffer.setTo(peekCharacter(source, -1));
+    buffer.setTo(peekCodePoint(source, -1));
    for (;;) {
        UChar    *UCharOffset;
        UChar     schar,
@ -2312,7 +2412,7 @@ uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
        uint32_t  result;

        if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
-            || (peekCharacter(source, 0) == 0  &&
+            || (peekCodeUnit(source, 0) == 0  &&
            //|| (*source->pos == 0  &&
                ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
                 source->fcdPosition == NULL ||
@ -2322,7 +2422,7 @@ uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
                 /* end of string in null terminated string or stopped by a
                 null character, note fcd does not always point to a base
                 character after the discontiguos change */
-                 u_getCombiningClass(peekCharacter(source, 0)) == 0) {
+                 u_getCombiningClass(peekCodePoint(source, 0)) == 0) {
                 //u_getCombiningClass(*(source->pos)) == 0) {
            //constart = (UChar *)coll->image + getContractOffset(CE);
            if (multicontraction) {
@ -2350,8 +2450,7 @@ uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
        }
        else {
            if (u_getCombiningClass(schar) ==
-                u_getCombiningClass(peekCharacter(source, -2))) {
-                //u_getCombiningClass(*(source->pos - 2))) {
+                u_getCombiningClass(peekCodePoint(source, -2))) {
                buffer.append(schar);
                continue;
            }
@ -2390,17 +2489,9 @@ uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
    return *(coll->contractionCEs + (constart - coll->contractionIndex));
 }

-static
-inline UBool isNonChar(UChar32 cp) {
-    return (UBool)((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDFFF));
-}
-
 /* now uses Mark's getImplicitPrimary code */
 static
 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
-    if(isNonChar(cp)) {
-        return 0;
-    }
    uint32_t r = uprv_uca_getImplicitPrimary(cp);
    *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
    collationSource->offsetRepeatCount += 1;
@ -3128,7 +3219,7 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
            /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
            /* two things can happen here: next code point can be a trailing surrogate - we will use it */
            /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
-            /* we return 0 (completely ignorable - per UCA specification */
+            /* we treat it like an unassigned code point. */
            {
                UChar trail;
                collIterateState state;
@ -3137,7 +3228,7 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
                    // we chould have stepped one char forward and it might have turned that it
                    // was not a trail surrogate. In that case, we have to backup.
                    loadState(source, &state, TRUE);
-                    return 0;
+                    return UCOL_NOT_FOUND;
                } else {
                    /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
                    CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
@ -3158,19 +3249,16 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
                    cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
                    source->iterator->next(source->iterator);
                    return getImplicit(cp, source);
-                } else {
-                    return 0;
                }
            } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
-                U_IS_TRAIL((nextChar=*source->pos))) {
-                    cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
-                    source->pos++;
-                    return getImplicit(cp, source);
-            } else {
-                return 0; /* completely ignorable */
+                      U_IS_TRAIL((nextChar=*source->pos))) {
+                cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
+                source->pos++;
+                return getImplicit(cp, source);
            }
+            return UCOL_NOT_FOUND;
        case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
-            return 0; /* broken surrogate sequence */
+            return UCOL_NOT_FOUND; /* broken surrogate sequence */
        case CHARSET_TAG:
            /* not yet implemented */
            /* probably after 1.8 */
@ -3189,36 +3277,27 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
 /* now uses Mark's getImplicitPrimary code */
 static
 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
-    if(isNonChar(cp)) {
-        return 0;
-    }
-
    uint32_t r = uprv_uca_getImplicitPrimary(cp);

    *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
    collationSource->toReturn = collationSource->CEpos;

-	if (collationSource->offsetBuffer == NULL) {
-		collationSource->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
-		collationSource->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
-		collationSource->offsetStore = collationSource->offsetBuffer;
-	}
+    // **** doesn't work if using iterator ****
+    if (collationSource->flags & UCOL_ITER_INNORMBUF) {
+        collationSource->offsetRepeatCount = 1;
+    } else {
+        int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);

-	// **** doesn't work if using iterator ****
-	if (collationSource->flags & UCOL_ITER_INNORMBUF) {
-	  collationSource->offsetRepeatCount = 1;
-	} else {
-	  int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);
+        UErrorCode errorCode = U_ZERO_ERROR;
+        collationSource->appendOffset(firstOffset, errorCode);
+        collationSource->appendOffset(firstOffset + 1, errorCode);

-	  *(collationSource->offsetStore++) = firstOffset;
-	  *(collationSource->offsetStore++) = firstOffset + 1;
-
-		collationSource->offsetReturn = collationSource->offsetStore - 1;
-		*(collationSource->offsetBuffer) = firstOffset;
-		if (collationSource->offsetReturn == collationSource->offsetBuffer) {
-			collationSource->offsetStore = collationSource->offsetBuffer;
-		}
-	}
+        collationSource->offsetReturn = collationSource->offsetStore - 1;
+        *(collationSource->offsetBuffer) = firstOffset;
+        if (collationSource->offsetReturn == collationSource->offsetBuffer) {
+            collationSource->offsetStore = collationSource->offsetBuffer;
+        }
+    }

    return ((r & 0x0000FFFF)<<16) | 0x000000C0;
 }
@ -3297,7 +3376,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
                        // it's easy for BMP code points
                        if(isZeroCE == 0) {
                            continue;
-                        } else if(U16_IS_TRAIL(schar) || U16_IS_LEAD(schar)) {
+                        } else if(U16_IS_SURROGATE(schar)) {
                            // for supplementary code points, we have to check the next one
                            // situations where we are going to ignore
                            // 1. beginning of the string: schar is a lone surrogate
@ -3306,9 +3385,9 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
                            //    that is explicitly set to zero.
                            if (!collIter_bos(source)) {
                                UChar lead;
-                                if(U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
+                                if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
                                    isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
-                                    if(getCETag(isZeroCE) == SURROGATE_TAG) {
+                                    if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) {
                                        uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
                                        if(finalCE == 0) {
                                            // this is a real, assigned completely ignorable code point
@ -3317,12 +3396,12 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
                                        }
                                    }
                                } else {
-                                    // lone surrogate, completely ignorable
-                                    continue;
+                                    // lone surrogate, treat like unassigned
+                                    return UCOL_NOT_FOUND;
                                }
                            } else {
-                                // lone surrogate at the beggining, completely ignorable
-                                continue;
+                                // lone surrogate at the beggining, treat like unassigned
+                                return UCOL_NOT_FOUND;
                            }
                        }
                        // Source string char was not in the table.
@ -3350,7 +3429,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
            the forward iteration. this will ensure that the obstinate problem of
            overlapping contractions will not occur.
            */
-            schar = peekCharacter(source, 0);
+            schar = peekCodeUnit(source, 0);
            constart = (UChar *)coll->image + getContractOffset(CE);
            if (isAtStartPrevIterate(source)
                /* commented away contraction end checks after adding the checks
@ -3441,17 +3520,11 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
                CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t));
            }

-            if (source->offsetBuffer == NULL) {
-                source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
-                source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
-                source->offsetStore = source->offsetBuffer;
-            }
-
            while (CE != UCOL_NO_MORE_CES) {
                *(source->CEpos ++) = CE;

                if (offsetBias >= 0) {
-                    *(source->offsetStore ++) = rawOffset + offsetBias;
+                    source->appendOffset(rawOffset + offsetBias, *status);
                }

                CECount++;
@ -3462,38 +3535,12 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
                    this bail*/
                    if (!increaseCEsCapacity(source)) {
                        *status = U_MEMORY_ALLOCATION_ERROR;
-                        if (strbuffer != buffer) {
-                            uprv_free(strbuffer);
-                        }
-
-                        return (uint32_t)UCOL_NULLORDER;
+                        break;
                    }

                    endCEBuffer = source->extendCEs + source->extendCEsSize;
                }

-                if (offsetBias >= 0 && source->offsetStore >= &source->offsetBuffer[source->offsetBufferSize]) {
-                    int32_t  storeIX = (int32_t)(source->offsetStore - source->offsetBuffer);
-                    int32_t *tob = (int32_t *) uprv_realloc(source->offsetBuffer,
-                        sizeof(int32_t) * (source->offsetBufferSize + UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE));
-
-                    if (tob != NULL) {
-                        source->offsetBuffer = tob;
-                        source->offsetStore = &source->offsetBuffer[storeIX];
-                        source->offsetBufferSize += UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE;
-                    } else {
-                        // memory error...
-                        *status = U_MEMORY_ALLOCATION_ERROR;
-                        source->CEpos = source->CEs;
-
-                        if (strbuffer != buffer) {
-                            uprv_free(strbuffer);
-                        }
-
-                        return (uint32_t) UCOL_NULLORDER;
-                    }
-                }
-
                if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {
                    rawOffset = (int32_t)(temp.fcdPosition - temp.string);
                } else {
@ -3503,6 +3550,13 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
                CE = ucol_IGetNextCE(coll, &temp, status);
            }

+            if (strbuffer != buffer) {
+                uprv_free(strbuffer);
+            }
+            if (U_FAILURE(*status)) {
+                return (uint32_t)UCOL_NULLORDER;
+            }
+
 			if (source->offsetRepeatValue != 0) {
                if (CECount > noChars) {
 				    source->offsetRepeatCount += temp.offsetRepeatCount;
@ -3512,10 +3566,6 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
                }
 			}

-            if (strbuffer != buffer) {
-                uprv_free(strbuffer);
-            }
-
            if (offsetBias >= 0) {
                source->offsetReturn = source->offsetStore - 1;
                if (source->offsetReturn == source->offsetBuffer) {
@ -3536,26 +3586,20 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
                *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
                source->toReturn = source->CEpos - 1;

-				if (source->offsetBuffer == NULL) {
-					source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
-					source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
-					source->offsetStore = source->offsetBuffer;
-				}
-
-				if (source->flags & UCOL_ITER_INNORMBUF) {
+                if (source->flags & UCOL_ITER_INNORMBUF) {
                    source->offsetRepeatCount = 1;
-				} else {
-				  int32_t firstOffset = (int32_t)(source->pos - source->string);
+                } else {
+                    int32_t firstOffset = (int32_t)(source->pos - source->string);

-				  *(source->offsetStore++) = firstOffset;
-				  *(source->offsetStore++) = firstOffset + 1;
+                    source->appendOffset(firstOffset, *status);
+                    source->appendOffset(firstOffset + 1, *status);

-					source->offsetReturn = source->offsetStore - 1;
-					*(source->offsetBuffer) = firstOffset;
-					if (source->offsetReturn == source->offsetBuffer) {
-						source->offsetStore = source->offsetBuffer;
-					}
-				}
+                    source->offsetReturn = source->offsetStore - 1;
+                    *(source->offsetBuffer) = firstOffset;
+                    if (source->offsetReturn == source->offsetBuffer) {
+                        source->offsetStore = source->offsetBuffer;
+                    }
+                }


                return *(source->toReturn);
@ -3579,12 +3623,6 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
                }
            }

-            if (source->offsetBuffer == NULL) {
-                source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
-                source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
-                source->offsetStore = source->offsetBuffer;
-            }
-
            /* find the offset to expansion table */
            CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
            size     = getExpansionCount(CE);
@ -3598,7 +3636,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
                    *(source->CEpos ++) = *CEOffset++;

                    if (firstOffset >= 0) {
-                        *(source->offsetStore ++) = firstOffset + 1;
+                        source->appendOffset(firstOffset + 1, *status);
                    }
                }
            } else {
@ -3607,7 +3645,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
                    *(source->CEpos ++) = *CEOffset ++;

                    if (firstOffset >= 0) {
-                        *(source->offsetStore ++) = firstOffset + 1;
+                        source->appendOffset(firstOffset + 1, *status);
                    }
                }
            }
@ -3891,15 +3929,8 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
                V += VBase;
                T += TBase;

-				if (source->offsetBuffer == NULL) {
-					source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
-					source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
-					source->offsetStore = source->offsetBuffer;
-				}
-
-			  int32_t firstOffset = (int32_t)(source->pos - source->string);
-
-			  *(source->offsetStore++) = firstOffset;
+                int32_t firstOffset = (int32_t)(source->pos - source->string);
+                source->appendOffset(firstOffset, *status);

                /*
                 * return the first CE, but first put the rest into the expansion buffer
@ -3907,21 +3938,21 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
                if (!source->coll->image->jamoSpecial) {
                    *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
                    *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
-					*(source->offsetStore++) = firstOffset + 1;
+                    source->appendOffset(firstOffset + 1, *status);

-					if (T != TBase) {
+                    if (T != TBase) {
                        *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
-					    *(source->offsetStore++) = firstOffset + 1;
-					}
+                        source->appendOffset(firstOffset + 1, *status);
+                    }

                    source->toReturn = source->CEpos - 1;

-					source->offsetReturn = source->offsetStore - 1;
-					if (source->offsetReturn == source->offsetBuffer) {
-						source->offsetStore = source->offsetBuffer;
-					}
-					
-					return *(source->toReturn);
+                    source->offsetReturn = source->offsetStore - 1;
+                    if (source->offsetReturn == source->offsetBuffer) {
+                        source->offsetStore = source->offsetBuffer;
+                    }
+
+                    return *(source->toReturn);
                } else {
                    // Since Hanguls pass the FCD check, it is
                    // guaranteed that we won't be in
@ -3971,13 +4002,13 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
            return getPrevImplicit(ch, source);

        case SURROGATE_TAG:  /* This is a surrogate pair */
-            /* essentialy an engaged lead surrogate. */
+            /* essentially an engaged lead surrogate. */
            /* if you have encountered it here, it means that a */
            /* broken sequence was encountered and this is an error */
-            return 0;
+            return UCOL_NOT_FOUND;

        case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
-            return 0; /* broken surrogate sequence */
+            return UCOL_NOT_FOUND; /* broken surrogate sequence */

        case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
            {
@ -3986,7 +4017,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
                const UChar *prev;
                if (isAtStartPrevIterate(source)) {
                    /* we are at the start of the string, wrong place to be at */
-                    return 0;
+                    return UCOL_NOT_FOUND;
                }
                if (source->pos != source->writableBuffer.getBuffer()) {
                    prev     = source->pos - 1;
@ -4000,7 +4031,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
                    cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
                    source->pos = prev;
                } else {
-                    return 0; /* completely ignorable */
+                    return UCOL_NOT_FOUND; /* like unassigned */
                }

                return getPrevImplicit(cp, source);
@ -4241,6 +4272,14 @@ ucol_getSortKeyWithAllocation(const UCollator *coll,

 #define UCOL_FSEC_BUF_SIZE 256

+// Is this primary weight compressible?
+// Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit).
+// TODO: This should use per-lead-byte flags from FractionalUCA.txt.
+static inline UBool
+isCompressible(const UCollator * /*coll*/, uint8_t primary1) {
+    return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegularPrimary;
+}
+
 /* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0  */
 /* or if we run out of space while making a sortkey and want to return ASAP                                   */
 int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t currentSize, UColAttributeValue strength, int32_t len) {
@ -4330,7 +4369,7 @@ int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t curre
        } else {
            wasShifted = FALSE;
            /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
-            /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
+            /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
            /* calculate sortkey size */
            if(primary1 != UCOL_IGNORABLE) {
                if(notIsContinuation) {
@ -4344,19 +4383,13 @@ int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t curre
                            /* one byter, not compressed */
                            currentSize++;
                            leadPrimary = 0;
-                        }
-                        else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
-                            //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
-                            //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
-                            (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary))
-                        {
-                            /* not compressible */
-                            leadPrimary = 0;
-                            currentSize+=2;
-                        }
-                        else { /* compress */
+                        } else if(isCompressible(coll, primary1)) {
+                            /* compress */
                            leadPrimary = primary1;
                            currentSize+=2;
+                        } else {
+                            leadPrimary = 0;
+                            currentSize+=2;
                        }
                    }
                } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
@ -4678,9 +4711,7 @@ ucol_calcSortKey(const    UCollator    *coll,
    if(U_FAILURE(*status)) {
        return 0;
    }
-    if(source == normSource.getBuffer()) {
-        s.flags &= ~UCOL_ITER_NORM;
-    }
+    s.flags &= ~UCOL_ITER_NORM;  // source passed the FCD test or else was normalized.

    if(resultLength == 0 || primaries == NULL) {
        return ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
@ -4781,7 +4812,7 @@ ucol_calcSortKey(const    UCollator    *coll,
            } else {
                wasShifted = FALSE;
                /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
-                /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
+                /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
                /* regular and simple sortkey calc */
                if(primary1 != UCOL_IGNORABLE) {
                    if(notIsContinuation) {
@ -4795,20 +4826,18 @@ ucol_calcSortKey(const    UCollator    *coll,
                                /* one byter, not compressed */
                                *primaries++ = primary1;
                                leadPrimary = 0;
-                            } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
-                                //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
-                                (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
-                                    /* not compressible */
-                                    leadPrimary = 0;
-                                    *primaries++ = primary1;
-                                    if(primaries <= primarySafeEnd) {
-                                        *primaries++ = primary2;
-                                    }
-                            } else { /* compress */
+                            } else if(isCompressible(coll, primary1)) {
+                                /* compress */
                                *primaries++ = leadPrimary = primary1;
                                if(primaries <= primarySafeEnd) {
                                    *primaries++ = primary2;
                                }
+                            } else {
+                                leadPrimary = 0;
+                                *primaries++ = primary1;
+                                if(primaries <= primarySafeEnd) {
+                                    *primaries++ = primary2;
+                                }
                            }
                        }
                    } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
@ -4957,9 +4986,7 @@ ucol_calcSortKey(const    UCollator    *coll,
                        finished = TRUE;
                        break;
                    }
-                    if(source == normSource.getBuffer()) {
-                        s.flags &= ~UCOL_ITER_NORM;
-                    }
+                    s.flags &= ~UCOL_ITER_NORM;
                    sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
                    *status = U_BUFFER_OVERFLOW_ERROR;
                    finished = TRUE;
@ -5283,9 +5310,7 @@ ucol_calcSortKeySimpleTertiary(const    UCollator    *coll,
    if(U_FAILURE(*status)) {
        return 0;
    }
-    if(source == normSource.getBuffer()) {
-        s.flags &= ~UCOL_ITER_NORM;
-    }
+    s.flags &= ~UCOL_ITER_NORM;  // source passed the FCD test or else was normalized.

    if(resultLength == 0 || primaries == NULL) {
        return ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
@ -5346,7 +5371,7 @@ ucol_calcSortKeySimpleTertiary(const    UCollator    *coll,
            primary1 = (uint8_t)(order >> 8);

            /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
-            /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
+            /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
            /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above.               */
            /* regular and simple sortkey calc */
            if(primary1 != UCOL_IGNORABLE) {
@ -5361,17 +5386,14 @@ ucol_calcSortKeySimpleTertiary(const    UCollator    *coll,
                            /* one byter, not compressed */
                            *primaries++ = primary1;
                            leadPrimary = 0;
-                        } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
-                            //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24)))
-                            //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
-                            (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
-                                /* not compressible */
-                                leadPrimary = 0;
-                                *primaries++ = primary1;
-                                *primaries++ = primary2;
-                        } else { /* compress */
+                        } else if(isCompressible(coll, primary1)) {
+                            /* compress */
                            *primaries++ = leadPrimary = primary1;
                            *primaries++ = primary2;
+                        } else {
+                            leadPrimary = 0;
+                            *primaries++ = primary1;
+                            *primaries++ = primary2;
                        }
                    }
                } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
@ -5450,9 +5472,7 @@ ucol_calcSortKeySimpleTertiary(const    UCollator    *coll,
                        finished = TRUE;
                        break;
                    }
-                    if(source == normSource.getBuffer()) {
-                        s.flags &= ~UCOL_ITER_NORM;
-                    }
+                    s.flags &= ~UCOL_ITER_NORM;
                    sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
                    *status = U_BUFFER_OVERFLOW_ERROR;
                    finished = TRUE;
@ -7583,7 +7603,7 @@ ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
            tCE = tCEs.pos-2;
            for(;;) {
                while (secS == 0 && sCE >= sCEs.buf) {
-                    if(sCESave == 0) {
+                    if(sCESave == NULL) {
                        secS = *(sCE--);
                        if(isContinuation(secS)) {
                            while(isContinuation(secS = *(sCE--)))
@ -7597,7 +7617,8 @@ ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
                        secS = *(sCE++);
                        if(!isContinuation(secS)) { /* This means we have finished with this cont */
                            sCE = sCESave;            /* reset the pointer to before continuation */
-                            sCESave = 0;
+                            sCESave = NULL;
+                            secS = 0;  /* Fetch a fresh CE before the continuation sequence. */
                            continue;
                        }
                    }
@ -7605,7 +7626,7 @@ ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
                }

                while(secT == 0 && tCE >= tCEs.buf) {
-                    if(tCESave == 0) {
+                    if(tCESave == NULL) {
                        secT = *(tCE--);
                        if(isContinuation(secT)) {
                            while(isContinuation(secT = *(tCE--)))
@ -7619,7 +7640,8 @@ ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
                        secT = *(tCE++);
                        if(!isContinuation(secT)) { /* This means we have finished with this cont */
                            tCE = tCESave;          /* reset the pointer to before continuation */
-                            tCESave = 0;
+                            tCESave = NULL;
+                            secT = 0;  /* Fetch a fresh CE before the continuation sequence. */
                            continue;
                        }
                    }
--- a/icu4c/source/i18n/ucol_bld.cpp
+++ b/icu4c/source/i18n/ucol_bld.cpp
@ -512,8 +512,10 @@ static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t
        }
    }

-    if(low == 0) {
-        low = 0x01000000;
+    if(low < 0x02000000) {
+        // We must not use CE weight byte 02, so we set it as the minimum lower bound.
+        // See http://site.icu-project.org/design/collation/bytes
+        low = 0x02000000;
    }

    if(strength == UCOL_SECONDARY) { /* similar as simple */
@ -761,7 +763,7 @@ U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErro
        fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
        fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
    }
-    tok=lh->first[UCOL_TOK_POLARITY_POSITIVE];
+    tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];

    do {
        fprintf(stderr,"%i", tok->strength);
@ -769,7 +771,7 @@ U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErro
    } while(tok != NULL);
    fprintf(stderr, "\n");

-    tok=lh->first[UCOL_TOK_POLARITY_POSITIVE];
+    tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];

    do {
        fprintf(stderr,"%i", tok->toInsert);
--- a/icu4c/source/i18n/ucol_elm.cpp
+++ b/icu4c/source/i18n/ucol_elm.cpp
@ -1402,12 +1402,13 @@ U_CDECL_END
 #ifdef UCOL_DEBUG
 // This is a debug function to print the contents of a trie.
 // It is used in conjuction with the code around utrie_unserialize call
-void enumRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
+UBool enumRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
    if(start<0x10000) {
        fprintf(stdout, "%08X, %08X, %08X\n", start, limit, value);
    } else {
        fprintf(stdout, "%08X=%04X %04X, %08X=%04X %04X, %08X\n", start, UTF16_LEAD(start), UTF16_TRAIL(start), limit, UTF16_LEAD(limit), UTF16_TRAIL(limit), value);
    }
+    return TRUE;
 }

 int32_t 
@ -1541,7 +1542,7 @@ uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) {
        if(U_SUCCESS(*status)) {
            utrie_enum(&UCAt, NULL, enumRange, NULL);
        }
-        trieWord = UTRIE_GET32_FROM_LEAD(UCAt, 0xDC01) 
+        trieWord = UTRIE_GET32_FROM_LEAD(&UCAt, 0xDC01);
    }
 #endif
    tableOffset += paddedsize(mappingSize);
--- a/icu4c/source/i18n/ucol_imp.h
+++ b/icu4c/source/i18n/ucol_imp.h
@ -300,6 +300,10 @@ typedef struct collIterate : public U_NAMESPACE_QUALIFIER UMemory {

  UCharIterator *iterator;
  /*int32_t iteratorIndex;*/
+
+  // The offsetBuffer should probably be a UVector32, but helper functions
+  // are an improvement over duplicated code.
+  void appendOffset(int32_t offset, UErrorCode &errorCode);
 } collIterate;

 #else
@ -630,7 +634,8 @@ ucol_setReqValidLocales(UCollator *coll, char *requestedLocaleToAdopt, char *val
 #define getExpansionSuffix(coleiter) ((coleiter)->iteratordata_.CEpos - (coleiter)->iteratordata_.toReturn)
 #define setExpansionSuffix(coleiter, offset) ((coleiter)->iteratordata_.toReturn = (coleiter)->iteratordata_.CEpos - leftoverces)

-/* This is an enum that lists magic special byte values from the fractional UCA */
+/* This is an enum that lists magic special byte values from the fractional UCA.
+ * See also http://site.icu-project.org/design/collation/bytes */
 /* TODO: all the #defines that refer to special byte values from the UCA should be changed to point here */

 enum {
@ -642,9 +647,9 @@ enum {
    UCOL_BYTE_FIRST_TAILORED = 0x04,
    UCOL_BYTE_COMMON = 0x05,
    UCOL_BYTE_FIRST_UCA = UCOL_BYTE_COMMON,
-    UCOL_CODAN_PLACEHOLDER = 0x27,
-    UCOL_BYTE_LAST_LATIN_PRIMARY = 0x4C,
-    UCOL_BYTE_FIRST_NON_LATIN_PRIMARY = 0x4D,
+    /* TODO: Make the following values dynamic since they change with almost every UCA version. */
+    UCOL_CODAN_PLACEHOLDER = 0x12,
+    UCOL_BYTE_FIRST_NON_LATIN_PRIMARY = 0x5B,
    UCOL_BYTE_UNSHIFTED_MAX = 0xFF
 }; 

--- a/icu4c/source/i18n/ucol_res.cpp
+++ b/icu4c/source/i18n/ucol_res.cpp
@ -397,9 +397,9 @@ ucol_openRules( const UChar        *rules,
        /* so something might be done here... or on lower level */
 #ifdef UCOL_DEBUG
        if(*status == U_ILLEGAL_ARGUMENT_ERROR) {
-            fprintf(stderr, "bad option starting at offset %i\n", src.current-src.source);
+            fprintf(stderr, "bad option starting at offset %i\n", (int)(src.current-src.source));
        } else {
-            fprintf(stderr, "invalid rule just before offset %i\n", src.current-src.source);
+            fprintf(stderr, "invalid rule just before offset %i\n", (int)(src.current-src.source));
        }
 #endif
        goto cleanup;
--- a/icu4c/source/i18n/ucol_wgt.cpp
+++ b/icu4c/source/i18n/ucol_wgt.cpp
@ -508,7 +508,7 @@ ucol_nextWeight(WeightRange ranges[], int32_t *pRangeCount) {
    }
 }

-#ifdef UCOL_DEBUG
+#if 0 // #ifdef UCOL_DEBUG

 static void
 testAlloc(uint32_t lowerLimit, uint32_t upperLimit, uint32_t n, UBool enumerate) {
--- a/icu4c/source/test/cintltst/capitst.c
+++ b/icu4c/source/test/cintltst/capitst.c
@ -1947,7 +1947,7 @@ static void TestShortString(void)
        uint32_t   expectedIdentifier;
    } testCases[] = {
        /*
-         * The following expectedOutput contains a collation weight (2D00 from UCA 5.2)
+         * The following expectedOutput contains a collation weight (2700 from UCA 6.0)
         * which is the primary weight for the T character (U+0041) in the input.
         * When that character gets a different weight in FractionalUCA.txt,
         * the expectedOutput needs to be adjusted.
@ -1955,7 +1955,7 @@ static void TestShortString(void)
         * in such a way that the absolute weight for 'A' changes,
         * we will get a test failure here and need to adjust the test case.
         */
-        {"LDE_RDE_KPHONEBOOK_T0041_ZLATN","B2D00_KPHONEBOOK_LDE", "de@collation=phonebook", U_USING_FALLBACK_WARNING, 0, 0 },
+        {"LDE_RDE_KPHONEBOOK_T0041_ZLATN","B2700_KPHONEBOOK_LDE", "de@collation=phonebook", U_USING_FALLBACK_WARNING, 0, 0 },

        {"LEN_RUS_NO_AS_S4","AS_LROOT_NO_S4", NULL, U_USING_DEFAULT_WARNING, 0, 0 },
        {"LDE_VPHONEBOOK_EO_SI","EO_KPHONEBOOK_LDE_SI", "de@collation=phonebook", U_ZERO_ERROR, 0, 0 },
--- a/icu4c/source/test/cintltst/citertst.c
+++ b/icu4c/source/test/cintltst/citertst.c
@ -35,6 +35,7 @@
 #include "cstring.h"
 #include "ucol_imp.h"
 #include "ucol_tok.h"
+#include "uparse.h"
 #include <stdio.h>

 extern uint8_t ucol_uprv_getCaseBits(const UChar *, uint32_t, UErrorCode *);
@ -1028,41 +1029,40 @@ static int32_t hex2num(char hex) {
 * @param codepoints array for storage, assuming size > 5
 * @return position at the end of the codepoint section
 */
-static char * getCodePoints(char *str, UChar *codepoints, UChar *contextCPs) {
-    char *pStartCP = str;
-    char *pEndCP   = str + 4;
-
-    *codepoints = (UChar)((hex2num(*pStartCP) << 12) |
-                          (hex2num(*(pStartCP + 1)) << 8) |
-                          (hex2num(*(pStartCP + 2)) << 4) |
-                          (hex2num(*(pStartCP + 3))));
-    if (*pEndCP == '|' || *(pEndCP+1) == '|') {
-        /* pre-context rule */
-        pStartCP = pEndCP;
-        while (*pStartCP==' ' || *pStartCP== '|' ) {
-            pStartCP++;
-        }
-        pEndCP = pStartCP+4;
-        *contextCPs = *codepoints;
-        *(++codepoints) = (UChar)((hex2num(*pStartCP) << 12) |
-                                  (hex2num(*(pStartCP + 1)) << 8) |
-                                  (hex2num(*(pStartCP + 2)) << 4) |
-                                  (hex2num(*(pStartCP + 3))));
-        contextCPs++;
-    }
-    *contextCPs = 0;
-    codepoints ++;
-    while (*pEndCP != ';') {
-        pStartCP = pEndCP + 1;
-        *codepoints = (UChar)((hex2num(*pStartCP) << 12) |
-                          (hex2num(*(pStartCP + 1)) << 8) |
-                          (hex2num(*(pStartCP + 2)) << 4) |
-                          (hex2num(*(pStartCP + 3))));
-        codepoints ++;
-        pEndCP = pStartCP + 4;
-    }
+static char *getCodePoints(char *str, UChar *codepoints, UChar *contextCPs) {
+    UErrorCode errorCode = U_ZERO_ERROR;
+    char *semi = uprv_strchr(str, ';');
+    char *pipe = uprv_strchr(str, '|');
+    char *s;
    *codepoints = 0;
-    return pEndCP + 1;
+    *contextCPs = 0;
+    if(semi == NULL) {
+        log_err("expected semicolon after code point string in FractionalUCA.txt %s\n", str);
+        return str;
+    }
+    if(pipe != NULL) {
+        int32_t contextLength;
+        *pipe = 0;
+        contextLength = u_parseString(str, contextCPs, 99, NULL, &errorCode);
+        *pipe = '|';
+        if(U_FAILURE(errorCode)) {
+            log_err("error parsing precontext string from FractionalUCA.txt %s\n", str);
+            return str;
+        }
+        /* prepend the precontext string to the codepoints */
+        u_memcpy(codepoints, contextCPs, contextLength);
+        codepoints += contextLength;
+        /* start of the code point string */
+        s = pipe + 1;
+    } else {
+        s = str;
+    }
+    u_parseString(s, codepoints, 99, NULL, &errorCode);
+    if(U_FAILURE(errorCode)) {
+        log_err("error parsing code point string from FractionalUCA.txt %s\n", str);
+        return str;
+    }
+    return semi + 1;
 }

 /**
@ -1262,7 +1262,7 @@ static FileStream * getFractionalUCA(void)
 */
 static void TestCEs() {
    FileStream *file = NULL;
-    char        line[1024];
+    char        line[2048];
    char       *str;
    UChar       codepoints[10];
    uint32_t    ces[20];
@ -1525,7 +1525,6 @@ static UBool checkCEValidity(const UCollator *coll, const UChar *codepoints,
    UBool result = FALSE;
    UBool primaryDone = FALSE, secondaryDone = FALSE, tertiaryDone = FALSE;
    const char * collLocale;
-    char codepointText[5*MAX_CODEPOINTS_TO_SHOW + 5];

    if (U_FAILURE(status)) {
        log_err("Error creating iterator for testing validity\n");
@ -1547,6 +1546,22 @@ static UBool checkCEValidity(const UCollator *coll, const UChar *codepoints,
        if (ce == 0) {
            continue;
        }
+        if (ce == 0x02000202) {
+            /* special CE for merge-sort character */
+            if (*codepoints == 0xFFFE /* && length == 1 */) {
+                /*
+                 * Note: We should check for length==1 but the token parser appears
+                 * to give us trailing NUL characters.
+                 * TODO: Ticket #8047: Change TestCEValidity to use ucol_getTailoredSet()
+                 *                     rather than the internal collation rule parser
+                 */
+                continue;
+            } else {
+                log_err("Special 02/02/02 weight for code point U+%04X [len %d] != U+FFFE\n",
+                        (int)*codepoints, (int)length);
+                break;
+            }
+        }
        primary   = UCOL_PRIMARYORDER(ce);
        p1 = primary >> 8;
        p2 = primary & 0xFF;
@ -1603,8 +1618,7 @@ static UBool checkCEValidity(const UCollator *coll, const UChar *codepoints,
                break;
            }
            if (tertiary <= 2) {
-                showCodepoints(codepoints, length, codepointText);
-                log_err("Tertiary byte of %08lX out of range: locale %s, codepoints %s\n", (long)ce, collLocale, codepointText);
+                log_err("Tertiary byte of %08lX out of range\n", (long)ce);
                break;
            }
            tertiaryDone = FALSE;
@ -1656,14 +1670,18 @@ static UBool checkCEValidity(const UCollator *coll, const UChar *codepoints,
            if (tertiary == 0) {
                tertiaryDone = TRUE;
            } else if (tertiary <= 2) {
-                showCodepoints(codepoints, length, codepointText);
-                log_err("Tertiary byte of %08lX out of range: locale %s, codepoints %s\n", (long)ce, collLocale, codepointText);
+                log_err("Tertiary byte of %08lX out of range\n", (long)ce);
                break;
            }
        }
-   }
-   ucol_closeElements(iter);
-   return result;
+    }
+    if (!result) {
+        char codepointText[5*MAX_CODEPOINTS_TO_SHOW + 5];
+        showCodepoints(codepoints, length, codepointText);
+        log_err("Locale: %s  Code point string: %s\n", collLocale, codepointText);
+    }
+    ucol_closeElements(iter);
+    return result;
 }

 static void TestCEValidity()
@ -1676,7 +1694,7 @@ static void TestCEValidity()
    char        locale[][11] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN", "zh__PINYIN"};
    const char *loc;
    FileStream *file = NULL;
-    char        line[1024];
+    char        line[2048];
    UChar       codepoints[10];
    int         count = 0;
    int         maxCount = 0;
@ -1883,7 +1901,7 @@ static void TestSortKeyValidity(void)
    /* tailored locales */
    char        locale[][6] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN"};
    FileStream *file = NULL;
-    char        line[1024];
+    char        line[2048];
    UChar       codepoints[10];
    int         count = 0;
    UChar       contextCPs[5];
@ -1906,6 +1924,10 @@ static void TestSortKeyValidity(void)
        }

        getCodePoints(line, codepoints, contextCPs);
+        if(codepoints[0] == 0xFFFE) {
+            /* Skip special merge-sort character U+FFFE which has otherwise illegal 02 weight bytes. */
+            continue;
+        }
        checkSortKeyValidity(coll, codepoints, u_strlen(codepoints));
    }

@ -1976,6 +1998,10 @@ static void TestSortKeyValidity(void)
                uprv_memcpy(codepoints, src.source + chOffset,
                                                       chLen * sizeof(UChar));
                codepoints[chLen] = 0;
+                if(codepoints[0] == 0xFFFE) {
+                    /* Skip special merge-sort character U+FFFE which has otherwise illegal 02 weight bytes. */
+                    continue;
+                }
                checkSortKeyValidity(coll, codepoints, chLen);
            }
            free(rulesCopy);
--- a/icu4c/source/test/cintltst/cmsccoll.c
+++ b/icu4c/source/test/cintltst/cmsccoll.c
@ -3102,8 +3102,15 @@ static void TestVariableTopSetting(void) {
        varTop1 = ucol_setVariableTop(coll, conts, 3, &status);
      }
      if(U_FAILURE(status)) {
-        log_err("Couldn't set variable top to a contraction %04X %04X %04X\n",
-          *conts, *(conts+1), *(conts+2));
+        if(status == U_PRIMARY_TOO_LONG_ERROR) {
+          /* ucol_setVariableTop() is documented to not accept 3-byte primaries,
+           * therefore it is not an error when it complains about them. */
+          log_verbose("Couldn't set variable top to a contraction %04X %04X %04X - U_PRIMARY_TOO_LONG_ERROR\n",
+                      *conts, *(conts+1), *(conts+2));
+        } else {
+          log_err("Couldn't set variable top to a contraction %04X %04X %04X - %s\n",
+                  *conts, *(conts+1), *(conts+2), u_errorName(status));
+        }
        status = U_ZERO_ERROR;
      }
      conts+=3;
@ -3153,10 +3160,11 @@ static void TestVariableTopSetting(void) {

 static void TestNonChars(void) {
  static const char *test[] = {
-    "\\u0000",
-    "\\uFFFE", "\\uFFFF",
-      "\\U0001FFFE", "\\U0001FFFF",
-      "\\U0002FFFE", "\\U0002FFFF",
+      "\\u0000",  /* ignorable */
+      "\\uFFFE",  /* special merge-sort character with minimum non-ignorable weights */
+      "\\uFDD0", "\\uFDEF",
+      "\\U0001FFFE", "\\U0001FFFF",  /* UCA 6.0: noncharacters are treated like unassigned, */
+      "\\U0002FFFE", "\\U0002FFFF",  /* not like ignorable. */
      "\\U0003FFFE", "\\U0003FFFF",
      "\\U0004FFFE", "\\U0004FFFF",
      "\\U0005FFFE", "\\U0005FFFF",
@ -3170,7 +3178,8 @@ static void TestNonChars(void) {
      "\\U000DFFFE", "\\U000DFFFF",
      "\\U000EFFFE", "\\U000EFFFF",
      "\\U000FFFFE", "\\U000FFFFF",
-      "\\U0010FFFE", "\\U0010FFFF"
+      "\\U0010FFFE", "\\U0010FFFF",
+      "\\uFFFF"  /* special character with maximum primary weight */
  };
  UErrorCode status = U_ZERO_ERROR;
  UCollator *coll = ucol_open("en_US", &status);
@ -3178,7 +3187,7 @@ static void TestNonChars(void) {
  log_verbose("Test non characters\n");

  if(U_SUCCESS(status)) {
-    genericOrderingTestWithResult(coll, test, 35, UCOL_EQUAL);
+    genericOrderingTestWithResult(coll, test, 35, UCOL_LESS);
  } else {
    log_err_status(status, "Unable to open collator\n");
  }
@ -3634,13 +3643,31 @@ static void TestRuleOptions(void) {
        {  "c", "b", "\\u0009", "a", "\\u000a" }, 5
    },

+    /*
+     * These strings contain the last character before [variable top]
+     * and the first and second characters (by primary weights) after it.
+     * See FractionalUCA.txt. For example:
+        [last variable [0C FE, 05, 05]] # U+10A7F OLD SOUTH ARABIAN NUMERIC INDICATOR
+        [variable top = 0C FE]
+        [first regular [0D 0A, 05, 05]] # U+0060 GRAVE ACCENT
+       and
+        00B4; [0D 0C, 05, 05]
+     *
+     * Note: Starting with UCA 6.0, the [variable top] collation element
+     * is not the weight of any character or string,
+     * which means that LAST_VARIABLE_CHAR_STRING sorts before [last variable].
+     */
+#define LAST_VARIABLE_CHAR_STRING "\\U00010A7F"
+#define FIRST_REGULAR_CHAR_STRING "\\u0060"
+#define SECOND_REGULAR_CHAR_STRING "\\u00B4"
+
    { "&[last variable]<a &[before 3][last variable]<<<c<<<b ",
-        {  "c", "b", "\\uD834\\uDF71", "a", "\\u02d0" }, 5
+        { LAST_VARIABLE_CHAR_STRING, "c", "b", /* [last variable] */ "a", FIRST_REGULAR_CHAR_STRING }, 5
    },

    { "&[first regular]<a"
      "&[before 1][first regular]<b",
-      { "b", "\\u02d0", "a", "\\u02d1"}, 4
+      { "b", FIRST_REGULAR_CHAR_STRING, "a", SECOND_REGULAR_CHAR_STRING }, 4
    },

    /*
@ -3648,11 +3675,17 @@ static void TestRuleOptions(void) {
     * has to match the character that has the [last regular] weight
     * which changes with each UCA version.
     * See the bottom of FractionalUCA.txt which says something like
-     *   [last regular [CE 27, 05, 05]] # U+1342E EGYPTIAN HIEROGLYPH AA032
+        [last regular [7A FE, 05, 05]] # U+1342E EGYPTIAN HIEROGLYPH AA032
+     *
+     * Note: Starting with UCA 6.0, the [last regular] collation element
+     * is not the weight of any character or string,
+     * which means that LAST_REGULAR_CHAR_STRING sorts before [last regular].
     */
+#define LAST_REGULAR_CHAR_STRING "\\U0001342E"
+
    { "&[before 1][last regular]<b"
      "&[last regular]<a",
-        { "b", "\\U0001342E", "a", "\\u4e00" }, 4
+        { LAST_REGULAR_CHAR_STRING, "b", /* [last regular] */ "a", "\\u4e00" }, 4
    },

    { "&[before 1][first implicit]<b"
@ -3670,7 +3703,7 @@ static void TestRuleOptions(void) {
      "&[last secondary ignorable]<<y"
      "&[last tertiary ignorable]<<<w"
      "&[top]<u",
-      {"\\ufffb",  "w", "y", "\\u20e3", "x", "\\u137c", "z", "u"}, 7
+      {"\\ufffb",  "w", "y", "\\u20e3", "x", LAST_VARIABLE_CHAR_STRING, "z", "u"}, 7
    }

  };
--- a/icu4c/source/test/intltest/apicoll.cpp
+++ b/icu4c/source/test/intltest/apicoll.cpp
@ -1284,23 +1284,23 @@ void CollationAPITest::TestMaxExpansion()

        size = coll.getMaxExpansion(order);
        if (U_FAILURE(status) || size < count) {
-            errln("Failure at codepoint %d, maximum expansion count < %d\n",
-                  ch, count);
+            errln("Failure at codepoint U+%04X, maximum expansion count %d < %d",
+                  ch, size, count);
        }
    }

    /* testing for exact max expansion */
+    int32_t size;
    ch = 0;
    while (ch < 0x61) {
        uint32_t order;
-        int32_t  size;
        str.setCharAt(0, ch);
        iter->setText(str, status);
        order = iter->previous(status);
        size  = coll.getMaxExpansion(order);
        if (U_FAILURE(status) || size != 1) {
-            errln("Failure at codepoint %d, maximum expansion count < %d\n",
-                ch, 1);
+            errln("Failure at codepoint U+%04X, maximum expansion count %d < %d",
+                  ch, size, 1);
        }
        ch ++;
    }
@ -1309,29 +1309,29 @@ void CollationAPITest::TestMaxExpansion()
    str.setTo(ch);
    iter->setText(str, status);
    temporder = iter->previous(status);
-
-    if (U_FAILURE(status) || coll.getMaxExpansion(temporder) != 3) {
-        errln("Failure at codepoint %d, maximum expansion count != %d\n",
-              ch, 3);
+    size = coll.getMaxExpansion(temporder);
+    if (U_FAILURE(status) || size != 3) {
+        errln("Failure at codepoint U+%04X, CE %08x, maximum expansion count %d != %d",
+              ch, temporder, size, 3);
    }

    ch = 0x64;
    str.setTo(ch);
    iter->setText(str, status);
    temporder = iter->previous(status);
-
-    if (U_FAILURE(status) || coll.getMaxExpansion(temporder) != 1) {
-        errln("Failure at codepoint %d, maximum expansion count != %d\n",
-                ch, 3);
+    size = coll.getMaxExpansion(temporder);
+    if (U_FAILURE(status) || size != 1) {
+        errln("Failure at codepoint U+%04X, CE %08x, maximum expansion count %d != %d",
+              ch, temporder, size, 1);
    }

    str.setTo(unassigned);
    iter->setText(str, status);
    sorder = iter->previous(status);
-
-    if (U_FAILURE(status) || coll.getMaxExpansion(sorder) != 2) {
-        errln("Failure at supplementary codepoints, maximum expansion count < %d\n",
-              2);
+    size = coll.getMaxExpansion(sorder);
+    if (U_FAILURE(status) || size != 2) {
+        errln("Failure at supplementary codepoints, maximum expansion count %d < %d",
+              size, 2);
    }

    /* testing jamo */
@ -1339,9 +1339,10 @@ void CollationAPITest::TestMaxExpansion()
    str.setTo(ch);
    iter->setText(str, status);
    temporder = iter->previous(status);
-    if (U_FAILURE(status) || coll.getMaxExpansion(temporder) > 3) {
-        errln("Failure at codepoint %d, maximum expansion count > %d\n",
-              ch, 3);
+    size = coll.getMaxExpansion(temporder);
+    if (U_FAILURE(status) || size > 3) {
+        errln("Failure at codepoint U+%04X, maximum expansion count %d > %d",
+              ch, size, 3);
    }

    delete iter;
@ -1352,9 +1353,10 @@ void CollationAPITest::TestMaxExpansion()
    RuleBasedCollator jamocoll(rule, status);
    iter = jamocoll.createCollationElementIterator(str);
    temporder = iter->previous(status);
-    if (U_FAILURE(status) || iter->getMaxExpansion(temporder) != 6) {
-        errln("Failure at codepoint %d, maximum expansion count > %d\n",
-              ch, 5);
+    size = iter->getMaxExpansion(temporder);
+    if (U_FAILURE(status) || size != 6) {
+        errln("Failure at codepoint U+%04X, maximum expansion count %d > %d",
+              ch, size, 5);
    }

    delete iter;
--- a/icu4c/source/test/intltest/dadrcoll.cpp
+++ b/icu4c/source/test/intltest/dadrcoll.cpp
@ -1,6 +1,6 @@
 /********************************************************************
 * COPYRIGHT: 
- * Copyright (c) 1997-2009, International Business Machines Corporation and
+ * Copyright (c) 1997-2010, International Business Machines Corporation and
 * others. All Rights Reserved.
 ********************************************************************/

@ -196,11 +196,12 @@ DataDrivenCollatorTest::processTest(TestData *testData) {
        if(U_SUCCESS(status)) {
          logln("Testing collator for rules "+testSetting);
        } else {
-          errln("Unable to instantiate collator for rules "+testSetting);
+          errln("Unable to instantiate collator for rules "+testSetting+" - "+u_errorName(status));
          return;
        }
      } else {
        errln("No collator definition!");
+        return;
      }
    }
    
--- a/icu4c/source/test/intltest/ssearch.cpp
+++ b/icu4c/source/test/intltest/ssearch.cpp
@ -74,6 +74,7 @@ SSearchTest::~SSearchTest()

 void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params )
 {
+    static const UVersionInfo icu47 = { 4, 7, 0, 0 };
    if (exec) logln("TestSuite SSearchTest: ");
    switch (index) {
 #if !UCONFIG_NO_BREAK_ITERATION
@ -82,7 +83,7 @@ void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name,
            break;

        case 1: name = "offsetTest";
-            if (exec) offsetTest();
+            if (exec && isICUVersionAtLeast(icu47)) offsetTest();
            break;

        case 2: name = "monkeyTest";
@ -90,7 +91,7 @@ void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name,
            break;

        case 3: name = "bmMonkeyTest";
-            if (exec) bmMonkeyTest(params);
+            if (exec && isICUVersionAtLeast(icu47)) bmMonkeyTest(params);
            break;

        case 4: name = "boyerMooreTest";
--- a/icu4c/source/test/intltest/tsmthred.cpp
+++ b/icu4c/source/test/intltest/tsmthred.cpp
@ -1,6 +1,6 @@
 /********************************************************************
 * COPYRIGHT: 
- * Copyright (c) 1999-2009, International Business Machines Corporation and
+ * Copyright (c) 1999-2010, International Business Machines Corporation and
 * others. All Rights Reserved.
 ********************************************************************/

@ -860,34 +860,34 @@ public:
    virtual void run() {
        //sleep(10000);
        int32_t line = 0;
-        
+
        uint8_t sk1[1024], sk2[1024];
        uint8_t *oldSk = NULL, *newSk = sk1;
        int32_t resLen = 0, oldLen = 0;
        int32_t i = 0;
-        
+
        for(i = 0; i < noLines; i++) {
            resLen = ucol_getSortKey(coll, lines[i].buff, lines[i].buflen, newSk, 1024);
-            
+
            int32_t res = 0, cmpres = 0, cmpres2 = 0;
-            
+
            if(oldSk != NULL) {
                res = strcmp((char *)oldSk, (char *)newSk);
                cmpres = ucol_strcoll(coll, lines[i-1].buff, lines[i-1].buflen, lines[i].buff, lines[i].buflen);
                cmpres2 = ucol_strcoll(coll, lines[i].buff, lines[i].buflen, lines[i-1].buff, lines[i-1].buflen);
                //cmpres = res;
                //cmpres2 = -cmpres;
-                
+
                if(cmpres != -cmpres2) {
                    error("Compare result not symmetrical on line "+ line);
                    break;
                }
-                
+
                if(((res&0x80000000) != (cmpres&0x80000000)) || (res == 0 && cmpres != 0) || (res != 0 && cmpres == 0)) {
                    error(UnicodeString("Difference between ucol_strcoll and sortkey compare on line ")+ UnicodeString(line));
                    break;
                }
-                
+
                if(res > 0) {
                    error(UnicodeString("Line %i is not greater or equal than previous line ")+ UnicodeString(i));
                    break;
@ -896,20 +896,24 @@ public:
                    if (res == 0) {
                        error(UnicodeString("Probable error in test file on line %i (comparing identical strings)")+ UnicodeString(i));
                        break;
-                    } else if (res > 0) {
-                        error(UnicodeString("Sortkeys are identical, but code point comapare gives >0 on line ")+ UnicodeString(i));
+                    }
+                    /*
+                     * UCA 6.0 test files can have lines that compare == if they are
+                     * different strings but canonically equivalent.
+                    else if (res > 0) {
+                        error(UnicodeString("Sortkeys are identical, but code point compare gives >0 on line ")+ UnicodeString(i));
                        break;
                    }
+                     */
                }
            }
-            
+
            oldSk = newSk;
            oldLen = resLen;
-            
+
            newSk = (newSk == sk1)?sk2:sk1;
        }
    }
-    
 };

 void MultithreadTest::TestCollators()
--- a/icu4c/source/test/intltest/ucaconf.cpp
+++ b/icu4c/source/test/intltest/ucaconf.cpp
@ -1,6 +1,6 @@
 /********************************************************************
 * COPYRIGHT: 
- * Copyright (c) 2002-2009, International Business Machines Corporation and
+ * Copyright (c) 2002-2010, International Business Machines Corporation and
 * others. All Rights Reserved.
 ********************************************************************/

@ -223,11 +223,16 @@ void UCAConformanceTest::testConformance(UCollator *coll)
                if (res == 0) {
                    errln("Probable error in test file on line %i (comparing identical strings)", line);
                    errln("  Data line %s", lineB);
-                } else if (res > 0) {
-                    errln("Sortkeys are identical, but code point comapare gives >0 on line %i", line);
+                }
+                /*
+                 * UCA 6.0 test files can have lines that compare == if they are
+                 * different strings but canonically equivalent.
+                else if (res > 0) {
+                    errln("Sortkeys are identical, but code point compare gives >0 on line %i", line);
                    errln("  Previous data line %s", oldLineB);
                    errln("  Current data line  %s", lineB);
                }
+                 */
            }
        }

--- a/icu4c/source/test/testdata/CollationTest_NON_IGNORABLE_SHORT.txt
+++ b/icu4c/source/test/testdata/CollationTest_NON_IGNORABLE_SHORT.txt
--- a/icu4c/source/test/testdata/CollationTest_SHIFTED_SHORT.txt
+++ b/icu4c/source/test/testdata/CollationTest_SHIFTED_SHORT.txt