ICU-11449 reorder single scripts not groups, scripts/groups can start on top-16-bit boundaries, data formatVersion 5 for new scripts data and optional reorderRanges appended to reorderCodes

X-SVN-Rev: 36925
This commit is contained in:
Markus Scherer 2015-01-07 03:49:20 +00:00
parent a9d7c3e4bd
commit e65a679a26
16 changed files with 684 additions and 377 deletions

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2010-2014, International Business Machines
* Copyright (C) 2010-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* Collation.java, ported from collation.h/.cpp
@ -587,9 +587,5 @@ public final class Collation {
return makeCE(unassignedPrimaryFromCodePoint(c));
}
static long reorder(byte[] reorderTable, long primary) {
return ((reorderTable[(int)primary >>> 24] & 0xffL) << 24) | (primary & 0xffffff);
}
// private Collation() // No instantiation.
}

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 1996-2014, International Business Machines
* Copyright (C) 1996-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* CollationCompare.java, ported from collationcompare.h/.cpp
@ -79,10 +79,9 @@ public final class CollationCompare /* all static */ {
if (leftPrimary != rightPrimary) {
// Return the primary difference, with script reordering.
byte[] reorderTable = settings.reorderTable;
if (reorderTable != null) {
leftPrimary = Collation.reorder(reorderTable, leftPrimary);
rightPrimary = Collation.reorder(reorderTable, rightPrimary);
if (settings.hasReordering()) {
leftPrimary = settings.reorder(leftPrimary);
rightPrimary = settings.reorder(rightPrimary);
}
return (leftPrimary < rightPrimary) ? Collation.LESS : Collation.GREATER;
}
@ -335,10 +334,9 @@ public final class CollationCompare /* all static */ {
if (leftQuaternary != rightQuaternary) {
// Return the difference, with script reordering.
byte[] reorderTable = settings.reorderTable;
if (reorderTable != null) {
leftQuaternary = Collation.reorder(reorderTable, leftQuaternary);
rightQuaternary = Collation.reorder(reorderTable, rightQuaternary);
if (settings.hasReordering()) {
leftQuaternary = settings.reorder(leftQuaternary);
rightQuaternary = settings.reorder(rightQuaternary);
}
return (leftQuaternary < rightQuaternary) ? Collation.LESS : Collation.GREATER;
}

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2010-2014, International Business Machines
* Copyright (C) 2010-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* CollationData.java, ported from collationdata.h/.cpp
@ -16,6 +16,7 @@ import com.ibm.icu.impl.Trie2_32;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ICUException;
/**
* Collation data container.
@ -25,6 +26,14 @@ import com.ibm.icu.text.UnicodeSet;
* Includes data for the collation base (root/default), aliased if this is not the base.
*/
public final class CollationData {
// Note: The ucadata.icu loader could discover the reserved ranges by setting an array
// parallel with the ranges, and resetting ranges that are indexed.
// The reordering builder code could clone the resulting template array.
static final int REORDER_RESERVED_BEFORE_LATIN = Collator.ReorderCodes.FIRST + 14;
static final int REORDER_RESERVED_AFTER_LATIN = Collator.ReorderCodes.FIRST + 15;
static final int MAX_NUM_SPECIAL_REORDER_CODES = 8;
CollationData(Normalizer2Impl nfc) {
nfcImpl = nfc;
}
@ -182,12 +191,8 @@ public final class CollationData {
* or 0 if the script is unknown
*/
long getFirstPrimaryForGroup(int script) {
int index = findScript(script);
if(index < 0) {
return 0;
}
long head = scripts[index];
return (head & 0xff00) << 16;
int index = getScriptIndex(script);
return index == 0 ? 0 : (long)scriptStarts[index] << 16;
}
/**
@ -197,13 +202,12 @@ public final class CollationData {
* or 0 if the script is unknown
*/
public long getLastPrimaryForGroup(int script) {
int index = findScript(script);
if(index < 0) {
int index = getScriptIndex(script);
if(index == 0) {
return 0;
}
int head = scripts[index];
long lastByte = head & 0xff;
return ((lastByte + 1) << 24) - 1;
long limit = scriptStarts[index + 1];
return (limit << 16) - 1;
}
/**
@ -211,108 +215,154 @@ public final class CollationData {
* @return the first script of the group, or -1 if the weight is beyond the last group
*/
public int getGroupForPrimary(long p) {
p >>= 24; // Reordering groups are distinguished by primary lead bytes.
for(int i = 0; i < scripts.length; i = i + 2 + scripts[i + 1]) {
int lastByte = scripts[i] & 0xff;
if(p <= lastByte) {
return scripts[i + 2];
p >>= 16;
if(p < scriptStarts[1] || scriptStarts[scriptStarts.length - 1] <= p) {
return -1;
}
int index = 1;
while(p >= scriptStarts[index + 1]) { ++index; }
for(int i = 0; i < numScripts; ++i) {
if(scriptsIndex[i] == index) {
return i;
}
}
for(int i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
if(scriptsIndex[numScripts + i] == index) {
return Collator.ReorderCodes.FIRST + i;
}
}
return -1;
}
private int findScript(int script) {
if(script < 0 || 0xffff < script) { return -1; }
for(int i = 0; i < scripts.length;) {
int limit = i + 2 + scripts[i + 1];
for(int j = i + 2; j < limit; ++j) {
if(script == scripts[j]) { return i; }
private int getScriptIndex(int script) {
if(script < 0) {
return 0;
} else if(script < numScripts) {
return scriptsIndex[script];
} else if(script < Collator.ReorderCodes.FIRST) {
return 0;
} else {
script -= Collator.ReorderCodes.FIRST;
if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
return scriptsIndex[numScripts + script];
} else {
return 0;
}
i = limit;
}
return -1;
}
public int[] getEquivalentScripts(int script) {
int i = findScript(script);
if(i < 0) { return EMPTY_INT_ARRAY; }
int length = scripts[i + 1];
assert(length != 0);
int dest[] = new int[length];
i += 2;
dest[0] = scripts[i++];
for(int j = 1; j < length; ++j) {
script = scripts[i++];
// Sorted insertion.
for(int k = j;; --k) {
// Invariant: dest[k] is free to receive either script or dest[k - 1].
if(k > 0 && script < dest[k - 1]) {
dest[k] = dest[k - 1];
} else {
dest[k] = script;
break;
}
int index = getScriptIndex(script);
if(index == 0) { return EMPTY_INT_ARRAY; }
if(script >= Collator.ReorderCodes.FIRST) {
// Special groups have no aliases.
return new int[] { script };
}
int length = 0;
for(int i = 0; i < numScripts; ++i) {
if(scriptsIndex[i] == index) {
++length;
}
}
int[] dest = new int[length];
if(length == 1) {
dest[0] = script;
return dest;
}
length = 0;
for(int i = 0; i < numScripts; ++i) {
if(scriptsIndex[i] == index) {
dest[length++] = i;
}
}
return dest;
}
/**
* Writes the permutation table for the given reordering of scripts and groups,
* mapping from default-order primary-weight lead bytes to reordered lead bytes.
* Writes the permutation of primary-weight ranges
* for the given reordering of scripts and groups.
* The caller checks for illegal arguments and
* takes care of [DEFAULT] and memory allocation.
*
* <p>Each list element will be a (limit, offset) pair as described
* for the CollationSettings.reorderRanges.
* The list will be empty if no ranges are reordered.
*/
public void makeReorderTable(int[] reorder, byte[] table) {
void makeReorderRanges(int[] reorder, UVector32 ranges) {
makeReorderRanges(reorder, false, ranges);
}
private void makeReorderRanges(int[] reorder, boolean latinMustMove, UVector32 ranges) {
ranges.removeAllElements();
int length = reorder.length;
// Initialize the table.
if(length == 0 || (length == 1 && reorder[0] == UScript.UNKNOWN)) {
return;
}
// Maps each script-or-group range to a new lead byte.
short[] table = new short[scriptStarts.length - 1]; // C++: uint8_t[]
{
// Set "don't care" values for reserved ranges.
int index = scriptsIndex[
numScripts + REORDER_RESERVED_BEFORE_LATIN - Collator.ReorderCodes.FIRST];
if(index != 0) {
table[index] = 0xff;
}
index = scriptsIndex[
numScripts + REORDER_RESERVED_AFTER_LATIN - Collator.ReorderCodes.FIRST];
if(index != 0) {
table[index] = 0xff;
}
}
// Never reorder special low and high primary lead bytes.
int lowByte;
for(lowByte = 0; lowByte <= Collation.MERGE_SEPARATOR_BYTE; ++lowByte) {
table[lowByte] = (byte)lowByte;
}
// lowByte == 03
int highByte;
for(highByte = 0xff; highByte >= Collation.TRAIL_WEIGHT_BYTE; --highByte) {
table[highByte] = (byte)highByte;
}
// highByte == FE
// Set intermediate bytes to 0 to indicate that they have not been set yet.
for(int i = lowByte; i <= highByte; ++i) {
table[i] = 0;
}
assert(scriptStarts.length >= 2);
assert(scriptStarts[0] == 0);
int lowStart = scriptStarts[1];
assert(lowStart == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8));
int highLimit = scriptStarts[scriptStarts.length - 1];
assert(highLimit == (Collation.TRAIL_WEIGHT_BYTE << 8));
// Get the set of special reorder codes in the input list.
// This supports up to 32 special reorder codes;
// This supports a fixed number of special reorder codes;
// it works for data with codes beyond Collator.ReorderCodes.LIMIT.
int specials = 0;
for(int i = 0; i < length; ++i) {
int reorderCode = reorder[i] - Collator.ReorderCodes.FIRST;
if(0 <= reorderCode && reorderCode <= 31) {
if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
specials |= 1 << reorderCode;
}
}
// Start the reordering with the special low reorder codes that do not occur in the input.
for(int i = 0;; i += 3) {
if(scripts[i + 1] != 1) { break; } // Went beyond special single-code reorder codes.
int reorderCode = scripts[i + 2] - Collator.ReorderCodes.FIRST;
if(reorderCode < 0) { break; } // Went beyond special reorder codes.
if((specials & (1 << reorderCode)) == 0) {
int head = scripts[i];
int firstByte = head >> 8;
int lastByte = head & 0xff;
do { table[firstByte++] = (byte)lowByte++; } while(firstByte <= lastByte);
for(int i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
int index = scriptsIndex[numScripts + i];
if(index != 0 && (specials & (1 << i)) == 0) {
lowStart = addLowScriptRange(table, index, lowStart);
}
}
// Reorder according to the input scripts, continuing from the bottom of the bytes range.
// Skip the reserved range before Latin if Latin is the first script,
// so that we do not move it unnecessarily.
int skippedReserved = 0;
if(specials == 0 && reorder[0] == UScript.LATIN && !latinMustMove) {
int index = scriptsIndex[UScript.LATIN];
assert(index != 0);
int start = scriptStarts[index];
assert(lowStart <= start);
skippedReserved = start - lowStart;
lowStart = start;
}
// Reorder according to the input scripts, continuing from the bottom of the primary range.
boolean hasReorderToEnd = false;
for(int i = 0; i < length;) {
int script = reorder[i++];
if(script == UScript.UNKNOWN) {
// Put the remaining scripts at the top.
hasReorderToEnd = true;
while(i < length) {
script = reorder[--length];
if(script == UScript.UNKNOWN) { // Must occur at most once.
@ -323,17 +373,14 @@ public final class CollationData {
throw new IllegalArgumentException(
"setReorderCodes(): UScript.DEFAULT together with other scripts");
}
int index = findScript(script);
if(index < 0) { continue; }
int head = scripts[index];
int firstByte = head >> 8;
int lastByte = head & 0xff;
if(table[firstByte] != 0) { // Duplicate or equivalent script.
int index = getScriptIndex(script);
if(index == 0) { continue; }
if(table[index] != 0) { // Duplicate or equivalent script.
throw new IllegalArgumentException(
"setReorderCodes(): duplicate or equivalent script " +
scriptCodeString(script));
}
do { table[lastByte--] = (byte)highByte--; } while(firstByte <= lastByte);
highLimit = addHighScriptRange(table, index, highLimit);
}
break;
}
@ -343,25 +390,82 @@ public final class CollationData {
throw new IllegalArgumentException(
"setReorderCodes(): UScript.DEFAULT together with other scripts");
}
int index = findScript(script);
if(index < 0) { continue; }
int head = scripts[index];
int firstByte = head >> 8;
int lastByte = head & 0xff;
if(table[firstByte] != 0) { // Duplicate or equivalent script.
int index = getScriptIndex(script);
if(index == 0) { continue; }
if(table[index] != 0) { // Duplicate or equivalent script.
throw new IllegalArgumentException(
"setReorderCodes(): duplicate or equivalent script " +
scriptCodeString(script));
}
do { table[firstByte++] = (byte)lowByte++; } while(firstByte <= lastByte);
lowStart = addLowScriptRange(table, index, lowStart);
}
// Put all remaining scripts into the middle.
// Avoid table[0] which must remain 0.
for(int i = 1; i <= 0xff; ++i) {
if(table[i] == 0) { table[i] = (byte)lowByte++; }
for(int i = 1; i < scriptStarts.length - 1; ++i) {
int leadByte = table[i];
if(leadByte != 0) { continue; }
int start = scriptStarts[i];
if(!hasReorderToEnd && start > lowStart) {
// No need to move this script.
lowStart = start;
}
lowStart = addLowScriptRange(table, i, lowStart);
}
assert(lowByte == highByte + 1);
if(lowStart > highLimit) {
if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {
// Try not skipping the before-Latin reserved range.
makeReorderRanges(reorder, true, ranges);
return;
}
// We need more primary lead bytes than available, despite the reserved ranges.
throw new ICUException(
"setReorderCodes(): reordering too many partial-primary-lead-byte scripts");
}
// Turn lead bytes into a list of (limit, offset) pairs.
// Encode each pair in one list element:
// Upper 16 bits = limit, lower 16 = signed lead byte offset.
int offset = 0;
for(int i = 1;; ++i) {
int nextOffset = offset;
while(i < scriptStarts.length - 1) {
int newLeadByte = table[i];
if(newLeadByte == 0xff) {
// "Don't care" lead byte for reserved range, continue with current offset.
} else {
nextOffset = newLeadByte - (scriptStarts[i] >> 8);
if(nextOffset != offset) { break; }
}
++i;
}
if(offset != 0 || i < scriptStarts.length - 1) {
ranges.addElement(((int)scriptStarts[i] << 16) | (offset & 0xffff));
}
if(i == scriptStarts.length - 1) { break; }
offset = nextOffset;
}
}
private int addLowScriptRange(short[] table, int index, int lowStart) {
int start = scriptStarts[index];
if((start & 0xff) < (lowStart & 0xff)) {
lowStart += 0x100;
}
table[index] = (short)(lowStart >> 8);
int limit = scriptStarts[index + 1];
lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff);
return lowStart;
}
private int addHighScriptRange(short[] table, int index, int highLimit) {
int limit = scriptStarts[index + 1];
if((limit & 0xff) > (highLimit & 0xff)) {
highLimit -= 0x100;
}
int start = scriptStarts[index];
highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff);
table[index] = (short)(highLimit >> 8);
return highLimit;
}
private static String scriptCodeString(int script) {
@ -423,21 +527,25 @@ public final class CollationData {
* Data for scripts and reordering groups.
* Uses include building a reordering permutation table and
* providing script boundaries to AlphabeticIndex.
*
* This data is a sorted list of primary-weight lead byte ranges (reordering groups),
* each with a list of pairs sorted in base collation order;
* each pair contains a script/reorder code and the lowest primary weight for that script.
*
* Data structure:
* - Each reordering group is encoded in n+2 16-bit integers.
* - First integer:
* Bits 15..8: First byte of the reordering group's range.
* Bits 7..0: Last byte of the reordering group's range.
* - Second integer:
* Length n of the list of script/reordering codes.
* - Each further integer is a script or reordering code.
*/
char[] scripts;
int numScripts;
/**
* The length of scriptsIndex is numScripts+16.
* It maps from a UScriptCode or a special reorder code to an entry in scriptStarts.
* 16 special reorder codes (not all used) are mapped starting at numScripts.
* Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit.
* There are special codes at the end for reorder-reserved primary ranges.
*
* <p>Multiple scripts may share a range and index, for example Hira & Kana.
*/
char[] scriptsIndex;
/**
* Start primary weight (top 16 bits only) for a group/script/reserved range
* indexed by scriptsIndex.
* The first range (separators & terminators) and the last range (trailing weights)
* are not reorderable, and no scriptsIndex entry points to them.
*/
char[] scriptStarts;
/**
* Collation elements in the root collator.

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2012-2014, International Business Machines
* Copyright (C) 2012-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* CollationDataBuilder.java, ported from collationdatabuilder.h/.cpp
@ -310,7 +310,9 @@ final class CollationDataBuilder { // not final in C++
if(base != null) {
data.numericPrimary = base.numericPrimary;
data.compressibleBytes = base.compressibleBytes;
data.scripts = base.scripts;
data.numScripts = base.numScripts;
data.scriptsIndex = base.scriptsIndex;
data.scriptStarts = base.scriptStarts;
}
buildFastLatinTable(data);
}

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Copyright (C) 2013-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* CollationDataReader.java, ported from collationdatareader.h/.cpp
@ -13,6 +13,7 @@ package com.ibm.icu.impl.coll;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.util.Arrays;
import com.ibm.icu.impl.ICUBinary;
@ -143,6 +144,7 @@ final class CollationDataReader /* all static */ {
CollationData baseData = base == null ? null : base.data;
int[] reorderCodes;
int reorderCodesLength;
index = IX_REORDER_CODES_OFFSET;
offset = inIndexes[index];
length = inIndexes[index + 1] - offset;
@ -152,13 +154,27 @@ final class CollationDataReader /* all static */ {
// the base data does not have a reordering.
throw new ICUException("Collation base data must not reorder scripts");
}
reorderCodes = new int[length / 4];
for(int i = 0; i < length / 4; ++i) {
reorderCodesLength = length / 4;
reorderCodes = new int[reorderCodesLength];
for(int i = 0; i < reorderCodesLength; ++i) {
reorderCodes[i] = inBytes.getInt();
}
length &= 3;
// The reorderRanges (if any) are the trailing reorderCodes entries.
// Split the array at the boundary.
// Script or reorder codes do not exceed 16-bit values.
// Range limits are stored in the upper 16 bits, and are never 0.
int reorderRangesLength = 0;
while(reorderRangesLength < reorderCodesLength &&
(reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) {
++reorderRangesLength;
}
assert(reorderRangesLength < reorderCodesLength);
reorderCodesLength -= reorderRangesLength;
} else {
reorderCodes = new int[0];
reorderCodesLength = 0;
}
ICUBinary.skipBytes(inBytes, length);
@ -170,7 +186,7 @@ final class CollationDataReader /* all static */ {
offset = inIndexes[index];
length = inIndexes[index + 1] - offset;
if(length >= 256) {
if(reorderCodes.length == 0) {
if(reorderCodesLength == 0) {
throw new ICUException("Reordering table without reordering codes");
}
reorderTable = new byte[256];
@ -410,15 +426,28 @@ final class CollationDataReader /* all static */ {
if(data == null) {
throw new ICUException("Script order data but no mappings");
}
data.scripts = new char[length / 2];
for(int i = 0; i < length / 2; ++i) {
data.scripts[i] = inBytes.getChar();
int scriptsLength = length / 2;
CharBuffer inChars = inBytes.asCharBuffer();
data.numScripts = inChars.get();
// There must be enough entries for both arrays, including more than two range starts.
int scriptStartsLength = scriptsLength - (1 + data.numScripts + 16);
if(scriptStartsLength <= 2) {
throw new ICUException("Script order data too short");
}
inChars.get(data.scriptsIndex = new char[data.numScripts + 16]);
inChars.get(data.scriptStarts = new char[scriptStartsLength]);
if(!(data.scriptStarts[0] == 0 &&
data.scriptStarts[1] == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8) &&
data.scriptStarts[scriptStartsLength - 1] ==
(Collation.TRAIL_WEIGHT_BYTE << 8))) {
throw new ICUException("Script order data not valid");
}
length &= 1;
} else if(data == null) {
// Nothing to do.
} else if(baseData != null) {
data.scripts = baseData.scripts;
data.numScripts = baseData.numScripts;
data.scriptsIndex = baseData.scriptsIndex;
data.scriptStarts = baseData.scriptStarts;
}
ICUBinary.skipBytes(inBytes, length);
@ -470,12 +499,8 @@ final class CollationDataReader /* all static */ {
throw new ICUException("The maxVariable could not be mapped to a variableTop");
}
if(reorderCodes.length == 0 || reorderTable != null) {
settings.setReordering(reorderCodes, reorderTable);
} else {
byte[] table = new byte[256];
baseData.makeReorderTable(reorderCodes, table);
settings.setReordering(reorderCodes, table);
if(reorderCodesLength != 0) {
settings.aliasReordering(baseData, reorderCodes, reorderCodesLength, reorderTable);
}
settings.fastLatinOptions = CollationFastLatin.getOptions(
@ -486,7 +511,7 @@ final class CollationDataReader /* all static */ {
private static final class IsAcceptable implements ICUBinary.Authenticate {
// @Override when we switch to Java 6
public boolean isDataVersionAcceptable(byte version[]) {
return version[0] == 4;
return version[0] == 5;
}
}
private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Copyright (C) 2013-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* CollationFastLatin.java, ported from collationfastlatin.h/.cpp
@ -23,7 +23,7 @@ public final class CollationFastLatin /* all static */ {
* When the major version number of the main data format changes,
* we can reset this fast Latin version to 1.
*/
public static final int VERSION = 1;
public static final int VERSION = 2;
public static final int LATIN_MAX = 0x17f;
public static final int LATIN_LIMIT = LATIN_MAX + 1;
@ -211,33 +211,50 @@ public final class CollationFastLatin /* all static */ {
// lowest long mini primary.
miniVarTop = MIN_LONG - 1;
} else {
int v1 = (int)(settings.variableTop >> 24);
int headerLength = header[0] & 0xff;
int i = headerLength - 1;
if(i <= 0 || v1 > (header[i] & 0x7f)) {
int i = 1 + settings.getMaxVariable();
if(i >= headerLength) {
return -1; // variableTop >= digits, should not occur
}
while(i > 1 && v1 <= (header[i - 1] & 0x7f)) { --i; }
// In the table header, the miniVarTop is in bits 15..7, with 4 zero bits 19..16 implied.
// Shift right to make it comparable with long mini primaries in bits 15..3.
miniVarTop = (header[i] & 0xff80) >> 4;
miniVarTop = header[i];
}
byte[] reorderTable = settings.reorderTable;
if(reorderTable != null) {
char[] scripts = data.scripts;
int length = data.scripts.length;
int prevLastByte = 0;
for(int i = 0; i < length;) {
// reordered last byte of the group
int lastByte = reorderTable[scripts[i] & 0xff] & 0xff;
if(lastByte < prevLastByte) {
// The permutation affects the groups up to Latin.
return -1;
boolean digitsAreReordered = false;
if(settings.hasReordering()) {
long prevStart = 0;
long beforeDigitStart = 0;
long digitStart = 0;
long afterDigitStart = 0;
for(int group = Collator.ReorderCodes.FIRST;
group < Collator.ReorderCodes.FIRST + CollationData.MAX_NUM_SPECIAL_REORDER_CODES;
++group) {
long start = data.getFirstPrimaryForGroup(group);
start = settings.reorder(start);
if(group == Collator.ReorderCodes.DIGIT) {
beforeDigitStart = prevStart;
digitStart = start;
} else if(start != 0) {
if(start < prevStart) {
// The permutation affects the groups up to Latin.
return -1;
}
// In the future, there might be a special group between digits & Latin.
if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) {
afterDigitStart = start;
}
prevStart = start;
}
if(scripts[i + 2] == UScript.LATIN) { break; }
i = i + 2 + scripts[i + 1];
prevLastByte = lastByte;
}
long latinStart = data.getFirstPrimaryForGroup(UScript.LATIN);
latinStart = settings.reorder(latinStart);
if(latinStart < prevStart) {
return -1;
}
if(afterDigitStart == 0) {
afterDigitStart = latinStart;
}
if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) {
digitsAreReordered = true;
}
}
@ -253,7 +270,7 @@ public final class CollationFastLatin /* all static */ {
}
primaries[c] = (char)p;
}
if((settings.options & CollationSettings.NUMERIC) != 0) {
if(digitsAreReordered || (settings.options & CollationSettings.NUMERIC) != 0) {
// Bail out for digits.
for(int c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; }
}

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Copyright (C) 2013-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* CollationFastLatinBuilder.java, ported from collationfastlatinbuilder.h/.cpp
@ -127,38 +127,26 @@ final class CollationFastLatinBuilder {
}
private boolean loadGroups(CollationData data) {
result.append(0); // reserved for version & headerLength
headerLength = 1 + NUM_SPECIAL_GROUPS;
int r0 = (CollationFastLatin.VERSION << 8) | headerLength;
result.append((char)r0);
// The first few reordering groups should be special groups
// (space, punct, ..., digit) followed by Latn, then Grek and other scripts.
for(int i = 0;;) {
if(i >= data.scripts.length) {
throw new AssertionError("no Latn script");
for(int i = 0; i < NUM_SPECIAL_GROUPS; ++i) {
lastSpecialPrimaries[i] = data.getLastPrimaryForGroup(Collator.ReorderCodes.FIRST + i);
if(lastSpecialPrimaries[i] == 0) {
// missing data
return false;
}
int head = data.scripts[i];
int lastByte = head & 0xff; // last primary byte in the group
int group = data.scripts[i + 2];
if(group == Collator.ReorderCodes.DIGIT) {
firstDigitPrimary = (long)(head & 0xff00) << 16;
headerLength = result.length();
int r0 = (CollationFastLatin.VERSION << 8) | headerLength;
result.setCharAt(0, (char)r0);
} else if(group == UScript.LATIN) {
if(firstDigitPrimary == 0) {
throw new AssertionError("no digit group");
}
firstLatinPrimary = (long)(head & 0xff00) << 16;
lastLatinPrimary = ((long)lastByte << 24) | 0xffffff;
break;
} else if(firstDigitPrimary == 0) {
// a group below digits
if(lastByte > 0x7f) {
// We only use 7 bits for the last byte of a below-digits group.
// This does not warrant an errorCode, but we do not build a fast Latin table.
return false;
}
result.append((char)lastByte);
}
i = i + 2 + data.scripts[i + 1];
result.append(0); // reserve a slot for this group
}
firstDigitPrimary = data.getFirstPrimaryForGroup(Collator.ReorderCodes.DIGIT);
firstLatinPrimary = data.getFirstPrimaryForGroup(UScript.LATIN);
lastLatinPrimary = data.getLastPrimaryForGroup(UScript.LATIN);
if(firstDigitPrimary == 0 || firstLatinPrimary == 0) {
// missing data
return false;
}
return true;
}
@ -173,23 +161,21 @@ final class CollationFastLatinBuilder {
}
// Both or neither must be potentially-variable,
// so that we can test only one and determine if both are variable.
if(p >= firstDigitPrimary) {
return q >= firstDigitPrimary;
} else if(q >= firstDigitPrimary) {
long lastVariablePrimary = lastSpecialPrimaries[NUM_SPECIAL_GROUPS - 1];
if(p > lastVariablePrimary) {
return q > lastVariablePrimary;
} else if(q > lastVariablePrimary) {
return false;
}
// Both will be encoded with long mini primaries.
// They must be in the same special reordering group,
// so that we can test only one and determine if both are variable.
p >>= 24; // first primary byte
q >>= 24;
assert(p != 0 && q != 0);
assert(p <= result.charAt(headerLength - 1)); // the loop will terminate
for(int i = 1;; ++i) {
long lastByte = result.charAt(i);
if(p <= lastByte) {
return q <= lastByte;
} else if(q <= lastByte) {
for(int i = 0;; ++i) { // will terminate
long lastPrimary = lastSpecialPrimaries[i];
if(p <= lastPrimary) {
return q <= lastPrimary;
} else if(q <= lastPrimary) {
return false;
}
}
@ -416,8 +402,8 @@ final class CollationFastLatinBuilder {
private void encodeUniqueCEs() {
miniCEs = new char[uniqueCEs.size()];
int group = 1;
long lastGroupByte = result.charAt(group);
int group = 0;
long lastGroupPrimary = lastSpecialPrimaries[group];
// The lowest unique CE must be at least a secondary CE.
assert(((int)uniqueCEs.elementAti(0) >>> 16) != 0);
long prevPrimary = 0;
@ -431,16 +417,15 @@ final class CollationFastLatinBuilder {
// (uniqueCEs does not store case bits.)
long p = ce >>> 32;
if(p != prevPrimary) {
int p1 = (int)(p >> 24);
while(p1 > lastGroupByte) {
while(p > lastGroupPrimary) {
assert(pri <= CollationFastLatin.MAX_LONG);
// Add the last "long primary" in or before the group
// into the upper 9 bits of the group entry.
result.setCharAt(group, (char)((pri << 4) | lastGroupByte));
if(++group < headerLength) { // group is 1-based
lastGroupByte = result.charAt(group);
// Set the group's header entry to the
// last "long primary" in or before the group.
result.setCharAt(1 + group, (char)pri);
if(++group < NUM_SPECIAL_GROUPS) {
lastGroupPrimary = lastSpecialPrimaries[group];
} else {
lastGroupByte = 0xff;
lastGroupPrimary = 0xffffffffL;
break;
}
}
@ -686,6 +671,10 @@ final class CollationFastLatinBuilder {
return (ce >>> 32) == Collation.NO_CE_PRIMARY && ce != Collation.NO_CE;
}
// space, punct, symbol, currency (not digit)
private static final int NUM_SPECIAL_GROUPS =
Collator.ReorderCodes.CURRENCY - Collator.ReorderCodes.FIRST + 1;
private static final long CONTRACTION_FLAG = 0x80000000L;
// temporary "buffer"
@ -699,7 +688,8 @@ final class CollationFastLatinBuilder {
/** One 16-bit mini CE per unique CE. */
private char[] miniCEs;
// These are constant for a given list of CollationData.scripts.
// These are constant for a given root collator.
long[] lastSpecialPrimaries = new long[NUM_SPECIAL_GROUPS];
private long firstDigitPrimary;
private long firstLatinPrimary;
private long lastLatinPrimary;

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2012-2014, International Business Machines
* Copyright (C) 2012-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* CollationKeys.java, ported from collationkeys.h/.cpp
@ -348,7 +348,6 @@ public final class CollationKeys /* all methods are static */ {
// +1 so that we can use "<" and primary ignorables test out early.
variableTop = settings.variableTop + 1;
}
byte[] reorderTable = settings.reorderTable;
int tertiaryMask = CollationSettings.getTertiaryMask(options);
@ -358,7 +357,7 @@ public final class CollationKeys /* all methods are static */ {
SortKeyLevel tertiaries = getSortKeyLevel(levels, Collation.TERTIARY_LEVEL_FLAG);
SortKeyLevel quaternaries = getSortKeyLevel(levels, Collation.QUATERNARY_LEVEL_FLAG);
int compressedP1 = 0; // 0==no compression; otherwise reordered compressible lead byte
long prevReorderedPrimary = 0; // 0==no compression
int commonCases = 0;
int commonSecondaries = 0;
int commonTertiaries = 0;
@ -387,16 +386,15 @@ public final class CollationKeys /* all methods are static */ {
}
do {
if ((levels & Collation.QUATERNARY_LEVEL_FLAG) != 0) {
int p1 = (int) p >>> 24;
if (reorderTable != null) {
p1 = reorderTable[p1] & 0xff;
if (settings.hasReordering()) {
p = settings.reorder(p);
}
if (p1 >= QUAT_SHIFTED_LIMIT_BYTE) {
if (((int) p >>> 24) >= QUAT_SHIFTED_LIMIT_BYTE) {
// Prevent shifted primary lead bytes from
// overlapping with the common compression range.
quaternaries.appendByte(QUAT_SHIFTED_LIMIT_BYTE);
}
quaternaries.appendWeight32((p1 << 24) | (p & 0xffffff));
quaternaries.appendWeight32(p);
}
do {
ce = iter.nextCE();
@ -409,13 +407,15 @@ public final class CollationKeys /* all methods are static */ {
// If ce==NO_CE, then write nothing for the primary level but
// terminate compression on all levels and then exit the loop.
if (p > Collation.NO_CE_PRIMARY && (levels & Collation.PRIMARY_LEVEL_FLAG) != 0) {
int p1 = (int) p >>> 24;
if (reorderTable != null) {
p1 = reorderTable[p1] & 0xff;
// Test the un-reordered primary for compressibility.
boolean isCompressible = compressibleBytes[(int) p >>> 24];
if(settings.hasReordering()) {
p = settings.reorder(p);
}
if (p1 != compressedP1) {
if (compressedP1 != 0) {
if (p1 < compressedP1) {
int p1 = (int) p >>> 24;
if (!isCompressible || p1 != ((int) prevReorderedPrimary >>> 24)) {
if (prevReorderedPrimary != 0) {
if (p < prevReorderedPrimary) {
// No primary compression terminator
// at the end of the level or merged segment.
if (p1 > Collation.MERGE_SEPARATOR_BYTE) {
@ -426,12 +426,10 @@ public final class CollationKeys /* all methods are static */ {
}
}
sink.Append(p1);
// Test the un-reordered lead byte for compressibility but
// remember the reordered lead byte.
if (compressibleBytes[(int) p >>> 24]) {
compressedP1 = p1;
if(isCompressible) {
prevReorderedPrimary = p;
} else {
compressedP1 = 0;
prevReorderedPrimary = 0;
}
}
byte p2 = (byte) (p >>> 16);

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Copyright (C) 2013-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* CollationRuleParser.java, ported from collationruleparser.h/.cpp
@ -718,17 +718,14 @@ public final class CollationRuleParser {
reorderCodes.add(code);
i = limit;
}
int length = reorderCodes.size();
if(length == 1 && reorderCodes.get(0) == Collator.ReorderCodes.NONE) {
if(reorderCodes.isEmpty()) {
settings.resetReordering();
return;
} else {
int[] codes = new int[reorderCodes.size()];
int j = 0;
for(Integer code : reorderCodes) { codes[j++] = code; }
settings.setReordering(baseData, codes);
}
int[] codes = new int[reorderCodes.size()];
int j = 0;
for(Integer code : reorderCodes) { codes[j++] = code; }
byte[] table = new byte[256];
baseData.makeReorderTable(codes, table);
settings.setReordering(codes, table);
}
private static final String[] gSpecialReorderCodes = {

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Copyright (C) 2013-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* CollationSettings.java, ported from collationsettings.h/.cpp
@ -93,7 +93,7 @@ public final class CollationSettings extends SharedObject {
@Override
public CollationSettings clone() {
CollationSettings newSettings = (CollationSettings)super.clone();
// Note: The reorderTable and reorderCodes need not be cloned
// Note: The reorderTable, reorderRanges, and reorderCodes need not be cloned
// because, in Java, they only get replaced but not modified.
newSettings.fastLatinPrimaries = fastLatinPrimaries.clone();
return newSettings;
@ -125,16 +125,180 @@ public final class CollationSettings extends SharedObject {
// When we turn off reordering, we want to set a null permutation
// rather than a no-op permutation.
reorderTable = null;
minHighNoReorder = 0;
reorderRanges = null;
reorderCodes = EMPTY_INT_ARRAY;
}
// No aliasReordering() in Java. Use setReordering(). See comments near reorderCodes.
public void setReordering(int[] codes, byte[] table) {
void aliasReordering(CollationData data, int[] codesAndRanges, int codesLength, byte[] table) {
int[] codes;
if(codesLength == codesAndRanges.length) {
codes = codesAndRanges;
} else {
// TODO: Java 6: Arrays.copyOf(codes, codesLength);
codes = new int[codesLength];
System.arraycopy(codesAndRanges, 0, codes, 0, codesLength);
}
int rangesStart = codesLength;
int rangesLimit = codesAndRanges.length;
int rangesLength = rangesLimit - rangesStart;
if(table != null &&
(rangesLength == 0 ?
!reorderTableHasSplitBytes(table) :
rangesLength >= 2 &&
// The first offset must be 0. The last offset must not be 0.
(codesAndRanges[rangesStart] & 0xffff) == 0 &&
(codesAndRanges[rangesLimit - 1] & 0xffff) != 0)) {
reorderTable = table;
reorderCodes = codes;
// Drop ranges before the first split byte. They are reordered by the table.
// This then speeds up reordering of the remaining ranges.
int firstSplitByteRangeIndex = rangesStart;
while(firstSplitByteRangeIndex < rangesLimit &&
(codesAndRanges[firstSplitByteRangeIndex] & 0xff0000) == 0) {
// The second byte of the primary limit is 0.
++firstSplitByteRangeIndex;
}
if(firstSplitByteRangeIndex == rangesLimit) {
assert(!reorderTableHasSplitBytes(table));
minHighNoReorder = 0;
reorderRanges = null;
} else {
assert(table[codesAndRanges[firstSplitByteRangeIndex] >>> 24] == 0);
minHighNoReorder = codesAndRanges[rangesLimit - 1] & 0xffff0000L;
setReorderRanges(codesAndRanges, firstSplitByteRangeIndex,
rangesLimit - firstSplitByteRangeIndex);
}
return;
}
// Regenerate missing data.
setReordering(data, codes);
}
public void setReordering(CollationData data, int[] codes) {
if(codes.length == 0 || (codes.length == 1 && codes[0] == Collator.ReorderCodes.NONE)) {
resetReordering();
return;
}
UVector32 rangesList = new UVector32();
data.makeReorderRanges(codes, rangesList);
int rangesLength = rangesList.size();
if(rangesLength == 0) {
resetReordering();
return;
}
int[] ranges = rangesList.getBuffer();
// ranges[] contains at least two (limit, offset) pairs.
// The first offset must be 0. The last offset must not be 0.
// Separators (at the low end) and trailing weights (at the high end)
// are never reordered.
assert(rangesLength >= 2);
assert((ranges[0] & 0xffff) == 0 && (ranges[rangesLength - 1] & 0xffff) != 0);
minHighNoReorder = ranges[rangesLength - 1] & 0xffff0000L;
// Write the lead byte permutation table.
// Set a 0 for each lead byte that has a range boundary in the middle.
byte[] table = new byte[256];
int b = 0;
int firstSplitByteRangeIndex = -1;
for(int i = 0; i < rangesLength; ++i) {
int pair = ranges[i];
int limit1 = pair >>> 24;
while(b < limit1) {
table[b] = (byte)(b + pair);
++b;
}
// Check the second byte of the limit.
if((pair & 0xff0000) != 0) {
table[limit1] = 0;
b = limit1 + 1;
if(firstSplitByteRangeIndex < 0) {
firstSplitByteRangeIndex = i;
}
}
}
while(b <= 0xff) {
table[b] = (byte)b;
++b;
}
int rangesStart;
if(firstSplitByteRangeIndex < 0) {
// The lead byte permutation table alone suffices for reordering.
rangesStart = rangesLength = 0;
} else {
// Remove the ranges below the first split byte.
rangesStart = firstSplitByteRangeIndex;
rangesLength -= firstSplitByteRangeIndex;
}
setReorderArrays(codes, ranges, rangesStart, rangesLength, table);
}
private void setReorderArrays(int[] codes,
int[] ranges, int rangesStart, int rangesLength, byte[] table) {
// Very different from C++. See the comments after the reorderCodes declaration.
if(codes == null) {
codes = EMPTY_INT_ARRAY;
}
assert (codes.length == 0) == (table == null);
reorderTable = table;
reorderCodes = codes;
setReorderRanges(ranges, rangesStart, rangesLength);
}
private void setReorderRanges(int[] ranges, int rangesStart, int rangesLength) {
if(rangesLength == 0) {
reorderRanges = null;
} else {
reorderRanges = new long[rangesLength];
int i = 0;
do {
reorderRanges[i++] = ranges[rangesStart++] & 0xffffffffL;
} while(i < rangesLength);
}
}
public void copyReorderingFrom(CollationSettings other) {
if(!other.hasReordering()) {
resetReordering();
return;
}
minHighNoReorder = other.minHighNoReorder;
reorderTable = other.reorderTable;
reorderRanges = other.reorderRanges;
reorderCodes = other.reorderCodes;
}
public boolean hasReordering() { return reorderTable != null; }
private static boolean reorderTableHasSplitBytes(byte[] table) {
assert(table[0] == 0);
for(int i = 1; i < 256; ++i) {
if(table[i] == 0) {
return true;
}
}
return false;
}
public long reorder(long p) {
byte b = reorderTable[(int)p >>> 24];
if(b != 0 || p <= Collation.NO_CE_PRIMARY) {
return ((b & 0xffL) << 24) | (p & 0xffffff);
} else {
return reorderEx(p);
}
}
private long reorderEx(long p) {
assert minHighNoReorder > 0;
if(p >= minHighNoReorder) { return p; }
// Round up p so that its lower 16 bits are >= any offset bits.
// Then compare q directly with (limit, offset) pairs.
long q = p | 0xffff;
long r;
int i = 0;
while(q >= (r = reorderRanges[i])) { ++i; }
return p + ((long)(short)r << 24);
}
// In C++, we use enums for attributes and their values, with a special value for the default.
@ -276,11 +440,39 @@ public final class CollationSettings extends SharedObject {
(MAX_VAR_PUNCT << MAX_VARIABLE_SHIFT);
/** Variable-top primary weight. */
public long variableTop;
/** 256-byte table for reordering permutation of primary lead bytes; null if no reordering. */
/**
* 256-byte table for reordering permutation of primary lead bytes; null if no reordering.
* A 0 entry at a non-zero index means that the primary lead byte is "split"
* (there are different offsets for primaries that share that lead byte)
* and the reordering offset must be determined via the reorderRanges.
*/
public byte[] reorderTable;
/** Limit of last reordered range. 0 if no reordering or no split bytes. */
long minHighNoReorder;
/**
* Primary-weight ranges for script reordering,
* to be used by reorder(p) for split-reordered primary lead bytes.
*
* <p>Each entry is a (limit, offset) pair.
* The upper 16 bits of the entry are the upper 16 bits of the
* exclusive primary limit of a range.
* Primaries between the previous limit and this one have their lead bytes
* modified by the signed offset (-0xff..+0xff) stored in the lower 16 bits.
*
* <p>CollationData.makeReorderRanges() writes a full list where the first range
* (at least for terminators and separators) has a 0 offset.
* The last range has a non-zero offset.
* minHighNoReorder is set to the limit of that last range.
*
* <p>In the settings object, the initial ranges before the first split lead byte
* are omitted for efficiency; they are handled by reorder(p) via the reorderTable.
* If there are no split-reordered lead bytes, then no ranges are needed.
*/
long[] reorderRanges;
/** Array of reorder codes; ignored if length == 0. */
public int[] reorderCodes = EMPTY_INT_ARRAY;
// Note: In C++, we keep a memory block around for the reorder codes and the permutation table,
// Note: In C++, we keep a memory block around for the reorder codes,
// the ranges, and the permutation table,
// and modify them for new codes.
// In Java, we simply copy references and then never modify the array contents.
// The caller must abandon the arrays.

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Copyright (C) 2013-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* CollationTailoring.java, ported from collationtailoring.h/.cpp
@ -33,6 +33,7 @@ public final class CollationTailoring {
if(baseSettings != null) {
assert(baseSettings.readOnly().reorderCodes.length == 0);
assert(baseSettings.readOnly().reorderTable == null);
assert(baseSettings.readOnly().minHighNoReorder == 0);
settings = baseSettings.clone();
} else {
settings = new SharedObject.Reference<CollationSettings>(new CollationSettings());

View file

@ -1,6 +1,6 @@
/**
*******************************************************************************
* Copyright (C) 1996-2014, International Business Machines Corporation and
* Copyright (C) 1996-2015, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
@ -399,27 +399,35 @@ public abstract class Collator implements Comparator<Object>, Freezable<Collator
/**
* Sets the reordering codes for this collator.
* <p>Collation reordering allows scripts and some other defined blocks of characters
* to be moved relative to each other as a block. This reordering is done on top of
* Collation reordering allows scripts and some other groups of characters
* to be moved relative to each other. This reordering is done on top of
* the DUCET/CLDR standard collation order. Reordering can specify groups to be placed
* at the start and/or the end of the collation order. These groups are specified using
* UScript codes and UColReorderCode entries.
* UScript codes and {@link Collator.ReorderCodes} entries.
*
* <p>By default, reordering codes specified for the start of the order are placed in the
* order given after a group of "special" non-script blocks. These special groups of characters
* order given after several special non-script blocks. These special groups of characters
* are space, punctuation, symbol, currency, and digit. These special groups are represented with
* UColReorderCode entries. Script groups can be intermingled with
* these special non-script blocks if those special blocks are explicitly specified in the reordering.
* <p>The special code OTHERS stands for any script that is not explicitly
* {@link Collator.ReorderCodes} entries. Script groups can be intermingled with
* these special non-script groups if those special groups are explicitly specified in the reordering.
*
* <p>The special code {@link Collator.ReorderCodes#OTHERS OTHERS}
* stands for any script that is not explicitly
* mentioned in the list of reordering codes given. Anything that is after OTHERS
* will go at the very end of the reordering in the order given.
* <p>The special reorder code DEFAULT will reset the reordering for this collator
*
* <p>The special reorder code {@link Collator.ReorderCodes#DEFAULT DEFAULT}
* will reset the reordering for this collator
* to the default for this collator. The default reordering may be the DUCET/CLDR order or may be a reordering that
* was specified when this collator was created from resource data or from rules. The
* DEFAULT code <b>must</b> be the sole code supplied when it used. If not
* that will result in an U_ILLEGAL_ARGUMENT_ERROR being set.
* <p>The special reorder code NONE will remove any reordering for this collator.
* DEFAULT code <b>must</b> be the sole code supplied when it is used.
* If not, then an {@link IllegalArgumentException} will be thrown.
*
* <p>The special reorder code {@link Collator.ReorderCodes#NONE NONE}
* will remove any reordering for this collator.
* The result of setting no reordering will be to have the DUCET/CLDR ordering used. The
* NONE code <b>must</b> be the sole code supplied when it used.
* NONE code <b>must</b> be the sole code supplied when it is used.
*
* @param order the reordering codes to apply to this collator; if this is null or an empty array
* then this clears any existing reordering
* @see #getReorderCodes
@ -1401,7 +1409,9 @@ public abstract class Collator implements Comparator<Object>, Freezable<Collator
/**
* Retrieves all the reorder codes that are grouped with the given reorder code. Some reorder
* codes are grouped and must reorder together.
*
* Beginning with ICU 55, scripts only reorder together if they are primary-equal,
* for example Hiragana and Katakana.
*
* @param reorderCode The reorder code to determine equivalence for.
* @return the set of all reorder codes in the same group as the given reorder code.
* @see #setReorderCodes

View file

@ -1,6 +1,6 @@
/**
*******************************************************************************
* Copyright (C) 1996-2014, International Business Machines Corporation and
* Copyright (C) 1996-2015, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
@ -34,6 +34,7 @@ import com.ibm.icu.impl.coll.FCDUTF16CollationIterator;
import com.ibm.icu.impl.coll.SharedObject;
import com.ibm.icu.impl.coll.TailoredSet;
import com.ibm.icu.impl.coll.UTF16CollationIterator;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.VersionInfo;
@ -909,35 +910,18 @@ public final class RuleBasedCollator extends Collator {
setFastLatinOptions(ownedSettings);
}
/**
* Sets the reordering codes for this collator.
* Collation reordering allows scripts and some other defined blocks of characters
* to be moved relative to each other as a block. This reordering is done on top of
* the DUCET/CLDR standard collation order. Reordering can specify groups to be placed
* at the start and/or the end of the collation order.
* <p>By default, reordering codes specified for the start of the order are placed in the
* order given after a group of special non-script blocks. These special groups of characters
* are space, punctuation, symbol, currency, and digit. These special groups are represented with
* {@link Collator.ReorderCodes}. Script groups can be intermingled with
* these special non-script blocks if those special blocks are explicitly specified in the reordering.
* <p>The special code {@link Collator.ReorderCodes#OTHERS OTHERS} stands for any script that is not explicitly
* mentioned in the list of reordering codes given. Anything that is after {@link Collator.ReorderCodes#OTHERS OTHERS}
* will go at the very end of the reordering in the order given.
* <p>The special reorder code {@link Collator.ReorderCodes#DEFAULT DEFAULT} will reset the reordering for this collator
* to the default for this collator. The default reordering may be the DUCET/CLDR order or may be a reordering that
* was specified when this collator was created from resource data or from rules. The
* {@link Collator.ReorderCodes#DEFAULT DEFAULT} code <b>must</b> be the sole code supplied when it used. If not
* that will result in an {@link IllegalArgumentException} being thrown.
* <p>The special reorder code {@link Collator.ReorderCodes#NONE NONE} will remove any reordering for this collator.
* The result of setting no reordering will be to have the DUCET/CLDR reordering used. The
* {@link Collator.ReorderCodes#NONE NONE} code <b>must</b> be the sole code supplied when it used.
/**
* {@inheritDoc}
*
* @param order the reordering codes to apply to this collator; if this is null or an empty array
* then this clears any existing reordering
* @throws IllegalArgumentException if the reordering codes are malformed in any way (e.g. duplicates, multiple reset codes, overlapping equivalent scripts)
* @see #getReorderCodes
* @see Collator#getEquivalentReorderCodes
* @see Collator.ReorderCodes
* @see UScript
* @stable ICU 4.8
*/
*/
@Override
public void setReorderCodes(int... order) {
checkNotFrozen();
@ -954,8 +938,7 @@ public final class RuleBasedCollator extends Collator {
if(length == 1 && order[0] == Collator.ReorderCodes.DEFAULT) {
if(settings.readOnly() != defaultSettings) {
CollationSettings ownedSettings = getOwnedSettings();
ownedSettings.setReordering(defaultSettings.reorderCodes,
defaultSettings.reorderTable);
ownedSettings.copyReorderingFrom(defaultSettings);
setFastLatinOptions(ownedSettings);
}
return;
@ -964,9 +947,7 @@ public final class RuleBasedCollator extends Collator {
if(length == 0) {
ownedSettings.resetReordering();
} else {
byte[] reorderTable = new byte[256];
data.makeReorderTable(order, reorderTable);
ownedSettings.setReordering(order.clone(), reorderTable);
ownedSettings.setReordering(data, order.clone());
}
setFastLatinOptions(ownedSettings);
}

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:d7bf72e445a207052fe2e2de0d70a989b69bc55da3df272f8e3096d6d9cb2ad0
size 11801973
oid sha256:49983175d1f04593f311dab35e6db8ad4b802d8c5de99a03d0e7333bd6ffcfc0
size 11802910

View file

@ -1,4 +1,4 @@
# Copyright (c) 2012-2014 International Business Machines
# Copyright (c) 2012-2015 International Business Machines
# Corporation and others. All Rights Reserved.
#
# This file should be in UTF-8 with a signature byte sequence ("BOM").
@ -2526,3 +2526,15 @@
<3 あ
<3 ァ
<1 い
** test: reorder single scripts not groups, ICU ticket 11449
@ root
% reorder Goth Latn
* compare
<1 4
<1 𐌰 # Gothic
<1 L
<1 Ω
# Before ICU 55, the following reordered together with Gothic.
<1 𐌈 # Old Italic
<1 𐑐 # Shavian

View file

@ -1,7 +1,6 @@
/*
/*
*******************************************************************************
* Copyright (C) 2002-2014, International Business Machines Corporation and
* Copyright (C) 2002-2015, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
@ -3153,7 +3152,7 @@ public class CollationMiscTest extends TestFmwk {
{
Collator myCollation;
int[] reorderCodes = {UScript.GREEK, UScript.HAN, ReorderCodes.PUNCTUATION};
int[] duplicateReorderCodes = {UScript.CUNEIFORM, UScript.GREEK, ReorderCodes.CURRENCY, UScript.EGYPTIAN_HIEROGLYPHS};
int[] duplicateReorderCodes = {UScript.HIRAGANA, UScript.GREEK, ReorderCodes.CURRENCY, UScript.KATAKANA};
int[] reorderCodesStartingWithDefault = {ReorderCodes.DEFAULT, UScript.GREEK, UScript.HAN, ReorderCodes.PUNCTUATION};
int[] retrievedReorderCodes;
String greekString = "\u03b1";
@ -3283,47 +3282,7 @@ public class CollationMiscTest extends TestFmwk {
errln("ERROR: retrieved reorder codes do not match set reorder codes.");
}
}
public void TestSameLeadBytScriptReorder(){
String[] testSourceCases = {
"\ud800\udf31", // Gothic
"\ud801\udc50", // Shavian
};
String[] testTargetCases = {
"\u0100", // Latin Extended-A
"\u2c74", // Latin Extended-C
};
int[] results = {
-1,
-1,
};
Collator myCollation;
String rules = "[reorder Goth Latn]";
try {
myCollation = new RuleBasedCollator(rules);
} catch (Exception e) {
warnln("ERROR: in creation of rule based collator");
return;
}
myCollation.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
myCollation.setStrength(Collator.TERTIARY);
for (int i = 0; i < testSourceCases.length ; i++)
{
CollationTest.doTest(this, (RuleBasedCollator)myCollation,
testSourceCases[i], testTargetCases[i],
results[i]);
}
// ensure that the non-reordered and reordered collation is the same
Collator nonReorderdCollator = RuleBasedCollator.getInstance();
int nonReorderedResults = nonReorderdCollator.compare(testSourceCases[0], testSourceCases[1]);
CollationTest.doTest(this, (RuleBasedCollator)myCollation,
testSourceCases[0], testSourceCases[1], nonReorderedResults);
}
static boolean containsExpectedScript(int[] scripts, int expectedScript) {
for (int i = 0; i < scripts.length; ++i) {
if (expectedScript == scripts[i]) { return true; }
@ -3332,66 +3291,87 @@ public class CollationMiscTest extends TestFmwk {
}
public void TestEquivalentReorderingScripts() {
// Beginning with ICU 55, collation reordering moves single scripts
// rather than groups of scripts,
// except where scripts share a range and sort primary-equal.
final int[] expectedScripts = {
UScript.BOPOMOFO, //Bopo
UScript.LISU, //Lisu
UScript.LYCIAN, //Lyci
UScript.CARIAN, //Cari
UScript.LYDIAN, //Lydi
UScript.YI, //Yiii
UScript.OLD_ITALIC, //Ital
UScript.GOTHIC, //Goth
UScript.DESERET, //Dsrt
UScript.SHAVIAN, //Shaw
UScript.OSMANYA, //Osma
UScript.LINEAR_B, //Linb
UScript.CYPRIOT, //Cprt
UScript.OLD_SOUTH_ARABIAN, //Sarb
UScript.AVESTAN, //Avst
UScript.IMPERIAL_ARAMAIC, //Armi
UScript.INSCRIPTIONAL_PARTHIAN, //Prti
UScript.INSCRIPTIONAL_PAHLAVI, //Phli
UScript.UGARITIC, //Ugar
UScript.OLD_PERSIAN, //Xpeo
UScript.CUNEIFORM, //Xsux
UScript.EGYPTIAN_HIEROGLYPHS, //Egyp
UScript.PHONETIC_POLLARD, //Plrd
UScript.SORA_SOMPENG, //Sora
UScript.MEROITIC_CURSIVE, //Merc
UScript.MEROITIC_HIEROGLYPHS //Mero
UScript.HIRAGANA,
UScript.KATAKANA,
UScript.KATAKANA_OR_HIRAGANA
};
int[] equivalentScripts = RuleBasedCollator.getEquivalentReorderCodes(UScript.GOTHIC);
if (equivalentScripts.length < expectedScripts.length) {
errln(String.format("ERROR/Gothic: retrieved equivalent script length wrong: " +
"expected at least %d, was = %d",
if (equivalentScripts.length != 1 || equivalentScripts[0] != UScript.GOTHIC) {
errln(String.format("ERROR/Gothic: retrieved equivalent scripts wrong: " +
"length expected 1, was = %d; expected [%d] was [%d]",
equivalentScripts.length, UScript.GOTHIC, equivalentScripts[0]));
}
equivalentScripts = RuleBasedCollator.getEquivalentReorderCodes(UScript.HIRAGANA);
if (equivalentScripts.length != expectedScripts.length) {
errln(String.format("ERROR/Hiragana: retrieved equivalent script length wrong: " +
"expected %d, was = %d",
expectedScripts.length, equivalentScripts.length));
}
int prevScript = -1;
for (int i = 0; i < equivalentScripts.length; ++i) {
int script = equivalentScripts[i];
if (script <= prevScript) {
errln("ERROR/Gothic: equivalent scripts out of order at index " + i);
errln("ERROR/Hiragana: equivalent scripts out of order at index " + i);
}
prevScript = script;
}
for (int code : expectedScripts) {
if (!containsExpectedScript(equivalentScripts, code)) {
errln("ERROR/Gothic: equivalent scripts do not contain " + code);
errln("ERROR/Hiragana: equivalent scripts do not contain " + code);
}
}
equivalentScripts = RuleBasedCollator.getEquivalentReorderCodes(UScript.SHAVIAN);
if (equivalentScripts.length < expectedScripts.length) {
errln(String.format("ERROR/Shavian: retrieved equivalent script length wrong: " +
"expected at least %d, was = %d",
equivalentScripts = RuleBasedCollator.getEquivalentReorderCodes(UScript.KATAKANA);
if (equivalentScripts.length != expectedScripts.length) {
errln(String.format("ERROR/Katakana: retrieved equivalent script length wrong: " +
"expected %d, was = %d",
expectedScripts.length, equivalentScripts.length));
}
for (int code : expectedScripts) {
if (!containsExpectedScript(equivalentScripts, code)) {
errln("ERROR/Shavian: equivalent scripts do not contain " + code);
errln("ERROR/Katakana: equivalent scripts do not contain " + code);
}
}
equivalentScripts = RuleBasedCollator.getEquivalentReorderCodes(UScript.KATAKANA_OR_HIRAGANA);
if (equivalentScripts.length != expectedScripts.length) {
errln(String.format("ERROR/Hrkt: retrieved equivalent script length wrong: " +
"expected %d, was = %d",
expectedScripts.length, equivalentScripts.length));
}
equivalentScripts = RuleBasedCollator.getEquivalentReorderCodes(UScript.HAN);
if (equivalentScripts.length != 3) {
errln("ERROR/Hani: retrieved equivalent script length wrong: " +
"expected 3, was = " + equivalentScripts.length);
}
equivalentScripts = RuleBasedCollator.getEquivalentReorderCodes(UScript.SIMPLIFIED_HAN);
if (equivalentScripts.length != 3) {
errln("ERROR/Hans: retrieved equivalent script length wrong: " +
"expected 3, was = " + equivalentScripts.length);
}
equivalentScripts = RuleBasedCollator.getEquivalentReorderCodes(UScript.TRADITIONAL_HAN);
if (equivalentScripts.length != 3) {
errln("ERROR/Hant: retrieved equivalent script length wrong: " +
"expected 3, was = " + equivalentScripts.length);
}
equivalentScripts = RuleBasedCollator.getEquivalentReorderCodes(UScript.MEROITIC_CURSIVE);
if (equivalentScripts.length != 2) {
errln("ERROR/Merc: retrieved equivalent script length wrong: " +
"expected 2, was = " + equivalentScripts.length);
}
equivalentScripts = RuleBasedCollator.getEquivalentReorderCodes(UScript.MEROITIC_HIEROGLYPHS);
if (equivalentScripts.length != 2) {
errln("ERROR/Mero: retrieved equivalent script length wrong: " +
"expected 2, was = " + equivalentScripts.length);
}
}
public void TestGreekFirstReorderCloning() {