mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-11449 reorder single scripts not groups, scripts/groups can start on top-16-bit boundaries, data formatVersion 5 for new scripts data and optional reorderRanges appended to reorderCodes
X-SVN-Rev: 36925
This commit is contained in:
parent
a9d7c3e4bd
commit
e65a679a26
16 changed files with 684 additions and 377 deletions
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010-2014, International Business Machines
|
||||
* Copyright (C) 2010-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* Collation.java, ported from collation.h/.cpp
|
||||
|
@ -587,9 +587,5 @@ public final class Collation {
|
|||
return makeCE(unassignedPrimaryFromCodePoint(c));
|
||||
}
|
||||
|
||||
static long reorder(byte[] reorderTable, long primary) {
|
||||
return ((reorderTable[(int)primary >>> 24] & 0xffL) << 24) | (primary & 0xffffff);
|
||||
}
|
||||
|
||||
// private Collation() // No instantiation.
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2014, International Business Machines
|
||||
* Copyright (C) 1996-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* CollationCompare.java, ported from collationcompare.h/.cpp
|
||||
|
@ -79,10 +79,9 @@ public final class CollationCompare /* all static */ {
|
|||
|
||||
if (leftPrimary != rightPrimary) {
|
||||
// Return the primary difference, with script reordering.
|
||||
byte[] reorderTable = settings.reorderTable;
|
||||
if (reorderTable != null) {
|
||||
leftPrimary = Collation.reorder(reorderTable, leftPrimary);
|
||||
rightPrimary = Collation.reorder(reorderTable, rightPrimary);
|
||||
if (settings.hasReordering()) {
|
||||
leftPrimary = settings.reorder(leftPrimary);
|
||||
rightPrimary = settings.reorder(rightPrimary);
|
||||
}
|
||||
return (leftPrimary < rightPrimary) ? Collation.LESS : Collation.GREATER;
|
||||
}
|
||||
|
@ -335,10 +334,9 @@ public final class CollationCompare /* all static */ {
|
|||
|
||||
if (leftQuaternary != rightQuaternary) {
|
||||
// Return the difference, with script reordering.
|
||||
byte[] reorderTable = settings.reorderTable;
|
||||
if (reorderTable != null) {
|
||||
leftQuaternary = Collation.reorder(reorderTable, leftQuaternary);
|
||||
rightQuaternary = Collation.reorder(reorderTable, rightQuaternary);
|
||||
if (settings.hasReordering()) {
|
||||
leftQuaternary = settings.reorder(leftQuaternary);
|
||||
rightQuaternary = settings.reorder(rightQuaternary);
|
||||
}
|
||||
return (leftQuaternary < rightQuaternary) ? Collation.LESS : Collation.GREATER;
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010-2014, International Business Machines
|
||||
* Copyright (C) 2010-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* CollationData.java, ported from collationdata.h/.cpp
|
||||
|
@ -16,6 +16,7 @@ import com.ibm.icu.impl.Trie2_32;
|
|||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.text.Collator;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.util.ICUException;
|
||||
|
||||
/**
|
||||
* Collation data container.
|
||||
|
@ -25,6 +26,14 @@ import com.ibm.icu.text.UnicodeSet;
|
|||
* Includes data for the collation base (root/default), aliased if this is not the base.
|
||||
*/
|
||||
public final class CollationData {
|
||||
// Note: The ucadata.icu loader could discover the reserved ranges by setting an array
|
||||
// parallel with the ranges, and resetting ranges that are indexed.
|
||||
// The reordering builder code could clone the resulting template array.
|
||||
static final int REORDER_RESERVED_BEFORE_LATIN = Collator.ReorderCodes.FIRST + 14;
|
||||
static final int REORDER_RESERVED_AFTER_LATIN = Collator.ReorderCodes.FIRST + 15;
|
||||
|
||||
static final int MAX_NUM_SPECIAL_REORDER_CODES = 8;
|
||||
|
||||
CollationData(Normalizer2Impl nfc) {
|
||||
nfcImpl = nfc;
|
||||
}
|
||||
|
@ -182,12 +191,8 @@ public final class CollationData {
|
|||
* or 0 if the script is unknown
|
||||
*/
|
||||
long getFirstPrimaryForGroup(int script) {
|
||||
int index = findScript(script);
|
||||
if(index < 0) {
|
||||
return 0;
|
||||
}
|
||||
long head = scripts[index];
|
||||
return (head & 0xff00) << 16;
|
||||
int index = getScriptIndex(script);
|
||||
return index == 0 ? 0 : (long)scriptStarts[index] << 16;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -197,13 +202,12 @@ public final class CollationData {
|
|||
* or 0 if the script is unknown
|
||||
*/
|
||||
public long getLastPrimaryForGroup(int script) {
|
||||
int index = findScript(script);
|
||||
if(index < 0) {
|
||||
int index = getScriptIndex(script);
|
||||
if(index == 0) {
|
||||
return 0;
|
||||
}
|
||||
int head = scripts[index];
|
||||
long lastByte = head & 0xff;
|
||||
return ((lastByte + 1) << 24) - 1;
|
||||
long limit = scriptStarts[index + 1];
|
||||
return (limit << 16) - 1;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -211,108 +215,154 @@ public final class CollationData {
|
|||
* @return the first script of the group, or -1 if the weight is beyond the last group
|
||||
*/
|
||||
public int getGroupForPrimary(long p) {
|
||||
p >>= 24; // Reordering groups are distinguished by primary lead bytes.
|
||||
for(int i = 0; i < scripts.length; i = i + 2 + scripts[i + 1]) {
|
||||
int lastByte = scripts[i] & 0xff;
|
||||
if(p <= lastByte) {
|
||||
return scripts[i + 2];
|
||||
p >>= 16;
|
||||
if(p < scriptStarts[1] || scriptStarts[scriptStarts.length - 1] <= p) {
|
||||
return -1;
|
||||
}
|
||||
int index = 1;
|
||||
while(p >= scriptStarts[index + 1]) { ++index; }
|
||||
for(int i = 0; i < numScripts; ++i) {
|
||||
if(scriptsIndex[i] == index) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
for(int i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
|
||||
if(scriptsIndex[numScripts + i] == index) {
|
||||
return Collator.ReorderCodes.FIRST + i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
private int findScript(int script) {
|
||||
if(script < 0 || 0xffff < script) { return -1; }
|
||||
for(int i = 0; i < scripts.length;) {
|
||||
int limit = i + 2 + scripts[i + 1];
|
||||
for(int j = i + 2; j < limit; ++j) {
|
||||
if(script == scripts[j]) { return i; }
|
||||
private int getScriptIndex(int script) {
|
||||
if(script < 0) {
|
||||
return 0;
|
||||
} else if(script < numScripts) {
|
||||
return scriptsIndex[script];
|
||||
} else if(script < Collator.ReorderCodes.FIRST) {
|
||||
return 0;
|
||||
} else {
|
||||
script -= Collator.ReorderCodes.FIRST;
|
||||
if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
|
||||
return scriptsIndex[numScripts + script];
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
i = limit;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
public int[] getEquivalentScripts(int script) {
|
||||
int i = findScript(script);
|
||||
if(i < 0) { return EMPTY_INT_ARRAY; }
|
||||
int length = scripts[i + 1];
|
||||
assert(length != 0);
|
||||
int dest[] = new int[length];
|
||||
i += 2;
|
||||
dest[0] = scripts[i++];
|
||||
for(int j = 1; j < length; ++j) {
|
||||
script = scripts[i++];
|
||||
// Sorted insertion.
|
||||
for(int k = j;; --k) {
|
||||
// Invariant: dest[k] is free to receive either script or dest[k - 1].
|
||||
if(k > 0 && script < dest[k - 1]) {
|
||||
dest[k] = dest[k - 1];
|
||||
} else {
|
||||
dest[k] = script;
|
||||
break;
|
||||
}
|
||||
int index = getScriptIndex(script);
|
||||
if(index == 0) { return EMPTY_INT_ARRAY; }
|
||||
if(script >= Collator.ReorderCodes.FIRST) {
|
||||
// Special groups have no aliases.
|
||||
return new int[] { script };
|
||||
}
|
||||
|
||||
int length = 0;
|
||||
for(int i = 0; i < numScripts; ++i) {
|
||||
if(scriptsIndex[i] == index) {
|
||||
++length;
|
||||
}
|
||||
}
|
||||
int[] dest = new int[length];
|
||||
if(length == 1) {
|
||||
dest[0] = script;
|
||||
return dest;
|
||||
}
|
||||
length = 0;
|
||||
for(int i = 0; i < numScripts; ++i) {
|
||||
if(scriptsIndex[i] == index) {
|
||||
dest[length++] = i;
|
||||
}
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes the permutation table for the given reordering of scripts and groups,
|
||||
* mapping from default-order primary-weight lead bytes to reordered lead bytes.
|
||||
* Writes the permutation of primary-weight ranges
|
||||
* for the given reordering of scripts and groups.
|
||||
* The caller checks for illegal arguments and
|
||||
* takes care of [DEFAULT] and memory allocation.
|
||||
*
|
||||
* <p>Each list element will be a (limit, offset) pair as described
|
||||
* for the CollationSettings.reorderRanges.
|
||||
* The list will be empty if no ranges are reordered.
|
||||
*/
|
||||
public void makeReorderTable(int[] reorder, byte[] table) {
|
||||
void makeReorderRanges(int[] reorder, UVector32 ranges) {
|
||||
makeReorderRanges(reorder, false, ranges);
|
||||
}
|
||||
|
||||
private void makeReorderRanges(int[] reorder, boolean latinMustMove, UVector32 ranges) {
|
||||
ranges.removeAllElements();
|
||||
int length = reorder.length;
|
||||
// Initialize the table.
|
||||
if(length == 0 || (length == 1 && reorder[0] == UScript.UNKNOWN)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Maps each script-or-group range to a new lead byte.
|
||||
short[] table = new short[scriptStarts.length - 1]; // C++: uint8_t[]
|
||||
|
||||
{
|
||||
// Set "don't care" values for reserved ranges.
|
||||
int index = scriptsIndex[
|
||||
numScripts + REORDER_RESERVED_BEFORE_LATIN - Collator.ReorderCodes.FIRST];
|
||||
if(index != 0) {
|
||||
table[index] = 0xff;
|
||||
}
|
||||
index = scriptsIndex[
|
||||
numScripts + REORDER_RESERVED_AFTER_LATIN - Collator.ReorderCodes.FIRST];
|
||||
if(index != 0) {
|
||||
table[index] = 0xff;
|
||||
}
|
||||
}
|
||||
|
||||
// Never reorder special low and high primary lead bytes.
|
||||
int lowByte;
|
||||
for(lowByte = 0; lowByte <= Collation.MERGE_SEPARATOR_BYTE; ++lowByte) {
|
||||
table[lowByte] = (byte)lowByte;
|
||||
}
|
||||
// lowByte == 03
|
||||
|
||||
int highByte;
|
||||
for(highByte = 0xff; highByte >= Collation.TRAIL_WEIGHT_BYTE; --highByte) {
|
||||
table[highByte] = (byte)highByte;
|
||||
}
|
||||
// highByte == FE
|
||||
|
||||
// Set intermediate bytes to 0 to indicate that they have not been set yet.
|
||||
for(int i = lowByte; i <= highByte; ++i) {
|
||||
table[i] = 0;
|
||||
}
|
||||
assert(scriptStarts.length >= 2);
|
||||
assert(scriptStarts[0] == 0);
|
||||
int lowStart = scriptStarts[1];
|
||||
assert(lowStart == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8));
|
||||
int highLimit = scriptStarts[scriptStarts.length - 1];
|
||||
assert(highLimit == (Collation.TRAIL_WEIGHT_BYTE << 8));
|
||||
|
||||
// Get the set of special reorder codes in the input list.
|
||||
// This supports up to 32 special reorder codes;
|
||||
// This supports a fixed number of special reorder codes;
|
||||
// it works for data with codes beyond Collator.ReorderCodes.LIMIT.
|
||||
int specials = 0;
|
||||
for(int i = 0; i < length; ++i) {
|
||||
int reorderCode = reorder[i] - Collator.ReorderCodes.FIRST;
|
||||
if(0 <= reorderCode && reorderCode <= 31) {
|
||||
if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
|
||||
specials |= 1 << reorderCode;
|
||||
}
|
||||
}
|
||||
|
||||
// Start the reordering with the special low reorder codes that do not occur in the input.
|
||||
for(int i = 0;; i += 3) {
|
||||
if(scripts[i + 1] != 1) { break; } // Went beyond special single-code reorder codes.
|
||||
int reorderCode = scripts[i + 2] - Collator.ReorderCodes.FIRST;
|
||||
if(reorderCode < 0) { break; } // Went beyond special reorder codes.
|
||||
if((specials & (1 << reorderCode)) == 0) {
|
||||
int head = scripts[i];
|
||||
int firstByte = head >> 8;
|
||||
int lastByte = head & 0xff;
|
||||
do { table[firstByte++] = (byte)lowByte++; } while(firstByte <= lastByte);
|
||||
for(int i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
|
||||
int index = scriptsIndex[numScripts + i];
|
||||
if(index != 0 && (specials & (1 << i)) == 0) {
|
||||
lowStart = addLowScriptRange(table, index, lowStart);
|
||||
}
|
||||
}
|
||||
|
||||
// Reorder according to the input scripts, continuing from the bottom of the bytes range.
|
||||
// Skip the reserved range before Latin if Latin is the first script,
|
||||
// so that we do not move it unnecessarily.
|
||||
int skippedReserved = 0;
|
||||
if(specials == 0 && reorder[0] == UScript.LATIN && !latinMustMove) {
|
||||
int index = scriptsIndex[UScript.LATIN];
|
||||
assert(index != 0);
|
||||
int start = scriptStarts[index];
|
||||
assert(lowStart <= start);
|
||||
skippedReserved = start - lowStart;
|
||||
lowStart = start;
|
||||
}
|
||||
|
||||
// Reorder according to the input scripts, continuing from the bottom of the primary range.
|
||||
boolean hasReorderToEnd = false;
|
||||
for(int i = 0; i < length;) {
|
||||
int script = reorder[i++];
|
||||
if(script == UScript.UNKNOWN) {
|
||||
// Put the remaining scripts at the top.
|
||||
hasReorderToEnd = true;
|
||||
while(i < length) {
|
||||
script = reorder[--length];
|
||||
if(script == UScript.UNKNOWN) { // Must occur at most once.
|
||||
|
@ -323,17 +373,14 @@ public final class CollationData {
|
|||
throw new IllegalArgumentException(
|
||||
"setReorderCodes(): UScript.DEFAULT together with other scripts");
|
||||
}
|
||||
int index = findScript(script);
|
||||
if(index < 0) { continue; }
|
||||
int head = scripts[index];
|
||||
int firstByte = head >> 8;
|
||||
int lastByte = head & 0xff;
|
||||
if(table[firstByte] != 0) { // Duplicate or equivalent script.
|
||||
int index = getScriptIndex(script);
|
||||
if(index == 0) { continue; }
|
||||
if(table[index] != 0) { // Duplicate or equivalent script.
|
||||
throw new IllegalArgumentException(
|
||||
"setReorderCodes(): duplicate or equivalent script " +
|
||||
scriptCodeString(script));
|
||||
}
|
||||
do { table[lastByte--] = (byte)highByte--; } while(firstByte <= lastByte);
|
||||
highLimit = addHighScriptRange(table, index, highLimit);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -343,25 +390,82 @@ public final class CollationData {
|
|||
throw new IllegalArgumentException(
|
||||
"setReorderCodes(): UScript.DEFAULT together with other scripts");
|
||||
}
|
||||
int index = findScript(script);
|
||||
if(index < 0) { continue; }
|
||||
int head = scripts[index];
|
||||
int firstByte = head >> 8;
|
||||
int lastByte = head & 0xff;
|
||||
if(table[firstByte] != 0) { // Duplicate or equivalent script.
|
||||
int index = getScriptIndex(script);
|
||||
if(index == 0) { continue; }
|
||||
if(table[index] != 0) { // Duplicate or equivalent script.
|
||||
throw new IllegalArgumentException(
|
||||
"setReorderCodes(): duplicate or equivalent script " +
|
||||
scriptCodeString(script));
|
||||
}
|
||||
do { table[firstByte++] = (byte)lowByte++; } while(firstByte <= lastByte);
|
||||
lowStart = addLowScriptRange(table, index, lowStart);
|
||||
}
|
||||
|
||||
// Put all remaining scripts into the middle.
|
||||
// Avoid table[0] which must remain 0.
|
||||
for(int i = 1; i <= 0xff; ++i) {
|
||||
if(table[i] == 0) { table[i] = (byte)lowByte++; }
|
||||
for(int i = 1; i < scriptStarts.length - 1; ++i) {
|
||||
int leadByte = table[i];
|
||||
if(leadByte != 0) { continue; }
|
||||
int start = scriptStarts[i];
|
||||
if(!hasReorderToEnd && start > lowStart) {
|
||||
// No need to move this script.
|
||||
lowStart = start;
|
||||
}
|
||||
lowStart = addLowScriptRange(table, i, lowStart);
|
||||
}
|
||||
assert(lowByte == highByte + 1);
|
||||
if(lowStart > highLimit) {
|
||||
if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {
|
||||
// Try not skipping the before-Latin reserved range.
|
||||
makeReorderRanges(reorder, true, ranges);
|
||||
return;
|
||||
}
|
||||
// We need more primary lead bytes than available, despite the reserved ranges.
|
||||
throw new ICUException(
|
||||
"setReorderCodes(): reordering too many partial-primary-lead-byte scripts");
|
||||
}
|
||||
|
||||
// Turn lead bytes into a list of (limit, offset) pairs.
|
||||
// Encode each pair in one list element:
|
||||
// Upper 16 bits = limit, lower 16 = signed lead byte offset.
|
||||
int offset = 0;
|
||||
for(int i = 1;; ++i) {
|
||||
int nextOffset = offset;
|
||||
while(i < scriptStarts.length - 1) {
|
||||
int newLeadByte = table[i];
|
||||
if(newLeadByte == 0xff) {
|
||||
// "Don't care" lead byte for reserved range, continue with current offset.
|
||||
} else {
|
||||
nextOffset = newLeadByte - (scriptStarts[i] >> 8);
|
||||
if(nextOffset != offset) { break; }
|
||||
}
|
||||
++i;
|
||||
}
|
||||
if(offset != 0 || i < scriptStarts.length - 1) {
|
||||
ranges.addElement(((int)scriptStarts[i] << 16) | (offset & 0xffff));
|
||||
}
|
||||
if(i == scriptStarts.length - 1) { break; }
|
||||
offset = nextOffset;
|
||||
}
|
||||
}
|
||||
|
||||
private int addLowScriptRange(short[] table, int index, int lowStart) {
|
||||
int start = scriptStarts[index];
|
||||
if((start & 0xff) < (lowStart & 0xff)) {
|
||||
lowStart += 0x100;
|
||||
}
|
||||
table[index] = (short)(lowStart >> 8);
|
||||
int limit = scriptStarts[index + 1];
|
||||
lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff);
|
||||
return lowStart;
|
||||
}
|
||||
|
||||
private int addHighScriptRange(short[] table, int index, int highLimit) {
|
||||
int limit = scriptStarts[index + 1];
|
||||
if((limit & 0xff) > (highLimit & 0xff)) {
|
||||
highLimit -= 0x100;
|
||||
}
|
||||
int start = scriptStarts[index];
|
||||
highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff);
|
||||
table[index] = (short)(highLimit >> 8);
|
||||
return highLimit;
|
||||
}
|
||||
|
||||
private static String scriptCodeString(int script) {
|
||||
|
@ -423,21 +527,25 @@ public final class CollationData {
|
|||
* Data for scripts and reordering groups.
|
||||
* Uses include building a reordering permutation table and
|
||||
* providing script boundaries to AlphabeticIndex.
|
||||
*
|
||||
* This data is a sorted list of primary-weight lead byte ranges (reordering groups),
|
||||
* each with a list of pairs sorted in base collation order;
|
||||
* each pair contains a script/reorder code and the lowest primary weight for that script.
|
||||
*
|
||||
* Data structure:
|
||||
* - Each reordering group is encoded in n+2 16-bit integers.
|
||||
* - First integer:
|
||||
* Bits 15..8: First byte of the reordering group's range.
|
||||
* Bits 7..0: Last byte of the reordering group's range.
|
||||
* - Second integer:
|
||||
* Length n of the list of script/reordering codes.
|
||||
* - Each further integer is a script or reordering code.
|
||||
*/
|
||||
char[] scripts;
|
||||
int numScripts;
|
||||
/**
|
||||
* The length of scriptsIndex is numScripts+16.
|
||||
* It maps from a UScriptCode or a special reorder code to an entry in scriptStarts.
|
||||
* 16 special reorder codes (not all used) are mapped starting at numScripts.
|
||||
* Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit.
|
||||
* There are special codes at the end for reorder-reserved primary ranges.
|
||||
*
|
||||
* <p>Multiple scripts may share a range and index, for example Hira & Kana.
|
||||
*/
|
||||
char[] scriptsIndex;
|
||||
/**
|
||||
* Start primary weight (top 16 bits only) for a group/script/reserved range
|
||||
* indexed by scriptsIndex.
|
||||
* The first range (separators & terminators) and the last range (trailing weights)
|
||||
* are not reorderable, and no scriptsIndex entry points to them.
|
||||
*/
|
||||
char[] scriptStarts;
|
||||
|
||||
/**
|
||||
* Collation elements in the root collator.
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2012-2014, International Business Machines
|
||||
* Copyright (C) 2012-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* CollationDataBuilder.java, ported from collationdatabuilder.h/.cpp
|
||||
|
@ -310,7 +310,9 @@ final class CollationDataBuilder { // not final in C++
|
|||
if(base != null) {
|
||||
data.numericPrimary = base.numericPrimary;
|
||||
data.compressibleBytes = base.compressibleBytes;
|
||||
data.scripts = base.scripts;
|
||||
data.numScripts = base.numScripts;
|
||||
data.scriptsIndex = base.scriptsIndex;
|
||||
data.scriptStarts = base.scriptStarts;
|
||||
}
|
||||
buildFastLatinTable(data);
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2013-2014, International Business Machines
|
||||
* Copyright (C) 2013-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* CollationDataReader.java, ported from collationdatareader.h/.cpp
|
||||
|
@ -13,6 +13,7 @@ package com.ibm.icu.impl.coll;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.util.Arrays;
|
||||
|
||||
import com.ibm.icu.impl.ICUBinary;
|
||||
|
@ -143,6 +144,7 @@ final class CollationDataReader /* all static */ {
|
|||
|
||||
CollationData baseData = base == null ? null : base.data;
|
||||
int[] reorderCodes;
|
||||
int reorderCodesLength;
|
||||
index = IX_REORDER_CODES_OFFSET;
|
||||
offset = inIndexes[index];
|
||||
length = inIndexes[index + 1] - offset;
|
||||
|
@ -152,13 +154,27 @@ final class CollationDataReader /* all static */ {
|
|||
// the base data does not have a reordering.
|
||||
throw new ICUException("Collation base data must not reorder scripts");
|
||||
}
|
||||
reorderCodes = new int[length / 4];
|
||||
for(int i = 0; i < length / 4; ++i) {
|
||||
reorderCodesLength = length / 4;
|
||||
reorderCodes = new int[reorderCodesLength];
|
||||
for(int i = 0; i < reorderCodesLength; ++i) {
|
||||
reorderCodes[i] = inBytes.getInt();
|
||||
}
|
||||
length &= 3;
|
||||
|
||||
// The reorderRanges (if any) are the trailing reorderCodes entries.
|
||||
// Split the array at the boundary.
|
||||
// Script or reorder codes do not exceed 16-bit values.
|
||||
// Range limits are stored in the upper 16 bits, and are never 0.
|
||||
int reorderRangesLength = 0;
|
||||
while(reorderRangesLength < reorderCodesLength &&
|
||||
(reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) {
|
||||
++reorderRangesLength;
|
||||
}
|
||||
assert(reorderRangesLength < reorderCodesLength);
|
||||
reorderCodesLength -= reorderRangesLength;
|
||||
} else {
|
||||
reorderCodes = new int[0];
|
||||
reorderCodesLength = 0;
|
||||
}
|
||||
ICUBinary.skipBytes(inBytes, length);
|
||||
|
||||
|
@ -170,7 +186,7 @@ final class CollationDataReader /* all static */ {
|
|||
offset = inIndexes[index];
|
||||
length = inIndexes[index + 1] - offset;
|
||||
if(length >= 256) {
|
||||
if(reorderCodes.length == 0) {
|
||||
if(reorderCodesLength == 0) {
|
||||
throw new ICUException("Reordering table without reordering codes");
|
||||
}
|
||||
reorderTable = new byte[256];
|
||||
|
@ -410,15 +426,28 @@ final class CollationDataReader /* all static */ {
|
|||
if(data == null) {
|
||||
throw new ICUException("Script order data but no mappings");
|
||||
}
|
||||
data.scripts = new char[length / 2];
|
||||
for(int i = 0; i < length / 2; ++i) {
|
||||
data.scripts[i] = inBytes.getChar();
|
||||
int scriptsLength = length / 2;
|
||||
CharBuffer inChars = inBytes.asCharBuffer();
|
||||
data.numScripts = inChars.get();
|
||||
// There must be enough entries for both arrays, including more than two range starts.
|
||||
int scriptStartsLength = scriptsLength - (1 + data.numScripts + 16);
|
||||
if(scriptStartsLength <= 2) {
|
||||
throw new ICUException("Script order data too short");
|
||||
}
|
||||
inChars.get(data.scriptsIndex = new char[data.numScripts + 16]);
|
||||
inChars.get(data.scriptStarts = new char[scriptStartsLength]);
|
||||
if(!(data.scriptStarts[0] == 0 &&
|
||||
data.scriptStarts[1] == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8) &&
|
||||
data.scriptStarts[scriptStartsLength - 1] ==
|
||||
(Collation.TRAIL_WEIGHT_BYTE << 8))) {
|
||||
throw new ICUException("Script order data not valid");
|
||||
}
|
||||
length &= 1;
|
||||
} else if(data == null) {
|
||||
// Nothing to do.
|
||||
} else if(baseData != null) {
|
||||
data.scripts = baseData.scripts;
|
||||
data.numScripts = baseData.numScripts;
|
||||
data.scriptsIndex = baseData.scriptsIndex;
|
||||
data.scriptStarts = baseData.scriptStarts;
|
||||
}
|
||||
ICUBinary.skipBytes(inBytes, length);
|
||||
|
||||
|
@ -470,12 +499,8 @@ final class CollationDataReader /* all static */ {
|
|||
throw new ICUException("The maxVariable could not be mapped to a variableTop");
|
||||
}
|
||||
|
||||
if(reorderCodes.length == 0 || reorderTable != null) {
|
||||
settings.setReordering(reorderCodes, reorderTable);
|
||||
} else {
|
||||
byte[] table = new byte[256];
|
||||
baseData.makeReorderTable(reorderCodes, table);
|
||||
settings.setReordering(reorderCodes, table);
|
||||
if(reorderCodesLength != 0) {
|
||||
settings.aliasReordering(baseData, reorderCodes, reorderCodesLength, reorderTable);
|
||||
}
|
||||
|
||||
settings.fastLatinOptions = CollationFastLatin.getOptions(
|
||||
|
@ -486,7 +511,7 @@ final class CollationDataReader /* all static */ {
|
|||
private static final class IsAcceptable implements ICUBinary.Authenticate {
|
||||
// @Override when we switch to Java 6
|
||||
public boolean isDataVersionAcceptable(byte version[]) {
|
||||
return version[0] == 4;
|
||||
return version[0] == 5;
|
||||
}
|
||||
}
|
||||
private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2013-2014, International Business Machines
|
||||
* Copyright (C) 2013-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* CollationFastLatin.java, ported from collationfastlatin.h/.cpp
|
||||
|
@ -23,7 +23,7 @@ public final class CollationFastLatin /* all static */ {
|
|||
* When the major version number of the main data format changes,
|
||||
* we can reset this fast Latin version to 1.
|
||||
*/
|
||||
public static final int VERSION = 1;
|
||||
public static final int VERSION = 2;
|
||||
|
||||
public static final int LATIN_MAX = 0x17f;
|
||||
public static final int LATIN_LIMIT = LATIN_MAX + 1;
|
||||
|
@ -211,33 +211,50 @@ public final class CollationFastLatin /* all static */ {
|
|||
// lowest long mini primary.
|
||||
miniVarTop = MIN_LONG - 1;
|
||||
} else {
|
||||
int v1 = (int)(settings.variableTop >> 24);
|
||||
int headerLength = header[0] & 0xff;
|
||||
int i = headerLength - 1;
|
||||
if(i <= 0 || v1 > (header[i] & 0x7f)) {
|
||||
int i = 1 + settings.getMaxVariable();
|
||||
if(i >= headerLength) {
|
||||
return -1; // variableTop >= digits, should not occur
|
||||
}
|
||||
while(i > 1 && v1 <= (header[i - 1] & 0x7f)) { --i; }
|
||||
// In the table header, the miniVarTop is in bits 15..7, with 4 zero bits 19..16 implied.
|
||||
// Shift right to make it comparable with long mini primaries in bits 15..3.
|
||||
miniVarTop = (header[i] & 0xff80) >> 4;
|
||||
miniVarTop = header[i];
|
||||
}
|
||||
|
||||
byte[] reorderTable = settings.reorderTable;
|
||||
if(reorderTable != null) {
|
||||
char[] scripts = data.scripts;
|
||||
int length = data.scripts.length;
|
||||
int prevLastByte = 0;
|
||||
for(int i = 0; i < length;) {
|
||||
// reordered last byte of the group
|
||||
int lastByte = reorderTable[scripts[i] & 0xff] & 0xff;
|
||||
if(lastByte < prevLastByte) {
|
||||
// The permutation affects the groups up to Latin.
|
||||
return -1;
|
||||
boolean digitsAreReordered = false;
|
||||
if(settings.hasReordering()) {
|
||||
long prevStart = 0;
|
||||
long beforeDigitStart = 0;
|
||||
long digitStart = 0;
|
||||
long afterDigitStart = 0;
|
||||
for(int group = Collator.ReorderCodes.FIRST;
|
||||
group < Collator.ReorderCodes.FIRST + CollationData.MAX_NUM_SPECIAL_REORDER_CODES;
|
||||
++group) {
|
||||
long start = data.getFirstPrimaryForGroup(group);
|
||||
start = settings.reorder(start);
|
||||
if(group == Collator.ReorderCodes.DIGIT) {
|
||||
beforeDigitStart = prevStart;
|
||||
digitStart = start;
|
||||
} else if(start != 0) {
|
||||
if(start < prevStart) {
|
||||
// The permutation affects the groups up to Latin.
|
||||
return -1;
|
||||
}
|
||||
// In the future, there might be a special group between digits & Latin.
|
||||
if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) {
|
||||
afterDigitStart = start;
|
||||
}
|
||||
prevStart = start;
|
||||
}
|
||||
if(scripts[i + 2] == UScript.LATIN) { break; }
|
||||
i = i + 2 + scripts[i + 1];
|
||||
prevLastByte = lastByte;
|
||||
}
|
||||
long latinStart = data.getFirstPrimaryForGroup(UScript.LATIN);
|
||||
latinStart = settings.reorder(latinStart);
|
||||
if(latinStart < prevStart) {
|
||||
return -1;
|
||||
}
|
||||
if(afterDigitStart == 0) {
|
||||
afterDigitStart = latinStart;
|
||||
}
|
||||
if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) {
|
||||
digitsAreReordered = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -253,7 +270,7 @@ public final class CollationFastLatin /* all static */ {
|
|||
}
|
||||
primaries[c] = (char)p;
|
||||
}
|
||||
if((settings.options & CollationSettings.NUMERIC) != 0) {
|
||||
if(digitsAreReordered || (settings.options & CollationSettings.NUMERIC) != 0) {
|
||||
// Bail out for digits.
|
||||
for(int c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; }
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2013-2014, International Business Machines
|
||||
* Copyright (C) 2013-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* CollationFastLatinBuilder.java, ported from collationfastlatinbuilder.h/.cpp
|
||||
|
@ -127,38 +127,26 @@ final class CollationFastLatinBuilder {
|
|||
}
|
||||
|
||||
private boolean loadGroups(CollationData data) {
|
||||
result.append(0); // reserved for version & headerLength
|
||||
headerLength = 1 + NUM_SPECIAL_GROUPS;
|
||||
int r0 = (CollationFastLatin.VERSION << 8) | headerLength;
|
||||
result.append((char)r0);
|
||||
// The first few reordering groups should be special groups
|
||||
// (space, punct, ..., digit) followed by Latn, then Grek and other scripts.
|
||||
for(int i = 0;;) {
|
||||
if(i >= data.scripts.length) {
|
||||
throw new AssertionError("no Latn script");
|
||||
for(int i = 0; i < NUM_SPECIAL_GROUPS; ++i) {
|
||||
lastSpecialPrimaries[i] = data.getLastPrimaryForGroup(Collator.ReorderCodes.FIRST + i);
|
||||
if(lastSpecialPrimaries[i] == 0) {
|
||||
// missing data
|
||||
return false;
|
||||
}
|
||||
int head = data.scripts[i];
|
||||
int lastByte = head & 0xff; // last primary byte in the group
|
||||
int group = data.scripts[i + 2];
|
||||
if(group == Collator.ReorderCodes.DIGIT) {
|
||||
firstDigitPrimary = (long)(head & 0xff00) << 16;
|
||||
headerLength = result.length();
|
||||
int r0 = (CollationFastLatin.VERSION << 8) | headerLength;
|
||||
result.setCharAt(0, (char)r0);
|
||||
} else if(group == UScript.LATIN) {
|
||||
if(firstDigitPrimary == 0) {
|
||||
throw new AssertionError("no digit group");
|
||||
}
|
||||
firstLatinPrimary = (long)(head & 0xff00) << 16;
|
||||
lastLatinPrimary = ((long)lastByte << 24) | 0xffffff;
|
||||
break;
|
||||
} else if(firstDigitPrimary == 0) {
|
||||
// a group below digits
|
||||
if(lastByte > 0x7f) {
|
||||
// We only use 7 bits for the last byte of a below-digits group.
|
||||
// This does not warrant an errorCode, but we do not build a fast Latin table.
|
||||
return false;
|
||||
}
|
||||
result.append((char)lastByte);
|
||||
}
|
||||
i = i + 2 + data.scripts[i + 1];
|
||||
result.append(0); // reserve a slot for this group
|
||||
}
|
||||
|
||||
firstDigitPrimary = data.getFirstPrimaryForGroup(Collator.ReorderCodes.DIGIT);
|
||||
firstLatinPrimary = data.getFirstPrimaryForGroup(UScript.LATIN);
|
||||
lastLatinPrimary = data.getLastPrimaryForGroup(UScript.LATIN);
|
||||
if(firstDigitPrimary == 0 || firstLatinPrimary == 0) {
|
||||
// missing data
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -173,23 +161,21 @@ final class CollationFastLatinBuilder {
|
|||
}
|
||||
// Both or neither must be potentially-variable,
|
||||
// so that we can test only one and determine if both are variable.
|
||||
if(p >= firstDigitPrimary) {
|
||||
return q >= firstDigitPrimary;
|
||||
} else if(q >= firstDigitPrimary) {
|
||||
long lastVariablePrimary = lastSpecialPrimaries[NUM_SPECIAL_GROUPS - 1];
|
||||
if(p > lastVariablePrimary) {
|
||||
return q > lastVariablePrimary;
|
||||
} else if(q > lastVariablePrimary) {
|
||||
return false;
|
||||
}
|
||||
// Both will be encoded with long mini primaries.
|
||||
// They must be in the same special reordering group,
|
||||
// so that we can test only one and determine if both are variable.
|
||||
p >>= 24; // first primary byte
|
||||
q >>= 24;
|
||||
assert(p != 0 && q != 0);
|
||||
assert(p <= result.charAt(headerLength - 1)); // the loop will terminate
|
||||
for(int i = 1;; ++i) {
|
||||
long lastByte = result.charAt(i);
|
||||
if(p <= lastByte) {
|
||||
return q <= lastByte;
|
||||
} else if(q <= lastByte) {
|
||||
for(int i = 0;; ++i) { // will terminate
|
||||
long lastPrimary = lastSpecialPrimaries[i];
|
||||
if(p <= lastPrimary) {
|
||||
return q <= lastPrimary;
|
||||
} else if(q <= lastPrimary) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -416,8 +402,8 @@ final class CollationFastLatinBuilder {
|
|||
|
||||
private void encodeUniqueCEs() {
|
||||
miniCEs = new char[uniqueCEs.size()];
|
||||
int group = 1;
|
||||
long lastGroupByte = result.charAt(group);
|
||||
int group = 0;
|
||||
long lastGroupPrimary = lastSpecialPrimaries[group];
|
||||
// The lowest unique CE must be at least a secondary CE.
|
||||
assert(((int)uniqueCEs.elementAti(0) >>> 16) != 0);
|
||||
long prevPrimary = 0;
|
||||
|
@ -431,16 +417,15 @@ final class CollationFastLatinBuilder {
|
|||
// (uniqueCEs does not store case bits.)
|
||||
long p = ce >>> 32;
|
||||
if(p != prevPrimary) {
|
||||
int p1 = (int)(p >> 24);
|
||||
while(p1 > lastGroupByte) {
|
||||
while(p > lastGroupPrimary) {
|
||||
assert(pri <= CollationFastLatin.MAX_LONG);
|
||||
// Add the last "long primary" in or before the group
|
||||
// into the upper 9 bits of the group entry.
|
||||
result.setCharAt(group, (char)((pri << 4) | lastGroupByte));
|
||||
if(++group < headerLength) { // group is 1-based
|
||||
lastGroupByte = result.charAt(group);
|
||||
// Set the group's header entry to the
|
||||
// last "long primary" in or before the group.
|
||||
result.setCharAt(1 + group, (char)pri);
|
||||
if(++group < NUM_SPECIAL_GROUPS) {
|
||||
lastGroupPrimary = lastSpecialPrimaries[group];
|
||||
} else {
|
||||
lastGroupByte = 0xff;
|
||||
lastGroupPrimary = 0xffffffffL;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -686,6 +671,10 @@ final class CollationFastLatinBuilder {
|
|||
return (ce >>> 32) == Collation.NO_CE_PRIMARY && ce != Collation.NO_CE;
|
||||
}
|
||||
|
||||
// space, punct, symbol, currency (not digit)
|
||||
private static final int NUM_SPECIAL_GROUPS =
|
||||
Collator.ReorderCodes.CURRENCY - Collator.ReorderCodes.FIRST + 1;
|
||||
|
||||
private static final long CONTRACTION_FLAG = 0x80000000L;
|
||||
|
||||
// temporary "buffer"
|
||||
|
@ -699,7 +688,8 @@ final class CollationFastLatinBuilder {
|
|||
/** One 16-bit mini CE per unique CE. */
|
||||
private char[] miniCEs;
|
||||
|
||||
// These are constant for a given list of CollationData.scripts.
|
||||
// These are constant for a given root collator.
|
||||
long[] lastSpecialPrimaries = new long[NUM_SPECIAL_GROUPS];
|
||||
private long firstDigitPrimary;
|
||||
private long firstLatinPrimary;
|
||||
private long lastLatinPrimary;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2012-2014, International Business Machines
|
||||
* Copyright (C) 2012-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* CollationKeys.java, ported from collationkeys.h/.cpp
|
||||
|
@ -348,7 +348,6 @@ public final class CollationKeys /* all methods are static */ {
|
|||
// +1 so that we can use "<" and primary ignorables test out early.
|
||||
variableTop = settings.variableTop + 1;
|
||||
}
|
||||
byte[] reorderTable = settings.reorderTable;
|
||||
|
||||
int tertiaryMask = CollationSettings.getTertiaryMask(options);
|
||||
|
||||
|
@ -358,7 +357,7 @@ public final class CollationKeys /* all methods are static */ {
|
|||
SortKeyLevel tertiaries = getSortKeyLevel(levels, Collation.TERTIARY_LEVEL_FLAG);
|
||||
SortKeyLevel quaternaries = getSortKeyLevel(levels, Collation.QUATERNARY_LEVEL_FLAG);
|
||||
|
||||
int compressedP1 = 0; // 0==no compression; otherwise reordered compressible lead byte
|
||||
long prevReorderedPrimary = 0; // 0==no compression
|
||||
int commonCases = 0;
|
||||
int commonSecondaries = 0;
|
||||
int commonTertiaries = 0;
|
||||
|
@ -387,16 +386,15 @@ public final class CollationKeys /* all methods are static */ {
|
|||
}
|
||||
do {
|
||||
if ((levels & Collation.QUATERNARY_LEVEL_FLAG) != 0) {
|
||||
int p1 = (int) p >>> 24;
|
||||
if (reorderTable != null) {
|
||||
p1 = reorderTable[p1] & 0xff;
|
||||
if (settings.hasReordering()) {
|
||||
p = settings.reorder(p);
|
||||
}
|
||||
if (p1 >= QUAT_SHIFTED_LIMIT_BYTE) {
|
||||
if (((int) p >>> 24) >= QUAT_SHIFTED_LIMIT_BYTE) {
|
||||
// Prevent shifted primary lead bytes from
|
||||
// overlapping with the common compression range.
|
||||
quaternaries.appendByte(QUAT_SHIFTED_LIMIT_BYTE);
|
||||
}
|
||||
quaternaries.appendWeight32((p1 << 24) | (p & 0xffffff));
|
||||
quaternaries.appendWeight32(p);
|
||||
}
|
||||
do {
|
||||
ce = iter.nextCE();
|
||||
|
@ -409,13 +407,15 @@ public final class CollationKeys /* all methods are static */ {
|
|||
// If ce==NO_CE, then write nothing for the primary level but
|
||||
// terminate compression on all levels and then exit the loop.
|
||||
if (p > Collation.NO_CE_PRIMARY && (levels & Collation.PRIMARY_LEVEL_FLAG) != 0) {
|
||||
int p1 = (int) p >>> 24;
|
||||
if (reorderTable != null) {
|
||||
p1 = reorderTable[p1] & 0xff;
|
||||
// Test the un-reordered primary for compressibility.
|
||||
boolean isCompressible = compressibleBytes[(int) p >>> 24];
|
||||
if(settings.hasReordering()) {
|
||||
p = settings.reorder(p);
|
||||
}
|
||||
if (p1 != compressedP1) {
|
||||
if (compressedP1 != 0) {
|
||||
if (p1 < compressedP1) {
|
||||
int p1 = (int) p >>> 24;
|
||||
if (!isCompressible || p1 != ((int) prevReorderedPrimary >>> 24)) {
|
||||
if (prevReorderedPrimary != 0) {
|
||||
if (p < prevReorderedPrimary) {
|
||||
// No primary compression terminator
|
||||
// at the end of the level or merged segment.
|
||||
if (p1 > Collation.MERGE_SEPARATOR_BYTE) {
|
||||
|
@ -426,12 +426,10 @@ public final class CollationKeys /* all methods are static */ {
|
|||
}
|
||||
}
|
||||
sink.Append(p1);
|
||||
// Test the un-reordered lead byte for compressibility but
|
||||
// remember the reordered lead byte.
|
||||
if (compressibleBytes[(int) p >>> 24]) {
|
||||
compressedP1 = p1;
|
||||
if(isCompressible) {
|
||||
prevReorderedPrimary = p;
|
||||
} else {
|
||||
compressedP1 = 0;
|
||||
prevReorderedPrimary = 0;
|
||||
}
|
||||
}
|
||||
byte p2 = (byte) (p >>> 16);
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2013-2014, International Business Machines
|
||||
* Copyright (C) 2013-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* CollationRuleParser.java, ported from collationruleparser.h/.cpp
|
||||
|
@ -718,17 +718,14 @@ public final class CollationRuleParser {
|
|||
reorderCodes.add(code);
|
||||
i = limit;
|
||||
}
|
||||
int length = reorderCodes.size();
|
||||
if(length == 1 && reorderCodes.get(0) == Collator.ReorderCodes.NONE) {
|
||||
if(reorderCodes.isEmpty()) {
|
||||
settings.resetReordering();
|
||||
return;
|
||||
} else {
|
||||
int[] codes = new int[reorderCodes.size()];
|
||||
int j = 0;
|
||||
for(Integer code : reorderCodes) { codes[j++] = code; }
|
||||
settings.setReordering(baseData, codes);
|
||||
}
|
||||
int[] codes = new int[reorderCodes.size()];
|
||||
int j = 0;
|
||||
for(Integer code : reorderCodes) { codes[j++] = code; }
|
||||
byte[] table = new byte[256];
|
||||
baseData.makeReorderTable(codes, table);
|
||||
settings.setReordering(codes, table);
|
||||
}
|
||||
|
||||
private static final String[] gSpecialReorderCodes = {
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2013-2014, International Business Machines
|
||||
* Copyright (C) 2013-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* CollationSettings.java, ported from collationsettings.h/.cpp
|
||||
|
@ -93,7 +93,7 @@ public final class CollationSettings extends SharedObject {
|
|||
@Override
|
||||
public CollationSettings clone() {
|
||||
CollationSettings newSettings = (CollationSettings)super.clone();
|
||||
// Note: The reorderTable and reorderCodes need not be cloned
|
||||
// Note: The reorderTable, reorderRanges, and reorderCodes need not be cloned
|
||||
// because, in Java, they only get replaced but not modified.
|
||||
newSettings.fastLatinPrimaries = fastLatinPrimaries.clone();
|
||||
return newSettings;
|
||||
|
@ -125,16 +125,180 @@ public final class CollationSettings extends SharedObject {
|
|||
// When we turn off reordering, we want to set a null permutation
|
||||
// rather than a no-op permutation.
|
||||
reorderTable = null;
|
||||
minHighNoReorder = 0;
|
||||
reorderRanges = null;
|
||||
reorderCodes = EMPTY_INT_ARRAY;
|
||||
}
|
||||
// No aliasReordering() in Java. Use setReordering(). See comments near reorderCodes.
|
||||
public void setReordering(int[] codes, byte[] table) {
|
||||
|
||||
void aliasReordering(CollationData data, int[] codesAndRanges, int codesLength, byte[] table) {
|
||||
int[] codes;
|
||||
if(codesLength == codesAndRanges.length) {
|
||||
codes = codesAndRanges;
|
||||
} else {
|
||||
// TODO: Java 6: Arrays.copyOf(codes, codesLength);
|
||||
codes = new int[codesLength];
|
||||
System.arraycopy(codesAndRanges, 0, codes, 0, codesLength);
|
||||
}
|
||||
int rangesStart = codesLength;
|
||||
int rangesLimit = codesAndRanges.length;
|
||||
int rangesLength = rangesLimit - rangesStart;
|
||||
if(table != null &&
|
||||
(rangesLength == 0 ?
|
||||
!reorderTableHasSplitBytes(table) :
|
||||
rangesLength >= 2 &&
|
||||
// The first offset must be 0. The last offset must not be 0.
|
||||
(codesAndRanges[rangesStart] & 0xffff) == 0 &&
|
||||
(codesAndRanges[rangesLimit - 1] & 0xffff) != 0)) {
|
||||
reorderTable = table;
|
||||
reorderCodes = codes;
|
||||
// Drop ranges before the first split byte. They are reordered by the table.
|
||||
// This then speeds up reordering of the remaining ranges.
|
||||
int firstSplitByteRangeIndex = rangesStart;
|
||||
while(firstSplitByteRangeIndex < rangesLimit &&
|
||||
(codesAndRanges[firstSplitByteRangeIndex] & 0xff0000) == 0) {
|
||||
// The second byte of the primary limit is 0.
|
||||
++firstSplitByteRangeIndex;
|
||||
}
|
||||
if(firstSplitByteRangeIndex == rangesLimit) {
|
||||
assert(!reorderTableHasSplitBytes(table));
|
||||
minHighNoReorder = 0;
|
||||
reorderRanges = null;
|
||||
} else {
|
||||
assert(table[codesAndRanges[firstSplitByteRangeIndex] >>> 24] == 0);
|
||||
minHighNoReorder = codesAndRanges[rangesLimit - 1] & 0xffff0000L;
|
||||
setReorderRanges(codesAndRanges, firstSplitByteRangeIndex,
|
||||
rangesLimit - firstSplitByteRangeIndex);
|
||||
}
|
||||
return;
|
||||
}
|
||||
// Regenerate missing data.
|
||||
setReordering(data, codes);
|
||||
}
|
||||
|
||||
public void setReordering(CollationData data, int[] codes) {
|
||||
if(codes.length == 0 || (codes.length == 1 && codes[0] == Collator.ReorderCodes.NONE)) {
|
||||
resetReordering();
|
||||
return;
|
||||
}
|
||||
UVector32 rangesList = new UVector32();
|
||||
data.makeReorderRanges(codes, rangesList);
|
||||
int rangesLength = rangesList.size();
|
||||
if(rangesLength == 0) {
|
||||
resetReordering();
|
||||
return;
|
||||
}
|
||||
int[] ranges = rangesList.getBuffer();
|
||||
// ranges[] contains at least two (limit, offset) pairs.
|
||||
// The first offset must be 0. The last offset must not be 0.
|
||||
// Separators (at the low end) and trailing weights (at the high end)
|
||||
// are never reordered.
|
||||
assert(rangesLength >= 2);
|
||||
assert((ranges[0] & 0xffff) == 0 && (ranges[rangesLength - 1] & 0xffff) != 0);
|
||||
minHighNoReorder = ranges[rangesLength - 1] & 0xffff0000L;
|
||||
|
||||
// Write the lead byte permutation table.
|
||||
// Set a 0 for each lead byte that has a range boundary in the middle.
|
||||
byte[] table = new byte[256];
|
||||
int b = 0;
|
||||
int firstSplitByteRangeIndex = -1;
|
||||
for(int i = 0; i < rangesLength; ++i) {
|
||||
int pair = ranges[i];
|
||||
int limit1 = pair >>> 24;
|
||||
while(b < limit1) {
|
||||
table[b] = (byte)(b + pair);
|
||||
++b;
|
||||
}
|
||||
// Check the second byte of the limit.
|
||||
if((pair & 0xff0000) != 0) {
|
||||
table[limit1] = 0;
|
||||
b = limit1 + 1;
|
||||
if(firstSplitByteRangeIndex < 0) {
|
||||
firstSplitByteRangeIndex = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
while(b <= 0xff) {
|
||||
table[b] = (byte)b;
|
||||
++b;
|
||||
}
|
||||
int rangesStart;
|
||||
if(firstSplitByteRangeIndex < 0) {
|
||||
// The lead byte permutation table alone suffices for reordering.
|
||||
rangesStart = rangesLength = 0;
|
||||
} else {
|
||||
// Remove the ranges below the first split byte.
|
||||
rangesStart = firstSplitByteRangeIndex;
|
||||
rangesLength -= firstSplitByteRangeIndex;
|
||||
}
|
||||
setReorderArrays(codes, ranges, rangesStart, rangesLength, table);
|
||||
}
|
||||
|
||||
private void setReorderArrays(int[] codes,
|
||||
int[] ranges, int rangesStart, int rangesLength, byte[] table) {
|
||||
// Very different from C++. See the comments after the reorderCodes declaration.
|
||||
if(codes == null) {
|
||||
codes = EMPTY_INT_ARRAY;
|
||||
}
|
||||
assert (codes.length == 0) == (table == null);
|
||||
reorderTable = table;
|
||||
reorderCodes = codes;
|
||||
setReorderRanges(ranges, rangesStart, rangesLength);
|
||||
}
|
||||
|
||||
private void setReorderRanges(int[] ranges, int rangesStart, int rangesLength) {
|
||||
if(rangesLength == 0) {
|
||||
reorderRanges = null;
|
||||
} else {
|
||||
reorderRanges = new long[rangesLength];
|
||||
int i = 0;
|
||||
do {
|
||||
reorderRanges[i++] = ranges[rangesStart++] & 0xffffffffL;
|
||||
} while(i < rangesLength);
|
||||
}
|
||||
}
|
||||
|
||||
public void copyReorderingFrom(CollationSettings other) {
|
||||
if(!other.hasReordering()) {
|
||||
resetReordering();
|
||||
return;
|
||||
}
|
||||
minHighNoReorder = other.minHighNoReorder;
|
||||
reorderTable = other.reorderTable;
|
||||
reorderRanges = other.reorderRanges;
|
||||
reorderCodes = other.reorderCodes;
|
||||
}
|
||||
|
||||
public boolean hasReordering() { return reorderTable != null; }
|
||||
|
||||
private static boolean reorderTableHasSplitBytes(byte[] table) {
|
||||
assert(table[0] == 0);
|
||||
for(int i = 1; i < 256; ++i) {
|
||||
if(table[i] == 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public long reorder(long p) {
|
||||
byte b = reorderTable[(int)p >>> 24];
|
||||
if(b != 0 || p <= Collation.NO_CE_PRIMARY) {
|
||||
return ((b & 0xffL) << 24) | (p & 0xffffff);
|
||||
} else {
|
||||
return reorderEx(p);
|
||||
}
|
||||
}
|
||||
|
||||
private long reorderEx(long p) {
|
||||
assert minHighNoReorder > 0;
|
||||
if(p >= minHighNoReorder) { return p; }
|
||||
// Round up p so that its lower 16 bits are >= any offset bits.
|
||||
// Then compare q directly with (limit, offset) pairs.
|
||||
long q = p | 0xffff;
|
||||
long r;
|
||||
int i = 0;
|
||||
while(q >= (r = reorderRanges[i])) { ++i; }
|
||||
return p + ((long)(short)r << 24);
|
||||
}
|
||||
|
||||
// In C++, we use enums for attributes and their values, with a special value for the default.
|
||||
|
@ -276,11 +440,39 @@ public final class CollationSettings extends SharedObject {
|
|||
(MAX_VAR_PUNCT << MAX_VARIABLE_SHIFT);
|
||||
/** Variable-top primary weight. */
|
||||
public long variableTop;
|
||||
/** 256-byte table for reordering permutation of primary lead bytes; null if no reordering. */
|
||||
/**
|
||||
* 256-byte table for reordering permutation of primary lead bytes; null if no reordering.
|
||||
* A 0 entry at a non-zero index means that the primary lead byte is "split"
|
||||
* (there are different offsets for primaries that share that lead byte)
|
||||
* and the reordering offset must be determined via the reorderRanges.
|
||||
*/
|
||||
public byte[] reorderTable;
|
||||
/** Limit of last reordered range. 0 if no reordering or no split bytes. */
|
||||
long minHighNoReorder;
|
||||
/**
|
||||
* Primary-weight ranges for script reordering,
|
||||
* to be used by reorder(p) for split-reordered primary lead bytes.
|
||||
*
|
||||
* <p>Each entry is a (limit, offset) pair.
|
||||
* The upper 16 bits of the entry are the upper 16 bits of the
|
||||
* exclusive primary limit of a range.
|
||||
* Primaries between the previous limit and this one have their lead bytes
|
||||
* modified by the signed offset (-0xff..+0xff) stored in the lower 16 bits.
|
||||
*
|
||||
* <p>CollationData.makeReorderRanges() writes a full list where the first range
|
||||
* (at least for terminators and separators) has a 0 offset.
|
||||
* The last range has a non-zero offset.
|
||||
* minHighNoReorder is set to the limit of that last range.
|
||||
*
|
||||
* <p>In the settings object, the initial ranges before the first split lead byte
|
||||
* are omitted for efficiency; they are handled by reorder(p) via the reorderTable.
|
||||
* If there are no split-reordered lead bytes, then no ranges are needed.
|
||||
*/
|
||||
long[] reorderRanges;
|
||||
/** Array of reorder codes; ignored if length == 0. */
|
||||
public int[] reorderCodes = EMPTY_INT_ARRAY;
|
||||
// Note: In C++, we keep a memory block around for the reorder codes and the permutation table,
|
||||
// Note: In C++, we keep a memory block around for the reorder codes,
|
||||
// the ranges, and the permutation table,
|
||||
// and modify them for new codes.
|
||||
// In Java, we simply copy references and then never modify the array contents.
|
||||
// The caller must abandon the arrays.
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2013-2014, International Business Machines
|
||||
* Copyright (C) 2013-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* CollationTailoring.java, ported from collationtailoring.h/.cpp
|
||||
|
@ -33,6 +33,7 @@ public final class CollationTailoring {
|
|||
if(baseSettings != null) {
|
||||
assert(baseSettings.readOnly().reorderCodes.length == 0);
|
||||
assert(baseSettings.readOnly().reorderTable == null);
|
||||
assert(baseSettings.readOnly().minHighNoReorder == 0);
|
||||
settings = baseSettings.clone();
|
||||
} else {
|
||||
settings = new SharedObject.Reference<CollationSettings>(new CollationSettings());
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2014, International Business Machines Corporation and
|
||||
* Copyright (C) 1996-2015, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -399,27 +399,35 @@ public abstract class Collator implements Comparator<Object>, Freezable<Collator
|
|||
|
||||
/**
|
||||
* Sets the reordering codes for this collator.
|
||||
* <p>Collation reordering allows scripts and some other defined blocks of characters
|
||||
* to be moved relative to each other as a block. This reordering is done on top of
|
||||
* Collation reordering allows scripts and some other groups of characters
|
||||
* to be moved relative to each other. This reordering is done on top of
|
||||
* the DUCET/CLDR standard collation order. Reordering can specify groups to be placed
|
||||
* at the start and/or the end of the collation order. These groups are specified using
|
||||
* UScript codes and UColReorderCode entries.
|
||||
* UScript codes and {@link Collator.ReorderCodes} entries.
|
||||
*
|
||||
* <p>By default, reordering codes specified for the start of the order are placed in the
|
||||
* order given after a group of "special" non-script blocks. These special groups of characters
|
||||
* order given after several special non-script blocks. These special groups of characters
|
||||
* are space, punctuation, symbol, currency, and digit. These special groups are represented with
|
||||
* UColReorderCode entries. Script groups can be intermingled with
|
||||
* these special non-script blocks if those special blocks are explicitly specified in the reordering.
|
||||
* <p>The special code OTHERS stands for any script that is not explicitly
|
||||
* {@link Collator.ReorderCodes} entries. Script groups can be intermingled with
|
||||
* these special non-script groups if those special groups are explicitly specified in the reordering.
|
||||
*
|
||||
* <p>The special code {@link Collator.ReorderCodes#OTHERS OTHERS}
|
||||
* stands for any script that is not explicitly
|
||||
* mentioned in the list of reordering codes given. Anything that is after OTHERS
|
||||
* will go at the very end of the reordering in the order given.
|
||||
* <p>The special reorder code DEFAULT will reset the reordering for this collator
|
||||
*
|
||||
* <p>The special reorder code {@link Collator.ReorderCodes#DEFAULT DEFAULT}
|
||||
* will reset the reordering for this collator
|
||||
* to the default for this collator. The default reordering may be the DUCET/CLDR order or may be a reordering that
|
||||
* was specified when this collator was created from resource data or from rules. The
|
||||
* DEFAULT code <b>must</b> be the sole code supplied when it used. If not
|
||||
* that will result in an U_ILLEGAL_ARGUMENT_ERROR being set.
|
||||
* <p>The special reorder code NONE will remove any reordering for this collator.
|
||||
* DEFAULT code <b>must</b> be the sole code supplied when it is used.
|
||||
* If not, then an {@link IllegalArgumentException} will be thrown.
|
||||
*
|
||||
* <p>The special reorder code {@link Collator.ReorderCodes#NONE NONE}
|
||||
* will remove any reordering for this collator.
|
||||
* The result of setting no reordering will be to have the DUCET/CLDR ordering used. The
|
||||
* NONE code <b>must</b> be the sole code supplied when it used.
|
||||
* NONE code <b>must</b> be the sole code supplied when it is used.
|
||||
*
|
||||
* @param order the reordering codes to apply to this collator; if this is null or an empty array
|
||||
* then this clears any existing reordering
|
||||
* @see #getReorderCodes
|
||||
|
@ -1401,7 +1409,9 @@ public abstract class Collator implements Comparator<Object>, Freezable<Collator
|
|||
/**
|
||||
* Retrieves all the reorder codes that are grouped with the given reorder code. Some reorder
|
||||
* codes are grouped and must reorder together.
|
||||
*
|
||||
* Beginning with ICU 55, scripts only reorder together if they are primary-equal,
|
||||
* for example Hiragana and Katakana.
|
||||
*
|
||||
* @param reorderCode The reorder code to determine equivalence for.
|
||||
* @return the set of all reorder codes in the same group as the given reorder code.
|
||||
* @see #setReorderCodes
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2014, International Business Machines Corporation and
|
||||
* Copyright (C) 1996-2015, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -34,6 +34,7 @@ import com.ibm.icu.impl.coll.FCDUTF16CollationIterator;
|
|||
import com.ibm.icu.impl.coll.SharedObject;
|
||||
import com.ibm.icu.impl.coll.TailoredSet;
|
||||
import com.ibm.icu.impl.coll.UTF16CollationIterator;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
import com.ibm.icu.util.VersionInfo;
|
||||
|
||||
|
@ -909,35 +910,18 @@ public final class RuleBasedCollator extends Collator {
|
|||
setFastLatinOptions(ownedSettings);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the reordering codes for this collator.
|
||||
* Collation reordering allows scripts and some other defined blocks of characters
|
||||
* to be moved relative to each other as a block. This reordering is done on top of
|
||||
* the DUCET/CLDR standard collation order. Reordering can specify groups to be placed
|
||||
* at the start and/or the end of the collation order.
|
||||
* <p>By default, reordering codes specified for the start of the order are placed in the
|
||||
* order given after a group of “special” non-script blocks. These special groups of characters
|
||||
* are space, punctuation, symbol, currency, and digit. These special groups are represented with
|
||||
* {@link Collator.ReorderCodes}. Script groups can be intermingled with
|
||||
* these special non-script blocks if those special blocks are explicitly specified in the reordering.
|
||||
* <p>The special code {@link Collator.ReorderCodes#OTHERS OTHERS} stands for any script that is not explicitly
|
||||
* mentioned in the list of reordering codes given. Anything that is after {@link Collator.ReorderCodes#OTHERS OTHERS}
|
||||
* will go at the very end of the reordering in the order given.
|
||||
* <p>The special reorder code {@link Collator.ReorderCodes#DEFAULT DEFAULT} will reset the reordering for this collator
|
||||
* to the default for this collator. The default reordering may be the DUCET/CLDR order or may be a reordering that
|
||||
* was specified when this collator was created from resource data or from rules. The
|
||||
* {@link Collator.ReorderCodes#DEFAULT DEFAULT} code <b>must</b> be the sole code supplied when it used. If not
|
||||
* that will result in an {@link IllegalArgumentException} being thrown.
|
||||
* <p>The special reorder code {@link Collator.ReorderCodes#NONE NONE} will remove any reordering for this collator.
|
||||
* The result of setting no reordering will be to have the DUCET/CLDR reordering used. The
|
||||
* {@link Collator.ReorderCodes#NONE NONE} code <b>must</b> be the sole code supplied when it used.
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* @param order the reordering codes to apply to this collator; if this is null or an empty array
|
||||
* then this clears any existing reordering
|
||||
* @throws IllegalArgumentException if the reordering codes are malformed in any way (e.g. duplicates, multiple reset codes, overlapping equivalent scripts)
|
||||
* @see #getReorderCodes
|
||||
* @see Collator#getEquivalentReorderCodes
|
||||
* @see Collator.ReorderCodes
|
||||
* @see UScript
|
||||
* @stable ICU 4.8
|
||||
*/
|
||||
*/
|
||||
@Override
|
||||
public void setReorderCodes(int... order) {
|
||||
checkNotFrozen();
|
||||
|
@ -954,8 +938,7 @@ public final class RuleBasedCollator extends Collator {
|
|||
if(length == 1 && order[0] == Collator.ReorderCodes.DEFAULT) {
|
||||
if(settings.readOnly() != defaultSettings) {
|
||||
CollationSettings ownedSettings = getOwnedSettings();
|
||||
ownedSettings.setReordering(defaultSettings.reorderCodes,
|
||||
defaultSettings.reorderTable);
|
||||
ownedSettings.copyReorderingFrom(defaultSettings);
|
||||
setFastLatinOptions(ownedSettings);
|
||||
}
|
||||
return;
|
||||
|
@ -964,9 +947,7 @@ public final class RuleBasedCollator extends Collator {
|
|||
if(length == 0) {
|
||||
ownedSettings.resetReordering();
|
||||
} else {
|
||||
byte[] reorderTable = new byte[256];
|
||||
data.makeReorderTable(order, reorderTable);
|
||||
ownedSettings.setReordering(order.clone(), reorderTable);
|
||||
ownedSettings.setReordering(data, order.clone());
|
||||
}
|
||||
setFastLatinOptions(ownedSettings);
|
||||
}
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:d7bf72e445a207052fe2e2de0d70a989b69bc55da3df272f8e3096d6d9cb2ad0
|
||||
size 11801973
|
||||
oid sha256:49983175d1f04593f311dab35e6db8ad4b802d8c5de99a03d0e7333bd6ffcfc0
|
||||
size 11802910
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (c) 2012-2014 International Business Machines
|
||||
# Copyright (c) 2012-2015 International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# This file should be in UTF-8 with a signature byte sequence ("BOM").
|
||||
|
@ -2526,3 +2526,15 @@
|
|||
<3 あ
|
||||
<3 ァ
|
||||
<1 い
|
||||
|
||||
** test: reorder single scripts not groups, ICU ticket 11449
|
||||
@ root
|
||||
% reorder Goth Latn
|
||||
* compare
|
||||
<1 4
|
||||
<1 𐌰 # Gothic
|
||||
<1 L
|
||||
<1 Ω
|
||||
# Before ICU 55, the following reordered together with Gothic.
|
||||
<1 𐌈 # Old Italic
|
||||
<1 𐑐 # Shavian
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
|
||||
/*
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2002-2014, International Business Machines Corporation and
|
||||
* Copyright (C) 2002-2015, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -3153,7 +3152,7 @@ public class CollationMiscTest extends TestFmwk {
|
|||
{
|
||||
Collator myCollation;
|
||||
int[] reorderCodes = {UScript.GREEK, UScript.HAN, ReorderCodes.PUNCTUATION};
|
||||
int[] duplicateReorderCodes = {UScript.CUNEIFORM, UScript.GREEK, ReorderCodes.CURRENCY, UScript.EGYPTIAN_HIEROGLYPHS};
|
||||
int[] duplicateReorderCodes = {UScript.HIRAGANA, UScript.GREEK, ReorderCodes.CURRENCY, UScript.KATAKANA};
|
||||
int[] reorderCodesStartingWithDefault = {ReorderCodes.DEFAULT, UScript.GREEK, UScript.HAN, ReorderCodes.PUNCTUATION};
|
||||
int[] retrievedReorderCodes;
|
||||
String greekString = "\u03b1";
|
||||
|
@ -3283,47 +3282,7 @@ public class CollationMiscTest extends TestFmwk {
|
|||
errln("ERROR: retrieved reorder codes do not match set reorder codes.");
|
||||
}
|
||||
}
|
||||
|
||||
public void TestSameLeadBytScriptReorder(){
|
||||
String[] testSourceCases = {
|
||||
"\ud800\udf31", // Gothic
|
||||
"\ud801\udc50", // Shavian
|
||||
};
|
||||
|
||||
String[] testTargetCases = {
|
||||
"\u0100", // Latin Extended-A
|
||||
"\u2c74", // Latin Extended-C
|
||||
};
|
||||
|
||||
int[] results = {
|
||||
-1,
|
||||
-1,
|
||||
};
|
||||
|
||||
Collator myCollation;
|
||||
String rules = "[reorder Goth Latn]";
|
||||
try {
|
||||
myCollation = new RuleBasedCollator(rules);
|
||||
} catch (Exception e) {
|
||||
warnln("ERROR: in creation of rule based collator");
|
||||
return;
|
||||
}
|
||||
myCollation.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
|
||||
myCollation.setStrength(Collator.TERTIARY);
|
||||
for (int i = 0; i < testSourceCases.length ; i++)
|
||||
{
|
||||
CollationTest.doTest(this, (RuleBasedCollator)myCollation,
|
||||
testSourceCases[i], testTargetCases[i],
|
||||
results[i]);
|
||||
}
|
||||
|
||||
// ensure that the non-reordered and reordered collation is the same
|
||||
Collator nonReorderdCollator = RuleBasedCollator.getInstance();
|
||||
int nonReorderedResults = nonReorderdCollator.compare(testSourceCases[0], testSourceCases[1]);
|
||||
CollationTest.doTest(this, (RuleBasedCollator)myCollation,
|
||||
testSourceCases[0], testSourceCases[1], nonReorderedResults);
|
||||
}
|
||||
|
||||
static boolean containsExpectedScript(int[] scripts, int expectedScript) {
|
||||
for (int i = 0; i < scripts.length; ++i) {
|
||||
if (expectedScript == scripts[i]) { return true; }
|
||||
|
@ -3332,66 +3291,87 @@ public class CollationMiscTest extends TestFmwk {
|
|||
}
|
||||
|
||||
public void TestEquivalentReorderingScripts() {
|
||||
// Beginning with ICU 55, collation reordering moves single scripts
|
||||
// rather than groups of scripts,
|
||||
// except where scripts share a range and sort primary-equal.
|
||||
final int[] expectedScripts = {
|
||||
UScript.BOPOMOFO, //Bopo
|
||||
UScript.LISU, //Lisu
|
||||
UScript.LYCIAN, //Lyci
|
||||
UScript.CARIAN, //Cari
|
||||
UScript.LYDIAN, //Lydi
|
||||
UScript.YI, //Yiii
|
||||
UScript.OLD_ITALIC, //Ital
|
||||
UScript.GOTHIC, //Goth
|
||||
UScript.DESERET, //Dsrt
|
||||
UScript.SHAVIAN, //Shaw
|
||||
UScript.OSMANYA, //Osma
|
||||
UScript.LINEAR_B, //Linb
|
||||
UScript.CYPRIOT, //Cprt
|
||||
UScript.OLD_SOUTH_ARABIAN, //Sarb
|
||||
UScript.AVESTAN, //Avst
|
||||
UScript.IMPERIAL_ARAMAIC, //Armi
|
||||
UScript.INSCRIPTIONAL_PARTHIAN, //Prti
|
||||
UScript.INSCRIPTIONAL_PAHLAVI, //Phli
|
||||
UScript.UGARITIC, //Ugar
|
||||
UScript.OLD_PERSIAN, //Xpeo
|
||||
UScript.CUNEIFORM, //Xsux
|
||||
UScript.EGYPTIAN_HIEROGLYPHS, //Egyp
|
||||
UScript.PHONETIC_POLLARD, //Plrd
|
||||
UScript.SORA_SOMPENG, //Sora
|
||||
UScript.MEROITIC_CURSIVE, //Merc
|
||||
UScript.MEROITIC_HIEROGLYPHS //Mero
|
||||
UScript.HIRAGANA,
|
||||
UScript.KATAKANA,
|
||||
UScript.KATAKANA_OR_HIRAGANA
|
||||
};
|
||||
|
||||
int[] equivalentScripts = RuleBasedCollator.getEquivalentReorderCodes(UScript.GOTHIC);
|
||||
if (equivalentScripts.length < expectedScripts.length) {
|
||||
errln(String.format("ERROR/Gothic: retrieved equivalent script length wrong: " +
|
||||
"expected at least %d, was = %d",
|
||||
if (equivalentScripts.length != 1 || equivalentScripts[0] != UScript.GOTHIC) {
|
||||
errln(String.format("ERROR/Gothic: retrieved equivalent scripts wrong: " +
|
||||
"length expected 1, was = %d; expected [%d] was [%d]",
|
||||
equivalentScripts.length, UScript.GOTHIC, equivalentScripts[0]));
|
||||
}
|
||||
|
||||
equivalentScripts = RuleBasedCollator.getEquivalentReorderCodes(UScript.HIRAGANA);
|
||||
if (equivalentScripts.length != expectedScripts.length) {
|
||||
errln(String.format("ERROR/Hiragana: retrieved equivalent script length wrong: " +
|
||||
"expected %d, was = %d",
|
||||
expectedScripts.length, equivalentScripts.length));
|
||||
}
|
||||
int prevScript = -1;
|
||||
for (int i = 0; i < equivalentScripts.length; ++i) {
|
||||
int script = equivalentScripts[i];
|
||||
if (script <= prevScript) {
|
||||
errln("ERROR/Gothic: equivalent scripts out of order at index " + i);
|
||||
errln("ERROR/Hiragana: equivalent scripts out of order at index " + i);
|
||||
}
|
||||
prevScript = script;
|
||||
}
|
||||
for (int code : expectedScripts) {
|
||||
if (!containsExpectedScript(equivalentScripts, code)) {
|
||||
errln("ERROR/Gothic: equivalent scripts do not contain " + code);
|
||||
errln("ERROR/Hiragana: equivalent scripts do not contain " + code);
|
||||
}
|
||||
}
|
||||
|
||||
equivalentScripts = RuleBasedCollator.getEquivalentReorderCodes(UScript.SHAVIAN);
|
||||
if (equivalentScripts.length < expectedScripts.length) {
|
||||
errln(String.format("ERROR/Shavian: retrieved equivalent script length wrong: " +
|
||||
"expected at least %d, was = %d",
|
||||
equivalentScripts = RuleBasedCollator.getEquivalentReorderCodes(UScript.KATAKANA);
|
||||
if (equivalentScripts.length != expectedScripts.length) {
|
||||
errln(String.format("ERROR/Katakana: retrieved equivalent script length wrong: " +
|
||||
"expected %d, was = %d",
|
||||
expectedScripts.length, equivalentScripts.length));
|
||||
}
|
||||
for (int code : expectedScripts) {
|
||||
if (!containsExpectedScript(equivalentScripts, code)) {
|
||||
errln("ERROR/Shavian: equivalent scripts do not contain " + code);
|
||||
errln("ERROR/Katakana: equivalent scripts do not contain " + code);
|
||||
}
|
||||
}
|
||||
|
||||
equivalentScripts = RuleBasedCollator.getEquivalentReorderCodes(UScript.KATAKANA_OR_HIRAGANA);
|
||||
if (equivalentScripts.length != expectedScripts.length) {
|
||||
errln(String.format("ERROR/Hrkt: retrieved equivalent script length wrong: " +
|
||||
"expected %d, was = %d",
|
||||
expectedScripts.length, equivalentScripts.length));
|
||||
}
|
||||
|
||||
equivalentScripts = RuleBasedCollator.getEquivalentReorderCodes(UScript.HAN);
|
||||
if (equivalentScripts.length != 3) {
|
||||
errln("ERROR/Hani: retrieved equivalent script length wrong: " +
|
||||
"expected 3, was = " + equivalentScripts.length);
|
||||
}
|
||||
equivalentScripts = RuleBasedCollator.getEquivalentReorderCodes(UScript.SIMPLIFIED_HAN);
|
||||
if (equivalentScripts.length != 3) {
|
||||
errln("ERROR/Hans: retrieved equivalent script length wrong: " +
|
||||
"expected 3, was = " + equivalentScripts.length);
|
||||
}
|
||||
equivalentScripts = RuleBasedCollator.getEquivalentReorderCodes(UScript.TRADITIONAL_HAN);
|
||||
if (equivalentScripts.length != 3) {
|
||||
errln("ERROR/Hant: retrieved equivalent script length wrong: " +
|
||||
"expected 3, was = " + equivalentScripts.length);
|
||||
}
|
||||
|
||||
equivalentScripts = RuleBasedCollator.getEquivalentReorderCodes(UScript.MEROITIC_CURSIVE);
|
||||
if (equivalentScripts.length != 2) {
|
||||
errln("ERROR/Merc: retrieved equivalent script length wrong: " +
|
||||
"expected 2, was = " + equivalentScripts.length);
|
||||
}
|
||||
equivalentScripts = RuleBasedCollator.getEquivalentReorderCodes(UScript.MEROITIC_HIEROGLYPHS);
|
||||
if (equivalentScripts.length != 2) {
|
||||
errln("ERROR/Mero: retrieved equivalent script length wrong: " +
|
||||
"expected 2, was = " + equivalentScripts.length);
|
||||
}
|
||||
}
|
||||
|
||||
public void TestGreekFirstReorderCloning() {
|
||||
|
|
Loading…
Add table
Reference in a new issue