ICU-22368 Reduce ~200K langInfo.res size by encode LSR into 32bits int.

See #2458
This commit is contained in:
Frank Tang 2023-06-21 18:24:40 +00:00 committed by Frank Yung-Fong Tang
parent 3fec4e718e
commit e83b0715a1
9 changed files with 8069 additions and 7803 deletions

View file

@ -11,6 +11,7 @@
#include "unicode/locid.h"
#include "unicode/uobject.h"
#include "unicode/ures.h"
#include "unicode/uscript.h"
#include "charstr.h"
#include "cstring.h"
#include "loclikelysubtags.h"
@ -81,11 +82,18 @@ struct XLikelySubtagsData {
// Read all strings in the resource bundle and convert them to invariant char *.
LocalMemory<int32_t> languageIndexes, regionIndexes, lsrSubtagIndexes;
int32_t languagesLength = 0, regionsLength = 0, lsrSubtagsLength = 0;
ResourceArray m49Array;
if (likelyTable.findValue("m49", value)) {
m49Array = value.getArray(errorCode);
} else {
errorCode = U_MISSING_RESOURCE_ERROR;
return;
}
if (!readStrings(likelyTable, "languageAliases", value,
languageIndexes, languagesLength, errorCode) ||
!readStrings(likelyTable, "regionAliases", value,
regionIndexes, regionsLength, errorCode) ||
!readStrings(likelyTable, "lsrs", value,
!readLSREncodedStrings(likelyTable, "lsrnum", value, m49Array,
lsrSubtagIndexes,lsrSubtagsLength, errorCode)) {
return;
}
@ -136,7 +144,7 @@ struct XLikelySubtagsData {
if (!readStrings(matchTable, "partitions", value,
partitionIndexes, partitionsLength, errorCode) ||
!readStrings(matchTable, "paradigms", value,
!readLSREncodedStrings(matchTable, "paradigmnum", value, m49Array,
paradigmSubtagIndexes, paradigmSubtagsLength, errorCode)) {
return;
}
@ -240,6 +248,88 @@ private:
}
return true;
}
UnicodeString toLanguage(int encoded) {
if (encoded == 0) {
return UNICODE_STRING_SIMPLE("");
}
if (encoded == 1) {
return UNICODE_STRING_SIMPLE("skip");
}
encoded &= 0x00ffffff;
encoded %= 27*27*27;
char lang[3];
lang[0] = 'a' + ((encoded % 27) - 1);
lang[1] = 'a' + (((encoded / 27 ) % 27) - 1);
if (encoded / (27 * 27) == 0) {
return UnicodeString(lang, 2);
}
lang[2] = 'a' + ((encoded / (27 * 27)) - 1);
return UnicodeString(lang, 3);
}
UnicodeString toScript(int encoded) {
if (encoded == 0) {
return UNICODE_STRING_SIMPLE("");
}
if (encoded == 1) {
return UNICODE_STRING_SIMPLE("script");
}
encoded = (encoded >> 24) & 0x000000ff;
const char* script = uscript_getShortName(static_cast<UScriptCode>(encoded));
if (script == nullptr) {
return UNICODE_STRING_SIMPLE("");
}
return UnicodeString(script, 4);
}
UnicodeString m49IndexToCode(const ResourceArray &m49Array, ResourceValue &value, int index, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) {
return UNICODE_STRING_SIMPLE("");
}
if (m49Array.getValue(index, value)) {
return value.getUnicodeString(errorCode);
}
// "m49" does not include the index.
errorCode = U_MISSING_RESOURCE_ERROR;
return UNICODE_STRING_SIMPLE("");
}
UnicodeString toRegion(const ResourceArray& m49Array, ResourceValue &value, int encoded, UErrorCode &errorCode) {
if (encoded == 0 || encoded == 1) {
return UNICODE_STRING_SIMPLE("");
}
encoded &= 0x00ffffff;
encoded /= 27 * 27 * 27;
encoded %= 27 * 27;
if (encoded < 27) {
// Selected M49 code index, find the code from "m49" resource.
return m49IndexToCode(m49Array, value, 2, errorCode);
}
char region[2];
region[0] = 'A' + ((encoded % 27) - 1);
region[1] = 'A' + (((encoded / 27) % 27) - 1);
return UnicodeString(region, 2);
}
bool readLSREncodedStrings(const ResourceTable &table, const char* key, ResourceValue &value, const ResourceArray& m49Array,
LocalMemory<int32_t> &indexes, int32_t &length, UErrorCode &errorCode) {
if (table.findValue(key, value)) {
const int32_t* vectors = value.getIntVector(length, errorCode);
if (U_FAILURE(errorCode)) { return false; }
if (length == 0) { return true; }
int32_t *rawIndexes = indexes.allocateInsteadAndCopy(length * 3);
if (rawIndexes == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return false;
}
for (int i = 0; i < length; ++i) {
rawIndexes[i*3] = strings.add(toLanguage(vectors[i]), errorCode);
rawIndexes[i*3+1] = strings.add(toScript(vectors[i]), errorCode);
rawIndexes[i*3+2] = strings.add(toRegion(m49Array, value, vectors[i], errorCode), errorCode);
if (U_FAILURE(errorCode)) { return false; }
}
length *= 3;
}
return true;
}
};
namespace {

File diff suppressed because it is too large Load diff

View file

@ -2,7 +2,9 @@
// License & terms of use: http://www.unicode.org/copyright.html
package com.ibm.icu.impl.locale;
import java.util.List;
import java.util.Objects;
import com.ibm.icu.lang.UScript;
public final class LSR {
public static final int REGION_INDEX_LIMIT = 1001 + 26 * 26;
@ -89,4 +91,100 @@ public final class LSR {
public int hashCode() {
return Objects.hash(language, script, region, flags);
}
// This method is added only to support encodeToIntForResource()
// It only support [a-z]{2,3} and will not work for other cases.
private int encodeLanguageToInt() {
assert language.length() >= 2;
assert language.length() <= 3;
assert language.charAt(0) >= 'a';
assert language.charAt(0) <= 'z';
assert language.charAt(1) >= 'a';
assert language.charAt(1) <= 'z';
assert language.length() == 2 || language.charAt(2) >= 'a';
assert language.length() == 2 || language.charAt(2) <= 'z';
return language.charAt(0) - 'a' + 1 +
27 * (language.charAt(1) - 'a' + 1) +
((language.length() == 2) ? 0 : 27 * 27 * (language.charAt(2) - 'a' + 1));
}
// This method is added only to support encodeToIntForResource()
// It only support [A-Z][a-z]{3} which defined in UScript and does not work for other cases.
private int encodeScriptToInt() {
int ret = UScript.getCodeFromName(script);
assert ret != UScript.INVALID_CODE;
return ret;
}
// This method is added only to support encodeToIntForResource()
// It only support [A-Z]{2} and the code in m49 but does not work for other cases.
private int encodeRegionToInt(List<String> m49) {
assert region.length() >= 2;
assert region.length() <= 3;
if (region.length() == 3) {
int index = m49.indexOf(region);
assert index >= 0;
if (index < 0) {
throw new IllegalStateException(
"Please add '" + region + "' to M49 in LocaleDistanceMapper.java");
}
return index;
}
assert region.charAt(0) >= 'A';
assert region.charAt(0) <= 'Z';
assert region.charAt(1) >= 'A';
assert region.charAt(1) <= 'Z';
// 'AA' => 1+27*1 = 28
// ...
// 'AZ' => 1+27*26 = 703
// 'BA' => 2+27*1 = 29
// ...
// 'IN' => 9+27*14 = 387
// 'ZZ' => 26+27*26 = 728
return (region.charAt(0) - 'A' + 1) + 27 * (region.charAt(1) - 'A' + 1);
}
// This is designed to only support encoding some LSR into resources but not for other cases.
public int encodeToIntForResource(List<String> m49) {
return (encodeLanguageToInt() + (27*27*27) * encodeRegionToInt(m49)) |
(encodeScriptToInt() << 24);
}
private static String toLanguage(int encoded) {
if (encoded == 0) return "";
if (encoded == 1) return "skip";
encoded &= 0x00ffffff;
encoded %= 27*27*27;
StringBuilder res = new StringBuilder(3);
res.append((char)('a' + ((encoded % 27) - 1)));
res.append((char)('a' + (((encoded / 27 ) % 27) - 1)));
if (encoded / (27 * 27) != 0) {
res.append((char)('a' + ((encoded / (27 * 27)) - 1)));
}
return res.toString();
}
private static String toScript(int encoded) {
if (encoded == 0) return "";
if (encoded == 1) return "script";
encoded = (encoded >> 24) & 0x000000ff;
return UScript.getShortName(encoded);
}
private static String toRegion(int encoded, String[] m49) {
if (encoded == 0 || encoded == 1) return "";
encoded &= 0x00ffffff;
encoded /= 27 * 27 * 27;
encoded %= 27 * 27;
if (encoded < 27) {
return m49[encoded];
}
StringBuilder res = new StringBuilder(3);
res.append((char)('A' + ((encoded % 27) - 1)));
res.append((char)('A' + (((encoded / 27) % 27) - 1)));
return res.toString();
}
public static LSR[] decodeInts(int[] nums, String[] m49) {
LSR[] lsrs = new LSR[nums.length];
for (int i = 0; i < nums.length; ++i) {
int n = nums[i];
lsrs[i] = new LSR(toLanguage(n), toScript(n), toRegion(n, m49), LSR.IMPLICIT_LSR);
}
return lsrs;
}
}

View file

@ -151,14 +151,12 @@ public class LocaleDistance {
String[] partitions = getValue(matchTable, "partitions", value).getStringArray();
Set<LSR> paradigmLSRs;
if (matchTable.findValue("paradigms", value)) {
String[] paradigms = value.getStringArray();
if (matchTable.findValue("paradigmnum", value)) {
String[] m49 = getValue(langInfo.getValueWithFallback("likely").getTable(),
"m49", value).getStringArray();
LSR[] paradigms = LSR.decodeInts(getValue(matchTable, "paradigmnum", value).getIntVector(), m49);
// LinkedHashSet for stable order; otherwise a unit test is flaky.
paradigmLSRs = new LinkedHashSet<>(paradigms.length / 3);
for (int i = 0; i < paradigms.length; i += 3) {
paradigmLSRs.add(new LSR(paradigms[i], paradigms[i + 1], paradigms[i + 2],
LSR.DONT_CARE_FLAGS));
}
paradigmLSRs = new LinkedHashSet<LSR>(Arrays.asList(paradigms));
} else {
paradigmLSRs = Collections.emptySet();
}

View file

@ -84,13 +84,8 @@ public final class XLikelySubtags {
byte[] trie = new byte[buffer.remaining()];
buffer.get(trie);
String[] lsrSubtags = getValue(likelyTable, "lsrs", value).getStringArray();
LSR[] lsrs = new LSR[lsrSubtags.length / 3];
for (int i = 0, j = 0; i < lsrSubtags.length; i += 3, ++j) {
lsrs[j] = new LSR(lsrSubtags[i], lsrSubtags[i + 1], lsrSubtags[i + 2],
LSR.IMPLICIT_LSR);
}
String[] m49 = getValue(likelyTable, "m49", value).getStringArray();
LSR[] lsrs = LSR.decodeInts(getValue(likelyTable, "lsrnum", value).getIntVector(), m49);
return new Data(languageAliases, regionAliases, trie, lsrs);
}

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:08e5f687e1d5b05d79567f448b115fcf0a4f36d2340a9cb179113988d656aa2c
size 14336414
oid sha256:f31ccf7b60ffb8c4a8fe9aadba04eedfc1f1bb74af57b03dc070f6d1c28465c9
size 14320254

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:9d0821499d99b54fa4bb80a5aaca12cd07cc71aea8a372e7c273691891973cb9
size 94829
oid sha256:9793b038249bdae5ddc4d3a4fef485047fa36bd49c7b1d81439ef02e3da9452d
size 94832

View file

@ -20,6 +20,7 @@ import java.util.Optional;
import java.util.Set;
import java.util.logging.Logger;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import org.unicode.cldr.api.AttributeKey;
@ -42,6 +43,8 @@ import com.google.common.primitives.Bytes;
import com.ibm.icu.impl.locale.LSR;
import com.ibm.icu.impl.locale.LocaleDistance;
import com.ibm.icu.impl.locale.XLikelySubtags;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.util.ULocale;
/**
@ -102,19 +105,24 @@ public final class LocaleDistanceMapper {
// Output resource bundle paths, split into two basic groups for likely locale mappings
// and match data.
private static final RbPath LIKELY_LANGUAGES = RbPath.of("likely", "languageAliases");
private static final RbPath LIKELY_M49 = RbPath.of("likely", "m49");
private static final RbPath LIKELY_REGIONS = RbPath.of("likely", "regionAliases");
private static final RbPath LIKELY_TRIE = RbPath.of("likely", "trie:bin");
private static final RbPath LIKELY_LSRS = RbPath.of("likely", "lsrs");
private static final RbPath LIKELY_LSRNUM = RbPath.of("likely", "lsrnum:intvector");
private static final RbPath MATCH_TRIE = RbPath.of("match", "trie:bin");
private static final RbPath MATCH_REGION_TO_PARTITIONS = RbPath.of("match", "regionToPartitions:bin");
private static final RbPath MATCH_PARTITIONS = RbPath.of("match", "partitions");
private static final RbPath MATCH_PARADIGMS = RbPath.of("match", "paradigms");
private static final RbPath MATCH_PARADIGMNUM = RbPath.of("match", "paradigmnum:intvector");
private static final RbPath MATCH_DISTANCES = RbPath.of("match", "distances:intvector");
// To split locale specifications (e.g. "ja_Latn" or "en_*_$!enUS").
private static final Splitter UNDERSCORE = Splitter.on('_');
// The encoding scheme allow us to only encode up to 27 M.49 code below.
// The size is later check while reading the M49 List.
private static final List<String> M49 = Arrays.asList("001", "143", "419");
/**
* Processes data from the given supplier to generate locale matcher ICU data.
*
@ -129,17 +137,22 @@ public final class LocaleDistanceMapper {
static IcuData process(CldrData data) {
IcuData icuData = new IcuData("langInfo", false);
if (M49.size() > 27) {
throw new IllegalStateException(
"The M49 list is too long. We can only encode up to 27 M49 codes.");
}
XLikelySubtags.Data likelyData = LikelySubtagsBuilder.build(data);
icuData.add(LIKELY_LANGUAGES, ofMapEntries(likelyData.languageAliases));
icuData.add(LIKELY_M49, RbValue.of(M49));
icuData.add(LIKELY_REGIONS, ofMapEntries(likelyData.regionAliases));
icuData.add(LIKELY_TRIE, ofBytes(likelyData.trie));
icuData.add(LIKELY_LSRS, ofLsrs(asList(likelyData.lsrs)));
icuData.add(LIKELY_LSRNUM, ofLsrNum(asList(likelyData.lsrs)));
LocaleDistance.Data distanceData = buildDistanceData(data);
icuData.add(MATCH_TRIE, ofBytes(distanceData.trie));
icuData.add(MATCH_REGION_TO_PARTITIONS, ofBytes(distanceData.regionToPartitionsIndex));
icuData.add(MATCH_PARTITIONS, RbValue.of(distanceData.partitionArrays));
icuData.add(MATCH_PARADIGMS, ofLsrs(distanceData.paradigmLSRs));
icuData.add(MATCH_PARADIGMNUM, ofLsrNum(distanceData.paradigmLSRs));
icuData.add(MATCH_DISTANCES, RbValue.of(Arrays.stream(distanceData.distances).mapToObj(Integer::toString)));
return icuData;
}
@ -434,21 +447,88 @@ public final class LocaleDistanceMapper {
.elementsPerLine(2);
}
// Returns an RbValue serialized from a sequence of LSR instance as a sequence of repeating
// (language, region, script) tuples (formatted as one tuple per line in the IcuData file).
//
// E.g.
// foo{
// lang1, script1, region1,
// ...
// langN, scriptN, regionN,
// }
private static RbValue ofLsrs(Collection<LSR> lsrs) {
// Returns an RbValue serialized from a sequence of LSR instance as a sequence of number
// represent (language, region, script) tuples (formatted as one number per line in the IcuData file).
private static RbValue ofLsrNum(Collection<LSR> lsrs) {
return RbValue.of(
lsrs.stream()
.flatMap(lsr -> Stream.of(lsr.language, lsr.script, lsr.region))
.collect(Collectors.toList()))
.elementsPerLine(3);
.flatMapToInt(lsr -> IntStream.of(LSRToNum(lsr)))
.mapToObj(Integer::toString));
}
// This method is added only to support encodeToIntForResource()
// It only support [a-z]{2,3} and will not work for other cases.
// TODO(ftang) Remove after LSR.encodeToIntForResource is available to the tool.
static private int encodeLanguageToInt(String language) {
assert language.length() >= 2;
assert language.length() <= 3;
assert language.charAt(0) >= 'a';
assert language.charAt(0) <= 'z';
assert language.charAt(1) >= 'a';
assert language.charAt(1) <= 'z';
assert language.length() == 2 || language.charAt(2) >= 'a';
assert language.length() == 2 || language.charAt(2) <= 'z';
return language.charAt(0) - 'a' + 1 +
27 * (language.charAt(1) - 'a' + 1) +
((language.length() == 2) ? 0 : 27 * 27 * (language.charAt(2) - 'a' + 1));
}
// This method is added only to support encodeToIntForResource()
// It only support [A-Z][a-z]{3} which defined in UScript and does not work for other cases.
// TODO(ftang) Remove after LSR.encodeToIntForResource is available to the tool.
static private int encodeScriptToInt(String script) {
int ret = UScript.getCodeFromName(script);
assert ret != UScript.INVALID_CODE;
return ret;
}
// This method is added only to support encodeToIntForResource()
// It only support [A-Z]{2}|001|143|419 and does not work for other cases.
// TODO(ftang) Remove after LSR.encodeToIntForResource is available to the tool.
static private int encodeRegionToInt(String region, List<String> m49) {
assert region.length() >= 2;
assert region.length() <= 3;
// Do not have enough bits to store the all 1000 possible combination of \d{3}
// Only support what is in M49.
if (region.length() == 3) {
int index = m49.indexOf(region);
assert index >= 0;
if (index < 0) {
throw new IllegalStateException(
"Please add '" + region + "' to M49 in LocaleDistanceMapper.java");
}
return index;
}
assert region.charAt(0) >= 'A';
assert region.charAt(0) <= 'Z';
assert region.charAt(1) >= 'A';
assert region.charAt(1) <= 'Z';
// 'AA' => 1+27*1 = 28
// ...
// 'AZ' => 1+27*26 = 703
// 'BA' => 2+27*1 = 29
// ...
// 'IN' => 9+27*14 = 387
// 'ZZ' => 26+27*26 = 728
return (region.charAt(0) - 'A' + 1) + 27 * (region.charAt(1) - 'A' + 1);
}
// This is designed to only support encoding some LSR into resources but not for other cases.
// TODO(ftang) Remove after LSR.encodeToIntForResource is available to the tool.
static int encodeToIntForResource(LSR lsr) {
return (encodeLanguageToInt(lsr.language) + (27*27*27) * encodeRegionToInt(lsr.region, M49)) |
(encodeScriptToInt(lsr.script) << 24);
}
private static int LSRToNum(LSR lsr) {
// Special number for "", "", "" return 0
if (lsr.language.isEmpty() && lsr.script.isEmpty() && lsr.region.isEmpty()) {
return 0;
}
// Special number for "skip", "script", "" return 1
if (lsr.language.equals("skip") && lsr.script.equals("script") && lsr.region.isEmpty()) {
return 1;
}
// TODO(ftang) Change to the following line after LSR.encodeToIntForResource is available to the tool.
// return lsr.encodeToIntForResource();
return encodeToIntForResource(lsr);
}
// Returns an RbValue serialized from a byte array, as a concatenated sequence of rows of