ICU-21284 Correctly normalize Unit Identifiers

See #1527
This commit is contained in:
Hugo van der Merwe 2021-02-08 21:16:57 +00:00
parent 9ac51f21ed
commit c0a490d49d
15 changed files with 526 additions and 160 deletions

View file

@ -1904,50 +1904,140 @@ units:table(nofallback){
}
}
unitQuantities{
ampere{"electric-current"}
ampere-per-meter{"magnetic-field-strength"}
ampere-per-square-meter{"current-density"}
bit{"digital"}
candela{"luminous-intensity"}
candela-per-square-meter{"illuminance"}
candela-square-meter-per-square-meter{"luminous-flux"}
cubic-meter{"volume"}
cubic-meter-per-kilogram{"specific-volume"}
cubic-meter-per-meter{"consumption"}
cubic-second-square-ampere-per-kilogram-square-meter{"electric-conductance"}
em{"typewidth"}
item{"substance-amount"}
item-per-cubic-meter{"concentration"}
item-per-kilogram{"concentration-mass"}
kelvin{"temperature"}
kilogram{"mass"}
kilogram-meter-per-square-second{"force"}
kilogram-per-cubic-meter{"mass-density"}
kilogram-per-kilogram{"mass-fraction"}
kilogram-per-meter-square-second{"pressure"}
kilogram-per-square-meter-square-second{"pressure-per-length"}
kilogram-per-square-second-ampere{"magnetic-induction"}
kilogram-square-meter-per-cubic-second{"power"}
kilogram-square-meter-per-cubic-second-ampere{"voltage"}
kilogram-square-meter-per-cubic-second-square-ampere{"electric-resistance"}
kilogram-square-meter-per-square-second{"energy"}
kilogram-square-meter-per-square-second-ampere{"magnetic-flux"}
kilogram-square-meter-per-square-second-square-ampere{"electric-inductance"}
meter{"length"}
meter-per-second{"speed"}
meter-per-square-second{"acceleration"}
pixel{"graphics"}
pixel-per-meter{"resolution"}
portion{"portion"}
pow4-second-square-ampere-per-kilogram-square-meter{"electric-capacitance"}
revolution{"angle"}
revolution-per-meter{"wave-number"}
revolution-per-second{"frequency"}
second{"duration"}
second-ampere{"electric-charge"}
square-meter{"area"}
square-meter-per-square-second{"dose"}
square-revolution{"solid-angle"}
year{"year-duration"}
{
candela{"luminous-intensity"}
}
{
candela-per-square-meter{"illuminance"}
}
{
candela-square-meter-per-square-meter{"luminous-flux"}
}
{
kilogram{"mass"}
}
{
kilogram-per-kilogram{"mass-fraction"}
}
{
kilogram-per-cubic-meter{"mass-density"}
}
{
kilogram-per-meter-square-second{"pressure"}
}
{
kilogram-per-square-second-ampere{"magnetic-induction"}
}
{
kilogram-meter-per-square-second{"force"}
}
{
kilogram-square-meter-per-cubic-second{"power"}
}
{
kilogram-square-meter-per-cubic-second-ampere{"voltage"}
}
{
kilogram-square-meter-per-cubic-second-square-ampere{"electric-resistance"}
}
{
kilogram-square-meter-per-square-second{"energy"}
}
{
kilogram-square-meter-per-square-second-ampere{"magnetic-flux"}
}
{
kilogram-square-meter-per-square-second-square-ampere{"electric-inductance"}
}
{
cubic-meter{"volume"}
}
{
cubic-meter-per-kilogram{"specific-volume"}
}
{
cubic-meter-per-meter{"consumption"}
}
{
square-meter{"area"}
}
{
square-meter-per-square-second{"dose"}
}
{
meter{"length"}
}
{
meter-per-second{"speed"}
}
{
meter-per-square-second{"acceleration"}
}
{
kilogram-per-square-meter-square-second{"pressure-per-length"}
}
{
pow4-second-square-ampere-per-kilogram-square-meter{"electric-capacitance"}
}
{
cubic-second-square-ampere-per-kilogram-square-meter{"electric-conductance"}
}
{
second{"duration"}
}
{
second-ampere{"electric-charge"}
}
{
year{"year-duration"}
}
{
ampere{"electric-current"}
}
{
ampere-per-square-meter{"current-density"}
}
{
ampere-per-meter{"magnetic-field-strength"}
}
{
kelvin{"temperature"}
}
{
square-revolution{"solid-angle"}
}
{
revolution{"angle"}
}
{
revolution-per-meter{"wave-number"}
}
{
revolution-per-second{"frequency"}
}
{
item{"substance-amount"}
}
{
item-per-kilogram{"concentration-mass"}
}
{
item-per-cubic-meter{"concentration"}
}
{
portion{"portion"}
}
{
bit{"digital"}
}
{
pixel{"graphics"}
}
{
pixel-per-meter{"resolution"}
}
{
em{"typewidth"}
}
}
}

View file

@ -138,14 +138,17 @@ const struct UnitPrefixStrings {
* A ResourceSink that collects simple unit identifiers from the keys of the
* convertUnits table into an array, and adds these values to a TrieBuilder,
* with associated values being their index into this array plus a specified
* offset, to a trie.
* offset.
*
* Example code:
*
* UErrorCode status = U_ZERO_ERROR;
* BytesTrieBuilder b(status);
* const char *unitIdentifiers[200];
* SimpleUnitIdentifiersSink identifierSink(unitIdentifiers, 200, b, kTrieValueOffset);
* int32_t ARR_SIZE = 200;
* const char *unitIdentifiers[ARR_SIZE];
* int32_t *unitCategories[ARR_SIZE];
* SimpleUnitIdentifiersSink identifierSink(gSerializedUnitCategoriesTrie, unitIdentifiers,
* unitCategories, ARR_SIZE, b, kTrieValueOffset);
* LocalUResourceBundlePointer unitsBundle(ures_openDirect(NULL, "units", &status));
* ures_getAllItemsWithFallback(unitsBundle.getAlias(), "convertUnits", identifierSink, status);
*/
@ -153,20 +156,27 @@ class SimpleUnitIdentifiersSink : public icu::ResourceSink {
public:
/**
* Constructor.
* @param out Array of char* to which the simple unit identifiers will be
* saved.
* @param outSize The size of `out`.
* @param quantitiesTrieData The data for constructing a quantitiesTrie,
* which maps from a simple unit identifier to an index into the
* gCategories array.
* @param out Array of char* to which pointers to the simple unit
* identifiers will be saved. (Does not take ownership.)
* @param outCategories Array of int32_t to which category indexes will be
* saved: this corresponds to simple unit IDs saved to `out`, mapping
* from the ID to the value produced by the quantitiesTrie (which is an
* index into the gCategories array).
* @param outSize The size of `out` and `outCategories`.
* @param trieBuilder The trie builder to which the simple unit identifier
* should be added. The trie builder must outlive this resource sink.
* @param trieValueOffset This is added to the index of the identifier in
* the `out` array, before adding to `trieBuilder` as the value
* associated with the identifier.
*/
explicit SimpleUnitIdentifiersSink(const char **out, int32_t outSize, BytesTrieBuilder &trieBuilder,
int32_t trieValueOffset)
: outArray(out), outSize(outSize), trieBuilder(trieBuilder), trieValueOffset(trieValueOffset),
outIndex(0) {
}
explicit SimpleUnitIdentifiersSink(StringPiece quantitiesTrieData, const char **out,
int32_t *outCategories, int32_t outSize,
BytesTrieBuilder &trieBuilder, int32_t trieValueOffset)
: outArray(out), outCategories(outCategories), outSize(outSize), trieBuilder(trieBuilder),
trieValueOffset(trieValueOffset), quantitiesTrieData(quantitiesTrieData), outIndex(0) {}
/**
* Adds the table keys found in value to the output vector.
@ -186,30 +196,120 @@ class SimpleUnitIdentifiersSink : public icu::ResourceSink {
return;
}
BytesTrie quantitiesTrie(quantitiesTrieData.data());
// Collect keys from the table resource.
const char *key;
for (int32_t i = 0; table.getKeyAndValue(i, key, value); ++i) {
const char *simpleUnitID;
for (int32_t i = 0; table.getKeyAndValue(i, simpleUnitID, value); ++i) {
U_ASSERT(i < table.getSize());
U_ASSERT(outIndex < outSize);
if (uprv_strcmp(key, "kilogram") == 0) {
if (uprv_strcmp(simpleUnitID, "kilogram") == 0) {
// For parsing, we use "gram", the prefixless metric mass unit. We
// thus ignore the SI Base Unit of Mass: it exists due to being the
// mass conversion target unit, but not needed for MeasureUnit
// parsing.
continue;
}
outArray[outIndex] = key;
trieBuilder.add(key, trieValueOffset + outIndex, status);
outArray[outIndex] = simpleUnitID;
trieBuilder.add(simpleUnitID, trieValueOffset + outIndex, status);
// Find the base target unit for this simple unit
ResourceTable table = value.getTable(status);
if (U_FAILURE(status)) { return; }
if (!table.findValue("target", value)) {
status = U_INVALID_FORMAT_ERROR;
break;
}
int32_t len;
const UChar* uTarget = value.getString(len, status);
CharString target;
target.appendInvariantChars(uTarget, len, status);
if (U_FAILURE(status)) { return; }
quantitiesTrie.reset();
UStringTrieResult result = quantitiesTrie.next(target.data(), target.length());
if (!USTRINGTRIE_HAS_VALUE(result)) {
status = U_INVALID_FORMAT_ERROR;
break;
}
outCategories[outIndex] = quantitiesTrie.getValue();
outIndex++;
}
}
private:
const char **outArray;
int32_t *outCategories;
int32_t outSize;
BytesTrieBuilder &trieBuilder;
int32_t trieValueOffset;
StringPiece quantitiesTrieData;
int32_t outIndex;
};
/**
* A ResourceSink that collects information from `unitQuantities` in the `units`
* resource to provide key->value lookups from base unit to category, as well as
* preserving ordering information for these categories. See `units.txt`.
*
* For example: "kilogram" -> "mass", "meter-per-second" -> "speed".
*
* In C++ unitQuantity values are collected in order into a UChar* array, while
* unitQuantity keys are added added to a TrieBuilder, with associated values
* being the index into the aforementioned UChar* array.
*/
class CategoriesSink : public icu::ResourceSink {
public:
/**
* Constructor.
* @param out Array of UChar* to which unitQuantity values will be saved.
* The pointers returned not owned: they point directly at the resource
* strings in static memory.
* @param outSize The size of the `out` array.
* @param trieBuilder The trie builder to which the keys (base units) of
* each unitQuantity will be added, each with value being the offset
* into `out`.
*/
explicit CategoriesSink(const UChar **out, int32_t &outSize, BytesTrieBuilder &trieBuilder)
: outQuantitiesArray(out), outSize(outSize), trieBuilder(trieBuilder), outIndex(0) {}
void put(const char * /*key*/, ResourceValue &value, UBool /*noFallback*/, UErrorCode &status) {
ResourceArray array = value.getArray(status);
if (U_FAILURE(status)) {
return;
}
if (outIndex + array.getSize() > outSize) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return;
}
for (int32_t i = 0; array.getValue(i, value); ++i) {
U_ASSERT(outIndex < outSize);
ResourceTable table = value.getTable(status);
if (U_FAILURE(status)) {
return;
}
if (table.getSize() != 1) {
status = U_INVALID_FORMAT_ERROR;
return;
}
const char *key;
table.getKeyAndValue(0, key, value);
int32_t uTmpLen;
outQuantitiesArray[outIndex] = value.getString(uTmpLen, status);
trieBuilder.add(key, outIndex, status);
outIndex++;
}
}
private:
const UChar **outQuantitiesArray;
int32_t &outSize;
BytesTrieBuilder &trieBuilder;
int32_t outIndex;
};
@ -222,11 +322,34 @@ icu::UInitOnce gUnitExtrasInitOnce = U_INITONCE_INITIALIZER;
// by SingleUnitImpl::getSimpleUnitID().)
const char **gSimpleUnits = nullptr;
// Maps from the value associated with each simple unit ID to an index into the
// gCategories array.
int32_t *gSimpleUnitCategories = nullptr;
char *gSerializedUnitExtrasStemTrie = nullptr;
// Array of UChar* pointing at the unit categories (aka "quantities", aka
// "types"), as found in the `unitQuantities` resource. The array memory itself
// is owned by this pointer, but the individual UChar* in that array point at
// static memory.
const UChar **gCategories = nullptr;
// Number of items in `gCategories`.
int32_t gCategoriesCount = 0;
// TODO: rather save an index into gCategories?
const char *kConsumption = "consumption";
size_t kConsumptionLen = strlen("consumption");
// Serialized BytesTrie for mapping from base units to indices into gCategories.
char *gSerializedUnitCategoriesTrie = nullptr;
UBool U_CALLCONV cleanupUnitExtras() {
uprv_free(gSerializedUnitCategoriesTrie);
gSerializedUnitCategoriesTrie = nullptr;
uprv_free(gCategories);
gCategories = nullptr;
uprv_free(gSerializedUnitExtrasStemTrie);
gSerializedUnitExtrasStemTrie = nullptr;
uprv_free(gSimpleUnitCategories);
gSimpleUnitCategories = nullptr;
uprv_free(gSimpleUnits);
gSimpleUnits = nullptr;
gUnitExtrasInitOnce.reset();
@ -235,6 +358,36 @@ UBool U_CALLCONV cleanupUnitExtras() {
void U_CALLCONV initUnitExtras(UErrorCode& status) {
ucln_i18n_registerCleanup(UCLN_I18N_UNIT_EXTRAS, cleanupUnitExtras);
LocalUResourceBundlePointer unitsBundle(ures_openDirect(nullptr, "units", &status));
// Collect unitQuantities information into gSerializedUnitCategoriesTrie and gCategories.
const char *CATEGORY_TABLE_NAME = "unitQuantities";
LocalUResourceBundlePointer unitQuantities(
ures_getByKey(unitsBundle.getAlias(), CATEGORY_TABLE_NAME, nullptr, &status));
if (U_FAILURE(status)) { return; }
gCategoriesCount = unitQuantities.getAlias()->fSize;
size_t quantitiesMallocSize = sizeof(UChar *) * gCategoriesCount;
gCategories = static_cast<const UChar **>(uprv_malloc(quantitiesMallocSize));
if (gCategories == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
uprv_memset(gCategories, 0, quantitiesMallocSize);
BytesTrieBuilder quantitiesBuilder(status);
CategoriesSink categoriesSink(gCategories, gCategoriesCount, quantitiesBuilder);
ures_getAllItemsWithFallback(unitsBundle.getAlias(), CATEGORY_TABLE_NAME, categoriesSink, status);
StringPiece resultQuantities = quantitiesBuilder.buildStringPiece(USTRINGTRIE_BUILD_FAST, status);
if (U_FAILURE(status)) { return; }
// Copy the result into the global constant pointer
size_t numBytesQuantities = resultQuantities.length();
gSerializedUnitCategoriesTrie = static_cast<char *>(uprv_malloc(numBytesQuantities));
if (gSerializedUnitCategoriesTrie == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
uprv_memcpy(gSerializedUnitCategoriesTrie, resultQuantities.data(), numBytesQuantities);
// Build the BytesTrie that Parser needs for parsing unit identifiers.
BytesTrieBuilder b(status);
if (U_FAILURE(status)) { return; }
@ -270,11 +423,8 @@ void U_CALLCONV initUnitExtras(UErrorCode& status) {
// Add sanctioned simple units by offset: simple units all have entries in
// units/convertUnits resources.
// TODO(ICU-21059): confirm whether this is clean enough, or whether we need to
// filter units' validity list instead.
LocalUResourceBundlePointer unitsBundle(ures_openDirect(NULL, "units", &status));
LocalUResourceBundlePointer convertUnits(
ures_getByKey(unitsBundle.getAlias(), "convertUnits", NULL, &status));
ures_getByKey(unitsBundle.getAlias(), "convertUnits", nullptr, &status));
if (U_FAILURE(status)) { return; }
// Allocate enough space: with identifierSink below skipping kilogram, we're
@ -287,9 +437,17 @@ void U_CALLCONV initUnitExtras(UErrorCode& status) {
return;
}
uprv_memset(gSimpleUnits, 0, arrayMallocSize);
arrayMallocSize = sizeof(int32_t) * simpleUnitsCount;
gSimpleUnitCategories = static_cast<int32_t *>(uprv_malloc(arrayMallocSize));
if (gSimpleUnitCategories == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
uprv_memset(gSimpleUnitCategories, 0, arrayMallocSize);
// Populate gSimpleUnits and build the associated trie.
SimpleUnitIdentifiersSink identifierSink(gSimpleUnits, simpleUnitsCount, b, kSimpleUnitOffset);
SimpleUnitIdentifiersSink identifierSink(resultQuantities, gSimpleUnits, gSimpleUnitCategories,
simpleUnitsCount, b, kSimpleUnitOffset);
ures_getAllItemsWithFallback(unitsBundle.getAlias(), "convertUnits", identifierSink, status);
// Build the CharsTrie
@ -648,6 +806,23 @@ compareSingleUnits(const void* /*context*/, const void* left, const void* right)
return (*realLeft)->compareTo(**realRight);
}
// Returns an index into the gCategories array, for the "unitQuantity" (aka
// "type" or "category") associated with the given base unit identifier. Returns
// -1 on failure, together with U_UNSUPPORTED_ERROR.
int32_t getUnitCategoryIndex(StringPiece baseUnitIdentifier, UErrorCode &status) {
umtx_initOnce(gUnitExtrasInitOnce, &initUnitExtras, status);
if (U_FAILURE(status)) {
return -1;
}
BytesTrie trie(gSerializedUnitCategoriesTrie);
UStringTrieResult result = trie.next(baseUnitIdentifier.data(), baseUnitIdentifier.length());
if (!USTRINGTRIE_HAS_VALUE(result)) {
status = U_UNSUPPORTED_ERROR;
return -1;
}
return trie.getValue();
}
} // namespace
U_CAPI int32_t U_EXPORT2
@ -672,6 +847,33 @@ umeas_getPrefixBase(UMeasurePrefix unitPrefix) {
return 10;
}
CharString U_I18N_API getUnitQuantity(StringPiece baseUnitIdentifier, UErrorCode &status) {
CharString result;
U_ASSERT(result.length() == 0);
if (U_FAILURE(status)) {
return result;
}
UErrorCode localStatus = U_ZERO_ERROR;
int32_t idx = getUnitCategoryIndex(baseUnitIdentifier, localStatus);
if (U_FAILURE(localStatus)) {
// TODO(icu-units#130): support inverting any unit, with correct
// fallback logic: inversion and fallback may depend on presence or
// absence of a usage for that category.
if (uprv_strcmp(baseUnitIdentifier.data(), "meter-per-cubic-meter") == 0) {
result.append(kConsumption, (int32_t)kConsumptionLen, status);
return result;
}
status = U_INVALID_FORMAT_ERROR;
return result;
}
if (idx < 0 || idx >= gCategoriesCount) {
status = U_INVALID_FORMAT_ERROR;
return result;
}
result.appendInvariantChars(gCategories[idx], u_strlen(gCategories[idx]), status);
return result;
}
// In ICU4J, this is MeasureUnit.getSingleUnitImpl().
SingleUnitImpl SingleUnitImpl::forMeasureUnit(const MeasureUnit& measureUnit, UErrorCode& status) {
MeasureUnitImpl temp;
@ -743,6 +945,10 @@ void SingleUnitImpl::appendNeutralIdentifier(CharString &result, UErrorCode &sta
result.append(StringPiece(this->getSimpleUnitID()), status);
}
int32_t SingleUnitImpl::getUnitCategoryIndex() const {
return gSimpleUnitCategories[index];
}
MeasureUnitImpl::MeasureUnitImpl(const MeasureUnitImpl &other, UErrorCode &status) {
*this = other.copy(status);
}

View file

@ -41,6 +41,20 @@ struct U_I18N_API MeasureUnitImplWithIndex : public UMemory {
: index(index), unitImpl(unitImpl) {}
};
/**
* Looks up the "unitQuantity" (aka "type" or "category") of a base unit
* identifier. The category is returned via `result`, which must initially be
* empty.
*
* This only supports base units: other units must be resolved to base units
* before passing to this function, otherwise U_UNSUPPORTED_ERROR status will be
* returned.
*
* Categories are found in `unitQuantities` in the `units` resource (see
* `units.txt`).
*/
CharString U_I18N_API getUnitQuantity(StringPiece baseUnitIdentifier, UErrorCode &status);
/**
* A struct representing a single unit (optional SI or binary prefix, and dimensionality).
*/
@ -70,10 +84,20 @@ struct U_I18N_API SingleUnitImpl : public UMemory {
*/
void appendNeutralIdentifier(CharString &result, UErrorCode &status) const;
/**
* Returns the index of this unit's "quantity" in unitQuantities (in
* measunit_extra.cpp). The value of this index determines sort order for
* normalization of unit identifiers.
*/
int32_t getUnitCategoryIndex() const;
/**
* Compare this SingleUnitImpl to another SingleUnitImpl for the sake of
* sorting and coalescing.
*
* Sort order of units is specified by UTS #35
* (https://unicode.org/reports/tr35/tr35-info.html#Unit_Identifier_Normalization).
*
* Takes the sign of dimensionality into account, but not the absolute
* value: per-meter is not considered the same as meter, but meter is
* considered the same as square-meter.
@ -90,6 +114,16 @@ struct U_I18N_API SingleUnitImpl : public UMemory {
if (dimensionality > 0 && other.dimensionality < 0) {
return -1;
}
// Sort by official quantity order
int32_t thisQuantity = this->getUnitCategoryIndex();
int32_t otherQuantity = other.getUnitCategoryIndex();
if (thisQuantity < otherQuantity) {
return -1;
}
if (thisQuantity > otherQuantity) {
return 1;
}
// If quantity order didn't help, then we go by index.
if (index < other.index) {
return -1;
}
@ -128,7 +162,8 @@ struct U_I18N_API SingleUnitImpl : public UMemory {
/**
* Simple unit index, unique for every simple unit, -1 for the dimensionless
* unit. This is an index into a string list in measunit_extra.cpp.
* unit. This is an index into a string list in measunit_extra.cpp, as
* loaded by SimpleUnitIdentifiersSink.
*
* The default value is -1, meaning the dimensionless unit:
* isDimensionless() will return true, until index is changed.

View file

@ -364,29 +364,6 @@ int32_t UnitPreferenceMetadata::compareTo(const UnitPreferenceMetadata &other, b
return cmp;
}
CharString U_I18N_API getUnitCategory(const char *baseUnitIdentifier, UErrorCode &status) {
CharString result;
LocalUResourceBundlePointer unitsBundle(ures_openDirect(NULL, "units", &status));
LocalUResourceBundlePointer unitQuantities(
ures_getByKey(unitsBundle.getAlias(), "unitQuantities", NULL, &status));
int32_t categoryLength;
if (U_FAILURE(status)) { return result; }
const UChar *uCategory =
ures_getStringByKey(unitQuantities.getAlias(), baseUnitIdentifier, &categoryLength, &status);
if (U_FAILURE(status)) {
// TODO(icu-units#130): support inverting any unit, with correct
// fallback logic: inversion and fallback may depend on presence or
// absence of a usage for that category.
if (uprv_strcmp(baseUnitIdentifier, "meter-per-cubic-meter") == 0) {
status = U_ZERO_ERROR;
result.append("consumption", status);
return result;
}
}
result.appendInvariantChars(uCategory, categoryLength, status);
return result;
}
// TODO: this may be unnecessary. Fold into ConversionRates class? Or move to anonymous namespace?
void U_I18N_API getAllConversionRates(MaybeStackVector<ConversionRateInfo> &result, UErrorCode &status) {
LocalUResourceBundlePointer unitsBundle(ures_openDirect(NULL, "units", &status));

View file

@ -17,22 +17,6 @@
U_NAMESPACE_BEGIN
namespace units {
/**
* Looks up the unit category of a base unit identifier.
*
* Only supports base units, other units must be resolved to base units before
* passing to this function.
*
* Categories are found in `unitQuantities` in the `units` resource (see
* `units.txt`).
*
* TODO(hugovdm): if we give units_data.cpp access to the functionality of
* `extractCompoundBaseUnit` which is currently in units_converter.cpp, we could
* support all units for which there is a category. Does it make sense to move
* that function to units_data.cpp?
*/
CharString U_I18N_API getUnitCategory(const char *baseUnitIdentifier, UErrorCode &status);
/**
* Encapsulates "convertUnits" information from units resources, specifying how
* to convert from one unit to another.

View file

@ -53,13 +53,18 @@ UnitsRouter::UnitsRouter(MeasureUnit inputUnit, StringPiece region, StringPiece
MeasureUnitImpl inputUnitImpl = MeasureUnitImpl::forMeasureUnitMaybeCopy(inputUnit, status);
MeasureUnit baseUnit =
(extractCompoundBaseUnit(inputUnitImpl, conversionRates, status)).build(status);
CharString category = getUnitCategory(baseUnit.getIdentifier(), status);
CharString category = getUnitQuantity(baseUnit.getIdentifier(), status);
if (U_FAILURE(status)) {
return;
}
const UnitPreference *const *unitPreferences;
int32_t preferencesCount = 0;
prefs.getPreferencesFor(category.data(), usage, region, unitPreferences, preferencesCount, status);
prefs.getPreferencesFor(category.toStringPiece(), usage, region, unitPreferences, preferencesCount,
status);
for (int i = 0; i < preferencesCount; ++i) {
U_ASSERT(unitPreferences[i] != nullptr);
const auto &preference = *unitPreferences[i];
MeasureUnitImpl complexTargetUnitImpl =

View file

@ -3672,13 +3672,12 @@ void MeasureFormatTest::TestIdentifiers() {
{"kilometer-per-second-per-megaparsec", "kilometer-per-megaparsec-second"},
// TODO(ICU-21284): Add more test cases once the proper ranking is available.
// TODO(ICU-21284,icu-units#70): These cases are the wrong way around:
{"pound-force-foot", "foot-pound-force"},
{"foot-pound-force", "foot-pound-force"},
{"kilowatt-hour", "hour-kilowatt"},
{"hour-kilowatt", "hour-kilowatt"},
{"newton-meter", "meter-newton"},
{"meter-newton", "meter-newton"},
{"newton-meter", "newton-meter"},
{"meter-newton", "newton-meter"},
{"pound-force-foot", "pound-force-foot"},
{"foot-pound-force", "pound-force-foot"},
{"kilowatt-hour", "kilowatt-hour"},
{"hour-kilowatt", "kilowatt-hour"},
// Testing prefixes are parsed and produced correctly (ensures no
// collisions in the enum values)

View file

@ -5,7 +5,9 @@
#if !UCONFIG_NO_FORMATTING
#include "measunit_impl.h"
#include "units_data.h"
#include "intltest.h"
using namespace ::icu::units;
@ -51,9 +53,10 @@ void UnitsDataTest::testGetUnitCategory() {
IcuTestErrorCode status(*this, "testGetUnitCategory");
for (const auto &t : testCases) {
CharString category = getUnitCategory(t.unit, status);
status.errIfFailureAndReset("getUnitCategory(%s)", t.unit);
assertEquals("category", t.expectedCategory, category.data());
CharString category = getUnitQuantity(t.unit, status);
if (!status.errIfFailureAndReset("getUnitCategory(%s)", t.unit)) {
assertEquals("category", t.expectedCategory, category.data());
}
}
}

View file

@ -5,6 +5,9 @@ package com.ibm.icu.impl.units;
import com.ibm.icu.util.MeasureUnit;
// TODO: revisit documentation in this file. E.g. we don't do dimensionless
// units in Java? We use null instead.
/**
* A class representing a single unit (optional SI or binary prefix, and dimensionality).
*/
@ -84,6 +87,9 @@ public class SingleUnitImpl {
* Compare this SingleUnitImpl to another SingleUnitImpl for the sake of
* sorting and coalescing.
* <p>
* Sort order of units is specified by UTS #35
* (https://unicode.org/reports/tr35/tr35-info.html#Unit_Identifier_Normalization).
* <p>
* Takes the sign of dimensionality into account, but not the absolute
* value: per-meter is not considered the same as meter, but meter is
* considered the same as square-meter.
@ -100,6 +106,16 @@ public class SingleUnitImpl {
if (dimensionality > 0 && other.dimensionality < 0) {
return -1;
}
// Sort by official quantity order
int thisCategoryIndex = UnitsData.getCategoryIndexOfSimpleUnit(index);
int otherCategoryIndex = UnitsData.getCategoryIndexOfSimpleUnit(other.index);
if (thisCategoryIndex < otherCategoryIndex) {
return -1;
}
if (thisCategoryIndex > otherCategoryIndex) {
return 1;
}
// If quantity order didn't help, then we go by index.
if (index < other.index) {
return -1;
}
@ -158,6 +174,7 @@ public class SingleUnitImpl {
this.unitPrefix = unitPrefix;
}
// TODO: unused? Delete?
public int getIndex() {
return index;
}

View file

@ -1,11 +1,11 @@
// © 2020 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package com.ibm.icu.impl.units;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import com.ibm.icu.impl.ICUData;
import com.ibm.icu.impl.ICUResourceBundle;
@ -17,33 +17,38 @@ import com.ibm.icu.util.UResourceBundle;
* Responsible for all units data operations (retriever, analysis, extraction certain data ... etc.).
*/
public class UnitsData {
private volatile static String[] simpleUnits = null;
// TODO(icu-units#122): this class can use static initialization to load the
// data once, and provide access to it via static methods. (Partial change
// has been done already.)
// Array of simple unit IDs.
private static String[] simpleUnits = null;
// Maps from the value associated with each simple unit ID to a category
// index number.
private static int[] simpleUnitCategories = null;
private ConversionRates conversionRates;
private UnitPreferences unitPreferences;
/**
* Pairs of categories and the corresponding base units.
*/
private Categories categories;
public UnitsData() {
this.conversionRates = new ConversionRates();
this.unitPreferences = new UnitPreferences();
this.categories = new Categories();
}
public static String[] getSimpleUnits() {
if (simpleUnits != null) {
return simpleUnits;
}
return simpleUnits;
}
static {
// Read simple units
ICUResourceBundle resource;
resource = (ICUResourceBundle) UResourceBundle.getBundleInstance(ICUData.ICU_BASE_NAME, "units");
SimpleUnitIdentifiersSink sink = new SimpleUnitIdentifiersSink();
resource.getAllItemsWithFallback("convertUnits", sink);
simpleUnits = sink.simpleUnits;
return simpleUnits;
simpleUnitCategories = sink.simpleUnitCategories;
}
public ConversionRates getConversionRates() {
@ -54,6 +59,10 @@ public class UnitsData {
return unitPreferences;
}
public static int getCategoryIndexOfSimpleUnit(int simpleUnitIndex) {
return simpleUnitCategories[simpleUnitIndex];
}
/**
* @param measureUnit An instance of MeasureUnitImpl.
* @return the corresponding category.
@ -70,7 +79,8 @@ public class UnitsData {
return "consumption";
}
return this.categories.mapFromUnitToCategory.get(baseUnitIdentifier);
int index = Categories.baseUnitToIndex.get(baseUnitIdentifier);
return Categories.indexToCategory[index];
}
public UnitPreferences.UnitPreference[] getPreferencesFor(String category, String usage, String region) {
@ -79,6 +89,7 @@ public class UnitsData {
public static class SimpleUnitIdentifiersSink extends UResource.Sink {
String[] simpleUnits = null;
int[] simpleUnitCategories = null;
@Override
public void put(UResource.Key key, UResource.Value value, boolean noFallback) {
@ -87,6 +98,7 @@ public class UnitsData {
UResource.Table simpleUnitsTable = value.getTable();
ArrayList<String> simpleUnits = new ArrayList<>();
ArrayList<Integer> simpleUnitCategories = new ArrayList<>();
for (int i = 0; simpleUnitsTable.getKeyAndValue(i, key, value); i++) {
if (key.toString().equals("kilogram")) {
@ -97,10 +109,28 @@ public class UnitsData {
continue;
}
// Find the base target unit for this simple unit
UResource.Table table = value.getTable();
if (!table.findValue("target", value)) {
// TODO: is there a more idiomatic way to deal with Resource
// Sink data errors in ICU4J? For now we just assert-fail,
// and otherwise skip bad data:
assert false : "Could not find \"target\" for simple unit: " + key;
continue;
}
String target = value.getString();
simpleUnits.add(key.toString());
simpleUnitCategories.add(Categories.baseUnitToIndex.get(target));
}
this.simpleUnits = simpleUnits.toArray(new String[0]);
this.simpleUnitCategories = new int[simpleUnitCategories.size()];
Iterator<Integer> iter = simpleUnitCategories.iterator();
for (int i = 0; i < this.simpleUnitCategories.length; i++)
{
this.simpleUnitCategories[i] = iter.next().intValue();
}
}
}
@ -138,50 +168,71 @@ public class UnitsData {
public static final String DEFAULT_USAGE = "default";
}
// Deals with base units and categories, e.g. "meter-per-second" --> "speed".
public static class Categories {
/**
* Maps from base unit to an index value: an index into the
* indexToCategory array.
*/
static HashMap<String, Integer> baseUnitToIndex;
/**
* Contains the map between units in their base units into their category.
* For example: meter-per-second --> "speed"
* Our official array of category strings - categories are identified by
* indeces into this array.
*/
HashMap<String, String> mapFromUnitToCategory;
static String[] indexToCategory;
public Categories() {
static {
// Read unit Categories
ICUResourceBundle resource;
resource = (ICUResourceBundle) UResourceBundle.getBundleInstance(ICUData.ICU_BASE_NAME, "units");
CategoriesSink sink = new CategoriesSink();
resource.getAllItemsWithFallback(Constants.CATEGORY_TABLE_NAME, sink);
this.mapFromUnitToCategory = sink.getMapFromUnitToCategory();
baseUnitToIndex = sink.mapFromUnitToIndex;
indexToCategory = sink.categories.toArray(new String[0]);
}
}
/**
* A Resource Sink that collects information from `unitQuantities` in the
* `units` resource to provide key->value lookups from base unit to
* category, as well as preserving ordering information for these
* categories. See `units.txt`.
*
* For example: "kilogram" -> "mass", "meter-per-second" -> "speed".
*
* In Java unitQuantity values are collected in order into an ArrayList,
* while unitQuantity key-to-index lookups are handled with a HashMap.
*/
public static class CategoriesSink extends UResource.Sink {
/**
* Contains the map between units in their base units into their category.
* For example: meter-per-second --> "speed"
*/
HashMap<String, String> mapFromUnitToCategory;
HashMap<String, Integer> mapFromUnitToIndex;
ArrayList<String> categories;
public CategoriesSink() {
mapFromUnitToCategory = new HashMap<>();
mapFromUnitToIndex = new HashMap<>();
categories = new ArrayList<>();
}
@Override
public void put(UResource.Key key, UResource.Value value, boolean noFallback) {
assert (key.toString().equals(Constants.CATEGORY_TABLE_NAME));
assert (value.getType() == UResourceBundle.TABLE);
assert (value.getType() == UResourceBundle.ARRAY);
UResource.Table categoryTable = value.getTable();
for (int i = 0; categoryTable.getKeyAndValue(i, key, value); i++) {
assert (value.getType() == UResourceBundle.STRING);
mapFromUnitToCategory.put(key.toString(), value.toString());
UResource.Array categoryArray = value.getArray();
for (int i=0; categoryArray.getValue(i, value); i++) {
assert (value.getType() == UResourceBundle.TABLE);
UResource.Table table = value.getTable();
assert (table.getSize() == 1)
: "expecting single-entry table, got size: " + table.getSize();
table.getKeyAndValue(0, key, value);
assert value.getType() == UResourceBundle.STRING : "expecting category string";
mapFromUnitToIndex.put(key.toString(), categories.size());
categories.add(value.toString());
}
}
public HashMap<String, String> getMapFromUnitToCategory() {
return mapFromUnitToCategory;
}
}
}

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:e738e530bcd2dcafff1de1d603c79d5a1edc04c095ca52366259c354f19e56ed
size 13306751
oid sha256:f4a144335f9c6c6a6df5a95d882d8841de82be4e86db650c643c67ac84ef8f84
size 13306908

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:19f02ee2a2dc722a729fa9258175a738fc6021d252769b85c023a927135c7c26
oid sha256:09736746668a9d57494331b4533ae8ba1e38f55f433f5ecd9026e1c57735a413
size 95080

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:1970fbcc18ec8a8b86702fe73ffbba842e9379bd973edbfb4e189ac6ac6d2a83
oid sha256:056761b1169f3ba2b2c63e3f71c8bce2e61a7a80d7e21bcd9c38e98fbd3414a0
size 723496

View file

@ -3497,13 +3497,12 @@ public class MeasureUnitTest extends TestFmwk {
new TestCase("kilometer-per-second-per-megaparsec", "kilometer-per-megaparsec-second"),
// TODO(ICU-21284): Add more test cases once the proper ranking is available.
// TODO(ICU-21284,icu-units#70): These cases are the wrong way around:
new TestCase("pound-force-foot", "foot-pound-force"),
new TestCase("foot-pound-force", "foot-pound-force"),
new TestCase("kilowatt-hour", "hour-kilowatt"),
new TestCase("hour-kilowatt", "hour-kilowatt"),
new TestCase("newton-meter", "meter-newton"),
new TestCase("meter-newton", "meter-newton"),
new TestCase("newton-meter", "newton-meter"),
new TestCase("meter-newton", "newton-meter"),
new TestCase("pound-force-foot", "pound-force-foot"),
new TestCase("foot-pound-force", "pound-force-foot"),
new TestCase("kilowatt-hour", "kilowatt-hour"),
new TestCase("hour-kilowatt", "kilowatt-hour"),
// Testing prefixes are parsed and produced correctly (ensures no
// collisions in the enum values)

View file

@ -146,7 +146,7 @@
; /unitConstants/$1 ; values="$2"
//supplementalData/unitQuantities/unitQuantity[@baseUnit="(%W)"][@quantity="(%W)"](?:[@status="%W"])?
; /unitQuantities/$1 ; values="$2"
; /unitQuantities/<FIFO>/$1 ; values="$2"
//supplementalData/convertUnits/convertUnit[@source="(%W)"][@baseUnit="(%W)"](?:[@systems="%W"])?
; /convertUnits/$1/target ; values=$2