ICU-21123 Calculate built-in units' gender when CLDR lacks the data

See #1620
This commit is contained in:
Hugo van der Merwe 2021-03-09 20:21:01 +00:00
parent c263b5b370
commit 54b896962b
7 changed files with 756 additions and 40 deletions

View file

@ -270,11 +270,13 @@ ures_getByKeyWithFallback(const UResourceBundle *resB,
* function can perform fallback on the sub-resources of the table.
* @param resB a resource
* @param inKey a key associated with the requested resource
* @param len if not NULL, used to return the length of the string
* @param status: fills in the outgoing error code
* could be <TT>U_MISSING_RESOURCE_ERROR</TT> if the key is not found
* could be a non-failing error
* e.g.: <TT>U_USING_FALLBACK_WARNING</TT>,<TT>U_USING_DEFAULT_WARNING </TT>
* @return a pointer to a UResourceBundle struct. If fill in param was NULL, caller must delete it
* @return returns a pointer to a zero-terminated UChar array which lives in a
* memory mapped/DLL file.
*/
U_CAPI const UChar* U_EXPORT2
ures_getStringByKeyWithFallback(const UResourceBundle *resB,

View file

@ -183,9 +183,10 @@ void extractCorePattern(const UnicodeString &pattern,
// Gets the gender of a built-in unit: unit must be a built-in. Returns an empty
// string both in case of unknown gender and in case of unknown unit.
const char *getGenderForBuiltin(const Locale &locale, MeasureUnit builtinUnit, UErrorCode &status) {
UnicodeString
getGenderForBuiltin(const Locale &locale, const MeasureUnit &builtinUnit, UErrorCode &status) {
LocalUResourceBundlePointer unitsBundle(ures_open(U_ICUDATA_UNIT, locale.getName(), &status));
if (U_FAILURE(status)) { return ""; }
if (U_FAILURE(status)) { return {}; }
// Map duration-year-person, duration-week-person, etc. to duration-year, duration-week, ...
// TODO(ICU-20400): Get duration-*-person data properly with aliases.
@ -205,18 +206,18 @@ const char *getGenderForBuiltin(const Locale &locale, MeasureUnit builtinUnit, U
key.append("/gender", status);
UErrorCode localStatus = status;
StackUResourceBundle fillIn;
ures_getByKeyWithFallback(unitsBundle.getAlias(), key.data(), fillIn.getAlias(), &localStatus);
int32_t resultLen = 0;
const UChar *result =
ures_getStringByKeyWithFallback(unitsBundle.getAlias(), key.data(), &resultLen, &localStatus);
if (U_SUCCESS(localStatus)) {
status = localStatus;
UnicodeString directString = ures_getUnicodeString(fillIn.getAlias(), &status);
return getGenderString(directString, status);
return UnicodeString(true, result, resultLen);
} else {
// TODO(icu-units#28): "$unitRes/gender" does not exist. Do we want to
// check whether the parent "$unitRes" exists? Then we could return
// U_MISSING_RESOURCE_ERROR for incorrect usage (e.g. builtinUnit not
// being a builtin).
return "";
return {};
}
}
@ -778,6 +779,132 @@ const UChar *trimSpaceChars(const UChar *s, int32_t &length) {
return s + start;
}
/**
* Calculates the gender of an arbitrary unit: this is the *second*
* implementation of an algorithm to do this:
*
* Gender is also calculated in "processPatternTimes": that code path is "bottom
* up", loading the gender for every component of a compound unit (at the same
* time as loading the Long Names formatting patterns), even if the gender is
* unneeded, then combining the single units' genders into the compound unit's
* gender, according to the rules. This algorithm does a lazier "top-down"
* evaluation, starting with the compound unit, calculating which single unit's
* gender is needed by breaking it down according to the rules, and then loading
* only the gender of the one single unit who's gender is needed.
*
* For future refactorings:
* 1. we could drop processPatternTimes' gender calculation and just call this
* function: for UNUM_UNIT_WIDTH_FULL_NAME, the unit gender is in the very
* same table as the formatting patterns, so loading it then may be
* efficient. For other unit widths however, it needs to be explicitly looked
* up anyway.
* 2. alternatively, if CLDR is providing all the genders we need such that we
* don't need to calculate them in ICU anymore, we could drop this function
* and keep only processPatternTimes' calculation. (And optimise it a bit?)
*
* @param locale The desired locale.
* @param unit The measure unit to calculate the gender for.
* @return The gender string for the unit, or an empty string if unknown or
* ungendered.
*/
UnicodeString calculateGenderForUnit(const Locale &locale, const MeasureUnit &unit, UErrorCode &status) {
MeasureUnitImpl impl;
const MeasureUnitImpl& mui = MeasureUnitImpl::forMeasureUnit(unit, impl, status);
int32_t singleUnitIndex = 0;
if (mui.complexity == UMEASURE_UNIT_COMPOUND) {
int32_t startSlice = 0;
// inclusive
int32_t endSlice = mui.singleUnits.length()-1;
U_ASSERT(endSlice > 0); // Else it would not be COMPOUND
if (mui.singleUnits[endSlice]->dimensionality < 0) {
// We have a -per- construct
UnicodeString perRule = getDeriveCompoundRule(locale, "gender", "per", status);
if (perRule.length() != 1) {
// Fixed gender for -per- units
return perRule;
}
if (perRule[0] == u'1') {
// Find the start of the denominator. We already know there is one.
while (mui.singleUnits[startSlice]->dimensionality >= 0) {
startSlice++;
}
} else {
// Find the end of the numerator
while (endSlice >= 0 && mui.singleUnits[endSlice]->dimensionality < 0) {
endSlice--;
}
if (endSlice < 0) {
// We have only a denominator, e.g. "per-second".
// TODO(icu-units#28): find out what gender to use in the
// absence of a first value - mentioned in CLDR-14253.
return {};
}
}
}
if (endSlice > startSlice) {
// We have a -times- construct
UnicodeString timesRule = getDeriveCompoundRule(locale, "gender", "times", status);
if (timesRule.length() != 1) {
// Fixed gender for -times- units
return timesRule;
}
if (timesRule[0] == u'0') {
endSlice = startSlice;
} else {
// We assume timesRule[0] == u'1'
startSlice = endSlice;
}
}
U_ASSERT(startSlice == endSlice);
singleUnitIndex = startSlice;
} else if (mui.complexity == UMEASURE_UNIT_MIXED) {
status = U_INTERNAL_PROGRAM_ERROR;
return {};
} else {
U_ASSERT(mui.complexity == UMEASURE_UNIT_SINGLE);
U_ASSERT(mui.singleUnits.length() == 1);
}
// Now we know which singleUnit's gender we want
const SingleUnitImpl *singleUnit = mui.singleUnits[singleUnitIndex];
// Check for any power-prefix gender override:
if (std::abs(singleUnit->dimensionality) != 1) {
UnicodeString powerRule = getDeriveCompoundRule(locale, "gender", "power", status);
if (powerRule.length() != 1) {
// Fixed gender for -powN- units
return powerRule;
}
// powerRule[0] == u'0'; u'1' not currently in spec.
}
// Check for any SI and binary prefix gender override:
if (std::abs(singleUnit->dimensionality) != 1) {
UnicodeString prefixRule = getDeriveCompoundRule(locale, "gender", "prefix", status);
if (prefixRule.length() != 1) {
// Fixed gender for -powN- units
return prefixRule;
}
// prefixRule[0] == u'0'; u'1' not currently in spec.
}
// Now we've boiled it down to the gender of one simple unit identifier:
return getGenderForBuiltin(locale, MeasureUnit::forIdentifier(singleUnit->getSimpleUnitID(), status),
status);
}
void maybeCalculateGender(const Locale &locale,
const MeasureUnit &unitRef,
UnicodeString *outArray,
UErrorCode &status) {
if (outArray[GENDER_INDEX].isBogus()) {
UnicodeString meterGender = getGenderForBuiltin(locale, MeasureUnit::getMeter(), status);
if (meterGender.isEmpty()) {
// No gender for meter: assume ungendered language
return;
}
// We have a gendered language, but are lacking gender for unitRef.
outArray[GENDER_INDEX] = calculateGenderForUnit(locale, unitRef, status);
}
}
} // namespace
void LongNameHandler::forMeasureUnit(const Locale &loc,
@ -802,6 +929,7 @@ void LongNameHandler::forMeasureUnit(const Locale &loc,
// - If result is not empty, return it
UnicodeString simpleFormats[ARRAY_LENGTH];
getMeasureData(loc, unitRef, width, unitDisplayCase, simpleFormats, status);
maybeCalculateGender(loc, unitRef, simpleFormats, status);
if (U_FAILURE(status)) {
return;
}
@ -1003,6 +1131,7 @@ void LongNameHandler::processPatternTimes(MeasureUnitImpl &&productUnit,
// - Do those unit tests cover this code path representatively?
if (builtinUnit != MeasureUnit()) {
getMeasureData(loc, builtinUnit, width, caseVariant, outArray, status);
maybeCalculateGender(loc, builtinUnit, outArray, status);
}
return;
}
@ -1059,8 +1188,8 @@ void LongNameHandler::processPatternTimes(MeasureUnitImpl &&productUnit,
}
// 4.2. Get the gender of that single_unit
MeasureUnit builtinUnit;
if (!MeasureUnit::findBySubType(singleUnit->getSimpleUnitID(), &builtinUnit)) {
MeasureUnit simpleUnit;
if (!MeasureUnit::findBySubType(singleUnit->getSimpleUnitID(), &simpleUnit)) {
// Ideally all simple units should be known, but they're not:
// 100-kilometer is internally treated as a simple unit, but it is
// not a built-in unit and does not have formatting data in CLDR 39.
@ -1069,7 +1198,7 @@ void LongNameHandler::processPatternTimes(MeasureUnitImpl &&productUnit,
status = U_UNSUPPORTED_ERROR;
return;
}
const char *gender = getGenderForBuiltin(loc, builtinUnit, status);
const char *gender = getGenderString(getGenderForBuiltin(loc, simpleUnit, status), status);
// 4.3. If singleUnit starts with a dimensionality_prefix, such as 'square-'
U_ASSERT(singleUnit->dimensionality > 0);
@ -1157,13 +1286,9 @@ void LongNameHandler::processPatternTimes(MeasureUnitImpl &&productUnit,
getDerivedGender(loc, "prefix", singleUnitArray, nullptr, status);
}
// Powers use compoundUnitPattern1, dimensionalityPrefixPatterns may
// have a "gender" element
//
// TODO(icu-units#28): untested: no locale data uses this currently:
if (dimensionality != 1) {
singleUnitArray[GENDER_INDEX] = getDerivedGender(loc, "power", singleUnitArray,
dimensionalityPrefixPatterns, status);
singleUnitArray[GENDER_INDEX] =
getDerivedGender(loc, "power", singleUnitArray, nullptr, status);
}
UnicodeString timesGenderRule = getDeriveCompoundRule(loc, "gender", "times", status);
@ -1448,6 +1573,8 @@ void MixedUnitLongNameHandler::forMeasureUnit(const Locale &loc,
// propagation of unitDisplayCase is correct:
getMeasureData(loc, impl.singleUnits[i]->build(status), width, unitDisplayCase, unitData,
status);
// TODO(ICU-21494): if we add support for gender for mixed units, we may
// need maybeCalculateGender() here.
}
// TODO(icu-units#120): Make sure ICU doesn't output zero-valued

View file

@ -2291,27 +2291,258 @@ void NumberFormatterApiTest::unitGender() {
const char *unitIdentifier;
const char *expectedGender;
} cases[] = {
{"de", "inch", "masculine"},
{"de", "yard", "neuter"},
{"de", "meter", "masculine"},
{"de", "liter", "masculine"},
{"de", "second", "feminine"},
{"de", "minute", "feminine"},
{"de", "hour", "feminine"},
{"de", "day", "masculine"},
{"de", "year", "neuter"},
{"de", "gram", "neuter"},
{"de", "watt", "neuter"},
{"de", "bit", "neuter"},
{"de", "byte", "neuter"},
{"fr", "inch", "masculine"},
{"fr", "yard", "masculine"},
{"fr", "meter", "masculine"},
{"fr", "liter", "masculine"},
{"fr", "second", "feminine"},
{"fr", "minute", "feminine"},
{"fr", "hour", "feminine"},
{"fr", "day", "masculine"},
{"fr", "year", "masculine"},
{"fr", "gram", "masculine"},
// grammaticalFeatures deriveCompound "per" rule takes the gender of the
// numerator unit:
{"de", "meter-per-hour", "masculine"},
{"fr", "meter-per-hour", "masculine"},
{"af", "meter-per-hour", ""}, // ungendered language
// French "times" takes gender from first value, German takes the
// second. Prefix and power does not have impact on gender for these
// languages:
{"de", "square-decimeter-square-second", "feminine"},
{"fr", "square-decimeter-square-second", "masculine"},
// TODO(icu-units#149): percent and permille bypasses LongNameHandler
// when unitWidth is not FULL_NAME:
// // Gender of per-second might be that of percent? TODO(icu-units#28)
// {"de", "percent", "neuter"},
// {"fr", "percent", "masculine"},
// Built-in units whose simple units lack gender in the CLDR data file
{"de", "kilopascal", "neuter"},
{"fr", "kilopascal", "masculine"},
// {"de", "pascal", ""},
// {"fr", "pascal", ""},
// Built-in units that lack gender in the CLDR data file
// {"de", "revolution", ""},
// {"de", "radian", ""},
// {"de", "arc-minute", ""},
// {"de", "arc-second", ""},
{"de", "square-yard", "neuter"}, // POWER
{"de", "square-inch", "masculine"}, // POWER
// {"de", "dunam", ""},
// {"de", "karat", ""},
// {"de", "milligram-ofglucose-per-deciliter", ""}, // COMPOUND, ofglucose
// {"de", "millimole-per-liter", ""}, // COMPOUND, mole
// {"de", "permillion", ""},
// {"de", "permille", ""},
// {"de", "permyriad", ""},
// {"de", "mole", ""},
{"de", "liter-per-kilometer", "masculine"}, // COMPOUND
{"de", "petabyte", "neuter"}, // PREFIX
{"de", "terabit", "neuter"}, // PREFIX
// {"de", "century", ""},
// {"de", "decade", ""},
{"de", "millisecond", "feminine"}, // PREFIX
{"de", "microsecond", "feminine"}, // PREFIX
{"de", "nanosecond", "feminine"}, // PREFIX
// {"de", "ampere", ""},
// {"de", "milliampere", ""}, // PREFIX, ampere
// {"de", "ohm", ""},
// {"de", "calorie", ""},
// {"de", "kilojoule", ""}, // PREFIX, joule
// {"de", "joule", ""},
{"de", "kilowatt-hour", "feminine"}, // COMPOUND
// {"de", "electronvolt", ""},
// {"de", "british-thermal-unit", ""},
// {"de", "therm-us", ""},
// {"de", "pound-force", ""},
// {"de", "newton", ""},
// {"de", "gigahertz", ""}, // PREFIX, hertz
// {"de", "megahertz", ""}, // PREFIX, hertz
// {"de", "kilohertz", ""}, // PREFIX, hertz
// {"de", "hertz", ""},
// {"de", "em", ""},
// {"de", "pixel", ""},
// {"de", "megapixel", ""},
// {"de", "pixel-per-centimeter", ""}, // COMPOUND, pixel
// {"de", "pixel-per-inch", ""}, // COMPOUND, pixel
// {"de", "dot-per-centimeter", ""}, // COMPOUND, dot
// {"de", "dot-per-inch", ""}, // COMPOUND, dot
// {"de", "dot", ""},
// {"de", "earth-radius", ""},
{"de", "decimeter", "masculine"}, // PREFIX
{"de", "micrometer", "masculine"}, // PREFIX
{"de", "nanometer", "masculine"}, // PREFIX
// {"de", "light-year", ""},
// {"de", "astronomical-unit", ""},
// {"de", "furlong", ""},
// {"de", "fathom", ""},
// {"de", "nautical-mile", ""},
// {"de", "mile-scandinavian", ""},
// {"de", "point", ""},
// {"de", "lux", ""},
// {"de", "candela", ""},
// {"de", "lumen", ""},
// {"de", "metric-ton", ""},
// {"de", "microgram", "neuter"}, // PREFIX
// {"de", "ton", ""},
// {"de", "stone", ""},
// {"de", "ounce-troy", ""},
// {"de", "carat", ""},
{"de", "gigawatt", "neuter"}, // PREFIX
{"de", "milliwatt", "neuter"}, // PREFIX
// {"de", "horsepower", ""},
// {"de", "millimeter-ofhg", ""},
// {"de", "pound-force-per-square-inch", ""}, // COMPOUND, pound-force
// {"de", "inch-ofhg", ""},
// {"de", "bar", ""},
// {"de", "millibar", ""}, // PREFIX, bar
// {"de", "atmosphere", ""},
// {"de", "pascal", ""}, // PREFIX, kilopascal? neuter?
// {"de", "hectopascal", ""}, // PREFIX, pascal, neuter?
// {"de", "megapascal", ""}, // PREFIX, pascal, neuter?
// {"de", "knot", ""},
{"de", "pound-force-foot", "masculine"}, // COMPOUND
{"de", "newton-meter", "masculine"}, // COMPOUND
{"de", "cubic-kilometer", "masculine"}, // POWER
{"de", "cubic-yard", "neuter"}, // POWER
{"de", "cubic-inch", "masculine"}, // POWER
{"de", "megaliter", "masculine"}, // PREFIX
{"de", "hectoliter", "masculine"}, // PREFIX
// {"de", "pint-metric", ""},
// {"de", "cup-metric", ""},
{"de", "acre-foot", "masculine"}, // COMPOUND
// {"de", "bushel", ""},
// {"de", "barrel", ""},
// Units missing gender in German also misses gender in French:
// {"fr", "revolution", ""},
// {"fr", "radian", ""},
// {"fr", "arc-minute", ""},
// {"fr", "arc-second", ""},
{"fr", "square-yard", "masculine"}, // POWER
{"fr", "square-inch", "masculine"}, // POWER
// {"fr", "dunam", ""},
// {"fr", "karat", ""},
{"fr", "milligram-ofglucose-per-deciliter", "masculine"}, // COMPOUND
// {"fr", "millimole-per-liter", ""}, // COMPOUND, mole
// {"fr", "permillion", ""},
// {"fr", "permille", ""},
// {"fr", "permyriad", ""},
// {"fr", "mole", ""},
{"fr", "liter-per-kilometer", "masculine"}, // COMPOUND
// {"fr", "petabyte", ""}, // PREFIX
// {"fr", "terabit", ""}, // PREFIX
// {"fr", "century", ""},
// {"fr", "decade", ""},
{"fr", "millisecond", "feminine"}, // PREFIX
{"fr", "microsecond", "feminine"}, // PREFIX
{"fr", "nanosecond", "feminine"}, // PREFIX
// {"fr", "ampere", ""},
// {"fr", "milliampere", ""}, // PREFIX, ampere
// {"fr", "ohm", ""},
// {"fr", "calorie", ""},
// {"fr", "kilojoule", ""}, // PREFIX, joule
// {"fr", "joule", ""},
// {"fr", "kilowatt-hour", ""}, // COMPOUND
// {"fr", "electronvolt", ""},
// {"fr", "british-thermal-unit", ""},
// {"fr", "therm-us", ""},
// {"fr", "pound-force", ""},
// {"fr", "newton", ""},
// {"fr", "gigahertz", ""}, // PREFIX, hertz
// {"fr", "megahertz", ""}, // PREFIX, hertz
// {"fr", "kilohertz", ""}, // PREFIX, hertz
// {"fr", "hertz", ""},
// {"fr", "em", ""},
// {"fr", "pixel", ""},
// {"fr", "megapixel", ""},
// {"fr", "pixel-per-centimeter", ""}, // COMPOUND, pixel
// {"fr", "pixel-per-inch", ""}, // COMPOUND, pixel
// {"fr", "dot-per-centimeter", ""}, // COMPOUND, dot
// {"fr", "dot-per-inch", ""}, // COMPOUND, dot
// {"fr", "dot", ""},
// {"fr", "earth-radius", ""},
{"fr", "decimeter", "masculine"}, // PREFIX
{"fr", "micrometer", "masculine"}, // PREFIX
{"fr", "nanometer", "masculine"}, // PREFIX
// {"fr", "light-year", ""},
// {"fr", "astronomical-unit", ""},
// {"fr", "furlong", ""},
// {"fr", "fathom", ""},
// {"fr", "nautical-mile", ""},
// {"fr", "mile-scandinavian", ""},
// {"fr", "point", ""},
// {"fr", "lux", ""},
// {"fr", "candela", ""},
// {"fr", "lumen", ""},
// {"fr", "metric-ton", ""},
// {"fr", "microgram", "masculine"}, // PREFIX
// {"fr", "ton", ""},
// {"fr", "stone", ""},
// {"fr", "ounce-troy", ""},
// {"fr", "carat", ""},
// {"fr", "gigawatt", ""}, // PREFIX
// {"fr", "milliwatt", ""},
// {"fr", "horsepower", ""},
{"fr", "millimeter-ofhg", "masculine"},
// {"fr", "pound-force-per-square-inch", ""}, // COMPOUND, pound-force
{"fr", "inch-ofhg", "masculine"},
// {"fr", "bar", ""},
// {"fr", "millibar", ""}, // PREFIX, bar
// {"fr", "atmosphere", ""},
// {"fr", "pascal", ""}, // PREFIX, kilopascal?
// {"fr", "hectopascal", ""}, // PREFIX, pascal
// {"fr", "megapascal", ""}, // PREFIX, pascal
// {"fr", "knot", ""},
// {"fr", "pound-force-foot", ""},
// {"fr", "newton-meter", ""},
{"fr", "cubic-kilometer", "masculine"}, // POWER
{"fr", "cubic-yard", "masculine"}, // POWER
{"fr", "cubic-inch", "masculine"}, // POWER
{"fr", "megaliter", "masculine"}, // PREFIX
{"fr", "hectoliter", "masculine"}, // PREFIX
// {"fr", "pint-metric", ""},
// {"fr", "cup-metric", ""},
{"fr", "acre-foot", "feminine"}, // COMPOUND
// {"fr", "bushel", ""},
// {"fr", "barrel", ""},
// Some more French units missing gender:
// {"fr", "degree", ""},
{"fr", "square-meter", "masculine"}, // POWER
// {"fr", "terabyte", ""}, // PREFIX, byte
// {"fr", "gigabyte", ""}, // PREFIX, byte
// {"fr", "gigabit", ""}, // PREFIX, bit
// {"fr", "megabyte", ""}, // PREFIX, byte
// {"fr", "megabit", ""}, // PREFIX, bit
// {"fr", "kilobyte", ""}, // PREFIX, byte
// {"fr", "kilobit", ""}, // PREFIX, bit
// {"fr", "byte", ""},
// {"fr", "bit", ""},
// {"fr", "volt", ""},
// {"fr", "watt", ""},
{"fr", "cubic-meter", "masculine"}, // POWER
// gender-lacking builtins within compound units
{"de", "newton-meter-per-second", "masculine"},
// TODO(ICU-21494): determine whether list genders behave as follows,
// and implement proper getListGender support (covering more than just
// two genders):

View file

@ -609,6 +609,129 @@ public class LongNameHandler
/// END DATA LOADING ///
////////////////////////
/**
* Calculates the gender of an arbitrary unit: this is the *second*
* implementation of an algorithm to do this:
*
* Gender is also calculated in "processPatternTimes": that code path is
* "bottom up", loading the gender for every component of a compound unit
* (at the same time as loading the Long Names formatting patterns), even if
* the gender is unneeded, then combining the single units' genders into the
* compound unit's gender, according to the rules. This algorithm does a
* lazier "top-down" evaluation, starting with the compound unit,
* calculating which single unit's gender is needed by breaking it down
* according to the rules, and then loading only the gender of the one
* single unit who's gender is needed.
*
* For future refactorings:
* 1. we could drop processPatternTimes' gender calculation and just call
* this function: for UNUM_UNIT_WIDTH_FULL_NAME, the unit gender is in
* the very same table as the formatting patterns, so loading it then may
* be efficient. For other unit widths however, it needs to be explicitly
* looked up anyway.
* 2. alternatively, if CLDR is providing all the genders we need such that
* we don't need to calculate them in ICU anymore, we could drop this
* function and keep only processPatternTimes' calculation. (And optimise
* it a bit?)
*
* @param locale The desired locale.
* @param unit The measure unit to calculate the gender for.
* @return The gender string for the unit, or an empty string if unknown or
* ungendered.
*/
private static String calculateGenderForUnit(ULocale locale, MeasureUnit unit) {
MeasureUnitImpl mui = unit.getCopyOfMeasureUnitImpl();
ArrayList<SingleUnitImpl> singleUnits = mui.getSingleUnits();
int singleUnitIndex = 0;
if (mui.getComplexity() == MeasureUnit.Complexity.COMPOUND) {
int startSlice = 0;
// inclusive
int endSlice = singleUnits.size() - 1;
assert endSlice > 0 : "COMPOUND units have more than one single unit";
if (singleUnits.get(endSlice).getDimensionality() < 0) {
// We have a -per- construct
String perRule = getDeriveCompoundRule(locale, "gender", "per");
if (perRule.length() != 1) {
// Fixed gender for -per- units
return perRule;
}
if (perRule.charAt(0) == '1') {
// Find the start of the denominator. We already know there is one.
while (singleUnits.get(startSlice).getDimensionality() >= 0) {
startSlice++;
}
} else {
// Find the end of the numerator
while (endSlice >= 0 && singleUnits.get(endSlice).getDimensionality() < 0) {
endSlice--;
}
if (endSlice < 0) {
// We have only a denominator, e.g. "per-second".
// TODO(icu-units#28): find out what gender to use in the
// absence of a first value - mentioned in CLDR-14253.
return "";
}
}
}
if (endSlice > startSlice) {
// We have a -times- construct
String timesRule = getDeriveCompoundRule(locale, "gender", "times");
if (timesRule.length() != 1) {
// Fixed gender for -times- units
return timesRule;
}
if (timesRule.charAt(0) == '0') {
endSlice = startSlice;
} else {
// We assume timesRule[0] == u'1'
startSlice = endSlice;
}
}
assert startSlice == endSlice;
singleUnitIndex = startSlice;
} else if (mui.getComplexity() == MeasureUnit.Complexity.MIXED) {
throw new ICUException("calculateGenderForUnit does not support MIXED units");
} else {
assert mui.getComplexity() == MeasureUnit.Complexity.SINGLE;
assert singleUnits.size() == 1;
}
// Now we know which singleUnit's gender we want
SingleUnitImpl singleUnit = singleUnits.get(singleUnitIndex);
// Check for any power-prefix gender override:
if (Math.abs(singleUnit.getDimensionality()) != 1) {
String powerRule = getDeriveCompoundRule(locale, "gender", "power");
if (powerRule.length() != 1) {
// Fixed gender for -powN- units
return powerRule;
}
// powerRule[0] == u'0'; u'1' not currently in spec.
}
// Check for any SI and binary prefix gender override:
if (Math.abs(singleUnit.getDimensionality()) != 1) {
String prefixRule = getDeriveCompoundRule(locale, "gender", "prefix");
if (prefixRule.length() != 1) {
// Fixed gender for -powN- units
return prefixRule;
}
// prefixRule[0] == u'0'; u'1' not currently in spec.
}
// Now we've boiled it down to the gender of one simple unit identifier:
return getGenderForBuiltin(locale, MeasureUnit.forIdentifier(singleUnit.getSimpleUnitID()));
}
private static void maybeCalculateGender(ULocale locale, MeasureUnit unit, String[] outArray) {
if (outArray[GENDER_INDEX] == null) {
String meterGender = getGenderForBuiltin(locale, MeasureUnit.METER);
if (meterGender.isEmpty()) {
// No gender for meter: assume ungendered language
return;
}
// We have a gendered language, but are lacking gender for unitRef.
outArray[GENDER_INDEX] = calculateGenderForUnit(locale, unit);
}
}
private final Map<StandardPlural, SimpleModifier> modifiers;
private final PluralRules rules;
private final MicroPropsGenerator parent;
@ -674,6 +797,7 @@ public class LongNameHandler
if (unit.getType() != null) {
String[] simpleFormats = new String[ARRAY_LENGTH];
getMeasureData(locale, unit, width, unitDisplayCase, simpleFormats);
maybeCalculateGender(locale, unit, simpleFormats);
// TODO(ICU4J): Reduce the number of object creations here?
Map<StandardPlural, SimpleModifier> modifiers = new EnumMap<>(StandardPlural.class);
LongNameHandler result = new LongNameHandler(modifiers, rules, parent);
@ -842,8 +966,8 @@ public class LongNameHandler
return;
}
MeasureUnit builtinUnit = MeasureUnit.findBySubType(productUnit.getIdentifier());
if (builtinUnit != null) {
MeasureUnit simpleUnit = MeasureUnit.findBySubType(productUnit.getIdentifier());
if (simpleUnit != null) {
// TODO(icu-units#145): spec doesn't cover builtin-per-builtin, it
// breaks them all down. Do we want to drop this?
// - findBySubType isn't super efficient, if we skip it and go to basic
@ -851,7 +975,8 @@ public class LongNameHandler
// - Check all the existing unit tests that fail without this: is it due
// to incorrect fallback via getMeasureData?
// - Do those unit tests cover this code path representatively?
getMeasureData(loc, builtinUnit, width, caseVariant, outArray);
getMeasureData(loc, simpleUnit, width, caseVariant, outArray);
maybeCalculateGender(loc, simpleUnit, outArray);
return;
}
@ -906,8 +1031,8 @@ public class LongNameHandler
}
// 4.2. Get the gender of that single_unit
builtinUnit = MeasureUnit.findBySubType(singleUnit.getSimpleUnitID());
if (builtinUnit == null) {
simpleUnit = MeasureUnit.findBySubType(singleUnit.getSimpleUnitID());
if (simpleUnit == null) {
// Ideally all simple units should be known, but they're not:
// 100-kilometer is internally treated as a simple unit, but it is
// not a built-in unit and does not have formatting data in CLDR 39.
@ -916,7 +1041,7 @@ public class LongNameHandler
throw new UnsupportedOperationException("Unsupported sinlgeUnit: " +
singleUnit.getSimpleUnitID());
}
String gender = getGenderForBuiltin(loc, builtinUnit);
String gender = getGenderForBuiltin(loc, simpleUnit);
// 4.3. If singleUnit starts with a dimensionality_prefix, such as 'square-'
assert singleUnit.getDimensionality() > 0;
@ -1003,13 +1128,9 @@ public class LongNameHandler
getDerivedGender(loc, "prefix", singleUnitArray, null);
}
// Powers use compoundUnitPattern1, dimensionalityPrefixPatterns may
// have a "gender" element
//
// TODO(icu-units#28): untested: no locale data uses this currently:
if (dimensionality != 1) {
singleUnitArray[GENDER_INDEX] =
getDerivedGender(loc, "power", singleUnitArray, dimensionalityPrefixPatterns);
getDerivedGender(loc, "power", singleUnitArray, null);
}
String timesGenderRule = getDeriveCompoundRule(loc, "gender", "times");

View file

@ -79,6 +79,8 @@ public class MixedUnitLongNameHandler
String[] unitData = new String[LongNameHandler.ARRAY_LENGTH];
LongNameHandler.getMeasureData(locale, individualUnits.get(i), width, unitDisplayCase,
unitData);
// TODO(ICU-21494): if we add support for gender for mixed units, we may
// need LongNameHandler.maybeCalculateGender() here.
result.fMixedUnitData.add(unitData);
}

View file

@ -144,6 +144,9 @@ public class FormattedNumber implements FormattedValue {
*/
@Deprecated
public String getGender() {
if (this.gender == null) {
return "";
}
return this.gender;
}

View file

@ -2295,27 +2295,257 @@ public class NumberFormatterApiTest extends TestFmwk {
}
TestCase cases[] = {
new TestCase("de", "meter", "masculine"),
new TestCase("de", "second", "feminine"),
new TestCase("de", "minute", "feminine"),
new TestCase("de", "hour", "feminine"),
new TestCase("de", "day", "masculine"),
new TestCase("de", "year", "neuter"),
new TestCase("fr", "meter", "masculine"),
new TestCase("fr", "second", "feminine"),
new TestCase("fr", "minute", "feminine"),
new TestCase("fr", "hour", "feminine"),
new TestCase("fr", "day", "masculine"),
new TestCase("de", "inch", "masculine"), //
new TestCase("de", "yard", "neuter"), //
new TestCase("de", "meter", "masculine"), //
new TestCase("de", "liter", "masculine"), //
new TestCase("de", "second", "feminine"), //
new TestCase("de", "minute", "feminine"), //
new TestCase("de", "hour", "feminine"), //
new TestCase("de", "day", "masculine"), //
new TestCase("de", "year", "neuter"), //
new TestCase("de", "gram", "neuter"), //
new TestCase("de", "watt", "neuter"), //
new TestCase("de", "bit", "neuter"), //
new TestCase("de", "byte", "neuter"), //
new TestCase("fr", "inch", "masculine"), //
new TestCase("fr", "yard", "masculine"), //
new TestCase("fr", "meter", "masculine"), //
new TestCase("fr", "liter", "masculine"), //
new TestCase("fr", "second", "feminine"), //
new TestCase("fr", "minute", "feminine"), //
new TestCase("fr", "hour", "feminine"), //
new TestCase("fr", "day", "masculine"), //
new TestCase("fr", "year", "masculine"), //
new TestCase("fr", "gram", "masculine"), //
// grammaticalFeatures deriveCompound "per" rule takes the gender of the
// numerator unit:
new TestCase("de", "meter-per-hour", "masculine"),
new TestCase("fr", "meter-per-hour", "masculine"),
new TestCase("af", "meter-per-hour", null), // ungendered language
new TestCase("af", "meter-per-hour", ""), // ungendered language
// French "times" takes gender from first value, German takes the
// second. Prefix and power does not have impact on gender for these
// languages:
new TestCase("de", "square-decimeter-square-second", "feminine"),
new TestCase("fr", "square-decimeter-square-second", "masculine"),
// TODO(icu-units#149): percent and permille bypasses
// LongNameHandler when unitWidth is not FULL_NAME:
// // Gender of per-second might be that of percent? TODO(icu-units#28)
// new TestCase("de", "percent", "neuter"), //
// new TestCase("fr", "percent", "masculine"), //
// Built-in units whose simple units lack gender in the CLDR data file
new TestCase("de", "kilopascal", "neuter"), //
new TestCase("fr", "kilopascal", "masculine"), //
// new TestCase("de", "pascal", ""), //
// new TestCase("fr", "pascal", ""), //
// Built-in units that lack gender in the CLDR data file
// new TestCase("de", "revolution", ""), //
// new TestCase("de", "radian", ""), //
// new TestCase("de", "arc-minute", ""), //
// new TestCase("de", "arc-second", ""), //
new TestCase("de", "square-yard", "neuter"), // COMPOUND
new TestCase("de", "square-inch", "masculine"), // COMPOUND
// new TestCase("de", "dunam", ""), //
// new TestCase("de", "karat", ""), //
// new TestCase("de", "milligram-ofglucose-per-deciliter", ""), // COMPOUND, ofglucose
// new TestCase("de", "millimole-per-liter", ""), // COMPOUND, mole
// new TestCase("de", "permillion", ""), //
// new TestCase("de", "permille", ""), //
// new TestCase("de", "permyriad", ""), //
// new TestCase("de", "mole", ""), //
new TestCase("de", "liter-per-kilometer", "masculine"), // COMPOUND
new TestCase("de", "petabyte", "neuter"), // PREFIX
new TestCase("de", "terabit", "neuter"), // PREFIX
// new TestCase("de", "century", ""), //
// new TestCase("de", "decade", ""), //
new TestCase("de", "millisecond", "feminine"), // PREFIX
new TestCase("de", "microsecond", "feminine"), // PREFIX
new TestCase("de", "nanosecond", "feminine"), // PREFIX
// new TestCase("de", "ampere", ""), //
// new TestCase("de", "milliampere", ""), // PREFIX, ampere
// new TestCase("de", "ohm", ""), //
// new TestCase("de", "calorie", ""), //
// new TestCase("de", "kilojoule", ""), // PREFIX, joule
// new TestCase("de", "joule", ""), //
new TestCase("de", "kilowatt-hour", "feminine"), // COMPOUND
// new TestCase("de", "electronvolt", ""), //
// new TestCase("de", "british-thermal-unit", ""), //
// new TestCase("de", "therm-us", ""), //
// new TestCase("de", "pound-force", ""), //
// new TestCase("de", "newton", ""), //
// new TestCase("de", "gigahertz", ""), // PREFIX, hertz
// new TestCase("de", "megahertz", ""), // PREFIX, hertz
// new TestCase("de", "kilohertz", ""), // PREFIX, hertz
// new TestCase("de", "hertz", ""), // PREFIX, hertz
// new TestCase("de", "em", ""), //
// new TestCase("de", "pixel", ""), //
// new TestCase("de", "megapixel", ""), //
// new TestCase("de", "pixel-per-centimeter", ""), // COMPOUND, pixel
// new TestCase("de", "pixel-per-inch", ""), // COMPOUND, pixel
// new TestCase("de", "dot-per-centimeter", ""), // COMPOUND, dot
// new TestCase("de", "dot-per-inch", ""), // COMPOUND, dot
// new TestCase("de", "dot", ""), //
// new TestCase("de", "earth-radius", ""), //
new TestCase("de", "decimeter", "masculine"), // PREFIX
new TestCase("de", "micrometer", "masculine"), // PREFIX
new TestCase("de", "nanometer", "masculine"), // PREFIX
// new TestCase("de", "light-year", ""), //
// new TestCase("de", "astronomical-unit", ""), //
// new TestCase("de", "furlong", ""), //
// new TestCase("de", "fathom", ""), //
// new TestCase("de", "nautical-mile", ""), //
// new TestCase("de", "mile-scandinavian", ""), //
// new TestCase("de", "point", ""), //
// new TestCase("de", "lux", ""), //
// new TestCase("de", "candela", ""), //
// new TestCase("de", "lumen", ""), //
// new TestCase("de", "metric-ton", ""), //
new TestCase("de", "microgram", "neuter"), // PREFIX
// new TestCase("de", "ton", ""), //
// new TestCase("de", "stone", ""), //
// new TestCase("de", "ounce-troy", ""), //
// new TestCase("de", "carat", ""), //
new TestCase("de", "gigawatt", "neuter"), // PREFIX
new TestCase("de", "milliwatt", "neuter"), // PREFIX
// new TestCase("de", "horsepower", ""), //
// new TestCase("de", "millimeter-ofhg", ""), //
// new TestCase("de", "pound-force-per-square-inch", ""), // COMPOUND, pound-force
// new TestCase("de", "inch-ofhg", ""), //
// new TestCase("de", "bar", ""), //
// new TestCase("de", "millibar", ""), // PREFIX, bar
// new TestCase("de", "atmosphere", ""), //
// new TestCase("de", "pascal", ""), // PREFIX, kilopascal? neuter?
// new TestCase("de", "hectopascal", ""), // PREFIX, pascal, neuter?
// new TestCase("de", "megapascal", ""), // PREFIX, pascal, neuter?
// new TestCase("de", "knot", ""), //
new TestCase("de", "pound-force-foot", "masculine"), // COMPOUND
new TestCase("de", "newton-meter", "masculine"), // COMPOUND
new TestCase("de", "cubic-kilometer", "masculine"), // POWER
new TestCase("de", "cubic-yard", "neuter"), // POWER
new TestCase("de", "cubic-inch", "masculine"), // POWER
new TestCase("de", "megaliter", "masculine"), // PREFIX
new TestCase("de", "hectoliter", "masculine"), // PREFIX
// new TestCase("de", "pint-metric", ""), //
// new TestCase("de", "cup-metric", ""), //
new TestCase("de", "acre-foot", "masculine"), // COMPOUND
// new TestCase("de", "bushel", ""), //
// new TestCase("de", "barrel", ""), //
// Units missing gender in German also misses gender in French:
// new TestCase("fr", "revolution", ""), //
// new TestCase("fr", "radian", ""), //
// new TestCase("fr", "arc-minute", ""), //
// new TestCase("fr", "arc-second", ""), //
new TestCase("fr", "square-yard", "masculine"), // COMPOUND
new TestCase("fr", "square-inch", "masculine"), // COMPOUND
// new TestCase("fr", "dunam", ""), //
// new TestCase("fr", "karat", ""), //
new TestCase("fr", "milligram-ofglucose-per-deciliter", "masculine"), // COMPOUND
// new TestCase("fr", "millimole-per-liter", ""), // COMPOUND, mole
// new TestCase("fr", "permillion", ""), //
// new TestCase("fr", "permille", ""), //
// new TestCase("fr", "permyriad", ""), //
// new TestCase("fr", "mole", ""), //
new TestCase("fr", "liter-per-kilometer", "masculine"), // COMPOUND
// new TestCase("fr", "petabyte", ""), // PREFIX
// new TestCase("fr", "terabit", ""), // PREFIX
// new TestCase("fr", "century", ""), //
// new TestCase("fr", "decade", ""), //
new TestCase("fr", "millisecond", "feminine"), // PREFIX
new TestCase("fr", "microsecond", "feminine"), // PREFIX
new TestCase("fr", "nanosecond", "feminine"), // PREFIX
// new TestCase("fr", "ampere", ""), //
// new TestCase("fr", "milliampere", ""), // PREFIX, ampere
// new TestCase("fr", "ohm", ""), //
// new TestCase("fr", "calorie", ""), //
// new TestCase("fr", "kilojoule", ""), // PREFIX, joule
// new TestCase("fr", "joule", ""), //
// new TestCase("fr", "kilowatt-hour", ""), // COMPOUND
// new TestCase("fr", "electronvolt", ""), //
// new TestCase("fr", "british-thermal-unit", ""), //
// new TestCase("fr", "therm-us", ""), //
// new TestCase("fr", "pound-force", ""), //
// new TestCase("fr", "newton", ""), //
// new TestCase("fr", "gigahertz", ""), // PREFIX, hertz
// new TestCase("fr", "megahertz", ""), // PREFIX, hertz
// new TestCase("fr", "kilohertz", ""), // PREFIX, hertz
// new TestCase("fr", "hertz", ""), // PREFIX, hertz
// new TestCase("fr", "em", ""), //
// new TestCase("fr", "pixel", ""), //
// new TestCase("fr", "megapixel", ""), //
// new TestCase("fr", "pixel-per-centimeter", ""), // COMPOUND, pixel
// new TestCase("fr", "pixel-per-inch", ""), // COMPOUND, pixel
// new TestCase("fr", "dot-per-centimeter", ""), // COMPOUND, dot
// new TestCase("fr", "dot-per-inch", ""), // COMPOUND, dot
// new TestCase("fr", "dot", ""), //
// new TestCase("fr", "earth-radius", ""), //
new TestCase("fr", "decimeter", "masculine"), // PREFIX
new TestCase("fr", "micrometer", "masculine"), // PREFIX
new TestCase("fr", "nanometer", "masculine"), // PREFIX
// new TestCase("fr", "light-year", ""), //
// new TestCase("fr", "astronomical-unit", ""), //
// new TestCase("fr", "furlong", ""), //
// new TestCase("fr", "fathom", ""), //
// new TestCase("fr", "nautical-mile", ""), //
// new TestCase("fr", "mile-scandinavian", ""), //
// new TestCase("fr", "point", ""), //
// new TestCase("fr", "lux", ""), //
// new TestCase("fr", "candela", ""), //
// new TestCase("fr", "lumen", ""), //
// new TestCase("fr", "metric-ton", ""), //
new TestCase("fr", "microgram", "masculine"), // PREFIX
// new TestCase("fr", "ton", ""), //
// new TestCase("fr", "stone", ""), //
// new TestCase("fr", "ounce-troy", ""), //
// new TestCase("fr", "carat", ""), //
// new TestCase("fr", "gigawatt", ""), // PREFIX
// new TestCase("fr", "milliwatt", ""), //
// new TestCase("fr", "horsepower", ""), //
new TestCase("fr", "millimeter-ofhg", "masculine"), //
// new TestCase("fr", "pound-force-per-square-inch", ""), // COMPOUND, pound-force
new TestCase("fr", "inch-ofhg", "masculine"), //
// new TestCase("fr", "bar", ""), //
// new TestCase("fr", "millibar", ""), // PREFIX, bar
// new TestCase("fr", "atmosphere", ""), //
// new TestCase("fr", "pascal", ""), // PREFIX, kilopascal?
// new TestCase("fr", "hectopascal", ""), // PREFIX, pascal
// new TestCase("fr", "megapascal", ""), // PREFIX, pascal
// new TestCase("fr", "knot", ""), //
// new TestCase("fr", "pound-force-foot", ""), //
// new TestCase("fr", "newton-meter", ""), //
new TestCase("fr", "cubic-kilometer", "masculine"), // POWER
new TestCase("fr", "cubic-yard", "masculine"), // POWER
new TestCase("fr", "cubic-inch", "masculine"), // POWER
new TestCase("fr", "megaliter", "masculine"), // PREFIX
new TestCase("fr", "hectoliter", "masculine"), // PREFIX
// new TestCase("fr", "pint-metric", ""), //
// new TestCase("fr", "cup-metric", ""), //
new TestCase("fr", "acre-foot", "feminine"), // COMPOUND
// new TestCase("fr", "bushel", ""), //
// new TestCase("fr", "barrel", ""), //
// Some more French units missing gender:
// new TestCase("fr", "degree", ""), //
new TestCase("fr", "square-meter", "masculine"), // COMPOUND
// new TestCase("fr", "terabyte", ""), // PREFIX, byte
// new TestCase("fr", "gigabyte", ""), // PREFIX, byte
// new TestCase("fr", "gigabit", ""), // PREFIX, bit
// new TestCase("fr", "megabyte", ""), // PREFIX, byte
// new TestCase("fr", "megabit", ""), // PREFIX, bit
// new TestCase("fr", "kilobyte", ""), // PREFIX, byte
// new TestCase("fr", "kilobit", ""), // PREFIX, bit
// new TestCase("fr", "byte", ""), //
// new TestCase("fr", "bit", ""), //
// new TestCase("fr", "volt", ""), //
new TestCase("fr", "cubic-meter", "masculine"), // POWER
// gender-lacking builtins within compound units
new TestCase("de", "newton-meter-per-second", "masculine"),
// TODO(ICU-21494): determine whether list genders behave as follows,
// and implement proper getListGender support (covering more than just
// two genders):