diff --git a/icu4c/source/common/uresimp.h b/icu4c/source/common/uresimp.h index 69d82566fe0..f038dedace3 100644 --- a/icu4c/source/common/uresimp.h +++ b/icu4c/source/common/uresimp.h @@ -270,11 +270,13 @@ ures_getByKeyWithFallback(const UResourceBundle *resB, * function can perform fallback on the sub-resources of the table. * @param resB a resource * @param inKey a key associated with the requested resource + * @param len if not NULL, used to return the length of the string * @param status: fills in the outgoing error code * could be U_MISSING_RESOURCE_ERROR if the key is not found * could be a non-failing error * e.g.: U_USING_FALLBACK_WARNING,U_USING_DEFAULT_WARNING - * @return a pointer to a UResourceBundle struct. If fill in param was NULL, caller must delete it + * @return returns a pointer to a zero-terminated UChar array which lives in a + * memory mapped/DLL file. */ U_CAPI const UChar* U_EXPORT2 ures_getStringByKeyWithFallback(const UResourceBundle *resB, diff --git a/icu4c/source/i18n/number_longnames.cpp b/icu4c/source/i18n/number_longnames.cpp index 7e5e3cabdb3..6c7a56e5f93 100644 --- a/icu4c/source/i18n/number_longnames.cpp +++ b/icu4c/source/i18n/number_longnames.cpp @@ -183,9 +183,10 @@ void extractCorePattern(const UnicodeString &pattern, // Gets the gender of a built-in unit: unit must be a built-in. Returns an empty // string both in case of unknown gender and in case of unknown unit. -const char *getGenderForBuiltin(const Locale &locale, MeasureUnit builtinUnit, UErrorCode &status) { +UnicodeString +getGenderForBuiltin(const Locale &locale, const MeasureUnit &builtinUnit, UErrorCode &status) { LocalUResourceBundlePointer unitsBundle(ures_open(U_ICUDATA_UNIT, locale.getName(), &status)); - if (U_FAILURE(status)) { return ""; } + if (U_FAILURE(status)) { return {}; } // Map duration-year-person, duration-week-person, etc. to duration-year, duration-week, ... // TODO(ICU-20400): Get duration-*-person data properly with aliases. @@ -205,18 +206,18 @@ const char *getGenderForBuiltin(const Locale &locale, MeasureUnit builtinUnit, U key.append("/gender", status); UErrorCode localStatus = status; - StackUResourceBundle fillIn; - ures_getByKeyWithFallback(unitsBundle.getAlias(), key.data(), fillIn.getAlias(), &localStatus); + int32_t resultLen = 0; + const UChar *result = + ures_getStringByKeyWithFallback(unitsBundle.getAlias(), key.data(), &resultLen, &localStatus); if (U_SUCCESS(localStatus)) { status = localStatus; - UnicodeString directString = ures_getUnicodeString(fillIn.getAlias(), &status); - return getGenderString(directString, status); + return UnicodeString(true, result, resultLen); } else { // TODO(icu-units#28): "$unitRes/gender" does not exist. Do we want to // check whether the parent "$unitRes" exists? Then we could return // U_MISSING_RESOURCE_ERROR for incorrect usage (e.g. builtinUnit not // being a builtin). - return ""; + return {}; } } @@ -778,6 +779,132 @@ const UChar *trimSpaceChars(const UChar *s, int32_t &length) { return s + start; } +/** + * Calculates the gender of an arbitrary unit: this is the *second* + * implementation of an algorithm to do this: + * + * Gender is also calculated in "processPatternTimes": that code path is "bottom + * up", loading the gender for every component of a compound unit (at the same + * time as loading the Long Names formatting patterns), even if the gender is + * unneeded, then combining the single units' genders into the compound unit's + * gender, according to the rules. This algorithm does a lazier "top-down" + * evaluation, starting with the compound unit, calculating which single unit's + * gender is needed by breaking it down according to the rules, and then loading + * only the gender of the one single unit who's gender is needed. + * + * For future refactorings: + * 1. we could drop processPatternTimes' gender calculation and just call this + * function: for UNUM_UNIT_WIDTH_FULL_NAME, the unit gender is in the very + * same table as the formatting patterns, so loading it then may be + * efficient. For other unit widths however, it needs to be explicitly looked + * up anyway. + * 2. alternatively, if CLDR is providing all the genders we need such that we + * don't need to calculate them in ICU anymore, we could drop this function + * and keep only processPatternTimes' calculation. (And optimise it a bit?) + * + * @param locale The desired locale. + * @param unit The measure unit to calculate the gender for. + * @return The gender string for the unit, or an empty string if unknown or + * ungendered. + */ +UnicodeString calculateGenderForUnit(const Locale &locale, const MeasureUnit &unit, UErrorCode &status) { + MeasureUnitImpl impl; + const MeasureUnitImpl& mui = MeasureUnitImpl::forMeasureUnit(unit, impl, status); + int32_t singleUnitIndex = 0; + if (mui.complexity == UMEASURE_UNIT_COMPOUND) { + int32_t startSlice = 0; + // inclusive + int32_t endSlice = mui.singleUnits.length()-1; + U_ASSERT(endSlice > 0); // Else it would not be COMPOUND + if (mui.singleUnits[endSlice]->dimensionality < 0) { + // We have a -per- construct + UnicodeString perRule = getDeriveCompoundRule(locale, "gender", "per", status); + if (perRule.length() != 1) { + // Fixed gender for -per- units + return perRule; + } + if (perRule[0] == u'1') { + // Find the start of the denominator. We already know there is one. + while (mui.singleUnits[startSlice]->dimensionality >= 0) { + startSlice++; + } + } else { + // Find the end of the numerator + while (endSlice >= 0 && mui.singleUnits[endSlice]->dimensionality < 0) { + endSlice--; + } + if (endSlice < 0) { + // We have only a denominator, e.g. "per-second". + // TODO(icu-units#28): find out what gender to use in the + // absence of a first value - mentioned in CLDR-14253. + return {}; + } + } + } + if (endSlice > startSlice) { + // We have a -times- construct + UnicodeString timesRule = getDeriveCompoundRule(locale, "gender", "times", status); + if (timesRule.length() != 1) { + // Fixed gender for -times- units + return timesRule; + } + if (timesRule[0] == u'0') { + endSlice = startSlice; + } else { + // We assume timesRule[0] == u'1' + startSlice = endSlice; + } + } + U_ASSERT(startSlice == endSlice); + singleUnitIndex = startSlice; + } else if (mui.complexity == UMEASURE_UNIT_MIXED) { + status = U_INTERNAL_PROGRAM_ERROR; + return {}; + } else { + U_ASSERT(mui.complexity == UMEASURE_UNIT_SINGLE); + U_ASSERT(mui.singleUnits.length() == 1); + } + + // Now we know which singleUnit's gender we want + const SingleUnitImpl *singleUnit = mui.singleUnits[singleUnitIndex]; + // Check for any power-prefix gender override: + if (std::abs(singleUnit->dimensionality) != 1) { + UnicodeString powerRule = getDeriveCompoundRule(locale, "gender", "power", status); + if (powerRule.length() != 1) { + // Fixed gender for -powN- units + return powerRule; + } + // powerRule[0] == u'0'; u'1' not currently in spec. + } + // Check for any SI and binary prefix gender override: + if (std::abs(singleUnit->dimensionality) != 1) { + UnicodeString prefixRule = getDeriveCompoundRule(locale, "gender", "prefix", status); + if (prefixRule.length() != 1) { + // Fixed gender for -powN- units + return prefixRule; + } + // prefixRule[0] == u'0'; u'1' not currently in spec. + } + // Now we've boiled it down to the gender of one simple unit identifier: + return getGenderForBuiltin(locale, MeasureUnit::forIdentifier(singleUnit->getSimpleUnitID(), status), + status); +} + +void maybeCalculateGender(const Locale &locale, + const MeasureUnit &unitRef, + UnicodeString *outArray, + UErrorCode &status) { + if (outArray[GENDER_INDEX].isBogus()) { + UnicodeString meterGender = getGenderForBuiltin(locale, MeasureUnit::getMeter(), status); + if (meterGender.isEmpty()) { + // No gender for meter: assume ungendered language + return; + } + // We have a gendered language, but are lacking gender for unitRef. + outArray[GENDER_INDEX] = calculateGenderForUnit(locale, unitRef, status); + } +} + } // namespace void LongNameHandler::forMeasureUnit(const Locale &loc, @@ -802,6 +929,7 @@ void LongNameHandler::forMeasureUnit(const Locale &loc, // - If result is not empty, return it UnicodeString simpleFormats[ARRAY_LENGTH]; getMeasureData(loc, unitRef, width, unitDisplayCase, simpleFormats, status); + maybeCalculateGender(loc, unitRef, simpleFormats, status); if (U_FAILURE(status)) { return; } @@ -1003,6 +1131,7 @@ void LongNameHandler::processPatternTimes(MeasureUnitImpl &&productUnit, // - Do those unit tests cover this code path representatively? if (builtinUnit != MeasureUnit()) { getMeasureData(loc, builtinUnit, width, caseVariant, outArray, status); + maybeCalculateGender(loc, builtinUnit, outArray, status); } return; } @@ -1059,8 +1188,8 @@ void LongNameHandler::processPatternTimes(MeasureUnitImpl &&productUnit, } // 4.2. Get the gender of that single_unit - MeasureUnit builtinUnit; - if (!MeasureUnit::findBySubType(singleUnit->getSimpleUnitID(), &builtinUnit)) { + MeasureUnit simpleUnit; + if (!MeasureUnit::findBySubType(singleUnit->getSimpleUnitID(), &simpleUnit)) { // Ideally all simple units should be known, but they're not: // 100-kilometer is internally treated as a simple unit, but it is // not a built-in unit and does not have formatting data in CLDR 39. @@ -1069,7 +1198,7 @@ void LongNameHandler::processPatternTimes(MeasureUnitImpl &&productUnit, status = U_UNSUPPORTED_ERROR; return; } - const char *gender = getGenderForBuiltin(loc, builtinUnit, status); + const char *gender = getGenderString(getGenderForBuiltin(loc, simpleUnit, status), status); // 4.3. If singleUnit starts with a dimensionality_prefix, such as 'square-' U_ASSERT(singleUnit->dimensionality > 0); @@ -1157,13 +1286,9 @@ void LongNameHandler::processPatternTimes(MeasureUnitImpl &&productUnit, getDerivedGender(loc, "prefix", singleUnitArray, nullptr, status); } - // Powers use compoundUnitPattern1, dimensionalityPrefixPatterns may - // have a "gender" element - // - // TODO(icu-units#28): untested: no locale data uses this currently: if (dimensionality != 1) { - singleUnitArray[GENDER_INDEX] = getDerivedGender(loc, "power", singleUnitArray, - dimensionalityPrefixPatterns, status); + singleUnitArray[GENDER_INDEX] = + getDerivedGender(loc, "power", singleUnitArray, nullptr, status); } UnicodeString timesGenderRule = getDeriveCompoundRule(loc, "gender", "times", status); @@ -1448,6 +1573,8 @@ void MixedUnitLongNameHandler::forMeasureUnit(const Locale &loc, // propagation of unitDisplayCase is correct: getMeasureData(loc, impl.singleUnits[i]->build(status), width, unitDisplayCase, unitData, status); + // TODO(ICU-21494): if we add support for gender for mixed units, we may + // need maybeCalculateGender() here. } // TODO(icu-units#120): Make sure ICU doesn't output zero-valued diff --git a/icu4c/source/test/intltest/numbertest_api.cpp b/icu4c/source/test/intltest/numbertest_api.cpp index 0da4d2ddee5..4d74a55842b 100644 --- a/icu4c/source/test/intltest/numbertest_api.cpp +++ b/icu4c/source/test/intltest/numbertest_api.cpp @@ -2291,27 +2291,258 @@ void NumberFormatterApiTest::unitGender() { const char *unitIdentifier; const char *expectedGender; } cases[] = { + {"de", "inch", "masculine"}, + {"de", "yard", "neuter"}, {"de", "meter", "masculine"}, + {"de", "liter", "masculine"}, {"de", "second", "feminine"}, {"de", "minute", "feminine"}, {"de", "hour", "feminine"}, {"de", "day", "masculine"}, {"de", "year", "neuter"}, + {"de", "gram", "neuter"}, + {"de", "watt", "neuter"}, + {"de", "bit", "neuter"}, + {"de", "byte", "neuter"}, + + {"fr", "inch", "masculine"}, + {"fr", "yard", "masculine"}, {"fr", "meter", "masculine"}, + {"fr", "liter", "masculine"}, {"fr", "second", "feminine"}, {"fr", "minute", "feminine"}, {"fr", "hour", "feminine"}, {"fr", "day", "masculine"}, + {"fr", "year", "masculine"}, + {"fr", "gram", "masculine"}, + // grammaticalFeatures deriveCompound "per" rule takes the gender of the // numerator unit: {"de", "meter-per-hour", "masculine"}, {"fr", "meter-per-hour", "masculine"}, {"af", "meter-per-hour", ""}, // ungendered language + // French "times" takes gender from first value, German takes the // second. Prefix and power does not have impact on gender for these // languages: {"de", "square-decimeter-square-second", "feminine"}, {"fr", "square-decimeter-square-second", "masculine"}, + + // TODO(icu-units#149): percent and permille bypasses LongNameHandler + // when unitWidth is not FULL_NAME: + // // Gender of per-second might be that of percent? TODO(icu-units#28) + // {"de", "percent", "neuter"}, + // {"fr", "percent", "masculine"}, + + // Built-in units whose simple units lack gender in the CLDR data file + {"de", "kilopascal", "neuter"}, + {"fr", "kilopascal", "masculine"}, + // {"de", "pascal", ""}, + // {"fr", "pascal", ""}, + + // Built-in units that lack gender in the CLDR data file + // {"de", "revolution", ""}, + // {"de", "radian", ""}, + // {"de", "arc-minute", ""}, + // {"de", "arc-second", ""}, + {"de", "square-yard", "neuter"}, // POWER + {"de", "square-inch", "masculine"}, // POWER + // {"de", "dunam", ""}, + // {"de", "karat", ""}, + // {"de", "milligram-ofglucose-per-deciliter", ""}, // COMPOUND, ofglucose + // {"de", "millimole-per-liter", ""}, // COMPOUND, mole + // {"de", "permillion", ""}, + // {"de", "permille", ""}, + // {"de", "permyriad", ""}, + // {"de", "mole", ""}, + {"de", "liter-per-kilometer", "masculine"}, // COMPOUND + {"de", "petabyte", "neuter"}, // PREFIX + {"de", "terabit", "neuter"}, // PREFIX + // {"de", "century", ""}, + // {"de", "decade", ""}, + {"de", "millisecond", "feminine"}, // PREFIX + {"de", "microsecond", "feminine"}, // PREFIX + {"de", "nanosecond", "feminine"}, // PREFIX + // {"de", "ampere", ""}, + // {"de", "milliampere", ""}, // PREFIX, ampere + // {"de", "ohm", ""}, + // {"de", "calorie", ""}, + // {"de", "kilojoule", ""}, // PREFIX, joule + // {"de", "joule", ""}, + {"de", "kilowatt-hour", "feminine"}, // COMPOUND + // {"de", "electronvolt", ""}, + // {"de", "british-thermal-unit", ""}, + // {"de", "therm-us", ""}, + // {"de", "pound-force", ""}, + // {"de", "newton", ""}, + // {"de", "gigahertz", ""}, // PREFIX, hertz + // {"de", "megahertz", ""}, // PREFIX, hertz + // {"de", "kilohertz", ""}, // PREFIX, hertz + // {"de", "hertz", ""}, + // {"de", "em", ""}, + // {"de", "pixel", ""}, + // {"de", "megapixel", ""}, + // {"de", "pixel-per-centimeter", ""}, // COMPOUND, pixel + // {"de", "pixel-per-inch", ""}, // COMPOUND, pixel + // {"de", "dot-per-centimeter", ""}, // COMPOUND, dot + // {"de", "dot-per-inch", ""}, // COMPOUND, dot + // {"de", "dot", ""}, + // {"de", "earth-radius", ""}, + {"de", "decimeter", "masculine"}, // PREFIX + {"de", "micrometer", "masculine"}, // PREFIX + {"de", "nanometer", "masculine"}, // PREFIX + // {"de", "light-year", ""}, + // {"de", "astronomical-unit", ""}, + // {"de", "furlong", ""}, + // {"de", "fathom", ""}, + // {"de", "nautical-mile", ""}, + // {"de", "mile-scandinavian", ""}, + // {"de", "point", ""}, + // {"de", "lux", ""}, + // {"de", "candela", ""}, + // {"de", "lumen", ""}, + // {"de", "metric-ton", ""}, + // {"de", "microgram", "neuter"}, // PREFIX + // {"de", "ton", ""}, + // {"de", "stone", ""}, + // {"de", "ounce-troy", ""}, + // {"de", "carat", ""}, + {"de", "gigawatt", "neuter"}, // PREFIX + {"de", "milliwatt", "neuter"}, // PREFIX + // {"de", "horsepower", ""}, + // {"de", "millimeter-ofhg", ""}, + // {"de", "pound-force-per-square-inch", ""}, // COMPOUND, pound-force + // {"de", "inch-ofhg", ""}, + // {"de", "bar", ""}, + // {"de", "millibar", ""}, // PREFIX, bar + // {"de", "atmosphere", ""}, + // {"de", "pascal", ""}, // PREFIX, kilopascal? neuter? + // {"de", "hectopascal", ""}, // PREFIX, pascal, neuter? + // {"de", "megapascal", ""}, // PREFIX, pascal, neuter? + // {"de", "knot", ""}, + {"de", "pound-force-foot", "masculine"}, // COMPOUND + {"de", "newton-meter", "masculine"}, // COMPOUND + {"de", "cubic-kilometer", "masculine"}, // POWER + {"de", "cubic-yard", "neuter"}, // POWER + {"de", "cubic-inch", "masculine"}, // POWER + {"de", "megaliter", "masculine"}, // PREFIX + {"de", "hectoliter", "masculine"}, // PREFIX + // {"de", "pint-metric", ""}, + // {"de", "cup-metric", ""}, + {"de", "acre-foot", "masculine"}, // COMPOUND + // {"de", "bushel", ""}, + // {"de", "barrel", ""}, + // Units missing gender in German also misses gender in French: + // {"fr", "revolution", ""}, + // {"fr", "radian", ""}, + // {"fr", "arc-minute", ""}, + // {"fr", "arc-second", ""}, + {"fr", "square-yard", "masculine"}, // POWER + {"fr", "square-inch", "masculine"}, // POWER + // {"fr", "dunam", ""}, + // {"fr", "karat", ""}, + {"fr", "milligram-ofglucose-per-deciliter", "masculine"}, // COMPOUND + // {"fr", "millimole-per-liter", ""}, // COMPOUND, mole + // {"fr", "permillion", ""}, + // {"fr", "permille", ""}, + // {"fr", "permyriad", ""}, + // {"fr", "mole", ""}, + {"fr", "liter-per-kilometer", "masculine"}, // COMPOUND + // {"fr", "petabyte", ""}, // PREFIX + // {"fr", "terabit", ""}, // PREFIX + // {"fr", "century", ""}, + // {"fr", "decade", ""}, + {"fr", "millisecond", "feminine"}, // PREFIX + {"fr", "microsecond", "feminine"}, // PREFIX + {"fr", "nanosecond", "feminine"}, // PREFIX + // {"fr", "ampere", ""}, + // {"fr", "milliampere", ""}, // PREFIX, ampere + // {"fr", "ohm", ""}, + // {"fr", "calorie", ""}, + // {"fr", "kilojoule", ""}, // PREFIX, joule + // {"fr", "joule", ""}, + // {"fr", "kilowatt-hour", ""}, // COMPOUND + // {"fr", "electronvolt", ""}, + // {"fr", "british-thermal-unit", ""}, + // {"fr", "therm-us", ""}, + // {"fr", "pound-force", ""}, + // {"fr", "newton", ""}, + // {"fr", "gigahertz", ""}, // PREFIX, hertz + // {"fr", "megahertz", ""}, // PREFIX, hertz + // {"fr", "kilohertz", ""}, // PREFIX, hertz + // {"fr", "hertz", ""}, + // {"fr", "em", ""}, + // {"fr", "pixel", ""}, + // {"fr", "megapixel", ""}, + // {"fr", "pixel-per-centimeter", ""}, // COMPOUND, pixel + // {"fr", "pixel-per-inch", ""}, // COMPOUND, pixel + // {"fr", "dot-per-centimeter", ""}, // COMPOUND, dot + // {"fr", "dot-per-inch", ""}, // COMPOUND, dot + // {"fr", "dot", ""}, + // {"fr", "earth-radius", ""}, + {"fr", "decimeter", "masculine"}, // PREFIX + {"fr", "micrometer", "masculine"}, // PREFIX + {"fr", "nanometer", "masculine"}, // PREFIX + // {"fr", "light-year", ""}, + // {"fr", "astronomical-unit", ""}, + // {"fr", "furlong", ""}, + // {"fr", "fathom", ""}, + // {"fr", "nautical-mile", ""}, + // {"fr", "mile-scandinavian", ""}, + // {"fr", "point", ""}, + // {"fr", "lux", ""}, + // {"fr", "candela", ""}, + // {"fr", "lumen", ""}, + // {"fr", "metric-ton", ""}, + // {"fr", "microgram", "masculine"}, // PREFIX + // {"fr", "ton", ""}, + // {"fr", "stone", ""}, + // {"fr", "ounce-troy", ""}, + // {"fr", "carat", ""}, + // {"fr", "gigawatt", ""}, // PREFIX + // {"fr", "milliwatt", ""}, + // {"fr", "horsepower", ""}, + {"fr", "millimeter-ofhg", "masculine"}, + // {"fr", "pound-force-per-square-inch", ""}, // COMPOUND, pound-force + {"fr", "inch-ofhg", "masculine"}, + // {"fr", "bar", ""}, + // {"fr", "millibar", ""}, // PREFIX, bar + // {"fr", "atmosphere", ""}, + // {"fr", "pascal", ""}, // PREFIX, kilopascal? + // {"fr", "hectopascal", ""}, // PREFIX, pascal + // {"fr", "megapascal", ""}, // PREFIX, pascal + // {"fr", "knot", ""}, + // {"fr", "pound-force-foot", ""}, + // {"fr", "newton-meter", ""}, + {"fr", "cubic-kilometer", "masculine"}, // POWER + {"fr", "cubic-yard", "masculine"}, // POWER + {"fr", "cubic-inch", "masculine"}, // POWER + {"fr", "megaliter", "masculine"}, // PREFIX + {"fr", "hectoliter", "masculine"}, // PREFIX + // {"fr", "pint-metric", ""}, + // {"fr", "cup-metric", ""}, + {"fr", "acre-foot", "feminine"}, // COMPOUND + // {"fr", "bushel", ""}, + // {"fr", "barrel", ""}, + // Some more French units missing gender: + // {"fr", "degree", ""}, + {"fr", "square-meter", "masculine"}, // POWER + // {"fr", "terabyte", ""}, // PREFIX, byte + // {"fr", "gigabyte", ""}, // PREFIX, byte + // {"fr", "gigabit", ""}, // PREFIX, bit + // {"fr", "megabyte", ""}, // PREFIX, byte + // {"fr", "megabit", ""}, // PREFIX, bit + // {"fr", "kilobyte", ""}, // PREFIX, byte + // {"fr", "kilobit", ""}, // PREFIX, bit + // {"fr", "byte", ""}, + // {"fr", "bit", ""}, + // {"fr", "volt", ""}, + // {"fr", "watt", ""}, + {"fr", "cubic-meter", "masculine"}, // POWER + + // gender-lacking builtins within compound units + {"de", "newton-meter-per-second", "masculine"}, + // TODO(ICU-21494): determine whether list genders behave as follows, // and implement proper getListGender support (covering more than just // two genders): diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/LongNameHandler.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/LongNameHandler.java index eeb959d8943..080efa5a1b5 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/LongNameHandler.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/LongNameHandler.java @@ -609,6 +609,129 @@ public class LongNameHandler /// END DATA LOADING /// //////////////////////// + /** + * Calculates the gender of an arbitrary unit: this is the *second* + * implementation of an algorithm to do this: + * + * Gender is also calculated in "processPatternTimes": that code path is + * "bottom up", loading the gender for every component of a compound unit + * (at the same time as loading the Long Names formatting patterns), even if + * the gender is unneeded, then combining the single units' genders into the + * compound unit's gender, according to the rules. This algorithm does a + * lazier "top-down" evaluation, starting with the compound unit, + * calculating which single unit's gender is needed by breaking it down + * according to the rules, and then loading only the gender of the one + * single unit who's gender is needed. + * + * For future refactorings: + * 1. we could drop processPatternTimes' gender calculation and just call + * this function: for UNUM_UNIT_WIDTH_FULL_NAME, the unit gender is in + * the very same table as the formatting patterns, so loading it then may + * be efficient. For other unit widths however, it needs to be explicitly + * looked up anyway. + * 2. alternatively, if CLDR is providing all the genders we need such that + * we don't need to calculate them in ICU anymore, we could drop this + * function and keep only processPatternTimes' calculation. (And optimise + * it a bit?) + * + * @param locale The desired locale. + * @param unit The measure unit to calculate the gender for. + * @return The gender string for the unit, or an empty string if unknown or + * ungendered. + */ + private static String calculateGenderForUnit(ULocale locale, MeasureUnit unit) { + MeasureUnitImpl mui = unit.getCopyOfMeasureUnitImpl(); + ArrayList singleUnits = mui.getSingleUnits(); + int singleUnitIndex = 0; + if (mui.getComplexity() == MeasureUnit.Complexity.COMPOUND) { + int startSlice = 0; + // inclusive + int endSlice = singleUnits.size() - 1; + assert endSlice > 0 : "COMPOUND units have more than one single unit"; + if (singleUnits.get(endSlice).getDimensionality() < 0) { + // We have a -per- construct + String perRule = getDeriveCompoundRule(locale, "gender", "per"); + if (perRule.length() != 1) { + // Fixed gender for -per- units + return perRule; + } + if (perRule.charAt(0) == '1') { + // Find the start of the denominator. We already know there is one. + while (singleUnits.get(startSlice).getDimensionality() >= 0) { + startSlice++; + } + } else { + // Find the end of the numerator + while (endSlice >= 0 && singleUnits.get(endSlice).getDimensionality() < 0) { + endSlice--; + } + if (endSlice < 0) { + // We have only a denominator, e.g. "per-second". + // TODO(icu-units#28): find out what gender to use in the + // absence of a first value - mentioned in CLDR-14253. + return ""; + } + } + } + if (endSlice > startSlice) { + // We have a -times- construct + String timesRule = getDeriveCompoundRule(locale, "gender", "times"); + if (timesRule.length() != 1) { + // Fixed gender for -times- units + return timesRule; + } + if (timesRule.charAt(0) == '0') { + endSlice = startSlice; + } else { + // We assume timesRule[0] == u'1' + startSlice = endSlice; + } + } + assert startSlice == endSlice; + singleUnitIndex = startSlice; + } else if (mui.getComplexity() == MeasureUnit.Complexity.MIXED) { + throw new ICUException("calculateGenderForUnit does not support MIXED units"); + } else { + assert mui.getComplexity() == MeasureUnit.Complexity.SINGLE; + assert singleUnits.size() == 1; + } + + // Now we know which singleUnit's gender we want + SingleUnitImpl singleUnit = singleUnits.get(singleUnitIndex); + // Check for any power-prefix gender override: + if (Math.abs(singleUnit.getDimensionality()) != 1) { + String powerRule = getDeriveCompoundRule(locale, "gender", "power"); + if (powerRule.length() != 1) { + // Fixed gender for -powN- units + return powerRule; + } + // powerRule[0] == u'0'; u'1' not currently in spec. + } + // Check for any SI and binary prefix gender override: + if (Math.abs(singleUnit.getDimensionality()) != 1) { + String prefixRule = getDeriveCompoundRule(locale, "gender", "prefix"); + if (prefixRule.length() != 1) { + // Fixed gender for -powN- units + return prefixRule; + } + // prefixRule[0] == u'0'; u'1' not currently in spec. + } + // Now we've boiled it down to the gender of one simple unit identifier: + return getGenderForBuiltin(locale, MeasureUnit.forIdentifier(singleUnit.getSimpleUnitID())); + } + + private static void maybeCalculateGender(ULocale locale, MeasureUnit unit, String[] outArray) { + if (outArray[GENDER_INDEX] == null) { + String meterGender = getGenderForBuiltin(locale, MeasureUnit.METER); + if (meterGender.isEmpty()) { + // No gender for meter: assume ungendered language + return; + } + // We have a gendered language, but are lacking gender for unitRef. + outArray[GENDER_INDEX] = calculateGenderForUnit(locale, unit); + } + } + private final Map modifiers; private final PluralRules rules; private final MicroPropsGenerator parent; @@ -674,6 +797,7 @@ public class LongNameHandler if (unit.getType() != null) { String[] simpleFormats = new String[ARRAY_LENGTH]; getMeasureData(locale, unit, width, unitDisplayCase, simpleFormats); + maybeCalculateGender(locale, unit, simpleFormats); // TODO(ICU4J): Reduce the number of object creations here? Map modifiers = new EnumMap<>(StandardPlural.class); LongNameHandler result = new LongNameHandler(modifiers, rules, parent); @@ -842,8 +966,8 @@ public class LongNameHandler return; } - MeasureUnit builtinUnit = MeasureUnit.findBySubType(productUnit.getIdentifier()); - if (builtinUnit != null) { + MeasureUnit simpleUnit = MeasureUnit.findBySubType(productUnit.getIdentifier()); + if (simpleUnit != null) { // TODO(icu-units#145): spec doesn't cover builtin-per-builtin, it // breaks them all down. Do we want to drop this? // - findBySubType isn't super efficient, if we skip it and go to basic @@ -851,7 +975,8 @@ public class LongNameHandler // - Check all the existing unit tests that fail without this: is it due // to incorrect fallback via getMeasureData? // - Do those unit tests cover this code path representatively? - getMeasureData(loc, builtinUnit, width, caseVariant, outArray); + getMeasureData(loc, simpleUnit, width, caseVariant, outArray); + maybeCalculateGender(loc, simpleUnit, outArray); return; } @@ -906,8 +1031,8 @@ public class LongNameHandler } // 4.2. Get the gender of that single_unit - builtinUnit = MeasureUnit.findBySubType(singleUnit.getSimpleUnitID()); - if (builtinUnit == null) { + simpleUnit = MeasureUnit.findBySubType(singleUnit.getSimpleUnitID()); + if (simpleUnit == null) { // Ideally all simple units should be known, but they're not: // 100-kilometer is internally treated as a simple unit, but it is // not a built-in unit and does not have formatting data in CLDR 39. @@ -916,7 +1041,7 @@ public class LongNameHandler throw new UnsupportedOperationException("Unsupported sinlgeUnit: " + singleUnit.getSimpleUnitID()); } - String gender = getGenderForBuiltin(loc, builtinUnit); + String gender = getGenderForBuiltin(loc, simpleUnit); // 4.3. If singleUnit starts with a dimensionality_prefix, such as 'square-' assert singleUnit.getDimensionality() > 0; @@ -1003,13 +1128,9 @@ public class LongNameHandler getDerivedGender(loc, "prefix", singleUnitArray, null); } - // Powers use compoundUnitPattern1, dimensionalityPrefixPatterns may - // have a "gender" element - // - // TODO(icu-units#28): untested: no locale data uses this currently: if (dimensionality != 1) { singleUnitArray[GENDER_INDEX] = - getDerivedGender(loc, "power", singleUnitArray, dimensionalityPrefixPatterns); + getDerivedGender(loc, "power", singleUnitArray, null); } String timesGenderRule = getDeriveCompoundRule(loc, "gender", "times"); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/MixedUnitLongNameHandler.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/MixedUnitLongNameHandler.java index dcce2a204d3..b9f388d5a99 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/MixedUnitLongNameHandler.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/MixedUnitLongNameHandler.java @@ -79,6 +79,8 @@ public class MixedUnitLongNameHandler String[] unitData = new String[LongNameHandler.ARRAY_LENGTH]; LongNameHandler.getMeasureData(locale, individualUnits.get(i), width, unitDisplayCase, unitData); + // TODO(ICU-21494): if we add support for gender for mixed units, we may + // need LongNameHandler.maybeCalculateGender() here. result.fMixedUnitData.add(unitData); } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/number/FormattedNumber.java b/icu4j/main/classes/core/src/com/ibm/icu/number/FormattedNumber.java index 5908d80fe5c..5f0cb47c798 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/number/FormattedNumber.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/number/FormattedNumber.java @@ -144,6 +144,9 @@ public class FormattedNumber implements FormattedValue { */ @Deprecated public String getGender() { + if (this.gender == null) { + return ""; + } return this.gender; } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberFormatterApiTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberFormatterApiTest.java index 85670b2ef14..66a688039ba 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberFormatterApiTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberFormatterApiTest.java @@ -2295,27 +2295,257 @@ public class NumberFormatterApiTest extends TestFmwk { } TestCase cases[] = { - new TestCase("de", "meter", "masculine"), - new TestCase("de", "second", "feminine"), - new TestCase("de", "minute", "feminine"), - new TestCase("de", "hour", "feminine"), - new TestCase("de", "day", "masculine"), - new TestCase("de", "year", "neuter"), - new TestCase("fr", "meter", "masculine"), - new TestCase("fr", "second", "feminine"), - new TestCase("fr", "minute", "feminine"), - new TestCase("fr", "hour", "feminine"), - new TestCase("fr", "day", "masculine"), + new TestCase("de", "inch", "masculine"), // + new TestCase("de", "yard", "neuter"), // + new TestCase("de", "meter", "masculine"), // + new TestCase("de", "liter", "masculine"), // + new TestCase("de", "second", "feminine"), // + new TestCase("de", "minute", "feminine"), // + new TestCase("de", "hour", "feminine"), // + new TestCase("de", "day", "masculine"), // + new TestCase("de", "year", "neuter"), // + new TestCase("de", "gram", "neuter"), // + new TestCase("de", "watt", "neuter"), // + new TestCase("de", "bit", "neuter"), // + new TestCase("de", "byte", "neuter"), // + + new TestCase("fr", "inch", "masculine"), // + new TestCase("fr", "yard", "masculine"), // + new TestCase("fr", "meter", "masculine"), // + new TestCase("fr", "liter", "masculine"), // + new TestCase("fr", "second", "feminine"), // + new TestCase("fr", "minute", "feminine"), // + new TestCase("fr", "hour", "feminine"), // + new TestCase("fr", "day", "masculine"), // + new TestCase("fr", "year", "masculine"), // + new TestCase("fr", "gram", "masculine"), // + // grammaticalFeatures deriveCompound "per" rule takes the gender of the // numerator unit: new TestCase("de", "meter-per-hour", "masculine"), new TestCase("fr", "meter-per-hour", "masculine"), - new TestCase("af", "meter-per-hour", null), // ungendered language + new TestCase("af", "meter-per-hour", ""), // ungendered language + // French "times" takes gender from first value, German takes the // second. Prefix and power does not have impact on gender for these // languages: new TestCase("de", "square-decimeter-square-second", "feminine"), new TestCase("fr", "square-decimeter-square-second", "masculine"), + + // TODO(icu-units#149): percent and permille bypasses + // LongNameHandler when unitWidth is not FULL_NAME: + // // Gender of per-second might be that of percent? TODO(icu-units#28) + // new TestCase("de", "percent", "neuter"), // + // new TestCase("fr", "percent", "masculine"), // + + // Built-in units whose simple units lack gender in the CLDR data file + new TestCase("de", "kilopascal", "neuter"), // + new TestCase("fr", "kilopascal", "masculine"), // + // new TestCase("de", "pascal", ""), // + // new TestCase("fr", "pascal", ""), // + + // Built-in units that lack gender in the CLDR data file + // new TestCase("de", "revolution", ""), // + // new TestCase("de", "radian", ""), // + // new TestCase("de", "arc-minute", ""), // + // new TestCase("de", "arc-second", ""), // + new TestCase("de", "square-yard", "neuter"), // COMPOUND + new TestCase("de", "square-inch", "masculine"), // COMPOUND + // new TestCase("de", "dunam", ""), // + // new TestCase("de", "karat", ""), // + // new TestCase("de", "milligram-ofglucose-per-deciliter", ""), // COMPOUND, ofglucose + // new TestCase("de", "millimole-per-liter", ""), // COMPOUND, mole + // new TestCase("de", "permillion", ""), // + // new TestCase("de", "permille", ""), // + // new TestCase("de", "permyriad", ""), // + // new TestCase("de", "mole", ""), // + new TestCase("de", "liter-per-kilometer", "masculine"), // COMPOUND + new TestCase("de", "petabyte", "neuter"), // PREFIX + new TestCase("de", "terabit", "neuter"), // PREFIX + // new TestCase("de", "century", ""), // + // new TestCase("de", "decade", ""), // + new TestCase("de", "millisecond", "feminine"), // PREFIX + new TestCase("de", "microsecond", "feminine"), // PREFIX + new TestCase("de", "nanosecond", "feminine"), // PREFIX + // new TestCase("de", "ampere", ""), // + // new TestCase("de", "milliampere", ""), // PREFIX, ampere + // new TestCase("de", "ohm", ""), // + // new TestCase("de", "calorie", ""), // + // new TestCase("de", "kilojoule", ""), // PREFIX, joule + // new TestCase("de", "joule", ""), // + new TestCase("de", "kilowatt-hour", "feminine"), // COMPOUND + // new TestCase("de", "electronvolt", ""), // + // new TestCase("de", "british-thermal-unit", ""), // + // new TestCase("de", "therm-us", ""), // + // new TestCase("de", "pound-force", ""), // + // new TestCase("de", "newton", ""), // + // new TestCase("de", "gigahertz", ""), // PREFIX, hertz + // new TestCase("de", "megahertz", ""), // PREFIX, hertz + // new TestCase("de", "kilohertz", ""), // PREFIX, hertz + // new TestCase("de", "hertz", ""), // PREFIX, hertz + // new TestCase("de", "em", ""), // + // new TestCase("de", "pixel", ""), // + // new TestCase("de", "megapixel", ""), // + // new TestCase("de", "pixel-per-centimeter", ""), // COMPOUND, pixel + // new TestCase("de", "pixel-per-inch", ""), // COMPOUND, pixel + // new TestCase("de", "dot-per-centimeter", ""), // COMPOUND, dot + // new TestCase("de", "dot-per-inch", ""), // COMPOUND, dot + // new TestCase("de", "dot", ""), // + // new TestCase("de", "earth-radius", ""), // + new TestCase("de", "decimeter", "masculine"), // PREFIX + new TestCase("de", "micrometer", "masculine"), // PREFIX + new TestCase("de", "nanometer", "masculine"), // PREFIX + // new TestCase("de", "light-year", ""), // + // new TestCase("de", "astronomical-unit", ""), // + // new TestCase("de", "furlong", ""), // + // new TestCase("de", "fathom", ""), // + // new TestCase("de", "nautical-mile", ""), // + // new TestCase("de", "mile-scandinavian", ""), // + // new TestCase("de", "point", ""), // + // new TestCase("de", "lux", ""), // + // new TestCase("de", "candela", ""), // + // new TestCase("de", "lumen", ""), // + // new TestCase("de", "metric-ton", ""), // + new TestCase("de", "microgram", "neuter"), // PREFIX + // new TestCase("de", "ton", ""), // + // new TestCase("de", "stone", ""), // + // new TestCase("de", "ounce-troy", ""), // + // new TestCase("de", "carat", ""), // + new TestCase("de", "gigawatt", "neuter"), // PREFIX + new TestCase("de", "milliwatt", "neuter"), // PREFIX + // new TestCase("de", "horsepower", ""), // + // new TestCase("de", "millimeter-ofhg", ""), // + // new TestCase("de", "pound-force-per-square-inch", ""), // COMPOUND, pound-force + // new TestCase("de", "inch-ofhg", ""), // + // new TestCase("de", "bar", ""), // + // new TestCase("de", "millibar", ""), // PREFIX, bar + // new TestCase("de", "atmosphere", ""), // + // new TestCase("de", "pascal", ""), // PREFIX, kilopascal? neuter? + // new TestCase("de", "hectopascal", ""), // PREFIX, pascal, neuter? + // new TestCase("de", "megapascal", ""), // PREFIX, pascal, neuter? + // new TestCase("de", "knot", ""), // + new TestCase("de", "pound-force-foot", "masculine"), // COMPOUND + new TestCase("de", "newton-meter", "masculine"), // COMPOUND + new TestCase("de", "cubic-kilometer", "masculine"), // POWER + new TestCase("de", "cubic-yard", "neuter"), // POWER + new TestCase("de", "cubic-inch", "masculine"), // POWER + new TestCase("de", "megaliter", "masculine"), // PREFIX + new TestCase("de", "hectoliter", "masculine"), // PREFIX + // new TestCase("de", "pint-metric", ""), // + // new TestCase("de", "cup-metric", ""), // + new TestCase("de", "acre-foot", "masculine"), // COMPOUND + // new TestCase("de", "bushel", ""), // + // new TestCase("de", "barrel", ""), // + // Units missing gender in German also misses gender in French: + // new TestCase("fr", "revolution", ""), // + // new TestCase("fr", "radian", ""), // + // new TestCase("fr", "arc-minute", ""), // + // new TestCase("fr", "arc-second", ""), // + new TestCase("fr", "square-yard", "masculine"), // COMPOUND + new TestCase("fr", "square-inch", "masculine"), // COMPOUND + // new TestCase("fr", "dunam", ""), // + // new TestCase("fr", "karat", ""), // + new TestCase("fr", "milligram-ofglucose-per-deciliter", "masculine"), // COMPOUND + // new TestCase("fr", "millimole-per-liter", ""), // COMPOUND, mole + // new TestCase("fr", "permillion", ""), // + // new TestCase("fr", "permille", ""), // + // new TestCase("fr", "permyriad", ""), // + // new TestCase("fr", "mole", ""), // + new TestCase("fr", "liter-per-kilometer", "masculine"), // COMPOUND + // new TestCase("fr", "petabyte", ""), // PREFIX + // new TestCase("fr", "terabit", ""), // PREFIX + // new TestCase("fr", "century", ""), // + // new TestCase("fr", "decade", ""), // + new TestCase("fr", "millisecond", "feminine"), // PREFIX + new TestCase("fr", "microsecond", "feminine"), // PREFIX + new TestCase("fr", "nanosecond", "feminine"), // PREFIX + // new TestCase("fr", "ampere", ""), // + // new TestCase("fr", "milliampere", ""), // PREFIX, ampere + // new TestCase("fr", "ohm", ""), // + // new TestCase("fr", "calorie", ""), // + // new TestCase("fr", "kilojoule", ""), // PREFIX, joule + // new TestCase("fr", "joule", ""), // + // new TestCase("fr", "kilowatt-hour", ""), // COMPOUND + // new TestCase("fr", "electronvolt", ""), // + // new TestCase("fr", "british-thermal-unit", ""), // + // new TestCase("fr", "therm-us", ""), // + // new TestCase("fr", "pound-force", ""), // + // new TestCase("fr", "newton", ""), // + // new TestCase("fr", "gigahertz", ""), // PREFIX, hertz + // new TestCase("fr", "megahertz", ""), // PREFIX, hertz + // new TestCase("fr", "kilohertz", ""), // PREFIX, hertz + // new TestCase("fr", "hertz", ""), // PREFIX, hertz + // new TestCase("fr", "em", ""), // + // new TestCase("fr", "pixel", ""), // + // new TestCase("fr", "megapixel", ""), // + // new TestCase("fr", "pixel-per-centimeter", ""), // COMPOUND, pixel + // new TestCase("fr", "pixel-per-inch", ""), // COMPOUND, pixel + // new TestCase("fr", "dot-per-centimeter", ""), // COMPOUND, dot + // new TestCase("fr", "dot-per-inch", ""), // COMPOUND, dot + // new TestCase("fr", "dot", ""), // + // new TestCase("fr", "earth-radius", ""), // + new TestCase("fr", "decimeter", "masculine"), // PREFIX + new TestCase("fr", "micrometer", "masculine"), // PREFIX + new TestCase("fr", "nanometer", "masculine"), // PREFIX + // new TestCase("fr", "light-year", ""), // + // new TestCase("fr", "astronomical-unit", ""), // + // new TestCase("fr", "furlong", ""), // + // new TestCase("fr", "fathom", ""), // + // new TestCase("fr", "nautical-mile", ""), // + // new TestCase("fr", "mile-scandinavian", ""), // + // new TestCase("fr", "point", ""), // + // new TestCase("fr", "lux", ""), // + // new TestCase("fr", "candela", ""), // + // new TestCase("fr", "lumen", ""), // + // new TestCase("fr", "metric-ton", ""), // + new TestCase("fr", "microgram", "masculine"), // PREFIX + // new TestCase("fr", "ton", ""), // + // new TestCase("fr", "stone", ""), // + // new TestCase("fr", "ounce-troy", ""), // + // new TestCase("fr", "carat", ""), // + // new TestCase("fr", "gigawatt", ""), // PREFIX + // new TestCase("fr", "milliwatt", ""), // + // new TestCase("fr", "horsepower", ""), // + new TestCase("fr", "millimeter-ofhg", "masculine"), // + // new TestCase("fr", "pound-force-per-square-inch", ""), // COMPOUND, pound-force + new TestCase("fr", "inch-ofhg", "masculine"), // + // new TestCase("fr", "bar", ""), // + // new TestCase("fr", "millibar", ""), // PREFIX, bar + // new TestCase("fr", "atmosphere", ""), // + // new TestCase("fr", "pascal", ""), // PREFIX, kilopascal? + // new TestCase("fr", "hectopascal", ""), // PREFIX, pascal + // new TestCase("fr", "megapascal", ""), // PREFIX, pascal + // new TestCase("fr", "knot", ""), // + // new TestCase("fr", "pound-force-foot", ""), // + // new TestCase("fr", "newton-meter", ""), // + new TestCase("fr", "cubic-kilometer", "masculine"), // POWER + new TestCase("fr", "cubic-yard", "masculine"), // POWER + new TestCase("fr", "cubic-inch", "masculine"), // POWER + new TestCase("fr", "megaliter", "masculine"), // PREFIX + new TestCase("fr", "hectoliter", "masculine"), // PREFIX + // new TestCase("fr", "pint-metric", ""), // + // new TestCase("fr", "cup-metric", ""), // + new TestCase("fr", "acre-foot", "feminine"), // COMPOUND + // new TestCase("fr", "bushel", ""), // + // new TestCase("fr", "barrel", ""), // + // Some more French units missing gender: + // new TestCase("fr", "degree", ""), // + new TestCase("fr", "square-meter", "masculine"), // COMPOUND + // new TestCase("fr", "terabyte", ""), // PREFIX, byte + // new TestCase("fr", "gigabyte", ""), // PREFIX, byte + // new TestCase("fr", "gigabit", ""), // PREFIX, bit + // new TestCase("fr", "megabyte", ""), // PREFIX, byte + // new TestCase("fr", "megabit", ""), // PREFIX, bit + // new TestCase("fr", "kilobyte", ""), // PREFIX, byte + // new TestCase("fr", "kilobit", ""), // PREFIX, bit + // new TestCase("fr", "byte", ""), // + // new TestCase("fr", "bit", ""), // + // new TestCase("fr", "volt", ""), // + new TestCase("fr", "cubic-meter", "masculine"), // POWER + + // gender-lacking builtins within compound units + new TestCase("de", "newton-meter-per-second", "masculine"), + // TODO(ICU-21494): determine whether list genders behave as follows, // and implement proper getListGender support (covering more than just // two genders):