ICU-22520 Update all users of ulocimp_get*() to ulocimp_getSubtags().

This simplifies the code by removing the need for finding the positions of the subtags, all that logic is now in just one single place.
2025-04-06 22:15:31 +00:00 · 2024-01-03 10:20:21 +09:00 · 2024-01-03 10:20:21 +09:00 · 1b768edbdf
commit 1b768edbdf
parent dc70b5a056
3 changed files with 55 additions and 124 deletions
--- a/icu4c/source/common/loclikely.cpp
+++ b/icu4c/source/common/loclikely.cpp
@ -156,52 +156,27 @@ parseTagString(
    icu::CharString& region,
    UErrorCode* err)
 {
+    icu::CharString variant;
    const char* position = localeID;

    if (U_FAILURE(*err) || localeID == nullptr) {
        goto error;
    }

-    lang = ulocimp_getLanguage(position, &position, *err);
+    ulocimp_getSubtags(localeID, &lang, &script, &region, &variant, &position, *err);

-    /*
-     * Note that we explicit consider U_STRING_NOT_TERMINATED_WARNING
-     * to be an error, because it indicates the user-supplied tag is
-     * not well-formed.
-     */
    if(U_FAILURE(*err)) {
        goto error;
    }

-    /*
-     * If no language was present, use the empty string instead.
-     * Otherwise, move past any separator.
-     */
+    if (!variant.isEmpty()) {
+        position -= 1 + variant.length();
+    }
+
    if (_isIDSeparator(*position)) {
        ++position;
    }

-    script = ulocimp_getScript(position, &position, *err);
-
-    if(U_FAILURE(*err)) {
-        goto error;
-    }
-
-    if (!script.isEmpty()) {
-        /*
-         * Move past any separator.
-         */
-        if (_isIDSeparator(*position)) {
-            ++position;
-        }    
-    }
-
-    region = ulocimp_getCountry(position, &position, *err);
-
-    if(U_FAILURE(*err)) {
-        goto error;
-    }
-
    if (region.isEmpty() && *position != 0 && *position != '@') {
        /* back up over consumed trailing separator */
        --position;
--- a/icu4c/source/common/uloc.cpp
+++ b/icu4c/source/common/uloc.cpp
@ -1569,32 +1569,18 @@ uloc_openKeywords(const char* localeID,
        tmpLocaleID=localeID;
    }

-    /* Skip the language */
-    ulocimp_getLanguage(tmpLocaleID, &tmpLocaleID, *status);
+    ulocimp_getSubtags(
+            tmpLocaleID,
+            nullptr,
+            nullptr,
+            nullptr,
+            nullptr,
+            &tmpLocaleID,
+            *status);
    if (U_FAILURE(*status)) {
        return 0;
    }

-    if(_isIDSeparator(*tmpLocaleID)) {
-        const char *scriptID;
-        /* Skip the script if available */
-        ulocimp_getScript(tmpLocaleID+1, &scriptID, *status);
-        if (U_FAILURE(*status)) {
-            return 0;
-        }
-        if(scriptID != tmpLocaleID+1) {
-            /* Found optional script */
-            tmpLocaleID = scriptID;
-        }
-        /* Skip the Country */
-        if (_isIDSeparator(*tmpLocaleID)) {
-            ulocimp_getCountry(tmpLocaleID+1, &tmpLocaleID, *status);
-            if (U_FAILURE(*status)) {
-                return 0;
-            }
-        }
-    }
-
    /* keywords are located after '@' */
    if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != nullptr) {
        CharString keywords;
@ -1634,7 +1620,7 @@ _canonicalize(const char* localeID,
        return;
    }

-    int32_t j, fieldCount=0, scriptSize=0, variantSize=0;
+    int32_t j, fieldCount=0;
    CharString tempBuffer;  // if localeID has a BCP47 extension, tmpLocaleID points to this
    CharString localeIDWithHyphens;  // if localeID has a BPC47 extension and have _, tmpLocaleID points to this
    const char* origLocaleID;
@ -1671,57 +1657,41 @@ _canonicalize(const char* localeID,
    origLocaleID=tmpLocaleID;

    /* get all pieces, one after another, and separate with '_' */
-    CharString tag = ulocimp_getLanguage(tmpLocaleID, &tmpLocaleID, *err);
+    CharString tag;
+    CharString script;
+    CharString country;
+    CharString variant;
+    ulocimp_getSubtags(
+            tmpLocaleID,
+            &tag,
+            &script,
+            &country,
+            &variant,
+            &tmpLocaleID,
+            *err);

    if (tag.length() == I_DEFAULT_LENGTH &&
            uprv_strncmp(origLocaleID, i_default, I_DEFAULT_LENGTH) == 0) {
        tag.clear();
        tag.append(uloc_getDefault(), *err);
-    } else if(_isIDSeparator(*tmpLocaleID)) {
-        const char *scriptID;
-
-        ++fieldCount;
-        tag.append('_', *err);
-
-        CharString script = ulocimp_getScript(tmpLocaleID+1, &scriptID, *err);
-        tag.append(script, *err);
-        scriptSize = script.length();
-        if(scriptSize > 0) {
-            /* Found optional script */
-            tmpLocaleID = scriptID;
+    } else {
+        if (!script.isEmpty()) {
            ++fieldCount;
-            if (_isIDSeparator(*tmpLocaleID)) {
-                /* If there is something else, then we add the _ */
+            tag.append('_', *err);
+            tag.append(script, *err);
+        }
+        if (!country.isEmpty()) {
+            ++fieldCount;
+            tag.append('_', *err);
+            tag.append(country, *err);
+        }
+        if (!variant.isEmpty()) {
+            ++fieldCount;
+            if (country.isEmpty()) {
                tag.append('_', *err);
            }
-        }
-
-        if (_isIDSeparator(*tmpLocaleID)) {
-            const char *cntryID;
-
-            CharString country = ulocimp_getCountry(tmpLocaleID+1, &cntryID, *err);
-            tag.append(country, *err);
-            if (!country.isEmpty()) {
-                /* Found optional country */
-                tmpLocaleID = cntryID;
-            }
-            if(_isIDSeparator(*tmpLocaleID)) {
-                /* If there is something else, then we add the _  if we found country before. */
-                if (!_isIDSeparator(*(tmpLocaleID+1))) {
-                    ++fieldCount;
-                    tag.append('_', *err);
-                }
-
-                variantSize = -tag.length();
-                {
-                    CharStringByteSink s(&tag);
-                    _getVariant(tmpLocaleID+1, *tmpLocaleID, &s, nullptr, false);
-                }
-                variantSize += tag.length();
-                if (variantSize > 0) {
-                    tmpLocaleID += variantSize + 1; /* skip '_' and variant */
-                }
-            }
+            tag.append('_', *err);
+            tag.append(variant, *err);
        }
    }

@ -1767,22 +1737,15 @@ _canonicalize(const char* localeID,
        /* Handle @FOO variant if @ is present and not followed by = */
        if (tmpLocaleID!=nullptr && keywordAssign==nullptr) {
            /* Add missing '_' if needed */
-            if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
+            if (fieldCount < 2 || (fieldCount < 3 && !script.isEmpty())) {
                do {
                    tag.append('_', *err);
                    ++fieldCount;
                } while(fieldCount<2);
            }

-            int32_t posixVariantSize = -tag.length();
-            {
-                CharStringByteSink s(&tag);
-                _getVariant(tmpLocaleID+1, '@', &s, nullptr, (UBool)(variantSize > 0));
-            }
-            posixVariantSize += tag.length();
-            if (posixVariantSize > 0) {
-                variantSize += posixVariantSize;
-            }
+            CharStringByteSink s(&tag);
+            _getVariant(tmpLocaleID+1, '@', &s, nullptr, !variant.isEmpty());
        }

        /* Look up the ID in the canonicalization map */
--- a/icu4c/source/common/uresbund.cpp
+++ b/icu4c/source/common/uresbund.cpp
@ -209,17 +209,11 @@ static bool getParentLocaleID(char *name, const char *origName, UResOpenType ope
    }
    
    UErrorCode err = U_ZERO_ERROR;
-    const char* tempNamePtr = name;
-    CharString language = ulocimp_getLanguage(tempNamePtr, &tempNamePtr, err);
-    if (*tempNamePtr == '_') {
-        ++tempNamePtr;
-    }
-    CharString script = ulocimp_getScript(tempNamePtr, &tempNamePtr, err);
-    if (*tempNamePtr == '_') {
-        ++tempNamePtr;
-    }
-    CharString region = ulocimp_getCountry(tempNamePtr, &tempNamePtr, err);
-    CharString workingLocale;
+    CharString language;
+    CharString script;
+    CharString region;
+    ulocimp_getSubtags(name, &language, &script, &region, nullptr, nullptr, err);
+
    if (U_FAILURE(err)) {
        // hopefully this never happens...
        return chopLocale(name);
@ -238,6 +232,8 @@ static bool getParentLocaleID(char *name, const char *origName, UResOpenType ope
        }
    }

+    CharString workingLocale;
+
    // if it's not in the parent locale table, figure out the fallback script algorithmically
    // (see CLDR-15265 for an explanation of the algorithm)
    if (!script.isEmpty() && !region.isEmpty()) {
@ -254,12 +250,9 @@ static bool getParentLocaleID(char *name, const char *origName, UResOpenType ope
        // - if yes, replace the region with the script from the original locale ID
        // - if no, replace the region with the default script for that language and region
        UErrorCode err = U_ZERO_ERROR;
-        tempNamePtr = origName;
-        CharString origNameLanguage = ulocimp_getLanguage(tempNamePtr, &tempNamePtr, err);
-        if (*tempNamePtr == '_') {
-            ++tempNamePtr;
-        }
-        CharString origNameScript = ulocimp_getScript(origName, nullptr, err);
+        CharString origNameLanguage;
+        CharString origNameScript;
+        ulocimp_getSubtags(origName, &origNameLanguage, &origNameScript, nullptr, nullptr, nullptr, err);
        if (!origNameScript.isEmpty()) {
            workingLocale.append(language, err).append("_", err).append(origNameScript, err);
        } else {