From 5a6545940ac0ed48bc8872424269e598388b7996 Mon Sep 17 00:00:00 2001 From: David Corbett Date: Fri, 28 Jan 2022 21:29:43 -0500 Subject: [PATCH 1/5] Add the language system tag INUK --- src/hb-ot-tag-table.hh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/hb-ot-tag-table.hh b/src/hb-ot-tag-table.hh index 2c6316df4..c5ec1518b 100644 --- a/src/hb-ot-tag-table.hh +++ b/src/hb-ot-tag-table.hh @@ -6,8 +6,8 @@ * * on files with these headers: * - * - * File-Date: 2021-08-06 + * + * File-Date: 2021-12-29 */ #ifndef HB_OT_TAG_TABLE_HH @@ -624,7 +624,9 @@ static const LangTag ot_languages[] = { {"ijs", HB_TAG('I','J','O',' ')}, /* Southeast Ijo -> Ijo */ {"ik", HB_TAG('I','P','K',' ')}, /* Inupiaq [macrolanguage] -> Inupiat */ {"ike", HB_TAG('I','N','U',' ')}, /* Eastern Canadian Inuktitut -> Inuktitut */ + {"ike", HB_TAG('I','N','U','K')}, /* Eastern Canadian Inuktitut -> Nunavik Inuktitut */ {"ikt", HB_TAG('I','N','U',' ')}, /* Inuinnaqtun -> Inuktitut */ + {"ikt", HB_TAG('I','N','U','K')}, /* Inuinnaqtun -> Nunavik Inuktitut */ /*{"ilo", HB_TAG('I','L','O',' ')},*/ /* Iloko -> Ilokano */ {"in", HB_TAG('I','N','D',' ')}, /* Indonesian (retired code) */ {"in", HB_TAG('M','L','Y',' ')}, /* Indonesian (retired code) -> Malay */ @@ -638,6 +640,7 @@ static const LangTag ot_languages[] = { {"it", HB_TAG('I','T','A',' ')}, /* Italian */ {"itz", HB_TAG('M','Y','N',' ')}, /* Itzá -> Mayan */ {"iu", HB_TAG('I','N','U',' ')}, /* Inuktitut [macrolanguage] */ + {"iu", HB_TAG('I','N','U','K')}, /* Inuktitut [macrolanguage] -> Nunavik Inuktitut */ {"iw", HB_TAG('I','W','R',' ')}, /* Hebrew (retired code) */ {"ixl", HB_TAG('M','Y','N',' ')}, /* Ixil -> Mayan */ {"ja", HB_TAG('J','A','N',' ')}, /* Japanese */ From 0e31595e0d2e214262c4cf0d4136215bc4c89a0a Mon Sep 17 00:00:00 2001 From: David Corbett Date: Fri, 28 Jan 2022 22:26:38 -0500 Subject: [PATCH 2/5] Infer tag mappings for unregistered macrolanguages Every macrolanguage not mentioned in the OT language system tag registry is mapped to every tag of its individual languages, if those have registered tags. --- src/gen-tag-table.py | 34 +++++++++++++++++++--------------- src/hb-ot-tag-table.hh | 9 +++++++++ 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/src/gen-tag-table.py b/src/gen-tag-table.py index cb612b982..fa98d29de 100755 --- a/src/gen-tag-table.py +++ b/src/gen-tag-table.py @@ -329,6 +329,10 @@ class OpenTypeRegistryParser (HTMLParser): from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47`` inverted. Its values start as unsorted sets; ``sort_languages`` converts them to sorted lists. + from_bcp_47_uninherited (Optional[Dict[str, AbstractSet[str]]]): + A copy of ``from_bcp_47``. It starts as ``None`` and is + populated at the beginning of the first call to + ``inherit_from_macrolanguages``. """ def __init__ (self): @@ -338,6 +342,7 @@ class OpenTypeRegistryParser (HTMLParser): self.ranks = collections.defaultdict (int) self.to_bcp_47 = collections.defaultdict (set) self.from_bcp_47 = collections.defaultdict (set) + self.from_bcp_47_uninherited = None # Whether the parser is in a element self._td = False # Whether the parser is after a
element within the current element @@ -463,29 +468,24 @@ class OpenTypeRegistryParser (HTMLParser): to SQI. If a BCP 47 tag for a macrolanguage has no OpenType mapping but - all of its individual languages do and they all map to the same - tags, the mapping is copied to the macrolanguage. + some of its individual languages do, their mappings are copied + to the macrolanguage. """ global bcp_47 - original_ot_from_bcp_47 = dict (self.from_bcp_47) + first_time = self.from_bcp_47_uninherited is None + if first_time: + self.from_bcp_47_uninherited = dict (self.from_bcp_47) for macrolanguage, languages in dict (bcp_47.macrolanguages).items (): - ot_macrolanguages = set (original_ot_from_bcp_47.get (macrolanguage, set ())) + ot_macrolanguages = set (self.from_bcp_47_uninherited.get (macrolanguage, set ())) if ot_macrolanguages: for ot_macrolanguage in ot_macrolanguages: for language in languages: self.add_language (language, ot_macrolanguage) self.ranks[ot_macrolanguage] += 1 - else: + elif first_time: for language in languages: - if language in original_ot_from_bcp_47: - if ot_macrolanguages: - ml = original_ot_from_bcp_47[language] - if ml: - ot_macrolanguages &= ml - else: - pass - else: - ot_macrolanguages |= original_ot_from_bcp_47[language] + if language in self.from_bcp_47_uninherited: + ot_macrolanguages |= self.from_bcp_47_uninherited[language] else: ot_macrolanguages.clear () if not ot_macrolanguages: @@ -1121,7 +1121,11 @@ def verify_disambiguation_dict (): elif len (primary_tags) == 0: expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag) else: - macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]') + original_languages = [t for t in primary_tags if t in ot.from_bcp_47_uninherited and 'retired code' not in bcp_47.scopes.get (t, '')] + if len (original_languages) == 1: + macrolanguages = original_languages + else: + macrolanguages = [t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]'] if len (macrolanguages) != 1: macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [family]') if len (macrolanguages) != 1: diff --git a/src/hb-ot-tag-table.hh b/src/hb-ot-tag-table.hh index c5ec1518b..f6e9238c7 100644 --- a/src/hb-ot-tag-table.hh +++ b/src/hb-ot-tag-table.hh @@ -256,6 +256,8 @@ static const LangTag ot_languages[] = { {"chh", HB_TAG_NONE }, /* Chinook != Chattisgarhi */ {"chj", HB_TAG('C','C','H','N')}, /* Ojitlán Chinantec -> Chinantec */ {"chk", HB_TAG('C','H','K','0')}, /* Chuukese */ + {"chm", HB_TAG('H','M','A',' ')}, /* Mari (Russia) [macrolanguage] -> High Mari */ + {"chm", HB_TAG('L','M','A',' ')}, /* Mari (Russia) [macrolanguage] -> Low Mari */ {"chn", HB_TAG('C','P','P',' ')}, /* Chinook jargon -> Creoles */ /*{"cho", HB_TAG('C','H','O',' ')},*/ /* Choctaw */ {"chp", HB_TAG('C','H','P',' ')}, /* Chipewyan */ @@ -1311,6 +1313,9 @@ static const LangTag ot_languages[] = { {"sgo", HB_TAG_NONE }, /* Songa (retired code) != Sango */ /*{"sgs", HB_TAG('S','G','S',' ')},*/ /* Samogitian */ {"sgw", HB_TAG('C','H','G',' ')}, /* Sebat Bet Gurage -> Chaha Gurage */ + {"sh", HB_TAG('B','O','S',' ')}, /* Serbo-Croatian [macrolanguage] -> Bosnian */ + {"sh", HB_TAG('H','R','V',' ')}, /* Serbo-Croatian [macrolanguage] -> Croatian */ + {"sh", HB_TAG('S','R','B',' ')}, /* Serbo-Croatian [macrolanguage] -> Serbian */ {"shi", HB_TAG('S','H','I',' ')}, /* Tachelhit */ {"shi", HB_TAG('B','B','R',' ')}, /* Tachelhit -> Berber */ {"shl", HB_TAG('Q','I','N',' ')}, /* Shendu -> Chin */ @@ -2841,6 +2846,8 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag) return hb_language_from_string ("fa", -1); /* Persian [macrolanguage] */ case HB_TAG('G','O','N',' '): /* Gondi */ return hb_language_from_string ("gon", -1); /* Gondi [macrolanguage] */ + case HB_TAG('H','M','A',' '): /* High Mari */ + return hb_language_from_string ("mrj", -1); /* Western Mari */ case HB_TAG('H','M','N',' '): /* Hmong */ return hb_language_from_string ("hmn", -1); /* Hmong [macrolanguage] */ case HB_TAG('H','N','D',' '): /* Hindko */ @@ -2881,6 +2888,8 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag) return hb_language_from_string ("uki", -1); /* Kui (India) */ case HB_TAG('K','U','R',' '): /* Kurdish */ return hb_language_from_string ("ku", -1); /* Kurdish [macrolanguage] */ + case HB_TAG('L','M','A',' '): /* Low Mari */ + return hb_language_from_string ("mhr", -1); /* Eastern Mari */ case HB_TAG('L','U','H',' '): /* Luyia */ return hb_language_from_string ("luy", -1); /* Luyia [macrolanguage] */ case HB_TAG('L','V','I',' '): /* Latvian */ From 0b1bf89cc2ee6a8782c007e2b7362a4485be249a Mon Sep 17 00:00:00 2001 From: David Corbett Date: Fri, 28 Jan 2022 22:27:51 -0500 Subject: [PATCH 3/5] =?UTF-8?q?Replace=20=E2=80=9C[family]=E2=80=9D=20with?= =?UTF-8?q?=20=E2=80=9C[collection]=E2=80=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Not all language collections are language families. --- src/gen-tag-table.py | 4 +-- src/hb-ot-tag-table.hh | 58 +++++++++++++++++++++--------------------- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/src/gen-tag-table.py b/src/gen-tag-table.py index fa98d29de..3064240ee 100755 --- a/src/gen-tag-table.py +++ b/src/gen-tag-table.py @@ -570,7 +570,7 @@ class BCP47Parser (object): if scope == 'macrolanguage': scope = ' [macrolanguage]' elif scope == 'collection': - scope = ' [family]' + scope = ' [collection]' else: continue self.scopes[subtag] = scope @@ -1127,7 +1127,7 @@ def verify_disambiguation_dict (): else: macrolanguages = [t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]'] if len (macrolanguages) != 1: - macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [family]') + macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [collection]') if len (macrolanguages) != 1: macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, '')) if len (macrolanguages) != 1: diff --git a/src/hb-ot-tag-table.hh b/src/hb-ot-tag-table.hh index f6e9238c7..a71d539d1 100644 --- a/src/hb-ot-tag-table.hh +++ b/src/hb-ot-tag-table.hh @@ -66,7 +66,7 @@ static const LangTag ot_languages[] = { {"an", HB_TAG('A','R','G',' ')}, /* Aragonese */ /*{"ang", HB_TAG('A','N','G',' ')},*/ /* Old English (ca. 450-1100) -> Anglo-Saxon */ {"aoa", HB_TAG('C','P','P',' ')}, /* Angolar -> Creoles */ - {"apa", HB_TAG('A','T','H',' ')}, /* Apache [family] -> Athapaskan */ + {"apa", HB_TAG('A','T','H',' ')}, /* Apache [collection] -> Athapaskan */ {"apc", HB_TAG('A','R','A',' ')}, /* North Levantine Arabic -> Arabic */ {"apd", HB_TAG('A','R','A',' ')}, /* Sudanese Arabic -> Arabic */ {"apj", HB_TAG('A','T','H',' ')}, /* Jicarilla Apache -> Athapaskan */ @@ -86,7 +86,7 @@ static const LangTag ot_languages[] = { {"arz", HB_TAG('A','R','A',' ')}, /* Egyptian Arabic -> Arabic */ {"as", HB_TAG('A','S','M',' ')}, /* Assamese */ /*{"ast", HB_TAG('A','S','T',' ')},*/ /* Asturian */ -/*{"ath", HB_TAG('A','T','H',' ')},*/ /* Athapascan [family] -> Athapaskan */ +/*{"ath", HB_TAG('A','T','H',' ')},*/ /* Athapascan [collection] -> Athapaskan */ {"atj", HB_TAG('R','C','R',' ')}, /* Atikamekw -> R-Cree */ {"atv", HB_TAG('A','L','T',' ')}, /* Northern Altai -> Altai */ {"auj", HB_TAG('B','B','R',' ')}, /* Awjilah -> Berber */ @@ -110,10 +110,10 @@ static const LangTag ot_languages[] = { {"azn", HB_TAG('N','A','H',' ')}, /* Western Durango Nahuatl -> Nahuatl */ {"azz", HB_TAG('N','A','H',' ')}, /* Highland Puebla Nahuatl -> Nahuatl */ {"ba", HB_TAG('B','S','H',' ')}, /* Bashkir */ - {"bad", HB_TAG('B','A','D','0')}, /* Banda [family] */ + {"bad", HB_TAG('B','A','D','0')}, /* Banda [collection] */ {"bag", HB_TAG_NONE }, /* Tuki != Baghelkhandi */ {"bah", HB_TAG('C','P','P',' ')}, /* Bahamas Creole English -> Creoles */ - {"bai", HB_TAG('B','M','L',' ')}, /* Bamileke [family] */ + {"bai", HB_TAG('B','M','L',' ')}, /* Bamileke [collection] */ {"bal", HB_TAG('B','L','I',' ')}, /* Baluchi [macrolanguage] */ /*{"ban", HB_TAG('B','A','N',' ')},*/ /* Balinese */ /*{"bar", HB_TAG('B','A','R',' ')},*/ /* Bavarian */ @@ -135,7 +135,7 @@ static const LangTag ot_languages[] = { {"bea", HB_TAG('A','T','H',' ')}, /* Beaver -> Athapaskan */ {"beb", HB_TAG('B','T','I',' ')}, /* Bebele -> Beti */ /*{"bem", HB_TAG('B','E','M',' ')},*/ /* Bemba (Zambia) */ - {"ber", HB_TAG('B','B','R',' ')}, /* Berber [family] */ + {"ber", HB_TAG('B','B','R',' ')}, /* Berber [collection] */ {"bew", HB_TAG('C','P','P',' ')}, /* Betawi -> Creoles */ {"bfl", HB_TAG('B','A','D','0')}, /* Banda-Ndélé -> Banda */ {"bfq", HB_TAG('B','A','D',' ')}, /* Badaga */ @@ -203,7 +203,7 @@ static const LangTag ot_languages[] = { {"btd", HB_TAG('B','T','K',' ')}, /* Batak Dairi -> Batak */ {"bti", HB_TAG_NONE }, /* Burate != Beti */ {"btj", HB_TAG('M','L','Y',' ')}, /* Bacanese Malay -> Malay */ -/*{"btk", HB_TAG('B','T','K',' ')},*/ /* Batak [family] */ +/*{"btk", HB_TAG('B','T','K',' ')},*/ /* Batak [collection] */ {"btm", HB_TAG('B','T','M',' ')}, /* Batak Mandailing */ {"btm", HB_TAG('B','T','K',' ')}, /* Batak Mandailing -> Batak */ {"bto", HB_TAG('B','I','K',' ')}, /* Rinconada Bikol -> Bikol */ @@ -299,10 +299,10 @@ static const LangTag ot_languages[] = { /*{"cop", HB_TAG('C','O','P',' ')},*/ /* Coptic */ {"coq", HB_TAG('A','T','H',' ')}, /* Coquille -> Athapaskan */ {"cpa", HB_TAG('C','C','H','N')}, /* Palantla Chinantec -> Chinantec */ - {"cpe", HB_TAG('C','P','P',' ')}, /* English-based creoles and pidgins [family] -> Creoles */ - {"cpf", HB_TAG('C','P','P',' ')}, /* French-based creoles and pidgins [family] -> Creoles */ + {"cpe", HB_TAG('C','P','P',' ')}, /* English-based creoles and pidgins [collection] -> Creoles */ + {"cpf", HB_TAG('C','P','P',' ')}, /* French-based creoles and pidgins [collection] -> Creoles */ {"cpi", HB_TAG('C','P','P',' ')}, /* Chinese Pidgin English -> Creoles */ -/*{"cpp", HB_TAG('C','P','P',' ')},*/ /* Portuguese-based creoles and pidgins [family] -> Creoles */ +/*{"cpp", HB_TAG('C','P','P',' ')},*/ /* Portuguese-based creoles and pidgins [collection] -> Creoles */ {"cpx", HB_TAG('Z','H','S',' ')}, /* Pu-Xian Chinese -> Chinese, Simplified */ {"cqd", HB_TAG('H','M','N',' ')}, /* Chuanqiandian Cluster Miao -> Hmong */ {"cqu", HB_TAG('Q','U','H',' ')}, /* Chilean Quechua (retired code) -> Quechua (Bolivia) */ @@ -322,7 +322,7 @@ static const LangTag ot_languages[] = { {"crm", HB_TAG('M','C','R',' ')}, /* Moose Cree */ {"crm", HB_TAG('L','C','R',' ')}, /* Moose Cree -> L-Cree */ {"crm", HB_TAG('C','R','E',' ')}, /* Moose Cree -> Cree */ - {"crp", HB_TAG('C','P','P',' ')}, /* Creoles and pidgins [family] -> Creoles */ + {"crp", HB_TAG('C','P','P',' ')}, /* Creoles and pidgins [collection] -> Creoles */ {"crr", HB_TAG_NONE }, /* Carolina Algonquian != Carrier */ {"crs", HB_TAG('C','P','P',' ')}, /* Seselwa Creole French -> Creoles */ {"crt", HB_TAG_NONE }, /* Iyojwa'ja Chorote != Crimean Tatar */ @@ -433,7 +433,7 @@ static const LangTag ot_languages[] = { {"et", HB_TAG('E','T','I',' ')}, /* Estonian [macrolanguage] */ {"eto", HB_TAG('B','T','I',' ')}, /* Eton (Cameroon) -> Beti */ {"eu", HB_TAG('E','U','Q',' ')}, /* Basque */ - {"euq", HB_TAG_NONE }, /* Basque [family] != Basque */ + {"euq", HB_TAG_NONE }, /* Basque [collection] != Basque */ {"eve", HB_TAG('E','V','N',' ')}, /* Even */ {"evn", HB_TAG('E','V','K',' ')}, /* Evenki */ {"ewo", HB_TAG('B','T','I',' ')}, /* Ewondo -> Beti */ @@ -622,7 +622,7 @@ static const LangTag ot_languages[] = { {"ijc", HB_TAG('I','J','O',' ')}, /* Izon -> Ijo */ {"ije", HB_TAG('I','J','O',' ')}, /* Biseni -> Ijo */ {"ijn", HB_TAG('I','J','O',' ')}, /* Kalabari -> Ijo */ -/*{"ijo", HB_TAG('I','J','O',' ')},*/ /* Ijo [family] */ +/*{"ijo", HB_TAG('I','J','O',' ')},*/ /* Ijo [collection] */ {"ijs", HB_TAG('I','J','O',' ')}, /* Southeast Ijo -> Ijo */ {"ik", HB_TAG('I','P','K',' ')}, /* Inupiaq [macrolanguage] -> Inupiat */ {"ike", HB_TAG('I','N','U',' ')}, /* Eastern Canadian Inuktitut -> Inuktitut */ @@ -672,7 +672,7 @@ static const LangTag ot_languages[] = { {"kab", HB_TAG('B','B','R',' ')}, /* Kabyle -> Berber */ {"kac", HB_TAG_NONE }, /* Kachin != Kachchi */ {"kam", HB_TAG('K','M','B',' ')}, /* Kamba (Kenya) */ - {"kar", HB_TAG('K','R','N',' ')}, /* Karen [family] */ + {"kar", HB_TAG('K','R','N',' ')}, /* Karen [collection] */ /*{"kaw", HB_TAG('K','A','W',' ')},*/ /* Kawi (Old Javanese) */ {"kbd", HB_TAG('K','A','B',' ')}, /* Kabardian */ {"kby", HB_TAG('K','N','R',' ')}, /* Manga Kanuri -> Kanuri */ @@ -881,7 +881,7 @@ static const LangTag ot_languages[] = { {"mam", HB_TAG('M','A','M',' ')}, /* Mam */ {"mam", HB_TAG('M','Y','N',' ')}, /* Mam -> Mayan */ {"man", HB_TAG('M','N','K',' ')}, /* Mandingo [macrolanguage] -> Maninka */ - {"map", HB_TAG_NONE }, /* Austronesian [family] != Mapudungun */ + {"map", HB_TAG_NONE }, /* Austronesian [collection] != Mapudungun */ {"maw", HB_TAG_NONE }, /* Mampruli != Marwari */ {"max", HB_TAG('M','L','Y',' ')}, /* North Moluccan Malay -> Malay */ {"max", HB_TAG('C','P','P',' ')}, /* North Moluccan Malay -> Creoles */ @@ -963,7 +963,7 @@ static const LangTag ot_languages[] = { {"mts", HB_TAG_NONE }, /* Yora != Maltese */ {"mud", HB_TAG('C','P','P',' ')}, /* Mednyj Aleut -> Creoles */ {"mui", HB_TAG('M','L','Y',' ')}, /* Musi -> Malay */ - {"mun", HB_TAG_NONE }, /* Munda [family] != Mundari */ + {"mun", HB_TAG_NONE }, /* Munda [collection] != Mundari */ {"mup", HB_TAG('R','A','J',' ')}, /* Malvi -> Rajasthani */ {"muq", HB_TAG('H','M','N',' ')}, /* Eastern Xiangxi Miao -> Hmong */ /*{"mus", HB_TAG('M','U','S',' ')},*/ /* Creek -> Muscogee */ @@ -978,7 +978,7 @@ static const LangTag ot_languages[] = { {"mww", HB_TAG('H','M','N',' ')}, /* Hmong Daw -> Hmong */ {"my", HB_TAG('B','R','M',' ')}, /* Burmese */ {"mym", HB_TAG('M','E','N',' ')}, /* Me’en */ -/*{"myn", HB_TAG('M','Y','N',' ')},*/ /* Mayan [family] */ +/*{"myn", HB_TAG('M','Y','N',' ')},*/ /* Mayan [collection] */ {"myq", HB_TAG('M','N','K',' ')}, /* Forest Maninka (retired code) -> Maninka */ {"myv", HB_TAG('E','R','Z',' ')}, /* Erzya */ {"mzb", HB_TAG('B','B','R',' ')}, /* Tumzabt -> Berber */ @@ -987,7 +987,7 @@ static const LangTag ot_languages[] = { {"na", HB_TAG('N','A','U',' ')}, /* Nauru -> Nauruan */ {"nag", HB_TAG('N','A','G',' ')}, /* Naga Pidgin -> Naga-Assamese */ {"nag", HB_TAG('C','P','P',' ')}, /* Naga Pidgin -> Creoles */ -/*{"nah", HB_TAG('N','A','H',' ')},*/ /* Nahuatl [family] */ +/*{"nah", HB_TAG('N','A','H',' ')},*/ /* Nahuatl [collection] */ {"nan", HB_TAG('Z','H','S',' ')}, /* Min Nan Chinese -> Chinese, Simplified */ /*{"nap", HB_TAG('N','A','P',' ')},*/ /* Neapolitan */ {"nas", HB_TAG_NONE }, /* Naasioi != Naskapi */ @@ -1098,7 +1098,7 @@ static const LangTag ot_languages[] = { {"otw", HB_TAG('O','J','B',' ')}, /* Ottawa -> Ojibway */ {"oua", HB_TAG('B','B','R',' ')}, /* Tagargrent -> Berber */ {"pa", HB_TAG('P','A','N',' ')}, /* Punjabi */ - {"paa", HB_TAG_NONE }, /* Papuan [family] != Palestinian Aramaic */ + {"paa", HB_TAG_NONE }, /* Papuan [collection] != Palestinian Aramaic */ /*{"pag", HB_TAG('P','A','G',' ')},*/ /* Pangasinan */ {"pal", HB_TAG_NONE }, /* Pahlavi != Pali */ /*{"pam", HB_TAG('P','A','M',' ')},*/ /* Pampanga -> Pampangan */ @@ -1337,7 +1337,7 @@ static const LangTag ot_languages[] = { {"skw", HB_TAG('C','P','P',' ')}, /* Skepi Creole Dutch -> Creoles */ {"sky", HB_TAG_NONE }, /* Sikaiana != Slovak */ {"sl", HB_TAG('S','L','V',' ')}, /* Slovenian */ - {"sla", HB_TAG_NONE }, /* Slavic [family] != Slavey */ + {"sla", HB_TAG_NONE }, /* Slavic [collection] != Slavey */ {"sm", HB_TAG('S','M','O',' ')}, /* Samoan */ {"sma", HB_TAG('S','S','M',' ')}, /* Southern Sami */ {"smj", HB_TAG('L','S','M',' ')}, /* Lule Sami */ @@ -1459,7 +1459,7 @@ static const LangTag ot_languages[] = { {"tpi", HB_TAG('C','P','P',' ')}, /* Tok Pisin -> Creoles */ {"tr", HB_TAG('T','R','K',' ')}, /* Turkish */ {"trf", HB_TAG('C','P','P',' ')}, /* Trinidadian Creole English -> Creoles */ - {"trk", HB_TAG_NONE }, /* Turkic [family] != Turkish */ + {"trk", HB_TAG_NONE }, /* Turkic [collection] != Turkish */ {"tru", HB_TAG('T','U','A',' ')}, /* Turoyo -> Turoyo Aramaic */ {"tru", HB_TAG('S','Y','R',' ')}, /* Turoyo -> Syriac */ {"ts", HB_TAG('T','S','G',' ')}, /* Tsonga */ @@ -1601,7 +1601,7 @@ static const LangTag ot_languages[] = { {"zlq", HB_TAG('Z','H','A',' ')}, /* Liuqian Zhuang -> Zhuang */ {"zmi", HB_TAG('M','L','Y',' ')}, /* Negeri Sembilan Malay -> Malay */ {"zmz", HB_TAG('B','A','D','0')}, /* Mbandja -> Banda */ - {"znd", HB_TAG_NONE }, /* Zande [family] != Zande */ + {"znd", HB_TAG_NONE }, /* Zande [collection] != Zande */ {"zne", HB_TAG('Z','N','D',' ')}, /* Zande */ {"zom", HB_TAG('Q','I','N',' ')}, /* Zou -> Chin */ {"zqe", HB_TAG('Z','H','A',' ')}, /* Qiubei Zhuang -> Zhuang */ @@ -2821,15 +2821,15 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag) case HB_TAG('A','R','K',' '): /* Rakhine */ return hb_language_from_string ("rki", -1); /* Rakhine */ case HB_TAG('A','T','H',' '): /* Athapaskan */ - return hb_language_from_string ("ath", -1); /* Athapascan [family] */ + return hb_language_from_string ("ath", -1); /* Athapascan [collection] */ case HB_TAG('B','B','R',' '): /* Berber */ - return hb_language_from_string ("ber", -1); /* Berber [family] */ + return hb_language_from_string ("ber", -1); /* Berber [collection] */ case HB_TAG('B','I','K',' '): /* Bikol */ return hb_language_from_string ("bik", -1); /* Bikol [macrolanguage] */ case HB_TAG('B','T','K',' '): /* Batak */ - return hb_language_from_string ("btk", -1); /* Batak [family] */ + return hb_language_from_string ("btk", -1); /* Batak [collection] */ case HB_TAG('C','P','P',' '): /* Creoles */ - return hb_language_from_string ("crp", -1); /* Creoles and pidgins [family] */ + return hb_language_from_string ("crp", -1); /* Creoles and pidgins [collection] */ case HB_TAG('C','R','R',' '): /* Carrier */ return hb_language_from_string ("crx", -1); /* Carrier */ case HB_TAG('D','G','R',' '): /* Dogri (macrolanguage) */ @@ -2857,7 +2857,7 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag) case HB_TAG('I','B','A',' '): /* Iban */ return hb_language_from_string ("iba", -1); /* Iban */ case HB_TAG('I','J','O',' '): /* Ijo */ - return hb_language_from_string ("ijo", -1); /* Ijo [family] */ + return hb_language_from_string ("ijo", -1); /* Ijo [collection] */ case HB_TAG('I','N','U',' '): /* Inuktitut */ return hb_language_from_string ("iu", -1); /* Inuktitut [macrolanguage] */ case HB_TAG('I','P','K',' '): /* Inupiat */ @@ -2883,7 +2883,7 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag) case HB_TAG('K','P','L',' '): /* Kpelle */ return hb_language_from_string ("kpe", -1); /* Kpelle [macrolanguage] */ case HB_TAG('K','R','N',' '): /* Karen */ - return hb_language_from_string ("kar", -1); /* Karen [family] */ + return hb_language_from_string ("kar", -1); /* Karen [collection] */ case HB_TAG('K','U','I',' '): /* Kui */ return hb_language_from_string ("uki", -1); /* Kui (India) */ case HB_TAG('K','U','R',' '): /* Kurdish */ @@ -2909,9 +2909,9 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag) case HB_TAG('M','O','N','T'): /* Thailand Mon */ return hb_language_from_string ("mnw-TH", -1); /* Mon; Thailand */ case HB_TAG('M','Y','N',' '): /* Mayan */ - return hb_language_from_string ("myn", -1); /* Mayan [family] */ + return hb_language_from_string ("myn", -1); /* Mayan [collection] */ case HB_TAG('N','A','H',' '): /* Nahuatl */ - return hb_language_from_string ("nah", -1); /* Nahuatl [family] */ + return hb_language_from_string ("nah", -1); /* Nahuatl [collection] */ case HB_TAG('N','E','P',' '): /* Nepali */ return hb_language_from_string ("ne", -1); /* Nepali [macrolanguage] */ case HB_TAG('N','I','S',' '): /* Nisi */ From a184c5f8518ab92b95947f23848ddde677e8cac1 Mon Sep 17 00:00:00 2001 From: David Corbett Date: Sun, 30 Jan 2022 13:28:23 -0500 Subject: [PATCH 4/5] =?UTF-8?q?Don=E2=80=99t=20always=20inherit=20from=20m?= =?UTF-8?q?acrolanguages?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If an OpenType tag maps to a BCP 47 macrolanguage, that is presumably to support the use of the macrolanguage as a vague stand-in for one of its individual languages. For example, "ar" and "zh" are often used for "arb" and "cmn". When the OpenType tag maps to a macrolanguage and some but not all of its individual languages, that indicates that the OpenType tag only corresponds to the listed individual languages (which may be referred to using the macrolanguage subtag) but not the missing individual languages. In particular, INUK (Nunavik Inuktitut) is mapped to "ike" (Eastern Canadian Inuktitut) and "iu" (Inuktitut) but not to "ikt" (Inuinnaqtun), so "ikt" should not inherit the INUK mapping from its macrolanguage "iu". --- src/gen-tag-table.py | 35 +++++++++++++++++++++++++++++++---- src/hb-ot-tag-table.hh | 12 ++---------- 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/src/gen-tag-table.py b/src/gen-tag-table.py index 3064240ee..d1f8fe286 100755 --- a/src/gen-tag-table.py +++ b/src/gen-tag-table.py @@ -467,6 +467,14 @@ class OpenTypeRegistryParser (HTMLParser): explicit mapping, so it inherits from sq (Albanian) the mapping to SQI. + However, if an OpenType tag maps to a BCP 47 macrolanguage and + some but not all of its individual languages, the mapping is not + inherited from the macrolanguage to the missing individual + languages. For example, INUK (Nunavik Inuktitut) is mapped to + ike (Eastern Canadian Inuktitut) and iu (Inuktitut) but not to + ikt (Inuinnaqtun, which is an individual language of iu), so + this method does not add a mapping from ikt to INUK. + If a BCP 47 tag for a macrolanguage has no OpenType mapping but some of its individual languages do, their mappings are copied to the macrolanguage. @@ -476,12 +484,30 @@ class OpenTypeRegistryParser (HTMLParser): if first_time: self.from_bcp_47_uninherited = dict (self.from_bcp_47) for macrolanguage, languages in dict (bcp_47.macrolanguages).items (): - ot_macrolanguages = set (self.from_bcp_47_uninherited.get (macrolanguage, set ())) + ot_macrolanguages = { + ot_macrolanguage for ot_macrolanguage in self.from_bcp_47_uninherited.get (macrolanguage, set ()) + } + blocked_ot_macrolanguages = set () + if 'retired code' not in bcp_47.scopes.get (macrolanguage, ''): + for ot_macrolanguage in ot_macrolanguages: + round_trip_macrolanguages = { + l for l in self.to_bcp_47[ot_macrolanguage] + if 'retired code' not in bcp_47.scopes.get (l, '') + } + round_trip_languages = { + l for l in languages + if 'retired code' not in bcp_47.scopes.get (l, '') + } + intersection = round_trip_macrolanguages & round_trip_languages + if intersection and intersection != round_trip_languages: + blocked_ot_macrolanguages.add (ot_macrolanguage) if ot_macrolanguages: for ot_macrolanguage in ot_macrolanguages: - for language in languages: - self.add_language (language, ot_macrolanguage) - self.ranks[ot_macrolanguage] += 1 + if ot_macrolanguage not in blocked_ot_macrolanguages: + for language in languages: + self.add_language (language, ot_macrolanguage) + if not blocked_ot_macrolanguages: + self.ranks[ot_macrolanguage] += 1 elif first_time: for language in languages: if language in self.from_bcp_47_uninherited: @@ -715,6 +741,7 @@ ot.add_language ('no', 'NOR') ot.add_language ('oc-provenc', 'PRO') +ot.remove_language_ot ('QUZ') ot.add_language ('qu', 'QUZ') ot.add_language ('qub', 'QWH') ot.add_language ('qud', 'QVI') diff --git a/src/hb-ot-tag-table.hh b/src/hb-ot-tag-table.hh index a71d539d1..463e7a02a 100644 --- a/src/hb-ot-tag-table.hh +++ b/src/hb-ot-tag-table.hh @@ -628,7 +628,6 @@ static const LangTag ot_languages[] = { {"ike", HB_TAG('I','N','U',' ')}, /* Eastern Canadian Inuktitut -> Inuktitut */ {"ike", HB_TAG('I','N','U','K')}, /* Eastern Canadian Inuktitut -> Nunavik Inuktitut */ {"ikt", HB_TAG('I','N','U',' ')}, /* Inuinnaqtun -> Inuktitut */ - {"ikt", HB_TAG('I','N','U','K')}, /* Inuinnaqtun -> Nunavik Inuktitut */ /*{"ilo", HB_TAG('I','L','O',' ')},*/ /* Iloko -> Ilokano */ {"in", HB_TAG('I','N','D',' ')}, /* Indonesian (retired code) */ {"in", HB_TAG('M','L','Y',' ')}, /* Indonesian (retired code) -> Malay */ @@ -1044,7 +1043,6 @@ static const LangTag ot_languages[] = { {"nln", HB_TAG('N','A','H',' ')}, /* Durango Nahuatl (retired code) -> Nahuatl */ {"nlv", HB_TAG('N','A','H',' ')}, /* Orizaba Nahuatl -> Nahuatl */ {"nn", HB_TAG('N','Y','N',' ')}, /* Norwegian Nynorsk (Nynorsk, Norwegian) */ - {"nn", HB_TAG('N','O','R',' ')}, /* Norwegian Nynorsk -> Norwegian */ {"nnh", HB_TAG('B','M','L',' ')}, /* Ngiemboon -> Bamileke */ {"nnz", HB_TAG('B','M','L',' ')}, /* Nda'nda' -> Bamileke */ {"no", HB_TAG('N','O','R',' ')}, /* Norwegian [macrolanguage] */ @@ -2615,14 +2613,8 @@ hb_ot_tags_from_complex_language (const char *lang_str, if (0 == strcmp (&lang_str[1], "o-nyn")) { /* Norwegian Nynorsk (retired code) */ - unsigned int i; - hb_tag_t possible_tags[] = { - HB_TAG('N','Y','N',' '), /* Norwegian Nynorsk (Nynorsk, Norwegian) */ - HB_TAG('N','O','R',' '), /* Norwegian */ - }; - for (i = 0; i < 2 && i < *count; i++) - tags[i] = possible_tags[i]; - *count = i; + tags[0] = HB_TAG('N','Y','N',' '); /* Norwegian Nynorsk (Nynorsk, Norwegian) */ + *count = 1; return true; } break; From ae9afd9772e909476d28fb647d7f7aef6865f6cd Mon Sep 17 00:00:00 2001 From: David Corbett Date: Sun, 3 Oct 2021 20:09:33 -0400 Subject: [PATCH 5/5] Let BCP 47 tag "mo" fall back to OT tag 'ROM ' --- src/gen-tag-table.py | 5 ++++- src/hb-ot-tag-table.hh | 13 +++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/gen-tag-table.py b/src/gen-tag-table.py index d1f8fe286..f8fb05f11 100755 --- a/src/gen-tag-table.py +++ b/src/gen-tag-table.py @@ -774,7 +774,6 @@ ot.add_language ('qxr', 'QVI') ot.add_language ('qxt', 'QWH') ot.add_language ('qxw', 'QWH') -bcp_47.macrolanguages['ro'].remove ('mo') bcp_47.macrolanguages['ro-MD'].add ('mo') ot.remove_language_ot ('SYRE') @@ -1014,6 +1013,8 @@ for initial, items in sorted (complex_tags.items ()): if initial != 'und': continue for lt, tags in items: + if not tags: + continue if lt.variant in bcp_47.prefixes: expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language, '%s is not a valid prefix of %s' % (lt.language, lt.variant)) @@ -1048,6 +1049,8 @@ for initial, items in sorted (complex_tags.items ()): continue print (" case '%s':" % initial) for lt, tags in items: + if not tags: + continue print (' if (', end='') script = lt.script region = lt.region diff --git a/src/hb-ot-tag-table.hh b/src/hb-ot-tag-table.hh index 463e7a02a..61d2814e9 100644 --- a/src/hb-ot-tag-table.hh +++ b/src/hb-ot-tag-table.hh @@ -940,6 +940,7 @@ static const LangTag ot_languages[] = { {"mnw", HB_TAG('M','O','N','T')}, /* Mon -> Thailand Mon */ {"mnx", HB_TAG_NONE }, /* Manikion != Manx */ {"mo", HB_TAG('M','O','L',' ')}, /* Moldavian (retired code) */ + {"mo", HB_TAG('R','O','M',' ')}, /* Moldavian (retired code) -> Romanian */ {"mod", HB_TAG('C','P','P',' ')}, /* Mobilian -> Creoles */ /*{"moh", HB_TAG('M','O','H',' ')},*/ /* Mohawk */ {"mok", HB_TAG_NONE }, /* Morori != Moksha */ @@ -2623,8 +2624,14 @@ hb_ot_tags_from_complex_language (const char *lang_str, && subtag_matches (lang_str, limit, "-md")) { /* Romanian; Moldova */ - tags[0] = HB_TAG('M','O','L',' '); /* Moldavian */ - *count = 1; + unsigned int i; + hb_tag_t possible_tags[] = { + HB_TAG('M','O','L',' '), /* Moldavian */ + HB_TAG('R','O','M',' '), /* Romanian */ + }; + for (i = 0; i < 2 && i < *count; i++) + tags[i] = possible_tags[i]; + *count = i; return true; } break; @@ -2930,6 +2937,8 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag) return hb_language_from_string ("qwh", -1); /* Huaylas Ancash Quechua */ case HB_TAG('R','A','J',' '): /* Rajasthani */ return hb_language_from_string ("raj", -1); /* Rajasthani [macrolanguage] */ + case HB_TAG('R','O','M',' '): /* Romanian */ + return hb_language_from_string ("ro", -1); /* Romanian */ case HB_TAG('R','O','Y',' '): /* Romany */ return hb_language_from_string ("rom", -1); /* Romany [macrolanguage] */ case HB_TAG('S','Q','I',' '): /* Albanian */