diff --git a/src/gen-tag-table.py b/src/gen-tag-table.py index cb612b982..fa98d29de 100755 --- a/src/gen-tag-table.py +++ b/src/gen-tag-table.py @@ -329,6 +329,10 @@ class OpenTypeRegistryParser (HTMLParser): from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47`` inverted. Its values start as unsorted sets; ``sort_languages`` converts them to sorted lists. + from_bcp_47_uninherited (Optional[Dict[str, AbstractSet[str]]]): + A copy of ``from_bcp_47``. It starts as ``None`` and is + populated at the beginning of the first call to + ``inherit_from_macrolanguages``. """ def __init__ (self): @@ -338,6 +342,7 @@ class OpenTypeRegistryParser (HTMLParser): self.ranks = collections.defaultdict (int) self.to_bcp_47 = collections.defaultdict (set) self.from_bcp_47 = collections.defaultdict (set) + self.from_bcp_47_uninherited = None # Whether the parser is in a element self._td = False # Whether the parser is after a
element within the current element @@ -463,29 +468,24 @@ class OpenTypeRegistryParser (HTMLParser): to SQI. If a BCP 47 tag for a macrolanguage has no OpenType mapping but - all of its individual languages do and they all map to the same - tags, the mapping is copied to the macrolanguage. + some of its individual languages do, their mappings are copied + to the macrolanguage. """ global bcp_47 - original_ot_from_bcp_47 = dict (self.from_bcp_47) + first_time = self.from_bcp_47_uninherited is None + if first_time: + self.from_bcp_47_uninherited = dict (self.from_bcp_47) for macrolanguage, languages in dict (bcp_47.macrolanguages).items (): - ot_macrolanguages = set (original_ot_from_bcp_47.get (macrolanguage, set ())) + ot_macrolanguages = set (self.from_bcp_47_uninherited.get (macrolanguage, set ())) if ot_macrolanguages: for ot_macrolanguage in ot_macrolanguages: for language in languages: self.add_language (language, ot_macrolanguage) self.ranks[ot_macrolanguage] += 1 - else: + elif first_time: for language in languages: - if language in original_ot_from_bcp_47: - if ot_macrolanguages: - ml = original_ot_from_bcp_47[language] - if ml: - ot_macrolanguages &= ml - else: - pass - else: - ot_macrolanguages |= original_ot_from_bcp_47[language] + if language in self.from_bcp_47_uninherited: + ot_macrolanguages |= self.from_bcp_47_uninherited[language] else: ot_macrolanguages.clear () if not ot_macrolanguages: @@ -1121,7 +1121,11 @@ def verify_disambiguation_dict (): elif len (primary_tags) == 0: expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag) else: - macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]') + original_languages = [t for t in primary_tags if t in ot.from_bcp_47_uninherited and 'retired code' not in bcp_47.scopes.get (t, '')] + if len (original_languages) == 1: + macrolanguages = original_languages + else: + macrolanguages = [t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]'] if len (macrolanguages) != 1: macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [family]') if len (macrolanguages) != 1: diff --git a/src/hb-ot-tag-table.hh b/src/hb-ot-tag-table.hh index c5ec1518b..f6e9238c7 100644 --- a/src/hb-ot-tag-table.hh +++ b/src/hb-ot-tag-table.hh @@ -256,6 +256,8 @@ static const LangTag ot_languages[] = { {"chh", HB_TAG_NONE }, /* Chinook != Chattisgarhi */ {"chj", HB_TAG('C','C','H','N')}, /* Ojitlán Chinantec -> Chinantec */ {"chk", HB_TAG('C','H','K','0')}, /* Chuukese */ + {"chm", HB_TAG('H','M','A',' ')}, /* Mari (Russia) [macrolanguage] -> High Mari */ + {"chm", HB_TAG('L','M','A',' ')}, /* Mari (Russia) [macrolanguage] -> Low Mari */ {"chn", HB_TAG('C','P','P',' ')}, /* Chinook jargon -> Creoles */ /*{"cho", HB_TAG('C','H','O',' ')},*/ /* Choctaw */ {"chp", HB_TAG('C','H','P',' ')}, /* Chipewyan */ @@ -1311,6 +1313,9 @@ static const LangTag ot_languages[] = { {"sgo", HB_TAG_NONE }, /* Songa (retired code) != Sango */ /*{"sgs", HB_TAG('S','G','S',' ')},*/ /* Samogitian */ {"sgw", HB_TAG('C','H','G',' ')}, /* Sebat Bet Gurage -> Chaha Gurage */ + {"sh", HB_TAG('B','O','S',' ')}, /* Serbo-Croatian [macrolanguage] -> Bosnian */ + {"sh", HB_TAG('H','R','V',' ')}, /* Serbo-Croatian [macrolanguage] -> Croatian */ + {"sh", HB_TAG('S','R','B',' ')}, /* Serbo-Croatian [macrolanguage] -> Serbian */ {"shi", HB_TAG('S','H','I',' ')}, /* Tachelhit */ {"shi", HB_TAG('B','B','R',' ')}, /* Tachelhit -> Berber */ {"shl", HB_TAG('Q','I','N',' ')}, /* Shendu -> Chin */ @@ -2841,6 +2846,8 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag) return hb_language_from_string ("fa", -1); /* Persian [macrolanguage] */ case HB_TAG('G','O','N',' '): /* Gondi */ return hb_language_from_string ("gon", -1); /* Gondi [macrolanguage] */ + case HB_TAG('H','M','A',' '): /* High Mari */ + return hb_language_from_string ("mrj", -1); /* Western Mari */ case HB_TAG('H','M','N',' '): /* Hmong */ return hb_language_from_string ("hmn", -1); /* Hmong [macrolanguage] */ case HB_TAG('H','N','D',' '): /* Hindko */ @@ -2881,6 +2888,8 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag) return hb_language_from_string ("uki", -1); /* Kui (India) */ case HB_TAG('K','U','R',' '): /* Kurdish */ return hb_language_from_string ("ku", -1); /* Kurdish [macrolanguage] */ + case HB_TAG('L','M','A',' '): /* Low Mari */ + return hb_language_from_string ("mhr", -1); /* Eastern Mari */ case HB_TAG('L','U','H',' '): /* Luyia */ return hb_language_from_string ("luy", -1); /* Luyia [macrolanguage] */ case HB_TAG('L','V','I',' '): /* Latvian */