diff --git a/docs/TRANSLATIONS.md b/docs/TRANSLATIONS.md index 193c3438ee..81b9ed8fed 100644 --- a/docs/TRANSLATIONS.md +++ b/docs/TRANSLATIONS.md @@ -58,9 +58,10 @@ By default, it searches `strings.txt`, to check `types_strings.txt` add a `-t` o There are many more other options, e.g. print various translation statistics, validate and re-format translation files. Check `tools/python/strings_utils.py -h` to see all of them. -To check consistency of types_strings.txt with categories.txt run: +To ensure that the category definitions in `categories.txt` are consistent with their +associated translations in `strings.txt`, you can use the category consistency script. ``` -ruby tools/ruby/category_consistency/check_consistency.rb +python tools/python/category/check_consistency.py ``` ## Automatic translations diff --git a/tools/python/category/check_consistency.py b/tools/python/category/check_consistency.py new file mode 100644 index 0000000000..3017c6c082 --- /dev/null +++ b/tools/python/category/check_consistency.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 + +""" +This script checks for consistency between categories defined in: +1. `displayed_categories.cpp`: A C++ source file containing category keys. +2. `categories.txt`: A data file defining category details. +3. `strings.txt`: A data file containing translations of category strings. + +The script: +- Parses category keys from the C++ file. +- Loads corresponding data from `categories.txt` and `strings.txt`. +- Compares the parsed data for inconsistencies in definitions and translations. +- Prints detailed messages if inconsistencies are found. + +It exits with: +- `0` on success (all categories are consistent), +- `1` on failure (inconsistencies detected). +""" + +import os +import re +from omim_parsers import CategoriesParser, StringsParser, LANGUAGES + +ROOT = os.path.dirname(os.path.abspath(__file__)) +OMIM_ROOT = os.path.join(ROOT, '..', '..', '..') +CPP_CATEGORIES_FILENAME = os.path.join(OMIM_ROOT, 'search', 'displayed_categories.cpp') +CATEGORIES_FILENAME = os.path.join(OMIM_ROOT, 'data', 'categories.txt') +STRINGS_FILENAME = os.path.join(OMIM_ROOT, 'data', 'strings', 'strings.txt') +CATEGORIES_MATCHER = re.compile(r"m_keys = \{(.*?)};", re.DOTALL) + + +def extract_cpp_categories(filename): + if not os.path.exists(filename): + print(f"Error: {filename} not found.") + return [] + + with open(filename, "r", encoding="utf-8") as cpp_file: + content = cpp_file.read() + + match = CATEGORIES_MATCHER.search(content) + if not match: + print(f"Error: No categories found in {filename}.") + return [] + + raw_categories = match.group(1) + return [cat.strip().strip('"') for cat in raw_categories.split(",")] + + +def compare_categories(string_cats, search_cats, cpp_cats): + inconsistent_strings = {} + missing_categories = [] + extra_categories = [] + + for category_name in search_cats.keys(): + if category_name not in string_cats: + missing_categories.append(category_name) + + for cpp_cat in cpp_cats: + if cpp_cat not in search_cats: + extra_categories.append(cpp_cat) + + for category_name, translations in search_cats.items(): + if category_name not in string_cats: + continue + + for lang, search_translation in translations.items(): + if lang not in string_cats[category_name]: + inconsistent_strings.setdefault(category_name, {})[lang] = ( + "Missing translation", + search_translation, + ) + elif string_cats[category_name][lang] != search_translation: + inconsistent_strings.setdefault(category_name, {})[lang] = ( + string_cats[category_name][lang], + search_translation, + ) + + if missing_categories: + print("\nMissing translations for categories in strings.txt:") + for category_name in missing_categories: + print(f" - {category_name}") + + if extra_categories: + print("\nExtra categories found in displayed_categories.cpp but not in categories.txt:") + for cpp_cat in extra_categories: + print(f" - {cpp_cat}") + + if inconsistent_strings: + print("\nInconsistent category translations:") + for category_name, langs in inconsistent_strings.items(): + print(f"Category \"{category_name}\":") + for lang, (strings_value, search_value) in langs.items(): + print(f" {lang}: strings.txt=\"{strings_value}\" vs categories.txt=\"{search_value}\"") + + return not (missing_categories or extra_categories or inconsistent_strings) + + +def check_search_categories_consistent(): + categories_txt_parser = CategoriesParser(LANGUAGES) + strings_txt_parser = StringsParser(LANGUAGES) + + if not os.path.exists(CATEGORIES_FILENAME): + print(f"Error: {CATEGORIES_FILENAME} not found.") + return 1 + + if not os.path.exists(STRINGS_FILENAME): + print(f"Error: {STRINGS_FILENAME} not found.") + return 1 + + search_categories = categories_txt_parser.parse_file(CATEGORIES_FILENAME) + string_categories = strings_txt_parser.parse_file(STRINGS_FILENAME) + cpp_categories = extract_cpp_categories(CPP_CATEGORIES_FILENAME) + + if compare_categories(string_categories, search_categories, cpp_categories): + print("Success: All categories are consistent.") + return 0 + else: + print("Failure: Inconsistencies found in category definitions.") + return 1 + + +if __name__ == "__main__": + exit(check_search_categories_consistent()) diff --git a/tools/python/category/omim_parsers.py b/tools/python/category/omim_parsers.py new file mode 100644 index 0000000000..825c18ee06 --- /dev/null +++ b/tools/python/category/omim_parsers.py @@ -0,0 +1,98 @@ +import re +from typing import Optional, Tuple, Dict, List + +LANGUAGES = [ + "af", "ar", "be", "bg", "ca", "cs", "da", "de", "el", "en", "en-GB", "es", "es-MX", "et", + "eu", "fa", "fi", "fr", "fr-CA", "he", "hi", "hu", "id", "it", "ja", "ko", "lt", "mr", "nb", + "nl", "pl", "pt", "pt-BR", "ro", "ru", "sk", "sv", "sw", "th", "tr", "uk", "vi", "zh-Hans", "zh-Hant" +] + + +class AbstractParser: + def __init__(self, keys: List[str]): + self.keys = keys + + def parse_line(self, line: str) -> Optional[Tuple[str, str]]: + raise NotImplementedError("You must implement parse_line.") + + def match_category(self, line: str, result: Dict[str, Dict]): + category_match = self.category().search(line) + if category_match: + category = category_match.group(1) + if category in self.keys: + if category not in result: + result[category] = {} + + def parse_file(self, filename: str) -> Dict[str, Dict]: + result = {} + current_category = None + + with open(filename, "r", encoding="utf-8") as file: + for line in file: + line = line.strip() + + # Skip comments and empty lines + if self.should_exclude_line(line): + continue + + # Match a new category + category_match = self.category().match(line) + if category_match: + current_category = category_match.group(1) + if current_category not in result: + result[current_category] = {} + continue + + # Parse translations for the current category + if current_category: + parsed = self.parse_line(line) + if parsed: + lang, translation = parsed + result[current_category].setdefault(lang, []).append(translation) + + return result + + def category(self) -> re.Pattern: + raise NotImplementedError("You must implement category.") + + def is_new_category(self, line: str) -> bool: + return bool(self.category().match(line)) + + def extract_category(self, line: str) -> Optional[str]: + match = self.category().match(line) + return match.group(1) if match else None + + def should_exclude_line(self, line: str) -> bool: + return False + + +class CategoriesParser(AbstractParser): + def parse_line(self, line: str) -> Optional[Tuple[str, str]]: + line_match = re.match(r"^([^:]+):(.+)$", line) + if line_match: + lang = line_match.group(1).strip() + translation = line_match.group(2).strip() + return lang, translation + return None + + def category(self) -> re.Pattern: + return re.compile(r"^([a-zA-Z0-9_-]+)\|@(.+)$") + + def should_exclude_line(self, line: str) -> bool: + return line.startswith("#") or not line + + +class StringsParser(AbstractParser): + def parse_line(self, line: str) -> Optional[Tuple[str, str]]: + line_match = re.match(r"^([^=]+)=(.*)$", line) + if line_match: + lang = line_match.group(1).strip() + translation = line_match.group(2).strip() + return lang, translation + return None + + def category(self) -> re.Pattern: + return re.compile(r"^\[([a-zA-Z0-9_]+)]$") + + def should_exclude_line(self, line: str) -> bool: + return line.startswith("tags") or not line diff --git a/tools/ruby/category_consistency/check_consistency.rb b/tools/ruby/category_consistency/check_consistency.rb deleted file mode 100755 index f94dd030e5..0000000000 --- a/tools/ruby/category_consistency/check_consistency.rb +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env ruby - -require_relative './omim_parsers' - -ROOT = File.expand_path(File.dirname(__FILE__)) -OMIM_ROOT = File.join(ROOT, '..', '..', '..') -CPP_CATEGORIES_FILENAME = File.join(OMIM_ROOT, 'search', 'displayed_categories.cpp') -CATEGORIES_FILENAME = File.join(OMIM_ROOT, 'data', 'categories.txt') -STRINGS_FILENAME = File.join(OMIM_ROOT, 'data', 'strings', 'strings.txt') -CATEGORIES_MATCHER = /m_keys = \{(.*)\};/m - -def load_categories_from_cpp(filename) - raw_categories = File.read(CPP_CATEGORIES_FILENAME) - match = CATEGORIES_MATCHER.match(raw_categories) - if match - cpp_categories = match[1].split(/,\s+/) - # Delete quotes - cpp_categories.map { |cat| cat.gsub!(/^"|"$/, '') } - cpp_categories - end -end - -def compare_categories(string_cats, search_cats) - inconsistent_strings = {} - - string_cats.each do |category_name, category| - if !search_cats.include? category_name - puts "Category '#{category_name}' not found in categories.txt" - next - end - category.each do |lang, translation| - if search_cats[category_name].include? lang - if !search_cats[category_name][lang].include? translation - not_found_cats_list = search_cats[category_name][lang] - (inconsistent_strings[category_name] ||= {})[lang] = [translation, not_found_cats_list] - end - end - end - end - - inconsistent_strings.each do |name, languages| - puts "\nInconsistent category \"#{name}\"" - languages.each do |lang, values| - string_value, category_value = values - puts "\t#{lang} : \"#{string_value}\" is not matched by #{category_value}" - end - end - inconsistent_strings.empty? -end - -def check_search_categories_consistent - cpp_categories = load_categories_from_cpp(CPP_CATEGORIES_FILENAME) - categories_txt_parser = OmimParsers::CategoriesParser.new cpp_categories - strings_txt_parser = OmimParsers::StringsParser.new cpp_categories - - search_categories = categories_txt_parser.parse_file(CATEGORIES_FILENAME) - string_categories = strings_txt_parser.parse_file(STRINGS_FILENAME) - - compare_categories(string_categories, search_categories) ? 0 : 1 -end - - -if __FILE__ == $0 - exit check_search_categories_consistent() -end diff --git a/tools/ruby/category_consistency/omim_parsers.rb b/tools/ruby/category_consistency/omim_parsers.rb deleted file mode 100644 index 440946524b..0000000000 --- a/tools/ruby/category_consistency/omim_parsers.rb +++ /dev/null @@ -1,104 +0,0 @@ -module OmimParsers - - # To update the list, run in root directory: - # sed -nEe "s/ +([a-zA-Z]{2}(-[a-zA-Z]{2,})?) = .*$/\1/p" "data/strings/strings.txt" | sort -u | tr '\n' ' ' | sed -e 's/,$//' | fold -s -w48; echo - LANGUAGES = %w(af ar be bg ca cs da de el en en-GB es es-MX et - eu fa fi fr fr-CA he hi hu id it ja ko lt mr nb - nl pl pt pt-BR ro ru sk sv sw th tr uk vi - zh-Hans zh-Hant) - - class AbstractParser - def initialize(keys) - @keys = keys - end - - def parse_line(line) - raise NotImplementedError.new("You must implement parse_file.") - end - - def match_category(line, result) - category_match = category.match(line) - if !category_match.nil? - category = category_match[1] - if @keys.include? category - result[category] ||= {} - end - end - end - - def parse_file(filename) - current_string = nil - result = {} - File.open(filename, 'r:UTF-8').each do |line| - line.strip! - next if should_exclude_line? line - - # If line is empty -> next category block started - if line.empty? - current_string = nil - next - end - - current_string ||= match_category(line, result) - - parsed = parse_line(line) - if !parsed.nil? and !current_string.nil? - lang, translation = parsed - current_string[lang] = translation - end - end - result - end - - def category - raise NotImplementedError.new("You must implement category.") - end - - def should_exclude_line?(line) - false - end - end - - class CategoriesParser < AbstractParser - def parse_line(line) - line_match = /^([^:]+):(\S+)$/u.match(line) - return if !line_match - - lang = $1.strip - return if !LANGUAGES.include? lang - - translation = $2.strip - synonyms = [] - translation.split('|').each do |token| - token_match = /\d?\^?(.*)$/.match(token) - synonyms.push(token_match[1]) if token_match - end - [lang, synonyms] - end - - def should_exclude_line?(line) - line.start_with? '#' - end - - def category - # We match only global categories ('food', 'bank'...) - /^@([A-Za-z0-9]+)$/ - end - end - - class StringsParser < AbstractParser - def parse_line(line) - line_match = /^([^=]+)=(.*)$/.match(line) - if line_match - lang = $1.strip - if LANGUAGES.include? lang - [lang, $2.strip] - end - end - end - - def category - /^\[(.+)\]/ - end - end -end