[tools] Rewrite Ruby scripts in Python for consistency

Signed-off-by: Meenbeese <meenbeese@tutanota.com>
This commit is contained in:
Meenbeese 2024-11-29 23:44:00 -05:00
parent 3c0fbeaa9c
commit d8dafb7973
5 changed files with 224 additions and 171 deletions

View file

@ -58,9 +58,10 @@ By default, it searches `strings.txt`, to check `types_strings.txt` add a `-t` o
There are many more other options, e.g. print various translation statistics, validate and re-format translation files.
Check `tools/python/strings_utils.py -h` to see all of them.
To check consistency of types_strings.txt with categories.txt run:
To ensure that the category definitions in `categories.txt` are consistent with their
associated translations in `strings.txt`, you can use the category consistency script.
```
ruby tools/ruby/category_consistency/check_consistency.rb
python tools/python/category/check_consistency.py
```
## Automatic translations

View file

@ -0,0 +1,123 @@
#!/usr/bin/env python3
"""
This script checks for consistency between categories defined in:
1. `displayed_categories.cpp`: A C++ source file containing category keys.
2. `categories.txt`: A data file defining category details.
3. `strings.txt`: A data file containing translations of category strings.
The script:
- Parses category keys from the C++ file.
- Loads corresponding data from `categories.txt` and `strings.txt`.
- Compares the parsed data for inconsistencies in definitions and translations.
- Prints detailed messages if inconsistencies are found.
It exits with:
- `0` on success (all categories are consistent),
- `1` on failure (inconsistencies detected).
"""
import os
import re
from omim_parsers import CategoriesParser, StringsParser, LANGUAGES
ROOT = os.path.dirname(os.path.abspath(__file__))
OMIM_ROOT = os.path.join(ROOT, '..', '..', '..')
CPP_CATEGORIES_FILENAME = os.path.join(OMIM_ROOT, 'search', 'displayed_categories.cpp')
CATEGORIES_FILENAME = os.path.join(OMIM_ROOT, 'data', 'categories.txt')
STRINGS_FILENAME = os.path.join(OMIM_ROOT, 'data', 'strings', 'strings.txt')
CATEGORIES_MATCHER = re.compile(r"m_keys = \{(.*?)};", re.DOTALL)
def extract_cpp_categories(filename):
if not os.path.exists(filename):
print(f"Error: {filename} not found.")
return []
with open(filename, "r", encoding="utf-8") as cpp_file:
content = cpp_file.read()
match = CATEGORIES_MATCHER.search(content)
if not match:
print(f"Error: No categories found in {filename}.")
return []
raw_categories = match.group(1)
return [cat.strip().strip('"') for cat in raw_categories.split(",")]
def compare_categories(string_cats, search_cats, cpp_cats):
inconsistent_strings = {}
missing_categories = []
extra_categories = []
for category_name in search_cats.keys():
if category_name not in string_cats:
missing_categories.append(category_name)
for cpp_cat in cpp_cats:
if cpp_cat not in search_cats:
extra_categories.append(cpp_cat)
for category_name, translations in search_cats.items():
if category_name not in string_cats:
continue
for lang, search_translation in translations.items():
if lang not in string_cats[category_name]:
inconsistent_strings.setdefault(category_name, {})[lang] = (
"Missing translation",
search_translation,
)
elif string_cats[category_name][lang] != search_translation:
inconsistent_strings.setdefault(category_name, {})[lang] = (
string_cats[category_name][lang],
search_translation,
)
if missing_categories:
print("\nMissing translations for categories in strings.txt:")
for category_name in missing_categories:
print(f" - {category_name}")
if extra_categories:
print("\nExtra categories found in displayed_categories.cpp but not in categories.txt:")
for cpp_cat in extra_categories:
print(f" - {cpp_cat}")
if inconsistent_strings:
print("\nInconsistent category translations:")
for category_name, langs in inconsistent_strings.items():
print(f"Category \"{category_name}\":")
for lang, (strings_value, search_value) in langs.items():
print(f" {lang}: strings.txt=\"{strings_value}\" vs categories.txt=\"{search_value}\"")
return not (missing_categories or extra_categories or inconsistent_strings)
def check_search_categories_consistent():
categories_txt_parser = CategoriesParser(LANGUAGES)
strings_txt_parser = StringsParser(LANGUAGES)
if not os.path.exists(CATEGORIES_FILENAME):
print(f"Error: {CATEGORIES_FILENAME} not found.")
return 1
if not os.path.exists(STRINGS_FILENAME):
print(f"Error: {STRINGS_FILENAME} not found.")
return 1
search_categories = categories_txt_parser.parse_file(CATEGORIES_FILENAME)
string_categories = strings_txt_parser.parse_file(STRINGS_FILENAME)
cpp_categories = extract_cpp_categories(CPP_CATEGORIES_FILENAME)
if compare_categories(string_categories, search_categories, cpp_categories):
print("Success: All categories are consistent.")
return 0
else:
print("Failure: Inconsistencies found in category definitions.")
return 1
if __name__ == "__main__":
exit(check_search_categories_consistent())

View file

@ -0,0 +1,98 @@
import re
from typing import Optional, Tuple, Dict, List
LANGUAGES = [
"af", "ar", "be", "bg", "ca", "cs", "da", "de", "el", "en", "en-GB", "es", "es-MX", "et",
"eu", "fa", "fi", "fr", "fr-CA", "he", "hi", "hu", "id", "it", "ja", "ko", "lt", "mr", "nb",
"nl", "pl", "pt", "pt-BR", "ro", "ru", "sk", "sv", "sw", "th", "tr", "uk", "vi", "zh-Hans", "zh-Hant"
]
class AbstractParser:
def __init__(self, keys: List[str]):
self.keys = keys
def parse_line(self, line: str) -> Optional[Tuple[str, str]]:
raise NotImplementedError("You must implement parse_line.")
def match_category(self, line: str, result: Dict[str, Dict]):
category_match = self.category().search(line)
if category_match:
category = category_match.group(1)
if category in self.keys:
if category not in result:
result[category] = {}
def parse_file(self, filename: str) -> Dict[str, Dict]:
result = {}
current_category = None
with open(filename, "r", encoding="utf-8") as file:
for line in file:
line = line.strip()
# Skip comments and empty lines
if self.should_exclude_line(line):
continue
# Match a new category
category_match = self.category().match(line)
if category_match:
current_category = category_match.group(1)
if current_category not in result:
result[current_category] = {}
continue
# Parse translations for the current category
if current_category:
parsed = self.parse_line(line)
if parsed:
lang, translation = parsed
result[current_category].setdefault(lang, []).append(translation)
return result
def category(self) -> re.Pattern:
raise NotImplementedError("You must implement category.")
def is_new_category(self, line: str) -> bool:
return bool(self.category().match(line))
def extract_category(self, line: str) -> Optional[str]:
match = self.category().match(line)
return match.group(1) if match else None
def should_exclude_line(self, line: str) -> bool:
return False
class CategoriesParser(AbstractParser):
def parse_line(self, line: str) -> Optional[Tuple[str, str]]:
line_match = re.match(r"^([^:]+):(.+)$", line)
if line_match:
lang = line_match.group(1).strip()
translation = line_match.group(2).strip()
return lang, translation
return None
def category(self) -> re.Pattern:
return re.compile(r"^([a-zA-Z0-9_-]+)\|@(.+)$")
def should_exclude_line(self, line: str) -> bool:
return line.startswith("#") or not line
class StringsParser(AbstractParser):
def parse_line(self, line: str) -> Optional[Tuple[str, str]]:
line_match = re.match(r"^([^=]+)=(.*)$", line)
if line_match:
lang = line_match.group(1).strip()
translation = line_match.group(2).strip()
return lang, translation
return None
def category(self) -> re.Pattern:
return re.compile(r"^\[([a-zA-Z0-9_]+)]$")
def should_exclude_line(self, line: str) -> bool:
return line.startswith("tags") or not line

View file

@ -1,65 +0,0 @@
#!/usr/bin/env ruby
require_relative './omim_parsers'
ROOT = File.expand_path(File.dirname(__FILE__))
OMIM_ROOT = File.join(ROOT, '..', '..', '..')
CPP_CATEGORIES_FILENAME = File.join(OMIM_ROOT, 'search', 'displayed_categories.cpp')
CATEGORIES_FILENAME = File.join(OMIM_ROOT, 'data', 'categories.txt')
STRINGS_FILENAME = File.join(OMIM_ROOT, 'data', 'strings', 'strings.txt')
CATEGORIES_MATCHER = /m_keys = \{(.*)\};/m
def load_categories_from_cpp(filename)
raw_categories = File.read(CPP_CATEGORIES_FILENAME)
match = CATEGORIES_MATCHER.match(raw_categories)
if match
cpp_categories = match[1].split(/,\s+/)
# Delete quotes
cpp_categories.map { |cat| cat.gsub!(/^"|"$/, '') }
cpp_categories
end
end
def compare_categories(string_cats, search_cats)
inconsistent_strings = {}
string_cats.each do |category_name, category|
if !search_cats.include? category_name
puts "Category '#{category_name}' not found in categories.txt"
next
end
category.each do |lang, translation|
if search_cats[category_name].include? lang
if !search_cats[category_name][lang].include? translation
not_found_cats_list = search_cats[category_name][lang]
(inconsistent_strings[category_name] ||= {})[lang] = [translation, not_found_cats_list]
end
end
end
end
inconsistent_strings.each do |name, languages|
puts "\nInconsistent category \"#{name}\""
languages.each do |lang, values|
string_value, category_value = values
puts "\t#{lang} : \"#{string_value}\" is not matched by #{category_value}"
end
end
inconsistent_strings.empty?
end
def check_search_categories_consistent
cpp_categories = load_categories_from_cpp(CPP_CATEGORIES_FILENAME)
categories_txt_parser = OmimParsers::CategoriesParser.new cpp_categories
strings_txt_parser = OmimParsers::StringsParser.new cpp_categories
search_categories = categories_txt_parser.parse_file(CATEGORIES_FILENAME)
string_categories = strings_txt_parser.parse_file(STRINGS_FILENAME)
compare_categories(string_categories, search_categories) ? 0 : 1
end
if __FILE__ == $0
exit check_search_categories_consistent()
end

View file

@ -1,104 +0,0 @@
module OmimParsers
# To update the list, run in root directory:
# sed -nEe "s/ +([a-zA-Z]{2}(-[a-zA-Z]{2,})?) = .*$/\1/p" "data/strings/strings.txt" | sort -u | tr '\n' ' ' | sed -e 's/,$//' | fold -s -w48; echo
LANGUAGES = %w(af ar be bg ca cs da de el en en-GB es es-MX et
eu fa fi fr fr-CA he hi hu id it ja ko lt mr nb
nl pl pt pt-BR ro ru sk sv sw th tr uk vi
zh-Hans zh-Hant)
class AbstractParser
def initialize(keys)
@keys = keys
end
def parse_line(line)
raise NotImplementedError.new("You must implement parse_file.")
end
def match_category(line, result)
category_match = category.match(line)
if !category_match.nil?
category = category_match[1]
if @keys.include? category
result[category] ||= {}
end
end
end
def parse_file(filename)
current_string = nil
result = {}
File.open(filename, 'r:UTF-8').each do |line|
line.strip!
next if should_exclude_line? line
# If line is empty -> next category block started
if line.empty?
current_string = nil
next
end
current_string ||= match_category(line, result)
parsed = parse_line(line)
if !parsed.nil? and !current_string.nil?
lang, translation = parsed
current_string[lang] = translation
end
end
result
end
def category
raise NotImplementedError.new("You must implement category.")
end
def should_exclude_line?(line)
false
end
end
class CategoriesParser < AbstractParser
def parse_line(line)
line_match = /^([^:]+):(\S+)$/u.match(line)
return if !line_match
lang = $1.strip
return if !LANGUAGES.include? lang
translation = $2.strip
synonyms = []
translation.split('|').each do |token|
token_match = /\d?\^?(.*)$/.match(token)
synonyms.push(token_match[1]) if token_match
end
[lang, synonyms]
end
def should_exclude_line?(line)
line.start_with? '#'
end
def category
# We match only global categories ('food', 'bank'...)
/^@([A-Za-z0-9]+)$/
end
end
class StringsParser < AbstractParser
def parse_line(line)
line_match = /^([^=]+)=(.*)$/.match(line)
if line_match
lang = $1.strip
if LANGUAGES.include? lang
[lang, $2.strip]
end
end
end
def category
/^\[(.+)\]/
end
end
end