[tools] Rewrite Ruby scripts in Python for consistency #9791

Open
meenbeese wants to merge 1 commit from meenbeese/py-rewrite into master
5 changed files with 224 additions and 171 deletions

View file

@ -58,9 +58,10 @@ By default, it searches `strings.txt`, to check `types_strings.txt` add a `-t` o
There are many more other options, e.g. print various translation statistics, validate and re-format translation files.
Check `tools/python/strings_utils.py -h` to see all of them.
Review

But the script operates with string.txt...

Could you please add a comment to the script describing what exactly does it check?
I've tried to make some changes to the UI search categories in categories.txt but couldn't make the script fail.

But the script operates with `string.txt`... Could you please add a comment to the script describing what exactly does it check? I've tried to make some changes to the UI search categories in categories.txt but couldn't make the script fail.
meenbeese commented 2024-12-03 19:22:33 +00:00 (Migrated from github.com)
Review

Added a comment in the script and updated the documentation.

Added a comment in the script and updated the documentation.
To check consistency of types_strings.txt with categories.txt run:
To ensure that the category definitions in `categories.txt` are consistent with their
associated translations in `strings.txt`, you can use the category consistency script.
```
ruby tools/ruby/category_consistency/check_consistency.rb
python tools/python/category/check_consistency.py
```
## Automatic translations

View file

@ -0,0 +1,123 @@
#!/usr/bin/env python3
"""
This script checks for consistency between categories defined in:
1. `displayed_categories.cpp`: A C++ source file containing category keys.
2. `categories.txt`: A data file defining category details.
3. `strings.txt`: A data file containing translations of category strings.
The script:
- Parses category keys from the C++ file.
- Loads corresponding data from `categories.txt` and `strings.txt`.
- Compares the parsed data for inconsistencies in definitions and translations.
- Prints detailed messages if inconsistencies are found.
It exits with:
- `0` on success (all categories are consistent),
- `1` on failure (inconsistencies detected).
"""
import os
import re
from omim_parsers import CategoriesParser, StringsParser, LANGUAGES
ROOT = os.path.dirname(os.path.abspath(__file__))
OMIM_ROOT = os.path.join(ROOT, '..', '..', '..')
CPP_CATEGORIES_FILENAME = os.path.join(OMIM_ROOT, 'search', 'displayed_categories.cpp')
CATEGORIES_FILENAME = os.path.join(OMIM_ROOT, 'data', 'categories.txt')
STRINGS_FILENAME = os.path.join(OMIM_ROOT, 'data', 'strings', 'strings.txt')
CATEGORIES_MATCHER = re.compile(r"m_keys = \{(.*?)};", re.DOTALL)
def extract_cpp_categories(filename):
if not os.path.exists(filename):
print(f"Error: {filename} not found.")
return []
with open(filename, "r", encoding="utf-8") as cpp_file:
content = cpp_file.read()
match = CATEGORIES_MATCHER.search(content)
if not match:
print(f"Error: No categories found in {filename}.")
return []
raw_categories = match.group(1)
return [cat.strip().strip('"') for cat in raw_categories.split(",")]
def compare_categories(string_cats, search_cats, cpp_cats):
Review

Please add type annotations like you did in omim_parsers.py

Please add type annotations like you did in omim_parsers.py
inconsistent_strings = {}
missing_categories = []
extra_categories = []
for category_name in search_cats.keys():
if category_name not in string_cats:
missing_categories.append(category_name)
Review

You can simplify this code by using set collection. For example:

missing_categories = search_cats.keys() - set(string_cats)
You can simplify this code by using `set` collection. For example: ```python missing_categories = search_cats.keys() - set(string_cats) ```
for cpp_cat in cpp_cats:
if cpp_cat not in search_cats:
extra_categories.append(cpp_cat)
Review

You can simplify this code by using set collection. For example:

extra_categories = set(cpp_cats) - set(search_cats)
You can simplify this code by using `set` collection. For example: ```python extra_categories = set(cpp_cats) - set(search_cats) ```
for category_name, translations in search_cats.items():
if category_name not in string_cats:
continue
for lang, search_translation in translations.items():
if lang not in string_cats[category_name]:
inconsistent_strings.setdefault(category_name, {})[lang] = (
"Missing translation",
search_translation,
)
elif string_cats[category_name][lang] != search_translation:
inconsistent_strings.setdefault(category_name, {})[lang] = (
string_cats[category_name][lang],
search_translation,
)
if missing_categories:
print("\nMissing translations for categories in strings.txt:")
for category_name in missing_categories:
print(f" - {category_name}")
if extra_categories:
print("\nExtra categories found in displayed_categories.cpp but not in categories.txt:")
for cpp_cat in extra_categories:
print(f" - {cpp_cat}")
if inconsistent_strings:
print("\nInconsistent category translations:")
for category_name, langs in inconsistent_strings.items():
print(f"Category \"{category_name}\":")
for lang, (strings_value, search_value) in langs.items():
print(f" {lang}: strings.txt=\"{strings_value}\" vs categories.txt=\"{search_value}\"")
return not (missing_categories or extra_categories or inconsistent_strings)
def check_search_categories_consistent():
categories_txt_parser = CategoriesParser(LANGUAGES)
strings_txt_parser = StringsParser(LANGUAGES)
if not os.path.exists(CATEGORIES_FILENAME):
print(f"Error: {CATEGORIES_FILENAME} not found.")
return 1
if not os.path.exists(STRINGS_FILENAME):
print(f"Error: {STRINGS_FILENAME} not found.")
return 1
search_categories = categories_txt_parser.parse_file(CATEGORIES_FILENAME)
string_categories = strings_txt_parser.parse_file(STRINGS_FILENAME)
cpp_categories = extract_cpp_categories(CPP_CATEGORIES_FILENAME)
if compare_categories(string_categories, search_categories, cpp_categories):
print("Success: All categories are consistent.")
return 0
else:
print("Failure: Inconsistencies found in category definitions.")
return 1
if __name__ == "__main__":
exit(check_search_categories_consistent())

View file

@ -0,0 +1,98 @@
import re
from typing import Optional, Tuple, Dict, List
LANGUAGES = [
"af", "ar", "be", "bg", "ca", "cs", "da", "de", "el", "en", "en-GB", "es", "es-MX", "et",
"eu", "fa", "fi", "fr", "fr-CA", "he", "hi", "hu", "id", "it", "ja", "ko", "lt", "mr", "nb",
"nl", "pl", "pt", "pt-BR", "ro", "ru", "sk", "sv", "sw", "th", "tr", "uk", "vi", "zh-Hans", "zh-Hant"
]
class AbstractParser:
def __init__(self, keys: List[str]):
self.keys = keys
def parse_line(self, line: str) -> Optional[Tuple[str, str]]:
raise NotImplementedError("You must implement parse_line.")
def match_category(self, line: str, result: Dict[str, Dict]):
category_match = self.category().search(line)
if category_match:
category = category_match.group(1)
if category in self.keys:
if category not in result:
result[category] = {}
def parse_file(self, filename: str) -> Dict[str, Dict]:
result = {}
current_category = None
with open(filename, "r", encoding="utf-8") as file:
for line in file:
line = line.strip()
# Skip comments and empty lines
if self.should_exclude_line(line):
continue
# Match a new category
category_match = self.category().match(line)
Review

Small optimization here: no need to call self.category() for every line.

Please create new variable at the start of parse_file(...) method and put there result of self.category()
For example:

category_regex = self.category()
with open(...):
    ...
    category_match = category_regex.match(line)
Small optimization here: no need to call `self.category()` for every line. Please create new variable at the start of `parse_file(...)` method and put there result of `self.category()` For example: ```python category_regex = self.category() with open(...): ... category_match = category_regex.match(line) ```
if category_match:
current_category = category_match.group(1)
if current_category not in result:
result[current_category] = {}
continue
# Parse translations for the current category
if current_category:
parsed = self.parse_line(line)
if parsed:
lang, translation = parsed
result[current_category].setdefault(lang, []).append(translation)
return result
def category(self) -> re.Pattern:
Review

Let's rename method from category(self) to category_re(self).

Let's rename method from `category(self)` to `category_re(self)`.
raise NotImplementedError("You must implement category.")
def is_new_category(self, line: str) -> bool:
return bool(self.category().match(line))
def extract_category(self, line: str) -> Optional[str]:
match = self.category().match(line)
return match.group(1) if match else None
def should_exclude_line(self, line: str) -> bool:
return False
class CategoriesParser(AbstractParser):
def parse_line(self, line: str) -> Optional[Tuple[str, str]]:
line_match = re.match(r"^([^:]+):(.+)$", line)
if line_match:
lang = line_match.group(1).strip()
translation = line_match.group(2).strip()
return lang, translation
return None
def category(self) -> re.Pattern:
return re.compile(r"^([a-zA-Z0-9_-]+)\|@(.+)$")
def should_exclude_line(self, line: str) -> bool:
return line.startswith("#") or not line
class StringsParser(AbstractParser):
def parse_line(self, line: str) -> Optional[Tuple[str, str]]:
line_match = re.match(r"^([^=]+)=(.*)$", line)
if line_match:
lang = line_match.group(1).strip()
translation = line_match.group(2).strip()
return lang, translation
return None
def category(self) -> re.Pattern:
return re.compile(r"^\[([a-zA-Z0-9_]+)]$")
def should_exclude_line(self, line: str) -> bool:
return line.startswith("tags") or not line

View file

@ -1,65 +0,0 @@
#!/usr/bin/env ruby
require_relative './omim_parsers'
ROOT = File.expand_path(File.dirname(__FILE__))
OMIM_ROOT = File.join(ROOT, '..', '..', '..')
CPP_CATEGORIES_FILENAME = File.join(OMIM_ROOT, 'search', 'displayed_categories.cpp')
CATEGORIES_FILENAME = File.join(OMIM_ROOT, 'data', 'categories.txt')
STRINGS_FILENAME = File.join(OMIM_ROOT, 'data', 'strings', 'strings.txt')
CATEGORIES_MATCHER = /m_keys = \{(.*)\};/m
def load_categories_from_cpp(filename)
raw_categories = File.read(CPP_CATEGORIES_FILENAME)
match = CATEGORIES_MATCHER.match(raw_categories)
if match
cpp_categories = match[1].split(/,\s+/)
# Delete quotes
cpp_categories.map { |cat| cat.gsub!(/^"|"$/, '') }
cpp_categories
end
end
def compare_categories(string_cats, search_cats)
inconsistent_strings = {}
string_cats.each do |category_name, category|
if !search_cats.include? category_name
puts "Category '#{category_name}' not found in categories.txt"
next
end
category.each do |lang, translation|
if search_cats[category_name].include? lang
if !search_cats[category_name][lang].include? translation
not_found_cats_list = search_cats[category_name][lang]
(inconsistent_strings[category_name] ||= {})[lang] = [translation, not_found_cats_list]
end
end
end
end
inconsistent_strings.each do |name, languages|
puts "\nInconsistent category \"#{name}\""
languages.each do |lang, values|
string_value, category_value = values
puts "\t#{lang} : \"#{string_value}\" is not matched by #{category_value}"
end
end
inconsistent_strings.empty?
end
def check_search_categories_consistent
cpp_categories = load_categories_from_cpp(CPP_CATEGORIES_FILENAME)
categories_txt_parser = OmimParsers::CategoriesParser.new cpp_categories
strings_txt_parser = OmimParsers::StringsParser.new cpp_categories
search_categories = categories_txt_parser.parse_file(CATEGORIES_FILENAME)
string_categories = strings_txt_parser.parse_file(STRINGS_FILENAME)
compare_categories(string_categories, search_categories) ? 0 : 1
end
if __FILE__ == $0
exit check_search_categories_consistent()
end

View file

@ -1,104 +0,0 @@
module OmimParsers
# To update the list, run in root directory:
# sed -nEe "s/ +([a-zA-Z]{2}(-[a-zA-Z]{2,})?) = .*$/\1/p" "data/strings/strings.txt" | sort -u | tr '\n' ' ' | sed -e 's/,$//' | fold -s -w48; echo
LANGUAGES = %w(af ar be bg ca cs da de el en en-GB es es-MX et
eu fa fi fr fr-CA he hi hu id it ja ko lt mr nb
nl pl pt pt-BR ro ru sk sv sw th tr uk vi
zh-Hans zh-Hant)
class AbstractParser
def initialize(keys)
@keys = keys
end
def parse_line(line)
raise NotImplementedError.new("You must implement parse_file.")
end
def match_category(line, result)
category_match = category.match(line)
if !category_match.nil?
category = category_match[1]
if @keys.include? category
result[category] ||= {}
end
end
end
def parse_file(filename)
current_string = nil
result = {}
File.open(filename, 'r:UTF-8').each do |line|
line.strip!
next if should_exclude_line? line
# If line is empty -> next category block started
if line.empty?
current_string = nil
next
end
current_string ||= match_category(line, result)
parsed = parse_line(line)
if !parsed.nil? and !current_string.nil?
lang, translation = parsed
current_string[lang] = translation
end
end
result
end
def category
raise NotImplementedError.new("You must implement category.")
end
def should_exclude_line?(line)
false
end
end
class CategoriesParser < AbstractParser
def parse_line(line)
line_match = /^([^:]+):(\S+)$/u.match(line)
return if !line_match
lang = $1.strip
return if !LANGUAGES.include? lang
translation = $2.strip
synonyms = []
translation.split('|').each do |token|
token_match = /\d?\^?(.*)$/.match(token)
synonyms.push(token_match[1]) if token_match
end
[lang, synonyms]
end
def should_exclude_line?(line)
line.start_with? '#'
end
def category
# We match only global categories ('food', 'bank'...)
/^@([A-Za-z0-9]+)$/
end
end
class StringsParser < AbstractParser
def parse_line(line)
line_match = /^([^=]+)=(.*)$/.match(line)
if line_match
lang = $1.strip
if LANGUAGES.include? lang
[lang, $2.strip]
end
end
end
def category
/^\[(.+)\]/
end
end
end