forked from organicmaps/organicmaps
[tools] Update find_untranslated_strings.py
- introduced CLI - added languages stats - filter output by selected languages - plurals support - more thorough file format check - added validation mode - preliminary "ref=" support - general refactoring - beautified by autopep8 Needed-for: #1703 Signed-off-by: Konstantin Pastbin <konstantin.pastbin@gmail.com>
This commit is contained in:
parent
a107eb9084
commit
51ac4c26a5
2 changed files with 433 additions and 171 deletions
|
@ -128,11 +128,11 @@ def parenthesize(strings):
|
|||
|
||||
def write_filtered_strings_txt(filtered, filepath, languages=None):
|
||||
logging.info("Writing strings to file {0}".format(filepath))
|
||||
strings_txt = StringsTxt()
|
||||
strings_txt = StringsTxt("{0}/{1}".format(OMIM_ROOT, StringsTxt.STRINGS_TXT_PATH))
|
||||
strings_dict = {key : dict(strings_txt.translations[key]) for key in filtered}
|
||||
strings_txt.translations = strings_dict
|
||||
strings_txt.comments_and_tags = {}
|
||||
strings_txt.write_formatted(filepath, languages=languages)
|
||||
strings_txt.write_formatted(target_file=filepath, langs=languages)
|
||||
|
||||
|
||||
def get_args():
|
||||
|
@ -201,7 +201,7 @@ def get_args():
|
|||
|
||||
parser.add_argument(
|
||||
"-ct", "--categories",
|
||||
dest="hardcoded_cagegories",
|
||||
dest="hardcoded_categories",
|
||||
default="{0}/data/hardcoded_categories.txt".format(find_omim()),
|
||||
help="""Path to the list of the categories that are displayed in the
|
||||
interface, but are not taken from strings.txt"""
|
||||
|
@ -261,13 +261,13 @@ def do_single(args):
|
|||
filtered.update(android)
|
||||
filtered.update(core)
|
||||
|
||||
strings_txt = StringsTxt()
|
||||
strings_txt = StringsTxt("{0}/{1}".format(OMIM_ROOT, StringsTxt.STRINGS_TXT_PATH))
|
||||
strings_txt.translations = {key: dict(strings_txt.translations[key]) for key in filtered}
|
||||
|
||||
strings_txt.comments_and_tags = new_comments_and_tags(strings_txt, filtered, new_tags)
|
||||
|
||||
path = args.output if isabs(args.output) else "{0}/{1}".format(OMIM_ROOT, args.output)
|
||||
strings_txt.write_formatted(languages=args.langs, target_file=path)
|
||||
strings_txt.write_formatted(target_file=path, langs=args.langs)
|
||||
|
||||
if args.generate:
|
||||
exec_shell(
|
||||
|
@ -325,7 +325,7 @@ if __name__ == "__main__":
|
|||
OMIM_ROOT=args.omim_root
|
||||
|
||||
HARDCODED_CATEGORIES = read_hardcoded_categories(
|
||||
args.hardcoded_cagegories
|
||||
args.hardcoded_categories
|
||||
)
|
||||
|
||||
args.langs = set(args.langs) if args.langs else None
|
||||
|
|
|
@ -1,39 +1,47 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
|
||||
from argparse import ArgumentParser
|
||||
from collections import namedtuple, defaultdict
|
||||
from itertools import combinations
|
||||
from os.path import join, dirname
|
||||
from os.path import join, dirname, abspath, isabs
|
||||
import re
|
||||
from sys import argv
|
||||
|
||||
TransAndKey = namedtuple("TransAndKey", "translation, key")
|
||||
|
||||
TRANSLATION = re.compile(r"(.*)\s*=\s*.*$", re.S | re.MULTILINE)
|
||||
MANY_DOTS = re.compile(r"\.{4,}")
|
||||
SPACE_PUNCTUATION = re.compile(r"\s[.,?!:;]")
|
||||
PLACEHOLDERS = re.compile(r"(%\d*\$@|%[@dqus]|\^)")
|
||||
|
||||
SIMILARITY_THRESHOLD = 20.0 #%
|
||||
|
||||
|
||||
class StringsTxt:
|
||||
|
||||
def __init__(self, strings_path=None):
|
||||
if not strings_path:
|
||||
self.strings_path = join(dirname(argv[0]), "..", "..", "data", "strings", "strings.txt")
|
||||
else:
|
||||
self.strings_path = strings_path
|
||||
STRINGS_TXT_PATH = "data/strings/strings.txt"
|
||||
TYPES_STRINGS_TXT_PATH = "data/strings/types_strings.txt"
|
||||
|
||||
self.translations = defaultdict(lambda: defaultdict(str)) # dict<key, dict<lang, translation>>
|
||||
self.translations_by_language = defaultdict(dict) # dict<lang, dict<key, translation>>
|
||||
self.comments_and_tags = defaultdict(dict)
|
||||
self.with_english = []
|
||||
self.all_langs = set()
|
||||
self.duplicates = {} # dict<lang, TransAndKey>
|
||||
SECTION = re.compile(r"\[\[\w+.*\]\]")
|
||||
DEFINITION = re.compile(r"\[[\w.]+\]")
|
||||
LANG_KEY = re.compile(r"^[a-z]{2}(-[a-zA-Z]{2,4})?(:[a-z]+)?$")
|
||||
TRANSLATION = re.compile(r"^\s*\S+\s*=\s*\S+.*$", re.S | re.MULTILINE)
|
||||
MANY_DOTS = re.compile(r"\.{4,}")
|
||||
SPACE_PUNCTUATION = re.compile(r"\s[.,?!:;]")
|
||||
PLACEHOLDERS = re.compile(r"(%\d*\$@|%[@dqus]|\^)")
|
||||
|
||||
PLURAL_KEYS = frozenset(("zero", "one", "two", "few", "many", "other"))
|
||||
SIMILARITY_THRESHOLD = 20.0 # %
|
||||
|
||||
TransAndKey = namedtuple("TransAndKey", "translation, key")
|
||||
|
||||
def __init__(self, strings_path):
|
||||
self.strings_path = strings_path
|
||||
|
||||
# dict<key, dict<lang, translation>>
|
||||
self.translations = defaultdict(lambda: defaultdict(str))
|
||||
self.translations_by_language = defaultdict(
|
||||
dict) # dict<lang, dict<key, translation>>
|
||||
self.comments_and_tags = defaultdict(
|
||||
dict) # dict<lang, dict<key, value>>
|
||||
self.all_langs = set() # including plural keys, e.g. en:few
|
||||
self.langs = set() # without plural keys
|
||||
self.duplicates = {} # dict<lang, TransAndKey>
|
||||
self.keys_in_order = []
|
||||
self._read_file()
|
||||
self.validation_errors = False
|
||||
|
||||
self._read_file()
|
||||
|
||||
def process_file(self):
|
||||
self._populate_translations_by_langs()
|
||||
|
@ -43,63 +51,154 @@ class StringsTxt:
|
|||
self.similarity_indices = []
|
||||
self._find_most_similar()
|
||||
|
||||
|
||||
def add_translation(self, translation, key, lang):
|
||||
if key not in self.keys_in_order:
|
||||
self.keys_in_order.append(key)
|
||||
self.translations[key][lang] = translation
|
||||
self.all_langs.add(lang)
|
||||
|
||||
lang, plural_key = self._parse_lang(lang)
|
||||
self.langs.add(lang)
|
||||
|
||||
def append_to_translation(self, key, lang, tail):
|
||||
self.translations[key][lang] = self.translations[key][lang] + tail
|
||||
|
||||
|
||||
def _read_file(self):
|
||||
with open(self.strings_path, encoding='utf-8') as strings:
|
||||
for line in strings:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
if line.startswith("[["):
|
||||
|
||||
if self.SECTION.match(line):
|
||||
self.keys_in_order.append(line)
|
||||
continue
|
||||
if line.startswith("["):
|
||||
# if line in self.translations:
|
||||
# print("Duplicate key {}".format(line))
|
||||
# continue
|
||||
|
||||
if self.DEFINITION.match(line):
|
||||
if line in self.translations:
|
||||
self._print_validation_issue(
|
||||
"Duplicate definition: {0}".format(line))
|
||||
self.translations[line] = {}
|
||||
current_key = line
|
||||
if current_key not in self.keys_in_order:
|
||||
self.keys_in_order.append(current_key)
|
||||
continue
|
||||
|
||||
if TRANSLATION.match(line):
|
||||
if self.TRANSLATION.match(line):
|
||||
lang, tran = self._parse_lang_and_translation(line)
|
||||
|
||||
if lang == "comment" or lang == "tags":
|
||||
if lang == "comment" or lang == "tags" or lang == "ref":
|
||||
self.comments_and_tags[current_key][lang] = tran
|
||||
continue
|
||||
|
||||
self.translations[current_key][lang] = tran
|
||||
|
||||
self.all_langs.add(lang)
|
||||
if line.startswith("en = "):
|
||||
self.with_english.append(current_key)
|
||||
continue
|
||||
lang, plural_key = self._parse_lang(lang)
|
||||
self.langs.add(lang)
|
||||
|
||||
else:
|
||||
self._print_validation_issue(
|
||||
"Could't parse line: {0}".format(line))
|
||||
|
||||
def print_statistics(self):
|
||||
stats = [(x, len(self.translations[x])) for x in list(self.translations.keys())]
|
||||
stats.sort(key=lambda x: x[1], reverse=True)
|
||||
def print_languages_stats(self, langs=None):
|
||||
self._print_header("Languages statistics")
|
||||
print("All languages in the file ({0} total):\n{1}\n".format(
|
||||
len(self.langs), ",".join(sorted(self.langs)))
|
||||
)
|
||||
print("Regional languages:\n{0}\n".format(
|
||||
",".join([lang for lang in sorted(self.langs) if len(lang) > 2]))
|
||||
)
|
||||
print("Languages using plurals:\n{0}\n".format(
|
||||
",".join([lang for lang in sorted(self.all_langs) if lang.find(":") > -1]))
|
||||
)
|
||||
|
||||
for k, v in stats:
|
||||
print("{0}\t{1}".format(k, v))
|
||||
self.print_invalid_languages()
|
||||
|
||||
print_plurals = True
|
||||
if not langs:
|
||||
print_plurals = False
|
||||
langs = self.langs
|
||||
|
||||
def print_duplicates(self):
|
||||
print(self._header("Duplicates:"))
|
||||
for lang, trans_and_keys in list(self.duplicates.items()):
|
||||
print("{0}\n {1}\n".format("=" * (len(lang) + 2), lang))
|
||||
langs_stats = []
|
||||
plurals_stats = defaultdict(dict) # dict<lang, dict<plural, int>>
|
||||
for lang in langs:
|
||||
lang_defs = set()
|
||||
if lang in self.translations_by_language:
|
||||
lang_defs = set(self.translations_by_language[lang].keys())
|
||||
plurals_stats[lang][lang] = len(lang_defs)
|
||||
for plural_key in self.PLURAL_KEYS:
|
||||
lang_plural = "{0}:{1}".format(lang, plural_key)
|
||||
if lang_plural in self.translations_by_language:
|
||||
plural_defs = set(
|
||||
self.translations_by_language[lang_plural].keys())
|
||||
plurals_stats[lang][lang_plural] = len(plural_defs)
|
||||
lang_defs = lang_defs.union(plural_defs)
|
||||
langs_stats.append((lang, len(lang_defs)))
|
||||
|
||||
print("\nNumber of translations out of total:\n")
|
||||
|
||||
langs_stats.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
n_trans = len(self.translations)
|
||||
for lang, lang_stat in langs_stats:
|
||||
print("{0:7} : {1} / {2} ({3}%)".format(
|
||||
lang, lang_stat, n_trans, round(100 * lang_stat / n_trans)
|
||||
))
|
||||
if print_plurals and not (len(plurals_stats[lang]) == 1 and lang in plurals_stats[lang]):
|
||||
for lang_plural, plural_stat in plurals_stats[lang].items():
|
||||
print(" {0:13} : {1}".format(lang_plural, plural_stat))
|
||||
|
||||
def print_invalid_languages(self):
|
||||
invalid_langs = []
|
||||
invalid_plurals = []
|
||||
for lang in self.all_langs:
|
||||
if not self.LANG_KEY.match(lang):
|
||||
invalid_langs.append(lang)
|
||||
lang_key, plural_key = self._parse_lang(lang)
|
||||
if plural_key and plural_key not in self.PLURAL_KEYS:
|
||||
invalid_plurals.append(lang)
|
||||
|
||||
if invalid_langs:
|
||||
self._print_validation_issue("Invalid languages: {0}".format(
|
||||
",".join(sorted(invalid_langs))
|
||||
))
|
||||
|
||||
if invalid_plurals:
|
||||
self._print_validation_issue("Invalid plurals: {0}".format(
|
||||
",".join(sorted(invalid_plurals))
|
||||
))
|
||||
|
||||
def print_definitions_stats(self, langs=None):
|
||||
self._print_header("Definitions stats")
|
||||
print("Number of translations out of total:\n")
|
||||
if not langs:
|
||||
langs = self.langs
|
||||
def_stats = []
|
||||
for definition in self.translations.keys():
|
||||
def_langs = set()
|
||||
for def_lang in self.translations[definition].keys():
|
||||
def_lang, plural_key = self._parse_lang(def_lang)
|
||||
if def_lang in langs:
|
||||
def_langs.add(def_lang)
|
||||
def_stats.append((definition, len(def_langs)))
|
||||
def_stats.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
n_langs = len(langs)
|
||||
for definition, n_trans in def_stats:
|
||||
print("{0}\t{1} / {2} ({3}%)".format(
|
||||
definition, n_trans, n_langs, round(100 * n_trans / n_langs)
|
||||
))
|
||||
|
||||
def print_duplicates(self, langs=None):
|
||||
self._print_header("Duplicate translations")
|
||||
print("Same translations used in several definitions:")
|
||||
langs = self._expand_plurals(langs) if langs else self.all_langs
|
||||
dups = list(self.duplicates.items())
|
||||
dups.sort(key=lambda x: x[0])
|
||||
for lang, trans_and_keys in dups:
|
||||
if lang not in langs:
|
||||
continue
|
||||
print("\nLanguage: {0}".format(lang))
|
||||
last_one = ""
|
||||
keys = []
|
||||
for tr in trans_and_keys:
|
||||
|
@ -110,41 +209,47 @@ class StringsTxt:
|
|||
keys.append(tr.key)
|
||||
self._print_keys_for_duplicates(keys, last_one)
|
||||
|
||||
|
||||
def _print_keys_for_duplicates(self, keys, last_one):
|
||||
if last_one:
|
||||
print("{0}: {1}\n".format(", ".join(keys), last_one))
|
||||
print("\t{0}: {1}".format(",".join(keys), last_one))
|
||||
|
||||
def _expand_plurals(self, langs):
|
||||
expanded_langs = set()
|
||||
for lang_plural in self.all_langs:
|
||||
lang, plural_key = self._parse_lang(lang_plural)
|
||||
if lang in langs:
|
||||
expanded_langs.add(lang_plural)
|
||||
return expanded_langs
|
||||
|
||||
def _process_string(self, string):
|
||||
if MANY_DOTS.search(string):
|
||||
print("WARNING: 4 or more dots in the string: {0}".format(string))
|
||||
return str.strip(string).replace("...", "…")
|
||||
|
||||
def _parse_lang(self, lang):
|
||||
plural_key = None
|
||||
sep_pos = lang.find(":")
|
||||
if sep_pos > -1:
|
||||
lang, plural_key = lang.split(":")
|
||||
return lang, plural_key
|
||||
|
||||
def _parse_lang_and_translation(self, line):
|
||||
ret = tuple(map(self._process_string, line.split("=", 1)))
|
||||
if len(ret) < 2:
|
||||
print("ERROR: Couldn't parse the line: {0}".format(line))
|
||||
assert len(ret) == 2
|
||||
return ret
|
||||
|
||||
lang, trans = line.split("=", 1)
|
||||
if self.MANY_DOTS.search(trans):
|
||||
self._print_validation_issue(
|
||||
"4 or more dots in the string: {0}".format(line), warning=True)
|
||||
return (lang.strip(), trans.strip())
|
||||
|
||||
def _populate_translations_by_langs(self):
|
||||
for lang in self.all_langs:
|
||||
trans_for_lang = {}
|
||||
for key, tran in list(self.translations.items()): # (tran = dict<lang, translation>)
|
||||
for key, tran in self.translations.items(): # (tran = dict<lang, translation>)
|
||||
if lang not in tran:
|
||||
continue
|
||||
trans_for_lang[key] = tran[lang]
|
||||
self.translations_by_language[lang] = trans_for_lang
|
||||
|
||||
|
||||
def _find_duplicates(self):
|
||||
for lang, tran in list(self.translations_by_language.items()):
|
||||
trans_for_lang = [TransAndKey(x[1], x[0]) for x in list(tran.items())]
|
||||
for lang, tran in self.translations_by_language.items():
|
||||
trans_for_lang = [self.TransAndKey(
|
||||
x[1], x[0]) for x in tran.items()]
|
||||
trans_for_lang.sort(key=lambda x: x.translation)
|
||||
prev_tran = TransAndKey("", "")
|
||||
prev_tran = self.TransAndKey("", "")
|
||||
possible_duplicates = set()
|
||||
for curr_tran in trans_for_lang:
|
||||
if curr_tran.translation == prev_tran.translation:
|
||||
|
@ -157,35 +262,53 @@ class StringsTxt:
|
|||
|
||||
def _find_most_duplicated(self):
|
||||
most_duplicated = defaultdict(int)
|
||||
for trans_and_keys in list(self.duplicates.values()):
|
||||
for trans_and_keys in self.duplicates.values():
|
||||
for trans_and_key in trans_and_keys:
|
||||
most_duplicated[trans_and_key.key] += 1
|
||||
|
||||
self.most_duplicated = sorted(list(most_duplicated.items()), key=lambda x: x[1], reverse=True)
|
||||
|
||||
self.most_duplicated = sorted(
|
||||
most_duplicated.items(), key=lambda x: x[1], reverse=True)
|
||||
|
||||
def print_most_duplicated(self):
|
||||
print(self._header("Most duplicated"))
|
||||
self._print_header("Most duplicated")
|
||||
print("Definitions with the most translations shared with other definitions:\n")
|
||||
for pair in self.most_duplicated:
|
||||
print("{}\t{}".format(pair[0], pair[1]))
|
||||
|
||||
|
||||
def print_missing_translations(self):
|
||||
print(self._header("Missing translations for languages:"))
|
||||
print(self.all_langs)
|
||||
def print_missing_translations(self, langs=None):
|
||||
self._print_header("Untranslated definitions")
|
||||
if not langs:
|
||||
langs = sorted(self.langs)
|
||||
all_translation_keys = set(self.translations.keys())
|
||||
for lang in self.all_langs:
|
||||
for lang in langs:
|
||||
keys_for_lang = set(self.translations_by_language[lang].keys())
|
||||
missing_keys = sorted(list(all_translation_keys - keys_for_lang))
|
||||
print("{0}:\n{1}\n".format(lang, "\n".join(missing_keys)))
|
||||
missing_keys = all_translation_keys - keys_for_lang
|
||||
for plural_key in self.PLURAL_KEYS:
|
||||
lang_plural = "{0}:{1}".format(lang, plural_key)
|
||||
if lang_plural in self.translations_by_language:
|
||||
missing_keys -= set(
|
||||
self.translations_by_language[lang_plural].keys())
|
||||
missing_keys = sorted(missing_keys)
|
||||
print("Language: {0} ({1} missing)\n\t{2}\n".format(
|
||||
lang, len(missing_keys), "\n\t".join(missing_keys)))
|
||||
|
||||
|
||||
def write_formatted(self, target_file=None, languages=None):
|
||||
def write_formatted(self, target_file=None, langs=None):
|
||||
before_block = ""
|
||||
langs = self._expand_plurals(langs) if langs else self.all_langs
|
||||
en_langs = []
|
||||
other_langs = []
|
||||
for lang in langs:
|
||||
if lang.startswith("en"):
|
||||
en_langs.append(lang)
|
||||
else:
|
||||
other_langs.append(lang)
|
||||
sorted_langs = sorted(en_langs) + sorted(other_langs)
|
||||
|
||||
if target_file is None:
|
||||
target_file = self.strings_path
|
||||
with open(target_file, "w") as outfile:
|
||||
for key in self.keys_in_order:
|
||||
# TODO: sort definitions and sections too?
|
||||
if not key:
|
||||
continue
|
||||
if key in self.translations:
|
||||
|
@ -200,30 +323,17 @@ class StringsTxt:
|
|||
before_block = "\n"
|
||||
|
||||
if key in self.comments_and_tags:
|
||||
for k, v in list(self.comments_and_tags[key].items()):
|
||||
for k, v in self.comments_and_tags[key].items():
|
||||
outfile.write(" {0} = {1}\n".format(k, v))
|
||||
en_langs = []
|
||||
for lang in self.all_langs:
|
||||
if lang.startswith('en'):
|
||||
en_langs.append(lang)
|
||||
sorted_langs = sorted(en_langs) + sorted(self.all_langs - set(en_langs))
|
||||
self._write_translations_for_langs(sorted_langs, tran, outfile, only_langs=languages)
|
||||
|
||||
self._write_translations_for_langs(sorted_langs, tran, outfile)
|
||||
|
||||
def _write_translations_for_langs(self, langs, tran, outfile, only_langs=None):
|
||||
langs_to_write = []
|
||||
|
||||
if only_langs:
|
||||
for lang in only_langs:
|
||||
if lang in langs:
|
||||
langs_to_write.append(lang)
|
||||
else:
|
||||
langs_to_write = langs
|
||||
|
||||
for lang in langs_to_write:
|
||||
def _write_translations_for_langs(self, langs, tran, outfile):
|
||||
for lang in langs:
|
||||
if lang in tran:
|
||||
outfile.write(" {0} = {1}\n".format(lang, tran[lang]))
|
||||
|
||||
outfile.write(" {0} = {1}\n".format(
|
||||
lang, tran[lang].replace("...", "…")
|
||||
))
|
||||
|
||||
def _compare_blocks(self, key_1, key_2):
|
||||
block_1 = self.translations[key_1]
|
||||
|
@ -236,104 +346,256 @@ class StringsTxt:
|
|||
if block_1[key] == block_2[key]:
|
||||
common_elements += 1
|
||||
|
||||
return [x for x in [
|
||||
(self._similarity_string(key_1, key_2), self._similarity_index(len(block_1), common_elements)),
|
||||
(self._similarity_string(key_2, key_1), self._similarity_index(len(block_2), common_elements))
|
||||
] if x[1] > SIMILARITY_THRESHOLD]
|
||||
|
||||
|
||||
def _similarity_string(self, key_1, key_2):
|
||||
return "{} -> {}".format(key_1, key_2)
|
||||
|
||||
|
||||
def _similarity_index(self, total_number, number_from_other):
|
||||
return 100.0 * number_from_other / total_number
|
||||
|
||||
sim_index = round(100 * 2 * common_elements /
|
||||
(len(block_1) + len(block_2)))
|
||||
if sim_index >= self.SIMILARITY_THRESHOLD:
|
||||
return [("{} <-> {}".format(key_1, key_2), sim_index)]
|
||||
return []
|
||||
|
||||
def _find_most_similar(self):
|
||||
search_scope = [x for x in self.most_duplicated if x[1] > len(self.translations[x[0]]) / 10]
|
||||
search_scope = [x for x in self.most_duplicated if x[1]
|
||||
> len(self.translations[x[0]]) / 10]
|
||||
for one, two in combinations(search_scope, 2):
|
||||
self.similarity_indices.extend(self._compare_blocks(one[0], two[0]))
|
||||
self.similarity_indices.extend(
|
||||
self._compare_blocks(one[0], two[0]))
|
||||
|
||||
self.similarity_indices.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
|
||||
def print_most_similar(self):
|
||||
print(self._header("Most similar blocks"))
|
||||
self._print_header("Most similar definitions")
|
||||
print("Definitions most similar to other definitions, i.e. with a lot of same translations:\n")
|
||||
for index in self.similarity_indices:
|
||||
print("{} : {}".format(index[0], index[1]))
|
||||
print("{} : {}%".format(index[0], index[1]))
|
||||
|
||||
def _print_header(self, string):
|
||||
# print headers in green colour
|
||||
print("\n{line} \033[0;32m{str}\033[0m {line}\n".format(
|
||||
line="=" * round((70 - len(string)) / 2),
|
||||
str=string
|
||||
))
|
||||
|
||||
def _header(self, string):
|
||||
return "\n\n{line}\n{string}\n{line}\n".format(
|
||||
line="=" * 80,
|
||||
string=string
|
||||
)
|
||||
|
||||
def _print_validation_issue(self, issue, warning=False):
|
||||
if warning:
|
||||
# print warnings in yellow colour
|
||||
print("\033[0;33mWARNING: {0}\033[0m".format(issue))
|
||||
return
|
||||
self.validation_errors = True
|
||||
# print errors in red colour
|
||||
print("\033[0;31mERROR: {0}\033[0m".format(issue))
|
||||
|
||||
def _has_space_before_punctuation(self, lang, string):
|
||||
if lang == "fr":
|
||||
if lang == "fr": # make exception for French
|
||||
return False
|
||||
if SPACE_PUNCTUATION.search(string):
|
||||
if self.SPACE_PUNCTUATION.search(string):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def print_strings_with_spaces_before_punctuation(self):
|
||||
print(self._header("Strings with spaces before punctuation:"))
|
||||
for key, lang_and_trans in list(self.translations.items()):
|
||||
def print_strings_with_spaces_before_punctuation(self, langs=None):
|
||||
self._print_header("Strings with spaces before punctuation")
|
||||
langs = self._expand_plurals(langs) if langs else self.all_langs
|
||||
for key, lang_and_trans in self.translations.items():
|
||||
wrote_key = False
|
||||
for lang, translation in list(lang_and_trans.items()):
|
||||
if self._has_space_before_punctuation(lang, translation):
|
||||
if not wrote_key:
|
||||
print("\n{}".format(key))
|
||||
wrote_key = True
|
||||
print("{} : {}".format(lang, translation))
|
||||
for lang, translation in lang_and_trans.items():
|
||||
if lang in langs:
|
||||
if self._has_space_before_punctuation(lang, translation):
|
||||
if not wrote_key:
|
||||
print("\n{}".format(key))
|
||||
wrote_key = True
|
||||
self._print_validation_issue(
|
||||
"{0} : {1}".format(lang, translation), warning=True)
|
||||
|
||||
|
||||
def _check_placeholders_in_block(self, block_key):
|
||||
def _check_placeholders_in_block(self, block_key, langs):
|
||||
wrong_placeholders_strings = []
|
||||
key = self.translations[block_key].get("en")
|
||||
if not key:
|
||||
print("No english for key: {}".format(block_key))
|
||||
print("Existing keys are: {}".format(",".join(list(self.translations[block_key].keys()))))
|
||||
raise KeyError
|
||||
en_lang = "en"
|
||||
en_trans = self.translations[block_key].get(en_lang)
|
||||
if not en_trans:
|
||||
for plural_key in sorted(self.PLURAL_KEYS):
|
||||
if en_trans:
|
||||
break
|
||||
en_lang = "en:{0}".format(plural_key)
|
||||
en_trans = self.translations[block_key].get(en_lang)
|
||||
if not en_trans:
|
||||
self._print_validation_issue(
|
||||
"No English for definition: {}".format(block_key))
|
||||
return None, wrong_placeholders_strings
|
||||
|
||||
en_placeholders = sorted(PLACEHOLDERS.findall(key))
|
||||
en_placeholders = sorted(self.PLACEHOLDERS.findall(en_trans))
|
||||
|
||||
|
||||
for lang, translation in list(self.translations[block_key].items()):
|
||||
if lang == "en":
|
||||
continue
|
||||
found = sorted(PLACEHOLDERS.findall(translation))
|
||||
if not en_placeholders == found: #must be sorted
|
||||
wrong_placeholders_strings.append("{} : {}".format(lang, translation))
|
||||
for lang, translation in self.translations[block_key].items():
|
||||
found = sorted(self.PLACEHOLDERS.findall(translation))
|
||||
if not en_placeholders == found: # must be sorted
|
||||
wrong_placeholders_strings.append(
|
||||
"{} = {}".format(lang, translation))
|
||||
|
||||
return wrong_placeholders_strings
|
||||
return en_lang, wrong_placeholders_strings
|
||||
|
||||
|
||||
def print_strings_with_wrong_paceholders(self):
|
||||
print(self._header("Strings with a wrong number of placeholders:"))
|
||||
for key, lang_and_trans in list(self.translations.items()):
|
||||
wrong_placeholders = self._check_placeholders_in_block(key)
|
||||
def print_strings_with_wrong_placeholders(self, langs=None):
|
||||
self._print_header("Strings with a wrong number of placeholders")
|
||||
langs = self._expand_plurals(langs) if langs else self.all_langs
|
||||
for key, lang_and_trans in self.translations.items():
|
||||
en_lang, wrong_placeholders = self._check_placeholders_in_block(
|
||||
key, langs)
|
||||
if not wrong_placeholders:
|
||||
continue
|
||||
|
||||
print("\n{0}".format(key))
|
||||
print("English: {0}".format(lang_and_trans["en"]))
|
||||
for string in wrong_placeholders:
|
||||
print(string)
|
||||
print("{0} = {1}".format(en_lang, lang_and_trans[en_lang]))
|
||||
for wp in wrong_placeholders:
|
||||
self._print_validation_issue(wp)
|
||||
|
||||
def validate(self, langs=None):
|
||||
self._print_header("Validating the file...")
|
||||
if self.validation_errors:
|
||||
self._print_validation_issue(
|
||||
"There were errors reading the file, check the output above")
|
||||
self._print_header("Invalid languages")
|
||||
self.print_invalid_languages()
|
||||
self.print_strings_with_spaces_before_punctuation(langs=args.langs)
|
||||
self.print_strings_with_wrong_placeholders(langs=args.langs)
|
||||
return not self.validation_errors
|
||||
|
||||
|
||||
def find_project_root():
|
||||
my_path = abspath(__file__)
|
||||
tools_index = my_path.rfind("/tools/python")
|
||||
project_root = my_path[:tools_index]
|
||||
return project_root
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = ArgumentParser(
|
||||
description="""
|
||||
Validates and formats translation files (strings.txt, types_strings.txt),
|
||||
prints file's statistics, finds duplicate and missing translations, etc."""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"input",
|
||||
nargs="?", default=None,
|
||||
help="input file path, defaults to <organicmaps>/data/strings/strings.txt"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-t", "--types-strings",
|
||||
action="store_true",
|
||||
help="use <organicmaps>/data/strings/types_strings.txt as input file by default"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-o", "--output",
|
||||
default=None, nargs="?", const=True,
|
||||
help="""path to write formatted output file to with languages
|
||||
sorted in alphabetic order except English translations going first
|
||||
(overwrites the input file by default)"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-l", "--languages",
|
||||
dest="langs", default=None,
|
||||
help="a comma-separated list of languages to limit output to, if applicable"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-pl", "--print-languages",
|
||||
dest="print_langs",
|
||||
action="store_true",
|
||||
help="print languages statistics"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-pf", "--print-definitions",
|
||||
dest="print_defs",
|
||||
action="store_true",
|
||||
help="print definitions stattistics"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-pd", "--print-duplicates",
|
||||
dest="print_dups",
|
||||
action="store_true",
|
||||
help="print same translations used in several definitions"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-po", "--print-most-duplicated",
|
||||
dest="print_mdups",
|
||||
action="store_true",
|
||||
help="""print definitions with the most translations shared
|
||||
with other definitions"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-ps", "--print-similar",
|
||||
dest="print_similar",
|
||||
action="store_true",
|
||||
help="""print definitions most similar to other definitions,
|
||||
i.e. with a lot of same translations"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-pm", "--missing-translations",
|
||||
dest="print_missing",
|
||||
action="store_true",
|
||||
help="print untranslated definitions"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-v", "--validate",
|
||||
dest="validate",
|
||||
action="store_true",
|
||||
help="""validate file format, placeholders usage, whitespace
|
||||
before punctuation, etc; exit with error if not valid"""
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
strings = StringsTxt(sys.argv[1] if len(sys.argv) > 1 else None)
|
||||
|
||||
args = get_args()
|
||||
|
||||
if not args.input:
|
||||
args.input = StringsTxt.TYPES_STRINGS_TXT_PATH if args.types_strings else StringsTxt.STRINGS_TXT_PATH
|
||||
args.input = "{0}/{1}".format(find_project_root(), args.input)
|
||||
args.input = abspath(args.input)
|
||||
print("Input file: {0}\n".format(args.input))
|
||||
|
||||
strings = StringsTxt(args.input)
|
||||
strings.process_file()
|
||||
strings.print_statistics()
|
||||
strings.print_duplicates()
|
||||
strings.print_most_duplicated()
|
||||
strings.print_most_similar()
|
||||
strings.print_missing_translations()
|
||||
strings.write_formatted()
|
||||
strings.print_strings_with_spaces_before_punctuation()
|
||||
strings.print_strings_with_wrong_paceholders()
|
||||
|
||||
if args.langs:
|
||||
args.langs = args.langs.split(",")
|
||||
print("Limit output to languages:\n{0}\n".format(",".join(args.langs)))
|
||||
|
||||
if args.print_langs:
|
||||
strings.print_languages_stats(langs=args.langs)
|
||||
|
||||
if args.print_defs:
|
||||
strings.print_definitions_stats(langs=args.langs)
|
||||
|
||||
if args.print_dups:
|
||||
strings.print_duplicates(langs=args.langs)
|
||||
|
||||
if args.print_mdups:
|
||||
strings.print_most_duplicated()
|
||||
|
||||
if args.print_similar:
|
||||
strings.print_most_similar()
|
||||
|
||||
if args.print_missing:
|
||||
strings.print_missing_translations(langs=args.langs)
|
||||
|
||||
if args.validate:
|
||||
if not strings.validate(langs=args.langs):
|
||||
# print in red color
|
||||
print("\n\033[0;31mThe file is not valid, terminating\033[0m")
|
||||
sys.exit(1)
|
||||
|
||||
if args.output:
|
||||
if args.output == True:
|
||||
args.output = args.input
|
||||
else:
|
||||
args.output = abspath(args.output)
|
||||
print("\nWriting formatted output file: {0}\n".format(args.output))
|
||||
strings.write_formatted(target_file=args.output, langs=args.langs)
|
||||
|
|
Loading…
Add table
Reference in a new issue