From e08a9d988ea63fd625bdcdaac646bfac41060c43 Mon Sep 17 00:00:00 2001 From: tatiana-yan Date: Fri, 24 May 2019 15:19:15 +0300 Subject: [PATCH] [storage] Add optional country_name_synonyms parameter to countries.txt to find corresponding CountryId by country name without countries.txt keys modification. --- search/region_info_getter.cpp | 3 +- .../features_collector_tool.cpp | 3 +- search/search_quality/helpers.cpp | 6 ++- search/search_quality/helpers.hpp | 3 +- .../search_quality_tool.cpp | 3 +- storage/country_tree.cpp | 45 ++++++++++++++++--- storage/country_tree.hpp | 8 +++- storage/storage.cpp | 8 ++-- storage/storage.hpp | 3 ++ storage/storage_defines.hpp | 4 +- tools/python/maps_generator/generator/env.py | 4 ++ tools/python/maps_generator/maps_generator.py | 1 + tools/python/post_generation/__main__.py | 3 ++ .../post_generation/hierarchy_to_countries.py | 20 ++++++++- tools/unix/generate_planet.sh | 2 +- 15 files changed, 94 insertions(+), 22 deletions(-) diff --git a/search/region_info_getter.cpp b/search/region_info_getter.cpp index e5bfb8da0f..f28e06181e 100644 --- a/search/region_info_getter.cpp +++ b/search/region_info_getter.cpp @@ -43,7 +43,8 @@ void GetPathToRoot(storage::CountryId const & id, storage::CountryTree const & c void RegionInfoGetter::LoadCountriesTree() { storage::Affiliations affiliations; - storage::LoadCountriesFromFile(COUNTRIES_FILE, m_countries, affiliations); + storage::CountryNameSynonyms countryNameSynonyms; + storage::LoadCountriesFromFile(COUNTRIES_FILE, m_countries, affiliations, countryNameSynonyms); } void RegionInfoGetter::SetLocale(string const & locale) diff --git a/search/search_quality/features_collector_tool/features_collector_tool.cpp b/search/search_quality/features_collector_tool/features_collector_tool.cpp index 772ccc6a6c..5c1d815214 100644 --- a/search/search_quality/features_collector_tool/features_collector_tool.cpp +++ b/search/search_quality/features_collector_tool/features_collector_tool.cpp @@ -108,7 +108,8 @@ int main(int argc, char * argv[]) InitDataSource(dataSource, "" /* mwmListPath */); storage::Affiliations affiliations; - InitAffiliations(affiliations); + storage::CountryNameSynonyms countryNameSynonyms; + InitStorageData(affiliations, countryNameSynonyms); auto engine = InitSearchEngine(dataSource, affiliations, "en" /* locale */, 1 /* numThreads */); diff --git a/search/search_quality/helpers.cpp b/search/search_quality/helpers.cpp index bb3ba66a2a..8289038e74 100644 --- a/search/search_quality/helpers.cpp +++ b/search/search_quality/helpers.cpp @@ -175,12 +175,14 @@ void InitDataSource(FrozenDataSource & dataSource, string const & mwmListPath) LOG(LINFO, ()); } -void InitAffiliations(storage::Affiliations & affiliations) +void InitStorageData(storage::Affiliations & affiliations, + storage::CountryNameSynonyms & countryNameSynonyms) { auto const countriesFile = base::JoinPath(GetPlatform().ResourcesDir(), COUNTRIES_FILE); storage::CountryTree countries; - auto const rv = storage::LoadCountriesFromFile(countriesFile, countries, affiliations); + auto const rv = + storage::LoadCountriesFromFile(countriesFile, countries, affiliations, countryNameSynonyms); CHECK(rv != -1, ("Can't load countries from:", countriesFile)); } diff --git a/search/search_quality/helpers.hpp b/search/search_quality/helpers.hpp index 074c268b9f..b226f00643 100644 --- a/search/search_quality/helpers.hpp +++ b/search/search_quality/helpers.hpp @@ -34,7 +34,8 @@ void InitViewport(std::string viewportName, m2::RectD & viewport); void InitDataSource(FrozenDataSource & dataSource, std::string const & mwmListPath); -void InitAffiliations(storage::Affiliations & affiliations); +void InitStorageData(storage::Affiliations & affiliations, + storage::CountryNameSynonyms & countryNameSynonyms); std::unique_ptr InitSearchEngine( DataSource & dataSource, storage::Affiliations const & affiliations, std::string const & locale, diff --git a/search/search_quality/search_quality_tool/search_quality_tool.cpp b/search/search_quality/search_quality_tool/search_quality_tool.cpp index 6282ed1d7b..b2f267b93c 100644 --- a/search/search_quality/search_quality_tool/search_quality_tool.cpp +++ b/search/search_quality/search_quality_tool/search_quality_tool.cpp @@ -380,7 +380,8 @@ int main(int argc, char * argv[]) InitDataSource(dataSource, FLAGS_mwm_list_path); storage::Affiliations affiliations; - InitAffiliations(affiliations); + storage::CountryNameSynonyms countryNameSynonyms; + InitStorageData(affiliations, countryNameSynonyms); auto engine = InitSearchEngine(dataSource, affiliations, FLAGS_locale, FLAGS_num_threads); diff --git a/storage/country_tree.cpp b/storage/country_tree.cpp index 6ca3d728fb..bf6b47a207 100644 --- a/storage/country_tree.cpp +++ b/storage/country_tree.cpp @@ -33,6 +33,7 @@ public: CountryId const & parent) = 0; virtual void InsertOldMwmMapping(CountryId const & newId, CountryId const & oldId) = 0; virtual void InsertAffiliation(CountryId const & countryId, string const & affilation) = 0; + virtual void InsertCountryNameSynonym(CountryId const & countryId, string const & synonym) = 0; virtual OldMwmMapping GetMapping() const = 0; }; @@ -40,11 +41,15 @@ class StoreCountriesSingleMwms : public StoreSingleMwmInterface { CountryTree & m_countries; Affiliations & m_affiliations; + CountryNameSynonyms & m_countryNameSynonyms; OldMwmMapping m_idsMapping; public: - StoreCountriesSingleMwms(CountryTree & countries, Affiliations & affiliations) - : m_countries(countries), m_affiliations(affiliations) + StoreCountriesSingleMwms(CountryTree & countries, Affiliations & affiliations, + CountryNameSynonyms & countryNameSynonyms) + : m_countries(countries) + , m_affiliations(affiliations) + , m_countryNameSynonyms(countryNameSynonyms) { } ~StoreCountriesSingleMwms() @@ -81,6 +86,17 @@ public: m_affiliations[affilation].push_back(countryId); } + void InsertCountryNameSynonym(CountryId const & countryId, string const & synonym) override + { + ASSERT(!synonym.empty(), ()); + ASSERT(!countryId.empty(), ()); + ASSERT(m_countryNameSynonyms.find(synonym) == m_countryNameSynonyms.end(), + ("Synonym must identify CountryTree node where the country is located. Country cannot be " + "located at multiple nodes.")); + + m_countryNameSynonyms[synonym] = countryId; + } + OldMwmMapping GetMapping() const override { return m_idsMapping; } }; @@ -104,10 +120,17 @@ public: } void InsertOldMwmMapping(CountryId const & /* newId */, CountryId const & /* oldId */) override {} + void InsertAffiliation(CountryId const & /* countryId */, string const & /* affilation */) override { } + + void InsertCountryNameSynonym(CountryId const & /* countryId */, + string const & /* synonym */) override + { + } + OldMwmMapping GetMapping() const override { ASSERT(false, ()); @@ -122,6 +145,11 @@ TMwmSubtreeAttrs LoadGroupSingleMwmsImpl(size_t depth, json_t * node, CountryId CountryId id; FromJSONObject(node, "id", id); + vector countryNameSynonyms; + FromJSONObjectOptionalField(node, "country_name_synonyms", countryNameSynonyms); + for (auto const & synonym : countryNameSynonyms) + store.InsertCountryNameSynonym(id, synonym); + // Mapping two component (big) mwms to one componenst (small) ones. vector oldIds; FromJSONObjectOptionalField(node, "old", oldIds); @@ -199,7 +227,8 @@ class StoreCountriesTwoComponentMwms : public StoreTwoComponentMwmInterface CountryTree & m_countries; public: - StoreCountriesTwoComponentMwms(CountryTree & countries, Affiliations & /* affiliations */) + StoreCountriesTwoComponentMwms(CountryTree & countries, Affiliations & /* affiliations */, + CountryNameSynonyms & /* countryNameSynonyms */) : m_countries(countries) { } @@ -308,6 +337,7 @@ bool LoadCountriesTwoComponentMwmsImpl(string const & jsonBuffer, int64_t LoadCountriesFromBuffer(string const & jsonBuffer, CountryTree & countries, Affiliations & affiliations, + CountryNameSynonyms & countryNameSynonyms, OldMwmMapping * mapping /* = nullptr */) { countries.Clear(); @@ -321,7 +351,7 @@ int64_t LoadCountriesFromBuffer(string const & jsonBuffer, CountryTree & countri if (version::IsSingleMwm(version)) { - StoreCountriesSingleMwms store(countries, affiliations); + StoreCountriesSingleMwms store(countries, affiliations, countryNameSynonyms); if (!LoadCountriesSingleMwmsImpl(jsonBuffer, store)) return -1; if (mapping) @@ -329,7 +359,7 @@ int64_t LoadCountriesFromBuffer(string const & jsonBuffer, CountryTree & countri } else { - StoreCountriesTwoComponentMwms store(countries, affiliations); + StoreCountriesTwoComponentMwms store(countries, affiliations, countryNameSynonyms); if (!LoadCountriesTwoComponentMwmsImpl(jsonBuffer, store)) return -1; } @@ -342,11 +372,12 @@ int64_t LoadCountriesFromBuffer(string const & jsonBuffer, CountryTree & countri } int64_t LoadCountriesFromFile(string const & path, CountryTree & countries, - Affiliations & affiliations, OldMwmMapping * mapping) + Affiliations & affiliations, + CountryNameSynonyms & countryNameSynonyms, OldMwmMapping * mapping) { string json; ReaderPtr(GetPlatform().GetReader(path)).ReadAsString(json); - return LoadCountriesFromBuffer(json, countries, affiliations, mapping); + return LoadCountriesFromBuffer(json, countries, affiliations, countryNameSynonyms, mapping); } void LoadCountryFile2CountryInfo(string const & jsonBuffer, map & id2info, diff --git a/storage/country_tree.hpp b/storage/country_tree.hpp index 6d266d298b..18c2ac1554 100644 --- a/storage/country_tree.hpp +++ b/storage/country_tree.hpp @@ -254,9 +254,13 @@ private: /// @return version of country file or -1 if error was encountered int64_t LoadCountriesFromBuffer(std::string const & buffer, CountryTree & countries, - Affiliations & affiliations, OldMwmMapping * mapping = nullptr); + Affiliations & affiliations, + CountryNameSynonyms & countryNameSynonyms, + OldMwmMapping * mapping = nullptr); int64_t LoadCountriesFromFile(std::string const & path, CountryTree & countries, - Affiliations & affiliations, OldMwmMapping * mapping = nullptr); + Affiliations & affiliations, + CountryNameSynonyms & countryNameSynonyms, + OldMwmMapping * mapping = nullptr); void LoadCountryFile2CountryInfo(std::string const & jsonBuffer, std::map & id2info, bool & isSingleMwm); diff --git a/storage/storage.cpp b/storage/storage.cpp index 6b3ddcd35d..f830a01262 100644 --- a/storage/storage.cpp +++ b/storage/storage.cpp @@ -141,8 +141,8 @@ Storage::Storage(string const & referenceCountriesTxtJsonForTesting, , m_downloadMapOnTheMap(nullptr) , m_maxMwmSizeBytes(0) { - m_currentVersion = - LoadCountriesFromBuffer(referenceCountriesTxtJsonForTesting, m_countries, m_affiliations); + m_currentVersion = LoadCountriesFromBuffer(referenceCountriesTxtJsonForTesting, m_countries, + m_affiliations, m_countryNameSynonyms); CHECK_LESS_OR_EQUAL(0, m_currentVersion, ("Can't load test countries file")); CalcMaxMwmSizeBytes(); } @@ -762,8 +762,8 @@ void Storage::LoadCountriesFile(string const & pathToCountriesFile, string const if (m_countries.IsEmpty()) { - m_currentVersion = - LoadCountriesFromFile(pathToCountriesFile, m_countries, m_affiliations, mapping); + m_currentVersion = LoadCountriesFromFile(pathToCountriesFile, m_countries, m_affiliations, + m_countryNameSynonyms, mapping); LOG_SHORT(LINFO, ("Loaded countries list for version:", m_currentVersion)); if (m_currentVersion < 0) LOG(LERROR, ("Can't load countries file", pathToCountriesFile)); diff --git a/storage/storage.hpp b/storage/storage.hpp index 1ac3322ba0..dd36b1a2a0 100644 --- a/storage/storage.hpp +++ b/storage/storage.hpp @@ -263,6 +263,7 @@ private: // Once filled |m_affiliations| is not changed. // Note. |m_affiliations| is empty in case of countries_obsolete.txt. Affiliations m_affiliations; + CountryNameSynonyms m_countryNameSynonyms; MwmSize m_maxMwmSizeBytes; @@ -469,6 +470,8 @@ public: Affiliations const & GetAffiliations() const { return m_affiliations; } + CountryNameSynonyms const & GetCountryNameSynonyms() const { return m_countryNameSynonyms; } + /// \brief Calls |toDo| for each node for subtree with |root|. /// For example ForEachInSubtree(GetRootId()) calls |toDo| for every node including /// the result of GetRootId() call. diff --git a/storage/storage_defines.hpp b/storage/storage_defines.hpp index c720dda172..4d4cefd36a 100644 --- a/storage/storage_defines.hpp +++ b/storage/storage_defines.hpp @@ -19,8 +19,10 @@ using CountriesSet = std::set; using CountriesVec = std::vector; using LocalFilePtr = std::shared_ptr; using OldMwmMapping = std::map; -/// Map from key affiliation words into MWM IDs (file names). +/// Map from key affiliation words into CountryIds. using Affiliations = std::unordered_map>; +/// Map from country name synonyms and old names into CountryId. +using CountryNameSynonyms = std::unordered_map; extern const storage::CountryId kInvalidCountryId; diff --git a/tools/python/maps_generator/generator/env.py b/tools/python/maps_generator/generator/env.py index 7459293c3d..a087a783eb 100644 --- a/tools/python/maps_generator/generator/env.py +++ b/tools/python/maps_generator/generator/env.py @@ -208,6 +208,10 @@ class Env: def borders_to_osm_path(self): return os.path.join(self.user_resource_path, "borders_vs_osm.csv") + @property + def countries_synonyms_path(self): + return os.path.join(self.user_resource_path, "countries_synonyms.csv") + @property def counties_txt_path(self): return os.path.join(self.mwm_path, "countries.txt") diff --git a/tools/python/maps_generator/maps_generator.py b/tools/python/maps_generator/maps_generator.py index 5e4ef8c215..4018a56863 100644 --- a/tools/python/maps_generator/maps_generator.py +++ b/tools/python/maps_generator/maps_generator.py @@ -223,6 +223,7 @@ def stage_descriptions(env): def stage_countries_txt(env): countries = hierarchy_to_countries(env.old_to_new_path, env.borders_to_osm_path, + env.country_synonyms_path, env.hierarchy_path, env.mwm_path, env.mwm_version) with open(env.counties_txt_path, "w") as f: diff --git a/tools/python/post_generation/__main__.py b/tools/python/post_generation/__main__.py index c004c7deb9..deb526186c 100644 --- a/tools/python/post_generation/__main__.py +++ b/tools/python/post_generation/__main__.py @@ -66,12 +66,15 @@ The post_generation commands are: help="old_vs_new.csv file") parser.add_argument("--osm", required=True, help="borders_vs_osm.csv file") + parser.add_argument("--countries_synonyms", required=True, + help="countries_synonyms.csv file") parser.add_argument("--mwm_version", type=int, required=True, help="Mwm version") parser.add_argument("-o", "--output", required=True, help="Output countries.txt file (default is stdout)") args = parser.parse_args(sys.argv[2:]) countries_json = hierarchy_to_countries_(args.old, args.osm, + args.countries_synonyms, args.hierarchy, args.target, args.mwm_version) diff --git a/tools/python/post_generation/hierarchy_to_countries.py b/tools/python/post_generation/hierarchy_to_countries.py index e63f33cca2..b2c518e758 100755 --- a/tools/python/post_generation/hierarchy_to_countries.py +++ b/tools/python/post_generation/hierarchy_to_countries.py @@ -109,9 +109,24 @@ def parse_borders_vs_osm(borders_vs_osm_csv_path): vsosm[m.group(1)] = [m.group(3)] return vsosm +def parse_countries_synonyms(countries_synonyms_csv_path): + countries_synonyms = {} + if not countries_synonyms_csv_path: + return countries_synonyms + + with open(countries_synonyms_csv_path) as f: + for line in f: + m = re.match(r"(.+)\t(.+)", line.strip()) + assert m + if m.group(1) in countries_synonyms: + countries_synonyms[m.group(1)].append(m.group(2)) + else: + countries_synonyms[m.group(1)] = [m.group(2)] + return countries_synonyms def hierarchy_to_countries(old_vs_new_csv_path, borders_vs_osm_csv_path, - hierarchy_path, target_path, version): + countries_synonyms_csv_path, hierarchy_path, + target_path, version): def fill_last(last, stack): name = last["id"] @@ -124,6 +139,7 @@ def hierarchy_to_countries(old_vs_new_csv_path, borders_vs_osm_csv_path, oldvs = parse_old_vs_new(old_vs_new_csv_path) vsosm = parse_borders_vs_osm(borders_vs_osm_csv_path) + countries_synonyms = parse_countries_synonyms(countries_synonyms_csv_path) stack = [CountryDict(v=version, nameattr="Countries", g=[])] last = None with open(hierarchy_path) as f: @@ -151,6 +167,8 @@ def hierarchy_to_countries(old_vs_new_csv_path, borders_vs_osm_csv_path, last["old"] = oldvs[items[0]] if items[0] in vsosm: last["affiliations"] = vsosm[items[0]] + if items[0] in countries_synonyms: + last["country_name_synonyms"] = countries_synonyms[items[0]] # the last line is always a file del last["d"] diff --git a/tools/unix/generate_planet.sh b/tools/unix/generate_planet.sh index 1f5e867bf7..0f642deac9 100755 --- a/tools/unix/generate_planet.sh +++ b/tools/unix/generate_planet.sh @@ -636,7 +636,7 @@ if [ "$MODE" == "resources" ]; then putmode "Step 8: Updating resource lists" # Update countries list $PYTHON36 -m $POST_GENERATION_MODULE hierarchy_to_countries --target "$TARGET" --hierarchy "$DATA_PATH/hierarchy.txt" --mwm_version "$COUNTRIES_VERSION" \ - --old "$DATA_PATH/old_vs_new.csv" --osm "$DATA_PATH/borders_vs_osm.csv" --output "$TARGET/countries.txt" >> "$PLANET_LOG" 2>&1 + --old "$DATA_PATH/old_vs_new.csv" --osm "$DATA_PATH/borders_vs_osm.csv" --countries_synonyms "$DATA_PATH/countries_synonyms.csv" --output "$TARGET/countries.txt" >> "$PLANET_LOG" 2>&1 # A quick fix: chmodding to a+rw all generated files for file in "$TARGET"/*.mwm*; do