From 3fac0435788352bd5cdc67764d9f17b59cbdf501 Mon Sep 17 00:00:00 2001 From: Ilya Zverev Date: Tue, 21 Jun 2016 18:48:36 +0300 Subject: [PATCH] [booking] Save all translations we've got --- base/base_tests/string_utils_test.cpp | 14 ++++++++++++++ base/string_utils.cpp | 13 +++++++++++++ base/string_utils.hpp | 3 +++ generator/booking_dataset.cpp | 21 +++++++++++---------- generator/booking_dataset.hpp | 8 ++------ tools/python/booking_hotels.py | 23 ++++++++++++----------- 6 files changed, 55 insertions(+), 27 deletions(-) diff --git a/base/base_tests/string_utils_test.cpp b/base/base_tests/string_utils_test.cpp index 1f1aa05bc2..49720cc45b 100644 --- a/base/base_tests/string_utils_test.cpp +++ b/base/base_tests/string_utils_test.cpp @@ -729,3 +729,17 @@ UNIT_TEST(NormalizeDigits_UniString) TEST_EQUAL(nd("a0192 "), "a0192 ", ()); TEST_EQUAL(nd("3456789"), "3456789", ()); } + +UNIT_TEST(Split) +{ + vector target; + strings::Split(";Test\\;проверка;0;", ';', target); + vector expected({"", "Test\\", "проверка", "0", ""}); + TEST_EQUAL(target, expected, ()); + strings::Split("and there was none", ' ', target); + vector expected2({"and", "there", "", "was", "none"}); + TEST_EQUAL(target, expected2, ()); + strings::Split("", '!', target); + vector expected3; + TEST_EQUAL(target, expected3, ()); +} diff --git a/base/string_utils.cpp b/base/string_utils.cpp index 7b6b808995..2a8b029f37 100644 --- a/base/string_utils.cpp +++ b/base/string_utils.cpp @@ -328,4 +328,17 @@ bool AlmostEqual(string const & str1, string const & str2, size_t mismatchedCoun return false; } +void Split(string const & s, char delimiter, vector & target) +{ + target.clear(); + + // Special case: if the string is empty, return an empty array instead of {""}. + if (s.empty()) + return; + + using It = TokenizeIterator; + for (It it(s, SimpleDelimiter(delimiter)); it; ++it) + target.push_back(*it); +} + } // namespace strings diff --git a/base/string_utils.hpp b/base/string_utils.hpp index d557706d85..e42c8188e8 100644 --- a/base/string_utils.hpp +++ b/base/string_utils.hpp @@ -306,6 +306,9 @@ void Tokenize(string const & str, char const * delims, TFunctor && f) } } +/// Splits a string by the delimiter, keeps empty parts, on an empty string returns an empty vector. +void Split(string const & s, char delimiter, vector & target); + /// @return code of last symbol in string or 0 if s is empty UniChar LastUniChar(string const & s); diff --git a/generator/booking_dataset.cpp b/generator/booking_dataset.cpp index 5d8ed73caf..2e619453b5 100644 --- a/generator/booking_dataset.cpp +++ b/generator/booking_dataset.cpp @@ -51,12 +51,7 @@ BookingDataset::Hotel::Hotel(string const & src) strings::to_uint(rec[Index(Fields::Type)], type); - langCode = rec[Index(Fields::Language)]; - if (!langCode.empty()) - { - nameLoc = rec[Index(Fields::NameLoc)]; - addressLoc = rec[Index(Fields::AddressLoc)]; - } + translations = rec[Index(Fields::Translations)]; } ostream & operator<<(ostream & s, BookingDataset::Hotel const & h) @@ -178,10 +173,15 @@ void BookingDataset::BuildFeatures(function const & fn) cons e.AddTag("price_rate", strings::to_string(hotel.priceCategory)); e.AddTag("addr:full", hotel.address); - if (!hotel.langCode.empty()) + if (!hotel.translations.empty()) { - e.AddTag("name:" + hotel.langCode, hotel.nameLoc); - e.AddTag("addr:full:" + hotel.langCode, hotel.addressLoc); + vector parts; + strings::Split(hotel.translations, '|', parts); + for (auto i = 0; i < parts.size(); i += 3) + { + e.AddTag("name:" + parts[i], parts[i + 1]); + e.AddTag("addr:full:" + parts[i], parts[i + 2]); + } } switch (hotel.type) @@ -278,7 +278,8 @@ bool BookingDataset::MatchWithBooking(OsmElement const & e) const return false; // Find |kMaxSelectedElements| nearest values to a point. - auto const bookingIndexes = GetNearestHotels(e.lat, e.lon, kMaxSelectedElements, kDistanceLimitInMeters); + auto const bookingIndexes = + GetNearestHotels(e.lat, e.lon, kMaxSelectedElements, kDistanceLimitInMeters); bool matched = false; diff --git a/generator/booking_dataset.hpp b/generator/booking_dataset.hpp index 37f0a7a349..cac294db30 100644 --- a/generator/booking_dataset.hpp +++ b/generator/booking_dataset.hpp @@ -36,9 +36,7 @@ public: RatingUsers = 8, DescUrl = 9, Type = 10, - Language = 11, - NameLoc = 12, - AddressLoc = 13, + Translations = 11, Counter }; @@ -54,9 +52,7 @@ public: double ratingUser = 0.0; string descUrl; uint32_t type = 0; - string langCode; - string nameLoc; - string addressLoc; + string translations; static constexpr size_t Index(Fields field) { return static_cast(field); } static constexpr size_t FieldsCount() { return static_cast(Fields::Counter); } diff --git a/tools/python/booking_hotels.py b/tools/python/booking_hotels.py index 7e4a5cc9fc..f684505036 100755 --- a/tools/python/booking_hotels.py +++ b/tools/python/booking_hotels.py @@ -17,7 +17,7 @@ import urllib2 logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(levelname)s: %(message)s') # Names starting with '.' are calculated in get_hotel_field() below. -HOTEL_FIELDS = ('hotel_id', '.lat', '.lon', 'name', 'address', 'class', '.rate', 'ranking', 'review_score', 'url', 'hoteltype_id') +HOTEL_FIELDS = ('hotel_id', '.lat', '.lon', 'name', 'address', 'class', '.rate', 'ranking', 'review_score', 'url', 'hoteltype_id', '.trans') class BookingApi: @@ -160,6 +160,16 @@ def translate(source, output): return hotel['location']['longitude'] elif field == '.rate': return rate + elif field == '.trans': + # Translations are packed into a single column: lang1|name1|address1|lang2|name2|address2|... + if 'translations' in hotel: + tr_list = [] + for tr_lang, tr_values in hotel['translations'].items(): + tr_list.append(tr_lang) + tr_list.extend([tr_values[e] for e in ('name', 'address')]) + return '|'.join([s.replace('|', ';') for s in tr_list]) + else: + return '' elif field in hotel: return hotel[field] raise ValueError('Unknown hotel field: {0}'.format(field)) @@ -175,16 +185,7 @@ def translate(source, output): while rate <= len(rates) and price > avg * rates[rate - 1]: rate += 1 l = [get_hotel_field(hotel, e, rate) for e in HOTEL_FIELDS] - # Add translations for hotel name and address if present. - if 'translations' in hotel: - tr_lang = hotel['languagecode'] - if tr_lang not in hotel['translations']: - tr_lang = hotel['translations'].keys()[0] - l.append(tr_lang) - l.extend([hotel['translations'][tr_lang][e] for e in ('name', 'address')]) - else: - l.extend([''] * 3) - print('\t'.join([unicode(f).encode('utf8').replace('\t', ' ') for f in l]), file=fd) + print('\t'.join([unicode(f).encode('utf8').replace('\t', ' ').replace('\n', ' ').replace('\r', '') for f in l]), file=fd) def process_options():