diff --git a/generator/booking_dataset.cpp b/generator/booking_dataset.cpp index 9444e7c3be..5d8ed73caf 100644 --- a/generator/booking_dataset.cpp +++ b/generator/booking_dataset.cpp @@ -50,6 +50,13 @@ BookingDataset::Hotel::Hotel(string const & src) descUrl = rec[Index(Fields::DescUrl)]; strings::to_uint(rec[Index(Fields::Type)], type); + + langCode = rec[Index(Fields::Language)]; + if (!langCode.empty()) + { + nameLoc = rec[Index(Fields::NameLoc)]; + addressLoc = rec[Index(Fields::AddressLoc)]; + } } ostream & operator<<(ostream & s, BookingDataset::Hotel const & h) @@ -171,6 +178,12 @@ void BookingDataset::BuildFeatures(function const & fn) cons e.AddTag("price_rate", strings::to_string(hotel.priceCategory)); e.AddTag("addr:full", hotel.address); + if (!hotel.langCode.empty()) + { + e.AddTag("name:" + hotel.langCode, hotel.nameLoc); + e.AddTag("addr:full:" + hotel.langCode, hotel.addressLoc); + } + switch (hotel.type) { case 19: diff --git a/generator/booking_dataset.hpp b/generator/booking_dataset.hpp index f8db7f6e09..37f0a7a349 100644 --- a/generator/booking_dataset.hpp +++ b/generator/booking_dataset.hpp @@ -36,6 +36,9 @@ public: RatingUsers = 8, DescUrl = 9, Type = 10, + Language = 11, + NameLoc = 12, + AddressLoc = 13, Counter }; @@ -51,6 +54,9 @@ public: double ratingUser = 0.0; string descUrl; uint32_t type = 0; + string langCode; + string nameLoc; + string addressLoc; static constexpr size_t Index(Fields field) { return static_cast(field); } static constexpr size_t FieldsCount() { return static_cast(Fields::Counter); } diff --git a/tools/python/booking_hotels.py b/tools/python/booking_hotels.py index ca63585e6e..7e4a5cc9fc 100755 --- a/tools/python/booking_hotels.py +++ b/tools/python/booking_hotels.py @@ -81,26 +81,45 @@ def download(user, password, path): countrycode = country['countrycode'] logging.info(u'Download[{0}]: {1}'.format(countrycode, country['name'])) - allhotels = [] + allhotels = {} while True: hotels = api.call('getHotels', dict(new_hotel_type=1, offset=len(allhotels), rows=maxrows, countrycodes=countrycode)) # Check for error. - if not hotels: + if hotels is None: exit(1) - allhotels.extend(hotels) + for h in hotels: + allhotels[h['hotel_id']] = h # If hotels in answer less then maxrows, we reach end of data. if len(hotels) < maxrows: break - logging.info('Num of hotels: {0}'.format(len(allhotels))) + # Now the same for hotel translations + offset = 0 + while True: + hotels = api.call('getHotelTranslations', dict(offset=offset, rows=maxrows, countrycodes=countrycode)) + if hotels is None: + exit(1) + + # Add translations for each hotel + for h in hotels: + if h['hotel_id'] in allhotels: + if 'translations' not in allhotels[h['hotel_id']]: + allhotels[h['hotel_id']]['translations'] = {} + allhotels[h['hotel_id']]['translations'][h['languagecode']] = {'name': h['name'], 'address': h['address']} + + offset += len(hotels) + if len(hotels) < maxrows: + break + + logging.info('Num of hotels: {0}, translations: {1}'.format(len(allhotels), offset)) filename = os.path.join(path, '{0} - {1}.pkl'.format(country['area'].encode('utf8'), country['name'].encode('utf8'))) with open(filename, 'wb') as fd: - pickle.dump(allhotels, fd, pickle.HIGHEST_PROTOCOL) + pickle.dump(allhotels.values(), fd, pickle.HIGHEST_PROTOCOL) def translate(source, output): @@ -110,7 +129,7 @@ def translate(source, output): files = [filename for filename in os.listdir(source) if filename.endswith('.pkl')] data = [] - for filename in files: + for filename in sorted(files): logging.info('Processing {0}'.format(filename)) with open(filename, 'rb') as fd: data += pickle.load(fd) @@ -155,8 +174,17 @@ def translate(source, output): # Find a range that contains the price while rate <= len(rates) and price > avg * rates[rate - 1]: rate += 1 - l = [unicode(get_hotel_field(hotel, e, rate)).encode('utf8').replace('\t', ' ') for e in HOTEL_FIELDS] - print('\t'.join(l), file=fd) + l = [get_hotel_field(hotel, e, rate) for e in HOTEL_FIELDS] + # Add translations for hotel name and address if present. + if 'translations' in hotel: + tr_lang = hotel['languagecode'] + if tr_lang not in hotel['translations']: + tr_lang = hotel['translations'].keys()[0] + l.append(tr_lang) + l.extend([hotel['translations'][tr_lang][e] for e in ('name', 'address')]) + else: + l.extend([''] * 3) + print('\t'.join([unicode(f).encode('utf8').replace('\t', ' ') for f in l]), file=fd) def process_options():