diff --git a/defines.hpp b/defines.hpp index 430fd98afc..68e0e5a7c9 100644 --- a/defines.hpp +++ b/defines.hpp @@ -26,6 +26,7 @@ #define COMPRESSED_SEARCH_INDEX_FILE_TAG "csdx" #define FEATURE_OFFSETS_FILE_TAG "offs" #define RANKS_FILE_TAG "ranks" +#define SEARCH_TOKENS_FILE_TAG "stokens" #define ROUTING_MATRIX_FILE_TAG "mercedes" #define ROUTING_EDGEDATA_FILE_TAG "daewoo" diff --git a/generator/feature_builder.hpp b/generator/feature_builder.hpp index 4d1d86b2cb..43716303e2 100644 --- a/generator/feature_builder.hpp +++ b/generator/feature_builder.hpp @@ -242,6 +242,8 @@ public: bool PreSerialize(SupportingData const & data); void Serialize(SupportingData & data, serial::CodingParams const & params); //@} + + feature::AddressData const & GetAddressData() const { return m_params.GetAddressData(); } }; namespace feature diff --git a/generator/feature_sorter.cpp b/generator/feature_sorter.cpp index 2ff3f7f039..72b6dc181b 100644 --- a/generator/feature_sorter.cpp +++ b/generator/feature_sorter.cpp @@ -90,14 +90,15 @@ namespace feature vector m_geoFile, m_trgFile; - unique_ptr m_MetadataWriter; + unique_ptr m_metadataWriter; + unique_ptr m_searchTokensWriter; struct TMetadataIndexEntry { uint32_t key; uint32_t value; }; - vector m_MetadataIndex; + vector m_metadataIndex; DataHeader m_header; uint32_t m_versionDate; @@ -108,7 +109,8 @@ namespace feature FeaturesCollector2(string const & fName, DataHeader const & header, uint32_t versionDate) : FeaturesCollector(fName + DATA_FILE_TAG), m_writer(fName), m_header(header), m_versionDate(versionDate) { - m_MetadataWriter.reset(new FileWriter(fName + METADATA_FILE_TAG)); + m_metadataWriter.reset(new FileWriter(fName + METADATA_FILE_TAG)); + m_searchTokensWriter.reset(new FileWriter(fName + SEARCH_TOKENS_FILE_TAG)); for (size_t i = 0; i < m_header.GetScalesCount(); ++i) { @@ -162,19 +164,23 @@ namespace feature { FileWriter w = m_writer.GetWriter(METADATA_INDEX_FILE_TAG); - for (auto const & v : m_MetadataIndex) + for (auto const & v : m_metadataIndex) { WriteToSink(w, v.key); WriteToSink(w, v.value); } } - m_MetadataWriter->Flush(); - m_writer.Write(m_MetadataWriter->GetName(), METADATA_FILE_TAG); + m_metadataWriter->Flush(); + m_writer.Write(m_metadataWriter->GetName(), METADATA_FILE_TAG); + + m_searchTokensWriter->Flush(); + m_writer.Write(m_searchTokensWriter->GetName(), SEARCH_TOKENS_FILE_TAG); m_writer.Finish(); - FileWriter::DeleteFileX(m_MetadataWriter->GetName()); + FileWriter::DeleteFileX(m_metadataWriter->GetName()); + FileWriter::DeleteFileX(m_searchTokensWriter->GetName()); if (m_header.GetType() == DataHeader::country) { @@ -506,12 +512,14 @@ namespace feature uint32_t const ftID = WriteFeatureBase(holder.m_buffer.m_buffer, fb); + fb.GetAddressData().Serialize(*m_searchTokensWriter); + if (!fb.GetMetadata().Empty()) { - uint64_t offset = m_MetadataWriter->Pos(); + uint64_t offset = m_metadataWriter->Pos(); ASSERT_LESS_OR_EQUAL(offset, numeric_limits::max(), ()); - m_MetadataIndex.push_back({ ftID, static_cast(offset) }); - fb.GetMetadata().SerializeToMWM(*m_MetadataWriter); + m_metadataIndex.push_back({ ftID, static_cast(offset) }); + fb.GetMetadata().SerializeToMWM(*m_metadataWriter); } uint64_t const osmID = fb.GetWayIDForRouting(); diff --git a/generator/generator_tests/feature_builder_test.cpp b/generator/generator_tests/feature_builder_test.cpp index 01fac43596..6192a11e2d 100644 --- a/generator/generator_tests/feature_builder_test.cpp +++ b/generator/generator_tests/feature_builder_test.cpp @@ -149,58 +149,19 @@ UNIT_TEST(FBuilder_RemoveUselessNames) TEST(fb1.CheckValid(), ()); } -UNIT_TEST(FBuilder_WithoutName) +UNIT_TEST(FeatureParams_Parsing) { classificator::Load(); - char const * arr1[][1] = { { "amenity" } }; { FeatureParams params; - AddTypes(params, arr1); - params.AddName("default", "Name"); - - FeatureBuilder1 fb; - fb.SetParams(params); - fb.SetCenter(m2::PointD(0, 0)); - - TEST(fb.PreSerialize(), ()); - TEST(fb.RemoveInvalidTypes(), ()); + params.AddStreet("Embarcadero street \t\t 85"); + TEST_EQUAL(params.GetStreet(), "Embarcadero street", ()); } { FeatureParams params; - AddTypes(params, arr1); - - FeatureBuilder1 fb; - fb.SetParams(params); - fb.SetCenter(m2::PointD(0, 0)); - - TEST(fb.PreSerialize(), ()); - TEST(!fb.RemoveInvalidTypes(), ()); + params.AddAddress("165 \t\t Dolliver Street"); + TEST_EQUAL(params.GetStreet(), "Dolliver Street", ()); } } - -UNIT_TEST(FBuilder_PointAddress) -{ - classificator::Load(); - - char const * arr[][2] = { { "addr:housenumber", "39/79" } }; - - OsmElement e; - FillXmlElement(arr, ARRAY_SIZE(arr), &e); - - FeatureParams params; - ftype::GetNameAndType(&e, params); - - TEST_EQUAL(params.m_Types.size(), 1, ()); - TEST(params.IsTypeExist(GetType({"building", "address"})), ()); - TEST_EQUAL(params.house.Get(), "39/79", ()); - - FeatureBuilder1 fb; - fb.SetParams(params); - fb.SetCenter(m2::PointD(0, 0)); - - TEST(fb.PreSerialize(), ()); - TEST(fb.RemoveInvalidTypes(), ()); - TEST(fb.CheckValid(), ()); -} diff --git a/generator/osm2type.cpp b/generator/osm2type.cpp index 1a0331c9a3..b963c78dfb 100644 --- a/generator/osm2type.cpp +++ b/generator/osm2type.cpp @@ -107,6 +107,15 @@ namespace ftype }); } + string Normalize(string const & s) + { + // Unicode Compatibility Decomposition, + // followed by Canonical Composition (NFKC). + // Needed for better search matching. + QByteArray ba = QString::fromUtf8(s.c_str()).normalized(QString::NormalizationForm_KC).toUtf8(); + return ba.constData(); + } + class NamesExtractor { set m_savedNames; @@ -148,12 +157,7 @@ namespace ftype if (v.empty() || !GetLangByKey(k, lang)) return false; - // Unicode Compatibility Decomposition, - // followed by Canonical Composition (NFKC). - // Needed for better search matching - QByteArray const normBytes = QString::fromUtf8( - v.c_str()).normalized(QString::NormalizationForm_KC).toUtf8(); - m_params.AddName(lang, normBytes.constData()); + m_params.AddName(lang, Normalize(v)); k.clear(); v.clear(); return false; @@ -522,15 +526,17 @@ namespace ftype { "restaurant", "yes", [](string & k, string & v) { k.swap(v); k = "amenity"; }}, { "hotel", "yes", [](string & k, string & v) { k.swap(v); k = "tourism"; }}, { "building", "entrance", [](string & k, string & v) { k.swap(v); v = "yes"; }}, - { "addr:housename", "*", [¶ms](string & k, string & v) { params.AddHouseName(v); k.clear(); v.clear(); }}, - { "addr:street", "*", [¶ms](string & k, string & v) { params.AddStreetAddress(v); k.clear(); v.clear(); }}, - { "addr:housenumber", "*", [¶ms](string & k, string & v) - { - // Treat "numbers" like names if it's not an actual number. - if (!params.AddHouseNumber(v)) - params.AddHouseName(v); - k.clear(); v.clear(); - }}, + + { "addr:city", "*", [¶ms](string & k, string & v) { params.AddPlace(Normalize(v)); k.clear(); v.clear(); }}, + { "addr:place", "*", [¶ms](string & k, string & v) { params.AddPlace(Normalize(v)); k.clear(); v.clear(); }}, + { "addr:housenumber", "*", [¶ms](string & k, string & v) { params.AddHouseName(Normalize(v)); k.clear(); v.clear(); }}, + { "addr:housename", "*", [¶ms](string & k, string & v) { params.AddHouseName(Normalize(v)); k.clear(); v.clear(); }}, + { "addr:street", "*", [¶ms](string & k, string & v) { params.AddStreet(Normalize(v)); k.clear(); v.clear(); }}, + //{ "addr:streetnumber", "*", [¶ms](string & k, string & v) { params.AddStreet(Normalize(v)); k.clear(); v.clear(); }}, + //{ "addr:full", "*", [¶ms](string & k, string & v) { params.AddAddress(Normalize(v)); k.clear(); v.clear(); }}, + { "addr:postcode", "*", [¶ms](string & k, string & v) { params.AddPostcode(Normalize(v)); k.clear(); v.clear(); }}, + { "addr:flats", "*", [¶ms](string & k, string & v) { params.flats = v; k.clear(); v.clear(); }}, + { "population", "*", [¶ms](string & k, string & v) { // Get population rank. diff --git a/generator/osm_translator.hpp b/generator/osm_translator.hpp index 074f06eda8..2e06c4d085 100644 --- a/generator/osm_translator.hpp +++ b/generator/osm_translator.hpp @@ -142,9 +142,12 @@ protected: for (auto const & p : e.tags) { // Store only this tags to use it in railway stations processing for the particular city. - if (p.first == "network" || p.first == "operator" || p.first == "route" || p.first == "maxspeed") + if (p.first == "network" || p.first == "operator" || p.first == "route" || p.first == "maxspeed" || + strings::StartsWith(p.first, "addr:")) + { if (!TBase::IsKeyTagExists(p.first)) TBase::m_current->AddTag(p.first, p.second); + } } } }; diff --git a/indexer/feature_data.cpp b/indexer/feature_data.cpp index ffaf297270..a0bf7e1091 100644 --- a/indexer/feature_data.cpp +++ b/indexer/feature_data.cpp @@ -171,6 +171,10 @@ struct IsBadChar } +///////////////////////////////////////////////////////////////////////////////////////// +// FeatureParams implementation +///////////////////////////////////////////////////////////////////////////////////////// + bool FeatureParams::AddName(string const & lang, string const & s) { if (IsDummyName(s)) @@ -218,29 +222,59 @@ bool FeatureParams::AddHouseNumber(string const & ss) return true; } -void FeatureParams::AddStreetAddress(string const & s) +void FeatureParams::AddStreet(string s) { - m_street = s; - // Erase bad chars (\n) because we write addresses to txt file. - m_street.erase(remove_if(m_street.begin(), m_street.end(), IsBadChar()), m_street.end()); + s.erase(remove_if(s.begin(), s.end(), IsBadChar()), s.end()); // Osm likes to put house numbers into addr:street field. - size_t i = m_street.find_last_of("\t "); + size_t i = s.find_last_of("\t "); if (i != string::npos) { - ++i; uint64_t n; - if (strings::to_uint64(m_street.substr(i), n)) - m_street.erase(i); + if (strings::to_uint64(s.substr(i+1), n)) + s.erase(s.find_last_not_of("\t ", i)+1); } + + m_addrTags.Add(AddressData::FAD_STREET, s); +} + +void FeatureParams::AddAddress(string const & s) +{ + size_t i = s.find_first_of("\t "); + if (i != string::npos) + { + string const house = s.substr(0, i); + if (feature::IsHouseNumber(house)) + { + AddHouseNumber(house); + i = s.find_first_not_of("\t ", i); + } + else + i = 0; + } + else + i = 0; + + AddStreet(s.substr(i, s.size()-i)); +} + +void FeatureParams::AddPlace(string const & s) +{ + m_addrTags.Add(AddressData::FAD_PLACE, s); +} + +void FeatureParams::AddPostcode(string const & s) +{ + m_addrTags.Add(AddressData::FAD_POSTCODE, s); } bool FeatureParams::FormatFullAddress(m2::PointD const & pt, string & res) const { - if (!m_street.empty() && !house.IsEmpty()) + string const street = GetStreet(); + if (!street.empty() && !house.IsEmpty()) { - res = m_street + "|" + house.Get() + "|" + res = street + "|" + house.Get() + "|" + strings::to_string_dac(MercatorBounds::YToLat(pt.y), 8) + "|" + strings::to_string_dac(MercatorBounds::XToLon(pt.x), 8) + '\n'; return true; @@ -249,6 +283,11 @@ bool FeatureParams::FormatFullAddress(m2::PointD const & pt, string & res) const return false; } +string FeatureParams::GetStreet() const +{ + return m_addrTags.Get(AddressData::FAD_STREET); +} + void FeatureParams::SetGeomType(feature::EGeomType t) { switch (t) diff --git a/indexer/feature_data.hpp b/indexer/feature_data.hpp index 07acea5c37..c12b309ea0 100644 --- a/indexer/feature_data.hpp +++ b/indexer/feature_data.hpp @@ -201,10 +201,8 @@ class FeatureParams : public FeatureParamsBase uint8_t m_geomType; - /// We use it now only for search unit tests - string m_street; - feature::Metadata m_metadata; + feature::AddressData m_addrTags; public: typedef vector TTypes; @@ -220,10 +218,18 @@ public: /// @name Used in storing full street address only. //@{ - void AddStreetAddress(string const & s); + void AddStreet(string s); + void AddPlace(string const & s); + void AddPostcode(string const & s); + void AddAddress(string const & s); + bool FormatFullAddress(m2::PointD const & pt, string & res) const; //@} + /// Used for testing purposes now. + string GetStreet() const; + feature::AddressData const & GetAddressData() const { return m_addrTags; } + /// Assign parameters except geometry type. /// Geometry is independent state and it's set by FeatureType's geometry functions. inline void SetParams(FeatureParams const & rhs) @@ -231,7 +237,7 @@ public: BaseT::operator=(rhs); m_Types = rhs.m_Types; - m_street = rhs.m_street; + m_addrTags = rhs.m_addrTags; m_metadata = rhs.m_metadata; } @@ -268,7 +274,7 @@ public: feature::Metadata const & GetMetadata() const { return m_metadata; } feature::Metadata & GetMetadata() { return m_metadata; } - template void Write(SinkT & sink, bool needStoreMetadata = true) const + template void Write(SinkT & sink, bool fullStoring) const { uint8_t const header = GetHeader(); @@ -277,13 +283,16 @@ public: for (size_t i = 0; i < m_Types.size(); ++i) WriteVarUint(sink, GetIndexForType(m_Types[i])); - if (needStoreMetadata) + if (fullStoring) + { m_metadata.Serialize(sink); + m_addrTags.Serialize(sink); + } BaseT::Write(sink, header); } - template void Read(SrcT & src, bool needReadMetadata = true) + template void Read(SrcT & src) { using namespace feature; @@ -294,8 +303,8 @@ public: for (size_t i = 0; i < count; ++i) m_Types.push_back(GetTypeForIndex(ReadVarUint(src))); - if (needReadMetadata) - m_metadata.Deserialize(src); + m_metadata.Deserialize(src); + m_addrTags.Deserialize(src); BaseT::Read(src, header); } diff --git a/indexer/feature_meta.hpp b/indexer/feature_meta.hpp index 98c8008c06..eda4527c84 100644 --- a/indexer/feature_meta.hpp +++ b/indexer/feature_meta.hpp @@ -12,7 +12,62 @@ namespace feature { - class Metadata + class MetadataBase + { + public: + string Get(uint8_t type) const + { + auto it = m_metadata.find(type); + return (it == m_metadata.end()) ? string() : it->second; + } + + vector GetPresentTypes() const + { + vector types; + types.reserve(m_metadata.size()); + + for (auto const & item : m_metadata) + types.push_back(item.first); + + return types; + } + + void Drop(uint8_t type) + { + m_metadata.erase(type); + } + + inline bool Empty() const { return m_metadata.empty(); } + inline size_t Size() const { return m_metadata.size(); } + + template void Serialize(ArchiveT & ar) const + { + uint8_t const sz = m_metadata.size(); + WriteToSink(ar, sz); + for (auto const & it : m_metadata) + { + WriteToSink(ar, static_cast(it.first)); + utils::WriteString(ar, it.second); + } + } + + template void Deserialize(ArchiveT & ar) + { + uint8_t const sz = ReadPrimitiveFromSource(ar); + for (size_t i = 0; i < sz; ++i) + { + uint8_t const key = ReadPrimitiveFromSource(ar); + string value; + utils::ReadString(ar, value); + m_metadata.insert(make_pair(key, value)); + } + } + + protected: + map m_metadata; + }; + + class Metadata : public MetadataBase { public: /// @note! Do not change values here. @@ -45,8 +100,7 @@ namespace feature static_assert(FMD_COUNT <= 255, "Meta types count is limited to one byte."); - /// Empty value drops (clears) corresponding type. - void Set(EType type, string const & value) + void Add(EType type, string const & s) { auto found = m_metadata.find(type); if (found == m_metadata.end()) @@ -91,6 +145,11 @@ namespace feature string GetWikiURL() const; +======= + val = val + ", " + s; + } + +>>>>>>> 12268b5... [generator] Pass address tokens to the search index generation step. template void SerializeToMWM(ArchiveT & ar) const { for (auto const & e : m_metadata) @@ -112,43 +171,28 @@ namespace feature { ar.Read(header, sizeof(header)); ar.Read(buffer, header[1]); - m_metadata[ToType(header[0] & 0x7F)].assign(buffer, header[1]); + m_metadata[header[0] & 0x7F].assign(buffer, header[1]); } while (!(header[0] & 0x80)); } - template void Serialize(ArchiveT & ar) const - { - uint8_t const sz = m_metadata.size(); - WriteToSink(ar, sz); - for (auto const & it : m_metadata) - { - WriteToSink(ar, static_cast(it.first)); - utils::WriteString(ar, it.second); - } - } - - template void Deserialize(ArchiveT & ar) - { - uint8_t const sz = ReadPrimitiveFromSource(ar); - ASSERT_LESS_OR_EQUAL(sz, FMD_COUNT, ()); - - for (size_t i = 0; i < sz; ++i) - { - EType const key = ToType(ReadPrimitiveFromSource(ar)); - string value; - utils::ReadString(ar, value); - m_metadata.insert(make_pair(key, value)); - } - } - private: - static EType ToType(uint8_t key) - { - ASSERT(key > 0 && key < FMD_COUNT, (key)); - return static_cast(key); - } - enum { kMaxStringLength = 255 }; - map m_metadata; + }; + + class AddressData : public MetadataBase + { + public: + enum EType + { + FAD_PLACE = 1, + FAD_STREET = 2, + FAD_POSTCODE = 3, + }; + + void Add(EType type, string const & s) + { + /// @todo Probably, we need to add separator here and store multiple values. + m_metadata[type] = s; + } }; } diff --git a/indexer/features_vector.hpp b/indexer/features_vector.hpp index be38523a73..ffc9d34861 100644 --- a/indexer/features_vector.hpp +++ b/indexer/features_vector.hpp @@ -68,4 +68,6 @@ public: feature::DataHeader const & GetHeader() const { return m_header; } FeaturesVector const & GetVector() const { return m_vector; } + + FilesContainerR::ReaderT GetReader(string const & tag) const { return m_cont.GetReader(tag); } }; diff --git a/indexer/search_index_builder.cpp b/indexer/search_index_builder.cpp index e25c72dc65..1b3be46766 100644 --- a/indexer/search_index_builder.cpp +++ b/indexer/search_index_builder.cpp @@ -273,6 +273,23 @@ void AddFeatureNameIndexPairs(FeaturesVectorTest & features, CategoriesHolder & features.GetVector().ForEach(FeatureInserter( synonyms.get(), keyValuePairs, categoriesHolder, header.GetScaleRange(), valueBuilder)); + + ReaderSource src = features.GetReader(SEARCH_TOKENS_FILE_TAG); + uint64_t index = 0; + FeatureNameInserter inserter(nullptr, keyValuePairs); + int8_t const lang = StringUtf8Multilang::GetLangIndex("default"); + + while (src.Size() > 0) + { + feature::AddressData data; + data.Deserialize(src); + + inserter.m_val.m_featureId = index++; + + string const street = data.Get(feature::AddressData::FAD_STREET); + if (!street.empty()) + inserter(lang, street); + } } } // namespace