forked from organicmaps/organicmaps
[geocoder] Optimize memory: name dictionary for address parts
This commit is contained in:
parent
f6cbdacaa7
commit
6837cafdbd
12 changed files with 245 additions and 93 deletions
|
@ -12,6 +12,8 @@ set(
|
|||
hierarchy_reader.hpp
|
||||
index.cpp
|
||||
index.hpp
|
||||
name_dictionary.cpp
|
||||
name_dictionary.hpp
|
||||
result.cpp
|
||||
result.hpp
|
||||
types.cpp
|
||||
|
|
|
@ -374,10 +374,14 @@ void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, vector
|
|||
{
|
||||
m_index.ForEachRelatedBuilding(docId, [&](Index::DocId const & buildingDocId) {
|
||||
auto const & bld = m_index.GetDoc(buildingDocId);
|
||||
auto const bt = static_cast<size_t>(Type::Building);
|
||||
auto const & realHN = MakeHouseNumber(bld.m_address[bt]);
|
||||
if (search::house_numbers::HouseNumbersMatch(realHN, subqueryHN, false /* queryIsPrefix */))
|
||||
auto const & realHN = bld.GetNormalizedName(Type::Building,
|
||||
m_hierarchy.GetNormalizedNameDictionary());
|
||||
auto const & realHNUniStr = strings::MakeUniString(realHN);
|
||||
if (search::house_numbers::HouseNumbersMatch(realHNUniStr, subqueryHN,
|
||||
false /* queryIsPrefix */))
|
||||
{
|
||||
curLayer.m_entries.emplace_back(buildingDocId);
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
|
@ -405,7 +409,7 @@ bool Geocoder::HasParent(vector<Geocoder::Layer> const & layers, Hierarchy::Entr
|
|||
// Note that the relationship is somewhat inverted: every ancestor
|
||||
// is stored in the address but the nodes have no information
|
||||
// about their children.
|
||||
if (m_index.GetDoc(docId).IsParentTo(e))
|
||||
if (m_hierarchy.IsParentTo(m_index.GetDoc(docId), e))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
|
|
@ -24,13 +24,28 @@ void PrintResults(Hierarchy const & hierarchy, vector<Result> const & results)
|
|||
if (results.empty())
|
||||
return;
|
||||
cout << "Top results:" << endl;
|
||||
|
||||
auto const & dictionary = hierarchy.GetNormalizedNameDictionary();
|
||||
for (size_t i = 0; i < results.size(); ++i)
|
||||
{
|
||||
if (FLAGS_top >= 0 && static_cast<int32_t>(i) >= FLAGS_top)
|
||||
break;
|
||||
cout << " " << DebugPrint(results[i]);
|
||||
if (auto const && e = hierarchy.GetEntryForOsmId(results[i].m_osmId))
|
||||
cout << " " << DebugPrint(e->m_address);
|
||||
{
|
||||
cout << " [";
|
||||
auto const * delimiter = "";
|
||||
for (size_t i = 0; i < static_cast<size_t>(Type::Count); ++i)
|
||||
{
|
||||
if (e->m_normalizedAddress[i])
|
||||
{
|
||||
auto type = static_cast<Type>(i);
|
||||
cout << delimiter << ToString(type) << ": " << e->GetNormalizedName(type, dictionary);
|
||||
delimiter = ", ";
|
||||
}
|
||||
}
|
||||
cout << "]";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,13 +29,6 @@ C00000000004B279 {"type": "Feature", "geometry": {"type": "Point", "coordinates"
|
|||
C0000000001C4CA7 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-78.7260117405499, 21.74300205]}, "properties": {"locales": {"default": {"name": "Ciego de Ávila", "address": {"region": "Ciego de Ávila", "country": "Cuba"}}}, "rank": 4}}
|
||||
C00000000059D6B5 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-78.9263054493181, 22.08185765]}, "properties": {"locales": {"default": {"name": "Florencia", "address": {"subregion": "Florencia", "region": "Ciego de Ávila", "country": "Cuba"}}}, "rank": 6}}
|
||||
)#";
|
||||
|
||||
geocoder::Tokens Split(string const & s)
|
||||
{
|
||||
geocoder::Tokens result;
|
||||
search::NormalizeAndTokenizeAsUtf8(s, result);
|
||||
return result;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace geocoder
|
||||
|
@ -74,6 +67,8 @@ UNIT_TEST(Geocoder_Hierarchy)
|
|||
{
|
||||
ScopedFile const regionsJsonFile("regions.jsonl", kRegionsData);
|
||||
Geocoder geocoder(regionsJsonFile.GetFullPath());
|
||||
auto const & hierarchy = geocoder.GetHierarchy();
|
||||
auto const & dictionary = hierarchy.GetNormalizedNameDictionary();
|
||||
|
||||
vector<Hierarchy::Entry> entries;
|
||||
geocoder.GetIndex().ForEachDocId({("florencia")}, [&](Index::DocId const & docId) {
|
||||
|
@ -81,9 +76,9 @@ UNIT_TEST(Geocoder_Hierarchy)
|
|||
});
|
||||
|
||||
TEST_EQUAL(entries.size(), 1, ());
|
||||
TEST_EQUAL(entries[0].m_address[static_cast<size_t>(Type::Country)], Split("cuba"), ());
|
||||
TEST_EQUAL(entries[0].m_address[static_cast<size_t>(Type::Region)], Split("ciego de avila"), ());
|
||||
TEST_EQUAL(entries[0].m_address[static_cast<size_t>(Type::Subregion)], Split("florencia"), ());
|
||||
TEST_EQUAL(entries[0].GetNormalizedName(Type::Country, dictionary), "cuba", ());
|
||||
TEST_EQUAL(entries[0].GetNormalizedName(Type::Region, dictionary), "ciego de avila", ());
|
||||
TEST_EQUAL(entries[0].GetNormalizedName(Type::Subregion, dictionary), "florencia", ());
|
||||
}
|
||||
|
||||
UNIT_TEST(Geocoder_OnlyBuildings)
|
||||
|
|
|
@ -16,12 +16,14 @@ using namespace std;
|
|||
namespace geocoder
|
||||
{
|
||||
// Hierarchy::Entry --------------------------------------------------------------------------------
|
||||
bool Hierarchy::Entry::DeserializeFromJSON(string const & jsonStr, ParsingStats & stats)
|
||||
bool Hierarchy::Entry::DeserializeFromJSON(string const & jsonStr,
|
||||
NameDictionaryMaker & normalizedNameDictionaryMaker,
|
||||
ParsingStats & stats)
|
||||
{
|
||||
try
|
||||
{
|
||||
base::Json root(jsonStr.c_str());
|
||||
return DeserializeFromJSONImpl(root.get(), jsonStr, stats);
|
||||
return DeserializeFromJSONImpl(root.get(), jsonStr, normalizedNameDictionaryMaker, stats);
|
||||
}
|
||||
catch (base::Json::Exception const & e)
|
||||
{
|
||||
|
@ -32,6 +34,7 @@ bool Hierarchy::Entry::DeserializeFromJSON(string const & jsonStr, ParsingStats
|
|||
|
||||
// todo(@m) Factor out to geojson.hpp? Add geojson to myjansson?
|
||||
bool Hierarchy::Entry::DeserializeFromJSONImpl(json_t * const root, string const & jsonStr,
|
||||
NameDictionaryMaker & normalizedNameDictionaryMaker,
|
||||
ParsingStats & stats)
|
||||
{
|
||||
if (!json_is_object(root))
|
||||
|
@ -43,7 +46,8 @@ bool Hierarchy::Entry::DeserializeFromJSONImpl(json_t * const root, string const
|
|||
auto const defaultLocale = base::GetJSONObligatoryFieldByPath(root, "properties", "locales",
|
||||
"default");
|
||||
auto const address = base::GetJSONObligatoryField(defaultLocale, "address");
|
||||
bool hasDuplicateAddress = false;
|
||||
m_normalizedAddress= {};
|
||||
Tokens tokens;
|
||||
for (size_t i = 0; i < static_cast<size_t>(Type::Count); ++i)
|
||||
{
|
||||
Type const type = static_cast<Type>(i);
|
||||
|
@ -60,69 +64,52 @@ bool Hierarchy::Entry::DeserializeFromJSONImpl(json_t * const root, string const
|
|||
if (levelValue.empty())
|
||||
continue;
|
||||
|
||||
if (!m_address[i].empty())
|
||||
{
|
||||
LOG(LDEBUG, ("Duplicate address field", type, "when parsing", jsonStr));
|
||||
hasDuplicateAddress = true;
|
||||
}
|
||||
search::NormalizeAndTokenizeAsUtf8(levelValue, tokens);
|
||||
if (tokens.empty())
|
||||
continue;
|
||||
|
||||
search::NormalizeAndTokenizeAsUtf8(levelValue, m_address[i]);
|
||||
|
||||
if (!m_address[i].empty())
|
||||
m_type = static_cast<Type>(i);
|
||||
auto normalizedValue = strings::JoinStrings(tokens, " ");
|
||||
m_normalizedAddress[i] = normalizedNameDictionaryMaker.Add(normalizedValue);
|
||||
m_type = static_cast<Type>(i);
|
||||
}
|
||||
|
||||
auto const & subregion = m_address[static_cast<size_t>(Type::Subregion)];
|
||||
auto const & locality = m_address[static_cast<size_t>(Type::Locality)];
|
||||
if (m_type == Type::Street && locality.empty() && subregion.empty() /* if locality detection fail */)
|
||||
auto const & subregion = m_normalizedAddress[static_cast<size_t>(Type::Subregion)];
|
||||
auto const & locality = m_normalizedAddress[static_cast<size_t>(Type::Locality)];
|
||||
if (m_type == Type::Street && !locality && !subregion)
|
||||
{
|
||||
++stats.m_noLocalityStreets;
|
||||
return false;
|
||||
}
|
||||
if (m_type == Type::Building && locality.empty() && subregion.empty() /* if locality detection fail */)
|
||||
if (m_type == Type::Building && !locality && !subregion)
|
||||
{
|
||||
++stats.m_noLocalityBuildings;
|
||||
return false;
|
||||
}
|
||||
|
||||
m_nameTokens.clear();
|
||||
FromJSONObjectOptionalField(defaultLocale, "name", m_name);
|
||||
search::NormalizeAndTokenizeAsUtf8(m_name, m_nameTokens);
|
||||
|
||||
if (m_name.empty())
|
||||
++stats.m_emptyNames;
|
||||
|
||||
if (hasDuplicateAddress)
|
||||
++stats.m_duplicateAddresses;
|
||||
|
||||
if (m_type == Type::Count)
|
||||
{
|
||||
LOG(LDEBUG, ("No address in an hierarchy entry:", jsonStr));
|
||||
++stats.m_emptyAddresses;
|
||||
}
|
||||
else if (m_nameTokens != m_address[static_cast<size_t>(m_type)])
|
||||
{
|
||||
++stats.m_mismatchedNames;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Hierarchy::Entry::IsParentTo(Hierarchy::Entry const & e) const
|
||||
std::string const & Hierarchy::Entry::GetNormalizedName(
|
||||
Type type, NameDictionary const & normalizedNameDictionary) const
|
||||
{
|
||||
for (size_t i = 0; i < static_cast<size_t>(geocoder::Type::Count); ++i)
|
||||
{
|
||||
if (!m_address[i].empty() && m_address[i] != e.m_address[i])
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
return normalizedNameDictionary.Get(m_normalizedAddress[static_cast<size_t>(type)]);
|
||||
}
|
||||
|
||||
// Hierarchy ---------------------------------------------------------------------------------------
|
||||
Hierarchy::Hierarchy(vector<Entry> && entries, bool sorted)
|
||||
: m_entries{std::move(entries)}
|
||||
Hierarchy::Hierarchy(vector<Entry> && entries, NameDictionary && normalizedNameDictionary)
|
||||
: m_entries{move(entries)}
|
||||
, m_normalizedNameDictionary{move(normalizedNameDictionary)}
|
||||
{
|
||||
if (!sorted)
|
||||
if (!is_sorted(m_entries.begin(), m_entries.end()))
|
||||
{
|
||||
LOG(LINFO, ("Sorting entries..."));
|
||||
sort(m_entries.begin(), m_entries.end());
|
||||
|
@ -134,6 +121,11 @@ vector<Hierarchy::Entry> const & Hierarchy::GetEntries() const
|
|||
return m_entries;
|
||||
}
|
||||
|
||||
NameDictionary const & Hierarchy::GetNormalizedNameDictionary() const
|
||||
{
|
||||
return m_normalizedNameDictionary;
|
||||
}
|
||||
|
||||
Hierarchy::Entry const * Hierarchy::GetEntryForOsmId(base::GeoObjectId const & osmId) const
|
||||
{
|
||||
auto const cmp = [](Hierarchy::Entry const & e, base::GeoObjectId const & id) {
|
||||
|
@ -147,4 +139,24 @@ Hierarchy::Entry const * Hierarchy::GetEntryForOsmId(base::GeoObjectId const & o
|
|||
|
||||
return &(*it);
|
||||
}
|
||||
|
||||
bool Hierarchy::IsParentTo(Hierarchy::Entry const & entry, Hierarchy::Entry const & toEntry) const
|
||||
{
|
||||
for (size_t i = 0; i < static_cast<size_t>(geocoder::Type::Count); ++i)
|
||||
{
|
||||
if (!entry.m_normalizedAddress[i])
|
||||
continue;
|
||||
|
||||
if (!toEntry.m_normalizedAddress[i])
|
||||
return false;
|
||||
auto const pos1 = entry.m_normalizedAddress[i];
|
||||
auto const pos2 = toEntry.m_normalizedAddress[i];
|
||||
if (pos1 != pos2 &&
|
||||
m_normalizedNameDictionary.Get(pos1) != m_normalizedNameDictionary.Get(pos2))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
} // namespace geocoder
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#pragma once
|
||||
|
||||
#include "geocoder/name_dictionary.hpp"
|
||||
#include "geocoder/types.hpp"
|
||||
|
||||
#include "base/geo_object_id.hpp"
|
||||
|
@ -58,36 +59,38 @@ public:
|
|||
// part of the geojson entry.
|
||||
struct Entry
|
||||
{
|
||||
bool DeserializeFromJSON(std::string const & jsonStr, ParsingStats & stats);
|
||||
|
||||
bool DeserializeFromJSON(std::string const & jsonStr,
|
||||
NameDictionaryMaker & normalizedNameDictionaryMaker,
|
||||
ParsingStats & stats);
|
||||
bool DeserializeFromJSONImpl(json_t * const root, std::string const & jsonStr,
|
||||
NameDictionaryMaker & normalizedNameDictionaryMaker,
|
||||
ParsingStats & stats);
|
||||
|
||||
// Checks whether this entry is a parent of |e|.
|
||||
bool IsParentTo(Entry const & e) const;
|
||||
|
||||
std::string const & GetNormalizedName(Type type,
|
||||
NameDictionary const & normalizedNameDictionary) const;
|
||||
bool operator<(Entry const & rhs) const { return m_osmId < rhs.m_osmId; }
|
||||
|
||||
base::GeoObjectId m_osmId = base::GeoObjectId(base::GeoObjectId::kInvalid);
|
||||
|
||||
// Original name of the entry. Useful for debugging.
|
||||
std::string m_name;
|
||||
// Tokenized and simplified name of the entry.
|
||||
Tokens m_nameTokens;
|
||||
|
||||
Type m_type = Type::Count;
|
||||
|
||||
// The address fields of this entry, one per Type.
|
||||
std::array<Tokens, static_cast<size_t>(Type::Count)> m_address;
|
||||
// The positions of entry address fields in normalized name dictionary, one per Type.
|
||||
std::array<NameDictionary::Position, static_cast<size_t>(Type::Count)> m_normalizedAddress{};
|
||||
};
|
||||
|
||||
explicit Hierarchy(std::vector<Entry> && entries, bool sorted);
|
||||
explicit Hierarchy(std::vector<Entry> && entries, NameDictionary && normalizeNameDictionary);
|
||||
|
||||
std::vector<Entry> const & GetEntries() const;
|
||||
NameDictionary const & GetNormalizedNameDictionary() const;
|
||||
|
||||
Entry const * GetEntryForOsmId(base::GeoObjectId const & osmId) const;
|
||||
bool IsParentTo(Hierarchy::Entry const & entry, Hierarchy::Entry const & toEntry) const;
|
||||
|
||||
private:
|
||||
std::vector<Entry> m_entries;
|
||||
NameDictionary m_normalizedNameDictionary;
|
||||
};
|
||||
} // namespace geocoder
|
||||
|
|
|
@ -61,6 +61,7 @@ Hierarchy HierarchyReader::Read(unsigned int readersCount)
|
|||
LOG(LINFO, ("Reading entries..."));
|
||||
|
||||
vector<Entry> entries;
|
||||
NameDictionaryMaker nameDictionaryMaker;
|
||||
ParsingStats stats{};
|
||||
|
||||
base::thread_pool::computational::ThreadPool threadPool{readersCount};
|
||||
|
@ -77,6 +78,18 @@ Hierarchy HierarchyReader::Read(unsigned int readersCount)
|
|||
tasks.pop_front();
|
||||
|
||||
auto & taskEntries = taskResult.m_entries;
|
||||
auto const & taskNameDictionary = taskResult.m_nameDictionary;
|
||||
for (auto & entry : taskEntries)
|
||||
{
|
||||
for (size_t i = 0; i < static_cast<size_t>(Type::Count); ++i)
|
||||
{
|
||||
if (auto & position = entry.m_normalizedAddress[i])
|
||||
{
|
||||
auto const & name = taskNameDictionary.Get(position);
|
||||
position = nameDictionaryMaker.Add(name);
|
||||
}
|
||||
}
|
||||
}
|
||||
move(begin(taskEntries), end(taskEntries), back_inserter(entries));
|
||||
|
||||
stats += taskResult.m_stats;
|
||||
|
@ -105,7 +118,7 @@ Hierarchy HierarchyReader::Read(unsigned int readersCount)
|
|||
("Entries whose names do not match their most specific addresses:", stats.m_mismatchedNames));
|
||||
LOG(LINFO, ("(End of stats.)"));
|
||||
|
||||
return Hierarchy{move(entries), true};
|
||||
return Hierarchy{move(entries), nameDictionaryMaker.Release()};
|
||||
}
|
||||
|
||||
void HierarchyReader::CheckDuplicateOsmIds(vector<geocoder::Hierarchy::Entry> const & entries,
|
||||
|
@ -155,6 +168,7 @@ HierarchyReader::ParsingResult HierarchyReader::DeserializeEntries(
|
|||
{
|
||||
vector<Entry> entries;
|
||||
entries.reserve(bufferSize);
|
||||
NameDictionaryMaker nameDictionaryMaker;
|
||||
ParsingStats stats;
|
||||
|
||||
for (size_t i = 0; i < bufferSize; ++i)
|
||||
|
@ -178,7 +192,7 @@ HierarchyReader::ParsingResult HierarchyReader::DeserializeEntries(
|
|||
auto const osmId = base::GeoObjectId(encodedId);
|
||||
entry.m_osmId = osmId;
|
||||
|
||||
if (!entry.DeserializeFromJSON(json, stats))
|
||||
if (!entry.DeserializeFromJSON(json, nameDictionaryMaker, stats))
|
||||
continue;
|
||||
|
||||
if (entry.m_type == Type::Count)
|
||||
|
@ -193,7 +207,7 @@ HierarchyReader::ParsingResult HierarchyReader::DeserializeEntries(
|
|||
entries.push_back(move(entry));
|
||||
}
|
||||
|
||||
return {move(entries), move(stats)};
|
||||
return {move(entries), nameDictionaryMaker.Release(), move(stats)};
|
||||
}
|
||||
|
||||
// static
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "geocoder/hierarchy.hpp"
|
||||
#include "geocoder/name_dictionary.hpp"
|
||||
|
||||
#include "base/exception.hpp"
|
||||
#include "base/geo_object_id.hpp"
|
||||
|
@ -33,12 +34,13 @@ private:
|
|||
struct ParsingResult
|
||||
{
|
||||
std::vector<Entry> m_entries;
|
||||
NameDictionary m_nameDictionary;
|
||||
ParsingStats m_stats;
|
||||
};
|
||||
|
||||
ParsingResult ReadEntries(size_t count);
|
||||
ParsingResult DeserializeEntries(std::vector<std::string> const & linesBuffer,
|
||||
std::size_t const bufferSize);
|
||||
std::size_t const bufferSize);
|
||||
static bool DeserializeId(std::string const & str, uint64_t & id);
|
||||
static std::string SerializeId(uint64_t id);
|
||||
|
||||
|
|
|
@ -26,6 +26,7 @@ namespace geocoder
|
|||
{
|
||||
Index::Index(Hierarchy const & hierarchy, unsigned int loadThreadsCount)
|
||||
: m_docs(hierarchy.GetEntries())
|
||||
, m_hierarchy{hierarchy}
|
||||
{
|
||||
CHECK_GREATER_OR_EQUAL(loadThreadsCount, 1, ());
|
||||
|
||||
|
@ -55,6 +56,8 @@ string Index::MakeIndexKey(Tokens const & tokens)
|
|||
void Index::AddEntries()
|
||||
{
|
||||
size_t numIndexed = 0;
|
||||
auto const & dictionary = m_hierarchy.GetNormalizedNameDictionary();
|
||||
Tokens tokens;
|
||||
for (DocId docId = 0; docId < static_cast<DocId>(m_docs.size()); ++docId)
|
||||
{
|
||||
auto const & doc = m_docs[static_cast<size_t>(docId)];
|
||||
|
@ -72,8 +75,9 @@ void Index::AddEntries()
|
|||
}
|
||||
else
|
||||
{
|
||||
size_t const t = static_cast<size_t>(doc.m_type);
|
||||
m_docIdsByTokens[MakeIndexKey(doc.m_address[t])].emplace_back(docId);
|
||||
auto const & name = doc.GetNormalizedName(doc.m_type, dictionary);
|
||||
search::NormalizeAndTokenizeAsUtf8(name, tokens);
|
||||
m_docIdsByTokens[MakeIndexKey(tokens)].emplace_back(docId);
|
||||
}
|
||||
|
||||
++numIndexed;
|
||||
|
@ -88,28 +92,30 @@ void Index::AddEntries()
|
|||
void Index::AddStreet(DocId const & docId, Index::Doc const & doc)
|
||||
{
|
||||
CHECK_EQUAL(doc.m_type, Type::Street, ());
|
||||
size_t const t = static_cast<size_t>(doc.m_type);
|
||||
|
||||
auto isStreetSynonym = [] (string const & s) {
|
||||
return search::IsStreetSynonym(strings::MakeUniString(s));
|
||||
};
|
||||
|
||||
if (all_of(begin(doc.m_address[t]), end(doc.m_address[t]), isStreetSynonym))
|
||||
auto const & dictionary = m_hierarchy.GetNormalizedNameDictionary();
|
||||
auto const & name = doc.GetNormalizedName(Type::Street, dictionary);
|
||||
Tokens tokens;
|
||||
search::NormalizeAndTokenizeAsUtf8(name, tokens);
|
||||
|
||||
if (all_of(begin(tokens), end(tokens), isStreetSynonym))
|
||||
{
|
||||
LOG(LDEBUG, ("Undefined proper name in tokens", doc.m_address[t], "of street entry",
|
||||
doc.m_osmId, "(", doc.m_address, ")"));
|
||||
if (doc.m_address[t].size() > 1)
|
||||
m_docIdsByTokens[MakeIndexKey(doc.m_address[t])].emplace_back(docId);
|
||||
if (tokens.size() > 1)
|
||||
m_docIdsByTokens[MakeIndexKey(tokens)].emplace_back(docId);
|
||||
return;
|
||||
}
|
||||
|
||||
m_docIdsByTokens[MakeIndexKey(doc.m_address[t])].emplace_back(docId);
|
||||
m_docIdsByTokens[MakeIndexKey(tokens)].emplace_back(docId);
|
||||
|
||||
for (size_t i = 0; i < doc.m_address[t].size(); ++i)
|
||||
for (size_t i = 0; i < tokens.size(); ++i)
|
||||
{
|
||||
if (!isStreetSynonym(doc.m_address[t][i]))
|
||||
if (!isStreetSynonym(tokens[i]))
|
||||
continue;
|
||||
auto addr = doc.m_address[t];
|
||||
auto addr = tokens;
|
||||
addr.erase(addr.begin() + i);
|
||||
m_docIdsByTokens[MakeIndexKey(addr)].emplace_back(docId);
|
||||
}
|
||||
|
@ -123,6 +129,8 @@ void Index::AddHouses(unsigned int loadThreadsCount)
|
|||
vector<thread> threads(loadThreadsCount);
|
||||
CHECK_GREATER(threads.size(), 0, ());
|
||||
|
||||
auto const & dictionary = m_hierarchy.GetNormalizedNameDictionary();
|
||||
|
||||
for (size_t t = 0; t < threads.size(); ++t)
|
||||
{
|
||||
threads[t] = thread([&, t, this]() {
|
||||
|
@ -137,31 +145,39 @@ void Index::AddHouses(unsigned int loadThreadsCount)
|
|||
if (buildingDoc.m_type != Type::Building)
|
||||
continue;
|
||||
|
||||
auto const & street = buildingDoc.m_address[static_cast<size_t>(Type::Street)];
|
||||
auto const & locality = buildingDoc.m_address[static_cast<size_t>(Type::Locality)];
|
||||
auto const & street = buildingDoc.m_normalizedAddress[static_cast<size_t>(Type::Street)];
|
||||
auto const & locality =
|
||||
buildingDoc.m_normalizedAddress[static_cast<size_t>(Type::Locality)];
|
||||
|
||||
Tokens const * relationName = nullptr;
|
||||
|
||||
if (!street.empty())
|
||||
relationName = &street;
|
||||
else if (!locality.empty())
|
||||
relationName = &locality;
|
||||
|
||||
if (!relationName)
|
||||
NameDictionary::Position relation = NameDictionary::kUnspecifiedPosition;
|
||||
if (street)
|
||||
relation = street;
|
||||
else if (locality)
|
||||
relation = locality;
|
||||
else
|
||||
continue;
|
||||
|
||||
ForEachDocId(*relationName, [&](DocId const & candidate) {
|
||||
auto const & relationName = dictionary.Get(relation);
|
||||
Tokens relationNameTokens;
|
||||
search::NormalizeAndTokenizeAsUtf8(relationName, relationNameTokens);
|
||||
bool indexed = false;
|
||||
ForEachDocId(relationNameTokens, [&](DocId const & candidate) {
|
||||
auto const & candidateDoc = GetDoc(candidate);
|
||||
if (candidateDoc.IsParentTo(buildingDoc))
|
||||
if (m_hierarchy.IsParentTo(candidateDoc, buildingDoc))
|
||||
{
|
||||
indexed = true;
|
||||
|
||||
lock_guard<mutex> lock(buildingsMutex);
|
||||
m_relatedBuildings[candidate].emplace_back(docId);
|
||||
}
|
||||
});
|
||||
|
||||
auto processedCount = numIndexed.fetch_add(1) + 1;
|
||||
if (processedCount % kLogBatch == 0)
|
||||
LOG(LINFO, ("Indexed", processedCount, "houses"));
|
||||
if (indexed)
|
||||
{
|
||||
auto processedCount = numIndexed.fetch_add(1) + 1;
|
||||
if (processedCount % kLogBatch == 0)
|
||||
LOG(LINFO, ("Indexed", processedCount, "houses"));
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
|
|
@ -70,6 +70,7 @@ private:
|
|||
void AddHouses(unsigned int loadThreadsCount);
|
||||
|
||||
std::vector<Doc> const & m_docs;
|
||||
Hierarchy const & m_hierarchy;
|
||||
|
||||
std::unordered_map<std::string, std::vector<DocId>> m_docIdsByTokens;
|
||||
|
||||
|
|
42
geocoder/name_dictionary.cpp
Normal file
42
geocoder/name_dictionary.cpp
Normal file
|
@ -0,0 +1,42 @@
|
|||
#include "geocoder/name_dictionary.hpp"
|
||||
|
||||
#include "base/assert.hpp"
|
||||
|
||||
#include <utility>
|
||||
|
||||
namespace geocoder
|
||||
{
|
||||
// NameDictionary ----------------------------------------------------------------------------------
|
||||
std::string const & NameDictionary::Get(Position position) const
|
||||
{
|
||||
CHECK_GREATER(position, 0, ());
|
||||
CHECK_LESS_OR_EQUAL(position, m_stock.size(), ());
|
||||
return m_stock[position - 1];
|
||||
}
|
||||
|
||||
NameDictionary::Position NameDictionary::Add(std::string const & s)
|
||||
{
|
||||
CHECK_LESS(m_stock.size(), UINT32_MAX, ());
|
||||
m_stock.push_back(s);
|
||||
return m_stock.size(); // index + 1
|
||||
}
|
||||
|
||||
// NameDictionaryMaker -----------------------------------------------------------------------------
|
||||
NameDictionary::Position NameDictionaryMaker::Add(std::string const & s)
|
||||
{
|
||||
auto indexItem = m_index.find(s);
|
||||
if (indexItem != m_index.end())
|
||||
return indexItem->second;
|
||||
|
||||
auto p = m_dictionary.Add(s);
|
||||
auto indexEmplace = m_index.emplace(m_dictionary.Get(p), p);
|
||||
CHECK(indexEmplace.second, ());
|
||||
return p;
|
||||
}
|
||||
|
||||
NameDictionary NameDictionaryMaker::Release()
|
||||
{
|
||||
m_index.clear();
|
||||
return std::move(m_dictionary);
|
||||
}
|
||||
} // namespace geocoder
|
46
geocoder/name_dictionary.hpp
Normal file
46
geocoder/name_dictionary.hpp
Normal file
|
@ -0,0 +1,46 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
namespace geocoder
|
||||
{
|
||||
class NameDictionary
|
||||
{
|
||||
public:
|
||||
// Values of Position type: kUnspecifiedPosition or >= 1.
|
||||
using Position = std::uint32_t;
|
||||
|
||||
static constexpr Position kUnspecifiedPosition = 0;
|
||||
|
||||
NameDictionary() = default;
|
||||
NameDictionary(NameDictionary &&) = default;
|
||||
NameDictionary & operator=(NameDictionary &&) = default;
|
||||
|
||||
NameDictionary(NameDictionary const &) = delete;
|
||||
NameDictionary & operator=(NameDictionary const &) = delete;
|
||||
|
||||
std::string const & Get(Position position) const;
|
||||
Position Add(std::string const & s);
|
||||
|
||||
private:
|
||||
std::vector<std::string> m_stock;
|
||||
};
|
||||
|
||||
class NameDictionaryMaker
|
||||
{
|
||||
public:
|
||||
NameDictionaryMaker() = default;
|
||||
NameDictionaryMaker(NameDictionaryMaker const &) = delete;
|
||||
NameDictionaryMaker & operator=(NameDictionaryMaker const &) = delete;
|
||||
|
||||
NameDictionary::Position Add(std::string const & s);
|
||||
NameDictionary Release();
|
||||
|
||||
private:
|
||||
NameDictionary m_dictionary;
|
||||
std::unordered_map<std::string, NameDictionary::Position> m_index;
|
||||
};
|
||||
} // namespace geocoder
|
Loading…
Add table
Reference in a new issue