[geocoder] Optimize memory: name dictionary for address parts

This commit is contained in:
Anatoly Serdtcev 2019-07-29 16:21:31 +03:00 committed by mpimenov
parent f6cbdacaa7
commit 6837cafdbd
12 changed files with 245 additions and 93 deletions

View file

@ -12,6 +12,8 @@ set(
hierarchy_reader.hpp
index.cpp
index.hpp
name_dictionary.cpp
name_dictionary.hpp
result.cpp
result.hpp
types.cpp

View file

@ -374,10 +374,14 @@ void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, vector
{
m_index.ForEachRelatedBuilding(docId, [&](Index::DocId const & buildingDocId) {
auto const & bld = m_index.GetDoc(buildingDocId);
auto const bt = static_cast<size_t>(Type::Building);
auto const & realHN = MakeHouseNumber(bld.m_address[bt]);
if (search::house_numbers::HouseNumbersMatch(realHN, subqueryHN, false /* queryIsPrefix */))
auto const & realHN = bld.GetNormalizedName(Type::Building,
m_hierarchy.GetNormalizedNameDictionary());
auto const & realHNUniStr = strings::MakeUniString(realHN);
if (search::house_numbers::HouseNumbersMatch(realHNUniStr, subqueryHN,
false /* queryIsPrefix */))
{
curLayer.m_entries.emplace_back(buildingDocId);
}
});
}
});
@ -405,7 +409,7 @@ bool Geocoder::HasParent(vector<Geocoder::Layer> const & layers, Hierarchy::Entr
// Note that the relationship is somewhat inverted: every ancestor
// is stored in the address but the nodes have no information
// about their children.
if (m_index.GetDoc(docId).IsParentTo(e))
if (m_hierarchy.IsParentTo(m_index.GetDoc(docId), e))
return true;
}
return false;

View file

@ -24,13 +24,28 @@ void PrintResults(Hierarchy const & hierarchy, vector<Result> const & results)
if (results.empty())
return;
cout << "Top results:" << endl;
auto const & dictionary = hierarchy.GetNormalizedNameDictionary();
for (size_t i = 0; i < results.size(); ++i)
{
if (FLAGS_top >= 0 && static_cast<int32_t>(i) >= FLAGS_top)
break;
cout << " " << DebugPrint(results[i]);
if (auto const && e = hierarchy.GetEntryForOsmId(results[i].m_osmId))
cout << " " << DebugPrint(e->m_address);
{
cout << " [";
auto const * delimiter = "";
for (size_t i = 0; i < static_cast<size_t>(Type::Count); ++i)
{
if (e->m_normalizedAddress[i])
{
auto type = static_cast<Type>(i);
cout << delimiter << ToString(type) << ": " << e->GetNormalizedName(type, dictionary);
delimiter = ", ";
}
}
cout << "]";
}
cout << endl;
}
}

View file

@ -29,13 +29,6 @@ C00000000004B279 {"type": "Feature", "geometry": {"type": "Point", "coordinates"
C0000000001C4CA7 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-78.7260117405499, 21.74300205]}, "properties": {"locales": {"default": {"name": "Ciego de Ávila", "address": {"region": "Ciego de Ávila", "country": "Cuba"}}}, "rank": 4}}
C00000000059D6B5 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-78.9263054493181, 22.08185765]}, "properties": {"locales": {"default": {"name": "Florencia", "address": {"subregion": "Florencia", "region": "Ciego de Ávila", "country": "Cuba"}}}, "rank": 6}}
)#";
geocoder::Tokens Split(string const & s)
{
geocoder::Tokens result;
search::NormalizeAndTokenizeAsUtf8(s, result);
return result;
}
} // namespace
namespace geocoder
@ -74,6 +67,8 @@ UNIT_TEST(Geocoder_Hierarchy)
{
ScopedFile const regionsJsonFile("regions.jsonl", kRegionsData);
Geocoder geocoder(regionsJsonFile.GetFullPath());
auto const & hierarchy = geocoder.GetHierarchy();
auto const & dictionary = hierarchy.GetNormalizedNameDictionary();
vector<Hierarchy::Entry> entries;
geocoder.GetIndex().ForEachDocId({("florencia")}, [&](Index::DocId const & docId) {
@ -81,9 +76,9 @@ UNIT_TEST(Geocoder_Hierarchy)
});
TEST_EQUAL(entries.size(), 1, ());
TEST_EQUAL(entries[0].m_address[static_cast<size_t>(Type::Country)], Split("cuba"), ());
TEST_EQUAL(entries[0].m_address[static_cast<size_t>(Type::Region)], Split("ciego de avila"), ());
TEST_EQUAL(entries[0].m_address[static_cast<size_t>(Type::Subregion)], Split("florencia"), ());
TEST_EQUAL(entries[0].GetNormalizedName(Type::Country, dictionary), "cuba", ());
TEST_EQUAL(entries[0].GetNormalizedName(Type::Region, dictionary), "ciego de avila", ());
TEST_EQUAL(entries[0].GetNormalizedName(Type::Subregion, dictionary), "florencia", ());
}
UNIT_TEST(Geocoder_OnlyBuildings)

View file

@ -16,12 +16,14 @@ using namespace std;
namespace geocoder
{
// Hierarchy::Entry --------------------------------------------------------------------------------
bool Hierarchy::Entry::DeserializeFromJSON(string const & jsonStr, ParsingStats & stats)
bool Hierarchy::Entry::DeserializeFromJSON(string const & jsonStr,
NameDictionaryMaker & normalizedNameDictionaryMaker,
ParsingStats & stats)
{
try
{
base::Json root(jsonStr.c_str());
return DeserializeFromJSONImpl(root.get(), jsonStr, stats);
return DeserializeFromJSONImpl(root.get(), jsonStr, normalizedNameDictionaryMaker, stats);
}
catch (base::Json::Exception const & e)
{
@ -32,6 +34,7 @@ bool Hierarchy::Entry::DeserializeFromJSON(string const & jsonStr, ParsingStats
// todo(@m) Factor out to geojson.hpp? Add geojson to myjansson?
bool Hierarchy::Entry::DeserializeFromJSONImpl(json_t * const root, string const & jsonStr,
NameDictionaryMaker & normalizedNameDictionaryMaker,
ParsingStats & stats)
{
if (!json_is_object(root))
@ -43,7 +46,8 @@ bool Hierarchy::Entry::DeserializeFromJSONImpl(json_t * const root, string const
auto const defaultLocale = base::GetJSONObligatoryFieldByPath(root, "properties", "locales",
"default");
auto const address = base::GetJSONObligatoryField(defaultLocale, "address");
bool hasDuplicateAddress = false;
m_normalizedAddress= {};
Tokens tokens;
for (size_t i = 0; i < static_cast<size_t>(Type::Count); ++i)
{
Type const type = static_cast<Type>(i);
@ -60,69 +64,52 @@ bool Hierarchy::Entry::DeserializeFromJSONImpl(json_t * const root, string const
if (levelValue.empty())
continue;
if (!m_address[i].empty())
{
LOG(LDEBUG, ("Duplicate address field", type, "when parsing", jsonStr));
hasDuplicateAddress = true;
}
search::NormalizeAndTokenizeAsUtf8(levelValue, tokens);
if (tokens.empty())
continue;
search::NormalizeAndTokenizeAsUtf8(levelValue, m_address[i]);
if (!m_address[i].empty())
m_type = static_cast<Type>(i);
auto normalizedValue = strings::JoinStrings(tokens, " ");
m_normalizedAddress[i] = normalizedNameDictionaryMaker.Add(normalizedValue);
m_type = static_cast<Type>(i);
}
auto const & subregion = m_address[static_cast<size_t>(Type::Subregion)];
auto const & locality = m_address[static_cast<size_t>(Type::Locality)];
if (m_type == Type::Street && locality.empty() && subregion.empty() /* if locality detection fail */)
auto const & subregion = m_normalizedAddress[static_cast<size_t>(Type::Subregion)];
auto const & locality = m_normalizedAddress[static_cast<size_t>(Type::Locality)];
if (m_type == Type::Street && !locality && !subregion)
{
++stats.m_noLocalityStreets;
return false;
}
if (m_type == Type::Building && locality.empty() && subregion.empty() /* if locality detection fail */)
if (m_type == Type::Building && !locality && !subregion)
{
++stats.m_noLocalityBuildings;
return false;
}
m_nameTokens.clear();
FromJSONObjectOptionalField(defaultLocale, "name", m_name);
search::NormalizeAndTokenizeAsUtf8(m_name, m_nameTokens);
if (m_name.empty())
++stats.m_emptyNames;
if (hasDuplicateAddress)
++stats.m_duplicateAddresses;
if (m_type == Type::Count)
{
LOG(LDEBUG, ("No address in an hierarchy entry:", jsonStr));
++stats.m_emptyAddresses;
}
else if (m_nameTokens != m_address[static_cast<size_t>(m_type)])
{
++stats.m_mismatchedNames;
}
return true;
}
bool Hierarchy::Entry::IsParentTo(Hierarchy::Entry const & e) const
std::string const & Hierarchy::Entry::GetNormalizedName(
Type type, NameDictionary const & normalizedNameDictionary) const
{
for (size_t i = 0; i < static_cast<size_t>(geocoder::Type::Count); ++i)
{
if (!m_address[i].empty() && m_address[i] != e.m_address[i])
return false;
}
return true;
return normalizedNameDictionary.Get(m_normalizedAddress[static_cast<size_t>(type)]);
}
// Hierarchy ---------------------------------------------------------------------------------------
Hierarchy::Hierarchy(vector<Entry> && entries, bool sorted)
: m_entries{std::move(entries)}
Hierarchy::Hierarchy(vector<Entry> && entries, NameDictionary && normalizedNameDictionary)
: m_entries{move(entries)}
, m_normalizedNameDictionary{move(normalizedNameDictionary)}
{
if (!sorted)
if (!is_sorted(m_entries.begin(), m_entries.end()))
{
LOG(LINFO, ("Sorting entries..."));
sort(m_entries.begin(), m_entries.end());
@ -134,6 +121,11 @@ vector<Hierarchy::Entry> const & Hierarchy::GetEntries() const
return m_entries;
}
NameDictionary const & Hierarchy::GetNormalizedNameDictionary() const
{
return m_normalizedNameDictionary;
}
Hierarchy::Entry const * Hierarchy::GetEntryForOsmId(base::GeoObjectId const & osmId) const
{
auto const cmp = [](Hierarchy::Entry const & e, base::GeoObjectId const & id) {
@ -147,4 +139,24 @@ Hierarchy::Entry const * Hierarchy::GetEntryForOsmId(base::GeoObjectId const & o
return &(*it);
}
bool Hierarchy::IsParentTo(Hierarchy::Entry const & entry, Hierarchy::Entry const & toEntry) const
{
for (size_t i = 0; i < static_cast<size_t>(geocoder::Type::Count); ++i)
{
if (!entry.m_normalizedAddress[i])
continue;
if (!toEntry.m_normalizedAddress[i])
return false;
auto const pos1 = entry.m_normalizedAddress[i];
auto const pos2 = toEntry.m_normalizedAddress[i];
if (pos1 != pos2 &&
m_normalizedNameDictionary.Get(pos1) != m_normalizedNameDictionary.Get(pos2))
{
return false;
}
}
return true;
}
} // namespace geocoder

View file

@ -1,5 +1,6 @@
#pragma once
#include "geocoder/name_dictionary.hpp"
#include "geocoder/types.hpp"
#include "base/geo_object_id.hpp"
@ -58,36 +59,38 @@ public:
// part of the geojson entry.
struct Entry
{
bool DeserializeFromJSON(std::string const & jsonStr, ParsingStats & stats);
bool DeserializeFromJSON(std::string const & jsonStr,
NameDictionaryMaker & normalizedNameDictionaryMaker,
ParsingStats & stats);
bool DeserializeFromJSONImpl(json_t * const root, std::string const & jsonStr,
NameDictionaryMaker & normalizedNameDictionaryMaker,
ParsingStats & stats);
// Checks whether this entry is a parent of |e|.
bool IsParentTo(Entry const & e) const;
std::string const & GetNormalizedName(Type type,
NameDictionary const & normalizedNameDictionary) const;
bool operator<(Entry const & rhs) const { return m_osmId < rhs.m_osmId; }
base::GeoObjectId m_osmId = base::GeoObjectId(base::GeoObjectId::kInvalid);
// Original name of the entry. Useful for debugging.
std::string m_name;
// Tokenized and simplified name of the entry.
Tokens m_nameTokens;
Type m_type = Type::Count;
// The address fields of this entry, one per Type.
std::array<Tokens, static_cast<size_t>(Type::Count)> m_address;
// The positions of entry address fields in normalized name dictionary, one per Type.
std::array<NameDictionary::Position, static_cast<size_t>(Type::Count)> m_normalizedAddress{};
};
explicit Hierarchy(std::vector<Entry> && entries, bool sorted);
explicit Hierarchy(std::vector<Entry> && entries, NameDictionary && normalizeNameDictionary);
std::vector<Entry> const & GetEntries() const;
NameDictionary const & GetNormalizedNameDictionary() const;
Entry const * GetEntryForOsmId(base::GeoObjectId const & osmId) const;
bool IsParentTo(Hierarchy::Entry const & entry, Hierarchy::Entry const & toEntry) const;
private:
std::vector<Entry> m_entries;
NameDictionary m_normalizedNameDictionary;
};
} // namespace geocoder

View file

@ -61,6 +61,7 @@ Hierarchy HierarchyReader::Read(unsigned int readersCount)
LOG(LINFO, ("Reading entries..."));
vector<Entry> entries;
NameDictionaryMaker nameDictionaryMaker;
ParsingStats stats{};
base::thread_pool::computational::ThreadPool threadPool{readersCount};
@ -77,6 +78,18 @@ Hierarchy HierarchyReader::Read(unsigned int readersCount)
tasks.pop_front();
auto & taskEntries = taskResult.m_entries;
auto const & taskNameDictionary = taskResult.m_nameDictionary;
for (auto & entry : taskEntries)
{
for (size_t i = 0; i < static_cast<size_t>(Type::Count); ++i)
{
if (auto & position = entry.m_normalizedAddress[i])
{
auto const & name = taskNameDictionary.Get(position);
position = nameDictionaryMaker.Add(name);
}
}
}
move(begin(taskEntries), end(taskEntries), back_inserter(entries));
stats += taskResult.m_stats;
@ -105,7 +118,7 @@ Hierarchy HierarchyReader::Read(unsigned int readersCount)
("Entries whose names do not match their most specific addresses:", stats.m_mismatchedNames));
LOG(LINFO, ("(End of stats.)"));
return Hierarchy{move(entries), true};
return Hierarchy{move(entries), nameDictionaryMaker.Release()};
}
void HierarchyReader::CheckDuplicateOsmIds(vector<geocoder::Hierarchy::Entry> const & entries,
@ -155,6 +168,7 @@ HierarchyReader::ParsingResult HierarchyReader::DeserializeEntries(
{
vector<Entry> entries;
entries.reserve(bufferSize);
NameDictionaryMaker nameDictionaryMaker;
ParsingStats stats;
for (size_t i = 0; i < bufferSize; ++i)
@ -178,7 +192,7 @@ HierarchyReader::ParsingResult HierarchyReader::DeserializeEntries(
auto const osmId = base::GeoObjectId(encodedId);
entry.m_osmId = osmId;
if (!entry.DeserializeFromJSON(json, stats))
if (!entry.DeserializeFromJSON(json, nameDictionaryMaker, stats))
continue;
if (entry.m_type == Type::Count)
@ -193,7 +207,7 @@ HierarchyReader::ParsingResult HierarchyReader::DeserializeEntries(
entries.push_back(move(entry));
}
return {move(entries), move(stats)};
return {move(entries), nameDictionaryMaker.Release(), move(stats)};
}
// static

View file

@ -1,6 +1,7 @@
#pragma once
#include "geocoder/hierarchy.hpp"
#include "geocoder/name_dictionary.hpp"
#include "base/exception.hpp"
#include "base/geo_object_id.hpp"
@ -33,12 +34,13 @@ private:
struct ParsingResult
{
std::vector<Entry> m_entries;
NameDictionary m_nameDictionary;
ParsingStats m_stats;
};
ParsingResult ReadEntries(size_t count);
ParsingResult DeserializeEntries(std::vector<std::string> const & linesBuffer,
std::size_t const bufferSize);
std::size_t const bufferSize);
static bool DeserializeId(std::string const & str, uint64_t & id);
static std::string SerializeId(uint64_t id);

View file

@ -26,6 +26,7 @@ namespace geocoder
{
Index::Index(Hierarchy const & hierarchy, unsigned int loadThreadsCount)
: m_docs(hierarchy.GetEntries())
, m_hierarchy{hierarchy}
{
CHECK_GREATER_OR_EQUAL(loadThreadsCount, 1, ());
@ -55,6 +56,8 @@ string Index::MakeIndexKey(Tokens const & tokens)
void Index::AddEntries()
{
size_t numIndexed = 0;
auto const & dictionary = m_hierarchy.GetNormalizedNameDictionary();
Tokens tokens;
for (DocId docId = 0; docId < static_cast<DocId>(m_docs.size()); ++docId)
{
auto const & doc = m_docs[static_cast<size_t>(docId)];
@ -72,8 +75,9 @@ void Index::AddEntries()
}
else
{
size_t const t = static_cast<size_t>(doc.m_type);
m_docIdsByTokens[MakeIndexKey(doc.m_address[t])].emplace_back(docId);
auto const & name = doc.GetNormalizedName(doc.m_type, dictionary);
search::NormalizeAndTokenizeAsUtf8(name, tokens);
m_docIdsByTokens[MakeIndexKey(tokens)].emplace_back(docId);
}
++numIndexed;
@ -88,28 +92,30 @@ void Index::AddEntries()
void Index::AddStreet(DocId const & docId, Index::Doc const & doc)
{
CHECK_EQUAL(doc.m_type, Type::Street, ());
size_t const t = static_cast<size_t>(doc.m_type);
auto isStreetSynonym = [] (string const & s) {
return search::IsStreetSynonym(strings::MakeUniString(s));
};
if (all_of(begin(doc.m_address[t]), end(doc.m_address[t]), isStreetSynonym))
auto const & dictionary = m_hierarchy.GetNormalizedNameDictionary();
auto const & name = doc.GetNormalizedName(Type::Street, dictionary);
Tokens tokens;
search::NormalizeAndTokenizeAsUtf8(name, tokens);
if (all_of(begin(tokens), end(tokens), isStreetSynonym))
{
LOG(LDEBUG, ("Undefined proper name in tokens", doc.m_address[t], "of street entry",
doc.m_osmId, "(", doc.m_address, ")"));
if (doc.m_address[t].size() > 1)
m_docIdsByTokens[MakeIndexKey(doc.m_address[t])].emplace_back(docId);
if (tokens.size() > 1)
m_docIdsByTokens[MakeIndexKey(tokens)].emplace_back(docId);
return;
}
m_docIdsByTokens[MakeIndexKey(doc.m_address[t])].emplace_back(docId);
m_docIdsByTokens[MakeIndexKey(tokens)].emplace_back(docId);
for (size_t i = 0; i < doc.m_address[t].size(); ++i)
for (size_t i = 0; i < tokens.size(); ++i)
{
if (!isStreetSynonym(doc.m_address[t][i]))
if (!isStreetSynonym(tokens[i]))
continue;
auto addr = doc.m_address[t];
auto addr = tokens;
addr.erase(addr.begin() + i);
m_docIdsByTokens[MakeIndexKey(addr)].emplace_back(docId);
}
@ -123,6 +129,8 @@ void Index::AddHouses(unsigned int loadThreadsCount)
vector<thread> threads(loadThreadsCount);
CHECK_GREATER(threads.size(), 0, ());
auto const & dictionary = m_hierarchy.GetNormalizedNameDictionary();
for (size_t t = 0; t < threads.size(); ++t)
{
threads[t] = thread([&, t, this]() {
@ -137,31 +145,39 @@ void Index::AddHouses(unsigned int loadThreadsCount)
if (buildingDoc.m_type != Type::Building)
continue;
auto const & street = buildingDoc.m_address[static_cast<size_t>(Type::Street)];
auto const & locality = buildingDoc.m_address[static_cast<size_t>(Type::Locality)];
auto const & street = buildingDoc.m_normalizedAddress[static_cast<size_t>(Type::Street)];
auto const & locality =
buildingDoc.m_normalizedAddress[static_cast<size_t>(Type::Locality)];
Tokens const * relationName = nullptr;
if (!street.empty())
relationName = &street;
else if (!locality.empty())
relationName = &locality;
if (!relationName)
NameDictionary::Position relation = NameDictionary::kUnspecifiedPosition;
if (street)
relation = street;
else if (locality)
relation = locality;
else
continue;
ForEachDocId(*relationName, [&](DocId const & candidate) {
auto const & relationName = dictionary.Get(relation);
Tokens relationNameTokens;
search::NormalizeAndTokenizeAsUtf8(relationName, relationNameTokens);
bool indexed = false;
ForEachDocId(relationNameTokens, [&](DocId const & candidate) {
auto const & candidateDoc = GetDoc(candidate);
if (candidateDoc.IsParentTo(buildingDoc))
if (m_hierarchy.IsParentTo(candidateDoc, buildingDoc))
{
indexed = true;
lock_guard<mutex> lock(buildingsMutex);
m_relatedBuildings[candidate].emplace_back(docId);
}
});
auto processedCount = numIndexed.fetch_add(1) + 1;
if (processedCount % kLogBatch == 0)
LOG(LINFO, ("Indexed", processedCount, "houses"));
if (indexed)
{
auto processedCount = numIndexed.fetch_add(1) + 1;
if (processedCount % kLogBatch == 0)
LOG(LINFO, ("Indexed", processedCount, "houses"));
}
}
});
}

View file

@ -70,6 +70,7 @@ private:
void AddHouses(unsigned int loadThreadsCount);
std::vector<Doc> const & m_docs;
Hierarchy const & m_hierarchy;
std::unordered_map<std::string, std::vector<DocId>> m_docIdsByTokens;

View file

@ -0,0 +1,42 @@
#include "geocoder/name_dictionary.hpp"
#include "base/assert.hpp"
#include <utility>
namespace geocoder
{
// NameDictionary ----------------------------------------------------------------------------------
std::string const & NameDictionary::Get(Position position) const
{
CHECK_GREATER(position, 0, ());
CHECK_LESS_OR_EQUAL(position, m_stock.size(), ());
return m_stock[position - 1];
}
NameDictionary::Position NameDictionary::Add(std::string const & s)
{
CHECK_LESS(m_stock.size(), UINT32_MAX, ());
m_stock.push_back(s);
return m_stock.size(); // index + 1
}
// NameDictionaryMaker -----------------------------------------------------------------------------
NameDictionary::Position NameDictionaryMaker::Add(std::string const & s)
{
auto indexItem = m_index.find(s);
if (indexItem != m_index.end())
return indexItem->second;
auto p = m_dictionary.Add(s);
auto indexEmplace = m_index.emplace(m_dictionary.Get(p), p);
CHECK(indexEmplace.second, ());
return p;
}
NameDictionary NameDictionaryMaker::Release()
{
m_index.clear();
return std::move(m_dictionary);
}
} // namespace geocoder

View file

@ -0,0 +1,46 @@
#pragma once
#include <cstdint>
#include <string>
#include <unordered_map>
#include <vector>
namespace geocoder
{
class NameDictionary
{
public:
// Values of Position type: kUnspecifiedPosition or >= 1.
using Position = std::uint32_t;
static constexpr Position kUnspecifiedPosition = 0;
NameDictionary() = default;
NameDictionary(NameDictionary &&) = default;
NameDictionary & operator=(NameDictionary &&) = default;
NameDictionary(NameDictionary const &) = delete;
NameDictionary & operator=(NameDictionary const &) = delete;
std::string const & Get(Position position) const;
Position Add(std::string const & s);
private:
std::vector<std::string> m_stock;
};
class NameDictionaryMaker
{
public:
NameDictionaryMaker() = default;
NameDictionaryMaker(NameDictionaryMaker const &) = delete;
NameDictionaryMaker & operator=(NameDictionaryMaker const &) = delete;
NameDictionary::Position Add(std::string const & s);
NameDictionary Release();
private:
NameDictionary m_dictionary;
std::unordered_map<std::string, NameDictionary::Position> m_index;
};
} // namespace geocoder