[geocoder] Index all locales names

This commit is contained in:
Anatoly Serdtcev 2019-08-01 19:07:11 +03:00 committed by LaGrunge
parent e16f74102f
commit aaac8554bd
10 changed files with 248 additions and 77 deletions

View file

@ -374,8 +374,9 @@ void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, vector
{
m_index.ForEachRelatedBuilding(docId, [&](Index::DocId const & buildingDocId) {
auto const & bld = m_index.GetDoc(buildingDocId);
auto const & realHN = bld.GetNormalizedName(Type::Building,
m_hierarchy.GetNormalizedNameDictionary());
auto const & multipleHN = bld.GetNormalizedMultipleNames(
Type::Building, m_hierarchy.GetNormalizedNameDictionary());
auto const & realHN = multipleHN.GetMainName();
auto const & realHNUniStr = strings::MakeUniString(realHN);
if (search::house_numbers::HouseNumbersMatch(realHNUniStr, subqueryHN,
false /* queryIsPrefix */))

View file

@ -40,7 +40,8 @@ void PrintResults(Hierarchy const & hierarchy, vector<Result> const & results)
if (e->m_normalizedAddress[i] != NameDictionary::kUnspecifiedPosition)
{
auto type = static_cast<Type>(i);
cout << delimiter << ToString(type) << ": " << e->GetNormalizedName(type, dictionary);
auto multipleNames = e->GetNormalizedMultipleNames(type, dictionary);
cout << delimiter << ToString(type) << ": " << multipleNames.GetMainName();
delimiter = ", ";
}
}

View file

@ -76,9 +76,25 @@ UNIT_TEST(Geocoder_Hierarchy)
});
TEST_EQUAL(entries.size(), 1, ());
TEST_EQUAL(entries[0].GetNormalizedName(Type::Country, dictionary), "cuba", ());
TEST_EQUAL(entries[0].GetNormalizedName(Type::Region, dictionary), "ciego de avila", ());
TEST_EQUAL(entries[0].GetNormalizedName(Type::Subregion, dictionary), "florencia", ());
TEST_EQUAL(entries[0].GetNormalizedMultipleNames(Type::Country, dictionary).GetMainName(), "cuba",
());
TEST_EQUAL(entries[0].GetNormalizedMultipleNames(Type::Region, dictionary).GetMainName(),
"ciego de avila", ());
TEST_EQUAL(entries[0].GetNormalizedMultipleNames(Type::Subregion, dictionary).GetMainName(),
"florencia", ());
}
UNIT_TEST(Geocoder_EnglishNames)
{
string const kData = R"#(
10 {"properties": {"locales": {"default": {"address": {"locality": "Москва"}}, "en": {"address": {"locality": "Moscow"}}}}}
11 {"properties": {"locales": {"default": {"address": {"locality": "Москва", "street": "улица Новый Арбат"}}, "en": {"address": {"locality": "Moscow", "street": "New Arbat Avenue"}}}}}
)#";
ScopedFile const regionsJsonFile("regions.jsonl", kData);
Geocoder geocoder(regionsJsonFile.GetFullPath());
TestGeocoder(geocoder, "Moscow, New Arbat", {{Id{0x11}, 1.0}, {Id{0x10}, 0.6}});
}
UNIT_TEST(Geocoder_OnlyBuildings)

View file

@ -43,34 +43,44 @@ bool Hierarchy::Entry::DeserializeFromJSONImpl(
MYTHROW(base::Json::Exception, ("Not a json object."));
}
if (!DeserializeAddressFromJSON(root, normalizedNameDictionaryBuilder, stats))
return false;
auto const defaultLocale = base::GetJSONObligatoryFieldByPath(root, "properties", "locales",
"default");
auto const address = base::GetJSONObligatoryField(defaultLocale, "address");
FromJSONObjectOptionalField(defaultLocale, "name", m_name);
if (m_name.empty())
++stats.m_emptyNames;
if (m_type == Type::Count)
{
LOG(LDEBUG, ("No address in an hierarchy entry:", jsonStr));
++stats.m_emptyAddresses;
}
return true;
}
bool Hierarchy::Entry::DeserializeAddressFromJSON(
json_t * const root, NameDictionaryBuilder & normalizedNameDictionaryBuilder,
ParsingStats & stats)
{
auto const locales = base::GetJSONObligatoryFieldByPath(root, "properties", "locales");
m_normalizedAddress= {};
Tokens tokens;
for (size_t i = 0; i < static_cast<size_t>(Type::Count); ++i)
{
Type const type = static_cast<Type>(i);
string const & levelKey = ToString(type);
auto const levelJson = base::GetJSONOptionalField(address, levelKey);
if (!levelJson)
continue;
if (base::JSONIsNull(levelJson))
MultipleNames multipleNames;
if (!FetchAddressFieldNames(locales, type, multipleNames, normalizedNameDictionaryBuilder,
stats))
{
return false;
}
string levelValue;
FromJSON(levelJson, levelValue);
if (levelValue.empty())
continue;
search::NormalizeAndTokenizeAsUtf8(levelValue, tokens);
if (tokens.empty())
continue;
auto normalizedValue = strings::JoinStrings(tokens, " ");
m_normalizedAddress[i] = normalizedNameDictionaryBuilder.Add(normalizedValue);
m_type = static_cast<Type>(i);
if (!multipleNames.GetMainName().empty())
{
m_normalizedAddress[i] = normalizedNameDictionaryBuilder.Add(move(multipleNames));
m_type = static_cast<Type>(i);
}
}
auto const & subregion = m_normalizedAddress[static_cast<size_t>(Type::Subregion)];
@ -88,22 +98,53 @@ bool Hierarchy::Entry::DeserializeFromJSONImpl(
return false;
}
FromJSONObjectOptionalField(defaultLocale, "name", m_name);
if (m_name.empty())
++stats.m_emptyNames;
if (m_type == Type::Count)
{
LOG(LDEBUG, ("No address in an hierarchy entry:", jsonStr));
++stats.m_emptyAddresses;
}
return true;
}
std::string const & Hierarchy::Entry::GetNormalizedName(
// static
bool Hierarchy::Entry::FetchAddressFieldNames(
json_t * const locales, Type type, MultipleNames & multipleNames,
NameDictionaryBuilder & normalizedNameDictionaryBuilder, ParsingStats & stats)
{
char const * localeName = nullptr;
json_t * localisedNames = nullptr;
string const & levelKey = ToString(type);
Tokens tokens;
json_object_foreach(locales, localeName, localisedNames)
{
auto const address = base::GetJSONObligatoryField(localisedNames, "address");
auto const levelJson = base::GetJSONOptionalField(address, levelKey);
if (!levelJson)
continue;
if (base::JSONIsNull(levelJson))
return false;
string levelValue;
FromJSON(levelJson, levelValue);
if (levelValue.empty())
continue;
search::NormalizeAndTokenizeAsUtf8(levelValue, tokens);
if (tokens.empty())
continue;
auto normalizedValue = strings::JoinStrings(tokens, " ");
static std::string defaultLocale = "default";
if (localeName == defaultLocale)
multipleNames.SetMainName(normalizedValue);
else
multipleNames.AddAltName(normalizedValue);
}
return true;
}
MultipleNames const & Hierarchy::Entry::GetNormalizedMultipleNames(
Type type, NameDictionary const & normalizedNameDictionary) const
{
return normalizedNameDictionary.Get(m_normalizedAddress[static_cast<size_t>(type)]);
auto const & addressField = m_normalizedAddress[static_cast<size_t>(type)];
return normalizedNameDictionary.Get(addressField);
}
// Hierarchy ---------------------------------------------------------------------------------------
@ -151,13 +192,16 @@ bool Hierarchy::IsParentTo(Hierarchy::Entry const & entry, Hierarchy::Entry cons
if (toEntry.m_normalizedAddress[i] == NameDictionary::kUnspecifiedPosition)
return false;
auto const pos1 = entry.m_normalizedAddress[i];
auto const pos2 = toEntry.m_normalizedAddress[i];
if (pos1 != pos2 &&
m_normalizedNameDictionary.Get(pos1) != m_normalizedNameDictionary.Get(pos2))
{
if (pos1 == pos2)
continue;
auto const & name1 = m_normalizedNameDictionary.Get(pos1).GetMainName();
auto const & name2 = m_normalizedNameDictionary.Get(pos2).GetMainName();
if (name1 != name2)
return false;
}
}
return true;
}

View file

@ -65,9 +65,16 @@ public:
bool DeserializeFromJSONImpl(json_t * const root, std::string const & jsonStr,
NameDictionaryBuilder & normalizedNameDictionaryBuilder,
ParsingStats & stats);
bool DeserializeAddressFromJSON(json_t * const root,
NameDictionaryBuilder & normalizedNameDictionaryBuilder,
ParsingStats & stats);
static bool FetchAddressFieldNames(json_t * const locales, Type type,
MultipleNames & multipleNames,
NameDictionaryBuilder & normalizedNameDictionaryBuilder,
ParsingStats & stats);
std::string const & GetNormalizedName(Type type,
NameDictionary const & normalizedNameDictionary) const;
MultipleNames const & GetNormalizedMultipleNames(
Type type, NameDictionary const & normalizedNameDictionary) const;
bool operator<(Entry const & rhs) const { return m_osmId < rhs.m_osmId; }
base::GeoObjectId m_osmId = base::GeoObjectId(base::GeoObjectId::kInvalid);

View file

@ -85,8 +85,8 @@ Hierarchy HierarchyReader::Read(unsigned int readersCount)
{
if (auto & position = entry.m_normalizedAddress[i])
{
auto const & name = taskNameDictionary.Get(position);
position = nameDictionaryBuilder.Add(name);
auto const & multipleNames = taskNameDictionary.Get(position);
position = nameDictionaryBuilder.Add(MultipleNames{multipleNames});
}
}
}

View file

@ -75,9 +75,11 @@ void Index::AddEntries()
}
else
{
auto const & name = doc.GetNormalizedName(doc.m_type, dictionary);
search::NormalizeAndTokenizeAsUtf8(name, tokens);
m_docIdsByTokens[MakeIndexKey(tokens)].emplace_back(docId);
for (auto const & name : doc.GetNormalizedMultipleNames(doc.m_type, dictionary))
{
search::NormalizeAndTokenizeAsUtf8(name, tokens);
InsertToIndex(tokens, docId);
}
}
++numIndexed;
@ -98,26 +100,28 @@ void Index::AddStreet(DocId const & docId, Index::Doc const & doc)
};
auto const & dictionary = m_hierarchy.GetNormalizedNameDictionary();
auto const & name = doc.GetNormalizedName(Type::Street, dictionary);
Tokens tokens;
search::NormalizeAndTokenizeAsUtf8(name, tokens);
if (all_of(begin(tokens), end(tokens), isStreetSynonym))
for (auto const & name : doc.GetNormalizedMultipleNames(Type::Street, dictionary))
{
if (tokens.size() > 1)
m_docIdsByTokens[MakeIndexKey(tokens)].emplace_back(docId);
return;
}
search::NormalizeAndTokenizeAsUtf8(name, tokens);
m_docIdsByTokens[MakeIndexKey(tokens)].emplace_back(docId);
if (all_of(begin(tokens), end(tokens), isStreetSynonym))
{
if (tokens.size() > 1)
InsertToIndex(tokens, docId);
return;
}
for (size_t i = 0; i < tokens.size(); ++i)
{
if (!isStreetSynonym(tokens[i]))
continue;
auto addr = tokens;
addr.erase(addr.begin() + i);
m_docIdsByTokens[MakeIndexKey(addr)].emplace_back(docId);
InsertToIndex(tokens, docId);
for (size_t i = 0; i < tokens.size(); ++i)
{
if (!isStreetSynonym(tokens[i]))
continue;
auto addr = tokens;
addr.erase(addr.begin() + i);
InsertToIndex(addr, docId);
}
}
}
@ -157,9 +161,12 @@ void Index::AddHouses(unsigned int loadThreadsCount)
else
continue;
auto const & relationName = dictionary.Get(relation);
auto const & relationMultipleNames = dictionary.Get(relation);
auto const & relationName = relationMultipleNames.GetMainName();
Tokens relationNameTokens;
search::NormalizeAndTokenizeAsUtf8(relationName, relationNameTokens);
CHECK(!relationNameTokens.empty(), ());
bool indexed = false;
ForEachDocId(relationNameTokens, [&](DocId const & candidate) {
auto const & candidateDoc = GetDoc(candidate);
@ -188,4 +195,11 @@ void Index::AddHouses(unsigned int loadThreadsCount)
if (numIndexed % kLogBatch != 0)
LOG(LINFO, ("Indexed", numIndexed, "houses"));
}
void Index::InsertToIndex(Tokens const & tokens, DocId docId)
{
auto & ids = m_docIdsByTokens[MakeIndexKey(tokens)];
if (0 == count(ids.begin(), ids.end(), docId))
ids.emplace_back(docId);
}
} // namespace geocoder

View file

@ -55,6 +55,8 @@ public:
}
private:
void InsertToIndex(Tokens const & tokens, DocId docId);
// Converts |tokens| to a single UTF-8 string that can be used
// as a key in the |m_docIdsByTokens| map.
static std::string MakeIndexKey(Tokens const & tokens);

View file

@ -2,34 +2,91 @@
#include "base/assert.hpp"
#include <algorithm>
#include <functional>
#include <limits>
#include <utility>
namespace geocoder
{
// MultipleName ------------------------------------------------------------------------------------
MultipleNames::MultipleNames(std::string const & mainName)
: m_names{mainName}
{ }
std::string const & MultipleNames::GetMainName() const noexcept
{
return m_names[0];
}
std::vector<std::string> const & MultipleNames::GetNames() const noexcept
{
return m_names;
}
MultipleNames::const_iterator MultipleNames::begin() const noexcept
{
return m_names.begin();
}
MultipleNames::const_iterator MultipleNames::end() const noexcept
{
return m_names.end();
}
void MultipleNames::SetMainName(std::string const & name)
{
m_names[0] = name;
}
void MultipleNames::AddAltName(std::string const & name)
{
m_names.emplace_back(std::move(name));
// Sort for operator==.
ASSERT_GREATER_OR_EQUAL(m_names.size(), 2, ());
std::inplace_merge(std::next(m_names.begin()), std::prev(m_names.end()), m_names.end());
}
bool operator==(MultipleNames const & lhs, MultipleNames const & rhs) noexcept
{
return lhs.m_names == rhs.m_names;
}
bool operator!=(MultipleNames const & lhs, MultipleNames const & rhs) noexcept
{
return !(lhs == rhs);
}
// NameDictionary ----------------------------------------------------------------------------------
std::string const & NameDictionary::Get(Position position) const
MultipleNames const & NameDictionary::Get(Position position) const
{
CHECK_GREATER(position, 0, ());
CHECK_LESS_OR_EQUAL(position, m_stock.size(), ());
return m_stock[position - 1];
}
NameDictionary::Position NameDictionary::Add(std::string const & s)
NameDictionary::Position NameDictionary::Add(MultipleNames && names)
{
CHECK(!names.GetMainName().empty(), ());
CHECK_LESS(m_stock.size(), std::numeric_limits<uint32_t>::max(), ());
m_stock.push_back(s);
m_stock.push_back(std::move(names));
return m_stock.size(); // index + 1
}
// NameDictionaryBuilder -----------------------------------------------------------------------------
NameDictionary::Position NameDictionaryBuilder::Add(std::string const & s)
// NameDictionaryBuilder::Hash ---------------------------------------------------------------------
size_t NameDictionaryBuilder::Hash::operator()(MultipleNames const & names) const noexcept
{
auto indexItem = m_index.find(s);
return std::hash<std::string>{}(names.GetMainName());
}
// NameDictionaryBuilder -----------------------------------------------------------------------------
NameDictionary::Position NameDictionaryBuilder::Add(MultipleNames && names)
{
auto indexItem = m_index.find(names);
if (indexItem != m_index.end())
return indexItem->second;
auto p = m_dictionary.Add(s);
auto p = m_dictionary.Add(std::move(names));
auto indexEmplace = m_index.emplace(m_dictionary.Get(p), p);
CHECK(indexEmplace.second, ());
return p;

View file

@ -7,6 +7,30 @@
namespace geocoder
{
class MultipleNames
{
public:
using const_iterator = std::vector<std::string>::const_iterator;
explicit MultipleNames(std::string const & mainName = {});
std::string const & GetMainName() const noexcept;
std::vector<std::string> const & GetNames() const noexcept;
const_iterator begin() const noexcept;
const_iterator end() const noexcept;
void SetMainName(std::string const & name);
// Complexity: O(N-1) - a best case, O(N*log(N)) - a worst case.
void AddAltName(std::string const & name);
friend bool operator==(MultipleNames const & lhs, MultipleNames const & rhs) noexcept;
friend bool operator!=(MultipleNames const & lhs, MultipleNames const & rhs) noexcept;
private:
std::vector<std::string> m_names;
};
class NameDictionary
{
public:
@ -22,11 +46,11 @@ public:
NameDictionary(NameDictionary const &) = delete;
NameDictionary & operator=(NameDictionary const &) = delete;
std::string const & Get(Position position) const;
Position Add(std::string const & s);
MultipleNames const & Get(Position position) const;
Position Add(MultipleNames && s);
private:
std::vector<std::string> m_stock;
std::vector<MultipleNames> m_stock;
};
class NameDictionaryBuilder
@ -36,11 +60,16 @@ public:
NameDictionaryBuilder(NameDictionaryBuilder const &) = delete;
NameDictionaryBuilder & operator=(NameDictionaryBuilder const &) = delete;
NameDictionary::Position Add(std::string const & s);
NameDictionary::Position Add(MultipleNames && s);
NameDictionary Release();
private:
struct Hash
{
size_t operator()(MultipleNames const & names) const noexcept;
};
NameDictionary m_dictionary;
std::unordered_map<std::string, NameDictionary::Position> m_index;
std::unordered_map<MultipleNames, NameDictionary::Position, Hash> m_index;
};
} // namespace geocoder