forked from organicmaps/organicmaps
[geocoder] Index all locales names
This commit is contained in:
parent
e16f74102f
commit
aaac8554bd
10 changed files with 248 additions and 77 deletions
|
@ -374,8 +374,9 @@ void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, vector
|
|||
{
|
||||
m_index.ForEachRelatedBuilding(docId, [&](Index::DocId const & buildingDocId) {
|
||||
auto const & bld = m_index.GetDoc(buildingDocId);
|
||||
auto const & realHN = bld.GetNormalizedName(Type::Building,
|
||||
m_hierarchy.GetNormalizedNameDictionary());
|
||||
auto const & multipleHN = bld.GetNormalizedMultipleNames(
|
||||
Type::Building, m_hierarchy.GetNormalizedNameDictionary());
|
||||
auto const & realHN = multipleHN.GetMainName();
|
||||
auto const & realHNUniStr = strings::MakeUniString(realHN);
|
||||
if (search::house_numbers::HouseNumbersMatch(realHNUniStr, subqueryHN,
|
||||
false /* queryIsPrefix */))
|
||||
|
|
|
@ -40,7 +40,8 @@ void PrintResults(Hierarchy const & hierarchy, vector<Result> const & results)
|
|||
if (e->m_normalizedAddress[i] != NameDictionary::kUnspecifiedPosition)
|
||||
{
|
||||
auto type = static_cast<Type>(i);
|
||||
cout << delimiter << ToString(type) << ": " << e->GetNormalizedName(type, dictionary);
|
||||
auto multipleNames = e->GetNormalizedMultipleNames(type, dictionary);
|
||||
cout << delimiter << ToString(type) << ": " << multipleNames.GetMainName();
|
||||
delimiter = ", ";
|
||||
}
|
||||
}
|
||||
|
|
|
@ -76,9 +76,25 @@ UNIT_TEST(Geocoder_Hierarchy)
|
|||
});
|
||||
|
||||
TEST_EQUAL(entries.size(), 1, ());
|
||||
TEST_EQUAL(entries[0].GetNormalizedName(Type::Country, dictionary), "cuba", ());
|
||||
TEST_EQUAL(entries[0].GetNormalizedName(Type::Region, dictionary), "ciego de avila", ());
|
||||
TEST_EQUAL(entries[0].GetNormalizedName(Type::Subregion, dictionary), "florencia", ());
|
||||
TEST_EQUAL(entries[0].GetNormalizedMultipleNames(Type::Country, dictionary).GetMainName(), "cuba",
|
||||
());
|
||||
TEST_EQUAL(entries[0].GetNormalizedMultipleNames(Type::Region, dictionary).GetMainName(),
|
||||
"ciego de avila", ());
|
||||
TEST_EQUAL(entries[0].GetNormalizedMultipleNames(Type::Subregion, dictionary).GetMainName(),
|
||||
"florencia", ());
|
||||
}
|
||||
|
||||
UNIT_TEST(Geocoder_EnglishNames)
|
||||
{
|
||||
string const kData = R"#(
|
||||
10 {"properties": {"locales": {"default": {"address": {"locality": "Москва"}}, "en": {"address": {"locality": "Moscow"}}}}}
|
||||
11 {"properties": {"locales": {"default": {"address": {"locality": "Москва", "street": "улица Новый Арбат"}}, "en": {"address": {"locality": "Moscow", "street": "New Arbat Avenue"}}}}}
|
||||
)#";
|
||||
|
||||
ScopedFile const regionsJsonFile("regions.jsonl", kData);
|
||||
Geocoder geocoder(regionsJsonFile.GetFullPath());
|
||||
|
||||
TestGeocoder(geocoder, "Moscow, New Arbat", {{Id{0x11}, 1.0}, {Id{0x10}, 0.6}});
|
||||
}
|
||||
|
||||
UNIT_TEST(Geocoder_OnlyBuildings)
|
||||
|
|
|
@ -43,34 +43,44 @@ bool Hierarchy::Entry::DeserializeFromJSONImpl(
|
|||
MYTHROW(base::Json::Exception, ("Not a json object."));
|
||||
}
|
||||
|
||||
if (!DeserializeAddressFromJSON(root, normalizedNameDictionaryBuilder, stats))
|
||||
return false;
|
||||
|
||||
auto const defaultLocale = base::GetJSONObligatoryFieldByPath(root, "properties", "locales",
|
||||
"default");
|
||||
auto const address = base::GetJSONObligatoryField(defaultLocale, "address");
|
||||
FromJSONObjectOptionalField(defaultLocale, "name", m_name);
|
||||
if (m_name.empty())
|
||||
++stats.m_emptyNames;
|
||||
|
||||
if (m_type == Type::Count)
|
||||
{
|
||||
LOG(LDEBUG, ("No address in an hierarchy entry:", jsonStr));
|
||||
++stats.m_emptyAddresses;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Hierarchy::Entry::DeserializeAddressFromJSON(
|
||||
json_t * const root, NameDictionaryBuilder & normalizedNameDictionaryBuilder,
|
||||
ParsingStats & stats)
|
||||
{
|
||||
auto const locales = base::GetJSONObligatoryFieldByPath(root, "properties", "locales");
|
||||
m_normalizedAddress= {};
|
||||
Tokens tokens;
|
||||
for (size_t i = 0; i < static_cast<size_t>(Type::Count); ++i)
|
||||
{
|
||||
Type const type = static_cast<Type>(i);
|
||||
string const & levelKey = ToString(type);
|
||||
auto const levelJson = base::GetJSONOptionalField(address, levelKey);
|
||||
if (!levelJson)
|
||||
continue;
|
||||
|
||||
if (base::JSONIsNull(levelJson))
|
||||
MultipleNames multipleNames;
|
||||
if (!FetchAddressFieldNames(locales, type, multipleNames, normalizedNameDictionaryBuilder,
|
||||
stats))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
string levelValue;
|
||||
FromJSON(levelJson, levelValue);
|
||||
if (levelValue.empty())
|
||||
continue;
|
||||
|
||||
search::NormalizeAndTokenizeAsUtf8(levelValue, tokens);
|
||||
if (tokens.empty())
|
||||
continue;
|
||||
|
||||
auto normalizedValue = strings::JoinStrings(tokens, " ");
|
||||
m_normalizedAddress[i] = normalizedNameDictionaryBuilder.Add(normalizedValue);
|
||||
m_type = static_cast<Type>(i);
|
||||
if (!multipleNames.GetMainName().empty())
|
||||
{
|
||||
m_normalizedAddress[i] = normalizedNameDictionaryBuilder.Add(move(multipleNames));
|
||||
m_type = static_cast<Type>(i);
|
||||
}
|
||||
}
|
||||
|
||||
auto const & subregion = m_normalizedAddress[static_cast<size_t>(Type::Subregion)];
|
||||
|
@ -88,22 +98,53 @@ bool Hierarchy::Entry::DeserializeFromJSONImpl(
|
|||
return false;
|
||||
}
|
||||
|
||||
FromJSONObjectOptionalField(defaultLocale, "name", m_name);
|
||||
if (m_name.empty())
|
||||
++stats.m_emptyNames;
|
||||
|
||||
if (m_type == Type::Count)
|
||||
{
|
||||
LOG(LDEBUG, ("No address in an hierarchy entry:", jsonStr));
|
||||
++stats.m_emptyAddresses;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string const & Hierarchy::Entry::GetNormalizedName(
|
||||
// static
|
||||
bool Hierarchy::Entry::FetchAddressFieldNames(
|
||||
json_t * const locales, Type type, MultipleNames & multipleNames,
|
||||
NameDictionaryBuilder & normalizedNameDictionaryBuilder, ParsingStats & stats)
|
||||
{
|
||||
char const * localeName = nullptr;
|
||||
json_t * localisedNames = nullptr;
|
||||
string const & levelKey = ToString(type);
|
||||
Tokens tokens;
|
||||
json_object_foreach(locales, localeName, localisedNames)
|
||||
{
|
||||
auto const address = base::GetJSONObligatoryField(localisedNames, "address");
|
||||
auto const levelJson = base::GetJSONOptionalField(address, levelKey);
|
||||
if (!levelJson)
|
||||
continue;
|
||||
|
||||
if (base::JSONIsNull(levelJson))
|
||||
return false;
|
||||
|
||||
string levelValue;
|
||||
FromJSON(levelJson, levelValue);
|
||||
if (levelValue.empty())
|
||||
continue;
|
||||
|
||||
search::NormalizeAndTokenizeAsUtf8(levelValue, tokens);
|
||||
if (tokens.empty())
|
||||
continue;
|
||||
|
||||
auto normalizedValue = strings::JoinStrings(tokens, " ");
|
||||
static std::string defaultLocale = "default";
|
||||
if (localeName == defaultLocale)
|
||||
multipleNames.SetMainName(normalizedValue);
|
||||
else
|
||||
multipleNames.AddAltName(normalizedValue);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
MultipleNames const & Hierarchy::Entry::GetNormalizedMultipleNames(
|
||||
Type type, NameDictionary const & normalizedNameDictionary) const
|
||||
{
|
||||
return normalizedNameDictionary.Get(m_normalizedAddress[static_cast<size_t>(type)]);
|
||||
auto const & addressField = m_normalizedAddress[static_cast<size_t>(type)];
|
||||
return normalizedNameDictionary.Get(addressField);
|
||||
}
|
||||
|
||||
// Hierarchy ---------------------------------------------------------------------------------------
|
||||
|
@ -151,13 +192,16 @@ bool Hierarchy::IsParentTo(Hierarchy::Entry const & entry, Hierarchy::Entry cons
|
|||
|
||||
if (toEntry.m_normalizedAddress[i] == NameDictionary::kUnspecifiedPosition)
|
||||
return false;
|
||||
|
||||
auto const pos1 = entry.m_normalizedAddress[i];
|
||||
auto const pos2 = toEntry.m_normalizedAddress[i];
|
||||
if (pos1 != pos2 &&
|
||||
m_normalizedNameDictionary.Get(pos1) != m_normalizedNameDictionary.Get(pos2))
|
||||
{
|
||||
if (pos1 == pos2)
|
||||
continue;
|
||||
|
||||
auto const & name1 = m_normalizedNameDictionary.Get(pos1).GetMainName();
|
||||
auto const & name2 = m_normalizedNameDictionary.Get(pos2).GetMainName();
|
||||
if (name1 != name2)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -65,9 +65,16 @@ public:
|
|||
bool DeserializeFromJSONImpl(json_t * const root, std::string const & jsonStr,
|
||||
NameDictionaryBuilder & normalizedNameDictionaryBuilder,
|
||||
ParsingStats & stats);
|
||||
bool DeserializeAddressFromJSON(json_t * const root,
|
||||
NameDictionaryBuilder & normalizedNameDictionaryBuilder,
|
||||
ParsingStats & stats);
|
||||
static bool FetchAddressFieldNames(json_t * const locales, Type type,
|
||||
MultipleNames & multipleNames,
|
||||
NameDictionaryBuilder & normalizedNameDictionaryBuilder,
|
||||
ParsingStats & stats);
|
||||
|
||||
std::string const & GetNormalizedName(Type type,
|
||||
NameDictionary const & normalizedNameDictionary) const;
|
||||
MultipleNames const & GetNormalizedMultipleNames(
|
||||
Type type, NameDictionary const & normalizedNameDictionary) const;
|
||||
bool operator<(Entry const & rhs) const { return m_osmId < rhs.m_osmId; }
|
||||
|
||||
base::GeoObjectId m_osmId = base::GeoObjectId(base::GeoObjectId::kInvalid);
|
||||
|
|
|
@ -85,8 +85,8 @@ Hierarchy HierarchyReader::Read(unsigned int readersCount)
|
|||
{
|
||||
if (auto & position = entry.m_normalizedAddress[i])
|
||||
{
|
||||
auto const & name = taskNameDictionary.Get(position);
|
||||
position = nameDictionaryBuilder.Add(name);
|
||||
auto const & multipleNames = taskNameDictionary.Get(position);
|
||||
position = nameDictionaryBuilder.Add(MultipleNames{multipleNames});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -75,9 +75,11 @@ void Index::AddEntries()
|
|||
}
|
||||
else
|
||||
{
|
||||
auto const & name = doc.GetNormalizedName(doc.m_type, dictionary);
|
||||
search::NormalizeAndTokenizeAsUtf8(name, tokens);
|
||||
m_docIdsByTokens[MakeIndexKey(tokens)].emplace_back(docId);
|
||||
for (auto const & name : doc.GetNormalizedMultipleNames(doc.m_type, dictionary))
|
||||
{
|
||||
search::NormalizeAndTokenizeAsUtf8(name, tokens);
|
||||
InsertToIndex(tokens, docId);
|
||||
}
|
||||
}
|
||||
|
||||
++numIndexed;
|
||||
|
@ -98,26 +100,28 @@ void Index::AddStreet(DocId const & docId, Index::Doc const & doc)
|
|||
};
|
||||
|
||||
auto const & dictionary = m_hierarchy.GetNormalizedNameDictionary();
|
||||
auto const & name = doc.GetNormalizedName(Type::Street, dictionary);
|
||||
Tokens tokens;
|
||||
search::NormalizeAndTokenizeAsUtf8(name, tokens);
|
||||
|
||||
if (all_of(begin(tokens), end(tokens), isStreetSynonym))
|
||||
for (auto const & name : doc.GetNormalizedMultipleNames(Type::Street, dictionary))
|
||||
{
|
||||
if (tokens.size() > 1)
|
||||
m_docIdsByTokens[MakeIndexKey(tokens)].emplace_back(docId);
|
||||
return;
|
||||
}
|
||||
search::NormalizeAndTokenizeAsUtf8(name, tokens);
|
||||
|
||||
m_docIdsByTokens[MakeIndexKey(tokens)].emplace_back(docId);
|
||||
if (all_of(begin(tokens), end(tokens), isStreetSynonym))
|
||||
{
|
||||
if (tokens.size() > 1)
|
||||
InsertToIndex(tokens, docId);
|
||||
return;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < tokens.size(); ++i)
|
||||
{
|
||||
if (!isStreetSynonym(tokens[i]))
|
||||
continue;
|
||||
auto addr = tokens;
|
||||
addr.erase(addr.begin() + i);
|
||||
m_docIdsByTokens[MakeIndexKey(addr)].emplace_back(docId);
|
||||
InsertToIndex(tokens, docId);
|
||||
|
||||
for (size_t i = 0; i < tokens.size(); ++i)
|
||||
{
|
||||
if (!isStreetSynonym(tokens[i]))
|
||||
continue;
|
||||
auto addr = tokens;
|
||||
addr.erase(addr.begin() + i);
|
||||
InsertToIndex(addr, docId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -157,9 +161,12 @@ void Index::AddHouses(unsigned int loadThreadsCount)
|
|||
else
|
||||
continue;
|
||||
|
||||
auto const & relationName = dictionary.Get(relation);
|
||||
auto const & relationMultipleNames = dictionary.Get(relation);
|
||||
auto const & relationName = relationMultipleNames.GetMainName();
|
||||
Tokens relationNameTokens;
|
||||
search::NormalizeAndTokenizeAsUtf8(relationName, relationNameTokens);
|
||||
CHECK(!relationNameTokens.empty(), ());
|
||||
|
||||
bool indexed = false;
|
||||
ForEachDocId(relationNameTokens, [&](DocId const & candidate) {
|
||||
auto const & candidateDoc = GetDoc(candidate);
|
||||
|
@ -188,4 +195,11 @@ void Index::AddHouses(unsigned int loadThreadsCount)
|
|||
if (numIndexed % kLogBatch != 0)
|
||||
LOG(LINFO, ("Indexed", numIndexed, "houses"));
|
||||
}
|
||||
|
||||
void Index::InsertToIndex(Tokens const & tokens, DocId docId)
|
||||
{
|
||||
auto & ids = m_docIdsByTokens[MakeIndexKey(tokens)];
|
||||
if (0 == count(ids.begin(), ids.end(), docId))
|
||||
ids.emplace_back(docId);
|
||||
}
|
||||
} // namespace geocoder
|
||||
|
|
|
@ -55,6 +55,8 @@ public:
|
|||
}
|
||||
|
||||
private:
|
||||
void InsertToIndex(Tokens const & tokens, DocId docId);
|
||||
|
||||
// Converts |tokens| to a single UTF-8 string that can be used
|
||||
// as a key in the |m_docIdsByTokens| map.
|
||||
static std::string MakeIndexKey(Tokens const & tokens);
|
||||
|
|
|
@ -2,34 +2,91 @@
|
|||
|
||||
#include "base/assert.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
#include <limits>
|
||||
#include <utility>
|
||||
|
||||
namespace geocoder
|
||||
{
|
||||
// MultipleName ------------------------------------------------------------------------------------
|
||||
MultipleNames::MultipleNames(std::string const & mainName)
|
||||
: m_names{mainName}
|
||||
{ }
|
||||
|
||||
std::string const & MultipleNames::GetMainName() const noexcept
|
||||
{
|
||||
return m_names[0];
|
||||
}
|
||||
|
||||
std::vector<std::string> const & MultipleNames::GetNames() const noexcept
|
||||
{
|
||||
return m_names;
|
||||
}
|
||||
|
||||
MultipleNames::const_iterator MultipleNames::begin() const noexcept
|
||||
{
|
||||
return m_names.begin();
|
||||
}
|
||||
|
||||
MultipleNames::const_iterator MultipleNames::end() const noexcept
|
||||
{
|
||||
return m_names.end();
|
||||
}
|
||||
|
||||
void MultipleNames::SetMainName(std::string const & name)
|
||||
{
|
||||
m_names[0] = name;
|
||||
}
|
||||
|
||||
void MultipleNames::AddAltName(std::string const & name)
|
||||
{
|
||||
m_names.emplace_back(std::move(name));
|
||||
// Sort for operator==.
|
||||
ASSERT_GREATER_OR_EQUAL(m_names.size(), 2, ());
|
||||
std::inplace_merge(std::next(m_names.begin()), std::prev(m_names.end()), m_names.end());
|
||||
}
|
||||
|
||||
bool operator==(MultipleNames const & lhs, MultipleNames const & rhs) noexcept
|
||||
{
|
||||
return lhs.m_names == rhs.m_names;
|
||||
}
|
||||
|
||||
bool operator!=(MultipleNames const & lhs, MultipleNames const & rhs) noexcept
|
||||
{
|
||||
return !(lhs == rhs);
|
||||
}
|
||||
|
||||
// NameDictionary ----------------------------------------------------------------------------------
|
||||
std::string const & NameDictionary::Get(Position position) const
|
||||
MultipleNames const & NameDictionary::Get(Position position) const
|
||||
{
|
||||
CHECK_GREATER(position, 0, ());
|
||||
CHECK_LESS_OR_EQUAL(position, m_stock.size(), ());
|
||||
return m_stock[position - 1];
|
||||
}
|
||||
|
||||
NameDictionary::Position NameDictionary::Add(std::string const & s)
|
||||
NameDictionary::Position NameDictionary::Add(MultipleNames && names)
|
||||
{
|
||||
CHECK(!names.GetMainName().empty(), ());
|
||||
CHECK_LESS(m_stock.size(), std::numeric_limits<uint32_t>::max(), ());
|
||||
m_stock.push_back(s);
|
||||
m_stock.push_back(std::move(names));
|
||||
return m_stock.size(); // index + 1
|
||||
}
|
||||
|
||||
// NameDictionaryBuilder -----------------------------------------------------------------------------
|
||||
NameDictionary::Position NameDictionaryBuilder::Add(std::string const & s)
|
||||
// NameDictionaryBuilder::Hash ---------------------------------------------------------------------
|
||||
size_t NameDictionaryBuilder::Hash::operator()(MultipleNames const & names) const noexcept
|
||||
{
|
||||
auto indexItem = m_index.find(s);
|
||||
return std::hash<std::string>{}(names.GetMainName());
|
||||
}
|
||||
|
||||
// NameDictionaryBuilder -----------------------------------------------------------------------------
|
||||
NameDictionary::Position NameDictionaryBuilder::Add(MultipleNames && names)
|
||||
{
|
||||
auto indexItem = m_index.find(names);
|
||||
if (indexItem != m_index.end())
|
||||
return indexItem->second;
|
||||
|
||||
auto p = m_dictionary.Add(s);
|
||||
auto p = m_dictionary.Add(std::move(names));
|
||||
auto indexEmplace = m_index.emplace(m_dictionary.Get(p), p);
|
||||
CHECK(indexEmplace.second, ());
|
||||
return p;
|
||||
|
|
|
@ -7,6 +7,30 @@
|
|||
|
||||
namespace geocoder
|
||||
{
|
||||
class MultipleNames
|
||||
{
|
||||
public:
|
||||
using const_iterator = std::vector<std::string>::const_iterator;
|
||||
|
||||
explicit MultipleNames(std::string const & mainName = {});
|
||||
|
||||
std::string const & GetMainName() const noexcept;
|
||||
std::vector<std::string> const & GetNames() const noexcept;
|
||||
|
||||
const_iterator begin() const noexcept;
|
||||
const_iterator end() const noexcept;
|
||||
|
||||
void SetMainName(std::string const & name);
|
||||
// Complexity: O(N-1) - a best case, O(N*log(N)) - a worst case.
|
||||
void AddAltName(std::string const & name);
|
||||
|
||||
friend bool operator==(MultipleNames const & lhs, MultipleNames const & rhs) noexcept;
|
||||
friend bool operator!=(MultipleNames const & lhs, MultipleNames const & rhs) noexcept;
|
||||
|
||||
private:
|
||||
std::vector<std::string> m_names;
|
||||
};
|
||||
|
||||
class NameDictionary
|
||||
{
|
||||
public:
|
||||
|
@ -22,11 +46,11 @@ public:
|
|||
NameDictionary(NameDictionary const &) = delete;
|
||||
NameDictionary & operator=(NameDictionary const &) = delete;
|
||||
|
||||
std::string const & Get(Position position) const;
|
||||
Position Add(std::string const & s);
|
||||
MultipleNames const & Get(Position position) const;
|
||||
Position Add(MultipleNames && s);
|
||||
|
||||
private:
|
||||
std::vector<std::string> m_stock;
|
||||
std::vector<MultipleNames> m_stock;
|
||||
};
|
||||
|
||||
class NameDictionaryBuilder
|
||||
|
@ -36,11 +60,16 @@ public:
|
|||
NameDictionaryBuilder(NameDictionaryBuilder const &) = delete;
|
||||
NameDictionaryBuilder & operator=(NameDictionaryBuilder const &) = delete;
|
||||
|
||||
NameDictionary::Position Add(std::string const & s);
|
||||
NameDictionary::Position Add(MultipleNames && s);
|
||||
NameDictionary Release();
|
||||
|
||||
private:
|
||||
struct Hash
|
||||
{
|
||||
size_t operator()(MultipleNames const & names) const noexcept;
|
||||
};
|
||||
|
||||
NameDictionary m_dictionary;
|
||||
std::unordered_map<std::string, NameDictionary::Position> m_index;
|
||||
std::unordered_map<MultipleNames, NameDictionary::Position, Hash> m_index;
|
||||
};
|
||||
} // namespace geocoder
|
||||
|
|
Loading…
Add table
Reference in a new issue