forked from organicmaps/organicmaps
[geocoder] UTF-8 everywhere.
This commit is contained in:
parent
47767b1f18
commit
335c44894c
7 changed files with 33 additions and 21 deletions
|
@ -91,7 +91,7 @@ bool HasParent(vector<geocoder::Geocoder::Layer> const & layers,
|
|||
|
||||
strings::UniString MakeHouseNumber(geocoder::Tokens const & tokens)
|
||||
{
|
||||
return strings::JoinStrings(tokens, strings::MakeUniString(""));
|
||||
return strings::MakeUniString(strings::JoinStrings(tokens, " "));
|
||||
}
|
||||
} // namespace
|
||||
|
||||
|
@ -100,7 +100,7 @@ namespace geocoder
|
|||
// Geocoder::Context -------------------------------------------------------------------------------
|
||||
Geocoder::Context::Context(string const & query) : m_beam(kMaxResults)
|
||||
{
|
||||
search::NormalizeAndTokenizeString(query, m_tokens);
|
||||
search::NormalizeAndTokenizeAsUtf8(query, m_tokens);
|
||||
m_tokenTypes.assign(m_tokens.size(), Type::Count);
|
||||
m_numUsedTokens = 0;
|
||||
}
|
||||
|
@ -115,7 +115,7 @@ size_t Geocoder::Context::GetNumUsedTokens() const
|
|||
return m_numUsedTokens;
|
||||
}
|
||||
|
||||
strings::UniString const & Geocoder::Context::GetToken(size_t id) const
|
||||
string const & Geocoder::Context::GetToken(size_t id) const
|
||||
{
|
||||
CHECK_LESS(id, m_tokens.size(), ());
|
||||
return m_tokens[id];
|
||||
|
@ -210,7 +210,7 @@ void Geocoder::Go(Context & ctx, Type type) const
|
|||
if (type == Type::Count)
|
||||
return;
|
||||
|
||||
vector<strings::UniString> subquery;
|
||||
Tokens subquery;
|
||||
for (size_t i = 0; i < ctx.GetNumTokens(); ++i)
|
||||
{
|
||||
subquery.clear();
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
#include "base/geo_object_id.hpp"
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
|
@ -55,7 +56,7 @@ public:
|
|||
size_t GetNumTokens() const;
|
||||
size_t GetNumUsedTokens() const;
|
||||
|
||||
strings::UniString const & GetToken(size_t id) const;
|
||||
std::string const & GetToken(size_t id) const;
|
||||
|
||||
void MarkToken(size_t id, Type type);
|
||||
|
||||
|
@ -74,8 +75,7 @@ public:
|
|||
std::vector<Layer> const & GetLayers() const;
|
||||
|
||||
private:
|
||||
// todo(@m) std::string?
|
||||
std::vector<strings::UniString> m_tokens;
|
||||
Tokens m_tokens;
|
||||
std::vector<Type> m_tokenTypes;
|
||||
|
||||
size_t m_numUsedTokens = 0;
|
||||
|
|
|
@ -30,7 +30,7 @@ string const kRegionsData = R"#(
|
|||
geocoder::Tokens Split(string const & s)
|
||||
{
|
||||
geocoder::Tokens result;
|
||||
search::NormalizeAndTokenizeString(s, result);
|
||||
search::NormalizeAndTokenizeAsUtf8(s, result);
|
||||
return result;
|
||||
}
|
||||
} // namespace
|
||||
|
@ -71,7 +71,7 @@ UNIT_TEST(Geocoder_Hierarchy)
|
|||
ScopedFile const regionsJsonFile("regions.jsonl", kRegionsData);
|
||||
Geocoder geocoder(regionsJsonFile.GetFullPath());
|
||||
|
||||
auto entries = geocoder.GetHierarchy().GetEntries({strings::MakeUniString("florencia")});
|
||||
auto entries = geocoder.GetHierarchy().GetEntries({("florencia")});
|
||||
|
||||
TEST(entries, ());
|
||||
TEST_EQUAL(entries->size(), 1, ());
|
||||
|
|
|
@ -19,6 +19,11 @@ namespace
|
|||
{
|
||||
// Information will be logged for every |kLogBatch| entries.
|
||||
size_t const kLogBatch = 100000;
|
||||
|
||||
string MakeIndexKey(geocoder::Tokens const & tokens)
|
||||
{
|
||||
return strings::JoinStrings(tokens, " ");
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace geocoder
|
||||
|
@ -67,7 +72,8 @@ void Hierarchy::Entry::DeserializeFromJSONImpl(json_t * const root, string const
|
|||
LOG(LDEBUG, ("Duplicate address field", type, "when parsing", jsonStr));
|
||||
hasDuplicateAddress = true;
|
||||
}
|
||||
search::NormalizeAndTokenizeString(levelValue, m_address[i]);
|
||||
|
||||
search::NormalizeAndTokenizeAsUtf8(levelValue, m_address[i]);
|
||||
|
||||
if (!m_address[i].empty())
|
||||
m_type = static_cast<Type>(i);
|
||||
|
@ -75,7 +81,7 @@ void Hierarchy::Entry::DeserializeFromJSONImpl(json_t * const root, string const
|
|||
|
||||
m_nameTokens.clear();
|
||||
FromJSONObjectOptionalField(properties, "name", m_name);
|
||||
search::NormalizeAndTokenizeString(m_name, m_nameTokens);
|
||||
search::NormalizeAndTokenizeAsUtf8(m_name, m_nameTokens);
|
||||
|
||||
if (m_name.empty())
|
||||
++stats.m_emptyNames;
|
||||
|
@ -140,6 +146,7 @@ Hierarchy::Hierarchy(string const & pathToJsonHierarchy)
|
|||
++stats.m_numLoaded;
|
||||
if (stats.m_numLoaded % kLogBatch == 0)
|
||||
LOG(LINFO, ("Read", stats.m_numLoaded, "entries"));
|
||||
|
||||
m_entriesStorage.emplace_back(move(entry));
|
||||
}
|
||||
|
||||
|
@ -160,10 +167,9 @@ Hierarchy::Hierarchy(string const & pathToJsonHierarchy)
|
|||
LOG(LINFO, ("(End of stats.)"));
|
||||
}
|
||||
|
||||
vector<Hierarchy::Entry *> const * const Hierarchy::GetEntries(
|
||||
vector<strings::UniString> const & tokens) const
|
||||
vector<Hierarchy::Entry *> const * const Hierarchy::GetEntries(Tokens const & tokens) const
|
||||
{
|
||||
auto it = m_entriesByTokens.find(tokens);
|
||||
auto it = m_entriesByTokens.find(MakeIndexKey(tokens));
|
||||
if (it == m_entriesByTokens.end())
|
||||
return {};
|
||||
|
||||
|
@ -181,7 +187,7 @@ void Hierarchy::IndexEntries()
|
|||
continue;
|
||||
|
||||
size_t const t = static_cast<size_t>(e.m_type);
|
||||
m_entriesByTokens[e.m_address[t]].emplace_back(&e);
|
||||
m_entriesByTokens[MakeIndexKey(e.m_address[t])].emplace_back(&e);
|
||||
|
||||
// Index every token but do not index prefixes.
|
||||
// for (auto const & tok : entry.m_address[t])
|
||||
|
|
|
@ -3,13 +3,12 @@
|
|||
#include "geocoder/types.hpp"
|
||||
|
||||
#include "base/geo_object_id.hpp"
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include <array>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
|
@ -85,8 +84,7 @@ public:
|
|||
// todo This method (and the whole class, in fact) is in the
|
||||
// prototype stage and may be too slow. Proper indexing should
|
||||
// be implemented to perform this type of queries.
|
||||
std::vector<Entry *> const * const GetEntries(
|
||||
std::vector<strings::UniString> const & tokens) const;
|
||||
std::vector<Entry *> const * const GetEntries(Tokens const & tokens) const;
|
||||
|
||||
private:
|
||||
// Adds address information of entries to the index.
|
||||
|
@ -95,7 +93,7 @@ private:
|
|||
// Fills |m_buildingsOnStreet| field for all street entries.
|
||||
void IndexHouses();
|
||||
|
||||
std::map<Tokens, std::vector<Entry *>> m_entriesByTokens;
|
||||
std::unordered_map<std::string, std::vector<Entry *>> m_entriesByTokens;
|
||||
|
||||
std::vector<Entry> m_entriesStorage;
|
||||
};
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
|
||||
namespace geocoder
|
||||
{
|
||||
using Tokens = std::vector<strings::UniString>;
|
||||
using Tokens = std::vector<std::string>;
|
||||
|
||||
enum class Type
|
||||
{
|
||||
|
|
|
@ -36,6 +36,14 @@ void NormalizeAndTokenizeString(std::string const & s, Tokens & tokens)
|
|||
search::Delimiters());
|
||||
}
|
||||
|
||||
template <typename Tokens>
|
||||
void NormalizeAndTokenizeAsUtf8(std::string const & s, Tokens & tokens)
|
||||
{
|
||||
tokens.clear();
|
||||
auto const fn = [&](strings::UniString const & s) { tokens.emplace_back(strings::ToUtf8(s)); };
|
||||
SplitUniString(NormalizeAndSimplifyString(s), fn, search::Delimiters());
|
||||
}
|
||||
|
||||
template <typename Fn>
|
||||
void ForEachNormalizedToken(std::string const & s, Fn && fn)
|
||||
{
|
||||
|
|
Loading…
Add table
Reference in a new issue