[geocoder] UTF-8 everywhere.

This commit is contained in:
Maxim Pimenov 2018-11-27 20:23:53 +03:00 committed by Sergey Yershov
parent 47767b1f18
commit 335c44894c
7 changed files with 33 additions and 21 deletions

View file

@ -91,7 +91,7 @@ bool HasParent(vector<geocoder::Geocoder::Layer> const & layers,
strings::UniString MakeHouseNumber(geocoder::Tokens const & tokens)
{
return strings::JoinStrings(tokens, strings::MakeUniString(""));
return strings::MakeUniString(strings::JoinStrings(tokens, " "));
}
} // namespace
@ -100,7 +100,7 @@ namespace geocoder
// Geocoder::Context -------------------------------------------------------------------------------
Geocoder::Context::Context(string const & query) : m_beam(kMaxResults)
{
search::NormalizeAndTokenizeString(query, m_tokens);
search::NormalizeAndTokenizeAsUtf8(query, m_tokens);
m_tokenTypes.assign(m_tokens.size(), Type::Count);
m_numUsedTokens = 0;
}
@ -115,7 +115,7 @@ size_t Geocoder::Context::GetNumUsedTokens() const
return m_numUsedTokens;
}
strings::UniString const & Geocoder::Context::GetToken(size_t id) const
string const & Geocoder::Context::GetToken(size_t id) const
{
CHECK_LESS(id, m_tokens.size(), ());
return m_tokens[id];
@ -210,7 +210,7 @@ void Geocoder::Go(Context & ctx, Type type) const
if (type == Type::Count)
return;
vector<strings::UniString> subquery;
Tokens subquery;
for (size_t i = 0; i < ctx.GetNumTokens(); ++i)
{
subquery.clear();

View file

@ -8,6 +8,7 @@
#include "base/geo_object_id.hpp"
#include "base/string_utils.hpp"
#include <cstddef>
#include <string>
#include <unordered_map>
#include <utility>
@ -55,7 +56,7 @@ public:
size_t GetNumTokens() const;
size_t GetNumUsedTokens() const;
strings::UniString const & GetToken(size_t id) const;
std::string const & GetToken(size_t id) const;
void MarkToken(size_t id, Type type);
@ -74,8 +75,7 @@ public:
std::vector<Layer> const & GetLayers() const;
private:
// todo(@m) std::string?
std::vector<strings::UniString> m_tokens;
Tokens m_tokens;
std::vector<Type> m_tokenTypes;
size_t m_numUsedTokens = 0;

View file

@ -30,7 +30,7 @@ string const kRegionsData = R"#(
geocoder::Tokens Split(string const & s)
{
geocoder::Tokens result;
search::NormalizeAndTokenizeString(s, result);
search::NormalizeAndTokenizeAsUtf8(s, result);
return result;
}
} // namespace
@ -71,7 +71,7 @@ UNIT_TEST(Geocoder_Hierarchy)
ScopedFile const regionsJsonFile("regions.jsonl", kRegionsData);
Geocoder geocoder(regionsJsonFile.GetFullPath());
auto entries = geocoder.GetHierarchy().GetEntries({strings::MakeUniString("florencia")});
auto entries = geocoder.GetHierarchy().GetEntries({("florencia")});
TEST(entries, ());
TEST_EQUAL(entries->size(), 1, ());

View file

@ -19,6 +19,11 @@ namespace
{
// Information will be logged for every |kLogBatch| entries.
size_t const kLogBatch = 100000;
string MakeIndexKey(geocoder::Tokens const & tokens)
{
return strings::JoinStrings(tokens, " ");
}
} // namespace
namespace geocoder
@ -67,7 +72,8 @@ void Hierarchy::Entry::DeserializeFromJSONImpl(json_t * const root, string const
LOG(LDEBUG, ("Duplicate address field", type, "when parsing", jsonStr));
hasDuplicateAddress = true;
}
search::NormalizeAndTokenizeString(levelValue, m_address[i]);
search::NormalizeAndTokenizeAsUtf8(levelValue, m_address[i]);
if (!m_address[i].empty())
m_type = static_cast<Type>(i);
@ -75,7 +81,7 @@ void Hierarchy::Entry::DeserializeFromJSONImpl(json_t * const root, string const
m_nameTokens.clear();
FromJSONObjectOptionalField(properties, "name", m_name);
search::NormalizeAndTokenizeString(m_name, m_nameTokens);
search::NormalizeAndTokenizeAsUtf8(m_name, m_nameTokens);
if (m_name.empty())
++stats.m_emptyNames;
@ -140,6 +146,7 @@ Hierarchy::Hierarchy(string const & pathToJsonHierarchy)
++stats.m_numLoaded;
if (stats.m_numLoaded % kLogBatch == 0)
LOG(LINFO, ("Read", stats.m_numLoaded, "entries"));
m_entriesStorage.emplace_back(move(entry));
}
@ -160,10 +167,9 @@ Hierarchy::Hierarchy(string const & pathToJsonHierarchy)
LOG(LINFO, ("(End of stats.)"));
}
vector<Hierarchy::Entry *> const * const Hierarchy::GetEntries(
vector<strings::UniString> const & tokens) const
vector<Hierarchy::Entry *> const * const Hierarchy::GetEntries(Tokens const & tokens) const
{
auto it = m_entriesByTokens.find(tokens);
auto it = m_entriesByTokens.find(MakeIndexKey(tokens));
if (it == m_entriesByTokens.end())
return {};
@ -181,7 +187,7 @@ void Hierarchy::IndexEntries()
continue;
size_t const t = static_cast<size_t>(e.m_type);
m_entriesByTokens[e.m_address[t]].emplace_back(&e);
m_entriesByTokens[MakeIndexKey(e.m_address[t])].emplace_back(&e);
// Index every token but do not index prefixes.
// for (auto const & tok : entry.m_address[t])

View file

@ -3,13 +3,12 @@
#include "geocoder/types.hpp"
#include "base/geo_object_id.hpp"
#include "base/string_utils.hpp"
#include <array>
#include <cstddef>
#include <cstdint>
#include <map>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
@ -85,8 +84,7 @@ public:
// todo This method (and the whole class, in fact) is in the
// prototype stage and may be too slow. Proper indexing should
// be implemented to perform this type of queries.
std::vector<Entry *> const * const GetEntries(
std::vector<strings::UniString> const & tokens) const;
std::vector<Entry *> const * const GetEntries(Tokens const & tokens) const;
private:
// Adds address information of entries to the index.
@ -95,7 +93,7 @@ private:
// Fills |m_buildingsOnStreet| field for all street entries.
void IndexHouses();
std::map<Tokens, std::vector<Entry *>> m_entriesByTokens;
std::unordered_map<std::string, std::vector<Entry *>> m_entriesByTokens;
std::vector<Entry> m_entriesStorage;
};

View file

@ -7,7 +7,7 @@
namespace geocoder
{
using Tokens = std::vector<strings::UniString>;
using Tokens = std::vector<std::string>;
enum class Type
{

View file

@ -36,6 +36,14 @@ void NormalizeAndTokenizeString(std::string const & s, Tokens & tokens)
search::Delimiters());
}
template <typename Tokens>
void NormalizeAndTokenizeAsUtf8(std::string const & s, Tokens & tokens)
{
tokens.clear();
auto const fn = [&](strings::UniString const & s) { tokens.emplace_back(strings::ToUtf8(s)); };
SplitUniString(NormalizeAndSimplifyString(s), fn, search::Delimiters());
}
template <typename Fn>
void ForEachNormalizedToken(std::string const & s, Fn && fn)
{