[generator] Normalize bbq cuisines.

This commit is contained in:
Alex Zolotarev 2016-01-28 00:24:57 +03:00 committed by Sergey Yershov
parent 77dabd4807
commit f0bdff2195
2 changed files with 65 additions and 5 deletions

View file

@ -11,4 +11,6 @@ UNIT_TEST(ValidateAndFormat_cuisine_test)
TEST_EQUAL(tagProc.ValidateAndFormat_cuisine("abc bca"), "abc_bca", ());
TEST_EQUAL(tagProc.ValidateAndFormat_cuisine("abc def gh"), "abc_def_gh", ());
TEST_EQUAL(tagProc.ValidateAndFormat_cuisine(""), "", ());
TEST_EQUAL(tagProc.ValidateAndFormat_cuisine(" ; , "), "", ());
TEST_EQUAL(tagProc.ValidateAndFormat_cuisine(" Korean bbq;barbeque;grill,bbq; "), "korean_bbq;barbecue;grill", ());
}

View file

@ -5,8 +5,57 @@
#include "base/logging.hpp"
#include "base/string_utils.hpp"
#include "std/regex.hpp"
#include "std/algorithm.hpp"
#include "std/cctype.hpp"
#include "std/unordered_set.hpp"
namespace
{
constexpr char const * kOSMMultivalueDelimiter = ";";
template <class T>
void RemoveDuplicatesAndKeepOrder(vector<T> & vec)
{
unordered_set<T> seen;
auto const predicate = [&seen](T const & value)
{
if (seen.find(value) != seen.end())
return true;
seen.insert(value);
return false;
};
vec.erase(std::remove_if(vec.begin(), vec.end(), predicate), vec.end());
}
// Also filters out duplicates.
class MultivalueCollector
{
public:
void operator()(string const & value)
{
if (value.empty() || value == kOSMMultivalueDelimiter)
return;
m_values.push_back(value);
}
string GetString()
{
if (m_values.empty())
return string();
RemoveDuplicatesAndKeepOrder(m_values);
return strings::JoinStrings(m_values, kOSMMultivalueDelimiter);
}
private:
vector<string> m_values;
};
void CollapseMultipleConsecutiveCharsIntoOne(char c, string & str)
{
auto const comparator = [c](char lhs, char rhs) { return lhs == rhs && lhs == c; };
str.erase(unique(str.begin(), str.end(), comparator), str.end());
}
} // namespace
string MetadataTagProcessorImpl::ValidateAndFormat_maxspeed(string const & v) const
{
@ -128,10 +177,19 @@ string MetadataTagProcessorImpl::ValidateAndFormat_denomination(string const & v
string MetadataTagProcessorImpl::ValidateAndFormat_cuisine(string v) const
{
strings::MakeLowerCaseInplace(v);
v = regex_replace(v, regex("[;,]\\s*"), ";");
v = regex_replace(v, regex("\\s+"), "_");
strings::Trim(v, ";_");
return v;
strings::SimpleTokenizer iter(v, ",;");
MultivalueCollector collector;
while (iter) {
string normalized = *iter;
strings::Trim(normalized, " ");
CollapseMultipleConsecutiveCharsIntoOne(' ', normalized);
replace(normalized.begin(), normalized.end(), ' ', '_');
if (normalized == "bbq" || normalized == "barbeque")
normalized = "barbecue";
collector(normalized);
++iter;
}
return collector.GetString();
}
string MetadataTagProcessorImpl::ValidateAndFormat_wikipedia(string v) const