From f0bdff219512d68770ae4c0483dd225b5c9953d7 Mon Sep 17 00:00:00 2001 From: Alex Zolotarev Date: Thu, 28 Jan 2016 00:24:57 +0300 Subject: [PATCH] [generator] Normalize bbq cuisines. --- generator/generator_tests/osm2meta_test.cpp | 2 + generator/osm2meta.cpp | 68 +++++++++++++++++++-- 2 files changed, 65 insertions(+), 5 deletions(-) diff --git a/generator/generator_tests/osm2meta_test.cpp b/generator/generator_tests/osm2meta_test.cpp index 57ab14884f..fb984abdc2 100644 --- a/generator/generator_tests/osm2meta_test.cpp +++ b/generator/generator_tests/osm2meta_test.cpp @@ -11,4 +11,6 @@ UNIT_TEST(ValidateAndFormat_cuisine_test) TEST_EQUAL(tagProc.ValidateAndFormat_cuisine("abc bca"), "abc_bca", ()); TEST_EQUAL(tagProc.ValidateAndFormat_cuisine("abc def gh"), "abc_def_gh", ()); TEST_EQUAL(tagProc.ValidateAndFormat_cuisine(""), "", ()); + TEST_EQUAL(tagProc.ValidateAndFormat_cuisine(" ; , "), "", ()); + TEST_EQUAL(tagProc.ValidateAndFormat_cuisine(" Korean bbq;barbeque;grill,bbq; "), "korean_bbq;barbecue;grill", ()); } diff --git a/generator/osm2meta.cpp b/generator/osm2meta.cpp index 4aa398a937..f5fafe6c80 100644 --- a/generator/osm2meta.cpp +++ b/generator/osm2meta.cpp @@ -5,8 +5,57 @@ #include "base/logging.hpp" #include "base/string_utils.hpp" -#include "std/regex.hpp" +#include "std/algorithm.hpp" #include "std/cctype.hpp" +#include "std/unordered_set.hpp" + +namespace +{ + +constexpr char const * kOSMMultivalueDelimiter = ";"; + +template +void RemoveDuplicatesAndKeepOrder(vector & vec) +{ + unordered_set seen; + auto const predicate = [&seen](T const & value) + { + if (seen.find(value) != seen.end()) + return true; + seen.insert(value); + return false; + }; + vec.erase(std::remove_if(vec.begin(), vec.end(), predicate), vec.end()); +} + +// Also filters out duplicates. +class MultivalueCollector +{ +public: + void operator()(string const & value) + { + if (value.empty() || value == kOSMMultivalueDelimiter) + return; + m_values.push_back(value); + } + string GetString() + { + if (m_values.empty()) + return string(); + + RemoveDuplicatesAndKeepOrder(m_values); + return strings::JoinStrings(m_values, kOSMMultivalueDelimiter); + } +private: + vector m_values; +}; + +void CollapseMultipleConsecutiveCharsIntoOne(char c, string & str) +{ + auto const comparator = [c](char lhs, char rhs) { return lhs == rhs && lhs == c; }; + str.erase(unique(str.begin(), str.end(), comparator), str.end()); +} +} // namespace string MetadataTagProcessorImpl::ValidateAndFormat_maxspeed(string const & v) const { @@ -128,10 +177,19 @@ string MetadataTagProcessorImpl::ValidateAndFormat_denomination(string const & v string MetadataTagProcessorImpl::ValidateAndFormat_cuisine(string v) const { strings::MakeLowerCaseInplace(v); - v = regex_replace(v, regex("[;,]\\s*"), ";"); - v = regex_replace(v, regex("\\s+"), "_"); - strings::Trim(v, ";_"); - return v; + strings::SimpleTokenizer iter(v, ",;"); + MultivalueCollector collector; + while (iter) { + string normalized = *iter; + strings::Trim(normalized, " "); + CollapseMultipleConsecutiveCharsIntoOne(' ', normalized); + replace(normalized.begin(), normalized.end(), ' ', '_'); + if (normalized == "bbq" || normalized == "barbeque") + normalized = "barbecue"; + collector(normalized); + ++iter; + } + return collector.GetString(); } string MetadataTagProcessorImpl::ValidateAndFormat_wikipedia(string v) const