diff --git a/base/base_tests/string_utils_test.cpp b/base/base_tests/string_utils_test.cpp index 49720cc45b..bac640f4d3 100644 --- a/base/base_tests/string_utils_test.cpp +++ b/base/base_tests/string_utils_test.cpp @@ -733,13 +733,16 @@ UNIT_TEST(NormalizeDigits_UniString) UNIT_TEST(Split) { vector target; - strings::Split(";Test\\;проверка;0;", ';', target); + TEST(strings::ParseCSVRow(",Test\\,проверка,0,", target), ()); vector expected({"", "Test\\", "проверка", "0", ""}); TEST_EQUAL(target, expected, ()); - strings::Split("and there was none", ' ', target); - vector expected2({"and", "there", "", "was", "none"}); + TEST(strings::ParseCSVRow("and there \"was none\"", target, ' '), ()); + vector expected2({"and", "there", "", "was none"}); TEST_EQUAL(target, expected2, ()); - strings::Split("", '!', target); + TEST(!strings::ParseCSVRow("", target), ()); vector expected3; TEST_EQUAL(target, expected3, ()); + TEST(!strings::ParseCSVRow("\"this, a line.\"", target, ',', 2), (target)); + vector expected4({"this, a line."}); + TEST_EQUAL(target, expected4, ()); } diff --git a/base/string_utils.cpp b/base/string_utils.cpp index 2a8b029f37..ab53723635 100644 --- a/base/string_utils.cpp +++ b/base/string_utils.cpp @@ -328,17 +328,61 @@ bool AlmostEqual(string const & str1, string const & str2, size_t mismatchedCoun return false; } -void Split(string const & s, char delimiter, vector & target) +bool ParseCSVRow(string const & s, vector & target, char const delimiter, size_t const columns) { target.clear(); + using It = TokenizeIterator; + bool insideQuotes = false; + ostringstream quoted; + for (It it(s, SimpleDelimiter(delimiter)); it; ++it) + { + string column = *it; + if (insideQuotes) + { + if (!column.empty() && column.back() == '"') + { + // Found the tail quote: remove it and add |quoted| to the vector. + insideQuotes = false; + column.pop_back(); + quoted << delimiter << column; + target.push_back(quoted.str()); + quoted.clear(); + } + else + quoted << delimiter << column; + } + else if (!column.empty() && column.front() == '"') + { + // Found the front quote: if there is the last one also, remove both and append column, + // otherwise push the column into a |quoted| buffer. + column.erase(0, 1); + if (column.back() == '"') + { + column.pop_back(); + strings::Trim(column); + target.push_back(column); + } + else + { + quoted << column; + insideQuotes = true; + } + } + else + { + strings::Trim(column); + target.push_back(column); + } + } // Special case: if the string is empty, return an empty array instead of {""}. - if (s.empty()) - return; + if (target.size() == 1 && target[0].empty()) + { + target.clear(); + return false; + } - using It = TokenizeIterator; - for (It it(s, SimpleDelimiter(delimiter)); it; ++it) - target.push_back(*it); + return columns <= 0 || target.size() == columns; } } // namespace strings diff --git a/base/string_utils.hpp b/base/string_utils.hpp index e42c8188e8..f9883a544c 100644 --- a/base/string_utils.hpp +++ b/base/string_utils.hpp @@ -307,7 +307,9 @@ void Tokenize(string const & str, char const * delims, TFunctor && f) } /// Splits a string by the delimiter, keeps empty parts, on an empty string returns an empty vector. -void Split(string const & s, char delimiter, vector & target); +/// Supports quoted columns, does not support newlines in columns and escaped quotes. +/// @return false if the line is empty or number of columns differs from |columns|. +bool ParseCSVRow(string const & s, vector & target, char const delimiter = ',', size_t const columns = 0); /// @return code of last symbol in string or 0 if s is empty UniChar LastUniChar(string const & s); diff --git a/generator/booking_dataset.cpp b/generator/booking_dataset.cpp index 2e619453b5..325beaee3c 100644 --- a/generator/booking_dataset.cpp +++ b/generator/booking_dataset.cpp @@ -30,10 +30,8 @@ bool CheckForValues(string const & value) BookingDataset::Hotel::Hotel(string const & src) { - vector rec(FieldsCount()); - strings::SimpleTokenizer token(src, "\t"); - for (size_t i = 0; token && i < rec.size(); ++i, ++token) - rec[i] = *token; + vector rec; + CHECK(strings::ParseCSVRow(src, rec, '\t', FieldsCount()), ("Error parsing hotels.tsv line:", src)); strings::to_uint(rec[Index(Fields::Id)], id); strings::to_double(rec[Index(Fields::Latitude)], lat); @@ -176,7 +174,7 @@ void BookingDataset::BuildFeatures(function const & fn) cons if (!hotel.translations.empty()) { vector parts; - strings::Split(hotel.translations, '|', parts); + strings::ParseCSVRow(hotel.translations, parts, '|'); for (auto i = 0; i < parts.size(); i += 3) { e.AddTag("name:" + parts[i], parts[i + 1]);