diff --git a/generator/generator_tests/metadata_test.cpp b/generator/generator_tests/metadata_test.cpp index a9d043115c..fd2056fb9f 100644 --- a/generator/generator_tests/metadata_test.cpp +++ b/generator/generator_tests/metadata_test.cpp @@ -129,6 +129,37 @@ UNIT_TEST(Metadata_ValidateAndFormat_ele) params.GetMetadata().Drop(feature::Metadata::FMD_ELE); } +UNIT_TEST(Metadata_ValidateAndFormat_wikipedia) +{ + FeatureParams params; + MetadataTagProcessor p(params); + string const lanaWoodUrlEncoded = "%D0%9B%D0%B0%D0%BD%D0%B0_%D0%92%D1%83%D0%B4"; + + p("wikipedia", "ru:Лана Вуд"); + TEST_EQUAL(params.GetMetadata().Get(feature::Metadata::FMD_WIKIPEDIA), "ru:" + lanaWoodUrlEncoded, ("ru:")); + params.GetMetadata().Drop(feature::Metadata::FMD_WIKIPEDIA); + + p("wikipedia", "https://ru.wikipedia.org/wiki/" + lanaWoodUrlEncoded); + TEST_EQUAL(params.GetMetadata().Get(feature::Metadata::FMD_WIKIPEDIA), "ru:" + lanaWoodUrlEncoded, ("https:")); + params.GetMetadata().Drop(feature::Metadata::FMD_WIKIPEDIA); + + p("wikipedia", "Test"); + TEST(params.GetMetadata().Empty(), ("Test")); + + p("wikipedia", "https://en.wikipedia.org/wiki/"); + TEST(params.GetMetadata().Empty(), ("Null wiki")); + + p("wikipedia", "http://.wikipedia.org/wiki/Whatever"); + TEST(params.GetMetadata().Empty(), ("Null lang", params.GetMetadata().Get(feature::Metadata::FMD_WIKIPEDIA))); + + // We ignore incorrect prefixes + p("wikipedia", "ht.tps://en.wikipedia.org/wiki/Whuh"); + TEST_EQUAL(params.GetMetadata().Get(feature::Metadata::FMD_WIKIPEDIA), "en:Whuh", ("ht.tp:")); + params.GetMetadata().Drop(feature::Metadata::FMD_WIKIPEDIA); + + p("wikipedia", "http://ru.google.com/wiki/wutlol"); + TEST(params.GetMetadata().Empty(), ("Google")); +} UNIT_TEST(Metadata_ReadWrite_Intermediate) { diff --git a/generator/osm2meta.hpp b/generator/osm2meta.hpp index 3dad97adc6..82d7f1b14e 100644 --- a/generator/osm2meta.hpp +++ b/generator/osm2meta.hpp @@ -105,6 +105,12 @@ public: if (!value.empty()) md.Add(Metadata::FMD_EMAIL, value); } + else if (k == "wikipedia") + { + string const & value = ValidateAndFormat_wikipedia(v); + if (!value.empty()) + md.Add(Metadata::FMD_WIKIPEDIA, value); + } return false; } @@ -184,4 +190,50 @@ protected: return v; } + // Special URL encoding for wikipedia: + // Replaces special characters with %HH codes + // And spaces with underscores. + string WikiUrlEncode(string const & value) const + { + ostringstream escaped; + escaped.fill('0'); + escaped << hex; + + for (auto const & c : value) + { + if (isalnum(c) || c == '-' || c == '_' || c == '.' || c == '~') + escaped << c; + else if (c == ' ') + escaped << '_'; + else + escaped << '%' << std::uppercase << setw(2) << static_cast(static_cast(c)); + } + + return escaped.str(); + } + + string ValidateAndFormat_wikipedia(string const & v) const + { + // Find prefix before ':', shortest case: "lg:aa" + string::size_type i = v.find(':'); + if (i == string::npos || i < 2 || i + 2 > v.length()) + return string(); + + // URL encode lang:title, so URL can be reconstructed faster + if (i <= 3 || v.substr(0, i) == "be-x-old") + return v.substr(0, i + 1) + WikiUrlEncode(v.substr(i + 1)); + + static string::size_type const minUrlPartLength = string("//be.wikipedia.org/wiki/AB").length(); + if (v[i+1] == '/' && i + minUrlPartLength < v.length()) + { + // Convert URL to "lang:title" + i += 3; + string::size_type const j = v.find('.', i + 1); + static string const wikiUrlPart = ".wikipedia.org/wiki/"; + if (j != string::npos && v.substr(j, wikiUrlPart.length()) == wikiUrlPart) + return v.substr(i, j - i) + ":" + v.substr(j + wikiUrlPart.length()); + } + return string(); + } + }; diff --git a/indexer/feature_meta.hpp b/indexer/feature_meta.hpp index 2ac7ef1a79..f81ef76198 100644 --- a/indexer/feature_meta.hpp +++ b/indexer/feature_meta.hpp @@ -34,6 +34,7 @@ namespace feature FMD_TURN_LANES_BACKWARD = 13, FMD_EMAIL = 14, FMD_POSTCODE = 15, + FMD_WIKIPEDIA = 16, FMD_COUNT };