From a7aa9a9b6e6dfd73ca89719017044a341092a146 Mon Sep 17 00:00:00 2001 From: alnzrv <7187657+alnzrv@users.noreply.github.com> Date: Tue, 2 Jul 2024 21:07:13 +0200 Subject: [PATCH 01/13] Fixed non-working Wikimedia links with ? character in a title Signed-off-by: alnzrv <7187657+alnzrv@users.noreply.github.com> --- indexer/feature_meta.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/indexer/feature_meta.cpp b/indexer/feature_meta.cpp index f57ac1d655..1d53341c4c 100644 --- a/indexer/feature_meta.cpp +++ b/indexer/feature_meta.cpp @@ -50,11 +50,22 @@ std::string Metadata::GetWikiURL() const return ToWikiURL(string(Get(FMD_WIKIPEDIA))); } -string Metadata::ToWikimediaCommonsURL(std::string const & v) +string Metadata::ToWikimediaCommonsURL(std::string v) { if (v.empty()) return v; + // ? character should be corrected to form a valid URL's path. + for (auto i = 0; i < v.size(); ++i) + { + auto & c = v[i]; + if (c == '?') + { + c = '%'; + v.insert(i + 1, "3F"); // ? => %3F + } + } + return "https://commons.wikimedia.org/wiki/" + v; } -- 2.45.3 From 83bf35c42bdbbc739042f1b61829f1fbac5e9917 Mon Sep 17 00:00:00 2001 From: alnzrv <7187657+alnzrv@users.noreply.github.com> Date: Tue, 2 Jul 2024 21:08:12 +0200 Subject: [PATCH 02/13] Added test for Wikimedia links with ? character in a title Signed-off-by: alnzrv <7187657+alnzrv@users.noreply.github.com> --- generator/generator_tests/metadata_parser_test.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/generator/generator_tests/metadata_parser_test.cpp b/generator/generator_tests/metadata_parser_test.cpp index bd535a7293..ddeb55bf53 100644 --- a/generator/generator_tests/metadata_parser_test.cpp +++ b/generator/generator_tests/metadata_parser_test.cpp @@ -212,6 +212,9 @@ UNIT_TEST(Metadata_ValidateAndFormat_wikimedia_commons) p(kWikiKey, "Category:Bosphorus"); TEST_EQUAL(md.Get(Metadata::FMD_WIKIMEDIA_COMMONS), "Category:Bosphorus", ()); + p(kWikiKey, "Category:What If? (Bonn)"); + TEST_EQUAL(md.Get(Metadata::FMD_WIKIMEDIA_COMMONS), "Category:What If%3F (Bonn)", ()); + md.Drop(Metadata::FMD_WIKIMEDIA_COMMONS); p(kWikiKey, "incorrect_wikimedia_content"); TEST(md.Get(Metadata::FMD_WIKIMEDIA_COMMONS).empty(), ()); -- 2.45.3 From 3a180b81f291bab0b24760268ec97fc87b06cda5 Mon Sep 17 00:00:00 2001 From: alnzrv <7187657+alnzrv@users.noreply.github.com> Date: Tue, 2 Jul 2024 22:49:04 +0200 Subject: [PATCH 03/13] Apply suggestions from code review Co-authored-by: Alexander Borsuk <170263+biodranik@users.noreply.github.com> Signed-off-by: alnzrv <7187657+alnzrv@users.noreply.github.com> --- indexer/feature_meta.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/indexer/feature_meta.cpp b/indexer/feature_meta.cpp index 1d53341c4c..2c77376936 100644 --- a/indexer/feature_meta.cpp +++ b/indexer/feature_meta.cpp @@ -50,7 +50,7 @@ std::string Metadata::GetWikiURL() const return ToWikiURL(string(Get(FMD_WIKIPEDIA))); } -string Metadata::ToWikimediaCommonsURL(std::string v) +std::string Metadata::ToWikimediaCommonsURL(std::string v) { if (v.empty()) return v; @@ -62,7 +62,7 @@ string Metadata::ToWikimediaCommonsURL(std::string v) if (c == '?') { c = '%'; - v.insert(i + 1, "3F"); // ? => %3F + v.insert(++i, "3F"); // ? => %3F } } -- 2.45.3 From 6952bc8f0e2df919a5133d428a1d969593bfe3ad Mon Sep 17 00:00:00 2001 From: alnzrv <7187657+alnzrv@users.noreply.github.com> Date: Tue, 2 Jul 2024 22:53:30 +0200 Subject: [PATCH 04/13] Fixed non-working Wikimedia links with ? character in a title Signed-off-by: alnzrv <7187657+alnzrv@users.noreply.github.com> --- indexer/feature_meta.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/indexer/feature_meta.hpp b/indexer/feature_meta.hpp index 84a823303a..bf22b7bf54 100644 --- a/indexer/feature_meta.hpp +++ b/indexer/feature_meta.hpp @@ -177,7 +177,7 @@ public: static std::string ToWikiURL(std::string v); std::string GetWikiURL() const; - static std::string ToWikimediaCommonsURL(std::string const & v); + static std::string ToWikimediaCommonsURL(std::string v); void ClearPOIAttribs(); }; -- 2.45.3 From 3dbc70e7f5bfca479fd10dfc0b5677bb35500bc6 Mon Sep 17 00:00:00 2001 From: alnzrv <7187657+alnzrv@users.noreply.github.com> Date: Tue, 2 Jul 2024 23:48:38 +0200 Subject: [PATCH 05/13] Fixed Wikimedia test Signed-off-by: alnzrv <7187657+alnzrv@users.noreply.github.com> --- generator/generator_tests/metadata_parser_test.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/generator/generator_tests/metadata_parser_test.cpp b/generator/generator_tests/metadata_parser_test.cpp index ddeb55bf53..80e89c6873 100644 --- a/generator/generator_tests/metadata_parser_test.cpp +++ b/generator/generator_tests/metadata_parser_test.cpp @@ -212,8 +212,7 @@ UNIT_TEST(Metadata_ValidateAndFormat_wikimedia_commons) p(kWikiKey, "Category:Bosphorus"); TEST_EQUAL(md.Get(Metadata::FMD_WIKIMEDIA_COMMONS), "Category:Bosphorus", ()); - p(kWikiKey, "Category:What If? (Bonn)"); - TEST_EQUAL(md.Get(Metadata::FMD_WIKIMEDIA_COMMONS), "Category:What If%3F (Bonn)", ()); + TEST_EQUAL(md.ToWikimediaCommonsURL("Category:What If? (Bonn)"), "https://commons.wikimedia.org/wiki/Category:What If%3F (Bonn)", ()); md.Drop(Metadata::FMD_WIKIMEDIA_COMMONS); p(kWikiKey, "incorrect_wikimedia_content"); -- 2.45.3 From 9b495262c0ddf02539f5351ffe786f793e294e40 Mon Sep 17 00:00:00 2001 From: alnzrv <7187657+alnzrv@users.noreply.github.com> Date: Thu, 4 Jul 2024 21:09:05 +0200 Subject: [PATCH 06/13] Spaces to underscores in Wikimedia links Signed-off-by: alnzrv <7187657+alnzrv@users.noreply.github.com> --- indexer/feature_meta.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/indexer/feature_meta.cpp b/indexer/feature_meta.cpp index 2c77376936..63c1e85d1c 100644 --- a/indexer/feature_meta.cpp +++ b/indexer/feature_meta.cpp @@ -55,11 +55,13 @@ std::string Metadata::ToWikimediaCommonsURL(std::string v) if (v.empty()) return v; - // ? character should be corrected to form a valid URL's path. + // Spaces and ? characters should be corrected to form a valid URL's path. for (auto i = 0; i < v.size(); ++i) { auto & c = v[i]; - if (c == '?') + if (c == ' ') + c = '_'; + else if (c == '?') { c = '%'; v.insert(++i, "3F"); // ? => %3F -- 2.45.3 From 433143c4c6dede9e1332a9dc7e54c8ce9147451a Mon Sep 17 00:00:00 2001 From: alnzrv <7187657+alnzrv@users.noreply.github.com> Date: Thu, 4 Jul 2024 21:10:37 +0200 Subject: [PATCH 07/13] Test for spaces/underscores in Wikimedia links Signed-off-by: alnzrv <7187657+alnzrv@users.noreply.github.com> --- generator/generator_tests/metadata_parser_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generator/generator_tests/metadata_parser_test.cpp b/generator/generator_tests/metadata_parser_test.cpp index 80e89c6873..bf9798ba2f 100644 --- a/generator/generator_tests/metadata_parser_test.cpp +++ b/generator/generator_tests/metadata_parser_test.cpp @@ -212,7 +212,7 @@ UNIT_TEST(Metadata_ValidateAndFormat_wikimedia_commons) p(kWikiKey, "Category:Bosphorus"); TEST_EQUAL(md.Get(Metadata::FMD_WIKIMEDIA_COMMONS), "Category:Bosphorus", ()); - TEST_EQUAL(md.ToWikimediaCommonsURL("Category:What If? (Bonn)"), "https://commons.wikimedia.org/wiki/Category:What If%3F (Bonn)", ()); + TEST_EQUAL(md.ToWikimediaCommonsURL("Category:What If? (Bonn)"), "https://commons.wikimedia.org/wiki/Category:What_If%3F_(Bonn)", ()); md.Drop(Metadata::FMD_WIKIMEDIA_COMMONS); p(kWikiKey, "incorrect_wikimedia_content"); -- 2.45.3 From 371771c85e8cdde4c3a42bfda6a356e4e4902a45 Mon Sep 17 00:00:00 2001 From: alnzrv <7187657+alnzrv@users.noreply.github.com> Date: Thu, 5 Sep 2024 22:59:59 +0200 Subject: [PATCH 08/13] Unified logic for wikipedia and wikimedia encoding Signed-off-by: alnzrv <7187657+alnzrv@users.noreply.github.com> --- indexer/feature_meta.cpp | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/indexer/feature_meta.cpp b/indexer/feature_meta.cpp index 63c1e85d1c..89e76a4c85 100644 --- a/indexer/feature_meta.cpp +++ b/indexer/feature_meta.cpp @@ -23,21 +23,7 @@ string Metadata::ToWikiURL(std::string v) if (colon == string::npos) return v; - // Spaces, % and ? characters should be corrected to form a valid URL's path. - // Standard percent encoding also encodes other characters like (), which lead to an unnecessary HTTP redirection. - for (auto i = colon; i < v.size(); ++i) - { - auto & c = v[i]; - if (c == ' ') - c = '_'; - else if (c == '%') - v.insert(i + 1, "25"); // % => %25 - else if (c == '?') - { - c = '%'; - v.insert(i + 1, "3F"); // ? => %3F - } - } + v = encodeWikiURL(v, colon); // Trying to avoid redirects by constructing the right link. // TODO: Wikipedia article could be opened in a user's language, but need @@ -55,8 +41,15 @@ std::string Metadata::ToWikimediaCommonsURL(std::string v) if (v.empty()) return v; + v = encodeWikiURL(v, 0); + + return "https://commons.wikimedia.org/wiki/" + v; +} + +std::string Metadata::encodeWikiURL(std::string v, int startIndex) { // Spaces and ? characters should be corrected to form a valid URL's path. - for (auto i = 0; i < v.size(); ++i) + // Standard percent encoding also encodes other characters like (), which lead to an unnecessary HTTP redirection. + for (auto i = startIndex; i < v.size(); ++i) { auto & c = v[i]; if (c == ' ') @@ -64,11 +57,10 @@ std::string Metadata::ToWikimediaCommonsURL(std::string v) else if (c == '?') { c = '%'; - v.insert(++i, "3F"); // ? => %3F + v.insert(i + 1, "3F"); // ? => %3F } } - - return "https://commons.wikimedia.org/wiki/" + v; + return v; } // static -- 2.45.3 From 179d74a5fb837cd5e1335826085e84c3b0e73844 Mon Sep 17 00:00:00 2001 From: alnzrv <7187657+alnzrv@users.noreply.github.com> Date: Thu, 5 Sep 2024 23:01:42 +0200 Subject: [PATCH 09/13] Fixed Wikipedia test Signed-off-by: alnzrv <7187657+alnzrv@users.noreply.github.com> --- generator/generator_tests/metadata_parser_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/generator/generator_tests/metadata_parser_test.cpp b/generator/generator_tests/metadata_parser_test.cpp index bf9798ba2f..5f63b3fbae 100644 --- a/generator/generator_tests/metadata_parser_test.cpp +++ b/generator/generator_tests/metadata_parser_test.cpp @@ -154,8 +154,8 @@ UNIT_TEST(Metadata_ValidateAndFormat_wikipedia) char const * url; }; constexpr Test tests[] = { - {"en:Bad %20Data", "en:Bad %20Data", "https://en." WIKIHOST "/wiki/Bad_%2520Data"}, - {"ru:Это тест_со знаками %, ? (и скобками)", "ru:Это тест со знаками %, ? (и скобками)", "https://ru." WIKIHOST "/wiki/Это_тест_со_знаками_%25,_%3F_(и_скобками)"}, + {"en:Bad %20Data", "en:Bad %20Data", "https://en." WIKIHOST "/wiki/Bad_%20Data"}, + {"ru:Это тест_со знаками %, ? (и скобками)", "ru:Это тест со знаками %, ? (и скобками)", "https://ru." WIKIHOST "/wiki/Это_тест_со_знаками_%,_%3F_(и_скобками)"}, {"https://be-tarask.wikipedia.org/wiki/Вялікае_Княства_Літоўскае", "be-tarask:Вялікае Княства Літоўскае", "https://be-tarask." WIKIHOST "/wiki/Вялікае_Княства_Літоўскае"}, // Final link points to https and mobile version. {"http://en.wikipedia.org/wiki/A#id", "en:A#id", "https://en." WIKIHOST "/wiki/A#id"}, -- 2.45.3 From d9433b53fbb3b1da9eba07469e3eceff33ac4ec5 Mon Sep 17 00:00:00 2001 From: alnzrv <7187657+alnzrv@users.noreply.github.com> Date: Thu, 5 Sep 2024 23:11:02 +0200 Subject: [PATCH 10/13] Fixed error in feature_meta.hpp Signed-off-by: alnzrv <7187657+alnzrv@users.noreply.github.com> --- indexer/feature_meta.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/indexer/feature_meta.hpp b/indexer/feature_meta.hpp index bf22b7bf54..b3e6f724c0 100644 --- a/indexer/feature_meta.hpp +++ b/indexer/feature_meta.hpp @@ -178,6 +178,7 @@ public: static std::string ToWikiURL(std::string v); std::string GetWikiURL() const; static std::string ToWikimediaCommonsURL(std::string v); + static std::string encodeWikiURL(std::string v, int startIndex); void ClearPOIAttribs(); }; -- 2.45.3 From 826cf7b7dc553e9fa1614fa1844f3d29bbbdcde2 Mon Sep 17 00:00:00 2001 From: alnzrv <7187657+alnzrv@users.noreply.github.com> Date: Thu, 5 Sep 2024 23:50:27 +0200 Subject: [PATCH 11/13] Small fix in the cycle as suggested Signed-off-by: alnzrv <7187657+alnzrv@users.noreply.github.com> --- indexer/feature_meta.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/indexer/feature_meta.cpp b/indexer/feature_meta.cpp index 89e76a4c85..fd97dbd641 100644 --- a/indexer/feature_meta.cpp +++ b/indexer/feature_meta.cpp @@ -57,7 +57,7 @@ std::string Metadata::encodeWikiURL(std::string v, int startIndex) { else if (c == '?') { c = '%'; - v.insert(i + 1, "3F"); // ? => %3F + v.insert(++i, "3F"); // ? => %3F } } return v; -- 2.45.3 From 2302e280cb57d6e2bf6deeae06c43c4a60ffb0e2 Mon Sep 17 00:00:00 2001 From: alnzrv <7187657+alnzrv@users.noreply.github.com> Date: Mon, 9 Sep 2024 11:32:23 +0200 Subject: [PATCH 12/13] Function name to CamelCase Signed-off-by: alnzrv <7187657+alnzrv@users.noreply.github.com> --- indexer/feature_meta.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/indexer/feature_meta.cpp b/indexer/feature_meta.cpp index fd97dbd641..ed607eea5f 100644 --- a/indexer/feature_meta.cpp +++ b/indexer/feature_meta.cpp @@ -23,7 +23,7 @@ string Metadata::ToWikiURL(std::string v) if (colon == string::npos) return v; - v = encodeWikiURL(v, colon); + v = EncodeWikiURL(v, colon); // Trying to avoid redirects by constructing the right link. // TODO: Wikipedia article could be opened in a user's language, but need @@ -41,12 +41,12 @@ std::string Metadata::ToWikimediaCommonsURL(std::string v) if (v.empty()) return v; - v = encodeWikiURL(v, 0); + v = EncodeWikiURL(v, 0); return "https://commons.wikimedia.org/wiki/" + v; } -std::string Metadata::encodeWikiURL(std::string v, int startIndex) { +std::string Metadata::EncodeWikiURL(std::string v, int startIndex) { // Spaces and ? characters should be corrected to form a valid URL's path. // Standard percent encoding also encodes other characters like (), which lead to an unnecessary HTTP redirection. for (auto i = startIndex; i < v.size(); ++i) -- 2.45.3 From 01f2466dbcfe4f010de3e67501a9ec45de62c500 Mon Sep 17 00:00:00 2001 From: alnzrv <7187657+alnzrv@users.noreply.github.com> Date: Mon, 9 Sep 2024 11:33:35 +0200 Subject: [PATCH 13/13] Function name to CamelCase Signed-off-by: alnzrv <7187657+alnzrv@users.noreply.github.com> --- indexer/feature_meta.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/indexer/feature_meta.hpp b/indexer/feature_meta.hpp index b3e6f724c0..89783ddf23 100644 --- a/indexer/feature_meta.hpp +++ b/indexer/feature_meta.hpp @@ -178,7 +178,7 @@ public: static std::string ToWikiURL(std::string v); std::string GetWikiURL() const; static std::string ToWikimediaCommonsURL(std::string v); - static std::string encodeWikiURL(std::string v, int startIndex); + static std::string EncodeWikiURL(std::string v, int startIndex); void ClearPOIAttribs(); }; -- 2.45.3