diff --git a/Cargo.lock b/Cargo.lock index 089cfe5..68dd02e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,76 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "ahash" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" +dependencies = [ + "cfg-if", + "getrandom 0.2.9", + "once_cell", + "version_check", +] + +[[package]] +name = "aho-corasick" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" +dependencies = [ + "memchr", +] + +[[package]] +name = "anstream" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is-terminal", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41ed9a86bf92ae6580e0a31281f65a1b1d867c0cc68d5346e2ae128dddfa6a7d" + +[[package]] +name = "anstyle-parse" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e765fd216e48e067936442276d1d57399e37bce53c264d6fefbe298080cb57ee" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b" +dependencies = [ + "windows-sys 0.48.0", +] + +[[package]] +name = "anstyle-wincon" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188" +dependencies = [ + "anstyle", + "windows-sys 0.48.0", +] + [[package]] name = "anyhow" version = "1.0.71" @@ -26,6 +96,12 @@ dependencies = [ "backtrace", ] +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + [[package]] name = "backtrace" version = "0.3.67" @@ -41,6 +117,18 @@ dependencies = [ "rustc-demangle", ] +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + [[package]] name = "cc" version = "1.0.79" @@ -53,12 +141,285 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "clap" +version = "4.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "401a4694d2bf92537b6867d94de48c4842089645fdcdf6c71865b175d836e9c2" +dependencies = [ + "clap_builder", + "clap_derive", + "once_cell", +] + +[[package]] +name = "clap_builder" +version = "4.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72394f3339a76daf211e57d4bcb374410f3965dcc606dd0e03738c7888766980" +dependencies = [ + "anstream", + "anstyle", + "bitflags", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8cd2b2a819ad6eec39e8f1d6b53001af1e5469f8c177579cdaeb313115b825f" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.17", +] + +[[package]] +name = "clap_lex" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" + +[[package]] +name = "colorchoice" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" + +[[package]] +name = "convert_case" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" + +[[package]] +name = "cssparser" +version = "0.29.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f93d03419cb5950ccfd3daf3ff1c7a36ace64609a1a8746d493df1ca0afde0fa" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa", + "matches", + "phf 0.10.1", + "proc-macro2", + "quote", + "smallvec", + "syn 1.0.109", +] + +[[package]] +name = "cssparser-macros" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfae75de57f2b2e85e8768c3ea840fd159c8f33e2b6522c7835b7abac81be16e" +dependencies = [ + "quote", + "syn 1.0.109", +] + +[[package]] +name = "derive_more" +version = "0.99.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321" +dependencies = [ + "convert_case", + "proc-macro2", + "quote", + "rustc_version", + "syn 1.0.109", +] + +[[package]] +name = "dtoa" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56899898ce76aaf4a0f24d914c97ea6ed976d42fec6ad33fcbb0a1103e07b2b0" + +[[package]] +name = "dtoa-short" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bde03329ae10e79ede66c9ce4dc930aa8599043b0743008548680f25b91502d6" +dependencies = [ + "dtoa", +] + +[[package]] +name = "ego-tree" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591" + +[[package]] +name = "env_logger" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0" +dependencies = [ + "humantime", + "is-terminal", + "log", + "regex", + "termcolor", +] + +[[package]] +name = "errno" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" +dependencies = [ + "errno-dragonfly", + "libc", + "windows-sys 0.48.0", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "form_urlencoded" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9c384f161156f5260c24a097c56119f9be8c798586aecc13afbcbe7b7e26bf8" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "getopts" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "getrandom" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.9.0+wasi-snapshot-preview1", +] + +[[package]] +name = "getrandom" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.11.0+wasi-snapshot-preview1", +] + [[package]] name = "gimli" version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ad0a93d233ebf96623465aad4046a8d3aa4da22d4f4beba5388838c8a434bbb4" +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + +[[package]] +name = "hermit-abi" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" + +[[package]] +name = "html5ever" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" +dependencies = [ + "log", + "mac", + "markup5ever", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + +[[package]] +name = "idna" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e14ddfc70884202db2244c223200c204c2bda1bc6e0998d11b5e024d657209e6" +dependencies = [ + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "io-lifetimes" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys 0.48.0", +] + +[[package]] +name = "is-terminal" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f" +dependencies = [ + "hermit-abi", + "io-lifetimes", + "rustix", + "windows-sys 0.48.0", +] + [[package]] name = "itoa" version = "1.0.6" @@ -71,6 +432,54 @@ version = "0.2.144" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1" +[[package]] +name = "linux-raw-sys" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" + +[[package]] +name = "lock_api" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "518ef76f2f87365916b142844c16d8fefd85039bc5699050210a7778ee1cd1de" + +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] +name = "markup5ever" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016" +dependencies = [ + "log", + "phf 0.10.1", + "phf_codegen 0.10.0", + "string_cache", + "string_cache_codegen", + "tendril", +] + +[[package]] +name = "matches" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2532096657941c2fea9c289d370a250971c689d4f143798ff67113ec042024a5" + [[package]] name = "memchr" version = "2.5.0" @@ -86,6 +495,18 @@ dependencies = [ "adler", ] +[[package]] +name = "new_debug_unreachable" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" + +[[package]] +name = "nodrop" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" + [[package]] name = "object" version = "0.30.3" @@ -100,10 +521,161 @@ name = "om-wikiparser" version = "0.0.0" dependencies = [ "anyhow", + "clap", + "env_logger", + "log", + "scraper", "serde", "serde_json", + "url", + "urlencoding", ] +[[package]] +name = "once_cell" +version = "1.17.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9670a07f94779e00908f3e686eab508878ebb390ba6e604d3a284c00e8d0487b" + +[[package]] +name = "parking_lot" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-sys 0.45.0", +] + +[[package]] +name = "percent-encoding" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" + +[[package]] +name = "phf" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" +dependencies = [ + "phf_shared 0.8.0", +] + +[[package]] +name = "phf" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" +dependencies = [ + "phf_macros", + "phf_shared 0.10.0", + "proc-macro-hack", +] + +[[package]] +name = "phf_codegen" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" +dependencies = [ + "phf_generator 0.8.0", + "phf_shared 0.8.0", +] + +[[package]] +name = "phf_codegen" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" +dependencies = [ + "phf_generator 0.10.0", + "phf_shared 0.10.0", +] + +[[package]] +name = "phf_generator" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" +dependencies = [ + "phf_shared 0.8.0", + "rand 0.7.3", +] + +[[package]] +name = "phf_generator" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" +dependencies = [ + "phf_shared 0.10.0", + "rand 0.8.5", +] + +[[package]] +name = "phf_macros" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58fdf3184dd560f160dd73922bea2d5cd6e8f064bf4b13110abd81b03697b4e0" +dependencies = [ + "phf_generator 0.10.0", + "phf_shared 0.10.0", + "proc-macro-hack", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "phf_shared" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" +dependencies = [ + "siphasher", +] + +[[package]] +name = "phf_shared" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" +dependencies = [ + "siphasher", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" + +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + +[[package]] +name = "proc-macro-hack" +version = "0.5.20+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" + [[package]] name = "proc-macro2" version = "1.0.59" @@ -122,18 +694,195 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.16", + "libc", + "rand_chacha 0.2.2", + "rand_core 0.5.1", + "rand_hc", + "rand_pcg", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core 0.5.1", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.16", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.9", +] + +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core 0.5.1", +] + +[[package]] +name = "rand_pcg" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" +dependencies = [ + "rand_core 0.5.1", +] + +[[package]] +name = "redox_syscall" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78" + [[package]] name = "rustc-demangle" version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] + +[[package]] +name = "rustix" +version = "0.37.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d" +dependencies = [ + "bitflags", + "errno", + "io-lifetimes", + "libc", + "linux-raw-sys", + "windows-sys 0.48.0", +] + [[package]] name = "ryu" version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041" +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + +[[package]] +name = "scraper" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59e25654b5e9fd557a67dbaab5a5d36b8c448d0561beb4c041b6dbb902eddfa6" +dependencies = [ + "ahash", + "cssparser", + "ego-tree", + "getopts", + "html5ever", + "once_cell", + "selectors", + "smallvec", + "tendril", +] + +[[package]] +name = "selectors" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c37578180969d00692904465fb7f6b3d50b9a2b952b87c23d0e2e5cb5013416" +dependencies = [ + "bitflags", + "cssparser", + "derive_more", + "fxhash", + "log", + "phf 0.8.0", + "phf_codegen 0.8.0", + "precomputed-hash", + "servo_arc", + "smallvec", +] + +[[package]] +name = "semver" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed" + [[package]] name = "serde" version = "1.0.163" @@ -151,7 +900,7 @@ checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.17", ] [[package]] @@ -165,6 +914,77 @@ dependencies = [ "serde", ] +[[package]] +name = "servo_arc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d52aa42f8fdf0fed91e5ce7f23d8138441002fa31dca008acf47e6fd4721f741" +dependencies = [ + "nodrop", + "stable_deref_trait", +] + +[[package]] +name = "siphasher" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" + +[[package]] +name = "smallvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" + +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + +[[package]] +name = "string_cache" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b" +dependencies = [ + "new_debug_unreachable", + "once_cell", + "parking_lot", + "phf_shared 0.10.0", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988" +dependencies = [ + "phf_generator 0.10.0", + "phf_shared 0.10.0", + "proc-macro2", + "quote", +] + +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "syn" version = "2.0.17" @@ -176,8 +996,274 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + +[[package]] +name = "termcolor" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "tinyvec" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "unicode-bidi" +version = "0.3.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" + [[package]] name = "unicode-ident" version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" + +[[package]] +name = "unicode-normalization" +version = "0.1.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-width" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" + +[[package]] +name = "url" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d68c799ae75762b8c3fe375feb6600ef5602c883c5d21eb51c09f22b83c4643" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + +[[package]] +name = "urlencoding" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8db7427f936968176eaa7cdf81b7f98b980b18495ec28f1b5791ac3bfe3eea9" + +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + +[[package]] +name = "utf8parse" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +dependencies = [ + "windows-targets 0.42.2", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.0", +] + +[[package]] +name = "windows-targets" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +dependencies = [ + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", +] + +[[package]] +name = "windows-targets" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5" +dependencies = [ + "windows_aarch64_gnullvm 0.48.0", + "windows_aarch64_msvc 0.48.0", + "windows_i686_gnu 0.48.0", + "windows_i686_msvc 0.48.0", + "windows_x86_64_gnu 0.48.0", + "windows_x86_64_gnullvm 0.48.0", + "windows_x86_64_msvc 0.48.0", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" + +[[package]] +name = "windows_i686_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" + +[[package]] +name = "windows_i686_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" diff --git a/Cargo.toml b/Cargo.toml index 6003f4e..3cc52c7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,10 +4,20 @@ version = "0.0.0" license = "AGPL-3.0-only" edition = "2021" repository = "https://github.com/organicmaps/wikiparser/" - +default-run = "om-wikiparser" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] anyhow = { version = "1.0.71", features = ["backtrace"] } +clap = { version = "4.3.2", features = ["derive"] } +env_logger = "0.10.0" +log = "0.4.18" +scraper = "0.16.0" serde = { version = "1.0.163", features = ["derive"] } serde_json = "1.0.96" +url = "2.3.1" +urlencoding = "2.1.2" + +[profile.release] +debug = true +overflow-checks = true diff --git a/src/bin/simplify_html.rs b/src/bin/simplify_html.rs new file mode 100644 index 0000000..d24c7f5 --- /dev/null +++ b/src/bin/simplify_html.rs @@ -0,0 +1,18 @@ +//! Apply html article simplification to stdin, and write it to stdout. +//! +//! Usage: +//! simplify_html < article.html > simplified.html +use std::io::{stdin, stdout, Read, Write}; + +use om_wikiparser::html::simplify; + +fn main() -> anyhow::Result<()> { + let mut input = String::new(); + stdin().read_to_string(&mut input)?; + + let output = simplify(&input); + + stdout().write_all(output.as_bytes())?; + + Ok(()) +} diff --git a/src/html.rs b/src/html.rs new file mode 100644 index 0000000..9143021 --- /dev/null +++ b/src/html.rs @@ -0,0 +1,68 @@ +use scraper::{ElementRef, Html, Selector}; + +pub fn simplify(html: &str) -> String { + // TODO: handle multiple languages + let bad_sections = [ + "External links", + "Sources", + "See also", + "Bibliography", + "Further reading", + "References", + ]; + + let mut document = Html::parse_document(html); + + // TODO: evaluate this only once + let headers = Selector::parse("h1, h2, h3, h4, h5, h6, h7").unwrap(); + + let mut to_remove = Vec::new(); + + // remove sections + for header in document.select(&headers) { + // TODO: should this join all text nodes? + let Some(title) = header.text().next() else { + continue + }; + if bad_sections.contains(&title) { + to_remove.push(header.id()); + let header_level = header.value().name(); + // strip trailing nodes + for sibling in header.next_siblings() { + if let Some(element) = sibling.value().as_element() { + if element.name() == header_level { + // TODO: should this check for a higher level? + break; + } + } + to_remove.push(sibling.id()); + } + } + } + + for id in to_remove.drain(..) { + if let Some(mut node) = document.tree.get_mut(id) { + node.detach(); + } + } + + // remove elements with no text that isn't whitespace + + for element in document + .root_element() + .descendants() + .filter_map(ElementRef::wrap) + { + if element.text().all(|t| t.trim().is_empty()) { + to_remove.push(element.id()); + } + } + + for id in to_remove.drain(..) { + if let Some(mut node) = document.tree.get_mut(id) { + node.detach(); + } + } + + document.html() +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..5648444 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,2 @@ +pub mod html; +pub mod wm; diff --git a/src/main.rs b/src/main.rs index f811ed2..477f604 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,40 +1,65 @@ // Usage: -// pv ~/Downloads/enwiki-NS0-20230401-ENTERPRISE-HTML.json.tar.gz | tar xzO | cargo run --release > /dev/null +// # prep outputs from map generator +// cut -f 2 ~/Downloads/id_to_wikidata.csv > /tmp/wikidata_ids.txt +// tail -n +2 ~/Downloads/wiki_urls.txt | cut -f 3 > /tmp/wikipedia_urls.txt +// # feed gzipped tarfile +// pv ~/Downloads/enwiki-NS0-20230401-ENTERPRISE-HTML.json.tar.gz | tar xzO \ +// | cargo run --release -- \ +// --wikidata-ids /tmp/wikidata_ids.txt \ +// --wikipedia-urls /tmp/wikipedia_urls.txt \ +// output_dir +use std::{ + fs::File, + io::{stdin, BufRead, BufReader, Write}, + path::PathBuf, +}; -use serde::Deserialize; -use std::io::{self, stdin, BufRead, BufReader, Write}; +use anyhow::bail; +use clap::Parser; +#[macro_use] +extern crate log; -#[derive(Deserialize)] -struct Page { - // TODO: check if CoW has a performance impact - name: String, - date_modified: String, - #[serde(default)] - url: String, - main_entity: Option, - // TODO: see what impact parsing/unescaping/allocating this has - article_body: ArticleBody, - #[serde(default)] - redirects: Vec, -} +use om_wikiparser::{ + html::simplify, + wm::{is_wikidata_match, is_wikipedia_match, parse_wikidata_file, parse_wikipedia_file, Page}, +}; -#[derive(Deserialize)] -struct Wikidata { - identifier: String, -} - -#[derive(Deserialize)] -struct ArticleBody { - html: String, -} - -#[derive(Deserialize)] -struct Redirect { - url: String, - name: String, +#[derive(Parser)] +struct Args { + output_dir: PathBuf, + #[arg(long)] + wikidata_ids: Option, + #[arg(long)] + wikipedia_urls: Option, } fn main() -> anyhow::Result<()> { + env_logger::Builder::new() + .filter_level(log::LevelFilter::Info) + .parse_default_env() + .try_init()?; + + let args = Args::parse(); + + info!("Loading urls"); + let wikipedia_titles = args + .wikipedia_urls + .map(parse_wikipedia_file) + .transpose()? + .unwrap_or_default(); + + info!("Loading ids"); + let wikidata_ids = args + .wikidata_ids + .map(parse_wikidata_file) + .transpose()? + .unwrap_or_default(); + + if !args.output_dir.is_dir() { + bail!("output dir {:?} does not exist", args.output_dir) + } + + info!("Processing dump"); let dump = BufReader::new(stdin()); // TODO: compare different deserialization methods @@ -45,10 +70,33 @@ fn main() -> anyhow::Result<()> { .and_then(|s| serde_json::from_str::(&s).map_err(anyhow::Error::new)) }); - let mut stdout = io::stdout(); for page in stream { let page = page?; - writeln!(stdout, "{}", page.name)?; + + if !(is_wikidata_match(&wikidata_ids, &page).is_some() + || is_wikipedia_match(&wikipedia_titles, &page).is_some()) + { + continue; + } + + let Some(qid) = page.main_entity.map(|e| e.identifier) else { + warn!("Page in list but without wikidata qid: {:?}", page.name); + continue; + }; + + let filename = args.output_dir.join(qid).with_extension("html"); + + debug!("{:?}: {:?}", page.name, filename); + + if filename.exists() { + debug!("Exists, skipping"); + continue; + } + + let html = simplify(&page.article_body.html); + + let mut file = File::create(filename)?; + file.write_all(html.as_bytes())?; } Ok(()) diff --git a/src/wm/mod.rs b/src/wm/mod.rs new file mode 100644 index 0000000..ceb39fb --- /dev/null +++ b/src/wm/mod.rs @@ -0,0 +1,177 @@ +//! Wikimedia types +use std::{ + collections::HashSet, + ffi::OsStr, + fs::{self}, + num::ParseIntError, + str::FromStr, +}; + +use anyhow::{anyhow, bail, Context}; + +use url::Url; + +mod page; +pub use page::Page; + +/// Read from a file of urls on each line. +pub fn parse_wikidata_file(path: impl AsRef) -> anyhow::Result> { + let contents = fs::read_to_string(path.as_ref())?; + contents + .lines() + .enumerate() + .map(|(i, line)| { + WikidataQid::from_str(line).with_context(|| { + let line_num = i + 1; + format!("bad QID value on line {line_num}: {line:?}") + }) + }) + .collect() +} + +/// Read article titles from a file of urls on each line. +pub fn parse_wikipedia_file( + path: impl AsRef, +) -> anyhow::Result> { + let contents = fs::read_to_string(path.as_ref())?; + contents + .lines() + .enumerate() + .map(|(i, line)| { + WikipediaTitleNorm::from_url(line).with_context(|| { + let line_num = i + 1; + format!("bad wikipedia url on line {line_num}: {line:?}") + }) + }) + .collect() +} + +pub fn is_wikidata_match(ids: &HashSet, page: &Page) -> Option { + let Some(wikidata) = &page.main_entity else { return None;}; + let wikidata_id = &wikidata.identifier; + let wikidata_id = match WikidataQid::from_str(wikidata_id) { + Ok(qid) => qid, + Err(e) => { + eprintln!("Could not parse QID: {:?}: {}", wikidata_id, e); + return None; + } + }; + + ids.get(&wikidata_id).map(|_| wikidata_id) +} + +pub fn is_wikipedia_match( + titles: &HashSet, + page: &Page, +) -> Option { + // TODO: handle multiple languages + let title = WikipediaTitleNorm::from_title(&page.name, "en"); + + if titles.get(&title).is_some() { + return Some(title); + } + + for redirect in &page.redirects { + let title = WikipediaTitleNorm::from_title(&redirect.name, "en"); + + if titles.get(&title).is_some() { + return Some(title); + } + } + + None +} + +/// Wikidata QID/Q Number +/// +/// See https://www.wikidata.org/wiki/Wikidata:Glossary#QID +/// +/// ``` +/// use std::str::FromStr; +/// use om_wikiparser::wm::WikidataQid; +/// +/// let with_q = WikidataQid::from_str("Q12345").unwrap(); +/// let without_q = WikidataQid::from_str("12345").unwrap(); +/// assert_eq!(with_q, without_q); +/// ``` +#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)] +pub struct WikidataQid(u32); + +impl FromStr for WikidataQid { + type Err = ParseIntError; + + fn from_str(s: &str) -> Result { + let s = s.strip_prefix('Q').unwrap_or(s); + u32::from_str(s).map(WikidataQid) + } +} + +/// Normalized wikipedia article title that can compare: +/// - titles `Spatial Database` +/// - urls `https://en.wikipedia.org/wiki/Spatial_database#Geodatabase` +/// - osm-style tags `en:Spatial Database` +/// +/// ``` +/// use om_wikiparser::wm::WikipediaTitleNorm; +/// +/// let url = WikipediaTitleNorm::from_url("https://en.wikipedia.org/wiki/Article_Title/").unwrap(); +/// let title = WikipediaTitleNorm::from_title("Article Title", "en"); +/// assert_eq!(url, title); +/// ``` +#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)] +pub struct WikipediaTitleNorm { + lang: String, + name: String, +} + +impl WikipediaTitleNorm { + fn normalize_title(title: &str) -> String { + // TODO: compare with generator url creation + title.replace(' ', "_") + } + + // https://en.wikipedia.org/wiki/Article_Title + pub fn from_url(url: &str) -> anyhow::Result { + let url = Url::parse(url)?; + + let (subdomain, host) = url + .host_str() + .ok_or(anyhow!("Expected host"))? + .split_once('.') + .ok_or(anyhow!("Expected subdomain"))?; + if host != "wikipedia.org" { + bail!("Expected wikipedia.org for domain") + } + let lang = subdomain; + + let mut paths = url.path_segments().ok_or(anyhow!("Expected path"))?; + + let root = paths + .next() + .ok_or(anyhow!("Expected first segment in path"))?; + + if root != "wiki" { + bail!("Expected 'wiki' in path") + } + + let title = paths + .next() + .ok_or(anyhow!("Expected second segment in path"))?; + let title = urlencoding::decode(title)?; + + Ok(Self::from_title(&title, lang)) + } + + // en:Article Title + fn _from_osm_tag(tag: &str) -> anyhow::Result { + let (lang, title) = tag.split_once(':').ok_or(anyhow!("Expected ':'"))?; + + Ok(Self::from_title(title, lang)) + } + + pub fn from_title(title: &str, lang: &str) -> Self { + let name = Self::normalize_title(title); + let lang = lang.to_owned(); + Self { name, lang } + } +} diff --git a/src/wm/page.rs b/src/wm/page.rs new file mode 100644 index 0000000..b830fd9 --- /dev/null +++ b/src/wm/page.rs @@ -0,0 +1,36 @@ +use serde::Deserialize; + +/// Deserialized Wikimedia Enterprise API Article +/// +/// For all available fields, see https://enterprise.wikimedia.com/docs/data-dictionary/ +#[allow(dead_code)] // TODO: reevaluate fields +#[derive(Deserialize)] +pub struct Page { + // TODO: check if CoW has a performance impact + pub name: String, + pub date_modified: String, + #[serde(default)] + pub url: String, + pub main_entity: Option, + // TODO: see what impact parsing/unescaping/allocating this has + pub article_body: ArticleBody, + #[serde(default)] + pub redirects: Vec, +} + +#[derive(Deserialize)] +pub struct Wikidata { + pub identifier: String, +} + +#[derive(Deserialize)] +pub struct ArticleBody { + pub html: String, +} + +#[allow(dead_code)] // TODO: reevaluate fields +#[derive(Deserialize)] +pub struct Redirect { + pub url: String, + pub name: String, +}