diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2f7896d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +target/ diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..089cfe5 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,183 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "addr2line" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a76fd60b23679b7d19bd066031410fb7e458ccc5e958eb5c325888ce4baedc97" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "anyhow" +version = "1.0.71" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8" +dependencies = [ + "backtrace", +] + +[[package]] +name = "backtrace" +version = "0.3.67" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "233d376d6d185f2a3093e58f283f60f880315b6c60075b01f36b3b85154564ca" +dependencies = [ + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + +[[package]] +name = "cc" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "gimli" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad0a93d233ebf96623465aad4046a8d3aa4da22d4f4beba5388838c8a434bbb4" + +[[package]] +name = "itoa" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" + +[[package]] +name = "libc" +version = "0.2.144" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1" + +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + +[[package]] +name = "miniz_oxide" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa" +dependencies = [ + "adler", +] + +[[package]] +name = "object" +version = "0.30.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea86265d3d3dcb6a27fc51bd29a4bf387fae9d2986b823079d4986af253eb439" +dependencies = [ + "memchr", +] + +[[package]] +name = "om-wikiparser" +version = "0.0.0" +dependencies = [ + "anyhow", + "serde", + "serde_json", +] + +[[package]] +name = "proc-macro2" +version = "1.0.59" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6aeca18b86b413c660b781aa319e4e2648a3e6f9eadc9b47e9038e6fe9f3451b" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" + +[[package]] +name = "ryu" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041" + +[[package]] +name = "serde" +version = "1.0.163" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.163" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.96" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "syn" +version = "2.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45b6ddbb36c5b969c182aec3c4a0bce7df3fbad4b77114706a49aacc80567388" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..0a2f9cb --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "om-wikiparser" +version = "0.0.0" +license = "AGPL-3.0-or-later" +edition = "2021" +repository = "https://github.com/organicmaps/wikiparser/" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow = { version = "1.0.71", features = ["backtrace"] } +serde = { version = "1.0.163", features = ["derive"] } +serde_json = "1.0.96" diff --git a/README.md b/README.md new file mode 100644 index 0000000..2c6ee28 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# wikiparser + +_Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._ diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..e58c983 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,57 @@ +// Usage: +// pv ~/Downloads/enwiki-NS0-20230401-ENTERPRISE-HTML.json.tar.gz | tar xzO | cargo run --release > /dev/null + +use serde::Deserialize; +use std::{ + io::{self, stdin, BufRead, BufReader, Write}, +}; + +#[derive(Deserialize)] +struct Page { + // TODO: check if CoW has a performance impact + name: String, + date_modified: String, + #[serde(default)] + url: String, + main_entity: Option, + // TODO: see what impact parsing/unescaping/allocating this has + article_body: ArticleBody, + #[serde(default)] + redirects: Vec, +} + +#[derive(Deserialize)] +struct Wikidata { + identifier: String, +} + +#[derive(Deserialize)] +struct ArticleBody { + html: String, +} + +#[derive(Deserialize)] +struct Redirect { + url: String, + name: String, +} + +fn main() -> anyhow::Result<()> { + let dump = BufReader::new(stdin()); + + // TODO: compare different deserialization methods + // docs warn against using a reader directly, and it's slower than tar can decompress the dump + // let stream = serde_json::Deserializer::from_reader(dump).into_iter::(); + let stream = dump.lines().map(|r| { + r.map_err(anyhow::Error::new) + .and_then(|s| serde_json::from_str::(&s).map_err(anyhow::Error::new)) + }); + + let mut stdout = io::stdout(); + for page in stream { + let page = page?; + writeln!(stdout, "{}", page.name)?; + } + + Ok(()) +}