Initial rust setup #1

Merged
newsch merged 2 commits from rust-init into main 2023-05-30 17:00:05 +00:00
5 changed files with 257 additions and 0 deletions

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
target/

183
Cargo.lock generated Normal file
View file

@ -0,0 +1,183 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "addr2line"
version = "0.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a76fd60b23679b7d19bd066031410fb7e458ccc5e958eb5c325888ce4baedc97"
dependencies = [
"gimli",
]
[[package]]
name = "adler"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "anyhow"
version = "1.0.71"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8"
dependencies = [
"backtrace",
]
[[package]]
name = "backtrace"
version = "0.3.67"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "233d376d6d185f2a3093e58f283f60f880315b6c60075b01f36b3b85154564ca"
dependencies = [
"addr2line",
"cc",
"cfg-if",
"libc",
"miniz_oxide",
"object",
"rustc-demangle",
]
[[package]]
name = "cc"
version = "1.0.79"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "gimli"
version = "0.27.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ad0a93d233ebf96623465aad4046a8d3aa4da22d4f4beba5388838c8a434bbb4"
[[package]]
name = "itoa"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
[[package]]
name = "libc"
version = "0.2.144"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1"
[[package]]
name = "memchr"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
[[package]]
name = "miniz_oxide"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa"
dependencies = [
"adler",
]
[[package]]
name = "object"
version = "0.30.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea86265d3d3dcb6a27fc51bd29a4bf387fae9d2986b823079d4986af253eb439"
dependencies = [
"memchr",
]
[[package]]
name = "om-wikiparser"
version = "0.0.0"
dependencies = [
"anyhow",
"serde",
"serde_json",
]
[[package]]
name = "proc-macro2"
version = "1.0.59"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6aeca18b86b413c660b781aa319e4e2648a3e6f9eadc9b47e9038e6fe9f3451b"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rustc-demangle"
version = "0.1.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"
[[package]]
name = "ryu"
version = "1.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"
[[package]]
name = "serde"
version = "1.0.163"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.163"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.96"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1"
dependencies = [
"itoa",
"ryu",
"serde",
]
[[package]]
name = "syn"
version = "2.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "45b6ddbb36c5b969c182aec3c4a0bce7df3fbad4b77114706a49aacc80567388"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "unicode-ident"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0"

13
Cargo.toml Normal file
View file

@ -0,0 +1,13 @@
[package]
name = "om-wikiparser"
version = "0.0.0"
license = "AGPL-3.0-or-later"
edition = "2021"
repository = "https://github.com/organicmaps/wikiparser/"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow = { version = "1.0.71", features = ["backtrace"] }
serde = { version = "1.0.163", features = ["derive"] }
serde_json = "1.0.96"

3
README.md Normal file
View file

@ -0,0 +1,3 @@
biodranik commented 2023-05-30 16:42:10 +00:00 (Migrated from github.com)
Review

... for later embedding into mwm map files created by Organic Maps generator (link here)

... for later embedding into mwm map files created by Organic Maps generator (link here)
newsch commented 2023-05-30 16:55:07 +00:00 (Migrated from github.com)
Review

How's this?

_Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._
How's this? ```suggestion _Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._ ```
biodranik commented 2023-05-30 16:42:10 +00:00 (Migrated from github.com)
Review

... for later embedding into mwm map files created by Organic Maps generator (link here)

... for later embedding into mwm map files created by Organic Maps generator (link here)
newsch commented 2023-05-30 16:55:07 +00:00 (Migrated from github.com)
Review

How's this?

_Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._
How's this? ```suggestion _Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._ ```
# wikiparser
biodranik commented 2023-05-30 16:42:10 +00:00 (Migrated from github.com)
Review

... for later embedding into mwm map files created by Organic Maps generator (link here)

... for later embedding into mwm map files created by Organic Maps generator (link here)
newsch commented 2023-05-30 16:55:07 +00:00 (Migrated from github.com)
Review

How's this?

_Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._
How's this? ```suggestion _Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._ ```
biodranik commented 2023-05-30 16:42:10 +00:00 (Migrated from github.com)
Review

... for later embedding into mwm map files created by Organic Maps generator (link here)

... for later embedding into mwm map files created by Organic Maps generator (link here)
newsch commented 2023-05-30 16:55:07 +00:00 (Migrated from github.com)
Review

How's this?

_Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._
How's this? ```suggestion _Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._ ```
_Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._
biodranik commented 2023-05-30 16:42:10 +00:00 (Migrated from github.com)
Review

... for later embedding into mwm map files created by Organic Maps generator (link here)

... for later embedding into mwm map files created by Organic Maps generator (link here)
newsch commented 2023-05-30 16:55:07 +00:00 (Migrated from github.com)
Review

How's this?

_Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._
How's this? ```suggestion _Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._ ```

57
src/main.rs Normal file
View file

@ -0,0 +1,57 @@
// Usage:
// pv ~/Downloads/enwiki-NS0-20230401-ENTERPRISE-HTML.json.tar.gz | tar xzO | cargo run --release > /dev/null
use serde::Deserialize;
use std::{
io::{self, stdin, BufRead, BufReader, Write},
};
#[derive(Deserialize)]
struct Page {
// TODO: check if CoW has a performance impact
name: String,
date_modified: String,
#[serde(default)]
url: String,
main_entity: Option<Wikidata>,
// TODO: see what impact parsing/unescaping/allocating this has
article_body: ArticleBody,
#[serde(default)]
redirects: Vec<Redirect>,
}
#[derive(Deserialize)]
struct Wikidata {
identifier: String,
}
#[derive(Deserialize)]
struct ArticleBody {
html: String,
}
#[derive(Deserialize)]
struct Redirect {
url: String,
name: String,
}
fn main() -> anyhow::Result<()> {
let dump = BufReader::new(stdin());
// TODO: compare different deserialization methods
// docs warn against using a reader directly, and it's slower than tar can decompress the dump
// let stream = serde_json::Deserializer::from_reader(dump).into_iter::<Page>();
let stream = dump.lines().map(|r| {
r.map_err(anyhow::Error::new)
.and_then(|s| serde_json::from_str::<Page>(&s).map_err(anyhow::Error::new))
});
let mut stdout = io::stdout();
for page in stream {
let page = page?;
writeln!(stdout, "{}", page.name)?;
}
Ok(())
}