Initial rust setup #1

Merged
newsch merged 2 commits from rust-init into main 2023-05-30 17:00:05 +00:00
5 changed files with 257 additions and 0 deletions
Showing only changes of commit bf08579dc4 - Show all commits

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
target/

183
Cargo.lock generated Normal file
View file

@ -0,0 +1,183 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "addr2line"
version = "0.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a76fd60b23679b7d19bd066031410fb7e458ccc5e958eb5c325888ce4baedc97"
dependencies = [
"gimli",
]
[[package]]
name = "adler"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "anyhow"
version = "1.0.71"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8"
dependencies = [
"backtrace",
]
[[package]]
name = "backtrace"
version = "0.3.67"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "233d376d6d185f2a3093e58f283f60f880315b6c60075b01f36b3b85154564ca"
dependencies = [
"addr2line",
"cc",
"cfg-if",
"libc",
"miniz_oxide",
"object",
"rustc-demangle",
]
[[package]]
name = "cc"
version = "1.0.79"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "gimli"
version = "0.27.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ad0a93d233ebf96623465aad4046a8d3aa4da22d4f4beba5388838c8a434bbb4"
[[package]]
name = "itoa"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
[[package]]
name = "libc"
version = "0.2.144"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1"
[[package]]
name = "memchr"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
[[package]]
name = "miniz_oxide"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa"
dependencies = [
"adler",
]
[[package]]
name = "object"
version = "0.30.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea86265d3d3dcb6a27fc51bd29a4bf387fae9d2986b823079d4986af253eb439"
dependencies = [
"memchr",
]
[[package]]
name = "om-wikiparser"
version = "0.0.0"
dependencies = [
"anyhow",
"serde",
"serde_json",
]
[[package]]
name = "proc-macro2"
version = "1.0.59"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6aeca18b86b413c660b781aa319e4e2648a3e6f9eadc9b47e9038e6fe9f3451b"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rustc-demangle"
version = "0.1.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"
[[package]]
name = "ryu"
version = "1.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"
[[package]]
name = "serde"
version = "1.0.163"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.163"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.96"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1"
dependencies = [
"itoa",
"ryu",
"serde",
]
[[package]]
name = "syn"
version = "2.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "45b6ddbb36c5b969c182aec3c4a0bce7df3fbad4b77114706a49aacc80567388"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "unicode-ident"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0"

13
Cargo.toml Normal file
View file

@ -0,0 +1,13 @@
[package]
name = "om-wikiparser"
version = "0.0.0"
license = "AGPL-3.0-or-later"
edition = "2021"
repository = "https://github.com/organicmaps/wikiparser/"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow = { version = "1.0.71", features = ["backtrace"] }
serde = { version = "1.0.163", features = ["derive"] }
serde_json = "1.0.96"

3
README.md Normal file
View file

@ -0,0 +1,3 @@
biodranik commented 2023-05-30 16:42:10 +00:00 (Migrated from github.com)
Review

... for later embedding into mwm map files created by Organic Maps generator (link here)

... for later embedding into mwm map files created by Organic Maps generator (link here)
newsch commented 2023-05-30 16:55:07 +00:00 (Migrated from github.com)
Review

How's this?

_Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._
How's this? ```suggestion _Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._ ```
biodranik commented 2023-05-30 16:42:10 +00:00 (Migrated from github.com)
Review

... for later embedding into mwm map files created by Organic Maps generator (link here)

... for later embedding into mwm map files created by Organic Maps generator (link here)
newsch commented 2023-05-30 16:55:07 +00:00 (Migrated from github.com)
Review

How's this?

_Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._
How's this? ```suggestion _Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._ ```
# wikiparser
biodranik commented 2023-05-30 16:42:10 +00:00 (Migrated from github.com)
Review

... for later embedding into mwm map files created by Organic Maps generator (link here)

... for later embedding into mwm map files created by Organic Maps generator (link here)
newsch commented 2023-05-30 16:55:07 +00:00 (Migrated from github.com)
Review

How's this?

_Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._
How's this? ```suggestion _Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._ ```
biodranik commented 2023-05-30 16:42:10 +00:00 (Migrated from github.com)
Review

... for later embedding into mwm map files created by Organic Maps generator (link here)

... for later embedding into mwm map files created by Organic Maps generator (link here)
newsch commented 2023-05-30 16:55:07 +00:00 (Migrated from github.com)
Review

How's this?

_Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._
How's this? ```suggestion _Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._ ```
_A parser to extract articles from Wikipedia database dumps._
biodranik commented 2023-05-30 16:42:10 +00:00 (Migrated from github.com)
Review

... for later embedding into mwm map files created by Organic Maps generator (link here)

... for later embedding into mwm map files created by Organic Maps generator (link here)
newsch commented 2023-05-30 16:55:07 +00:00 (Migrated from github.com)
Review

How's this?

_Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._
How's this? ```suggestion _Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._ ```

57
src/main.rs Normal file
View file

@ -0,0 +1,57 @@
// Usage:
// pv ~/Downloads/enwiki-NS0-20230401-ENTERPRISE-HTML.json.tar.gz | tar xzO | cargo run --release > /dev/null
use serde::Deserialize;
use std::{
io::{self, stdin, BufRead, BufReader, Write},
};
#[derive(Deserialize)]
struct Page {
// TODO: check if CoW has a performance impact
name: String,
date_modified: String,
#[serde(default)]
url: String,
main_entity: Option<Wikidata>,
// TODO: see what impact parsing/unescaping/allocating this has
article_body: ArticleBody,
#[serde(default)]
redirects: Vec<Redirect>,
}
#[derive(Deserialize)]
struct Wikidata {
identifier: String,
}
#[derive(Deserialize)]
struct ArticleBody {
html: String,
}
#[derive(Deserialize)]
struct Redirect {
url: String,
name: String,
}
fn main() -> anyhow::Result<()> {
let dump = BufReader::new(stdin());
// TODO: compare different deserialization methods
// docs warn against using a reader directly, and it's slower than tar can decompress the dump
// let stream = serde_json::Deserializer::from_reader(dump).into_iter::<Page>();
let stream = dump.lines().map(|r| {
r.map_err(anyhow::Error::new)
.and_then(|s| serde_json::from_str::<Page>(&s).map_err(anyhow::Error::new))
});
let mut stdout = io::stdout();
for page in stream {
let page = page?;
writeln!(stdout, "{}", page.name)?;
}
Ok(())
}