Extract tags in parallel in rust

- Use rayon and osmpbf crates, output intermediate TSV file in the same
  format as osmconvert, for use with the new `--osm-tags` flag.
- Number of threads spawned can be configured with `--procs` flag.
- Replace all wikidata id references with QID.
- Update script and documentation to use new subcommands.
- run.sh now expects a pbf file to extract tags from.

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
Evan Lloyd New-Schmidt 2023-08-08 13:12:26 -04:00 committed by Evan Lloyd New-Schmidt
parent b6db70f74c
commit 6d242a62aa
7 changed files with 560 additions and 98 deletions

332
Cargo.lock generated
View file

@ -112,7 +112,7 @@ dependencies = [
"cc",
"cfg-if",
"libc",
"miniz_oxide",
"miniz_oxide 0.6.2",
"object",
"rustc-demangle",
]
@ -123,6 +123,12 @@ version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "bitflags"
version = "2.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42"
[[package]]
name = "byteorder"
version = "1.4.3"
@ -160,7 +166,7 @@ checksum = "72394f3339a76daf211e57d4bcb374410f3965dcc606dd0e03738c7888766980"
dependencies = [
"anstream",
"anstyle",
"bitflags",
"bitflags 1.3.2",
"clap_lex",
"strsim",
]
@ -174,7 +180,7 @@ dependencies = [
"heck",
"proc-macro2",
"quote",
"syn 2.0.17",
"syn 2.0.28",
]
[[package]]
@ -195,6 +201,58 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e"
[[package]]
name = "crc32fast"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
dependencies = [
"cfg-if",
]
[[package]]
name = "crossbeam-channel"
version = "0.5.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200"
dependencies = [
"cfg-if",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-deque"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
dependencies = [
"cfg-if",
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7"
dependencies = [
"autocfg",
"cfg-if",
"crossbeam-utils",
"memoffset",
"scopeguard",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294"
dependencies = [
"cfg-if",
]
[[package]]
name = "cssparser"
version = "0.29.6"
@ -277,6 +335,12 @@ version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591"
[[package]]
name = "either"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07"
[[package]]
name = "env_logger"
version = "0.10.0"
@ -311,6 +375,22 @@ dependencies = [
"libc",
]
[[package]]
name = "fastrand"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764"
[[package]]
name = "flate2"
version = "1.0.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
dependencies = [
"crc32fast",
"miniz_oxide 0.7.1",
]
[[package]]
name = "form_urlencoded"
version = "1.1.0"
@ -376,6 +456,12 @@ version = "0.27.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ad0a93d233ebf96623465aad4046a8d3aa4da22d4f4beba5388838c8a434bbb4"
[[package]]
name = "hashbrown"
version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
[[package]]
name = "heck"
version = "0.4.1"
@ -418,6 +504,16 @@ dependencies = [
"unicode-normalization",
]
[[package]]
name = "indexmap"
version = "1.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
dependencies = [
"autocfg",
"hashbrown",
]
[[package]]
name = "io-lifetimes"
version = "1.0.11"
@ -437,7 +533,7 @@ checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
dependencies = [
"hermit-abi",
"io-lifetimes",
"rustix",
"rustix 0.37.19",
"windows-sys 0.48.0",
]
@ -449,9 +545,9 @@ checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
[[package]]
name = "libc"
version = "0.2.144"
version = "0.2.147"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1"
checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3"
[[package]]
name = "linux-raw-sys"
@ -459,6 +555,12 @@ version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
[[package]]
name = "linux-raw-sys"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57bcfdad1b858c2db7c38303a6d2ad4dfaf5eb53dfeb0910128b2c26d6158503"
[[package]]
name = "lock_api"
version = "0.4.9"
@ -507,6 +609,24 @@ version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
[[package]]
name = "memmap2"
version = "0.5.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327"
dependencies = [
"libc",
]
[[package]]
name = "memoffset"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c"
dependencies = [
"autocfg",
]
[[package]]
name = "miniz_oxide"
version = "0.6.2"
@ -516,6 +636,15 @@ dependencies = [
"adler",
]
[[package]]
name = "miniz_oxide"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
dependencies = [
"adler",
]
[[package]]
name = "new_debug_unreachable"
version = "1.0.4"
@ -528,6 +657,16 @@ version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb"
[[package]]
name = "num_cpus"
version = "1.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
dependencies = [
"hermit-abi",
"libc",
]
[[package]]
name = "object"
version = "0.30.3"
@ -548,6 +687,8 @@ dependencies = [
"env_logger",
"log",
"once_cell",
"osmpbf",
"rayon",
"scraper",
"serde",
"serde_json",
@ -561,6 +702,20 @@ version = "1.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
[[package]]
name = "osmpbf"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3bec2671f8eb1e9a353adfe8aafe44c9c5207e0012d469a4b61fb7bf33adf37"
dependencies = [
"byteorder",
"flate2",
"memmap2",
"protobuf",
"protobuf-codegen",
"rayon",
]
[[package]]
name = "parking_lot"
version = "0.12.1"
@ -579,7 +734,7 @@ checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521"
dependencies = [
"cfg-if",
"libc",
"redox_syscall",
"redox_syscall 0.2.16",
"smallvec",
"windows-sys 0.45.0",
]
@ -702,18 +857,69 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
[[package]]
name = "proc-macro2"
version = "1.0.59"
version = "1.0.66"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6aeca18b86b413c660b781aa319e4e2648a3e6f9eadc9b47e9038e6fe9f3451b"
checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.28"
name = "protobuf"
version = "3.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488"
checksum = "b55bad9126f378a853655831eb7363b7b01b81d19f8cb1218861086ca4a1a61e"
dependencies = [
"once_cell",
"protobuf-support",
"thiserror",
]
[[package]]
name = "protobuf-codegen"
version = "3.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0dd418ac3c91caa4032d37cb80ff0d44e2ebe637b2fb243b6234bf89cdac4901"
dependencies = [
"anyhow",
"once_cell",
"protobuf",
"protobuf-parse",
"regex",
"tempfile",
"thiserror",
]
[[package]]
name = "protobuf-parse"
version = "3.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d39b14605eaa1f6a340aec7f320b34064feb26c93aec35d6a9a2272a8ddfa49"
dependencies = [
"anyhow",
"indexmap",
"log",
"protobuf",
"protobuf-support",
"tempfile",
"thiserror",
"which",
]
[[package]]
name = "protobuf-support"
version = "3.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a5d4d7b8601c814cfb36bcebb79f0e61e45e1e93640cf778837833bbed05c372"
dependencies = [
"thiserror",
]
[[package]]
name = "quote"
version = "1.0.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965"
dependencies = [
"proc-macro2",
]
@ -799,13 +1005,44 @@ dependencies = [
"rand_core 0.5.1",
]
[[package]]
name = "rayon"
version = "1.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b"
dependencies = [
"either",
"rayon-core",
]
[[package]]
name = "rayon-core"
version = "1.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d"
dependencies = [
"crossbeam-channel",
"crossbeam-deque",
"crossbeam-utils",
"num_cpus",
]
[[package]]
name = "redox_syscall"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
dependencies = [
"bitflags",
"bitflags 1.3.2",
]
[[package]]
name = "redox_syscall"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29"
dependencies = [
"bitflags 1.3.2",
]
[[package]]
@ -846,11 +1083,24 @@ version = "0.37.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d"
dependencies = [
"bitflags",
"bitflags 1.3.2",
"errno",
"io-lifetimes",
"libc",
"linux-raw-sys",
"linux-raw-sys 0.3.8",
"windows-sys 0.48.0",
]
[[package]]
name = "rustix"
version = "0.38.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "172891ebdceb05aa0005f533a6cbfca599ddd7d966f6f5d4d9b2e70478e70399"
dependencies = [
"bitflags 2.3.3",
"errno",
"libc",
"linux-raw-sys 0.4.5",
"windows-sys 0.48.0",
]
@ -889,7 +1139,7 @@ version = "0.24.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c37578180969d00692904465fb7f6b3d50b9a2b952b87c23d0e2e5cb5013416"
dependencies = [
"bitflags",
"bitflags 1.3.2",
"cssparser",
"derive_more",
"fxhash",
@ -924,7 +1174,7 @@ checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.17",
"syn 2.0.28",
]
[[package]]
@ -1011,15 +1261,28 @@ dependencies = [
[[package]]
name = "syn"
version = "2.0.17"
version = "2.0.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "45b6ddbb36c5b969c182aec3c4a0bce7df3fbad4b77114706a49aacc80567388"
checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "tempfile"
version = "3.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc02fddf48964c42031a0b3fe0428320ecf3a73c401040fc0096f97794310651"
dependencies = [
"cfg-if",
"fastrand",
"redox_syscall 0.3.5",
"rustix 0.38.7",
"windows-sys 0.48.0",
]
[[package]]
name = "tendril"
version = "0.4.3"
@ -1040,6 +1303,26 @@ dependencies = [
"winapi-util",
]
[[package]]
name = "thiserror"
version = "1.0.44"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "611040a08a0439f8248d1990b111c95baa9c704c805fa1f62104b39655fd7f90"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.44"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "090198534930841fab3a5d1bb637cde49e339654e606195f8d9c76eeb081dc96"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
]
[[package]]
name = "tinyvec"
version = "1.6.0"
@ -1129,6 +1412,17 @@ version = "0.11.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
[[package]]
name = "which"
version = "4.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2441c784c52b289a054b7201fc93253e288f094e2f4be9058343127c4226a269"
dependencies = [
"either",
"libc",
"once_cell",
]
[[package]]
name = "winapi"
version = "0.3.9"

View file

@ -15,6 +15,8 @@ ego-tree = "0.6.2"
env_logger = "0.10.0"
log = "0.4.18"
once_cell = "1.18.0"
osmpbf = "0.3.1"
rayon = "1.7.0"
scraper = "0.16.0"
serde = { version = "1.0.163", features = ["derive"] }
serde_json = "1.0.96"

View file

@ -13,7 +13,7 @@ It defines article sections that are not important for users and should be remov
## Usage
To use with the map generator, see the [`run.sh` script](run.sh) and its own help documentation.
It handles preparing the inputs, using multiple dumps, and re-running to convert titles to QIDs and extract them across languages.
It handles extracting the tags, using multiple dumps, and re-running to convert titles to QIDs and extract them across languages.
To run the wikiparser manually or for development, see below.
@ -29,41 +29,64 @@ Run the program with the `--help` flag to see all supported arguments.
```
$ cargo run --release -- --help
Extract article HTML from Wikipedia Enterprise HTML dumps.
Extract articles from Wikipedia Enterprise HTML dumps
Expects an uncompressed dump connected to stdin.
Usage: om-wikiparser <COMMAND>
Usage: om-wikiparser [OPTIONS] <OUTPUT_DIR>
Commands:
get-articles Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps
get-tags Extract wikidata/wikipedia tags from an OpenStreetMap PBF dump
simplify Apply the same html article simplification used when extracting articles to stdin, and write it to stdout
help Print this message or the help of the given subcommand(s)
Options:
-h, --help Print help (see more with '--help')
-V, --version Print version
```
Each command has its own additional help:
```
$ cargo run -- get-articles --help
Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.
Expects an uncompressed dump (newline-delimited JSON) connected to stdin.
Usage: om-wikiparser get-articles [OPTIONS] <OUTPUT_DIR>
Arguments:
<OUTPUT_DIR>
Directory to write the extracted articles to
Options:
--write-new-ids <WRITE_NEW_IDS>
--write-new-qids <FILE>
Append to the provided file path the QIDs of articles matched by title but not QID.
Use this to save the QIDs of articles you know the url of, but not the QID. The same path can later be passed to the `--wikidata-ids` option to extract them from another language's dump.
Use this to save the QIDs of articles you know the url of, but not the QID. The same path can later be passed to the `--wikidata-qids` option to extract them from another language's dump. Writes are atomicly appended to the file, so the same path may be used by multiple concurrent instances.
-h, --help
Print help (see a summary with '-h')
-V, --version
Print version
FILTERS:
--wikidata-ids <WIKIDATA_IDS>
--osm-tags <FILE.tsv>
Path to a TSV file that contains one or more of `wikidata`, `wikipedia` columns.
This can be generated with the `get-tags` command or `osmconvert --csv-headline --csv 'wikidata wikipedia'`.
--wikidata-qids <FILE>
Path to file that contains a Wikidata QID to extract on each line (e.g. `Q12345`)
--wikipedia-urls <WIKIPEDIA_URLS>
--wikipedia-urls <FILE>
Path to file that contains a Wikipedia article url to extract on each line (e.g. `https://lang.wikipedia.org/wiki/Article_Title`)
```
It takes as inputs:
- A wikidata enterprise JSON dump, extracted and connected to `stdin`.
- A file of Wikidata QIDs to extract, one per line (e.g. `Q12345`), passed as the CLI flag `--wikidata-ids`.
- A file of Wikipedia article titles to extract, one per line (e.g. `https://$LANG.wikipedia.org/wiki/$ARTICLE_TITLE`), passed as a CLI flag `--wikipedia-urls`.
- A directory to write the extracted articles to, as a CLI argument.
- Any number of filters passed:
- A TSV file of wikidata qids and wikipedia urls, created by the `get-tags` command or `osmconvert`, passed as the CLI flag `--osm-tags`.
- A file of Wikidata QIDs to extract, one per line (e.g. `Q12345`), passed as the CLI flag `--wikidata-ids`.
- A file of Wikipedia article titles to extract, one per line (e.g. `https://$LANG.wikipedia.org/wiki/$ARTICLE_TITLE`), passed as a CLI flag `--wikipedia-urls`.
As an example of manual usage with the map generator:
- Assuming this program is installed to `$PATH` as `om-wikiparser`.
@ -74,7 +97,7 @@ As an example of manual usage with the map generator:
```shell
# Transform intermediate files from generator.
cut -f 2 id_to_wikidata.csv > wikidata_ids.txt
cut -f 2 id_to_wikidata.csv > wikidata_qids.txt
tail -n +2 wiki_urls.txt | cut -f 3 > wikipedia_urls.txt
# Enable backtraces in errors and panics.
export RUST_BACKTRACE=1
@ -83,9 +106,38 @@ export RUST_LOG=om_wikiparser=debug
# Begin extraction.
for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
do
tar xzf $dump | om-wikiparser \
--wikidata-ids wikidata_ids.txt \
tar xzf $dump | om-wikiparser get-articles \
--wikidata-ids wikidata_qids.txt \
--wikipedia-urls wikipedia_urls.txt \
--write-new-qids new_qids.txt \
descriptions/
done
# Extract discovered QIDs.
for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
do
tar xzf $dump | om-wikiparser get-articles \
--wikidata-ids new_qids.txt \
descriptions/
done
```
Alternatively, extract the tags directly from a `.osm.pbf` file (referenced here as `planet-latest.osm.pbf`):
```shell
# Extract tags
om-wikiparser get-tags planet-latest.osm.pbf > osm_tags.tsv
# Begin extraction.
for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
do
tar xzf $dump | om-wikiparser get-articles \
--osm-tags osm_tags.tsv \
--write-new-qids new_qids.txt \
descriptions/
done
# Extract discovered QIDs.
for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
do
tar xzf $dump | om-wikiparser get-articles \
--wikidata-ids new_qids.txt \
descriptions/
done
```

41
run.sh
View file

@ -1,17 +1,16 @@
#! /usr/bin/env bash
# shellcheck disable=SC2016 # Backticks not used as expansions in documentation.
USAGE='Usage: ./run.sh [-h] <BUILD_DIR> <DUMP_FILE.json.tar.gz> [<DUMP_FILE.json.tar.gz>...]
USAGE='Usage: ./run.sh [-h] <BUILD_DIR> <OSM_FILE.osm.pbf> <DUMP_FILE.json.tar.gz> [<DUMP_FILE.json.tar.gz>...]
A convenience script to run the wikiparser with the maps generator as a drop-in replacement for the descriptions scraper.
Arguments:
<BUILD_DIR> An existing directory to place descriptions in.
The `id_to_wikidata.csv` and `wiki_urls.txt` files output by the
maps generator must be placed in this directory before running.
The extracted articles will be placed in a `descriptions`
subdirectory within this directory.
The `intermediate_data` subfolder of a maps build directory may
be used for this. The same folder may be used for multiple runs.
<OSM_FILE> An OpenStreetMap dump in PBF format to extract tags from.
<DUMP_FILE> A wikipedia enterprise html dump. These take the form of
`enwiki-NS0-20230401-ENTERPRISE-HTML.json.tar.gz`. Multiple
dumps in the same language SHOULD NOT be provided, and will
@ -21,7 +20,7 @@ Options:
-h Print this help screen
1. Builds wikiparser.
2. Extracts wikidata ids and wikipedia urls from generator intermediate files `id_to_wikidata.csv` and `wiki_urls.txt`.
2. Extracts wikidata qids and wikipedia urls from OpenStreetMap pbf file (NOTE: this spawns as many threads as there are cores).
3. Runs wikiparser in parallel for all input dump files (NOTE: this currently starts 2 processes for each dump files).
For information on running the wikiparser manually, see README.md.
@ -43,8 +42,8 @@ do
done
shift $((OPTIND - 1))
if [ -z "${2-}" ]; then
echo "BUILD_DIR and at least one DUMP_FILE are required" >&2
if [ -z "${3-}" ]; then
echo "BUILD_DIR, OSM_FILE, and at least one DUMP_FILE are required" >&2
echo -n "$USAGE" >&2
exit 1
fi
@ -58,6 +57,13 @@ if [ ! -d "$BUILD_DIR" ]; then
exit 1
fi
OSM_FILE=$(readlink -f -- "$1")
shift
if [ ! -f "$OSM_FILE" ]; then
echo "OSM_FILE '$OSM_FILE' does not exist or is not a file" >&2
exit 1
fi
DUMP_FILES=()
while (( $# > 0 )); do
dump_file="$(readlink -f -- "$1")"
@ -91,16 +97,8 @@ wikiparser=$(pwd)/target/release/om-wikiparser
log "Changing to maps build dir '$BUILD_DIR'"
cd "$BUILD_DIR"
log "Transforming intermediate generator data"
for intermediate_file in id_to_wikidata.csv wiki_urls.txt; do
if [ ! -e "$intermediate_file" ]; then
echo -e "Cannot find intermediate generator file '$intermediate_file' in maps build dir '$BUILD_DIR/'\nWas the descriptions step run?" >&2
exit 1
fi
done
cut -f 2 id_to_wikidata.csv > wikidata_ids.txt
tail -n +2 wiki_urls.txt | cut -f 3 > wikipedia_urls.txt
log "Extracting tags from '$OSM_FILE'"
"$wikiparser" get-tags "$OSM_FILE" > osm_tags.tsv
# Enable backtraces in errors and panics.
export RUST_BACKTRACE=1
@ -129,10 +127,9 @@ trap 'kill_jobs' SIGINT SIGTERM EXIT
for dump in "${DUMP_FILES[@]}"; do
log "Extracting '$dump'"
tar xzOf "$dump" | "$wikiparser" \
--wikidata-ids wikidata_ids.txt \
--wikipedia-urls wikipedia_urls.txt \
--write-new-ids new_qids.txt \
tar xzOf "$dump" | "$wikiparser" get-articles \
--osm-tags osm_tags.tsv \
--write-new-qids new_qids.txt \
"$OUTPUT_DIR" &
done
@ -142,8 +139,8 @@ log "Beginning extraction of discovered QIDs"
# Extract new qids from other dumps in parallel.
for dump in "${DUMP_FILES[@]}"; do
tar xzOf "$dump" | "$wikiparser" \
--wikidata-ids new_qids.txt \
tar xzOf "$dump" | "$wikiparser" get-articles \
--wikidata-qids new_qids.txt \
"$OUTPUT_DIR" &
done

View file

@ -12,7 +12,7 @@ use om_wikiparser::{
wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
};
/// Extract article HTML from Wikipedia Enterprise HTML dumps.
/// Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.
///
/// Expects an uncompressed dump (newline-delimited JSON) connected to stdin.
#[derive(clap::Args)]
@ -22,27 +22,27 @@ pub struct Args {
/// Path to a TSV file that contains one or more of `wikidata`, `wikipedia` columns.
///
/// This can be generated with `osmconvert --csv-headline --csv 'wikidata wikipedia'`.
#[arg(long, help_heading = "FILTERS")]
/// This can be generated with the `get-tags` command or `osmconvert --csv-headline --csv 'wikidata wikipedia'`.
#[arg(long, help_heading = "FILTERS", value_name = "FILE.tsv")]
pub osm_tags: Option<PathBuf>,
/// Path to file that contains a Wikidata QID to extract on each line
/// (e.g. `Q12345`).
#[arg(long, help_heading = "FILTERS")]
pub wikidata_ids: Option<PathBuf>,
#[arg(long, help_heading = "FILTERS", value_name = "FILE")]
pub wikidata_qids: Option<PathBuf>,
/// Path to file that contains a Wikipedia article url to extract on each line
/// (e.g. `https://lang.wikipedia.org/wiki/Article_Title`).
#[arg(long, help_heading = "FILTERS")]
#[arg(long, help_heading = "FILTERS", value_name = "FILE")]
pub wikipedia_urls: Option<PathBuf>,
/// Append to the provided file path the QIDs of articles matched by title but not QID.
///
/// Use this to save the QIDs of articles you know the url of, but not the QID.
/// The same path can later be passed to the `--wikidata-ids` option to extract them from another language's dump.
/// The same path can later be passed to the `--wikidata-qids` option to extract them from another language's dump.
/// Writes are atomicly appended to the file, so the same path may be used by multiple concurrent instances.
#[arg(long, requires("wikipedia_urls"))]
pub write_new_ids: Option<PathBuf>,
#[arg(long, value_name = "FILE")]
pub write_new_qids: Option<PathBuf>,
}
pub fn run(args: Args) -> anyhow::Result<()> {
@ -53,8 +53,8 @@ pub fn run(args: Args) -> anyhow::Result<()> {
Default::default()
};
let mut wikidata_ids = if let Some(path) = args.wikidata_ids {
info!("Loading wikidata ids from {path:?}");
let mut wikidata_qids = if let Some(path) = args.wikidata_qids {
info!("Loading wikidata QIDs from {path:?}");
parse_wikidata_file(path)?
} else {
Default::default()
@ -62,11 +62,11 @@ pub fn run(args: Args) -> anyhow::Result<()> {
if let Some(path) = args.osm_tags {
info!("Loading wikipedia/wikidata osm tags from {path:?}");
parse_osm_tag_file(path, &mut wikidata_ids, &mut wikipedia_titles)?;
parse_osm_tag_file(path, &mut wikidata_qids, &mut wikipedia_titles)?;
}
debug!("Parsed {} unique article urls", wikipedia_titles.len());
debug!("Parsed {} unique wikidata ids", wikidata_ids.len());
debug!("Parsed {} unique article titles", wikipedia_titles.len());
debug!("Parsed {} unique wikidata QIDs", wikidata_qids.len());
// NOTE: For atomic writes to the same file across threads/processes:
// - The file needs to be opened in APPEND mode (`.append(true)`).
@ -77,8 +77,8 @@ pub fn run(args: Args) -> anyhow::Result<()> {
// - `man write(3posix)`: https://www.man7.org/linux/man-pages/man3/write.3p.html
// - `std::fs::OpenOptions::append`: https://doc.rust-lang.org/std/fs/struct.OpenOptions.html#method.append
// - https://stackoverflow.com/questions/1154446/is-file-append-atomic-in-unix
let mut write_new_ids = args
.write_new_ids
let mut write_new_qids = args
.write_new_qids
.as_ref()
.map(|p| File::options().create(true).append(true).open(p))
.transpose()?;
@ -105,7 +105,7 @@ pub fn run(args: Args) -> anyhow::Result<()> {
let is_wikidata_match = qid
.as_ref()
.map(|qid| wikidata_ids.contains(qid))
.map(|qid| wikidata_qids.contains(qid))
.unwrap_or_default();
let matching_titles = if wikipedia_titles.is_empty() {
@ -127,16 +127,16 @@ pub fn run(args: Args) -> anyhow::Result<()> {
}
// Write matched new QIDs back to file.
if let (Some(f), Some(qid)) = (&mut write_new_ids, &qid) {
if let (Some(f), Some(qid)) = (&mut write_new_qids, &qid) {
if !is_wikidata_match && !matching_titles.is_empty() {
debug!("Writing new id {} for article {:?}", qid, page.name);
// NOTE: Write to string buffer first to have a single atomic write syscall.
// See `write_new_ids` for more info.
// See `write_new_qids` for more info.
let line = format!("{}\n", qid);
write!(f, "{}", line).with_context(|| {
format!(
"writing new id to file {:?}",
args.write_new_ids.as_ref().unwrap()
"writing new QID to file {:?}",
args.write_new_qids.as_ref().unwrap()
)
})?;
}

93
src/get_tags.rs Normal file
View file

@ -0,0 +1,93 @@
use std::{
io::{stdout, Read},
sync::mpsc,
thread,
};
use osmpbf::{BlobDecode, BlobReader, Element};
use rayon::prelude::*;
struct Record {
id: String,
wikidata: String,
wikipedia: String,
}
/// Extract matching tags from an osm pbf file and write to stdout in TSV.
pub fn run(pbf: impl Read + Send) -> anyhow::Result<()> {
let reader = BlobReader::new(pbf);
let (send, recv) = mpsc::sync_channel(128);
let writer_thread = thread::Builder::new()
.name("writer".to_string())
.spawn(move || write(recv))?;
reader
.par_bridge()
.try_for_each(move |blob| -> anyhow::Result<()> {
// Based on `osmpbf` implementation of `ElementReader`.
let BlobDecode::OsmData(block) = blob?.decode()? else { return Ok(()) };
for record in block.elements().filter_map(extract_tags) {
send.send(record)?;
}
Ok(())
})?;
let record_count = writer_thread.join().unwrap()?;
info!("Finished processing {record_count} records");
Ok(())
}
fn write(recv: mpsc::Receiver<Record>) -> anyhow::Result<usize> {
let mut output = csv::WriterBuilder::new()
.delimiter(b'\t')
.from_writer(stdout().lock());
output.write_record(["@id", "wikidata", "wikipedia"])?;
let mut count = 0;
for Record {
id,
wikidata,
wikipedia,
} in recv
{
output.write_record([id, wikidata, wikipedia])?;
count += 1;
}
Ok(count)
}
fn extract_tags(el: Element) -> Option<Record> {
match el {
Element::Node(n) => make_record(n.id(), n.tags()),
Element::DenseNode(n) => make_record(n.id(), n.tags()),
Element::Way(w) => make_record(w.id(), w.tags()),
Element::Relation(r) => make_record(r.id(), r.tags()),
}
}
fn make_record<'i>(id: i64, tags: impl 'i + Iterator<Item = (&'i str, &'i str)>) -> Option<Record> {
let mut wikipedia = String::new();
let mut wikidata = String::new();
for (key, value) in tags {
match key {
"wikipedia" => wikipedia = value.trim().to_owned(),
"wikidata" => wikidata = value.trim().to_owned(),
_ => {}
}
}
if wikidata.is_empty() && wikipedia.is_empty() {
return None;
}
Some(Record {
id: id.to_string(),
wikipedia,
wikidata,
})
}

View file

@ -1,24 +1,20 @@
use std::{
fs::File,
io::{stdin, stdout, BufReader, Read, Write},
num::NonZeroUsize,
path::PathBuf,
};
use clap::{CommandFactory, Parser, Subcommand};
#[macro_use]
extern crate log;
mod get_articles;
mod get_tags;
/// Get the version returned by `git describe`, e.g.:
/// - `v2.0` if a git tag
/// - the commit hash `034ac04` if not a tag
/// - `034ac04-dirty` if uncommited changes are present,
/// or the crate version if not available (if installed from crates.io).
///
/// See `build.rs` file for more info.
fn version() -> &'static str {
option_env!("CARGO_GIT_VERSION")
.or(option_env!("CARGO_PKG_VERSION"))
.unwrap_or("unknown")
}
/// Extract articles from Wikipedia Enterprise HTML dumps.
#[derive(Parser)]
#[command(version = crate::version())]
#[command(author, version, about, long_about, version = crate::version())]
struct Args {
#[command(subcommand)]
cmd: Cmd,
@ -31,7 +27,16 @@ enum Cmd {
/// Extract wikidata/wikipedia tags from an OpenStreetMap PBF dump.
///
/// Writes to stdout the extracted tags in a TSV format similar to `osmconvert --csv`.
GetTags,
GetTags {
/// The `.osm.pbf` file to use.
pbf_file: PathBuf,
/// The number of threads to spawn to parse and decompress the pbf file.
///
/// Defaults to the number of cores.
#[arg(short, long)]
procs: Option<NonZeroUsize>,
},
/// Apply the same html article simplification used when extracting articles to stdin, and write it to stdout.
///
@ -57,24 +62,30 @@ fn main() -> anyhow::Result<()> {
match args.cmd {
Cmd::GetArticles(args) => {
if args.wikidata_ids.is_none()
if args.wikidata_qids.is_none()
&& args.wikipedia_urls.is_none()
&& args.osm_tags.is_none()
{
let mut cmd = Args::command();
cmd.error(
clap::error::ErrorKind::MissingRequiredArgument,
"at least one of --osm-tags --wikidata-ids --wikipedia-urls is required",
"at least one of --osm-tags --wikidata-qids --wikipedia-urls is required",
)
.exit()
}
get_articles::run(args)
}
Cmd::GetTags => todo!(),
Cmd::Simplify { lang } => {
use std::io::{stdin, stdout, Read, Write};
Cmd::GetTags { pbf_file, procs } => {
rayon::ThreadPoolBuilder::new()
.thread_name(|num| format!("worker{num}"))
.num_threads(procs.map(usize::from).unwrap_or_default())
.build_global()?;
let pbf_file = File::open(pbf_file).map(BufReader::new)?;
get_tags::run(pbf_file)
}
Cmd::Simplify { lang } => {
let mut input = String::new();
stdin().read_to_string(&mut input)?;
@ -86,3 +97,16 @@ fn main() -> anyhow::Result<()> {
}
}
}
/// Get the version returned by `git describe`, e.g.:
/// - `v2.0` if a git tag
/// - the commit hash `034ac04` if not a tag
/// - `034ac04-dirty` if uncommited changes are present,
/// or the crate version if not available (if installed from crates.io).
///
/// See `build.rs` file for more info.
fn version() -> &'static str {
option_env!("CARGO_GIT_VERSION")
.or(option_env!("CARGO_PKG_VERSION"))
.unwrap_or("unknown")
}