Extract tags in parallel in rust
- Use rayon and osmpbf crates, output intermediate TSV file in the same format as osmconvert, for use with the new `--osm-tags` flag. - Number of threads spawned can be configured with `--procs` flag. - Replace all wikidata id references with QID. - Update script and documentation to use new subcommands. - run.sh now expects a pbf file to extract tags from. Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
parent
b6db70f74c
commit
6d242a62aa
7 changed files with 560 additions and 98 deletions
332
Cargo.lock
generated
332
Cargo.lock
generated
|
@ -112,7 +112,7 @@ dependencies = [
|
|||
"cc",
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"miniz_oxide",
|
||||
"miniz_oxide 0.6.2",
|
||||
"object",
|
||||
"rustc-demangle",
|
||||
]
|
||||
|
@ -123,6 +123,12 @@ version = "1.3.2"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "2.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42"
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.4.3"
|
||||
|
@ -160,7 +166,7 @@ checksum = "72394f3339a76daf211e57d4bcb374410f3965dcc606dd0e03738c7888766980"
|
|||
dependencies = [
|
||||
"anstream",
|
||||
"anstyle",
|
||||
"bitflags",
|
||||
"bitflags 1.3.2",
|
||||
"clap_lex",
|
||||
"strsim",
|
||||
]
|
||||
|
@ -174,7 +180,7 @@ dependencies = [
|
|||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.17",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -195,6 +201,58 @@ version = "0.4.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e"
|
||||
|
||||
[[package]]
|
||||
name = "crc32fast"
|
||||
version = "1.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-channel"
|
||||
version = "0.5.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-deque"
|
||||
version = "0.8.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crossbeam-epoch",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-epoch"
|
||||
version = "0.9.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"cfg-if",
|
||||
"crossbeam-utils",
|
||||
"memoffset",
|
||||
"scopeguard",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cssparser"
|
||||
version = "0.29.6"
|
||||
|
@ -277,6 +335,12 @@ version = "0.6.2"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591"
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07"
|
||||
|
||||
[[package]]
|
||||
name = "env_logger"
|
||||
version = "0.10.0"
|
||||
|
@ -311,6 +375,22 @@ dependencies = [
|
|||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fastrand"
|
||||
version = "2.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764"
|
||||
|
||||
[[package]]
|
||||
name = "flate2"
|
||||
version = "1.0.26"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
|
||||
dependencies = [
|
||||
"crc32fast",
|
||||
"miniz_oxide 0.7.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "form_urlencoded"
|
||||
version = "1.1.0"
|
||||
|
@ -376,6 +456,12 @@ version = "0.27.2"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ad0a93d233ebf96623465aad4046a8d3aa4da22d4f4beba5388838c8a434bbb4"
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.4.1"
|
||||
|
@ -418,6 +504,16 @@ dependencies = [
|
|||
"unicode-normalization",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "indexmap"
|
||||
version = "1.9.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"hashbrown",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "io-lifetimes"
|
||||
version = "1.0.11"
|
||||
|
@ -437,7 +533,7 @@ checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
|
|||
dependencies = [
|
||||
"hermit-abi",
|
||||
"io-lifetimes",
|
||||
"rustix",
|
||||
"rustix 0.37.19",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
|
@ -449,9 +545,9 @@ checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
|
|||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.144"
|
||||
version = "0.2.147"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1"
|
||||
checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
|
@ -459,6 +555,12 @@ version = "0.3.8"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.4.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "57bcfdad1b858c2db7c38303a6d2ad4dfaf5eb53dfeb0910128b2c26d6158503"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.9"
|
||||
|
@ -507,6 +609,24 @@ version = "2.5.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
|
||||
|
||||
[[package]]
|
||||
name = "memmap2"
|
||||
version = "0.5.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memoffset"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "miniz_oxide"
|
||||
version = "0.6.2"
|
||||
|
@ -516,6 +636,15 @@ dependencies = [
|
|||
"adler",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "miniz_oxide"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
|
||||
dependencies = [
|
||||
"adler",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "new_debug_unreachable"
|
||||
version = "1.0.4"
|
||||
|
@ -528,6 +657,16 @@ version = "0.1.14"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb"
|
||||
|
||||
[[package]]
|
||||
name = "num_cpus"
|
||||
version = "1.16.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "object"
|
||||
version = "0.30.3"
|
||||
|
@ -548,6 +687,8 @@ dependencies = [
|
|||
"env_logger",
|
||||
"log",
|
||||
"once_cell",
|
||||
"osmpbf",
|
||||
"rayon",
|
||||
"scraper",
|
||||
"serde",
|
||||
"serde_json",
|
||||
|
@ -561,6 +702,20 @@ version = "1.18.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
|
||||
|
||||
[[package]]
|
||||
name = "osmpbf"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3bec2671f8eb1e9a353adfe8aafe44c9c5207e0012d469a4b61fb7bf33adf37"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"flate2",
|
||||
"memmap2",
|
||||
"protobuf",
|
||||
"protobuf-codegen",
|
||||
"rayon",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.12.1"
|
||||
|
@ -579,7 +734,7 @@ checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521"
|
|||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"redox_syscall",
|
||||
"redox_syscall 0.2.16",
|
||||
"smallvec",
|
||||
"windows-sys 0.45.0",
|
||||
]
|
||||
|
@ -702,18 +857,69 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
|
|||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.59"
|
||||
version = "1.0.66"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6aeca18b86b413c660b781aa319e4e2648a3e6f9eadc9b47e9038e6fe9f3451b"
|
||||
checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.28"
|
||||
name = "protobuf"
|
||||
version = "3.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488"
|
||||
checksum = "b55bad9126f378a853655831eb7363b7b01b81d19f8cb1218861086ca4a1a61e"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
"protobuf-support",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "protobuf-codegen"
|
||||
version = "3.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0dd418ac3c91caa4032d37cb80ff0d44e2ebe637b2fb243b6234bf89cdac4901"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"once_cell",
|
||||
"protobuf",
|
||||
"protobuf-parse",
|
||||
"regex",
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "protobuf-parse"
|
||||
version = "3.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9d39b14605eaa1f6a340aec7f320b34064feb26c93aec35d6a9a2272a8ddfa49"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"indexmap",
|
||||
"log",
|
||||
"protobuf",
|
||||
"protobuf-support",
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"which",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "protobuf-support"
|
||||
version = "3.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a5d4d7b8601c814cfb36bcebb79f0e61e45e1e93640cf778837833bbed05c372"
|
||||
dependencies = [
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.32"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
@ -799,13 +1005,44 @@ dependencies = [
|
|||
"rand_core 0.5.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b"
|
||||
dependencies = [
|
||||
"either",
|
||||
"rayon-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon-core"
|
||||
version = "1.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d"
|
||||
dependencies = [
|
||||
"crossbeam-channel",
|
||||
"crossbeam-deque",
|
||||
"crossbeam-utils",
|
||||
"num_cpus",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.2.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"bitflags 1.3.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.3.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -846,11 +1083,24 @@ version = "0.37.19"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"bitflags 1.3.2",
|
||||
"errno",
|
||||
"io-lifetimes",
|
||||
"libc",
|
||||
"linux-raw-sys",
|
||||
"linux-raw-sys 0.3.8",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.38.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "172891ebdceb05aa0005f533a6cbfca599ddd7d966f6f5d4d9b2e70478e70399"
|
||||
dependencies = [
|
||||
"bitflags 2.3.3",
|
||||
"errno",
|
||||
"libc",
|
||||
"linux-raw-sys 0.4.5",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
|
@ -889,7 +1139,7 @@ version = "0.24.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c37578180969d00692904465fb7f6b3d50b9a2b952b87c23d0e2e5cb5013416"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"bitflags 1.3.2",
|
||||
"cssparser",
|
||||
"derive_more",
|
||||
"fxhash",
|
||||
|
@ -924,7 +1174,7 @@ checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e"
|
|||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.17",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -1011,15 +1261,28 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.17"
|
||||
version = "2.0.28"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "45b6ddbb36c5b969c182aec3c4a0bce7df3fbad4b77114706a49aacc80567388"
|
||||
checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tempfile"
|
||||
version = "3.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc02fddf48964c42031a0b3fe0428320ecf3a73c401040fc0096f97794310651"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"fastrand",
|
||||
"redox_syscall 0.3.5",
|
||||
"rustix 0.38.7",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tendril"
|
||||
version = "0.4.3"
|
||||
|
@ -1040,6 +1303,26 @@ dependencies = [
|
|||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.44"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "611040a08a0439f8248d1990b111c95baa9c704c805fa1f62104b39655fd7f90"
|
||||
dependencies = [
|
||||
"thiserror-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "1.0.44"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "090198534930841fab3a5d1bb637cde49e339654e606195f8d9c76eeb081dc96"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinyvec"
|
||||
version = "1.6.0"
|
||||
|
@ -1129,6 +1412,17 @@ version = "0.11.0+wasi-snapshot-preview1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
||||
|
||||
[[package]]
|
||||
name = "which"
|
||||
version = "4.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2441c784c52b289a054b7201fc93253e288f094e2f4be9058343127c4226a269"
|
||||
dependencies = [
|
||||
"either",
|
||||
"libc",
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.9"
|
||||
|
|
|
@ -15,6 +15,8 @@ ego-tree = "0.6.2"
|
|||
env_logger = "0.10.0"
|
||||
log = "0.4.18"
|
||||
once_cell = "1.18.0"
|
||||
osmpbf = "0.3.1"
|
||||
rayon = "1.7.0"
|
||||
scraper = "0.16.0"
|
||||
serde = { version = "1.0.163", features = ["derive"] }
|
||||
serde_json = "1.0.96"
|
||||
|
|
84
README.md
84
README.md
|
@ -13,7 +13,7 @@ It defines article sections that are not important for users and should be remov
|
|||
## Usage
|
||||
|
||||
To use with the map generator, see the [`run.sh` script](run.sh) and its own help documentation.
|
||||
It handles preparing the inputs, using multiple dumps, and re-running to convert titles to QIDs and extract them across languages.
|
||||
It handles extracting the tags, using multiple dumps, and re-running to convert titles to QIDs and extract them across languages.
|
||||
|
||||
To run the wikiparser manually or for development, see below.
|
||||
|
||||
|
@ -29,41 +29,64 @@ Run the program with the `--help` flag to see all supported arguments.
|
|||
|
||||
```
|
||||
$ cargo run --release -- --help
|
||||
Extract article HTML from Wikipedia Enterprise HTML dumps.
|
||||
Extract articles from Wikipedia Enterprise HTML dumps
|
||||
|
||||
Expects an uncompressed dump connected to stdin.
|
||||
Usage: om-wikiparser <COMMAND>
|
||||
|
||||
Usage: om-wikiparser [OPTIONS] <OUTPUT_DIR>
|
||||
Commands:
|
||||
get-articles Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps
|
||||
get-tags Extract wikidata/wikipedia tags from an OpenStreetMap PBF dump
|
||||
simplify Apply the same html article simplification used when extracting articles to stdin, and write it to stdout
|
||||
help Print this message or the help of the given subcommand(s)
|
||||
|
||||
Options:
|
||||
-h, --help Print help (see more with '--help')
|
||||
-V, --version Print version
|
||||
```
|
||||
|
||||
Each command has its own additional help:
|
||||
|
||||
```
|
||||
$ cargo run -- get-articles --help
|
||||
Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.
|
||||
|
||||
Expects an uncompressed dump (newline-delimited JSON) connected to stdin.
|
||||
|
||||
Usage: om-wikiparser get-articles [OPTIONS] <OUTPUT_DIR>
|
||||
|
||||
Arguments:
|
||||
<OUTPUT_DIR>
|
||||
Directory to write the extracted articles to
|
||||
|
||||
Options:
|
||||
--write-new-ids <WRITE_NEW_IDS>
|
||||
--write-new-qids <FILE>
|
||||
Append to the provided file path the QIDs of articles matched by title but not QID.
|
||||
|
||||
Use this to save the QIDs of articles you know the url of, but not the QID. The same path can later be passed to the `--wikidata-ids` option to extract them from another language's dump.
|
||||
Use this to save the QIDs of articles you know the url of, but not the QID. The same path can later be passed to the `--wikidata-qids` option to extract them from another language's dump. Writes are atomicly appended to the file, so the same path may be used by multiple concurrent instances.
|
||||
|
||||
-h, --help
|
||||
Print help (see a summary with '-h')
|
||||
|
||||
-V, --version
|
||||
Print version
|
||||
|
||||
FILTERS:
|
||||
--wikidata-ids <WIKIDATA_IDS>
|
||||
--osm-tags <FILE.tsv>
|
||||
Path to a TSV file that contains one or more of `wikidata`, `wikipedia` columns.
|
||||
|
||||
This can be generated with the `get-tags` command or `osmconvert --csv-headline --csv 'wikidata wikipedia'`.
|
||||
|
||||
--wikidata-qids <FILE>
|
||||
Path to file that contains a Wikidata QID to extract on each line (e.g. `Q12345`)
|
||||
|
||||
--wikipedia-urls <WIKIPEDIA_URLS>
|
||||
--wikipedia-urls <FILE>
|
||||
Path to file that contains a Wikipedia article url to extract on each line (e.g. `https://lang.wikipedia.org/wiki/Article_Title`)
|
||||
```
|
||||
|
||||
It takes as inputs:
|
||||
- A wikidata enterprise JSON dump, extracted and connected to `stdin`.
|
||||
- A file of Wikidata QIDs to extract, one per line (e.g. `Q12345`), passed as the CLI flag `--wikidata-ids`.
|
||||
- A file of Wikipedia article titles to extract, one per line (e.g. `https://$LANG.wikipedia.org/wiki/$ARTICLE_TITLE`), passed as a CLI flag `--wikipedia-urls`.
|
||||
- A directory to write the extracted articles to, as a CLI argument.
|
||||
- Any number of filters passed:
|
||||
- A TSV file of wikidata qids and wikipedia urls, created by the `get-tags` command or `osmconvert`, passed as the CLI flag `--osm-tags`.
|
||||
- A file of Wikidata QIDs to extract, one per line (e.g. `Q12345`), passed as the CLI flag `--wikidata-ids`.
|
||||
- A file of Wikipedia article titles to extract, one per line (e.g. `https://$LANG.wikipedia.org/wiki/$ARTICLE_TITLE`), passed as a CLI flag `--wikipedia-urls`.
|
||||
|
||||
As an example of manual usage with the map generator:
|
||||
- Assuming this program is installed to `$PATH` as `om-wikiparser`.
|
||||
|
@ -74,7 +97,7 @@ As an example of manual usage with the map generator:
|
|||
|
||||
```shell
|
||||
# Transform intermediate files from generator.
|
||||
cut -f 2 id_to_wikidata.csv > wikidata_ids.txt
|
||||
cut -f 2 id_to_wikidata.csv > wikidata_qids.txt
|
||||
tail -n +2 wiki_urls.txt | cut -f 3 > wikipedia_urls.txt
|
||||
# Enable backtraces in errors and panics.
|
||||
export RUST_BACKTRACE=1
|
||||
|
@ -83,9 +106,38 @@ export RUST_LOG=om_wikiparser=debug
|
|||
# Begin extraction.
|
||||
for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
|
||||
do
|
||||
tar xzf $dump | om-wikiparser \
|
||||
--wikidata-ids wikidata_ids.txt \
|
||||
tar xzf $dump | om-wikiparser get-articles \
|
||||
--wikidata-ids wikidata_qids.txt \
|
||||
--wikipedia-urls wikipedia_urls.txt \
|
||||
--write-new-qids new_qids.txt \
|
||||
descriptions/
|
||||
done
|
||||
# Extract discovered QIDs.
|
||||
for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
|
||||
do
|
||||
tar xzf $dump | om-wikiparser get-articles \
|
||||
--wikidata-ids new_qids.txt \
|
||||
descriptions/
|
||||
done
|
||||
```
|
||||
|
||||
Alternatively, extract the tags directly from a `.osm.pbf` file (referenced here as `planet-latest.osm.pbf`):
|
||||
```shell
|
||||
# Extract tags
|
||||
om-wikiparser get-tags planet-latest.osm.pbf > osm_tags.tsv
|
||||
# Begin extraction.
|
||||
for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
|
||||
do
|
||||
tar xzf $dump | om-wikiparser get-articles \
|
||||
--osm-tags osm_tags.tsv \
|
||||
--write-new-qids new_qids.txt \
|
||||
descriptions/
|
||||
done
|
||||
# Extract discovered QIDs.
|
||||
for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
|
||||
do
|
||||
tar xzf $dump | om-wikiparser get-articles \
|
||||
--wikidata-ids new_qids.txt \
|
||||
descriptions/
|
||||
done
|
||||
```
|
||||
|
|
41
run.sh
41
run.sh
|
@ -1,17 +1,16 @@
|
|||
#! /usr/bin/env bash
|
||||
# shellcheck disable=SC2016 # Backticks not used as expansions in documentation.
|
||||
USAGE='Usage: ./run.sh [-h] <BUILD_DIR> <DUMP_FILE.json.tar.gz> [<DUMP_FILE.json.tar.gz>...]
|
||||
USAGE='Usage: ./run.sh [-h] <BUILD_DIR> <OSM_FILE.osm.pbf> <DUMP_FILE.json.tar.gz> [<DUMP_FILE.json.tar.gz>...]
|
||||
|
||||
A convenience script to run the wikiparser with the maps generator as a drop-in replacement for the descriptions scraper.
|
||||
|
||||
Arguments:
|
||||
<BUILD_DIR> An existing directory to place descriptions in.
|
||||
The `id_to_wikidata.csv` and `wiki_urls.txt` files output by the
|
||||
maps generator must be placed in this directory before running.
|
||||
The extracted articles will be placed in a `descriptions`
|
||||
subdirectory within this directory.
|
||||
The `intermediate_data` subfolder of a maps build directory may
|
||||
be used for this. The same folder may be used for multiple runs.
|
||||
<OSM_FILE> An OpenStreetMap dump in PBF format to extract tags from.
|
||||
<DUMP_FILE> A wikipedia enterprise html dump. These take the form of
|
||||
`enwiki-NS0-20230401-ENTERPRISE-HTML.json.tar.gz`. Multiple
|
||||
dumps in the same language SHOULD NOT be provided, and will
|
||||
|
@ -21,7 +20,7 @@ Options:
|
|||
-h Print this help screen
|
||||
|
||||
1. Builds wikiparser.
|
||||
2. Extracts wikidata ids and wikipedia urls from generator intermediate files `id_to_wikidata.csv` and `wiki_urls.txt`.
|
||||
2. Extracts wikidata qids and wikipedia urls from OpenStreetMap pbf file (NOTE: this spawns as many threads as there are cores).
|
||||
3. Runs wikiparser in parallel for all input dump files (NOTE: this currently starts 2 processes for each dump files).
|
||||
|
||||
For information on running the wikiparser manually, see README.md.
|
||||
|
@ -43,8 +42,8 @@ do
|
|||
done
|
||||
shift $((OPTIND - 1))
|
||||
|
||||
if [ -z "${2-}" ]; then
|
||||
echo "BUILD_DIR and at least one DUMP_FILE are required" >&2
|
||||
if [ -z "${3-}" ]; then
|
||||
echo "BUILD_DIR, OSM_FILE, and at least one DUMP_FILE are required" >&2
|
||||
echo -n "$USAGE" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
@ -58,6 +57,13 @@ if [ ! -d "$BUILD_DIR" ]; then
|
|||
exit 1
|
||||
fi
|
||||
|
||||
OSM_FILE=$(readlink -f -- "$1")
|
||||
shift
|
||||
if [ ! -f "$OSM_FILE" ]; then
|
||||
echo "OSM_FILE '$OSM_FILE' does not exist or is not a file" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
DUMP_FILES=()
|
||||
while (( $# > 0 )); do
|
||||
dump_file="$(readlink -f -- "$1")"
|
||||
|
@ -91,16 +97,8 @@ wikiparser=$(pwd)/target/release/om-wikiparser
|
|||
log "Changing to maps build dir '$BUILD_DIR'"
|
||||
cd "$BUILD_DIR"
|
||||
|
||||
log "Transforming intermediate generator data"
|
||||
for intermediate_file in id_to_wikidata.csv wiki_urls.txt; do
|
||||
if [ ! -e "$intermediate_file" ]; then
|
||||
echo -e "Cannot find intermediate generator file '$intermediate_file' in maps build dir '$BUILD_DIR/'\nWas the descriptions step run?" >&2
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
cut -f 2 id_to_wikidata.csv > wikidata_ids.txt
|
||||
tail -n +2 wiki_urls.txt | cut -f 3 > wikipedia_urls.txt
|
||||
log "Extracting tags from '$OSM_FILE'"
|
||||
"$wikiparser" get-tags "$OSM_FILE" > osm_tags.tsv
|
||||
|
||||
# Enable backtraces in errors and panics.
|
||||
export RUST_BACKTRACE=1
|
||||
|
@ -129,10 +127,9 @@ trap 'kill_jobs' SIGINT SIGTERM EXIT
|
|||
|
||||
for dump in "${DUMP_FILES[@]}"; do
|
||||
log "Extracting '$dump'"
|
||||
tar xzOf "$dump" | "$wikiparser" \
|
||||
--wikidata-ids wikidata_ids.txt \
|
||||
--wikipedia-urls wikipedia_urls.txt \
|
||||
--write-new-ids new_qids.txt \
|
||||
tar xzOf "$dump" | "$wikiparser" get-articles \
|
||||
--osm-tags osm_tags.tsv \
|
||||
--write-new-qids new_qids.txt \
|
||||
"$OUTPUT_DIR" &
|
||||
done
|
||||
|
||||
|
@ -142,8 +139,8 @@ log "Beginning extraction of discovered QIDs"
|
|||
|
||||
# Extract new qids from other dumps in parallel.
|
||||
for dump in "${DUMP_FILES[@]}"; do
|
||||
tar xzOf "$dump" | "$wikiparser" \
|
||||
--wikidata-ids new_qids.txt \
|
||||
tar xzOf "$dump" | "$wikiparser" get-articles \
|
||||
--wikidata-qids new_qids.txt \
|
||||
"$OUTPUT_DIR" &
|
||||
done
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@ use om_wikiparser::{
|
|||
wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
|
||||
};
|
||||
|
||||
/// Extract article HTML from Wikipedia Enterprise HTML dumps.
|
||||
/// Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.
|
||||
///
|
||||
/// Expects an uncompressed dump (newline-delimited JSON) connected to stdin.
|
||||
#[derive(clap::Args)]
|
||||
|
@ -22,27 +22,27 @@ pub struct Args {
|
|||
|
||||
/// Path to a TSV file that contains one or more of `wikidata`, `wikipedia` columns.
|
||||
///
|
||||
/// This can be generated with `osmconvert --csv-headline --csv 'wikidata wikipedia'`.
|
||||
#[arg(long, help_heading = "FILTERS")]
|
||||
/// This can be generated with the `get-tags` command or `osmconvert --csv-headline --csv 'wikidata wikipedia'`.
|
||||
#[arg(long, help_heading = "FILTERS", value_name = "FILE.tsv")]
|
||||
pub osm_tags: Option<PathBuf>,
|
||||
|
||||
/// Path to file that contains a Wikidata QID to extract on each line
|
||||
/// (e.g. `Q12345`).
|
||||
#[arg(long, help_heading = "FILTERS")]
|
||||
pub wikidata_ids: Option<PathBuf>,
|
||||
#[arg(long, help_heading = "FILTERS", value_name = "FILE")]
|
||||
pub wikidata_qids: Option<PathBuf>,
|
||||
|
||||
/// Path to file that contains a Wikipedia article url to extract on each line
|
||||
/// (e.g. `https://lang.wikipedia.org/wiki/Article_Title`).
|
||||
#[arg(long, help_heading = "FILTERS")]
|
||||
#[arg(long, help_heading = "FILTERS", value_name = "FILE")]
|
||||
pub wikipedia_urls: Option<PathBuf>,
|
||||
|
||||
/// Append to the provided file path the QIDs of articles matched by title but not QID.
|
||||
///
|
||||
/// Use this to save the QIDs of articles you know the url of, but not the QID.
|
||||
/// The same path can later be passed to the `--wikidata-ids` option to extract them from another language's dump.
|
||||
/// The same path can later be passed to the `--wikidata-qids` option to extract them from another language's dump.
|
||||
/// Writes are atomicly appended to the file, so the same path may be used by multiple concurrent instances.
|
||||
#[arg(long, requires("wikipedia_urls"))]
|
||||
pub write_new_ids: Option<PathBuf>,
|
||||
#[arg(long, value_name = "FILE")]
|
||||
pub write_new_qids: Option<PathBuf>,
|
||||
}
|
||||
|
||||
pub fn run(args: Args) -> anyhow::Result<()> {
|
||||
|
@ -53,8 +53,8 @@ pub fn run(args: Args) -> anyhow::Result<()> {
|
|||
Default::default()
|
||||
};
|
||||
|
||||
let mut wikidata_ids = if let Some(path) = args.wikidata_ids {
|
||||
info!("Loading wikidata ids from {path:?}");
|
||||
let mut wikidata_qids = if let Some(path) = args.wikidata_qids {
|
||||
info!("Loading wikidata QIDs from {path:?}");
|
||||
parse_wikidata_file(path)?
|
||||
} else {
|
||||
Default::default()
|
||||
|
@ -62,11 +62,11 @@ pub fn run(args: Args) -> anyhow::Result<()> {
|
|||
|
||||
if let Some(path) = args.osm_tags {
|
||||
info!("Loading wikipedia/wikidata osm tags from {path:?}");
|
||||
parse_osm_tag_file(path, &mut wikidata_ids, &mut wikipedia_titles)?;
|
||||
parse_osm_tag_file(path, &mut wikidata_qids, &mut wikipedia_titles)?;
|
||||
}
|
||||
|
||||
debug!("Parsed {} unique article urls", wikipedia_titles.len());
|
||||
debug!("Parsed {} unique wikidata ids", wikidata_ids.len());
|
||||
debug!("Parsed {} unique article titles", wikipedia_titles.len());
|
||||
debug!("Parsed {} unique wikidata QIDs", wikidata_qids.len());
|
||||
|
||||
// NOTE: For atomic writes to the same file across threads/processes:
|
||||
// - The file needs to be opened in APPEND mode (`.append(true)`).
|
||||
|
@ -77,8 +77,8 @@ pub fn run(args: Args) -> anyhow::Result<()> {
|
|||
// - `man write(3posix)`: https://www.man7.org/linux/man-pages/man3/write.3p.html
|
||||
// - `std::fs::OpenOptions::append`: https://doc.rust-lang.org/std/fs/struct.OpenOptions.html#method.append
|
||||
// - https://stackoverflow.com/questions/1154446/is-file-append-atomic-in-unix
|
||||
let mut write_new_ids = args
|
||||
.write_new_ids
|
||||
let mut write_new_qids = args
|
||||
.write_new_qids
|
||||
.as_ref()
|
||||
.map(|p| File::options().create(true).append(true).open(p))
|
||||
.transpose()?;
|
||||
|
@ -105,7 +105,7 @@ pub fn run(args: Args) -> anyhow::Result<()> {
|
|||
|
||||
let is_wikidata_match = qid
|
||||
.as_ref()
|
||||
.map(|qid| wikidata_ids.contains(qid))
|
||||
.map(|qid| wikidata_qids.contains(qid))
|
||||
.unwrap_or_default();
|
||||
|
||||
let matching_titles = if wikipedia_titles.is_empty() {
|
||||
|
@ -127,16 +127,16 @@ pub fn run(args: Args) -> anyhow::Result<()> {
|
|||
}
|
||||
|
||||
// Write matched new QIDs back to file.
|
||||
if let (Some(f), Some(qid)) = (&mut write_new_ids, &qid) {
|
||||
if let (Some(f), Some(qid)) = (&mut write_new_qids, &qid) {
|
||||
if !is_wikidata_match && !matching_titles.is_empty() {
|
||||
debug!("Writing new id {} for article {:?}", qid, page.name);
|
||||
// NOTE: Write to string buffer first to have a single atomic write syscall.
|
||||
// See `write_new_ids` for more info.
|
||||
// See `write_new_qids` for more info.
|
||||
let line = format!("{}\n", qid);
|
||||
write!(f, "{}", line).with_context(|| {
|
||||
format!(
|
||||
"writing new id to file {:?}",
|
||||
args.write_new_ids.as_ref().unwrap()
|
||||
"writing new QID to file {:?}",
|
||||
args.write_new_qids.as_ref().unwrap()
|
||||
)
|
||||
})?;
|
||||
}
|
||||
|
|
93
src/get_tags.rs
Normal file
93
src/get_tags.rs
Normal file
|
@ -0,0 +1,93 @@
|
|||
use std::{
|
||||
io::{stdout, Read},
|
||||
sync::mpsc,
|
||||
thread,
|
||||
};
|
||||
|
||||
use osmpbf::{BlobDecode, BlobReader, Element};
|
||||
use rayon::prelude::*;
|
||||
|
||||
struct Record {
|
||||
id: String,
|
||||
wikidata: String,
|
||||
wikipedia: String,
|
||||
}
|
||||
|
||||
/// Extract matching tags from an osm pbf file and write to stdout in TSV.
|
||||
pub fn run(pbf: impl Read + Send) -> anyhow::Result<()> {
|
||||
let reader = BlobReader::new(pbf);
|
||||
|
||||
let (send, recv) = mpsc::sync_channel(128);
|
||||
let writer_thread = thread::Builder::new()
|
||||
.name("writer".to_string())
|
||||
.spawn(move || write(recv))?;
|
||||
|
||||
reader
|
||||
.par_bridge()
|
||||
.try_for_each(move |blob| -> anyhow::Result<()> {
|
||||
// Based on `osmpbf` implementation of `ElementReader`.
|
||||
let BlobDecode::OsmData(block) = blob?.decode()? else { return Ok(()) };
|
||||
for record in block.elements().filter_map(extract_tags) {
|
||||
send.send(record)?;
|
||||
}
|
||||
Ok(())
|
||||
})?;
|
||||
|
||||
let record_count = writer_thread.join().unwrap()?;
|
||||
info!("Finished processing {record_count} records");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write(recv: mpsc::Receiver<Record>) -> anyhow::Result<usize> {
|
||||
let mut output = csv::WriterBuilder::new()
|
||||
.delimiter(b'\t')
|
||||
.from_writer(stdout().lock());
|
||||
output.write_record(["@id", "wikidata", "wikipedia"])?;
|
||||
|
||||
let mut count = 0;
|
||||
|
||||
for Record {
|
||||
id,
|
||||
wikidata,
|
||||
wikipedia,
|
||||
} in recv
|
||||
{
|
||||
output.write_record([id, wikidata, wikipedia])?;
|
||||
count += 1;
|
||||
}
|
||||
|
||||
Ok(count)
|
||||
}
|
||||
|
||||
fn extract_tags(el: Element) -> Option<Record> {
|
||||
match el {
|
||||
Element::Node(n) => make_record(n.id(), n.tags()),
|
||||
Element::DenseNode(n) => make_record(n.id(), n.tags()),
|
||||
Element::Way(w) => make_record(w.id(), w.tags()),
|
||||
Element::Relation(r) => make_record(r.id(), r.tags()),
|
||||
}
|
||||
}
|
||||
|
||||
fn make_record<'i>(id: i64, tags: impl 'i + Iterator<Item = (&'i str, &'i str)>) -> Option<Record> {
|
||||
let mut wikipedia = String::new();
|
||||
let mut wikidata = String::new();
|
||||
|
||||
for (key, value) in tags {
|
||||
match key {
|
||||
"wikipedia" => wikipedia = value.trim().to_owned(),
|
||||
"wikidata" => wikidata = value.trim().to_owned(),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
if wikidata.is_empty() && wikipedia.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(Record {
|
||||
id: id.to_string(),
|
||||
wikipedia,
|
||||
wikidata,
|
||||
})
|
||||
}
|
64
src/main.rs
64
src/main.rs
|
@ -1,24 +1,20 @@
|
|||
use std::{
|
||||
fs::File,
|
||||
io::{stdin, stdout, BufReader, Read, Write},
|
||||
num::NonZeroUsize,
|
||||
path::PathBuf,
|
||||
};
|
||||
|
||||
use clap::{CommandFactory, Parser, Subcommand};
|
||||
#[macro_use]
|
||||
extern crate log;
|
||||
|
||||
mod get_articles;
|
||||
mod get_tags;
|
||||
|
||||
/// Get the version returned by `git describe`, e.g.:
|
||||
/// - `v2.0` if a git tag
|
||||
/// - the commit hash `034ac04` if not a tag
|
||||
/// - `034ac04-dirty` if uncommited changes are present,
|
||||
/// or the crate version if not available (if installed from crates.io).
|
||||
///
|
||||
/// See `build.rs` file for more info.
|
||||
fn version() -> &'static str {
|
||||
option_env!("CARGO_GIT_VERSION")
|
||||
.or(option_env!("CARGO_PKG_VERSION"))
|
||||
.unwrap_or("unknown")
|
||||
}
|
||||
|
||||
/// Extract articles from Wikipedia Enterprise HTML dumps.
|
||||
#[derive(Parser)]
|
||||
#[command(version = crate::version())]
|
||||
#[command(author, version, about, long_about, version = crate::version())]
|
||||
struct Args {
|
||||
#[command(subcommand)]
|
||||
cmd: Cmd,
|
||||
|
@ -31,7 +27,16 @@ enum Cmd {
|
|||
/// Extract wikidata/wikipedia tags from an OpenStreetMap PBF dump.
|
||||
///
|
||||
/// Writes to stdout the extracted tags in a TSV format similar to `osmconvert --csv`.
|
||||
GetTags,
|
||||
GetTags {
|
||||
/// The `.osm.pbf` file to use.
|
||||
pbf_file: PathBuf,
|
||||
|
||||
/// The number of threads to spawn to parse and decompress the pbf file.
|
||||
///
|
||||
/// Defaults to the number of cores.
|
||||
#[arg(short, long)]
|
||||
procs: Option<NonZeroUsize>,
|
||||
},
|
||||
|
||||
/// Apply the same html article simplification used when extracting articles to stdin, and write it to stdout.
|
||||
///
|
||||
|
@ -57,24 +62,30 @@ fn main() -> anyhow::Result<()> {
|
|||
|
||||
match args.cmd {
|
||||
Cmd::GetArticles(args) => {
|
||||
if args.wikidata_ids.is_none()
|
||||
if args.wikidata_qids.is_none()
|
||||
&& args.wikipedia_urls.is_none()
|
||||
&& args.osm_tags.is_none()
|
||||
{
|
||||
let mut cmd = Args::command();
|
||||
cmd.error(
|
||||
clap::error::ErrorKind::MissingRequiredArgument,
|
||||
"at least one of --osm-tags --wikidata-ids --wikipedia-urls is required",
|
||||
"at least one of --osm-tags --wikidata-qids --wikipedia-urls is required",
|
||||
)
|
||||
.exit()
|
||||
}
|
||||
|
||||
get_articles::run(args)
|
||||
}
|
||||
Cmd::GetTags => todo!(),
|
||||
Cmd::Simplify { lang } => {
|
||||
use std::io::{stdin, stdout, Read, Write};
|
||||
Cmd::GetTags { pbf_file, procs } => {
|
||||
rayon::ThreadPoolBuilder::new()
|
||||
.thread_name(|num| format!("worker{num}"))
|
||||
.num_threads(procs.map(usize::from).unwrap_or_default())
|
||||
.build_global()?;
|
||||
|
||||
let pbf_file = File::open(pbf_file).map(BufReader::new)?;
|
||||
get_tags::run(pbf_file)
|
||||
}
|
||||
Cmd::Simplify { lang } => {
|
||||
let mut input = String::new();
|
||||
stdin().read_to_string(&mut input)?;
|
||||
|
||||
|
@ -86,3 +97,16 @@ fn main() -> anyhow::Result<()> {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the version returned by `git describe`, e.g.:
|
||||
/// - `v2.0` if a git tag
|
||||
/// - the commit hash `034ac04` if not a tag
|
||||
/// - `034ac04-dirty` if uncommited changes are present,
|
||||
/// or the crate version if not available (if installed from crates.io).
|
||||
///
|
||||
/// See `build.rs` file for more info.
|
||||
fn version() -> &'static str {
|
||||
option_env!("CARGO_GIT_VERSION")
|
||||
.or(option_env!("CARGO_PKG_VERSION"))
|
||||
.unwrap_or("unknown")
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue