2023-08-10 13:37:59 +00:00 · 2023-08-09 18:50:27 +00:00 · 2023-08-09 18:50:50 +00:00 · 2023-08-09 19:16:18 +00:00 · 2023-08-09 21:33:40 +00:00 · 2023-08-09 22:40:00 +00:00
13 changed files with 1252 additions and 516 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -112,7 +112,7 @@ dependencies = [
 "cc",
 "cfg-if",
 "libc",
- "miniz_oxide",
+ "miniz_oxide 0.6.2",
 "object",
 "rustc-demangle",
 ]
@ -123,6 +123,12 @@ version = "1.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"

+[[package]]
+name = "bitflags"
+version = "2.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42"
+
 [[package]]
 name = "byteorder"
 version = "1.4.3"
@ -160,7 +166,7 @@ checksum = "72394f3339a76daf211e57d4bcb374410f3965dcc606dd0e03738c7888766980"
 dependencies = [
 "anstream",
 "anstyle",
- "bitflags",
+ "bitflags 1.3.2",
 "clap_lex",
 "strsim",
 ]
@ -174,7 +180,7 @@ dependencies = [
 "heck",
 "proc-macro2",
 "quote",
- "syn 2.0.17",
+ "syn 2.0.28",
 ]

 [[package]]
@ -195,6 +201,58 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e"

+[[package]]
+name = "crc32fast"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "crossbeam-channel"
+version = "0.5.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200"
+dependencies = [
+ "cfg-if",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-deque"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
+dependencies = [
+ "cfg-if",
+ "crossbeam-epoch",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7"
+dependencies = [
+ "autocfg",
+ "cfg-if",
+ "crossbeam-utils",
+ "memoffset",
+ "scopeguard",
+]
+
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294"
+dependencies = [
+ "cfg-if",
+]
+
 [[package]]
 name = "cssparser"
 version = "0.29.6"
@ -222,6 +280,27 @@ dependencies = [
 "syn 1.0.109",
 ]

+[[package]]
+name = "csv"
+version = "1.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "626ae34994d3d8d668f4269922248239db4ae42d538b14c398b74a52208e8086"
+dependencies = [
+ "csv-core",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "csv-core"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "derive_more"
 version = "0.99.17"
@ -256,6 +335,12 @@ version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591"

+[[package]]
+name = "either"
+version = "1.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07"
+
 [[package]]
 name = "env_logger"
 version = "0.10.0"
@ -290,6 +375,22 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "fastrand"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764"
+
+[[package]]
+name = "flate2"
+version = "1.0.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
+dependencies = [
+ "crc32fast",
+ "miniz_oxide 0.7.1",
+]
+
 [[package]]
 name = "form_urlencoded"
 version = "1.1.0"
@ -355,6 +456,12 @@ version = "0.27.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ad0a93d233ebf96623465aad4046a8d3aa4da22d4f4beba5388838c8a434bbb4"

+[[package]]
+name = "hashbrown"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
+
 [[package]]
 name = "heck"
 version = "0.4.1"
@ -397,6 +504,16 @@ dependencies = [
 "unicode-normalization",
 ]

+[[package]]
+name = "indexmap"
+version = "1.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
+dependencies = [
+ "autocfg",
+ "hashbrown",
+]
+
 [[package]]
 name = "io-lifetimes"
 version = "1.0.11"
@ -416,7 +533,7 @@ checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
 dependencies = [
 "hermit-abi",
 "io-lifetimes",
- "rustix",
+ "rustix 0.37.19",
 "windows-sys 0.48.0",
 ]

@ -428,9 +545,9 @@ checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"

 [[package]]
 name = "libc"
-version = "0.2.144"
+version = "0.2.147"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1"
+checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3"

 [[package]]
 name = "linux-raw-sys"
@ -438,6 +555,12 @@ version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"

+[[package]]
+name = "linux-raw-sys"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57bcfdad1b858c2db7c38303a6d2ad4dfaf5eb53dfeb0910128b2c26d6158503"
+
 [[package]]
 name = "lock_api"
 version = "0.4.9"
@ -486,6 +609,24 @@ version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"

+[[package]]
+name = "memmap2"
+version = "0.5.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "memoffset"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c"
+dependencies = [
+ "autocfg",
+]
+
 [[package]]
 name = "miniz_oxide"
 version = "0.6.2"
@ -495,6 +636,15 @@ dependencies = [
 "adler",
 ]

+[[package]]
+name = "miniz_oxide"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
+dependencies = [
+ "adler",
+]
+
 [[package]]
 name = "new_debug_unreachable"
 version = "1.0.4"
@ -507,6 +657,16 @@ version = "0.1.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb"

+[[package]]
+name = "num_cpus"
+version = "1.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
+dependencies = [
+ "hermit-abi",
+ "libc",
+]
+
 [[package]]
 name = "object"
 version = "0.30.3"
@ -522,13 +682,17 @@ version = "0.0.0"
 dependencies = [
 "anyhow",
 "clap",
+ "csv",
 "ego-tree",
 "env_logger",
 "log",
 "once_cell",
+ "osmpbf",
+ "rayon",
 "scraper",
 "serde",
 "serde_json",
+ "thiserror",
 "url",
 "urlencoding",
 ]
@ -539,6 +703,20 @@ version = "1.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"

+[[package]]
+name = "osmpbf"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3bec2671f8eb1e9a353adfe8aafe44c9c5207e0012d469a4b61fb7bf33adf37"
+dependencies = [
+ "byteorder",
+ "flate2",
+ "memmap2",
+ "protobuf",
+ "protobuf-codegen",
+ "rayon",
+]
+
 [[package]]
 name = "parking_lot"
 version = "0.12.1"
@ -557,7 +735,7 @@ checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521"
 dependencies = [
 "cfg-if",
 "libc",
- "redox_syscall",
+ "redox_syscall 0.2.16",
 "smallvec",
 "windows-sys 0.45.0",
 ]
@ -680,18 +858,69 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"

 [[package]]
 name = "proc-macro2"
-version = "1.0.59"
+version = "1.0.66"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6aeca18b86b413c660b781aa319e4e2648a3e6f9eadc9b47e9038e6fe9f3451b"
+checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
 dependencies = [
 "unicode-ident",
 ]

 [[package]]
-name = "quote"
-version = "1.0.28"
+name = "protobuf"
+version = "3.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488"
+checksum = "b55bad9126f378a853655831eb7363b7b01b81d19f8cb1218861086ca4a1a61e"
+dependencies = [
+ "once_cell",
+ "protobuf-support",
+ "thiserror",
+]
+
+[[package]]
+name = "protobuf-codegen"
+version = "3.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0dd418ac3c91caa4032d37cb80ff0d44e2ebe637b2fb243b6234bf89cdac4901"
+dependencies = [
+ "anyhow",
+ "once_cell",
+ "protobuf",
+ "protobuf-parse",
+ "regex",
+ "tempfile",
+ "thiserror",
+]
+
+[[package]]
+name = "protobuf-parse"
+version = "3.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d39b14605eaa1f6a340aec7f320b34064feb26c93aec35d6a9a2272a8ddfa49"
+dependencies = [
+ "anyhow",
+ "indexmap",
+ "log",
+ "protobuf",
+ "protobuf-support",
+ "tempfile",
+ "thiserror",
+ "which",
+]
+
+[[package]]
+name = "protobuf-support"
+version = "3.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a5d4d7b8601c814cfb36bcebb79f0e61e45e1e93640cf778837833bbed05c372"
+dependencies = [
+ "thiserror",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965"
 dependencies = [
 "proc-macro2",
 ]
@ -777,13 +1006,44 @@ dependencies = [
 "rand_core 0.5.1",
 ]

+[[package]]
+name = "rayon"
+version = "1.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b"
+dependencies = [
+ "either",
+ "rayon-core",
+]
+
+[[package]]
+name = "rayon-core"
+version = "1.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d"
+dependencies = [
+ "crossbeam-channel",
+ "crossbeam-deque",
+ "crossbeam-utils",
+ "num_cpus",
+]
+
 [[package]]
 name = "redox_syscall"
 version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
 dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
+]
+
+[[package]]
+name = "redox_syscall"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29"
+dependencies = [
+ "bitflags 1.3.2",
 ]

 [[package]]
@ -824,11 +1084,24 @@ version = "0.37.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d"
 dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
 "errno",
 "io-lifetimes",
 "libc",
- "linux-raw-sys",
+ "linux-raw-sys 0.3.8",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "rustix"
+version = "0.38.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "172891ebdceb05aa0005f533a6cbfca599ddd7d966f6f5d4d9b2e70478e70399"
+dependencies = [
+ "bitflags 2.3.3",
+ "errno",
+ "libc",
+ "linux-raw-sys 0.4.5",
 "windows-sys 0.48.0",
 ]

@ -867,7 +1140,7 @@ version = "0.24.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c37578180969d00692904465fb7f6b3d50b9a2b952b87c23d0e2e5cb5013416"
 dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
 "cssparser",
 "derive_more",
 "fxhash",
@ -902,7 +1175,7 @@ checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.17",
+ "syn 2.0.28",
 ]

 [[package]]
@ -989,15 +1262,28 @@ dependencies = [

 [[package]]
 name = "syn"
-version = "2.0.17"
+version = "2.0.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "45b6ddbb36c5b969c182aec3c4a0bce7df3fbad4b77114706a49aacc80567388"
+checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567"
 dependencies = [
 "proc-macro2",
 "quote",
 "unicode-ident",
 ]

+[[package]]
+name = "tempfile"
+version = "3.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc02fddf48964c42031a0b3fe0428320ecf3a73c401040fc0096f97794310651"
+dependencies = [
+ "cfg-if",
+ "fastrand",
+ "redox_syscall 0.3.5",
+ "rustix 0.38.7",
+ "windows-sys 0.48.0",
+]
+
 [[package]]
 name = "tendril"
 version = "0.4.3"
@ -1018,6 +1304,26 @@ dependencies = [
 "winapi-util",
 ]

+[[package]]
+name = "thiserror"
+version = "1.0.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "611040a08a0439f8248d1990b111c95baa9c704c805fa1f62104b39655fd7f90"
+dependencies = [
+ "thiserror-impl",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "090198534930841fab3a5d1bb637cde49e339654e606195f8d9c76eeb081dc96"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.28",
+]
+
 [[package]]
 name = "tinyvec"
 version = "1.6.0"
@ -1107,6 +1413,17 @@ version = "0.11.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"

+[[package]]
+name = "which"
+version = "4.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2441c784c52b289a054b7201fc93253e288f094e2f4be9058343127c4226a269"
+dependencies = [
+ "either",
+ "libc",
+ "once_cell",
+]
+
 [[package]]
 name = "winapi"
 version = "0.3.9"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -10,13 +10,17 @@ default-run = "om-wikiparser"
 [dependencies]
 anyhow = { version = "1.0.71", features = ["backtrace"] }
 clap = { version = "4.3.2", features = ["derive"] }
+csv = "1.2.2"
 ego-tree = "0.6.2"
 env_logger = "0.10.0"
 log = "0.4.18"
 once_cell = "1.18.0"
+osmpbf = "0.3.1"
+rayon = "1.7.0"
 scraper = "0.16.0"
 serde = { version = "1.0.163", features = ["derive"] }
 serde_json = "1.0.96"
+thiserror = "1.0.44"
 url = "2.3.1"
 urlencoding = "2.1.2"

--- a/README.md
+++ b/README.md
@ -13,7 +13,7 @@ It defines article sections that are not important for users and should be remov
 ## Usage

 To use with the map generator, see the [`run.sh` script](run.sh) and its own help documentation.
-It handles preparing the inputs, using multiple dumps, and re-running to convert titles to QIDs and extract them across languages.
+It handles extracting the tags, using multiple dumps, and re-running to convert titles to QIDs and extract them across languages.

 To run the wikiparser manually or for development, see below.

@ -29,41 +29,64 @@ Run the program with the `--help` flag to see all supported arguments.

 ```
 $ cargo run --release -- --help
-Extract article HTML from Wikipedia Enterprise HTML dumps.
+Extract articles from Wikipedia Enterprise HTML dumps

-Expects an uncompressed dump connected to stdin.
+Usage: om-wikiparser <COMMAND>

-Usage: om-wikiparser [OPTIONS] <OUTPUT_DIR>
+Commands:
+  get-articles  Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps
+  get-tags      Extract wikidata/wikipedia tags from an OpenStreetMap PBF dump
+  simplify      Apply the same html article simplification used when extracting articles to stdin, and write it to stdout
+  help          Print this message or the help of the given subcommand(s)
+
+Options:
+  -h, --help     Print help (see more with '--help')
+  -V, --version  Print version
+```
+
+Each command has its own additional help:
+
+```
+$ cargo run -- get-articles --help
+Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.
+
+Expects an uncompressed dump (newline-delimited JSON) connected to stdin.
+
+Usage: om-wikiparser get-articles [OPTIONS] <OUTPUT_DIR>

 Arguments:
  <OUTPUT_DIR>
          Directory to write the extracted articles to

 Options:
-      --write-new-ids <WRITE_NEW_IDS>
+      --write-new-qids <FILE>
          Append to the provided file path the QIDs of articles matched by title but not QID.

-          Use this to save the QIDs of articles you know the url of, but not the QID. The same path can later be passed to the `--wikidata-ids` option to extract them from another language's dump.
+          Use this to save the QIDs of articles you know the url of, but not the QID. The same path can later be passed to the `--wikidata-qids` option to extract them from another language's dump. Writes are atomicly appended to the file, so the same path may be used by multiple concurrent instances.

  -h, --help
          Print help (see a summary with '-h')

-  -V, --version
-          Print version
-
 FILTERS:
-      --wikidata-ids <WIKIDATA_IDS>
+      --osm-tags <FILE.tsv>
+          Path to a TSV file that contains one or more of `wikidata`, `wikipedia` columns.
+
+          This can be generated with the `get-tags` command or `osmconvert --csv-headline --csv 'wikidata wikipedia'`.
+
+      --wikidata-qids <FILE>
          Path to file that contains a Wikidata QID to extract on each line (e.g. `Q12345`)

-      --wikipedia-urls <WIKIPEDIA_URLS>
+      --wikipedia-urls <FILE>
          Path to file that contains a Wikipedia article url to extract on each line (e.g. `https://lang.wikipedia.org/wiki/Article_Title`)
 ```

 It takes as inputs:
 - A wikidata enterprise JSON dump, extracted and connected to `stdin`.
- A file of Wikidata QIDs to extract, one per line (e.g. `Q12345`), passed as the CLI flag `--wikidata-ids`.
- A file of Wikipedia article titles to extract, one per line (e.g. `https://$LANG.wikipedia.org/wiki/$ARTICLE_TITLE`), passed as a CLI flag `--wikipedia-urls`.
 - A directory to write the extracted articles to, as a CLI argument.
+- Any number of filters passed:
+  - A TSV file of wikidata qids and wikipedia urls, created by the `get-tags` command or `osmconvert`, passed as the CLI flag `--osm-tags`.
+  - A file of Wikidata QIDs to extract, one per line (e.g. `Q12345`), passed as the CLI flag `--wikidata-ids`.
+  - A file of Wikipedia article titles to extract, one per line (e.g. `https://$LANG.wikipedia.org/wiki/$ARTICLE_TITLE`), passed as a CLI flag `--wikipedia-urls`.

 As an example of manual usage with the map generator:
 - Assuming this program is installed to `$PATH` as `om-wikiparser`.
@ -74,7 +97,7 @@ As an example of manual usage with the map generator:

 ```shell
 # Transform intermediate files from generator.
-cut -f 2 id_to_wikidata.csv > wikidata_ids.txt
+cut -f 2 id_to_wikidata.csv > wikidata_qids.txt
 tail -n +2 wiki_urls.txt | cut -f 3 > wikipedia_urls.txt
 # Enable backtraces in errors and panics.
 export RUST_BACKTRACE=1
@ -83,9 +106,38 @@ export RUST_LOG=om_wikiparser=debug
 # Begin extraction.
 for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
 do
-  tar xzf $dump | om-wikiparser \
-    --wikidata-ids wikidata_ids.txt \
+  tar xzf $dump | om-wikiparser get-articles \
+    --wikidata-ids wikidata_qids.txt \
    --wikipedia-urls wikipedia_urls.txt \
+    --write-new-qids new_qids.txt \
+    descriptions/
+done
+# Extract discovered QIDs.
+for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
+do
+  tar xzf $dump | om-wikiparser get-articles \
+    --wikidata-ids new_qids.txt \
+    descriptions/
+done
+```
+
+Alternatively, extract the tags directly from a `.osm.pbf` file (referenced here as `planet-latest.osm.pbf`):
+```shell
+# Extract tags
+om-wikiparser get-tags planet-latest.osm.pbf > osm_tags.tsv
+# Begin extraction.
+for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
+do
+  tar xzf $dump | om-wikiparser get-articles \
+    --osm-tags osm_tags.tsv \
+    --write-new-qids new_qids.txt \
+    descriptions/
+done
+# Extract discovered QIDs.
+for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
+do
+  tar xzf $dump | om-wikiparser get-articles \
+    --wikidata-ids new_qids.txt \
    descriptions/
 done
 ```
--- a/benches/id_parsing.rs
+++ b/benches/id_parsing.rs
@ -4,22 +4,21 @@ use std::{collections::HashSet, str::FromStr};
 extern crate om_wikiparser;
 extern crate test;

+use om_wikiparser::wm::{Qid, Title};
+
+const TITLE: &str = "https://en.wikipedia.org/wiki/Article_Title";
+const QID: &str = "Q123456789";
+
 #[bench]
 fn parse_wikipedia(b: &mut test::Bencher) {
    b.iter(|| {
-        let title = om_wikiparser::wm::WikipediaTitleNorm::from_url(
-            "https://en.wikipedia.org/wiki/Article_Title",
-        )
-        .unwrap();
+        Title::from_url(TITLE).unwrap();
    });
 }

 #[bench]
 fn hash_wikipedia(b: &mut test::Bencher) {
-    let title = om_wikiparser::wm::WikipediaTitleNorm::from_url(
-        "https://en.wikipedia.org/wiki/Article_Title",
-    )
-    .unwrap();
+    let title = Title::from_url(TITLE).unwrap();
    let mut set = HashSet::new();
    b.iter(|| {
        set.insert(&title);
@ -29,13 +28,13 @@ fn hash_wikipedia(b: &mut test::Bencher) {
 #[bench]
 fn parse_wikidata(b: &mut test::Bencher) {
    b.iter(|| {
-        let qid = om_wikiparser::wm::WikidataQid::from_str("Q123456789").unwrap();
+        Qid::from_str(QID).unwrap();
    });
 }

 #[bench]
 fn hash_wikidata(b: &mut test::Bencher) {
-    let qid = om_wikiparser::wm::WikidataQid::from_str("Q123456789").unwrap();
+    let qid = Qid::from_str(QID).unwrap();
    let mut set = HashSet::new();
    b.iter(|| {
        set.insert(&qid);
--- a/run.sh
+++ b/run.sh
@ -1,17 +1,16 @@
 #! /usr/bin/env bash
 # shellcheck disable=SC2016 # Backticks not used as expansions in documentation.
-USAGE='Usage: ./run.sh [-h] <BUILD_DIR> <DUMP_FILE.json.tar.gz> [<DUMP_FILE.json.tar.gz>...]
+USAGE='Usage: ./run.sh [-h] <BUILD_DIR> <OSM_FILE.osm.pbf> <DUMP_FILE.json.tar.gz> [<DUMP_FILE.json.tar.gz>...]

 A convenience script to run the wikiparser with the maps generator as a drop-in replacement for the descriptions scraper.

 Arguments:
    <BUILD_DIR> An existing directory to place descriptions in.
-                The `id_to_wikidata.csv` and `wiki_urls.txt` files output by the
-                maps generator must be placed in this directory before running.
                The extracted articles will be placed in a `descriptions`
                subdirectory within this directory.
                The `intermediate_data` subfolder of a maps build directory may
                be used for this. The same folder may be used for multiple runs.
+    <OSM_FILE>  An OpenStreetMap dump in PBF format to extract tags from.
    <DUMP_FILE> A wikipedia enterprise html dump. These take the form of
                `enwiki-NS0-20230401-ENTERPRISE-HTML.json.tar.gz`. Multiple
                dumps in the same language SHOULD NOT be provided, and will
@ -21,7 +20,7 @@ Options:
    -h      Print this help screen

 1. Builds wikiparser.
-2. Extracts wikidata ids and wikipedia urls from generator intermediate files `id_to_wikidata.csv` and `wiki_urls.txt`.
+2. Extracts wikidata qids and wikipedia urls from OpenStreetMap pbf file (NOTE: this spawns as many threads as there are cores).
 3. Runs wikiparser in parallel for all input dump files (NOTE: this currently starts 2 processes for each dump files).

 For information on running the wikiparser manually, see README.md.
@ -43,8 +42,8 @@ do
 done
 shift $((OPTIND - 1))

-if [ -z "${2-}" ]; then
-    echo "BUILD_DIR and at least one DUMP_FILE are required" >&2
+if [ -z "${3-}" ]; then
+    echo "BUILD_DIR, OSM_FILE, and at least one DUMP_FILE are required" >&2
    echo -n "$USAGE" >&2
    exit 1
 fi
@ -58,6 +57,13 @@ if [ ! -d "$BUILD_DIR" ]; then
    exit 1
 fi

+OSM_FILE=$(readlink -f -- "$1")
+shift
+if [ ! -f "$OSM_FILE" ]; then
+    echo "OSM_FILE '$OSM_FILE' does not exist or is not a file" >&2
+    exit 1
+fi
+
 DUMP_FILES=()
 while (( $# > 0 )); do
    dump_file="$(readlink -f -- "$1")"
@ -91,16 +97,8 @@ wikiparser=$(pwd)/target/release/om-wikiparser
 log "Changing to maps build dir '$BUILD_DIR'"
 cd "$BUILD_DIR"

-log "Transforming intermediate generator data"
-for intermediate_file in id_to_wikidata.csv wiki_urls.txt; do
-    if [ ! -e "$intermediate_file" ]; then
-        echo -e "Cannot find intermediate generator file '$intermediate_file' in maps build dir '$BUILD_DIR/'\nWas the descriptions step run?" >&2
-        exit 1
-    fi
-done
-
-cut -f 2 id_to_wikidata.csv > wikidata_ids.txt
-tail -n +2 wiki_urls.txt | cut -f 3 > wikipedia_urls.txt
+log "Extracting tags from '$OSM_FILE'"
+"$wikiparser" get-tags "$OSM_FILE" > osm_tags.tsv

 # Enable backtraces in errors and panics.
 export RUST_BACKTRACE=1
@ -129,10 +127,9 @@ trap 'kill_jobs' SIGINT SIGTERM EXIT

 for dump in "${DUMP_FILES[@]}"; do
  log "Extracting '$dump'"
-  tar xzOf "$dump" | "$wikiparser" \
-    --wikidata-ids wikidata_ids.txt \
-    --wikipedia-urls wikipedia_urls.txt \
-    --write-new-ids new_qids.txt \
+  tar xzOf "$dump" | "$wikiparser" get-articles \
+    --osm-tags osm_tags.tsv \
+    --write-new-qids new_qids.txt \
    "$OUTPUT_DIR" &
 done

@ -142,8 +139,8 @@ log "Beginning extraction of discovered QIDs"

 # Extract new qids from other dumps in parallel.
 for dump in "${DUMP_FILES[@]}"; do
-  tar xzOf "$dump" | "$wikiparser" \
-    --wikidata-ids new_qids.txt \
+  tar xzOf "$dump" | "$wikiparser" get-articles \
+    --wikidata-qids new_qids.txt \
    "$OUTPUT_DIR" &
 done

--- a/src/bin/simplify_html.rs
+++ b/src/bin/simplify_html.rs
@ -1,23 +0,0 @@
-//! Apply html article simplification to stdin, and write it to stdout.
-//!
-//! Usage:
-//!     simplify_html < article.html > simplified.html
-use std::io::{stdin, stdout, Read, Write};
-
-use om_wikiparser::html::simplify;
-
-fn main() -> anyhow::Result<()> {
-    env_logger::Builder::new()
-        .filter_level(log::LevelFilter::Info)
-        .parse_default_env()
-        .try_init()?;
-
-    let mut input = String::new();
-    stdin().read_to_string(&mut input)?;
-
-    let output = simplify(&input, "en");
-
-    stdout().write_all(output.as_bytes())?;
-
-    Ok(())
-}
--- a/src/get_articles.rs
+++ b/src/get_articles.rs
@ -0,0 +1,288 @@
+use std::{
+    fs::{self, File},
+    io::{stdin, BufRead, Write},
+    os::unix,
+    path::{Path, PathBuf},
+};
+
+use anyhow::{anyhow, bail, Context};
+
+use om_wikiparser::{
+    html::simplify,
+    wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, Title},
+};
+
+/// Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.
+///
+/// Expects an uncompressed dump (newline-delimited JSON) connected to stdin.
+#[derive(clap::Args)]
+pub struct Args {
+    /// Directory to write the extracted articles to.
+    pub output_dir: PathBuf,
+
+    /// Path to a TSV file that contains one or more of `wikidata`, `wikipedia` columns.
+    ///
+    /// This can be generated with the `get-tags` command or `osmconvert --csv-headline --csv 'wikidata wikipedia'`.
+    #[arg(long, help_heading = "FILTERS", value_name = "FILE.tsv")]
+    pub osm_tags: Option<PathBuf>,
+
+    /// Path to file that contains a Wikidata QID to extract on each line
+    /// (e.g. `Q12345`).
+    #[arg(long, help_heading = "FILTERS", value_name = "FILE")]
+    pub wikidata_qids: Option<PathBuf>,
+
+    /// Path to file that contains a Wikipedia article url to extract on each line
+    /// (e.g. `https://lang.wikipedia.org/wiki/Article_Title`).
+    #[arg(long, help_heading = "FILTERS", value_name = "FILE")]
+    pub wikipedia_urls: Option<PathBuf>,
+
+    /// Append to the provided file path the QIDs of articles matched by title but not QID.
+    ///
+    /// Use this to save the QIDs of articles you know the url of, but not the QID.
+    /// The same path can later be passed to the `--wikidata-qids` option to extract them from another language's dump.
+    /// Writes are atomicly appended to the file, so the same path may be used by multiple concurrent instances.
+    #[arg(long, value_name = "FILE")]
+    pub write_new_qids: Option<PathBuf>,
+}
+
+pub fn run(args: Args) -> anyhow::Result<()> {
+    let mut wikipedia_titles = if let Some(path) = args.wikipedia_urls {
+        info!("Loading article urls from {path:?}");
+        parse_wikipedia_file(path)?
+    } else {
+        Default::default()
+    };
+
+    let mut wikidata_qids = if let Some(path) = args.wikidata_qids {
+        info!("Loading wikidata QIDs from {path:?}");
+        parse_wikidata_file(path)?
+    } else {
+        Default::default()
+    };
+
+    if let Some(ref path) = args.osm_tags {
+        info!("Loading wikipedia/wikidata osm tags from {path:?}");
+
+        let original_items = wikidata_qids.len() + wikipedia_titles.len();
+        let mut line_errors = Vec::new();
+        parse_osm_tag_file(
+            path,
+            &mut wikidata_qids,
+            &mut wikipedia_titles,
+            Some(&mut line_errors),
+        )?;
+
+        if !line_errors.is_empty() {
+            let error_count = line_errors.len();
+            let new_items = wikidata_qids.len() + wikipedia_titles.len() - original_items;
+            let expected_threshold = 0.02;
+            let percentage = 100.0 * error_count as f64 / new_items as f64;
+            let level = if percentage >= expected_threshold {
+                log::Level::Error
+            } else {
+                log::Level::Info
+            };
+
+            log!(
+                level,
+                "{error_count} errors ({percentage:.4}%) parsing osm tags from {path:?}",
+            );
+        }
+    }
+
+    debug!("Parsed {} unique article titles", wikipedia_titles.len());
+    debug!("Parsed {} unique wikidata QIDs", wikidata_qids.len());
+
+    // NOTE: For atomic writes to the same file across threads/processes:
+    // - The file needs to be opened in APPEND mode (`.append(true)`).
+    // - Each write needs to be a single syscall (for Rust, use `format!` for formatting before calling `write!`, or `write!` to a `String` first).
+    // - Each write needs to be under `PIPE_BUF` size (see `man write(3)`), usually 4kb on Linux.
+    //
+    // For more information, see:
+    // - `man write(3posix)`: https://www.man7.org/linux/man-pages/man3/write.3p.html
+    // - `std::fs::OpenOptions::append`: https://doc.rust-lang.org/std/fs/struct.OpenOptions.html#method.append
+    // - https://stackoverflow.com/questions/1154446/is-file-append-atomic-in-unix
+    let mut write_new_qids = args
+        .write_new_qids
+        .as_ref()
+        .map(|p| File::options().create(true).append(true).open(p))
+        .transpose()?;
+
+    if !args.output_dir.is_dir() {
+        bail!("output dir {:?} does not exist", args.output_dir)
+    }
+
+    info!("Processing dump");
+    let dump = stdin().lock();
+
+    // TODO: Compare different deserialization methods.
+    // The docs warn against using a reader directly, and it's slower than tar can decompress the dump.
+    // let stream = serde_json::Deserializer::from_reader(dump).into_iter::<Page>();
+    let stream = dump.lines().map(|r| {
+        r.map_err(anyhow::Error::new)
+            .and_then(|s| serde_json::from_str::<Page>(&s).map_err(anyhow::Error::new))
+    });
+
+    for page in stream {
+        let page = page?;
+
+        let qid = page.wikidata();
+
+        let is_wikidata_match = qid
+            .as_ref()
+            .map(|qid| wikidata_qids.contains(qid))
+            .unwrap_or_default();
+
+        let matching_titles = if wikipedia_titles.is_empty() {
+            Default::default()
+        } else {
+            page.all_titles()
+                .filter_map(|r| {
+                    r.map(Some).unwrap_or_else(|e| {
+                        warn!("Could not parse title for {:?}: {:#}", &page.name, e);
+                        None
+                    })
+                })
+                .filter(|t| wikipedia_titles.contains(t))
+                .collect::<Vec<_>>()
+        };
+
+        if !is_wikidata_match && matching_titles.is_empty() {
+            continue;
+        }
+
+        // Write matched new QIDs back to file.
+        if let (Some(f), Some(qid)) = (&mut write_new_qids, &qid) {
+            if !is_wikidata_match && !matching_titles.is_empty() {
+                debug!("Writing new id {} for article {:?}", qid, page.name);
+                // NOTE: Write to string buffer first to have a single atomic write syscall.
+                // See `write_new_qids` for more info.
+                let line = format!("{}\n", qid);
+                write!(f, "{}", line).with_context(|| {
+                    format!(
+                        "writing new QID to file {:?}",
+                        args.write_new_qids.as_ref().unwrap()
+                    )
+                })?;
+            }
+        }
+
+        if let Err(e) = write(&args.output_dir, &page, matching_titles) {
+            error!("Error writing article {:?}: {:#}", page.name, e);
+        }
+    }
+
+    Ok(())
+}
+
+/// Determine the directory to write the article contents to, create it, and create any necessary symlinks to it.
+fn create_article_dir(
+    base: impl AsRef<Path>,
+    page: &Page,
+    redirects: impl IntoIterator<Item = Title>,
+) -> anyhow::Result<PathBuf> {
+    let base = base.as_ref();
+    let mut redirects = redirects.into_iter();
+
+    let main_dir = match page.wikidata() {
+        None => {
+            // Write to wikipedia title directory.
+            // Prefer first redirect, fall back to page title if none exist
+            info!("Page without wikidata qid: {:?} ({})", page.name, page.url);
+            redirects
+                .next()
+                .or_else(|| match page.title() {
+                    Ok(title) => Some(title),
+                    Err(e) => {
+                        warn!("Unable to parse title for page {:?}: {:#}", page.name, e);
+                        None
+                    }
+                })
+                // hard fail when no titles can be parsed
+                .ok_or_else(|| anyhow!("No available titles for page {:?}", page.name))?
+                .get_dir(base.to_owned())
+        }
+        Some(qid) => {
+            // Otherwise use wikidata as main directory and symlink from wikipedia titles.
+            qid.get_dir(base.to_owned())
+        }
+    };
+
+    if main_dir.is_symlink() {
+        fs::remove_file(&main_dir)
+            .with_context(|| format!("removing old link for main directory {:?}", &main_dir))?;
+    }
+    fs::create_dir_all(&main_dir)
+        .with_context(|| format!("creating main directory {:?}", &main_dir))?;
+
+    // Write symlinks to main directory.
+    for title in redirects {
+        let wikipedia_dir = title.get_dir(base.to_owned());
+
+        // Build required directory.
+        //
+        // Possible states from previous run:
+        // - Does not exist (and is not a symlink)
+        // - Exists, is a directory
+        // - Exists, is a valid symlink to correct location
+        // - Exists, is a valid symlink to incorrect location
+        if wikipedia_dir.exists() {
+            if wikipedia_dir.is_symlink() {
+                // Only replace if not valid
+                if fs::read_link(&wikipedia_dir)? == main_dir {
+                    continue;
+                }
+                fs::remove_file(&wikipedia_dir)?;
+            } else {
+                fs::remove_dir_all(&wikipedia_dir)?;
+            }
+        } else {
+            // titles can contain `/`, so ensure necessary subdirs exist
+            let parent_dir = wikipedia_dir.parent().unwrap();
+            fs::create_dir_all(parent_dir)
+                .with_context(|| format!("creating wikipedia directory {:?}", parent_dir))?;
+        }
+
+        unix::fs::symlink(&main_dir, &wikipedia_dir).with_context(|| {
+            format!(
+                "creating symlink from {:?} to {:?}",
+                wikipedia_dir, main_dir
+            )
+        })?;
+    }
+
+    Ok(main_dir)
+}
+
+/// Write selected article to disk.
+///
+/// - Write page contents to wikidata page (`wikidata.org/wiki/QXXX/lang.html`).
+/// - If the page has no wikidata qid, write contents to wikipedia location (`lang.wikipedia.org/wiki/article_title/lang.html`).
+/// - Create links from all wikipedia urls and redirects (`lang.wikipedia.org/wiki/a_redirect -> wikidata.org/wiki/QXXX`).
+fn write(
+    base: impl AsRef<Path>,
+    page: &Page,
+    redirects: impl IntoIterator<Item = Title>,
+) -> anyhow::Result<()> {
+    let article_dir = create_article_dir(base, page, redirects)?;
+
+    // Write html to determined file.
+    let mut filename = article_dir;
+    filename.push(&page.in_language.identifier);
+    filename.set_extension("html");
+
+    debug!("{:?}: {:?}", page.name, filename);
+
+    if filename.exists() {
+        debug!("Overwriting existing file");
+    }
+
+    let html = simplify(&page.article_body.html, &page.in_language.identifier);
+
+    let mut file =
+        File::create(&filename).with_context(|| format!("creating html file {:?}", filename))?;
+    file.write_all(html.as_bytes())
+        .with_context(|| format!("writing html file {:?}", filename))?;
+
+    Ok(())
+}
--- a/src/get_tags.rs
+++ b/src/get_tags.rs
@ -0,0 +1,93 @@
+use std::{
+    io::{stdout, Read},
+    sync::mpsc,
+    thread,
+};
+
+use osmpbf::{BlobDecode, BlobReader, Element};
+use rayon::prelude::*;
+
+struct Record {
+    id: String,
+    wikidata: String,
+    wikipedia: String,
+}
+
+/// Extract matching tags from an osm pbf file and write to stdout in TSV.
+pub fn run(pbf: impl Read + Send) -> anyhow::Result<()> {
+    let reader = BlobReader::new(pbf);
+
+    let (send, recv) = mpsc::sync_channel(128);
+    let writer_thread = thread::Builder::new()
+        .name("writer".to_string())
+        .spawn(move || write(recv))?;
+
+    reader
+        .par_bridge()
+        .try_for_each(move |blob| -> anyhow::Result<()> {
+            // Based on `osmpbf` implementation of `ElementReader`.
+            let BlobDecode::OsmData(block) = blob?.decode()? else { return Ok(()) };
+            for record in block.elements().filter_map(extract_tags) {
+                send.send(record)?;
+            }
+            Ok(())
+        })?;
+
+    let record_count = writer_thread.join().unwrap()?;
+    info!("Finished processing {record_count} records");
+
+    Ok(())
+}
+
+fn write(recv: mpsc::Receiver<Record>) -> anyhow::Result<usize> {
+    let mut output = csv::WriterBuilder::new()
+        .delimiter(b'\t')
+        .from_writer(stdout().lock());
+    output.write_record(["@id", "wikidata", "wikipedia"])?;
+
+    let mut count = 0;
+
+    for Record {
+        id,
+        wikidata,
+        wikipedia,
+    } in recv
+    {
+        output.write_record([id, wikidata, wikipedia])?;
+        count += 1;
+    }
+
+    Ok(count)
+}
+
+fn extract_tags(el: Element) -> Option<Record> {
+    match el {
+        Element::Node(n) => make_record(n.id(), n.tags()),
+        Element::DenseNode(n) => make_record(n.id(), n.tags()),
+        Element::Way(w) => make_record(w.id(), w.tags()),
+        Element::Relation(r) => make_record(r.id(), r.tags()),
+    }
+}
+
+fn make_record<'i>(id: i64, tags: impl 'i + Iterator<Item = (&'i str, &'i str)>) -> Option<Record> {
+    let mut wikipedia = String::new();
+    let mut wikidata = String::new();
+
+    for (key, value) in tags {
+        match key {
+            "wikipedia" => wikipedia = value.trim().to_owned(),
+            "wikidata" => wikidata = value.trim().to_owned(),
+            _ => {}
+        }
+    }
+
+    if wikidata.is_empty() && wikipedia.is_empty() {
+        return None;
+    }
+
+    Some(Record {
+        id: id.to_string(),
+        wikipedia,
+        wikidata,
+    })
+}
--- a/src/main.rs
+++ b/src/main.rs
@ -1,19 +1,102 @@
 use std::{
-    fs::{self, File},
-    io::{stdin, BufRead, Write},
-    os::unix,
-    path::{Path, PathBuf},
+    fs::File,
+    io::{stdin, stdout, BufReader, Read, Write},
+    num::NonZeroUsize,
+    path::PathBuf,
 };

-use anyhow::{anyhow, bail, Context};
-use clap::{CommandFactory, Parser};
+use clap::{CommandFactory, Parser, Subcommand};
 #[macro_use]
 extern crate log;

-use om_wikiparser::{
-    html::simplify,
-    wm::{parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
-};
+mod get_articles;
+mod get_tags;
+
+/// Extract articles from Wikipedia Enterprise HTML dumps.
+#[derive(Parser)]
+#[command(author, version, about, long_about, version = crate::version())]
+struct Args {
+    #[command(subcommand)]
+    cmd: Cmd,
+}
+
+#[derive(Subcommand)]
+enum Cmd {
+    GetArticles(get_articles::Args),
+
+    /// Extract wikidata/wikipedia tags from an OpenStreetMap PBF dump.
+    ///
+    /// Writes to stdout the extracted tags in a TSV format similar to `osmconvert --csv`.
+    GetTags {
+        /// The `.osm.pbf` file to use.
+        pbf_file: PathBuf,
+
+        /// The number of threads to spawn to parse and decompress the pbf file.
+        ///
+        /// Defaults to the number of cores.
+        #[arg(short, long)]
+        procs: Option<NonZeroUsize>,
+    },
+
+    /// Apply the same html article simplification used when extracting articles to stdin, and write it to stdout.
+    ///
+    /// This is meant for testing and debugging.
+    Simplify {
+        /// The language to use when processing the article (defaults to `en`).
+        #[arg(long, default_value_t = String::from("en"))]
+        lang: String,
+    },
+}
+
+fn main() -> anyhow::Result<()> {
+    // Use info level by default, load overrides from `RUST_LOG` env variable.
+    // See https://docs.rs/env_logger/latest/env_logger/index.html#example
+    env_logger::Builder::new()
+        .filter_level(log::LevelFilter::Info)
+        .parse_default_env()
+        .try_init()?;
+
+    let args = Args::parse();
+
+    info!("{} {}", Args::command().get_name(), version());
+
+    match args.cmd {
+        Cmd::GetArticles(args) => {
+            if args.wikidata_qids.is_none()
+                && args.wikipedia_urls.is_none()
+                && args.osm_tags.is_none()
+            {
+                let mut cmd = Args::command();
+                cmd.error(
+                    clap::error::ErrorKind::MissingRequiredArgument,
+                    "at least one of --osm-tags --wikidata-qids --wikipedia-urls is required",
+                )
+                .exit()
+            }
+
+            get_articles::run(args)
+        }
+        Cmd::GetTags { pbf_file, procs } => {
+            rayon::ThreadPoolBuilder::new()
+                .thread_name(|num| format!("worker{num}"))
+                .num_threads(procs.map(usize::from).unwrap_or_default())
+                .build_global()?;
+
+            let pbf_file = File::open(pbf_file).map(BufReader::new)?;
+            get_tags::run(pbf_file)
+        }
+        Cmd::Simplify { lang } => {
+            let mut input = String::new();
+            stdin().read_to_string(&mut input)?;
+
+            let output = om_wikiparser::html::simplify(&input, &lang);
+
+            stdout().write_all(output.as_bytes())?;
+
+            Ok(())
+        }
+    }
+}

 /// Get the version returned by `git describe`, e.g.:
 /// - `v2.0` if a git tag
@ -27,264 +110,3 @@ fn version() -> &'static str {
        .or(option_env!("CARGO_PKG_VERSION"))
        .unwrap_or("unknown")
 }
-
-/// Extract article HTML from Wikipedia Enterprise HTML dumps.
-///
-/// Expects an uncompressed dump connected to stdin.
-#[derive(Parser)]
-#[command(version = crate::version())]
-struct Args {
-    /// Directory to write the extracted articles to.
-    output_dir: PathBuf,
-
-    /// Path to file that contains a Wikidata QID to extract on each line
-    /// (e.g. `Q12345`).
-    #[arg(long, help_heading = "FILTERS")]
-    wikidata_ids: Option<PathBuf>,
-
-    /// Path to file that contains a Wikipedia article url to extract on each line
-    /// (e.g. `https://lang.wikipedia.org/wiki/Article_Title`).
-    #[arg(long, help_heading = "FILTERS")]
-    wikipedia_urls: Option<PathBuf>,
-
-    /// Append to the provided file path the QIDs of articles matched by title but not QID.
-    ///
-    /// Use this to save the QIDs of articles you know the url of, but not the QID.
-    /// The same path can later be passed to the `--wikidata-ids` option to extract them from another language's dump.
-    /// Writes are atomicly appended to the file, so the same path may be used by multiple concurrent instances.
-    #[arg(long, requires("wikipedia_urls"))]
-    write_new_ids: Option<PathBuf>,
-}
-
-/// Determine the directory to write the article contents to, create it, and create any necessary symlinks to it.
-fn create_article_dir(
-    base: impl AsRef<Path>,
-    page: &Page,
-    redirects: impl IntoIterator<Item = WikipediaTitleNorm>,
-) -> anyhow::Result<PathBuf> {
-    let base = base.as_ref();
-    let mut redirects = redirects.into_iter();
-
-    let main_dir = match page.wikidata() {
-        None => {
-            // Write to wikipedia title directory.
-            // Prefer first redirect, fall back to page title if none exist
-            info!("Page without wikidata qid: {:?} ({})", page.name, page.url);
-            redirects
-                .next()
-                .or_else(|| match page.title() {
-                    Ok(title) => Some(title),
-                    Err(e) => {
-                        warn!("Unable to parse title for page {:?}: {:#}", page.name, e);
-                        None
-                    }
-                })
-                // hard fail when no titles can be parsed
-                .ok_or_else(|| anyhow!("No available titles for page {:?}", page.name))?
-                .get_dir(base.to_owned())
-        }
-        Some(qid) => {
-            // Otherwise use wikidata as main directory and symlink from wikipedia titles.
-            qid.get_dir(base.to_owned())
-        }
-    };
-
-    if main_dir.is_symlink() {
-        fs::remove_file(&main_dir)
-            .with_context(|| format!("removing old link for main directory {:?}", &main_dir))?;
-    }
-    fs::create_dir_all(&main_dir)
-        .with_context(|| format!("creating main directory {:?}", &main_dir))?;
-
-    // Write symlinks to main directory.
-    for title in redirects {
-        let wikipedia_dir = title.get_dir(base.to_owned());
-
-        // Build required directory.
-        //
-        // Possible states from previous run:
-        // - Does not exist (and is not a symlink)
-        // - Exists, is a directory
-        // - Exists, is a valid symlink to correct location
-        // - Exists, is a valid symlink to incorrect location
-        if wikipedia_dir.exists() {
-            if wikipedia_dir.is_symlink() {
-                // Only replace if not valid
-                if fs::read_link(&wikipedia_dir)? == main_dir {
-                    continue;
-                }
-                fs::remove_file(&wikipedia_dir)?;
-            } else {
-                fs::remove_dir_all(&wikipedia_dir)?;
-            }
-        } else {
-            // titles can contain `/`, so ensure necessary subdirs exist
-            let parent_dir = wikipedia_dir.parent().unwrap();
-            fs::create_dir_all(parent_dir)
-                .with_context(|| format!("creating wikipedia directory {:?}", parent_dir))?;
-        }
-
-        unix::fs::symlink(&main_dir, &wikipedia_dir).with_context(|| {
-            format!(
-                "creating symlink from {:?} to {:?}",
-                wikipedia_dir, main_dir
-            )
-        })?;
-    }
-
-    Ok(main_dir)
-}
-
-/// Write selected article to disk.
-///
-/// - Write page contents to wikidata page (`wikidata.org/wiki/QXXX/lang.html`).
-/// - If the page has no wikidata qid, write contents to wikipedia location (`lang.wikipedia.org/wiki/article_title/lang.html`).
-/// - Create links from all wikipedia urls and redirects (`lang.wikipedia.org/wiki/a_redirect -> wikidata.org/wiki/QXXX`).
-fn write(
-    base: impl AsRef<Path>,
-    page: &Page,
-    redirects: impl IntoIterator<Item = WikipediaTitleNorm>,
-) -> anyhow::Result<()> {
-    let article_dir = create_article_dir(base, page, redirects)?;
-
-    // Write html to determined file.
-    let mut filename = article_dir;
-    filename.push(&page.in_language.identifier);
-    filename.set_extension("html");
-
-    debug!("{:?}: {:?}", page.name, filename);
-
-    if filename.exists() {
-        debug!("Overwriting existing file");
-    }
-
-    let html = simplify(&page.article_body.html, &page.in_language.identifier);
-
-    let mut file =
-        File::create(&filename).with_context(|| format!("creating html file {:?}", filename))?;
-    file.write_all(html.as_bytes())
-        .with_context(|| format!("writing html file {:?}", filename))?;
-
-    Ok(())
-}
-
-fn main() -> anyhow::Result<()> {
-    // Use info level by default, load overrides from `RUST_LOG` env variable.
-    // See https://docs.rs/env_logger/latest/env_logger/index.html#example
-    env_logger::Builder::new()
-        .filter_level(log::LevelFilter::Info)
-        .parse_default_env()
-        .try_init()?;
-
-    let args = Args::parse();
-
-    if args.wikidata_ids.is_none() && args.wikipedia_urls.is_none() {
-        let mut cmd = Args::command();
-        cmd.error(
-            clap::error::ErrorKind::MissingRequiredArgument,
-            "one or both of --wikidata-ids and --wikipedia-urls is required",
-        )
-        .exit()
-    }
-
-    info!("{} {}", Args::command().get_name(), version());
-
-    let wikipedia_titles = if let Some(path) = args.wikipedia_urls {
-        info!("Loading article urls from {path:?}");
-        let urls = parse_wikipedia_file(path)?;
-        debug!("Parsed {} unique article urls", urls.len());
-        urls
-    } else {
-        Default::default()
-    };
-
-    let wikidata_ids = if let Some(path) = args.wikidata_ids {
-        info!("Loading wikidata ids from {path:?}");
-        let ids = parse_wikidata_file(path)?;
-        debug!("Parsed {} unique wikidata ids", ids.len());
-        ids
-    } else {
-        Default::default()
-    };
-
-    // NOTE: For atomic writes to the same file across threads/processes:
-    // - The file needs to be opened in APPEND mode (`.append(true)`).
-    // - Each write needs to be a single syscall (for Rust, use `format!` for formatting before calling `write!`, or `write!` to a `String` first).
-    // - Each write needs to be under `PIPE_BUF` size (see `man write(3)`), usually 4kb on Linux.
-    //
-    // For more information, see:
-    // - `man write(3posix)`: https://www.man7.org/linux/man-pages/man3/write.3p.html
-    // - `std::fs::OpenOptions::append`: https://doc.rust-lang.org/std/fs/struct.OpenOptions.html#method.append
-    // - https://stackoverflow.com/questions/1154446/is-file-append-atomic-in-unix
-    let mut write_new_ids = args
-        .write_new_ids
-        .as_ref()
-        .map(|p| File::options().create(true).append(true).open(p))
-        .transpose()?;
-
-    if !args.output_dir.is_dir() {
-        bail!("output dir {:?} does not exist", args.output_dir)
-    }
-
-    info!("Processing dump");
-    let dump = stdin().lock();
-
-    // TODO: Compare different deserialization methods.
-    // The docs warn against using a reader directly, and it's slower than tar can decompress the dump.
-    // let stream = serde_json::Deserializer::from_reader(dump).into_iter::<Page>();
-    let stream = dump.lines().map(|r| {
-        r.map_err(anyhow::Error::new)
-            .and_then(|s| serde_json::from_str::<Page>(&s).map_err(anyhow::Error::new))
-    });
-
-    for page in stream {
-        let page = page?;
-
-        let qid = page.wikidata();
-
-        let is_wikidata_match = qid
-            .as_ref()
-            .map(|qid| wikidata_ids.contains(qid))
-            .unwrap_or_default();
-
-        let matching_titles = if wikipedia_titles.is_empty() {
-            Default::default()
-        } else {
-            page.all_titles()
-                .filter_map(|r| {
-                    r.map(Some).unwrap_or_else(|e| {
-                        warn!("Could not parse title for {:?}: {:#}", &page.name, e);
-                        None
-                    })
-                })
-                .filter(|t| wikipedia_titles.contains(t))
-                .collect::<Vec<_>>()
-        };
-
-        if !is_wikidata_match && matching_titles.is_empty() {
-            continue;
-        }
-
-        // Write matched new QIDs back to fild.
-        if let (Some(f), Some(qid)) = (&mut write_new_ids, &qid) {
-            if !is_wikidata_match && !matching_titles.is_empty() {
-                debug!("Writing new id {} for article {:?}", qid, page.name);
-                // NOTE: Write to string buffer first to have a single atomic write syscall.
-                // See `write_new_ids` for more info.
-                let line = format!("{}\n", qid);
-                write!(f, "{}", line).with_context(|| {
-                    format!(
-                        "writing new id to file {:?}",
-                        args.write_new_ids.as_ref().unwrap()
-                    )
-                })?;
-            }
-        }
-
-        if let Err(e) = write(&args.output_dir, &page, matching_titles) {
-            error!("Error writing article {:?}: {:#}", page.name, e);
-        }
-    }
-
-    Ok(())
-}
--- a/src/wm/mod.rs
+++ b/src/wm/mod.rs
@ -1,24 +1,23 @@
 //! Wikimedia types
-use std::{
-    collections::HashSet, ffi::OsStr, fmt::Display, fs, num::ParseIntError, path::PathBuf,
-    str::FromStr,
-};
+use std::{collections::HashSet, error::Error, ffi::OsStr, fmt::Display, fs, str::FromStr};

 use anyhow::{anyhow, bail, Context};

-use url::Url;
-
 mod page;
 pub use page::Page;
+mod title;
+pub use title::*;
+mod qid;
+pub use qid::*;

 /// Read from a file of urls on each line.
-pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<WikidataQid>> {
+pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Qid>> {
    let contents = fs::read_to_string(path.as_ref())?;
    Ok(contents
        .lines()
        .enumerate()
        .map(|(i, line)| {
-            WikidataQid::from_str(line).with_context(|| {
+            Qid::from_str(line).with_context(|| {
                let line_num = i + 1;
                format!("on line {line_num}: {line:?}")
            })
@ -34,15 +33,13 @@ pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Wi
 }

 /// Read article titles from a file of urls on each line.
-pub fn parse_wikipedia_file(
-    path: impl AsRef<OsStr>,
-) -> anyhow::Result<HashSet<WikipediaTitleNorm>> {
+pub fn parse_wikipedia_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Title>> {
    let contents = fs::read_to_string(path.as_ref())?;
    Ok(contents
        .lines()
        .enumerate()
        .map(|(i, line)| {
-            WikipediaTitleNorm::from_url(line).with_context(|| {
+            Title::from_url(line).with_context(|| {
                let line_num = i + 1;
                format!("on line {line_num}: {line:?}")
            })
@ -57,147 +54,120 @@ pub fn parse_wikipedia_file(
        .collect())
 }

-/// Wikidata QID/Q Number
-///
-/// See https://www.wikidata.org/wiki/Wikidata:Glossary#QID
-///
-/// ```
-/// use std::str::FromStr;
-/// use om_wikiparser::wm::WikidataQid;
-///
-/// let with_q = WikidataQid::from_str("Q12345").unwrap();
-/// let without_q = WikidataQid::from_str(" 12345 ").unwrap();
-/// assert_eq!(with_q, without_q);
-///
-/// assert!(WikidataQid::from_str("q12345").is_ok());
-/// assert!(WikidataQid::from_str("https://wikidata.org/wiki/Q12345").is_err());
-/// assert!(WikidataQid::from_str("Article_Title").is_err());
-/// assert!(WikidataQid::from_str("Q").is_err());
-/// assert!(WikidataQid::from_str("").is_err());
-/// ```
-#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
-pub struct WikidataQid(u32);
+pub fn parse_osm_tag_file(
+    path: impl AsRef<OsStr>,
+    qids: &mut HashSet<Qid>,
+    titles: &mut HashSet<Title>,
+    mut line_errors: Option<&mut Vec<ParseLineError>>,
+) -> anyhow::Result<()> {
+    let path = path.as_ref();
+    let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?;

-impl FromStr for WikidataQid {
-    type Err = ParseIntError;
+    let mut push_error = |e: ParseLineError| {
+        debug!("Tag parse error: {e}");
+        if let Some(ref mut errs) = line_errors {
+            errs.push(e);
+        }
+    };

-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let s = s.trim();
-        let s = s.strip_prefix(['Q', 'q']).unwrap_or(s);
-        u32::from_str(s).map(WikidataQid)
+    let mut qid_col = None;
+    let mut title_col = None;
+    for (column, title) in rdr.headers()?.iter().enumerate() {
+        match title {
+            "wikidata" => qid_col = Some(column),
+            "wikipedia" => title_col = Some(column),
+            _ => (),
+        }
    }
+
+    let qid_col = qid_col.ok_or_else(|| anyhow!("Cannot find 'wikidata' column"))?;
+    let title_col = title_col.ok_or_else(|| anyhow!("Cannot find 'wikipedia' column"))?;
+
+    let mut row = csv::StringRecord::new();
+    loop {
+        match rdr.read_record(&mut row) {
+            Ok(true) => {}
+            // finished
+            Ok(false) => break,
+            // attempt to recover from parsing errors
+            Err(e) => {
+                if e.is_io_error() {
+                    bail!(e)
+                }
+                push_error(ParseLineError {
+                    text: String::new(),
+                    line: rdr.position().line(),
+                    kind: e.into(),
+                });
+                continue;
+            }
+        }
+
+        let qid = &row[qid_col].trim();
+        if !qid.is_empty() {
+            match Qid::from_str(qid) {
+                Ok(qid) => {
+                    qids.insert(qid);
+                }
+                Err(e) => push_error(ParseLineError {
+                    text: qid.to_string(),
+                    line: rdr.position().line(),
+                    kind: e.into(),
+                }),
+            }
+        }
+
+        let title = &row[title_col].trim();
+        if !title.is_empty() {
+            match Title::from_osm_tag(title) {
+                Ok(title) => {
+                    titles.insert(title);
+                }
+                Err(e) => push_error(ParseLineError {
+                    text: title.to_string(),
+                    line: rdr.position().line(),
+                    kind: e.into(),
+                }),
+            }
+        }
+    }
+
+    Ok(())
 }

-impl Display for WikidataQid {
+#[derive(Debug, thiserror::Error)]
+pub enum ParseErrorKind {
+    #[error("bad title")]
+    Title(#[from] ParseTitleError),
+    #[error("bad QID")]
+    Qid(#[from] ParseQidError),
+    #[error("bad TSV line")]
+    Tsv(#[from] csv::Error),
+}
+
+#[derive(Debug)]
+pub struct ParseLineError {
+    text: String,
+    line: u64,
+    kind: ParseErrorKind,
+}
+
+impl Display for ParseLineError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "Q{}", self.0)
+        // write source chain to ensure they are logged
+        write!(f, "on line {}: {:?}: {}", self.line, self.text, self.kind)?;
+        let mut source = self.kind.source();
+        while let Some(e) = source {
+            write!(f, ": {}", e)?;
+            source = e.source();
+        }
+        Ok(())
    }
 }

-impl WikidataQid {
-    pub fn get_dir(&self, base: PathBuf) -> PathBuf {
-        let mut path = base;
-        path.push("wikidata");
-        // TODO: can use as_mut_os_string with 1.70.0
-        path.push(self.to_string());
-
-        path
-    }
-}
-
-/// Normalized wikipedia article title that can compare:
-/// - titles `Spatial Database`
-/// - urls `https://en.wikipedia.org/wiki/Spatial_database#Geodatabase`
-/// - osm-style tags `en:Spatial Database`
-///
-/// ```
-/// use om_wikiparser::wm::WikipediaTitleNorm;
-///
-/// let title = WikipediaTitleNorm::from_title("Article Title", "en").unwrap();
-/// let url = WikipediaTitleNorm::from_url("https://en.wikipedia.org/wiki/Article_Title#Section").unwrap();
-/// assert_eq!(url, title);
-///
-/// assert!(WikipediaTitleNorm::from_url("https://en.wikipedia.org/not_a_wiki_page").is_err());
-/// assert!(WikipediaTitleNorm::from_url("https://wikidata.org/wiki/Q12345").is_err());
-///
-/// assert!(
-///     WikipediaTitleNorm::from_url("https://de.wikipedia.org/wiki/Breil/Brigels").unwrap() !=
-///     WikipediaTitleNorm::from_url("https://de.wikipedia.org/wiki/Breil").unwrap()
-/// );
-/// ```
-#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
-pub struct WikipediaTitleNorm {
-    lang: String,
-    name: String,
-}
-
-impl WikipediaTitleNorm {
-    fn normalize_title(title: &str) -> String {
-        // TODO: Compare with map generator url creation, ensure covers all cases.
-        title.trim().replace(' ', "_")
-    }
-
-    // https://en.wikipedia.org/wiki/Article_Title/More_Title
-    pub fn from_url(url: &str) -> anyhow::Result<Self> {
-        let url = Url::parse(url.trim())?;
-
-        let (subdomain, host) = url
-            .host_str()
-            .ok_or_else(|| anyhow!("Expected host"))?
-            .split_once('.')
-            .ok_or_else(|| anyhow!("Expected subdomain"))?;
-        if host != "wikipedia.org" {
-            bail!("Expected wikipedia.org for domain")
-        }
-        let lang = subdomain;
-
-        let path = url.path();
-
-        let (root, title) = path
-            .strip_prefix('/')
-            .unwrap_or(path)
-            .split_once('/')
-            .ok_or_else(|| anyhow!("Expected at least two segments in path"))?;
-
-        if root != "wiki" {
-            bail!("Expected 'wiki' as root path, got: {:?}", root)
-        }
-        let title = urlencoding::decode(title)?;
-
-        Self::from_title(&title, lang)
-    }
-
-    // en:Article Title
-    fn _from_osm_tag(tag: &str) -> anyhow::Result<Self> {
-        let (lang, title) = tag
-            .trim()
-            .split_once(':')
-            .ok_or_else(|| anyhow!("Expected ':'"))?;
-
-        Self::from_title(title, lang)
-    }
-
-    pub fn from_title(title: &str, lang: &str) -> anyhow::Result<Self> {
-        let title = title.trim();
-        let lang = lang.trim();
-        if title.is_empty() {
-            bail!("title cannot be empty or whitespace");
-        }
-        if lang.is_empty() {
-            bail!("lang cannot be empty or whitespace");
-        }
-        let name = Self::normalize_title(title);
-        let lang = lang.to_owned();
-        Ok(Self { name, lang })
-    }
-
-    pub fn get_dir(&self, base: PathBuf) -> PathBuf {
-        let mut path = base;
-        // TODO: can use as_mut_os_string with 1.70.0
-        path.push(format!("{}.wikipedia.org", self.lang));
-        path.push("wiki");
-        path.push(&self.name);
-
-        path
+impl Error for ParseLineError {
+    fn source(&self) -> Option<&(dyn Error + 'static)> {
+        // return nothing b/c Display prints source chain
+        None
    }
 }
--- a/src/wm/page.rs
+++ b/src/wm/page.rs
@ -1,8 +1,9 @@
 use std::{iter, str::FromStr};

+use anyhow::Context;
 use serde::Deserialize;

-use super::{WikidataQid, WikipediaTitleNorm};
+use super::{Qid, Title};

 // TODO: consolidate into single struct
 /// Deserialized Wikimedia Enterprise API Article
@ -25,27 +26,29 @@ pub struct Page {
 }

 impl Page {
-    pub fn wikidata(&self) -> Option<WikidataQid> {
+    pub fn wikidata(&self) -> Option<Qid> {
        // TODO: return error
        self.main_entity
            .as_ref()
-            .map(|e| WikidataQid::from_str(&e.identifier).unwrap())
+            .map(|e| Qid::from_str(&e.identifier).unwrap())
    }

    /// Title of the article
-    pub fn title(&self) -> anyhow::Result<WikipediaTitleNorm> {
-        WikipediaTitleNorm::from_title(&self.name, &self.in_language.identifier)
+    pub fn title(&self) -> anyhow::Result<Title> {
+        Title::from_title(&self.name, &self.in_language.identifier)
+            .with_context(|| format!("bad title {:?}", self.name))
    }

    /// All titles that lead to the article, the main title followed by any redirects.
-    pub fn all_titles(&self) -> impl Iterator<Item = anyhow::Result<WikipediaTitleNorm>> + '_ {
+    pub fn all_titles(&self) -> impl Iterator<Item = anyhow::Result<Title>> + '_ {
        iter::once(self.title()).chain(self.redirects())
    }

-    pub fn redirects(&self) -> impl Iterator<Item = anyhow::Result<WikipediaTitleNorm>> + '_ {
-        self.redirects
-            .iter()
-            .map(|r| WikipediaTitleNorm::from_title(&r.name, &self.in_language.identifier))
+    pub fn redirects(&self) -> impl Iterator<Item = anyhow::Result<Title>> + '_ {
+        self.redirects.iter().map(|r| {
+            Title::from_title(&r.name, &self.in_language.identifier)
+                .with_context(|| format!("bad redirect {:?}", self.name))
+        })
    }
 }

--- a/src/wm/qid.rs
+++ b/src/wm/qid.rs
@ -0,0 +1,64 @@
+use std::{error::Error, fmt::Display, num::ParseIntError, path::PathBuf, str::FromStr};
+
+/// Wikidata QID/Q Number
+///
+/// See https://www.wikidata.org/wiki/Wikidata:Glossary#QID
+///
+/// ```
+/// use std::str::FromStr;
+/// use om_wikiparser::wm::Qid;
+///
+/// let with_q = Qid::from_str("Q12345").unwrap();
+/// let without_q = Qid::from_str(" 12345 ").unwrap();
+/// assert_eq!(with_q, without_q);
+///
+/// assert!(Qid::from_str("q12345").is_ok());
+/// assert!(Qid::from_str("https://wikidata.org/wiki/Q12345").is_err());
+/// assert!(Qid::from_str("Article_Title").is_err());
+/// assert!(Qid::from_str("Q").is_err());
+/// assert!(Qid::from_str("").is_err());
+/// ```
+#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
+pub struct Qid(u32);
+
+impl FromStr for Qid {
+    type Err = ParseQidError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let s = s.trim();
+        let s = s.strip_prefix(['Q', 'q']).unwrap_or(s);
+        u32::from_str(s).map(Qid).map_err(ParseQidError)
+    }
+}
+
+impl Display for Qid {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Q{}", self.0)
+    }
+}
+
+impl Qid {
+    pub fn get_dir(&self, base: PathBuf) -> PathBuf {
+        let mut path = base;
+        path.push("wikidata");
+        // TODO: can use as_mut_os_string with 1.70.0
+        path.push(self.to_string());
+
+        path
+    }
+}
+
+#[derive(Debug, PartialEq, Eq)]
+pub struct ParseQidError(ParseIntError);
+
+impl Display for ParseQidError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
+impl Error for ParseQidError {
+    fn source(&self) -> Option<&(dyn Error + 'static)> {
+        self.0.source()
+    }
+}
--- a/src/wm/title.rs
+++ b/src/wm/title.rs
@ -0,0 +1,150 @@
+use std::{fmt::Display, path::PathBuf, string::FromUtf8Error};
+
+use url::Url;
+
+/// Normalized wikipedia article title that can compare:
+/// - titles `Spatial Database`
+/// - urls `https://en.wikipedia.org/wiki/Spatial_database#Geodatabase`
+/// - osm-style tags `en:Spatial Database`
+///
+/// ```
+/// use om_wikiparser::wm::Title;
+///
+/// let title = Title::from_title("Article Title", "en").unwrap();
+/// let url = Title::from_url("https://en.wikipedia.org/wiki/Article_Title#Section").unwrap();
+/// let mobile = Title::from_url("https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
+/// let url_tag1 = Title::from_osm_tag("https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
+/// let url_tag2 = Title::from_osm_tag("de:https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
+/// assert_eq!(url, title);
+/// assert_eq!(url, mobile);
+/// assert_eq!(url, url_tag1);
+/// assert_eq!(url, url_tag2);
+///
+/// assert!(Title::from_url("https://en.wikipedia.org/not_a_wiki_page").is_err());
+/// assert!(Title::from_url("https://wikidata.org/wiki/Q12345").is_err());
+///
+/// assert!(
+///     Title::from_url("https://de.wikipedia.org/wiki/Breil/Brigels").unwrap() !=
+///     Title::from_url("https://de.wikipedia.org/wiki/Breil").unwrap()
+/// );
+/// ```
+#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
+pub struct Title {
+    lang: String,
+    name: String,
+}
+
+impl Display for Title {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}:{}", self.lang, self.name)
+    }
+}
+
+impl Title {
+    fn normalize_title(title: &str) -> String {
+        // TODO: Compare with map generator url creation, ensure covers all cases.
+        title.trim().replace(' ', "_")
+    }
+
+    // https://en.wikipedia.org/wiki/Article_Title/More_Title
+    pub fn from_url(url: &str) -> Result<Self, ParseTitleError> {
+        let url = Url::parse(url.trim())?;
+
+        let (subdomain, host) = url
+            .host_str()
+            .ok_or(ParseTitleError::NoHost)?
+            .split_once('.')
+            .ok_or(ParseTitleError::NoSubdomain)?;
+        let host = host.strip_prefix("m.").unwrap_or(host);
+        if host != "wikipedia.org" {
+            return Err(ParseTitleError::BadDomain);
+        }
+        let lang = subdomain;
+
+        let path = url.path();
+
+        let (root, title) = path
+            .strip_prefix('/')
+            .unwrap_or(path)
+            .split_once('/')
+            .ok_or(ParseTitleError::ShortPath)?;
+
+        if root != "wiki" {
+            return Err(ParseTitleError::BadPath);
+        }
+        let title = urlencoding::decode(title)?;
+
+        Self::from_title(&title, lang)
+    }
+
+    // en:Article Title
+    pub fn from_osm_tag(tag: &str) -> Result<Self, ParseTitleError> {
+        let (lang, title) = tag
+            .trim()
+            .split_once(':')
+            .ok_or(ParseTitleError::MissingColon)?;
+
+        let lang = lang.trim_start();
+        let title = title.trim_start();
+
+        if matches!(lang, "http" | "https") {
+            return Self::from_url(tag);
+        }
+
+        if title.starts_with("http://") || title.starts_with("https://") {
+            return Self::from_url(title);
+        }
+
+        Self::from_title(title, lang)
+    }
+
+    pub fn from_title(title: &str, lang: &str) -> Result<Self, ParseTitleError> {
+        let title = title.trim();
+        let lang = lang.trim();
+        if title.is_empty() {
+            return Err(ParseTitleError::NoTitle);
+        }
+        if lang.is_empty() {
+            return Err(ParseTitleError::NoLang);
+        }
+        let name = Self::normalize_title(title);
+        let lang = lang.to_owned();
+        Ok(Self { name, lang })
+    }
+
+    pub fn get_dir(&self, base: PathBuf) -> PathBuf {
+        let mut path = base;
+        // TODO: can use as_mut_os_string with 1.70.0
+        path.push(format!("{}.wikipedia.org", self.lang));
+        path.push("wiki");
+        path.push(&self.name);
+
+        path
+    }
+}
+
+#[derive(Debug, PartialEq, Eq, thiserror::Error)]
+pub enum ParseTitleError {
+    #[error("title cannot be empty or whitespace")]
+    NoTitle,
+    #[error("lang cannot be empty or whitespace")]
+    NoLang,
+    #[error("no ':' separating lang and title")]
+    MissingColon,
+
+    // url-specific
+    #[error("cannot parse url")]
+    Url(#[from] url::ParseError),
+    #[error("cannot decode url")]
+    UrlDecode(#[from] FromUtf8Error),
+    #[error("no host in url")]
+    NoHost,
+    #[error("no subdomain in url")]
+    NoSubdomain,
+    #[error("url base domain is wikipedia.org")]
+    BadDomain,
+    #[error("url base path is not /wiki/")]
+    BadPath,
+    #[error("path has less than 2 segments")]
+    ShortPath,
+}