Add osm tag file parsing #23
355
Cargo.lock
generated
|
@ -112,7 +112,7 @@ dependencies = [
|
|||
"cc",
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"miniz_oxide",
|
||||
"miniz_oxide 0.6.2",
|
||||
"object",
|
||||
"rustc-demangle",
|
||||
]
|
||||
|
@ -123,6 +123,12 @@ version = "1.3.2"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "2.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42"
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.4.3"
|
||||
|
@ -160,7 +166,7 @@ checksum = "72394f3339a76daf211e57d4bcb374410f3965dcc606dd0e03738c7888766980"
|
|||
dependencies = [
|
||||
"anstream",
|
||||
"anstyle",
|
||||
"bitflags",
|
||||
"bitflags 1.3.2",
|
||||
"clap_lex",
|
||||
"strsim",
|
||||
]
|
||||
|
@ -174,7 +180,7 @@ dependencies = [
|
|||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.17",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -195,6 +201,58 @@ version = "0.4.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e"
|
||||
|
||||
[[package]]
|
||||
name = "crc32fast"
|
||||
version = "1.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-channel"
|
||||
version = "0.5.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-deque"
|
||||
version = "0.8.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crossbeam-epoch",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-epoch"
|
||||
version = "0.9.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"cfg-if",
|
||||
"crossbeam-utils",
|
||||
"memoffset",
|
||||
"scopeguard",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cssparser"
|
||||
version = "0.29.6"
|
||||
|
@ -222,6 +280,27 @@ dependencies = [
|
|||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "csv"
|
||||
version = "1.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "626ae34994d3d8d668f4269922248239db4ae42d538b14c398b74a52208e8086"
|
||||
dependencies = [
|
||||
"csv-core",
|
||||
"itoa",
|
||||
"ryu",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "csv-core"
|
||||
version = "0.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "derive_more"
|
||||
version = "0.99.17"
|
||||
|
@ -256,6 +335,12 @@ version = "0.6.2"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591"
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07"
|
||||
|
||||
[[package]]
|
||||
name = "env_logger"
|
||||
version = "0.10.0"
|
||||
|
@ -290,6 +375,22 @@ dependencies = [
|
|||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fastrand"
|
||||
version = "2.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764"
|
||||
|
||||
[[package]]
|
||||
name = "flate2"
|
||||
version = "1.0.26"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
|
||||
dependencies = [
|
||||
"crc32fast",
|
||||
"miniz_oxide 0.7.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "form_urlencoded"
|
||||
version = "1.1.0"
|
||||
|
@ -355,6 +456,12 @@ version = "0.27.2"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ad0a93d233ebf96623465aad4046a8d3aa4da22d4f4beba5388838c8a434bbb4"
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.4.1"
|
||||
|
@ -397,6 +504,16 @@ dependencies = [
|
|||
"unicode-normalization",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "indexmap"
|
||||
version = "1.9.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"hashbrown",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "io-lifetimes"
|
||||
version = "1.0.11"
|
||||
|
@ -416,7 +533,7 @@ checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
|
|||
dependencies = [
|
||||
"hermit-abi",
|
||||
"io-lifetimes",
|
||||
"rustix",
|
||||
"rustix 0.37.19",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
|
@ -428,9 +545,9 @@ checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
|
|||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.144"
|
||||
version = "0.2.147"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1"
|
||||
checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
|
@ -438,6 +555,12 @@ version = "0.3.8"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.4.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "57bcfdad1b858c2db7c38303a6d2ad4dfaf5eb53dfeb0910128b2c26d6158503"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.9"
|
||||
|
@ -486,6 +609,24 @@ version = "2.5.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
|
||||
|
||||
[[package]]
|
||||
name = "memmap2"
|
||||
version = "0.5.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memoffset"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "miniz_oxide"
|
||||
version = "0.6.2"
|
||||
|
@ -495,6 +636,15 @@ dependencies = [
|
|||
"adler",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "miniz_oxide"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
|
||||
dependencies = [
|
||||
"adler",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "new_debug_unreachable"
|
||||
version = "1.0.4"
|
||||
|
@ -507,6 +657,16 @@ version = "0.1.14"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb"
|
||||
|
||||
[[package]]
|
||||
name = "num_cpus"
|
||||
version = "1.16.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "object"
|
||||
version = "0.30.3"
|
||||
|
@ -522,13 +682,17 @@ version = "0.0.0"
|
|||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"csv",
|
||||
"ego-tree",
|
||||
"env_logger",
|
||||
"log",
|
||||
"once_cell",
|
||||
"osmpbf",
|
||||
"rayon",
|
||||
"scraper",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror",
|
||||
"url",
|
||||
"urlencoding",
|
||||
]
|
||||
|
@ -539,6 +703,20 @@ version = "1.18.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
|
||||
|
||||
[[package]]
|
||||
name = "osmpbf"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3bec2671f8eb1e9a353adfe8aafe44c9c5207e0012d469a4b61fb7bf33adf37"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"flate2",
|
||||
"memmap2",
|
||||
"protobuf",
|
||||
"protobuf-codegen",
|
||||
"rayon",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.12.1"
|
||||
|
@ -557,7 +735,7 @@ checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521"
|
|||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"redox_syscall",
|
||||
"redox_syscall 0.2.16",
|
||||
"smallvec",
|
||||
"windows-sys 0.45.0",
|
||||
]
|
||||
|
@ -680,18 +858,69 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
|
|||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.59"
|
||||
version = "1.0.66"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6aeca18b86b413c660b781aa319e4e2648a3e6f9eadc9b47e9038e6fe9f3451b"
|
||||
checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.28"
|
||||
name = "protobuf"
|
||||
version = "3.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488"
|
||||
checksum = "b55bad9126f378a853655831eb7363b7b01b81d19f8cb1218861086ca4a1a61e"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
"protobuf-support",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "protobuf-codegen"
|
||||
version = "3.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0dd418ac3c91caa4032d37cb80ff0d44e2ebe637b2fb243b6234bf89cdac4901"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"once_cell",
|
||||
"protobuf",
|
||||
"protobuf-parse",
|
||||
"regex",
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "protobuf-parse"
|
||||
version = "3.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9d39b14605eaa1f6a340aec7f320b34064feb26c93aec35d6a9a2272a8ddfa49"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"indexmap",
|
||||
"log",
|
||||
"protobuf",
|
||||
"protobuf-support",
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"which",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "protobuf-support"
|
||||
version = "3.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a5d4d7b8601c814cfb36bcebb79f0e61e45e1e93640cf778837833bbed05c372"
|
||||
dependencies = [
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.32"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
@ -777,13 +1006,44 @@ dependencies = [
|
|||
"rand_core 0.5.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b"
|
||||
dependencies = [
|
||||
"either",
|
||||
"rayon-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon-core"
|
||||
version = "1.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d"
|
||||
dependencies = [
|
||||
"crossbeam-channel",
|
||||
"crossbeam-deque",
|
||||
"crossbeam-utils",
|
||||
"num_cpus",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.2.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"bitflags 1.3.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.3.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -824,11 +1084,24 @@ version = "0.37.19"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"bitflags 1.3.2",
|
||||
"errno",
|
||||
"io-lifetimes",
|
||||
"libc",
|
||||
"linux-raw-sys",
|
||||
"linux-raw-sys 0.3.8",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.38.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "172891ebdceb05aa0005f533a6cbfca599ddd7d966f6f5d4d9b2e70478e70399"
|
||||
dependencies = [
|
||||
"bitflags 2.3.3",
|
||||
"errno",
|
||||
"libc",
|
||||
"linux-raw-sys 0.4.5",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
|
@ -867,7 +1140,7 @@ version = "0.24.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c37578180969d00692904465fb7f6b3d50b9a2b952b87c23d0e2e5cb5013416"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"bitflags 1.3.2",
|
||||
"cssparser",
|
||||
"derive_more",
|
||||
"fxhash",
|
||||
|
@ -902,7 +1175,7 @@ checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e"
|
|||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.17",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -989,15 +1262,28 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.17"
|
||||
version = "2.0.28"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "45b6ddbb36c5b969c182aec3c4a0bce7df3fbad4b77114706a49aacc80567388"
|
||||
checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tempfile"
|
||||
version = "3.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc02fddf48964c42031a0b3fe0428320ecf3a73c401040fc0096f97794310651"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"fastrand",
|
||||
"redox_syscall 0.3.5",
|
||||
"rustix 0.38.7",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tendril"
|
||||
version = "0.4.3"
|
||||
|
@ -1018,6 +1304,26 @@ dependencies = [
|
|||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.44"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "611040a08a0439f8248d1990b111c95baa9c704c805fa1f62104b39655fd7f90"
|
||||
dependencies = [
|
||||
"thiserror-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "1.0.44"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "090198534930841fab3a5d1bb637cde49e339654e606195f8d9c76eeb081dc96"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinyvec"
|
||||
version = "1.6.0"
|
||||
|
@ -1107,6 +1413,17 @@ version = "0.11.0+wasi-snapshot-preview1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
||||
|
||||
[[package]]
|
||||
name = "which"
|
||||
version = "4.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2441c784c52b289a054b7201fc93253e288f094e2f4be9058343127c4226a269"
|
||||
dependencies = [
|
||||
"either",
|
||||
"libc",
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.9"
|
||||
|
|
|
@ -10,13 +10,17 @@ default-run = "om-wikiparser"
|
|||
[dependencies]
|
||||
anyhow = { version = "1.0.71", features = ["backtrace"] }
|
||||
clap = { version = "4.3.2", features = ["derive"] }
|
||||
csv = "1.2.2"
|
||||
ego-tree = "0.6.2"
|
||||
env_logger = "0.10.0"
|
||||
log = "0.4.18"
|
||||
once_cell = "1.18.0"
|
||||
osmpbf = "0.3.1"
|
||||
rayon = "1.7.0"
|
||||
scraper = "0.16.0"
|
||||
serde = { version = "1.0.163", features = ["derive"] }
|
||||
serde_json = "1.0.96"
|
||||
thiserror = "1.0.44"
|
||||
url = "2.3.1"
|
||||
urlencoding = "2.1.2"
|
||||
|
||||
|
|
84
README.md
|
@ -13,7 +13,7 @@ It defines article sections that are not important for users and should be remov
|
|||
## Usage
|
||||
|
||||
To use with the map generator, see the [`run.sh` script](run.sh) and its own help documentation.
|
||||
It handles preparing the inputs, using multiple dumps, and re-running to convert titles to QIDs and extract them across languages.
|
||||
It handles extracting the tags, using multiple dumps, and re-running to convert titles to QIDs and extract them across languages.
|
||||
|
||||
To run the wikiparser manually or for development, see below.
|
||||
|
||||
|
@ -29,41 +29,64 @@ Run the program with the `--help` flag to see all supported arguments.
|
|||
|
||||
```
|
||||
$ cargo run --release -- --help
|
||||
Extract article HTML from Wikipedia Enterprise HTML dumps.
|
||||
Extract articles from Wikipedia Enterprise HTML dumps
|
||||
|
||||
Expects an uncompressed dump connected to stdin.
|
||||
Usage: om-wikiparser <COMMAND>
|
||||
|
||||
Usage: om-wikiparser [OPTIONS] <OUTPUT_DIR>
|
||||
Commands:
|
||||
get-articles Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps
|
||||
get-tags Extract wikidata/wikipedia tags from an OpenStreetMap PBF dump
|
||||
simplify Apply the same html article simplification used when extracting articles to stdin, and write it to stdout
|
||||
help Print this message or the help of the given subcommand(s)
|
||||
|
||||
Options:
|
||||
-h, --help Print help (see more with '--help')
|
||||
-V, --version Print version
|
||||
```
|
||||
|
||||
Each command has its own additional help:
|
||||
|
||||
```
|
||||
$ cargo run -- get-articles --help
|
||||
Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.
|
||||
|
||||
Expects an uncompressed dump (newline-delimited JSON) connected to stdin.
|
||||
|
||||
Usage: om-wikiparser get-articles [OPTIONS] <OUTPUT_DIR>
|
||||
|
||||
Arguments:
|
||||
<OUTPUT_DIR>
|
||||
Directory to write the extracted articles to
|
||||
|
||||
Options:
|
||||
--write-new-ids <WRITE_NEW_IDS>
|
||||
--write-new-qids <FILE>
|
||||
Append to the provided file path the QIDs of articles matched by title but not QID.
|
||||
|
||||
Use this to save the QIDs of articles you know the url of, but not the QID. The same path can later be passed to the `--wikidata-ids` option to extract them from another language's dump.
|
||||
Use this to save the QIDs of articles you know the url of, but not the QID. The same path can later be passed to the `--wikidata-qids` option to extract them from another language's dump. Writes are atomicly appended to the file, so the same path may be used by multiple concurrent instances.
|
||||
|
||||
-h, --help
|
||||
Print help (see a summary with '-h')
|
||||
|
||||
-V, --version
|
||||
Print version
|
||||
|
||||
FILTERS:
|
||||
--wikidata-ids <WIKIDATA_IDS>
|
||||
--osm-tags <FILE.tsv>
|
||||
Path to a TSV file that contains one or more of `wikidata`, `wikipedia` columns.
|
||||
|
||||
This can be generated with the `get-tags` command or `osmconvert --csv-headline --csv 'wikidata wikipedia'`.
|
||||
|
||||
--wikidata-qids <FILE>
|
||||
Path to file that contains a Wikidata QID to extract on each line (e.g. `Q12345`)
|
||||
|
||||
--wikipedia-urls <WIKIPEDIA_URLS>
|
||||
--wikipedia-urls <FILE>
|
||||
Path to file that contains a Wikipedia article url to extract on each line (e.g. `https://lang.wikipedia.org/wiki/Article_Title`)
|
||||
```
|
||||
|
||||
It takes as inputs:
|
||||
- A wikidata enterprise JSON dump, extracted and connected to `stdin`.
|
||||
- A file of Wikidata QIDs to extract, one per line (e.g. `Q12345`), passed as the CLI flag `--wikidata-ids`.
|
||||
- A file of Wikipedia article titles to extract, one per line (e.g. `https://$LANG.wikipedia.org/wiki/$ARTICLE_TITLE`), passed as a CLI flag `--wikipedia-urls`.
|
||||
- A directory to write the extracted articles to, as a CLI argument.
|
||||
- Any number of filters passed:
|
||||
- A TSV file of wikidata qids and wikipedia urls, created by the `get-tags` command or `osmconvert`, passed as the CLI flag `--osm-tags`.
|
||||
- A file of Wikidata QIDs to extract, one per line (e.g. `Q12345`), passed as the CLI flag `--wikidata-ids`.
|
||||
- A file of Wikipedia article titles to extract, one per line (e.g. `https://$LANG.wikipedia.org/wiki/$ARTICLE_TITLE`), passed as a CLI flag `--wikipedia-urls`.
|
||||
|
||||
As an example of manual usage with the map generator:
|
||||
- Assuming this program is installed to `$PATH` as `om-wikiparser`.
|
||||
|
@ -74,7 +97,7 @@ As an example of manual usage with the map generator:
|
|||
|
||||
```shell
|
||||
# Transform intermediate files from generator.
|
||||
cut -f 2 id_to_wikidata.csv > wikidata_ids.txt
|
||||
cut -f 2 id_to_wikidata.csv > wikidata_qids.txt
|
||||
tail -n +2 wiki_urls.txt | cut -f 3 > wikipedia_urls.txt
|
||||
# Enable backtraces in errors and panics.
|
||||
export RUST_BACKTRACE=1
|
||||
|
@ -83,9 +106,38 @@ export RUST_LOG=om_wikiparser=debug
|
|||
# Begin extraction.
|
||||
for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
|
||||
do
|
||||
tar xzf $dump | om-wikiparser \
|
||||
--wikidata-ids wikidata_ids.txt \
|
||||
tar xzf $dump | om-wikiparser get-articles \
|
||||
--wikidata-ids wikidata_qids.txt \
|
||||
--wikipedia-urls wikipedia_urls.txt \
|
||||
--write-new-qids new_qids.txt \
|
||||
descriptions/
|
||||
done
|
||||
# Extract discovered QIDs.
|
||||
for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
|
||||
do
|
||||
tar xzf $dump | om-wikiparser get-articles \
|
||||
--wikidata-ids new_qids.txt \
|
||||
descriptions/
|
||||
done
|
||||
```
|
||||
|
||||
Alternatively, extract the tags directly from a `.osm.pbf` file (referenced here as `planet-latest.osm.pbf`):
|
||||
```shell
|
||||
# Extract tags
|
||||
om-wikiparser get-tags planet-latest.osm.pbf > osm_tags.tsv
|
||||
# Begin extraction.
|
||||
for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
|
||||
do
|
||||
tar xzf $dump | om-wikiparser get-articles \
|
||||
--osm-tags osm_tags.tsv \
|
||||
--write-new-qids new_qids.txt \
|
||||
descriptions/
|
||||
done
|
||||
# Extract discovered QIDs.
|
||||
for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
|
||||
do
|
||||
tar xzf $dump | om-wikiparser get-articles \
|
||||
--wikidata-ids new_qids.txt \
|
||||
descriptions/
|
||||
done
|
||||
```
|
||||
|
|
|
@ -4,22 +4,21 @@ use std::{collections::HashSet, str::FromStr};
|
|||
extern crate om_wikiparser;
|
||||
extern crate test;
|
||||
|
||||
![]() I'm not sure, renaming it to the shorter I'm not sure, renaming it to the shorter `Title` must have altered `rustfmt`'s heuristics.
|
||||
use om_wikiparser::wm::{Qid, Title};
|
||||
|
||||
const TITLE: &str = "https://en.wikipedia.org/wiki/Article_Title";
|
||||
const QID: &str = "Q123456789";
|
||||
|
||||
#[bench]
|
||||
fn parse_wikipedia(b: &mut test::Bencher) {
|
||||
b.iter(|| {
|
||||
let title = om_wikiparser::wm::WikipediaTitleNorm::from_url(
|
||||
"https://en.wikipedia.org/wiki/Article_Title",
|
||||
)
|
||||
.unwrap();
|
||||
Title::from_url(TITLE).unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn hash_wikipedia(b: &mut test::Bencher) {
|
||||
let title = om_wikiparser::wm::WikipediaTitleNorm::from_url(
|
||||
"https://en.wikipedia.org/wiki/Article_Title",
|
||||
)
|
||||
.unwrap();
|
||||
let title = Title::from_url(TITLE).unwrap();
|
||||
let mut set = HashSet::new();
|
||||
b.iter(|| {
|
||||
set.insert(&title);
|
||||
|
@ -29,13 +28,13 @@ fn hash_wikipedia(b: &mut test::Bencher) {
|
|||
#[bench]
|
||||
fn parse_wikidata(b: &mut test::Bencher) {
|
||||
b.iter(|| {
|
||||
let qid = om_wikiparser::wm::WikidataQid::from_str("Q123456789").unwrap();
|
||||
Qid::from_str(QID).unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn hash_wikidata(b: &mut test::Bencher) {
|
||||
let qid = om_wikiparser::wm::WikidataQid::from_str("Q123456789").unwrap();
|
||||
let qid = Qid::from_str(QID).unwrap();
|
||||
let mut set = HashSet::new();
|
||||
b.iter(|| {
|
||||
set.insert(&qid);
|
||||
|
|
41
run.sh
|
@ -1,17 +1,16 @@
|
|||
#! /usr/bin/env bash
|
||||
# shellcheck disable=SC2016 # Backticks not used as expansions in documentation.
|
||||
USAGE='Usage: ./run.sh [-h] <BUILD_DIR> <DUMP_FILE.json.tar.gz> [<DUMP_FILE.json.tar.gz>...]
|
||||
USAGE='Usage: ./run.sh [-h] <BUILD_DIR> <OSM_FILE.osm.pbf> <DUMP_FILE.json.tar.gz> [<DUMP_FILE.json.tar.gz>...]
|
||||
|
||||
A convenience script to run the wikiparser with the maps generator as a drop-in replacement for the descriptions scraper.
|
||||
|
||||
Arguments:
|
||||
<BUILD_DIR> An existing directory to place descriptions in.
|
||||
The `id_to_wikidata.csv` and `wiki_urls.txt` files output by the
|
||||
maps generator must be placed in this directory before running.
|
||||
The extracted articles will be placed in a `descriptions`
|
||||
subdirectory within this directory.
|
||||
The `intermediate_data` subfolder of a maps build directory may
|
||||
be used for this. The same folder may be used for multiple runs.
|
||||
<OSM_FILE> An OpenStreetMap dump in PBF format to extract tags from.
|
||||
<DUMP_FILE> A wikipedia enterprise html dump. These take the form of
|
||||
`enwiki-NS0-20230401-ENTERPRISE-HTML.json.tar.gz`. Multiple
|
||||
dumps in the same language SHOULD NOT be provided, and will
|
||||
|
@ -21,7 +20,7 @@ Options:
|
|||
-h Print this help screen
|
||||
|
||||
1. Builds wikiparser.
|
||||
2. Extracts wikidata ids and wikipedia urls from generator intermediate files `id_to_wikidata.csv` and `wiki_urls.txt`.
|
||||
2. Extracts wikidata qids and wikipedia urls from OpenStreetMap pbf file (NOTE: this spawns as many threads as there are cores).
|
||||
3. Runs wikiparser in parallel for all input dump files (NOTE: this currently starts 2 processes for each dump files).
|
||||
|
||||
For information on running the wikiparser manually, see README.md.
|
||||
|
@ -43,8 +42,8 @@ do
|
|||
done
|
||||
shift $((OPTIND - 1))
|
||||
|
||||
if [ -z "${2-}" ]; then
|
||||
echo "BUILD_DIR and at least one DUMP_FILE are required" >&2
|
||||
if [ -z "${3-}" ]; then
|
||||
echo "BUILD_DIR, OSM_FILE, and at least one DUMP_FILE are required" >&2
|
||||
echo -n "$USAGE" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
@ -58,6 +57,13 @@ if [ ! -d "$BUILD_DIR" ]; then
|
|||
exit 1
|
||||
fi
|
||||
|
||||
OSM_FILE=$(readlink -f -- "$1")
|
||||
shift
|
||||
if [ ! -f "$OSM_FILE" ]; then
|
||||
echo "OSM_FILE '$OSM_FILE' does not exist or is not a file" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
DUMP_FILES=()
|
||||
while (( $# > 0 )); do
|
||||
dump_file="$(readlink -f -- "$1")"
|
||||
|
@ -91,16 +97,8 @@ wikiparser=$(pwd)/target/release/om-wikiparser
|
|||
log "Changing to maps build dir '$BUILD_DIR'"
|
||||
cd "$BUILD_DIR"
|
||||
|
||||
log "Transforming intermediate generator data"
|
||||
for intermediate_file in id_to_wikidata.csv wiki_urls.txt; do
|
||||
if [ ! -e "$intermediate_file" ]; then
|
||||
echo -e "Cannot find intermediate generator file '$intermediate_file' in maps build dir '$BUILD_DIR/'\nWas the descriptions step run?" >&2
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
cut -f 2 id_to_wikidata.csv > wikidata_ids.txt
|
||||
tail -n +2 wiki_urls.txt | cut -f 3 > wikipedia_urls.txt
|
||||
log "Extracting tags from '$OSM_FILE'"
|
||||
"$wikiparser" get-tags "$OSM_FILE" > osm_tags.tsv
|
||||
|
||||
# Enable backtraces in errors and panics.
|
||||
export RUST_BACKTRACE=1
|
||||
|
@ -129,10 +127,9 @@ trap 'kill_jobs' SIGINT SIGTERM EXIT
|
|||
|
||||
for dump in "${DUMP_FILES[@]}"; do
|
||||
log "Extracting '$dump'"
|
||||
tar xzOf "$dump" | "$wikiparser" \
|
||||
--wikidata-ids wikidata_ids.txt \
|
||||
--wikipedia-urls wikipedia_urls.txt \
|
||||
--write-new-ids new_qids.txt \
|
||||
tar xzOf "$dump" | "$wikiparser" get-articles \
|
||||
--osm-tags osm_tags.tsv \
|
||||
--write-new-qids new_qids.txt \
|
||||
"$OUTPUT_DIR" &
|
||||
done
|
||||
|
||||
|
@ -142,8 +139,8 @@ log "Beginning extraction of discovered QIDs"
|
|||
|
||||
# Extract new qids from other dumps in parallel.
|
||||
for dump in "${DUMP_FILES[@]}"; do
|
||||
tar xzOf "$dump" | "$wikiparser" \
|
||||
--wikidata-ids new_qids.txt \
|
||||
tar xzOf "$dump" | "$wikiparser" get-articles \
|
||||
--wikidata-qids new_qids.txt \
|
||||
"$OUTPUT_DIR" &
|
||||
done
|
||||
|
||||
|
|
|
@ -1,23 +0,0 @@
|
|||
//! Apply html article simplification to stdin, and write it to stdout.
|
||||
//!
|
||||
//! Usage:
|
||||
//! simplify_html < article.html > simplified.html
|
||||
use std::io::{stdin, stdout, Read, Write};
|
||||
|
||||
use om_wikiparser::html::simplify;
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
env_logger::Builder::new()
|
||||
.filter_level(log::LevelFilter::Info)
|
||||
.parse_default_env()
|
||||
.try_init()?;
|
||||
|
||||
let mut input = String::new();
|
||||
stdin().read_to_string(&mut input)?;
|
||||
|
||||
let output = simplify(&input, "en");
|
||||
|
||||
stdout().write_all(output.as_bytes())?;
|
||||
|
||||
Ok(())
|
||||
}
|
288
src/get_articles.rs
Normal file
|
@ -0,0 +1,288 @@
|
|||
use std::{
|
||||
fs::{self, File},
|
||||
io::{stdin, BufRead, Write},
|
||||
os::unix,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
|
||||
use om_wikiparser::{
|
||||
html::simplify,
|
||||
wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, Title},
|
||||
};
|
||||
|
||||
/// Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.
|
||||
///
|
||||
/// Expects an uncompressed dump (newline-delimited JSON) connected to stdin.
|
||||
#[derive(clap::Args)]
|
||||
pub struct Args {
|
||||
/// Directory to write the extracted articles to.
|
||||
pub output_dir: PathBuf,
|
||||
|
||||
/// Path to a TSV file that contains one or more of `wikidata`, `wikipedia` columns.
|
||||
///
|
||||
/// This can be generated with the `get-tags` command or `osmconvert --csv-headline --csv 'wikidata wikipedia'`.
|
||||
#[arg(long, help_heading = "FILTERS", value_name = "FILE.tsv")]
|
||||
pub osm_tags: Option<PathBuf>,
|
||||
|
||||
/// Path to file that contains a Wikidata QID to extract on each line
|
||||
/// (e.g. `Q12345`).
|
||||
#[arg(long, help_heading = "FILTERS", value_name = "FILE")]
|
||||
pub wikidata_qids: Option<PathBuf>,
|
||||
|
||||
/// Path to file that contains a Wikipedia article url to extract on each line
|
||||
/// (e.g. `https://lang.wikipedia.org/wiki/Article_Title`).
|
||||
#[arg(long, help_heading = "FILTERS", value_name = "FILE")]
|
||||
pub wikipedia_urls: Option<PathBuf>,
|
||||
|
||||
/// Append to the provided file path the QIDs of articles matched by title but not QID.
|
||||
///
|
||||
/// Use this to save the QIDs of articles you know the url of, but not the QID.
|
||||
/// The same path can later be passed to the `--wikidata-qids` option to extract them from another language's dump.
|
||||
/// Writes are atomicly appended to the file, so the same path may be used by multiple concurrent instances.
|
||||
#[arg(long, value_name = "FILE")]
|
||||
pub write_new_qids: Option<PathBuf>,
|
||||
}
|
||||
|
||||
pub fn run(args: Args) -> anyhow::Result<()> {
|
||||
let mut wikipedia_titles = if let Some(path) = args.wikipedia_urls {
|
||||
info!("Loading article urls from {path:?}");
|
||||
parse_wikipedia_file(path)?
|
||||
} else {
|
||||
Default::default()
|
||||
};
|
||||
|
||||
let mut wikidata_qids = if let Some(path) = args.wikidata_qids {
|
||||
info!("Loading wikidata QIDs from {path:?}");
|
||||
parse_wikidata_file(path)?
|
||||
} else {
|
||||
Default::default()
|
||||
};
|
||||
|
||||
if let Some(ref path) = args.osm_tags {
|
||||
info!("Loading wikipedia/wikidata osm tags from {path:?}");
|
||||
|
||||
let original_items = wikidata_qids.len() + wikipedia_titles.len();
|
||||
let mut line_errors = Vec::new();
|
||||
parse_osm_tag_file(
|
||||
path,
|
||||
&mut wikidata_qids,
|
||||
&mut wikipedia_titles,
|
||||
Some(&mut line_errors),
|
||||
)?;
|
||||
|
||||
if !line_errors.is_empty() {
|
||||
let error_count = line_errors.len();
|
||||
let new_items = wikidata_qids.len() + wikipedia_titles.len() - original_items;
|
||||
let expected_threshold = 0.02;
|
||||
let percentage = 100.0 * error_count as f64 / new_items as f64;
|
||||
let level = if percentage >= expected_threshold {
|
||||
log::Level::Error
|
||||
} else {
|
||||
log::Level::Info
|
||||
};
|
||||
|
||||
log!(
|
||||
level,
|
||||
"{error_count} errors ({percentage:.4}%) parsing osm tags from {path:?}",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
debug!("Parsed {} unique article titles", wikipedia_titles.len());
|
||||
debug!("Parsed {} unique wikidata QIDs", wikidata_qids.len());
|
||||
|
||||
// NOTE: For atomic writes to the same file across threads/processes:
|
||||
// - The file needs to be opened in APPEND mode (`.append(true)`).
|
||||
// - Each write needs to be a single syscall (for Rust, use `format!` for formatting before calling `write!`, or `write!` to a `String` first).
|
||||
// - Each write needs to be under `PIPE_BUF` size (see `man write(3)`), usually 4kb on Linux.
|
||||
//
|
||||
// For more information, see:
|
||||
// - `man write(3posix)`: https://www.man7.org/linux/man-pages/man3/write.3p.html
|
||||
// - `std::fs::OpenOptions::append`: https://doc.rust-lang.org/std/fs/struct.OpenOptions.html#method.append
|
||||
// - https://stackoverflow.com/questions/1154446/is-file-append-atomic-in-unix
|
||||
let mut write_new_qids = args
|
||||
.write_new_qids
|
||||
.as_ref()
|
||||
.map(|p| File::options().create(true).append(true).open(p))
|
||||
.transpose()?;
|
||||
|
||||
if !args.output_dir.is_dir() {
|
||||
bail!("output dir {:?} does not exist", args.output_dir)
|
||||
}
|
||||
|
||||
info!("Processing dump");
|
||||
let dump = stdin().lock();
|
||||
|
||||
// TODO: Compare different deserialization methods.
|
||||
// The docs warn against using a reader directly, and it's slower than tar can decompress the dump.
|
||||
// let stream = serde_json::Deserializer::from_reader(dump).into_iter::<Page>();
|
||||
let stream = dump.lines().map(|r| {
|
||||
r.map_err(anyhow::Error::new)
|
||||
.and_then(|s| serde_json::from_str::<Page>(&s).map_err(anyhow::Error::new))
|
||||
});
|
||||
|
||||
for page in stream {
|
||||
let page = page?;
|
||||
|
||||
let qid = page.wikidata();
|
||||
|
||||
let is_wikidata_match = qid
|
||||
.as_ref()
|
||||
.map(|qid| wikidata_qids.contains(qid))
|
||||
.unwrap_or_default();
|
||||
|
||||
let matching_titles = if wikipedia_titles.is_empty() {
|
||||
Default::default()
|
||||
![]() What is the benefit of hiding errors under a threshold? Isn't it beneficial to see all errors and be able to estimate/compare the quality of the dump, and to easily grep/find what is most important, or feed the whole log to contributors for fixes? What is the benefit of hiding errors under a threshold? Isn't it beneficial to see all errors and be able to estimate/compare the quality of the dump, and to easily grep/find what is most important, or feed the whole log to contributors for fixes?
![]() The threshold only determines if the message is I'm open to other ideas. The threshold only determines if the message is `info` vs `error` level.
When you use the `run.sh` script with multiple languages it prints a copy of the hundreds of errors for each language.
I think writing the parse errors to a file separately will be easier to read and deal with.
I'm open to other ideas.
|
||||
} else {
|
||||
page.all_titles()
|
||||
.filter_map(|r| {
|
||||
r.map(Some).unwrap_or_else(|e| {
|
||||
warn!("Could not parse title for {:?}: {:#}", &page.name, e);
|
||||
None
|
||||
})
|
||||
})
|
||||
.filter(|t| wikipedia_titles.contains(t))
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
if !is_wikidata_match && matching_titles.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Write matched new QIDs back to file.
|
||||
if let (Some(f), Some(qid)) = (&mut write_new_qids, &qid) {
|
||||
if !is_wikidata_match && !matching_titles.is_empty() {
|
||||
debug!("Writing new id {} for article {:?}", qid, page.name);
|
||||
// NOTE: Write to string buffer first to have a single atomic write syscall.
|
||||
// See `write_new_qids` for more info.
|
||||
let line = format!("{}\n", qid);
|
||||
write!(f, "{}", line).with_context(|| {
|
||||
format!(
|
||||
"writing new QID to file {:?}",
|
||||
args.write_new_qids.as_ref().unwrap()
|
||||
)
|
||||
})?;
|
||||
}
|
||||
}
|
||||
|
||||
if let Err(e) = write(&args.output_dir, &page, matching_titles) {
|
||||
error!("Error writing article {:?}: {:#}", page.name, e);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Determine the directory to write the article contents to, create it, and create any necessary symlinks to it.
|
||||
fn create_article_dir(
|
||||
base: impl AsRef<Path>,
|
||||
page: &Page,
|
||||
redirects: impl IntoIterator<Item = Title>,
|
||||
) -> anyhow::Result<PathBuf> {
|
||||
let base = base.as_ref();
|
||||
let mut redirects = redirects.into_iter();
|
||||
|
||||
let main_dir = match page.wikidata() {
|
||||
None => {
|
||||
// Write to wikipedia title directory.
|
||||
// Prefer first redirect, fall back to page title if none exist
|
||||
info!("Page without wikidata qid: {:?} ({})", page.name, page.url);
|
||||
redirects
|
||||
.next()
|
||||
.or_else(|| match page.title() {
|
||||
Ok(title) => Some(title),
|
||||
Err(e) => {
|
||||
warn!("Unable to parse title for page {:?}: {:#}", page.name, e);
|
||||
None
|
||||
}
|
||||
})
|
||||
// hard fail when no titles can be parsed
|
||||
.ok_or_else(|| anyhow!("No available titles for page {:?}", page.name))?
|
||||
.get_dir(base.to_owned())
|
||||
}
|
||||
Some(qid) => {
|
||||
// Otherwise use wikidata as main directory and symlink from wikipedia titles.
|
||||
qid.get_dir(base.to_owned())
|
||||
}
|
||||
};
|
||||
|
||||
if main_dir.is_symlink() {
|
||||
fs::remove_file(&main_dir)
|
||||
.with_context(|| format!("removing old link for main directory {:?}", &main_dir))?;
|
||||
}
|
||||
fs::create_dir_all(&main_dir)
|
||||
.with_context(|| format!("creating main directory {:?}", &main_dir))?;
|
||||
|
||||
// Write symlinks to main directory.
|
||||
for title in redirects {
|
||||
let wikipedia_dir = title.get_dir(base.to_owned());
|
||||
|
||||
// Build required directory.
|
||||
//
|
||||
// Possible states from previous run:
|
||||
// - Does not exist (and is not a symlink)
|
||||
// - Exists, is a directory
|
||||
// - Exists, is a valid symlink to correct location
|
||||
// - Exists, is a valid symlink to incorrect location
|
||||
if wikipedia_dir.exists() {
|
||||
if wikipedia_dir.is_symlink() {
|
||||
// Only replace if not valid
|
||||
if fs::read_link(&wikipedia_dir)? == main_dir {
|
||||
continue;
|
||||
}
|
||||
fs::remove_file(&wikipedia_dir)?;
|
||||
} else {
|
||||
fs::remove_dir_all(&wikipedia_dir)?;
|
||||
}
|
||||
} else {
|
||||
// titles can contain `/`, so ensure necessary subdirs exist
|
||||
let parent_dir = wikipedia_dir.parent().unwrap();
|
||||
fs::create_dir_all(parent_dir)
|
||||
.with_context(|| format!("creating wikipedia directory {:?}", parent_dir))?;
|
||||
}
|
||||
|
||||
unix::fs::symlink(&main_dir, &wikipedia_dir).with_context(|| {
|
||||
format!(
|
||||
"creating symlink from {:?} to {:?}",
|
||||
wikipedia_dir, main_dir
|
||||
)
|
||||
})?;
|
||||
}
|
||||
|
||||
Ok(main_dir)
|
||||
}
|
||||
|
||||
/// Write selected article to disk.
|
||||
///
|
||||
/// - Write page contents to wikidata page (`wikidata.org/wiki/QXXX/lang.html`).
|
||||
/// - If the page has no wikidata qid, write contents to wikipedia location (`lang.wikipedia.org/wiki/article_title/lang.html`).
|
||||
/// - Create links from all wikipedia urls and redirects (`lang.wikipedia.org/wiki/a_redirect -> wikidata.org/wiki/QXXX`).
|
||||
fn write(
|
||||
base: impl AsRef<Path>,
|
||||
page: &Page,
|
||||
redirects: impl IntoIterator<Item = Title>,
|
||||
) -> anyhow::Result<()> {
|
||||
let article_dir = create_article_dir(base, page, redirects)?;
|
||||
|
||||
// Write html to determined file.
|
||||
let mut filename = article_dir;
|
||||
filename.push(&page.in_language.identifier);
|
||||
filename.set_extension("html");
|
||||
|
||||
debug!("{:?}: {:?}", page.name, filename);
|
||||
|
||||
if filename.exists() {
|
||||
debug!("Overwriting existing file");
|
||||
}
|
||||
|
||||
let html = simplify(&page.article_body.html, &page.in_language.identifier);
|
||||
|
||||
let mut file =
|
||||
File::create(&filename).with_context(|| format!("creating html file {:?}", filename))?;
|
||||
file.write_all(html.as_bytes())
|
||||
.with_context(|| format!("writing html file {:?}", filename))?;
|
||||
|
||||
Ok(())
|
||||
}
|
93
src/get_tags.rs
Normal file
|
@ -0,0 +1,93 @@
|
|||
use std::{
|
||||
io::{stdout, Read},
|
||||
sync::mpsc,
|
||||
thread,
|
||||
};
|
||||
|
||||
use osmpbf::{BlobDecode, BlobReader, Element};
|
||||
use rayon::prelude::*;
|
||||
|
||||
struct Record {
|
||||
id: String,
|
||||
wikidata: String,
|
||||
wikipedia: String,
|
||||
}
|
||||
|
||||
/// Extract matching tags from an osm pbf file and write to stdout in TSV.
|
||||
pub fn run(pbf: impl Read + Send) -> anyhow::Result<()> {
|
||||
let reader = BlobReader::new(pbf);
|
||||
|
||||
let (send, recv) = mpsc::sync_channel(128);
|
||||
let writer_thread = thread::Builder::new()
|
||||
.name("writer".to_string())
|
||||
.spawn(move || write(recv))?;
|
||||
|
||||
reader
|
||||
.par_bridge()
|
||||
.try_for_each(move |blob| -> anyhow::Result<()> {
|
||||
// Based on `osmpbf` implementation of `ElementReader`.
|
||||
let BlobDecode::OsmData(block) = blob?.decode()? else { return Ok(()) };
|
||||
for record in block.elements().filter_map(extract_tags) {
|
||||
send.send(record)?;
|
||||
}
|
||||
Ok(())
|
||||
})?;
|
||||
|
||||
let record_count = writer_thread.join().unwrap()?;
|
||||
info!("Finished processing {record_count} records");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write(recv: mpsc::Receiver<Record>) -> anyhow::Result<usize> {
|
||||
let mut output = csv::WriterBuilder::new()
|
||||
.delimiter(b'\t')
|
||||
.from_writer(stdout().lock());
|
||||
output.write_record(["@id", "wikidata", "wikipedia"])?;
|
||||
|
||||
let mut count = 0;
|
||||
|
||||
for Record {
|
||||
id,
|
||||
wikidata,
|
||||
wikipedia,
|
||||
} in recv
|
||||
{
|
||||
output.write_record([id, wikidata, wikipedia])?;
|
||||
count += 1;
|
||||
}
|
||||
|
||||
Ok(count)
|
||||
}
|
||||
|
||||
fn extract_tags(el: Element) -> Option<Record> {
|
||||
match el {
|
||||
Element::Node(n) => make_record(n.id(), n.tags()),
|
||||
Element::DenseNode(n) => make_record(n.id(), n.tags()),
|
||||
Element::Way(w) => make_record(w.id(), w.tags()),
|
||||
Element::Relation(r) => make_record(r.id(), r.tags()),
|
||||
}
|
||||
}
|
||||
|
||||
fn make_record<'i>(id: i64, tags: impl 'i + Iterator<Item = (&'i str, &'i str)>) -> Option<Record> {
|
||||
let mut wikipedia = String::new();
|
||||
let mut wikidata = String::new();
|
||||
|
||||
for (key, value) in tags {
|
||||
match key {
|
||||
"wikipedia" => wikipedia = value.trim().to_owned(),
|
||||
"wikidata" => wikidata = value.trim().to_owned(),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
if wikidata.is_empty() && wikipedia.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(Record {
|
||||
id: id.to_string(),
|
||||
wikipedia,
|
||||
wikidata,
|
||||
})
|
||||
}
|
364
src/main.rs
|
@ -1,19 +1,102 @@
|
|||
use std::{
|
||||
fs::{self, File},
|
||||
io::{stdin, BufRead, Write},
|
||||
os::unix,
|
||||
path::{Path, PathBuf},
|
||||
fs::File,
|
||||
io::{stdin, stdout, BufReader, Read, Write},
|
||||
num::NonZeroUsize,
|
||||
path::PathBuf,
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
use clap::{CommandFactory, Parser};
|
||||
use clap::{CommandFactory, Parser, Subcommand};
|
||||
#[macro_use]
|
||||
extern crate log;
|
||||
|
||||
use om_wikiparser::{
|
||||
html::simplify,
|
||||
wm::{parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
|
||||
};
|
||||
mod get_articles;
|
||||
mod get_tags;
|
||||
|
||||
/// Extract articles from Wikipedia Enterprise HTML dumps.
|
||||
#[derive(Parser)]
|
||||
#[command(author, version, about, long_about, version = crate::version())]
|
||||
struct Args {
|
||||
#[command(subcommand)]
|
||||
cmd: Cmd,
|
||||
}
|
||||
|
||||
#[derive(Subcommand)]
|
||||
enum Cmd {
|
||||
GetArticles(get_articles::Args),
|
||||
|
||||
/// Extract wikidata/wikipedia tags from an OpenStreetMap PBF dump.
|
||||
///
|
||||
/// Writes to stdout the extracted tags in a TSV format similar to `osmconvert --csv`.
|
||||
GetTags {
|
||||
/// The `.osm.pbf` file to use.
|
||||
pbf_file: PathBuf,
|
||||
|
||||
/// The number of threads to spawn to parse and decompress the pbf file.
|
||||
///
|
||||
/// Defaults to the number of cores.
|
||||
#[arg(short, long)]
|
||||
procs: Option<NonZeroUsize>,
|
||||
},
|
||||
|
||||
/// Apply the same html article simplification used when extracting articles to stdin, and write it to stdout.
|
||||
///
|
||||
/// This is meant for testing and debugging.
|
||||
Simplify {
|
||||
/// The language to use when processing the article (defaults to `en`).
|
||||
#[arg(long, default_value_t = String::from("en"))]
|
||||
lang: String,
|
||||
},
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
// Use info level by default, load overrides from `RUST_LOG` env variable.
|
||||
// See https://docs.rs/env_logger/latest/env_logger/index.html#example
|
||||
env_logger::Builder::new()
|
||||
.filter_level(log::LevelFilter::Info)
|
||||
.parse_default_env()
|
||||
.try_init()?;
|
||||
|
||||
let args = Args::parse();
|
||||
|
||||
info!("{} {}", Args::command().get_name(), version());
|
||||
|
||||
match args.cmd {
|
||||
Cmd::GetArticles(args) => {
|
||||
if args.wikidata_qids.is_none()
|
||||
&& args.wikipedia_urls.is_none()
|
||||
&& args.osm_tags.is_none()
|
||||
{
|
||||
let mut cmd = Args::command();
|
||||
cmd.error(
|
||||
clap::error::ErrorKind::MissingRequiredArgument,
|
||||
"at least one of --osm-tags --wikidata-qids --wikipedia-urls is required",
|
||||
)
|
||||
.exit()
|
||||
}
|
||||
|
||||
get_articles::run(args)
|
||||
}
|
||||
Cmd::GetTags { pbf_file, procs } => {
|
||||
rayon::ThreadPoolBuilder::new()
|
||||
.thread_name(|num| format!("worker{num}"))
|
||||
.num_threads(procs.map(usize::from).unwrap_or_default())
|
||||
.build_global()?;
|
||||
|
||||
let pbf_file = File::open(pbf_file).map(BufReader::new)?;
|
||||
get_tags::run(pbf_file)
|
||||
}
|
||||
Cmd::Simplify { lang } => {
|
||||
let mut input = String::new();
|
||||
stdin().read_to_string(&mut input)?;
|
||||
|
||||
let output = om_wikiparser::html::simplify(&input, &lang);
|
||||
|
||||
stdout().write_all(output.as_bytes())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the version returned by `git describe`, e.g.:
|
||||
/// - `v2.0` if a git tag
|
||||
|
@ -27,264 +110,3 @@ fn version() -> &'static str {
|
|||
.or(option_env!("CARGO_PKG_VERSION"))
|
||||
.unwrap_or("unknown")
|
||||
}
|
||||
|
||||
/// Extract article HTML from Wikipedia Enterprise HTML dumps.
|
||||
///
|
||||
/// Expects an uncompressed dump connected to stdin.
|
||||
#[derive(Parser)]
|
||||
#[command(version = crate::version())]
|
||||
struct Args {
|
||||
/// Directory to write the extracted articles to.
|
||||
output_dir: PathBuf,
|
||||
|
||||
/// Path to file that contains a Wikidata QID to extract on each line
|
||||
/// (e.g. `Q12345`).
|
||||
#[arg(long, help_heading = "FILTERS")]
|
||||
wikidata_ids: Option<PathBuf>,
|
||||
|
||||
/// Path to file that contains a Wikipedia article url to extract on each line
|
||||
/// (e.g. `https://lang.wikipedia.org/wiki/Article_Title`).
|
||||
#[arg(long, help_heading = "FILTERS")]
|
||||
wikipedia_urls: Option<PathBuf>,
|
||||
|
||||
/// Append to the provided file path the QIDs of articles matched by title but not QID.
|
||||
///
|
||||
/// Use this to save the QIDs of articles you know the url of, but not the QID.
|
||||
/// The same path can later be passed to the `--wikidata-ids` option to extract them from another language's dump.
|
||||
/// Writes are atomicly appended to the file, so the same path may be used by multiple concurrent instances.
|
||||
#[arg(long, requires("wikipedia_urls"))]
|
||||
write_new_ids: Option<PathBuf>,
|
||||
}
|
||||
|
||||
/// Determine the directory to write the article contents to, create it, and create any necessary symlinks to it.
|
||||
fn create_article_dir(
|
||||
base: impl AsRef<Path>,
|
||||
page: &Page,
|
||||
redirects: impl IntoIterator<Item = WikipediaTitleNorm>,
|
||||
) -> anyhow::Result<PathBuf> {
|
||||
let base = base.as_ref();
|
||||
let mut redirects = redirects.into_iter();
|
||||
|
||||
let main_dir = match page.wikidata() {
|
||||
None => {
|
||||
// Write to wikipedia title directory.
|
||||
// Prefer first redirect, fall back to page title if none exist
|
||||
info!("Page without wikidata qid: {:?} ({})", page.name, page.url);
|
||||
redirects
|
||||
.next()
|
||||
.or_else(|| match page.title() {
|
||||
Ok(title) => Some(title),
|
||||
Err(e) => {
|
||||
warn!("Unable to parse title for page {:?}: {:#}", page.name, e);
|
||||
None
|
||||
}
|
||||
})
|
||||
// hard fail when no titles can be parsed
|
||||
.ok_or_else(|| anyhow!("No available titles for page {:?}", page.name))?
|
||||
.get_dir(base.to_owned())
|
||||
}
|
||||
Some(qid) => {
|
||||
// Otherwise use wikidata as main directory and symlink from wikipedia titles.
|
||||
qid.get_dir(base.to_owned())
|
||||
}
|
||||
};
|
||||
|
||||
if main_dir.is_symlink() {
|
||||
fs::remove_file(&main_dir)
|
||||
.with_context(|| format!("removing old link for main directory {:?}", &main_dir))?;
|
||||
}
|
||||
fs::create_dir_all(&main_dir)
|
||||
.with_context(|| format!("creating main directory {:?}", &main_dir))?;
|
||||
|
||||
// Write symlinks to main directory.
|
||||
for title in redirects {
|
||||
let wikipedia_dir = title.get_dir(base.to_owned());
|
||||
|
||||
// Build required directory.
|
||||
//
|
||||
// Possible states from previous run:
|
||||
// - Does not exist (and is not a symlink)
|
||||
// - Exists, is a directory
|
||||
// - Exists, is a valid symlink to correct location
|
||||
// - Exists, is a valid symlink to incorrect location
|
||||
if wikipedia_dir.exists() {
|
||||
if wikipedia_dir.is_symlink() {
|
||||
// Only replace if not valid
|
||||
if fs::read_link(&wikipedia_dir)? == main_dir {
|
||||
continue;
|
||||
}
|
||||
fs::remove_file(&wikipedia_dir)?;
|
||||
} else {
|
||||
fs::remove_dir_all(&wikipedia_dir)?;
|
||||
}
|
||||
} else {
|
||||
// titles can contain `/`, so ensure necessary subdirs exist
|
||||
let parent_dir = wikipedia_dir.parent().unwrap();
|
||||
fs::create_dir_all(parent_dir)
|
||||
.with_context(|| format!("creating wikipedia directory {:?}", parent_dir))?;
|
||||
}
|
||||
|
||||
unix::fs::symlink(&main_dir, &wikipedia_dir).with_context(|| {
|
||||
format!(
|
||||
"creating symlink from {:?} to {:?}",
|
||||
wikipedia_dir, main_dir
|
||||
)
|
||||
})?;
|
||||
}
|
||||
|
||||
Ok(main_dir)
|
||||
}
|
||||
|
||||
/// Write selected article to disk.
|
||||
///
|
||||
/// - Write page contents to wikidata page (`wikidata.org/wiki/QXXX/lang.html`).
|
||||
/// - If the page has no wikidata qid, write contents to wikipedia location (`lang.wikipedia.org/wiki/article_title/lang.html`).
|
||||
/// - Create links from all wikipedia urls and redirects (`lang.wikipedia.org/wiki/a_redirect -> wikidata.org/wiki/QXXX`).
|
||||
fn write(
|
||||
base: impl AsRef<Path>,
|
||||
page: &Page,
|
||||
redirects: impl IntoIterator<Item = WikipediaTitleNorm>,
|
||||
) -> anyhow::Result<()> {
|
||||
let article_dir = create_article_dir(base, page, redirects)?;
|
||||
|
||||
// Write html to determined file.
|
||||
let mut filename = article_dir;
|
||||
filename.push(&page.in_language.identifier);
|
||||
filename.set_extension("html");
|
||||
|
||||
debug!("{:?}: {:?}", page.name, filename);
|
||||
|
||||
if filename.exists() {
|
||||
debug!("Overwriting existing file");
|
||||
}
|
||||
|
||||
let html = simplify(&page.article_body.html, &page.in_language.identifier);
|
||||
|
||||
let mut file =
|
||||
File::create(&filename).with_context(|| format!("creating html file {:?}", filename))?;
|
||||
file.write_all(html.as_bytes())
|
||||
.with_context(|| format!("writing html file {:?}", filename))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
// Use info level by default, load overrides from `RUST_LOG` env variable.
|
||||
// See https://docs.rs/env_logger/latest/env_logger/index.html#example
|
||||
env_logger::Builder::new()
|
||||
.filter_level(log::LevelFilter::Info)
|
||||
.parse_default_env()
|
||||
.try_init()?;
|
||||
|
||||
let args = Args::parse();
|
||||
|
||||
if args.wikidata_ids.is_none() && args.wikipedia_urls.is_none() {
|
||||
let mut cmd = Args::command();
|
||||
cmd.error(
|
||||
clap::error::ErrorKind::MissingRequiredArgument,
|
||||
"one or both of --wikidata-ids and --wikipedia-urls is required",
|
||||
)
|
||||
.exit()
|
||||
}
|
||||
|
||||
info!("{} {}", Args::command().get_name(), version());
|
||||
|
||||
let wikipedia_titles = if let Some(path) = args.wikipedia_urls {
|
||||
info!("Loading article urls from {path:?}");
|
||||
let urls = parse_wikipedia_file(path)?;
|
||||
debug!("Parsed {} unique article urls", urls.len());
|
||||
urls
|
||||
} else {
|
||||
Default::default()
|
||||
};
|
||||
|
||||
let wikidata_ids = if let Some(path) = args.wikidata_ids {
|
||||
info!("Loading wikidata ids from {path:?}");
|
||||
let ids = parse_wikidata_file(path)?;
|
||||
debug!("Parsed {} unique wikidata ids", ids.len());
|
||||
ids
|
||||
} else {
|
||||
Default::default()
|
||||
};
|
||||
|
||||
// NOTE: For atomic writes to the same file across threads/processes:
|
||||
// - The file needs to be opened in APPEND mode (`.append(true)`).
|
||||
// - Each write needs to be a single syscall (for Rust, use `format!` for formatting before calling `write!`, or `write!` to a `String` first).
|
||||
// - Each write needs to be under `PIPE_BUF` size (see `man write(3)`), usually 4kb on Linux.
|
||||
//
|
||||
// For more information, see:
|
||||
// - `man write(3posix)`: https://www.man7.org/linux/man-pages/man3/write.3p.html
|
||||
// - `std::fs::OpenOptions::append`: https://doc.rust-lang.org/std/fs/struct.OpenOptions.html#method.append
|
||||
// - https://stackoverflow.com/questions/1154446/is-file-append-atomic-in-unix
|
||||
let mut write_new_ids = args
|
||||
.write_new_ids
|
||||
.as_ref()
|
||||
.map(|p| File::options().create(true).append(true).open(p))
|
||||
.transpose()?;
|
||||
|
||||
if !args.output_dir.is_dir() {
|
||||
bail!("output dir {:?} does not exist", args.output_dir)
|
||||
}
|
||||
|
||||
info!("Processing dump");
|
||||
let dump = stdin().lock();
|
||||
|
||||
// TODO: Compare different deserialization methods.
|
||||
// The docs warn against using a reader directly, and it's slower than tar can decompress the dump.
|
||||
// let stream = serde_json::Deserializer::from_reader(dump).into_iter::<Page>();
|
||||
let stream = dump.lines().map(|r| {
|
||||
r.map_err(anyhow::Error::new)
|
||||
.and_then(|s| serde_json::from_str::<Page>(&s).map_err(anyhow::Error::new))
|
||||
});
|
||||
|
||||
for page in stream {
|
||||
let page = page?;
|
||||
|
||||
let qid = page.wikidata();
|
||||
|
||||
let is_wikidata_match = qid
|
||||
.as_ref()
|
||||
.map(|qid| wikidata_ids.contains(qid))
|
||||
.unwrap_or_default();
|
||||
|
||||
let matching_titles = if wikipedia_titles.is_empty() {
|
||||
Default::default()
|
||||
} else {
|
||||
page.all_titles()
|
||||
.filter_map(|r| {
|
||||
r.map(Some).unwrap_or_else(|e| {
|
||||
warn!("Could not parse title for {:?}: {:#}", &page.name, e);
|
||||
None
|
||||
})
|
||||
})
|
||||
.filter(|t| wikipedia_titles.contains(t))
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
if !is_wikidata_match && matching_titles.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Write matched new QIDs back to fild.
|
||||
if let (Some(f), Some(qid)) = (&mut write_new_ids, &qid) {
|
||||
if !is_wikidata_match && !matching_titles.is_empty() {
|
||||
debug!("Writing new id {} for article {:?}", qid, page.name);
|
||||
// NOTE: Write to string buffer first to have a single atomic write syscall.
|
||||
// See `write_new_ids` for more info.
|
||||
let line = format!("{}\n", qid);
|
||||
write!(f, "{}", line).with_context(|| {
|
||||
format!(
|
||||
"writing new id to file {:?}",
|
||||
args.write_new_ids.as_ref().unwrap()
|
||||
)
|
||||
})?;
|
||||
}
|
||||
}
|
||||
|
||||
if let Err(e) = write(&args.output_dir, &page, matching_titles) {
|
||||
error!("Error writing article {:?}: {:#}", page.name, e);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
260
src/wm/mod.rs
|
@ -1,24 +1,23 @@
|
|||
//! Wikimedia types
|
||||
use std::{
|
||||
collections::HashSet, ffi::OsStr, fmt::Display, fs, num::ParseIntError, path::PathBuf,
|
||||
str::FromStr,
|
||||
};
|
||||
use std::{collections::HashSet, error::Error, ffi::OsStr, fmt::Display, fs, str::FromStr};
|
||||
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
|
||||
use url::Url;
|
||||
|
||||
mod page;
|
||||
pub use page::Page;
|
||||
mod title;
|
||||
pub use title::*;
|
||||
mod qid;
|
||||
pub use qid::*;
|
||||
|
||||
/// Read from a file of urls on each line.
|
||||
pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<WikidataQid>> {
|
||||
pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Qid>> {
|
||||
let contents = fs::read_to_string(path.as_ref())?;
|
||||
Ok(contents
|
||||
.lines()
|
||||
.enumerate()
|
||||
.map(|(i, line)| {
|
||||
WikidataQid::from_str(line).with_context(|| {
|
||||
Qid::from_str(line).with_context(|| {
|
||||
let line_num = i + 1;
|
||||
format!("on line {line_num}: {line:?}")
|
||||
})
|
||||
|
@ -34,15 +33,13 @@ pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Wi
|
|||
}
|
||||
|
||||
/// Read article titles from a file of urls on each line.
|
||||
pub fn parse_wikipedia_file(
|
||||
path: impl AsRef<OsStr>,
|
||||
) -> anyhow::Result<HashSet<WikipediaTitleNorm>> {
|
||||
pub fn parse_wikipedia_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Title>> {
|
||||
let contents = fs::read_to_string(path.as_ref())?;
|
||||
Ok(contents
|
||||
.lines()
|
||||
.enumerate()
|
||||
.map(|(i, line)| {
|
||||
WikipediaTitleNorm::from_url(line).with_context(|| {
|
||||
Title::from_url(line).with_context(|| {
|
||||
let line_num = i + 1;
|
||||
format!("on line {line_num}: {line:?}")
|
||||
})
|
||||
|
@ -57,147 +54,120 @@ pub fn parse_wikipedia_file(
|
|||
.collect())
|
||||
}
|
||||
|
||||
/// Wikidata QID/Q Number
|
||||
///
|
||||
/// See https://www.wikidata.org/wiki/Wikidata:Glossary#QID
|
||||
///
|
||||
/// ```
|
||||
/// use std::str::FromStr;
|
||||
/// use om_wikiparser::wm::WikidataQid;
|
||||
///
|
||||
/// let with_q = WikidataQid::from_str("Q12345").unwrap();
|
||||
/// let without_q = WikidataQid::from_str(" 12345 ").unwrap();
|
||||
/// assert_eq!(with_q, without_q);
|
||||
///
|
||||
/// assert!(WikidataQid::from_str("q12345").is_ok());
|
||||
/// assert!(WikidataQid::from_str("https://wikidata.org/wiki/Q12345").is_err());
|
||||
/// assert!(WikidataQid::from_str("Article_Title").is_err());
|
||||
/// assert!(WikidataQid::from_str("Q").is_err());
|
||||
/// assert!(WikidataQid::from_str("").is_err());
|
||||
/// ```
|
||||
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
pub struct WikidataQid(u32);
|
||||
pub fn parse_osm_tag_file(
|
||||
path: impl AsRef<OsStr>,
|
||||
qids: &mut HashSet<Qid>,
|
||||
titles: &mut HashSet<Title>,
|
||||
mut line_errors: Option<&mut Vec<ParseLineError>>,
|
||||
) -> anyhow::Result<()> {
|
||||
let path = path.as_ref();
|
||||
let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?;
|
||||
|
||||
impl FromStr for WikidataQid {
|
||||
type Err = ParseIntError;
|
||||
let mut push_error = |e: ParseLineError| {
|
||||
debug!("Tag parse error: {e}");
|
||||
if let Some(ref mut errs) = line_errors {
|
||||
errs.push(e);
|
||||
}
|
||||
};
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let s = s.trim();
|
||||
let s = s.strip_prefix(['Q', 'q']).unwrap_or(s);
|
||||
u32::from_str(s).map(WikidataQid)
|
||||
let mut qid_col = None;
|
||||
let mut title_col = None;
|
||||
for (column, title) in rdr.headers()?.iter().enumerate() {
|
||||
match title {
|
||||
"wikidata" => qid_col = Some(column),
|
||||
"wikipedia" => title_col = Some(column),
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
|
||||
let qid_col = qid_col.ok_or_else(|| anyhow!("Cannot find 'wikidata' column"))?;
|
||||
let title_col = title_col.ok_or_else(|| anyhow!("Cannot find 'wikipedia' column"))?;
|
||||
|
||||
let mut row = csv::StringRecord::new();
|
||||
loop {
|
||||
match rdr.read_record(&mut row) {
|
||||
Ok(true) => {}
|
||||
// finished
|
||||
Ok(false) => break,
|
||||
// attempt to recover from parsing errors
|
||||
Err(e) => {
|
||||
if e.is_io_error() {
|
||||
bail!(e)
|
||||
}
|
||||
push_error(ParseLineError {
|
||||
text: String::new(),
|
||||
line: rdr.position().line(),
|
||||
kind: e.into(),
|
||||
});
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
let qid = &row[qid_col].trim();
|
||||
if !qid.is_empty() {
|
||||
match Qid::from_str(qid) {
|
||||
Ok(qid) => {
|
||||
qids.insert(qid);
|
||||
}
|
||||
Err(e) => push_error(ParseLineError {
|
||||
text: qid.to_string(),
|
||||
line: rdr.position().line(),
|
||||
kind: e.into(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
let title = &row[title_col].trim();
|
||||
if !title.is_empty() {
|
||||
match Title::from_osm_tag(title) {
|
||||
Ok(title) => {
|
||||
titles.insert(title);
|
||||
}
|
||||
Err(e) => push_error(ParseLineError {
|
||||
text: title.to_string(),
|
||||
line: rdr.position().line(),
|
||||
kind: e.into(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl Display for WikidataQid {
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum ParseErrorKind {
|
||||
#[error("bad title")]
|
||||
Title(#[from] ParseTitleError),
|
||||
#[error("bad QID")]
|
||||
Qid(#[from] ParseQidError),
|
||||
#[error("bad TSV line")]
|
||||
Tsv(#[from] csv::Error),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ParseLineError {
|
||||
text: String,
|
||||
line: u64,
|
||||
kind: ParseErrorKind,
|
||||
}
|
||||
|
||||
impl Display for ParseLineError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "Q{}", self.0)
|
||||
// write source chain to ensure they are logged
|
||||
write!(f, "on line {}: {:?}: {}", self.line, self.text, self.kind)?;
|
||||
let mut source = self.kind.source();
|
||||
while let Some(e) = source {
|
||||
write!(f, ": {}", e)?;
|
||||
source = e.source();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl WikidataQid {
|
||||
pub fn get_dir(&self, base: PathBuf) -> PathBuf {
|
||||
let mut path = base;
|
||||
path.push("wikidata");
|
||||
// TODO: can use as_mut_os_string with 1.70.0
|
||||
path.push(self.to_string());
|
||||
|
||||
path
|
||||
}
|
||||
}
|
||||
|
||||
/// Normalized wikipedia article title that can compare:
|
||||
/// - titles `Spatial Database`
|
||||
/// - urls `https://en.wikipedia.org/wiki/Spatial_database#Geodatabase`
|
||||
/// - osm-style tags `en:Spatial Database`
|
||||
///
|
||||
/// ```
|
||||
/// use om_wikiparser::wm::WikipediaTitleNorm;
|
||||
///
|
||||
/// let title = WikipediaTitleNorm::from_title("Article Title", "en").unwrap();
|
||||
/// let url = WikipediaTitleNorm::from_url("https://en.wikipedia.org/wiki/Article_Title#Section").unwrap();
|
||||
/// assert_eq!(url, title);
|
||||
///
|
||||
/// assert!(WikipediaTitleNorm::from_url("https://en.wikipedia.org/not_a_wiki_page").is_err());
|
||||
/// assert!(WikipediaTitleNorm::from_url("https://wikidata.org/wiki/Q12345").is_err());
|
||||
///
|
||||
/// assert!(
|
||||
/// WikipediaTitleNorm::from_url("https://de.wikipedia.org/wiki/Breil/Brigels").unwrap() !=
|
||||
/// WikipediaTitleNorm::from_url("https://de.wikipedia.org/wiki/Breil").unwrap()
|
||||
/// );
|
||||
/// ```
|
||||
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
pub struct WikipediaTitleNorm {
|
||||
lang: String,
|
||||
name: String,
|
||||
}
|
||||
|
||||
impl WikipediaTitleNorm {
|
||||
fn normalize_title(title: &str) -> String {
|
||||
// TODO: Compare with map generator url creation, ensure covers all cases.
|
||||
title.trim().replace(' ', "_")
|
||||
}
|
||||
|
||||
// https://en.wikipedia.org/wiki/Article_Title/More_Title
|
||||
pub fn from_url(url: &str) -> anyhow::Result<Self> {
|
||||
let url = Url::parse(url.trim())?;
|
||||
|
||||
let (subdomain, host) = url
|
||||
.host_str()
|
||||
.ok_or_else(|| anyhow!("Expected host"))?
|
||||
.split_once('.')
|
||||
.ok_or_else(|| anyhow!("Expected subdomain"))?;
|
||||
if host != "wikipedia.org" {
|
||||
bail!("Expected wikipedia.org for domain")
|
||||
}
|
||||
let lang = subdomain;
|
||||
|
||||
let path = url.path();
|
||||
|
||||
let (root, title) = path
|
||||
.strip_prefix('/')
|
||||
.unwrap_or(path)
|
||||
.split_once('/')
|
||||
.ok_or_else(|| anyhow!("Expected at least two segments in path"))?;
|
||||
|
||||
if root != "wiki" {
|
||||
bail!("Expected 'wiki' as root path, got: {:?}", root)
|
||||
}
|
||||
let title = urlencoding::decode(title)?;
|
||||
|
||||
Self::from_title(&title, lang)
|
||||
}
|
||||
|
||||
// en:Article Title
|
||||
fn _from_osm_tag(tag: &str) -> anyhow::Result<Self> {
|
||||
let (lang, title) = tag
|
||||
.trim()
|
||||
.split_once(':')
|
||||
.ok_or_else(|| anyhow!("Expected ':'"))?;
|
||||
|
||||
Self::from_title(title, lang)
|
||||
}
|
||||
|
||||
pub fn from_title(title: &str, lang: &str) -> anyhow::Result<Self> {
|
||||
let title = title.trim();
|
||||
let lang = lang.trim();
|
||||
if title.is_empty() {
|
||||
bail!("title cannot be empty or whitespace");
|
||||
}
|
||||
if lang.is_empty() {
|
||||
bail!("lang cannot be empty or whitespace");
|
||||
}
|
||||
let name = Self::normalize_title(title);
|
||||
let lang = lang.to_owned();
|
||||
Ok(Self { name, lang })
|
||||
}
|
||||
|
||||
pub fn get_dir(&self, base: PathBuf) -> PathBuf {
|
||||
let mut path = base;
|
||||
// TODO: can use as_mut_os_string with 1.70.0
|
||||
path.push(format!("{}.wikipedia.org", self.lang));
|
||||
path.push("wiki");
|
||||
path.push(&self.name);
|
||||
|
||||
path
|
||||
impl Error for ParseLineError {
|
||||
fn source(&self) -> Option<&(dyn Error + 'static)> {
|
||||
// return nothing b/c Display prints source chain
|
||||
None
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
use std::{iter, str::FromStr};
|
||||
|
||||
use anyhow::Context;
|
||||
use serde::Deserialize;
|
||||
|
||||
use super::{WikidataQid, WikipediaTitleNorm};
|
||||
use super::{Qid, Title};
|
||||
|
||||
// TODO: consolidate into single struct
|
||||
/// Deserialized Wikimedia Enterprise API Article
|
||||
|
@ -25,27 +26,29 @@ pub struct Page {
|
|||
}
|
||||
|
||||
impl Page {
|
||||
pub fn wikidata(&self) -> Option<WikidataQid> {
|
||||
pub fn wikidata(&self) -> Option<Qid> {
|
||||
// TODO: return error
|
||||
self.main_entity
|
||||
.as_ref()
|
||||
.map(|e| WikidataQid::from_str(&e.identifier).unwrap())
|
||||
.map(|e| Qid::from_str(&e.identifier).unwrap())
|
||||
}
|
||||
|
||||
/// Title of the article
|
||||
pub fn title(&self) -> anyhow::Result<WikipediaTitleNorm> {
|
||||
WikipediaTitleNorm::from_title(&self.name, &self.in_language.identifier)
|
||||
pub fn title(&self) -> anyhow::Result<Title> {
|
||||
Title::from_title(&self.name, &self.in_language.identifier)
|
||||
.with_context(|| format!("bad title {:?}", self.name))
|
||||
}
|
||||
|
||||
/// All titles that lead to the article, the main title followed by any redirects.
|
||||
pub fn all_titles(&self) -> impl Iterator<Item = anyhow::Result<WikipediaTitleNorm>> + '_ {
|
||||
pub fn all_titles(&self) -> impl Iterator<Item = anyhow::Result<Title>> + '_ {
|
||||
iter::once(self.title()).chain(self.redirects())
|
||||
}
|
||||
|
||||
pub fn redirects(&self) -> impl Iterator<Item = anyhow::Result<WikipediaTitleNorm>> + '_ {
|
||||
self.redirects
|
||||
.iter()
|
||||
.map(|r| WikipediaTitleNorm::from_title(&r.name, &self.in_language.identifier))
|
||||
pub fn redirects(&self) -> impl Iterator<Item = anyhow::Result<Title>> + '_ {
|
||||
self.redirects.iter().map(|r| {
|
||||
Title::from_title(&r.name, &self.in_language.identifier)
|
||||
.with_context(|| format!("bad redirect {:?}", self.name))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
|
64
src/wm/qid.rs
Normal file
|
@ -0,0 +1,64 @@
|
|||
use std::{error::Error, fmt::Display, num::ParseIntError, path::PathBuf, str::FromStr};
|
||||
|
||||
/// Wikidata QID/Q Number
|
||||
///
|
||||
/// See https://www.wikidata.org/wiki/Wikidata:Glossary#QID
|
||||
///
|
||||
/// ```
|
||||
/// use std::str::FromStr;
|
||||
/// use om_wikiparser::wm::Qid;
|
||||
///
|
||||
/// let with_q = Qid::from_str("Q12345").unwrap();
|
||||
/// let without_q = Qid::from_str(" 12345 ").unwrap();
|
||||
/// assert_eq!(with_q, without_q);
|
||||
///
|
||||
/// assert!(Qid::from_str("q12345").is_ok());
|
||||
/// assert!(Qid::from_str("https://wikidata.org/wiki/Q12345").is_err());
|
||||
/// assert!(Qid::from_str("Article_Title").is_err());
|
||||
/// assert!(Qid::from_str("Q").is_err());
|
||||
/// assert!(Qid::from_str("").is_err());
|
||||
/// ```
|
||||
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
pub struct Qid(u32);
|
||||
|
||||
impl FromStr for Qid {
|
||||
type Err = ParseQidError;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let s = s.trim();
|
||||
let s = s.strip_prefix(['Q', 'q']).unwrap_or(s);
|
||||
u32::from_str(s).map(Qid).map_err(ParseQidError)
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for Qid {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "Q{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Qid {
|
||||
pub fn get_dir(&self, base: PathBuf) -> PathBuf {
|
||||
let mut path = base;
|
||||
path.push("wikidata");
|
||||
// TODO: can use as_mut_os_string with 1.70.0
|
||||
path.push(self.to_string());
|
||||
|
||||
path
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct ParseQidError(ParseIntError);
|
||||
|
||||
impl Display for ParseQidError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
self.0.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for ParseQidError {
|
||||
fn source(&self) -> Option<&(dyn Error + 'static)> {
|
||||
self.0.source()
|
||||
}
|
||||
}
|
150
src/wm/title.rs
Normal file
|
@ -0,0 +1,150 @@
|
|||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
use std::{fmt::Display, path::PathBuf, string::FromUtf8Error};
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
use url::Url;
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
/// Normalized wikipedia article title that can compare:
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
/// - titles `Spatial Database`
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
/// - urls `https://en.wikipedia.org/wiki/Spatial_database#Geodatabase`
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
/// - osm-style tags `en:Spatial Database`
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
///
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
/// ```
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
/// use om_wikiparser::wm::Title;
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
///
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
/// let title = Title::from_title("Article Title", "en").unwrap();
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
/// let url = Title::from_url("https://en.wikipedia.org/wiki/Article_Title#Section").unwrap();
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
/// let mobile = Title::from_url("https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
/// let url_tag1 = Title::from_osm_tag("https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
/// let url_tag2 = Title::from_osm_tag("de:https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
/// assert_eq!(url, title);
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
/// assert_eq!(url, mobile);
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
/// assert_eq!(url, url_tag1);
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
/// assert_eq!(url, url_tag2);
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
///
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
/// assert!(Title::from_url("https://en.wikipedia.org/not_a_wiki_page").is_err());
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
/// assert!(Title::from_url("https://wikidata.org/wiki/Q12345").is_err());
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
///
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
/// assert!(
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
/// Title::from_url("https://de.wikipedia.org/wiki/Breil/Brigels").unwrap() !=
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
/// Title::from_url("https://de.wikipedia.org/wiki/Breil").unwrap()
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
/// );
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
/// ```
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
pub struct Title {
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
lang: String,
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
name: String,
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
}
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
impl Display for Title {
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
write!(f, "{}:{}", self.lang, self.name)
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
}
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
}
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
impl Title {
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
fn normalize_title(title: &str) -> String {
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
// TODO: Compare with map generator url creation, ensure covers all cases.
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
title.trim().replace(' ', "_")
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
}
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
// https://en.wikipedia.org/wiki/Article_Title/More_Title
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
pub fn from_url(url: &str) -> Result<Self, ParseTitleError> {
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
let url = Url::parse(url.trim())?;
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
let (subdomain, host) = url
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
.host_str()
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
.ok_or(ParseTitleError::NoHost)?
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
.split_once('.')
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
.ok_or(ParseTitleError::NoSubdomain)?;
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
let host = host.strip_prefix("m.").unwrap_or(host);
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
if host != "wikipedia.org" {
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
return Err(ParseTitleError::BadDomain);
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
}
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
let lang = subdomain;
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
let path = url.path();
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
let (root, title) = path
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
.strip_prefix('/')
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
.unwrap_or(path)
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
.split_once('/')
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
.ok_or(ParseTitleError::ShortPath)?;
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
if root != "wiki" {
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
return Err(ParseTitleError::BadPath);
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
}
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
let title = urlencoding::decode(title)?;
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
Self::from_title(&title, lang)
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
}
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
// en:Article Title
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
pub fn from_osm_tag(tag: &str) -> Result<Self, ParseTitleError> {
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
let (lang, title) = tag
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
.trim()
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
.split_once(':')
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
.ok_or(ParseTitleError::MissingColon)?;
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
let lang = lang.trim_start();
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
let title = title.trim_start();
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
if matches!(lang, "http" | "https") {
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
return Self::from_url(tag);
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
}
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
if title.starts_with("http://") || title.starts_with("https://") {
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
return Self::from_url(title);
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
}
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
Self::from_title(title, lang)
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
}
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
pub fn from_title(title: &str, lang: &str) -> Result<Self, ParseTitleError> {
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
let title = title.trim();
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
let lang = lang.trim();
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
if title.is_empty() {
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
return Err(ParseTitleError::NoTitle);
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
}
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
if lang.is_empty() {
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
return Err(ParseTitleError::NoLang);
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
}
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
let name = Self::normalize_title(title);
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
let lang = lang.to_owned();
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
Ok(Self { name, lang })
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
}
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
pub fn get_dir(&self, base: PathBuf) -> PathBuf {
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
let mut path = base;
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
// TODO: can use as_mut_os_string with 1.70.0
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
path.push(format!("{}.wikipedia.org", self.lang));
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
path.push("wiki");
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
path.push(&self.name);
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
path
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
}
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
}
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
#[derive(Debug, PartialEq, Eq, thiserror::Error)]
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
pub enum ParseTitleError {
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
#[error("title cannot be empty or whitespace")]
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
NoTitle,
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
#[error("lang cannot be empty or whitespace")]
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
NoLang,
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
#[error("no ':' separating lang and title")]
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
MissingColon,
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
// url-specific
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
#[error("cannot parse url")]
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
Url(#[from] url::ParseError),
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
#[error("cannot decode url")]
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
UrlDecode(#[from] FromUtf8Error),
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
#[error("no host in url")]
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
NoHost,
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
#[error("no subdomain in url")]
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
NoSubdomain,
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
#[error("url base domain is wikipedia.org")]
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
BadDomain,
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
#[error("url base path is not /wiki/")]
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
BadPath,
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
#[error("path has less than 2 segments")]
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
ShortPath,
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
||||
}
|
||||
![]() Does it make sense to print wrong hosts in a log to fix/support them? Does it make sense to print wrong hosts in a log to fix/support them?
![]() ditto ditto
![]() They are caught at a higher level and logged/saved with the full string They are caught at a higher level and logged/saved with the full string
|
Why is it not in one line?
A constant to avoid copy-paste?