diff --git a/Cargo.lock b/Cargo.lock index c398fbf..40b7ffa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -347,19 +347,6 @@ version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" -[[package]] -name = "env_logger" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0" -dependencies = [ - "humantime", - "is-terminal", - "log", - "regex", - "termcolor", -] - [[package]] name = "errno" version = "0.3.1" @@ -504,12 +491,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "humantime" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" - [[package]] name = "idna" version = "0.3.0" @@ -559,6 +540,12 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + [[package]] name = "libc" version = "0.2.147" @@ -613,6 +600,15 @@ dependencies = [ "tendril", ] +[[package]] +name = "matchers" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558" +dependencies = [ + "regex-automata", +] + [[package]] name = "matches" version = "0.1.10" @@ -673,6 +669,16 @@ version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" +[[package]] +name = "nu-ansi-term" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +dependencies = [ + "overload", + "winapi", +] + [[package]] name = "num_cpus" version = "1.16.0" @@ -700,7 +706,6 @@ dependencies = [ "clap", "csv", "ego-tree", - "env_logger", "expect-test", "html5ever", "log", @@ -712,6 +717,8 @@ dependencies = [ "serde", "serde_json", "thiserror", + "tracing", + "tracing-subscriber", "url", "urlencoding", ] @@ -736,6 +743,12 @@ dependencies = [ "rayon", ] +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" + [[package]] name = "parking_lot" version = "0.12.1" @@ -857,6 +870,12 @@ dependencies = [ "siphasher", ] +[[package]] +name = "pin-project-lite" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" + [[package]] name = "ppv-lite86" version = "0.2.17" @@ -1073,9 +1092,24 @@ checksum = "d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f" dependencies = [ "aho-corasick", "memchr", - "regex-syntax", + "regex-syntax 0.7.2", ] +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" +dependencies = [ + "regex-syntax 0.6.29", +] + +[[package]] +name = "regex-syntax" +version = "0.6.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" + [[package]] name = "regex-syntax" version = "0.7.2" @@ -1218,6 +1252,15 @@ dependencies = [ "stable_deref_trait", ] +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + [[package]] name = "siphasher" version = "0.3.10" @@ -1314,15 +1357,6 @@ dependencies = [ "utf-8", ] -[[package]] -name = "termcolor" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" -dependencies = [ - "winapi-util", -] - [[package]] name = "thiserror" version = "1.0.44" @@ -1343,6 +1377,16 @@ dependencies = [ "syn 2.0.28", ] +[[package]] +name = "thread_local" +version = "1.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152" +dependencies = [ + "cfg-if", + "once_cell", +] + [[package]] name = "tinyvec" version = "1.6.0" @@ -1358,6 +1402,68 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "tracing" +version = "0.1.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" +dependencies = [ + "cfg-if", + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.28", +] + +[[package]] +name = "tracing-core" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a" +dependencies = [ + "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ddad33d2d10b1ed7eb9d1f518a5674713876e97e5bb9b7345a7984fbb4f922" +dependencies = [ + "lazy_static", + "log", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", +] + [[package]] name = "unicode-bidi" version = "0.3.13" @@ -1414,6 +1520,12 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" +[[package]] +name = "valuable" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" + [[package]] name = "version_check" version = "0.9.4" @@ -1459,15 +1571,6 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" -[[package]] -name = "winapi-util" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" -dependencies = [ - "winapi", -] - [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" diff --git a/Cargo.toml b/Cargo.toml index 7be520a..ce3577d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,6 @@ anyhow = { version = "1.0.71", features = ["backtrace"] } clap = { version = "4.3.2", features = ["derive"] } csv = "1.2.2" ego-tree = "0.6.2" -env_logger = "0.10.0" expect-test = "1.4.1" html5ever = "0.26.0" log = "0.4.18" @@ -25,6 +24,8 @@ scraper = "0.16.0" serde = { version = "1.0.163", features = ["derive"] } serde_json = "1.0.96" thiserror = "1.0.44" +tracing = "0.1.37" +tracing-subscriber = { version = "0.3.17", features = ["env-filter"] } url = "2.3.1" urlencoding = "2.1.2" diff --git a/src/get_articles.rs b/src/get_articles.rs index b77bcad..8730acf 100644 --- a/src/get_articles.rs +++ b/src/get_articles.rs @@ -80,18 +80,8 @@ pub fn run(args: Args) -> anyhow::Result<()> { if !line_errors.is_empty() { let error_count = line_errors.len(); let new_items = wikidata_qids.len() + wikipedia_titles.len() - original_items; - let expected_threshold = 0.02; let percentage = 100.0 * error_count as f64 / new_items as f64; - let level = if percentage >= expected_threshold { - log::Level::Error - } else { - log::Level::Info - }; - - log!( - level, - "{error_count} errors ({percentage:.4}%) parsing osm tags from {path:?}", - ); + warn!("{error_count} errors ({percentage:.4}%) parsing osm tags from {path:?}",); } } @@ -118,21 +108,38 @@ pub fn run(args: Args) -> anyhow::Result<()> { } info!("Processing dump"); - let dump = stdin().lock(); + let mut dump = stdin().lock(); - // TODO: Compare different deserialization methods. - // The docs warn against using a reader directly, and it's slower than tar can decompress the dump. - // let stream = serde_json::Deserializer::from_reader(dump).into_iter::(); - let stream = dump.lines().map(|r| { - r.map_err(anyhow::Error::new) - .and_then(|s| serde_json::from_str::(&s).map_err(anyhow::Error::new)) - }); + let mut buffer = String::new(); + let mut line = 0; + let mut byte = 1; + loop { + line += 1; + byte += buffer.len(); + buffer.clear(); - for page in stream { - let page = page?; + if 0 == dump.read_line(&mut buffer).context("reading dump")? { + // Reached end of file. + break; + } + + // TODO: Compare different deserialization methods. + // The docs warn against using a reader directly, and it's slower than tar can decompress the dump. + // let stream = serde_json::Deserializer::from_reader(dump).into_iter::(); + let page: Page = serde_json::from_str(&buffer).context("deserializing json")?; + + let span = info_span!( + "page", + lang = page.in_language.identifier, + title = page.name, + url = page.url, + qid = page.main_entity.as_ref().map(|w| &w.identifier), + line, + byte, + ); + let _handle = span.enter(); let qid = page.wikidata(); - let is_wikidata_match = qid .as_ref() .map(|qid| wikidata_qids.contains(qid)) @@ -144,7 +151,7 @@ pub fn run(args: Args) -> anyhow::Result<()> { page.all_titles() .filter_map(|r| { r.map(Some).unwrap_or_else(|e| { - warn!("Could not parse title for {:?}: {:#}", &page.name, e); + warn!("Could not parse title: {:#}", e); None }) }) @@ -159,7 +166,7 @@ pub fn run(args: Args) -> anyhow::Result<()> { // Write matched new QIDs back to file. if let (Some(f), Some(qid)) = (&mut write_new_qids, &qid) { if !is_wikidata_match && !matching_titles.is_empty() { - debug!("Writing new id {} for article {:?}", qid, page.name); + debug!("Writing new id {}", qid); // NOTE: Write to string buffer first to have a single atomic write syscall. // See `write_new_qids` for more info. let line = format!("{}\n", qid); @@ -173,7 +180,7 @@ pub fn run(args: Args) -> anyhow::Result<()> { } if let Err(e) = write(&args.output_dir, &page, matching_titles, !args.no_simplify) { - error!("Error writing article {:?}: {:#}", page.name, e); + error!("Error writing article: {:#}", e); } } @@ -193,13 +200,13 @@ fn create_article_dir( None => { // Write to wikipedia title directory. // Prefer first redirect, fall back to page title if none exist - info!("Page without wikidata qid: {:?} ({})", page.name, page.url); + info!("Page without wikidata qid"); redirects .next() .or_else(|| match page.title() { Ok(title) => Some(title), Err(e) => { - warn!("Unable to parse title for page {:?}: {:#}", page.name, e); + warn!("Unable to parse title: {:#}", e); None } }) diff --git a/src/main.rs b/src/main.rs index ddd67a3..9c125a9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,9 +2,10 @@ use std::{ collections::HashSet, env, fs::File, - io::{stdin, stdout, BufReader, Read, Write}, + io::{stderr, stdin, stdout, BufReader, Read, Write}, num::NonZeroUsize, path::PathBuf, + process, str::FromStr, thread::available_parallelism, time::Instant, @@ -12,9 +13,11 @@ use std::{ use anyhow::Context; use clap::{CommandFactory, Parser, Subcommand}; -use om_wikiparser::osm; #[macro_use] -extern crate log; +extern crate tracing; +use tracing_subscriber::filter::EnvFilter; + +use om_wikiparser::osm; mod get_articles; mod get_tags; @@ -75,11 +78,12 @@ enum Cmd { fn main() -> anyhow::Result<()> { // Use info level by default, load overrides from `RUST_LOG` env variable. - // See https://docs.rs/env_logger/latest/env_logger/index.html#example - env_logger::Builder::new() - .filter_level(log::LevelFilter::Info) - .parse_default_env() - .try_init()?; + // See https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.EnvFilter.html + tracing_subscriber::fmt() + .with_env_filter(EnvFilter::from_default_env()) + .compact() + .with_writer(stderr) + .init(); let args = Args::parse(); @@ -99,6 +103,9 @@ fn main() -> anyhow::Result<()> { .exit() } + let pid = process::id(); + let span = info_span!("", pid); + let _handle = span.enter(); get_articles::run(args) } Cmd::GetTags { pbf_file, threads } => {