Use tracing with pid, lang, title, url, line, and byte fields

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
Evan Lloyd New-Schmidt 2023-10-09 13:49:14 -04:00 committed by Evan Lloyd New-Schmidt
parent bfdb3c17a9
commit 580b60bdd4
4 changed files with 193 additions and 75 deletions

181
Cargo.lock generated
View file

@ -347,19 +347,6 @@ version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07"
[[package]]
name = "env_logger"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0"
dependencies = [
"humantime",
"is-terminal",
"log",
"regex",
"termcolor",
]
[[package]]
name = "errno"
version = "0.3.1"
@ -504,12 +491,6 @@ dependencies = [
"syn 1.0.109",
]
[[package]]
name = "humantime"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
[[package]]
name = "idna"
version = "0.3.0"
@ -559,6 +540,12 @@ version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
[[package]]
name = "lazy_static"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "libc"
version = "0.2.147"
@ -613,6 +600,15 @@ dependencies = [
"tendril",
]
[[package]]
name = "matchers"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
dependencies = [
"regex-automata",
]
[[package]]
name = "matches"
version = "0.1.10"
@ -673,6 +669,16 @@ version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb"
[[package]]
name = "nu-ansi-term"
version = "0.46.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
dependencies = [
"overload",
"winapi",
]
[[package]]
name = "num_cpus"
version = "1.16.0"
@ -700,7 +706,6 @@ dependencies = [
"clap",
"csv",
"ego-tree",
"env_logger",
"expect-test",
"html5ever",
"log",
@ -712,6 +717,8 @@ dependencies = [
"serde",
"serde_json",
"thiserror",
"tracing",
"tracing-subscriber",
"url",
"urlencoding",
]
@ -736,6 +743,12 @@ dependencies = [
"rayon",
]
[[package]]
name = "overload"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
[[package]]
name = "parking_lot"
version = "0.12.1"
@ -857,6 +870,12 @@ dependencies = [
"siphasher",
]
[[package]]
name = "pin-project-lite"
version = "0.2.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58"
[[package]]
name = "ppv-lite86"
version = "0.2.17"
@ -1073,9 +1092,24 @@ checksum = "d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
"regex-syntax 0.7.2",
]
[[package]]
name = "regex-automata"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
dependencies = [
"regex-syntax 0.6.29",
]
[[package]]
name = "regex-syntax"
version = "0.6.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
[[package]]
name = "regex-syntax"
version = "0.7.2"
@ -1218,6 +1252,15 @@ dependencies = [
"stable_deref_trait",
]
[[package]]
name = "sharded-slab"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
dependencies = [
"lazy_static",
]
[[package]]
name = "siphasher"
version = "0.3.10"
@ -1314,15 +1357,6 @@ dependencies = [
"utf-8",
]
[[package]]
name = "termcolor"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
dependencies = [
"winapi-util",
]
[[package]]
name = "thiserror"
version = "1.0.44"
@ -1343,6 +1377,16 @@ dependencies = [
"syn 2.0.28",
]
[[package]]
name = "thread_local"
version = "1.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152"
dependencies = [
"cfg-if",
"once_cell",
]
[[package]]
name = "tinyvec"
version = "1.6.0"
@ -1358,6 +1402,68 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "tracing"
version = "0.1.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8"
dependencies = [
"cfg-if",
"pin-project-lite",
"tracing-attributes",
"tracing-core",
]
[[package]]
name = "tracing-attributes"
version = "0.1.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
]
[[package]]
name = "tracing-core"
version = "0.1.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a"
dependencies = [
"once_cell",
"valuable",
]
[[package]]
name = "tracing-log"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ddad33d2d10b1ed7eb9d1f518a5674713876e97e5bb9b7345a7984fbb4f922"
dependencies = [
"lazy_static",
"log",
"tracing-core",
]
[[package]]
name = "tracing-subscriber"
version = "0.3.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
dependencies = [
"matchers",
"nu-ansi-term",
"once_cell",
"regex",
"sharded-slab",
"smallvec",
"thread_local",
"tracing",
"tracing-core",
"tracing-log",
]
[[package]]
name = "unicode-bidi"
version = "0.3.13"
@ -1414,6 +1520,12 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
[[package]]
name = "valuable"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
[[package]]
name = "version_check"
version = "0.9.4"
@ -1459,15 +1571,6 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-util"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
dependencies = [
"winapi",
]
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"

View file

@ -13,7 +13,6 @@ anyhow = { version = "1.0.71", features = ["backtrace"] }
clap = { version = "4.3.2", features = ["derive"] }
csv = "1.2.2"
ego-tree = "0.6.2"
env_logger = "0.10.0"
expect-test = "1.4.1"
html5ever = "0.26.0"
log = "0.4.18"
@ -25,6 +24,8 @@ scraper = "0.16.0"
serde = { version = "1.0.163", features = ["derive"] }
serde_json = "1.0.96"
thiserror = "1.0.44"
tracing = "0.1.37"
tracing-subscriber = { version = "0.3.17", features = ["env-filter"] }
url = "2.3.1"
urlencoding = "2.1.2"

View file

@ -80,18 +80,8 @@ pub fn run(args: Args) -> anyhow::Result<()> {
if !line_errors.is_empty() {
let error_count = line_errors.len();
let new_items = wikidata_qids.len() + wikipedia_titles.len() - original_items;
let expected_threshold = 0.02;
let percentage = 100.0 * error_count as f64 / new_items as f64;
let level = if percentage >= expected_threshold {
log::Level::Error
} else {
log::Level::Info
};
log!(
level,
"{error_count} errors ({percentage:.4}%) parsing osm tags from {path:?}",
);
warn!("{error_count} errors ({percentage:.4}%) parsing osm tags from {path:?}",);
}
}
@ -118,21 +108,38 @@ pub fn run(args: Args) -> anyhow::Result<()> {
}
info!("Processing dump");
let dump = stdin().lock();
let mut dump = stdin().lock();
// TODO: Compare different deserialization methods.
// The docs warn against using a reader directly, and it's slower than tar can decompress the dump.
// let stream = serde_json::Deserializer::from_reader(dump).into_iter::<Page>();
let stream = dump.lines().map(|r| {
r.map_err(anyhow::Error::new)
.and_then(|s| serde_json::from_str::<Page>(&s).map_err(anyhow::Error::new))
});
let mut buffer = String::new();
let mut line = 0;
let mut byte = 1;
loop {
line += 1;
byte += buffer.len();
buffer.clear();
for page in stream {
let page = page?;
if 0 == dump.read_line(&mut buffer).context("reading dump")? {
// Reached end of file.
break;
}
// TODO: Compare different deserialization methods.
// The docs warn against using a reader directly, and it's slower than tar can decompress the dump.
// let stream = serde_json::Deserializer::from_reader(dump).into_iter::<Page>();
let page: Page = serde_json::from_str(&buffer).context("deserializing json")?;
let span = info_span!(
"page",
lang = page.in_language.identifier,
title = page.name,
url = page.url,
qid = page.main_entity.as_ref().map(|w| &w.identifier),
line,
byte,
);
let _handle = span.enter();
let qid = page.wikidata();
let is_wikidata_match = qid
.as_ref()
.map(|qid| wikidata_qids.contains(qid))
@ -144,7 +151,7 @@ pub fn run(args: Args) -> anyhow::Result<()> {
page.all_titles()
.filter_map(|r| {
r.map(Some).unwrap_or_else(|e| {
warn!("Could not parse title for {:?}: {:#}", &page.name, e);
warn!("Could not parse title: {:#}", e);
None
})
})
@ -159,7 +166,7 @@ pub fn run(args: Args) -> anyhow::Result<()> {
// Write matched new QIDs back to file.
if let (Some(f), Some(qid)) = (&mut write_new_qids, &qid) {
if !is_wikidata_match && !matching_titles.is_empty() {
debug!("Writing new id {} for article {:?}", qid, page.name);
debug!("Writing new id {}", qid);
// NOTE: Write to string buffer first to have a single atomic write syscall.
// See `write_new_qids` for more info.
let line = format!("{}\n", qid);
@ -173,7 +180,7 @@ pub fn run(args: Args) -> anyhow::Result<()> {
}
if let Err(e) = write(&args.output_dir, &page, matching_titles, !args.no_simplify) {
error!("Error writing article {:?}: {:#}", page.name, e);
error!("Error writing article: {:#}", e);
}
}
@ -193,13 +200,13 @@ fn create_article_dir(
None => {
// Write to wikipedia title directory.
// Prefer first redirect, fall back to page title if none exist
info!("Page without wikidata qid: {:?} ({})", page.name, page.url);
info!("Page without wikidata qid");
redirects
.next()
.or_else(|| match page.title() {
Ok(title) => Some(title),
Err(e) => {
warn!("Unable to parse title for page {:?}: {:#}", page.name, e);
warn!("Unable to parse title: {:#}", e);
None
}
})

View file

@ -2,9 +2,10 @@ use std::{
collections::HashSet,
env,
fs::File,
io::{stdin, stdout, BufReader, Read, Write},
io::{stderr, stdin, stdout, BufReader, Read, Write},
num::NonZeroUsize,
path::PathBuf,
process,
str::FromStr,
thread::available_parallelism,
time::Instant,
@ -12,9 +13,11 @@ use std::{
use anyhow::Context;
use clap::{CommandFactory, Parser, Subcommand};
use om_wikiparser::osm;
#[macro_use]
extern crate log;
extern crate tracing;
use tracing_subscriber::filter::EnvFilter;
use om_wikiparser::osm;
mod get_articles;
mod get_tags;
@ -75,11 +78,12 @@ enum Cmd {
fn main() -> anyhow::Result<()> {
// Use info level by default, load overrides from `RUST_LOG` env variable.
// See https://docs.rs/env_logger/latest/env_logger/index.html#example
env_logger::Builder::new()
.filter_level(log::LevelFilter::Info)
.parse_default_env()
.try_init()?;
// See https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.EnvFilter.html
tracing_subscriber::fmt()
.with_env_filter(EnvFilter::from_default_env())
.compact()
.with_writer(stderr)
.init();
let args = Args::parse();
@ -99,6 +103,9 @@ fn main() -> anyhow::Result<()> {
.exit()
}
let pid = process::id();
let span = info_span!("", pid);
let _handle = span.enter();
get_articles::run(args)
}
Cmd::GetTags { pbf_file, threads } => {