Use tracing with pid, lang, title, url, line, and byte fields
Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
parent
bfdb3c17a9
commit
580b60bdd4
4 changed files with 193 additions and 75 deletions
181
Cargo.lock
generated
181
Cargo.lock
generated
|
@ -347,19 +347,6 @@ version = "1.9.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07"
|
||||
|
||||
[[package]]
|
||||
name = "env_logger"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0"
|
||||
dependencies = [
|
||||
"humantime",
|
||||
"is-terminal",
|
||||
"log",
|
||||
"regex",
|
||||
"termcolor",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "errno"
|
||||
version = "0.3.1"
|
||||
|
@ -504,12 +491,6 @@ dependencies = [
|
|||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "humantime"
|
||||
version = "2.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "0.3.0"
|
||||
|
@ -559,6 +540,12 @@ version = "1.0.6"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
|
||||
|
||||
[[package]]
|
||||
name = "lazy_static"
|
||||
version = "1.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.147"
|
||||
|
@ -613,6 +600,15 @@ dependencies = [
|
|||
"tendril",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matchers"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
|
||||
dependencies = [
|
||||
"regex-automata",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matches"
|
||||
version = "0.1.10"
|
||||
|
@ -673,6 +669,16 @@ version = "0.1.14"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb"
|
||||
|
||||
[[package]]
|
||||
name = "nu-ansi-term"
|
||||
version = "0.46.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
|
||||
dependencies = [
|
||||
"overload",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num_cpus"
|
||||
version = "1.16.0"
|
||||
|
@ -700,7 +706,6 @@ dependencies = [
|
|||
"clap",
|
||||
"csv",
|
||||
"ego-tree",
|
||||
"env_logger",
|
||||
"expect-test",
|
||||
"html5ever",
|
||||
"log",
|
||||
|
@ -712,6 +717,8 @@ dependencies = [
|
|||
"serde",
|
||||
"serde_json",
|
||||
"thiserror",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
"urlencoding",
|
||||
]
|
||||
|
@ -736,6 +743,12 @@ dependencies = [
|
|||
"rayon",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "overload"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.12.1"
|
||||
|
@ -857,6 +870,12 @@ dependencies = [
|
|||
"siphasher",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pin-project-lite"
|
||||
version = "0.2.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58"
|
||||
|
||||
[[package]]
|
||||
name = "ppv-lite86"
|
||||
version = "0.2.17"
|
||||
|
@ -1073,9 +1092,24 @@ checksum = "d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f"
|
|||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax",
|
||||
"regex-syntax 0.7.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
|
||||
dependencies = [
|
||||
"regex-syntax 0.6.29",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.6.29"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.7.2"
|
||||
|
@ -1218,6 +1252,15 @@ dependencies = [
|
|||
"stable_deref_trait",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sharded-slab"
|
||||
version = "0.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "siphasher"
|
||||
version = "0.3.10"
|
||||
|
@ -1314,15 +1357,6 @@ dependencies = [
|
|||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "termcolor"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
|
||||
dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.44"
|
||||
|
@ -1343,6 +1377,16 @@ dependencies = [
|
|||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thread_local"
|
||||
version = "1.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinyvec"
|
||||
version = "1.6.0"
|
||||
|
@ -1358,6 +1402,68 @@ version = "0.1.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
|
||||
|
||||
[[package]]
|
||||
name = "tracing"
|
||||
version = "0.1.37"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"pin-project-lite",
|
||||
"tracing-attributes",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-attributes"
|
||||
version = "0.1.26"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-core"
|
||||
version = "0.1.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
"valuable",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-log"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78ddad33d2d10b1ed7eb9d1f518a5674713876e97e5bb9b7345a7984fbb4f922"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"log",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-subscriber"
|
||||
version = "0.3.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
|
||||
dependencies = [
|
||||
"matchers",
|
||||
"nu-ansi-term",
|
||||
"once_cell",
|
||||
"regex",
|
||||
"sharded-slab",
|
||||
"smallvec",
|
||||
"thread_local",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tracing-log",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-bidi"
|
||||
version = "0.3.13"
|
||||
|
@ -1414,6 +1520,12 @@ version = "0.2.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
|
||||
|
||||
[[package]]
|
||||
name = "valuable"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
|
||||
|
||||
[[package]]
|
||||
name = "version_check"
|
||||
version = "0.9.4"
|
||||
|
@ -1459,15 +1571,6 @@ version = "0.4.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||
|
||||
[[package]]
|
||||
name = "winapi-util"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi-x86_64-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
|
|
|
@ -13,7 +13,6 @@ anyhow = { version = "1.0.71", features = ["backtrace"] }
|
|||
clap = { version = "4.3.2", features = ["derive"] }
|
||||
csv = "1.2.2"
|
||||
ego-tree = "0.6.2"
|
||||
env_logger = "0.10.0"
|
||||
expect-test = "1.4.1"
|
||||
html5ever = "0.26.0"
|
||||
log = "0.4.18"
|
||||
|
@ -25,6 +24,8 @@ scraper = "0.16.0"
|
|||
serde = { version = "1.0.163", features = ["derive"] }
|
||||
serde_json = "1.0.96"
|
||||
thiserror = "1.0.44"
|
||||
tracing = "0.1.37"
|
||||
tracing-subscriber = { version = "0.3.17", features = ["env-filter"] }
|
||||
url = "2.3.1"
|
||||
urlencoding = "2.1.2"
|
||||
|
||||
|
|
|
@ -80,18 +80,8 @@ pub fn run(args: Args) -> anyhow::Result<()> {
|
|||
if !line_errors.is_empty() {
|
||||
let error_count = line_errors.len();
|
||||
let new_items = wikidata_qids.len() + wikipedia_titles.len() - original_items;
|
||||
let expected_threshold = 0.02;
|
||||
let percentage = 100.0 * error_count as f64 / new_items as f64;
|
||||
let level = if percentage >= expected_threshold {
|
||||
log::Level::Error
|
||||
} else {
|
||||
log::Level::Info
|
||||
};
|
||||
|
||||
log!(
|
||||
level,
|
||||
"{error_count} errors ({percentage:.4}%) parsing osm tags from {path:?}",
|
||||
);
|
||||
warn!("{error_count} errors ({percentage:.4}%) parsing osm tags from {path:?}",);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -118,21 +108,38 @@ pub fn run(args: Args) -> anyhow::Result<()> {
|
|||
}
|
||||
|
||||
info!("Processing dump");
|
||||
let dump = stdin().lock();
|
||||
let mut dump = stdin().lock();
|
||||
|
||||
// TODO: Compare different deserialization methods.
|
||||
// The docs warn against using a reader directly, and it's slower than tar can decompress the dump.
|
||||
// let stream = serde_json::Deserializer::from_reader(dump).into_iter::<Page>();
|
||||
let stream = dump.lines().map(|r| {
|
||||
r.map_err(anyhow::Error::new)
|
||||
.and_then(|s| serde_json::from_str::<Page>(&s).map_err(anyhow::Error::new))
|
||||
});
|
||||
let mut buffer = String::new();
|
||||
let mut line = 0;
|
||||
let mut byte = 1;
|
||||
loop {
|
||||
line += 1;
|
||||
byte += buffer.len();
|
||||
buffer.clear();
|
||||
|
||||
for page in stream {
|
||||
let page = page?;
|
||||
if 0 == dump.read_line(&mut buffer).context("reading dump")? {
|
||||
// Reached end of file.
|
||||
break;
|
||||
}
|
||||
|
||||
// TODO: Compare different deserialization methods.
|
||||
// The docs warn against using a reader directly, and it's slower than tar can decompress the dump.
|
||||
// let stream = serde_json::Deserializer::from_reader(dump).into_iter::<Page>();
|
||||
let page: Page = serde_json::from_str(&buffer).context("deserializing json")?;
|
||||
|
||||
let span = info_span!(
|
||||
"page",
|
||||
lang = page.in_language.identifier,
|
||||
title = page.name,
|
||||
url = page.url,
|
||||
qid = page.main_entity.as_ref().map(|w| &w.identifier),
|
||||
line,
|
||||
byte,
|
||||
);
|
||||
let _handle = span.enter();
|
||||
|
||||
let qid = page.wikidata();
|
||||
|
||||
let is_wikidata_match = qid
|
||||
.as_ref()
|
||||
.map(|qid| wikidata_qids.contains(qid))
|
||||
|
@ -144,7 +151,7 @@ pub fn run(args: Args) -> anyhow::Result<()> {
|
|||
page.all_titles()
|
||||
.filter_map(|r| {
|
||||
r.map(Some).unwrap_or_else(|e| {
|
||||
warn!("Could not parse title for {:?}: {:#}", &page.name, e);
|
||||
warn!("Could not parse title: {:#}", e);
|
||||
None
|
||||
})
|
||||
})
|
||||
|
@ -159,7 +166,7 @@ pub fn run(args: Args) -> anyhow::Result<()> {
|
|||
// Write matched new QIDs back to file.
|
||||
if let (Some(f), Some(qid)) = (&mut write_new_qids, &qid) {
|
||||
if !is_wikidata_match && !matching_titles.is_empty() {
|
||||
debug!("Writing new id {} for article {:?}", qid, page.name);
|
||||
debug!("Writing new id {}", qid);
|
||||
// NOTE: Write to string buffer first to have a single atomic write syscall.
|
||||
// See `write_new_qids` for more info.
|
||||
let line = format!("{}\n", qid);
|
||||
|
@ -173,7 +180,7 @@ pub fn run(args: Args) -> anyhow::Result<()> {
|
|||
}
|
||||
|
||||
if let Err(e) = write(&args.output_dir, &page, matching_titles, !args.no_simplify) {
|
||||
error!("Error writing article {:?}: {:#}", page.name, e);
|
||||
error!("Error writing article: {:#}", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -193,13 +200,13 @@ fn create_article_dir(
|
|||
None => {
|
||||
// Write to wikipedia title directory.
|
||||
// Prefer first redirect, fall back to page title if none exist
|
||||
info!("Page without wikidata qid: {:?} ({})", page.name, page.url);
|
||||
info!("Page without wikidata qid");
|
||||
redirects
|
||||
.next()
|
||||
.or_else(|| match page.title() {
|
||||
Ok(title) => Some(title),
|
||||
Err(e) => {
|
||||
warn!("Unable to parse title for page {:?}: {:#}", page.name, e);
|
||||
warn!("Unable to parse title: {:#}", e);
|
||||
None
|
||||
}
|
||||
})
|
||||
|
|
23
src/main.rs
23
src/main.rs
|
@ -2,9 +2,10 @@ use std::{
|
|||
collections::HashSet,
|
||||
env,
|
||||
fs::File,
|
||||
io::{stdin, stdout, BufReader, Read, Write},
|
||||
io::{stderr, stdin, stdout, BufReader, Read, Write},
|
||||
num::NonZeroUsize,
|
||||
path::PathBuf,
|
||||
process,
|
||||
str::FromStr,
|
||||
thread::available_parallelism,
|
||||
time::Instant,
|
||||
|
@ -12,9 +13,11 @@ use std::{
|
|||
|
||||
use anyhow::Context;
|
||||
use clap::{CommandFactory, Parser, Subcommand};
|
||||
use om_wikiparser::osm;
|
||||
#[macro_use]
|
||||
extern crate log;
|
||||
extern crate tracing;
|
||||
use tracing_subscriber::filter::EnvFilter;
|
||||
|
||||
use om_wikiparser::osm;
|
||||
|
||||
mod get_articles;
|
||||
mod get_tags;
|
||||
|
@ -75,11 +78,12 @@ enum Cmd {
|
|||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
// Use info level by default, load overrides from `RUST_LOG` env variable.
|
||||
// See https://docs.rs/env_logger/latest/env_logger/index.html#example
|
||||
env_logger::Builder::new()
|
||||
.filter_level(log::LevelFilter::Info)
|
||||
.parse_default_env()
|
||||
.try_init()?;
|
||||
// See https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.EnvFilter.html
|
||||
tracing_subscriber::fmt()
|
||||
.with_env_filter(EnvFilter::from_default_env())
|
||||
.compact()
|
||||
.with_writer(stderr)
|
||||
.init();
|
||||
|
||||
let args = Args::parse();
|
||||
|
||||
|
@ -99,6 +103,9 @@ fn main() -> anyhow::Result<()> {
|
|||
.exit()
|
||||
}
|
||||
|
||||
let pid = process::id();
|
||||
let span = info_span!("", pid);
|
||||
let _handle = span.enter();
|
||||
get_articles::run(args)
|
||||
}
|
||||
Cmd::GetTags { pbf_file, threads } => {
|
||||
|
|
Loading…
Add table
Reference in a new issue