Add option to dump new QIDs to file

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
Evan Lloyd New-Schmidt 2023-07-11 12:02:19 -04:00
parent 45efd77c0d
commit 7d287bd5a4
2 changed files with 90 additions and 24 deletions

View file

@ -19,6 +19,35 @@ Alternatively, build it with `cargo build --release`, which places the binary in
Run the program with the `--help` flag to see all supported arguments.
```shell
$ cargo run --release -- --help
Extract article HTML from Wikipedia Enterprise HTML dumps.
Expects an uncompressed dump connected to stdin.
Usage: om-wikiparser [OPTIONS] <OUTPUT_DIR>
Arguments:
<OUTPUT_DIR>
Directory to write the extracted articles to
Options:
--wikidata-ids <WIKIDATA_IDS>
File of Wikidata QIDs to extract, one per line (e.g. `Q12345`)
--wikipedia-urls <WIKIPEDIA_URLS>
File of Wikipedia article titles to extract, one per line (e.g. `https://lang.wikipedia.org/wiki/Article_Title`)
--write-new-ids <WRITE_NEW_IDS>
Append QIDs of articles matched by title but not QID to the provided file
-h, --help
Print help (see a summary with '-h')
-V, --version
Print version
```
It takes as inputs:
- A wikidata enterprise JSON dump, extracted and connected to `stdin`.
- A file of Wikidata QIDs to extract, one per line (e.g. `Q12345`), passed as the CLI flag `--wikidata-ids`.

View file

@ -15,13 +15,23 @@ use om_wikiparser::{
wm::{parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
};
/// Extract article HTML from Wikipedia Enterprise HTML dumps.
///
/// Expects an uncompressed dump connected to stdin.
#[derive(Parser)]
#[command(version)]
struct Args {
/// Directory to write the extracted articles to.
output_dir: PathBuf,
/// File of Wikidata QIDs to extract, one per line (e.g. `Q12345`).
#[arg(long)]
wikidata_ids: Option<PathBuf>,
/// File of Wikipedia article titles to extract, one per line (e.g. `https://lang.wikipedia.org/wiki/Article_Title`).
#[arg(long)]
wikipedia_urls: Option<PathBuf>,
/// Append QIDs of articles matched by title but not QID to the provided file.
#[arg(long, requires("wikipedia_urls"))]
write_new_ids: Option<PathBuf>,
}
/// Determine the directory to write the article contents to, create it, and create any necessary symlinks to it.
@ -147,19 +157,29 @@ fn main() -> anyhow::Result<()> {
let args = Args::parse();
info!("Loading urls");
let wikipedia_titles = args
.wikipedia_urls
.map(parse_wikipedia_file)
.transpose()?
.unwrap_or_default();
let wikipedia_titles = if let Some(path) = args.wikipedia_urls {
info!("Loading article urls from {path:?}");
let urls = parse_wikipedia_file(path)?;
debug!("Parsed {} unique article urls", urls.len());
urls
} else {
Default::default()
};
info!("Loading ids");
let wikidata_ids = args
.wikidata_ids
.map(parse_wikidata_file)
.transpose()?
.unwrap_or_default();
let wikidata_ids = if let Some(path) = args.wikidata_ids {
info!("Loading wikidata ids from {path:?}");
let ids = parse_wikidata_file(path)?;
debug!("Parsed {} unique wikidata ids", ids.len());
ids
} else {
Default::default()
};
let mut write_new_ids = args
.write_new_ids
.as_ref()
.map(|p| File::options().create(true).append(true).open(p))
.transpose()?;
if !args.output_dir.is_dir() {
bail!("output dir {:?} does not exist", args.output_dir)
@ -179,26 +199,43 @@ fn main() -> anyhow::Result<()> {
for page in stream {
let page = page?;
let is_wikidata_match = page
.wikidata()
.map(|qid| wikidata_ids.contains(&qid))
let qid = page.wikidata();
let is_wikidata_match = qid
.as_ref()
.map(|qid| wikidata_ids.contains(qid))
.unwrap_or_default();
let matching_titles = page
.all_titles()
.filter_map(|r| {
r.map(Some).unwrap_or_else(|e| {
warn!("Could not parse title for {:?}: {:#}", &page.name, e);
None
let matching_titles = if wikipedia_titles.is_empty() {
Default::default()
} else {
page.all_titles()
.filter_map(|r| {
r.map(Some).unwrap_or_else(|e| {
warn!("Could not parse title for {:?}: {:#}", &page.name, e);
None
})
})
})
.filter(|t| wikipedia_titles.contains(t))
.collect::<Vec<_>>();
.filter(|t| wikipedia_titles.contains(t))
.collect::<Vec<_>>()
};
if !is_wikidata_match && matching_titles.is_empty() {
continue;
}
if let (Some(f), Some(qid)) = (&mut write_new_ids, &qid) {
if !is_wikidata_match && !matching_titles.is_empty() {
debug!("Writing new id {} for article {:?}", qid, page.name);
writeln!(f, "{}", qid).with_context(|| {
format!(
"writing new id to file {:?}",
args.write_new_ids.as_ref().unwrap()
)
})?;
}
}
if let Err(e) = write(&args.output_dir, &page, matching_titles) {
error!("Error writing article {:?}: {:#}", page.name, e);
}