Add option to dump new QIDs (#20)

This allows us to extract articles that we know the title of but not the QID of from other language's dumps in a another pass.

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
Evan Lloyd New-Schmidt 2023-07-13 14:04:52 -04:00 committed by GitHub
parent 45efd77c0d
commit 75f4f6a21b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 133 additions and 34 deletions

View file

@ -19,6 +19,38 @@ Alternatively, build it with `cargo build --release`, which places the binary in
Run the program with the `--help` flag to see all supported arguments.
```shell
$ cargo run --release -- --help
Extract article HTML from Wikipedia Enterprise HTML dumps.
Expects an uncompressed dump connected to stdin.
Usage: om-wikiparser [OPTIONS] <OUTPUT_DIR>
Arguments:
<OUTPUT_DIR>
Directory to write the extracted articles to
Options:
--write-new-ids <WRITE_NEW_IDS>
Append to the provided file path the QIDs of articles matched by title but not QID.
Use this to save the QIDs of articles you know the url of, but not the QID. The same path can later be passed to the `--wikidata-ids` option to extract them from another language's dump.
-h, --help
Print help (see a summary with '-h')
-V, --version
Print version
FILTERS:
--wikidata-ids <WIKIDATA_IDS>
Path to file that contains a Wikidata QID to extract on each line (e.g. `Q12345`)
--wikipedia-urls <WIKIPEDIA_URLS>
Path to file that contains a Wikipedia article url to extract on each line (e.g. `https://lang.wikipedia.org/wiki/Article_Title`)
```
It takes as inputs:
- A wikidata enterprise JSON dump, extracted and connected to `stdin`.
- A file of Wikidata QIDs to extract, one per line (e.g. `Q12345`), passed as the CLI flag `--wikidata-ids`.

View file

@ -6,7 +6,7 @@ use std::{
};
use anyhow::{anyhow, bail, Context};
use clap::Parser;
use clap::{CommandFactory, Parser};
#[macro_use]
extern crate log;
@ -15,13 +15,31 @@ use om_wikiparser::{
wm::{parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
};
/// Extract article HTML from Wikipedia Enterprise HTML dumps.
///
/// Expects an uncompressed dump connected to stdin.
#[derive(Parser)]
#[command(version)]
struct Args {
/// Directory to write the extracted articles to.
output_dir: PathBuf,
#[arg(long)]
/// Path to file that contains a Wikidata QID to extract on each line
/// (e.g. `Q12345`).
#[arg(long, help_heading = "FILTERS")]
wikidata_ids: Option<PathBuf>,
#[arg(long)]
/// Path to file that contains a Wikipedia article url to extract on each line
/// (e.g. `https://lang.wikipedia.org/wiki/Article_Title`).
#[arg(long, help_heading = "FILTERS")]
wikipedia_urls: Option<PathBuf>,
/// Append to the provided file path the QIDs of articles matched by title but not QID.
///
/// Use this to save the QIDs of articles you know the url of, but not the QID.
/// The same path can later be passed to the `--wikidata-ids` option to extract them from another language's dump.
#[arg(long, requires("wikipedia_urls"))]
write_new_ids: Option<PathBuf>,
}
/// Determine the directory to write the article contents to, create it, and create any necessary symlinks to it.
@ -65,7 +83,6 @@ fn create_article_dir(
.with_context(|| format!("creating main directory {:?}", &main_dir))?;
// Write symlinks to main directory.
// TODO: Only write redirects that we care about.
for title in redirects {
let wikipedia_dir = title.get_dir(base.to_owned());
@ -147,19 +164,38 @@ fn main() -> anyhow::Result<()> {
let args = Args::parse();
info!("Loading urls");
let wikipedia_titles = args
.wikipedia_urls
.map(parse_wikipedia_file)
.transpose()?
.unwrap_or_default();
if args.wikidata_ids.is_none() && args.wikipedia_urls.is_none() {
let mut cmd = Args::command();
cmd.error(
clap::error::ErrorKind::MissingRequiredArgument,
"one or both of --wikidata-ids and --wikipedia-urls is required",
)
.exit()
}
info!("Loading ids");
let wikidata_ids = args
.wikidata_ids
.map(parse_wikidata_file)
.transpose()?
.unwrap_or_default();
let wikipedia_titles = if let Some(path) = args.wikipedia_urls {
info!("Loading article urls from {path:?}");
let urls = parse_wikipedia_file(path)?;
debug!("Parsed {} unique article urls", urls.len());
urls
} else {
Default::default()
};
let wikidata_ids = if let Some(path) = args.wikidata_ids {
info!("Loading wikidata ids from {path:?}");
let ids = parse_wikidata_file(path)?;
debug!("Parsed {} unique wikidata ids", ids.len());
ids
} else {
Default::default()
};
let mut write_new_ids = args
.write_new_ids
.as_ref()
.map(|p| File::options().create(true).append(true).open(p))
.transpose()?;
if !args.output_dir.is_dir() {
bail!("output dir {:?} does not exist", args.output_dir)
@ -179,26 +215,43 @@ fn main() -> anyhow::Result<()> {
for page in stream {
let page = page?;
let is_wikidata_match = page
.wikidata()
.map(|qid| wikidata_ids.contains(&qid))
let qid = page.wikidata();
let is_wikidata_match = qid
.as_ref()
.map(|qid| wikidata_ids.contains(qid))
.unwrap_or_default();
let matching_titles = page
.all_titles()
.filter_map(|r| {
r.map(Some).unwrap_or_else(|e| {
warn!("Could not parse title for {:?}: {:#}", &page.name, e);
None
let matching_titles = if wikipedia_titles.is_empty() {
Default::default()
} else {
page.all_titles()
.filter_map(|r| {
r.map(Some).unwrap_or_else(|e| {
warn!("Could not parse title for {:?}: {:#}", &page.name, e);
None
})
})
})
.filter(|t| wikipedia_titles.contains(t))
.collect::<Vec<_>>();
.filter(|t| wikipedia_titles.contains(t))
.collect::<Vec<_>>()
};
if !is_wikidata_match && matching_titles.is_empty() {
continue;
}
if let (Some(f), Some(qid)) = (&mut write_new_ids, &qid) {
if !is_wikidata_match && !matching_titles.is_empty() {
debug!("Writing new id {} for article {:?}", qid, page.name);
writeln!(f, "{}", qid).with_context(|| {
format!(
"writing new id to file {:?}",
args.write_new_ids.as_ref().unwrap()
)
})?;
}
}
if let Err(e) = write(&args.output_dir, &page, matching_titles) {
error!("Error writing article {:?}: {:#}", page.name, e);
}

View file

@ -14,16 +14,23 @@ pub use page::Page;
/// Read from a file of urls on each line.
pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<WikidataQid>> {
let contents = fs::read_to_string(path.as_ref())?;
contents
Ok(contents
.lines()
.enumerate()
.map(|(i, line)| {
WikidataQid::from_str(line).with_context(|| {
let line_num = i + 1;
format!("bad QID value on line {line_num}: {line:?}")
format!("on line {line_num}: {line:?}")
})
})
.collect()
.filter_map(|r| match r {
Ok(qid) => Some(qid),
Err(e) => {
warn!("Could not parse QID: {:#}", e);
None
}
})
.collect())
}
/// Read article titles from a file of urls on each line.
@ -31,16 +38,23 @@ pub fn parse_wikipedia_file(
path: impl AsRef<OsStr>,
) -> anyhow::Result<HashSet<WikipediaTitleNorm>> {
let contents = fs::read_to_string(path.as_ref())?;
contents
Ok(contents
.lines()
.enumerate()
.map(|(i, line)| {
WikipediaTitleNorm::from_url(line).with_context(|| {
let line_num = i + 1;
format!("bad wikipedia url on line {line_num}: {line:?}")
format!("on line {line_num}: {line:?}")
})
})
.collect()
.filter_map(|r| match r {
Ok(qid) => Some(qid),
Err(e) => {
warn!("Could not parse wikipedia title: {:#}", e);
None
}
})
.collect())
}
/// Wikidata QID/Q Number