Add option to dump new QIDs to file
Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
parent
45efd77c0d
commit
7d287bd5a4
2 changed files with 90 additions and 24 deletions
29
README.md
29
README.md
|
@ -19,6 +19,35 @@ Alternatively, build it with `cargo build --release`, which places the binary in
|
|||
|
||||
Run the program with the `--help` flag to see all supported arguments.
|
||||
|
||||
```shell
|
||||
$ cargo run --release -- --help
|
||||
Extract article HTML from Wikipedia Enterprise HTML dumps.
|
||||
|
||||
Expects an uncompressed dump connected to stdin.
|
||||
|
||||
Usage: om-wikiparser [OPTIONS] <OUTPUT_DIR>
|
||||
|
||||
Arguments:
|
||||
<OUTPUT_DIR>
|
||||
Directory to write the extracted articles to
|
||||
|
||||
Options:
|
||||
--wikidata-ids <WIKIDATA_IDS>
|
||||
File of Wikidata QIDs to extract, one per line (e.g. `Q12345`)
|
||||
|
||||
--wikipedia-urls <WIKIPEDIA_URLS>
|
||||
File of Wikipedia article titles to extract, one per line (e.g. `https://lang.wikipedia.org/wiki/Article_Title`)
|
||||
|
||||
--write-new-ids <WRITE_NEW_IDS>
|
||||
Append QIDs of articles matched by title but not QID to the provided file
|
||||
|
||||
-h, --help
|
||||
Print help (see a summary with '-h')
|
||||
|
||||
-V, --version
|
||||
Print version
|
||||
```
|
||||
|
||||
It takes as inputs:
|
||||
- A wikidata enterprise JSON dump, extracted and connected to `stdin`.
|
||||
- A file of Wikidata QIDs to extract, one per line (e.g. `Q12345`), passed as the CLI flag `--wikidata-ids`.
|
||||
|
|
85
src/main.rs
85
src/main.rs
|
@ -15,13 +15,23 @@ use om_wikiparser::{
|
|||
wm::{parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
|
||||
};
|
||||
|
||||
/// Extract article HTML from Wikipedia Enterprise HTML dumps.
|
||||
///
|
||||
/// Expects an uncompressed dump connected to stdin.
|
||||
#[derive(Parser)]
|
||||
#[command(version)]
|
||||
struct Args {
|
||||
/// Directory to write the extracted articles to.
|
||||
output_dir: PathBuf,
|
||||
/// File of Wikidata QIDs to extract, one per line (e.g. `Q12345`).
|
||||
#[arg(long)]
|
||||
wikidata_ids: Option<PathBuf>,
|
||||
/// File of Wikipedia article titles to extract, one per line (e.g. `https://lang.wikipedia.org/wiki/Article_Title`).
|
||||
#[arg(long)]
|
||||
wikipedia_urls: Option<PathBuf>,
|
||||
/// Append QIDs of articles matched by title but not QID to the provided file.
|
||||
#[arg(long, requires("wikipedia_urls"))]
|
||||
write_new_ids: Option<PathBuf>,
|
||||
}
|
||||
|
||||
/// Determine the directory to write the article contents to, create it, and create any necessary symlinks to it.
|
||||
|
@ -147,19 +157,29 @@ fn main() -> anyhow::Result<()> {
|
|||
|
||||
let args = Args::parse();
|
||||
|
||||
info!("Loading urls");
|
||||
let wikipedia_titles = args
|
||||
.wikipedia_urls
|
||||
.map(parse_wikipedia_file)
|
||||
.transpose()?
|
||||
.unwrap_or_default();
|
||||
let wikipedia_titles = if let Some(path) = args.wikipedia_urls {
|
||||
info!("Loading article urls from {path:?}");
|
||||
let urls = parse_wikipedia_file(path)?;
|
||||
debug!("Parsed {} unique article urls", urls.len());
|
||||
urls
|
||||
} else {
|
||||
Default::default()
|
||||
};
|
||||
|
||||
info!("Loading ids");
|
||||
let wikidata_ids = args
|
||||
.wikidata_ids
|
||||
.map(parse_wikidata_file)
|
||||
.transpose()?
|
||||
.unwrap_or_default();
|
||||
let wikidata_ids = if let Some(path) = args.wikidata_ids {
|
||||
info!("Loading wikidata ids from {path:?}");
|
||||
let ids = parse_wikidata_file(path)?;
|
||||
debug!("Parsed {} unique wikidata ids", ids.len());
|
||||
ids
|
||||
} else {
|
||||
Default::default()
|
||||
};
|
||||
|
||||
let mut write_new_ids = args
|
||||
.write_new_ids
|
||||
.as_ref()
|
||||
.map(|p| File::options().create(true).append(true).open(p))
|
||||
.transpose()?;
|
||||
|
||||
if !args.output_dir.is_dir() {
|
||||
bail!("output dir {:?} does not exist", args.output_dir)
|
||||
|
@ -179,26 +199,43 @@ fn main() -> anyhow::Result<()> {
|
|||
for page in stream {
|
||||
let page = page?;
|
||||
|
||||
let is_wikidata_match = page
|
||||
.wikidata()
|
||||
.map(|qid| wikidata_ids.contains(&qid))
|
||||
let qid = page.wikidata();
|
||||
|
||||
let is_wikidata_match = qid
|
||||
.as_ref()
|
||||
.map(|qid| wikidata_ids.contains(qid))
|
||||
.unwrap_or_default();
|
||||
|
||||
let matching_titles = page
|
||||
.all_titles()
|
||||
.filter_map(|r| {
|
||||
r.map(Some).unwrap_or_else(|e| {
|
||||
warn!("Could not parse title for {:?}: {:#}", &page.name, e);
|
||||
None
|
||||
let matching_titles = if wikipedia_titles.is_empty() {
|
||||
Default::default()
|
||||
} else {
|
||||
page.all_titles()
|
||||
.filter_map(|r| {
|
||||
r.map(Some).unwrap_or_else(|e| {
|
||||
warn!("Could not parse title for {:?}: {:#}", &page.name, e);
|
||||
None
|
||||
})
|
||||
})
|
||||
})
|
||||
.filter(|t| wikipedia_titles.contains(t))
|
||||
.collect::<Vec<_>>();
|
||||
.filter(|t| wikipedia_titles.contains(t))
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
if !is_wikidata_match && matching_titles.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let (Some(f), Some(qid)) = (&mut write_new_ids, &qid) {
|
||||
if !is_wikidata_match && !matching_titles.is_empty() {
|
||||
debug!("Writing new id {} for article {:?}", qid, page.name);
|
||||
writeln!(f, "{}", qid).with_context(|| {
|
||||
format!(
|
||||
"writing new id to file {:?}",
|
||||
args.write_new_ids.as_ref().unwrap()
|
||||
)
|
||||
})?;
|
||||
}
|
||||
}
|
||||
|
||||
if let Err(e) = write(&args.output_dir, &page, matching_titles) {
|
||||
error!("Error writing article {:?}: {:#}", page.name, e);
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue