Add option to dump new QIDs (#20)
This allows us to extract articles that we know the title of but not the QID of from other language's dumps in a another pass. Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
parent
45efd77c0d
commit
75f4f6a21b
3 changed files with 133 additions and 34 deletions
32
README.md
32
README.md
|
@ -19,6 +19,38 @@ Alternatively, build it with `cargo build --release`, which places the binary in
|
|||
|
||||
Run the program with the `--help` flag to see all supported arguments.
|
||||
|
||||
```shell
|
||||
$ cargo run --release -- --help
|
||||
Extract article HTML from Wikipedia Enterprise HTML dumps.
|
||||
|
||||
Expects an uncompressed dump connected to stdin.
|
||||
|
||||
Usage: om-wikiparser [OPTIONS] <OUTPUT_DIR>
|
||||
|
||||
Arguments:
|
||||
<OUTPUT_DIR>
|
||||
Directory to write the extracted articles to
|
||||
|
||||
Options:
|
||||
--write-new-ids <WRITE_NEW_IDS>
|
||||
Append to the provided file path the QIDs of articles matched by title but not QID.
|
||||
|
||||
Use this to save the QIDs of articles you know the url of, but not the QID. The same path can later be passed to the `--wikidata-ids` option to extract them from another language's dump.
|
||||
|
||||
-h, --help
|
||||
Print help (see a summary with '-h')
|
||||
|
||||
-V, --version
|
||||
Print version
|
||||
|
||||
FILTERS:
|
||||
--wikidata-ids <WIKIDATA_IDS>
|
||||
Path to file that contains a Wikidata QID to extract on each line (e.g. `Q12345`)
|
||||
|
||||
--wikipedia-urls <WIKIPEDIA_URLS>
|
||||
Path to file that contains a Wikipedia article url to extract on each line (e.g. `https://lang.wikipedia.org/wiki/Article_Title`)
|
||||
```
|
||||
|
||||
It takes as inputs:
|
||||
- A wikidata enterprise JSON dump, extracted and connected to `stdin`.
|
||||
- A file of Wikidata QIDs to extract, one per line (e.g. `Q12345`), passed as the CLI flag `--wikidata-ids`.
|
||||
|
|
109
src/main.rs
109
src/main.rs
|
@ -6,7 +6,7 @@ use std::{
|
|||
};
|
||||
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
use clap::Parser;
|
||||
use clap::{CommandFactory, Parser};
|
||||
#[macro_use]
|
||||
extern crate log;
|
||||
|
||||
|
@ -15,13 +15,31 @@ use om_wikiparser::{
|
|||
wm::{parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
|
||||
};
|
||||
|
||||
/// Extract article HTML from Wikipedia Enterprise HTML dumps.
|
||||
///
|
||||
/// Expects an uncompressed dump connected to stdin.
|
||||
#[derive(Parser)]
|
||||
#[command(version)]
|
||||
struct Args {
|
||||
/// Directory to write the extracted articles to.
|
||||
output_dir: PathBuf,
|
||||
#[arg(long)]
|
||||
|
||||
/// Path to file that contains a Wikidata QID to extract on each line
|
||||
/// (e.g. `Q12345`).
|
||||
#[arg(long, help_heading = "FILTERS")]
|
||||
wikidata_ids: Option<PathBuf>,
|
||||
#[arg(long)]
|
||||
|
||||
/// Path to file that contains a Wikipedia article url to extract on each line
|
||||
/// (e.g. `https://lang.wikipedia.org/wiki/Article_Title`).
|
||||
#[arg(long, help_heading = "FILTERS")]
|
||||
wikipedia_urls: Option<PathBuf>,
|
||||
|
||||
/// Append to the provided file path the QIDs of articles matched by title but not QID.
|
||||
///
|
||||
/// Use this to save the QIDs of articles you know the url of, but not the QID.
|
||||
/// The same path can later be passed to the `--wikidata-ids` option to extract them from another language's dump.
|
||||
#[arg(long, requires("wikipedia_urls"))]
|
||||
write_new_ids: Option<PathBuf>,
|
||||
}
|
||||
|
||||
/// Determine the directory to write the article contents to, create it, and create any necessary symlinks to it.
|
||||
|
@ -65,7 +83,6 @@ fn create_article_dir(
|
|||
.with_context(|| format!("creating main directory {:?}", &main_dir))?;
|
||||
|
||||
// Write symlinks to main directory.
|
||||
// TODO: Only write redirects that we care about.
|
||||
for title in redirects {
|
||||
let wikipedia_dir = title.get_dir(base.to_owned());
|
||||
|
||||
|
@ -147,19 +164,38 @@ fn main() -> anyhow::Result<()> {
|
|||
|
||||
let args = Args::parse();
|
||||
|
||||
info!("Loading urls");
|
||||
let wikipedia_titles = args
|
||||
.wikipedia_urls
|
||||
.map(parse_wikipedia_file)
|
||||
.transpose()?
|
||||
.unwrap_or_default();
|
||||
if args.wikidata_ids.is_none() && args.wikipedia_urls.is_none() {
|
||||
let mut cmd = Args::command();
|
||||
cmd.error(
|
||||
clap::error::ErrorKind::MissingRequiredArgument,
|
||||
"one or both of --wikidata-ids and --wikipedia-urls is required",
|
||||
)
|
||||
.exit()
|
||||
}
|
||||
|
||||
info!("Loading ids");
|
||||
let wikidata_ids = args
|
||||
.wikidata_ids
|
||||
.map(parse_wikidata_file)
|
||||
.transpose()?
|
||||
.unwrap_or_default();
|
||||
let wikipedia_titles = if let Some(path) = args.wikipedia_urls {
|
||||
info!("Loading article urls from {path:?}");
|
||||
let urls = parse_wikipedia_file(path)?;
|
||||
debug!("Parsed {} unique article urls", urls.len());
|
||||
urls
|
||||
} else {
|
||||
Default::default()
|
||||
};
|
||||
|
||||
let wikidata_ids = if let Some(path) = args.wikidata_ids {
|
||||
info!("Loading wikidata ids from {path:?}");
|
||||
let ids = parse_wikidata_file(path)?;
|
||||
debug!("Parsed {} unique wikidata ids", ids.len());
|
||||
ids
|
||||
} else {
|
||||
Default::default()
|
||||
};
|
||||
|
||||
let mut write_new_ids = args
|
||||
.write_new_ids
|
||||
.as_ref()
|
||||
.map(|p| File::options().create(true).append(true).open(p))
|
||||
.transpose()?;
|
||||
|
||||
if !args.output_dir.is_dir() {
|
||||
bail!("output dir {:?} does not exist", args.output_dir)
|
||||
|
@ -179,26 +215,43 @@ fn main() -> anyhow::Result<()> {
|
|||
for page in stream {
|
||||
let page = page?;
|
||||
|
||||
let is_wikidata_match = page
|
||||
.wikidata()
|
||||
.map(|qid| wikidata_ids.contains(&qid))
|
||||
let qid = page.wikidata();
|
||||
|
||||
let is_wikidata_match = qid
|
||||
.as_ref()
|
||||
.map(|qid| wikidata_ids.contains(qid))
|
||||
.unwrap_or_default();
|
||||
|
||||
let matching_titles = page
|
||||
.all_titles()
|
||||
.filter_map(|r| {
|
||||
r.map(Some).unwrap_or_else(|e| {
|
||||
warn!("Could not parse title for {:?}: {:#}", &page.name, e);
|
||||
None
|
||||
let matching_titles = if wikipedia_titles.is_empty() {
|
||||
Default::default()
|
||||
} else {
|
||||
page.all_titles()
|
||||
.filter_map(|r| {
|
||||
r.map(Some).unwrap_or_else(|e| {
|
||||
warn!("Could not parse title for {:?}: {:#}", &page.name, e);
|
||||
None
|
||||
})
|
||||
})
|
||||
})
|
||||
.filter(|t| wikipedia_titles.contains(t))
|
||||
.collect::<Vec<_>>();
|
||||
.filter(|t| wikipedia_titles.contains(t))
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
if !is_wikidata_match && matching_titles.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let (Some(f), Some(qid)) = (&mut write_new_ids, &qid) {
|
||||
if !is_wikidata_match && !matching_titles.is_empty() {
|
||||
debug!("Writing new id {} for article {:?}", qid, page.name);
|
||||
writeln!(f, "{}", qid).with_context(|| {
|
||||
format!(
|
||||
"writing new id to file {:?}",
|
||||
args.write_new_ids.as_ref().unwrap()
|
||||
)
|
||||
})?;
|
||||
}
|
||||
}
|
||||
|
||||
if let Err(e) = write(&args.output_dir, &page, matching_titles) {
|
||||
error!("Error writing article {:?}: {:#}", page.name, e);
|
||||
}
|
||||
|
|
|
@ -14,16 +14,23 @@ pub use page::Page;
|
|||
/// Read from a file of urls on each line.
|
||||
pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<WikidataQid>> {
|
||||
let contents = fs::read_to_string(path.as_ref())?;
|
||||
contents
|
||||
Ok(contents
|
||||
.lines()
|
||||
.enumerate()
|
||||
.map(|(i, line)| {
|
||||
WikidataQid::from_str(line).with_context(|| {
|
||||
let line_num = i + 1;
|
||||
format!("bad QID value on line {line_num}: {line:?}")
|
||||
format!("on line {line_num}: {line:?}")
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
.filter_map(|r| match r {
|
||||
Ok(qid) => Some(qid),
|
||||
Err(e) => {
|
||||
warn!("Could not parse QID: {:#}", e);
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Read article titles from a file of urls on each line.
|
||||
|
@ -31,16 +38,23 @@ pub fn parse_wikipedia_file(
|
|||
path: impl AsRef<OsStr>,
|
||||
) -> anyhow::Result<HashSet<WikipediaTitleNorm>> {
|
||||
let contents = fs::read_to_string(path.as_ref())?;
|
||||
contents
|
||||
Ok(contents
|
||||
.lines()
|
||||
.enumerate()
|
||||
.map(|(i, line)| {
|
||||
WikipediaTitleNorm::from_url(line).with_context(|| {
|
||||
let line_num = i + 1;
|
||||
format!("bad wikipedia url on line {line_num}: {line:?}")
|
||||
format!("on line {line_num}: {line:?}")
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
.filter_map(|r| match r {
|
||||
Ok(qid) => Some(qid),
|
||||
Err(e) => {
|
||||
warn!("Could not parse wikipedia title: {:#}", e);
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Wikidata QID/Q Number
|
||||
|
|
Loading…
Add table
Reference in a new issue