Add option to not simplify when extracting
Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
parent
941d2b1032
commit
c9eb7a160a
1 changed files with 12 additions and 3 deletions
|
@ -8,7 +8,7 @@ use std::{
|
|||
use anyhow::{anyhow, bail, Context};
|
||||
|
||||
use om_wikiparser::{
|
||||
html::simplify,
|
||||
html,
|
||||
wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, Title},
|
||||
};
|
||||
|
||||
|
@ -43,6 +43,10 @@ pub struct Args {
|
|||
/// Writes are atomicly appended to the file, so the same path may be used by multiple concurrent instances.
|
||||
#[arg(long, value_name = "FILE")]
|
||||
pub write_new_qids: Option<PathBuf>,
|
||||
|
||||
/// Don't process extracted HTML; write the original text to disk.
|
||||
#[arg(long)]
|
||||
pub no_simplify: bool,
|
||||
}
|
||||
|
||||
pub fn run(args: Args) -> anyhow::Result<()> {
|
||||
|
@ -167,7 +171,7 @@ pub fn run(args: Args) -> anyhow::Result<()> {
|
|||
}
|
||||
}
|
||||
|
||||
if let Err(e) = write(&args.output_dir, &page, matching_titles) {
|
||||
if let Err(e) = write(&args.output_dir, &page, matching_titles, !args.no_simplify) {
|
||||
error!("Error writing article {:?}: {:#}", page.name, e);
|
||||
}
|
||||
}
|
||||
|
@ -263,6 +267,7 @@ fn write(
|
|||
base: impl AsRef<Path>,
|
||||
page: &Page,
|
||||
redirects: impl IntoIterator<Item = Title>,
|
||||
simplify: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let article_dir = create_article_dir(base, page, redirects)?;
|
||||
|
||||
|
@ -277,7 +282,11 @@ fn write(
|
|||
debug!("Overwriting existing file");
|
||||
}
|
||||
|
||||
let html = simplify(&page.article_body.html, &page.in_language.identifier);
|
||||
let html = if simplify {
|
||||
html::simplify(&page.article_body.html, &page.in_language.identifier)
|
||||
} else {
|
||||
page.article_body.html.to_string()
|
||||
};
|
||||
|
||||
let mut file =
|
||||
File::create(&filename).with_context(|| format!("creating html file {:?}", filename))?;
|
||||
|
|
Loading…
Add table
Reference in a new issue