Add support for multiple languages

Per-language section removal is configured with a static json file.

This includes a test to make sure the file exists and is formatted correctly.

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
Evan Lloyd New-Schmidt 2023-06-07 15:55:18 -04:00 committed by Evan Lloyd New-Schmidt
parent 35faadc693
commit 8435682ddf
10 changed files with 159 additions and 57 deletions

5
Cargo.lock generated
View file

@ -524,6 +524,7 @@ dependencies = [
"clap",
"env_logger",
"log",
"once_cell",
"scraper",
"serde",
"serde_json",
@ -533,9 +534,9 @@ dependencies = [
[[package]]
name = "once_cell"
version = "1.17.2"
version = "1.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9670a07f94779e00908f3e686eab508878ebb390ba6e604d3a284c00e8d0487b"
checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
[[package]]
name = "parking_lot"

View file

@ -12,6 +12,7 @@ anyhow = { version = "1.0.71", features = ["backtrace"] }
clap = { version = "4.3.2", features = ["derive"] }
env_logger = "0.10.0"
log = "0.4.18"
once_cell = "1.18.0"
scraper = "0.16.0"
serde = { version = "1.0.163", features = ["derive"] }
serde_json = "1.0.96"

View file

@ -1,3 +1,8 @@
# wikiparser
_Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._
## Usage
[`article_processing_config.json`](article_processing_config.json) should be updated when adding a new language.
It defines article sections that are not important for users and should be removed.

View file

@ -0,0 +1,44 @@
{
"sections_to_remove": {
"de": [
"Anmerkungen",
"Anmerkungen und Einzelnachweise",
"Einzelbelege",
"Einzelnachweise",
"Filme",
"Literatur",
"Siehe auch",
"Weblinks"
],
"en": [
"Bibliography",
"External links",
"Further reading",
"References",
"See also",
"Sources"
],
"es": [
"Enlaces externos",
"Referencias",
"Véase también",
"Vínculos de interés"
],
"fr": [
"Articles connexes",
"Bibliographie",
"Lien externe",
"Liens externes",
"Notes et références",
"Références",
"Voir aussi"
],
"ru": [
"Библиография",
"Литература",
"Примечания",
"См. также",
"Ссылки"
]
}
}

View file

@ -10,7 +10,7 @@ fn main() -> anyhow::Result<()> {
let mut input = String::new();
stdin().read_to_string(&mut input)?;
let output = simplify(&input);
let output = simplify(&input, "en");
stdout().write_all(output.as_bytes())?;

View file

@ -1,49 +1,63 @@
use std::collections::{BTreeMap, BTreeSet};
use once_cell::sync::Lazy;
use scraper::{ElementRef, Html, Selector};
use serde::Deserialize;
pub fn simplify(html: &str) -> String {
// TODO: handle multiple languages
let bad_sections = [
"External links",
"Sources",
"See also",
"Bibliography",
"Further reading",
"References",
];
#[derive(Debug, Deserialize)]
struct Config<'a> {
#[serde(borrow)]
sections_to_remove: BTreeMap<&'a str, BTreeSet<&'a str>>,
}
static CONFIG: Lazy<Config<'static>> = Lazy::new(|| {
serde_json::from_str(include_str!(concat!(
env!("CARGO_MANIFEST_DIR"),
"/article_processing_config.json"
)))
.expect("\"article_processing_config.json\" is either invalid json or the wrong structure")
});
static HEADERS: Lazy<Selector> =
Lazy::new(|| Selector::parse("h1, h2, h3, h4, h5, h6, h7").unwrap());
pub fn simplify(html: &str, lang: &str) -> String {
let mut document = Html::parse_document(html);
// TODO: evaluate this only once
let headers = Selector::parse("h1, h2, h3, h4, h5, h6, h7").unwrap();
let mut to_remove = Vec::new();
// remove sections
for header in document.select(&headers) {
// TODO: should this join all text nodes?
let Some(title) = header.text().next() else {
continue
};
if bad_sections.contains(&title) {
to_remove.push(header.id());
let header_level = header.value().name();
// strip trailing nodes
for sibling in header.next_siblings() {
if let Some(element) = sibling.value().as_element() {
if element.name() == header_level {
// TODO: should this check for a higher level?
break;
if let Some(bad_sections) = CONFIG.sections_to_remove.get(lang) {
for header in document.select(&HEADERS) {
// TODO: should this join all text nodes?
let Some(title) = header.text().next() else {
continue
};
if bad_sections.contains(&title.trim()) {
to_remove.push(header.id());
let header_level = header.value().name();
// strip trailing nodes
for sibling in header.next_siblings() {
if let Some(element) = sibling.value().as_element() {
if element.name() == header_level {
// TODO: should this check for a higher level?
break;
}
}
to_remove.push(sibling.id());
}
to_remove.push(sibling.id());
}
}
}
for id in to_remove.drain(..) {
if let Some(mut node) = document.tree.get_mut(id) {
node.detach();
for id in to_remove.drain(..) {
if let Some(mut node) = document.tree.get_mut(id) {
node.detach();
}
}
} else {
warn!("No sections to remove configured for lang {lang:?}");
}
// remove elements with no text that isn't whitespace
@ -66,3 +80,13 @@ pub fn simplify(html: &str) -> String {
document.html()
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn static_config_parses() {
assert!(!CONFIG.sections_to_remove.is_empty());
}
}

View file

@ -1,2 +1,5 @@
pub mod html;
pub mod wm;
#[macro_use]
extern crate log;

View file

@ -9,9 +9,9 @@
// --wikipedia-urls /tmp/wikipedia_urls.txt \
// output_dir
use std::{
fs::File,
fs::{create_dir, File},
io::{stdin, BufRead, Write},
path::PathBuf,
path::{Path, PathBuf},
};
use anyhow::bail;
@ -33,6 +33,37 @@ struct Args {
wikipedia_urls: Option<PathBuf>,
}
fn write(dir: impl AsRef<Path>, page: Page) -> anyhow::Result<()> {
let Some(qid) = page.main_entity.map(|e| e.identifier) else {
// TODO: handle and still write
bail!("Page in list but without wikidata qid: {:?} ({})", page.name, page.url);
};
let mut filename = dir.as_ref().to_owned();
filename.push(qid);
filename.push(&page.in_language.identifier);
filename.set_extension("html");
debug!("{:?}: {:?}", page.name, filename);
if filename.exists() {
debug!("Exists, skipping");
return Ok(());
}
let subfolder = filename.parent().unwrap();
if !subfolder.exists() {
create_dir(subfolder)?;
}
let html = simplify(&page.article_body.html, &page.in_language.identifier);
let mut file = File::create(&filename)?;
file.write_all(html.as_bytes())?;
Ok(())
}
fn main() -> anyhow::Result<()> {
env_logger::Builder::new()
.filter_level(log::LevelFilter::Info)
@ -79,24 +110,9 @@ fn main() -> anyhow::Result<()> {
continue;
}
let Some(qid) = page.main_entity.map(|e| e.identifier) else {
warn!("Page in list but without wikidata qid: {:?}", page.name);
continue;
};
let filename = args.output_dir.join(qid).with_extension("html");
debug!("{:?}: {:?}", page.name, filename);
if filename.exists() {
debug!("Exists, skipping");
continue;
if let Err(e) = write(&args.output_dir, page) {
error!("Error writing article: {}", e);
}
let html = simplify(&page.article_body.html);
let mut file = File::create(filename)?;
file.write_all(html.as_bytes())?;
}
Ok(())

View file

@ -58,15 +58,14 @@ pub fn is_wikipedia_match(
titles: &HashSet<WikipediaTitleNorm>,
page: &Page,
) -> Option<WikipediaTitleNorm> {
// TODO: handle multiple languages
let title = WikipediaTitleNorm::from_title(&page.name, "en");
let title = WikipediaTitleNorm::from_title(&page.name, &page.in_language.identifier);
if titles.get(&title).is_some() {
return Some(title);
}
for redirect in &page.redirects {
let title = WikipediaTitleNorm::from_title(&redirect.name, "en");
let title = WikipediaTitleNorm::from_title(&redirect.name, &page.in_language.identifier);
if titles.get(&title).is_some() {
return Some(title);

View file

@ -1,5 +1,6 @@
use serde::Deserialize;
// TODO: consolidate into single struct
/// Deserialized Wikimedia Enterprise API Article
///
/// For all available fields, see https://enterprise.wikimedia.com/docs/data-dictionary/
@ -9,6 +10,7 @@ pub struct Page {
// TODO: check if CoW has a performance impact
pub name: String,
pub date_modified: String,
pub in_language: Language,
#[serde(default)]
pub url: String,
pub main_entity: Option<Wikidata>,
@ -25,6 +27,8 @@ pub struct Wikidata {
#[derive(Deserialize)]
pub struct ArticleBody {
// TODO: look into RawValue to lazily parse/allocate this:
// https://docs.rs/serde_json/latest/serde_json/value/struct.RawValue.html
pub html: String,
}
@ -34,3 +38,8 @@ pub struct Redirect {
pub url: String,
pub name: String,
}
#[derive(Deserialize)]
pub struct Language {
pub identifier: String,
}