Add support for multiple languages
Per-language section removal is configured with a static json file. This includes a test to make sure the file exists and is formatted correctly. Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
parent
35faadc693
commit
8435682ddf
10 changed files with 159 additions and 57 deletions
5
Cargo.lock
generated
5
Cargo.lock
generated
|
@ -524,6 +524,7 @@ dependencies = [
|
|||
"clap",
|
||||
"env_logger",
|
||||
"log",
|
||||
"once_cell",
|
||||
"scraper",
|
||||
"serde",
|
||||
"serde_json",
|
||||
|
@ -533,9 +534,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.17.2"
|
||||
version = "1.18.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9670a07f94779e00908f3e686eab508878ebb390ba6e604d3a284c00e8d0487b"
|
||||
checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
|
|
|
@ -12,6 +12,7 @@ anyhow = { version = "1.0.71", features = ["backtrace"] }
|
|||
clap = { version = "4.3.2", features = ["derive"] }
|
||||
env_logger = "0.10.0"
|
||||
log = "0.4.18"
|
||||
once_cell = "1.18.0"
|
||||
scraper = "0.16.0"
|
||||
serde = { version = "1.0.163", features = ["derive"] }
|
||||
serde_json = "1.0.96"
|
||||
|
|
|
@ -1,3 +1,8 @@
|
|||
# wikiparser
|
||||
|
||||
_Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._
|
||||
|
||||
## Usage
|
||||
|
||||
[`article_processing_config.json`](article_processing_config.json) should be updated when adding a new language.
|
||||
It defines article sections that are not important for users and should be removed.
|
||||
|
|
44
article_processing_config.json
Normal file
44
article_processing_config.json
Normal file
|
@ -0,0 +1,44 @@
|
|||
{
|
||||
"sections_to_remove": {
|
||||
"de": [
|
||||
"Anmerkungen",
|
||||
"Anmerkungen und Einzelnachweise",
|
||||
"Einzelbelege",
|
||||
"Einzelnachweise",
|
||||
"Filme",
|
||||
"Literatur",
|
||||
"Siehe auch",
|
||||
"Weblinks"
|
||||
],
|
||||
"en": [
|
||||
"Bibliography",
|
||||
"External links",
|
||||
"Further reading",
|
||||
"References",
|
||||
"See also",
|
||||
"Sources"
|
||||
],
|
||||
"es": [
|
||||
"Enlaces externos",
|
||||
"Referencias",
|
||||
"Véase también",
|
||||
"Vínculos de interés"
|
||||
],
|
||||
"fr": [
|
||||
"Articles connexes",
|
||||
"Bibliographie",
|
||||
"Lien externe",
|
||||
"Liens externes",
|
||||
"Notes et références",
|
||||
"Références",
|
||||
"Voir aussi"
|
||||
],
|
||||
"ru": [
|
||||
"Библиография",
|
||||
"Литература",
|
||||
"Примечания",
|
||||
"См. также",
|
||||
"Ссылки"
|
||||
]
|
||||
}
|
||||
}
|
|
@ -10,7 +10,7 @@ fn main() -> anyhow::Result<()> {
|
|||
let mut input = String::new();
|
||||
stdin().read_to_string(&mut input)?;
|
||||
|
||||
let output = simplify(&input);
|
||||
let output = simplify(&input, "en");
|
||||
|
||||
stdout().write_all(output.as_bytes())?;
|
||||
|
||||
|
|
88
src/html.rs
88
src/html.rs
|
@ -1,49 +1,63 @@
|
|||
use std::collections::{BTreeMap, BTreeSet};
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use scraper::{ElementRef, Html, Selector};
|
||||
use serde::Deserialize;
|
||||
|
||||
pub fn simplify(html: &str) -> String {
|
||||
// TODO: handle multiple languages
|
||||
let bad_sections = [
|
||||
"External links",
|
||||
"Sources",
|
||||
"See also",
|
||||
"Bibliography",
|
||||
"Further reading",
|
||||
"References",
|
||||
];
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct Config<'a> {
|
||||
#[serde(borrow)]
|
||||
sections_to_remove: BTreeMap<&'a str, BTreeSet<&'a str>>,
|
||||
}
|
||||
|
||||
static CONFIG: Lazy<Config<'static>> = Lazy::new(|| {
|
||||
serde_json::from_str(include_str!(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/article_processing_config.json"
|
||||
)))
|
||||
.expect("\"article_processing_config.json\" is either invalid json or the wrong structure")
|
||||
});
|
||||
|
||||
static HEADERS: Lazy<Selector> =
|
||||
Lazy::new(|| Selector::parse("h1, h2, h3, h4, h5, h6, h7").unwrap());
|
||||
|
||||
pub fn simplify(html: &str, lang: &str) -> String {
|
||||
let mut document = Html::parse_document(html);
|
||||
|
||||
// TODO: evaluate this only once
|
||||
let headers = Selector::parse("h1, h2, h3, h4, h5, h6, h7").unwrap();
|
||||
|
||||
let mut to_remove = Vec::new();
|
||||
|
||||
// remove sections
|
||||
for header in document.select(&headers) {
|
||||
// TODO: should this join all text nodes?
|
||||
let Some(title) = header.text().next() else {
|
||||
continue
|
||||
};
|
||||
if bad_sections.contains(&title) {
|
||||
to_remove.push(header.id());
|
||||
let header_level = header.value().name();
|
||||
// strip trailing nodes
|
||||
for sibling in header.next_siblings() {
|
||||
if let Some(element) = sibling.value().as_element() {
|
||||
if element.name() == header_level {
|
||||
// TODO: should this check for a higher level?
|
||||
break;
|
||||
|
||||
if let Some(bad_sections) = CONFIG.sections_to_remove.get(lang) {
|
||||
for header in document.select(&HEADERS) {
|
||||
// TODO: should this join all text nodes?
|
||||
let Some(title) = header.text().next() else {
|
||||
continue
|
||||
};
|
||||
|
||||
if bad_sections.contains(&title.trim()) {
|
||||
to_remove.push(header.id());
|
||||
let header_level = header.value().name();
|
||||
// strip trailing nodes
|
||||
for sibling in header.next_siblings() {
|
||||
if let Some(element) = sibling.value().as_element() {
|
||||
if element.name() == header_level {
|
||||
// TODO: should this check for a higher level?
|
||||
break;
|
||||
}
|
||||
}
|
||||
to_remove.push(sibling.id());
|
||||
}
|
||||
to_remove.push(sibling.id());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for id in to_remove.drain(..) {
|
||||
if let Some(mut node) = document.tree.get_mut(id) {
|
||||
node.detach();
|
||||
for id in to_remove.drain(..) {
|
||||
if let Some(mut node) = document.tree.get_mut(id) {
|
||||
node.detach();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
warn!("No sections to remove configured for lang {lang:?}");
|
||||
}
|
||||
|
||||
// remove elements with no text that isn't whitespace
|
||||
|
@ -66,3 +80,13 @@ pub fn simplify(html: &str) -> String {
|
|||
|
||||
document.html()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn static_config_parses() {
|
||||
assert!(!CONFIG.sections_to_remove.is_empty());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,2 +1,5 @@
|
|||
pub mod html;
|
||||
pub mod wm;
|
||||
|
||||
#[macro_use]
|
||||
extern crate log;
|
||||
|
|
54
src/main.rs
54
src/main.rs
|
@ -9,9 +9,9 @@
|
|||
// --wikipedia-urls /tmp/wikipedia_urls.txt \
|
||||
// output_dir
|
||||
use std::{
|
||||
fs::File,
|
||||
fs::{create_dir, File},
|
||||
io::{stdin, BufRead, Write},
|
||||
path::PathBuf,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use anyhow::bail;
|
||||
|
@ -33,6 +33,37 @@ struct Args {
|
|||
wikipedia_urls: Option<PathBuf>,
|
||||
}
|
||||
|
||||
fn write(dir: impl AsRef<Path>, page: Page) -> anyhow::Result<()> {
|
||||
let Some(qid) = page.main_entity.map(|e| e.identifier) else {
|
||||
// TODO: handle and still write
|
||||
bail!("Page in list but without wikidata qid: {:?} ({})", page.name, page.url);
|
||||
};
|
||||
|
||||
let mut filename = dir.as_ref().to_owned();
|
||||
filename.push(qid);
|
||||
filename.push(&page.in_language.identifier);
|
||||
filename.set_extension("html");
|
||||
|
||||
debug!("{:?}: {:?}", page.name, filename);
|
||||
|
||||
if filename.exists() {
|
||||
debug!("Exists, skipping");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let subfolder = filename.parent().unwrap();
|
||||
if !subfolder.exists() {
|
||||
create_dir(subfolder)?;
|
||||
}
|
||||
|
||||
let html = simplify(&page.article_body.html, &page.in_language.identifier);
|
||||
|
||||
let mut file = File::create(&filename)?;
|
||||
file.write_all(html.as_bytes())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
env_logger::Builder::new()
|
||||
.filter_level(log::LevelFilter::Info)
|
||||
|
@ -79,24 +110,9 @@ fn main() -> anyhow::Result<()> {
|
|||
continue;
|
||||
}
|
||||
|
||||
let Some(qid) = page.main_entity.map(|e| e.identifier) else {
|
||||
warn!("Page in list but without wikidata qid: {:?}", page.name);
|
||||
continue;
|
||||
};
|
||||
|
||||
let filename = args.output_dir.join(qid).with_extension("html");
|
||||
|
||||
debug!("{:?}: {:?}", page.name, filename);
|
||||
|
||||
if filename.exists() {
|
||||
debug!("Exists, skipping");
|
||||
continue;
|
||||
if let Err(e) = write(&args.output_dir, page) {
|
||||
error!("Error writing article: {}", e);
|
||||
}
|
||||
|
||||
let html = simplify(&page.article_body.html);
|
||||
|
||||
let mut file = File::create(filename)?;
|
||||
file.write_all(html.as_bytes())?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
|
|
@ -58,15 +58,14 @@ pub fn is_wikipedia_match(
|
|||
titles: &HashSet<WikipediaTitleNorm>,
|
||||
page: &Page,
|
||||
) -> Option<WikipediaTitleNorm> {
|
||||
// TODO: handle multiple languages
|
||||
let title = WikipediaTitleNorm::from_title(&page.name, "en");
|
||||
let title = WikipediaTitleNorm::from_title(&page.name, &page.in_language.identifier);
|
||||
|
||||
if titles.get(&title).is_some() {
|
||||
return Some(title);
|
||||
}
|
||||
|
||||
for redirect in &page.redirects {
|
||||
let title = WikipediaTitleNorm::from_title(&redirect.name, "en");
|
||||
let title = WikipediaTitleNorm::from_title(&redirect.name, &page.in_language.identifier);
|
||||
|
||||
if titles.get(&title).is_some() {
|
||||
return Some(title);
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
use serde::Deserialize;
|
||||
|
||||
// TODO: consolidate into single struct
|
||||
/// Deserialized Wikimedia Enterprise API Article
|
||||
///
|
||||
/// For all available fields, see https://enterprise.wikimedia.com/docs/data-dictionary/
|
||||
|
@ -9,6 +10,7 @@ pub struct Page {
|
|||
// TODO: check if CoW has a performance impact
|
||||
pub name: String,
|
||||
pub date_modified: String,
|
||||
pub in_language: Language,
|
||||
#[serde(default)]
|
||||
pub url: String,
|
||||
pub main_entity: Option<Wikidata>,
|
||||
|
@ -25,6 +27,8 @@ pub struct Wikidata {
|
|||
|
||||
#[derive(Deserialize)]
|
||||
pub struct ArticleBody {
|
||||
// TODO: look into RawValue to lazily parse/allocate this:
|
||||
// https://docs.rs/serde_json/latest/serde_json/value/struct.RawValue.html
|
||||
pub html: String,
|
||||
}
|
||||
|
||||
|
@ -34,3 +38,8 @@ pub struct Redirect {
|
|||
pub url: String,
|
||||
pub name: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
pub struct Language {
|
||||
pub identifier: String,
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue