Initial parsing and processing

The html processing should perform both of the main steps handled from the original
`descriptions_downloader.py` script:
- remove specific sections, e.g. "References"
- remove elements with no non-whitespace text

Determining how similar the output is will require more testing.

A separate binary target is included for standalone html processing.

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
Evan Lloyd New-Schmidt 2023-06-01 10:14:46 -04:00 committed by Evan Lloyd New-Schmidt
parent aba31775fa
commit d55d3cc7e0
8 changed files with 1479 additions and 34 deletions

1088
Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -4,10 +4,20 @@ version = "0.0.0"
license = "AGPL-3.0-only"
edition = "2021"
repository = "https://github.com/organicmaps/wikiparser/"
default-run = "om-wikiparser"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow = { version = "1.0.71", features = ["backtrace"] }
clap = { version = "4.3.2", features = ["derive"] }
env_logger = "0.10.0"
log = "0.4.18"
scraper = "0.16.0"
serde = { version = "1.0.163", features = ["derive"] }
serde_json = "1.0.96"
url = "2.3.1"
urlencoding = "2.1.2"
[profile.release]
debug = true
overflow-checks = true

18
src/bin/simplify_html.rs Normal file
View file

@ -0,0 +1,18 @@
//! Apply html article simplification to stdin, and write it to stdout.
//!
//! Usage:
//! simplify_html < article.html > simplified.html
use std::io::{stdin, stdout, Read, Write};
use om_wikiparser::html::simplify;
fn main() -> anyhow::Result<()> {
let mut input = String::new();
stdin().read_to_string(&mut input)?;
let output = simplify(&input);
stdout().write_all(output.as_bytes())?;
Ok(())
}

68
src/html.rs Normal file
View file

@ -0,0 +1,68 @@
use scraper::{ElementRef, Html, Selector};
pub fn simplify(html: &str) -> String {
// TODO: handle multiple languages
let bad_sections = [
"External links",
"Sources",
"See also",
"Bibliography",
"Further reading",
"References",
];
let mut document = Html::parse_document(html);
// TODO: evaluate this only once
let headers = Selector::parse("h1, h2, h3, h4, h5, h6, h7").unwrap();
let mut to_remove = Vec::new();
// remove sections
for header in document.select(&headers) {
// TODO: should this join all text nodes?
let Some(title) = header.text().next() else {
continue
};
if bad_sections.contains(&title) {
to_remove.push(header.id());
let header_level = header.value().name();
// strip trailing nodes
for sibling in header.next_siblings() {
if let Some(element) = sibling.value().as_element() {
if element.name() == header_level {
// TODO: should this check for a higher level?
break;
}
}
to_remove.push(sibling.id());
}
}
}
for id in to_remove.drain(..) {
if let Some(mut node) = document.tree.get_mut(id) {
node.detach();
}
}
// remove elements with no text that isn't whitespace
for element in document
.root_element()
.descendants()
.filter_map(ElementRef::wrap)
{
if element.text().all(|t| t.trim().is_empty()) {
to_remove.push(element.id());
}
}
for id in to_remove.drain(..) {
if let Some(mut node) = document.tree.get_mut(id) {
node.detach();
}
}
document.html()
}

2
src/lib.rs Normal file
View file

@ -0,0 +1,2 @@
pub mod html;
pub mod wm;

View file

@ -1,40 +1,65 @@
// Usage:
// pv ~/Downloads/enwiki-NS0-20230401-ENTERPRISE-HTML.json.tar.gz | tar xzO | cargo run --release > /dev/null
// # prep outputs from map generator
// cut -f 2 ~/Downloads/id_to_wikidata.csv > /tmp/wikidata_ids.txt
// tail -n +2 ~/Downloads/wiki_urls.txt | cut -f 3 > /tmp/wikipedia_urls.txt
// # feed gzipped tarfile
// pv ~/Downloads/enwiki-NS0-20230401-ENTERPRISE-HTML.json.tar.gz | tar xzO \
// | cargo run --release -- \
// --wikidata-ids /tmp/wikidata_ids.txt \
// --wikipedia-urls /tmp/wikipedia_urls.txt \
// output_dir
use std::{
fs::File,
io::{stdin, BufRead, BufReader, Write},
path::PathBuf,
};
use serde::Deserialize;
use std::io::{self, stdin, BufRead, BufReader, Write};
use anyhow::bail;
use clap::Parser;
#[macro_use]
extern crate log;
#[derive(Deserialize)]
struct Page {
// TODO: check if CoW has a performance impact
name: String,
date_modified: String,
#[serde(default)]
url: String,
main_entity: Option<Wikidata>,
// TODO: see what impact parsing/unescaping/allocating this has
article_body: ArticleBody,
#[serde(default)]
redirects: Vec<Redirect>,
}
use om_wikiparser::{
html::simplify,
wm::{is_wikidata_match, is_wikipedia_match, parse_wikidata_file, parse_wikipedia_file, Page},
};
#[derive(Deserialize)]
struct Wikidata {
identifier: String,
}
#[derive(Deserialize)]
struct ArticleBody {
html: String,
}
#[derive(Deserialize)]
struct Redirect {
url: String,
name: String,
#[derive(Parser)]
struct Args {
output_dir: PathBuf,
#[arg(long)]
wikidata_ids: Option<PathBuf>,
#[arg(long)]
wikipedia_urls: Option<PathBuf>,
}
fn main() -> anyhow::Result<()> {
env_logger::Builder::new()
.filter_level(log::LevelFilter::Info)
.parse_default_env()
.try_init()?;
let args = Args::parse();
info!("Loading urls");
let wikipedia_titles = args
.wikipedia_urls
.map(parse_wikipedia_file)
.transpose()?
.unwrap_or_default();
info!("Loading ids");
let wikidata_ids = args
.wikidata_ids
.map(parse_wikidata_file)
.transpose()?
.unwrap_or_default();
if !args.output_dir.is_dir() {
bail!("output dir {:?} does not exist", args.output_dir)
}
info!("Processing dump");
let dump = BufReader::new(stdin());
// TODO: compare different deserialization methods
@ -45,10 +70,33 @@ fn main() -> anyhow::Result<()> {
.and_then(|s| serde_json::from_str::<Page>(&s).map_err(anyhow::Error::new))
});
let mut stdout = io::stdout();
for page in stream {
let page = page?;
writeln!(stdout, "{}", page.name)?;
if !(is_wikidata_match(&wikidata_ids, &page).is_some()
|| is_wikipedia_match(&wikipedia_titles, &page).is_some())
{
continue;
}
let Some(qid) = page.main_entity.map(|e| e.identifier) else {
warn!("Page in list but without wikidata qid: {:?}", page.name);
continue;
};
let filename = args.output_dir.join(qid).with_extension("html");
debug!("{:?}: {:?}", page.name, filename);
if filename.exists() {
debug!("Exists, skipping");
continue;
}
let html = simplify(&page.article_body.html);
let mut file = File::create(filename)?;
file.write_all(html.as_bytes())?;
}
Ok(())

177
src/wm/mod.rs Normal file
View file

@ -0,0 +1,177 @@
//! Wikimedia types
use std::{
collections::HashSet,
ffi::OsStr,
fs::{self},
num::ParseIntError,
str::FromStr,
};
use anyhow::{anyhow, bail, Context};
use url::Url;
mod page;
pub use page::Page;
/// Read from a file of urls on each line.
pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<WikidataQid>> {
let contents = fs::read_to_string(path.as_ref())?;
contents
.lines()
.enumerate()
.map(|(i, line)| {
WikidataQid::from_str(line).with_context(|| {
let line_num = i + 1;
format!("bad QID value on line {line_num}: {line:?}")
})
})
.collect()
}
/// Read article titles from a file of urls on each line.
pub fn parse_wikipedia_file(
path: impl AsRef<OsStr>,
) -> anyhow::Result<HashSet<WikipediaTitleNorm>> {
let contents = fs::read_to_string(path.as_ref())?;
contents
.lines()
.enumerate()
.map(|(i, line)| {
WikipediaTitleNorm::from_url(line).with_context(|| {
let line_num = i + 1;
format!("bad wikipedia url on line {line_num}: {line:?}")
})
})
.collect()
}
pub fn is_wikidata_match(ids: &HashSet<WikidataQid>, page: &Page) -> Option<WikidataQid> {
let Some(wikidata) = &page.main_entity else { return None;};
let wikidata_id = &wikidata.identifier;
let wikidata_id = match WikidataQid::from_str(wikidata_id) {
Ok(qid) => qid,
Err(e) => {
eprintln!("Could not parse QID: {:?}: {}", wikidata_id, e);
return None;
}
};
ids.get(&wikidata_id).map(|_| wikidata_id)
}
pub fn is_wikipedia_match(
titles: &HashSet<WikipediaTitleNorm>,
page: &Page,
) -> Option<WikipediaTitleNorm> {
// TODO: handle multiple languages
let title = WikipediaTitleNorm::from_title(&page.name, "en");
if titles.get(&title).is_some() {
return Some(title);
}
for redirect in &page.redirects {
let title = WikipediaTitleNorm::from_title(&redirect.name, "en");
if titles.get(&title).is_some() {
return Some(title);
}
}
None
}
/// Wikidata QID/Q Number
///
/// See https://www.wikidata.org/wiki/Wikidata:Glossary#QID
///
/// ```
/// use std::str::FromStr;
/// use om_wikiparser::wm::WikidataQid;
///
/// let with_q = WikidataQid::from_str("Q12345").unwrap();
/// let without_q = WikidataQid::from_str("12345").unwrap();
/// assert_eq!(with_q, without_q);
/// ```
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct WikidataQid(u32);
impl FromStr for WikidataQid {
type Err = ParseIntError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let s = s.strip_prefix('Q').unwrap_or(s);
u32::from_str(s).map(WikidataQid)
}
}
/// Normalized wikipedia article title that can compare:
/// - titles `Spatial Database`
/// - urls `https://en.wikipedia.org/wiki/Spatial_database#Geodatabase`
/// - osm-style tags `en:Spatial Database`
///
/// ```
/// use om_wikiparser::wm::WikipediaTitleNorm;
///
/// let url = WikipediaTitleNorm::from_url("https://en.wikipedia.org/wiki/Article_Title/").unwrap();
/// let title = WikipediaTitleNorm::from_title("Article Title", "en");
/// assert_eq!(url, title);
/// ```
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct WikipediaTitleNorm {
lang: String,
name: String,
}
impl WikipediaTitleNorm {
fn normalize_title(title: &str) -> String {
// TODO: compare with generator url creation
title.replace(' ', "_")
}
// https://en.wikipedia.org/wiki/Article_Title
pub fn from_url(url: &str) -> anyhow::Result<Self> {
let url = Url::parse(url)?;
let (subdomain, host) = url
.host_str()
.ok_or(anyhow!("Expected host"))?
.split_once('.')
.ok_or(anyhow!("Expected subdomain"))?;
if host != "wikipedia.org" {
bail!("Expected wikipedia.org for domain")
}
let lang = subdomain;
let mut paths = url.path_segments().ok_or(anyhow!("Expected path"))?;
let root = paths
.next()
.ok_or(anyhow!("Expected first segment in path"))?;
if root != "wiki" {
bail!("Expected 'wiki' in path")
}
let title = paths
.next()
.ok_or(anyhow!("Expected second segment in path"))?;
let title = urlencoding::decode(title)?;
Ok(Self::from_title(&title, lang))
}
// en:Article Title
fn _from_osm_tag(tag: &str) -> anyhow::Result<Self> {
let (lang, title) = tag.split_once(':').ok_or(anyhow!("Expected ':'"))?;
Ok(Self::from_title(title, lang))
}
pub fn from_title(title: &str, lang: &str) -> Self {
let name = Self::normalize_title(title);
let lang = lang.to_owned();
Self { name, lang }
}
}

36
src/wm/page.rs Normal file
View file

@ -0,0 +1,36 @@
use serde::Deserialize;
/// Deserialized Wikimedia Enterprise API Article
///
/// For all available fields, see https://enterprise.wikimedia.com/docs/data-dictionary/
#[allow(dead_code)] // TODO: reevaluate fields
#[derive(Deserialize)]
pub struct Page {
// TODO: check if CoW has a performance impact
pub name: String,
pub date_modified: String,
#[serde(default)]
pub url: String,
pub main_entity: Option<Wikidata>,
// TODO: see what impact parsing/unescaping/allocating this has
pub article_body: ArticleBody,
#[serde(default)]
pub redirects: Vec<Redirect>,
}
#[derive(Deserialize)]
pub struct Wikidata {
pub identifier: String,
}
#[derive(Deserialize)]
pub struct ArticleBody {
pub html: String,
}
#[allow(dead_code)] // TODO: reevaluate fields
#[derive(Deserialize)]
pub struct Redirect {
pub url: String,
pub name: String,
}