Initial parsing and processing
The html processing should perform both of the main steps handled from the original `descriptions_downloader.py` script: - remove specific sections, e.g. "References" - remove elements with no non-whitespace text Determining how similar the output is will require more testing. A separate binary target is included for standalone html processing. Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
parent
aba31775fa
commit
d55d3cc7e0
8 changed files with 1479 additions and 34 deletions
1088
Cargo.lock
generated
1088
Cargo.lock
generated
File diff suppressed because it is too large
Load diff
12
Cargo.toml
12
Cargo.toml
|
@ -4,10 +4,20 @@ version = "0.0.0"
|
|||
license = "AGPL-3.0-only"
|
||||
edition = "2021"
|
||||
repository = "https://github.com/organicmaps/wikiparser/"
|
||||
|
||||
default-run = "om-wikiparser"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
anyhow = { version = "1.0.71", features = ["backtrace"] }
|
||||
clap = { version = "4.3.2", features = ["derive"] }
|
||||
env_logger = "0.10.0"
|
||||
log = "0.4.18"
|
||||
scraper = "0.16.0"
|
||||
serde = { version = "1.0.163", features = ["derive"] }
|
||||
serde_json = "1.0.96"
|
||||
url = "2.3.1"
|
||||
urlencoding = "2.1.2"
|
||||
|
||||
[profile.release]
|
||||
debug = true
|
||||
overflow-checks = true
|
||||
|
|
18
src/bin/simplify_html.rs
Normal file
18
src/bin/simplify_html.rs
Normal file
|
@ -0,0 +1,18 @@
|
|||
//! Apply html article simplification to stdin, and write it to stdout.
|
||||
//!
|
||||
//! Usage:
|
||||
//! simplify_html < article.html > simplified.html
|
||||
use std::io::{stdin, stdout, Read, Write};
|
||||
|
||||
use om_wikiparser::html::simplify;
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let mut input = String::new();
|
||||
stdin().read_to_string(&mut input)?;
|
||||
|
||||
let output = simplify(&input);
|
||||
|
||||
stdout().write_all(output.as_bytes())?;
|
||||
|
||||
Ok(())
|
||||
}
|
68
src/html.rs
Normal file
68
src/html.rs
Normal file
|
@ -0,0 +1,68 @@
|
|||
use scraper::{ElementRef, Html, Selector};
|
||||
|
||||
pub fn simplify(html: &str) -> String {
|
||||
// TODO: handle multiple languages
|
||||
let bad_sections = [
|
||||
"External links",
|
||||
"Sources",
|
||||
"See also",
|
||||
"Bibliography",
|
||||
"Further reading",
|
||||
"References",
|
||||
];
|
||||
|
||||
let mut document = Html::parse_document(html);
|
||||
|
||||
// TODO: evaluate this only once
|
||||
let headers = Selector::parse("h1, h2, h3, h4, h5, h6, h7").unwrap();
|
||||
|
||||
let mut to_remove = Vec::new();
|
||||
|
||||
// remove sections
|
||||
for header in document.select(&headers) {
|
||||
// TODO: should this join all text nodes?
|
||||
let Some(title) = header.text().next() else {
|
||||
continue
|
||||
};
|
||||
if bad_sections.contains(&title) {
|
||||
to_remove.push(header.id());
|
||||
let header_level = header.value().name();
|
||||
// strip trailing nodes
|
||||
for sibling in header.next_siblings() {
|
||||
if let Some(element) = sibling.value().as_element() {
|
||||
if element.name() == header_level {
|
||||
// TODO: should this check for a higher level?
|
||||
break;
|
||||
}
|
||||
}
|
||||
to_remove.push(sibling.id());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for id in to_remove.drain(..) {
|
||||
if let Some(mut node) = document.tree.get_mut(id) {
|
||||
node.detach();
|
||||
}
|
||||
}
|
||||
|
||||
// remove elements with no text that isn't whitespace
|
||||
|
||||
for element in document
|
||||
.root_element()
|
||||
.descendants()
|
||||
.filter_map(ElementRef::wrap)
|
||||
{
|
||||
if element.text().all(|t| t.trim().is_empty()) {
|
||||
to_remove.push(element.id());
|
||||
}
|
||||
}
|
||||
|
||||
for id in to_remove.drain(..) {
|
||||
if let Some(mut node) = document.tree.get_mut(id) {
|
||||
node.detach();
|
||||
}
|
||||
}
|
||||
|
||||
document.html()
|
||||
}
|
2
src/lib.rs
Normal file
2
src/lib.rs
Normal file
|
@ -0,0 +1,2 @@
|
|||
pub mod html;
|
||||
pub mod wm;
|
112
src/main.rs
112
src/main.rs
|
@ -1,40 +1,65 @@
|
|||
// Usage:
|
||||
// pv ~/Downloads/enwiki-NS0-20230401-ENTERPRISE-HTML.json.tar.gz | tar xzO | cargo run --release > /dev/null
|
||||
// # prep outputs from map generator
|
||||
// cut -f 2 ~/Downloads/id_to_wikidata.csv > /tmp/wikidata_ids.txt
|
||||
// tail -n +2 ~/Downloads/wiki_urls.txt | cut -f 3 > /tmp/wikipedia_urls.txt
|
||||
// # feed gzipped tarfile
|
||||
// pv ~/Downloads/enwiki-NS0-20230401-ENTERPRISE-HTML.json.tar.gz | tar xzO \
|
||||
// | cargo run --release -- \
|
||||
// --wikidata-ids /tmp/wikidata_ids.txt \
|
||||
// --wikipedia-urls /tmp/wikipedia_urls.txt \
|
||||
// output_dir
|
||||
use std::{
|
||||
fs::File,
|
||||
io::{stdin, BufRead, BufReader, Write},
|
||||
path::PathBuf,
|
||||
};
|
||||
|
||||
use serde::Deserialize;
|
||||
use std::io::{self, stdin, BufRead, BufReader, Write};
|
||||
use anyhow::bail;
|
||||
use clap::Parser;
|
||||
#[macro_use]
|
||||
extern crate log;
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Page {
|
||||
// TODO: check if CoW has a performance impact
|
||||
name: String,
|
||||
date_modified: String,
|
||||
#[serde(default)]
|
||||
url: String,
|
||||
main_entity: Option<Wikidata>,
|
||||
// TODO: see what impact parsing/unescaping/allocating this has
|
||||
article_body: ArticleBody,
|
||||
#[serde(default)]
|
||||
redirects: Vec<Redirect>,
|
||||
}
|
||||
use om_wikiparser::{
|
||||
html::simplify,
|
||||
wm::{is_wikidata_match, is_wikipedia_match, parse_wikidata_file, parse_wikipedia_file, Page},
|
||||
};
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Wikidata {
|
||||
identifier: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ArticleBody {
|
||||
html: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Redirect {
|
||||
url: String,
|
||||
name: String,
|
||||
#[derive(Parser)]
|
||||
struct Args {
|
||||
output_dir: PathBuf,
|
||||
#[arg(long)]
|
||||
wikidata_ids: Option<PathBuf>,
|
||||
#[arg(long)]
|
||||
wikipedia_urls: Option<PathBuf>,
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
env_logger::Builder::new()
|
||||
.filter_level(log::LevelFilter::Info)
|
||||
.parse_default_env()
|
||||
.try_init()?;
|
||||
|
||||
let args = Args::parse();
|
||||
|
||||
info!("Loading urls");
|
||||
let wikipedia_titles = args
|
||||
.wikipedia_urls
|
||||
.map(parse_wikipedia_file)
|
||||
.transpose()?
|
||||
.unwrap_or_default();
|
||||
|
||||
info!("Loading ids");
|
||||
let wikidata_ids = args
|
||||
.wikidata_ids
|
||||
.map(parse_wikidata_file)
|
||||
.transpose()?
|
||||
.unwrap_or_default();
|
||||
|
||||
if !args.output_dir.is_dir() {
|
||||
bail!("output dir {:?} does not exist", args.output_dir)
|
||||
}
|
||||
|
||||
info!("Processing dump");
|
||||
let dump = BufReader::new(stdin());
|
||||
|
||||
// TODO: compare different deserialization methods
|
||||
|
@ -45,10 +70,33 @@ fn main() -> anyhow::Result<()> {
|
|||
.and_then(|s| serde_json::from_str::<Page>(&s).map_err(anyhow::Error::new))
|
||||
});
|
||||
|
||||
let mut stdout = io::stdout();
|
||||
for page in stream {
|
||||
let page = page?;
|
||||
writeln!(stdout, "{}", page.name)?;
|
||||
|
||||
if !(is_wikidata_match(&wikidata_ids, &page).is_some()
|
||||
|| is_wikipedia_match(&wikipedia_titles, &page).is_some())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
let Some(qid) = page.main_entity.map(|e| e.identifier) else {
|
||||
warn!("Page in list but without wikidata qid: {:?}", page.name);
|
||||
continue;
|
||||
};
|
||||
|
||||
let filename = args.output_dir.join(qid).with_extension("html");
|
||||
|
||||
debug!("{:?}: {:?}", page.name, filename);
|
||||
|
||||
if filename.exists() {
|
||||
debug!("Exists, skipping");
|
||||
continue;
|
||||
}
|
||||
|
||||
let html = simplify(&page.article_body.html);
|
||||
|
||||
let mut file = File::create(filename)?;
|
||||
file.write_all(html.as_bytes())?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
|
177
src/wm/mod.rs
Normal file
177
src/wm/mod.rs
Normal file
|
@ -0,0 +1,177 @@
|
|||
//! Wikimedia types
|
||||
use std::{
|
||||
collections::HashSet,
|
||||
ffi::OsStr,
|
||||
fs::{self},
|
||||
num::ParseIntError,
|
||||
str::FromStr,
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
|
||||
use url::Url;
|
||||
|
||||
mod page;
|
||||
pub use page::Page;
|
||||
|
||||
/// Read from a file of urls on each line.
|
||||
pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<WikidataQid>> {
|
||||
let contents = fs::read_to_string(path.as_ref())?;
|
||||
contents
|
||||
.lines()
|
||||
.enumerate()
|
||||
.map(|(i, line)| {
|
||||
WikidataQid::from_str(line).with_context(|| {
|
||||
let line_num = i + 1;
|
||||
format!("bad QID value on line {line_num}: {line:?}")
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Read article titles from a file of urls on each line.
|
||||
pub fn parse_wikipedia_file(
|
||||
path: impl AsRef<OsStr>,
|
||||
) -> anyhow::Result<HashSet<WikipediaTitleNorm>> {
|
||||
let contents = fs::read_to_string(path.as_ref())?;
|
||||
contents
|
||||
.lines()
|
||||
.enumerate()
|
||||
.map(|(i, line)| {
|
||||
WikipediaTitleNorm::from_url(line).with_context(|| {
|
||||
let line_num = i + 1;
|
||||
format!("bad wikipedia url on line {line_num}: {line:?}")
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn is_wikidata_match(ids: &HashSet<WikidataQid>, page: &Page) -> Option<WikidataQid> {
|
||||
let Some(wikidata) = &page.main_entity else { return None;};
|
||||
let wikidata_id = &wikidata.identifier;
|
||||
let wikidata_id = match WikidataQid::from_str(wikidata_id) {
|
||||
Ok(qid) => qid,
|
||||
Err(e) => {
|
||||
eprintln!("Could not parse QID: {:?}: {}", wikidata_id, e);
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
ids.get(&wikidata_id).map(|_| wikidata_id)
|
||||
}
|
||||
|
||||
pub fn is_wikipedia_match(
|
||||
titles: &HashSet<WikipediaTitleNorm>,
|
||||
page: &Page,
|
||||
) -> Option<WikipediaTitleNorm> {
|
||||
// TODO: handle multiple languages
|
||||
let title = WikipediaTitleNorm::from_title(&page.name, "en");
|
||||
|
||||
if titles.get(&title).is_some() {
|
||||
return Some(title);
|
||||
}
|
||||
|
||||
for redirect in &page.redirects {
|
||||
let title = WikipediaTitleNorm::from_title(&redirect.name, "en");
|
||||
|
||||
if titles.get(&title).is_some() {
|
||||
return Some(title);
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Wikidata QID/Q Number
|
||||
///
|
||||
/// See https://www.wikidata.org/wiki/Wikidata:Glossary#QID
|
||||
///
|
||||
/// ```
|
||||
/// use std::str::FromStr;
|
||||
/// use om_wikiparser::wm::WikidataQid;
|
||||
///
|
||||
/// let with_q = WikidataQid::from_str("Q12345").unwrap();
|
||||
/// let without_q = WikidataQid::from_str("12345").unwrap();
|
||||
/// assert_eq!(with_q, without_q);
|
||||
/// ```
|
||||
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
pub struct WikidataQid(u32);
|
||||
|
||||
impl FromStr for WikidataQid {
|
||||
type Err = ParseIntError;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let s = s.strip_prefix('Q').unwrap_or(s);
|
||||
u32::from_str(s).map(WikidataQid)
|
||||
}
|
||||
}
|
||||
|
||||
/// Normalized wikipedia article title that can compare:
|
||||
/// - titles `Spatial Database`
|
||||
/// - urls `https://en.wikipedia.org/wiki/Spatial_database#Geodatabase`
|
||||
/// - osm-style tags `en:Spatial Database`
|
||||
///
|
||||
/// ```
|
||||
/// use om_wikiparser::wm::WikipediaTitleNorm;
|
||||
///
|
||||
/// let url = WikipediaTitleNorm::from_url("https://en.wikipedia.org/wiki/Article_Title/").unwrap();
|
||||
/// let title = WikipediaTitleNorm::from_title("Article Title", "en");
|
||||
/// assert_eq!(url, title);
|
||||
/// ```
|
||||
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
pub struct WikipediaTitleNorm {
|
||||
lang: String,
|
||||
name: String,
|
||||
}
|
||||
|
||||
impl WikipediaTitleNorm {
|
||||
fn normalize_title(title: &str) -> String {
|
||||
// TODO: compare with generator url creation
|
||||
title.replace(' ', "_")
|
||||
}
|
||||
|
||||
// https://en.wikipedia.org/wiki/Article_Title
|
||||
pub fn from_url(url: &str) -> anyhow::Result<Self> {
|
||||
let url = Url::parse(url)?;
|
||||
|
||||
let (subdomain, host) = url
|
||||
.host_str()
|
||||
.ok_or(anyhow!("Expected host"))?
|
||||
.split_once('.')
|
||||
.ok_or(anyhow!("Expected subdomain"))?;
|
||||
if host != "wikipedia.org" {
|
||||
bail!("Expected wikipedia.org for domain")
|
||||
}
|
||||
let lang = subdomain;
|
||||
|
||||
let mut paths = url.path_segments().ok_or(anyhow!("Expected path"))?;
|
||||
|
||||
let root = paths
|
||||
.next()
|
||||
.ok_or(anyhow!("Expected first segment in path"))?;
|
||||
|
||||
if root != "wiki" {
|
||||
bail!("Expected 'wiki' in path")
|
||||
}
|
||||
|
||||
let title = paths
|
||||
.next()
|
||||
.ok_or(anyhow!("Expected second segment in path"))?;
|
||||
let title = urlencoding::decode(title)?;
|
||||
|
||||
Ok(Self::from_title(&title, lang))
|
||||
}
|
||||
|
||||
// en:Article Title
|
||||
fn _from_osm_tag(tag: &str) -> anyhow::Result<Self> {
|
||||
let (lang, title) = tag.split_once(':').ok_or(anyhow!("Expected ':'"))?;
|
||||
|
||||
Ok(Self::from_title(title, lang))
|
||||
}
|
||||
|
||||
pub fn from_title(title: &str, lang: &str) -> Self {
|
||||
let name = Self::normalize_title(title);
|
||||
let lang = lang.to_owned();
|
||||
Self { name, lang }
|
||||
}
|
||||
}
|
36
src/wm/page.rs
Normal file
36
src/wm/page.rs
Normal file
|
@ -0,0 +1,36 @@
|
|||
use serde::Deserialize;
|
||||
|
||||
/// Deserialized Wikimedia Enterprise API Article
|
||||
///
|
||||
/// For all available fields, see https://enterprise.wikimedia.com/docs/data-dictionary/
|
||||
#[allow(dead_code)] // TODO: reevaluate fields
|
||||
#[derive(Deserialize)]
|
||||
pub struct Page {
|
||||
// TODO: check if CoW has a performance impact
|
||||
pub name: String,
|
||||
pub date_modified: String,
|
||||
#[serde(default)]
|
||||
pub url: String,
|
||||
pub main_entity: Option<Wikidata>,
|
||||
// TODO: see what impact parsing/unescaping/allocating this has
|
||||
pub article_body: ArticleBody,
|
||||
#[serde(default)]
|
||||
pub redirects: Vec<Redirect>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
pub struct Wikidata {
|
||||
pub identifier: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
pub struct ArticleBody {
|
||||
pub html: String,
|
||||
}
|
||||
|
||||
#[allow(dead_code)] // TODO: reevaluate fields
|
||||
#[derive(Deserialize)]
|
||||
pub struct Redirect {
|
||||
pub url: String,
|
||||
pub name: String,
|
||||
}
|
Loading…
Add table
Reference in a new issue