Refactor and rename title/qid wrappers

- Move Qid and Title to separate modules
- Reformat benchmark

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
Evan Lloyd New-Schmidt 2023-08-09 12:09:40 -04:00 committed by Evan Lloyd New-Schmidt
parent bdf6f1a68c
commit 34bb9318d5
6 changed files with 211 additions and 207 deletions

View file

@ -4,22 +4,21 @@ use std::{collections::HashSet, str::FromStr};
extern crate om_wikiparser;
extern crate test;
use om_wikiparser::wm::{Qid, Title};
const TITLE: &str = "https://en.wikipedia.org/wiki/Article_Title";
const QID: &str = "Q123456789";
#[bench]
fn parse_wikipedia(b: &mut test::Bencher) {
b.iter(|| {
let title = om_wikiparser::wm::WikipediaTitleNorm::from_url(
"https://en.wikipedia.org/wiki/Article_Title",
)
.unwrap();
Title::from_url(TITLE).unwrap();
});
}
#[bench]
fn hash_wikipedia(b: &mut test::Bencher) {
let title = om_wikiparser::wm::WikipediaTitleNorm::from_url(
"https://en.wikipedia.org/wiki/Article_Title",
)
.unwrap();
let title = Title::from_url(TITLE).unwrap();
let mut set = HashSet::new();
b.iter(|| {
set.insert(&title);
@ -29,13 +28,13 @@ fn hash_wikipedia(b: &mut test::Bencher) {
#[bench]
fn parse_wikidata(b: &mut test::Bencher) {
b.iter(|| {
let qid = om_wikiparser::wm::WikidataQid::from_str("Q123456789").unwrap();
Qid::from_str(QID).unwrap();
});
}
#[bench]
fn hash_wikidata(b: &mut test::Bencher) {
let qid = om_wikiparser::wm::WikidataQid::from_str("Q123456789").unwrap();
let qid = Qid::from_str(QID).unwrap();
let mut set = HashSet::new();
b.iter(|| {
set.insert(&qid);

View file

@ -9,7 +9,7 @@ use anyhow::{anyhow, bail, Context};
use om_wikiparser::{
html::simplify,
wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, Title},
};
/// Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.
@ -154,7 +154,7 @@ pub fn run(args: Args) -> anyhow::Result<()> {
fn create_article_dir(
base: impl AsRef<Path>,
page: &Page,
redirects: impl IntoIterator<Item = WikipediaTitleNorm>,
redirects: impl IntoIterator<Item = Title>,
) -> anyhow::Result<PathBuf> {
let base = base.as_ref();
let mut redirects = redirects.into_iter();
@ -237,7 +237,7 @@ fn create_article_dir(
fn write(
base: impl AsRef<Path>,
page: &Page,
redirects: impl IntoIterator<Item = WikipediaTitleNorm>,
redirects: impl IntoIterator<Item = Title>,
) -> anyhow::Result<()> {
let article_dir = create_article_dir(base, page, redirects)?;

View file

@ -1,24 +1,23 @@
//! Wikimedia types
use std::{
collections::HashSet, ffi::OsStr, fmt::Display, fs, num::ParseIntError, path::PathBuf,
str::FromStr,
};
use std::{collections::HashSet, ffi::OsStr, fs, str::FromStr};
use anyhow::{anyhow, bail, Context};
use url::Url;
use anyhow::{anyhow, Context};
mod page;
pub use page::Page;
mod title;
pub use title::*;
mod qid;
pub use qid::*;
/// Read from a file of urls on each line.
pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<WikidataQid>> {
pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Qid>> {
let contents = fs::read_to_string(path.as_ref())?;
Ok(contents
.lines()
.enumerate()
.map(|(i, line)| {
WikidataQid::from_str(line).with_context(|| {
Qid::from_str(line).with_context(|| {
let line_num = i + 1;
format!("on line {line_num}: {line:?}")
})
@ -34,15 +33,13 @@ pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Wi
}
/// Read article titles from a file of urls on each line.
pub fn parse_wikipedia_file(
path: impl AsRef<OsStr>,
) -> anyhow::Result<HashSet<WikipediaTitleNorm>> {
pub fn parse_wikipedia_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Title>> {
let contents = fs::read_to_string(path.as_ref())?;
Ok(contents
.lines()
.enumerate()
.map(|(i, line)| {
WikipediaTitleNorm::from_url(line).with_context(|| {
Title::from_url(line).with_context(|| {
let line_num = i + 1;
format!("on line {line_num}: {line:?}")
})
@ -59,8 +56,8 @@ pub fn parse_wikipedia_file(
pub fn parse_osm_tag_file(
path: impl AsRef<OsStr>,
qids: &mut HashSet<WikidataQid>,
titles: &mut HashSet<WikipediaTitleNorm>,
qids: &mut HashSet<Qid>,
titles: &mut HashSet<Title>,
) -> anyhow::Result<()> {
let path = path.as_ref();
let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?;
@ -93,7 +90,7 @@ pub fn parse_osm_tag_file(
let qid = &row[qid_col].trim();
if !qid.is_empty() {
match WikidataQid::from_str(qid) {
match Qid::from_str(qid) {
Ok(qid) => {
qids.insert(qid);
}
@ -109,7 +106,7 @@ pub fn parse_osm_tag_file(
let title = &row[title_col].trim();
if !title.is_empty() {
match WikipediaTitleNorm::from_osm_tag(title) {
match Title::from_osm_tag(title) {
Ok(title) => {
titles.insert(title);
}
@ -126,172 +123,3 @@ pub fn parse_osm_tag_file(
Ok(())
}
/// Wikidata QID/Q Number
///
/// See https://www.wikidata.org/wiki/Wikidata:Glossary#QID
///
/// ```
/// use std::str::FromStr;
/// use om_wikiparser::wm::WikidataQid;
///
/// let with_q = WikidataQid::from_str("Q12345").unwrap();
/// let without_q = WikidataQid::from_str(" 12345 ").unwrap();
/// assert_eq!(with_q, without_q);
///
/// assert!(WikidataQid::from_str("q12345").is_ok());
/// assert!(WikidataQid::from_str("https://wikidata.org/wiki/Q12345").is_err());
/// assert!(WikidataQid::from_str("Article_Title").is_err());
/// assert!(WikidataQid::from_str("Q").is_err());
/// assert!(WikidataQid::from_str("").is_err());
/// ```
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct WikidataQid(u32);
impl FromStr for WikidataQid {
type Err = ParseIntError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let s = s.trim();
let s = s.strip_prefix(['Q', 'q']).unwrap_or(s);
u32::from_str(s).map(WikidataQid)
}
}
impl Display for WikidataQid {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Q{}", self.0)
}
}
impl WikidataQid {
pub fn get_dir(&self, base: PathBuf) -> PathBuf {
let mut path = base;
path.push("wikidata");
// TODO: can use as_mut_os_string with 1.70.0
path.push(self.to_string());
path
}
}
/// Normalized wikipedia article title that can compare:
/// - titles `Spatial Database`
/// - urls `https://en.wikipedia.org/wiki/Spatial_database#Geodatabase`
/// - osm-style tags `en:Spatial Database`
///
/// ```
/// use om_wikiparser::wm::WikipediaTitleNorm;
///
/// let title = WikipediaTitleNorm::from_title("Article Title", "en").unwrap();
/// let url = WikipediaTitleNorm::from_url("https://en.wikipedia.org/wiki/Article_Title#Section").unwrap();
/// let mobile = WikipediaTitleNorm::from_url("https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
/// let url_tag1 = WikipediaTitleNorm::from_osm_tag("https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
/// let url_tag2 = WikipediaTitleNorm::from_osm_tag("de:https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
/// assert_eq!(url, title);
/// assert_eq!(url, mobile);
/// assert_eq!(url, url_tag1);
/// assert_eq!(url, url_tag2);
///
/// assert!(WikipediaTitleNorm::from_url("https://en.wikipedia.org/not_a_wiki_page").is_err());
/// assert!(WikipediaTitleNorm::from_url("https://wikidata.org/wiki/Q12345").is_err());
///
/// assert!(
/// WikipediaTitleNorm::from_url("https://de.wikipedia.org/wiki/Breil/Brigels").unwrap() !=
/// WikipediaTitleNorm::from_url("https://de.wikipedia.org/wiki/Breil").unwrap()
/// );
/// ```
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct WikipediaTitleNorm {
lang: String,
name: String,
}
impl Display for WikipediaTitleNorm {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}:{}", self.lang, self.name)
}
}
impl WikipediaTitleNorm {
fn normalize_title(title: &str) -> String {
// TODO: Compare with map generator url creation, ensure covers all cases.
title.trim().replace(' ', "_")
}
// https://en.wikipedia.org/wiki/Article_Title/More_Title
pub fn from_url(url: &str) -> anyhow::Result<Self> {
let url = Url::parse(url.trim())?;
let (subdomain, host) = url
.host_str()
.ok_or_else(|| anyhow!("Expected host"))?
.split_once('.')
.ok_or_else(|| anyhow!("Expected subdomain"))?;
let host = host.strip_prefix("m.").unwrap_or(host);
if host != "wikipedia.org" {
bail!("Expected wikipedia.org for domain")
}
let lang = subdomain;
let path = url.path();
let (root, title) = path
.strip_prefix('/')
.unwrap_or(path)
.split_once('/')
.ok_or_else(|| anyhow!("Expected at least two segments in path"))?;
if root != "wiki" {
bail!("Expected 'wiki' as root path, got: {:?}", root)
}
let title = urlencoding::decode(title)?;
Self::from_title(&title, lang)
}
// en:Article Title
pub fn from_osm_tag(tag: &str) -> anyhow::Result<Self> {
let (lang, title) = tag
.trim()
.split_once(':')
.ok_or_else(|| anyhow!("Expected ':'"))?;
let lang = lang.trim_start();
let title = title.trim_start();
if matches!(lang, "http" | "https") {
return Self::from_url(tag);
}
if title.starts_with("http://") || title.starts_with("https://") {
return Self::from_url(title);
}
Self::from_title(title, lang)
}
pub fn from_title(title: &str, lang: &str) -> anyhow::Result<Self> {
let title = title.trim();
let lang = lang.trim();
if title.is_empty() {
bail!("title cannot be empty or whitespace");
}
if lang.is_empty() {
bail!("lang cannot be empty or whitespace");
}
let name = Self::normalize_title(title);
let lang = lang.to_owned();
Ok(Self { name, lang })
}
pub fn get_dir(&self, base: PathBuf) -> PathBuf {
let mut path = base;
// TODO: can use as_mut_os_string with 1.70.0
path.push(format!("{}.wikipedia.org", self.lang));
path.push("wiki");
path.push(&self.name);
path
}
}

View file

@ -2,7 +2,7 @@ use std::{iter, str::FromStr};
use serde::Deserialize;
use super::{WikidataQid, WikipediaTitleNorm};
use super::{Qid, Title};
// TODO: consolidate into single struct
/// Deserialized Wikimedia Enterprise API Article
@ -25,27 +25,27 @@ pub struct Page {
}
impl Page {
pub fn wikidata(&self) -> Option<WikidataQid> {
pub fn wikidata(&self) -> Option<Qid> {
// TODO: return error
self.main_entity
.as_ref()
.map(|e| WikidataQid::from_str(&e.identifier).unwrap())
.map(|e| Qid::from_str(&e.identifier).unwrap())
}
/// Title of the article
pub fn title(&self) -> anyhow::Result<WikipediaTitleNorm> {
WikipediaTitleNorm::from_title(&self.name, &self.in_language.identifier)
pub fn title(&self) -> anyhow::Result<Title> {
Title::from_title(&self.name, &self.in_language.identifier)
}
/// All titles that lead to the article, the main title followed by any redirects.
pub fn all_titles(&self) -> impl Iterator<Item = anyhow::Result<WikipediaTitleNorm>> + '_ {
pub fn all_titles(&self) -> impl Iterator<Item = anyhow::Result<Title>> + '_ {
iter::once(self.title()).chain(self.redirects())
}
pub fn redirects(&self) -> impl Iterator<Item = anyhow::Result<WikipediaTitleNorm>> + '_ {
pub fn redirects(&self) -> impl Iterator<Item = anyhow::Result<Title>> + '_ {
self.redirects
.iter()
.map(|r| WikipediaTitleNorm::from_title(&r.name, &self.in_language.identifier))
.map(|r| Title::from_title(&r.name, &self.in_language.identifier))
}
}

51
src/wm/qid.rs Normal file
View file

@ -0,0 +1,51 @@
use std::{fmt::Display, num::ParseIntError, path::PathBuf, str::FromStr};
/// Wikidata QID/Q Number
///
/// See https://www.wikidata.org/wiki/Wikidata:Glossary#QID
///
/// ```
/// use std::str::FromStr;
/// use om_wikiparser::wm::Qid;
///
/// let with_q = Qid::from_str("Q12345").unwrap();
/// let without_q = Qid::from_str(" 12345 ").unwrap();
/// assert_eq!(with_q, without_q);
///
/// assert!(Qid::from_str("q12345").is_ok());
/// assert!(Qid::from_str("https://wikidata.org/wiki/Q12345").is_err());
/// assert!(Qid::from_str("Article_Title").is_err());
/// assert!(Qid::from_str("Q").is_err());
/// assert!(Qid::from_str("").is_err());
/// ```
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct Qid(u32);
pub type ParseQidError = ParseIntError;
impl FromStr for Qid {
type Err = ParseQidError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let s = s.trim();
let s = s.strip_prefix(['Q', 'q']).unwrap_or(s);
u32::from_str(s).map(Qid)
}
}
impl Display for Qid {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Q{}", self.0)
}
}
impl Qid {
pub fn get_dir(&self, base: PathBuf) -> PathBuf {
let mut path = base;
path.push("wikidata");
// TODO: can use as_mut_os_string with 1.70.0
path.push(self.to_string());
path
}
}

126
src/wm/title.rs Normal file
View file

@ -0,0 +1,126 @@
use std::{fmt::Display, path::PathBuf};
use anyhow::{anyhow, bail};
use url::Url;
/// Normalized wikipedia article title that can compare:
/// - titles `Spatial Database`
/// - urls `https://en.wikipedia.org/wiki/Spatial_database#Geodatabase`
/// - osm-style tags `en:Spatial Database`
///
/// ```
/// use om_wikiparser::wm::Title;
///
/// let title = Title::from_title("Article Title", "en").unwrap();
/// let url = Title::from_url("https://en.wikipedia.org/wiki/Article_Title#Section").unwrap();
/// let mobile = Title::from_url("https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
/// let url_tag1 = Title::from_osm_tag("https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
/// let url_tag2 = Title::from_osm_tag("de:https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
/// assert_eq!(url, title);
/// assert_eq!(url, mobile);
/// assert_eq!(url, url_tag1);
/// assert_eq!(url, url_tag2);
///
/// assert!(Title::from_url("https://en.wikipedia.org/not_a_wiki_page").is_err());
/// assert!(Title::from_url("https://wikidata.org/wiki/Q12345").is_err());
///
/// assert!(
/// Title::from_url("https://de.wikipedia.org/wiki/Breil/Brigels").unwrap() !=
/// Title::from_url("https://de.wikipedia.org/wiki/Breil").unwrap()
/// );
/// ```
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct Title {
lang: String,
name: String,
}
impl Display for Title {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}:{}", self.lang, self.name)
}
}
impl Title {
fn normalize_title(title: &str) -> String {
// TODO: Compare with map generator url creation, ensure covers all cases.
title.trim().replace(' ', "_")
}
// https://en.wikipedia.org/wiki/Article_Title/More_Title
pub fn from_url(url: &str) -> anyhow::Result<Self> {
let url = Url::parse(url.trim())?;
let (subdomain, host) = url
.host_str()
.ok_or_else(|| anyhow!("Expected host"))?
.split_once('.')
.ok_or_else(|| anyhow!("Expected subdomain"))?;
let host = host.strip_prefix("m.").unwrap_or(host);
if host != "wikipedia.org" {
bail!("Expected wikipedia.org for domain")
}
let lang = subdomain;
let path = url.path();
let (root, title) = path
.strip_prefix('/')
.unwrap_or(path)
.split_once('/')
.ok_or_else(|| anyhow!("Expected at least two segments in path"))?;
if root != "wiki" {
bail!("Expected 'wiki' as root path, got: {:?}", root)
}
let title = urlencoding::decode(title)?;
Self::from_title(&title, lang)
}
// en:Article Title
pub fn from_osm_tag(tag: &str) -> anyhow::Result<Self> {
let (lang, title) = tag
.trim()
.split_once(':')
.ok_or_else(|| anyhow!("Expected ':'"))?;
let lang = lang.trim_start();
let title = title.trim_start();
if matches!(lang, "http" | "https") {
return Self::from_url(tag);
}
if title.starts_with("http://") || title.starts_with("https://") {
return Self::from_url(title);
}
Self::from_title(title, lang)
}
pub fn from_title(title: &str, lang: &str) -> anyhow::Result<Self> {
let title = title.trim();
let lang = lang.trim();
if title.is_empty() {
bail!("title cannot be empty or whitespace");
}
if lang.is_empty() {
bail!("lang cannot be empty or whitespace");
}
let name = Self::normalize_title(title);
let lang = lang.to_owned();
Ok(Self { name, lang })
}
pub fn get_dir(&self, base: PathBuf) -> PathBuf {
let mut path = base;
// TODO: can use as_mut_os_string with 1.70.0
path.push(format!("{}.wikipedia.org", self.lang));
path.push("wiki");
path.push(&self.name);
path
}
}