Refactor and rename title/qid wrappers
- Move Qid and Title to separate modules - Reformat benchmark Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
parent
bdf6f1a68c
commit
34bb9318d5
6 changed files with 211 additions and 207 deletions
|
@ -4,22 +4,21 @@ use std::{collections::HashSet, str::FromStr};
|
|||
extern crate om_wikiparser;
|
||||
extern crate test;
|
||||
|
||||
use om_wikiparser::wm::{Qid, Title};
|
||||
|
||||
const TITLE: &str = "https://en.wikipedia.org/wiki/Article_Title";
|
||||
const QID: &str = "Q123456789";
|
||||
|
||||
#[bench]
|
||||
fn parse_wikipedia(b: &mut test::Bencher) {
|
||||
b.iter(|| {
|
||||
let title = om_wikiparser::wm::WikipediaTitleNorm::from_url(
|
||||
"https://en.wikipedia.org/wiki/Article_Title",
|
||||
)
|
||||
.unwrap();
|
||||
Title::from_url(TITLE).unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn hash_wikipedia(b: &mut test::Bencher) {
|
||||
let title = om_wikiparser::wm::WikipediaTitleNorm::from_url(
|
||||
"https://en.wikipedia.org/wiki/Article_Title",
|
||||
)
|
||||
.unwrap();
|
||||
let title = Title::from_url(TITLE).unwrap();
|
||||
let mut set = HashSet::new();
|
||||
b.iter(|| {
|
||||
set.insert(&title);
|
||||
|
@ -29,13 +28,13 @@ fn hash_wikipedia(b: &mut test::Bencher) {
|
|||
#[bench]
|
||||
fn parse_wikidata(b: &mut test::Bencher) {
|
||||
b.iter(|| {
|
||||
let qid = om_wikiparser::wm::WikidataQid::from_str("Q123456789").unwrap();
|
||||
Qid::from_str(QID).unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn hash_wikidata(b: &mut test::Bencher) {
|
||||
let qid = om_wikiparser::wm::WikidataQid::from_str("Q123456789").unwrap();
|
||||
let qid = Qid::from_str(QID).unwrap();
|
||||
let mut set = HashSet::new();
|
||||
b.iter(|| {
|
||||
set.insert(&qid);
|
||||
|
|
|
@ -9,7 +9,7 @@ use anyhow::{anyhow, bail, Context};
|
|||
|
||||
use om_wikiparser::{
|
||||
html::simplify,
|
||||
wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
|
||||
wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, Title},
|
||||
};
|
||||
|
||||
/// Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.
|
||||
|
@ -154,7 +154,7 @@ pub fn run(args: Args) -> anyhow::Result<()> {
|
|||
fn create_article_dir(
|
||||
base: impl AsRef<Path>,
|
||||
page: &Page,
|
||||
redirects: impl IntoIterator<Item = WikipediaTitleNorm>,
|
||||
redirects: impl IntoIterator<Item = Title>,
|
||||
) -> anyhow::Result<PathBuf> {
|
||||
let base = base.as_ref();
|
||||
let mut redirects = redirects.into_iter();
|
||||
|
@ -237,7 +237,7 @@ fn create_article_dir(
|
|||
fn write(
|
||||
base: impl AsRef<Path>,
|
||||
page: &Page,
|
||||
redirects: impl IntoIterator<Item = WikipediaTitleNorm>,
|
||||
redirects: impl IntoIterator<Item = Title>,
|
||||
) -> anyhow::Result<()> {
|
||||
let article_dir = create_article_dir(base, page, redirects)?;
|
||||
|
||||
|
|
200
src/wm/mod.rs
200
src/wm/mod.rs
|
@ -1,24 +1,23 @@
|
|||
//! Wikimedia types
|
||||
use std::{
|
||||
collections::HashSet, ffi::OsStr, fmt::Display, fs, num::ParseIntError, path::PathBuf,
|
||||
str::FromStr,
|
||||
};
|
||||
use std::{collections::HashSet, ffi::OsStr, fs, str::FromStr};
|
||||
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
|
||||
use url::Url;
|
||||
use anyhow::{anyhow, Context};
|
||||
|
||||
mod page;
|
||||
pub use page::Page;
|
||||
mod title;
|
||||
pub use title::*;
|
||||
mod qid;
|
||||
pub use qid::*;
|
||||
|
||||
/// Read from a file of urls on each line.
|
||||
pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<WikidataQid>> {
|
||||
pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Qid>> {
|
||||
let contents = fs::read_to_string(path.as_ref())?;
|
||||
Ok(contents
|
||||
.lines()
|
||||
.enumerate()
|
||||
.map(|(i, line)| {
|
||||
WikidataQid::from_str(line).with_context(|| {
|
||||
Qid::from_str(line).with_context(|| {
|
||||
let line_num = i + 1;
|
||||
format!("on line {line_num}: {line:?}")
|
||||
})
|
||||
|
@ -34,15 +33,13 @@ pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Wi
|
|||
}
|
||||
|
||||
/// Read article titles from a file of urls on each line.
|
||||
pub fn parse_wikipedia_file(
|
||||
path: impl AsRef<OsStr>,
|
||||
) -> anyhow::Result<HashSet<WikipediaTitleNorm>> {
|
||||
pub fn parse_wikipedia_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Title>> {
|
||||
let contents = fs::read_to_string(path.as_ref())?;
|
||||
Ok(contents
|
||||
.lines()
|
||||
.enumerate()
|
||||
.map(|(i, line)| {
|
||||
WikipediaTitleNorm::from_url(line).with_context(|| {
|
||||
Title::from_url(line).with_context(|| {
|
||||
let line_num = i + 1;
|
||||
format!("on line {line_num}: {line:?}")
|
||||
})
|
||||
|
@ -59,8 +56,8 @@ pub fn parse_wikipedia_file(
|
|||
|
||||
pub fn parse_osm_tag_file(
|
||||
path: impl AsRef<OsStr>,
|
||||
qids: &mut HashSet<WikidataQid>,
|
||||
titles: &mut HashSet<WikipediaTitleNorm>,
|
||||
qids: &mut HashSet<Qid>,
|
||||
titles: &mut HashSet<Title>,
|
||||
) -> anyhow::Result<()> {
|
||||
let path = path.as_ref();
|
||||
let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?;
|
||||
|
@ -93,7 +90,7 @@ pub fn parse_osm_tag_file(
|
|||
|
||||
let qid = &row[qid_col].trim();
|
||||
if !qid.is_empty() {
|
||||
match WikidataQid::from_str(qid) {
|
||||
match Qid::from_str(qid) {
|
||||
Ok(qid) => {
|
||||
qids.insert(qid);
|
||||
}
|
||||
|
@ -109,7 +106,7 @@ pub fn parse_osm_tag_file(
|
|||
|
||||
let title = &row[title_col].trim();
|
||||
if !title.is_empty() {
|
||||
match WikipediaTitleNorm::from_osm_tag(title) {
|
||||
match Title::from_osm_tag(title) {
|
||||
Ok(title) => {
|
||||
titles.insert(title);
|
||||
}
|
||||
|
@ -126,172 +123,3 @@ pub fn parse_osm_tag_file(
|
|||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Wikidata QID/Q Number
|
||||
///
|
||||
/// See https://www.wikidata.org/wiki/Wikidata:Glossary#QID
|
||||
///
|
||||
/// ```
|
||||
/// use std::str::FromStr;
|
||||
/// use om_wikiparser::wm::WikidataQid;
|
||||
///
|
||||
/// let with_q = WikidataQid::from_str("Q12345").unwrap();
|
||||
/// let without_q = WikidataQid::from_str(" 12345 ").unwrap();
|
||||
/// assert_eq!(with_q, without_q);
|
||||
///
|
||||
/// assert!(WikidataQid::from_str("q12345").is_ok());
|
||||
/// assert!(WikidataQid::from_str("https://wikidata.org/wiki/Q12345").is_err());
|
||||
/// assert!(WikidataQid::from_str("Article_Title").is_err());
|
||||
/// assert!(WikidataQid::from_str("Q").is_err());
|
||||
/// assert!(WikidataQid::from_str("").is_err());
|
||||
/// ```
|
||||
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
pub struct WikidataQid(u32);
|
||||
|
||||
impl FromStr for WikidataQid {
|
||||
type Err = ParseIntError;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let s = s.trim();
|
||||
let s = s.strip_prefix(['Q', 'q']).unwrap_or(s);
|
||||
u32::from_str(s).map(WikidataQid)
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for WikidataQid {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "Q{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl WikidataQid {
|
||||
pub fn get_dir(&self, base: PathBuf) -> PathBuf {
|
||||
let mut path = base;
|
||||
path.push("wikidata");
|
||||
// TODO: can use as_mut_os_string with 1.70.0
|
||||
path.push(self.to_string());
|
||||
|
||||
path
|
||||
}
|
||||
}
|
||||
|
||||
/// Normalized wikipedia article title that can compare:
|
||||
/// - titles `Spatial Database`
|
||||
/// - urls `https://en.wikipedia.org/wiki/Spatial_database#Geodatabase`
|
||||
/// - osm-style tags `en:Spatial Database`
|
||||
///
|
||||
/// ```
|
||||
/// use om_wikiparser::wm::WikipediaTitleNorm;
|
||||
///
|
||||
/// let title = WikipediaTitleNorm::from_title("Article Title", "en").unwrap();
|
||||
/// let url = WikipediaTitleNorm::from_url("https://en.wikipedia.org/wiki/Article_Title#Section").unwrap();
|
||||
/// let mobile = WikipediaTitleNorm::from_url("https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
|
||||
/// let url_tag1 = WikipediaTitleNorm::from_osm_tag("https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
|
||||
/// let url_tag2 = WikipediaTitleNorm::from_osm_tag("de:https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
|
||||
/// assert_eq!(url, title);
|
||||
/// assert_eq!(url, mobile);
|
||||
/// assert_eq!(url, url_tag1);
|
||||
/// assert_eq!(url, url_tag2);
|
||||
///
|
||||
/// assert!(WikipediaTitleNorm::from_url("https://en.wikipedia.org/not_a_wiki_page").is_err());
|
||||
/// assert!(WikipediaTitleNorm::from_url("https://wikidata.org/wiki/Q12345").is_err());
|
||||
///
|
||||
/// assert!(
|
||||
/// WikipediaTitleNorm::from_url("https://de.wikipedia.org/wiki/Breil/Brigels").unwrap() !=
|
||||
/// WikipediaTitleNorm::from_url("https://de.wikipedia.org/wiki/Breil").unwrap()
|
||||
/// );
|
||||
/// ```
|
||||
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
pub struct WikipediaTitleNorm {
|
||||
lang: String,
|
||||
name: String,
|
||||
}
|
||||
|
||||
impl Display for WikipediaTitleNorm {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}:{}", self.lang, self.name)
|
||||
}
|
||||
}
|
||||
|
||||
impl WikipediaTitleNorm {
|
||||
fn normalize_title(title: &str) -> String {
|
||||
// TODO: Compare with map generator url creation, ensure covers all cases.
|
||||
title.trim().replace(' ', "_")
|
||||
}
|
||||
|
||||
// https://en.wikipedia.org/wiki/Article_Title/More_Title
|
||||
pub fn from_url(url: &str) -> anyhow::Result<Self> {
|
||||
let url = Url::parse(url.trim())?;
|
||||
|
||||
let (subdomain, host) = url
|
||||
.host_str()
|
||||
.ok_or_else(|| anyhow!("Expected host"))?
|
||||
.split_once('.')
|
||||
.ok_or_else(|| anyhow!("Expected subdomain"))?;
|
||||
let host = host.strip_prefix("m.").unwrap_or(host);
|
||||
if host != "wikipedia.org" {
|
||||
bail!("Expected wikipedia.org for domain")
|
||||
}
|
||||
let lang = subdomain;
|
||||
|
||||
let path = url.path();
|
||||
|
||||
let (root, title) = path
|
||||
.strip_prefix('/')
|
||||
.unwrap_or(path)
|
||||
.split_once('/')
|
||||
.ok_or_else(|| anyhow!("Expected at least two segments in path"))?;
|
||||
|
||||
if root != "wiki" {
|
||||
bail!("Expected 'wiki' as root path, got: {:?}", root)
|
||||
}
|
||||
let title = urlencoding::decode(title)?;
|
||||
|
||||
Self::from_title(&title, lang)
|
||||
}
|
||||
|
||||
// en:Article Title
|
||||
pub fn from_osm_tag(tag: &str) -> anyhow::Result<Self> {
|
||||
let (lang, title) = tag
|
||||
.trim()
|
||||
.split_once(':')
|
||||
.ok_or_else(|| anyhow!("Expected ':'"))?;
|
||||
|
||||
let lang = lang.trim_start();
|
||||
let title = title.trim_start();
|
||||
|
||||
if matches!(lang, "http" | "https") {
|
||||
return Self::from_url(tag);
|
||||
}
|
||||
|
||||
if title.starts_with("http://") || title.starts_with("https://") {
|
||||
return Self::from_url(title);
|
||||
}
|
||||
|
||||
Self::from_title(title, lang)
|
||||
}
|
||||
|
||||
pub fn from_title(title: &str, lang: &str) -> anyhow::Result<Self> {
|
||||
let title = title.trim();
|
||||
let lang = lang.trim();
|
||||
if title.is_empty() {
|
||||
bail!("title cannot be empty or whitespace");
|
||||
}
|
||||
if lang.is_empty() {
|
||||
bail!("lang cannot be empty or whitespace");
|
||||
}
|
||||
let name = Self::normalize_title(title);
|
||||
let lang = lang.to_owned();
|
||||
Ok(Self { name, lang })
|
||||
}
|
||||
|
||||
pub fn get_dir(&self, base: PathBuf) -> PathBuf {
|
||||
let mut path = base;
|
||||
// TODO: can use as_mut_os_string with 1.70.0
|
||||
path.push(format!("{}.wikipedia.org", self.lang));
|
||||
path.push("wiki");
|
||||
path.push(&self.name);
|
||||
|
||||
path
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,7 +2,7 @@ use std::{iter, str::FromStr};
|
|||
|
||||
use serde::Deserialize;
|
||||
|
||||
use super::{WikidataQid, WikipediaTitleNorm};
|
||||
use super::{Qid, Title};
|
||||
|
||||
// TODO: consolidate into single struct
|
||||
/// Deserialized Wikimedia Enterprise API Article
|
||||
|
@ -25,27 +25,27 @@ pub struct Page {
|
|||
}
|
||||
|
||||
impl Page {
|
||||
pub fn wikidata(&self) -> Option<WikidataQid> {
|
||||
pub fn wikidata(&self) -> Option<Qid> {
|
||||
// TODO: return error
|
||||
self.main_entity
|
||||
.as_ref()
|
||||
.map(|e| WikidataQid::from_str(&e.identifier).unwrap())
|
||||
.map(|e| Qid::from_str(&e.identifier).unwrap())
|
||||
}
|
||||
|
||||
/// Title of the article
|
||||
pub fn title(&self) -> anyhow::Result<WikipediaTitleNorm> {
|
||||
WikipediaTitleNorm::from_title(&self.name, &self.in_language.identifier)
|
||||
pub fn title(&self) -> anyhow::Result<Title> {
|
||||
Title::from_title(&self.name, &self.in_language.identifier)
|
||||
}
|
||||
|
||||
/// All titles that lead to the article, the main title followed by any redirects.
|
||||
pub fn all_titles(&self) -> impl Iterator<Item = anyhow::Result<WikipediaTitleNorm>> + '_ {
|
||||
pub fn all_titles(&self) -> impl Iterator<Item = anyhow::Result<Title>> + '_ {
|
||||
iter::once(self.title()).chain(self.redirects())
|
||||
}
|
||||
|
||||
pub fn redirects(&self) -> impl Iterator<Item = anyhow::Result<WikipediaTitleNorm>> + '_ {
|
||||
pub fn redirects(&self) -> impl Iterator<Item = anyhow::Result<Title>> + '_ {
|
||||
self.redirects
|
||||
.iter()
|
||||
.map(|r| WikipediaTitleNorm::from_title(&r.name, &self.in_language.identifier))
|
||||
.map(|r| Title::from_title(&r.name, &self.in_language.identifier))
|
||||
}
|
||||
}
|
||||
|
||||
|
|
51
src/wm/qid.rs
Normal file
51
src/wm/qid.rs
Normal file
|
@ -0,0 +1,51 @@
|
|||
use std::{fmt::Display, num::ParseIntError, path::PathBuf, str::FromStr};
|
||||
|
||||
/// Wikidata QID/Q Number
|
||||
///
|
||||
/// See https://www.wikidata.org/wiki/Wikidata:Glossary#QID
|
||||
///
|
||||
/// ```
|
||||
/// use std::str::FromStr;
|
||||
/// use om_wikiparser::wm::Qid;
|
||||
///
|
||||
/// let with_q = Qid::from_str("Q12345").unwrap();
|
||||
/// let without_q = Qid::from_str(" 12345 ").unwrap();
|
||||
/// assert_eq!(with_q, without_q);
|
||||
///
|
||||
/// assert!(Qid::from_str("q12345").is_ok());
|
||||
/// assert!(Qid::from_str("https://wikidata.org/wiki/Q12345").is_err());
|
||||
/// assert!(Qid::from_str("Article_Title").is_err());
|
||||
/// assert!(Qid::from_str("Q").is_err());
|
||||
/// assert!(Qid::from_str("").is_err());
|
||||
/// ```
|
||||
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
pub struct Qid(u32);
|
||||
|
||||
pub type ParseQidError = ParseIntError;
|
||||
|
||||
impl FromStr for Qid {
|
||||
type Err = ParseQidError;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let s = s.trim();
|
||||
let s = s.strip_prefix(['Q', 'q']).unwrap_or(s);
|
||||
u32::from_str(s).map(Qid)
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for Qid {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "Q{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Qid {
|
||||
pub fn get_dir(&self, base: PathBuf) -> PathBuf {
|
||||
let mut path = base;
|
||||
path.push("wikidata");
|
||||
// TODO: can use as_mut_os_string with 1.70.0
|
||||
path.push(self.to_string());
|
||||
|
||||
path
|
||||
}
|
||||
}
|
126
src/wm/title.rs
Normal file
126
src/wm/title.rs
Normal file
|
@ -0,0 +1,126 @@
|
|||
use std::{fmt::Display, path::PathBuf};
|
||||
|
||||
use anyhow::{anyhow, bail};
|
||||
|
||||
use url::Url;
|
||||
|
||||
/// Normalized wikipedia article title that can compare:
|
||||
/// - titles `Spatial Database`
|
||||
/// - urls `https://en.wikipedia.org/wiki/Spatial_database#Geodatabase`
|
||||
/// - osm-style tags `en:Spatial Database`
|
||||
///
|
||||
/// ```
|
||||
/// use om_wikiparser::wm::Title;
|
||||
///
|
||||
/// let title = Title::from_title("Article Title", "en").unwrap();
|
||||
/// let url = Title::from_url("https://en.wikipedia.org/wiki/Article_Title#Section").unwrap();
|
||||
/// let mobile = Title::from_url("https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
|
||||
/// let url_tag1 = Title::from_osm_tag("https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
|
||||
/// let url_tag2 = Title::from_osm_tag("de:https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
|
||||
/// assert_eq!(url, title);
|
||||
/// assert_eq!(url, mobile);
|
||||
/// assert_eq!(url, url_tag1);
|
||||
/// assert_eq!(url, url_tag2);
|
||||
///
|
||||
/// assert!(Title::from_url("https://en.wikipedia.org/not_a_wiki_page").is_err());
|
||||
/// assert!(Title::from_url("https://wikidata.org/wiki/Q12345").is_err());
|
||||
///
|
||||
/// assert!(
|
||||
/// Title::from_url("https://de.wikipedia.org/wiki/Breil/Brigels").unwrap() !=
|
||||
/// Title::from_url("https://de.wikipedia.org/wiki/Breil").unwrap()
|
||||
/// );
|
||||
/// ```
|
||||
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
pub struct Title {
|
||||
lang: String,
|
||||
name: String,
|
||||
}
|
||||
|
||||
impl Display for Title {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}:{}", self.lang, self.name)
|
||||
}
|
||||
}
|
||||
|
||||
impl Title {
|
||||
fn normalize_title(title: &str) -> String {
|
||||
// TODO: Compare with map generator url creation, ensure covers all cases.
|
||||
title.trim().replace(' ', "_")
|
||||
}
|
||||
|
||||
// https://en.wikipedia.org/wiki/Article_Title/More_Title
|
||||
pub fn from_url(url: &str) -> anyhow::Result<Self> {
|
||||
let url = Url::parse(url.trim())?;
|
||||
|
||||
let (subdomain, host) = url
|
||||
.host_str()
|
||||
.ok_or_else(|| anyhow!("Expected host"))?
|
||||
.split_once('.')
|
||||
.ok_or_else(|| anyhow!("Expected subdomain"))?;
|
||||
let host = host.strip_prefix("m.").unwrap_or(host);
|
||||
if host != "wikipedia.org" {
|
||||
bail!("Expected wikipedia.org for domain")
|
||||
}
|
||||
let lang = subdomain;
|
||||
|
||||
let path = url.path();
|
||||
|
||||
let (root, title) = path
|
||||
.strip_prefix('/')
|
||||
.unwrap_or(path)
|
||||
.split_once('/')
|
||||
.ok_or_else(|| anyhow!("Expected at least two segments in path"))?;
|
||||
|
||||
if root != "wiki" {
|
||||
bail!("Expected 'wiki' as root path, got: {:?}", root)
|
||||
}
|
||||
let title = urlencoding::decode(title)?;
|
||||
|
||||
Self::from_title(&title, lang)
|
||||
}
|
||||
|
||||
// en:Article Title
|
||||
pub fn from_osm_tag(tag: &str) -> anyhow::Result<Self> {
|
||||
let (lang, title) = tag
|
||||
.trim()
|
||||
.split_once(':')
|
||||
.ok_or_else(|| anyhow!("Expected ':'"))?;
|
||||
|
||||
let lang = lang.trim_start();
|
||||
let title = title.trim_start();
|
||||
|
||||
if matches!(lang, "http" | "https") {
|
||||
return Self::from_url(tag);
|
||||
}
|
||||
|
||||
if title.starts_with("http://") || title.starts_with("https://") {
|
||||
return Self::from_url(title);
|
||||
}
|
||||
|
||||
Self::from_title(title, lang)
|
||||
}
|
||||
|
||||
pub fn from_title(title: &str, lang: &str) -> anyhow::Result<Self> {
|
||||
let title = title.trim();
|
||||
let lang = lang.trim();
|
||||
if title.is_empty() {
|
||||
bail!("title cannot be empty or whitespace");
|
||||
}
|
||||
if lang.is_empty() {
|
||||
bail!("lang cannot be empty or whitespace");
|
||||
}
|
||||
let name = Self::normalize_title(title);
|
||||
let lang = lang.to_owned();
|
||||
Ok(Self { name, lang })
|
||||
}
|
||||
|
||||
pub fn get_dir(&self, base: PathBuf) -> PathBuf {
|
||||
let mut path = base;
|
||||
// TODO: can use as_mut_os_string with 1.70.0
|
||||
path.push(format!("{}.wikipedia.org", self.lang));
|
||||
path.push("wiki");
|
||||
path.push(&self.name);
|
||||
|
||||
path
|
||||
}
|
||||
}
|
Loading…
Add table
Reference in a new issue