Improve url handling
- Check for urls in osm tags - Handle mobile urls Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
parent
6d242a62aa
commit
bdf6f1a68c
1 changed files with 26 additions and 2 deletions
|
@ -109,7 +109,7 @@ pub fn parse_osm_tag_file(
|
|||
|
||||
let title = &row[title_col].trim();
|
||||
if !title.is_empty() {
|
||||
match WikipediaTitleNorm::_from_osm_tag(title) {
|
||||
match WikipediaTitleNorm::from_osm_tag(title) {
|
||||
Ok(title) => {
|
||||
titles.insert(title);
|
||||
}
|
||||
|
@ -185,7 +185,13 @@ impl WikidataQid {
|
|||
///
|
||||
/// let title = WikipediaTitleNorm::from_title("Article Title", "en").unwrap();
|
||||
/// let url = WikipediaTitleNorm::from_url("https://en.wikipedia.org/wiki/Article_Title#Section").unwrap();
|
||||
/// let mobile = WikipediaTitleNorm::from_url("https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
|
||||
/// let url_tag1 = WikipediaTitleNorm::from_osm_tag("https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
|
||||
/// let url_tag2 = WikipediaTitleNorm::from_osm_tag("de:https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
|
||||
/// assert_eq!(url, title);
|
||||
/// assert_eq!(url, mobile);
|
||||
/// assert_eq!(url, url_tag1);
|
||||
/// assert_eq!(url, url_tag2);
|
||||
///
|
||||
/// assert!(WikipediaTitleNorm::from_url("https://en.wikipedia.org/not_a_wiki_page").is_err());
|
||||
/// assert!(WikipediaTitleNorm::from_url("https://wikidata.org/wiki/Q12345").is_err());
|
||||
|
@ -201,6 +207,12 @@ pub struct WikipediaTitleNorm {
|
|||
name: String,
|
||||
}
|
||||
|
||||
impl Display for WikipediaTitleNorm {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}:{}", self.lang, self.name)
|
||||
}
|
||||
}
|
||||
|
||||
impl WikipediaTitleNorm {
|
||||
fn normalize_title(title: &str) -> String {
|
||||
// TODO: Compare with map generator url creation, ensure covers all cases.
|
||||
|
@ -216,6 +228,7 @@ impl WikipediaTitleNorm {
|
|||
.ok_or_else(|| anyhow!("Expected host"))?
|
||||
.split_once('.')
|
||||
.ok_or_else(|| anyhow!("Expected subdomain"))?;
|
||||
let host = host.strip_prefix("m.").unwrap_or(host);
|
||||
if host != "wikipedia.org" {
|
||||
bail!("Expected wikipedia.org for domain")
|
||||
}
|
||||
|
@ -238,12 +251,23 @@ impl WikipediaTitleNorm {
|
|||
}
|
||||
|
||||
// en:Article Title
|
||||
fn _from_osm_tag(tag: &str) -> anyhow::Result<Self> {
|
||||
pub fn from_osm_tag(tag: &str) -> anyhow::Result<Self> {
|
||||
let (lang, title) = tag
|
||||
.trim()
|
||||
.split_once(':')
|
||||
.ok_or_else(|| anyhow!("Expected ':'"))?;
|
||||
|
||||
let lang = lang.trim_start();
|
||||
let title = title.trim_start();
|
||||
|
||||
if matches!(lang, "http" | "https") {
|
||||
return Self::from_url(tag);
|
||||
}
|
||||
|
||||
if title.starts_with("http://") || title.starts_with("https://") {
|
||||
return Self::from_url(title);
|
||||
}
|
||||
|
||||
Self::from_title(title, lang)
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue