Write to generator-compatible folder structure (#6)
The map generator expects a certain folder structure created by the current scraper to add the article content into the mwm files. - Article html is written to wikidata directory. - Directories are created for any matched titles and symlinked to the wikidata directory. - Articles without a QID are written to article title directory. - Article titles containing `/` are not escaped, so multiple subdirectories are possible. The output folder hierarchy looks like this: . ├── de.wikipedia.org │ └── wiki │ ├── Coal_River_Springs_Territorial_Park │ │ ├── de.html │ │ └── ru.html │ ├── Ni'iinlii_Njik_(Fishing_Branch)_Territorial_Park │ │ ├── de.html │ │ └── en.html │ ... ├── en.wikipedia.org │ └── wiki │ ├── Arctic_National_Wildlife_Refuge │ │ ├── de.html │ │ ├── en.html │ │ ├── es.html │ │ ├── fr.html │ │ └── ru.html │ ├── Baltimore │ │ └── Washington_International_Airport │ │ ├── de.html │ │ ├── en.html │ │ ├── es.html │ │ ├── fr.html │ │ └── ru.html │ ... └── wikidata ├── Q59320 │ ├── de.html │ ├── en.html │ ├── es.html │ ├── fr.html │ └── ru.html ├── Q120306 │ ├── de.html │ ├── en.html │ ├── es.html │ ├── fr.html │ └── ru.html ... Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
parent
bb1f897cd2
commit
9036e3413f
4 changed files with 233 additions and 95 deletions
42
README.md
42
README.md
|
@ -2,7 +2,45 @@
|
|||
|
||||
_Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._
|
||||
|
||||
## Usage
|
||||
## Configuring
|
||||
|
||||
[`article_processing_config.json`](article_processing_config.json) should be updated when adding a new language.
|
||||
It defines article sections that are not important for users and should be removed.
|
||||
It defines article sections that are not important for users and should be removed from the extracted HTML.
|
||||
|
||||
## Usage
|
||||
|
||||
First, install [the rust language tools](https://www.rust-lang.org/)
|
||||
|
||||
For best performance, use `--release` when building or running.
|
||||
|
||||
You can run the program from within this directory using `cargo run --release --`.
|
||||
|
||||
Alternatively, build it with `cargo build --release`, which places the binary in `./target/release/om-wikiparser`.
|
||||
|
||||
Run the program with the `--help` flag to see all supported arguments.
|
||||
|
||||
It takes as inputs:
|
||||
- A wikidata enterprise JSON dump, extracted and connected to `stdin`.
|
||||
- A file of Wikidata QIDs to extract, one per line (e.g. `Q12345`), passed as the CLI flag `--wikidata-ids`.
|
||||
- A file of Wikipedia article titles to extract, one per line (e.g. `https://$LANG.wikipedia.org/wiki/$ARTICLE_TITLE`), passed as a CLI flag `--wikipedia-urls`.
|
||||
- A directory to write the extracted articles to, as a CLI argument.
|
||||
|
||||
As an example of usage with the map generator:
|
||||
- Assuming this program is installed to `$PATH` as `om-wikiparser`.
|
||||
- Download [the dumps in the desired languages](https://dumps.wikimedia.org/other/enterprise_html/runs/) (Use the files with the format `${LANG}wiki-NS0-${DATE}-ENTERPRISE-HTML.json.tar.gz`).
|
||||
Set `DUMP_DOWNLOAD_DIR` to the location they are downloaded.
|
||||
- Run the following from within the `intermediate_data` subdirectory of the maps build directory:
|
||||
|
||||
```shell
|
||||
# Transform intermediate files from generator.
|
||||
cut -f 2 id_to_wikidata.csv > wikidata_ids.txt
|
||||
tail -n +2 wiki_urls.txt | cut -f 3 > wikipedia_urls.txt
|
||||
# Begin extraction.
|
||||
for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
|
||||
do
|
||||
tar xzf $dump | om-wikiparser \
|
||||
--wikidata-ids wikidata_ids.txt \
|
||||
--wikipedia-urls wikipedia_urls.txt \
|
||||
descriptions/
|
||||
done
|
||||
```
|
||||
|
|
153
src/main.rs
153
src/main.rs
|
@ -1,27 +1,18 @@
|
|||
// Usage:
|
||||
// # prep outputs from map generator
|
||||
// cut -f 2 ~/Downloads/id_to_wikidata.csv > /tmp/wikidata_ids.txt
|
||||
// tail -n +2 ~/Downloads/wiki_urls.txt | cut -f 3 > /tmp/wikipedia_urls.txt
|
||||
// # feed gzipped tarfile
|
||||
// pv ~/Downloads/enwiki-NS0-20230401-ENTERPRISE-HTML.json.tar.gz | tar xzO \
|
||||
// | cargo run --release -- \
|
||||
// --wikidata-ids /tmp/wikidata_ids.txt \
|
||||
// --wikipedia-urls /tmp/wikipedia_urls.txt \
|
||||
// output_dir
|
||||
use std::{
|
||||
fs::{create_dir, File},
|
||||
fs::{self, File},
|
||||
io::{stdin, BufRead, Write},
|
||||
os::unix,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use anyhow::bail;
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
use clap::Parser;
|
||||
#[macro_use]
|
||||
extern crate log;
|
||||
|
||||
use om_wikiparser::{
|
||||
html::simplify,
|
||||
wm::{is_wikidata_match, is_wikipedia_match, parse_wikidata_file, parse_wikipedia_file, Page},
|
||||
wm::{parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
|
||||
};
|
||||
|
||||
#[derive(Parser)]
|
||||
|
@ -33,33 +24,115 @@ struct Args {
|
|||
wikipedia_urls: Option<PathBuf>,
|
||||
}
|
||||
|
||||
fn write(dir: impl AsRef<Path>, page: Page) -> anyhow::Result<()> {
|
||||
let Some(qid) = page.main_entity.map(|e| e.identifier) else {
|
||||
// TODO: handle and still write
|
||||
bail!("Page in list but without wikidata qid: {:?} ({})", page.name, page.url);
|
||||
/// Determine the directory to write the article contents to, create it, and create any necessary symlinks to it.
|
||||
fn create_article_dir(
|
||||
base: impl AsRef<Path>,
|
||||
page: &Page,
|
||||
redirects: impl IntoIterator<Item = WikipediaTitleNorm>,
|
||||
) -> anyhow::Result<PathBuf> {
|
||||
let base = base.as_ref();
|
||||
let mut redirects = redirects.into_iter();
|
||||
|
||||
let main_dir = match page.wikidata() {
|
||||
None => {
|
||||
// Write to wikipedia title directory.
|
||||
// Prefer first redirect, fall back to page title if none exist
|
||||
info!("Page without wikidata qid: {:?} ({})", page.name, page.url);
|
||||
redirects
|
||||
.next()
|
||||
.or_else(|| match page.title() {
|
||||
Ok(title) => Some(title),
|
||||
Err(e) => {
|
||||
warn!("Unable to parse title for page {:?}: {:#}", page.name, e);
|
||||
None
|
||||
}
|
||||
})
|
||||
// hard fail when no titles can be parsed
|
||||
.ok_or_else(|| anyhow!("No available titles for page {:?}", page.name))?
|
||||
.get_dir(base.to_owned())
|
||||
}
|
||||
Some(qid) => {
|
||||
// Otherwise use wikidata as main directory and symlink from wikipedia titles.
|
||||
qid.get_dir(base.to_owned())
|
||||
}
|
||||
};
|
||||
|
||||
let mut filename = dir.as_ref().to_owned();
|
||||
filename.push(qid);
|
||||
if main_dir.is_symlink() {
|
||||
fs::remove_file(&main_dir)
|
||||
.with_context(|| format!("removing old link for main directory {:?}", &main_dir))?;
|
||||
}
|
||||
fs::create_dir_all(&main_dir)
|
||||
.with_context(|| format!("creating main directory {:?}", &main_dir))?;
|
||||
|
||||
// Write symlinks to main directory.
|
||||
// TODO: Only write redirects that we care about.
|
||||
for title in redirects {
|
||||
let wikipedia_dir = title.get_dir(base.to_owned());
|
||||
|
||||
// Build required directory.
|
||||
//
|
||||
// Possible states from previous run:
|
||||
// - Does not exist (and is not a symlink)
|
||||
// - Exists, is a directory
|
||||
// - Exists, is a valid symlink to correct location
|
||||
// - Exists, is a valid symlink to incorrect location
|
||||
if wikipedia_dir.exists() {
|
||||
if wikipedia_dir.is_symlink() {
|
||||
// Only replace if not valid
|
||||
if fs::read_link(&wikipedia_dir)? == main_dir {
|
||||
continue;
|
||||
}
|
||||
fs::remove_file(&wikipedia_dir)?;
|
||||
} else {
|
||||
fs::remove_dir_all(&wikipedia_dir)?;
|
||||
}
|
||||
} else {
|
||||
// titles can contain `/`, so ensure necessary subdirs exist
|
||||
let parent_dir = wikipedia_dir.parent().unwrap();
|
||||
fs::create_dir_all(parent_dir)
|
||||
.with_context(|| format!("creating wikipedia directory {:?}", parent_dir))?;
|
||||
}
|
||||
|
||||
unix::fs::symlink(&main_dir, &wikipedia_dir).with_context(|| {
|
||||
format!(
|
||||
"creating symlink from {:?} to {:?}",
|
||||
wikipedia_dir, main_dir
|
||||
)
|
||||
})?;
|
||||
}
|
||||
|
||||
Ok(main_dir)
|
||||
}
|
||||
|
||||
/// Write selected article to disk.
|
||||
///
|
||||
/// - Write page contents to wikidata page (`wikidata.org/wiki/QXXX/lang.html`).
|
||||
/// - If the page has no wikidata qid, write contents to wikipedia location (`lang.wikipedia.org/wiki/article_title/lang.html`).
|
||||
/// - Create links from all wikipedia urls and redirects (`lang.wikipedia.org/wiki/a_redirect -> wikidata.org/wiki/QXXX`).
|
||||
fn write(
|
||||
base: impl AsRef<Path>,
|
||||
page: &Page,
|
||||
redirects: impl IntoIterator<Item = WikipediaTitleNorm>,
|
||||
) -> anyhow::Result<()> {
|
||||
let article_dir = create_article_dir(base, page, redirects)?;
|
||||
|
||||
// Write html to determined file.
|
||||
let mut filename = article_dir;
|
||||
filename.push(&page.in_language.identifier);
|
||||
filename.set_extension("html");
|
||||
|
||||
debug!("{:?}: {:?}", page.name, filename);
|
||||
|
||||
if filename.exists() {
|
||||
debug!("Exists, skipping");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let subfolder = filename.parent().unwrap();
|
||||
if !subfolder.exists() {
|
||||
create_dir(subfolder)?;
|
||||
debug!("Overwriting existing file");
|
||||
}
|
||||
|
||||
let html = simplify(&page.article_body.html, &page.in_language.identifier);
|
||||
|
||||
let mut file = File::create(&filename)?;
|
||||
file.write_all(html.as_bytes())?;
|
||||
let mut file =
|
||||
File::create(&filename).with_context(|| format!("creating html file {:?}", filename))?;
|
||||
file.write_all(html.as_bytes())
|
||||
.with_context(|| format!("writing html file {:?}", filename))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -104,14 +177,28 @@ fn main() -> anyhow::Result<()> {
|
|||
for page in stream {
|
||||
let page = page?;
|
||||
|
||||
if !(is_wikidata_match(&wikidata_ids, &page).is_some()
|
||||
|| is_wikipedia_match(&wikipedia_titles, &page).is_some())
|
||||
{
|
||||
let is_wikidata_match = page
|
||||
.wikidata()
|
||||
.map(|qid| wikidata_ids.contains(&qid))
|
||||
.unwrap_or_default();
|
||||
|
||||
let matching_titles = page
|
||||
.all_titles()
|
||||
.filter_map(|r| {
|
||||
r.map(Some).unwrap_or_else(|e| {
|
||||
warn!("Could not parse title for {:?}: {:#}", &page.name, e);
|
||||
None
|
||||
})
|
||||
})
|
||||
.filter(|t| wikipedia_titles.contains(t))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if !is_wikidata_match && matching_titles.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Err(e) = write(&args.output_dir, page) {
|
||||
error!("Error writing article: {}", e);
|
||||
if let Err(e) = write(&args.output_dir, &page, matching_titles) {
|
||||
error!("Error writing article {:?}: {:#}", page.name, e);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
104
src/wm/mod.rs
104
src/wm/mod.rs
|
@ -1,5 +1,8 @@
|
|||
//! Wikimedia types
|
||||
use std::{collections::HashSet, ffi::OsStr, fs, num::ParseIntError, str::FromStr};
|
||||
use std::{
|
||||
collections::HashSet, ffi::OsStr, fmt::Display, fs, num::ParseIntError, path::PathBuf,
|
||||
str::FromStr,
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
|
||||
|
@ -40,53 +43,6 @@ pub fn parse_wikipedia_file(
|
|||
.collect()
|
||||
}
|
||||
|
||||
pub fn is_wikidata_match(ids: &HashSet<WikidataQid>, page: &Page) -> Option<WikidataQid> {
|
||||
let Some(wikidata) = &page.main_entity else { return None;};
|
||||
let wikidata_id = &wikidata.identifier;
|
||||
let wikidata_id = match WikidataQid::from_str(wikidata_id) {
|
||||
Ok(qid) => qid,
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"Could not parse QID for {:?}: {:?}: {:#}",
|
||||
page.name, wikidata_id, e
|
||||
);
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
ids.get(&wikidata_id).map(|_| wikidata_id)
|
||||
}
|
||||
|
||||
pub fn is_wikipedia_match(
|
||||
titles: &HashSet<WikipediaTitleNorm>,
|
||||
page: &Page,
|
||||
) -> Option<WikipediaTitleNorm> {
|
||||
match WikipediaTitleNorm::from_title(&page.name, &page.in_language.identifier) {
|
||||
Err(e) => warn!("Could not parse title for {:?}: {:#}", page.name, e),
|
||||
Ok(title) => {
|
||||
if titles.get(&title).is_some() {
|
||||
return Some(title);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for redirect in &page.redirects {
|
||||
match WikipediaTitleNorm::from_title(&redirect.name, &page.in_language.identifier) {
|
||||
Err(e) => warn!(
|
||||
"Could not parse redirect title for {:?}: {:?}: {:#}",
|
||||
page.name, redirect.name, e
|
||||
),
|
||||
Ok(title) => {
|
||||
if titles.get(&title).is_some() {
|
||||
return Some(title);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Wikidata QID/Q Number
|
||||
///
|
||||
/// See https://www.wikidata.org/wiki/Wikidata:Glossary#QID
|
||||
|
@ -118,6 +74,23 @@ impl FromStr for WikidataQid {
|
|||
}
|
||||
}
|
||||
|
||||
impl Display for WikidataQid {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "Q{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl WikidataQid {
|
||||
pub fn get_dir(&self, base: PathBuf) -> PathBuf {
|
||||
let mut path = base;
|
||||
path.push("wikidata");
|
||||
// TODO: can use as_mut_os_string with 1.70.0
|
||||
path.push(self.to_string());
|
||||
|
||||
path
|
||||
}
|
||||
}
|
||||
|
||||
/// Normalized wikipedia article title that can compare:
|
||||
/// - titles `Spatial Database`
|
||||
/// - urls `https://en.wikipedia.org/wiki/Spatial_database#Geodatabase`
|
||||
|
@ -132,6 +105,11 @@ impl FromStr for WikidataQid {
|
|||
///
|
||||
/// assert!(WikipediaTitleNorm::from_url("https://en.wikipedia.org/not_a_wiki_page").is_err());
|
||||
/// assert!(WikipediaTitleNorm::from_url("https://wikidata.org/wiki/Q12345").is_err());
|
||||
///
|
||||
/// assert!(
|
||||
/// WikipediaTitleNorm::from_url("https://de.wikipedia.org/wiki/Breil/Brigels").unwrap() !=
|
||||
/// WikipediaTitleNorm::from_url("https://de.wikipedia.org/wiki/Breil").unwrap()
|
||||
/// );
|
||||
/// ```
|
||||
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
pub struct WikipediaTitleNorm {
|
||||
|
@ -145,7 +123,7 @@ impl WikipediaTitleNorm {
|
|||
title.trim().replace(' ', "_")
|
||||
}
|
||||
|
||||
// https://en.wikipedia.org/wiki/Article_Title
|
||||
// https://en.wikipedia.org/wiki/Article_Title/More_Title
|
||||
pub fn from_url(url: &str) -> anyhow::Result<Self> {
|
||||
let url = Url::parse(url.trim())?;
|
||||
|
||||
|
@ -159,21 +137,17 @@ impl WikipediaTitleNorm {
|
|||
}
|
||||
let lang = subdomain;
|
||||
|
||||
let mut paths = url
|
||||
.path_segments()
|
||||
.ok_or_else(|| anyhow!("Expected path"))?;
|
||||
let path = url.path();
|
||||
|
||||
let root = paths
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("Expected first segment in path"))?;
|
||||
let (root, title) = path
|
||||
.strip_prefix('/')
|
||||
.unwrap_or(path)
|
||||
.split_once('/')
|
||||
.ok_or_else(|| anyhow!("Expected at least two segments in path"))?;
|
||||
|
||||
if root != "wiki" {
|
||||
bail!("Expected 'wiki' in path")
|
||||
bail!("Expected 'wiki' as root path, got: {:?}", root)
|
||||
}
|
||||
|
||||
let title = paths
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("Expected second segment in path"))?;
|
||||
let title = urlencoding::decode(title)?;
|
||||
|
||||
Self::from_title(&title, lang)
|
||||
|
@ -202,4 +176,14 @@ impl WikipediaTitleNorm {
|
|||
let lang = lang.to_owned();
|
||||
Ok(Self { name, lang })
|
||||
}
|
||||
|
||||
pub fn get_dir(&self, base: PathBuf) -> PathBuf {
|
||||
let mut path = base;
|
||||
// TODO: can use as_mut_os_string with 1.70.0
|
||||
path.push(format!("{}.wikipedia.org", self.lang));
|
||||
path.push("wiki");
|
||||
path.push(&self.name);
|
||||
|
||||
path
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,9 @@
|
|||
use std::{iter, str::FromStr};
|
||||
|
||||
use serde::Deserialize;
|
||||
|
||||
use super::{WikidataQid, WikipediaTitleNorm};
|
||||
|
||||
// TODO: consolidate into single struct
|
||||
/// Deserialized Wikimedia Enterprise API Article
|
||||
///
|
||||
|
@ -20,6 +24,31 @@ pub struct Page {
|
|||
pub redirects: Vec<Redirect>,
|
||||
}
|
||||
|
||||
impl Page {
|
||||
pub fn wikidata(&self) -> Option<WikidataQid> {
|
||||
// TODO: return error
|
||||
self.main_entity
|
||||
.as_ref()
|
||||
.map(|e| WikidataQid::from_str(&e.identifier).unwrap())
|
||||
}
|
||||
|
||||
/// Title of the article
|
||||
pub fn title(&self) -> anyhow::Result<WikipediaTitleNorm> {
|
||||
WikipediaTitleNorm::from_title(&self.name, &self.in_language.identifier)
|
||||
}
|
||||
|
||||
/// All titles that lead to the article, the main title followed by any redirects.
|
||||
pub fn all_titles(&self) -> impl Iterator<Item = anyhow::Result<WikipediaTitleNorm>> + '_ {
|
||||
iter::once(self.title()).chain(self.redirects())
|
||||
}
|
||||
|
||||
pub fn redirects(&self) -> impl Iterator<Item = anyhow::Result<WikipediaTitleNorm>> + '_ {
|
||||
self.redirects
|
||||
.iter()
|
||||
.map(|r| WikipediaTitleNorm::from_title(&r.name, &self.in_language.identifier))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
pub struct Wikidata {
|
||||
pub identifier: String,
|
||||
|
|
Loading…
Add table
Reference in a new issue