diff --git a/.github/workflows/rust-checks.yml b/.github/workflows/rust-checks.yml index a7ac273..279d944 100644 --- a/.github/workflows/rust-checks.yml +++ b/.github/workflows/rust-checks.yml @@ -3,9 +3,9 @@ name: Rust Checks on: pull_request: paths-ignore: - - .gitignore - - LICENSE - - README.md + - .gitignore + - LICENSE + - README.md jobs: test: diff --git a/.github/workflows/shell-checks.yml b/.github/workflows/shell-checks.yml new file mode 100644 index 0000000..b09a46b --- /dev/null +++ b/.github/workflows/shell-checks.yml @@ -0,0 +1,18 @@ +name: Shell Checks + +on: + pull_request: + paths: + - "**.sh" + +jobs: + test: + name: shellcheck + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Run shellcheck + run: | + shellcheck --version + shellcheck -x *.sh diff --git a/README.md b/README.md index d1a94ad..60ecb21 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,9 @@ _Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._ +Extracted articles are identified by Wikipedia article titles in url or text form (language-specific), and [Wikidata QIDs](https://www.wikidata.org/wiki/Wikidata:Glossary#QID) (language-agnostic). +OpenStreetMap commonly stores these as [`wikipedia*=`](https://wiki.openstreetmap.org/wiki/Key:wikipedia) and [`wikidata=`](https://wiki.openstreetmap.org/wiki/Key:wikidata) tags on objects. + ## Configuring [`article_processing_config.json`](article_processing_config.json) should be updated when adding a new language. @@ -9,6 +12,11 @@ It defines article sections that are not important for users and should be remov ## Usage +To use with the map generator, see the [`run.sh` script](run.sh) and its own help documentation. +It handles preparing the inputs, using multiple dumps, and re-running to convert titles to QIDs and extract them across languages. + +To run the wikiparser manually or for development, see below. + First, install [the rust language tools](https://www.rust-lang.org/) For best performance, use `--release` when building or running. @@ -19,7 +27,7 @@ Alternatively, build it with `cargo build --release`, which places the binary in Run the program with the `--help` flag to see all supported arguments. -```shell +``` $ cargo run --release -- --help Extract article HTML from Wikipedia Enterprise HTML dumps. @@ -57,10 +65,11 @@ It takes as inputs: - A file of Wikipedia article titles to extract, one per line (e.g. `https://$LANG.wikipedia.org/wiki/$ARTICLE_TITLE`), passed as a CLI flag `--wikipedia-urls`. - A directory to write the extracted articles to, as a CLI argument. -As an example of usage with the map generator: +As an example of manual usage with the map generator: - Assuming this program is installed to `$PATH` as `om-wikiparser`. - Download [the dumps in the desired languages](https://dumps.wikimedia.org/other/enterprise_html/runs/) (Use the files with the format `${LANG}wiki-NS0-${DATE}-ENTERPRISE-HTML.json.tar.gz`). Set `DUMP_DOWNLOAD_DIR` to the location they are downloaded. +- Run a maps build with descriptions enabled to generate the `id_to_wikidata.csv` and `wiki_urls.txt` files. - Run the following from within the `intermediate_data` subdirectory of the maps build directory: ```shell diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..7777e48 --- /dev/null +++ b/build.rs @@ -0,0 +1,32 @@ +use std::process::Command; + +/// Pass git-describe through CARGO_GIT_VERSION env variable +/// +/// NOTE: Cargo.toml still needs to be updated on releases +fn set_version_from_git() { + let cmd = Command::new("git") + .arg("describe") + .arg("--always") + .arg("--dirty") + .arg("--tags") + .output(); + + match cmd { + Ok(output) if output.status.success() => { + let version = String::from_utf8_lossy(&output.stdout); + let version = version.trim(); + println!("cargo:rustc-env=CARGO_GIT_VERSION={}", version); + // rerun when git checks out another ref or any ref changes + println!("cargo:rerun-if-changed=.git/refs/"); + println!("cargo:rerun-if-changed=.git/HEAD"); + } + _ => { + // crates.io builds without git, so ignore here + eprintln!("git describe failed; ignoring"); + } + } +} + +fn main() { + set_version_from_git(); +} diff --git a/lib.sh b/lib.sh new file mode 100644 index 0000000..1c8746c --- /dev/null +++ b/lib.sh @@ -0,0 +1,7 @@ +# Shared functions for scripts +# shellcheck shell=bash + +# Write message to stderr with a timestamp and line ending. +log () { + echo -e "$(date '+%Y-%m-%dT%H:%M:%SZ')" "$@" >&2 +} diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..4767b77 --- /dev/null +++ b/run.sh @@ -0,0 +1,152 @@ +#! /usr/bin/env bash +# shellcheck disable=SC2016 # Backticks not used as expansions in documentation. +USAGE='Usage: ./run.sh [-h] [...] + +A convenience script to run the wikiparser with the maps generator as a drop-in replacement for the descriptions scraper. + +Arguments: + An existing directory to place descriptions in. + The `id_to_wikidata.csv` and `wiki_urls.txt` files output by the + maps generator must be placed in this directory before running. + The extracted articles will be placed in a `descriptions` + subdirectory within this directory. + The `intermediate_data` subfolder of a maps build directory may + be used for this. The same folder may be used for multiple runs. + A wikipedia enterprise html dump. These take the form of + `enwiki-NS0-20230401-ENTERPRISE-HTML.json.tar.gz`. Multiple + dumps in the same language SHOULD NOT be provided, and will + result in inconsistent data. + +Options: + -h Print this help screen + +1. Builds wikiparser. +2. Extracts wikidata ids and wikipedia urls from generator intermediate files `id_to_wikidata.csv` and `wiki_urls.txt`. +3. Runs wikiparser in parallel for all input dump files (NOTE: this currently starts 2 processes for each dump files). + +For information on running the wikiparser manually, see README.md. + +For more information on the map generator, see +. +' + +set -euo pipefail +# set -x + +# Parse options. +while getopts "h" opt +do + case $opt in + h) echo -n "$USAGE" >&2; exit 0;; + ?) echo "$USAGE" | head -n1 >&2; exit 1;; + esac +done +shift $((OPTIND - 1)) + +if [ -z "${2-}" ]; then + echo "BUILD_DIR and at least one DUMP_FILE are required" >&2 + echo -n "$USAGE" >&2 + exit 1 +fi + +# Process and canonicalize all path arguments before changing directories. + +BUILD_DIR=$(readlink -f -- "$1") +shift +if [ ! -d "$BUILD_DIR" ]; then + echo "BUILD_DIR '$BUILD_DIR' does not exist or is not a directory" >&2 + exit 1 +fi + +DUMP_FILES=() +while (( $# > 0 )); do + dump_file="$(readlink -f -- "$1")" + if [ ! -f "$dump_file" ]; then + echo "DUMP_FILE '$dump_file' does not exist or is not a file" >&2 + exit 1 + fi + DUMP_FILES+=("$dump_file") + shift +done + +# Ensure we're running in the directory of this script. +SCRIPT_PATH=$(dirname "$0") +cd "$SCRIPT_PATH" +SCRIPT_PATH=$(pwd) + +# only load library after changing to script directory +source lib.sh + +log "Using maps build directory '$BUILD_DIR'" + +if ! command -v "cargo" > /dev/null; then + echo -e "'cargo' is not installed, cannot build wikiparser.\nSee ." >&2 + exit 1 +fi + +log "Building wikiparser" +cargo build --release +wikiparser=$(pwd)/target/release/om-wikiparser + +log "Changing to maps build dir '$BUILD_DIR'" +cd "$BUILD_DIR" + +log "Transforming intermediate generator data" +for intermediate_file in id_to_wikidata.csv wiki_urls.txt; do + if [ ! -e "$intermediate_file" ]; then + echo -e "Cannot find intermediate generator file '$intermediate_file' in maps build dir '$BUILD_DIR/'\nWas the descriptions step run?" >&2 + exit 1 + fi +done + +cut -f 2 id_to_wikidata.csv > wikidata_ids.txt +tail -n +2 wiki_urls.txt | cut -f 3 > wikipedia_urls.txt + +# Enable backtraces in errors and panics. +export RUST_BACKTRACE=1 +# Set log level. +export RUST_LOG=om_wikiparser=info + +# Begin extraction. +OUTPUT_DIR=$(pwd)/descriptions +if [ ! -e "$OUTPUT_DIR" ]; then + mkdir "$OUTPUT_DIR" +fi +log "Extracting articles to '$OUTPUT_DIR'" + +kill_jobs() { + pids=$(jobs -p) + if [ -n "$pids" ]; then + log "Killing background jobs" + # shellcheck disable=SC2086 # PIDs are intentionally expanded. + kill $pids + log "Waiting for background jobs to stop" + wait + fi +} + +trap 'kill_jobs' SIGINT SIGTERM EXIT + +for dump in "${DUMP_FILES[@]}"; do + log "Extracting '$dump'" + tar xzOf "$dump" | "$wikiparser" \ + --wikidata-ids wikidata_ids.txt \ + --wikipedia-urls wikipedia_urls.txt \ + --write-new-ids new_qids.txt \ + "$OUTPUT_DIR" & +done + +wait + +log "Beginning extraction of discovered QIDs" + +# Extract new qids from other dumps in parallel. +for dump in "${DUMP_FILES[@]}"; do + tar xzOf "$dump" | "$wikiparser" \ + --wikidata-ids new_qids.txt \ + "$OUTPUT_DIR" & +done + +wait + +log "Finished" diff --git a/src/html.rs b/src/html.rs index 24d1d4f..35f855b 100644 --- a/src/html.rs +++ b/src/html.rs @@ -22,6 +22,19 @@ static CONFIG: Lazy> = Lazy::new(|| { static HEADERS: Lazy = Lazy::new(|| Selector::parse("h1, h2, h3, h4, h5, h6, h7").unwrap()); +/// Elements that should always be kept, regardless of other metrics. +static ELEMENT_ALLOW_LIST: Lazy = Lazy::new(|| { + Selector::parse( + &[ + // Meta tags that affect rendering. + "head > meta[charset]", + "head > meta[http-equiv]", + ] + .join(", "), + ) + .unwrap() +}); + pub fn simplify(html: &str, lang: &str) -> String { let mut document = Html::parse_document(html); @@ -53,8 +66,6 @@ pub fn simplify(html: &str, lang: &str) -> String { } remove_ids(&mut document, to_remove.drain(..)); - } else { - warn!("No sections to remove configured for lang {lang:?}"); } for el in document @@ -62,7 +73,7 @@ pub fn simplify(html: &str, lang: &str) -> String { .descendants() .filter_map(ElementRef::wrap) { - if is_image(&el) || is_empty_or_whitespace(&el) { + if (is_image(&el) || is_empty_or_whitespace(&el)) && !ELEMENT_ALLOW_LIST.matches(&el) { to_remove.push(el.id()); } } diff --git a/src/main.rs b/src/main.rs index 637e9b5..f95d656 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,11 +15,24 @@ use om_wikiparser::{ wm::{parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm}, }; +/// Get the version returned by `git describe`, e.g.: +/// - `v2.0` if a git tag +/// - the commit hash `034ac04` if not a tag +/// - `034ac04-dirty` if uncommited changes are present, +/// or the crate version if not available (if installed from crates.io). +/// +/// See `build.rs` file for more info. +fn version() -> &'static str { + option_env!("CARGO_GIT_VERSION") + .or(option_env!("CARGO_PKG_VERSION")) + .unwrap_or("unknown") +} + /// Extract article HTML from Wikipedia Enterprise HTML dumps. /// /// Expects an uncompressed dump connected to stdin. #[derive(Parser)] -#[command(version)] +#[command(version = crate::version())] struct Args { /// Directory to write the extracted articles to. output_dir: PathBuf, @@ -38,6 +51,7 @@ struct Args { /// /// Use this to save the QIDs of articles you know the url of, but not the QID. /// The same path can later be passed to the `--wikidata-ids` option to extract them from another language's dump. + /// Writes are atomicly appended to the file, so the same path may be used by multiple concurrent instances. #[arg(long, requires("wikipedia_urls"))] write_new_ids: Option, } @@ -173,6 +187,8 @@ fn main() -> anyhow::Result<()> { .exit() } + info!("{} {}", Args::command().get_name(), version()); + let wikipedia_titles = if let Some(path) = args.wikipedia_urls { info!("Loading article urls from {path:?}"); let urls = parse_wikipedia_file(path)?; @@ -191,6 +207,15 @@ fn main() -> anyhow::Result<()> { Default::default() }; + // NOTE: For atomic writes to the same file across threads/processes: + // - The file needs to be opened in APPEND mode (`.append(true)`). + // - Each write needs to be a single syscall (for Rust, use `format!` for formatting before calling `write!`, or `write!` to a `String` first). + // - Each write needs to be under `PIPE_BUF` size (see `man write(3)`), usually 4kb on Linux. + // + // For more information, see: + // - `man write(3posix)`: https://www.man7.org/linux/man-pages/man3/write.3p.html + // - `std::fs::OpenOptions::append`: https://doc.rust-lang.org/std/fs/struct.OpenOptions.html#method.append + // - https://stackoverflow.com/questions/1154446/is-file-append-atomic-in-unix let mut write_new_ids = args .write_new_ids .as_ref() @@ -240,10 +265,14 @@ fn main() -> anyhow::Result<()> { continue; } + // Write matched new QIDs back to fild. if let (Some(f), Some(qid)) = (&mut write_new_ids, &qid) { if !is_wikidata_match && !matching_titles.is_empty() { debug!("Writing new id {} for article {:?}", qid, page.name); - writeln!(f, "{}", qid).with_context(|| { + // NOTE: Write to string buffer first to have a single atomic write syscall. + // See `write_new_ids` for more info. + let line = format!("{}\n", qid); + write!(f, "{}", line).with_context(|| { format!( "writing new id to file {:?}", args.write_new_ids.as_ref().unwrap()