diff --git a/.github/workflows/rust-checks.yml b/.github/workflows/rust-checks.yml index a7ac273..279d944 100644 --- a/.github/workflows/rust-checks.yml +++ b/.github/workflows/rust-checks.yml @@ -3,9 +3,9 @@ name: Rust Checks on: pull_request: paths-ignore: - - .gitignore - - LICENSE - - README.md + - .gitignore + - LICENSE + - README.md jobs: test: diff --git a/.github/workflows/shell-checks.yml b/.github/workflows/shell-checks.yml new file mode 100644 index 0000000..b09a46b --- /dev/null +++ b/.github/workflows/shell-checks.yml @@ -0,0 +1,18 @@ +name: Shell Checks + +on: + pull_request: + paths: + - "**.sh" + +jobs: + test: + name: shellcheck + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Run shellcheck + run: | + shellcheck --version + shellcheck -x *.sh diff --git a/README.md b/README.md index d1a94ad..60ecb21 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,9 @@ _Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._ +Extracted articles are identified by Wikipedia article titles in url or text form (language-specific), and [Wikidata QIDs](https://www.wikidata.org/wiki/Wikidata:Glossary#QID) (language-agnostic). +OpenStreetMap commonly stores these as [`wikipedia*=`](https://wiki.openstreetmap.org/wiki/Key:wikipedia) and [`wikidata=`](https://wiki.openstreetmap.org/wiki/Key:wikidata) tags on objects. + ## Configuring [`article_processing_config.json`](article_processing_config.json) should be updated when adding a new language. @@ -9,6 +12,11 @@ It defines article sections that are not important for users and should be remov ## Usage +To use with the map generator, see the [`run.sh` script](run.sh) and its own help documentation. +It handles preparing the inputs, using multiple dumps, and re-running to convert titles to QIDs and extract them across languages. + +To run the wikiparser manually or for development, see below. + First, install [the rust language tools](https://www.rust-lang.org/) For best performance, use `--release` when building or running. @@ -19,7 +27,7 @@ Alternatively, build it with `cargo build --release`, which places the binary in Run the program with the `--help` flag to see all supported arguments. -```shell +``` $ cargo run --release -- --help Extract article HTML from Wikipedia Enterprise HTML dumps. @@ -57,10 +65,11 @@ It takes as inputs: - A file of Wikipedia article titles to extract, one per line (e.g. `https://$LANG.wikipedia.org/wiki/$ARTICLE_TITLE`), passed as a CLI flag `--wikipedia-urls`. - A directory to write the extracted articles to, as a CLI argument. -As an example of usage with the map generator: +As an example of manual usage with the map generator: - Assuming this program is installed to `$PATH` as `om-wikiparser`. - Download [the dumps in the desired languages](https://dumps.wikimedia.org/other/enterprise_html/runs/) (Use the files with the format `${LANG}wiki-NS0-${DATE}-ENTERPRISE-HTML.json.tar.gz`). Set `DUMP_DOWNLOAD_DIR` to the location they are downloaded. +- Run a maps build with descriptions enabled to generate the `id_to_wikidata.csv` and `wiki_urls.txt` files. - Run the following from within the `intermediate_data` subdirectory of the maps build directory: ```shell diff --git a/lib.sh b/lib.sh new file mode 100644 index 0000000..1c8746c --- /dev/null +++ b/lib.sh @@ -0,0 +1,7 @@ +# Shared functions for scripts +# shellcheck shell=bash + +# Write message to stderr with a timestamp and line ending. +log () { + echo -e "$(date '+%Y-%m-%dT%H:%M:%SZ')" "$@" >&2 +} diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..4767b77 --- /dev/null +++ b/run.sh @@ -0,0 +1,152 @@ +#! /usr/bin/env bash +# shellcheck disable=SC2016 # Backticks not used as expansions in documentation. +USAGE='Usage: ./run.sh [-h] [...] + +A convenience script to run the wikiparser with the maps generator as a drop-in replacement for the descriptions scraper. + +Arguments: + An existing directory to place descriptions in. + The `id_to_wikidata.csv` and `wiki_urls.txt` files output by the + maps generator must be placed in this directory before running. + The extracted articles will be placed in a `descriptions` + subdirectory within this directory. + The `intermediate_data` subfolder of a maps build directory may + be used for this. The same folder may be used for multiple runs. + A wikipedia enterprise html dump. These take the form of + `enwiki-NS0-20230401-ENTERPRISE-HTML.json.tar.gz`. Multiple + dumps in the same language SHOULD NOT be provided, and will + result in inconsistent data. + +Options: + -h Print this help screen + +1. Builds wikiparser. +2. Extracts wikidata ids and wikipedia urls from generator intermediate files `id_to_wikidata.csv` and `wiki_urls.txt`. +3. Runs wikiparser in parallel for all input dump files (NOTE: this currently starts 2 processes for each dump files). + +For information on running the wikiparser manually, see README.md. + +For more information on the map generator, see +. +' + +set -euo pipefail +# set -x + +# Parse options. +while getopts "h" opt +do + case $opt in + h) echo -n "$USAGE" >&2; exit 0;; + ?) echo "$USAGE" | head -n1 >&2; exit 1;; + esac +done +shift $((OPTIND - 1)) + +if [ -z "${2-}" ]; then + echo "BUILD_DIR and at least one DUMP_FILE are required" >&2 + echo -n "$USAGE" >&2 + exit 1 +fi + +# Process and canonicalize all path arguments before changing directories. + +BUILD_DIR=$(readlink -f -- "$1") +shift +if [ ! -d "$BUILD_DIR" ]; then + echo "BUILD_DIR '$BUILD_DIR' does not exist or is not a directory" >&2 + exit 1 +fi + +DUMP_FILES=() +while (( $# > 0 )); do + dump_file="$(readlink -f -- "$1")" + if [ ! -f "$dump_file" ]; then + echo "DUMP_FILE '$dump_file' does not exist or is not a file" >&2 + exit 1 + fi + DUMP_FILES+=("$dump_file") + shift +done + +# Ensure we're running in the directory of this script. +SCRIPT_PATH=$(dirname "$0") +cd "$SCRIPT_PATH" +SCRIPT_PATH=$(pwd) + +# only load library after changing to script directory +source lib.sh + +log "Using maps build directory '$BUILD_DIR'" + +if ! command -v "cargo" > /dev/null; then + echo -e "'cargo' is not installed, cannot build wikiparser.\nSee ." >&2 + exit 1 +fi + +log "Building wikiparser" +cargo build --release +wikiparser=$(pwd)/target/release/om-wikiparser + +log "Changing to maps build dir '$BUILD_DIR'" +cd "$BUILD_DIR" + +log "Transforming intermediate generator data" +for intermediate_file in id_to_wikidata.csv wiki_urls.txt; do + if [ ! -e "$intermediate_file" ]; then + echo -e "Cannot find intermediate generator file '$intermediate_file' in maps build dir '$BUILD_DIR/'\nWas the descriptions step run?" >&2 + exit 1 + fi +done + +cut -f 2 id_to_wikidata.csv > wikidata_ids.txt +tail -n +2 wiki_urls.txt | cut -f 3 > wikipedia_urls.txt + +# Enable backtraces in errors and panics. +export RUST_BACKTRACE=1 +# Set log level. +export RUST_LOG=om_wikiparser=info + +# Begin extraction. +OUTPUT_DIR=$(pwd)/descriptions +if [ ! -e "$OUTPUT_DIR" ]; then + mkdir "$OUTPUT_DIR" +fi +log "Extracting articles to '$OUTPUT_DIR'" + +kill_jobs() { + pids=$(jobs -p) + if [ -n "$pids" ]; then + log "Killing background jobs" + # shellcheck disable=SC2086 # PIDs are intentionally expanded. + kill $pids + log "Waiting for background jobs to stop" + wait + fi +} + +trap 'kill_jobs' SIGINT SIGTERM EXIT + +for dump in "${DUMP_FILES[@]}"; do + log "Extracting '$dump'" + tar xzOf "$dump" | "$wikiparser" \ + --wikidata-ids wikidata_ids.txt \ + --wikipedia-urls wikipedia_urls.txt \ + --write-new-ids new_qids.txt \ + "$OUTPUT_DIR" & +done + +wait + +log "Beginning extraction of discovered QIDs" + +# Extract new qids from other dumps in parallel. +for dump in "${DUMP_FILES[@]}"; do + tar xzOf "$dump" | "$wikiparser" \ + --wikidata-ids new_qids.txt \ + "$OUTPUT_DIR" & +done + +wait + +log "Finished"