From 481ace45ce29262f1b257d10b45e4ea7ef082f8c Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Tue, 26 Sep 2023 11:45:07 -0400 Subject: [PATCH] Add Download script (#22) - Downloads latest enterprise dumps in requested languages - Uses parallel downloading with wget2 if available - Dumps are stored in subdirectories by date Signed-off-by: Evan Lloyd New-Schmidt --- README.md | 36 ++++++++++ download.sh | 191 ++++++++++++++++++++++++++++++++++++++++++++++++++++ run.sh | 2 +- 3 files changed, 228 insertions(+), 1 deletion(-) create mode 100755 download.sh diff --git a/README.md b/README.md index 7f2bd0d..666597d 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,42 @@ OpenStreetMap commonly stores these as [`wikipedia*=`](https://wiki.openstreetma [`article_processing_config.json`](article_processing_config.json) should be updated when adding a new language. It defines article sections that are not important for users and should be removed from the extracted HTML. +## Downloading Dumps + +[Enterprise HTML dumps, updated twice a month, are publicly accessible](https://dumps.wikimedia.org/other/enterprise_html/). Please note that each language's dump is tens of gigabytes in size. + +Wikimedia requests no more than 2 concurrent downloads, which the included [`download.sh`](./download.sh) script respects: +> If you are reading this on Wikimedia servers, please note that we have rate limited downloaders and we are capping the number of per-ip connections to 2. +> This will help to ensure that everyone can access the files with reasonable download times. +> Clients that try to evade these limits may be blocked. +> Our mirror sites do not have this cap. + +See [the list of available mirrors](https://dumps.wikimedia.org/mirrors.html) for other options. Note that most of them do not include the enterprise dumps; check to see that the `other/enterprise_html/runs/` path includes subdirectories with files. The following two mirrors are known to include the enterprise html dumps as of August 2023: +- (US) https://dumps.wikimedia.your.org +- (Sweden) https://mirror.accum.se/mirror/wikimedia.org + +For the wikiparser you'll want the ["NS0"](https://en.wikipedia.org/wiki/Wikipedia:Namespace) "ENTERPRISE-HTML" `.json.tar.gz` files. + +They are gzipped tar files containing a single file of newline-delimited JSON matching the [Wikimedia Enterprise API schema](https://enterprise.wikimedia.com/docs/data-dictionary/). + +The included [`download.sh`](./download.sh) script handles downloading the latest set of dumps in specific languages. +It maintains a directory with the following layout: +``` +/ +├── latest -> 20230701/ +├── 20230701/ +│ ├── dewiki-NS0-20230701-ENTERPRISE-HTML.json.tar.gz +│ ├── enwiki-NS0-20230701-ENTERPRISE-HTML.json.tar.gz +│ ├── eswiki-NS0-20230701-ENTERPRISE-HTML.json.tar.gz +│ ... +├── 20230620/ +│ ├── dewiki-NS0-20230620-ENTERPRISE-HTML.json.tar.gz +│ ├── enwiki-NS0-20230620-ENTERPRISE-HTML.json.tar.gz +│ ├── eswiki-NS0-20230620-ENTERPRISE-HTML.json.tar.gz +│ ... +... +``` + ## Usage To use with the map generator, see the [`run.sh` script](run.sh) and its own help documentation. diff --git a/download.sh b/download.sh new file mode 100755 index 0000000..70876e0 --- /dev/null +++ b/download.sh @@ -0,0 +1,191 @@ +#! /usr/bin/env bash +USAGE="Usage: ./download.sh [-hD] [-c ] + +Download the latest Wikipedia Enterprise HTML dumps. + +Arguments: + An existing directory to store dumps in. Dumps will be grouped + into subdirectories by date, and a link 'latest' will point to + the latest complete dump subdirectory, if it exists. + +Options: + -h Print this help screen. + -D Delete all old dump subdirectories if the latest is downloaded. + -c Number of concurrent downloads to allow. Ignored if wget2 is not + present or MIRROR is not set. Defaults to 2. + +Environment Variables: + LANGUAGES A whitespace-separated list of wikipedia language codes to + download dumps of. + Defaults to the languages in 'article_processing_config.json'. + See . + MIRROR A wikimedia dump mirror to use instead of the main wikimedia + server. See for a + list of available mirrors, note that many do not include the + required Enterprise HTML dumps. + For example: MIRROR=https://mirror.accum.se/mirror/wikimedia.org + +Exit codes: + 0 The latest dumps are already present or were downloaded successfully. + 1 Argument error. + 16 Some of languages were not available to download. The latest dump may + be in progress, some of the specified languages may not exist, or the + chosen mirror may not host the files. + _ Subprocess error. +" + +set -euo pipefail +# set -x + +build_user_agent() { + # While the dump websites are not part of the API, it's still polite to identify yourself. + # See https://meta.wikimedia.org/wiki/User-Agent_policy + subcommand=$1 + name="OrganicMapsWikiparserDownloaderBot" + version="1.0" + url="https://github.com/organicmaps/wikiparser" + email="hello@organicmaps.app" + echo -n "$name/$version ($url; $email) $subcommand" +} + +# Parse options. +DELETE_OLD_DUMPS=false +CONCURRENT_DOWNLOADS= +while getopts "hDc:" opt +do + case $opt in + h) echo -n "$USAGE"; exit 0;; + D) DELETE_OLD_DUMPS=true;; + c) CONCURRENT_DOWNLOADS=$OPTARG;; + ?) echo "$USAGE" | head -n1 >&2; exit 1;; + esac +done +shift $((OPTIND - 1)) + +if [ -z "${1:-}" ]; then + echo "DUMP_DIR is required" >&2 + echo -n "$USAGE" >&2 + exit 1 +fi + +# The parent directory to store groups of dumps in. +DUMP_DIR=$(readlink -f "$1") +shift + +if [ -n "${1:-}" ]; then + echo "Unexpected extra argument: '$1'" >&2 + echo "$USAGE" | head -n1 >&2 + exit 1 +fi + +if [ ! -d "$DUMP_DIR" ]; then + echo "DUMP_DIR '$DUMP_DIR' does not exist" >&2 + exit 1 +fi + +if [ -n "$CONCURRENT_DOWNLOADS" ]; then + if [ ! "$CONCURRENT_DOWNLOADS" -ge 1 ]; then + echo "Number of concurrent downloads (-n) must be >= 1" >&2 + echo "$USAGE" | head -n1 >&2 + exit 1 + fi + if [ -z "${MIRROR:-}" ]; then + # NOTE: Wikipedia requests no more than 2 concurrent downloads. + # See https://dumps.wikimedia.org/ for more info. + echo "WARN: MIRROR is not set; ignoring -n" >&2 + CONCURRENT_DOWNLOADS= + fi +fi + +# Ensure we're running in the directory of this script. +SCRIPT_PATH=$(dirname "$0") +cd "$SCRIPT_PATH" +SCRIPT_PATH=$(pwd) + +# Only load library after changing to script directory. +source lib.sh + + +if [ -n "${MIRROR:-}" ]; then + log "Using mirror '$MIRROR'" + BASE_URL=$MIRROR +else + BASE_URL="https://dumps.wikimedia.org" +fi + +if [ -z "${LANGUAGES:-}" ]; then + # Load languages from config. + LANGUAGES=$(jq -r '(.sections_to_remove | keys | .[])' article_processing_config.json) +fi +# shellcheck disable=SC2086 # LANGUAGES is intentionally expanded. +log "Selected languages:" $LANGUAGES + +log "Fetching run index" +# The date of the latest dump, YYYYMMDD. +LATEST_DUMP=$(wget "$BASE_URL/other/enterprise_html/runs/" --no-verbose -O - \ + | grep -Po '(?<=href=")[^"]*' | grep -P '\d{8}' | sort -r | head -n1) +LATEST_DUMP="${LATEST_DUMP%/}" + +log "Checking latest dump $LATEST_DUMP" + +URLS= +MISSING_DUMPS=0 +for lang in $LANGUAGES; do + url="$BASE_URL/other/enterprise_html/runs/${LATEST_DUMP}/${lang}wiki-NS0-${LATEST_DUMP}-ENTERPRISE-HTML.json.tar.gz" + if ! wget --no-verbose --method=HEAD "$url"; then + MISSING_DUMPS=$(( MISSING_DUMPS + 1 )) + log "Dump for '$lang' does not exist at '$url'" + continue + fi + URLS="$URLS $url" +done + +if [ -z "$URLS" ]; then + log "No dumps available" + exit 16 +fi + +# The subdir to store the latest dump in. +DOWNLOAD_DIR="$DUMP_DIR/$LATEST_DUMP" +if [ ! -e "$DOWNLOAD_DIR" ]; then + mkdir "$DOWNLOAD_DIR" +fi + +log "Downloading available dumps" +if type wget2 > /dev/null; then + # shellcheck disable=SC2086 # URLS should be expanded on spaces. + wget2 --verbose --progress=bar --continue \ + --user-agent "$(build_user_agent wget2)" \ + --max-threads "${CONCURRENT_DOWNLOADS:-2}" \ + --directory-prefix "$DOWNLOAD_DIR" \ + $URLS +else + log "WARN: wget2 is not available, falling back to sequential downloads" + # shellcheck disable=SC2086 # URLS should be expanded on spaces. + wget --continue \ + --user-agent "$(build_user_agent wget)" \ + --directory-prefix "$DOWNLOAD_DIR" \ + $URLS +fi + +if [ $MISSING_DUMPS -gt 0 ]; then + log "$MISSING_DUMPS dumps not available yet" + exit 16 +fi + +log "Linking 'latest' to '$LATEST_DUMP'" +LATEST_LINK="$DUMP_DIR/latest" +ln -sf -T "$LATEST_DUMP" "$LATEST_LINK" + +if [ "$DELETE_OLD_DUMPS" = true ]; then + # shellcheck disable=SC2010 # Only matching files with numeric names are used. + mapfile -t OLD_DUMPS < <(ls "$DUMP_DIR" | grep -P '^\d{8}$' | grep -vF "$LATEST_DUMP") + if [ "${#OLD_DUMPS[@]}" -gt 0 ]; then + log "Deleting old dumps" "${OLD_DUMPS[@]}" + for old_dump in "${OLD_DUMPS[@]}"; do + rm -r "${DUMP_DIR:?}/${old_dump:?}/" + done + else + log "No old dumps to delete" + fi +fi diff --git a/run.sh b/run.sh index dc29e9c..3345119 100755 --- a/run.sh +++ b/run.sh @@ -36,7 +36,7 @@ set -euo pipefail while getopts "h" opt do case $opt in - h) echo -n "$USAGE" >&2; exit 0;; + h) echo -n "$USAGE"; exit 0;; ?) echo "$USAGE" | head -n1 >&2; exit 1;; esac done