#! /usr/bin/env bash USAGE="Usage: ./download.sh [-hD] [-c ] Download the latest Wikipedia Enterprise HTML dumps. Arguments: An existing directory to store dumps in. Dumps will be grouped into subdirectories by date, and a link 'latest' will point to the latest complete dump subdirectory, if it exists. Options: -h Print this help screen. -D Delete all old dump subdirectories if the latest is downloaded. -c Number of concurrent downloads to allow. Ignored if wget2 is not present or MIRROR is not set. Defaults to 2. Environment Variables: LANGUAGES A whitespace-separated list of wikipedia language codes to download dumps of. Defaults to the languages in 'article_processing_config.json'. See . MIRROR A wikimedia dump mirror to use instead of the main wikimedia server. See for a list of available mirrors, note that many do not include the required Enterprise HTML dumps. For example: MIRROR=https://mirror.accum.se/mirror/wikimedia.org Exit codes: 0 The latest dumps are already present or were downloaded successfully. 1 Argument error. 16 Some of languages were not available to download. The latest dump may be in progress, some of the specified languages may not exist, or the chosen mirror may not host the files. _ Subprocess error. " set -euo pipefail # set -x build_user_agent() { # While the dump websites are not part of the API, it's still polite to identify yourself. # See https://meta.wikimedia.org/wiki/User-Agent_policy subcommand=$1 name="OrganicMapsWikiparserDownloaderBot" version="1.0" url="https://github.com/organicmaps/wikiparser" email="hello@organicmaps.app" echo -n "$name/$version ($url; $email) $subcommand" } # Parse options. DELETE_OLD_DUMPS=false CONCURRENT_DOWNLOADS= while getopts "hDc:" opt do case $opt in h) echo -n "$USAGE"; exit 0;; D) DELETE_OLD_DUMPS=true;; c) CONCURRENT_DOWNLOADS=$OPTARG;; ?) echo "$USAGE" | head -n1 >&2; exit 1;; esac done shift $((OPTIND - 1)) if [ -z "${1:-}" ]; then echo "DUMP_DIR is required" >&2 echo -n "$USAGE" >&2 exit 1 fi # The parent directory to store groups of dumps in. DUMP_DIR=$(readlink -f "$1") shift if [ -n "${1:-}" ]; then echo "Unexpected extra argument: '$1'" >&2 echo "$USAGE" | head -n1 >&2 exit 1 fi if [ ! -d "$DUMP_DIR" ]; then echo "DUMP_DIR does not exist: '$DUMP_DIR'" >&2 exit 1 fi if [ -n "$CONCURRENT_DOWNLOADS" ]; then if [ ! "$CONCURRENT_DOWNLOADS" -ge 1 ]; then echo "Number of concurrent downloads (-n) must be >= 1" >&2 echo "$USAGE" | head -n1 >&2 exit 1 fi if [ -z "${MIRROR:-}" ]; then # NOTE: Wikipedia requests no more than 2 concurrent downloads. # See https://dumps.wikimedia.org/ for more info. echo "WARN: MIRROR is not set; ignoring -n" >&2 CONCURRENT_DOWNLOADS= fi fi # Ensure we're running in the directory of this script. SCRIPT_PATH=$(dirname "$0") cd "$SCRIPT_PATH" SCRIPT_PATH=$(pwd) # Only load library after changing to script directory. source lib.sh if [ -n "${MIRROR:-}" ]; then log "Using mirror '$MIRROR'" BASE_URL=$MIRROR else BASE_URL="https://dumps.wikimedia.org" fi if [ -z "${LANGUAGES:-}" ]; then # Load languages from config. LANGUAGES=$(jq -r '(.sections_to_remove | keys | .[])' article_processing_config.json) fi # shellcheck disable=SC2086 # LANGUAGES is intentionally expanded. log "Selected languages:" $LANGUAGES log "Fetching run index" # The date of the latest dump, YYYYMMDD. LATEST_DUMP=$(wget "$BASE_URL/other/enterprise_html/runs/" --no-verbose -O - \ | grep -Po '(?<=href=")[^"]*' | grep -P '\d{8}' | sort -r | head -n1) LATEST_DUMP="${LATEST_DUMP%/}" log "Checking latest dump $LATEST_DUMP" URLS= MISSING_DUMPS=0 for lang in $LANGUAGES; do url="$BASE_URL/other/enterprise_html/runs/${LATEST_DUMP}/${lang}wiki-NS0-${LATEST_DUMP}-ENTERPRISE-HTML.json.tar.gz" if ! wget --no-verbose --method=HEAD "$url"; then MISSING_DUMPS=$(( MISSING_DUMPS + 1 )) log "Dump for '$lang' does not exist at '$url'" continue fi URLS="$URLS $url" done if [ -z "$URLS" ]; then log "No dumps available" exit 16 fi # The subdir to store the latest dump in. DOWNLOAD_DIR="$DUMP_DIR/$LATEST_DUMP" if [ ! -e "$DOWNLOAD_DIR" ]; then mkdir "$DOWNLOAD_DIR" fi log "Downloading available dumps" if type wget2 > /dev/null; then # shellcheck disable=SC2086 # URLS should be expanded on spaces. wget2 --verbose --progress=bar --continue \ --user-agent "$(build_user_agent wget2)" \ --max-threads "${CONCURRENT_DOWNLOADS:-2}" \ --directory-prefix "$DOWNLOAD_DIR" \ $URLS else log "WARN: wget2 is not available, falling back to sequential downloads" # shellcheck disable=SC2086 # URLS should be expanded on spaces. wget --continue \ --user-agent "$(build_user_agent wget)" \ --directory-prefix "$DOWNLOAD_DIR" \ $URLS fi if [ $MISSING_DUMPS -gt 0 ]; then log "$MISSING_DUMPS dumps not available yet" exit 16 fi log "Linking 'latest' to '$LATEST_DUMP'" LATEST_LINK="$DUMP_DIR/latest" ln -sf -T "$LATEST_DUMP" "$LATEST_LINK" if [ "$DELETE_OLD_DUMPS" = true ]; then # shellcheck disable=SC2010 # Only matching files with numeric names are used. mapfile -t OLD_DUMPS < <(ls "$DUMP_DIR" | grep -P '^\d{8}$' | grep -vF "$LATEST_DUMP") if [ "${#OLD_DUMPS[@]}" -gt 0 ]; then log "Deleting old dumps" "${OLD_DUMPS[@]}" for old_dump in "${OLD_DUMPS[@]}"; do rm -r "${DUMP_DIR:?}/${old_dump:?}/" done else log "No old dumps to delete" fi fi