- Warn on unexpected file extensions - Move filename to end of errors Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
191 lines
5.9 KiB
Bash
Executable file
191 lines
5.9 KiB
Bash
Executable file
#! /usr/bin/env bash
|
|
USAGE="Usage: ./download.sh [-hD] [-c <NUM>] <DUMP_DIR>
|
|
|
|
Download the latest Wikipedia Enterprise HTML dumps.
|
|
|
|
Arguments:
|
|
<DUMP_DIR> An existing directory to store dumps in. Dumps will be grouped
|
|
into subdirectories by date, and a link 'latest' will point to
|
|
the latest complete dump subdirectory, if it exists.
|
|
|
|
Options:
|
|
-h Print this help screen.
|
|
-D Delete all old dump subdirectories if the latest is downloaded.
|
|
-c <NUM> Number of concurrent downloads to allow. Ignored if wget2 is not
|
|
present or MIRROR is not set. Defaults to 2.
|
|
|
|
Environment Variables:
|
|
LANGUAGES A whitespace-separated list of wikipedia language codes to
|
|
download dumps of.
|
|
Defaults to the languages in 'article_processing_config.json'.
|
|
See <https://meta.wikimedia.org/wiki/List_of_Wikipedias>.
|
|
MIRROR A wikimedia dump mirror to use instead of the main wikimedia
|
|
server. See <https://dumps.wikimedia.org/mirrors.html> for a
|
|
list of available mirrors, note that many do not include the
|
|
required Enterprise HTML dumps.
|
|
For example: MIRROR=https://mirror.accum.se/mirror/wikimedia.org
|
|
|
|
Exit codes:
|
|
0 The latest dumps are already present or were downloaded successfully.
|
|
1 Argument error.
|
|
16 Some of languages were not available to download. The latest dump may
|
|
be in progress, some of the specified languages may not exist, or the
|
|
chosen mirror may not host the files.
|
|
_ Subprocess error.
|
|
"
|
|
|
|
set -euo pipefail
|
|
# set -x
|
|
|
|
build_user_agent() {
|
|
# While the dump websites are not part of the API, it's still polite to identify yourself.
|
|
# See https://meta.wikimedia.org/wiki/User-Agent_policy
|
|
subcommand=$1
|
|
name="OrganicMapsWikiparserDownloaderBot"
|
|
version="1.0"
|
|
url="https://github.com/organicmaps/wikiparser"
|
|
email="hello@organicmaps.app"
|
|
echo -n "$name/$version ($url; $email) $subcommand"
|
|
}
|
|
|
|
# Parse options.
|
|
DELETE_OLD_DUMPS=false
|
|
CONCURRENT_DOWNLOADS=
|
|
while getopts "hDc:" opt
|
|
do
|
|
case $opt in
|
|
h) echo -n "$USAGE"; exit 0;;
|
|
D) DELETE_OLD_DUMPS=true;;
|
|
c) CONCURRENT_DOWNLOADS=$OPTARG;;
|
|
?) echo "$USAGE" | head -n1 >&2; exit 1;;
|
|
esac
|
|
done
|
|
shift $((OPTIND - 1))
|
|
|
|
if [ -z "${1:-}" ]; then
|
|
echo "DUMP_DIR is required" >&2
|
|
echo -n "$USAGE" >&2
|
|
exit 1
|
|
fi
|
|
|
|
# The parent directory to store groups of dumps in.
|
|
DUMP_DIR=$(readlink -f "$1")
|
|
shift
|
|
|
|
if [ -n "${1:-}" ]; then
|
|
echo "Unexpected extra argument: '$1'" >&2
|
|
echo "$USAGE" | head -n1 >&2
|
|
exit 1
|
|
fi
|
|
|
|
if [ ! -d "$DUMP_DIR" ]; then
|
|
echo "DUMP_DIR does not exist: '$DUMP_DIR'" >&2
|
|
exit 1
|
|
fi
|
|
|
|
if [ -n "$CONCURRENT_DOWNLOADS" ]; then
|
|
if [ ! "$CONCURRENT_DOWNLOADS" -ge 1 ]; then
|
|
echo "Number of concurrent downloads (-n) must be >= 1" >&2
|
|
echo "$USAGE" | head -n1 >&2
|
|
exit 1
|
|
fi
|
|
if [ -z "${MIRROR:-}" ]; then
|
|
# NOTE: Wikipedia requests no more than 2 concurrent downloads.
|
|
# See https://dumps.wikimedia.org/ for more info.
|
|
echo "WARN: MIRROR is not set; ignoring -n" >&2
|
|
CONCURRENT_DOWNLOADS=
|
|
fi
|
|
fi
|
|
|
|
# Ensure we're running in the directory of this script.
|
|
SCRIPT_PATH=$(dirname "$0")
|
|
cd "$SCRIPT_PATH"
|
|
SCRIPT_PATH=$(pwd)
|
|
|
|
# Only load library after changing to script directory.
|
|
source lib.sh
|
|
|
|
|
|
if [ -n "${MIRROR:-}" ]; then
|
|
log "Using mirror '$MIRROR'"
|
|
BASE_URL=$MIRROR
|
|
else
|
|
BASE_URL="https://dumps.wikimedia.org"
|
|
fi
|
|
|
|
if [ -z "${LANGUAGES:-}" ]; then
|
|
# Load languages from config.
|
|
LANGUAGES=$(jq -r '(.sections_to_remove | keys | .[])' article_processing_config.json)
|
|
fi
|
|
# shellcheck disable=SC2086 # LANGUAGES is intentionally expanded.
|
|
log "Selected languages:" $LANGUAGES
|
|
|
|
log "Fetching run index"
|
|
# The date of the latest dump, YYYYMMDD.
|
|
LATEST_DUMP=$(wget "$BASE_URL/other/enterprise_html/runs/" --no-verbose -O - \
|
|
| grep -Po '(?<=href=")[^"]*' | grep -P '\d{8}' | sort -r | head -n1)
|
|
LATEST_DUMP="${LATEST_DUMP%/}"
|
|
|
|
log "Checking latest dump $LATEST_DUMP"
|
|
|
|
URLS=
|
|
MISSING_DUMPS=0
|
|
for lang in $LANGUAGES; do
|
|
url="$BASE_URL/other/enterprise_html/runs/${LATEST_DUMP}/${lang}wiki-NS0-${LATEST_DUMP}-ENTERPRISE-HTML.json.tar.gz"
|
|
if ! wget --no-verbose --method=HEAD "$url"; then
|
|
MISSING_DUMPS=$(( MISSING_DUMPS + 1 ))
|
|
log "Dump for '$lang' does not exist at '$url'"
|
|
continue
|
|
fi
|
|
URLS="$URLS $url"
|
|
done
|
|
|
|
if [ -z "$URLS" ]; then
|
|
log "No dumps available"
|
|
exit 16
|
|
fi
|
|
|
|
# The subdir to store the latest dump in.
|
|
DOWNLOAD_DIR="$DUMP_DIR/$LATEST_DUMP"
|
|
if [ ! -e "$DOWNLOAD_DIR" ]; then
|
|
mkdir "$DOWNLOAD_DIR"
|
|
fi
|
|
|
|
log "Downloading available dumps"
|
|
if type wget2 > /dev/null; then
|
|
# shellcheck disable=SC2086 # URLS should be expanded on spaces.
|
|
wget2 --verbose --progress=bar --continue \
|
|
--user-agent "$(build_user_agent wget2)" \
|
|
--max-threads "${CONCURRENT_DOWNLOADS:-2}" \
|
|
--directory-prefix "$DOWNLOAD_DIR" \
|
|
$URLS
|
|
else
|
|
log "WARN: wget2 is not available, falling back to sequential downloads"
|
|
# shellcheck disable=SC2086 # URLS should be expanded on spaces.
|
|
wget --continue \
|
|
--user-agent "$(build_user_agent wget)" \
|
|
--directory-prefix "$DOWNLOAD_DIR" \
|
|
$URLS
|
|
fi
|
|
|
|
if [ $MISSING_DUMPS -gt 0 ]; then
|
|
log "$MISSING_DUMPS dumps not available yet"
|
|
exit 16
|
|
fi
|
|
|
|
log "Linking 'latest' to '$LATEST_DUMP'"
|
|
LATEST_LINK="$DUMP_DIR/latest"
|
|
ln -sf -T "$LATEST_DUMP" "$LATEST_LINK"
|
|
|
|
if [ "$DELETE_OLD_DUMPS" = true ]; then
|
|
# shellcheck disable=SC2010 # Only matching files with numeric names are used.
|
|
mapfile -t OLD_DUMPS < <(ls "$DUMP_DIR" | grep -P '^\d{8}$' | grep -vF "$LATEST_DUMP")
|
|
if [ "${#OLD_DUMPS[@]}" -gt 0 ]; then
|
|
log "Deleting old dumps" "${OLD_DUMPS[@]}"
|
|
for old_dump in "${OLD_DUMPS[@]}"; do
|
|
rm -r "${DUMP_DIR:?}/${old_dump:?}/"
|
|
done
|
|
else
|
|
log "No old dumps to delete"
|
|
fi
|
|
fi
|