From 28c17a28eb12a620585b4a3067e93cdb9189c0b1 Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Tue, 18 Jul 2023 15:26:50 -0400 Subject: [PATCH 01/28] WIP download script Signed-off-by: Evan Lloyd New-Schmidt --- download.sh | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 download.sh diff --git a/download.sh b/download.sh new file mode 100644 index 0000000..2c846ba --- /dev/null +++ b/download.sh @@ -0,0 +1,71 @@ +#! /usr/bin/env sh +# Download the latest Wikipedia Enterprise dumps. +# Exit codes: +# - No new dumps available +# - Dump not complete +USAGE="download.sh DOWNLOAD_DIR" + +set -eu +set -x + +if [ -z "${1}" ] +then + printf "Usage:\t%s\n" "$USAGE" >&2 + exit 1 +fi + +DOWNLOAD_DIR=$1 + + +# Write printf-style message to stderr with a timestamp and line ending. +log () { + printf "%s " "$(date '+%Y-%m-%dT%H:%M:%SZ')" >&2 + # shellcheck disable=2059 # format string is part of arguments + printf "$@" >&2 + printf "\n" >&2 +} + +# Ensure we're running in the directory of this script. +SCRIPT_PATH=$(dirname "$0") +cd "$SCRIPT_PATH" +SCRIPT_PATH=$(pwd) + +if [ -z "${LANGUAGES+}" ] +then + LANGUAGES=$(jq -r '(.sections_to_remove | keys)' article_processing_config.json) +fi +log "Selected languages: %s" "$LANGUAGES" + +TMP=$(mktemp -d wikiparser-download-XXXX) +trap 'rm -rf $TMP' EXIT INT HUP + +log "Fetching run index" +# Call wget outside of pipeline for errors to be caught by set -e. +wget 'https://dumps.wikimedia.org/other/enterprise_html/runs/' --no-verbose -O "$TMP/runs.html" + +LATEST_DUMP=$(grep -Po '(?<=href=")[^"]*' "$TMP/runs.html" | sort | head -n1) +LATEST_DUMP="${LATEST_DUMP#/}" + +log "Fetching index for latest dump '%s'" "$LATEST_DUMP" +wget "https://dumps.wikimedia.org/other/enterprise_html/runs/$LATEST_DUMP" --no-verbose -O "$TMP/$LATEST_DUMP.html" + +for lang in $LANGUAGES +do + url="https://wikipedia.invalid/${LATEST_DUMP}/${lang}wiki-NS0-${LATEST_DUMP}-ENTERPRISE-HTML.json.tar.gz" + if ! wget --no-verbose --method=HEAD "$url" + then + log "Dump for '%s' does not exist yet at '%s'" "$lang" "$url" + continue + fi + URLS="$URLS $url" +done + +if [ -z "$URLS" ] +then + log "No dumps available" + exit 1 +fi + +log "Downloading available dumps" +# shellcheck disable=SC2086 # URLS should be expanded on spaces +wget --directory-prefix "$DOWNLOAD_DIR" --continue $URLS -- 2.45.3 From 7254bc3ec8147325ac7ae724a5456b82f501f411 Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Tue, 25 Jul 2023 12:07:09 -0400 Subject: [PATCH 02/28] Add requested changes Signed-off-by: Evan Lloyd New-Schmidt --- download.sh | 39 ++++++++++++++------------------------- 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/download.sh b/download.sh index 2c846ba..d4a7bf5 100644 --- a/download.sh +++ b/download.sh @@ -1,40 +1,32 @@ -#! /usr/bin/env sh +#! /usr/bin/env bash # Download the latest Wikipedia Enterprise dumps. # Exit codes: # - No new dumps available # - Dump not complete USAGE="download.sh DOWNLOAD_DIR" -set -eu +set -euo pipefail set -x -if [ -z "${1}" ] -then - printf "Usage:\t%s\n" "$USAGE" >&2 +if [ -z "${1}" ]; then + echo -e "Usage:\t$USAGE\n" >&2 exit 1 fi DOWNLOAD_DIR=$1 - -# Write printf-style message to stderr with a timestamp and line ending. -log () { - printf "%s " "$(date '+%Y-%m-%dT%H:%M:%SZ')" >&2 - # shellcheck disable=2059 # format string is part of arguments - printf "$@" >&2 - printf "\n" >&2 -} - # Ensure we're running in the directory of this script. SCRIPT_PATH=$(dirname "$0") cd "$SCRIPT_PATH" SCRIPT_PATH=$(pwd) -if [ -z "${LANGUAGES+}" ] -then +# only load library after changing to script directory +source lib.sh + +if [ -z "${LANGUAGES+}" ]; then LANGUAGES=$(jq -r '(.sections_to_remove | keys)' article_processing_config.json) fi -log "Selected languages: %s" "$LANGUAGES" +log "Selected languages: $LANGUAGES" TMP=$(mktemp -d wikiparser-download-XXXX) trap 'rm -rf $TMP' EXIT INT HUP @@ -46,22 +38,19 @@ wget 'https://dumps.wikimedia.org/other/enterprise_html/runs/' --no-verbose -O LATEST_DUMP=$(grep -Po '(?<=href=")[^"]*' "$TMP/runs.html" | sort | head -n1) LATEST_DUMP="${LATEST_DUMP#/}" -log "Fetching index for latest dump '%s'" "$LATEST_DUMP" +log "Fetching index for latest dump '$LATEST_DUMP'" wget "https://dumps.wikimedia.org/other/enterprise_html/runs/$LATEST_DUMP" --no-verbose -O "$TMP/$LATEST_DUMP.html" -for lang in $LANGUAGES -do +for lang in $LANGUAGES; do url="https://wikipedia.invalid/${LATEST_DUMP}/${lang}wiki-NS0-${LATEST_DUMP}-ENTERPRISE-HTML.json.tar.gz" - if ! wget --no-verbose --method=HEAD "$url" - then - log "Dump for '%s' does not exist yet at '%s'" "$lang" "$url" + if ! wget --no-verbose --method=HEAD "$url"; then + log "Dump for '$lang' does not exist yet at '$url'" continue fi URLS="$URLS $url" done -if [ -z "$URLS" ] -then +if [ -z "$URLS" ]; then log "No dumps available" exit 1 fi -- 2.45.3 From 3a4d1214dc858aeb32d128c0314e47018c2b004a Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Wed, 16 Aug 2023 14:08:59 -0400 Subject: [PATCH 03/28] Canonicalize input paths Signed-off-by: Evan Lloyd New-Schmidt --- download.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/download.sh b/download.sh index d4a7bf5..39ec1ad 100644 --- a/download.sh +++ b/download.sh @@ -13,7 +13,7 @@ if [ -z "${1}" ]; then exit 1 fi -DOWNLOAD_DIR=$1 +DOWNLOAD_DIR=$(readlink -f "$1") # Ensure we're running in the directory of this script. SCRIPT_PATH=$(dirname "$0") -- 2.45.3 From 9ee1e8d59456782282d3c358caedba8d8093e4f0 Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Wed, 16 Aug 2023 14:09:52 -0400 Subject: [PATCH 04/28] Fix check for uninitialized variable Signed-off-by: Evan Lloyd New-Schmidt --- download.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/download.sh b/download.sh index 39ec1ad..b3accf0 100644 --- a/download.sh +++ b/download.sh @@ -8,7 +8,7 @@ USAGE="download.sh DOWNLOAD_DIR" set -euo pipefail set -x -if [ -z "${1}" ]; then +if [ -z "${1:-}" ]; then echo -e "Usage:\t$USAGE\n" >&2 exit 1 fi @@ -23,7 +23,8 @@ SCRIPT_PATH=$(pwd) # only load library after changing to script directory source lib.sh -if [ -z "${LANGUAGES+}" ]; then +if [ -z "${LANGUAGES:-}" ]; then + # Load languages from config. LANGUAGES=$(jq -r '(.sections_to_remove | keys)' article_processing_config.json) fi log "Selected languages: $LANGUAGES" -- 2.45.3 From bae03b91c8012f447dd0c9f01462cb1a22c4cad8 Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Wed, 16 Aug 2023 14:10:35 -0400 Subject: [PATCH 05/28] Improve comments Signed-off-by: Evan Lloyd New-Schmidt --- download.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/download.sh b/download.sh index b3accf0..f28033c 100644 --- a/download.sh +++ b/download.sh @@ -1,6 +1,7 @@ #! /usr/bin/env bash # Download the latest Wikipedia Enterprise dumps. # Exit codes: +# - 0: The lastest dumps are already present or were downloaded successfully. # - No new dumps available # - Dump not complete USAGE="download.sh DOWNLOAD_DIR" @@ -20,7 +21,7 @@ SCRIPT_PATH=$(dirname "$0") cd "$SCRIPT_PATH" SCRIPT_PATH=$(pwd) -# only load library after changing to script directory +# Only load library after changing to script directory. source lib.sh if [ -z "${LANGUAGES:-}" ]; then @@ -57,5 +58,5 @@ if [ -z "$URLS" ]; then fi log "Downloading available dumps" -# shellcheck disable=SC2086 # URLS should be expanded on spaces +# shellcheck disable=SC2086 # URLS should be expanded on spaces. wget --directory-prefix "$DOWNLOAD_DIR" --continue $URLS -- 2.45.3 From fe295b2379d45804f5f81eafa2d920eae63ea6f6 Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Wed, 16 Aug 2023 14:41:19 -0400 Subject: [PATCH 06/28] Fix jq list output Signed-off-by: Evan Lloyd New-Schmidt --- download.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/download.sh b/download.sh index f28033c..f9c3d78 100644 --- a/download.sh +++ b/download.sh @@ -26,7 +26,7 @@ source lib.sh if [ -z "${LANGUAGES:-}" ]; then # Load languages from config. - LANGUAGES=$(jq -r '(.sections_to_remove | keys)' article_processing_config.json) + LANGUAGES=$(jq -r '(.sections_to_remove | keys | .[])' article_processing_config.json) fi log "Selected languages: $LANGUAGES" -- 2.45.3 From 0a1e0592ffa93a1e537fed55e4d4ead1408f5d64 Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Wed, 16 Aug 2023 14:42:03 -0400 Subject: [PATCH 07/28] Use real enterprise dump url Signed-off-by: Evan Lloyd New-Schmidt --- download.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/download.sh b/download.sh index f9c3d78..2a09743 100644 --- a/download.sh +++ b/download.sh @@ -44,7 +44,7 @@ log "Fetching index for latest dump '$LATEST_DUMP'" wget "https://dumps.wikimedia.org/other/enterprise_html/runs/$LATEST_DUMP" --no-verbose -O "$TMP/$LATEST_DUMP.html" for lang in $LANGUAGES; do - url="https://wikipedia.invalid/${LATEST_DUMP}/${lang}wiki-NS0-${LATEST_DUMP}-ENTERPRISE-HTML.json.tar.gz" + url="https://dumps.wikimedia.org/other/enterprise_html/runs/${LATEST_DUMP}/${lang}wiki-NS0-${LATEST_DUMP}-ENTERPRISE-HTML.json.tar.gz" if ! wget --no-verbose --method=HEAD "$url"; then log "Dump for '$lang' does not exist yet at '$url'" continue -- 2.45.3 From 27ff9cb4dc952f255848a29d7f6f1a850cc19b71 Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Wed, 16 Aug 2023 14:42:43 -0400 Subject: [PATCH 08/28] Track number of missing dumps Signed-off-by: Evan Lloyd New-Schmidt --- download.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/download.sh b/download.sh index 2a09743..93d80c4 100644 --- a/download.sh +++ b/download.sh @@ -43,10 +43,12 @@ LATEST_DUMP="${LATEST_DUMP#/}" log "Fetching index for latest dump '$LATEST_DUMP'" wget "https://dumps.wikimedia.org/other/enterprise_html/runs/$LATEST_DUMP" --no-verbose -O "$TMP/$LATEST_DUMP.html" +MISSING_DUMPS=0 for lang in $LANGUAGES; do url="https://dumps.wikimedia.org/other/enterprise_html/runs/${LATEST_DUMP}/${lang}wiki-NS0-${LATEST_DUMP}-ENTERPRISE-HTML.json.tar.gz" if ! wget --no-verbose --method=HEAD "$url"; then - log "Dump for '$lang' does not exist yet at '$url'" + MISSING_DUMPS=$(( MISSING_DUMPS + 1 )) + log "Dump for '$lang' does not exist at '$url'" continue fi URLS="$URLS $url" @@ -60,3 +62,8 @@ fi log "Downloading available dumps" # shellcheck disable=SC2086 # URLS should be expanded on spaces. wget --directory-prefix "$DOWNLOAD_DIR" --continue $URLS + +if [ $MISSING_DUMPS -gt 0 ]; then + log "$MISSING_DUMPS dumps not available yet" + exit 1 +fi -- 2.45.3 From 54727b968d700e57fcf2da9016ac1c7d68a5bcbf Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Wed, 16 Aug 2023 16:27:35 -0400 Subject: [PATCH 09/28] Working downloads Signed-off-by: Evan Lloyd New-Schmidt --- download.sh | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) mode change 100644 => 100755 download.sh diff --git a/download.sh b/download.sh old mode 100644 new mode 100755 index 93d80c4..61d478b --- a/download.sh +++ b/download.sh @@ -7,7 +7,7 @@ USAGE="download.sh DOWNLOAD_DIR" set -euo pipefail -set -x +# set -x if [ -z "${1:-}" ]; then echo -e "Usage:\t$USAGE\n" >&2 @@ -28,21 +28,22 @@ if [ -z "${LANGUAGES:-}" ]; then # Load languages from config. LANGUAGES=$(jq -r '(.sections_to_remove | keys | .[])' article_processing_config.json) fi -log "Selected languages: $LANGUAGES" +# shellcheck disable=SC2086 # LANGUAGES is intentionally expanded. +log "Selected languages:" $LANGUAGES -TMP=$(mktemp -d wikiparser-download-XXXX) +TMP=$(mktemp --tmpdir -d wikiparser-download-XXXX) trap 'rm -rf $TMP' EXIT INT HUP log "Fetching run index" # Call wget outside of pipeline for errors to be caught by set -e. wget 'https://dumps.wikimedia.org/other/enterprise_html/runs/' --no-verbose -O "$TMP/runs.html" -LATEST_DUMP=$(grep -Po '(?<=href=")[^"]*' "$TMP/runs.html" | sort | head -n1) -LATEST_DUMP="${LATEST_DUMP#/}" +LATEST_DUMP=$(grep -Po '(?<=href=")[^"]*' "$TMP/runs.html" | grep -P '\d{8}' | sort -r | head -n1) +LATEST_DUMP="${LATEST_DUMP%/}" -log "Fetching index for latest dump '$LATEST_DUMP'" -wget "https://dumps.wikimedia.org/other/enterprise_html/runs/$LATEST_DUMP" --no-verbose -O "$TMP/$LATEST_DUMP.html" +log "Checking latest dump $LATEST_DUMP" +URLS= MISSING_DUMPS=0 for lang in $LANGUAGES; do url="https://dumps.wikimedia.org/other/enterprise_html/runs/${LATEST_DUMP}/${lang}wiki-NS0-${LATEST_DUMP}-ENTERPRISE-HTML.json.tar.gz" -- 2.45.3 From bce44d1ab91aa6b6bef40e7b2a96368dc0bfc33f Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Wed, 16 Aug 2023 16:44:23 -0400 Subject: [PATCH 10/28] Store in subdirs Signed-off-by: Evan Lloyd New-Schmidt --- download.sh | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/download.sh b/download.sh index 61d478b..4d1ad79 100755 --- a/download.sh +++ b/download.sh @@ -4,7 +4,7 @@ # - 0: The lastest dumps are already present or were downloaded successfully. # - No new dumps available # - Dump not complete -USAGE="download.sh DOWNLOAD_DIR" +USAGE="download.sh DUMP_DIR" set -euo pipefail # set -x @@ -14,7 +14,9 @@ if [ -z "${1:-}" ]; then exit 1 fi -DOWNLOAD_DIR=$(readlink -f "$1") +# The parent directory to store groups of dumps in. +DUMP_DIR=$(readlink -f "$1") +shift # Ensure we're running in the directory of this script. SCRIPT_PATH=$(dirname "$0") @@ -38,6 +40,7 @@ log "Fetching run index" # Call wget outside of pipeline for errors to be caught by set -e. wget 'https://dumps.wikimedia.org/other/enterprise_html/runs/' --no-verbose -O "$TMP/runs.html" +# The date of the latest dump, YYYYMMDD. LATEST_DUMP=$(grep -Po '(?<=href=")[^"]*' "$TMP/runs.html" | grep -P '\d{8}' | sort -r | head -n1) LATEST_DUMP="${LATEST_DUMP%/}" @@ -60,6 +63,12 @@ if [ -z "$URLS" ]; then exit 1 fi +# The subdir to store the latest dump in. +DOWNLOAD_DIR="$DUMP_DIR/$LATEST_DUMP" +if [ ! -e "$DOWNLOAD_DIR" ]; then + mkdir "$DOWNLOAD_DIR" +fi + log "Downloading available dumps" # shellcheck disable=SC2086 # URLS should be expanded on spaces. wget --directory-prefix "$DOWNLOAD_DIR" --continue $URLS @@ -68,3 +77,7 @@ if [ $MISSING_DUMPS -gt 0 ]; then log "$MISSING_DUMPS dumps not available yet" exit 1 fi + +log "Linking 'latest' to '$LATEST_DUMP'" +LATEST_LINK="$DUMP_DIR/latest" +ln -sf "$LATEST_DUMP" "$LATEST_LINK" -- 2.45.3 From 5077ed02f248590db8baf4c7e76de3fe87f208b8 Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Wed, 16 Aug 2023 17:09:43 -0400 Subject: [PATCH 11/28] Document usage Signed-off-by: Evan Lloyd New-Schmidt --- download.sh | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/download.sh b/download.sh index 4d1ad79..caebe22 100755 --- a/download.sh +++ b/download.sh @@ -1,16 +1,32 @@ #! /usr/bin/env bash -# Download the latest Wikipedia Enterprise dumps. -# Exit codes: -# - 0: The lastest dumps are already present or were downloaded successfully. -# - No new dumps available -# - Dump not complete -USAGE="download.sh DUMP_DIR" +USAGE="Usage: ./download.sh + +Download the latest Wikipedia Enterprise HTML dumps. + +Arguments: + An existing directory to store dumps in. Dumps will be grouped + into subdirectories by date, and a link 'latest' will point to + the latest complete dump subdirectory, if it exists. + +Environment Variables: + LANGUAGES A space-separated list of wikipedia language codes to download + dumps of. + Defaults to the languages in 'article_processing_config.json'. + See . + +Exit codes: + 0 The lastest dumps are already present or were downloaded successfully. + 1 Argument error. + 16 Some of languages were not available to download. The latest dump may + be in progress, or some of the specified languages may not exist. + _ Subprocess error. +" set -euo pipefail # set -x if [ -z "${1:-}" ]; then - echo -e "Usage:\t$USAGE\n" >&2 + echo -n "$USAGE" >&2 exit 1 fi @@ -60,7 +76,7 @@ done if [ -z "$URLS" ]; then log "No dumps available" - exit 1 + exit 16 fi # The subdir to store the latest dump in. @@ -75,9 +91,11 @@ wget --directory-prefix "$DOWNLOAD_DIR" --continue $URLS if [ $MISSING_DUMPS -gt 0 ]; then log "$MISSING_DUMPS dumps not available yet" - exit 1 + exit 16 fi log "Linking 'latest' to '$LATEST_DUMP'" LATEST_LINK="$DUMP_DIR/latest" ln -sf "$LATEST_DUMP" "$LATEST_LINK" + +# TODO: Remove old dumps? -- 2.45.3 From 4c2c6e97ff03f0a17594ff25f52e64145c87f445 Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Wed, 16 Aug 2023 17:11:56 -0400 Subject: [PATCH 12/28] Rename temp dir Signed-off-by: Evan Lloyd New-Schmidt --- download.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/download.sh b/download.sh index caebe22..f161a94 100755 --- a/download.sh +++ b/download.sh @@ -49,15 +49,15 @@ fi # shellcheck disable=SC2086 # LANGUAGES is intentionally expanded. log "Selected languages:" $LANGUAGES -TMP=$(mktemp --tmpdir -d wikiparser-download-XXXX) -trap 'rm -rf $TMP' EXIT INT HUP +TMP_DIR=$(mktemp --tmpdir -d wikiparser-download-XXXX) +trap 'rm -rf $TMP_DIR' EXIT INT HUP log "Fetching run index" # Call wget outside of pipeline for errors to be caught by set -e. -wget 'https://dumps.wikimedia.org/other/enterprise_html/runs/' --no-verbose -O "$TMP/runs.html" +wget 'https://dumps.wikimedia.org/other/enterprise_html/runs/' --no-verbose -O "$TMP_DIR/runs.html" # The date of the latest dump, YYYYMMDD. -LATEST_DUMP=$(grep -Po '(?<=href=")[^"]*' "$TMP/runs.html" | grep -P '\d{8}' | sort -r | head -n1) +LATEST_DUMP=$(grep -Po '(?<=href=")[^"]*' "$TMP_DIR/runs.html" | grep -P '\d{8}' | sort -r | head -n1) LATEST_DUMP="${LATEST_DUMP%/}" log "Checking latest dump $LATEST_DUMP" -- 2.45.3 From af80f2ad757da52a1ca2005f4975b306f6e5db77 Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Wed, 16 Aug 2023 17:14:51 -0400 Subject: [PATCH 13/28] Check for DUMP_DIR existence Signed-off-by: Evan Lloyd New-Schmidt --- download.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/download.sh b/download.sh index f161a94..9a41689 100755 --- a/download.sh +++ b/download.sh @@ -34,6 +34,11 @@ fi DUMP_DIR=$(readlink -f "$1") shift +if [ ! -d "$DUMP_DIR" ]; then + echo "DUMP_DIR '$DUMP_DIR' does not exist" >&2 + exit 1 +fi + # Ensure we're running in the directory of this script. SCRIPT_PATH=$(dirname "$0") cd "$SCRIPT_PATH" -- 2.45.3 From 98d5a8a95fc8bb261e9e6af38835a2ad2075c011 Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Wed, 16 Aug 2023 17:55:42 -0400 Subject: [PATCH 14/28] Mention download.sh in README Signed-off-by: Evan Lloyd New-Schmidt --- README.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/README.md b/README.md index 7f2bd0d..4b4f973 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,32 @@ OpenStreetMap commonly stores these as [`wikipedia*=`](https://wiki.openstreetma [`article_processing_config.json`](article_processing_config.json) should be updated when adding a new language. It defines article sections that are not important for users and should be removed from the extracted HTML. +## Downloading Dumps + +[Enterprise HTML dumps, updated twice a month. are publicly accessible](https://dumps.wikimedia.org/other/enterprise_html/). + +For the wikiparser you'll want the ["NS0"](https://en.wikipedia.org/wiki/Wikipedia:Namespace) "ENTERPRISE-HTML" `.json.tar.gz` files. + +They are gzipped tar files containing a single file of newline-delimited JSON matching the [Wikimedia Enterprise API schema](https://enterprise.wikimedia.com/docs/data-dictionary/). + +The included [`download.sh`](./download.sh) script handles downloading the latest set of dumps in specific languages. +It maintains a directory with the following layout: +``` +/ +├── latest -> 20230701/ +├── 20230701/ +│ ├── dewiki-NS0-20230701-ENTERPRISE-HTML.json.tar.gz +│ ├── enwiki-NS0-20230701-ENTERPRISE-HTML.json.tar.gz +│ ├── eswiki-NS0-20230701-ENTERPRISE-HTML.json.tar.gz +│ ... +├── 20230620/ +│ ├── dewiki-NS0-20230620-ENTERPRISE-HTML.json.tar.gz +│ ├── enwiki-NS0-20230620-ENTERPRISE-HTML.json.tar.gz +│ ├── eswiki-NS0-20230620-ENTERPRISE-HTML.json.tar.gz +│ ... +... +``` + ## Usage To use with the map generator, see the [`run.sh` script](run.sh) and its own help documentation. -- 2.45.3 From 30b19caeefb96baea61722ae87fdec8721ca6d45 Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Wed, 16 Aug 2023 17:57:44 -0400 Subject: [PATCH 15/28] Fix typo Signed-off-by: Evan Lloyd New-Schmidt --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4b4f973..d82f89a 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ It defines article sections that are not important for users and should be remov ## Downloading Dumps -[Enterprise HTML dumps, updated twice a month. are publicly accessible](https://dumps.wikimedia.org/other/enterprise_html/). +[Enterprise HTML dumps, updated twice a month, are publicly accessible ](https://dumps.wikimedia.org/other/enterprise_html/). For the wikiparser you'll want the ["NS0"](https://en.wikipedia.org/wiki/Wikipedia:Namespace) "ENTERPRISE-HTML" `.json.tar.gz` files. -- 2.45.3 From 187e294d99813a0cf25133d5f475c8c493691b90 Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Wed, 16 Aug 2023 17:59:23 -0400 Subject: [PATCH 16/28] Fix typo typo Signed-off-by: Evan Lloyd New-Schmidt --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d82f89a..8a5fbdc 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ It defines article sections that are not important for users and should be remov ## Downloading Dumps -[Enterprise HTML dumps, updated twice a month, are publicly accessible ](https://dumps.wikimedia.org/other/enterprise_html/). +[Enterprise HTML dumps, updated twice a month, are publicly accessible](https://dumps.wikimedia.org/other/enterprise_html/). For the wikiparser you'll want the ["NS0"](https://en.wikipedia.org/wiki/Wikipedia:Namespace) "ENTERPRISE-HTML" `.json.tar.gz` files. -- 2.45.3 From b06167f9d92b83c5862d7d1b19cffca23ef4857a Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Wed, 16 Aug 2023 20:08:01 -0400 Subject: [PATCH 17/28] Clarify LANGUAGES parsing Signed-off-by: Evan Lloyd New-Schmidt --- download.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/download.sh b/download.sh index 9a41689..47ba41e 100755 --- a/download.sh +++ b/download.sh @@ -9,8 +9,8 @@ Arguments: the latest complete dump subdirectory, if it exists. Environment Variables: - LANGUAGES A space-separated list of wikipedia language codes to download - dumps of. + LANGUAGES A whitespace-separated list of wikipedia language codes to + download dumps of. Defaults to the languages in 'article_processing_config.json'. See . -- 2.45.3 From 38faebbc541ac362b040d93d59454a6cea3261bd Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Wed, 16 Aug 2023 20:11:02 -0400 Subject: [PATCH 18/28] Remove old workaround for lack of pipefail Signed-off-by: Evan Lloyd New-Schmidt --- download.sh | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/download.sh b/download.sh index 47ba41e..c4d4bc6 100755 --- a/download.sh +++ b/download.sh @@ -54,15 +54,12 @@ fi # shellcheck disable=SC2086 # LANGUAGES is intentionally expanded. log "Selected languages:" $LANGUAGES -TMP_DIR=$(mktemp --tmpdir -d wikiparser-download-XXXX) -trap 'rm -rf $TMP_DIR' EXIT INT HUP - log "Fetching run index" -# Call wget outside of pipeline for errors to be caught by set -e. -wget 'https://dumps.wikimedia.org/other/enterprise_html/runs/' --no-verbose -O "$TMP_DIR/runs.html" + # The date of the latest dump, YYYYMMDD. -LATEST_DUMP=$(grep -Po '(?<=href=")[^"]*' "$TMP_DIR/runs.html" | grep -P '\d{8}' | sort -r | head -n1) +LATEST_DUMP=$(wget 'https://dumps.wikimedia.org/other/enterprise_html/runs/' --no-verbose -O - \ + | grep -Po '(?<=href=")[^"]*' | grep -P '\d{8}' | sort -r | head -n1) LATEST_DUMP="${LATEST_DUMP%/}" log "Checking latest dump $LATEST_DUMP" -- 2.45.3 From 82f2993b2158629f3a71fe0242efbbb95e20acdc Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Wed, 16 Aug 2023 20:12:05 -0400 Subject: [PATCH 19/28] Remove extra whitespace Signed-off-by: Evan Lloyd New-Schmidt --- download.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/download.sh b/download.sh index c4d4bc6..6ca0ead 100755 --- a/download.sh +++ b/download.sh @@ -55,8 +55,6 @@ fi log "Selected languages:" $LANGUAGES log "Fetching run index" - - # The date of the latest dump, YYYYMMDD. LATEST_DUMP=$(wget 'https://dumps.wikimedia.org/other/enterprise_html/runs/' --no-verbose -O - \ | grep -Po '(?<=href=")[^"]*' | grep -P '\d{8}' | sort -r | head -n1) -- 2.45.3 From 9d2d2e5f3966a1d9b0bf457fdcf860cd4a685b14 Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Fri, 18 Aug 2023 13:51:29 -0400 Subject: [PATCH 20/28] Add option to delete old dumps Signed-off-by: Evan Lloyd New-Schmidt --- download.sh | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/download.sh b/download.sh index 6ca0ead..de2b7e3 100755 --- a/download.sh +++ b/download.sh @@ -1,5 +1,5 @@ #! /usr/bin/env bash -USAGE="Usage: ./download.sh +USAGE="Usage: ./download.sh [-hD] Download the latest Wikipedia Enterprise HTML dumps. @@ -8,6 +8,10 @@ Arguments: into subdirectories by date, and a link 'latest' will point to the latest complete dump subdirectory, if it exists. +Options: + -h Print this help screen + -D Delete all old dump subdirectories if the latest is downloaded + Environment Variables: LANGUAGES A whitespace-separated list of wikipedia language codes to download dumps of. @@ -25,7 +29,20 @@ Exit codes: set -euo pipefail # set -x +# Parse options. +DELETE_OLD_DUMPS=false +while getopts "hD" opt +do + case $opt in + h) echo -n "$USAGE" >&2; exit 0;; + D) DELETE_OLD_DUMPS=true;; + ?) echo "$USAGE" | head -n1 >&2; exit 1;; + esac +done +shift $((OPTIND - 1)) + if [ -z "${1:-}" ]; then + echo "DUMP_DIR is required" >&2 echo -n "$USAGE" >&2 exit 1 fi @@ -34,6 +51,12 @@ fi DUMP_DIR=$(readlink -f "$1") shift +if [ -n "${1:-}" ]; then + echo "Unexpected extra argument: '$1'" >&2 + echo "$USAGE" | head -n1 >&2 + exit 1 +fi + if [ ! -d "$DUMP_DIR" ]; then echo "DUMP_DIR '$DUMP_DIR' does not exist" >&2 exit 1 @@ -98,4 +121,15 @@ log "Linking 'latest' to '$LATEST_DUMP'" LATEST_LINK="$DUMP_DIR/latest" ln -sf "$LATEST_DUMP" "$LATEST_LINK" -# TODO: Remove old dumps? +if [ "$DELETE_OLD_DUMPS" = true ]; then + # shellcheck disable=SC2010 # Only matching files with numeric names are used. + mapfile -t OLD_DUMPS < <(ls "$DUMP_DIR" | grep -P '^\d{8}$' | grep -vF "$LATEST_DUMP") + if [ "${#OLD_DUMPS[@]}" -gt 0 ]; then + log "Deleting old dumps" "${OLD_DUMPS[@]}" + for old_dump in "${OLD_DUMPS[@]}"; do + rm -r "${DUMP_DIR:?}/${old_dump:?}/" + done + else + log "No old dumps to delete" + fi +fi -- 2.45.3 From 4d9199235aa98165d8a46a57574aa019c8246f39 Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Fri, 18 Aug 2023 14:08:11 -0400 Subject: [PATCH 21/28] Use wget2 by default Signed-off-by: Evan Lloyd New-Schmidt --- download.sh | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/download.sh b/download.sh index de2b7e3..e22fb90 100755 --- a/download.sh +++ b/download.sh @@ -109,8 +109,17 @@ if [ ! -e "$DOWNLOAD_DIR" ]; then fi log "Downloading available dumps" -# shellcheck disable=SC2086 # URLS should be expanded on spaces. -wget --directory-prefix "$DOWNLOAD_DIR" --continue $URLS +if type wget2 > /dev/null; then + # NOTE: Wikipedia requests no more than 2 concurrent downloads. + # See https://dumps.wikimedia.org/ for more info. + + # shellcheck disable=SC2086 # URLS should be expanded on spaces. + wget2 --max-threads 2 --verbose --progress=bar --directory-prefix "$DOWNLOAD_DIR" --continue $URLS +else + log "WARN: wget2 is not available, falling back to sequential downloads" + # shellcheck disable=SC2086 # URLS should be expanded on spaces. + wget --directory-prefix "$DOWNLOAD_DIR" --continue $URLS +fi if [ $MISSING_DUMPS -gt 0 ]; then log "$MISSING_DUMPS dumps not available yet" -- 2.45.3 From 5c8be743024db0f23fbc171496a6786171df3f1a Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Fri, 18 Aug 2023 14:19:33 -0400 Subject: [PATCH 22/28] Write requested help to stdout Signed-off-by: Evan Lloyd New-Schmidt --- download.sh | 2 +- run.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/download.sh b/download.sh index e22fb90..c1abdb0 100755 --- a/download.sh +++ b/download.sh @@ -34,7 +34,7 @@ DELETE_OLD_DUMPS=false while getopts "hD" opt do case $opt in - h) echo -n "$USAGE" >&2; exit 0;; + h) echo -n "$USAGE"; exit 0;; D) DELETE_OLD_DUMPS=true;; ?) echo "$USAGE" | head -n1 >&2; exit 1;; esac diff --git a/run.sh b/run.sh index dc29e9c..3345119 100755 --- a/run.sh +++ b/run.sh @@ -36,7 +36,7 @@ set -euo pipefail while getopts "h" opt do case $opt in - h) echo -n "$USAGE" >&2; exit 0;; + h) echo -n "$USAGE"; exit 0;; ?) echo "$USAGE" | head -n1 >&2; exit 1;; esac done -- 2.45.3 From e7b5c19426201be556550b3f5b1af23c8b531543 Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Fri, 18 Aug 2023 14:24:52 -0400 Subject: [PATCH 23/28] Fix link replacement Without -T, ln interprets an existing LATEST_LINK as a directory to place the link in, instead of a link to replace. Signed-off-by: Evan Lloyd New-Schmidt --- download.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/download.sh b/download.sh index c1abdb0..d27df47 100755 --- a/download.sh +++ b/download.sh @@ -128,7 +128,7 @@ fi log "Linking 'latest' to '$LATEST_DUMP'" LATEST_LINK="$DUMP_DIR/latest" -ln -sf "$LATEST_DUMP" "$LATEST_LINK" +ln -sf -T "$LATEST_DUMP" "$LATEST_LINK" if [ "$DELETE_OLD_DUMPS" = true ]; then # shellcheck disable=SC2010 # Only matching files with numeric names are used. -- 2.45.3 From cf1ac059be5a8d2663f9777891081e2d00511f1f Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Mon, 21 Aug 2023 15:58:40 -0400 Subject: [PATCH 24/28] Make base url configurable Signed-off-by: Evan Lloyd New-Schmidt --- README.md | 12 +++++++++++- download.sh | 19 ++++++++++++++++--- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 8a5fbdc..666597d 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,17 @@ It defines article sections that are not important for users and should be remov ## Downloading Dumps -[Enterprise HTML dumps, updated twice a month, are publicly accessible](https://dumps.wikimedia.org/other/enterprise_html/). +[Enterprise HTML dumps, updated twice a month, are publicly accessible](https://dumps.wikimedia.org/other/enterprise_html/). Please note that each language's dump is tens of gigabytes in size. + +Wikimedia requests no more than 2 concurrent downloads, which the included [`download.sh`](./download.sh) script respects: +> If you are reading this on Wikimedia servers, please note that we have rate limited downloaders and we are capping the number of per-ip connections to 2. +> This will help to ensure that everyone can access the files with reasonable download times. +> Clients that try to evade these limits may be blocked. +> Our mirror sites do not have this cap. + +See [the list of available mirrors](https://dumps.wikimedia.org/mirrors.html) for other options. Note that most of them do not include the enterprise dumps; check to see that the `other/enterprise_html/runs/` path includes subdirectories with files. The following two mirrors are known to include the enterprise html dumps as of August 2023: +- (US) https://dumps.wikimedia.your.org +- (Sweden) https://mirror.accum.se/mirror/wikimedia.org For the wikiparser you'll want the ["NS0"](https://en.wikipedia.org/wiki/Wikipedia:Namespace) "ENTERPRISE-HTML" `.json.tar.gz` files. diff --git a/download.sh b/download.sh index d27df47..7a48ca4 100755 --- a/download.sh +++ b/download.sh @@ -17,12 +17,18 @@ Environment Variables: download dumps of. Defaults to the languages in 'article_processing_config.json'. See . + MIRROR A wikimedia dump mirror to use instead of the main wikimedia + server. See for a + list of available mirrors, note that many do not include the + required Enterprise HTML dumps. + For example: MIRROR=https://mirror.accum.se/mirror/wikimedia.org Exit codes: 0 The lastest dumps are already present or were downloaded successfully. 1 Argument error. 16 Some of languages were not available to download. The latest dump may - be in progress, or some of the specified languages may not exist. + be in progress, some of the specified languages may not exist, or the + chosen mirror may not host the files. _ Subprocess error. " @@ -70,6 +76,13 @@ SCRIPT_PATH=$(pwd) # Only load library after changing to script directory. source lib.sh +if [ -n "${MIRROR:-}" ]; then + log "Using mirror '$MIRROR'" + BASE_URL=$MIRROR +else + BASE_URL="https://dumps.wikimedia.org" +fi + if [ -z "${LANGUAGES:-}" ]; then # Load languages from config. LANGUAGES=$(jq -r '(.sections_to_remove | keys | .[])' article_processing_config.json) @@ -79,7 +92,7 @@ log "Selected languages:" $LANGUAGES log "Fetching run index" # The date of the latest dump, YYYYMMDD. -LATEST_DUMP=$(wget 'https://dumps.wikimedia.org/other/enterprise_html/runs/' --no-verbose -O - \ +LATEST_DUMP=$(wget "$BASE_URL/other/enterprise_html/runs/" --no-verbose -O - \ | grep -Po '(?<=href=")[^"]*' | grep -P '\d{8}' | sort -r | head -n1) LATEST_DUMP="${LATEST_DUMP%/}" @@ -88,7 +101,7 @@ log "Checking latest dump $LATEST_DUMP" URLS= MISSING_DUMPS=0 for lang in $LANGUAGES; do - url="https://dumps.wikimedia.org/other/enterprise_html/runs/${LATEST_DUMP}/${lang}wiki-NS0-${LATEST_DUMP}-ENTERPRISE-HTML.json.tar.gz" + url="$BASE_URL/other/enterprise_html/runs/${LATEST_DUMP}/${lang}wiki-NS0-${LATEST_DUMP}-ENTERPRISE-HTML.json.tar.gz" if ! wget --no-verbose --method=HEAD "$url"; then MISSING_DUMPS=$(( MISSING_DUMPS + 1 )) log "Dump for '$lang' does not exist at '$url'" -- 2.45.3 From 4be00cd39206ca75fd1f3b335eca221a7da7772c Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Mon, 21 Aug 2023 16:15:29 -0400 Subject: [PATCH 25/28] Make concurrent downloads configurable Signed-off-by: Evan Lloyd New-Schmidt --- download.sh | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/download.sh b/download.sh index 7a48ca4..04aa5aa 100755 --- a/download.sh +++ b/download.sh @@ -1,5 +1,5 @@ #! /usr/bin/env bash -USAGE="Usage: ./download.sh [-hD] +USAGE="Usage: ./download.sh [-hD] [-c ] Download the latest Wikipedia Enterprise HTML dumps. @@ -9,8 +9,11 @@ Arguments: the latest complete dump subdirectory, if it exists. Options: - -h Print this help screen - -D Delete all old dump subdirectories if the latest is downloaded + -h Print this help screen + -D Delete all old dump subdirectories if the latest is downloaded + -c Number of concurrent downloads to allow. Requires MIRROR to be + set (Wikimedia servers ask for no more than 2). Requires wget2. + Defaults to 2. Environment Variables: LANGUAGES A whitespace-separated list of wikipedia language codes to @@ -37,11 +40,13 @@ set -euo pipefail # Parse options. DELETE_OLD_DUMPS=false -while getopts "hD" opt +CONCURRENT_DOWNLOADS= +while getopts "hDc:" opt do case $opt in h) echo -n "$USAGE"; exit 0;; D) DELETE_OLD_DUMPS=true;; + c) CONCURRENT_DOWNLOADS=$OPTARG;; ?) echo "$USAGE" | head -n1 >&2; exit 1;; esac done @@ -68,6 +73,18 @@ if [ ! -d "$DUMP_DIR" ]; then exit 1 fi +if [ -n "$CONCURRENT_DOWNLOADS" ]; then + if [ ! "$CONCURRENT_DOWNLOADS" -ge 1 ]; then + echo "Number of concurrent downloads (-n) must be >= 1" >&2 + echo "$USAGE" | head -n1 >&2 + exit 1 + fi + if [ -z "${MIRROR:-}" ]; then + echo "WARN: MIRROR is not set; ignoring -n" >&2 + CONCURRENT_DOWNLOADS= + fi +fi + # Ensure we're running in the directory of this script. SCRIPT_PATH=$(dirname "$0") cd "$SCRIPT_PATH" @@ -127,7 +144,7 @@ if type wget2 > /dev/null; then # See https://dumps.wikimedia.org/ for more info. # shellcheck disable=SC2086 # URLS should be expanded on spaces. - wget2 --max-threads 2 --verbose --progress=bar --directory-prefix "$DOWNLOAD_DIR" --continue $URLS + wget2 --max-threads "${CONCURRENT_DOWNLOADS:-2}" --verbose --progress=bar --directory-prefix "$DOWNLOAD_DIR" --continue $URLS else log "WARN: wget2 is not available, falling back to sequential downloads" # shellcheck disable=SC2086 # URLS should be expanded on spaces. -- 2.45.3 From f08fd7d479dfebb334c37628575e59a078ba8d29 Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Mon, 21 Aug 2023 17:17:48 -0400 Subject: [PATCH 26/28] Clarify -c behavior Signed-off-by: Evan Lloyd New-Schmidt --- download.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/download.sh b/download.sh index 04aa5aa..bfad1c3 100755 --- a/download.sh +++ b/download.sh @@ -11,9 +11,8 @@ Arguments: Options: -h Print this help screen -D Delete all old dump subdirectories if the latest is downloaded - -c Number of concurrent downloads to allow. Requires MIRROR to be - set (Wikimedia servers ask for no more than 2). Requires wget2. - Defaults to 2. + -c Number of concurrent downloads to allow. Ignored if wget2 is not + present or MIRROR is not set. Defaults to 2. Environment Variables: LANGUAGES A whitespace-separated list of wikipedia language codes to -- 2.45.3 From b118724892afa6fee226df74c3b6924dfc27bcfe Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Tue, 22 Aug 2023 10:20:23 -0400 Subject: [PATCH 27/28] Use custom user agent with email Signed-off-by: Evan Lloyd New-Schmidt --- download.sh | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/download.sh b/download.sh index bfad1c3..eab7e3a 100755 --- a/download.sh +++ b/download.sh @@ -37,6 +37,17 @@ Exit codes: set -euo pipefail # set -x +build_user_agent() { + # While the dump websites are not part of the API, it's still polite to identify yourself. + # See https://meta.wikimedia.org/wiki/User-Agent_policy + subcommand=$1 + name="OrganicMapsWikiparserDownloaderBot" + version="1.0" + url="https://github.com/organicmaps/wikiparser" + email="hello@organicmaps.app" + echo -n "$name/$version ($url; $email) $subcommand" +} + # Parse options. DELETE_OLD_DUMPS=false CONCURRENT_DOWNLOADS= @@ -79,6 +90,8 @@ if [ -n "$CONCURRENT_DOWNLOADS" ]; then exit 1 fi if [ -z "${MIRROR:-}" ]; then + # NOTE: Wikipedia requests no more than 2 concurrent downloads. + # See https://dumps.wikimedia.org/ for more info. echo "WARN: MIRROR is not set; ignoring -n" >&2 CONCURRENT_DOWNLOADS= fi @@ -92,6 +105,7 @@ SCRIPT_PATH=$(pwd) # Only load library after changing to script directory. source lib.sh + if [ -n "${MIRROR:-}" ]; then log "Using mirror '$MIRROR'" BASE_URL=$MIRROR @@ -139,15 +153,19 @@ fi log "Downloading available dumps" if type wget2 > /dev/null; then - # NOTE: Wikipedia requests no more than 2 concurrent downloads. - # See https://dumps.wikimedia.org/ for more info. - # shellcheck disable=SC2086 # URLS should be expanded on spaces. - wget2 --max-threads "${CONCURRENT_DOWNLOADS:-2}" --verbose --progress=bar --directory-prefix "$DOWNLOAD_DIR" --continue $URLS + wget2 --verbose --progress=bar --continue \ + --user-agent "$(build_user_agent wget2)" \ + --max-threads "${CONCURRENT_DOWNLOADS:-2}" \ + --directory-prefix "$DOWNLOAD_DIR" \ + $URLS else log "WARN: wget2 is not available, falling back to sequential downloads" # shellcheck disable=SC2086 # URLS should be expanded on spaces. - wget --directory-prefix "$DOWNLOAD_DIR" --continue $URLS + wget --continue \ + --user-agent "$(build_user_agent wget)" \ + --directory-prefix "$DOWNLOAD_DIR" \ + $URLS fi if [ $MISSING_DUMPS -gt 0 ]; then -- 2.45.3 From b21a999da7475be96e55e64a52624d8f6c044cc2 Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Tue, 26 Sep 2023 11:38:15 -0400 Subject: [PATCH 28/28] Fix typos Signed-off-by: Evan Lloyd New-Schmidt --- download.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/download.sh b/download.sh index eab7e3a..70876e0 100755 --- a/download.sh +++ b/download.sh @@ -9,8 +9,8 @@ Arguments: the latest complete dump subdirectory, if it exists. Options: - -h Print this help screen - -D Delete all old dump subdirectories if the latest is downloaded + -h Print this help screen. + -D Delete all old dump subdirectories if the latest is downloaded. -c Number of concurrent downloads to allow. Ignored if wget2 is not present or MIRROR is not set. Defaults to 2. @@ -26,7 +26,7 @@ Environment Variables: For example: MIRROR=https://mirror.accum.se/mirror/wikimedia.org Exit codes: - 0 The lastest dumps are already present or were downloaded successfully. + 0 The latest dumps are already present or were downloaded successfully. 1 Argument error. 16 Some of languages were not available to download. The latest dump may be in progress, some of the specified languages may not exist, or the -- 2.45.3