[python][generator] Supported downloading of directories.

This commit is contained in:
Maksim Andrianov 2020-05-22 12:26:45 +03:00 committed by Olga Khlopkova
parent cad12d7d59
commit feffe94dfc
5 changed files with 85 additions and 3 deletions

View file

@ -1,5 +1,5 @@
htmlmin
requests
bs4
beautifulsoup4==4.9.1
wikidata
wikipedia-api

View file

@ -24,6 +24,7 @@ from typing import Union
from maps_generator.generator.status import Status
from maps_generator.utils.file import download_files
from maps_generator.utils.file import normalize_url_to_path_dict
from maps_generator.utils.log import DummyObject
from maps_generator.utils.log import create_file_logger
@ -278,6 +279,7 @@ def depends_from_internal(*deps) -> Callable[[Type[Stage],], Type[Stage]]:
deps[d.url] = path
if deps:
deps = normalize_url_to_path_dict(deps)
download_files(deps, env.force_download_files)
obj.depends_from_internal_downloaded = True

View file

@ -1,3 +1,6 @@
-r ../post_generation/requirements.txt
-r ../descriptions/requirements.txt
filelock==3.0.10
beautifulsoup4==4.9.1
requests==2.23.0
requests_file==1.5.1

View file

@ -4,12 +4,19 @@ import glob
import logging
import os
import shutil
import urllib.request
from functools import partial
from multiprocessing.pool import ThreadPool
from typing import AnyStr
from typing import Dict
from typing import List
from typing import Optional
from urllib.parse import unquote
from urllib.parse import urljoin
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
from requests_file import FileAdapter
from maps_generator.utils.md5 import check_md5
from maps_generator.utils.md5 import md5_ext
@ -42,11 +49,79 @@ def download_file(url: AnyStr, name: AnyStr, download_if_exists: bool = True):
return
tmp_name = f"{name}__"
urllib.request.urlretrieve(url, tmp_name)
os.makedirs(os.path.dirname(tmp_name), exist_ok=True)
with requests.Session() as session:
session.mount("file://", FileAdapter())
response = session.get(url, stream=True)
with open(tmp_name, "wb") as handle:
for data in response.iter_content(chunk_size=4096):
handle.write(data)
shutil.move(tmp_name, name)
logger.info(f"File {name} was downloaded from {url}.")
def is_dir(url) -> bool:
return url.endswith("/")
def find_files(url) -> List[AnyStr]:
def files_list_file_scheme(path, results=None):
if results is None:
results = []
for p in os.listdir(path):
new_path = os.path.join(path, p)
if os.path.isdir(new_path):
files_list_file_scheme(new_path, results)
else:
results.append(new_path)
return results
def files_list_http_scheme(url, results=None):
if results is None:
results = []
page = requests.get(url).content
bs = BeautifulSoup(page, "html.parser")
links = bs.findAll("a", href=True)
for link in links:
href = link["href"]
if href == "./" or href == "../":
continue
new_url = urljoin(url, href)
if is_dir(new_url):
files_list_http_scheme(new_url, results)
else:
results.append(new_url)
return results
parse_result = urlparse(url)
if parse_result.scheme == "file":
return [
f.replace(parse_result.path, "")
for f in files_list_file_scheme(parse_result.path)
]
if parse_result.scheme == "http" or parse_result.scheme == "https":
return [f.replace(url, "") for f in files_list_http_scheme(url)]
assert False, parse_result
def normalize_url_to_path_dict(
url_to_path: Dict[AnyStr, AnyStr]
) -> Dict[AnyStr, AnyStr]:
for url in list(url_to_path.keys()):
if is_dir(url):
path = url_to_path[url]
del url_to_path[url]
for rel_path in find_files(url):
abs_url = urljoin(url, rel_path)
url_to_path[abs_url] = unquote(os.path.join(path, rel_path))
return url_to_path
def download_files(url_to_path: Dict[AnyStr, AnyStr], download_if_exists: bool = True):
with ThreadPool() as pool:
pool.starmap(

View file

@ -37,6 +37,8 @@ NEED_PLANET_UPDATE: 0
[External]
# Note: If you want to set a directory name you have to add "/" to the end of url.
# The url to the planet file.
# PLANET_URL:
# The url to the file with md5 sum of the planet.