diff --git a/tools/python/descriptions/requirements.txt b/tools/python/descriptions/requirements.txt index 4c5abf335f..884dba74a4 100644 --- a/tools/python/descriptions/requirements.txt +++ b/tools/python/descriptions/requirements.txt @@ -1,5 +1,5 @@ htmlmin requests -bs4 +beautifulsoup4==4.9.1 wikidata wikipedia-api \ No newline at end of file diff --git a/tools/python/maps_generator/generator/stages.py b/tools/python/maps_generator/generator/stages.py index 69d2836866..60c6f161ed 100644 --- a/tools/python/maps_generator/generator/stages.py +++ b/tools/python/maps_generator/generator/stages.py @@ -24,6 +24,7 @@ from typing import Union from maps_generator.generator.status import Status from maps_generator.utils.file import download_files +from maps_generator.utils.file import normalize_url_to_path_dict from maps_generator.utils.log import DummyObject from maps_generator.utils.log import create_file_logger @@ -278,6 +279,7 @@ def depends_from_internal(*deps) -> Callable[[Type[Stage],], Type[Stage]]: deps[d.url] = path if deps: + deps = normalize_url_to_path_dict(deps) download_files(deps, env.force_download_files) obj.depends_from_internal_downloaded = True diff --git a/tools/python/maps_generator/requirements.txt b/tools/python/maps_generator/requirements.txt index ef5098d175..77402d564a 100644 --- a/tools/python/maps_generator/requirements.txt +++ b/tools/python/maps_generator/requirements.txt @@ -1,3 +1,6 @@ -r ../post_generation/requirements.txt -r ../descriptions/requirements.txt filelock==3.0.10 +beautifulsoup4==4.9.1 +requests==2.23.0 +requests_file==1.5.1 diff --git a/tools/python/maps_generator/utils/file.py b/tools/python/maps_generator/utils/file.py index eec4fd4ced..84d8c14406 100644 --- a/tools/python/maps_generator/utils/file.py +++ b/tools/python/maps_generator/utils/file.py @@ -4,12 +4,19 @@ import glob import logging import os import shutil -import urllib.request from functools import partial from multiprocessing.pool import ThreadPool from typing import AnyStr from typing import Dict +from typing import List from typing import Optional +from urllib.parse import unquote +from urllib.parse import urljoin +from urllib.parse import urlparse + +import requests +from bs4 import BeautifulSoup +from requests_file import FileAdapter from maps_generator.utils.md5 import check_md5 from maps_generator.utils.md5 import md5_ext @@ -42,11 +49,79 @@ def download_file(url: AnyStr, name: AnyStr, download_if_exists: bool = True): return tmp_name = f"{name}__" - urllib.request.urlretrieve(url, tmp_name) + os.makedirs(os.path.dirname(tmp_name), exist_ok=True) + with requests.Session() as session: + session.mount("file://", FileAdapter()) + response = session.get(url, stream=True) + with open(tmp_name, "wb") as handle: + for data in response.iter_content(chunk_size=4096): + handle.write(data) + shutil.move(tmp_name, name) logger.info(f"File {name} was downloaded from {url}.") +def is_dir(url) -> bool: + return url.endswith("/") + + +def find_files(url) -> List[AnyStr]: + def files_list_file_scheme(path, results=None): + if results is None: + results = [] + + for p in os.listdir(path): + new_path = os.path.join(path, p) + if os.path.isdir(new_path): + files_list_file_scheme(new_path, results) + else: + results.append(new_path) + return results + + def files_list_http_scheme(url, results=None): + if results is None: + results = [] + + page = requests.get(url).content + bs = BeautifulSoup(page, "html.parser") + links = bs.findAll("a", href=True) + for link in links: + href = link["href"] + if href == "./" or href == "../": + continue + + new_url = urljoin(url, href) + if is_dir(new_url): + files_list_http_scheme(new_url, results) + else: + results.append(new_url) + return results + + parse_result = urlparse(url) + if parse_result.scheme == "file": + return [ + f.replace(parse_result.path, "") + for f in files_list_file_scheme(parse_result.path) + ] + if parse_result.scheme == "http" or parse_result.scheme == "https": + return [f.replace(url, "") for f in files_list_http_scheme(url)] + + assert False, parse_result + + +def normalize_url_to_path_dict( + url_to_path: Dict[AnyStr, AnyStr] +) -> Dict[AnyStr, AnyStr]: + for url in list(url_to_path.keys()): + if is_dir(url): + path = url_to_path[url] + del url_to_path[url] + for rel_path in find_files(url): + abs_url = urljoin(url, rel_path) + url_to_path[abs_url] = unquote(os.path.join(path, rel_path)) + return url_to_path + + def download_files(url_to_path: Dict[AnyStr, AnyStr], download_if_exists: bool = True): with ThreadPool() as pool: pool.starmap( diff --git a/tools/python/maps_generator/var/etc/map_generator.ini.default b/tools/python/maps_generator/var/etc/map_generator.ini.default index 69a91eca7a..0a04b70006 100644 --- a/tools/python/maps_generator/var/etc/map_generator.ini.default +++ b/tools/python/maps_generator/var/etc/map_generator.ini.default @@ -37,6 +37,8 @@ NEED_PLANET_UPDATE: 0 [External] +# Note: If you want to set a directory name you have to add "/" to the end of url. + # The url to the planet file. # PLANET_URL: # The url to the file with md5 sum of the planet.