forked from organicmaps/organicmaps
Merge pull request #10051 from maksimandrianov/fixes
[generator] Remove remove_empty_sections, added popularity_checker
This commit is contained in:
commit
f07de9288b
1 changed files with 44 additions and 21 deletions
|
@ -15,17 +15,39 @@ This script downloads Wikipedia pages for different languages.
|
|||
"""
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
WORKERS = 16
|
||||
CHUNK_SIZE = 64
|
||||
WORKERS = 80
|
||||
CHUNK_SIZE = 128
|
||||
|
||||
HEADERS = {f"h{x}" for x in range(1,7)}
|
||||
BAD_SECTIONS = {
|
||||
"en": ["External links", "Sources", "See also", "Bibliography", "Further reading"],
|
||||
"ru": ["Литература", "Ссылки", "См. также"],
|
||||
"ru": ["Литература", "Ссылки", "См. также", "Библиография", "Примечания"],
|
||||
"es": ["Vínculos de interés", "Véase también", "Enlaces externos"]
|
||||
}
|
||||
|
||||
|
||||
def read_popularity(path):
|
||||
"""
|
||||
:param path: a path of popularity file. A file contains '<id>,<rank>' rows.
|
||||
:return: a set of popularity object ids
|
||||
"""
|
||||
ids = set()
|
||||
for line in open(path):
|
||||
try:
|
||||
ident = int(line.split(",", maxsplit=1)[0])
|
||||
except (AttributeError, IndexError):
|
||||
continue
|
||||
ids.add(ident)
|
||||
return ids
|
||||
|
||||
|
||||
def should_download_wikipage(popularity_set):
|
||||
@functools.wraps(popularity_set)
|
||||
def wrapped(ident):
|
||||
return popularity_set is None or ident in popularity_set
|
||||
return wrapped
|
||||
|
||||
|
||||
def remove_bad_sections(soup, lang):
|
||||
if lang not in BAD_SECTIONS:
|
||||
return soup
|
||||
|
@ -48,25 +70,12 @@ def remove_bad_sections(soup, lang):
|
|||
return soup
|
||||
|
||||
|
||||
def remove_empty_sections(soup):
|
||||
prev = None
|
||||
for x in soup.find_all():
|
||||
if prev is not None and x.name in HEADERS and prev.name in HEADERS:
|
||||
prev.extract()
|
||||
prev = x
|
||||
|
||||
if prev is not None and prev.name in HEADERS:
|
||||
prev.extract()
|
||||
return soup
|
||||
|
||||
|
||||
def beautify_page(html, lang):
|
||||
soup = BeautifulSoup(html, "html")
|
||||
for x in soup.find_all():
|
||||
if len(x.text.strip()) == 0:
|
||||
x.extract()
|
||||
|
||||
soup = remove_empty_sections(soup)
|
||||
soup = remove_bad_sections(soup, lang)
|
||||
html = str(soup.prettify())
|
||||
html = htmlmin.minify(html, remove_empty_space=True)
|
||||
|
@ -143,15 +152,21 @@ def download_all(path, url, langs):
|
|||
download(path, lang[1])
|
||||
|
||||
|
||||
def worker(output_dir, langs):
|
||||
def worker(output_dir, checker, langs):
|
||||
@functools.wraps(worker)
|
||||
def wrapped(line):
|
||||
if not line.strip():
|
||||
return
|
||||
|
||||
try:
|
||||
url = line.rsplit("\t", maxsplit=1)[-1]
|
||||
(mwm_path, ident, url) = line.split("\t")
|
||||
ident = int(ident)
|
||||
if not checker(ident):
|
||||
return
|
||||
url = url.strip()
|
||||
except (AttributeError, IndexError):
|
||||
log.exception(f"{line} is incorrect.")
|
||||
return
|
||||
url = url.strip()
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
path = os.path.join(output_dir, parsed.netloc, parsed.path[1:])
|
||||
download_all(path, url, langs)
|
||||
|
@ -162,6 +177,10 @@ def parse_args():
|
|||
parser = argparse.ArgumentParser(description="Download wiki pages.")
|
||||
parser.add_argument("--o", metavar="PATH", type=str,
|
||||
help="Output dir for saving pages")
|
||||
parser.add_argument("--p", metavar="PATH", type=str,
|
||||
help="File with popular object ids for which we "
|
||||
"download wikipedia data. If not given, download "
|
||||
"for all objects.")
|
||||
parser.add_argument('--i', metavar="PATH", type=str, required=True,
|
||||
help="Input file with wikipedia url.")
|
||||
parser.add_argument('--langs', metavar="LANGS", type=str, nargs='+',
|
||||
|
@ -177,13 +196,17 @@ def main():
|
|||
args = parse_args()
|
||||
input_file = args.i
|
||||
output_dir = args.o
|
||||
popularity_file = args.p
|
||||
langs = list(itertools.chain.from_iterable(args.langs))
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
popularity_set = read_popularity(popularity_file) if popularity_file else None
|
||||
if popularity_set:
|
||||
log.info(f"Popularity set size: {len(popularity_set)}.")
|
||||
checker = should_download_wikipage(popularity_set)
|
||||
with open(input_file) as file:
|
||||
_ = file.readline()
|
||||
pool = ThreadPool(processes=WORKERS)
|
||||
pool.map(worker(output_dir, langs), file, CHUNK_SIZE)
|
||||
pool.map(worker(output_dir, checker, langs), file, CHUNK_SIZE)
|
||||
pool.close()
|
||||
pool.join()
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue