[generator] Download Wikipedia articles' summaries only

Signed-off-by: Konstantin Pastbin <konstantin.pastbin@gmail.com>
This commit is contained in:
Konstantin Pastbin 2022-04-16 22:39:37 +03:00
parent 68c6da563f
commit a540b37979
2 changed files with 14 additions and 5 deletions

View file

@ -14,7 +14,11 @@ from descriptions.descriptions_downloader import log
def parse_args():
parser = argparse.ArgumentParser(description="Download wiki pages.")
parser.add_argument(
"--output_dir", metavar="PATH", type=str, help="Output dir for saving pages"
"--output_dir",
metavar="PATH",
type=str,
required=True,
help="Output dir for saving pages",
)
parser.add_argument(
"--popularity",
@ -32,7 +36,10 @@ def parse_args():
help="Input file with wikipedia url.",
)
parser.add_argument(
"--wikidata", metavar="PATH", type=str, help="Input file with wikidata ids."
"--wikidata",
metavar="PATH",
type=str,
help="Input file with wikidata ids.",
)
parser.add_argument(
"--langs",
@ -47,6 +54,7 @@ def parse_args():
def main():
logging.basicConfig(level=logging.WARNING)
log.setLevel(logging.WARNING)
wikipediaapi.log.setLevel(logging.WARNING)
args = parse_args()

View file

@ -138,7 +138,7 @@ def beautify_page(html, lang):
for x in soup.find_all():
if len(x.text.strip()) == 0:
x.extract()
soup = remove_bad_sections(soup, lang)
# soup = remove_bad_sections(soup, lang)
html = str(soup.prettify())
html = htmlmin.minify(html, remove_empty_space=True)
return html
@ -181,7 +181,8 @@ def download(directory, url):
return None
page = get_wiki_page(lang, page_name)
try:
text = try_get(page, "text")
# text = try_get(page, "text")
text = try_get(page, "summary")
except GettingError:
log.exception(f"Error: page is not downloaded {page_name}.")
return None
@ -236,7 +237,7 @@ def wikipedia_worker(output_dir, checker, langs):
if not checker(ident):
return
url = url.strip()
except (AttributeError, IndexError):
except (AttributeError, IndexError, ValueError):
log.exception(f"{line} is incorrect.")
return
parsed = urllib.parse.urlparse(url)