[generator] Download Wikipedia articles' summaries only
Signed-off-by: Konstantin Pastbin <konstantin.pastbin@gmail.com>
This commit is contained in:
parent
68c6da563f
commit
a540b37979
2 changed files with 14 additions and 5 deletions
|
@ -14,7 +14,11 @@ from descriptions.descriptions_downloader import log
|
|||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Download wiki pages.")
|
||||
parser.add_argument(
|
||||
"--output_dir", metavar="PATH", type=str, help="Output dir for saving pages"
|
||||
"--output_dir",
|
||||
metavar="PATH",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Output dir for saving pages",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--popularity",
|
||||
|
@ -32,7 +36,10 @@ def parse_args():
|
|||
help="Input file with wikipedia url.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--wikidata", metavar="PATH", type=str, help="Input file with wikidata ids."
|
||||
"--wikidata",
|
||||
metavar="PATH",
|
||||
type=str,
|
||||
help="Input file with wikidata ids.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--langs",
|
||||
|
@ -47,6 +54,7 @@ def parse_args():
|
|||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.WARNING)
|
||||
log.setLevel(logging.WARNING)
|
||||
wikipediaapi.log.setLevel(logging.WARNING)
|
||||
args = parse_args()
|
||||
|
|
|
@ -138,7 +138,7 @@ def beautify_page(html, lang):
|
|||
for x in soup.find_all():
|
||||
if len(x.text.strip()) == 0:
|
||||
x.extract()
|
||||
soup = remove_bad_sections(soup, lang)
|
||||
# soup = remove_bad_sections(soup, lang)
|
||||
html = str(soup.prettify())
|
||||
html = htmlmin.minify(html, remove_empty_space=True)
|
||||
return html
|
||||
|
@ -181,7 +181,8 @@ def download(directory, url):
|
|||
return None
|
||||
page = get_wiki_page(lang, page_name)
|
||||
try:
|
||||
text = try_get(page, "text")
|
||||
# text = try_get(page, "text")
|
||||
text = try_get(page, "summary")
|
||||
except GettingError:
|
||||
log.exception(f"Error: page is not downloaded {page_name}.")
|
||||
return None
|
||||
|
@ -236,7 +237,7 @@ def wikipedia_worker(output_dir, checker, langs):
|
|||
if not checker(ident):
|
||||
return
|
||||
url = url.strip()
|
||||
except (AttributeError, IndexError):
|
||||
except (AttributeError, IndexError, ValueError):
|
||||
log.exception(f"{line} is incorrect.")
|
||||
return
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
|
|
Reference in a new issue