Download Wikipedia articles' summaries only #2410

Closed
pastk wants to merge 2 commits from pastk-scripts into master
4 changed files with 20 additions and 28 deletions

View file

@ -5,6 +5,7 @@ import linecache
import multiprocessing
import os
import re
import subprocess
import sys
from contextlib import contextmanager
from distutils import log
@ -409,12 +410,6 @@ class BuildOmimBindingCommand(build_ext, object):
self.cmake_pybindings()
super(BuildOmimBindingCommand, self).run()
VERSIONS_LOCATIONS = {
'xcode/common.xcconfig': 'CURRENT_PROJECT_VERSION',
'android/gradle.properties': 'propVersionName',
}
PYBINDINGS = {
'pygen': {
'path': 'generator/pygen',
@ -451,22 +446,10 @@ PYBINDINGS = {
def get_version():
versions = []
for path, varname in VERSIONS_LOCATIONS.items():
with open(os.path.join(OMIM_ROOT, os.path.normpath(path))) as f:
for line in f:
match = re.search(
r'^\s*{}\s*=\s*(?P<version>.*)'.format(varname),
line.strip(),
)
if match:
versions.append(LooseVersion(match.group('version')))
break
code_version = max(versions)
env_version_addendum = os.environ.get('OMIM_SCM_VERSION', '')
return "{}{}".format(code_version, env_version_addendum)
return subprocess.check_output(
[os.path.join(OMIM_ROOT, 'tools', 'unix', 'version.sh'), 'android_code'],
universal_newlines=True,
).strip(' \n\r')
def transform_omim_requirement(requirement, omim_package_version):

View file

@ -14,7 +14,11 @@ from descriptions.descriptions_downloader import log
def parse_args():
parser = argparse.ArgumentParser(description="Download wiki pages.")
parser.add_argument(
"--output_dir", metavar="PATH", type=str, help="Output dir for saving pages"
"--output_dir",
metavar="PATH",
type=str,
required=True,
help="Output dir for saving pages",
)
parser.add_argument(
"--popularity",
@ -32,7 +36,10 @@ def parse_args():
help="Input file with wikipedia url.",
)
parser.add_argument(
"--wikidata", metavar="PATH", type=str, help="Input file with wikidata ids."
"--wikidata",
metavar="PATH",
type=str,
help="Input file with wikidata ids.",
)
parser.add_argument(
"--langs",
@ -47,6 +54,7 @@ def parse_args():
def main():
logging.basicConfig(level=logging.WARNING)
log.setLevel(logging.WARNING)
wikipediaapi.log.setLevel(logging.WARNING)
args = parse_args()

View file

@ -138,7 +138,7 @@ def beautify_page(html, lang):
for x in soup.find_all():
if len(x.text.strip()) == 0:
x.extract()
soup = remove_bad_sections(soup, lang)
# soup = remove_bad_sections(soup, lang)
html = str(soup.prettify())
Review

shall I remove all unused code or keep it commented in case we need it again?

shall I remove all unused code or keep it commented in case we need it again?
biodranik commented 2022-04-16 21:33:02 +00:00 (Migrated from github.com)
Review

Зачем выпиливать все секции? Я вот читаю статьи, вроде реально всё интересно и нужно. Не лучше ли явно выпиливать только что-то нерелевантное? Какую проблему-то решаем?

Зачем выпиливать все секции? Я вот читаю статьи, вроде реально всё интересно и нужно. Не лучше ли явно выпиливать только что-то нерелевантное? Какую проблему-то решаем?
Review

It leads to a significant maps size inflation (and its enabled for 5 languages only so far!), though added value for the most users is small.
I doubt many users will read full wikipedia articles in a map app, its not an offline wikipedia reader after all.

Many more users will read short descriptions of the POIs though (its kind of replacement of missing OSM descriptions) and given they take much less space the relative added value will be much higher.

It leads to a significant maps size inflation (and its enabled for 5 languages only so far!), though added value for the most users is small. I doubt many users will read **full** wikipedia articles in a map app, its not an offline wikipedia reader after all. Many more users will read short descriptions of the POIs though (its kind of replacement of missing OSM descriptions) and given they take much less space the relative added value will be much higher.
biodranik commented 2022-04-17 10:51:29 +00:00 (Migrated from github.com)
Review

All articles I saw on the map were already stripped and contained only the summary or summary plus one or two other interesting and useful sections. That's why I'm asking, how much and what exactly can we save with this patch.

If you have some "bad" examples that are too large and can be easily stripped, because sections are "unnecessary", let's check them and maybe add these sections to exceptions.

All articles I saw on the map were already stripped and contained only the summary or summary plus one or two other interesting and useful sections. That's why I'm asking, how much and what exactly can we save with this patch. If you have some "bad" examples that are too large and can be easily stripped, because sections are "unnecessary", let's check them and maybe add these sections to exceptions.
Review

Check any capital or a significant sightseeing, e.g.
https://en.wikipedia.org/wiki/Eiffel_Tower
https://en.wikipedia.org/wiki/Tbilisi

Section names are non-standard, so it would be impossible to gain big savings by such filtering, e.g. I just checked some articles in a smaller russian city and found many sections of very little (or very specific) interest:
https://ru.wikipedia.org/wiki/Марийский_государственный_университет
https://ru.wikipedia.org/wiki/Йошкар-Олинская_ТЭЦ-2
https://ru.wikipedia.org/wiki/Йошкар-олинский_троллейбус

Check any capital or a significant sightseeing, e.g. https://en.wikipedia.org/wiki/Eiffel_Tower https://en.wikipedia.org/wiki/Tbilisi Section names are non-standard, so it would be impossible to gain big savings by such filtering, e.g. I just checked some articles in a smaller russian city and found many sections of very little (or very specific) interest: https://ru.wikipedia.org/wiki/Марийский_государственный_университет https://ru.wikipedia.org/wiki/Йошкар-Олинская_ТЭЦ-2 https://ru.wikipedia.org/wiki/Йошкар-олинский_троллейбус
biodranik commented 2022-04-17 11:57:18 +00:00 (Migrated from github.com)
Review

Checked in the app. I see only a few relevant sections in all examples, except cities. We don't store wiki articles in the world map now, so it's impossible to check their sections.

Checked in the app. I see only a few relevant sections in all examples, except cities. We don't store wiki articles in the world map now, so it's impossible to check their sections.
Review

I dunno, what's useful in having sections with lists of universities' buildings and faculties or technical specs of the machinery used at electrical station...
I don't think its worth spending storage space and bandwidth for this stuff of special interest.

I dunno, what's useful in having sections with lists of universities' buildings and faculties or technical specs of the machinery used at electrical station... I don't think its worth spending storage space and bandwidth for this stuff of special interest.
Review

I don't worry about MWM size, to be honest. Current wikipedia content in the app is good.

I don't worry about MWM size, to be honest. Current wikipedia content in the app is good.
html = htmlmin.minify(html, remove_empty_space=True)
return html
@ -181,7 +181,8 @@ def download(directory, url):
return None
page = get_wiki_page(lang, page_name)
try:
text = try_get(page, "text")
# text = try_get(page, "text")
text = try_get(page, "summary")
except GettingError:
log.exception(f"Error: page is not downloaded {page_name}.")
return None
@ -236,7 +237,7 @@ def wikipedia_worker(output_dir, checker, langs):
if not checker(ident):
return
url = url.strip()
except (AttributeError, IndexError):
except (AttributeError, IndexError, ValueError):
log.exception(f"{line} is incorrect.")
return
parsed = urllib.parse.urlparse(url)

View file

@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
import os
import sys