Download Wikipedia articles' summaries only #2410
|
@ -5,6 +5,7 @@ import linecache
|
|||
import multiprocessing
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from contextlib import contextmanager
|
||||
from distutils import log
|
||||
|
@ -409,12 +410,6 @@ class BuildOmimBindingCommand(build_ext, object):
|
|||
self.cmake_pybindings()
|
||||
super(BuildOmimBindingCommand, self).run()
|
||||
|
||||
|
||||
VERSIONS_LOCATIONS = {
|
||||
'xcode/common.xcconfig': 'CURRENT_PROJECT_VERSION',
|
||||
'android/gradle.properties': 'propVersionName',
|
||||
}
|
||||
|
||||
PYBINDINGS = {
|
||||
'pygen': {
|
||||
'path': 'generator/pygen',
|
||||
|
@ -451,22 +446,10 @@ PYBINDINGS = {
|
|||
|
||||
|
||||
def get_version():
|
||||
versions = []
|
||||
for path, varname in VERSIONS_LOCATIONS.items():
|
||||
with open(os.path.join(OMIM_ROOT, os.path.normpath(path))) as f:
|
||||
for line in f:
|
||||
match = re.search(
|
||||
r'^\s*{}\s*=\s*(?P<version>.*)'.format(varname),
|
||||
line.strip(),
|
||||
)
|
||||
if match:
|
||||
versions.append(LooseVersion(match.group('version')))
|
||||
break
|
||||
code_version = max(versions)
|
||||
|
||||
env_version_addendum = os.environ.get('OMIM_SCM_VERSION', '')
|
||||
|
||||
return "{}{}".format(code_version, env_version_addendum)
|
||||
return subprocess.check_output(
|
||||
[os.path.join(OMIM_ROOT, 'tools', 'unix', 'version.sh'), 'android_code'],
|
||||
universal_newlines=True,
|
||||
).strip(' \n\r')
|
||||
|
||||
|
||||
def transform_omim_requirement(requirement, omim_package_version):
|
||||
|
|
|
@ -14,7 +14,11 @@ from descriptions.descriptions_downloader import log
|
|||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Download wiki pages.")
|
||||
parser.add_argument(
|
||||
"--output_dir", metavar="PATH", type=str, help="Output dir for saving pages"
|
||||
"--output_dir",
|
||||
metavar="PATH",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Output dir for saving pages",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--popularity",
|
||||
|
@ -32,7 +36,10 @@ def parse_args():
|
|||
help="Input file with wikipedia url.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--wikidata", metavar="PATH", type=str, help="Input file with wikidata ids."
|
||||
"--wikidata",
|
||||
metavar="PATH",
|
||||
type=str,
|
||||
help="Input file with wikidata ids.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--langs",
|
||||
|
@ -47,6 +54,7 @@ def parse_args():
|
|||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.WARNING)
|
||||
log.setLevel(logging.WARNING)
|
||||
wikipediaapi.log.setLevel(logging.WARNING)
|
||||
args = parse_args()
|
||||
|
|
|
@ -138,7 +138,7 @@ def beautify_page(html, lang):
|
|||
for x in soup.find_all():
|
||||
if len(x.text.strip()) == 0:
|
||||
x.extract()
|
||||
soup = remove_bad_sections(soup, lang)
|
||||
# soup = remove_bad_sections(soup, lang)
|
||||
html = str(soup.prettify())
|
||||
![]() Зачем выпиливать все секции? Я вот читаю статьи, вроде реально всё интересно и нужно. Не лучше ли явно выпиливать только что-то нерелевантное? Какую проблему-то решаем? Зачем выпиливать все секции? Я вот читаю статьи, вроде реально всё интересно и нужно. Не лучше ли явно выпиливать только что-то нерелевантное? Какую проблему-то решаем?
pastk
commented
It leads to a significant maps size inflation (and its enabled for 5 languages only so far!), though added value for the most users is small. Many more users will read short descriptions of the POIs though (its kind of replacement of missing OSM descriptions) and given they take much less space the relative added value will be much higher. It leads to a significant maps size inflation (and its enabled for 5 languages only so far!), though added value for the most users is small.
I doubt many users will read **full** wikipedia articles in a map app, its not an offline wikipedia reader after all.
Many more users will read short descriptions of the POIs though (its kind of replacement of missing OSM descriptions) and given they take much less space the relative added value will be much higher.
![]() All articles I saw on the map were already stripped and contained only the summary or summary plus one or two other interesting and useful sections. That's why I'm asking, how much and what exactly can we save with this patch. If you have some "bad" examples that are too large and can be easily stripped, because sections are "unnecessary", let's check them and maybe add these sections to exceptions. All articles I saw on the map were already stripped and contained only the summary or summary plus one or two other interesting and useful sections. That's why I'm asking, how much and what exactly can we save with this patch.
If you have some "bad" examples that are too large and can be easily stripped, because sections are "unnecessary", let's check them and maybe add these sections to exceptions.
pastk
commented
Check any capital or a significant sightseeing, e.g. Section names are non-standard, so it would be impossible to gain big savings by such filtering, e.g. I just checked some articles in a smaller russian city and found many sections of very little (or very specific) interest: Check any capital or a significant sightseeing, e.g.
https://en.wikipedia.org/wiki/Eiffel_Tower
https://en.wikipedia.org/wiki/Tbilisi
Section names are non-standard, so it would be impossible to gain big savings by such filtering, e.g. I just checked some articles in a smaller russian city and found many sections of very little (or very specific) interest:
https://ru.wikipedia.org/wiki/Марийский_государственный_университет
https://ru.wikipedia.org/wiki/Йошкар-Олинская_ТЭЦ-2
https://ru.wikipedia.org/wiki/Йошкар-олинский_троллейбус
![]() Checked in the app. I see only a few relevant sections in all examples, except cities. We don't store wiki articles in the world map now, so it's impossible to check their sections. Checked in the app. I see only a few relevant sections in all examples, except cities. We don't store wiki articles in the world map now, so it's impossible to check their sections.
pastk
commented
I dunno, what's useful in having sections with lists of universities' buildings and faculties or technical specs of the machinery used at electrical station... I dunno, what's useful in having sections with lists of universities' buildings and faculties or technical specs of the machinery used at electrical station...
I don't think its worth spending storage space and bandwidth for this stuff of special interest.
rtsisyk
commented
I don't worry about MWM size, to be honest. Current wikipedia content in the app is good. I don't worry about MWM size, to be honest. Current wikipedia content in the app is good.
|
||||
html = htmlmin.minify(html, remove_empty_space=True)
|
||||
return html
|
||||
|
@ -181,7 +181,8 @@ def download(directory, url):
|
|||
return None
|
||||
page = get_wiki_page(lang, page_name)
|
||||
try:
|
||||
text = try_get(page, "text")
|
||||
# text = try_get(page, "text")
|
||||
text = try_get(page, "summary")
|
||||
except GettingError:
|
||||
log.exception(f"Error: page is not downloaded {page_name}.")
|
||||
return None
|
||||
|
@ -236,7 +237,7 @@ def wikipedia_worker(output_dir, checker, langs):
|
|||
if not checker(ident):
|
||||
return
|
||||
url = url.strip()
|
||||
except (AttributeError, IndexError):
|
||||
except (AttributeError, IndexError, ValueError):
|
||||
log.exception(f"{line} is incorrect.")
|
||||
return
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/env python
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
|
shall I remove all unused code or keep it commented in case we need it again?