Initial commit, debug version

This commit is contained in:
Ilya Zverev 2016-02-25 09:52:41 +03:00
commit 2017f9a251
7 changed files with 352 additions and 0 deletions

4
.gitignore vendored Normal file
View file

@ -0,0 +1,4 @@
*.pyc
mapsme-state.txt
mapsme-changes.db
mapsme-process.log

224
server/mapsme-process.py Executable file
View file

@ -0,0 +1,224 @@
#!/usr/bin/python
import sys, os, urllib2, re, gzip
import peewee, json
from lxml import etree
from StringIO import StringIO
from datetime import datetime
path = os.path.dirname(sys.argv[0]) if len(sys.argv) < 2 else sys.argv[1]
database = peewee.SqliteDatabase(os.path.join(path, 'mapsme-changes.db'))
STATE_FILENAME = os.path.join(path, 'mapsme-state.txt')
REPLICATION_BASE_URL = 'http://planet.openstreetmap.org/replication/changesets'
API_ENDPOINT = 'https://api.openstreetmap.org/api/0.6'
MAIN_TAGS = ('amenity', 'shop', 'tourism', 'historic', 'craft', 'emergency', 'barrier', 'highway', 'entrance', 'building')
INTERESTING_TAGS = list(MAIN_TAGS) + ['name']
class Change(peewee.Model):
"""A model for the change. Just a single table."""
changeset = peewee.IntegerField()
user = peewee.CharField(max_length=250, index=True)
version = peewee.CharField(max_length=250)
timestamp = peewee.DateTimeField(index=True)
action = peewee.FixedCharField(max_length=1) # c, d, m, a
obj_type = peewee.FixedCharField(max_length=1, null=True)
obj_id = peewee.IntegerField(null=True)
main_tag = peewee.CharField(max_length=100, null=True)
address = peewee.BooleanField(default=False)
changes = peewee.TextField()
class Meta:
database = database
db_table = 'mapsme_change'
def download_last_state():
"""Downloads last changeset replication sequence number from the planet website."""
state = urllib2.urlopen(REPLICATION_BASE_URL + '/state.yaml').read()
m = re.search(r'sequence:\s+(\d+)', state)
# Not checking to throw exception in case of an error
return int(m.group(1))
def read_last_state():
state = None
try:
with open(STATE_FILENAME, 'r') as f:
m = re.search(r'\d+', f.read())
state = int(m.group(0))
except:
pass
return state
def write_last_state(state):
with open(STATE_FILENAME, 'w') as f:
f.write(str(state))
def filter_changeset(changeset):
"""A changeset object is a dict of tags plus 'id', 'timestamp' and 'user' fields."""
return 'created_by' in changeset and 'maps.me' in changeset['created_by'].lower()
def download_replication(state):
"""Downloads replication archive for a given state, and returns a list of changeset data to process."""
changesets = []
url = '{0}/{1:03}/{2:03}/{3:03}.osm.gz'.format(REPLICATION_BASE_URL, int(state / 1000000), int(state / 1000) % 1000, state % 1000)
response = urllib2.urlopen(url)
data = response.read()
gz = gzip.GzipFile(fileobj=StringIO(data))
chdata = {}
for event, element in etree.iterparse(gz, events=('start', 'end')):
if event == 'start':
if element.tag == 'changeset':
chdata = {}
elif element.tag == 'tag':
chdata[element.get('k')] = element.get('v').encode('utf-8')
elif event == 'end' and element.tag == 'changeset':
chdata['id'] = int(element.get('id'))
chdata['user'] = element.get('user').encode('utf-8')
chdata['timestamp'] = element.get('created_at')
if filter_changeset(chdata):
changesets.append(chdata)
return changesets
def obj_to_dict(obj):
"""Converts XML object to an easy to use dict."""
if obj is None:
return None
res = {}
res['type'] = obj.tag
res['id'] = int(obj.get('id'))
res['version'] = int(obj.get('version'))
res['deleted'] = obj.get('visible') == 'false'
if obj.tag == 'node' and 'lon' in obj.keys() and 'lat' in obj.keys():
res['coords'] = (obj.get('lon'), obj.get('lat'))
res['tags'] = { tag.get('k') : tag.get('v') for tag in obj.iterchildren('tag')}
if obj.tag == 'way':
res['refs'] = [x.get('ref') for x in obj.iterchildren('nd')]
elif obj.tag == 'relation':
res['refs'] = [(x.get('type'), x.get('ref'), x.get('role')) for x in obj.iterchildren('member')]
return res
def create_change(changeset, obj):
"""Creates a Change object, ready to be populated with changes."""
# Find the main tag
main = None
for k in MAIN_TAGS:
if k in obj['tags']:
main = '{0}={1}'.format(k, obj['tags'][k])
break
if main is None:
return None
ch = Change()
ch.changeset = changeset['id']
ch.user = changeset['user']
ch.timestamp = datetime.strptime(changeset['timestamp'], '%Y-%m-%dT%H:%M:%SZ')
ch.version = changeset['created_by']
ch.obj_type = obj['type'][:1]
ch.obj_id = obj['id']
ch.main_tag = main
return ch
def has_address_tags(tags):
for k in tags:
if k[:5] == 'addr:':
return True
return False
def record_obj_diff(changeset, obj, prev, anomalies):
ch = None
if prev is None or prev['deleted']:
if not obj['deleted']:
# Creation
ch = create_change(changeset, obj)
if ch is not None:
ch.action = 'c'
ch.address = has_address_tags(obj['tags'])
coords = None if 'coords' not in obj else obj['coords']
ch.changes = json.dumps((coords, obj['tags']), ensure_ascii=False)
else:
anomalies[obj['type'][0] + 'c'] += 1
elif obj['deleted']:
# Deletion
ch = create_change(changeset, prev)
if ch is not None:
ch.action = 'd'
coords = None if 'coords' not in prev else prev['coords']
ch.changes = json.dumps((coords, prev['tags']), ensure_ascii=False)
else:
anomalies[prev['type'][0] + 'd'] += 1
else:
# Both objects are present, compare them
if 'coords' not in obj or obj['coords'] == prev['coords']:
coords = None
else:
coords = (prev['coords'], obj['coords'])
tags = {}
for k in prev['tags']:
new_val = None if k not in obj['tags'] else obj['tags'][k]
if k in INTERESTING_TAGS or prev['tags'][k] != new_val:
tags[k] = (prev['tags'][k], new_val)
for k in obj['tags']:
if k not in prev['tags']:
tags[k] = (None, obj['tags'][k])
ch = create_change(changeset, obj)
if ch is not None:
ch.action = 'm'
ch.address = has_address_tags(tags)
ch.changes = json.dumps((coords, tags), ensure_ascii=False)
else:
anomalies[prev['type'][0] + 'm'] += 1
if 'refs' in obj and obj['refs'] != prev['refs']:
anomalies['way_ref' if obj['type'] == 'way' else 'rel_ref'] += 1
if ch is not None:
ch.save()
def record_changeset_diff(changeset):
"""Received changeset data dict, downloads individual object changes and store changes to a database."""
response = urllib2.urlopen('{0}/changeset/{1}/download'.format(API_ENDPOINT, changeset['id']))
root = etree.parse(response).getroot()
anomalies = {}
for k in ('way_ref', 'rel_ref', 'nc', 'wc', 'rc', 'nm', 'wm', 'rm', 'nd', 'wd', 'rd'):
anomalies[k] = 0
for action in root:
for obj_xml in action:
obj = obj_to_dict(obj_xml)
if obj['version'] == 1:
prev = None
else:
response2 = urllib2.urlopen('{0}/{1}/{2}/{3}'.format(API_ENDPOINT, obj['type'], obj['id'], obj['version'] - 1))
prev = obj_to_dict(etree.parse(response2).getroot()[0])
record_obj_diff(changeset, obj, prev, anomalies)
if sum(anomalies.itervalues()) > 0:
ch = Change()
ch.changeset = changeset['id']
ch.user = changeset['user']
ch.timestamp = datetime.strptime(changeset['timestamp'], '%Y-%m-%dT%H:%M:%SZ')
ch.version = changeset['created_by']
ch.action = 'a'
ch.changes = json.dumps(anomalies)
ch.save()
if __name__ == '__main__':
try:
cur_state = download_last_state()
except Exception as e:
print 'Failed to download last state:', e
sys.exit(1)
state = read_last_state()
if state is None:
state = cur_state - 1
database.connect()
database.create_tables([Change], safe=True)
for i in range(state + 1, cur_state + 1):
print i
try:
changesets = download_replication(i)
for c in changesets:
print '-', c
record_changeset_diff(c)
except Exception as e:
print 'Failed to download and process replication {0}: {1}'.format(i, e)
raise e
break
write_last_state(i)

4
www/config.py Normal file
View file

@ -0,0 +1,4 @@
DATABASE_PATH = '../server'
DEBUG = True
PAGE_SIZE = 30
TOP = 10

69
www/mmwatch.py Executable file
View file

@ -0,0 +1,69 @@
#!/usr/bin/python
import os, json, peewee
from flask import Flask, request, render_template
from flask.ext.compress import Compress
import config
app = Flask(__name__)
app.debug = config.DEBUG
Compress(app)
database = peewee.SqliteDatabase(os.path.join(config.DATABASE_PATH, 'mapsme-changes.db'))
class Change(peewee.Model):
"""A model for the change. Just a single table."""
changeset = peewee.IntegerField()
user = peewee.CharField(max_length=250, index=True)
version = peewee.CharField(max_length=250)
timestamp = peewee.DateTimeField(index=True)
action = peewee.FixedCharField(max_length=1) # c, d, m, a
obj_type = peewee.FixedCharField(max_length=1, null=True)
obj_id = peewee.IntegerField(null=True)
main_tag = peewee.CharField(max_length=100, null=True)
address = peewee.BooleanField(default=False)
changes = peewee.TextField()
class Meta:
database = database
db_table = 'mapsme_change'
@app.before_request
def before_request():
database.connect()
@app.teardown_request
def teardown(exception):
if not database.is_closed():
database.close()
@app.route('/')
def the_one_and_only_page():
changes = Change.select().order_by(Change.id.desc()).paginate(1, config.PAGE_SIZE)
users = Change.select(Change.user, peewee.fn.Count(Change.id).alias('count')).group_by(Change.user).order_by(peewee.fn.Count(Change.id).desc()).limit(config.TOP)
tags = Change.select(Change.main_tag, peewee.fn.Count(Change.id).alias('count')).group_by(Change.main_tag).order_by(peewee.fn.Count(Change.id).desc()).limit(config.TOP)
versions = Change.select(Change.version, peewee.fn.Count(Change.id).alias('count')).group_by(Change.version).order_by(peewee.fn.Count(Change.id).desc()).limit(config.TOP)
stat_src = Change.select(Change.action, Change.obj_type, peewee.fn.Count(Change.id).alias('count')).group_by(Change.action, Change.obj_type).order_by(peewee.fn.Count(Change.id).desc()).limit(config.TOP)
stats = {}
stats['created'] = stats['deleted'] = stats['modified'] = stats['anomalies'] = 0
stats['nodes'] = stats['ways'] = stats['relations'] = stats['total'] = 0
for stat in stat_src:
stats['total'] += stat.count
if stat.action == 'c':
stats['created'] += stat.count
elif stat.action == 'd':
stats['deleted'] += stat.count
elif stat.action == 'm':
stats['modified'] += stat.count
elif stat.action == 'a':
stats['anomalies'] += stat.count
if stat.obj_type == 'n':
stats['nodes'] += stat.count
elif stat.obj_type == 'w':
stats['ways'] += stat.count
elif stat.obj_type == 'r':
stats['relations'] += stat.count
return render_template('index.html', stats=stats, changes=changes, users=users, tags=tags, versions=versions)
if __name__ == '__main__':
app.run(threaded=True)

1
www/mmwatch.wsgi Normal file
View file

@ -0,0 +1 @@
from mmwatch import app as application

0
www/static/style.css Normal file
View file

50
www/templates/index.html Normal file
View file

@ -0,0 +1,50 @@
<!doctype html>
<title>MAPS.ME OSM Changes Browser</title>
<link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='style.css') }}">
<body>
<h1>OSM Edits Made With MAPS.ME</h1>
<div id="stats">
<h2>Statistics</h2>
<ul>
<li>Total: {{ stats.total }}</li>
<li>Created: {{ stats.created }}</li>
<li>Deleted: {{ stats.deleted }}</li>
<li>Modified: {{ stats.modified }}</li>
<li>Anomalies: {{ stats.anomalies }}</li>
<li>Nodes: {{ stats.nodes }}</li>
<li>Ways: {{ stats.ways }}</li>
<li>Relations: {{stats.relations }}</li>
</ul>
</div>
<div>
<h2>Top Users</h2>
<ol>
{% for user in users %}
<li>{{ user.user }} ({{user.count}})</li>
{% endfor %}
</ol>
</div>
<div>
<h2>Top Main Tags</h2>
<ol>
{% for tag in tags %}
<li>{{ tag.main_tag }} ({{tag.count}})</li>
{% endfor %}
</ol>
</div>
<div>
<h2>Versions</h2>
<ol>
{% for v in versions %}
<li>{{ v.version }} ({{v.count}})</li>
{% endfor %}
</ol>
</div>
<div>
<h2>Changes</h2>
{% for change in changes %}
<div>
<a href="https://www.openstreetmap.org/user/{{ change.user }}">{{ change.user }}</a> at {{ change.timestamp }} in <a href="https://www.openstreetmap.org/changeset/{{ change.changeset }}">{{ change.changeset }}</a>: {{ change.main_tag }}
</div>
{% endfor %}
</div>