Fix duplicates algorithm

This commit is contained in:
Ilya Zverev 2018-03-20 17:26:21 +03:00
parent 87c6ce1a70
commit 284f952576
3 changed files with 45 additions and 12 deletions

View file

@ -2,6 +2,13 @@
## master branch
## 1.3.1
_Released 2018-03-20_
* "Similar tags" now means at least 66% instead of 50%.
* Instead of removing all duplicates, conflating them and removing only unmatched.
## 1.3.0
_Released 2018-03-15_

View file

@ -10,6 +10,7 @@ import re
import os
import sys
from io import BytesIO
from collections import defaultdict
try:
from .version import __version__
except ImportError:
@ -41,6 +42,7 @@ class SourcePoint:
self.category = category
self.dist_offset = 0
self.remarks = remarks
self.exclusive_group = None
def distance(self, other):
"""Calculate distance in meters."""
@ -774,7 +776,7 @@ class OsmConflator:
self.register_match(dist[0][1], osm_point.id)
osm_kd = osm_kd.remove(osm_point)
del dist[0]
for i in range(len(dist)-1, -1, -1):
for i in reversed(range(len(dist))):
if dist[i][2] == osm_point:
nearest, distance = search_nn_fix(osm_kd, self.dataset[dist[i][1]])
if nearest and distance <= max_distance:
@ -811,9 +813,34 @@ class OsmConflator:
if count_created > 0:
logging.info('Created %s audit-overridden dataset points', count_created)
# Prepare exclusive groups dict
exclusive_groups = defaultdict(set)
for p, v in self.dataset.items():
if v.exclusive_group is not None:
exclusive_groups[v.exclusive_group].add(p)
# Then find matches for unmatched dataset points
self.match_dataset_points_smart()
# Remove unmatched duplicates
count_duplicates = 0
for ids in exclusive_groups.values():
found = False
for p in ids:
if p not in self.dataset:
found = True
break
for p in ids:
if p in self.dataset:
if found:
count_duplicates += 1
del self.dataset[p]
else:
# Leave one element when not matched any
found = True
if count_duplicates > 0:
logging.info('Removed %s unmatched duplicates', count_duplicates)
# Add unmatched dataset points
logging.info('Adding %s unmatched dataset points', len(self.dataset))
for k in sorted(list(self.dataset.keys())):
@ -1054,31 +1081,30 @@ def check_dataset_for_duplicates(profile, dataset, print_all=False):
diff_tags = [k for k in tags if tags[k] == '---']
kd = kdtree.create(list(dataset))
duplicates = set()
group = 0
for d in dataset:
if d.id in duplicates:
continue
group += 1
for alt, _ in kd.search_knn(d, 3): # The first one will be equal to d
if alt.data.id != d.id and alt.data.distance(d) < max_distance:
dist = alt.data.distance(d)
if alt.data.id != d.id and dist < max_distance:
tags_differ = 0
if alt.data.distance(d) > uncond_distance:
if dist > uncond_distance:
for k in diff_tags:
if alt.data.tags.get(k) != d.tags.get(k):
tags_differ += 1
if tags_differ <= len(diff_tags) / 2:
if tags_differ <= max(1, len(diff_tags) / 3):
duplicates.add(alt.data.id)
d.exclusive_group = group
alt.data.exclusive_group = group
if print_all or len(duplicates) <= 5:
is_duplicate = tags_differ <= 1
logging.error('Dataset points %s: %s and %s',
'duplicate each other' if is_duplicate else 'are too similar',
d.id, alt.data.id)
if duplicates:
remove = profile.get('remove_duplicates', True)
if remove:
for i in reversed(range(len(dataset))):
if dataset[i].id in duplicates:
del dataset[i]
logging.error('%s %s duplicates from the dataset',
'Removed' if remove else 'Found', len(duplicates))
logging.error('Found %s duplicates in the dataset', len(duplicates))
if found_duplicate_ids:
raise KeyError('Cannot continue with duplicate ids')

View file

@ -1 +1 @@
__version__ = '1.3.0'
__version__ = '1.3.1'