From c1ca4c68b11863d2c348a346d6dbf0cbb326b751 Mon Sep 17 00:00:00 2001 From: Alexey Zakharenkov <35913079+alexey-zakharenkov@users.noreply.github.com> Date: Tue, 29 Dec 2020 12:32:33 +0300 Subject: [PATCH] More robust work with mwm size predictions, including prediction model limitations --- db/create_tables.sql | 3 +- web/app/auto_split.py | 18 ++-- web/app/borders_api.py | 8 +- web/app/borders_api_utils.py | 66 ++++++++------ web/app/config.py | 6 ++ web/app/mwm_size_predictor.py | 8 +- web/app/static/borders.js | 7 +- web/app/subregions.py | 156 ++++++++++++++++++---------------- 8 files changed, 158 insertions(+), 114 deletions(-) diff --git a/db/create_tables.sql b/db/create_tables.sql index b214ae7..3877c81 100644 --- a/db/create_tables.sql +++ b/db/create_tables.sql @@ -38,6 +38,7 @@ CREATE TABLE splitting ( subregion_ids BIGINT[] NOT NULL, mwm_size_est REAL NOT NULL, mwm_size_thr INTEGER NOT NULL, -- mwm size threshold in Kb, 4-bytes INTEGER is enough + next_level INTEGER NOT NULL, geom geometry NOT NULL ); -CREATE INDEX splitting_idx ON splitting (osm_border_id, mwm_size_thr); +CREATE INDEX splitting_idx ON splitting (osm_border_id, mwm_size_thr, next_level); diff --git a/web/app/auto_split.py b/web/app/auto_split.py index 2649663..56d2adb 100644 --- a/web/app/auto_split.py +++ b/web/app/auto_split.py @@ -12,9 +12,12 @@ from subregions import get_subregions_info class DisjointClusterUnion: """Disjoint set union implementation for administrative subregions.""" - def __init__(self, region_id, subregions, mwm_size_thr=None): + def __init__(self, region_id, subregions, next_level, mwm_size_thr=None): + assert all(s_data['mwm_size_est'] is not None + for s_data in subregions.values()) self.region_id = region_id self.subregions = subregions + self.next_level = next_level self.mwm_size_thr = mwm_size_thr or MWM_SIZE_THRESHOLD self.representatives = {sub_id: sub_id for sub_id in subregions} # A cluster is one or more subregions with common borders @@ -84,7 +87,8 @@ def get_best_cluster_to_join_with(small_cluster_id, for subregion_id in subregion_ids: for other_subregion_id, length in common_border_matrix[subregion_id].items(): other_cluster_id = dcu.find_cluster(other_subregion_id) - if other_cluster_id != small_cluster_id: + if (other_cluster_id != small_cluster_id and + not dcu.clusters[other_cluster_id]['finished']): common_borders[other_cluster_id] += length if not common_borders: return None @@ -144,8 +148,10 @@ def find_golden_splitting(conn, border_id, next_level, mwm_size_thr): next_level, need_cities=True) if not subregions: return + if any(s_data['mwm_size_est'] is None for s_data in subregions.values()): + return - dcu = DisjointClusterUnion(border_id, subregions, mwm_size_thr) + dcu = DisjointClusterUnion(border_id, subregions, next_level, mwm_size_thr) all_subregion_ids = dcu.get_all_subregion_ids() common_border_matrix = calculate_common_border_matrix(conn, all_subregion_ids) @@ -188,6 +194,7 @@ def save_splitting_to_db(conn, dcu: DisjointClusterUnion): DELETE FROM {autosplit_table} WHERE osm_border_id = {dcu.region_id} AND mwm_size_thr = {dcu.mwm_size_thr} + AND next_level = {dcu.next_level} """) for cluster_id, data in dcu.clusters.items(): subregion_ids = data['subregion_ids'] @@ -196,12 +203,13 @@ def save_splitting_to_db(conn, dcu: DisjointClusterUnion): ) cluster_geometry_sql = get_union_sql(subregion_ids) cursor.execute(f""" - INSERT INTO {autosplit_table} (osm_border_id, subregion_ids, - geom, mwm_size_thr, mwm_size_est) + INSERT INTO {autosplit_table} (osm_border_id, subregion_ids, geom, + next_level, mwm_size_thr, mwm_size_est) VALUES ( {dcu.region_id}, '{subregion_ids_array_str}', ({cluster_geometry_sql}), + {dcu.next_level}, {dcu.mwm_size_thr}, {data['mwm_size_est']} ) diff --git a/web/app/borders_api.py b/web/app/borders_api.py index ffe277b..90c09e1 100755 --- a/web/app/borders_api.py +++ b/web/app/borders_api.py @@ -413,11 +413,11 @@ def find_osm_borders(): def copy_from_osm(): osm_id = int(request.args.get('id')) name = request.args.get('name') - success = copy_region_from_osm(g.conn, osm_id, name) - if not success: - return jsonify(status=f"Region with id={osm_id} already exists") + errors, warnings = copy_region_from_osm(g.conn, osm_id, name) + if errors: + return jsonify(status='\n'.join(errors)) g.conn.commit() - return jsonify(status='ok') + return jsonify(status='ok', warnings=warnings) @app.route('/rename') diff --git a/web/app/borders_api_utils.py b/web/app/borders_api_utils.py index 0421b33..6e84a11 100644 --- a/web/app/borders_api_utils.py +++ b/web/app/borders_api_utils.py @@ -143,8 +143,9 @@ def get_clusters_for_preview_one(region_id, next_level, mwm_size_thr): where_clause = f""" osm_border_id = %s AND mwm_size_thr = %s + AND next_level = %s """ - splitting_sql_params = (region_id, mwm_size_thr) + splitting_sql_params = (region_id, mwm_size_thr, next_level) with g.conn.cursor() as cursor: cursor.execute(f""" SELECT 1 FROM {autosplit_table} @@ -231,9 +232,9 @@ def divide_region_into_subregions(conn, region_id, next_level): cursor.execute(f""" INSERT INTO {borders_table} (id, geom, name, parent_id, modified, count_k, mwm_size_est) - SELECT osm_id, way, name, {parent_id}, now(), -1, {mwm_size_est} + SELECT osm_id, way, name, {parent_id}, now(), -1, %s FROM {osm_table} - WHERE osm_id = %s""", (subregion_id,) + WHERE osm_id = %s""", (mwm_size_est, subregion_id,) ) if not is_admin_region: cursor.execute(f"DELETE FROM {borders_table} WHERE id = %s", (region_id,)) @@ -251,8 +252,9 @@ def divide_into_clusters(region_ids, next_level, mwm_size_thr): where_clause = f""" osm_border_id = %s AND mwm_size_thr = %s + AND next_level = %s """ - splitting_sql_params = (region_id, mwm_size_thr) + splitting_sql_params = (region_id, mwm_size_thr, next_level) cursor.execute(f""" SELECT 1 FROM {autosplit_table} WHERE {where_clause} @@ -269,24 +271,32 @@ def divide_into_clusters(region_ids, next_level, mwm_size_thr): """, splitting_sql_params ) if cursor.rowcount == 1: - continue - for rec in cursor: - subregion_ids = rec[0] - cluster_id = subregion_ids[0] - if len(subregion_ids) == 1: - subregion_id = cluster_id - name = get_osm_border_name_by_osm_id(g.conn, subregion_id) - else: - counter += 1 - free_id -= 1 - subregion_id = free_id - name = f"{base_name}_{counter}" insert_cursor.execute(f""" - INSERT INTO {borders_table} (id, name, parent_id, geom, modified, count_k, mwm_size_est) - SELECT {subregion_id}, %s, osm_border_id, geom, now(), -1, mwm_size_est - FROM {autosplit_table} WHERE subregion_ids[1] = %s AND {where_clause} - """, (name, cluster_id,) + splitting_sql_params - ) + UPDATE {borders_table} + SET modified = now(), + mwm_size_est = (SELECT mwm_size_est + FROM {autosplit_table} + WHERE {where_clause}) + WHERE id = {region_id} + """, splitting_sql_params) + else: + for rec in cursor: + subregion_ids = rec[0] + cluster_id = subregion_ids[0] + if len(subregion_ids) == 1: + subregion_id = cluster_id + name = get_osm_border_name_by_osm_id(g.conn, subregion_id) + else: + counter += 1 + free_id -= 1 + subregion_id = free_id + name = f"{base_name}_{counter}" + insert_cursor.execute(f""" + INSERT INTO {borders_table} (id, name, parent_id, geom, modified, count_k, mwm_size_est) + SELECT {subregion_id}, %s, osm_border_id, geom, now(), -1, mwm_size_est + FROM {autosplit_table} WHERE subregion_ids[1] = %s AND {where_clause} + """, (name, cluster_id,) + splitting_sql_params + ) g.conn.commit() return jsonify(status='ok') @@ -393,13 +403,16 @@ def find_potential_parents(region_id): def copy_region_from_osm(conn, region_id, name=None, parent_id='not_passed'): + errors, warnings = [], [] borders_table = main_borders_table with conn.cursor() as cursor: # Check if this id already in use - cursor.execute(f"SELECT id FROM {borders_table} WHERE id = %s", + cursor.execute(f"SELECT name FROM {borders_table} WHERE id = %s", (region_id,)) if cursor.rowcount > 0: - return False + name = cursor.fetchone()[0] + errors.append(f"Region with id={region_id} already exists under name '{name}'") + return errors, warnings name_expr = f"'{name}'" if name else "name" parent_id_expr = f"{parent_id}" if isinstance(parent_id, int) else "NULL" @@ -413,8 +426,11 @@ def copy_region_from_osm(conn, region_id, name=None, parent_id='not_passed'): ) if parent_id == 'not_passed': assign_region_to_lowest_parent(conn, region_id) - update_border_mwm_size_estimation(conn, region_id) - return True + try: + update_border_mwm_size_estimation(conn, region_id) + except Exception as e: + warnings.append(str(e)) + return errors, warnings def get_osm_border_name_by_osm_id(conn, osm_id): diff --git a/web/app/config.py b/web/app/config.py index 432f05b..7377eed 100644 --- a/web/app/config.py +++ b/web/app/config.py @@ -33,3 +33,9 @@ MWM_SIZE_THRESHOLD = 70*1024 # Estimated mwm size is predicted by the 'model.pkl' with 'scaler.pkl' for X MWM_SIZE_PREDICTION_MODEL_PATH = '/app/data/model.pkl' MWM_SIZE_PREDICTION_MODEL_SCALER_PATH = '/app/data/scaler.pkl' +MWM_SIZE_PREDICTION_MODEL_LIMITATIONS = { + 'area': 5500 * 1.5, + 'urban_pop': 3500000 * 1.5, + 'city_cnt': 32 * 1.5, + 'hamlet_cnt': 2120 * 1.5 +} diff --git a/web/app/mwm_size_predictor.py b/web/app/mwm_size_predictor.py index 4045635..abcd2f2 100644 --- a/web/app/mwm_size_predictor.py +++ b/web/app/mwm_size_predictor.py @@ -6,6 +6,8 @@ import config class MwmSizePredictor: + factors = ('urban_pop', 'area', 'city_cnt', 'hamlet_cnt') + def __init__(self): with open(config.MWM_SIZE_PREDICTION_MODEL_PATH, 'rb') as f: self.model = pickle.load(f) @@ -20,9 +22,9 @@ class MwmSizePredictor: @classmethod def predict(cls, features_array): - """1D or 2D array of feature values for predictions. Features are - 'urban_pop', 'area', 'city_cnt', 'hamlet_cnt' as defined for the - prediction model. + """1D or 2D array of feature values for predictions. + Each feature is a list of values for factors + defined by 'cls.factors' sequence. """ X = np.array(features_array) one_prediction = (X.ndim == 1) diff --git a/web/app/static/borders.js b/web/app/static/borders.js index f407620..3ade574 100644 --- a/web/app/static/borders.js +++ b/web/app/static/borders.js @@ -316,8 +316,9 @@ function selectLayer(e) { $('#b_size').text( Math.round(props['count_k'] * BYTES_FOR_NODE / 1024 / 1024) + ' MB' ); - $('#pa_size').text(Math.round(props['mwm_size_est'] / 1024) + ' MB'); - //$('#b_nodes').text(borders[selectedId].layer.getLatLngs()[0].length); + var mwm_size_est = props['mwm_size_est']; + var mwm_size_est_text = mwm_size_est === null ? '-' : Math.round(props['mwm_size_est']/1024) + ' MB'; + $('#pa_size').text(mwm_size_est_text); $('#b_nodes').text(props['nodes']); $('#b_date').text(props['modified']); $('#b_area').text(L.Util.formatNum(props['area'] / 1000000, 2)); @@ -1114,7 +1115,7 @@ function bDivideDrawPreview(response) { var show_divide_button = (subregions.features.length > 1); if (clusters) { subregions_count_text += ', ' + clusters.features.length + ' кластеров'; - show_divide_button = (clusters.features.length > 1); + show_divide_button = (clusters.features.length > 0); } $('#d_count').text(subregions_count_text).show(); if (show_divide_button) diff --git a/web/app/subregions.py b/web/app/subregions.py index 0a281cf..317c25d 100644 --- a/web/app/subregions.py +++ b/web/app/subregions.py @@ -5,6 +5,8 @@ from config import ( BORDERS_TABLE as borders_table, OSM_TABLE as osm_table, OSM_PLACES_TABLE as osm_places_table, + + MWM_SIZE_PREDICTION_MODEL_LIMITATIONS, ) from mwm_size_predictor import MwmSizePredictor @@ -20,12 +22,11 @@ def get_subregions_info(conn, region_id, region_table, """ subregions = _get_subregions_basic_info(conn, region_id, region_table, next_level, need_cities) - _add_population_data(conn, subregions, need_cities) - _add_mwm_size_estimation(subregions) + _add_mwm_size_estimation(conn, subregions, need_cities) keys = ('name', 'mwm_size_est') if need_cities: keys = keys + ('cities',) - return {subregion_id: {k: subregion_data[k] for k in keys} + return {subregion_id: {k: subregion_data.get(k) for k in keys} for subregion_id, subregion_data in subregions.items() } @@ -51,100 +52,109 @@ def _get_subregions_basic_info(conn, region_id, region_table, 'osm_id': rec[0], 'name': rec[1], 'area': rec[2], - 'urban_pop': 0, - 'city_cnt': 0, - 'hamlet_cnt': 0 } - if need_cities: - subregion_data['cities'] = [] subregions[rec[0]] = subregion_data return subregions def _add_population_data(conn, subregions, need_cities): - if not subregions: + """Adds population data only for subregions that are suitable + for mwm size estimation. + """ + subregion_ids = [ + s_id for s_id, s_data in subregions.items() + if s_data['area'] <= MWM_SIZE_PREDICTION_MODEL_LIMITATIONS['area'] + ] + if not subregion_ids: return - cursor = conn.cursor() - subregion_ids = ','.join(str(x) for x in subregions.keys()) - cursor.execute(f""" - SELECT b.osm_id, p.name, coalesce(p.population, 0), p.place - FROM {osm_table} b, {osm_places_table} p - WHERE b.osm_id IN ({subregion_ids}) - AND ST_Contains(b.way, p.center) - """ - ) - for subregion_id, place_name, place_population, place_type in cursor: - subregion_data = subregions[subregion_id] - if place_type in ('city', 'town'): - subregion_data['city_cnt'] += 1 - subregion_data['urban_pop'] += place_population - if need_cities: - subregion_data['cities'].append({ - 'name': place_name, - 'population': place_population - }) - else: - subregion_data['hamlet_cnt'] += 1 + + for subregion_id, data in subregions.items(): + data.update({ + 'urban_pop': 0, + 'city_cnt': 0, + 'hamlet_cnt': 0 + }) + if need_cities: + data['cities'] = [] + + subregion_ids_str = ','.join(str(x) for x in subregion_ids) + with conn.cursor() as cursor: + cursor.execute(f""" + SELECT b.osm_id, p.name, coalesce(p.population, 0), p.place + FROM {osm_table} b, {osm_places_table} p + WHERE b.osm_id IN ({subregion_ids_str}) + AND ST_Contains(b.way, p.center) + """ + ) + for subregion_id, place_name, place_population, place_type in cursor: + subregion_data = subregions[subregion_id] + if place_type in ('city', 'town'): + subregion_data['city_cnt'] += 1 + subregion_data['urban_pop'] += place_population + if need_cities: + subregion_data['cities'].append({ + 'name': place_name, + 'population': place_population + }) + else: + subregion_data['hamlet_cnt'] += 1 -def _add_mwm_size_estimation(subregions): - if not subregions: - return - subregions_sorted = [ +def _add_mwm_size_estimation(conn, subregions, need_cities): + for subregion_data in subregions.values(): + subregion_data['mwm_size_est'] = None + + _add_population_data(conn, subregions, need_cities) + + subregions_to_predict = [ ( s_id, - [subregions[s_id][f] for f in - ('urban_pop', 'area', 'city_cnt', 'hamlet_cnt')] + [subregions[s_id][f] for f in MwmSizePredictor.factors] ) for s_id in sorted(subregions.keys()) + if all(subregions[s_id].get(f) is not None and + subregions[s_id][f] <= + MWM_SIZE_PREDICTION_MODEL_LIMITATIONS[f] + for f in MwmSizePredictor.factors) ] - feature_array = [x[1] for x in subregions_sorted] + if not subregions_to_predict: + return + + feature_array = [x[1] for x in subregions_to_predict] predictions = MwmSizePredictor.predict(feature_array) for subregion_id, mwm_size_prediction in zip( - (x[0] for x in subregions_sorted), + (x[0] for x in subregions_to_predict), predictions ): subregions[subregion_id]['mwm_size_est'] = mwm_size_prediction def update_border_mwm_size_estimation(conn, border_id): - cursor = conn.cursor() - cursor.execute(f""" - SELECT name, ST_Area(geography(geom))/1.0E+6 area - FROM {borders_table} - WHERE id = %s""", (border_id, )) - name, area = cursor.fetchone() - if math.isnan(area): - raise Exception(f"Area is NaN for border '{name}' ({border_id})") - border_data = { - 'area': area, - 'urban_pop': 0, - 'city_cnt': 0, - 'hamlet_cnt': 0 - } - cursor.execute(f""" - SELECT coalesce(p.population, 0), p.place - FROM {borders_table} b, {osm_places_table} p - WHERE b.id = %s - AND ST_Contains(b.geom, p.center) - """, (border_id, )) - for place_population, place_type in cursor: - if place_type in ('city', 'town'): - border_data['city_cnt'] += 1 - border_data['urban_pop'] += place_population - else: - border_data['hamlet_cnt'] += 1 - - feature_array = [ - border_data[f] for f in - ('urban_pop', 'area', 'city_cnt', 'hamlet_cnt') - ] - mwm_size_est = MwmSizePredictor.predict(feature_array) - cursor.execute(f"UPDATE {borders_table} SET mwm_size_est = %s WHERE id = %s", - (mwm_size_est, border_id)) - conn.commit() + with conn.cursor() as cursor: + cursor.execute(f""" + SELECT name, ST_Area(geography(geom))/1.0E+6 area + FROM {borders_table} + WHERE id = %s""", (border_id,)) + name, area = cursor.fetchone() + if math.isnan(area): + e = Exception(f"Area is NaN for border '{name}' ({border_id})") + raise e + border_data = { + 'area': area, + } + regions = {border_id: border_data} + _add_mwm_size_estimation(conn, regions, need_cities=False) + mwm_size_est = border_data.get('mwm_size_est') + # mwm_size_est may be None. Python's None is converted to NULL + # duging %s substitution in execute(). + cursor.execute(f""" + UPDATE {borders_table} + SET mwm_size_est = %s + WHERE id = %s + """, (mwm_size_est, border_id,)) + conn.commit() def is_administrative_region(conn, region_id):