More robust work with mwm size predictions, including prediction model limitations

2020-12-29 12:32:33 +03:00 · 2020-12-29 12:32:33 +03:00 · c1ca4c68b1
commit c1ca4c68b1
parent a944aee15c
8 changed files with 158 additions and 114 deletions
--- a/db/create_tables.sql
+++ b/db/create_tables.sql
@ -38,6 +38,7 @@ CREATE TABLE splitting (
    subregion_ids BIGINT[] NOT NULL,
    mwm_size_est REAL NOT NULL,
    mwm_size_thr INTEGER NOT NULL, -- mwm size threshold in Kb, 4-bytes INTEGER is enough
+    next_level INTEGER NOT NULL,
    geom geometry NOT NULL
 );
-CREATE INDEX splitting_idx ON splitting (osm_border_id, mwm_size_thr);
+CREATE INDEX splitting_idx ON splitting (osm_border_id, mwm_size_thr, next_level);
--- a/web/app/auto_split.py
+++ b/web/app/auto_split.py
@ -12,9 +12,12 @@ from subregions import get_subregions_info
 class DisjointClusterUnion:
    """Disjoint set union implementation for administrative subregions."""

-    def __init__(self, region_id, subregions, mwm_size_thr=None):
+    def __init__(self, region_id, subregions, next_level, mwm_size_thr=None):
+        assert all(s_data['mwm_size_est'] is not None
+                    for s_data in subregions.values())
        self.region_id = region_id
        self.subregions = subregions
+        self.next_level = next_level
        self.mwm_size_thr = mwm_size_thr or MWM_SIZE_THRESHOLD
        self.representatives = {sub_id: sub_id for sub_id in subregions}
        # A cluster is one or more subregions with common borders
@ -84,7 +87,8 @@ def get_best_cluster_to_join_with(small_cluster_id,
    for subregion_id in subregion_ids:
        for other_subregion_id, length in common_border_matrix[subregion_id].items():
            other_cluster_id = dcu.find_cluster(other_subregion_id)
-            if other_cluster_id != small_cluster_id:
+            if (other_cluster_id != small_cluster_id and
+                    not dcu.clusters[other_cluster_id]['finished']):
                common_borders[other_cluster_id] += length
    if not common_borders:
        return None
@ -144,8 +148,10 @@ def find_golden_splitting(conn, border_id, next_level, mwm_size_thr):
                                     next_level, need_cities=True)
    if not subregions:
        return
+    if any(s_data['mwm_size_est'] is None for s_data in subregions.values()):
+        return

-    dcu = DisjointClusterUnion(border_id, subregions, mwm_size_thr)
+    dcu = DisjointClusterUnion(border_id, subregions, next_level, mwm_size_thr)
    all_subregion_ids = dcu.get_all_subregion_ids()
    common_border_matrix = calculate_common_border_matrix(conn, all_subregion_ids)

@ -188,6 +194,7 @@ def save_splitting_to_db(conn, dcu: DisjointClusterUnion):
            DELETE FROM {autosplit_table}
            WHERE osm_border_id = {dcu.region_id}
              AND mwm_size_thr = {dcu.mwm_size_thr}
+              AND next_level = {dcu.next_level}
            """)
        for cluster_id, data in dcu.clusters.items():
            subregion_ids = data['subregion_ids']
@ -196,12 +203,13 @@ def save_splitting_to_db(conn, dcu: DisjointClusterUnion):
            )
            cluster_geometry_sql = get_union_sql(subregion_ids)
            cursor.execute(f"""
-                INSERT INTO {autosplit_table} (osm_border_id, subregion_ids,
-                                               geom, mwm_size_thr, mwm_size_est)
+                INSERT INTO {autosplit_table} (osm_border_id, subregion_ids, geom,
+                                               next_level, mwm_size_thr, mwm_size_est)
                  VALUES (
                    {dcu.region_id},
                    '{subregion_ids_array_str}',
                    ({cluster_geometry_sql}),
+                    {dcu.next_level},
                    {dcu.mwm_size_thr},
                    {data['mwm_size_est']}
                  )
--- a/web/app/borders_api.py
+++ b/web/app/borders_api.py
@ -413,11 +413,11 @@ def find_osm_borders():
 def copy_from_osm():
    osm_id = int(request.args.get('id'))
    name = request.args.get('name')
-    success = copy_region_from_osm(g.conn, osm_id, name)
-    if not success:
-        return jsonify(status=f"Region with id={osm_id} already exists")
+    errors, warnings = copy_region_from_osm(g.conn, osm_id, name)
+    if errors:
+        return jsonify(status='\n'.join(errors))
    g.conn.commit()
-    return jsonify(status='ok')
+    return jsonify(status='ok', warnings=warnings)


@app.route('/rename')
--- a/web/app/borders_api_utils.py
+++ b/web/app/borders_api_utils.py
@ -143,8 +143,9 @@ def get_clusters_for_preview_one(region_id, next_level, mwm_size_thr):
    where_clause = f"""
        osm_border_id = %s
        AND mwm_size_thr = %s
+        AND next_level = %s
        """
-    splitting_sql_params = (region_id, mwm_size_thr)
+    splitting_sql_params = (region_id, mwm_size_thr, next_level)
    with g.conn.cursor() as cursor:
        cursor.execute(f"""
            SELECT 1 FROM {autosplit_table}
@ -231,9 +232,9 @@ def divide_region_into_subregions(conn, region_id, next_level):
            cursor.execute(f"""
                INSERT INTO {borders_table}
                    (id, geom, name, parent_id, modified, count_k, mwm_size_est)
-                SELECT osm_id, way, name, {parent_id}, now(), -1, {mwm_size_est}
+                SELECT osm_id, way, name, {parent_id}, now(), -1, %s
                FROM {osm_table}
-                WHERE osm_id  = %s""", (subregion_id,)
+                WHERE osm_id  = %s""", (mwm_size_est, subregion_id,)
            )
        if not is_admin_region:
            cursor.execute(f"DELETE FROM {borders_table} WHERE id = %s", (region_id,))
@ -251,8 +252,9 @@ def divide_into_clusters(region_ids, next_level, mwm_size_thr):
        where_clause = f"""
            osm_border_id = %s
            AND mwm_size_thr = %s
+            AND next_level = %s
            """
-        splitting_sql_params = (region_id, mwm_size_thr)
+        splitting_sql_params = (region_id, mwm_size_thr, next_level)
        cursor.execute(f"""
            SELECT 1 FROM {autosplit_table}
            WHERE {where_clause}
@ -269,24 +271,32 @@ def divide_into_clusters(region_ids, next_level, mwm_size_thr):
            """, splitting_sql_params
        )
        if cursor.rowcount == 1:
-            continue
-        for rec in cursor:
-            subregion_ids = rec[0]
-            cluster_id = subregion_ids[0]
-            if len(subregion_ids) == 1:
-                subregion_id = cluster_id
-                name = get_osm_border_name_by_osm_id(g.conn, subregion_id)
-            else:
-                counter += 1
-                free_id -= 1
-                subregion_id = free_id
-                name = f"{base_name}_{counter}"
            insert_cursor.execute(f"""
-                INSERT INTO {borders_table} (id, name, parent_id, geom, modified, count_k, mwm_size_est)
-                SELECT {subregion_id}, %s, osm_border_id, geom, now(), -1, mwm_size_est
-                FROM {autosplit_table} WHERE subregion_ids[1] = %s AND {where_clause}
-                """, (name, cluster_id,) + splitting_sql_params
-            )
+                UPDATE {borders_table}
+                SET modified = now(),
+                    mwm_size_est = (SELECT mwm_size_est
+                                    FROM {autosplit_table}
+                                    WHERE {where_clause})
+                WHERE id = {region_id}
+                """, splitting_sql_params)
+        else:
+            for rec in cursor:
+                subregion_ids = rec[0]
+                cluster_id = subregion_ids[0]
+                if len(subregion_ids) == 1:
+                    subregion_id = cluster_id
+                    name = get_osm_border_name_by_osm_id(g.conn, subregion_id)
+                else:
+                    counter += 1
+                    free_id -= 1
+                    subregion_id = free_id
+                    name = f"{base_name}_{counter}"
+                insert_cursor.execute(f"""
+                    INSERT INTO {borders_table} (id, name, parent_id, geom, modified, count_k, mwm_size_est)
+                    SELECT {subregion_id}, %s, osm_border_id, geom, now(), -1, mwm_size_est
+                    FROM {autosplit_table} WHERE subregion_ids[1] = %s AND {where_clause}
+                    """, (name, cluster_id,) + splitting_sql_params
+                )
    g.conn.commit()
    return jsonify(status='ok')

@ -393,13 +403,16 @@ def find_potential_parents(region_id):


 def copy_region_from_osm(conn, region_id, name=None, parent_id='not_passed'):
+    errors, warnings = [], []
    borders_table = main_borders_table
    with conn.cursor() as cursor:
        # Check if this id already in use
-        cursor.execute(f"SELECT id FROM {borders_table} WHERE id = %s",
+        cursor.execute(f"SELECT name FROM {borders_table} WHERE id = %s",
                       (region_id,))
        if cursor.rowcount > 0:
-            return False
+            name = cursor.fetchone()[0]
+            errors.append(f"Region with id={region_id} already exists under name '{name}'")
+            return errors, warnings

        name_expr = f"'{name}'" if name else "name"
        parent_id_expr = f"{parent_id}" if isinstance(parent_id, int) else "NULL"
@ -413,8 +426,11 @@ def copy_region_from_osm(conn, region_id, name=None, parent_id='not_passed'):
        )
        if parent_id == 'not_passed':
            assign_region_to_lowest_parent(conn, region_id)
-        update_border_mwm_size_estimation(conn, region_id)
-        return True
+        try:
+            update_border_mwm_size_estimation(conn, region_id)
+        except Exception as e:
+            warnings.append(str(e))
+        return errors, warnings


 def get_osm_border_name_by_osm_id(conn, osm_id):
--- a/web/app/config.py
+++ b/web/app/config.py
@ -33,3 +33,9 @@ MWM_SIZE_THRESHOLD = 70*1024
 # Estimated mwm size is predicted by the 'model.pkl' with 'scaler.pkl' for X
 MWM_SIZE_PREDICTION_MODEL_PATH = '/app/data/model.pkl'
 MWM_SIZE_PREDICTION_MODEL_SCALER_PATH = '/app/data/scaler.pkl'
+MWM_SIZE_PREDICTION_MODEL_LIMITATIONS = {
+    'area': 5500 * 1.5,
+    'urban_pop': 3500000 * 1.5,
+    'city_cnt': 32 * 1.5,
+    'hamlet_cnt': 2120 * 1.5
+}
--- a/web/app/mwm_size_predictor.py
+++ b/web/app/mwm_size_predictor.py
@ -6,6 +6,8 @@ import config

 class MwmSizePredictor:

+    factors = ('urban_pop', 'area', 'city_cnt', 'hamlet_cnt')
+
    def __init__(self):
        with open(config.MWM_SIZE_PREDICTION_MODEL_PATH, 'rb') as f:
            self.model = pickle.load(f)
@ -20,9 +22,9 @@ class MwmSizePredictor:

    @classmethod
    def predict(cls, features_array):
-        """1D or 2D array of feature values for predictions. Features are
-        'urban_pop', 'area', 'city_cnt', 'hamlet_cnt' as defined for the
-        prediction model.
+        """1D or 2D array of feature values for predictions.
+        Each feature is a list of values for factors
+        defined by 'cls.factors' sequence.
        """
        X = np.array(features_array)
        one_prediction = (X.ndim == 1)
--- a/web/app/static/borders.js
+++ b/web/app/static/borders.js
@ -316,8 +316,9 @@ function selectLayer(e) {
        $('#b_size').text(
            Math.round(props['count_k'] * BYTES_FOR_NODE / 1024 / 1024) + ' MB'
        );
-        $('#pa_size').text(Math.round(props['mwm_size_est'] / 1024) + ' MB');
-        //$('#b_nodes').text(borders[selectedId].layer.getLatLngs()[0].length);
+        var mwm_size_est = props['mwm_size_est'];
+        var mwm_size_est_text = mwm_size_est === null ? '-' : Math.round(props['mwm_size_est']/1024) + ' MB';
+        $('#pa_size').text(mwm_size_est_text);
        $('#b_nodes').text(props['nodes']);
        $('#b_date').text(props['modified']);
        $('#b_area').text(L.Util.formatNum(props['area'] / 1000000, 2));
@ -1114,7 +1115,7 @@ function bDivideDrawPreview(response) {
    var show_divide_button = (subregions.features.length > 1);
    if (clusters) {
        subregions_count_text += ', ' + clusters.features.length + ' кластеров';
-        show_divide_button = (clusters.features.length > 1);
+        show_divide_button = (clusters.features.length > 0);
    }
    $('#d_count').text(subregions_count_text).show();
    if (show_divide_button)
--- a/web/app/subregions.py
+++ b/web/app/subregions.py
@ -5,6 +5,8 @@ from config import (
    BORDERS_TABLE as borders_table,
    OSM_TABLE as osm_table,
    OSM_PLACES_TABLE as osm_places_table,
+
+    MWM_SIZE_PREDICTION_MODEL_LIMITATIONS,
 )
 from mwm_size_predictor import MwmSizePredictor

@ -20,12 +22,11 @@ def get_subregions_info(conn, region_id, region_table,
    """
    subregions = _get_subregions_basic_info(conn, region_id, region_table,
                                            next_level, need_cities)
-    _add_population_data(conn, subregions, need_cities)
-    _add_mwm_size_estimation(subregions)
+    _add_mwm_size_estimation(conn, subregions, need_cities)
    keys = ('name', 'mwm_size_est')
    if need_cities:
        keys = keys + ('cities',)
-    return {subregion_id: {k: subregion_data[k] for k in keys}
+    return {subregion_id: {k: subregion_data.get(k) for k in keys}
            for subregion_id, subregion_data in subregions.items()
    }

@ -51,100 +52,109 @@ def _get_subregions_basic_info(conn, region_id, region_table,
            'osm_id': rec[0],
            'name': rec[1],
            'area': rec[2],
-            'urban_pop': 0,
-            'city_cnt': 0,
-            'hamlet_cnt': 0
        }
-        if need_cities:
-            subregion_data['cities'] = []
        subregions[rec[0]] = subregion_data
    return subregions


 def _add_population_data(conn, subregions, need_cities):
-    if not subregions:
+    """Adds population data only for subregions that are suitable
+    for mwm size estimation.
+    """
+    subregion_ids = [
+        s_id for s_id, s_data in subregions.items()
+        if s_data['area'] <= MWM_SIZE_PREDICTION_MODEL_LIMITATIONS['area']
+    ]
+    if not subregion_ids:
        return
-    cursor = conn.cursor()
-    subregion_ids = ','.join(str(x) for x in subregions.keys())
-    cursor.execute(f"""
-        SELECT b.osm_id, p.name, coalesce(p.population, 0), p.place
-        FROM {osm_table} b, {osm_places_table} p
-        WHERE b.osm_id IN ({subregion_ids})
-            AND ST_Contains(b.way, p.center)
-        """
-    )
-    for subregion_id, place_name, place_population, place_type in cursor:
-        subregion_data = subregions[subregion_id]
-        if place_type in ('city', 'town'):
-            subregion_data['city_cnt'] += 1
-            subregion_data['urban_pop'] += place_population
-            if need_cities:
-                subregion_data['cities'].append({
-                    'name': place_name,
-                    'population': place_population
-                })
-        else:
-            subregion_data['hamlet_cnt'] += 1
+
+    for subregion_id, data in subregions.items():
+        data.update({
+            'urban_pop': 0,
+            'city_cnt': 0,
+            'hamlet_cnt': 0
+        })
+        if need_cities:
+            data['cities'] = []
+
+    subregion_ids_str = ','.join(str(x) for x in subregion_ids)
+    with conn.cursor() as cursor:
+        cursor.execute(f"""
+            SELECT b.osm_id, p.name, coalesce(p.population, 0), p.place
+            FROM {osm_table} b, {osm_places_table} p
+            WHERE b.osm_id IN ({subregion_ids_str})
+                AND ST_Contains(b.way, p.center)
+            """
+        )
+        for subregion_id, place_name, place_population, place_type in cursor:
+            subregion_data = subregions[subregion_id]
+            if place_type in ('city', 'town'):
+                subregion_data['city_cnt'] += 1
+                subregion_data['urban_pop'] += place_population
+                if need_cities:
+                    subregion_data['cities'].append({
+                        'name': place_name,
+                        'population': place_population
+                    })
+            else:
+                subregion_data['hamlet_cnt'] += 1


-def _add_mwm_size_estimation(subregions):
-    if not subregions:
-        return
-    subregions_sorted = [
+def _add_mwm_size_estimation(conn, subregions, need_cities):
+    for subregion_data in subregions.values():
+        subregion_data['mwm_size_est'] = None
+
+    _add_population_data(conn, subregions, need_cities)
+
+    subregions_to_predict = [
        (
            s_id,
-            [subregions[s_id][f] for f in
-                ('urban_pop', 'area', 'city_cnt', 'hamlet_cnt')]
+            [subregions[s_id][f] for f in MwmSizePredictor.factors]
        )
        for s_id in sorted(subregions.keys())
+            if all(subregions[s_id].get(f) is not None and
+                   subregions[s_id][f] <=
+                        MWM_SIZE_PREDICTION_MODEL_LIMITATIONS[f]
+                   for f in MwmSizePredictor.factors)
    ]

-    feature_array = [x[1] for x in subregions_sorted]
+    if not subregions_to_predict:
+        return
+
+    feature_array = [x[1] for x in subregions_to_predict]
    predictions = MwmSizePredictor.predict(feature_array)

    for subregion_id, mwm_size_prediction in zip(
-        (x[0] for x in subregions_sorted),
+        (x[0] for x in subregions_to_predict),
        predictions
    ):
        subregions[subregion_id]['mwm_size_est'] = mwm_size_prediction


 def update_border_mwm_size_estimation(conn, border_id):
-    cursor = conn.cursor()
-    cursor.execute(f"""
-        SELECT name, ST_Area(geography(geom))/1.0E+6 area
-        FROM {borders_table}
-        WHERE id = %s""", (border_id, ))
-    name, area = cursor.fetchone()
-    if math.isnan(area):
-        raise Exception(f"Area is NaN for border '{name}' ({border_id})")
-    border_data = {
-        'area': area,
-        'urban_pop': 0,
-        'city_cnt': 0,
-        'hamlet_cnt': 0
-    }
-    cursor.execute(f"""
-        SELECT coalesce(p.population, 0), p.place
-        FROM {borders_table} b, {osm_places_table} p
-        WHERE b.id = %s
-            AND ST_Contains(b.geom, p.center)
-        """, (border_id, ))
-    for place_population, place_type in cursor:
-        if place_type in ('city', 'town'):
-            border_data['city_cnt'] += 1
-            border_data['urban_pop'] += place_population
-        else:
-            border_data['hamlet_cnt'] += 1
-
-    feature_array = [
-        border_data[f] for f in
-        ('urban_pop', 'area', 'city_cnt', 'hamlet_cnt')
-    ]
-    mwm_size_est = MwmSizePredictor.predict(feature_array)
-    cursor.execute(f"UPDATE {borders_table} SET mwm_size_est = %s WHERE id = %s",
-                   (mwm_size_est, border_id))
-    conn.commit()
+    with conn.cursor() as cursor:
+        cursor.execute(f"""
+            SELECT name, ST_Area(geography(geom))/1.0E+6 area
+            FROM {borders_table}
+            WHERE id = %s""", (border_id,))
+        name, area = cursor.fetchone()
+        if math.isnan(area):
+            e = Exception(f"Area is NaN for border '{name}' ({border_id})")
+            raise e
+        border_data = {
+            'area': area,
+        }
+        regions = {border_id: border_data}
+        _add_mwm_size_estimation(conn, regions, need_cities=False)
+        mwm_size_est = border_data.get('mwm_size_est')
+        # mwm_size_est may be None. Python's None is converted to NULL
+        # duging %s substitution in execute().
+        cursor.execute(f"""
+            UPDATE {borders_table}
+            SET mwm_size_est = %s
+            WHERE id = %s
+            """, (mwm_size_est, border_id,))
+        conn.commit()


 def is_administrative_region(conn, region_id):