forked from organicmaps/organicmaps
[search] Changed the name scoring scheme.
This commit is contained in:
parent
6e4b1c0be8
commit
83dd94fd2b
7 changed files with 54 additions and 51 deletions
|
@ -64,7 +64,8 @@ void ProcessMetadata(FeatureType const & ft, Result::Metadata & meta)
|
|||
meta.m_hotelRating = rating;
|
||||
|
||||
int pricing;
|
||||
strings::to_int(src.Get(feature::Metadata::FMD_PRICE_RATE), pricing);
|
||||
if (!strings::to_int(src.Get(feature::Metadata::FMD_PRICE_RATE), pricing))
|
||||
pricing = 0;
|
||||
string pricingStr;
|
||||
CHECK_GREATER_OR_EQUAL(pricing, 0, ("Pricing must be positive!"));
|
||||
for (auto i = 0; i < pricing; i++)
|
||||
|
@ -292,11 +293,12 @@ bool PreResult2::IsStreet() const
|
|||
string PreResult2::DebugPrint() const
|
||||
{
|
||||
stringstream ss;
|
||||
ss << "{ IntermediateResult: " <<
|
||||
"Name: " << m_str <<
|
||||
"; Type: " << GetBestType() <<
|
||||
"; Rank: " << static_cast<int>(m_info.m_rank) <<
|
||||
"; Distance: " << m_distance << " }";
|
||||
ss << "IntermediateResult [ "
|
||||
<< "Name: " << m_str
|
||||
<< "; Type: " << GetBestType()
|
||||
<< "; Ranking info: " << search::DebugPrint(m_info)
|
||||
<< "; Linear model rank: " << m_info.GetLinearModelRank()
|
||||
<< " ]";
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
|
|
|
@ -19,7 +19,7 @@ namespace
|
|||
{
|
||||
bool IsAlmostFullMatch(NameScore score)
|
||||
{
|
||||
return score == NAME_SCORE_FULL_MATCH_PREFIX || score == NAME_SCORE_FULL_MATCH;
|
||||
return score == NAME_SCORE_PREFIX || score == NAME_SCORE_FULL_MATCH;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
|
|
|
@ -11,19 +11,26 @@ namespace
|
|||
{
|
||||
// See search/search_quality/scoring_model.py for details. In short,
|
||||
// these coeffs correspond to coeffs in a linear model.
|
||||
double const kDistanceToPivot = -1.0000000;
|
||||
double const kRank = 0.7165246;
|
||||
double const kFalseCats = -0.3833900;
|
||||
double const kDistanceToPivot = -0.37897824370302247;
|
||||
double const kRank = 1.0;
|
||||
double const kFalseCats = -0.05775625793967508;
|
||||
|
||||
double const kNameScore[NameScore::NAME_SCORE_COUNT] = {
|
||||
-0.1069757 /* Zero */, -0.0250079 /* Substring Prefix */, 0.0447104 /* Substring */,
|
||||
0.0872732 /* Full Match Prefix */, 0.0872732 /* Full Match */
|
||||
-0.11436302557264734 /* Zero */
|
||||
, 0.014295634567960331 /* Substring */
|
||||
, 0.046219090910780115 /* Prefix */
|
||||
, 0.05384830009390816 /* Full Match */
|
||||
};
|
||||
|
||||
double const kSearchType[SearchModel::SEARCH_TYPE_COUNT] = {
|
||||
-0.3884116 /* POI */, -0.3884116 /* Building */,
|
||||
-0.3214653 /* Street */, -0.3357469 /* Unclassified */,
|
||||
-0.4341714 /* Village */, 0.2721947 /* City */,
|
||||
0.4708555 /* State */, 0.7367450 /* Country */
|
||||
-0.09164609318265761 /* POI */
|
||||
, -0.09164609318265761 /* Building */
|
||||
, -0.0805969548653964 /* Street */
|
||||
, -0.030493728520630793 /* Unclassified */
|
||||
, -0.19242203325862917 /* Village */
|
||||
, -0.10945592241057521 /* City */
|
||||
, 0.19250143015921584 /* State */
|
||||
, 0.31211330207867427 /* Country */
|
||||
};
|
||||
|
||||
double TransformDistance(double distance)
|
||||
|
|
|
@ -65,9 +65,8 @@ string DebugPrint(NameScore score)
|
|||
switch (score)
|
||||
{
|
||||
case NAME_SCORE_ZERO: return "Zero";
|
||||
case NAME_SCORE_SUBSTRING_PREFIX: return "Substring Prefix";
|
||||
case NAME_SCORE_SUBSTRING: return "Substring";
|
||||
case NAME_SCORE_FULL_MATCH_PREFIX: return "Full Match Prefix";
|
||||
case NAME_SCORE_PREFIX: return "Prefix";
|
||||
case NAME_SCORE_FULL_MATCH: return "Full Match";
|
||||
case NAME_SCORE_COUNT: return "Count";
|
||||
}
|
||||
|
|
|
@ -30,10 +30,9 @@ bool PrefixMatch(QueryParams::Token const & token, strings::UniString const & te
|
|||
enum NameScore
|
||||
{
|
||||
NAME_SCORE_ZERO = 0,
|
||||
NAME_SCORE_SUBSTRING_PREFIX = 1,
|
||||
NAME_SCORE_SUBSTRING = 2,
|
||||
NAME_SCORE_FULL_MATCH_PREFIX = 3,
|
||||
NAME_SCORE_FULL_MATCH = 4,
|
||||
NAME_SCORE_SUBSTRING = 1,
|
||||
NAME_SCORE_PREFIX = 2,
|
||||
NAME_SCORE_FULL_MATCH = 3,
|
||||
|
||||
NAME_SCORE_COUNT
|
||||
};
|
||||
|
@ -44,8 +43,8 @@ bool IsStopWord(strings::UniString const & s);
|
|||
// Normalizes, simplifies and splits string, removes stop-words.
|
||||
void PrepareStringForMatching(std::string const & name, std::vector<strings::UniString> & tokens);
|
||||
|
||||
template <typename TSlice>
|
||||
NameScore GetNameScore(std::string const & name, TSlice const & slice)
|
||||
template <typename Slice>
|
||||
NameScore GetNameScore(std::string const & name, Slice const & slice)
|
||||
{
|
||||
if (slice.Empty())
|
||||
return NAME_SCORE_ZERO;
|
||||
|
@ -55,8 +54,8 @@ NameScore GetNameScore(std::string const & name, TSlice const & slice)
|
|||
return GetNameScore(tokens, slice);
|
||||
}
|
||||
|
||||
template <typename TSlice>
|
||||
NameScore GetNameScore(std::vector<strings::UniString> const & tokens, TSlice const & slice)
|
||||
template <typename Slice>
|
||||
NameScore GetNameScore(std::vector<strings::UniString> const & tokens, Slice const & slice)
|
||||
{
|
||||
if (slice.Empty())
|
||||
return NAME_SCORE_ZERO;
|
||||
|
@ -75,18 +74,19 @@ NameScore GetNameScore(std::vector<strings::UniString> const & tokens, TSlice co
|
|||
if (!match)
|
||||
continue;
|
||||
|
||||
if (impl::FullMatch(slice.Get(m - 1), tokens[offset + m - 1]))
|
||||
{
|
||||
if (m == n)
|
||||
return NAME_SCORE_FULL_MATCH;
|
||||
score = max(score, NAME_SCORE_SUBSTRING);
|
||||
}
|
||||
if (lastTokenIsPrefix && impl::PrefixMatch(slice.Get(m - 1), tokens[offset + m - 1]))
|
||||
{
|
||||
if (m == n)
|
||||
return NAME_SCORE_FULL_MATCH_PREFIX;
|
||||
score = max(score, NAME_SCORE_SUBSTRING_PREFIX);
|
||||
}
|
||||
bool const fullMatch = impl::FullMatch(slice.Get(m - 1), tokens[offset + m - 1]);
|
||||
bool const prefixMatch =
|
||||
lastTokenIsPrefix && impl::PrefixMatch(slice.Get(m - 1), tokens[offset + m - 1]);
|
||||
if (!fullMatch && !prefixMatch)
|
||||
continue;
|
||||
|
||||
if (m == n && fullMatch)
|
||||
return NAME_SCORE_FULL_MATCH;
|
||||
|
||||
if (offset == 0)
|
||||
score = max(score, NAME_SCORE_PREFIX);
|
||||
|
||||
score = max(score, NAME_SCORE_SUBSTRING);
|
||||
}
|
||||
return score;
|
||||
}
|
||||
|
|
|
@ -16,7 +16,7 @@ import sys
|
|||
MAX_DISTANCE_METERS = 2e6
|
||||
MAX_RANK = 255
|
||||
RELEVANCES = {'Irrelevant': 0, 'Relevant': 1, 'Vital': 3}
|
||||
NAME_SCORES = ['Zero', 'Substring Prefix', 'Substring', 'Full Match Prefix', 'Full Match']
|
||||
NAME_SCORES = ['Zero', 'Substring', 'Prefix', 'Full Match']
|
||||
SEARCH_TYPES = ['POI', 'Building', 'Street', 'Unclassified', 'Village', 'City', 'State', 'Country']
|
||||
|
||||
FEATURES = ['DistanceToPivot', 'Rank', 'FalseCats'] + NAME_SCORES + SEARCH_TYPES
|
||||
|
@ -25,8 +25,6 @@ FEATURES = ['DistanceToPivot', 'Rank', 'FalseCats'] + NAME_SCORES + SEARCH_TYPES
|
|||
def transform_name_score(value, categories_match):
|
||||
if categories_match == 1:
|
||||
return 'Zero'
|
||||
elif value == 'Full Match Prefix':
|
||||
return 'Full Match'
|
||||
else:
|
||||
return value
|
||||
|
||||
|
@ -40,10 +38,6 @@ def normalize_data(data):
|
|||
|
||||
cats = data['PureCats'].combine(data['FalseCats'], max)
|
||||
|
||||
# Full prefix match is unified with a full match as these features
|
||||
# are collinear. But we need both of them as they're also used in
|
||||
# locality sorting.
|
||||
#
|
||||
# TODO (@y, @m): do forward/backward/subset selection of features
|
||||
# instead of this merging. It would be great to conduct PCA on
|
||||
# the features too.
|
||||
|
@ -277,7 +271,6 @@ def main(args):
|
|||
|
||||
# Following code restores coeffs for merged features.
|
||||
ws[FEATURES.index('Building')] = ws[FEATURES.index('POI')]
|
||||
ws[FEATURES.index('Full Match Prefix')] = ws[FEATURES.index('Full Match')]
|
||||
|
||||
ndcgs = compute_ndcgs_for_ws(data, ws)
|
||||
|
||||
|
|
|
@ -45,12 +45,14 @@ UNIT_TEST(NameTest_Smoke)
|
|||
TEST_EQUAL(GetScore("New York", "Central Park, New York, US", TokenRange(2, 4)),
|
||||
NAME_SCORE_FULL_MATCH, ());
|
||||
TEST_EQUAL(GetScore("New York", "York", TokenRange(0, 1)), NAME_SCORE_SUBSTRING, ());
|
||||
TEST_EQUAL(GetScore("Moscow", "Red Square Mosc", TokenRange(2, 3)), NAME_SCORE_FULL_MATCH_PREFIX,
|
||||
());
|
||||
TEST_EQUAL(GetScore("Moscow", "Red Square Mosc", TokenRange(2, 3)), NAME_SCORE_PREFIX, ());
|
||||
TEST_EQUAL(GetScore("Moscow", "Red Square Moscow", TokenRange(2, 3)), NAME_SCORE_FULL_MATCH, ());
|
||||
TEST_EQUAL(GetScore("San Francisco", "Fran", TokenRange(0, 1)), NAME_SCORE_SUBSTRING_PREFIX, ());
|
||||
TEST_EQUAL(GetScore("San Francisco", "Fran", TokenRange(0, 1)), NAME_SCORE_SUBSTRING, ());
|
||||
TEST_EQUAL(GetScore("San Francisco", "Fran ", TokenRange(0, 1)), NAME_SCORE_ZERO, ());
|
||||
TEST_EQUAL(GetScore("Лермонтовъ", "Лермонтов", TokenRange(0, 1)), NAME_SCORE_FULL_MATCH_PREFIX,
|
||||
());
|
||||
TEST_EQUAL(GetScore("San Francisco", "Sa", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
|
||||
TEST_EQUAL(GetScore("San Francisco", "San ", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
|
||||
TEST_EQUAL(GetScore("Лермонтовъ", "Лермонтов", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
|
||||
TEST_EQUAL(GetScore("фото на документы", "фото", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
|
||||
TEST_EQUAL(GetScore("фотоателье", "фото", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
|
||||
}
|
||||
} // namespace
|
||||
|
|
Loading…
Add table
Reference in a new issue