[search] Changed the name scoring scheme.

This commit is contained in:
Maxim Pimenov 2017-05-18 18:39:50 +03:00
parent 6e4b1c0be8
commit 83dd94fd2b
7 changed files with 54 additions and 51 deletions

View file

@ -64,7 +64,8 @@ void ProcessMetadata(FeatureType const & ft, Result::Metadata & meta)
meta.m_hotelRating = rating;
int pricing;
strings::to_int(src.Get(feature::Metadata::FMD_PRICE_RATE), pricing);
if (!strings::to_int(src.Get(feature::Metadata::FMD_PRICE_RATE), pricing))
pricing = 0;
string pricingStr;
CHECK_GREATER_OR_EQUAL(pricing, 0, ("Pricing must be positive!"));
for (auto i = 0; i < pricing; i++)
@ -292,11 +293,12 @@ bool PreResult2::IsStreet() const
string PreResult2::DebugPrint() const
{
stringstream ss;
ss << "{ IntermediateResult: " <<
"Name: " << m_str <<
"; Type: " << GetBestType() <<
"; Rank: " << static_cast<int>(m_info.m_rank) <<
"; Distance: " << m_distance << " }";
ss << "IntermediateResult [ "
<< "Name: " << m_str
<< "; Type: " << GetBestType()
<< "; Ranking info: " << search::DebugPrint(m_info)
<< "; Linear model rank: " << m_info.GetLinearModelRank()
<< " ]";
return ss.str();
}

View file

@ -19,7 +19,7 @@ namespace
{
bool IsAlmostFullMatch(NameScore score)
{
return score == NAME_SCORE_FULL_MATCH_PREFIX || score == NAME_SCORE_FULL_MATCH;
return score == NAME_SCORE_PREFIX || score == NAME_SCORE_FULL_MATCH;
}
} // namespace

View file

@ -11,19 +11,26 @@ namespace
{
// See search/search_quality/scoring_model.py for details. In short,
// these coeffs correspond to coeffs in a linear model.
double const kDistanceToPivot = -1.0000000;
double const kRank = 0.7165246;
double const kFalseCats = -0.3833900;
double const kDistanceToPivot = -0.37897824370302247;
double const kRank = 1.0;
double const kFalseCats = -0.05775625793967508;
double const kNameScore[NameScore::NAME_SCORE_COUNT] = {
-0.1069757 /* Zero */, -0.0250079 /* Substring Prefix */, 0.0447104 /* Substring */,
0.0872732 /* Full Match Prefix */, 0.0872732 /* Full Match */
-0.11436302557264734 /* Zero */
, 0.014295634567960331 /* Substring */
, 0.046219090910780115 /* Prefix */
, 0.05384830009390816 /* Full Match */
};
double const kSearchType[SearchModel::SEARCH_TYPE_COUNT] = {
-0.3884116 /* POI */, -0.3884116 /* Building */,
-0.3214653 /* Street */, -0.3357469 /* Unclassified */,
-0.4341714 /* Village */, 0.2721947 /* City */,
0.4708555 /* State */, 0.7367450 /* Country */
-0.09164609318265761 /* POI */
, -0.09164609318265761 /* Building */
, -0.0805969548653964 /* Street */
, -0.030493728520630793 /* Unclassified */
, -0.19242203325862917 /* Village */
, -0.10945592241057521 /* City */
, 0.19250143015921584 /* State */
, 0.31211330207867427 /* Country */
};
double TransformDistance(double distance)

View file

@ -65,9 +65,8 @@ string DebugPrint(NameScore score)
switch (score)
{
case NAME_SCORE_ZERO: return "Zero";
case NAME_SCORE_SUBSTRING_PREFIX: return "Substring Prefix";
case NAME_SCORE_SUBSTRING: return "Substring";
case NAME_SCORE_FULL_MATCH_PREFIX: return "Full Match Prefix";
case NAME_SCORE_PREFIX: return "Prefix";
case NAME_SCORE_FULL_MATCH: return "Full Match";
case NAME_SCORE_COUNT: return "Count";
}

View file

@ -30,10 +30,9 @@ bool PrefixMatch(QueryParams::Token const & token, strings::UniString const & te
enum NameScore
{
NAME_SCORE_ZERO = 0,
NAME_SCORE_SUBSTRING_PREFIX = 1,
NAME_SCORE_SUBSTRING = 2,
NAME_SCORE_FULL_MATCH_PREFIX = 3,
NAME_SCORE_FULL_MATCH = 4,
NAME_SCORE_SUBSTRING = 1,
NAME_SCORE_PREFIX = 2,
NAME_SCORE_FULL_MATCH = 3,
NAME_SCORE_COUNT
};
@ -44,8 +43,8 @@ bool IsStopWord(strings::UniString const & s);
// Normalizes, simplifies and splits string, removes stop-words.
void PrepareStringForMatching(std::string const & name, std::vector<strings::UniString> & tokens);
template <typename TSlice>
NameScore GetNameScore(std::string const & name, TSlice const & slice)
template <typename Slice>
NameScore GetNameScore(std::string const & name, Slice const & slice)
{
if (slice.Empty())
return NAME_SCORE_ZERO;
@ -55,8 +54,8 @@ NameScore GetNameScore(std::string const & name, TSlice const & slice)
return GetNameScore(tokens, slice);
}
template <typename TSlice>
NameScore GetNameScore(std::vector<strings::UniString> const & tokens, TSlice const & slice)
template <typename Slice>
NameScore GetNameScore(std::vector<strings::UniString> const & tokens, Slice const & slice)
{
if (slice.Empty())
return NAME_SCORE_ZERO;
@ -75,18 +74,19 @@ NameScore GetNameScore(std::vector<strings::UniString> const & tokens, TSlice co
if (!match)
continue;
if (impl::FullMatch(slice.Get(m - 1), tokens[offset + m - 1]))
{
if (m == n)
return NAME_SCORE_FULL_MATCH;
score = max(score, NAME_SCORE_SUBSTRING);
}
if (lastTokenIsPrefix && impl::PrefixMatch(slice.Get(m - 1), tokens[offset + m - 1]))
{
if (m == n)
return NAME_SCORE_FULL_MATCH_PREFIX;
score = max(score, NAME_SCORE_SUBSTRING_PREFIX);
}
bool const fullMatch = impl::FullMatch(slice.Get(m - 1), tokens[offset + m - 1]);
bool const prefixMatch =
lastTokenIsPrefix && impl::PrefixMatch(slice.Get(m - 1), tokens[offset + m - 1]);
if (!fullMatch && !prefixMatch)
continue;
if (m == n && fullMatch)
return NAME_SCORE_FULL_MATCH;
if (offset == 0)
score = max(score, NAME_SCORE_PREFIX);
score = max(score, NAME_SCORE_SUBSTRING);
}
return score;
}

View file

@ -16,7 +16,7 @@ import sys
MAX_DISTANCE_METERS = 2e6
MAX_RANK = 255
RELEVANCES = {'Irrelevant': 0, 'Relevant': 1, 'Vital': 3}
NAME_SCORES = ['Zero', 'Substring Prefix', 'Substring', 'Full Match Prefix', 'Full Match']
NAME_SCORES = ['Zero', 'Substring', 'Prefix', 'Full Match']
SEARCH_TYPES = ['POI', 'Building', 'Street', 'Unclassified', 'Village', 'City', 'State', 'Country']
FEATURES = ['DistanceToPivot', 'Rank', 'FalseCats'] + NAME_SCORES + SEARCH_TYPES
@ -25,8 +25,6 @@ FEATURES = ['DistanceToPivot', 'Rank', 'FalseCats'] + NAME_SCORES + SEARCH_TYPES
def transform_name_score(value, categories_match):
if categories_match == 1:
return 'Zero'
elif value == 'Full Match Prefix':
return 'Full Match'
else:
return value
@ -40,10 +38,6 @@ def normalize_data(data):
cats = data['PureCats'].combine(data['FalseCats'], max)
# Full prefix match is unified with a full match as these features
# are collinear. But we need both of them as they're also used in
# locality sorting.
#
# TODO (@y, @m): do forward/backward/subset selection of features
# instead of this merging. It would be great to conduct PCA on
# the features too.
@ -277,7 +271,6 @@ def main(args):
# Following code restores coeffs for merged features.
ws[FEATURES.index('Building')] = ws[FEATURES.index('POI')]
ws[FEATURES.index('Full Match Prefix')] = ws[FEATURES.index('Full Match')]
ndcgs = compute_ndcgs_for_ws(data, ws)

View file

@ -45,12 +45,14 @@ UNIT_TEST(NameTest_Smoke)
TEST_EQUAL(GetScore("New York", "Central Park, New York, US", TokenRange(2, 4)),
NAME_SCORE_FULL_MATCH, ());
TEST_EQUAL(GetScore("New York", "York", TokenRange(0, 1)), NAME_SCORE_SUBSTRING, ());
TEST_EQUAL(GetScore("Moscow", "Red Square Mosc", TokenRange(2, 3)), NAME_SCORE_FULL_MATCH_PREFIX,
());
TEST_EQUAL(GetScore("Moscow", "Red Square Mosc", TokenRange(2, 3)), NAME_SCORE_PREFIX, ());
TEST_EQUAL(GetScore("Moscow", "Red Square Moscow", TokenRange(2, 3)), NAME_SCORE_FULL_MATCH, ());
TEST_EQUAL(GetScore("San Francisco", "Fran", TokenRange(0, 1)), NAME_SCORE_SUBSTRING_PREFIX, ());
TEST_EQUAL(GetScore("San Francisco", "Fran", TokenRange(0, 1)), NAME_SCORE_SUBSTRING, ());
TEST_EQUAL(GetScore("San Francisco", "Fran ", TokenRange(0, 1)), NAME_SCORE_ZERO, ());
TEST_EQUAL(GetScore("Лермонтовъ", "Лермонтов", TokenRange(0, 1)), NAME_SCORE_FULL_MATCH_PREFIX,
());
TEST_EQUAL(GetScore("San Francisco", "Sa", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
TEST_EQUAL(GetScore("San Francisco", "San ", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
TEST_EQUAL(GetScore("Лермонтовъ", "Лермонтов", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
TEST_EQUAL(GetScore("фото на документы", "фото", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
TEST_EQUAL(GetScore("фотоателье", "фото", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
}
} // namespace