diff --git a/qt/main.cpp b/qt/main.cpp index e53f4e208c..9e089b1025 100644 --- a/qt/main.cpp +++ b/qt/main.cpp @@ -25,8 +25,10 @@ #include #include +DEFINE_string(data_path, "", "Path to data directory"); DEFINE_string(log_abort_level, my::ToString(my::GetDefaultLogAbortLevel()), "Log messages severity that causes termination."); +DEFINE_string(resources_path, "", "Path to resources directory"); namespace { @@ -94,6 +96,12 @@ int main(int argc, char * argv[]) google::SetUsageMessage("Desktop application."); google::ParseCommandLineFlags(&argc, &argv, true); + Platform & platform = GetPlatform(); + if (!FLAGS_resources_path.empty()) + platform.SetResourceDir(FLAGS_resources_path); + if (!FLAGS_data_path.empty()) + platform.SetWritableDirForTests(FLAGS_data_path); + my::LogLevel level; CHECK(my::FromString(FLAGS_log_abort_level, level), ()); my::g_LogAbortLevel = level; @@ -115,7 +123,7 @@ int main(int argc, char * argv[]) alohalytics::Stats::Instance().SetDebugMode(true); #endif - GetPlatform().SetupMeasurementSystem(); + platform.SetupMeasurementSystem(); // display EULA if needed char const * settingsEULA = "EulaAccepted"; @@ -127,7 +135,7 @@ int main(int argc, char * argv[]) string buffer; { - ReaderPtr reader = GetPlatform().GetReader("eula.html"); + ReaderPtr reader = platform.GetReader("eula.html"); reader.ReadAsString(buffer); } qt::InfoDialog eulaDialog(qAppName() + QString(" End User Licensing Agreement"), buffer.c_str(), NULL, buttons); @@ -146,7 +154,7 @@ int main(int argc, char * argv[]) qt::MainWindow::SetDefaultSurfaceFormat(apiOpenGLES3); #ifdef BUILD_DESIGNER - if (argc >= 2 && GetPlatform().IsFileExistsByFullPath(argv[1])) + if (argc >= 2 && platform.IsFileExistsByFullPath(argv[1])) mapcssFilePath = argv[1]; if (0 == mapcssFilePath.length()) { diff --git a/search/ranking_info.cpp b/search/ranking_info.cpp index bd0a52230a..f6812b0daa 100644 --- a/search/ranking_info.cpp +++ b/search/ranking_info.cpp @@ -11,26 +11,25 @@ namespace { // See search/search_quality/scoring_model.py for details. In short, // these coeffs correspond to coeffs in a linear model. -double const kDistanceToPivot = -0.37897824370302247; -double const kRank = 1.0; -double const kFalseCats = -0.05775625793967508; - +double const kDistanceToPivot = -1.0000000; +double const kRank = 0.5238890; +double const kFalseCats = -0.7319971; +double const kErrorsMade = -0.0238639; double const kNameScore[NameScore::NAME_SCORE_COUNT] = { - -0.11436302557264734 /* Zero */ - , 0.014295634567960331 /* Substring */ - , 0.046219090910780115 /* Prefix */ - , 0.05384830009390816 /* Full Match */ + -0.1683931 /* Zero */, + 0.0268117 /* Substring */, + 0.0599575 /* Prefix */, + 0.0816240 /* Full Match */ }; - double const kType[Model::TYPE_COUNT] = { - -0.09164609318265761 /* POI */ - , -0.09164609318265761 /* Building */ - , -0.0805969548653964 /* Street */ - , -0.030493728520630793 /* Unclassified */ - , -0.19242203325862917 /* Village */ - , -0.10945592241057521 /* City */ - , 0.19250143015921584 /* State */ - , 0.31211330207867427 /* Country */ + -0.4322325 /* POI */, + -0.4322325 /* Building */, + -0.3823704 /* Street */, + -0.3747346 /* Unclassified */, + -0.4453585 /* Village */, + 0.3900264 /* City */, + 0.5397572 /* State */, + 0.7049124 /* Country */ }; double TransformDistance(double distance) @@ -48,6 +47,7 @@ void RankingInfo::PrintCSVHeader(ostream & os) os << "DistanceToPivot" << ",Rank" << ",NameScore" + << ",ErrorsMade" << ",SearchType" << ",PureCats" << ",FalseCats"; @@ -71,8 +71,13 @@ string DebugPrint(RankingInfo const & info) void RankingInfo::ToCSV(ostream & os) const { os << fixed; - os << m_distanceToPivot << "," << static_cast(m_rank) << "," << DebugPrint(m_nameScore) - << "," << DebugPrint(m_type) << "," << m_pureCats << "," << m_falseCats; + os << m_distanceToPivot << ","; + os << static_cast(m_rank) << ","; + os << DebugPrint(m_nameScore) << ","; + os << GetErrorsMade() << ","; + os << DebugPrint(m_type) << ","; + os << m_pureCats << ","; + os << m_falseCats; } double RankingInfo::GetLinearModelRank() const @@ -96,7 +101,12 @@ double RankingInfo::GetLinearModelRank() const nameScore = NAME_SCORE_ZERO; } - return kDistanceToPivot * distanceToPivot + kRank * rank + kNameScore[nameScore] + kType[m_type] + - m_falseCats * kFalseCats; + return kDistanceToPivot * distanceToPivot + kRank * rank + kNameScore[nameScore] + + kErrorsMade * GetErrorsMade() + kType[m_type] + m_falseCats * kFalseCats; +} + +size_t RankingInfo::GetErrorsMade() const +{ + return m_errorsMade.IsValid() ? m_errorsMade.m_errorsMade : 0; } } // namespace search diff --git a/search/ranking_info.hpp b/search/ranking_info.hpp index fc9af152aa..abd600314b 100644 --- a/search/ranking_info.hpp +++ b/search/ranking_info.hpp @@ -45,6 +45,8 @@ struct RankingInfo // Returns rank calculated by a linear model. Large values // correspond to important features. double GetLinearModelRank() const; + + size_t GetErrorsMade() const; }; string DebugPrint(RankingInfo const & info); diff --git a/search/search_quality/sample.cpp b/search/search_quality/sample.cpp index 988149c270..51f1c51692 100644 --- a/search/search_quality/sample.cpp +++ b/search/search_quality/sample.cpp @@ -99,7 +99,7 @@ bool Sample::DeserializeFromJSON(string const & jsonStr) } catch (my::Json::Exception const & e) { - LOG(LDEBUG, ("Can't parse sample:", e.Msg(), jsonStr)); + LOG(LWARNING, ("Can't parse sample:", e.Msg(), jsonStr)); } return false; } diff --git a/search/search_quality/scoring_model.py b/search/search_quality/scoring_model.py index 3a6b259e56..9e599092e7 100755 --- a/search/search_quality/scoring_model.py +++ b/search/search_quality/scoring_model.py @@ -2,11 +2,11 @@ from math import exp, log from scipy.stats import pearsonr -from sklearn import cross_validation, grid_search, svm +from sklearn import svm +from sklearn.model_selection import GridSearchCV, KFold import argparse import collections import itertools -import matplotlib.pyplot as plt import numpy as np import pandas as pd import random @@ -18,8 +18,7 @@ MAX_RANK = 255 RELEVANCES = {'Irrelevant': 0, 'Relevant': 1, 'Vital': 3} NAME_SCORES = ['Zero', 'Substring', 'Prefix', 'Full Match'] SEARCH_TYPES = ['POI', 'Building', 'Street', 'Unclassified', 'Village', 'City', 'State', 'Country'] - -FEATURES = ['DistanceToPivot', 'Rank', 'FalseCats'] + NAME_SCORES + SEARCH_TYPES +FEATURES = ['DistanceToPivot', 'Rank', 'FalseCats', 'ErrorsMade'] + NAME_SCORES + SEARCH_TYPES def transform_name_score(value, categories_match): @@ -157,29 +156,6 @@ def transform_data(data): return xs, ys -def plot_diagrams(xs, ys, features): - """ - For each feature, plots histagrams of x * sign(y), where x is a - slice on the feature of a list of pairwise differences between - input feature-vectors and y is a list of pairwise differences - between relevances of the input feature-vectors. Stong bias - toward positive or negative values in histograms indicates that - the current feature is important for ranking, as there is a - correlation between difference between features values and - relevancy. - """ - for i, f in enumerate(features): - x = [x[i] * np.sign(y) for x, y in zip(xs, ys)] - - l, r = min(x), max(x) - d = max(abs(l), abs(r)) - - plt.subplot(4, 4, i + 1) - plt.hist(x, bins=8, range=(-d, d)) - plt.title(f) - plt.show() - - def show_pearson_statistics(xs, ys, features): """ Shows info about Pearson coefficient between features and @@ -241,7 +217,7 @@ def cpp_output(features, ws): else: print_const(f, w) print_array('kNameScore', 'NameScore::NAME_SCORE_COUNT', ns) - print_array('kSearchType', 'SearchModel::SEARCH_TYPE_COUNT', st) + print_array('kType', 'Model::TYPE_COUNT', st) def main(args): @@ -249,20 +225,17 @@ def main(args): normalize_data(data) ndcgs = compute_ndcgs_without_ws(data); - print('Current NDCG: {}, std: {}'.format(np.mean(ndcgs), np.std(ndcgs))) + print('Current NDCG: {:.3f}, std: {:.3f}'.format(np.mean(ndcgs), np.std(ndcgs))) print() xs, ys = transform_data(data) - if args.plot: - plot_diagrams(xs, ys, FEATURES) - clf = svm.LinearSVC(random_state=args.seed) - cv = cross_validation.KFold(len(ys), n_folds=5, shuffle=True, random_state=args.seed) + cv = KFold(n_splits=5, shuffle=True, random_state=args.seed) # "C" stands for the regularizer constant. grid = {'C': np.power(10.0, np.arange(-5, 6))} - gs = grid_search.GridSearchCV(clf, grid, scoring='accuracy', cv=cv) + gs = GridSearchCV(clf, grid, scoring='roc_auc', cv=cv) gs.fit(xs, ys) ws = gs.best_estimator_.coef_[0] @@ -274,8 +247,8 @@ def main(args): ndcgs = compute_ndcgs_for_ws(data, ws) - print('NDCG mean: {}, std: {}'.format(np.mean(ndcgs), np.std(ndcgs))) - print('Accuracy: {}'.format(gs.best_score_)) + print('NDCG mean: {:.3f}, std: {:.3f}'.format(np.mean(ndcgs), np.std(ndcgs))) + print('ROC AUC: {:.3f}'.format(gs.best_score_)) if args.pearson: print() @@ -292,7 +265,6 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--seed', help='random seed', type=int) - parser.add_argument('--plot', help='plot diagrams', action='store_true') parser.add_argument('--pearson', help='show pearson statistics', action='store_true') parser.add_argument('--cpp', help='generate output in the C++ format', action='store_true') args = parser.parse_args()