diff --git a/base/stl_add.hpp b/base/stl_add.hpp index e252a8b53e..d3d403810d 100644 --- a/base/stl_add.hpp +++ b/base/stl_add.hpp @@ -1,6 +1,7 @@ #pragma once #include "../std/functional.hpp" #include "../std/iterator.hpp" +#include "../std/map.hpp" template class BackInsertFunctor { @@ -115,3 +116,9 @@ template IterT PrevIterInCycle(IterT it, IterT beg, IterT end) return --it; } +template +ValueT ValueForKey(map const & m, KeyT key, ValueT defaultV) +{ + typename map::const_iterator const it = m.find(key); + return (it == m.end() ? defaultV : it->second); +} diff --git a/publisher/main.cpp b/publisher/main.cpp index ed33d92831..2204f4634f 100644 --- a/publisher/main.cpp +++ b/publisher/main.cpp @@ -6,52 +6,174 @@ #include "../base/base.hpp" #include "../base/assert.hpp" #include "../base/logging.hpp" +#include "../base/stl_add.hpp" +#include "../std/algorithm.hpp" #include "../std/bind.hpp" +#include "../std/fstream.hpp" +#include "../std/iterator.hpp" +#include "../std/string.hpp" #include "../3party/gflags/src/gflags/gflags.h" +#include "../3party/jansson/myjansson.hpp" DEFINE_int32(max_uncompressed_article_chunk_size, 899950, "Max size of chunk of articles, uncompressed."); DEFINE_int32(compression_level, 9, "BZip2 compression level."); DEFINE_string(input, "", "Input file."); DEFINE_string(output, "", "Output dictionary file."); +DEFINE_string(article_file_suffix, "", "Suffix of the article files."); +DEFINE_string(redirects, "", "JSON file with redirects."); + +void IndexAard(sl::SlofIndexer & indexer) +{ + FileReader inputReader(FLAGS_input.c_str()); + sl::AardDictionary inputDictionary(inputReader); + + LOG(LINFO, ("Starting indexing, keys:", inputDictionary.KeyCount())); + for (uint32_t id = 0; id < inputDictionary.KeyCount(); ++id) + { + if ((id % 5000) == 0) + LOG(LINFO, (id, "done.")); + // TODO: Handle redirects. + // TODO: Handle several keys for article? + string key, article; + inputDictionary.KeyById(id, key); + inputDictionary.ArticleById(id, article); + if (article.empty()) + { + LOG(LWARNING, ("Skipping empty article for:", key)); + } + else + { + uint64_t const articleId = indexer.AddArticle(article); + indexer.AddKey(key, articleId); + } + } + LOG(LINFO, ("Logging stats.")); + indexer.LogStats(); + LOG(LINFO, ("Finishing indexing.")); +} + +void IndexJson(sl::SlofIndexer & indexer) +{ + vector > articles; + LOG(LINFO, ("Reading list of articles.")); + { + ifstream fin(FLAGS_input.c_str()); + string line; + for (int i = 0; getline(fin, line); ++i) + { + if (line.empty()) + continue; + + my::Json root(line.c_str()); + CHECK_EQUAL(json_typeof(root), JSON_ARRAY, (i, line)); + CHECK_EQUAL(3, json_array_size(root), (i, line)); + articles.push_back(vector()); + for (int j = 0; j < 3; ++j) + { + json_t * pJsonElement = json_array_get(root, j); + CHECK(pJsonElement, (i, line)); + CHECK_EQUAL(json_typeof(pJsonElement), JSON_STRING, (i, j, line)); + articles.back().push_back(json_string_value(pJsonElement)); + } + } + } + + LOG(LINFO, ("Sorting list of articles.")); + sort(articles.begin(), articles.end()); + + LOG(LINFO, ("Adding articles.")); + map keysToArticles; + for (size_t i = 0; i < articles.size(); ++i) + { + string const url = articles[i][0]; + string const title = articles[i][1]; + string const fileName = articles[i][2] + FLAGS_article_file_suffix; + articles[i].clear(); + + FileReader articleReader(fileName); + string article(static_cast(articleReader.Size()), 0); + articleReader.Read(0, &article[0], article.size()); + + uint64_t const articleId = indexer.AddArticle(article); + indexer.AddKey(title, articleId); + CHECK(keysToArticles.insert(make_pair(title, articleId)).second, (i)); + + if ((i & 127) == 0) + LOG(LINFO, ("Done:", i)); + } + articles.clear(); + + LOG(LINFO, ("Adding redirects.")); + map redirects; + { + ifstream fin(FLAGS_redirects.c_str()); + string line; + for (int i = 0; getline(fin, line); ++i) + { + if (line.empty()) + continue; + + my::Json root(line.c_str()); + CHECK_EQUAL(json_typeof(root), JSON_ARRAY, (i, line)); + CHECK_EQUAL(2, json_array_size(root), (i, line)); + string s[2]; + for (int j = 0; j < 2; ++j) + { + json_t * pJsonElement = json_array_get(root, j); + CHECK(pJsonElement, (i, j, line)); + CHECK_EQUAL(json_typeof(pJsonElement), JSON_STRING, (i, line)); + s[j] = json_string_value(pJsonElement); + } + CHECK(redirects.insert(make_pair(s[0], s[1])).second, (s[0], s[1])); + } + } + for (map::const_iterator it = redirects.begin(); it != redirects.end(); ++it) + { + string const & src = it->first; + string dst = it->second; + + if (keysToArticles.count(src)) + { + LOG(LWARNING, ("Conflicting redirect", src, dst)); + continue; + } + + uint64_t articleId = -1; + for (size_t depth = 0; articleId == -1 && !dst.empty() && depth < 5; ++depth) + { + articleId = ValueForKey(keysToArticles, dst, uint64_t(-1)); + dst = ValueForKey(redirects, dst, string()); + } + + if (articleId == -1) + LOG(LWARNING, ("Redirect not found", it->first, it->second)); + else + indexer.AddKey(src, articleId); + } +} int main(int argc, char ** argv) { google::ParseCommandLineFlags(&argc, &argv, true); CHECK(!FLAGS_input.empty(), ()); CHECK(!FLAGS_output.empty(), ()); - FileReader inputReader(FLAGS_input.c_str()); + FileWriter outputWriter(FLAGS_output.c_str()); { - sl::AardDictionary inputDictionary(inputReader); sl::SlofIndexer indexer(outputWriter, FLAGS_max_uncompressed_article_chunk_size, bind(&CompressBZip2, FLAGS_compression_level, _1, _2, _3)); - LOG(LINFO, ("Starting indexing, keys:", inputDictionary.KeyCount())); - for (uint32_t id = 0; id < inputDictionary.KeyCount(); ++id) - { - if ((id % 5000) == 0) - LOG(LINFO, (id, "done.")); - // TODO: Handle redirects. - // TODO: Handle several keys for article? - string key, article; - inputDictionary.KeyById(id, key); - inputDictionary.ArticleById(id, article); - if (article.empty()) - { - LOG(LWARNING, ("Skipping empty article for:", key)); - } - else - { - uint64_t const articleId = indexer.AddArticle(article); - indexer.AddKey(key, articleId); - } - } - LOG(LINFO, ("Logging stats.")); - indexer.LogStats(); + + size_t const & inputSize = FLAGS_input.size(); + if (inputSize > 5 && FLAGS_input.substr(inputSize - 5) == ".aard") + IndexAard(indexer); + else if (inputSize > 5 && FLAGS_input.substr(inputSize - 5) == ".json") + IndexJson(indexer); + else + CHECK(false, (FLAGS_input)); + LOG(LINFO, ("Finishing indexing.")); } - LOG(LINFO, ("Indexing done.")); - LOG(LINFO, ("Input size:", inputReader.Size())); LOG(LINFO, ("Output size:", outputWriter.Pos())); } diff --git a/std/string.hpp b/std/string.hpp index d1c6ce6cdd..675fee5fe3 100644 --- a/std/string.hpp +++ b/std/string.hpp @@ -10,6 +10,7 @@ using std::basic_string; using std::string; +using std::getline; #ifdef OMIM_OS_BADA typedef std::basic_string wstring;