diff --git a/lang_getter/debug/lang_getter b/lang_getter/debug/lang_getter new file mode 100755 index 0000000000..95df683fa4 Binary files /dev/null and b/lang_getter/debug/lang_getter differ diff --git a/lang_getter/lang_getter.pro b/lang_getter/lang_getter.pro new file mode 100644 index 0000000000..e3dc4cc0af --- /dev/null +++ b/lang_getter/lang_getter.pro @@ -0,0 +1,28 @@ +#------------------------------------------------- +# +# Project created by QtCreator 2011-11-18T08:50:14 +# +#------------------------------------------------- + +QT += core network xml + +QT -= gui + +TARGET = lang_getter +CONFIG += console +CONFIG -= app_bundle + +TEMPLATE = app + + +SOURCES += main.cpp \ + pagedownloader.cpp \ + logging.cpp \ + mainmanager.cpp \ + stringparser.cpp + +HEADERS += \ + pagedownloader.h \ + logging.h \ + mainmanager.h \ + stringparser.h diff --git a/lang_getter/logging.cpp b/lang_getter/logging.cpp new file mode 100644 index 0000000000..22250fca96 --- /dev/null +++ b/lang_getter/logging.cpp @@ -0,0 +1,30 @@ +#include "logging.h" + +#include + + +using namespace std; + +Logging::Logging() +{ +} + +void Logging::Print(STATUS s, QString const & msg) +{ + cout << StatusToString(s).toStdString() << " " << msg.toStdString() << endl; +} + +void Logging::Percent(qint64 curr, qint64 total) +{ +} + +QString Logging::StatusToString(STATUS s) const +{ + switch (s) + { + case INFO: return "INFO"; + case WARNING: return "WARNING"; + case ERROR: return "ERROR"; + default: return "NONE"; + } +} diff --git a/lang_getter/logging.h b/lang_getter/logging.h new file mode 100644 index 0000000000..86c1fcfc9e --- /dev/null +++ b/lang_getter/logging.h @@ -0,0 +1,16 @@ +#pragma once + +#include + +class Logging +{ +public: + Logging(); + + enum STATUS { INFO, WARNING, ERROR }; + + void Print(STATUS s, QString const & msg); + void Percent(qint64 curr, qint64 total); + + QString StatusToString(STATUS s) const; +}; diff --git a/lang_getter/main.cpp b/lang_getter/main.cpp new file mode 100644 index 0000000000..7aa0e9dfa8 --- /dev/null +++ b/lang_getter/main.cpp @@ -0,0 +1,13 @@ +#include + +#include "mainmanager.h" + +int main(int argc, char *argv[]) +{ + QCoreApplication a(argc, argv); + + MainManager manager("/Users/alena/omim/omim/data/metainfo/"); + manager.ProcessCountryList("/Users/alena/omim/omim/data/polygons.lst"); + + return a.exec(); +} diff --git a/lang_getter/mainmanager.cpp b/lang_getter/mainmanager.cpp new file mode 100644 index 0000000000..5f8df4bbb9 --- /dev/null +++ b/lang_getter/mainmanager.cpp @@ -0,0 +1,304 @@ +#include "mainmanager.h" + +#include + +#include +#include +#include + + +using namespace std; + +void MainManager::Country::AddCode(QString const & code) +{ + if (m_codes.end() == find(m_codes.begin(), m_codes.end(), code)) + m_codes.push_back(code); +} + +void MainManager::Country::AddUrl(size_t url) +{ + if (m_langUrls.end() == find(m_langUrls.begin(), m_langUrls.end(), url)) + m_langUrls.push_back(url); +} + +namespace +{ + void append(QString & res, QString const & s) + { + if (res.isEmpty()) res = s; + else res = res + "|" + s; + } +} + +bool MainManager::Country::GetResult(QString & res, MainManager const & m) const +{ + res.clear(); + + for (size_t i = 0; i < m_codes.size(); ++i) + append(res, m_codes[i]); + + for (size_t i = 0; i < m_langUrls.size(); ++i) + { + QString const code = m.m_langUrls[m_langUrls[i]]; + if (!code.isEmpty()) + append(res, code); + } + + return !res.isEmpty(); +} + + +MainManager::MainManager(QString const & outDir) + : m_downloader(m_log), m_parser(m_log), m_outDir(outDir) +{ +} + +char const * MainManager::LangNameToCode(QString const & name) +{ + if (name.contains("English", Qt::CaseInsensitive)) return "en"; + if (name.contains("Spanish", Qt::CaseInsensitive)) return "es"; + if (name.contains("French", Qt::CaseInsensitive)) return "fr"; + if (name.contains("Mandarin", Qt::CaseInsensitive)) return "zh"; + return 0; +} + +void MainManager::ProcessCountryList(QString const & file) +{ + ifstream s(file.toStdString().c_str()); + if (!s.is_open() || !s.good()) + { + m_log.Print(Logging::ERROR, QString("Can't open file: ") + file); + return; + } + + char buffer[256]; + while (s.good()) + { + s.getline(buffer, 256); + if (strlen(buffer) > 0) + m_countries.push_back(buffer); + } + + m_downloader.ConnectFinished(this, SLOT(countryDownloaded(QString const &))); + + m_index = 0; + ProcessNextCountry(); +} + +namespace +{ + void get_country_url(QString & name) + { + int const i = name.indexOf('_'); + if (i != -1) + name = name.mid(0, i); // for regions return country name + + name.replace(' ', '_'); // make correct wiki url + } +} + +void MainManager::ProcessNextCountry() +{ + if (m_index >= m_countries.size()) + { + m_downloader.ConnectFinished(this, SLOT(languageDownloaded(QString const &))); + + m_index = 0; + ProcessNextLanguage(); + return; + } + + QString url = m_countries[m_index].m_name; + get_country_url(url); + + m_downloader.Download(QString("http://en.wikipedia.org/wiki/") + url); +} + +namespace +{ + class append_result + { + MainManager & m_manager; + public: + append_result(MainManager & m) : m_manager(m) {} + void operator() (QString const & s) + { + char const * code = m_manager.LangNameToCode(s); + if (code) + m_manager.AppendResult(code); + } + }; + + class nodes_iterator + { + QDomElement m_node; + bool m_isList; + + public: + nodes_iterator(QDomElement const & root) : m_isList(false) + { + // process single elements ... + m_node = root.firstChildElement("a"); + if (m_node.isNull()) + { + // ... or compound list + m_node = root.firstChildElement("ul"); + if (!m_node.isNull()) + { + m_node = m_node.firstChildElement("li"); + m_isList = true; + } + } + } + + bool valid() const { return !m_node.isNull(); } + + QDomElement get() const + { + return (m_isList ? m_node.firstChildElement("a") : m_node); + } + + void next() + { + m_node = m_node.nextSiblingElement(m_isList ? "li" : "a"); + } + }; +} + +void MainManager::ProcessLangEntry(QString const & xml, QString const & entry) +{ + if (m_parser.InitSubDOM(xml, entry, "td")) + { + nodes_iterator it(m_parser.Root()); + + if (!it.valid()) + { + // try to get language from root node + TokenizeString(m_parser.Root().text(), ", ", append_result(*this)); + } + + // iterate through child nodes + while (it.valid()) + { + QDomElement e = it.get(); + + char const * code = LangNameToCode(e.text()); + if (code) + { + AppendResult(code); + } + else + { + QString const url = e.attribute("href"); + if (!url.isEmpty()) + AppendLangUrl(url); + else + m_log.Print(Logging::WARNING, QString("Undefined language without url: ") + e.text()); + } + + it.next(); + } + } +} + +void MainManager::countryDownloaded(QString const & s) +{ + ProcessLangEntry(s, "Official language(s)"); + ProcessLangEntry(s, "National language"); + + ++m_index; + ProcessNextCountry(); +} + +void MainManager::AppendResult(QString const & code) +{ + m_countries[m_index].AddCode(code); +} + +void MainManager::AppendLangUrl(QString url) +{ + { + int const i = url.lastIndexOf("/"); + if (i != -1) + url = url.mid(i+1); + } + + size_t index; + { + vector::iterator i = find(m_langUrls.begin(), m_langUrls.end(), url); + if (i == m_langUrls.end()) + { + m_langUrls.push_back(url); + index = m_langUrls.size()-1; + } + else + index = std::distance(m_langUrls.begin(), i); + } + + m_countries[m_index].AddUrl(index); +} + +void MainManager::ProcessNextLanguage() +{ + if (m_index >= m_langUrls.size()) + { + CreateResultFiles(); + m_log.Print(Logging::INFO, "Done!"); + + exit(0); + return; + } + + m_downloader.Download(QString("http://en.wikipedia.org/wiki/") + m_langUrls[m_index]); +} + +bool MainManager::ProcessCodeEntry(QString const & xml, QString const & entry) +{ + if (m_parser.InitSubDOM(xml, entry, "td")) + { + QDomElement e = m_parser.Root().firstChildElement("tt"); + if (!e.isNull()) + { + QString const name = e.text(); + if (!name.isEmpty()) + { + m_langUrls[m_index] = name; + return true; + } + } + } + + return false; +} + +void MainManager::languageDownloaded(QString const & s) +{ + if (!ProcessCodeEntry(s, "ISO 639-1")) + if (!ProcessCodeEntry(s, "ISO 639-2")) + if (!ProcessCodeEntry(s, "ISO 639-3")) + { + m_log.Print(Logging::WARNING, QString("Can't find code for url: ") + m_langUrls[m_index]); + m_langUrls[m_index] = QString(); + } + + ++m_index; + ProcessNextLanguage(); +} + +void MainManager::CreateResultFiles() +{ + m_log.Print(Logging::INFO, "Results:"); + + for (size_t i = 0; i < m_countries.size(); ++i) + { + QString s; + if (m_countries[i].GetResult(s, *this)) + { + QFile f(m_outDir + m_countries[i].m_name + QString(".meta")); + f.open(QFile::WriteOnly); + f.write(s.toStdString().c_str()); + } + else + m_log.Print(Logging::WARNING, QString("No languages for country: ") + m_countries[i].m_name); + } +} diff --git a/lang_getter/mainmanager.h b/lang_getter/mainmanager.h new file mode 100644 index 0000000000..5c80403ff1 --- /dev/null +++ b/lang_getter/mainmanager.h @@ -0,0 +1,64 @@ +#pragma once +#include "logging.h" +#include "pagedownloader.h" +#include "stringparser.h" + +#include +#include + + +class MainManager : public QObject +{ + Q_OBJECT + + Logging m_log; + PageDownloader m_downloader; + ContentParser m_parser; + + QString m_outDir; + + class Country + { + std::vector m_codes; + std::vector m_langUrls; + + public: + QString m_name; + + Country(char const * name) : m_name(name) {} + + void AddCode(QString const & code); + void AddUrl(size_t url); + + bool GetResult(QString & res, MainManager const & m) const; + }; + + std::vector m_countries; + std::vector m_langUrls; + + size_t m_index; + +public: + MainManager(QString const & outDir); + + void ProcessCountryList(QString const & file); + +protected: + void ProcessNextCountry(); + void ProcessLangEntry(QString const & xml, QString const & entry); + +public: // need for functor + char const * LangNameToCode(QString const & name); + void AppendResult(QString const & code); +protected: + void AppendLangUrl(QString url); + + void ProcessNextLanguage(); + bool ProcessCodeEntry(QString const & xml, QString const & entry); + + void CreateResultFiles(); + +private slots: + void countryDownloaded(QString const & s); + void languageDownloaded(QString const & s); +}; diff --git a/lang_getter/pagedownloader.cpp b/lang_getter/pagedownloader.cpp new file mode 100644 index 0000000000..87e089e13d --- /dev/null +++ b/lang_getter/pagedownloader.cpp @@ -0,0 +1,62 @@ +#include "pagedownloader.h" +#include "logging.h" + +#include +#include + + +void PageDownloader::ConnectFinished(QObject * obj, char const * slot) +{ + disconnect(SIGNAL(finished(QString const &))); + connect(this, SIGNAL(finished(QString const &)), obj, slot); +} + +void PageDownloader::Download(QUrl const & url) +{ + m_res.clear(); + + m_reply = m_manager.get(QNetworkRequest(url)); + connect(m_reply, SIGNAL(finished()), this, SLOT(httpFinished())); + connect(m_reply, SIGNAL(readyRead()), this, SLOT(httpReadyRead())); + connect(m_reply, SIGNAL(downloadProgress(qint64,qint64)), this, + SLOT(updateDataReadProgress(qint64,qint64))); +} + +void PageDownloader::Download(QString const & url) +{ + Download(QUrl(url)); +} + +void PageDownloader::httpFinished() +{ + QString const s = QString::fromUtf8(m_res.constData()); + QString const url = m_reply->url().toString(); + + if (s.isEmpty()) + { + m_log.Print(Logging::WARNING, QString("Downloading of ") + + url + + QString(" failed.")); + } + else + { + m_log.Print(Logging::INFO, QString("Downloading of ") + + url + + QString(" finished successfully.")); + } + + m_reply->deleteLater(); + m_reply = 0; + + emit finished(s); +} + +void PageDownloader::httpReadyRead() +{ + m_res += m_reply->readAll(); +} + +void PageDownloader::updateDataReadProgress(qint64 read, qint64 total) +{ + m_log.Percent(read, total); +} diff --git a/lang_getter/pagedownloader.h b/lang_getter/pagedownloader.h new file mode 100644 index 0000000000..8884818c57 --- /dev/null +++ b/lang_getter/pagedownloader.h @@ -0,0 +1,37 @@ +#pragma once + +#include +#include + + +class Logging; +class QUrl; +class QNetworkReply; + +class PageDownloader : public QObject +{ + Q_OBJECT + + QNetworkAccessManager m_manager; + QNetworkReply * m_reply; + + Logging & m_log; + + QByteArray m_res; + +public: + PageDownloader(Logging & log) : m_log(log) {} + + void ConnectFinished(QObject * obj, char const * slot); + + void Download(QUrl const & url); + void Download(QString const & url); + +signals: + void finished(QString const &); + +private slots: + void httpFinished(); + void httpReadyRead(); + void updateDataReadProgress(qint64 read, qint64 total); +}; diff --git a/lang_getter/release/lang_getter b/lang_getter/release/lang_getter new file mode 100755 index 0000000000..147f9c40a1 Binary files /dev/null and b/lang_getter/release/lang_getter differ diff --git a/lang_getter/stringparser.cpp b/lang_getter/stringparser.cpp new file mode 100644 index 0000000000..e74530dcdc --- /dev/null +++ b/lang_getter/stringparser.cpp @@ -0,0 +1,36 @@ +#include "stringparser.h" +#include "logging.h" + + +bool ContentParser::InitSubDOM(QString const & xml, QString const & entry, QString const & tag) +{ + int const i = xml.indexOf(entry); + if (i == -1) + { + m_log.Print(Logging::INFO, QString("Can't find entry: ") + entry); + return false; + } + + int const beg = xml.indexOf(QString("<") + tag, i); + if (beg == -1 || beg < i) + { + m_log.Print(Logging::INFO, QString("Can't find tag: ") + tag); + return false; + } + + QString last = QString("/") + tag + QString(">"); + int const end = xml.indexOf(last, beg); + Q_ASSERT ( end != -1 && beg < end ); + + if (!m_doc.setContent(xml.mid(beg, end - beg + last.length()))) + { + m_log.Print(Logging::ERROR, QString("QDomDocument::setContent error")); + return false; + } + + m_node = m_doc.documentElement(); + Q_ASSERT ( !m_node.isNull() ); + Q_ASSERT ( m_node.tagName() == tag ); + + return true; +} diff --git a/lang_getter/stringparser.h b/lang_getter/stringparser.h new file mode 100644 index 0000000000..adbbb991de --- /dev/null +++ b/lang_getter/stringparser.h @@ -0,0 +1,39 @@ +#pragma once + +#include + + +class Logging; + +class ContentParser +{ + Logging & m_log; + + QDomDocument m_doc; + QDomElement m_node; + +public: + ContentParser(Logging & log) : m_log(log) {} + + bool InitSubDOM(QString const & xml, QString const & entry, QString const & tag); + QDomElement Root() const { return m_node; } +}; + +template +void TokenizeString(QString const & s, QString const & delim, ToDo toDo) +{ + int beg = 0; + int i = 0; + for (; i < s.length(); ++i) + { + if (delim.indexOf(s[i]) != -1) + { + if (i > beg) + toDo(s.mid(beg, i-beg)); + beg = i+1; + } + } + + if (i > beg) + toDo(s.mid(beg, i-beg)); +}