Add languages getter utility. It uses wikipedia as source.

This commit is contained in:
vng 2011-11-18 21:22:44 +03:00 committed by Alex Zolotarev
parent 53322f4a86
commit ad54bdeecc
12 changed files with 629 additions and 0 deletions

BIN
lang_getter/debug/lang_getter Executable file

Binary file not shown.

View file

@ -0,0 +1,28 @@
#-------------------------------------------------
#
# Project created by QtCreator 2011-11-18T08:50:14
#
#-------------------------------------------------
QT += core network xml
QT -= gui
TARGET = lang_getter
CONFIG += console
CONFIG -= app_bundle
TEMPLATE = app
SOURCES += main.cpp \
pagedownloader.cpp \
logging.cpp \
mainmanager.cpp \
stringparser.cpp
HEADERS += \
pagedownloader.h \
logging.h \
mainmanager.h \
stringparser.h

30
lang_getter/logging.cpp Normal file
View file

@ -0,0 +1,30 @@
#include "logging.h"
#include <iostream>
using namespace std;
Logging::Logging()
{
}
void Logging::Print(STATUS s, QString const & msg)
{
cout << StatusToString(s).toStdString() << " " << msg.toStdString() << endl;
}
void Logging::Percent(qint64 curr, qint64 total)
{
}
QString Logging::StatusToString(STATUS s) const
{
switch (s)
{
case INFO: return "INFO";
case WARNING: return "WARNING";
case ERROR: return "ERROR";
default: return "NONE";
}
}

16
lang_getter/logging.h Normal file
View file

@ -0,0 +1,16 @@
#pragma once
#include <QString>
class Logging
{
public:
Logging();
enum STATUS { INFO, WARNING, ERROR };
void Print(STATUS s, QString const & msg);
void Percent(qint64 curr, qint64 total);
QString StatusToString(STATUS s) const;
};

13
lang_getter/main.cpp Normal file
View file

@ -0,0 +1,13 @@
#include <QtCore/QCoreApplication>
#include "mainmanager.h"
int main(int argc, char *argv[])
{
QCoreApplication a(argc, argv);
MainManager manager("/Users/alena/omim/omim/data/metainfo/");
manager.ProcessCountryList("/Users/alena/omim/omim/data/polygons.lst");
return a.exec();
}

304
lang_getter/mainmanager.cpp Normal file
View file

@ -0,0 +1,304 @@
#include "mainmanager.h"
#include <QFile>
#include <algorithm>
#include <fstream>
#include <string>
using namespace std;
void MainManager::Country::AddCode(QString const & code)
{
if (m_codes.end() == find(m_codes.begin(), m_codes.end(), code))
m_codes.push_back(code);
}
void MainManager::Country::AddUrl(size_t url)
{
if (m_langUrls.end() == find(m_langUrls.begin(), m_langUrls.end(), url))
m_langUrls.push_back(url);
}
namespace
{
void append(QString & res, QString const & s)
{
if (res.isEmpty()) res = s;
else res = res + "|" + s;
}
}
bool MainManager::Country::GetResult(QString & res, MainManager const & m) const
{
res.clear();
for (size_t i = 0; i < m_codes.size(); ++i)
append(res, m_codes[i]);
for (size_t i = 0; i < m_langUrls.size(); ++i)
{
QString const code = m.m_langUrls[m_langUrls[i]];
if (!code.isEmpty())
append(res, code);
}
return !res.isEmpty();
}
MainManager::MainManager(QString const & outDir)
: m_downloader(m_log), m_parser(m_log), m_outDir(outDir)
{
}
char const * MainManager::LangNameToCode(QString const & name)
{
if (name.contains("English", Qt::CaseInsensitive)) return "en";
if (name.contains("Spanish", Qt::CaseInsensitive)) return "es";
if (name.contains("French", Qt::CaseInsensitive)) return "fr";
if (name.contains("Mandarin", Qt::CaseInsensitive)) return "zh";
return 0;
}
void MainManager::ProcessCountryList(QString const & file)
{
ifstream s(file.toStdString().c_str());
if (!s.is_open() || !s.good())
{
m_log.Print(Logging::ERROR, QString("Can't open file: ") + file);
return;
}
char buffer[256];
while (s.good())
{
s.getline(buffer, 256);
if (strlen(buffer) > 0)
m_countries.push_back(buffer);
}
m_downloader.ConnectFinished(this, SLOT(countryDownloaded(QString const &)));
m_index = 0;
ProcessNextCountry();
}
namespace
{
void get_country_url(QString & name)
{
int const i = name.indexOf('_');
if (i != -1)
name = name.mid(0, i); // for regions return country name
name.replace(' ', '_'); // make correct wiki url
}
}
void MainManager::ProcessNextCountry()
{
if (m_index >= m_countries.size())
{
m_downloader.ConnectFinished(this, SLOT(languageDownloaded(QString const &)));
m_index = 0;
ProcessNextLanguage();
return;
}
QString url = m_countries[m_index].m_name;
get_country_url(url);
m_downloader.Download(QString("http://en.wikipedia.org/wiki/") + url);
}
namespace
{
class append_result
{
MainManager & m_manager;
public:
append_result(MainManager & m) : m_manager(m) {}
void operator() (QString const & s)
{
char const * code = m_manager.LangNameToCode(s);
if (code)
m_manager.AppendResult(code);
}
};
class nodes_iterator
{
QDomElement m_node;
bool m_isList;
public:
nodes_iterator(QDomElement const & root) : m_isList(false)
{
// process single elements ...
m_node = root.firstChildElement("a");
if (m_node.isNull())
{
// ... or compound list
m_node = root.firstChildElement("ul");
if (!m_node.isNull())
{
m_node = m_node.firstChildElement("li");
m_isList = true;
}
}
}
bool valid() const { return !m_node.isNull(); }
QDomElement get() const
{
return (m_isList ? m_node.firstChildElement("a") : m_node);
}
void next()
{
m_node = m_node.nextSiblingElement(m_isList ? "li" : "a");
}
};
}
void MainManager::ProcessLangEntry(QString const & xml, QString const & entry)
{
if (m_parser.InitSubDOM(xml, entry, "td"))
{
nodes_iterator it(m_parser.Root());
if (!it.valid())
{
// try to get language from root node
TokenizeString(m_parser.Root().text(), ", ", append_result(*this));
}
// iterate through child nodes
while (it.valid())
{
QDomElement e = it.get();
char const * code = LangNameToCode(e.text());
if (code)
{
AppendResult(code);
}
else
{
QString const url = e.attribute("href");
if (!url.isEmpty())
AppendLangUrl(url);
else
m_log.Print(Logging::WARNING, QString("Undefined language without url: ") + e.text());
}
it.next();
}
}
}
void MainManager::countryDownloaded(QString const & s)
{
ProcessLangEntry(s, "Official language(s)");
ProcessLangEntry(s, "National language");
++m_index;
ProcessNextCountry();
}
void MainManager::AppendResult(QString const & code)
{
m_countries[m_index].AddCode(code);
}
void MainManager::AppendLangUrl(QString url)
{
{
int const i = url.lastIndexOf("/");
if (i != -1)
url = url.mid(i+1);
}
size_t index;
{
vector<QString>::iterator i = find(m_langUrls.begin(), m_langUrls.end(), url);
if (i == m_langUrls.end())
{
m_langUrls.push_back(url);
index = m_langUrls.size()-1;
}
else
index = std::distance(m_langUrls.begin(), i);
}
m_countries[m_index].AddUrl(index);
}
void MainManager::ProcessNextLanguage()
{
if (m_index >= m_langUrls.size())
{
CreateResultFiles();
m_log.Print(Logging::INFO, "Done!");
exit(0);
return;
}
m_downloader.Download(QString("http://en.wikipedia.org/wiki/") + m_langUrls[m_index]);
}
bool MainManager::ProcessCodeEntry(QString const & xml, QString const & entry)
{
if (m_parser.InitSubDOM(xml, entry, "td"))
{
QDomElement e = m_parser.Root().firstChildElement("tt");
if (!e.isNull())
{
QString const name = e.text();
if (!name.isEmpty())
{
m_langUrls[m_index] = name;
return true;
}
}
}
return false;
}
void MainManager::languageDownloaded(QString const & s)
{
if (!ProcessCodeEntry(s, "ISO 639-1"))
if (!ProcessCodeEntry(s, "ISO 639-2"))
if (!ProcessCodeEntry(s, "ISO 639-3"))
{
m_log.Print(Logging::WARNING, QString("Can't find code for url: ") + m_langUrls[m_index]);
m_langUrls[m_index] = QString();
}
++m_index;
ProcessNextLanguage();
}
void MainManager::CreateResultFiles()
{
m_log.Print(Logging::INFO, "Results:");
for (size_t i = 0; i < m_countries.size(); ++i)
{
QString s;
if (m_countries[i].GetResult(s, *this))
{
QFile f(m_outDir + m_countries[i].m_name + QString(".meta"));
f.open(QFile::WriteOnly);
f.write(s.toStdString().c_str());
}
else
m_log.Print(Logging::WARNING, QString("No languages for country: ") + m_countries[i].m_name);
}
}

64
lang_getter/mainmanager.h Normal file
View file

@ -0,0 +1,64 @@
#pragma once
#include "logging.h"
#include "pagedownloader.h"
#include "stringparser.h"
#include <QObject>
#include <vector>
class MainManager : public QObject
{
Q_OBJECT
Logging m_log;
PageDownloader m_downloader;
ContentParser m_parser;
QString m_outDir;
class Country
{
std::vector<QString> m_codes;
std::vector<size_t> m_langUrls;
public:
QString m_name;
Country(char const * name) : m_name(name) {}
void AddCode(QString const & code);
void AddUrl(size_t url);
bool GetResult(QString & res, MainManager const & m) const;
};
std::vector<Country> m_countries;
std::vector<QString> m_langUrls;
size_t m_index;
public:
MainManager(QString const & outDir);
void ProcessCountryList(QString const & file);
protected:
void ProcessNextCountry();
void ProcessLangEntry(QString const & xml, QString const & entry);
public: // need for functor
char const * LangNameToCode(QString const & name);
void AppendResult(QString const & code);
protected:
void AppendLangUrl(QString url);
void ProcessNextLanguage();
bool ProcessCodeEntry(QString const & xml, QString const & entry);
void CreateResultFiles();
private slots:
void countryDownloaded(QString const & s);
void languageDownloaded(QString const & s);
};

View file

@ -0,0 +1,62 @@
#include "pagedownloader.h"
#include "logging.h"
#include <QUrl>
#include <QNetworkReply>
void PageDownloader::ConnectFinished(QObject * obj, char const * slot)
{
disconnect(SIGNAL(finished(QString const &)));
connect(this, SIGNAL(finished(QString const &)), obj, slot);
}
void PageDownloader::Download(QUrl const & url)
{
m_res.clear();
m_reply = m_manager.get(QNetworkRequest(url));
connect(m_reply, SIGNAL(finished()), this, SLOT(httpFinished()));
connect(m_reply, SIGNAL(readyRead()), this, SLOT(httpReadyRead()));
connect(m_reply, SIGNAL(downloadProgress(qint64,qint64)), this,
SLOT(updateDataReadProgress(qint64,qint64)));
}
void PageDownloader::Download(QString const & url)
{
Download(QUrl(url));
}
void PageDownloader::httpFinished()
{
QString const s = QString::fromUtf8(m_res.constData());
QString const url = m_reply->url().toString();
if (s.isEmpty())
{
m_log.Print(Logging::WARNING, QString("Downloading of ") +
url +
QString(" failed."));
}
else
{
m_log.Print(Logging::INFO, QString("Downloading of ") +
url +
QString(" finished successfully."));
}
m_reply->deleteLater();
m_reply = 0;
emit finished(s);
}
void PageDownloader::httpReadyRead()
{
m_res += m_reply->readAll();
}
void PageDownloader::updateDataReadProgress(qint64 read, qint64 total)
{
m_log.Percent(read, total);
}

View file

@ -0,0 +1,37 @@
#pragma once
#include <QObject>
#include <QNetworkAccessManager>
class Logging;
class QUrl;
class QNetworkReply;
class PageDownloader : public QObject
{
Q_OBJECT
QNetworkAccessManager m_manager;
QNetworkReply * m_reply;
Logging & m_log;
QByteArray m_res;
public:
PageDownloader(Logging & log) : m_log(log) {}
void ConnectFinished(QObject * obj, char const * slot);
void Download(QUrl const & url);
void Download(QString const & url);
signals:
void finished(QString const &);
private slots:
void httpFinished();
void httpReadyRead();
void updateDataReadProgress(qint64 read, qint64 total);
};

BIN
lang_getter/release/lang_getter Executable file

Binary file not shown.

View file

@ -0,0 +1,36 @@
#include "stringparser.h"
#include "logging.h"
bool ContentParser::InitSubDOM(QString const & xml, QString const & entry, QString const & tag)
{
int const i = xml.indexOf(entry);
if (i == -1)
{
m_log.Print(Logging::INFO, QString("Can't find entry: ") + entry);
return false;
}
int const beg = xml.indexOf(QString("<") + tag, i);
if (beg == -1 || beg < i)
{
m_log.Print(Logging::INFO, QString("Can't find tag: ") + tag);
return false;
}
QString last = QString("/") + tag + QString(">");
int const end = xml.indexOf(last, beg);
Q_ASSERT ( end != -1 && beg < end );
if (!m_doc.setContent(xml.mid(beg, end - beg + last.length())))
{
m_log.Print(Logging::ERROR, QString("QDomDocument::setContent error"));
return false;
}
m_node = m_doc.documentElement();
Q_ASSERT ( !m_node.isNull() );
Q_ASSERT ( m_node.tagName() == tag );
return true;
}

View file

@ -0,0 +1,39 @@
#pragma once
#include <QDomDocument>
class Logging;
class ContentParser
{
Logging & m_log;
QDomDocument m_doc;
QDomElement m_node;
public:
ContentParser(Logging & log) : m_log(log) {}
bool InitSubDOM(QString const & xml, QString const & entry, QString const & tag);
QDomElement Root() const { return m_node; }
};
template <class ToDo>
void TokenizeString(QString const & s, QString const & delim, ToDo toDo)
{
int beg = 0;
int i = 0;
for (; i < s.length(); ++i)
{
if (delim.indexOf(s[i]) != -1)
{
if (i > beg)
toDo(s.mid(beg, i-beg));
beg = i+1;
}
}
if (i > beg)
toDo(s.mid(beg, i-beg));
}