From 70b65dd3baaee4018d793048dc81a665f477683b Mon Sep 17 00:00:00 2001 From: Alex Zolotarev Date: Thu, 26 May 2011 05:04:43 +0200 Subject: [PATCH] Updated unique char counter tool --- tools/osm_unique_char_counter/main.cpp | 140 +++++++----------- .../osm_unique_char_counter.pro | 45 ++---- 2 files changed, 66 insertions(+), 119 deletions(-) diff --git a/tools/osm_unique_char_counter/main.cpp b/tools/osm_unique_char_counter/main.cpp index 960c9a9b0e..91d1a0a7e4 100644 --- a/tools/osm_unique_char_counter/main.cpp +++ b/tools/osm_unique_char_counter/main.cpp @@ -1,17 +1,29 @@ -#include "../../indexer/xmlparser.h" -#include -#include -#include +#include "../../base/string_utils.hpp" -#include +#include "../../coding/parse_xml.hpp" + +#include "../../std/iostream.hpp" +#include "../../std/fstream.hpp" +#include "../../std/unordered_map.hpp" +#include "../../std/vector.hpp" + +#include using namespace std; template class XMLDispatcher { + bool m_fire; + bool m_k_ok; + string m_v; + TDoClass & m_doClass; + public: XMLDispatcher(TDoClass & doClass) : m_fire(false), m_k_ok(false), m_doClass(doClass) {} + + void CharData(string const &) {} + bool Push(string const & element) { if (element == "tag") @@ -28,8 +40,7 @@ public: m_v = value; } } - void Process() {} - void Pop() + void Pop(string const &) { if (m_fire) { @@ -42,110 +53,69 @@ public: m_fire = false; } } - bool m_fire; - bool m_k_ok; - string m_v; - TDoClass & m_doClass; }; -static int const KMaxXMLFileBufferSize = 65536; +typedef unordered_map CountContT; +typedef pair ElemT; -static size_t gLobalCounter = 0; - -template -bool ParseXML(XMLDispatcherT & dispatcher) +bool SortFunc(ElemT const & e1, ElemT const & e2) { - // Create the parser - XmlParser parser(dispatcher); - if (!parser.Create()) return false; - - double progress = 0.; - int const multiplier = 100; - double const step = multiplier * 1024 * 1024; - double next_progress = progress + step; - size_t mb = 0; - while (!cin.eof()) - { - char * buffer = static_cast(parser.GetBuffer(KMaxXMLFileBufferSize)); - if (buffer == 0) - return false; - - cin.read(buffer, KMaxXMLFileBufferSize); - progress += KMaxXMLFileBufferSize; - - if (progress > next_progress) - { - mb += multiplier; - cout << mb << " Mb (" << gLobalCounter << ")" << endl; - next_progress += step; - } - - if (!parser.ParseBuffer(cin.gcount(), cin.eof())) - return false; - } - - return true; + return e1.second > e2.second; } struct Counter { - Counter() + CountContT m_counter; + + void operator()(string const & utf8s) { - m_size = pow(double(2.), int(sizeof(ushort)*8)); - m_array = new long double[m_size]; - fill(m_array, m_array + m_size, (long double)(0.)); - } - ~Counter() - { - delete[] m_array; - } - void operator()(string const & utf8) - { - QString s(QString::fromUtf8(utf8.c_str(), utf8.size())); - for (int i = 0; i < s.size(); ++i) + strings::UniString us; + utf8::unchecked::utf8to32(utf8s.begin(), utf8s.end(), back_inserter(us)); + for (strings::UniString::iterator it = us.begin(); it != us.end(); ++it) { - ushort code = s[i].unicode(); - if (m_array[code] == 0) - { - ++gLobalCounter; - cout << code << " (" << gLobalCounter << ")" << endl; - } - m_array[code] += 1.; + pair found = m_counter.insert( + make_pair(*it, 1)); + if (!found.second) + ++found.first->second; } } void PrintResult() { - ofstream file("results.txt"); + // sort + typedef vector SortVecT; + SortVecT v(m_counter.begin(), m_counter.end()); + sort(v.begin(), v.end(), SortFunc); - cout << endl << "RESULTS:" << endl; - cout << "Code" << "\t" << "Count" << endl; - file << "Code\tCount\t#\tSymbol" << endl; - cout << "=========================================================" << endl; - file << "=========================================================" << endl; - for (size_t i = 0; i < m_size; ++i) + locale loc("en_US.UTF-8"); + cout.imbue(loc); + + string c; + c.resize(10); + for (size_t i = 0; i < v.size(); ++i) { - if (m_array[i] != 0.) - { - cout << i << "\t" << m_array[i] << endl; - file << i << "\t" << m_array[i] << endl; - } + c.clear(); + utf8::unchecked::append(v[i].first, back_inserter(c)); + cout << v[i].second << " " << hex << v[i].first << " " << c << endl; } - - cout << endl << "Total symbols: " << gLobalCounter << endl; - file << endl << "Total symbols: " << gLobalCounter << endl; } +}; - long double * m_array; - size_t m_size; +struct StdinReader +{ + size_t Read(char * buffer, size_t bufferSize) + { + return fread(buffer, sizeof(char), bufferSize, stdin); + } }; int main(int argc, char *argv[]) { Counter c; XMLDispatcher dispatcher(c); - ParseXML(dispatcher); + StdinReader reader; + ParseXML(reader, dispatcher); c.PrintResult(); - + return 0; } diff --git a/tools/osm_unique_char_counter/osm_unique_char_counter.pro b/tools/osm_unique_char_counter/osm_unique_char_counter.pro index e74bfb98a4..15c00e3207 100644 --- a/tools/osm_unique_char_counter/osm_unique_char_counter.pro +++ b/tools/osm_unique_char_counter/osm_unique_char_counter.pro @@ -1,46 +1,23 @@ # ----------------------------------------------------- # Project created by Alex Zolotarev 2010-01-21T13:23:29 # ----------------------------------------------------- -include(../../common.pro.include) - -QT -= gui -TARGET = osm_unique_char_counter +QT -= gui core +TARGET = osm_unique_char_counter CONFIG += console CONFIG -= app_bundle -TEMPLATE = app +TEMPLATE = app + +ROOT_DIR = ../.. +DEPENDENCIES = coding base expat + +include($$ROOT_DIR/common.pri) + # Additional include directories INCLUDEPATH *= ../../3party/expat/lib \ ../../3party/boost -# Configure intermediate and output directories -CONFIG(release, debug|release) { - BINARIES_PATH = ../../out/release - TEMP_PATH = ../../out/release/tmp/$$TARGET -} -else { - BINARIES_PATH = ../../out/debug - TEMP_PATH = ../../out/debug/tmp/$$TARGET -} -DESTDIR = $$BINARIES_PATH -OBJECTS_DIR = $$TEMP_PATH -RCC_DIR = $$TEMP_PATH -MOC_DIR = $$TEMP_PATH -UI_DIR = $$TEMP_PATH +HEADERS += ../../coding/parse_xml.hpp \ + ../../base/string_utils.hpp \ -# Configure some specific compiler options -win32-msvc2008 { - QMAKE_CFLAGS_DEBUG += /Fd$${DESTDIR}/$${TARGET}.pdb - QMAKE_CXXFLAGS_DEBUG += /Fd$${DESTDIR}/$${TARGET}.pdb - QMAKE_LFLAGS += /PDB:$${DESTDIR}/$${TARGET}.pdb -} - -# Configure library dependencies for all libraries in LIBS -PRE_TARGETDEPS = $$BINARIES_PATH/$${LIB_PREFIX}expat$$LIB_EXT - -LIBS += -L$$BINARIES_PATH \ - -lexpat - -HEADERS += ../../indexer/xmlparser.h - SOURCES += main.cpp