From e761162e1b714a0d0bf89236efbd7a4eef8912c7 Mon Sep 17 00:00:00 2001 From: Sergey Yershov Date: Tue, 23 Dec 2014 18:44:11 +0300 Subject: [PATCH] Tools for calculate statistic about downloaded countries --- tools/download_statistic/aggregator | 43 +++++++++++++++++++++++ tools/download_statistic/calc_statistic | 1 + tools/download_statistic/columnizer | 1 + tools/download_statistic/linezier | 1 + tools/download_statistic/remover | 1 + tools/download_statistic/requirements.txt | 8 +++++ tools/download_statistic/resolver | 31 ++++++++++++++++ 7 files changed, 86 insertions(+) create mode 100755 tools/download_statistic/aggregator create mode 100755 tools/download_statistic/calc_statistic create mode 100755 tools/download_statistic/columnizer create mode 100755 tools/download_statistic/linezier create mode 100755 tools/download_statistic/remover create mode 100644 tools/download_statistic/requirements.txt create mode 100755 tools/download_statistic/resolver diff --git a/tools/download_statistic/aggregator b/tools/download_statistic/aggregator new file mode 100755 index 0000000000..7b23b7b089 --- /dev/null +++ b/tools/download_statistic/aggregator @@ -0,0 +1,43 @@ +#!/usr/bin/env python +#coding: utf-8 + +from collections import defaultdict +import sys +import datetime + +result = defaultdict(lambda : defaultdict(lambda :defaultdict(set))) + + + +def print_result(): + for date_key in result.iterkeys(): + year, month, req_type = date_key.split('_') + for from_country in result[date_key].iterkeys(): + for req_country in result[date_key][from_country].iterkeys(): + print '{};{:02d};{};{};{};{}'.format(year,int(month),from_country,req_country,req_type,len(result[date_key][from_country][req_country])) + +try: + with sys.stdin as file: + for rec in file: + try: + parts = rec.strip().split('|') + req_type = 'R' if len(parts) == 6 and parts[5]=='.routing' else 'M' + from_country = parts[0] + date = datetime.datetime.strptime(parts[2], '%d/%b/%Y:%H:%M:%S') + user_id = parts[3] + req_country = parts[4].split('_')[0] + date_key = '{}_{}_{}'.format(date.year,date.month,req_type) + user_key = '{}_{}'.format(user_id,req_country) + result[date_key][from_country][req_country].add(user_key) + except: + pass # ignore all errors for one string +except KeyboardInterrupt: + print_result() + exit(0) +except: + print_result() + raise + +print_result() + + diff --git a/tools/download_statistic/calc_statistic b/tools/download_statistic/calc_statistic new file mode 100755 index 0000000000..9775e3dc1f --- /dev/null +++ b/tools/download_statistic/calc_statistic @@ -0,0 +1 @@ +./linezier | ./remover | ./columnizer | ./resolver | ./aggregator \ No newline at end of file diff --git a/tools/download_statistic/columnizer b/tools/download_statistic/columnizer new file mode 100755 index 0000000000..3710833576 --- /dev/null +++ b/tools/download_statistic/columnizer @@ -0,0 +1 @@ +sed 's/ \[/|/;s/\] "/|/;s/" /|/;s/\.mwm/|/' | awk '!x[$0]++' \ No newline at end of file diff --git a/tools/download_statistic/linezier b/tools/download_statistic/linezier new file mode 100755 index 0000000000..7503bc7e6e --- /dev/null +++ b/tools/download_statistic/linezier @@ -0,0 +1 @@ +sed -e :a -e '$!N;s/\n\t/ | /;ta;' -e 'P;D' | grep -e 'COUNTRY:' \ No newline at end of file diff --git a/tools/download_statistic/remover b/tools/download_statistic/remover new file mode 100755 index 0000000000..66f60e1f39 --- /dev/null +++ b/tools/download_statistic/remover @@ -0,0 +1 @@ +sed 's/ "[a-zA-Z0-9/._ ]*" / /1;s/ - - / /;s/ -0.00//;s/ ... ... -/ /;s/| \(.*\) COUNTRY://;s/ |\(.*\)$//' \ No newline at end of file diff --git a/tools/download_statistic/requirements.txt b/tools/download_statistic/requirements.txt new file mode 100644 index 0000000000..12e35115a1 --- /dev/null +++ b/tools/download_statistic/requirements.txt @@ -0,0 +1,8 @@ +Required python 2.7 and additional modules: +geoip2 +ipaddr +maxminddb + +also reuired IP to geo database from http://dev.maxmind.com/geoip/legacy/geolite/ + +GeoLite2-Country.mmdb diff --git a/tools/download_statistic/resolver b/tools/download_statistic/resolver new file mode 100755 index 0000000000..57e2e37835 --- /dev/null +++ b/tools/download_statistic/resolver @@ -0,0 +1,31 @@ +#!/usr/bin/env python +#coding: utf-8 + +import geoip2.database +import sys +from collections import defaultdict + +reader = geoip2.database.Reader('./GeoLite2-Country.mmdb') + +try: + with sys.stdin as file: + for rec in file: + try: + parts = rec.strip().split('|') + ip = parts[0] + from_country = None + try: + from_country = reader.country(ip).country.name + except geoip2.errors.AddressNotFoundError: + from_country = 'Unknown' + + print '{}|{}'.format(from_country,'|'.join(parts)) +# print '{} | {} {} {} | {} | {} | {}'.format(from_country, date[0], date[1], date[2][:4], ip, parts[1][1:13], parts[1][parts[1].find(':')+1:-1]) + except: + pass # ignore all errors for one string +except KeyboardInterrupt: + exit(0) +except: + raise + +