Tools for calculate statistic about downloaded countries

This commit is contained in:
Sergey Yershov 2014-12-23 18:44:11 +03:00 committed by Alex Zolotarev
parent ac622b6dc3
commit e761162e1b
7 changed files with 86 additions and 0 deletions

View file

@ -0,0 +1,43 @@
#!/usr/bin/env python
#coding: utf-8
from collections import defaultdict
import sys
import datetime
result = defaultdict(lambda : defaultdict(lambda :defaultdict(set)))
def print_result():
for date_key in result.iterkeys():
year, month, req_type = date_key.split('_')
for from_country in result[date_key].iterkeys():
for req_country in result[date_key][from_country].iterkeys():
print '{};{:02d};{};{};{};{}'.format(year,int(month),from_country,req_country,req_type,len(result[date_key][from_country][req_country]))
try:
with sys.stdin as file:
for rec in file:
try:
parts = rec.strip().split('|')
req_type = 'R' if len(parts) == 6 and parts[5]=='.routing' else 'M'
from_country = parts[0]
date = datetime.datetime.strptime(parts[2], '%d/%b/%Y:%H:%M:%S')
user_id = parts[3]
req_country = parts[4].split('_')[0]
date_key = '{}_{}_{}'.format(date.year,date.month,req_type)
user_key = '{}_{}'.format(user_id,req_country)
result[date_key][from_country][req_country].add(user_key)
except:
pass # ignore all errors for one string
except KeyboardInterrupt:
print_result()
exit(0)
except:
print_result()
raise
print_result()

View file

@ -0,0 +1 @@
./linezier | ./remover | ./columnizer | ./resolver | ./aggregator

View file

@ -0,0 +1 @@
sed 's/ \[/|/;s/\] "/|/;s/" /|/;s/\.mwm/|/' | awk '!x[$0]++'

View file

@ -0,0 +1 @@
sed -e :a -e '$!N;s/\n\t/ | /;ta;' -e 'P;D' | grep -e 'COUNTRY:'

View file

@ -0,0 +1 @@
sed 's/ "[a-zA-Z0-9/._ ]*" / /1;s/ - - / /;s/ -0.00//;s/ ... ... -/ /;s/| \(.*\) COUNTRY://;s/ |\(.*\)$//'

View file

@ -0,0 +1,8 @@
Required python 2.7 and additional modules:
geoip2
ipaddr
maxminddb
also reuired IP to geo database from http://dev.maxmind.com/geoip/legacy/geolite/
GeoLite2-Country.mmdb

View file

@ -0,0 +1,31 @@
#!/usr/bin/env python
#coding: utf-8
import geoip2.database
import sys
from collections import defaultdict
reader = geoip2.database.Reader('./GeoLite2-Country.mmdb')
try:
with sys.stdin as file:
for rec in file:
try:
parts = rec.strip().split('|')
ip = parts[0]
from_country = None
try:
from_country = reader.country(ip).country.name
except geoip2.errors.AddressNotFoundError:
from_country = 'Unknown'
print '{}|{}'.format(from_country,'|'.join(parts))
# print '{} | {} {} {} | {} | {} | {}'.format(from_country, date[0], date[1], date[2][:4], ip, parts[1][1:13], parts[1][parts[1].find(':')+1:-1])
except:
pass # ignore all errors for one string
except KeyboardInterrupt:
exit(0)
except:
raise