I wanted to share a cool code mash up I did to show some stats about Err usage in the world.
The result is here, in blue roughly after a non reliable homebrew datamining from the apache logs: 
 
I applied this tip
As dependencies you gonna need : 
Here is the full snippet :
#! /usr/bin/python
import matplotlib
matplotlib.use('Qt4Agg')
from datetime import datetime
import os
DIR = 'data/'
dirList=os.listdir(DIR)
unzip = lambda l:tuple(zip(*l))
all_dates = {}
for fname in dirList:
    if fname.startswith('gootz-access'):
        counts = {}
        d = datetime.strptime(fname.split('.')[1], '%Y-%m-%d')
        with open(DIR+fname) as f:
            data = f.readlines()
            for line in data:
                if line.find('err/version') != -1:
                    addr = line.split('-')[0].strip()
                    if counts.has_key(addr):
                        counts[addr] += 1
                    else:
                        counts[addr] = 1
        all_dates[d] = counts
sorted_dates = sorted(all_dates.keys())
import GeoIP
gi = GeoIP.open('GeoLiteCity.dat',GeoIP.GEOIP_STANDARD)
insts_pos = {}
devs_pos = {}
for date in sorted_dates:
    for ip, count in all_dates[date].iteritems():
        record = gi.record_by_name(ip)
        if record and record['latitude']:
            lon = record['longitude']
            lat = record['latitude']
            toinc = devs_pos if count > 5 else insts_pos
            if toinc.has_key((lon,lat)):
                toinc[(lon,lat)] += 1
            else:
                toinc[(lon,lat)] = 1
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
import numpy as np
# lon_0 is central longitude of robinson projection.
# resolution = 'c' means use crude resolution coastlines.
m = Basemap(projection='robin',lon_0=0,resolution='c')
#set a background colour
m.drawmapboundary(fill_color='#85A6D9')
# draw coastlines, country boundaries, fill continents.
m.fillcontinents(color='white',lake_color='#85A6D9')
m.drawcoastlines(color='#6D5F47', linewidth=.4)
m.drawcountries(color='#6D5F47', linewidth=.4)
# draw lat/lon grid lines every 30 degrees.
m.drawmeridians(np.arange(-180, 180, 30), color='#bbbbbb')
m.drawparallels(np.arange(-90, 90, 30), color='#bbbbbb')
inst_lngs = [entry[0][0] for entry in insts_pos.iteritems()]
inst_lats = [entry[0][1] for entry in insts_pos.iteritems()]
inst_count = [entry[1] for entry in insts_pos.iteritems()]
inst_x,inst_y = m(inst_lngs,inst_lats)
s_inst_count = [p * p for p in inst_count]
m.scatter(
    inst_x,
    inst_y,
    s=s_inst_count, #size
    c='blue', #color
    marker='o', #symbol
    alpha=0.25, #transparency
    zorder = 2, #plotting order
    )
for population, xpt, ypt in zip(inst_count, inst_x, inst_y):
    label_txt = int(round(population, 0)) #round to 0 dp and display as integer
    plt.text(
        xpt,
        ypt,
        label_txt,
        color = 'blue',
        size='small',
        horizontalalignment='center',
        verticalalignment='center',
        zorder = 3,
        )
devs_lngs = [entry[0][0] for entry in devs_pos.iteritems()]
devs_lats = [entry[0][1] for entry in devs_pos.iteritems()]
devs_count = [entry[1] for entry in devs_pos.iteritems()]
devs_x,devs_y = m(devs_lngs,devs_lats)
s_devs_count = [p * p for p in devs_count]
m.scatter(
    devs_x,
    devs_y,
    s=s_devs_count, #size
    c='red', #color
    marker='o', #symbol
    alpha=0.25, #transparency
    zorder = 4, #plotting order
    )
for population, xpt, ypt in zip(devs_count, devs_x, devs_y):
    label_txt = int(round(population, 0)) #round to 0 dp and display as integer
    plt.text(
        xpt,
        ypt,
        label_txt,
        color = 'red',
        size='small',
        horizontalalignment='center',
        verticalalignment='center',
        zorder = 5,
        )
#add a title and display the map on screen
plt.title('From where Err is used.')
plt.show()
 
No comments:
Post a Comment