Supybot-plugins/Glob2Chan/pygeoip.py

464 lines
14 KiB
Python

#!/usr/bin/env python2.5
'''
Pure Python reader for GeoIP Country Edition databases.
'''
__author__ = 'David Wilson <dw@botanicus.net>'
import os
import sys
import struct
if sys.version_info[0] >= 3:
from io import BytesIO
else:
from cBytesIO import BytesIO as BytesIO
#
# Constants.
#
# From GeoIP.h.
SEGMENT_RECORD_LENGTH = 3
STANDARD_RECORD_LENGTH = 3
ORG_RECORD_LENGTH = 4
MAX_RECORD_LENGTH = 4
FULL_RECORD_LENGTH = 50
NUM_DB_TYPES = 20
GEOIP_COUNTRY_EDITION = 1
GEOIP_REGION_EDITION_REV0 = 7
GEOIP_CITY_EDITION_REV0 = 6
GEOIP_ORG_EDITION = 5
GEOIP_ISP_EDITION = 4
GEOIP_CITY_EDITION_REV1 = 2
GEOIP_REGION_EDITION_REV1 = 3
GEOIP_PROXY_EDITION = 8
GEOIP_ASNUM_EDITION = 9
GEOIP_NETSPEED_EDITION = 10
GEOIP_DOMAIN_EDITION = 11
GEOIP_COUNTRY_EDITION_V6 = 12
COUNTRY_BEGIN = 16776960
STATE_BEGIN_REV0 = 16700000
STATE_BEGIN_REV1 = 16000000
STRUCTURE_INFO_MAX_SIZE = 20
DATABASE_INFO_MAX_SIZE = 100
GeoIP_country_code = '''
AP EU AD AE AF AG AI AL AM AN AO AQ AR AS AT AU AW AZ BA BB BD BE BF BG BH
BI BJ BM BN BO BR BS BT BV BW BY BZ CA CC CD CF CG CH CI CK CL CM CN CO CR
CU CV CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER ES ET FI FJ FK FM FO FR FX
GA GB GD GE GF GH GI GL GM GN GP GQ GR GS GT GU GW GY HK HM HN HR HT HU ID
IE IL IN IO IQ IR IS IT JM JO JP KE KG KH KI KM KN KP KR KW KY KZ LA LB LC
LI LK LR LS LT LU LV LY MA MC MD MG MH MK ML MM MN MO MP MQ MR MS MT MU MV
MW MX MY MZ NA NC NE NF NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM
PN PR PS PT PW PY QA RE RO RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO
SR ST SV SY SZ TC TD TF TG TH TJ TK TM TN TO TL TR TT TV TW TZ UA UG UM US
UY UZ VA VC VE VG VI VN VU WF WS YE YT RS ZA ZM ME ZW A1 A2 O1 AX GG IM JE
BL MF
'''.split()
GeoIP_country_continent = '''
AS EU EU AS AS SA SA EU AS SA AF AN SA OC EU OC SA AS EU SA AS EU AF EU AS
AF AF SA AS SA SA SA AS AF AF EU SA NA AS AF AF AF EU AF OC SA AF AS SA SA
SA AF AS AS EU EU AF EU SA SA AF SA EU AF AF AF EU AF EU OC SA OC EU EU EU
AF EU SA AS SA AF EU SA AF AF SA AF EU SA SA OC AF SA AS AF SA EU SA EU AS
EU AS AS AS AS AS EU EU SA AS AS AF AS AS OC AF SA AS AS AS SA AS AS AS SA
EU AS AF AF EU EU EU AF AF EU EU AF OC EU AF AS AS AS OC SA AF SA EU AF AS
AF NA AS AF AF OC AF OC AF SA EU EU AS OC OC OC AS SA SA OC OC AS AS EU SA
OC SA AS EU OC SA AS AF EU AS AF AS OC AF AF EU AS AF EU EU EU AF EU AF AF
SA AF SA AS AF SA AF AF AF AS AS OC AS AF OC AS AS SA OC AS AF EU AF OC NA
SA AS EU SA SA SA SA AS OC OC OC AS AF EU AF AF EU AF -- -- -- EU EU EU EU
SA SA
'''.split()
#
# Helper functions.
#
def addr_to_num(ip):
'''
Convert an IPv4 address from a string to its integer representation.
@param[in] ip IPv4 address as a string.
@returns Address as an integer.
'''
try:
w, x, y, z = map(int, ip.split('.'))
if w>255 or x>255 or y>255 or z>255:
raise ValueError()
except (ValueError, TypeError):
raise ValueError('%r is not an IPv4 address.' % (ip,))
return (w << 24) | (x << 16) | (y << 8) | z
def num_to_addr(num):
'''
Convert an IPv4 address from its integer representation to a string.
@param[in] num Address as an integer.
@returns IPv4 address as a string.
'''
return '%d.%d.%d.%d' % ((num >> 24) & 0xff,
(num >> 16) & 0xff,
(num >> 8) & 0xff,
(num & 0xff))
def latin1_to_utf8(string):
return string.decode('latin-1').encode('utf-8')
def safe_lookup(lst, idx):
if idx is None:
return None
return lst[idx]
#
# Classes.
#
class ReadBuffer(object):
'''
Utility to read data more easily.
'''
buffer = None
def __init__(self, source, size, seek_offset=None, seek_whence=os.SEEK_SET):
fp = BytesIO(source)
if seek_offset is not None:
fp.seek(seek_offset, seek_whence)
self.buffer = fp.read(size)
def read_string(self):
'''
Read a null-terminated string.
@returns Result as a string.
'''
result, self.buffer = self.buffer.split('\0', 1)
return result
def read_int(self, size):
'''
Read a multibyte integer.
@param[in] size Number of bytes to read as an integer.
@returns Result as an integer.
'''
result = sum(ord(self.buffer[i]) << (8*i) for i in range(size))
self.buffer = self.buffer[size:]
return result
class AddressInfo(object):
'''
Representation of a database lookup result.
'''
__slots__ = [ 'ip', 'ipnum', 'prefix', 'country', 'continent' ]
def __init__(self, ip=None, ipnum=None, prefix=None, country_id=None):
self.ip = ip
self.ipnum = ipnum
self.prefix = prefix
self.country = safe_lookup(GeoIP_country_code, country_id)
self.continent = safe_lookup(GeoIP_country_continent, country_id)
network = property(lambda self:
num_to_addr(self.ipnum & ~((32-self.prefix)**2-1)))
def __str__(self):
return '[%s of network %s/%d in country %s]' %\
(self.ip, self.network, self.prefix, self.country)
class BigAddressInfo(AddressInfo):
'''
Representation of a database lookup result with more info in it.
'''
# __slots__ is inherited and appended to.
__slots__ = [ 'city', 'region', 'postal_code', 'metro_code', 'area_code', 'longitude', 'latitude' ]
def __init__(self, ip=None, ipnum=None, prefix=None, country_id=None,
city=None, region=None, postal_code=None, metro_code=None, area_code=None,
longitude=None, latitude=None):
AddressInfo.__init__(self, ip, ipnum, prefix, country_id)
self.city = city or None
self.region = region or None
self.postal_code = postal_code or None
self.metro_code = metro_code
self.area_code = area_code
self.longitude = longitude
self.latitude = latitude
def __str__(self):
return '[%s of network %s/%d in city %s, %s]' %\
(self.ip, self.network, self.prefix, self.city, self.country)
class Database(object):
'''
GeoIP database reader implementation. Currently only supports country
edition.
'''
def __init__(self, filename):
'''
Initialize a new GeoIP reader instance.
@param[in] filename Path to GeoIP.dat as a string.
'''
self.filename = filename
self.cache = open(filename, 'rb').read()
self._setup_segments()
if self.db_type not in (GEOIP_COUNTRY_EDITION,
GEOIP_CITY_EDITION_REV0,
GEOIP_CITY_EDITION_REV1):
raise NotImplementedError('Database edition is not supported yet; '
'Please use a Country or City database.')
def _setup_segments(self):
self.segments = None
# default to GeoIP Country Edition
self.db_type = GEOIP_COUNTRY_EDITION
self.record_length = STANDARD_RECORD_LENGTH
fp = BytesIO(self.cache)
fp.seek(-3, os.SEEK_END)
for i in range(STRUCTURE_INFO_MAX_SIZE):
delim = fp.read(3)
if delim != '\xFF\xFF\xFF':
fp.seek(-4, os.SEEK_CUR)
continue
self.db_type = ord(fp.read(1))
# Region Edition, pre June 2003.
if self.db_type == GEOIP_REGION_EDITION_REV0:
self.segments = [STATE_BEGIN_REV0]
# Region Edition, post June 2003.
elif self.db_type == GEOIP_REGION_EDITION_REV1:
self.segments = [STATE_BEGIN_REV1]
# City/Org Editions have two segments, read offset of second segment
elif self.db_type in (GEOIP_CITY_EDITION_REV0,
GEOIP_CITY_EDITION_REV1,
GEOIP_ORG_EDITION, GEOIP_ISP_EDITION,
GEOIP_ASNUM_EDITION):
self.segments = [0]
for idx, ch in enumerate(fp.read(SEGMENT_RECORD_LENGTH)):
self.segments[0] += ord(ch) << (idx * 8)
if self.db_type in (GEOIP_ORG_EDITION, GEOIP_ISP_EDITION):
self.record_length = ORG_RECORD_LENGTH
break
if self.db_type in (GEOIP_COUNTRY_EDITION, GEOIP_PROXY_EDITION,
GEOIP_NETSPEED_EDITION, GEOIP_COUNTRY_EDITION_V6):
self.segments = [COUNTRY_BEGIN]
def info(self):
'''
Return a string describing the loaded database version.
@returns English text string, or None if database is ancient.
'''
fp = BytesIO(self.cache)
fp.seek(-3, os.SEEK_END)
hasStructureInfo = False
# first get past the database structure information
for i in range(STRUCTURE_INFO_MAX_SIZE):
if fp.read(3) == '\xFF\xFF\xFF':
hasStructureInfo = True
break
fp.seek(-4, os.SEEK_CUR)
if hasStructureInfo:
fp.seek(-6, os.SEEK_CUR)
else:
# no structure info, must be pre Sep 2002 database, go back to end.
fp.seek(-3, os.SEEK_END)
for i in range(DATABASE_INFO_MAX_SIZE):
if fp.read(3) == '\0\0\0':
return fp.read(i)
fp.seek(-4, os.SEEK_CUR)
def _decode(self, buf, branch):
'''
@param[in] buf Record buffer.
@param[in] branch 1 for left, 2 for right.
@returns X.
'''
offset = 3 * branch
if self.record_length == 3:
return buf[offset] | (buf[offset+1] << 8) | (buf[offset+2] << 16)
# General case.
end = branch * self.record_length
x = 0
for j in range(self.record_length):
x = (x << 8) | buf[end - j]
return x
def _seek_record(self, ipnum):
fp = BytesIO(self.cache)
offset = 0
for depth in range(31, -1, -1):
fp.seek(self.record_length * 2 * offset)
buf = fp.read(self.record_length * 2)
x = self._decode(buf, int(bool(ipnum & (1 << depth))))
if x >= self.segments[0]:
return 32 - depth, x
offset = x
assert False, \
"Error Traversing Database for ipnum = %lu: "\
"Perhaps database is corrupt?" % ipnum
def _lookup_country(self, ip):
"Lookup a country db entry."
ipnum = addr_to_num(ip)
prefix, num = self._seek_record(ipnum)
num -= COUNTRY_BEGIN
if num:
country_id = num - 1
else:
country_id = None
return AddressInfo(country_id=country_id, ip=ip, ipnum=ipnum, prefix=prefix)
def _lookup_city(self, ip):
"Look up a city db entry."
ipnum = addr_to_num(ip)
prefix, num = self._seek_record(ipnum)
record, next_record_ptr = self._extract_record(num, None)
return BigAddressInfo(ip=ip, ipnum=ipnum, prefix=prefix, **record)
def _extract_record(self, seek_record, next_record_ptr):
if seek_record == self.segments[0]:
return {'country_id': None}, next_record_ptr
seek_offset = seek_record + (2 * self.record_length - 1) * self.segments[0]
record_buf = ReadBuffer(self.cache, FULL_RECORD_LENGTH, seek_offset)
record = {}
# get country
record['country_id'] = record_buf.read_int(1) - 1
# get region
record['region'] = record_buf.read_string()
# get city
record['city'] = latin1_to_utf8(record_buf.read_string())
# get postal code
record['postal_code'] = record_buf.read_string()
# get latitude
record['latitude'] = record_buf.read_int(3) / 10000.0 - 180
# get longitude
record['longitude'] = record_buf.read_int(3) / 10000.0 - 180
# get area code and metro code for post April 2002 databases and for US locations
if (self.db_type == GEOIP_CITY_EDITION_REV1) and (GeoIP_country_code[record['country_id']] == 'US'):
metro_area_combo = record_buf.read_int(3)
record['metro_code'] = metro_area_combo / 1000
record['area_code'] = metro_area_combo % 1000
# Used for GeoIP_next_record (which this code doesn't have.)
if next_record_ptr is not None:
next_record_ptr = seek_record - len(record_buf)
return record, next_record_ptr
def lookup(self, ip):
'''
Lookup an IP address returning an AddressInfo (or BigAddressInfo)
instance describing its location.
@param[in] ip IPv4 address as a string.
@returns AddressInfo (or BigAddressInfo) instance.
'''
if self.db_type in (GEOIP_COUNTRY_EDITION, GEOIP_PROXY_EDITION, GEOIP_NETSPEED_EDITION):
return self._lookup_country(ip)
elif self.db_type in (GEOIP_CITY_EDITION_REV0, GEOIP_CITY_EDITION_REV1):
return self._lookup_city(ip)
if __name__ == '__main__':
import time, sys
dbfile = 'GeoIP.dat'
if len(sys.argv) > 1:
dbfile = sys.argv[1]
t1 = time.time()
db = Database(dbfile)
t2 = time.time()
print(db.info())
t3 = time.time()
tests = '''
127.0.0.1
83.198.135.28
83.126.35.59
192.168.1.1
194.168.1.255
196.25.210.14
64.22.109.113
'''.split()
for test in tests:
addr_info = db.lookup(test)
print(addr_info)
if isinstance(addr_info, BigAddressInfo):
print(" ", dict((key, getattr(addr_info, key)) for key in dir(addr_info) if not key.startswith('_')))
t4 = time.time()
print("Open: %dms" % ((t2-t1) * 1000,))
print("Info: %dms" % ((t3-t2) * 1000,))
print("Lookup: %dms" % ((t4-t3) * 1000,))