├── requirements.txt ├── .gitignore ├── LICENSE ├── ipinfo.py ├── README.md └── csv2dat.py /requirements.txt: -------------------------------------------------------------------------------- 1 | ipaddr 2 | pygeoip 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | 29 | # Translations 30 | *.mo 31 | 32 | # Mr Developer 33 | .mr.developer.cfg 34 | .project 35 | .pydevproject 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Mark Teodoro 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /ipinfo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import socket 3 | import struct 4 | 5 | try: 6 | from collections import namedtuple 7 | except ImportError: 8 | #http://code.activestate.com/recipes/500261-named-tuples/ 9 | from namedtuple import namedtuple 10 | 11 | try: 12 | import GeoIP as geoip 13 | GEOIP_MEMORY_CACHE = geoip.GEOIP_MEMORY_CACHE 14 | GEOIP_STANDARD = geoip.GEOIP_STANDARD 15 | _geo_open = geoip.open 16 | except ImportError: 17 | import pygeoip as geoip 18 | GEOIP_MEMORY_CACHE = geoip.MEMORY_CACHE 19 | GEOIP_STANDARD = geoip.STANDARD 20 | _geo_open = geoip.GeoIP 21 | 22 | __all__ = ['init_geo', 'init_asn', 'get_geo', 'get_asn'] 23 | 24 | 25 | search_paths = ['.', '/usr/local/share/GeoIP', '/usr/share/GeoIP'] 26 | def _init(db, flags): 27 | for path in search_paths: 28 | dbpath = os.path.join(path, db) 29 | if os.path.exists(dbpath): 30 | return _geo_open(dbpath, flags) 31 | 32 | _gic = None 33 | def init_geo(db='GeoIPCity.dat', flags=GEOIP_MEMORY_CACHE): 34 | global _gic 35 | _gic = _init(db, flags) 36 | 37 | _gia = None 38 | def init_asn(db='GeoIPASNum.dat', flags=GEOIP_MEMORY_CACHE): 39 | global _gia 40 | _gia = _init(db, flags) 41 | 42 | 43 | _geo_default = { 44 | 'area_code': 0, 45 | 'city': u'', 46 | 'continent': u'', 47 | 'country_code': u'', 48 | 'country_code3': u'', 49 | 'country_name': u'', 50 | 'dma_code': 0, 51 | 'latitude': 0.0, 52 | 'longitude': 0.0, 53 | 'metro_code': 0, 54 | 'postal_code': u'', 55 | 'region': u'', 56 | 'region_code': u'', 57 | 'time_zone': u'' 58 | } 59 | _geo_str_keys = set(k for k,v in _geo_default.iteritems() if v == u'') 60 | 61 | IpGeo = namedtuple('IpGeo', sorted(_geo_default)) 62 | ipgeo_default = IpGeo(**_geo_default) 63 | 64 | def get_geo(ip): 65 | if not _gic: init_geo() 66 | 67 | rec = _gic.record_by_addr(ip) 68 | if not rec: 69 | return ipgeo_default 70 | 71 | for k in _geo_str_keys: 72 | v = rec.get(k) or '' 73 | rec[k] = v.decode('latin1') 74 | 75 | #fixup - pygeoip exposes region as region_code 76 | if not rec['region']: 77 | rec['region'] = rec['region_code'] 78 | return IpGeo(**rec) 79 | 80 | 81 | IpAsn = namedtuple('IpAsn', ['asn', 'asname']) 82 | ipasn_default = IpAsn(0, u'') 83 | 84 | def get_asn(ip): 85 | if not _gia: init_asn() 86 | 87 | try: 88 | rec = _gia.org_by_addr(ip) 89 | asn, asname = rec.split(' ', 1) 90 | return IpAsn(int(asn[2:]), asname.decode('latin1')) 91 | except Exception, e: 92 | return ipasn_default 93 | 94 | 95 | def ip2int(ip): 96 | return struct.unpack("!I", socket.inet_aton(ip))[0] 97 | 98 | 99 | def int2ip(ip): 100 | return socket.inet_ntoa(struct.pack("!I", ip)) 101 | 102 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | mmutils 2 | ======= 3 | 4 | **Note: These tools work with MaxMind Legacy format databases. For information on the new GeoIP2 databases, including a writer, see: http://maxmind.github.io/MaxMind-DB/** 5 | 6 | Tools for working with MaxMind GeoIP csv and dat files 7 | ------------------------------------------------------ 8 | 9 | **ipinfo.py**: Convenience wrapper for GeoIPCity and GeoIPASNum databases. Use init\_geo or init\_asn to override default db locations and flags. 10 | 11 | Example: 12 | 13 | >>> import ipinfo 14 | >>> ipinfo.get_geo('8.8.8.8') 15 | IpGeo(area_code=650, city=u'Mountain View', country_code=u'US', country_code3=u'USA', country_name=u'United States', dma_code=807, latitude=37.419200897216797, longitude=-122.05740356445312, metro_code=807, postal_code=u'94043', region=u'CA', region_name=u'California', time_zone=u'America/Los_Angeles') 16 | >>> ipinfo.get_asn('8.8.8.8') 17 | IpAsn(asn=15169, asname=u'Google Inc.') 18 | 19 | 20 | **csv2dat.py**: Converts MaxMind CSV files to .dat files. Useful for augmenting MaxMind data. Currently supports GeoIP City and ASN database types. Note: runs 4-5x faster under pypy. 21 | 22 | Examples: 23 | 24 | Convert MaxMind ASN CSV to .dat format: 25 | 26 | $ csv2dat.py -w mmasn.dat mmasn GeoIPASNum2.csv 27 | wrote 356311-node trie with 285539 networks (42664 distinct labels) in 3 seconds 28 | 29 | Test mmasn.dat file against list of IPs, one per line: 30 | 31 | $ csv2dat.py test GeoIPASNum.dat mmasn.dat ips.txt 32 | ok: 670135 bad: 0 33 | 34 | Convert MaxMind City files to .dat format: 35 | 36 | $ csv2dat.py -w mmcity.dat -l GeoLiteCity-Location.csv mmcity GeoLiteCity-Blocks.csv 37 | wrote 2943570-node trie with 2939800 networks (109370 distinct labels) in 36 seconds 38 | 39 | Test mmcity.dat file against list of IPs, one per line: 40 | 41 | $ csv2dat.py test GeoLiteCity.dat mmcity.dat ips.txt 42 | ok: 670135 bad: 0 43 | 44 | Flatten MaxMind City CSVs into one file (for easier editing): 45 | 46 | $ csv2dat.py -l GeoLiteCity-Location.csv flat GeoLiteCity-Blocks.csv > mmcity_flat.csv 47 | 48 | Convert flattened MaxMind City files to .dat format: 49 | 50 | $ csv2dat.py -w mmcity.dat mmcity flatcity.csv 51 | wrote 2943570-node trie with 2939800 networks (109370 distinct labels) in 36 seconds 52 | 53 | Convert MaxMind ASN v6 CSV to .dat format: 54 | 55 | $ csv2dat.py -w mmasn6.dat mmasn6 GeoIPASNum2v6.csv 56 | wrote 63125-node trie with 35983 networks (6737 distinct labels) in 2 seconds 57 | 58 | Convert MaxMind City v6 CSV to .dat format: 59 | 60 | $ csv2dat.py -w mmcity6.dat mmcity6 GeoLiteCityv6.csv 61 | wrote 80637-node trie with 13074 networks (205 distinct labels) in 2 seconds 62 | 63 | Convert MaxMind Country CSV to .dat format: 64 | 65 | $ csv2dat.py -w mmcountry.dat mmcountry GeoIPCountryWhois.csv 66 | wrote 136109-node trie with 133498 networks (250 distinct labels) in 8 seconds 67 | 68 | Convert MaxMind Country v6 CSV to .dat format: 69 | 70 | $ csv2dat.py -w mmcountry6.dat mmcountry6 GeoIPv6.csv 71 | wrote 102601-node trie with 17580 networks (215 distinct labels) in 3 seconds 72 | 73 | Convert MaxMind ISP CSV to .dat format: 74 | 75 | $ csv2dat.py -w mmisp.dat mmisp GeoIPISP.csv 76 | wrote 378619-node trie with 303605 networks (45963 distinct labels) in 19 seconds 77 | 78 | Convert MaxMind Org CSV to .dat format: 79 | 80 | $ csv2dat.py -w mmorg.dat mmorg GeoIPOrg.csv 81 | wrote 378619-node trie with 303605 networks (45963 distinct labels) in 19 seconds 82 | 83 | -------------------------------------------------------------------------------- /csv2dat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import logging 4 | import logging.handlers 5 | import optparse 6 | 7 | import csv 8 | import fileinput 9 | import itertools 10 | import struct 11 | import time 12 | 13 | from functools import partial 14 | 15 | import ipaddr 16 | import pygeoip 17 | 18 | cc_idx = dict((cc.lower(), i) for i,cc in enumerate(pygeoip.const.COUNTRY_CODES)) 19 | cc_idx['--'] = cc_idx[''] 20 | cc_idx['cw'] = cc_idx['an'] #netherlands antilles / curacao 21 | cc_idx['uk'] = cc_idx['gb'] #uk / great britain 22 | cc_idx['sx'] = cc_idx['fx'] #st. martin? 23 | 24 | def init_logger(opts): 25 | level = logging.INFO 26 | handler = logging.StreamHandler() 27 | #handler = logging.handlers.SysLogHandler(address='/dev/log') 28 | if opts.debug: 29 | level = logging.DEBUG 30 | handler = logging.StreamHandler() 31 | root = logging.getLogger() 32 | root.setLevel(level) 33 | root.addHandler(handler) 34 | 35 | def parse_args(argv): 36 | if argv is None: 37 | argv = sys.argv[1:] 38 | p = optparse.OptionParser() 39 | 40 | cmdlist = [] 41 | for cmd, (f, usage) in sorted(cmds.iteritems()): 42 | cmdlist.append('%-8s\t%%prog %s' % (cmd, usage)) 43 | cmdlist = '\n '.join(cmdlist) 44 | 45 | p.usage = '%%prog [options] +\n\nExamples:\n %s' % cmdlist 46 | 47 | p.add_option('-d', '--debug', action='store_true', 48 | default=False, help="debug mode") 49 | p.add_option('-g', '--geoip', action='store_true', 50 | default=False, help='test with C GeoIP module') 51 | p.add_option('-w', '--write-dat', help='write filename.dat') 52 | p.add_option('-l', '--locations', help='city locations csv') 53 | opts, args = p.parse_args(argv) 54 | 55 | #sanity check 56 | if not args or args[0] not in cmds: 57 | p.error('missing command. choose from: %s' % ' '.join(sorted(cmds))) 58 | 59 | return opts, args 60 | 61 | 62 | def test_dbs(opts, args): 63 | """test reference.dat and test.dat against a list of IPs and print any differences""" 64 | ref_file, tst_file = args[:2] 65 | gi_ref = pygeoip.GeoIP(ref_file, pygeoip.MEMORY_CACHE) 66 | gi_tst = pygeoip.GeoIP(tst_file, pygeoip.MEMORY_CACHE) 67 | dbtype = gi_ref._databaseType 68 | if gi_ref._databaseType != gi_tst._databaseType: 69 | print "error: database types don't match" 70 | exit(1) 71 | 72 | if opts.geoip: 73 | import GeoIP 74 | logging.debug('using GeoIP module') 75 | gi_ref = GeoIP.open(ref_file, pygeoip.MEMORY_CACHE) 76 | gi_tst = GeoIP.open(test_file, pygeoip.MEMORY_CACHE) 77 | else: 78 | logging.debug('using pygeoip module') 79 | 80 | isequal = lambda lhs, rhs: lhs == rhs 81 | if dbtype in (pygeoip.const.ASNUM_EDITION, pygeoip.const.ASNUM_EDITION_V6, 82 | pygeoip.const.ISP_EDITION, pygeoip.const.ORG_EDITION): 83 | get_ref = gi_ref.org_by_addr 84 | get_tst = gi_tst.org_by_addr 85 | elif dbtype in (pygeoip.const.CITY_EDITION_REV1, pygeoip.const.CITY_EDITION_REV1_V6): 86 | get_ref = gi_ref.record_by_addr 87 | get_tst = gi_tst.record_by_addr 88 | def isequal(lhs, rhs): 89 | if lhs and rhs: 90 | #Python's float rounding makes these unpredictable, 91 | #so just stomp them to ints as a sanity check. 92 | for k in ('latitude', 'longitude'): 93 | lhs[k] = int(lhs[k]) 94 | rhs[k] = int(rhs[k]) 95 | return lhs == rhs 96 | elif dbtype in (pygeoip.const.COUNTRY_EDITION, pygeoip.const.COUNTRY_EDITION_V6): 97 | get_ref = gi_ref.country_code_by_addr 98 | get_tst = gi_tst.country_code_by_addr 99 | else: 100 | print "error: unknown database type" 101 | exit(1) 102 | 103 | ok = bad = 0 104 | for ip in fileinput.input(args[2:]): 105 | ip = ip.strip() 106 | ref = get_ref(ip) 107 | tst = get_tst(ip) 108 | if not isequal(ref, tst): 109 | print ip, ref, tst 110 | bad += 1 111 | else: 112 | ok += 1 113 | print 'ok:', ok, 'bad:', bad 114 | test_dbs.usage = 'test reference.dat test.dat ips.txt' 115 | 116 | 117 | def gen_csv(f): 118 | """peek at rows from a csv and start yielding when we get past the comments 119 | to a row that starts with an int""" 120 | def startswith_int(row): 121 | try: 122 | int(row[0][0]) 123 | return True 124 | except (ValueError, IndexError): 125 | return False 126 | 127 | cr = csv.reader(f) 128 | return itertools.dropwhile(lambda x: not startswith_int(x), cr) 129 | 130 | 131 | def flatten_city(opts, args): 132 | """flatten MM blocks and locations CSVs into one file for easier editing""" 133 | id_loc = dict((row[0], row[1:]) for row in gen_csv(open(opts.locations))) 134 | cw = csv.writer(sys.stdout, lineterminator='\n') 135 | for row in gen_csv(fileinput.input(args)): 136 | row[-1:] = id_loc[row[-1]] 137 | cw.writerow(row) 138 | flatten_city.usage = '-l GeoLiteCity-Location.csv flat GeoLiteCity-Blocks.csv > flatcity.csv' 139 | 140 | 141 | class RadixTreeNode(object): 142 | __slots__ = ['segment', 'lhs', 'rhs'] 143 | def __init__(self, segment): 144 | self.segment = segment 145 | self.lhs = None 146 | self.rhs = None 147 | 148 | 149 | class RadixTree(object): 150 | def __init__(self, debug=False): 151 | self.debug = debug 152 | 153 | self.netcount = 0 154 | self.segments = [RadixTreeNode(0)] 155 | self.data_offsets = {} 156 | self.data_segments = [] 157 | self.cur_offset = 1 158 | 159 | def __setitem__(self, net, data): 160 | self.netcount += 1 161 | inet = int(net) 162 | node = self.segments[0] 163 | for depth in range(self.seek_depth, self.seek_depth - (net.prefixlen-1), -1): 164 | if inet & (1 << depth): 165 | if not node.rhs: 166 | node.rhs = RadixTreeNode(len(self.segments)) 167 | self.segments.append(node.rhs) 168 | node = node.rhs 169 | else: 170 | if not node.lhs: 171 | node.lhs = RadixTreeNode(len(self.segments)) 172 | self.segments.append(node.lhs) 173 | node = node.lhs 174 | 175 | if not data in self.data_offsets: 176 | self.data_offsets[data] = self.cur_offset 177 | enc_data = self.encode(*data) 178 | self.data_segments.append(enc_data) 179 | self.cur_offset += (len(enc_data)) 180 | 181 | if self.debug: 182 | #store net after data for easier debugging 183 | data = data, net 184 | 185 | if inet & (1 << self.seek_depth - (net.prefixlen-1)): 186 | node.rhs = data 187 | else: 188 | node.lhs = data 189 | 190 | def gen_nets(self, opts, args): 191 | raise NotImplementedError 192 | 193 | def load(self, opts, args): 194 | for nets, data in self.gen_nets(opts, args): 195 | for net in nets: 196 | self[net] = data 197 | 198 | def dump_node(self, node): 199 | if not node: 200 | #empty leaf 201 | return '--' 202 | elif isinstance(node, RadixTreeNode): 203 | #internal node 204 | return node.segment 205 | else: 206 | #data leaf 207 | data = node[0] if self.debug else node 208 | return '%d %s' % (len(self.segments) + self.data_offsets[data], node) 209 | 210 | def dump(self): 211 | for node in self.segments: 212 | print node.segment, [self.dump_node(node.lhs), self.dump_node(node.rhs)] 213 | 214 | def encode(self, *args): 215 | raise NotImplementedError 216 | 217 | def encode_rec(self, rec, reclen): 218 | """encode rec as 4-byte little-endian int, then truncate it to reclen""" 219 | assert(reclen <= 4) 220 | return struct.pack('= 2 ** (8 * self.segreclen): 237 | logging.warning('too many segments for final segment record size!') 238 | 239 | for node in self.segments: 240 | f.write(self.serialize_node(node.lhs)) 241 | f.write(self.serialize_node(node.rhs)) 242 | 243 | f.write(chr(42)) #So long, and thanks for all the fish! 244 | f.write(''.join(self.data_segments)) 245 | 246 | f.write('csv2dat.py') #.dat file comment - can be anything 247 | f.write(chr(0xFF) * 3) 248 | f.write(chr(self.edition)) 249 | f.write(self.encode_rec(len(self.segments), self.segreclen)) 250 | 251 | 252 | class ASNRadixTree(RadixTree): 253 | usage = '-w mmasn.dat mmasn GeoIPASNum2.csv' 254 | cmd = 'mmasn' 255 | seek_depth = 31 256 | edition = pygeoip.const.ASNUM_EDITION 257 | reclen = pygeoip.const.STANDARD_RECORD_LENGTH 258 | segreclen = pygeoip.const.SEGMENT_RECORD_LENGTH 259 | 260 | def gen_nets(self, opts, args): 261 | for lo, hi, asn in gen_csv(fileinput.input(args)): 262 | lo, hi = ipaddr.IPAddress(int(lo)), ipaddr.IPAddress(int(hi)) 263 | nets = ipaddr.summarize_address_range(lo, hi) 264 | yield nets, (asn,) 265 | 266 | def encode(self, data): 267 | return data + '\0' 268 | 269 | 270 | class ASNv6RadixTree(ASNRadixTree): 271 | usage = '-w mmasn6.dat mmasn6 GeoIPASNum2v6.csv' 272 | cmd = 'mmasn6' 273 | seek_depth = 127 274 | edition = pygeoip.const.ASNUM_EDITION_V6 275 | reclen = pygeoip.const.STANDARD_RECORD_LENGTH 276 | segreclen = pygeoip.const.SEGMENT_RECORD_LENGTH 277 | 278 | def gen_nets(self, opts, args): 279 | for _, _, lo, hi, asn in gen_csv(fileinput.input(args)): 280 | lo, hi = ipaddr.IPAddress(int(lo)), ipaddr.IPAddress(int(hi)) 281 | nets = ipaddr.summarize_address_range(lo, hi) 282 | yield nets, (asn,) 283 | 284 | 285 | class ISPRadixTree(ASNRadixTree): 286 | usage = '-w mmisp.dat mmisp GeoIPISP.csv' 287 | cmd = 'mmisp' 288 | seek_depth = 31 289 | edition = pygeoip.const.ISP_EDITION 290 | reclen = pygeoip.const.ORG_RECORD_LENGTH 291 | segreclen = pygeoip.const.SEGMENT_RECORD_LENGTH 292 | 293 | 294 | class OrgRadixTree(ASNRadixTree): 295 | usage = '-w mmorg.dat mmorg GeoIPOrg.csv' 296 | cmd = 'mmorg' 297 | seek_depth = 31 298 | edition = pygeoip.const.ORG_EDITION 299 | reclen = pygeoip.const.ORG_RECORD_LENGTH 300 | segreclen = pygeoip.const.SEGMENT_RECORD_LENGTH 301 | 302 | 303 | class CityRev1RadixTree(RadixTree): 304 | usage = '-w mmcity.dat [-l GeoLiteCity-Location.csv] mmcity GeoLiteCity-Blocks.csv' 305 | cmd = 'mmcity' 306 | seek_depth = 31 307 | edition = pygeoip.const.CITY_EDITION_REV1 308 | reclen = pygeoip.const.STANDARD_RECORD_LENGTH 309 | segreclen = pygeoip.const.SEGMENT_RECORD_LENGTH 310 | 311 | def gen_nets(self, opts, args): 312 | id_loc = None 313 | if opts.locations: 314 | id_loc = dict((row[0], row[1:]) for row in gen_csv(open(opts.locations))) 315 | 316 | for row in gen_csv(fileinput.input(args)): 317 | lo, hi = row[:2] 318 | loc = row[2:] 319 | if id_loc: 320 | loc = id_loc[loc[0]] 321 | lo, hi = ipaddr.IPAddress(int(lo)), ipaddr.IPAddress(int(hi)) 322 | nets = ipaddr.summarize_address_range(lo, hi) 323 | yield nets, tuple(loc) 324 | 325 | def encode(self, country, region, city, postal_code, lat, lon, metro_code, area_code): 326 | def str2num(num, ntype): 327 | return ntype(num) if num else ntype(0) 328 | 329 | country = country.lower() 330 | lat, lon = round(str2num(lat, float), 4), round(str2num(lon, float), 4) 331 | metro_code, area_code = str2num(metro_code, int), str2num(area_code, int) 332 | 333 | buf = [] 334 | try: 335 | buf.append(chr(cc_idx[country])) 336 | except KeyError: 337 | logging.warning("'%s': missing country. update pygeoip.const.COUNTRY_CODES?", country) 338 | buf.append(chr(cc_idx[''])) 339 | buf.append('\0'.join((region, city, postal_code))) 340 | buf.append('\0') 341 | buf.append(self.encode_rec(int((lat + 180) * 10000), 3)) 342 | buf.append(self.encode_rec(int((lon + 180) * 10000), 3)) 343 | if (metro_code or area_code) and country == 'us': 344 | buf.append(self.encode_rec(metro_code * 1000 + area_code, 3)) 345 | else: 346 | buf.append('\0\0\0') 347 | return ''.join(buf) 348 | 349 | 350 | class CityRev1v6RadixTree(CityRev1RadixTree): 351 | usage = '-w mmcity6.dat mmcity6 GeoLiteCityv6.csv' 352 | cmd = 'mmcity6' 353 | seek_depth = 127 354 | edition = pygeoip.const.CITY_EDITION_REV1 355 | reclen = pygeoip.const.STANDARD_RECORD_LENGTH 356 | segreclen = pygeoip.const.SEGMENT_RECORD_LENGTH 357 | 358 | def gen_nets(self, opts, args): 359 | for row in gen_csv(fileinput.input(args)): 360 | lo, hi = row[2:4] 361 | lo, hi = ipaddr.IPAddress(int(lo)), ipaddr.IPAddress(int(hi)) 362 | nets = ipaddr.summarize_address_range(lo, hi) 363 | #v6 postal_code is after lat/lon instead of before like v4 364 | country, region, city, lat, lon, postal_code, metro_code, area_code = row[4:] 365 | yield nets, (country, region, city, postal_code, lat, lon, metro_code, area_code) 366 | 367 | 368 | class CountryRadixTree(RadixTree): 369 | usage = '-w mmcountry.dat mmcountry GeoIPCountryWhois.csv' 370 | cmd = 'mmcountry' 371 | seek_depth = 31 372 | edition = pygeoip.const.COUNTRY_EDITION 373 | reclen = pygeoip.const.STANDARD_RECORD_LENGTH 374 | segreclen = pygeoip.const.SEGMENT_RECORD_LENGTH 375 | 376 | def gen_nets(self, opts, args): 377 | for _, _, lo, hi, cc, _ in gen_csv(fileinput.input(args)): 378 | lo, hi = ipaddr.IPAddress(int(lo)), ipaddr.IPAddress(int(hi)) 379 | nets = ipaddr.summarize_address_range(lo, hi) 380 | yield nets, (cc,) 381 | 382 | def encode(self, cc): 383 | #unused 384 | return '' 385 | 386 | def serialize_node(self, node): 387 | if not node: 388 | #empty leaf 389 | rec = pygeoip.const.COUNTRY_BEGIN 390 | elif isinstance(node, RadixTreeNode): 391 | #internal node 392 | rec = node.segment 393 | else: 394 | #data leaf 395 | data = node[0] if self.debug else node 396 | cc = data[0] 397 | try: 398 | offset = cc_idx[cc.lower()] 399 | except KeyError: 400 | logging.warning("'%s': missing country. update pygeoip.const.COUNTRY_CODES?", cc) 401 | offset = 0 402 | #data leaves directly encode cc index as an offset 403 | rec = pygeoip.const.COUNTRY_BEGIN + offset 404 | return self.encode_rec(rec, self.reclen) 405 | 406 | def serialize(self, f): 407 | for node in self.segments: 408 | f.write(self.serialize_node(node.lhs)) 409 | f.write(self.serialize_node(node.rhs)) 410 | 411 | f.write(chr(0x00) * 3) 412 | f.write('csv2dat.py') #.dat file comment - can be anything 413 | f.write(chr(0xFF) * 3) 414 | f.write(chr(self.edition)) 415 | f.write(self.encode_rec(len(self.segments), self.segreclen)) 416 | 417 | 418 | class Countryv6RadixTree(CountryRadixTree): 419 | usage = '-w mmcountry6.dat mmcountry6 GeoIPv6.csv' 420 | cmd = 'mmcountry6' 421 | seek_depth = 127 422 | edition = pygeoip.const.COUNTRY_EDITION_V6 423 | reclen = pygeoip.const.STANDARD_RECORD_LENGTH 424 | segreclen = pygeoip.const.SEGMENT_RECORD_LENGTH 425 | 426 | def gen_nets(self, opts, args): 427 | for row in gen_csv(fileinput.input(args)): 428 | #handle weird space before quote problems 429 | lo, hi, cc = [x.strip(' "') for x in row[2:5]] 430 | lo, hi = ipaddr.IPAddress(int(lo)), ipaddr.IPAddress(int(hi)) 431 | nets = ipaddr.summarize_address_range(lo, hi) 432 | yield nets, (cc,) 433 | 434 | 435 | def build_dat(RTree, opts, args): 436 | tstart = time.time() 437 | r = RTree(debug=opts.debug) 438 | 439 | r.load(opts, args) 440 | 441 | if opts.debug: 442 | r.dump() 443 | 444 | with open(opts.write_dat, 'wb') as f: 445 | r.serialize(f) 446 | 447 | tstop = time.time() 448 | print 'wrote %d-node trie with %d networks (%d distinct labels) in %d seconds' % ( 449 | len(r.segments), r.netcount, len(r.data_offsets), tstop - tstart) 450 | 451 | 452 | rtrees = [ 453 | ASNRadixTree, ASNv6RadixTree, 454 | CityRev1RadixTree, CityRev1v6RadixTree, 455 | CountryRadixTree, Countryv6RadixTree, 456 | ISPRadixTree, OrgRadixTree, 457 | ] 458 | cmds = dict((rtree.cmd, (partial(build_dat, rtree), rtree.usage)) for rtree in rtrees) 459 | cmds['flat'] = (flatten_city, flatten_city.usage) 460 | cmds['test'] = (test_dbs, test_dbs.usage) 461 | 462 | def main(argv=None): 463 | global opts 464 | opts, args = parse_args(argv) 465 | init_logger(opts) 466 | logging.debug(opts) 467 | logging.debug(args) 468 | 469 | cmd = args.pop(0) 470 | cmd, usage = cmds[cmd] 471 | return cmd(opts, args) 472 | 473 | 474 | if __name__ == '__main__': 475 | rval = main() 476 | logging.shutdown() 477 | sys.exit(rval) 478 | 479 | --------------------------------------------------------------------------------