├── requirements.txt
├── .gitignore
├── LICENSE
├── ipinfo.py
├── README.md
└── csv2dat.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | ipaddr
2 | pygeoip
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | 
21 | # Installer logs
22 | pip-log.txt
23 | 
24 | # Unit test / coverage reports
25 | .coverage
26 | .tox
27 | nosetests.xml
28 | 
29 | # Translations
30 | *.mo
31 | 
32 | # Mr Developer
33 | .mr.developer.cfg
34 | .project
35 | .pydevproject
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Mark Teodoro
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/ipinfo.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import socket
  3 | import struct
  4 | 
  5 | try:
  6 |     from collections import namedtuple
  7 | except ImportError:
  8 |     #http://code.activestate.com/recipes/500261-named-tuples/
  9 |     from namedtuple import namedtuple
 10 | 
 11 | try:
 12 |     import GeoIP as geoip
 13 |     GEOIP_MEMORY_CACHE = geoip.GEOIP_MEMORY_CACHE
 14 |     GEOIP_STANDARD = geoip.GEOIP_STANDARD
 15 |     _geo_open = geoip.open
 16 | except ImportError:
 17 |     import pygeoip as geoip
 18 |     GEOIP_MEMORY_CACHE = geoip.MEMORY_CACHE
 19 |     GEOIP_STANDARD = geoip.STANDARD
 20 |     _geo_open = geoip.GeoIP
 21 | 
 22 | __all__ = ['init_geo', 'init_asn', 'get_geo', 'get_asn']
 23 | 
 24 | 
 25 | search_paths = ['.', '/usr/local/share/GeoIP', '/usr/share/GeoIP']
 26 | def _init(db, flags):
 27 |     for path in search_paths:
 28 |         dbpath = os.path.join(path, db)
 29 |         if os.path.exists(dbpath):
 30 |             return _geo_open(dbpath, flags)
 31 | 
 32 | _gic = None
 33 | def init_geo(db='GeoIPCity.dat', flags=GEOIP_MEMORY_CACHE):
 34 |     global _gic
 35 |     _gic = _init(db, flags)
 36 | 
 37 | _gia = None
 38 | def init_asn(db='GeoIPASNum.dat', flags=GEOIP_MEMORY_CACHE):
 39 |     global _gia
 40 |     _gia = _init(db, flags)
 41 | 
 42 | 
 43 | _geo_default = {
 44 |     'area_code': 0,
 45 |     'city': u'',
 46 |     'continent': u'',
 47 |     'country_code': u'',
 48 |     'country_code3': u'',
 49 |     'country_name': u'',
 50 |     'dma_code': 0,
 51 |     'latitude': 0.0,
 52 |     'longitude': 0.0,
 53 |     'metro_code': 0,
 54 |     'postal_code': u'',
 55 |     'region': u'',
 56 |     'region_code': u'',
 57 |     'time_zone': u''
 58 | }
 59 | _geo_str_keys = set(k for k,v in _geo_default.iteritems() if v == u'')
 60 | 
 61 | IpGeo = namedtuple('IpGeo', sorted(_geo_default))
 62 | ipgeo_default = IpGeo(**_geo_default)
 63 | 
 64 | def get_geo(ip):
 65 |     if not _gic: init_geo()
 66 | 
 67 |     rec = _gic.record_by_addr(ip)
 68 |     if not rec:
 69 |         return ipgeo_default
 70 | 
 71 |     for k in _geo_str_keys:
 72 |         v = rec.get(k) or ''
 73 |         rec[k] = v.decode('latin1')
 74 | 
 75 |     #fixup - pygeoip exposes region as region_code
 76 |     if not rec['region']:
 77 |         rec['region'] = rec['region_code']
 78 |     return IpGeo(**rec)
 79 | 
 80 | 
 81 | IpAsn = namedtuple('IpAsn', ['asn', 'asname'])
 82 | ipasn_default = IpAsn(0, u'')
 83 | 
 84 | def get_asn(ip):
 85 |     if not _gia: init_asn()
 86 | 
 87 |     try:
 88 |         rec = _gia.org_by_addr(ip)
 89 |         asn, asname = rec.split(' ', 1)
 90 |         return IpAsn(int(asn[2:]), asname.decode('latin1'))
 91 |     except Exception, e:
 92 |         return ipasn_default
 93 | 
 94 | 
 95 | def ip2int(ip):
 96 |     return struct.unpack("!I", socket.inet_aton(ip))[0]
 97 | 
 98 | 
 99 | def int2ip(ip):
100 |     return socket.inet_ntoa(struct.pack("!I", ip))
101 | 
102 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | mmutils
 2 | =======
 3 | 
 4 | **Note: These tools work with MaxMind Legacy format databases.  For information on the new GeoIP2 databases, including a writer, see: http://maxmind.github.io/MaxMind-DB/**
 5 | 
 6 | Tools for working with MaxMind GeoIP csv and dat files
 7 | ------------------------------------------------------
 8 | 
 9 | **ipinfo.py**: Convenience wrapper for GeoIPCity and GeoIPASNum databases.  Use init\_geo or init\_asn to override default db locations and flags.
10 | 
11 | Example:
12 | 
13 |     >>> import ipinfo
14 |     >>> ipinfo.get_geo('8.8.8.8')
15 |     IpGeo(area_code=650, city=u'Mountain View', country_code=u'US', country_code3=u'USA', country_name=u'United States', dma_code=807, latitude=37.419200897216797, longitude=-122.05740356445312, metro_code=807, postal_code=u'94043', region=u'CA', region_name=u'California', time_zone=u'America/Los_Angeles')
16 |     >>> ipinfo.get_asn('8.8.8.8')
17 |     IpAsn(asn=15169, asname=u'Google Inc.')
18 | 
19 | 
20 | **csv2dat.py**: Converts MaxMind CSV files to .dat files.  Useful for augmenting MaxMind data.  Currently supports GeoIP City and ASN database types.  Note: runs 4-5x faster under pypy.
21 | 
22 | Examples:
23 | 
24 | Convert MaxMind ASN CSV to .dat format:
25 | 
26 |     $ csv2dat.py -w mmasn.dat mmasn GeoIPASNum2.csv
27 |     wrote 356311-node trie with 285539 networks (42664 distinct labels) in 3 seconds
28 | 
29 | Test mmasn.dat file against list of IPs, one per line:
30 | 
31 |     $ csv2dat.py test GeoIPASNum.dat mmasn.dat ips.txt
32 |     ok: 670135 bad: 0
33 | 
34 | Convert MaxMind City files to .dat format:
35 | 
36 |     $ csv2dat.py -w mmcity.dat -l GeoLiteCity-Location.csv mmcity GeoLiteCity-Blocks.csv
37 |     wrote 2943570-node trie with 2939800 networks (109370 distinct labels) in 36 seconds
38 | 
39 | Test mmcity.dat file against list of IPs, one per line:
40 | 
41 |     $ csv2dat.py test GeoLiteCity.dat mmcity.dat ips.txt
42 |     ok: 670135 bad: 0
43 | 
44 | Flatten MaxMind City CSVs into one file (for easier editing):
45 | 
46 |     $ csv2dat.py -l GeoLiteCity-Location.csv flat GeoLiteCity-Blocks.csv > mmcity_flat.csv
47 | 
48 | Convert flattened MaxMind City files to .dat format:
49 | 
50 |     $ csv2dat.py -w mmcity.dat mmcity flatcity.csv
51 |     wrote 2943570-node trie with 2939800 networks (109370 distinct labels) in 36 seconds
52 | 
53 | Convert MaxMind ASN v6 CSV to .dat format:
54 | 
55 |     $ csv2dat.py -w mmasn6.dat mmasn6 GeoIPASNum2v6.csv
56 |     wrote 63125-node trie with 35983 networks (6737 distinct labels) in 2 seconds
57 | 
58 | Convert MaxMind City v6 CSV to .dat format:
59 | 
60 |     $ csv2dat.py -w mmcity6.dat mmcity6 GeoLiteCityv6.csv
61 |     wrote 80637-node trie with 13074 networks (205 distinct labels) in 2 seconds
62 | 
63 | Convert MaxMind Country CSV to .dat format:
64 | 
65 |     $ csv2dat.py -w mmcountry.dat mmcountry GeoIPCountryWhois.csv
66 |     wrote 136109-node trie with 133498 networks (250 distinct labels) in 8 seconds
67 | 
68 | Convert MaxMind Country v6 CSV to .dat format:
69 | 
70 |     $ csv2dat.py -w mmcountry6.dat mmcountry6 GeoIPv6.csv
71 |     wrote 102601-node trie with 17580 networks (215 distinct labels) in 3 seconds
72 | 
73 | Convert MaxMind ISP CSV to .dat format:
74 | 
75 |     $ csv2dat.py -w mmisp.dat mmisp GeoIPISP.csv
76 |     wrote 378619-node trie with 303605 networks (45963 distinct labels) in 19 seconds
77 | 
78 | Convert MaxMind Org CSV to .dat format:
79 | 
80 |     $ csv2dat.py -w mmorg.dat mmorg GeoIPOrg.csv
81 |     wrote 378619-node trie with 303605 networks (45963 distinct labels) in 19 seconds
82 | 
83 | 


--------------------------------------------------------------------------------
/csv2dat.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys
  3 | import logging
  4 | import logging.handlers
  5 | import optparse
  6 | 
  7 | import csv
  8 | import fileinput
  9 | import itertools
 10 | import struct
 11 | import time
 12 | 
 13 | from functools import partial
 14 | 
 15 | import ipaddr
 16 | import pygeoip
 17 | 
 18 | cc_idx = dict((cc.lower(), i) for i,cc in enumerate(pygeoip.const.COUNTRY_CODES))
 19 | cc_idx['--'] = cc_idx['']
 20 | cc_idx['cw'] = cc_idx['an'] #netherlands antilles / curacao
 21 | cc_idx['uk'] = cc_idx['gb'] #uk / great britain
 22 | cc_idx['sx'] = cc_idx['fx'] #st. martin?
 23 | 
 24 | def init_logger(opts):
 25 |     level = logging.INFO
 26 |     handler = logging.StreamHandler()
 27 |     #handler = logging.handlers.SysLogHandler(address='/dev/log')
 28 |     if opts.debug:
 29 |         level = logging.DEBUG
 30 |         handler = logging.StreamHandler()
 31 |     root = logging.getLogger()
 32 |     root.setLevel(level)
 33 |     root.addHandler(handler)
 34 | 
 35 | def parse_args(argv):
 36 |     if argv is None:
 37 |         argv = sys.argv[1:]
 38 |     p = optparse.OptionParser()
 39 | 
 40 |     cmdlist = []
 41 |     for cmd, (f, usage) in sorted(cmds.iteritems()):
 42 |         cmdlist.append('%-8s\t%%prog %s' % (cmd, usage))
 43 |     cmdlist = '\n  '.join(cmdlist)
 44 | 
 45 |     p.usage = '%%prog [options] <cmd> <arg>+\n\nExamples:\n  %s' % cmdlist
 46 | 
 47 |     p.add_option('-d', '--debug', action='store_true',
 48 |             default=False, help="debug mode")
 49 |     p.add_option('-g', '--geoip', action='store_true',
 50 |             default=False, help='test with C GeoIP module')
 51 |     p.add_option('-w', '--write-dat', help='write filename.dat')
 52 |     p.add_option('-l', '--locations', help='city locations csv')
 53 |     opts, args = p.parse_args(argv)
 54 | 
 55 |     #sanity check
 56 |     if not args or args[0] not in cmds:
 57 |         p.error('missing command. choose from: %s' % ' '.join(sorted(cmds)))
 58 | 
 59 |     return opts, args
 60 | 
 61 | 
 62 | def test_dbs(opts, args):
 63 |     """test reference.dat and test.dat against a list of IPs and print any differences"""
 64 |     ref_file, tst_file = args[:2]
 65 |     gi_ref = pygeoip.GeoIP(ref_file, pygeoip.MEMORY_CACHE)
 66 |     gi_tst = pygeoip.GeoIP(tst_file, pygeoip.MEMORY_CACHE)
 67 |     dbtype = gi_ref._databaseType
 68 |     if gi_ref._databaseType != gi_tst._databaseType:
 69 |         print "error: database types don't match"
 70 |         exit(1)
 71 | 
 72 |     if opts.geoip:
 73 |         import GeoIP
 74 |         logging.debug('using GeoIP module')
 75 |         gi_ref = GeoIP.open(ref_file, pygeoip.MEMORY_CACHE)
 76 |         gi_tst = GeoIP.open(test_file, pygeoip.MEMORY_CACHE)
 77 |     else:
 78 |         logging.debug('using pygeoip module')
 79 | 
 80 |     isequal = lambda lhs, rhs: lhs == rhs
 81 |     if dbtype in (pygeoip.const.ASNUM_EDITION, pygeoip.const.ASNUM_EDITION_V6,
 82 |             pygeoip.const.ISP_EDITION, pygeoip.const.ORG_EDITION):
 83 |         get_ref = gi_ref.org_by_addr
 84 |         get_tst = gi_tst.org_by_addr
 85 |     elif dbtype in (pygeoip.const.CITY_EDITION_REV1, pygeoip.const.CITY_EDITION_REV1_V6):
 86 |         get_ref = gi_ref.record_by_addr
 87 |         get_tst = gi_tst.record_by_addr
 88 |         def isequal(lhs, rhs):
 89 |             if lhs and rhs:
 90 |                 #Python's float rounding makes these unpredictable,
 91 |                 #so just stomp them to ints as a sanity check.
 92 |                 for k in ('latitude', 'longitude'):
 93 |                     lhs[k] = int(lhs[k])
 94 |                     rhs[k] = int(rhs[k])
 95 |             return lhs == rhs
 96 |     elif dbtype in (pygeoip.const.COUNTRY_EDITION, pygeoip.const.COUNTRY_EDITION_V6):
 97 |         get_ref = gi_ref.country_code_by_addr
 98 |         get_tst = gi_tst.country_code_by_addr
 99 |     else:
100 |         print "error: unknown database type"
101 |         exit(1)
102 | 
103 |     ok = bad = 0
104 |     for ip in fileinput.input(args[2:]):
105 |         ip = ip.strip()
106 |         ref = get_ref(ip)
107 |         tst = get_tst(ip)
108 |         if not isequal(ref, tst):
109 |             print ip, ref, tst
110 |             bad += 1
111 |         else:
112 |             ok += 1
113 |     print 'ok:', ok, 'bad:', bad
114 | test_dbs.usage = 'test reference.dat test.dat ips.txt'
115 | 
116 | 
117 | def gen_csv(f):
118 |     """peek at rows from a csv and start yielding when we get past the comments
119 |     to a row that starts with an int"""
120 |     def startswith_int(row):
121 |         try:
122 |             int(row[0][0])
123 |             return True
124 |         except (ValueError, IndexError):
125 |             return False
126 | 
127 |     cr = csv.reader(f)
128 |     return itertools.dropwhile(lambda x: not startswith_int(x), cr)
129 | 
130 | 
131 | def flatten_city(opts, args):
132 |     """flatten MM blocks and locations CSVs into one file for easier editing"""
133 |     id_loc = dict((row[0], row[1:]) for row in gen_csv(open(opts.locations)))
134 |     cw = csv.writer(sys.stdout, lineterminator='\n')
135 |     for row in gen_csv(fileinput.input(args)):
136 |         row[-1:] = id_loc[row[-1]]
137 |         cw.writerow(row)
138 | flatten_city.usage = '-l GeoLiteCity-Location.csv flat GeoLiteCity-Blocks.csv > flatcity.csv'
139 | 
140 | 
141 | class RadixTreeNode(object):
142 |     __slots__ = ['segment', 'lhs', 'rhs']
143 |     def __init__(self, segment):
144 |         self.segment = segment
145 |         self.lhs = None
146 |         self.rhs = None
147 | 
148 | 
149 | class RadixTree(object):
150 |     def __init__(self, debug=False):
151 |         self.debug = debug
152 | 
153 |         self.netcount = 0
154 |         self.segments = [RadixTreeNode(0)]
155 |         self.data_offsets = {}
156 |         self.data_segments = []
157 |         self.cur_offset = 1
158 | 
159 |     def __setitem__(self, net, data):
160 |         self.netcount += 1
161 |         inet = int(net)
162 |         node = self.segments[0]
163 |         for depth in range(self.seek_depth, self.seek_depth - (net.prefixlen-1), -1):
164 |             if inet & (1 << depth):
165 |                 if not node.rhs:
166 |                     node.rhs = RadixTreeNode(len(self.segments))
167 |                     self.segments.append(node.rhs)
168 |                 node = node.rhs
169 |             else:
170 |                 if not node.lhs:
171 |                     node.lhs = RadixTreeNode(len(self.segments))
172 |                     self.segments.append(node.lhs)
173 |                 node = node.lhs
174 | 
175 |         if not data in self.data_offsets:
176 |             self.data_offsets[data] = self.cur_offset
177 |             enc_data = self.encode(*data)
178 |             self.data_segments.append(enc_data)
179 |             self.cur_offset += (len(enc_data))
180 | 
181 |         if self.debug:
182 |             #store net after data for easier debugging
183 |             data = data, net
184 | 
185 |         if inet & (1 << self.seek_depth - (net.prefixlen-1)):
186 |             node.rhs = data
187 |         else:
188 |             node.lhs = data
189 | 
190 |     def gen_nets(self, opts, args):
191 |         raise NotImplementedError
192 | 
193 |     def load(self, opts, args):
194 |         for nets, data in self.gen_nets(opts, args):
195 |             for net in nets:
196 |                 self[net] = data
197 | 
198 |     def dump_node(self, node):
199 |         if not node:
200 |             #empty leaf
201 |             return '--'
202 |         elif isinstance(node, RadixTreeNode):
203 |             #internal node
204 |             return node.segment
205 |         else:
206 |             #data leaf
207 |             data = node[0] if self.debug else node
208 |             return '%d %s' % (len(self.segments) + self.data_offsets[data], node)
209 | 
210 |     def dump(self):
211 |         for node in self.segments:
212 |             print node.segment, [self.dump_node(node.lhs), self.dump_node(node.rhs)]
213 | 
214 |     def encode(self, *args):
215 |         raise NotImplementedError
216 | 
217 |     def encode_rec(self, rec, reclen):
218 |         """encode rec as 4-byte little-endian int, then truncate it to reclen"""
219 |         assert(reclen <= 4)
220 |         return struct.pack('<I', rec)[:reclen]
221 | 
222 |     def serialize_node(self, node):
223 |         if not node:
224 |             #empty leaf
225 |             rec = len(self.segments)
226 |         elif isinstance(node, RadixTreeNode):
227 |             #internal node
228 |             rec = node.segment
229 |         else:
230 |             #data leaf
231 |             data = node[0] if self.debug else node
232 |             rec = len(self.segments) + self.data_offsets[data]
233 |         return self.encode_rec(rec, self.reclen)
234 | 
235 |     def serialize(self, f):
236 |         if len(self.segments) >= 2 ** (8 * self.segreclen):
237 |             logging.warning('too many segments for final segment record size!')
238 | 
239 |         for node in self.segments:
240 |             f.write(self.serialize_node(node.lhs))
241 |             f.write(self.serialize_node(node.rhs))
242 | 
243 |         f.write(chr(42)) #So long, and thanks for all the fish!
244 |         f.write(''.join(self.data_segments))
245 | 
246 |         f.write('csv2dat.py') #.dat file comment - can be anything
247 |         f.write(chr(0xFF) * 3)
248 |         f.write(chr(self.edition))
249 |         f.write(self.encode_rec(len(self.segments), self.segreclen))
250 | 
251 | 
252 | class ASNRadixTree(RadixTree):
253 |     usage = '-w mmasn.dat mmasn GeoIPASNum2.csv'
254 |     cmd = 'mmasn'
255 |     seek_depth = 31
256 |     edition = pygeoip.const.ASNUM_EDITION
257 |     reclen = pygeoip.const.STANDARD_RECORD_LENGTH
258 |     segreclen = pygeoip.const.SEGMENT_RECORD_LENGTH
259 | 
260 |     def gen_nets(self, opts, args):
261 |         for lo, hi, asn in gen_csv(fileinput.input(args)):
262 |             lo, hi = ipaddr.IPAddress(int(lo)), ipaddr.IPAddress(int(hi))
263 |             nets = ipaddr.summarize_address_range(lo, hi)
264 |             yield nets, (asn,)
265 | 
266 |     def encode(self, data):
267 |         return data + '\0'
268 | 
269 | 
270 | class ASNv6RadixTree(ASNRadixTree):
271 |     usage = '-w mmasn6.dat mmasn6 GeoIPASNum2v6.csv'
272 |     cmd = 'mmasn6'
273 |     seek_depth = 127
274 |     edition = pygeoip.const.ASNUM_EDITION_V6
275 |     reclen = pygeoip.const.STANDARD_RECORD_LENGTH
276 |     segreclen = pygeoip.const.SEGMENT_RECORD_LENGTH
277 | 
278 |     def gen_nets(self, opts, args):
279 |         for _, _, lo, hi, asn in gen_csv(fileinput.input(args)):
280 |             lo, hi = ipaddr.IPAddress(int(lo)), ipaddr.IPAddress(int(hi))
281 |             nets = ipaddr.summarize_address_range(lo, hi)
282 |             yield nets, (asn,)
283 | 
284 | 
285 | class ISPRadixTree(ASNRadixTree):
286 |     usage = '-w mmisp.dat mmisp GeoIPISP.csv'
287 |     cmd = 'mmisp'
288 |     seek_depth = 31
289 |     edition = pygeoip.const.ISP_EDITION
290 |     reclen = pygeoip.const.ORG_RECORD_LENGTH
291 |     segreclen = pygeoip.const.SEGMENT_RECORD_LENGTH
292 | 
293 | 
294 | class OrgRadixTree(ASNRadixTree):
295 |     usage = '-w mmorg.dat mmorg GeoIPOrg.csv'
296 |     cmd = 'mmorg'
297 |     seek_depth = 31
298 |     edition = pygeoip.const.ORG_EDITION
299 |     reclen = pygeoip.const.ORG_RECORD_LENGTH
300 |     segreclen = pygeoip.const.SEGMENT_RECORD_LENGTH
301 | 
302 | 
303 | class CityRev1RadixTree(RadixTree):
304 |     usage = '-w mmcity.dat [-l GeoLiteCity-Location.csv] mmcity GeoLiteCity-Blocks.csv'
305 |     cmd = 'mmcity'
306 |     seek_depth = 31
307 |     edition = pygeoip.const.CITY_EDITION_REV1
308 |     reclen = pygeoip.const.STANDARD_RECORD_LENGTH
309 |     segreclen = pygeoip.const.SEGMENT_RECORD_LENGTH
310 | 
311 |     def gen_nets(self, opts, args):
312 |         id_loc = None
313 |         if opts.locations:
314 |             id_loc = dict((row[0], row[1:]) for row in gen_csv(open(opts.locations)))
315 | 
316 |         for row in gen_csv(fileinput.input(args)):
317 |             lo, hi = row[:2]
318 |             loc = row[2:]
319 |             if id_loc:
320 |                 loc = id_loc[loc[0]]
321 |             lo, hi = ipaddr.IPAddress(int(lo)), ipaddr.IPAddress(int(hi))
322 |             nets = ipaddr.summarize_address_range(lo, hi)
323 |             yield nets, tuple(loc)
324 | 
325 |     def encode(self, country, region, city, postal_code, lat, lon, metro_code, area_code):
326 |         def str2num(num, ntype):
327 |             return ntype(num) if num else ntype(0)
328 | 
329 |         country = country.lower()
330 |         lat, lon = round(str2num(lat, float), 4), round(str2num(lon, float), 4)
331 |         metro_code, area_code = str2num(metro_code, int), str2num(area_code, int)
332 | 
333 |         buf = []
334 |         try:
335 |             buf.append(chr(cc_idx[country]))
336 |         except KeyError:
337 |             logging.warning("'%s': missing country. update pygeoip.const.COUNTRY_CODES?", country)
338 |             buf.append(chr(cc_idx['']))
339 |         buf.append('\0'.join((region, city, postal_code)))
340 |         buf.append('\0')
341 |         buf.append(self.encode_rec(int((lat + 180) * 10000), 3))
342 |         buf.append(self.encode_rec(int((lon + 180) * 10000), 3))
343 |         if (metro_code or area_code) and country == 'us':
344 |             buf.append(self.encode_rec(metro_code * 1000 + area_code, 3))
345 |         else:
346 |             buf.append('\0\0\0')
347 |         return ''.join(buf)
348 | 
349 | 
350 | class CityRev1v6RadixTree(CityRev1RadixTree):
351 |     usage = '-w mmcity6.dat mmcity6 GeoLiteCityv6.csv'
352 |     cmd = 'mmcity6'
353 |     seek_depth = 127
354 |     edition = pygeoip.const.CITY_EDITION_REV1
355 |     reclen = pygeoip.const.STANDARD_RECORD_LENGTH
356 |     segreclen = pygeoip.const.SEGMENT_RECORD_LENGTH
357 | 
358 |     def gen_nets(self, opts, args):
359 |         for row in gen_csv(fileinput.input(args)):
360 |             lo, hi = row[2:4]
361 |             lo, hi = ipaddr.IPAddress(int(lo)), ipaddr.IPAddress(int(hi))
362 |             nets = ipaddr.summarize_address_range(lo, hi)
363 |             #v6 postal_code is after lat/lon instead of before like v4
364 |             country, region, city, lat, lon, postal_code, metro_code, area_code = row[4:]
365 |             yield nets, (country, region, city, postal_code, lat, lon, metro_code, area_code)
366 | 
367 | 
368 | class CountryRadixTree(RadixTree):
369 |     usage = '-w mmcountry.dat mmcountry GeoIPCountryWhois.csv'
370 |     cmd = 'mmcountry'
371 |     seek_depth = 31
372 |     edition = pygeoip.const.COUNTRY_EDITION
373 |     reclen = pygeoip.const.STANDARD_RECORD_LENGTH
374 |     segreclen = pygeoip.const.SEGMENT_RECORD_LENGTH
375 | 
376 |     def gen_nets(self, opts, args):
377 |         for _, _, lo, hi, cc, _ in gen_csv(fileinput.input(args)):
378 |             lo, hi = ipaddr.IPAddress(int(lo)), ipaddr.IPAddress(int(hi))
379 |             nets = ipaddr.summarize_address_range(lo, hi)
380 |             yield nets, (cc,)
381 | 
382 |     def encode(self, cc):
383 |         #unused
384 |         return ''
385 | 
386 |     def serialize_node(self, node):
387 |         if not node:
388 |             #empty leaf
389 |             rec = pygeoip.const.COUNTRY_BEGIN
390 |         elif isinstance(node, RadixTreeNode):
391 |             #internal node
392 |             rec = node.segment
393 |         else:
394 |             #data leaf
395 |             data = node[0] if self.debug else node
396 |             cc = data[0]
397 |             try:
398 |                 offset = cc_idx[cc.lower()]
399 |             except KeyError:
400 |                 logging.warning("'%s': missing country. update pygeoip.const.COUNTRY_CODES?", cc)
401 |                 offset = 0
402 |             #data leaves directly encode cc index as an offset
403 |             rec = pygeoip.const.COUNTRY_BEGIN + offset
404 |         return self.encode_rec(rec, self.reclen)
405 | 
406 |     def serialize(self, f):
407 |         for node in self.segments:
408 |             f.write(self.serialize_node(node.lhs))
409 |             f.write(self.serialize_node(node.rhs))
410 | 
411 |         f.write(chr(0x00) * 3)
412 |         f.write('csv2dat.py') #.dat file comment - can be anything
413 |         f.write(chr(0xFF) * 3)
414 |         f.write(chr(self.edition))
415 |         f.write(self.encode_rec(len(self.segments), self.segreclen))
416 | 
417 | 
418 | class Countryv6RadixTree(CountryRadixTree):
419 |     usage = '-w mmcountry6.dat mmcountry6 GeoIPv6.csv'
420 |     cmd = 'mmcountry6'
421 |     seek_depth = 127
422 |     edition = pygeoip.const.COUNTRY_EDITION_V6
423 |     reclen = pygeoip.const.STANDARD_RECORD_LENGTH
424 |     segreclen = pygeoip.const.SEGMENT_RECORD_LENGTH
425 | 
426 |     def gen_nets(self, opts, args):
427 |         for row in gen_csv(fileinput.input(args)):
428 |             #handle weird space before quote problems
429 |             lo, hi, cc = [x.strip(' "') for x in row[2:5]]
430 |             lo, hi = ipaddr.IPAddress(int(lo)), ipaddr.IPAddress(int(hi))
431 |             nets = ipaddr.summarize_address_range(lo, hi)
432 |             yield nets, (cc,)
433 | 
434 | 
435 | def build_dat(RTree, opts, args):
436 |     tstart = time.time()
437 |     r = RTree(debug=opts.debug)
438 | 
439 |     r.load(opts, args)
440 | 
441 |     if opts.debug:
442 |         r.dump()
443 | 
444 |     with open(opts.write_dat, 'wb') as f:
445 |         r.serialize(f)
446 | 
447 |     tstop = time.time()
448 |     print 'wrote %d-node trie with %d networks (%d distinct labels) in %d seconds' % (
449 |             len(r.segments), r.netcount, len(r.data_offsets), tstop - tstart)
450 | 
451 | 
452 | rtrees = [
453 |     ASNRadixTree, ASNv6RadixTree,
454 |     CityRev1RadixTree, CityRev1v6RadixTree,
455 |     CountryRadixTree, Countryv6RadixTree,
456 |     ISPRadixTree, OrgRadixTree,
457 | ]
458 | cmds = dict((rtree.cmd, (partial(build_dat, rtree), rtree.usage)) for rtree in rtrees)
459 | cmds['flat'] = (flatten_city, flatten_city.usage)
460 | cmds['test'] = (test_dbs, test_dbs.usage)
461 | 
462 | def main(argv=None):
463 |     global opts
464 |     opts, args = parse_args(argv)
465 |     init_logger(opts)
466 |     logging.debug(opts)
467 |     logging.debug(args)
468 | 
469 |     cmd = args.pop(0)
470 |     cmd, usage = cmds[cmd]
471 |     return cmd(opts, args)
472 | 
473 | 
474 | if __name__ == '__main__':
475 |     rval = main()
476 |     logging.shutdown()
477 |     sys.exit(rval)
478 | 
479 | 


--------------------------------------------------------------------------------