├── DomainClassifier
    ├── __init__.py
    ├── domainclassifier.py
    └── test.py
├── README.md
├── doc
    ├── domainclassifier-flow.dot
    └── domainclassifier-flow.png
├── requirements.txt
└── setup.py


/DomainClassifier/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.1"
2 | 


--------------------------------------------------------------------------------
/DomainClassifier/domainclassifier.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """DomainClassifier is a simple Python library to extract and classify Internet
  3 | domains from raw text files following their existence, localization or
  4 | attributes.
  5 | """
  6 | 
  7 | import re
  8 | import dns.resolver
  9 | import IPy
 10 | import redis
 11 | import socket
 12 | import time
 13 | from datetime import date, timedelta
 14 | import os
 15 | import sys
 16 | from uuid import uuid4
 17 | 
 18 | from multiprocessing import Process as Proc
 19 | 
 20 | try:
 21 |     # python 3
 22 |     import urllib.request as urllib
 23 | except:
 24 |     # python 2
 25 |     import urllib2 as urllib
 26 | 
 27 | try:
 28 |     from pybgpranking import BGPRanking
 29 | except:
 30 |     print("pybgpranking is not installed - ranking of ASN values won't be possible")
 31 | __author__ = "Alexandre Dulaunoy"
 32 | __copyright__ = "Copyright 2012-2024, Alexandre Dulaunoy"
 33 | __license__ = "AGPL version 3"
 34 | __version__ = "1.1"
 35 | 
 36 | 
 37 | class Extract:
 38 | 
 39 |     """DomainClassifier Extract class is the base class for extracting domains
 40 |     from a rawtext stream. When call, the rawtext parameter is a string
 41 |     containing the raw data to be process."""
 42 | 
 43 |     def __init__(self, rawtext=None, nameservers=['8.8.8.8'], port=53, redis_host='', redis_port=6379, redis_db=0, expire_time=3600, re_timeout=-1):
 44 |         self.rawtext = rawtext
 45 |         self.presolver = dns.resolver.Resolver()
 46 |         self.presolver.nameservers = nameservers
 47 |         self.presolver.port = port
 48 |         self.presolver.lifetime = 1.0
 49 |         self.bgprankingserver = 'pdns.circl.lu'
 50 |         self.vdomain = []
 51 |         self.listtld = []
 52 | 
 53 |         self.re_domain = re.compile(r'\b([a-zA-Z\d-]{,63}(\.[a-zA-Z\d-]{,63})+)\b')
 54 | 
 55 |         if redis_host and redis_port:
 56 |             self.redis = redis.StrictRedis(host=redis_host, port=redis_port, db=redis_db, decode_responses=True)
 57 |             self.uuid = str(uuid4())
 58 |             self.re_timeout = re_timeout
 59 |         else:
 60 |             self.redis = None
 61 |         self.expire_time = expire_time
 62 | 
 63 |         self.domain = self.potentialdomain()
 64 | 
 65 |     """__origin is a private function to the ASN lookup for an IP address via
 66 |     the Team Cymru DNS interface. ipadd is a string contain the IP address in a
 67 |     decimal form."""
 68 | 
 69 |     def __origin(self, ipaddr=None):
 70 | 
 71 |         if ipaddr:
 72 |             clook = (
 73 |                 IPy.IP(str(ipaddr))
 74 |                 .reverseName()
 75 |                 .replace('.in-addr.arpa.', '.origin.asn.cymru.com')
 76 |             )
 77 |             try:
 78 |                 a = self.presolver.query(clook, 'TXT')
 79 |             except dns.resolver.NXDOMAIN:
 80 |                 return None
 81 |             except dns.exception.Timeout:
 82 |                 return None
 83 |         if a:
 84 |             x = str(a[0]).split("|")
 85 |             # why so many spaces?
 86 |             x = list(map(lambda t: t.replace("\"", "").strip(), x))
 87 |             return (x[0], x[2], a[0])
 88 |         else:
 89 |             return None
 90 | 
 91 |     """__bgpanking return the ranking the float value of an ASN.
 92 |     """
 93 | 
 94 |     def __bgpranking(self, asn=None):
 95 |         if asn:
 96 |             bgpranking = BGPRanking()
 97 |             value = bgpranking.query(
 98 |                 asn, date=(date.today() - timedelta(1)).isoformat()
 99 |             )
100 |             return value['response']['ranking']['rank']
101 | 
102 |     def __updatelisttld(self, force=False):
103 |         ianatldlist = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
104 |         userdir = os.path.expanduser("~")
105 |         cachedir = os.path.join(userdir, ".DomainClassifier")
106 |         if not os.path.exists(cachedir):
107 |             os.mkdir(cachedir)
108 |         tldcache = os.path.join(cachedir, "tlds")
109 |         if not os.path.exists(tldcache):
110 |             print(tldcache)
111 |             req = urllib.Request(ianatldlist)
112 |             req.add_header(
113 |                 'User-Agent',
114 |                 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0',
115 |             )
116 |             tlds = (urllib.urlopen(req).read()).decode('utf8')
117 |             f = open(tldcache, "wb")
118 |             f.write(tlds.encode("utf-8"))
119 |             f.close()
120 | 
121 |         f = open(tldcache, "r")
122 |         tlds = f.read()
123 |         f.close()
124 |         tlds = tlds.split("\n")
125 |         for tld in tlds:
126 |             if tld:
127 |                 self.listtld.append(tld.lower())
128 | 
129 |     def __listtld(self):
130 |         if not self.listtld:
131 |             self.__updatelisttld()
132 |         self.cleandomain = []
133 |         if self.domain is None:
134 |             return False
135 |         for domain in self.domain:
136 |             lastpart = domain.rsplit(".")[-1:][0]
137 |             for tld in self.listtld:
138 |                 if lastpart == tld:
139 |                     self.cleandomain.append(domain)
140 | 
141 |         return self.cleandomain
142 | 
143 |     def __re_findall(self, rawtext):
144 |         for x in re.findall(self.re_domain, rawtext):
145 |             if x[0]:
146 |                 self.redis.sadd('cache:regex:{}'.format(self.uuid), x[0])
147 |         self.redis.expire('cache:regex:{}'.format(self.uuid), 360)
148 | 
149 |     def __regex_findall(self, rawtext, timeout):
150 |         proc = Proc(target=self.__re_findall, args=(rawtext,))
151 |         try:
152 |             proc.start()
153 |             proc.join(timeout)
154 |             if proc.is_alive():
155 |                 proc.terminate()
156 |                 print('regex: processing timeout')
157 |                 return []
158 |             else:
159 |                 domains = self.redis.smembers('cache:regex:{}'.format(self.uuid))
160 |                 self.redis.delete('cache:regex:{}'.format(self.uuid))
161 |                 proc.terminate()
162 |                 return domains
163 |         except KeyboardInterrupt:
164 |             print("Caught KeyboardInterrupt, terminating workers")
165 |             proc.terminate()
166 |             sys.exit(0)
167 | 
168 |     def text(self, rawtext=''):
169 |         if rawtext:
170 |             self.rawtext = rawtext
171 |             self.domain = self.potentialdomain()
172 |             self.vdomain = []
173 |             return True
174 |         return False
175 | 
176 |     """potentialdomain method extracts potential domains matching any
177 |     string that is a serie of string with maximum 63 character separated by a
178 |     dot. The method used the rawtext defined at the instantiation of the class.
179 |     This return a list of a potential domain."""
180 | 
181 |     def potentialdomain(self, validTLD=True):
182 |         self.domain = []
183 |         if self.re_timeout > 0 and self.redis:
184 |             self.domain = list(self.__regex_findall(self.rawtext, self.re_timeout))
185 |         else:
186 |             domains = self.re_domain.findall(self.rawtext)
187 |             for x in domains:
188 |                 if x[0]:
189 |                     self.domain.append(x[0])
190 |         if validTLD:
191 |             self.domain = self.__listtld()
192 |         return self.domain
193 | 
194 |     """validdomain method used the extracted domains from the domain method to
195 |     generate a list of valid domain (at least existing in the authoritative DNS
196 |     server". The records type used are A, AAAA, SOA, MX and CNAME records. This
197 |     returns a list of existing domain. If the extended flag is true, a set is
198 |     return with the associated DNS resources found."""
199 | 
200 |     def validdomain(
201 |         self,
202 |         rtype=['A', 'AAAA', 'SOA', 'MX', 'CNAME'],
203 |         extended=True,
204 |         passive_dns=False,
205 |     ):
206 |         if extended is False:
207 |             self.vdomain = set()
208 |         else:
209 |             self.vdomain = []
210 | 
211 |         for domain in self.domain:
212 |             if self.redis:
213 |                 if self.redis.exists('dom_class:cache:{}'.format(domain)):
214 |                     passive_dns_out = self.redis.smembers('dom_class:cache:{}'.format(domain))
215 |                     for out in passive_dns_out:
216 |                         if extended:
217 |                             out = tuple(out.split('[^]', 2))
218 |                             self.vdomain.append(out)
219 |                         else:
220 |                             self.vdomain.add(out)
221 |                     continue
222 | 
223 |             for dnstype in rtype:
224 |                 try:
225 |                     answers = self.presolver.query(domain, dnstype)
226 |                 except:
227 |                     pass
228 |                 else:
229 |                     # Passive DNS output
230 |                     # timestamp||dns-client ||dns-server||RR class||Query||Query Type||Answer||TTL||Count
231 |                     if passive_dns:
232 |                         rrset = answers.rrset.to_text().splitlines()
233 |                         for dns_resp in rrset:
234 |                             dns_resp = dns_resp.split()
235 |                             passive_dns_out = (
236 |                                 '{}||127.0.0.1||{}||{}||{}||{}||{}||{}||1\n'.format(
237 |                                     time.time(),
238 |                                     self.presolver.nameservers[0],
239 |                                     dns_resp[2],
240 |                                     domain,
241 |                                     dnstype,
242 |                                     dns_resp[4],
243 |                                     answers.ttl,
244 |                                 )
245 |                             )
246 |                             self.vdomain.add((passive_dns_out))
247 |                             if self.redis:
248 |                                 self.redis.sadd('dom_class:cache:{}'.format(domain), passive_dns_out)
249 |                                 self.redis.expire('dom_class:cache:{}'.format(domain), self.expire_time)
250 |                     elif extended:
251 |                         self.vdomain.append((domain, dnstype, answers[0]))
252 |                         if self.redis:
253 |                             self.redis.sadd('dom_class:cache:{}'.format(domain), '{}[^]{}[^]{}'.format(domain, dnstype, answers[0]))
254 |                             self.redis.expire('dom_class:cache:{}'.format(domain), self.expire_time)
255 |                     else:
256 |                         self.vdomain.add((domain))
257 |                         if self.redis:
258 |                             self.redis.sadd('dom_class:cache:{}'.format(domain), domain)
259 |                             self.redis.expire('dom_class:cache:{}'.format(domain), self.expire_time)
260 |         return self.vdomain
261 | 
262 |     """ipaddress method extracts from the domain list the valid IPv4 addresses"""
263 | 
264 |     def ipaddress(self, extended=False):
265 | 
266 |         if extended is False:
267 |             self.ipaddresses = []
268 |         else:
269 |             self.ipaddresses = set()
270 | 
271 |         for d in self.domain:
272 |             try:
273 |                 ip = socket.gethostbyname(d)
274 |             except:
275 |                 continue
276 | 
277 |             if extended is False:
278 |                 self.ipaddresses.append((ip))
279 |             else:
280 |                 orig = self.__origin(ipaddr=ip)
281 |                 self.ipaddresses.add((ip, str(orig)))
282 | 
283 |         return self.ipaddresses
284 | 
285 |     """localizedomain method use the validdomain list (in extended format) to
286 |     localize per country code the associated resources. The cc argument specifies the
287 |     country code in ISO 3166-1 alpha-2 format to check for."""
288 | 
289 |     def localizedomain(self, cc=None):
290 |         self.localdom = []
291 | 
292 |         for dom in self.vdomain:
293 |             if dom[1] == 'A':
294 |                 ip = dom[2]
295 |                 try:
296 |                     orig = self.__origin(ipaddr=dom[2])[1]
297 |                 except:
298 |                     continue
299 |                 if orig == cc:
300 |                     self.localdom.append(dom)
301 |             elif dom[1] == 'CNAME':
302 |                 cname = str(dom[2])
303 |                 ip = socket.gethostbyname(cname)
304 |                 try:
305 |                     orig = self.__origin(ipaddr=ip)[1]
306 |                 except:
307 |                     continue
308 |                 if orig == cc:
309 |                     self.localdom.append(dom)
310 |         return self.localdom
311 | 
312 |     """rankdomain method use the validdomain list (in extended format to rank
313 |     each domain with an IP address. Return a sorted list of tuples (ranking,
314 |     domain).
315 |     """
316 | 
317 |     def rankdomain(self):
318 |         self.rankdom = []
319 | 
320 |         if self.vdomain:
321 |             for dom in self.vdomain:
322 |                 rank = None
323 |                 asn = None
324 |                 if dom[1] == 'A':
325 |                     ip = dom[2]
326 |                     o = self.__origin(ipaddr=dom[2])
327 |                     if o:
328 |                         asn = o[0]
329 |                     rank = self.__bgpranking(asn)
330 |                     t = (rank, dom[0])
331 |                     self.rankdom.append(t)
332 |                 elif dom[1] == 'CNAME':
333 |                     cname = str(dom[2])
334 |                     try:
335 |                         ip = socket.gethostbyname(cname)
336 |                     except:
337 |                         continue
338 |                     try:
339 |                         asn = self.__origin(ipaddr=ip)[0]
340 |                     except TypeError:
341 |                         continue
342 |                     rank = self.__bgpranking(asn)
343 |                     t = (rank, dom[0])
344 |                     self.rankdom.append(t)
345 |             return sorted(self.rankdom, key=lambda d: d[0])
346 | 
347 |     """exclude domains from a regular expression. If validdomain was called,
348 |     it's only on the valid domain list."""
349 | 
350 |     """exclude domains from a regular expression. If validdomain was called,
351 |     it's only on the valid domain list."""
352 | 
353 |     def exclude(self, expression=None):
354 |         self.cleandomain = []
355 | 
356 |         excludefilter = re.compile(expression)
357 | 
358 |         if not self.vdomain:
359 |             domains = self.domain
360 |         else:
361 |             domains = self.vdomain
362 | 
363 |         for dom in domains:
364 |             if type(dom) == tuple:
365 |                 dom = dom[0]
366 | 
367 |             if excludefilter.search(dom):
368 |                 pass
369 |             else:
370 |                 self.cleandomain.append(dom)
371 |         return self.cleandomain
372 | 
373 |     def include(self, expression=None):
374 |         self.cleandomain = []
375 | 
376 |         includefilter = re.compile(expression)
377 | 
378 |         if not self.vdomain:
379 |             domains = self.domain
380 |         else:
381 |             domains = self.vdomain
382 | 
383 |         for dom in domains:
384 |             if type(dom) == tuple:
385 |                 dom = dom[0]
386 |             if includefilter.search(dom):
387 |                 self.cleandomain.append(dom)
388 | 
389 |         return set(self.cleandomain)
390 | 
391 | 
392 | if __name__ == "__main__":
393 |     c = Extract(
394 |         rawtext="www.foo.lu www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1 abc.dontexist"
395 |     )
396 |     c.text(
397 |         rawtext="www.abc.lu www.xxx.com random text a test bric broc www.lemonde.fr www.belnet.be www.foo.be"
398 |     )
399 |     print(c.potentialdomain())
400 |     print(c.potentialdomain(validTLD=True))
401 |     print(c.validdomain(extended=True))
402 |     print("US:")
403 |     print(c.localizedomain(cc='US'))
404 |     print("LU:")
405 |     print(c.localizedomain(cc='LU'))
406 |     print("BE:")
407 |     print(c.localizedomain(cc='BE'))
408 |     print("Ranking:")
409 |     print(c.rankdomain())
410 |     print("List of ip addresses:")
411 |     print(c.ipaddress(extended=False))
412 |     print("Include dot.lu:")
413 |     print(c.include(expression=r'\.lu$'))
414 |     print("Exclude dot.lu:")
415 |     print(c.exclude(expression=r'\.lu$'))
416 |     c.text(rawtext="www.lwn.net www.undeadly.org")
417 |     print(c.potentialdomain(validTLD=True))
418 |     c.validdomain()
419 |     print(c.localizedomain(cc='US'))
420 |     print(c.validdomain(extended=False, passive_dns=True))
421 | 


--------------------------------------------------------------------------------
/DomainClassifier/test.py:
--------------------------------------------------------------------------------
 1 | import domainclassifier
 2 | 
 3 | c = domainclassifier.Extract( rawtext = "www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1", nameservers = ['8.8.8.8'])
 4 | #print c.potentialdomain()
 5 | print(c.validdomain(extended=True))
 6 | print("US:")
 7 | print(c.localizedomain(cc='US'))
 8 | print("LU:")
 9 | print(c.localizedomain(cc='LU'))
10 | print("BE:")
11 | print(c.localizedomain(cc='BE'))
12 | print("Ranking:")
13 | print(c.rankdomain())
14 | print("List of ip addresses:")
15 | print(c.ipaddress(extended=True))
16 | print("Include dot.lu:")
17 | print(c.include(expression=r'\.lu$'))
18 | print("Exclude dot.lu:")
19 | print(c.exclude(expression=r'\.lu$'))
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | DomainClassifier
  2 | ================
  3 | 
  4 | DomainClassifier is a simple Python library to extract and classify Internet domains/hostnames/IP addresses from raw unstructured text files following their existence, localization or attributes.
  5 | 
  6 | DomainClassifier can be used to extract Internet hosts from any free texts or collected unstructured information. A passive dns output is also available.
  7 | 
  8 | ![An overview of the DomainClassifier methods](https://raw.github.com/adulau/DomainClassifier/master/doc/domainclassifier-flow.png)
  9 | 
 10 | Install
 11 | -------
 12 | 
 13 | [DomainClassifier](https://pypi.python.org/pypi/DomainClassifier/) is part of the pypi package. It can be installed using the pip command:
 14 | 
 15 | `pip install DomainClassifier`
 16 | 
 17 | ```python
 18 | 
 19 | In [11]: c = DomainClassifier.domainclassifier.Extract(rawtext="www.google.com foo.bar ppp.ppp")
 20 | 
 21 | In [12]: c.potentialdomain()
 22 | Out[12]: ['www.google.com', 'foo.bar']
 23 | ```
 24 | 
 25 | How To Use It
 26 | -------------
 27 | 
 28 | 
 29 | ```python
 30 | import DomainClassifier.domainclassifier
 31 | 
 32 | c = DomainClassifier.domainclassifier.Extract( rawtext = "www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be ht
 33 | tp://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.
 34 | 8 201.1.1.1")
 35 | 
 36 | # extracting potentially valid domains from rawtext
 37 | print(c.potentialdomain())
 38 | 
 39 | # reduce set of potentially valid domains to existing domains
 40 | # (based on SOA,A,AAAA,CNAME,MX records)
 41 | print(c.validdomain(extended=True))
 42 | 
 43 | # reduce set of valid domains with DNS records associated to a
 44 | # specified country
 45 | print("US:")
 46 | print(c.localizedomain(cc='US'))
 47 | print("LU:")
 48 | print(c.localizedomain(cc='LU'))
 49 | print("BE:")
 50 | print(c.localizedomain(cc='BE'))
 51 | print("Ranking:")
 52 | print(c.rankdomain())
 53 | 
 54 | # extract valid IPv4 addresses (using the potential list of valid domains)
 55 | print("List of ip addresses:")
 56 | print(c.ipaddress(extended=True))
 57 | 
 58 | # some more filtering
 59 | print("Include dot.lu:")
 60 | print(c.include(expression=r'\.lu$'))
 61 | print("Exclude dot.lu:")
 62 | print(c.exclude(expression=r'\.lu$'))
 63 | ```
 64 | 
 65 | ### Sample output
 66 | 
 67 | ```python
 68 | ['www.xxx.com', 'foo.lu', 'abc.lu', 'a.b.c.d.e', '1.2.3.4', 'foo.be', 'www.belnet.be', 'www.cert.be', 'www.public.lu', 'www.allo.lu', 'www.eurodns.com', 'something-broken-www.google.com', 'www.google.lu', 'www.facebook.com', 'www.nic.ru', 'www.youporn.com', '8.8.8.8', '201.1.1.1']
 69 | [('www.xxx.com', 'A', <DNS IN A rdata: 67.23.112.226>), ('abc.lu', 'SOA', <DNS IN SOA rdata: neptun.vo.lu. Administrator.vo.lu. 2006063001 86400 7200 2419200 3600>), ('abc.lu', 'MX', <DNS IN MX rdata: 10 proteus.vo.lu.>), ('foo.be', 'A', <DNS IN A rdata: 188.65.217.78>), ('foo.be', 'AAAA', <DNS IN AAAA rdata: 2001:6f8:202:2df::2>), ('foo.be', 'SOA', <DNS IN SOA rdata: ka.quuxlabs.com. adulau.foo.be. 2010121901 21600 3600 604800 86400>), ('foo.be', 'MX', <DNS IN MX rdata: 10 mail.foo.be.>), ('www.belnet.be', 'A', <DNS IN A rdata: 193.190.130.15>), ('www.belnet.be', 'AAAA', <DNS IN AAAA rdata: 2001:6a8:3c80:8300::15>), ('www.belnet.be', 'CNAME', <DNS IN CNAME rdata: fiorano.belnet.be.>), ('www.cert.be', 'A', <DNS IN A rdata: 193.190.198.61>), ('www.cert.be', 'AAAA', <DNS IN AAAA rdata: 2001:6a8:3c80::61>), ('www.cert.be', 'SOA', <DNS IN SOA rdata: ns.belnet.be. hostmaster.belnet.be. 2013053039 360 180 1209600 3600>), ('www.cert.be', 'MX', <DNS IN MX rdata: 10 asp-mxa.belnet.be.>), ('www.cert.be', 'CNAME', <DNS IN CNAME rdata: cert.be.>), ('www.public.lu', 'A', <DNS IN A rdata: 194.154.200.74>), ('www.allo.lu', 'A', <DNS IN A rdata: 80.90.47.69>), ('www.eurodns.com', 'A', <DNS IN A rdata: 80.92.65.165>), ('www.google.lu', 'A', <DNS IN A rdata: 173.194.66.94>), ('www.google.lu', 'AAAA', <DNS IN AAAA rdata: 2a00:1450:400c:c03::5e>), ('www.facebook.com', 'A', <DNS IN A rdata: 31.13.64.1>), ('www.facebook.com', 'AAAA', <DNS IN AAAA rdata: 2a03:2880:10:8f07:face:b00c::1>), ('www.facebook.com', 'MX', <DNS IN MX rdata: 10 msgin.t.facebook.com.>), ('www.facebook.com', 'CNAME', <DNS IN CNAME rdata: star.c10r.facebook.com.>), ('www.nic.ru', 'A', <DNS IN A rdata: 194.85.61.42>), ('www.nic.ru', 'MX', <DNS IN MX rdata: 0 nomail.nic.ru.>), ('www.youporn.com', 'A', <DNS IN A rdata: 31.192.116.24>), ('www.youporn.com', 'SOA', <DNS IN SOA rdata: pdns1.ultradns.net. dns.manwin.com. 2012041840 86400 86400 86400 86400>), ('www.youporn.com', 'MX', <DNS IN MX rdata: 20 smtp-scan01.mx.reflected.net.>), ('www.youporn.com', 'CNAME', <DNS IN CNAME rdata: youporn.com.>)]
 70 | US:
 71 | [('www.xxx.com', 'A', <DNS IN A rdata: 67.23.112.226>), ('www.google.lu', 'A', <DNS IN A rdata: 173.194.66.94>)]
 72 | LU:
 73 | [('www.public.lu', 'A', <DNS IN A rdata: 194.154.200.74>), ('www.allo.lu', 'A', <DNS IN A rdata: 80.90.47.69>), ('www.eurodns.com', 'A', <DNS IN A rdata: 80.92.65.165>)]
 74 | BE:
 75 | [('foo.be', 'A', <DNS IN A rdata: 188.65.217.78>), ('www.belnet.be', 'A', <DNS IN A rdata: 193.190.130.15>), ('www.belnet.be', 'CNAME', <DNS IN CNAME rdata: fiorano.belnet.be.>), ('www.cert.be', 'A', <DNS IN A rdata: 193.190.198.61>), ('www.cert.be', 'CNAME', <DNS IN CNAME rdata: cert.be.>)]
 76 | Ranking:
 77 | [(1.0, 'www.youporn.com'), (1.0, 'www.youporn.com'), (1.0000120563271599, 'www.belnet.be'), (1.0000120563271599, 'www.belnet.be'), (1.0000120563271599, 'www.cert.be'), (1.0000120563271599, 'www.cert.be'), (1.0000372023809501, 'foo.be'), (1.0001395089285701, 'www.public.lu'), (1.00015419407895, 'www.allo.lu'), (1.0003662109375, 'www.eurodns.com'), (1.0004111842105301, 'www.xxx.com'), (1.0005944293478299, 'www.nic.ru'), (1.0024646577381, 'www.facebook.com'), (1.0024646577381, 'www.facebook.com'), (1.002635288165, 'www.google.lu')]
 78 | List of ip addresses:
 79 | ('15169', 'AU', <DNS IN TXT rdata: "15169 | 1.2.3.0/24 | AU | apnic | 2011-08-11">)
 80 | ('15169', 'US', <DNS IN TXT rdata: "15169 | 8.8.8.0/24 | US | arin | 1992-12-01">)
 81 | ('27699', 'BR', <DNS IN TXT rdata: "27699 | 201.1.0.0/17 | BR | lacnic | 2003-12-08">)
 82 | set([('201.1.1.1', '(\'27699\', \'BR\', <DNS IN TXT rdata: "27699 | 201.1.0.0/17 | BR | lacnic | 2003-12-08">)'), ('8.8.8.8', '(\'15169\', \'US\', <DNS IN TXT rdata: "15169 | 8.8.8.0/24 | US | arin | 1992-12-01">)'), ('1.2.3.4', '(\'15169\', \'AU\', <DNS IN TXT rdata: "15169 | 1.2.3.0/24 | AU | apnic | 2011-08-11">)')])
 83 | Include dot.lu:
 84 | ['abc.lu', 'abc.lu', 'www.public.lu', 'www.allo.lu', 'www.google.lu', 'www.google.lu']
 85 | Exclude dot.lu:
 86 | ['www.xxx.com', 'foo.be', 'foo.be', 'foo.be', 'foo.be', 'www.belnet.be', 'www.belnet.be', 'www.belnet.be', 'www.cert.be', 'www.cert.be', 'www.cert.be', 'www.cert.be', 'www.cert.be', 'www.eurodns.com', 'www.facebook.com', 'www.facebook.com', 'www.facebook.com', 'www.facebook.com', 'www.nic.ru', 'www.nic.ru', 'www.youporn.com', 'www.youporn.com', 'www.youporn.com', 'www.youporn.com']
 87 | ```
 88 | 
 89 | ### Software Required
 90 | 
 91 | * Python (tested successfully on version 2.6, 2.7 and 3.5)
 92 | * dnspython library - http://www.dnspython.org/
 93 | * IPy library
 94 | * [pybgpranking](https://github.com/D4-project/BGP-Ranking/tree/master/client) to get malicious ranking of BGP AS number via [BGP Ranking](https://github.com/D4-project/BGP-Ranking)
 95 | 
 96 | ### Software using DomainClassifier
 97 | 
 98 | * [AIL framework - Analysis Information Leak framework](https://github.com/ail-project/ail-framework)
 99 | 
100 | ### License
101 | 
102 | ~~~~
103 | Copyright (C) 2012-2023 Alexandre Dulaunoy - a(at)foo.be
104 | Copyright (C) 2021 Aurelien Thirion
105 | 
106 | This program is free software: you can redistribute it and/or modify
107 | it under the terms of the GNU Affero General Public License as
108 | published by the Free Software Foundation, either version 3 of the
109 | License, or (at your option) any later version.
110 | 
111 | This program is distributed in the hope that it will be useful,
112 | but WITHOUT ANY WARRANTY; without even the implied warranty of
113 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
114 | GNU Affero General Public License for more details.
115 | 
116 | You should have received a copy of the GNU Affero General Public License
117 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
118 | ~~~~
119 | 


--------------------------------------------------------------------------------
/doc/domainclassifier-flow.dot:
--------------------------------------------------------------------------------
 1 | digraph g{
 2 |   z [label="raw text including probable hostnames"];
 3 |   a -> b;
 4 |   a [label=".potentialdomain()",shape=box,fillcolor="palegreen",style="filled"];
 5 |   z -> a [label=" extracting"];
 6 |   b [label=".validdomain(extended=True)",shape=box,fillcolor="palegreen",style="filled"];
 7 |   c [label=".localizedomain(cc='country code')",shape=box,fillcolor="palegreen",style="filled"];
 8 |  b->c;
 9 |   e [label=".validdomain(extended=False)",shape=box,fillcolor="palegreen",style="filled"];
10 | a->e;
11 | f [label=".rankdomain()",shape=box,fillcolor="palegreen",style="filled"];
12 | c->f;
13 | g [label=".include(expression=regexp)",shape=box,fillcolor="palegreen",style="filled" ];
14 | h [label=".exclude(expression=regexp)",shape=box,fillcolor="palegreen",style="filled" ];
15 | f->g;
16 | g->h;
17 | e->g;
18 | i [label="set of hostnames", fillcolor="palegreen",style="filled"];
19 | h->i;
20 | }
21 | 


--------------------------------------------------------------------------------
/doc/domainclassifier-flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adulau/DomainClassifier/d3f8129399b9030bbb298412b458c0d4e35114c6/doc/domainclassifier-flow.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | IPy
2 | dnspython
3 | git+https://github.com/D4-project/BGP-Ranking.git/#egg=pybgpranking&subdirectory=client
4 | redis
5 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | from pathlib import Path
 3 | this_directory = Path(__file__).parent
 4 | long_description = (this_directory / "README.md").read_text()
 5 | 
 6 | setup(
 7 |     name="DomainClassifier",
 8 |     version="1.4",
 9 |     packages=find_packages(),
10 |     install_requires=['dnspython', 'IPy', 'pybgpranking'],
11 |     author="Alexandre Dulaunoy",
12 |     author_email="a@foo.be",
13 |     description="DomainClassifier is a Python library to extract and classify Internet domains/hostnames/IP addresses from raw unstructured text files following their existence, localization or attributes.",
14 |     long_description=long_description,
15 |     long_description_content_type='text/markdown',
16 |     license="AGPL",
17 |     keywords="internet mining domain resolver geolocalisation",
18 |     url="http://github.com/adulau/DomainClassifier"
19 | )
20 | 


--------------------------------------------------------------------------------