├── DomainClassifier ├── __init__.py ├── domainclassifier.py └── test.py ├── README.md ├── doc ├── domainclassifier-flow.dot └── domainclassifier-flow.png ├── requirements.txt └── setup.py /DomainClassifier/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.1" 2 | -------------------------------------------------------------------------------- /DomainClassifier/domainclassifier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """DomainClassifier is a simple Python library to extract and classify Internet 3 | domains from raw text files following their existence, localization or 4 | attributes. 5 | """ 6 | 7 | import re 8 | import dns.resolver 9 | import IPy 10 | import redis 11 | import socket 12 | import time 13 | from datetime import date, timedelta 14 | import os 15 | import sys 16 | from uuid import uuid4 17 | 18 | from multiprocessing import Process as Proc 19 | 20 | try: 21 | # python 3 22 | import urllib.request as urllib 23 | except: 24 | # python 2 25 | import urllib2 as urllib 26 | 27 | try: 28 | from pybgpranking import BGPRanking 29 | except: 30 | print("pybgpranking is not installed - ranking of ASN values won't be possible") 31 | __author__ = "Alexandre Dulaunoy" 32 | __copyright__ = "Copyright 2012-2024, Alexandre Dulaunoy" 33 | __license__ = "AGPL version 3" 34 | __version__ = "1.1" 35 | 36 | 37 | class Extract: 38 | 39 | """DomainClassifier Extract class is the base class for extracting domains 40 | from a rawtext stream. When call, the rawtext parameter is a string 41 | containing the raw data to be process.""" 42 | 43 | def __init__(self, rawtext=None, nameservers=['8.8.8.8'], port=53, redis_host='', redis_port=6379, redis_db=0, expire_time=3600, re_timeout=-1): 44 | self.rawtext = rawtext 45 | self.presolver = dns.resolver.Resolver() 46 | self.presolver.nameservers = nameservers 47 | self.presolver.port = port 48 | self.presolver.lifetime = 1.0 49 | self.bgprankingserver = 'pdns.circl.lu' 50 | self.vdomain = [] 51 | self.listtld = [] 52 | 53 | self.re_domain = re.compile(r'\b([a-zA-Z\d-]{,63}(\.[a-zA-Z\d-]{,63})+)\b') 54 | 55 | if redis_host and redis_port: 56 | self.redis = redis.StrictRedis(host=redis_host, port=redis_port, db=redis_db, decode_responses=True) 57 | self.uuid = str(uuid4()) 58 | self.re_timeout = re_timeout 59 | else: 60 | self.redis = None 61 | self.expire_time = expire_time 62 | 63 | self.domain = self.potentialdomain() 64 | 65 | """__origin is a private function to the ASN lookup for an IP address via 66 | the Team Cymru DNS interface. ipadd is a string contain the IP address in a 67 | decimal form.""" 68 | 69 | def __origin(self, ipaddr=None): 70 | 71 | if ipaddr: 72 | clook = ( 73 | IPy.IP(str(ipaddr)) 74 | .reverseName() 75 | .replace('.in-addr.arpa.', '.origin.asn.cymru.com') 76 | ) 77 | try: 78 | a = self.presolver.query(clook, 'TXT') 79 | except dns.resolver.NXDOMAIN: 80 | return None 81 | except dns.exception.Timeout: 82 | return None 83 | if a: 84 | x = str(a[0]).split("|") 85 | # why so many spaces? 86 | x = list(map(lambda t: t.replace("\"", "").strip(), x)) 87 | return (x[0], x[2], a[0]) 88 | else: 89 | return None 90 | 91 | """__bgpanking return the ranking the float value of an ASN. 92 | """ 93 | 94 | def __bgpranking(self, asn=None): 95 | if asn: 96 | bgpranking = BGPRanking() 97 | value = bgpranking.query( 98 | asn, date=(date.today() - timedelta(1)).isoformat() 99 | ) 100 | return value['response']['ranking']['rank'] 101 | 102 | def __updatelisttld(self, force=False): 103 | ianatldlist = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt" 104 | userdir = os.path.expanduser("~") 105 | cachedir = os.path.join(userdir, ".DomainClassifier") 106 | if not os.path.exists(cachedir): 107 | os.mkdir(cachedir) 108 | tldcache = os.path.join(cachedir, "tlds") 109 | if not os.path.exists(tldcache): 110 | print(tldcache) 111 | req = urllib.Request(ianatldlist) 112 | req.add_header( 113 | 'User-Agent', 114 | 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0', 115 | ) 116 | tlds = (urllib.urlopen(req).read()).decode('utf8') 117 | f = open(tldcache, "wb") 118 | f.write(tlds.encode("utf-8")) 119 | f.close() 120 | 121 | f = open(tldcache, "r") 122 | tlds = f.read() 123 | f.close() 124 | tlds = tlds.split("\n") 125 | for tld in tlds: 126 | if tld: 127 | self.listtld.append(tld.lower()) 128 | 129 | def __listtld(self): 130 | if not self.listtld: 131 | self.__updatelisttld() 132 | self.cleandomain = [] 133 | if self.domain is None: 134 | return False 135 | for domain in self.domain: 136 | lastpart = domain.rsplit(".")[-1:][0] 137 | for tld in self.listtld: 138 | if lastpart == tld: 139 | self.cleandomain.append(domain) 140 | 141 | return self.cleandomain 142 | 143 | def __re_findall(self, rawtext): 144 | for x in re.findall(self.re_domain, rawtext): 145 | if x[0]: 146 | self.redis.sadd('cache:regex:{}'.format(self.uuid), x[0]) 147 | self.redis.expire('cache:regex:{}'.format(self.uuid), 360) 148 | 149 | def __regex_findall(self, rawtext, timeout): 150 | proc = Proc(target=self.__re_findall, args=(rawtext,)) 151 | try: 152 | proc.start() 153 | proc.join(timeout) 154 | if proc.is_alive(): 155 | proc.terminate() 156 | print('regex: processing timeout') 157 | return [] 158 | else: 159 | domains = self.redis.smembers('cache:regex:{}'.format(self.uuid)) 160 | self.redis.delete('cache:regex:{}'.format(self.uuid)) 161 | proc.terminate() 162 | return domains 163 | except KeyboardInterrupt: 164 | print("Caught KeyboardInterrupt, terminating workers") 165 | proc.terminate() 166 | sys.exit(0) 167 | 168 | def text(self, rawtext=''): 169 | if rawtext: 170 | self.rawtext = rawtext 171 | self.domain = self.potentialdomain() 172 | self.vdomain = [] 173 | return True 174 | return False 175 | 176 | """potentialdomain method extracts potential domains matching any 177 | string that is a serie of string with maximum 63 character separated by a 178 | dot. The method used the rawtext defined at the instantiation of the class. 179 | This return a list of a potential domain.""" 180 | 181 | def potentialdomain(self, validTLD=True): 182 | self.domain = [] 183 | if self.re_timeout > 0 and self.redis: 184 | self.domain = list(self.__regex_findall(self.rawtext, self.re_timeout)) 185 | else: 186 | domains = self.re_domain.findall(self.rawtext) 187 | for x in domains: 188 | if x[0]: 189 | self.domain.append(x[0]) 190 | if validTLD: 191 | self.domain = self.__listtld() 192 | return self.domain 193 | 194 | """validdomain method used the extracted domains from the domain method to 195 | generate a list of valid domain (at least existing in the authoritative DNS 196 | server". The records type used are A, AAAA, SOA, MX and CNAME records. This 197 | returns a list of existing domain. If the extended flag is true, a set is 198 | return with the associated DNS resources found.""" 199 | 200 | def validdomain( 201 | self, 202 | rtype=['A', 'AAAA', 'SOA', 'MX', 'CNAME'], 203 | extended=True, 204 | passive_dns=False, 205 | ): 206 | if extended is False: 207 | self.vdomain = set() 208 | else: 209 | self.vdomain = [] 210 | 211 | for domain in self.domain: 212 | if self.redis: 213 | if self.redis.exists('dom_class:cache:{}'.format(domain)): 214 | passive_dns_out = self.redis.smembers('dom_class:cache:{}'.format(domain)) 215 | for out in passive_dns_out: 216 | if extended: 217 | out = tuple(out.split('[^]', 2)) 218 | self.vdomain.append(out) 219 | else: 220 | self.vdomain.add(out) 221 | continue 222 | 223 | for dnstype in rtype: 224 | try: 225 | answers = self.presolver.query(domain, dnstype) 226 | except: 227 | pass 228 | else: 229 | # Passive DNS output 230 | # timestamp||dns-client ||dns-server||RR class||Query||Query Type||Answer||TTL||Count 231 | if passive_dns: 232 | rrset = answers.rrset.to_text().splitlines() 233 | for dns_resp in rrset: 234 | dns_resp = dns_resp.split() 235 | passive_dns_out = ( 236 | '{}||127.0.0.1||{}||{}||{}||{}||{}||{}||1\n'.format( 237 | time.time(), 238 | self.presolver.nameservers[0], 239 | dns_resp[2], 240 | domain, 241 | dnstype, 242 | dns_resp[4], 243 | answers.ttl, 244 | ) 245 | ) 246 | self.vdomain.add((passive_dns_out)) 247 | if self.redis: 248 | self.redis.sadd('dom_class:cache:{}'.format(domain), passive_dns_out) 249 | self.redis.expire('dom_class:cache:{}'.format(domain), self.expire_time) 250 | elif extended: 251 | self.vdomain.append((domain, dnstype, answers[0])) 252 | if self.redis: 253 | self.redis.sadd('dom_class:cache:{}'.format(domain), '{}[^]{}[^]{}'.format(domain, dnstype, answers[0])) 254 | self.redis.expire('dom_class:cache:{}'.format(domain), self.expire_time) 255 | else: 256 | self.vdomain.add((domain)) 257 | if self.redis: 258 | self.redis.sadd('dom_class:cache:{}'.format(domain), domain) 259 | self.redis.expire('dom_class:cache:{}'.format(domain), self.expire_time) 260 | return self.vdomain 261 | 262 | """ipaddress method extracts from the domain list the valid IPv4 addresses""" 263 | 264 | def ipaddress(self, extended=False): 265 | 266 | if extended is False: 267 | self.ipaddresses = [] 268 | else: 269 | self.ipaddresses = set() 270 | 271 | for d in self.domain: 272 | try: 273 | ip = socket.gethostbyname(d) 274 | except: 275 | continue 276 | 277 | if extended is False: 278 | self.ipaddresses.append((ip)) 279 | else: 280 | orig = self.__origin(ipaddr=ip) 281 | self.ipaddresses.add((ip, str(orig))) 282 | 283 | return self.ipaddresses 284 | 285 | """localizedomain method use the validdomain list (in extended format) to 286 | localize per country code the associated resources. The cc argument specifies the 287 | country code in ISO 3166-1 alpha-2 format to check for.""" 288 | 289 | def localizedomain(self, cc=None): 290 | self.localdom = [] 291 | 292 | for dom in self.vdomain: 293 | if dom[1] == 'A': 294 | ip = dom[2] 295 | try: 296 | orig = self.__origin(ipaddr=dom[2])[1] 297 | except: 298 | continue 299 | if orig == cc: 300 | self.localdom.append(dom) 301 | elif dom[1] == 'CNAME': 302 | cname = str(dom[2]) 303 | ip = socket.gethostbyname(cname) 304 | try: 305 | orig = self.__origin(ipaddr=ip)[1] 306 | except: 307 | continue 308 | if orig == cc: 309 | self.localdom.append(dom) 310 | return self.localdom 311 | 312 | """rankdomain method use the validdomain list (in extended format to rank 313 | each domain with an IP address. Return a sorted list of tuples (ranking, 314 | domain). 315 | """ 316 | 317 | def rankdomain(self): 318 | self.rankdom = [] 319 | 320 | if self.vdomain: 321 | for dom in self.vdomain: 322 | rank = None 323 | asn = None 324 | if dom[1] == 'A': 325 | ip = dom[2] 326 | o = self.__origin(ipaddr=dom[2]) 327 | if o: 328 | asn = o[0] 329 | rank = self.__bgpranking(asn) 330 | t = (rank, dom[0]) 331 | self.rankdom.append(t) 332 | elif dom[1] == 'CNAME': 333 | cname = str(dom[2]) 334 | try: 335 | ip = socket.gethostbyname(cname) 336 | except: 337 | continue 338 | try: 339 | asn = self.__origin(ipaddr=ip)[0] 340 | except TypeError: 341 | continue 342 | rank = self.__bgpranking(asn) 343 | t = (rank, dom[0]) 344 | self.rankdom.append(t) 345 | return sorted(self.rankdom, key=lambda d: d[0]) 346 | 347 | """exclude domains from a regular expression. If validdomain was called, 348 | it's only on the valid domain list.""" 349 | 350 | """exclude domains from a regular expression. If validdomain was called, 351 | it's only on the valid domain list.""" 352 | 353 | def exclude(self, expression=None): 354 | self.cleandomain = [] 355 | 356 | excludefilter = re.compile(expression) 357 | 358 | if not self.vdomain: 359 | domains = self.domain 360 | else: 361 | domains = self.vdomain 362 | 363 | for dom in domains: 364 | if type(dom) == tuple: 365 | dom = dom[0] 366 | 367 | if excludefilter.search(dom): 368 | pass 369 | else: 370 | self.cleandomain.append(dom) 371 | return self.cleandomain 372 | 373 | def include(self, expression=None): 374 | self.cleandomain = [] 375 | 376 | includefilter = re.compile(expression) 377 | 378 | if not self.vdomain: 379 | domains = self.domain 380 | else: 381 | domains = self.vdomain 382 | 383 | for dom in domains: 384 | if type(dom) == tuple: 385 | dom = dom[0] 386 | if includefilter.search(dom): 387 | self.cleandomain.append(dom) 388 | 389 | return set(self.cleandomain) 390 | 391 | 392 | if __name__ == "__main__": 393 | c = Extract( 394 | rawtext="www.foo.lu www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1 abc.dontexist" 395 | ) 396 | c.text( 397 | rawtext="www.abc.lu www.xxx.com random text a test bric broc www.lemonde.fr www.belnet.be www.foo.be" 398 | ) 399 | print(c.potentialdomain()) 400 | print(c.potentialdomain(validTLD=True)) 401 | print(c.validdomain(extended=True)) 402 | print("US:") 403 | print(c.localizedomain(cc='US')) 404 | print("LU:") 405 | print(c.localizedomain(cc='LU')) 406 | print("BE:") 407 | print(c.localizedomain(cc='BE')) 408 | print("Ranking:") 409 | print(c.rankdomain()) 410 | print("List of ip addresses:") 411 | print(c.ipaddress(extended=False)) 412 | print("Include dot.lu:") 413 | print(c.include(expression=r'\.lu$')) 414 | print("Exclude dot.lu:") 415 | print(c.exclude(expression=r'\.lu$')) 416 | c.text(rawtext="www.lwn.net www.undeadly.org") 417 | print(c.potentialdomain(validTLD=True)) 418 | c.validdomain() 419 | print(c.localizedomain(cc='US')) 420 | print(c.validdomain(extended=False, passive_dns=True)) 421 | -------------------------------------------------------------------------------- /DomainClassifier/test.py: -------------------------------------------------------------------------------- 1 | import domainclassifier 2 | 3 | c = domainclassifier.Extract( rawtext = "www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1", nameservers = ['8.8.8.8']) 4 | #print c.potentialdomain() 5 | print(c.validdomain(extended=True)) 6 | print("US:") 7 | print(c.localizedomain(cc='US')) 8 | print("LU:") 9 | print(c.localizedomain(cc='LU')) 10 | print("BE:") 11 | print(c.localizedomain(cc='BE')) 12 | print("Ranking:") 13 | print(c.rankdomain()) 14 | print("List of ip addresses:") 15 | print(c.ipaddress(extended=True)) 16 | print("Include dot.lu:") 17 | print(c.include(expression=r'\.lu$')) 18 | print("Exclude dot.lu:") 19 | print(c.exclude(expression=r'\.lu$')) 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | DomainClassifier 2 | ================ 3 | 4 | DomainClassifier is a simple Python library to extract and classify Internet domains/hostnames/IP addresses from raw unstructured text files following their existence, localization or attributes. 5 | 6 | DomainClassifier can be used to extract Internet hosts from any free texts or collected unstructured information. A passive dns output is also available. 7 | 8 | ![An overview of the DomainClassifier methods](https://raw.github.com/adulau/DomainClassifier/master/doc/domainclassifier-flow.png) 9 | 10 | Install 11 | ------- 12 | 13 | [DomainClassifier](https://pypi.python.org/pypi/DomainClassifier/) is part of the pypi package. It can be installed using the pip command: 14 | 15 | `pip install DomainClassifier` 16 | 17 | ```python 18 | 19 | In [11]: c = DomainClassifier.domainclassifier.Extract(rawtext="www.google.com foo.bar ppp.ppp") 20 | 21 | In [12]: c.potentialdomain() 22 | Out[12]: ['www.google.com', 'foo.bar'] 23 | ``` 24 | 25 | How To Use It 26 | ------------- 27 | 28 | 29 | ```python 30 | import DomainClassifier.domainclassifier 31 | 32 | c = DomainClassifier.domainclassifier.Extract( rawtext = "www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be ht 33 | tp://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8. 34 | 8 201.1.1.1") 35 | 36 | # extracting potentially valid domains from rawtext 37 | print(c.potentialdomain()) 38 | 39 | # reduce set of potentially valid domains to existing domains 40 | # (based on SOA,A,AAAA,CNAME,MX records) 41 | print(c.validdomain(extended=True)) 42 | 43 | # reduce set of valid domains with DNS records associated to a 44 | # specified country 45 | print("US:") 46 | print(c.localizedomain(cc='US')) 47 | print("LU:") 48 | print(c.localizedomain(cc='LU')) 49 | print("BE:") 50 | print(c.localizedomain(cc='BE')) 51 | print("Ranking:") 52 | print(c.rankdomain()) 53 | 54 | # extract valid IPv4 addresses (using the potential list of valid domains) 55 | print("List of ip addresses:") 56 | print(c.ipaddress(extended=True)) 57 | 58 | # some more filtering 59 | print("Include dot.lu:") 60 | print(c.include(expression=r'\.lu$')) 61 | print("Exclude dot.lu:") 62 | print(c.exclude(expression=r'\.lu$')) 63 | ``` 64 | 65 | ### Sample output 66 | 67 | ```python 68 | ['www.xxx.com', 'foo.lu', 'abc.lu', 'a.b.c.d.e', '1.2.3.4', 'foo.be', 'www.belnet.be', 'www.cert.be', 'www.public.lu', 'www.allo.lu', 'www.eurodns.com', 'something-broken-www.google.com', 'www.google.lu', 'www.facebook.com', 'www.nic.ru', 'www.youporn.com', '8.8.8.8', '201.1.1.1'] 69 | [('www.xxx.com', 'A', ), ('abc.lu', 'SOA', ), ('abc.lu', 'MX', ), ('foo.be', 'A', ), ('foo.be', 'AAAA', ), ('foo.be', 'SOA', ), ('foo.be', 'MX', ), ('www.belnet.be', 'A', ), ('www.belnet.be', 'AAAA', ), ('www.belnet.be', 'CNAME', ), ('www.cert.be', 'A', ), ('www.cert.be', 'AAAA', ), ('www.cert.be', 'SOA', ), ('www.cert.be', 'MX', ), ('www.cert.be', 'CNAME', ), ('www.public.lu', 'A', ), ('www.allo.lu', 'A', ), ('www.eurodns.com', 'A', ), ('www.google.lu', 'A', ), ('www.google.lu', 'AAAA', ), ('www.facebook.com', 'A', ), ('www.facebook.com', 'AAAA', ), ('www.facebook.com', 'MX', ), ('www.facebook.com', 'CNAME', ), ('www.nic.ru', 'A', ), ('www.nic.ru', 'MX', ), ('www.youporn.com', 'A', ), ('www.youporn.com', 'SOA', ), ('www.youporn.com', 'MX', ), ('www.youporn.com', 'CNAME', )] 70 | US: 71 | [('www.xxx.com', 'A', ), ('www.google.lu', 'A', )] 72 | LU: 73 | [('www.public.lu', 'A', ), ('www.allo.lu', 'A', ), ('www.eurodns.com', 'A', )] 74 | BE: 75 | [('foo.be', 'A', ), ('www.belnet.be', 'A', ), ('www.belnet.be', 'CNAME', ), ('www.cert.be', 'A', ), ('www.cert.be', 'CNAME', )] 76 | Ranking: 77 | [(1.0, 'www.youporn.com'), (1.0, 'www.youporn.com'), (1.0000120563271599, 'www.belnet.be'), (1.0000120563271599, 'www.belnet.be'), (1.0000120563271599, 'www.cert.be'), (1.0000120563271599, 'www.cert.be'), (1.0000372023809501, 'foo.be'), (1.0001395089285701, 'www.public.lu'), (1.00015419407895, 'www.allo.lu'), (1.0003662109375, 'www.eurodns.com'), (1.0004111842105301, 'www.xxx.com'), (1.0005944293478299, 'www.nic.ru'), (1.0024646577381, 'www.facebook.com'), (1.0024646577381, 'www.facebook.com'), (1.002635288165, 'www.google.lu')] 78 | List of ip addresses: 79 | ('15169', 'AU', ) 80 | ('15169', 'US', ) 81 | ('27699', 'BR', ) 82 | set([('201.1.1.1', '(\'27699\', \'BR\', )'), ('8.8.8.8', '(\'15169\', \'US\', )'), ('1.2.3.4', '(\'15169\', \'AU\', )')]) 83 | Include dot.lu: 84 | ['abc.lu', 'abc.lu', 'www.public.lu', 'www.allo.lu', 'www.google.lu', 'www.google.lu'] 85 | Exclude dot.lu: 86 | ['www.xxx.com', 'foo.be', 'foo.be', 'foo.be', 'foo.be', 'www.belnet.be', 'www.belnet.be', 'www.belnet.be', 'www.cert.be', 'www.cert.be', 'www.cert.be', 'www.cert.be', 'www.cert.be', 'www.eurodns.com', 'www.facebook.com', 'www.facebook.com', 'www.facebook.com', 'www.facebook.com', 'www.nic.ru', 'www.nic.ru', 'www.youporn.com', 'www.youporn.com', 'www.youporn.com', 'www.youporn.com'] 87 | ``` 88 | 89 | ### Software Required 90 | 91 | * Python (tested successfully on version 2.6, 2.7 and 3.5) 92 | * dnspython library - http://www.dnspython.org/ 93 | * IPy library 94 | * [pybgpranking](https://github.com/D4-project/BGP-Ranking/tree/master/client) to get malicious ranking of BGP AS number via [BGP Ranking](https://github.com/D4-project/BGP-Ranking) 95 | 96 | ### Software using DomainClassifier 97 | 98 | * [AIL framework - Analysis Information Leak framework](https://github.com/ail-project/ail-framework) 99 | 100 | ### License 101 | 102 | ~~~~ 103 | Copyright (C) 2012-2023 Alexandre Dulaunoy - a(at)foo.be 104 | Copyright (C) 2021 Aurelien Thirion 105 | 106 | This program is free software: you can redistribute it and/or modify 107 | it under the terms of the GNU Affero General Public License as 108 | published by the Free Software Foundation, either version 3 of the 109 | License, or (at your option) any later version. 110 | 111 | This program is distributed in the hope that it will be useful, 112 | but WITHOUT ANY WARRANTY; without even the implied warranty of 113 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 114 | GNU Affero General Public License for more details. 115 | 116 | You should have received a copy of the GNU Affero General Public License 117 | along with this program. If not, see . 118 | ~~~~ 119 | -------------------------------------------------------------------------------- /doc/domainclassifier-flow.dot: -------------------------------------------------------------------------------- 1 | digraph g{ 2 | z [label="raw text including probable hostnames"]; 3 | a -> b; 4 | a [label=".potentialdomain()",shape=box,fillcolor="palegreen",style="filled"]; 5 | z -> a [label=" extracting"]; 6 | b [label=".validdomain(extended=True)",shape=box,fillcolor="palegreen",style="filled"]; 7 | c [label=".localizedomain(cc='country code')",shape=box,fillcolor="palegreen",style="filled"]; 8 | b->c; 9 | e [label=".validdomain(extended=False)",shape=box,fillcolor="palegreen",style="filled"]; 10 | a->e; 11 | f [label=".rankdomain()",shape=box,fillcolor="palegreen",style="filled"]; 12 | c->f; 13 | g [label=".include(expression=regexp)",shape=box,fillcolor="palegreen",style="filled" ]; 14 | h [label=".exclude(expression=regexp)",shape=box,fillcolor="palegreen",style="filled" ]; 15 | f->g; 16 | g->h; 17 | e->g; 18 | i [label="set of hostnames", fillcolor="palegreen",style="filled"]; 19 | h->i; 20 | } 21 | -------------------------------------------------------------------------------- /doc/domainclassifier-flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adulau/DomainClassifier/d3f8129399b9030bbb298412b458c0d4e35114c6/doc/domainclassifier-flow.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | IPy 2 | dnspython 3 | git+https://github.com/D4-project/BGP-Ranking.git/#egg=pybgpranking&subdirectory=client 4 | redis 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from pathlib import Path 3 | this_directory = Path(__file__).parent 4 | long_description = (this_directory / "README.md").read_text() 5 | 6 | setup( 7 | name="DomainClassifier", 8 | version="1.4", 9 | packages=find_packages(), 10 | install_requires=['dnspython', 'IPy', 'pybgpranking'], 11 | author="Alexandre Dulaunoy", 12 | author_email="a@foo.be", 13 | description="DomainClassifier is a Python library to extract and classify Internet domains/hostnames/IP addresses from raw unstructured text files following their existence, localization or attributes.", 14 | long_description=long_description, 15 | long_description_content_type='text/markdown', 16 | license="AGPL", 17 | keywords="internet mining domain resolver geolocalisation", 18 | url="http://github.com/adulau/DomainClassifier" 19 | ) 20 | --------------------------------------------------------------------------------