├── pythonwhois
    ├── shared.py
    ├── states_au.dat
    ├── states_ca.dat
    ├── __init__.py
    ├── states_us.dat
    ├── net.py
    ├── countries.dat
    ├── countries3.dat
    └── parse.py
├── .gitignore
├── lib
    ├── files
    │   ├── GeoLite2-ASN.mmdb
    │   ├── GeoLite2-Country.mmdb
    │   ├── extensions.txt
    │   ├── shorteners.txt
    │   └── tlds.txt
    ├── blacklists.py
    ├── spf.py
    └── functions.py
├── urls
    └── national
    │   └── .urls-benign.csv.swp
├── config.ini
├── requirements.txt
├── get_database_phishtank.py
├── run.py
├── README.md
└── extract.py


/pythonwhois/shared.py:
--------------------------------------------------------------------------------
1 | class WhoisException(Exception):
2 | 	pass
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | __pycache__/
3 | .env/
4 | database_phishtank.json
5 | 


--------------------------------------------------------------------------------
/lib/files/GeoLite2-ASN.mmdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucasayres/url-feature-extractor/HEAD/lib/files/GeoLite2-ASN.mmdb


--------------------------------------------------------------------------------
/lib/files/GeoLite2-Country.mmdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucasayres/url-feature-extractor/HEAD/lib/files/GeoLite2-Country.mmdb


--------------------------------------------------------------------------------
/urls/national/.urls-benign.csv.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucasayres/url-feature-extractor/HEAD/urls/national/.urls-benign.csv.swp


--------------------------------------------------------------------------------
/config.ini:
--------------------------------------------------------------------------------
 1 | [phishtank]
 2 | api_key =
 3 | 
 4 | [safebrowsing]
 5 | client_id =
 6 | api_key =
 7 | version = 1.5.2
 8 | 
 9 | [wot]
10 | api_key =
11 | 


--------------------------------------------------------------------------------
/pythonwhois/states_au.dat:
--------------------------------------------------------------------------------
1 | NSW,"New South Wales"
2 | QLD,"Queensland"
3 | SA,"South Australia"
4 | TAS,"Tasmania"
5 | VIC,"Victoria"
6 | WA,"Western Australia"


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | beautifulsoup4==4.6.0
 2 | bs4==0.0.1
 3 | configparser==3.5.0
 4 | dnspython==1.15.0
 5 | geoip2==2.5.0
 6 | IPy==0.83
 7 | maxminddb==1.3.0
 8 | rblwatch==0.3.0
 9 | requests==2.22.0
10 | urllib3==1.24.2
11 | 


--------------------------------------------------------------------------------
/pythonwhois/states_ca.dat:
--------------------------------------------------------------------------------
 1 | id,name,abbreviation
 2 | "60","Alberta","AB"
 3 | "61","British Columbia","BC"
 4 | "62","Manitoba","MB"
 5 | "63","New Brunswick","NB"
 6 | "64","Newfoundland and Labrador","NL"
 7 | "65","Nova Scotia","NS"
 8 | "66","Ontario","ON"
 9 | "67","Prince Edward Island","PE"
10 | "68","Quebec","QC"
11 | "69","Saskatchewan","SK"
12 | "70","Northwest Territories","NT"
13 | "71","Nunavut","NU"
14 | "72","Yukon Territory","YT"
15 | 


--------------------------------------------------------------------------------
/get_database_phishtank.py:
--------------------------------------------------------------------------------
 1 | import urllib.request
 2 | import configparser
 3 | import json
 4 | import bz2
 5 | 
 6 | config = configparser.ConfigParser()
 7 | config.read('config.ini')
 8 | 
 9 | 
10 | def update_db():
11 |     """Download the PhishTank URLs database in lib/files/database_phishtank.json."""
12 |     api_key = config.get('phishtank', 'api_key')
13 |     api_url = 'http://data.phishtank.com/data/%s/online-valid.json.bz2' % (api_key)
14 |     compraw = urllib.request.urlopen(api_url).read()
15 |     rawdecomp = bz2.decompress(compraw)
16 |     database = json.loads(rawdecomp.decode('utf-8'))
17 |     with open('lib/files/database_phishtank.json', 'w') as outfile:
18 |         json.dump(database, outfile)
19 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | from get_database_phishtank import update_db
 2 | import argparse
 3 | import extract
 4 | 
 5 | 
 6 | def main():
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument("input", help="File of URLs to be analyzed")
 9 |     parser.add_argument("output", help="Output File")
10 |     args = parser.parse_args()
11 | 
12 |     if args.input and args.output:
13 |         # Update phishtank database
14 |         print('Download and update phishtank database...')
15 |         update_db()
16 |         # Starts extraction
17 |         print('Starts extraction...')
18 |         extract.main(args.input, args.output)
19 |         print('''
20 | #######################################
21 | #   Dataset generated successfully!   #
22 | #######################################
23 |             ''')
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     main()
28 | 


--------------------------------------------------------------------------------
/pythonwhois/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import net, parse
 2 | 
 3 | def get_whois(domain, normalized=[]):
 4 | 	try:
 5 | 		raw_data, server_list = net.get_whois_raw(domain, with_server_list=True)
 6 | 	except:
 7 | 		return False
 8 | 	# Unlisted handles will be looked up on the last WHOIS server that was queried. This may be changed to also query
 9 | 	# other servers in the future, if it turns out that there are cases where the last WHOIS server in the chain doesn't
10 | 	# actually hold the handle contact details, but another WHOIS server in the chain does.
11 | 	return parse.parse_raw_whois(raw_data, normalized=normalized, never_query_handles=False, handle_server=server_list[-1])
12 | 
13 | def whois(*args, **kwargs):
14 | 	raise Exception("The whois() method has been replaced by a different method (with a different API), since pythonwhois 2.0. Either install the older pythonwhois 1.2.3, or change your code to use the new API.")
15 | 


--------------------------------------------------------------------------------
/pythonwhois/states_us.dat:
--------------------------------------------------------------------------------
 1 | id,name,abbreviation
 2 | "1","Alabama","AL"
 3 | "2","Alaska","AK"
 4 | "3","Arizona","AZ"
 5 | "4","Arkansas","AR"
 6 | "5","California","CA"
 7 | "6","Colorado","CO"
 8 | "7","Connecticut","CT"
 9 | "8","Delaware","DE"
10 | "9","Florida","FL"
11 | "10","Georgia","GA"
12 | "11","Hawaii","HI"
13 | "12","Idaho","ID"
14 | "13","Illinois","IL"
15 | "14","Indiana","IN"
16 | "15","Iowa","IA"
17 | "16","Kansas","KS"
18 | "17","Kentucky","KY"
19 | "18","Louisiana","LA"
20 | "19","Maine","ME"
21 | "20","Maryland","MD"
22 | "21","Massachusetts","MA"
23 | "22","Michigan","MI"
24 | "23","Minnesota","MN"
25 | "24","Mississippi","MS"
26 | "25","Missouri","MO"
27 | "26","Montana","MT"
28 | "27","Nebraska","NE"
29 | "28","Nevada","NV"
30 | "29","New Hampshire","NH"
31 | "30","New Jersey","NJ"
32 | "31","New Mexico","NM"
33 | "32","New York","NY"
34 | "33","North Carolina","NC"
35 | "34","North Dakota","ND"
36 | "35","Ohio","OH"
37 | "36","Oklahoma","OK"
38 | "37","Oregon","OR"
39 | "38","Pennsylvania","PA"
40 | "39","Rhode Island","RI"
41 | "40","South Carolina","SC"
42 | "41","South Dakota","SD"
43 | "42","Tennessee","TN"
44 | "43","Texas","TX"
45 | "44","Utah","UT"
46 | "45","Vermont","VT"
47 | "46","Virginia","VA"
48 | "47","Washington","WA"
49 | "48","West Virginia","WV"
50 | "49","Wisconsin","WI"
51 | "50","Wyoming","WY"
52 | "52","Puerto Rico","PR"
53 | "53","U.S. Virgin Islands","VI"
54 | "54","American Samoa","AS"
55 | "55","Guam","GU"
56 | "56","Northern Mariana Islands","MP"


--------------------------------------------------------------------------------
/lib/files/extensions.txt:
--------------------------------------------------------------------------------
  1 | .3dm
  2 | .3ds
  3 | .3g2
  4 | .3gp
  5 | .7z
  6 | .accdb
  7 | .ai
  8 | .aif
  9 | .apk
 10 | .app
 11 | .asf
 12 | .asp
 13 | .aspx
 14 | .avi
 15 | .bak
 16 | .bat
 17 | .bin
 18 | .bmp
 19 | .c
 20 | .cab
 21 | .cbr
 22 | .cer
 23 | .cfg
 24 | .cfm
 25 | .cgi
 26 | .class
 27 | .com
 28 | .cpl
 29 | .cpp
 30 | .crdownload
 31 | .crx
 32 | .cs
 33 | .csr
 34 | .css
 35 | .csv
 36 | .cue
 37 | .cur
 38 | .dat
 39 | .db
 40 | .dbf
 41 | .dds
 42 | .deb
 43 | .dem
 44 | .deskthemepack
 45 | .dll
 46 | .dmg
 47 | .dmp
 48 | .doc
 49 | .docx
 50 | .drv
 51 | .dtd
 52 | .dwg
 53 | .dxf
 54 | .eps
 55 | .exe
 56 | .fla
 57 | .flv
 58 | .fnt
 59 | .fon
 60 | .gadget
 61 | .gam
 62 | .ged
 63 | .gif
 64 | .gpx
 65 | .gz
 66 | .h
 67 | .hqx
 68 | .htm
 69 | .html
 70 | .icns
 71 | .ico
 72 | .ics
 73 | .iff
 74 | .indd
 75 | .ini
 76 | .iso
 77 | .jar
 78 | .java
 79 | .jpg
 80 | .js
 81 | .jsp
 82 | .key
 83 | .keychain
 84 | .kml
 85 | .kmz
 86 | .lnk
 87 | .log
 88 | .lua
 89 | .m
 90 | .m3u
 91 | .m4a
 92 | .m4v
 93 | .max
 94 | .mdb
 95 | .mdf
 96 | .mid
 97 | .mim
 98 | .mov
 99 | .mp3
100 | .mp4
101 | .mpa
102 | .mpg
103 | .msg
104 | .msi
105 | .nes
106 | .obj
107 | .odt
108 | .otf
109 | .pages
110 | .part
111 | .pct
112 | .pdb
113 | .pdf
114 | .php
115 | .pkg
116 | .pl
117 | .plugin
118 | .png
119 | .pps
120 | .ppt
121 | .pptx
122 | .prf
123 | .ps
124 | .psd
125 | .pspimage
126 | .py
127 | .rar
128 | .rm
129 | .rom
130 | .rpm
131 | .rss
132 | .rtf
133 | .sav
134 | .sdf
135 | .sh
136 | .sitx
137 | .sln
138 | .sql
139 | .srt
140 | .svg
141 | .swf
142 | .swift
143 | .sys
144 | .tar
145 | .tar.gz
146 | .tax2016
147 | .tex
148 | .tga
149 | .thm
150 | .tif
151 | .tiff
152 | .tmp
153 | .toast
154 | .torrent
155 | .ttf
156 | .txt
157 | .uue
158 | .vb
159 | .vcd
160 | .vcf
161 | .vcxproj
162 | .vob
163 | .wav
164 | .wma
165 | .wmv
166 | .wpd
167 | .wps
168 | .wsf
169 | .xcodeproj
170 | .xhtml
171 | .xlr
172 | .xls
173 | .xlsx
174 | .xml
175 | .yuv
176 | .zip
177 | .zipx
178 | 


--------------------------------------------------------------------------------
/lib/blacklists.py:
--------------------------------------------------------------------------------
 1 | import configparser
 2 | import requests
 3 | import json
 4 | 
 5 | config = configparser.ConfigParser()
 6 | config.read('config.ini')
 7 | 
 8 | 
 9 | def google_safebrowsing(url):
10 |     client_id = config.get('safebrowsing', 'client_id')
11 |     version = config.get('safebrowsing', 'version')
12 |     api_key = config.get('safebrowsing', 'api_key')
13 |     platform_types = ['ANY_PLATFORM']
14 |     threat_types = ['THREAT_TYPE_UNSPECIFIED',
15 |                     'MALWARE', 'SOCIAL_ENGINEERING',
16 |                     'UNWANTED_SOFTWARE', 'POTENTIALLY_HARMFUL_APPLICATION']
17 |     threat_entry_types = ['URL']
18 |     api_url = 'https://safebrowsing.googleapis.com/v4/threatMatches:find?key=%s' % (api_key)
19 |     threat_entries = [{'url': url}]
20 |     payload = {
21 |         'client': {
22 |             'clientId': client_id,
23 |             'clientVersion': version
24 |         },
25 |         'threatInfo': {
26 |             'threatTypes': threat_types,
27 |             'platformTypes': platform_types,
28 |             'threatEntryTypes': threat_entry_types,
29 |             'threatEntries': threat_entries
30 |         }
31 |     }
32 |     headers = {'content-type': 'application/json'}
33 |     try:
34 |         response = requests.post(api_url, headers=headers, json=payload).json().get('matches', None)
35 |         if response is not None:
36 |             return True
37 |         else:
38 |             return False
39 |     except Exception:
40 |         return '?'
41 | 
42 | 
43 | def phishtank(url):
44 |     with open('lib/files/database_phishtank.json') as db:
45 |         data = json.load(db)
46 |     for d in data:
47 |         if (url == d['url']):
48 |             return True
49 |     return False
50 | 
51 | 
52 | def wot(url):
53 |     api_key = config.get('wot', 'api_key')
54 |     api_url = 'http://api.mywot.com/0.4/public_link_json2'
55 |     try:
56 |         response = requests.get(api_url, params={'hosts': url, 'key': api_key}).json()
57 |         return any('blacklists' in val for val in response.values())
58 |     except Exception:
59 |         return False
60 | 


--------------------------------------------------------------------------------
/lib/spf.py:
--------------------------------------------------------------------------------
 1 | import dns.resolver
 2 | import dns.name
 3 | from urllib import parse
 4 | 
 5 | 
 6 | class SPFRecord(object):
 7 | 
 8 |     def __init__(self, domain):
 9 |         self.version = None
10 |         self.includes = []
11 |         self.ip4 = []
12 |         self.ip6 = []
13 |         try:
14 |             self._dns_response = dns.resolver.query(domain, 'TXT')
15 |         except Exception:
16 |             return False
17 |         self.txt_records = [txt.to_text() for txt in self._dns_response]
18 |         for txt in self.txt_records:
19 |             self._parse_txt(txt)
20 | 
21 |     def _parse_txt(self, txt):
22 |         for entry in txt.split(' '):
23 |             if entry.startswith('v') and '=' in entry:
24 |                 self._add_version(entry)
25 |             elif entry.startswith('include') and ':' in entry:
26 |                 self._add_include(entry)
27 |             elif entry.startswith('ip4') and ':' in entry:
28 |                 self._add_ip4(entry)
29 |             elif entry.startswith('ip6') and ':' in entry:
30 |                 self._add_ip6(entry)
31 | 
32 |     @property
33 |     def ips(self):
34 |         return self.ip4 + self.ip6
35 | 
36 |     def _add_version(self, entry):
37 |         self.version = entry.split('=')[1]
38 | 
39 |     def _add_include(self, entry):
40 |         self.includes.append(entry.split(':')[1])
41 | 
42 |     def _add_ip4(self, entry):
43 |         ip = entry.split(':')[1]
44 |         self.ip4.append(ip)
45 | 
46 |     def _add_ip6(self, entry):
47 |         ip = entry.split(':')[1]
48 |         self.ip6.append(ip)
49 | 
50 | 
51 | def is_expired(domain):
52 |     try:
53 |         dns.resolver.query(domain)
54 |         return False
55 |     except dns.resolver.NXDOMAIN:
56 |         return True
57 |     except Exception:
58 |         return False
59 | 
60 | 
61 | def get_spf_record(domain):
62 |     if is_expired(domain):
63 |         return None
64 |     try:
65 |         return SPFRecord(domain)
66 |     except Exception:
67 |         return None
68 | 
69 | 
70 | def check_spf(spf, domain):
71 |     for inc_domain in spf.includes:
72 |         try:
73 |             url = parse.urlparse("mail://%s" % inc_domain).netloc
74 |             parent = '.'.join(url.split('.')[-2:])
75 |             if is_expired(parent):
76 |                 return False
77 |             else:
78 |                 return True
79 |         except Exception:
80 |             return False
81 |     return '?'
82 | 


--------------------------------------------------------------------------------
/pythonwhois/net.py:
--------------------------------------------------------------------------------
 1 | import socket, re, sys
 2 | from codecs import encode, decode
 3 | from . import shared
 4 | 
 5 | def get_whois_raw(domain, server="", previous=None, rfc3490=True, never_cut=False, with_server_list=False, server_list=None):
 6 | 	previous = previous or []
 7 | 	server_list = server_list or []
 8 | 	# Sometimes IANA simply won't give us the right root WHOIS server
 9 | 	exceptions = {
10 | 		".ac.uk": "whois.ja.net",
11 | 		".ps": "whois.pnina.ps",
12 | 		".buzz": "whois.nic.buzz",
13 | 		".moe": "whois.nic.moe",
14 | 		# The following is a bit hacky, but IANA won't return the right answer for example.com because it's a direct registration.
15 | 		"example.com": "whois.verisign-grs.com"
16 | 	}
17 | 
18 | 	if rfc3490:
19 | 		if sys.version_info < (3, 0):
20 | 			domain = encode( domain if type(domain) is unicode else decode(domain, "utf8"), "idna" )
21 | 		else:
22 | 			domain = encode(domain, "idna").decode("ascii")
23 | 
24 | 	if len(previous) == 0 and server == "":
25 | 		# Root query
26 | 		is_exception = False
27 | 		for exception, exc_serv in exceptions.items():
28 | 			if domain.endswith(exception):
29 | 				is_exception = True
30 | 				target_server = exc_serv
31 | 				break
32 | 		if is_exception == False:
33 | 			target_server = get_root_server(domain)
34 | 	else:
35 | 		target_server = server
36 | 	if target_server == "whois.jprs.jp":
37 | 		request_domain = "%s/e" % domain # Suppress Japanese output
38 | 	elif domain.endswith(".de") and ( target_server == "whois.denic.de" or target_server == "de.whois-servers.net" ):
39 | 		request_domain = "-T dn,ace %s" % domain # regional specific stuff
40 | 	elif target_server == "whois.verisign-grs.com":
41 | 		request_domain = "=%s" % domain # Avoid partial matches
42 | 	elif target_server is False:
43 | 		return False
44 | 	else:
45 | 		request_domain = domain
46 | 	response = whois_request(request_domain, target_server)
47 | 	if never_cut:
48 | 		# If the caller has requested to 'never cut' responses, he will get the original response from the server (this is
49 | 		# useful for callers that are only interested in the raw data). Otherwise, if the target is verisign-grs, we will
50 | 		# select the data relevant to the requested domain, and discard the rest, so that in a multiple-option response the
51 | 		# parsing code will only touch the information relevant to the requested domain. The side-effect of this is that
52 | 		# when `never_cut` is set to False, any verisign-grs responses in the raw data will be missing header, footer, and
53 | 		# alternative domain options (this is handled a few lines below, after the verisign-grs processing).
54 | 		new_list = [response] + previous
55 | 	if target_server == "whois.verisign-grs.com":
56 | 		# VeriSign is a little... special. As it may return multiple full records and there's no way to do an exact query,
57 | 		# we need to actually find the correct record in the list.
58 | 		for record in response.split("\n\n"):
59 | 			if re.search("Domain Name: %s\n" % domain.upper(), record):
60 | 				response = record
61 | 				break
62 | 	if never_cut == False:
63 | 		new_list = [response] + previous
64 | 	server_list.append(target_server)
65 | 	for line in [x.strip() for x in response.splitlines()]:
66 | 		match = re.match("(refer|whois server|referral url|whois server|registrar whois):\s*([^\s]+\.[^\s]+)", line, re.IGNORECASE)
67 | 		if match is not None:
68 | 			referal_server = match.group(2)
69 | 			if referal_server != server and "://" not in referal_server: # We want to ignore anything non-WHOIS (eg. HTTP) for now.
70 | 				# Referal to another WHOIS server...
71 | 				return get_whois_raw(domain, referal_server, new_list, server_list=server_list, with_server_list=with_server_list)
72 | 	if with_server_list:
73 | 		return (new_list, server_list)
74 | 	else:
75 | 		return new_list
76 | 
77 | def get_root_server(domain):
78 | 	data = whois_request(domain, "whois.iana.org")
79 | 	for line in [x.strip() for x in data.splitlines()]:
80 | 		match = re.match("refer:\s*([^\s]+)", line)
81 | 		if match is None:
82 | 			continue
83 | 		return match.group(1)
84 | 	# raise shared.WhoisException("No root WHOIS server found for domain.")
85 | 	return False
86 | 
87 | def whois_request(domain, server, port=43):
88 | 	sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
89 | 	sock.connect((server, port))
90 | 	sock.send(("%s\r\n" % domain).encode("utf-8"))
91 | 	buff = b""
92 | 	while True:
93 | 		data = sock.recv(1024)
94 | 		if len(data) == 0:
95 | 			break
96 | 		buff += data
97 | 	return buff.decode("utf-8", errors="ignore")
98 | 


--------------------------------------------------------------------------------
/pythonwhois/countries.dat:
--------------------------------------------------------------------------------
  1 | iso,name
  2 | AF,Afghanistan
  3 | AL,Albania
  4 | DZ,Algeria
  5 | AS,"American Samoa"
  6 | AD,Andorra
  7 | AO,Angola
  8 | AI,Anguilla
  9 | AQ,Antarctica
 10 | AG,"Antigua and Barbuda"
 11 | AR,Argentina
 12 | AM,Armenia
 13 | AW,Aruba
 14 | AU,Australia
 15 | AT,Austria
 16 | AZ,Azerbaijan
 17 | BS,Bahamas
 18 | BH,Bahrain
 19 | BD,Bangladesh
 20 | BB,Barbados
 21 | BY,Belarus
 22 | BE,Belgium
 23 | BZ,Belize
 24 | BJ,Benin
 25 | BM,Bermuda
 26 | BT,Bhutan
 27 | BO,Bolivia
 28 | BA,"Bosnia and Herzegovina"
 29 | BW,Botswana
 30 | BV,"Bouvet Island"
 31 | BR,Brazil
 32 | BQ,"British Antarctic Territory"
 33 | IO,"British Indian Ocean Territory"
 34 | VG,"British Virgin Islands"
 35 | BN,Brunei
 36 | BG,Bulgaria
 37 | BF,"Burkina Faso"
 38 | BI,Burundi
 39 | KH,Cambodia
 40 | CM,Cameroon
 41 | CA,Canada
 42 | CT,"Canton and Enderbury Islands"
 43 | CV,"Cape Verde"
 44 | KY,"Cayman Islands"
 45 | CF,"Central African Republic"
 46 | TD,Chad
 47 | CL,Chile
 48 | CN,China
 49 | CX,"Christmas Island"
 50 | CC,"Cocos [Keeling] Islands"
 51 | CO,Colombia
 52 | KM,Comoros
 53 | CG,"Congo - Brazzaville"
 54 | CD,"Congo - Kinshasa"
 55 | CK,"Cook Islands"
 56 | CR,"Costa Rica"
 57 | HR,Croatia
 58 | CU,Cuba
 59 | CY,Cyprus
 60 | CZ,"Czech Republic"
 61 | CI,"Côte d’Ivoire"
 62 | DK,Denmark
 63 | DJ,Djibouti
 64 | DM,Dominica
 65 | DO,"Dominican Republic"
 66 | NQ,"Dronning Maud Land"
 67 | DD,"East Germany"
 68 | EC,Ecuador
 69 | EG,Egypt
 70 | SV,"El Salvador"
 71 | GQ,"Equatorial Guinea"
 72 | ER,Eritrea
 73 | EE,Estonia
 74 | ET,Ethiopia
 75 | FK,"Falkland Islands"
 76 | FO,"Faroe Islands"
 77 | FJ,Fiji
 78 | FI,Finland
 79 | FR,France
 80 | GF,"French Guiana"
 81 | PF,"French Polynesia"
 82 | TF,"French Southern Territories"
 83 | FQ,"French Southern and Antarctic Territories"
 84 | GA,Gabon
 85 | GM,Gambia
 86 | GE,Georgia
 87 | DE,Germany
 88 | GH,Ghana
 89 | GI,Gibraltar
 90 | GR,Greece
 91 | GL,Greenland
 92 | GD,Grenada
 93 | GP,Guadeloupe
 94 | GU,Guam
 95 | GT,Guatemala
 96 | GG,Guernsey
 97 | GN,Guinea
 98 | GW,Guinea-Bissau
 99 | GY,Guyana
100 | HT,Haiti
101 | HM,"Heard Island and McDonald Islands"
102 | HN,Honduras
103 | HK,"Hong Kong"
104 | HU,Hungary
105 | IS,Iceland
106 | IN,India
107 | ID,Indonesia
108 | IR,Iran
109 | IQ,Iraq
110 | IE,Ireland
111 | IM,"Isle of Man"
112 | IL,Israel
113 | IT,Italy
114 | JM,Jamaica
115 | JP,Japan
116 | JE,Jersey
117 | JT,"Johnston Island"
118 | JO,Jordan
119 | KZ,Kazakhstan
120 | KE,Kenya
121 | KI,Kiribati
122 | KW,Kuwait
123 | KG,Kyrgyzstan
124 | LA,Laos
125 | LV,Latvia
126 | LB,Lebanon
127 | LS,Lesotho
128 | LR,Liberia
129 | LY,Libya
130 | LI,Liechtenstein
131 | LT,Lithuania
132 | LU,Luxembourg
133 | MO,"Macau SAR China"
134 | MK,Macedonia
135 | MG,Madagascar
136 | MW,Malawi
137 | MY,Malaysia
138 | MV,Maldives
139 | ML,Mali
140 | MT,Malta
141 | MH,"Marshall Islands"
142 | MQ,Martinique
143 | MR,Mauritania
144 | MU,Mauritius
145 | YT,Mayotte
146 | FX,"Metropolitan France"
147 | MX,Mexico
148 | FM,Micronesia
149 | MI,"Midway Islands"
150 | MD,Moldova
151 | MC,Monaco
152 | MN,Mongolia
153 | ME,Montenegro
154 | MS,Montserrat
155 | MA,Morocco
156 | MZ,Mozambique
157 | MM,"Myanmar [Burma]"
158 | NA,Namibia
159 | NR,Nauru
160 | NP,Nepal
161 | NL,Netherlands
162 | AN,"Netherlands Antilles"
163 | NT,"Neutral Zone"
164 | NC,"New Caledonia"
165 | NZ,"New Zealand"
166 | NI,Nicaragua
167 | NE,Niger
168 | NG,Nigeria
169 | NU,Niue
170 | NF,"Norfolk Island"
171 | KP,"North Korea"
172 | VD,"North Vietnam"
173 | MP,"Northern Mariana Islands"
174 | NO,Norway
175 | OM,Oman
176 | PC,"Pacific Islands Trust Territory"
177 | PK,Pakistan
178 | PW,Palau
179 | PS,"Palestinian Territories"
180 | PA,Panama
181 | PZ,"Panama Canal Zone"
182 | PG,"Papua New Guinea"
183 | PY,Paraguay
184 | YD,"People's Democratic Republic of Yemen"
185 | PE,Peru
186 | PH,Philippines
187 | PN,"Pitcairn Islands"
188 | PL,Poland
189 | PT,Portugal
190 | PR,"Puerto Rico"
191 | QA,Qatar
192 | RO,Romania
193 | RU,Russia
194 | RW,Rwanda
195 | RE,Réunion
196 | BL,"Saint Barthélemy"
197 | SH,"Saint Helena"
198 | KN,"Saint Kitts and Nevis"
199 | LC,"Saint Lucia"
200 | MF,"Saint Martin"
201 | PM,"Saint Pierre and Miquelon"
202 | VC,"Saint Vincent and the Grenadines"
203 | WS,Samoa
204 | SM,"San Marino"
205 | SA,"Saudi Arabia"
206 | SN,Senegal
207 | RS,Serbia
208 | CS,"Serbia and Montenegro"
209 | SC,Seychelles
210 | SL,"Sierra Leone"
211 | SG,Singapore
212 | SK,Slovakia
213 | SI,Slovenia
214 | SB,"Solomon Islands"
215 | SO,Somalia
216 | ZA,"South Africa"
217 | GS,"South Georgia and the South Sandwich Islands"
218 | KR,"South Korea"
219 | ES,Spain
220 | LK,"Sri Lanka"
221 | SD,Sudan
222 | SR,Suriname
223 | SJ,"Svalbard and Jan Mayen"
224 | SZ,Swaziland
225 | SE,Sweden
226 | CH,Switzerland
227 | SY,Syria
228 | ST,"São Tomé and Príncipe"
229 | TW,Taiwan
230 | TJ,Tajikistan
231 | TZ,Tanzania
232 | TH,Thailand
233 | TL,Timor-Leste
234 | TG,Togo
235 | TK,Tokelau
236 | TO,Tonga
237 | TT,"Trinidad and Tobago"
238 | TN,Tunisia
239 | TR,Turkey
240 | TM,Turkmenistan
241 | TC,"Turks and Caicos Islands"
242 | TV,Tuvalu
243 | UM,"U.S. Minor Outlying Islands"
244 | PU,"U.S. Miscellaneous Pacific Islands"
245 | VI,"U.S. Virgin Islands"
246 | UG,Uganda
247 | UA,Ukraine
248 | SU,"Union of Soviet Socialist Republics"
249 | AE,"United Arab Emirates"
250 | GB,"United Kingdom"
251 | US,"United States"
252 | ZZ,"Unknown or Invalid Region"
253 | UY,Uruguay
254 | UZ,Uzbekistan
255 | VU,Vanuatu
256 | VA,"Vatican City"
257 | VE,Venezuela
258 | VN,Vietnam
259 | WK,"Wake Island"
260 | WF,"Wallis and Futuna"
261 | EH,"Western Sahara"
262 | YE,Yemen
263 | ZM,Zambia
264 | ZW,Zimbabwe
265 | AX,"Åland Islands"


--------------------------------------------------------------------------------
/lib/files/shorteners.txt:
--------------------------------------------------------------------------------
  1 | 0rz.tw
  2 | 1-url.net
  3 | 126.am
  4 | 1tk.us
  5 | 1un.fr
  6 | 1url.com
  7 | 1url.cz
  8 | 1wb2.net
  9 | 2.gp
 10 | 2.ht
 11 | 2ad.in
 12 | 2doc.net
 13 | 2fear.com
 14 | 2tu.us
 15 | 2ty.in
 16 | 2u.xf.cz
 17 | 3ra.be
 18 | 3x.si
 19 | 4i.ae
 20 | 4ks.net
 21 | 4view.me
 22 | 5em.cz
 23 | 5url.net
 24 | 5z8.info
 25 | 6fr.ru
 26 | 6g6.eu
 27 | 7.ly
 28 | 76.gd
 29 | 77.ai
 30 | 7fth.cc
 31 | 7li.in
 32 | 7vd.cn
 33 | 8u.cz
 34 | 944.la
 35 | 98.to
 36 | L9.fr
 37 | Lvvk.com
 38 | To8.cc
 39 | a0.fr
 40 | abbr.sk
 41 | ad-med.cz
 42 | ad5.eu
 43 | ad7.biz
 44 | adb.ug
 45 | adf.ly
 46 | adfa.st
 47 | adfly.fr
 48 | adli.pw
 49 | adv.li
 50 | ajn.me
 51 | aka.gr
 52 | alil.in
 53 | amzn.to
 54 | any.gs
 55 | aqva.pl
 56 | ares.tl
 57 | asso.in
 58 | au.ms
 59 | ayt.fr
 60 | azali.fr
 61 | b00.fr
 62 | b23.ru
 63 | b54.in
 64 | baid.us
 65 | bc.vc
 66 | beam.to
 67 | bee4.biz
 68 | bim.im
 69 | bit.do
 70 | bit.ly
 71 | bitly.com
 72 | bitw.in
 73 | blap.net
 74 | ble.pl
 75 | blip.tv
 76 | boi.re
 77 | bote.me
 78 | bougn.at
 79 | br4.in
 80 | brk.to
 81 | brzu.net
 82 | bul.lu
 83 | bxl.me
 84 | bzh.me
 85 | cachor.ro
 86 | captur.in
 87 | cashfly.com
 88 | cbs.so
 89 | cbug.cc
 90 | cc.cc
 91 | ccj.im
 92 | cf.ly
 93 | cf2.me
 94 | cf6.co
 95 | chilp.it
 96 | cjb.net
 97 | cli.gs
 98 | clikk.in
 99 | clk.im
100 | cn86.org
101 | couic.fr
102 | cr.tl
103 | cudder.it
104 | cur.lv
105 | curl.im
106 | curte.me
107 | cut.pe
108 | cut.sk
109 | cutt.eu
110 | cutt.us
111 | cutu.me
112 | cybr.fr
113 | cyonix.to
114 | d75.eu
115 | daa.pl
116 | dai.ly
117 | decenturl.com
118 | dd.ma
119 | ddp.net
120 | dft.ba
121 | digbig.com
122 | doiop.com
123 | dolp.cc
124 | dopice.sk
125 | droid.ws
126 | dv.gd
127 | dyo.gs
128 | e37.eu
129 | easyurl.net
130 | ecra.se
131 | ely.re
132 | encurtador.com.br
133 | erax.cz
134 | erw.cz
135 | esy.es
136 | ex9.co
137 | ezurl.cc
138 | fff.re
139 | fff.to
140 | fff.wf
141 | filz.fr
142 | fnk.es
143 | foe.hn
144 | folu.me
145 | freze.it
146 | fur.ly
147 | fwdurl.net
148 | g00.me
149 | gca.sh
150 | gg.gg
151 | goo.gl
152 | goo.lu
153 | grem.io
154 | guiama.is
155 | hadej.co
156 | hide.my
157 | hjkl.fr
158 | hops.me
159 | href.li
160 | ht.ly
161 | i-2.co
162 | i99.cz
163 | icit.fr
164 | ick.li
165 | icks.ro
166 | iiiii.in
167 | iky.fr
168 | ilix.in
169 | info.ms
170 | is.gd
171 | isra.li
172 | itm.im
173 | ity.im
174 | ix.sk
175 | j.gs
176 | j.mp
177 | jdem.cz
178 | jieb.be
179 | jp22.net
180 | jqw.de
181 | kask.us
182 | kd2.org
183 | kfd.pl
184 | korta.nu
185 | kr3w.de
186 | krat.si
187 | kratsi.cz
188 | krod.cz
189 | kuc.cz
190 | kxb.me
191 | l-k.be
192 | lc-s.co
193 | lc.cx
194 | lcut.in
195 | letop10.
196 | libero.it
197 | lick.my
198 | lien.li
199 | lien.pl
200 | lin.io
201 | linkn.co
202 | linkbucks.com
203 | llu.ch
204 | lnk.co
205 | lnk.ly
206 | lnk.sk
207 | lnks.fr
208 | lnky.fr
209 | lnp.sn
210 | lp25.fr
211 | m1p.fr
212 | m3mi.com
213 | make.my
214 | mcaf.ee
215 | mdl29.net
216 | mic.fr
217 | migre.me
218 | minu.me
219 | moourl.com
220 | more.sh
221 | mut.lu
222 | myurl.in
223 | net.ms
224 | net46.net
225 | nicou.ch
226 | nig.gr
227 | notlong.com
228 | nov.io
229 | nq.st
230 | nutshellurl.com
231 | nxy.in
232 | o-x.fr
233 | okok.fr
234 | onl.li
235 | ou.af
236 | ou.gd
237 | oua.be
238 | ouo.io
239 | ow.ly
240 | p.pw
241 | parky.tv
242 | past.is
243 | pdh.co
244 | ph.ly
245 | pich.in
246 | pin.st
247 | plots.fr
248 | plots.fr
249 | pm.wu.cz
250 | po.st
251 | ppfr.it
252 | ppst.me
253 | ppt.cc
254 | ppt.li
255 | pqn.bz
256 | prejit.cz
257 | ptab.it
258 | ptm.ro
259 | pw2.ro
260 | py6.ru
261 | q.gs
262 | qbn.ru
263 | qqc.co
264 | qr.net
265 | qrtag.fr
266 | qxp.cz
267 | qxp.sk
268 | r.cont.us
269 | rb6.co
270 | rcknr.io
271 | rdz.me
272 | redir.ec
273 | redir.fr
274 | redu.it
275 | ref.so
276 | reise.lc
277 | relink.fr
278 | repla.cr
279 | ri.ms
280 | riz.cz
281 | rod.gs
282 | roflc.at
283 | rt.se
284 | s-url.fr
285 | safe.mn
286 | sagyap.tk
287 | sdu.sk
288 | seeme.at
289 | segue.se
290 | sh.st
291 | sh.st
292 | shar.as
293 | shrinkurl.us
294 | shorl.com
295 | short.cc
296 | short.ie
297 | short.pk
298 | shorte.st
299 | shrt.in
300 | shy.si
301 | smu.sh
302 | sicax.net
303 | sina.lt
304 | sk.gy
305 | skr.sk
306 | skroc.pl
307 | smll.co
308 | sn.im
309 | snipurl.com
310 | snsw.us
311 | snurl.com
312 | soo.gd
313 | sort3.me
314 | spn.sr
315 | sq6.ru
316 | ssl.gs
317 | su.pr
318 | surl.me
319 | sux.cz
320 | sy.pe
321 | t.cn
322 | t.co
323 | ta.gd
324 | tabzi.com
325 | tau.pe
326 | tdjt.cz
327 | thesa.us
328 | tighturl.com
329 | tin.li
330 | tini.cc
331 | tiny.cc
332 | tiny.lt
333 | tiny.ms
334 | tiny.pl
335 | tinyurl.com
336 | tinyurl.hu
337 | tixsu.com
338 | tldr.sk
339 | tllg.net
340 | tnij.org
341 | tny.cz
342 | to.ly
343 | tohle.de
344 | tpmr.com
345 | tr.im
346 | tr5.in
347 | trck.me
348 | trick.ly
349 | trkr.ws
350 | trunc.it
351 | twet.fr
352 | twi.im
353 | twlr.me
354 | twurl.nl
355 | u.to
356 | uby.es
357 | ucam.me
358 | ug.cz
359 | ulmt.in
360 | unlc.us
361 | upzat.com
362 | ur1.ca
363 | url2.fr
364 | url5.org
365 | url.ie
366 | url.likedeck.com
367 | url.lotpatrol.com
368 | urlcut.com
369 | urlin.it
370 | urls.fr
371 | urltea.com
372 | urlz.fr
373 | urub.us
374 | utfg.sk
375 | v.gd
376 | v.ht
377 | v5.gd
378 | vaaa.fr
379 | valv.im
380 | vaza.me
381 | vbly.us
382 | vd55.com
383 | verd.in
384 | vgn.me
385 | vov.li
386 | vsll.eu
387 | vt802.us
388 | vur.me
389 | vv.vg
390 | w1p.fr
391 | waa.ai
392 | wapurl.co.uk
393 | wb1.eu
394 | web99.eu
395 | wed.li
396 | wideo.fr
397 | wp.me
398 | wtc.la
399 | wu.cz
400 | ww7.fr
401 | wwy.me
402 | x.co
403 | x.nu
404 | x10.mx
405 | x2c.eu
406 | x2c.eumx
407 | xav.cc
408 | xgd.in
409 | xib.me
410 | xl8.eu
411 | xoe.cz
412 | xrl.us
413 | xt3.me
414 | xua.me
415 | xub.me
416 | xurls.co
417 | yagoa.fr
418 | yagoa.me
419 | yatuc.com
420 | yau.sh
421 | yeca.eu
422 | yect.com
423 | yep.it
424 | yogh.me
425 | yon.ir
426 | youfap.me
427 | youtu.be
428 | ysear.ch
429 | yyv.co
430 | z9.fr
431 | zSMS.net
432 | zapit.nu
433 | zeek.ir
434 | zip.net
435 | zkr.cz
436 | zkrat.me
437 | zkrt.cz
438 | zoodl.com
439 | zpag.es
440 | zti.me
441 | zxq.net
442 | zyva.org
443 | zzb.bz
444 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # URL Feature Extractor
  2 | 
  3 | Extracting features from URLs to build a data set for machine learning. The purpose is to find a machine learning model to predict phishing URLs, which are targeted to the Brazilian population.
  4 | 
  5 | This repo includes the implementation of our paper:
  6 | 
  7 | Lucas Dantas Gama Ayres, Italo Valcy S Brito and Rodrigo Rocha Gomes e Souza. Using Machine Learning to Automatically Detect Malicious URLs in Brazil. In Simpósio Brasileiro de Redes de Computadores e Sistemas Distribuídos (SBRC 2019) - 2019, Gramado - RS - Brazil.
  8 | 
  9 | The paper is available here: https://sol.sbc.org.br/index.php/sbrc/article/view/7416
 10 | 
 11 | DOI: https://doi.org/10.5753/sbrc.2019.7416
 12 | 
 13 | ## Install
 14 | 
 15 | ```bash
 16 | $ sudo apt-get update && sudo apt-get upgrade
 17 | $ sudo apt-get install virtualenv python3 python3-dev python-dev gcc libpq-dev libssl-dev libffi-dev build-essentials
 18 | $ virtualenv -p /usr/bin/python3 .env
 19 | $ source .env/bin/activate
 20 | $ pip install -r requirements.txt
 21 | ```
 22 | 
 23 | ## How to use
 24 | 
 25 | Before running the software, add the API Keys to the Google Safe Browsing, Phishtank, and MyWot in the ```config.ini``` file.
 26 | 
 27 | Now, run:
 28 | 
 29 | ```bash
 30 | $ python run.py <input-urls> <output-dataset>
 31 | ```
 32 | 
 33 | ## Features implemented
 34 | 
 35 | <table>
 36 |     <tr>
 37 |         <th style="text-align:center" colspan="4">
 38 |             <b>LEXICAL</b>
 39 |         </th>
 40 |     </tr>
 41 |     <tr>
 42 |         <td>Count (.) in URL</td>
 43 |         <td>Count (-) in URL</td>
 44 |         <td>Count (_) in URL</td>
 45 |         <td>Count (/) in URL</td>
 46 |     </tr>
 47 |     <tr>
 48 |         <td>Count (?) in URL</td>
 49 |         <td>Count (=) in URL</td>
 50 |         <td>Count (@) in URL</td>
 51 |         <td>Count (&) in URL</td>
 52 |     </tr>
 53 |     <tr>
 54 |         <td>Count (!) in URL</td>
 55 |         <td>Count ( ) in URL</td>
 56 |         <td>Count (~) in URL</td>
 57 |         <td>Count (,) in URL</td>
 58 |     </tr>
 59 |     <tr>
 60 |         <td>Count (+) in URL</td>
 61 |         <td>Count (*) in URL</td>
 62 |         <td>Count (#) in URL</td>
 63 |         <td>Count ($) in URL</td>
 64 |     </tr>
 65 |     <tr>
 66 |         <td>Count (%) in URL</td>
 67 |         <td>URL LengthL</td>
 68 |         <td>TLD amount in URL</td>
 69 |         <td>Count (.) in Domain</td>
 70 |     </tr>
 71 |     <tr>
 72 |         <td>Count (-) in Domain</td>
 73 |         <td>Count (_) in Domain</td>
 74 |         <td>Count (/) in Domain</td>
 75 |         <td>Count (?) in Domain</td>
 76 |     </tr>
 77 |     <tr>
 78 |         <td>Count (=) in Domain</td>
 79 |         <td>Count (@) in Domain</td>
 80 |         <td>Count (&) in Domain</td>
 81 |         <td>Count (!) in Domain</td>
 82 |     </tr>
 83 |     <tr>
 84 |         <td>Count ( ) in Domain</td>
 85 |         <td>Count (~) in Domain</td>
 86 |         <td>Count (,) in Domain</td>
 87 |         <td>Count (+) in Domain</td>
 88 |     </tr>
 89 |     <tr>
 90 |         <td>Count (*) in Domain</td>
 91 |         <td>Count (#) in Domain</td>
 92 |         <td>Count ($) in Domain</td>
 93 |         <td>Count (%) in Domain</td>
 94 |     </tr>
 95 |     <tr>
 96 |         <td>Domain Length</td>
 97 |         <td>Quantidade de vogais in Domain</td>
 98 |         <td>URL domain in IP address format</td>
 99 |         <td>Domain contains the key words "server" or "client"</td>
100 |     </tr>
101 |     <tr>
102 |         <td>Count (.) in Directory</td>
103 |         <td>Count (-) in Directory</td>
104 |         <td>Count (_) in Directory</td>
105 |         <td>Count (/) in Directory</td>
106 |     </tr>
107 |     <tr>
108 |         <td>Count (?) in Directory</td>
109 |         <td>Count (=) in Directory</td>
110 |         <td>Count (@) in Directory</td>
111 |         <td>Count (&) in Directory</td>
112 |     </tr>
113 |     <tr>
114 |         <td>Count (!) in Directory</td>
115 |         <td>Count ( ) in Directory</td>
116 |         <td>Count (~) in Directory</td>
117 |         <td>Count (,) in Directory</td>
118 |     </tr>
119 |     <tr>
120 |         <td>Count (+) in Directory</td>
121 |         <td>Count (*) in Directory</td>
122 |         <td>Count (#) in Directory</td>
123 |         <td>Count ($) in Directory</td>
124 |     </tr>
125 |     <tr>
126 |         <td>Count (%) in Directory</td>
127 |         <td>Directory Length</td>
128 |         <td>Count (.) in file</td>
129 |         <td>Count (-) in file</td>
130 |     </tr>
131 |     <tr>
132 |         <td>Count (_) in file</td>
133 |         <td>Count (/) in file</td>
134 |         <td>Count (?) in file</td>
135 |         <td>Count (=) in file</td>
136 |     </tr>
137 |     <tr>
138 |         <td>Count (@) in file</td>
139 |         <td>Count (&) in file</td>
140 |         <td>Count (!) in file</td>
141 |         <td>Count ( ) in file</td>
142 |     </tr>
143 |     <tr>
144 |         <td>Count (~) in file</td>
145 |         <td>Count (,) in file</td>
146 |         <td>Count (+) in file</td>
147 |         <td>Count (*) in file</td>
148 |     </tr>
149 |     <tr>
150 |         <td>Count (#) in file</td>
151 |         <td>Count ($) in file</td>
152 |         <td>Count (%) in file</td>
153 |         <td>File length</td>
154 |     </tr>
155 |     <tr>
156 |         <td>Count (.) in parameters</td>
157 |         <td>Count (-) in parameters</td>
158 |         <td>Count (_) in parameters</td>
159 |         <td>Count (/) in parameters</td>
160 |     </tr>
161 |     <tr>
162 |         <td>Count (?) in parameters</td>
163 |         <td>Count (=) in parameters</td>
164 |         <td>Count (@) in parameters</td>
165 |         <td>Count (&) in parameters</td>
166 |     </tr>
167 |     <tr>
168 |         <td>Count (!) in parameters</td>
169 |         <td>Count ( ) in parameters</td>
170 |         <td>Count (~) in parameters</td>
171 |         <td>Count (,) in parameters</td>
172 |     </tr>
173 |     <tr>
174 |         <td>Count (+) in parameters</td>
175 |         <td>Count (*) in parameters</td>
176 |         <td>Count (#) in parameters</td>
177 |         <td>Count ($) in parameters</td>
178 |     </tr>
179 |     <tr>
180 |         <td>Count (%) in parameters</td>
181 |         <td>Length of parameters</td>
182 |         <td>TLD presence in arguments</td>
183 |         <td>Number of parameters</td>
184 |     </tr>
185 |     <tr>
186 |         <td>Email present at URL</td>
187 |         <td>File extension</td>
188 |     </tr>
189 | </table>
190 | 
191 | <table>
192 |     <tr>
193 |         <th style="text-align:center" colspan="4">
194 |             <b>BLACKLIST</b>
195 |         </th>
196 |     </tr>
197 |     <tr>
198 |         <td>Presence of the URL in blacklists</td>
199 |         <td>Presence of the IP Address in blacklists</td>
200 |         <td>Presence of the domain in Blacklists</td>
201 |     </tr>
202 | </table>
203 | 
204 | <table>
205 |     <tr>
206 |         <th style="text-align:center" colspan="4">
207 |             <b>HOST</b>
208 |         </th>
209 |     </tr>
210 |     <tr>
211 |         <td>Presence of the domain in RBL (Real-time Blackhole List)</td>
212 |         <td>Search time (response) domain (lookup)</td>
213 |         <td>Domain has SPF?</td>
214 |         <td>Geographical location of IP</td>
215 |     </tr>
216 |     <tr>
217 |         <td>AS Number (or ASN)</td>
218 |         <td>PTR of IP</td>
219 |         <td>Time (in days) of domain activation</td>
220 |         <td>Time (in days) of domain expiration</td>
221 |     </tr>
222 |     <tr>
223 |         <td>Number of resolved IPs</td>
224 |         <td>Number of resolved name servers (NameServers - NS)</td>
225 |         <td>Number of MX Servers</td>
226 |         <td>Time-to-live (TTL) value associated with hostname</td>
227 |     </tr>
228 | </table>
229 | 
230 | <table>
231 |     <tr>
232 |         <th style="text-align:center" colspan="4">
233 |             <b>OTHERS</b>
234 |         </th>
235 |     </tr>
236 |     <tr>
237 |         <td>Valid TLS / SSL Certificate</td>
238 |         <td>Number of redirects</td>
239 |         <td>Check if URL is indexed on Google</td>
240 |         <td>Check if domain is indexed on Google</td>
241 |     </tr>
242 |     <tr>
243 |         <td>Uses URL shortener service</td>
244 |     </tr>
245 | </table>
246 | 
247 | ## Contributing
248 | 
249 | Any contribution is appreciated.
250 | 
251 | #### Submitting a Pull Request (PR)
252 | 
253 | 1. Clone the project:
254 |   ```
255 |   $ git clone https://github.com/lucasayres/url-feature-extractor.git
256 |   ```
257 | 
258 | 2. Make your changes in a new git branch:
259 |   ```
260 |   $ git checkout -b my-branch master
261 |   ```
262 | 
263 | 3. Add your changes.
264 | 
265 | 4. Push your branch to Github.
266 | 
267 | 5. Create a PR to master.
268 | 


--------------------------------------------------------------------------------
/pythonwhois/countries3.dat:
--------------------------------------------------------------------------------
  1 | "name","iso_name","iso2","iso3","numcode"
  2 | "Antigua and Barbuda","ANTIGUA AND BARBUDA","AG","ATG",28
  3 | "Bosnia and Herzegovina","BOSNIA AND HERZEGOVINA","BA","BIH",70
  4 | "Cocos (Keeling) Islands","COCOS (KEELING) ISLANDS","CC","\N","\N"
  5 | "Congo, the Democratic Republic of the","CONGO, THE DEMOCRATIC REPUBLIC OF THE","CD","COD",180
  6 | "Cote D'Ivoire","COTE D'IVOIRE","CI","CIV",384
  7 | "Fiji","FIJI","FJ","FJI",242
  8 | "French Southern Territories","FRENCH SOUTHERN TERRITORIES","TF","\N","\N"
  9 | "Heard Island and Mcdonald Islands","HEARD ISLAND AND MCDONALD ISLANDS","HM","\N","\N"
 10 | "Holy See (Vatican City State)","HOLY SEE (VATICAN CITY STATE)","VA","VAT",336
 11 | "Iran, Islamic Republic of","IRAN, ISLAMIC REPUBLIC OF","IR","IRN",364
 12 | "Korea, Democratic People's Republic of","KOREA, DEMOCRATIC PEOPLE'S REPUBLIC OF","KP","PRK",408
 13 | "Korea, Republic of","KOREA, REPUBLIC OF","KR","KOR",410
 14 | "Belarus","BELARUS","BY","BLR",112
 15 | "Lao People's Democratic Republic","LAO PEOPLE'S DEMOCRATIC REPUBLIC","LA","LAO",418
 16 | "Macedonia, the Former Yugoslav Republic of","MACEDONIA, THE FORMER YUGOSLAV REPUBLIC OF","MK","MKD",807
 17 | "United States","UNITED STATES","US","USA",840
 18 | "Micronesia, Federated States of","MICRONESIA, FEDERATED STATES OF","FM","FSM",583
 19 | "Moldova, Republic of","MOLDOVA, REPUBLIC OF","MD","MDA",498
 20 | "Palestinian Territory, Occupied","PALESTINIAN TERRITORY, OCCUPIED","PS","\N","\N"
 21 | "Pitcairn","PITCAIRN","PN","PCN",612
 22 | "Reunion","REUNION","RE","REU",638
 23 | "Saint Helena","SAINT HELENA","SH","SHN",654
 24 | "Saint Kitts and Nevis","SAINT KITTS AND NEVIS","KN","KNA",659
 25 | "Saint Pierre and Miquelon","SAINT PIERRE AND MIQUELON","PM","SPM",666
 26 | "Sao Tome and Principe","SAO TOME AND PRINCIPE","ST","STP",678
 27 | "Serbia and Montenegro","SERBIA AND MONTENEGRO","CS","\N","\N"
 28 | "South Georgia and the South Sandwich Islands","SOUTH GEORGIA AND THE SOUTH SANDWICH ISLANDS","GS","\N","\N"
 29 | "Svalbard and Jan Mayen","SVALBARD AND JAN MAYEN","SJ","SJM",744
 30 | "Syrian Arab Republic","SYRIAN ARAB REPUBLIC","SY","SYR",760
 31 | "Taiwan, Province of China","TAIWAN, PROVINCE OF CHINA","TW","TWN",158
 32 | "Tanzania, United Republic of","TANZANIA, UNITED REPUBLIC OF","TZ","TZA",834
 33 | "Timor-Leste","TIMOR-LESTE","TL","\N","\N"
 34 | "Trinidad and Tobago","TRINIDAD AND TOBAGO","TT","TTO",780
 35 | "Mexico","MEXICO","MX","MEX",484
 36 | "Myanmar","MYANMAR","MM","MMR",104
 37 | "Virgin Islands, British","VIRGIN ISLANDS, BRITISH","VG","VGB",92
 38 | "Virgin Islands, U.s.","VIRGIN ISLANDS, U.S.","VI","VIR",850
 39 | "Wallis and Futuna","WALLIS AND FUTUNA","WF","WLF",876
 40 | "Albania","ALBANIA","AL","ALB",8
 41 | "Algeria","ALGERIA","DZ","DZA",12
 42 | "American Samoa","AMERICAN SAMOA","AS","ASM",16
 43 | "Vanuatu","VANUATU","VU","VUT",548
 44 | "Yemen","YEMEN","YE","YEM",887
 45 | "Andorra","ANDORRA","AD","AND",20
 46 | "Angola","ANGOLA","AO","AGO",24
 47 | "Anguilla","ANGUILLA","AI","AIA",660
 48 | "Argentina","ARGENTINA","AR","ARG",32
 49 | "Armenia","ARMENIA","AM","ARM",51
 50 | "Aruba","ARUBA","AW","ABW",533
 51 | "Australia","AUSTRALIA","AU","AUS",36
 52 | "Austria","AUSTRIA","AT","AUT",40
 53 | "Azerbaijan","AZERBAIJAN","AZ","AZE",31
 54 | "Bahamas","BAHAMAS","BS","BHS",44
 55 | "Bahrain","BAHRAIN","BH","BHR",48
 56 | "Bangladesh","BANGLADESH","BD","BGD",50
 57 | "Barbados","BARBADOS","BB","BRB",52
 58 | "Belgium","BELGIUM","BE","BEL",56
 59 | "Benin","BENIN","BJ","BEN",204
 60 | "Bermuda","BERMUDA","BM","BMU",60
 61 | "Bhutan","BHUTAN","BT","BTN",64
 62 | "Bolivia","BOLIVIA","BO","BOL",68
 63 | "Botswana","BOTSWANA","BW","BWA",72
 64 | "Bouvet Island","BOUVET ISLAND","BV","\N","\N"
 65 | "Brazil","BRAZIL","BR","BRA",76
 66 | "British Indian Ocean Territory","BRITISH INDIAN OCEAN TERRITORY","IO","\N","\N"
 67 | "Brunei Darussalam","BRUNEI DARUSSALAM","BN","BRN",96
 68 | "Bulgaria","BULGARIA","BG","BGR",100
 69 | "Burkina Faso","BURKINA FASO","BF","BFA",854
 70 | "Burundi","BURUNDI","BI","BDI",108
 71 | "Cambodia","CAMBODIA","KH","KHM",116
 72 | "Cameroon","CAMEROON","CM","CMR",120
 73 | "Canada","CANADA","CA","CAN",124
 74 | "Cape Verde","CAPE VERDE","CV","CPV",132
 75 | "Malta","MALTA","MT","MLT",470
 76 | "Cayman Islands","CAYMAN ISLANDS","KY","CYM",136
 77 | "Chad","CHAD","TD","TCD",148
 78 | "Chile","CHILE","CL","CHL",152
 79 | "China","CHINA","CN","CHN",156
 80 | "Christmas Island","CHRISTMAS ISLAND","CX","\N","\N"
 81 | "Colombia","COLOMBIA","CO","COL",170
 82 | "Comoros","COMOROS","KM","COM",174
 83 | "Cook Islands","COOK ISLANDS","CK","COK",184
 84 | "Costa Rica","COSTA RICA","CR","CRI",188
 85 | "Croatia","CROATIA","HR","HRV",191
 86 | "Cuba","CUBA","CU","CUB",192
 87 | "Cyprus","CYPRUS","CY","CYP",196
 88 | "Czech Republic","CZECH REPUBLIC","CZ","CZE",203
 89 | "Denmark","DENMARK","DK","DNK",208
 90 | "Djibouti","DJIBOUTI","DJ","DJI",262
 91 | "Dominica","DOMINICA","DM","DMA",212
 92 | "Dominican Republic","DOMINICAN REPUBLIC","DO","DOM",214
 93 | "Ecuador","ECUADOR","EC","ECU",218
 94 | "Egypt","EGYPT","EG","EGY",818
 95 | "Equatorial Guinea","EQUATORIAL GUINEA","GQ","GNQ",226
 96 | "Eritrea","ERITREA","ER","ERI",232
 97 | "Estonia","ESTONIA","EE","EST",233
 98 | "Ethiopia","ETHIOPIA","ET","ETH",231
 99 | "Faroe Islands","FAROE ISLANDS","FO","FRO",234
100 | "Finland","FINLAND","FI","FIN",246
101 | "France","FRANCE","FR","FRA",250
102 | "French Guiana","FRENCH GUIANA","GF","GUF",254
103 | "French Polynesia","FRENCH POLYNESIA","PF","PYF",258
104 | "Gabon","GABON","GA","GAB",266
105 | "Gambia","GAMBIA","GM","GMB",270
106 | "Georgia","GEORGIA","GE","GEO",268
107 | "Germany","GERMANY","DE","DEU",276
108 | "Ghana","GHANA","GH","GHA",288
109 | "Gibraltar","GIBRALTAR","GI","GIB",292
110 | "Greece","GREECE","GR","GRC",300
111 | "Greenland","GREENLAND","GL","GRL",304
112 | "Grenada","GRENADA","GD","GRD",308
113 | "Guadeloupe","GUADELOUPE","GP","GLP",312
114 | "Guam","GUAM","GU","GUM",316
115 | "Guatemala","GUATEMALA","GT","GTM",320
116 | "Guinea","GUINEA","GN","GIN",324
117 | "Guinea-Bissau","GUINEA-BISSAU","GW","GNB",624
118 | "Guyana","GUYANA","GY","GUY",328
119 | "Haiti","HAITI","HT","HTI",332
120 | "Honduras","HONDURAS","HN","HND",340
121 | "Hong Kong","HONG KONG","HK","HKG",344
122 | "Hungary","HUNGARY","HU","HUN",348
123 | "Iceland","ICELAND","IS","ISL",352
124 | "India","INDIA","IN","IND",356
125 | "Indonesia","INDONESIA","ID","IDN",360
126 | "Iraq","IRAQ","IQ","IRQ",368
127 | "Israel","ISRAEL","IL","ISR",376
128 | "Italy","ITALY","IT","ITA",380
129 | "Jamaica","JAMAICA","JM","JAM",388
130 | "Japan","JAPAN","JP","JPN",392
131 | "Jordan","JORDAN","JO","JOR",400
132 | "Kazakhstan","KAZAKHSTAN","KZ","KAZ",398
133 | "Kenya","KENYA","KE","KEN",404
134 | "Kiribati","KIRIBATI","KI","KIR",296
135 | "Kuwait","KUWAIT","KW","KWT",414
136 | "Kyrgyzstan","KYRGYZSTAN","KG","KGZ",417
137 | "Latvia","LATVIA","LV","LVA",428
138 | "Lesotho","LESOTHO","LS","LSO",426
139 | "Liberia","LIBERIA","LR","LBR",430
140 | "Libyan Arab Jamahiriya","LIBYAN ARAB JAMAHIRIYA","LY","LBY",434
141 | "Lithuania","LITHUANIA","LT","LTU",440
142 | "Luxembourg","LUXEMBOURG","LU","LUX",442
143 | "Macao","MACAO","MO","MAC",446
144 | "Madagascar","MADAGASCAR","MG","MDG",450
145 | "Malawi","MALAWI","MW","MWI",454
146 | "Malaysia","MALAYSIA","MY","MYS",458
147 | "Maldives","MALDIVES","MV","MDV",462
148 | "Mali","MALI","ML","MLI",466
149 | "Marshall Islands","MARSHALL ISLANDS","MH","MHL",584
150 | "Martinique","MARTINIQUE","MQ","MTQ",474
151 | "Mauritius","MAURITIUS","MU","MUS",480
152 | "Mayotte","MAYOTTE","YT","\N","\N"
153 | "Monaco","MONACO","MC","MCO",492
154 | "Mongolia","MONGOLIA","MN","MNG",496
155 | "Montserrat","MONTSERRAT","MS","MSR",500
156 | "Morocco","MOROCCO","MA","MAR",504
157 | "Mozambique","MOZAMBIQUE","MZ","MOZ",508
158 | "Namibia","NAMIBIA","NA","NAM",516
159 | "Nauru","NAURU","NR","NRU",520
160 | "Nepal","NEPAL","NP","NPL",524
161 | "Netherlands","NETHERLANDS","NL","NLD",528
162 | "New Caledonia","NEW CALEDONIA","NC","NCL",540
163 | "New Zealand","NEW ZEALAND","NZ","NZL",554
164 | "Nicaragua","NICARAGUA","NI","NIC",558
165 | "Niger","NIGER","NE","NER",562
166 | "Nigeria","NIGERIA","NG","NGA",566
167 | "Niue","NIUE","NU","NIU",570
168 | "Norfolk Island","NORFOLK ISLAND","NF","NFK",574
169 | "Northern Mariana Islands","NORTHERN MARIANA ISLANDS","MP","MNP",580
170 | "Norway","NORWAY","NO","NOR",578
171 | "Oman","OMAN","OM","OMN",512
172 | "Pakistan","PAKISTAN","PK","PAK",586
173 | "Palau","PALAU","PW","PLW",585
174 | "Panama","PANAMA","PA","PAN",591
175 | "Paraguay","PARAGUAY","PY","PRY",600
176 | "Peru","PERU","PE","PER",604
177 | "Philippines","PHILIPPINES","PH","PHL",608
178 | "Poland","POLAND","PL","POL",616
179 | "Portugal","PORTUGAL","PT","PRT",620
180 | "Puerto Rico","PUERTO RICO","PR","PRI",630
181 | "Qatar","QATAR","QA","QAT",634
182 | "Romania","ROMANIA","RO","ROM",642
183 | "Russian Federation","RUSSIAN FEDERATION","RU","RUS",643
184 | "Rwanda","RWANDA","RW","RWA",646
185 | "Saint Lucia","SAINT LUCIA","LC","LCA",662
186 | "Saint Vincent and the Grenadines","SAINT VINCENT AND THE GRENADINES","VC","VCT",670
187 | "Samoa","SAMOA","WS","WSM",882
188 | "San Marino","SAN MARINO","SM","SMR",674
189 | "Senegal","SENEGAL","SN","SEN",686
190 | "Seychelles","SEYCHELLES","SC","SYC",690
191 | "Sierra Leone","SIERRA LEONE","SL","SLE",694
192 | "Singapore","SINGAPORE","SG","SGP",702
193 | "Slovakia","SLOVAKIA","SK","SVK",703
194 | "Slovenia","SLOVENIA","SI","SVN",705
195 | "Solomon Islands","SOLOMON ISLANDS","SB","SLB",90
196 | "Somalia","SOMALIA","SO","SOM",706
197 | "South Africa","SOUTH AFRICA","ZA","ZAF",710
198 | "Spain","SPAIN","ES","ESP",724
199 | "Sri Lanka","SRI LANKA","LK","LKA",144
200 | "Sudan","SUDAN","SD","SDN",736
201 | "Suriname","SURINAME","SR","SUR",740
202 | "Swaziland","SWAZILAND","SZ","SWZ",748
203 | "Sweden","SWEDEN","SE","SWE",752
204 | "Switzerland","SWITZERLAND","CH","CHE",756
205 | "Thailand","THAILAND","TH","THA",764
206 | "Togo","TOGO","TG","TGO",768
207 | "Tokelau","TOKELAU","TK","TKL",772
208 | "Tonga","TONGA","TO","TON",776
209 | "Tunisia","TUNISIA","TN","TUN",788
210 | "Turkey","TURKEY","TR","TUR",792
211 | "Turkmenistan","TURKMENISTAN","TM","TKM",795
212 | "Turks and Caicos Islands","TURKS AND CAICOS ISLANDS","TC","TCA",796
213 | "Tuvalu","TUVALU","TV","TUV",798
214 | "Uganda","UGANDA","UG","UGA",800
215 | "Ukraine","UKRAINE","UA","UKR",804
216 | "United Kingdom","UNITED KINGDOM","GB","GBR",826
217 | "United States Minor Outlying Islands","UNITED STATES MINOR OUTLYING ISLANDS","UM","\N","\N"
218 | "Uruguay","URUGUAY","UY","URY",858
219 | "Uzbekistan","UZBEKISTAN","UZ","UZB",860
220 | "Venezuela","VENEZUELA","VE","VEN",862
221 | "Viet Nam","VIET NAM","VN","VNM",704
222 | "Western Sahara","WESTERN SAHARA","EH","ESH",732
223 | "Zambia","ZAMBIA","ZM","ZMB",894
224 | "Zimbabwe","ZIMBABWE","ZW","ZWE",716
225 | "Antarctica","ANTARCTICA","AQ","\N","\N"
226 | "Belize","BELIZE","BZ","BLZ",84
227 | "Central African Republic","CENTRAL AFRICAN REPUBLIC","CF","CAF",140
228 | "El Salvador","EL SALVADOR","SV","SLV",222
229 | "Ireland","IRELAND","IE","IRL",372
230 | "Lebanon","LEBANON","LB","LBN",422
231 | "Liechtenstein","LIECHTENSTEIN","LI","LIE",438
232 | "Mauritania","MAURITANIA","MR","MRT",478
233 | "Afghanistan","AFGHANISTAN","AF","AFG",4
234 | "Falkland Islands (Malvinas)","FALKLAND ISLANDS (MALVINAS)","FK","FLK",238
235 | "Netherlands Antilles","NETHERLANDS ANTILLES","AN","ANT",530
236 | "Congo","CONGO","CG","COG",178
237 | "Papua New Guinea","PAPUA NEW GUINEA","PG","PNG",598
238 | "Saudi Arabia","SAUDI ARABIA","SA","SAU",682
239 | "Tajikistan","TAJIKISTAN","TJ","TJK",762
240 | "United Arab Emirates","UNITED ARAB EMIRATES","AE","ARE",784


--------------------------------------------------------------------------------
/lib/functions.py:
--------------------------------------------------------------------------------
  1 | from urllib import parse
  2 | from dns import resolver, reversename
  3 | from datetime import datetime
  4 | from bs4 import BeautifulSoup
  5 | from rblwatch import RBLSearch
  6 | from .spf import get_spf_record, check_spf
  7 | from .blacklists import google_safebrowsing, phishtank, wot
  8 | import re
  9 | import pythonwhois
 10 | import ipaddress
 11 | import requests
 12 | import geoip2.database
 13 | 
 14 | PATH = 'lib/files/'
 15 | 
 16 | 
 17 | def start_url(url):
 18 |     """Split URL into: protocol, host, path, params, query and fragment."""
 19 |     if not parse.urlparse(url.strip()).scheme:
 20 |         url = 'http://' + url
 21 |     protocol, host, path, params, query, fragment = parse.urlparse(url.strip())
 22 | 
 23 |     result = {
 24 |         'url': host + path + params + query + fragment,
 25 |         'protocol': protocol,
 26 |         'host': host,
 27 |         'path': path,
 28 |         'params': params,
 29 |         'query': query,
 30 |         'fragment': fragment
 31 |     }
 32 |     return result
 33 | 
 34 | 
 35 | def count(text, character):
 36 |     """Return the amount of certain character in the text."""
 37 |     return text.count(character)
 38 | 
 39 | 
 40 | def count_vowels(text):
 41 |     """Return the number of vowels."""
 42 |     vowels = ['a', 'e', 'i', 'o', 'u']
 43 |     count = 0
 44 |     for i in vowels:
 45 |         count += text.lower().count(i)
 46 |     return count
 47 | 
 48 | 
 49 | def length(text):
 50 |     """Return the length of a string."""
 51 |     return len(text)
 52 | 
 53 | 
 54 | def valid_ip(host):
 55 |     """Return if the domain has a valid IP format (IPv4 or IPv6)."""
 56 |     try:
 57 |         ipaddress.ip_address(host)
 58 |         return True
 59 |     except Exception:
 60 |         return False
 61 | 
 62 | 
 63 | def valid_email(text):
 64 |     """Return if there is an email in the text."""
 65 |     if re.findall(r'[\w\.-]+@[\w\.-]+', text):
 66 |         return True
 67 |     else:
 68 |         return False
 69 | 
 70 | 
 71 | def check_shortener(url):
 72 |     """Check if the domain is a shortener."""
 73 |     file = open(PATH + 'shorteners.txt', 'r')
 74 |     for line in file:
 75 |         with_www = "www." + line.strip()
 76 |         if line.strip() == url['host'].lower() or with_www == url['host'].lower():
 77 |             file.close()
 78 |             return True
 79 |     file.close()
 80 |     return False
 81 | 
 82 | 
 83 | def check_tld(text):
 84 |     """Check for presence of Top-Level Domains (TLD)."""
 85 |     file = open(PATH + 'tlds.txt', 'r')
 86 |     pattern = re.compile("[a-zA-Z0-9.]")
 87 |     for line in file:
 88 |         i = (text.lower().strip()).find(line.strip())
 89 |         while i > -1:
 90 |             if ((i + len(line) - 1) >= len(text)) or not pattern.match(text[i + len(line) - 1]):
 91 |                 file.close()
 92 |                 return True
 93 |             i = text.find(line.strip(), i + 1)
 94 |     file.close()
 95 |     return False
 96 | 
 97 | 
 98 | def count_tld(text):
 99 |     """Return amount of Top-Level Domains (TLD) present in the URL."""
100 |     file = open(PATH + 'tlds.txt', 'r')
101 |     count = 0
102 |     pattern = re.compile("[a-zA-Z0-9.]")
103 |     for line in file:
104 |         i = (text.lower().strip()).find(line.strip())
105 |         while i > -1:
106 |             if ((i + len(line) - 1) >= len(text)) or not pattern.match(text[i + len(line) - 1]):
107 |                 count += 1
108 |             i = text.find(line.strip(), i + 1)
109 |     file.close()
110 |     return count
111 | 
112 | 
113 | def count_params(text):
114 |     """Return number of parameters."""
115 |     return len(parse.parse_qs(text))
116 | 
117 | 
118 | def check_word_server_client(text):
119 |     """Return whether the "server" or "client" keywords exist in the domain."""
120 |     if "server" in text.lower() or "client" in text.lower():
121 |         return True
122 |     return False
123 | 
124 | 
125 | def count_ips(url):
126 |     """Return the number of resolved IPs (IPv4)."""
127 |     if valid_ip(url['host']):
128 |         return 1
129 | 
130 |     try:
131 |         answers = resolver.query(url['host'], 'A')
132 |         return len(answers)
133 |     except Exception:
134 |         return '?'
135 | 
136 | 
137 | def count_name_servers(url):
138 |     """Return number of NameServers (NS) resolved."""
139 |     count = 0
140 |     if count_ips(url):
141 |         try:
142 |             answers = resolver.query(url['host'], 'NS')
143 |             return len(answers)
144 |         except (resolver.NoAnswer, resolver.NXDOMAIN):
145 |             split_host = url['host'].split('.')
146 |             while len(split_host) > 0:
147 |                 split_host.pop(0)
148 |                 supposed_domain = '.'.join(split_host)
149 |                 try:
150 |                     answers = resolver.query(supposed_domain, 'NS')
151 |                     count = len(answers)
152 |                     break
153 |                 except Exception:
154 |                     count = 0
155 |         except Exception:
156 |             count = 0
157 |     return count
158 | 
159 | 
160 | def count_mx_servers(url):
161 |     """Return Number of Resolved MX Servers."""
162 |     count = 0
163 |     if count_ips(url):
164 |         try:
165 |             answers = resolver.query(url['host'], 'MX')
166 |             return len(answers)
167 |         except (resolver.NoAnswer, resolver.NXDOMAIN):
168 |             split_host = url['host'].split('.')
169 |             while len(split_host) > 0:
170 |                 split_host.pop(0)
171 |                 supposed_domain = '.'.join(split_host)
172 |                 try:
173 |                     answers = resolver.query(supposed_domain, 'MX')
174 |                     count = len(answers)
175 |                     break
176 |                 except Exception:
177 |                     count = 0
178 |         except Exception:
179 |             count = 0
180 |     return count
181 | 
182 | 
183 | def extract_ttl(url):
184 |     """Return Time-to-live (TTL) value associated with hostname."""
185 |     try:
186 |         ttl = resolver.query(url['host']).rrset.ttl
187 |         return ttl
188 |     except Exception:
189 |         return '?'
190 | 
191 | 
192 | def time_activation_domain(url):
193 |     """Return time (in days) of domain activation."""
194 |     if url['host'].startswith("www."):
195 |         url['host'] = url['host'][4:]
196 | 
197 |     pythonwhois.net.socket.setdefaulttimeout(3.0)
198 |     try:
199 |         result_whois = pythonwhois.get_whois(url['host'].lower())
200 |         if not result_whois:
201 |             return '?'
202 |         creation_date = str(result_whois['creation_date'][0])
203 |         formated_date = " ".join(creation_date.split()[:1])
204 |         d1 = datetime.strptime(formated_date, "%Y-%m-%d")
205 |         d2 = datetime.now()
206 |         return abs((d2 - d1).days)
207 |     except Exception:
208 |         return '?'
209 | 
210 | 
211 | def expiration_date_register(url):
212 |     """Retorna time (in days) for register expiration."""
213 |     if url['host'].startswith("www."):
214 |         url['host'] = url['host'][4:]
215 | 
216 |     pythonwhois.net.socket.setdefaulttimeout(3.0)
217 |     try:
218 |         result_whois = pythonwhois.get_whois(url['host'].lower())
219 |         if not result_whois:
220 |             return '?'
221 |         expiration_date = str(result_whois['expiration_date'][0])
222 |         formated_date = " ".join(expiration_date.split()[:1])
223 |         d1 = datetime.strptime(formated_date, "%Y-%m-%d")
224 |         d2 = datetime.now()
225 |         return abs((d1 - d2).days)
226 |     except Exception:
227 |         return '?'
228 | 
229 | 
230 | def extract_extension(text):
231 |     """Return file extension name."""
232 |     file = open(PATH + 'extensions.txt', 'r')
233 |     pattern = re.compile("[a-zA-Z0-9.]")
234 |     for extension in file:
235 |         i = (text.lower().strip()).find(extension.strip())
236 |         while i > -1:
237 |             if ((i + len(extension) - 1) >= len(text)) or not pattern.match(text[i + len(extension) - 1]):
238 |                 file.close()
239 |                 return extension.rstrip().split('.')[-1]
240 |             i = text.find(extension.strip(), i + 1)
241 |     file.close()
242 |     return '?'
243 | 
244 | 
245 | def check_ssl(url):
246 |     """Check if the ssl certificate is valid."""
247 |     try:
248 |         requests.get(url, verify=True, timeout=3)
249 |         return True
250 |     except Exception:
251 |         return False
252 | 
253 | 
254 | def count_redirects(url):
255 |     """Return the number of redirects in a URL."""
256 |     try:
257 |         response = requests.get(url, timeout=3)
258 |         if response.history:
259 |             return len(response.history)
260 |         else:
261 |             return 0
262 |     except Exception:
263 |         return '?'
264 | 
265 | 
266 | def get_asn_number(url):
267 |     """Return the ANS number associated with the IP."""
268 |     try:
269 |         with geoip2.database.Reader(PATH + 'GeoLite2-ASN.mmdb') as reader:
270 |             if valid_ip(url['host']):
271 |                 ip = url['host']
272 |             else:
273 |                 ip = resolver.query(url['host'], 'A')
274 |                 ip = ip[0].to_text()
275 | 
276 |             if ip:
277 |                 response = reader.asn(ip)
278 |                 return response.autonomous_system_number
279 |             else:
280 |                 return '?'
281 |     except Exception:
282 |         return '?'
283 | 
284 | 
285 | def get_country(url):
286 |     """Return the country associated with IP."""
287 |     try:
288 |         if valid_ip(url['host']):
289 |             ip = url['host']
290 |         else:
291 |             ip = resolver.query(url['host'], 'A')
292 |             ip = ip[0].to_text()
293 | 
294 |         if ip:
295 |             reader = geoip2.database.Reader(PATH + 'GeoLite2-Country.mmdb')
296 |             response = reader.country(ip)
297 |             return response.country.iso_code
298 |         else:
299 |             return '?'
300 |     except Exception:
301 |         return '?'
302 | 
303 | 
304 | def get_ptr(url):
305 |     """Return PTR associated with IP."""
306 |     try:
307 |         if valid_ip(url['host']):
308 |             ip = url['host']
309 |         else:
310 |             ip = resolver.query(url['host'], 'A')
311 |             ip = ip[0].to_text()
312 | 
313 |         if ip:
314 |             r = reversename.from_address(ip)
315 |             result = resolver.query(r, 'PTR')[0].to_text()
316 |             return result
317 |         else:
318 |             return '?'
319 |     except Exception:
320 |         return '?'
321 | 
322 | 
323 | def google_search(url):
324 |     """Check if the url is indexed in google."""
325 |     user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
326 |     headers = {'User-Agent': user_agent}
327 | 
328 |     query = {'q': 'info:' + url}
329 |     google = "https://www.google.com/search?" + parse.urlencode(query)
330 |     try:
331 |         data = requests.get(google, headers=headers)
332 |     except Exception:
333 |         return '?'
334 |     data.encoding = 'ISO-8859-1'
335 |     soup = BeautifulSoup(str(data.content), "html.parser")
336 |     try:
337 |         (soup.find(id="rso").find(
338 |             "div").find("div").find("h3").find("a"))['href']
339 |         return True
340 |     except AttributeError:
341 |         return False
342 | 
343 | 
344 | def valid_spf(domain):
345 |     """Check if within the registered domain has SPF and if it is valid."""
346 |     spf = get_spf_record(domain)
347 |     if spf is not None:
348 |         return check_spf(spf, domain)
349 |     return False
350 | 
351 | 
352 | def check_blacklists(url):
353 |     """Check if the URL or Domain is malicious through Google Safebrowsing, Phishtank, and WOT."""
354 |     if (google_safebrowsing(url) or phishtank(url) or wot(url)):
355 |         return True
356 |     return False
357 | 
358 | 
359 | def check_blacklists_ip(url):
360 |     """Check if the IP is malicious through Google Safebrowsing, Phishtank and WOT."""
361 |     try:
362 |         if valid_ip(url['host']):
363 |             ip = url['host']
364 |         else:
365 |             ip = resolver.query(url['host'], 'A')
366 |             ip = ip[0].to_text()
367 | 
368 |         if ip:
369 |             if (google_safebrowsing(ip) or phishtank(ip) or wot(ip)):
370 |                 return True
371 |             return False
372 |         else:
373 |             return '?'
374 |     except Exception:
375 |         return '?'
376 | 
377 | 
378 | def check_rbl(domain):
379 |     """Check domain presence on RBL (Real-time Blackhole List)."""
380 |     searcher = RBLSearch(domain)
381 |     try:
382 |         listed = searcher.listed
383 |     except Exception:
384 |         return False
385 |     for key in listed:
386 |         if key == 'SEARCH_HOST':
387 |             pass
388 |         elif listed[key]['LISTED']:
389 |             return True
390 |     return False
391 | 
392 | 
393 | def check_time_response(domain):
394 |     """Return the response time in seconds."""
395 |     try:
396 |         latency = requests.get(domain, headers={'Cache-Control': 'no-cache'}).elapsed.total_seconds()
397 |         return latency
398 |     except Exception:
399 |         return '?'
400 | 
401 | 
402 | def read_file(archive):
403 |     """Read the file with the URLs."""
404 |     with open(archive, 'r') as f:
405 |         urls = ([line.rstrip() for line in f])
406 |         return urls
407 | 


--------------------------------------------------------------------------------
/extract.py:
--------------------------------------------------------------------------------
  1 | from lib.functions import *
  2 | import posixpath
  3 | import csv
  4 | 
  5 | 
  6 | def attributes():
  7 |     """Output file attributes."""
  8 |     lexical = [
  9 |         'qtd_ponto_url', 'qtd_hifen_url', 'qtd_underline_url',
 10 |         'qtd_barra_url', 'qtd_interrogacao_url', 'qtd_igual_url',
 11 |         'qtd_arroba_url', 'qtd_comercial_url', 'qtd_exclamacao_url',
 12 |         'qtd_espaco_url', 'qtd_til_url', 'qtd_virgula_url',
 13 |         'qtd_mais_url', 'qtd_asterisco_url', 'qtd_hashtag_url',
 14 |         'qtd_cifrao_url', 'qtd_porcento_url', 'qtd_tld_url',
 15 |         'comprimento_url', 'qtd_ponto_dominio', 'qtd_hifen_dominio',
 16 |         'qtd_underline_dominio', 'qtd_barra_dominio', 'qtd_interrogacao_dominio',
 17 |         'qtd_igual_dominio', 'qtd_arroba_dominio', 'qtd_comercial_dominio',
 18 |         'qtd_exclamacao_dominio', 'qtd_espaco_dominio', 'qtd_til_dominio',
 19 |         'qtd_virgula_dominio', 'qtd_mais_dominio', 'qtd_asterisco_dominio',
 20 |         'qtd_hashtag_dominio', 'qtd_cifrao_dominio', 'qtd_porcento_dominio',
 21 |         'qtd_vogais_dominio', 'comprimento_dominio', 'formato_ip_dominio',
 22 |         'server_client_dominio', 'qtd_ponto_diretorio', 'qtd_hifen_diretorio',
 23 |         'qtd_underline_diretorio', 'qtd_barra_diretorio', 'qtd_interrogacao_diretorio',
 24 |         'qtd_igual_diretorio', 'qtd_arroba_diretorio', 'qtd_comercial_diretorio',
 25 |         'qtd_exclamacao_diretorio', 'qtd_espaco_diretorio', 'qtd_til_diretorio',
 26 |         'qtd_virgula_diretorio', 'qtd_mais_diretorio', 'qtd_asterisco_diretorio',
 27 |         'qtd_hashtag_diretorio', 'qtd_cifrao_diretorio', 'qtd_porcento_diretorio',
 28 |         'comprimento_diretorio', 'qtd_ponto_arquivo', 'qtd_hifen_arquivo',
 29 |         'qtd_underline_arquivo', 'qtd_barra_arquivo', 'qtd_interrogacao_arquivo',
 30 |         'qtd_igual_arquivo', 'qtd_arroba_arquivo', 'qtd_comercial_arquivo',
 31 |         'qtd_exclamacao_arquivo', 'qtd_espaco_arquivo', 'qtd_til_arquivo',
 32 |         'qtd_virgula_arquivo', 'qtd_mais_arquivo', 'qtd_asterisco_arquivo',
 33 |         'qtd_hashtag_arquivo', 'qtd_cifrao_arquivo', 'qtd_porcento_arquivo',
 34 |         'comprimento_arquivo', 'qtd_ponto_parametros', 'qtd_hifen_parametros',
 35 |         'qtd_underline_parametros', 'qtd_barra_parametros', 'qtd_interrogacao_parametros',
 36 |         'qtd_igual_parametros', 'qtd_arroba_parametros', 'qtd_comercial_parametros',
 37 |         'qtd_exclamacao_parametros', 'qtd_espaco_parametros', 'qtd_til_parametros',
 38 |         'qtd_virgula_parametros', 'qtd_mais_parametros', 'qtd_asterisco_parametros',
 39 |         'qtd_hashtag_parametros', 'qtd_cifrao_parametros', 'qtd_porcento_parametros',
 40 |         'comprimento_parametros', 'presenca_tld_argumentos', 'qtd_parametros',
 41 |         'email_na_url', 'extensao_arquivo'
 42 |     ]
 43 | 
 44 |     blacklist = ['url_presente_em_blacklists', 'presenca_ip_blacklists', 'dominio_presente_em_blacklists']
 45 | 
 46 |     host = ['dominio_presente_em_rbl', 'tempo_resposta', 'possui_spf', 'localizacao_geografica_ip',
 47 |             'numero_as_ip', 'ptr_ip', 'tempo_ativacao_dominio', 'tempo_expiracao_dominio',
 48 |             'qtd_ip_resolvido', 'qtd_nameservers', 'qtd_servidores_mx', 'valor_ttl_associado']
 49 | 
 50 |     others = ['certificado_tls_ssl', 'qtd_redirecionamentos', 'url_indexada_no_google', 'dominio_indexado_no_google', 'url_encurtada']
 51 | 
 52 |     list_attributes = []
 53 |     list_attributes.extend(lexical)
 54 |     list_attributes.extend(blacklist)
 55 |     list_attributes.extend(host)
 56 |     list_attributes.extend(others)
 57 |     list_attributes.extend(['phishing'])
 58 | 
 59 |     return list_attributes
 60 | 
 61 | 
 62 | def main(urls, dataset):
 63 |     with open(dataset, "w") as output:
 64 |         writer = csv.writer(output)
 65 |         writer.writerow(attributes())
 66 |         count_url = 0
 67 |         for url in read_file(urls):
 68 |             print(url)
 69 |             count_url = count_url + 1
 70 |             dict_url = start_url(url)
 71 | 
 72 |             """LEXICAL"""
 73 |             # URL
 74 |             dot_url = str(count(dict_url['url'], '.'))
 75 |             hyphe_url = str(count(dict_url['url'], '-'))
 76 |             underline_url = str(count(dict_url['url'], '_'))
 77 |             bar_url = str(count(dict_url['url'], '/'))
 78 |             question_url = str(count(dict_url['url'], '?'))
 79 |             equal_url = str(count(dict_url['url'], '='))
 80 |             arroba_url = str(count(dict_url['url'], '@'))
 81 |             ampersand_url = str(count(dict_url['url'], '&'))
 82 |             exclamation_url = str(count(dict_url['url'], '!'))
 83 |             blank_url = str(count(dict_url['url'], ' '))
 84 |             til_url = str(count(dict_url['url'], '~'))
 85 |             comma_url = str(count(dict_url['url'], ','))
 86 |             plus_url = str(count(dict_url['url'], '+'))
 87 |             asterisk_url = str(count(dict_url['url'], '*'))
 88 |             hashtag_url = str(count(dict_url['url'], '#'))
 89 |             money_sign_url = str(count(dict_url['url'], '$'))
 90 |             percentage_url = str(count(dict_url['url'], '%'))
 91 |             len_url = str(length(dict_url['url']))
 92 |             email_exist = str(valid_email(dict_url['url']))
 93 |             count_tld_url = str(count_tld(dict_url['url']))
 94 |             # DOMAIN
 95 |             dot_host = str(count(dict_url['host'], '.'))
 96 |             hyphe_host = str(count(dict_url['host'], '-'))
 97 |             underline_host = str(count(dict_url['host'], '_'))
 98 |             bar_host = str(count(dict_url['host'], '/'))
 99 |             question_host = str(count(dict_url['host'], '?'))
100 |             equal_host = str(count(dict_url['host'], '='))
101 |             arroba_host = str(count(dict_url['host'], '@'))
102 |             ampersand_host = str(count(dict_url['host'], '&'))
103 |             exclamation_host = str(count(dict_url['host'], '!'))
104 |             blank_host = str(count(dict_url['host'], ' '))
105 |             til_host = str(count(dict_url['host'], '~'))
106 |             comma_host = str(count(dict_url['host'], ','))
107 |             plus_host = str(count(dict_url['host'], '+'))
108 |             asterisk_host = str(count(dict_url['host'], '*'))
109 |             hashtag_host = str(count(dict_url['host'], '#'))
110 |             money_sign_host = str(count(dict_url['host'], '$'))
111 |             percentage_host = str(count(dict_url['host'], '%'))
112 |             vowels_host = str(count_vowels(dict_url['host']))
113 |             len_host = str(length(dict_url['host']))
114 |             ip_exist = str(valid_ip(dict_url['host']))
115 |             server_client = str(check_word_server_client(dict_url['host']))
116 |             # DIRECTORY
117 |             if dict_url['path']:
118 |                 dot_path = str(count(dict_url['path'], '.'))
119 |                 hyphe_path = str(count(dict_url['path'], '-'))
120 |                 underline_path = str(count(dict_url['path'], '_'))
121 |                 bar_path = str(count(dict_url['path'], '/'))
122 |                 question_path = str(count(dict_url['path'], '?'))
123 |                 equal_path = str(count(dict_url['path'], '='))
124 |                 arroba_path = str(count(dict_url['path'], '@'))
125 |                 ampersand_path = str(count(dict_url['path'], '&'))
126 |                 exclamation_path = str(count(dict_url['path'], '!'))
127 |                 blank_path = str(count(dict_url['path'], ' '))
128 |                 til_path = str(count(dict_url['path'], '~'))
129 |                 comma_path = str(count(dict_url['path'], ','))
130 |                 plus_path = str(count(dict_url['path'], '+'))
131 |                 asterisk_path = str(count(dict_url['path'], '*'))
132 |                 hashtag_path = str(count(dict_url['path'], '#'))
133 |                 money_sign_path = str(count(dict_url['path'], '$'))
134 |                 percentage_path = str(count(dict_url['path'], '%'))
135 |                 len_path = str(length(dict_url['path']))
136 |             else:
137 |                 dot_path = '?'
138 |                 hyphe_path = '?'
139 |                 underline_path = '?'
140 |                 bar_path = '?'
141 |                 question_path = '?'
142 |                 equal_path = '?'
143 |                 arroba_path = '?'
144 |                 ampersand_path = '?'
145 |                 exclamation_path = '?'
146 |                 blank_path = '?'
147 |                 til_path = '?'
148 |                 comma_path = '?'
149 |                 plus_path = '?'
150 |                 asterisk_path = '?'
151 |                 hashtag_path = '?'
152 |                 money_sign_path = '?'
153 |                 percentage_path = '?'
154 |                 len_path = '?'
155 |             # FILE
156 |             if dict_url['path']:
157 |                 dot_file = str(count(posixpath.basename(dict_url['path']), '.'))
158 |                 hyphe_file = str(count(posixpath.basename(dict_url['path']), '-'))
159 |                 underline_file = str(
160 |                     count(posixpath.basename(dict_url['path']), '_'))
161 |                 bar_file = str(count(posixpath.basename(dict_url['path']), '/'))
162 |                 question_file = str(
163 |                     count(posixpath.basename(dict_url['path']), '?'))
164 |                 equal_file = str(count(posixpath.basename(dict_url['path']), '='))
165 |                 arroba_file = str(count(posixpath.basename(dict_url['path']), '@'))
166 |                 ampersand_file = str(
167 |                     count(posixpath.basename(dict_url['path']), '&'))
168 |                 exclamation_file = str(
169 |                     count(posixpath.basename(dict_url['path']), '!'))
170 |                 blank_file = str(count(posixpath.basename(dict_url['path']), ' '))
171 |                 til_file = str(count(posixpath.basename(dict_url['path']), '~'))
172 |                 comma_file = str(count(posixpath.basename(dict_url['path']), ','))
173 |                 plus_file = str(count(posixpath.basename(dict_url['path']), '+'))
174 |                 asterisk_file = str(
175 |                     count(posixpath.basename(dict_url['path']), '*'))
176 |                 hashtag_file = str(
177 |                     count(posixpath.basename(dict_url['path']), '#'))
178 |                 money_sign_file = str(
179 |                     count(posixpath.basename(dict_url['path']), '$'))
180 |                 percentage_file = str(
181 |                     count(posixpath.basename(dict_url['path']), '%'))
182 |                 len_file = str(length(posixpath.basename(dict_url['path'])))
183 |                 extension = str(extract_extension(
184 |                     posixpath.basename(dict_url['path'])))
185 |             else:
186 |                 dot_file = '?'
187 |                 hyphe_file = '?'
188 |                 underline_file = '?'
189 |                 bar_file = '?'
190 |                 question_file = '?'
191 |                 equal_file = '?'
192 |                 arroba_file = '?'
193 |                 ampersand_file = '?'
194 |                 exclamation_file = '?'
195 |                 blank_file = '?'
196 |                 til_file = '?'
197 |                 comma_file = '?'
198 |                 plus_file = '?'
199 |                 asterisk_file = '?'
200 |                 hashtag_file = '?'
201 |                 money_sign_file = '?'
202 |                 percentage_file = '?'
203 |                 len_file = '?'
204 |                 extension = '?'
205 |             # PARAMETERS
206 |             if dict_url['query']:
207 |                 dot_params = str(count(dict_url['query'], '.'))
208 |                 hyphe_params = str(count(dict_url['query'], '-'))
209 |                 underline_params = str(count(dict_url['query'], '_'))
210 |                 bar_params = str(count(dict_url['query'], '/'))
211 |                 question_params = str(count(dict_url['query'], '?'))
212 |                 equal_params = str(count(dict_url['query'], '='))
213 |                 arroba_params = str(count(dict_url['query'], '@'))
214 |                 ampersand_params = str(count(dict_url['query'], '&'))
215 |                 exclamation_params = str(count(dict_url['query'], '!'))
216 |                 blank_params = str(count(dict_url['query'], ' '))
217 |                 til_params = str(count(dict_url['query'], '~'))
218 |                 comma_params = str(count(dict_url['query'], ','))
219 |                 plus_params = str(count(dict_url['query'], '+'))
220 |                 asterisk_params = str(count(dict_url['query'], '*'))
221 |                 hashtag_params = str(count(dict_url['query'], '#'))
222 |                 money_sign_params = str(count(dict_url['query'], '$'))
223 |                 percentage_params = str(count(dict_url['query'], '%'))
224 |                 len_params = str(length(dict_url['query']))
225 |                 tld_params = str(check_tld(dict_url['query']))
226 |                 number_params = str(count_params(dict_url['query']))
227 |             else:
228 |                 dot_params = '?'
229 |                 hyphe_params = '?'
230 |                 underline_params = '?'
231 |                 bar_params = '?'
232 |                 question_params = '?'
233 |                 equal_params = '?'
234 |                 arroba_params = '?'
235 |                 ampersand_params = '?'
236 |                 exclamation_params = '?'
237 |                 blank_params = '?'
238 |                 til_params = '?'
239 |                 comma_params = '?'
240 |                 plus_params = '?'
241 |                 asterisk_params = '?'
242 |                 hashtag_params = '?'
243 |                 money_sign_params = '?'
244 |                 percentage_params = '?'
245 |                 len_params = '?'
246 |                 tld_params = '?'
247 |                 number_params = '?'
248 | 
249 |             """BLACKLIST"""
250 |             blacklist_url = str(check_blacklists(dict_url['protocol'] + '://' + dict_url['url']))
251 |             blacklist_ip = str(check_blacklists_ip(dict_url))
252 |             blacklist_domain = str(check_blacklists(dict_url['protocol'] + '://' + dict_url['host']))
253 | 
254 |             """HOST"""
255 |             spf = str(valid_spf(dict_url['host']))
256 |             rbl = str(check_rbl(dict_url['host']))
257 |             time_domain = str(check_time_response(dict_url['protocol'] + '://' + dict_url['host']))
258 |             asn = str(get_asn_number(dict_url))
259 |             country = str(get_country(dict_url))
260 |             ptr = str(get_ptr(dict_url))
261 |             activation_time = str(time_activation_domain(dict_url))
262 |             expiration_time = str(expiration_date_register(dict_url))
263 |             count_ip = str(count_ips(dict_url))
264 |             count_ns = str(count_name_servers(dict_url))
265 |             count_mx = str(count_mx_servers(dict_url))
266 |             ttl = str(extract_ttl(dict_url))
267 | 
268 |             """OTHERS"""
269 |             ssl = str(check_ssl('https://' + dict_url['url']))
270 |             count_redirect = str(count_redirects(
271 |                 dict_url['protocol'] + '://' + dict_url['url']))
272 |             google_url = str(google_search(dict_url['url']))
273 |             google_domain = str(google_search(dict_url['host']))
274 |             shortener = str(check_shortener(dict_url))
275 | 
276 |             _lexical = [
277 |                 dot_url, hyphe_url, underline_url, bar_url, question_url,
278 |                 equal_url, arroba_url, ampersand_url, exclamation_url,
279 |                 blank_url, til_url, comma_url, plus_url, asterisk_url, hashtag_url,
280 |                 money_sign_url, percentage_url, count_tld_url, len_url, dot_host,
281 |                 hyphe_host, underline_host, bar_host, question_host, equal_host,
282 |                 arroba_host, ampersand_host, exclamation_host, blank_host, til_host,
283 |                 comma_host, plus_host, asterisk_host, hashtag_host, money_sign_host,
284 |                 percentage_host, vowels_host, len_host, ip_exist, server_client,
285 |                 dot_path, hyphe_path, underline_path, bar_path, question_path,
286 |                 equal_path, arroba_path, ampersand_path, exclamation_path,
287 |                 blank_path, til_path, comma_path, plus_path, asterisk_path,
288 |                 hashtag_path, money_sign_path, percentage_path, len_path, dot_file,
289 |                 hyphe_file, underline_file, bar_file, question_file, equal_file,
290 |                 arroba_file, ampersand_file, exclamation_file, blank_file,
291 |                 til_file, comma_file, plus_file, asterisk_file, hashtag_file,
292 |                 money_sign_file, percentage_file, len_file, dot_params,
293 |                 hyphe_params, underline_params, bar_params, question_params,
294 |                 equal_params, arroba_params, ampersand_params, exclamation_params,
295 |                 blank_params, til_params, comma_params, plus_params, asterisk_params,
296 |                 hashtag_params, money_sign_params, percentage_params, len_params,
297 |                 tld_params, number_params, email_exist, extension
298 |             ]
299 | 
300 |             _blacklist = [blacklist_url, blacklist_ip, blacklist_domain]
301 | 
302 |             _host = [rbl, time_domain, spf, country, asn, ptr, activation_time,
303 |                      expiration_time, count_ip, count_ns, count_mx, ttl]
304 | 
305 |             _others = [ssl, count_redirect, google_url, google_domain, shortener]
306 | 
307 |             result = []
308 |             result.extend(_lexical)
309 |             result.extend(_blacklist)
310 |             result.extend(_host)
311 |             result.extend(_others)
312 |             result.extend([''])
313 | 
314 |             writer.writerow(result)
315 | 


--------------------------------------------------------------------------------
/lib/files/tlds.txt:
--------------------------------------------------------------------------------
   1 | .aaa
   2 | .aarp
   3 | .abarth
   4 | .abb
   5 | .abbott
   6 | .abbvie
   7 | .abc
   8 | .able
   9 | .abogado
  10 | .abudhabi
  11 | .ac
  12 | .academy
  13 | .accenture
  14 | .accountant
  15 | .accountants
  16 | .aco
  17 | .active
  18 | .actor
  19 | .ad
  20 | .adac
  21 | .ads
  22 | .adult
  23 | .ae
  24 | .aeg
  25 | .aero
  26 | .aetna
  27 | .af
  28 | .afamilycompany
  29 | .afl
  30 | .africa
  31 | .ag
  32 | .agakhan
  33 | .agency
  34 | .ai
  35 | .aig
  36 | .aigo
  37 | .airbus
  38 | .airforce
  39 | .airtel
  40 | .akdn
  41 | .al
  42 | .alfaromeo
  43 | .alibaba
  44 | .alipay
  45 | .allfinanz
  46 | .allstate
  47 | .ally
  48 | .alsace
  49 | .alstom
  50 | .am
  51 | .americanexpress
  52 | .americanfamily
  53 | .amex
  54 | .amfam
  55 | .amica
  56 | .amsterdam
  57 | .analytics
  58 | .android
  59 | .anquan
  60 | .anz
  61 | .ao
  62 | .aol
  63 | .apartments
  64 | .app
  65 | .apple
  66 | .aq
  67 | .aquarelle
  68 | .ar
  69 | .aramco
  70 | .archi
  71 | .army
  72 | .arpa
  73 | .art
  74 | .arte
  75 | .as
  76 | .asda
  77 | .asia
  78 | .associates
  79 | .at
  80 | .athleta
  81 | .attorney
  82 | .au
  83 | .auction
  84 | .audi
  85 | .audible
  86 | .audio
  87 | .auspost
  88 | .author
  89 | .auto
  90 | .autos
  91 | .avianca
  92 | .aw
  93 | .aws
  94 | .ax
  95 | .axa
  96 | .az
  97 | .azure
  98 | .ba
  99 | .baby
 100 | .baidu
 101 | .banamex
 102 | .bananarepublic
 103 | .band
 104 | .bank
 105 | .bar
 106 | .barcelona
 107 | .barclaycard
 108 | .barclays
 109 | .barefoot
 110 | .bargains
 111 | .baseball
 112 | .basketball
 113 | .bauhaus
 114 | .bayern
 115 | .bb
 116 | .bbc
 117 | .bbt
 118 | .bbva
 119 | .bcg
 120 | .bcn
 121 | .bd
 122 | .be
 123 | .beats
 124 | .beauty
 125 | .beer
 126 | .bentley
 127 | .berlin
 128 | .best
 129 | .bestbuy
 130 | .bet
 131 | .bf
 132 | .bg
 133 | .bh
 134 | .bharti
 135 | .bi
 136 | .bible
 137 | .bid
 138 | .bike
 139 | .bing
 140 | .bingo
 141 | .bio
 142 | .biz
 143 | .bj
 144 | .black
 145 | .blackfriday
 146 | .blanco
 147 | .blockbuster
 148 | .blog
 149 | .bloomberg
 150 | .blue
 151 | .bm
 152 | .bms
 153 | .bmw
 154 | .bn
 155 | .bnl
 156 | .bnpparibas
 157 | .bo
 158 | .boats
 159 | .boehringer
 160 | .bofa
 161 | .bom
 162 | .bond
 163 | .boo
 164 | .book
 165 | .booking
 166 | .boots
 167 | .bosch
 168 | .bostik
 169 | .boston
 170 | .bot
 171 | .boutique
 172 | .box
 173 | .br
 174 | .bradesco
 175 | .bridgestone
 176 | .broadway
 177 | .broker
 178 | .brother
 179 | .brussels
 180 | .bs
 181 | .bt
 182 | .budapest
 183 | .bugatti
 184 | .build
 185 | .builders
 186 | .business
 187 | .buy
 188 | .buzz
 189 | .bv
 190 | .bw
 191 | .by
 192 | .bz
 193 | .bzh
 194 | .ca
 195 | .cab
 196 | .cafe
 197 | .cal
 198 | .call
 199 | .calvinklein
 200 | .cam
 201 | .camera
 202 | .camp
 203 | .cancerresearch
 204 | .canon
 205 | .capetown
 206 | .capital
 207 | .capitalone
 208 | .car
 209 | .caravan
 210 | .cards
 211 | .care
 212 | .career
 213 | .careers
 214 | .cars
 215 | .cartier
 216 | .casa
 217 | .case
 218 | .caseih
 219 | .cash
 220 | .casino
 221 | .cat
 222 | .catering
 223 | .catholic
 224 | .cba
 225 | .cbn
 226 | .cbre
 227 | .cbs
 228 | .cc
 229 | .cd
 230 | .ceb
 231 | .center
 232 | .ceo
 233 | .cern
 234 | .cf
 235 | .cfa
 236 | .cfd
 237 | .cg
 238 | .ch
 239 | .chanel
 240 | .channel
 241 | .chase
 242 | .chat
 243 | .cheap
 244 | .chintai
 245 | .chloe
 246 | .christmas
 247 | .chrome
 248 | .chrysler
 249 | .church
 250 | .ci
 251 | .cipriani
 252 | .circle
 253 | .cisco
 254 | .citadel
 255 | .citi
 256 | .citic
 257 | .city
 258 | .cityeats
 259 | .ck
 260 | .cl
 261 | .claims
 262 | .cleaning
 263 | .click
 264 | .clinic
 265 | .clinique
 266 | .clothing
 267 | .cloud
 268 | .club
 269 | .clubmed
 270 | .cm
 271 | .cn
 272 | .co
 273 | .coach
 274 | .codes
 275 | .coffee
 276 | .college
 277 | .cologne
 278 | .com
 279 | .comcast
 280 | .commbank
 281 | .community
 282 | .company
 283 | .compare
 284 | .computer
 285 | .comsec
 286 | .condos
 287 | .construction
 288 | .consulting
 289 | .contact
 290 | .contractors
 291 | .cooking
 292 | .cookingchannel
 293 | .cool
 294 | .coop
 295 | .corsica
 296 | .country
 297 | .coupon
 298 | .coupons
 299 | .courses
 300 | .cr
 301 | .credit
 302 | .creditcard
 303 | .creditunion
 304 | .cricket
 305 | .crown
 306 | .crs
 307 | .cruise
 308 | .cruises
 309 | .csc
 310 | .cu
 311 | .cuisinella
 312 | .cv
 313 | .cw
 314 | .cx
 315 | .cy
 316 | .cymru
 317 | .cyou
 318 | .cz
 319 | .dabur
 320 | .dad
 321 | .dance
 322 | .data
 323 | .date
 324 | .dating
 325 | .datsun
 326 | .day
 327 | .dclk
 328 | .dds
 329 | .de
 330 | .deal
 331 | .dealer
 332 | .deals
 333 | .degree
 334 | .delivery
 335 | .dell
 336 | .deloitte
 337 | .delta
 338 | .democrat
 339 | .dental
 340 | .dentist
 341 | .desi
 342 | .design
 343 | .dev
 344 | .dhl
 345 | .diamonds
 346 | .diet
 347 | .digital
 348 | .direct
 349 | .directory
 350 | .discount
 351 | .discover
 352 | .dish
 353 | .diy
 354 | .dj
 355 | .dk
 356 | .dm
 357 | .dnp
 358 | .do
 359 | .docs
 360 | .doctor
 361 | .dodge
 362 | .dog
 363 | .doha
 364 | .domains
 365 | .dot
 366 | .download
 367 | .drive
 368 | .dtv
 369 | .dubai
 370 | .duck
 371 | .dunlop
 372 | .duns
 373 | .dupont
 374 | .durban
 375 | .dvag
 376 | .dvr
 377 | .dz
 378 | .earth
 379 | .eat
 380 | .ec
 381 | .eco
 382 | .edeka
 383 | .edu
 384 | .education
 385 | .ee
 386 | .eg
 387 | .email
 388 | .emerck
 389 | .energy
 390 | .engineer
 391 | .engineering
 392 | .enterprises
 393 | .epost
 394 | .epson
 395 | .equipment
 396 | .er
 397 | .ericsson
 398 | .erni
 399 | .es
 400 | .esq
 401 | .estate
 402 | .esurance
 403 | .et
 404 | .eu
 405 | .eurovision
 406 | .eus
 407 | .events
 408 | .everbank
 409 | .exchange
 410 | .expert
 411 | .exposed
 412 | .express
 413 | .extraspace
 414 | .fage
 415 | .fail
 416 | .fairwinds
 417 | .faith
 418 | .family
 419 | .fan
 420 | .fans
 421 | .farm
 422 | .farmers
 423 | .fashion
 424 | .fast
 425 | .fedex
 426 | .feedback
 427 | .ferrari
 428 | .ferrero
 429 | .fi
 430 | .fiat
 431 | .fidelity
 432 | .fido
 433 | .film
 434 | .final
 435 | .finance
 436 | .financial
 437 | .fire
 438 | .firestone
 439 | .firmdale
 440 | .fish
 441 | .fishing
 442 | .fit
 443 | .fitness
 444 | .fj
 445 | .fk
 446 | .flickr
 447 | .flights
 448 | .flir
 449 | .florist
 450 | .flowers
 451 | .fly
 452 | .fm
 453 | .fo
 454 | .foo
 455 | .food
 456 | .foodnetwork
 457 | .football
 458 | .ford
 459 | .forex
 460 | .forsale
 461 | .forum
 462 | .foundation
 463 | .fox
 464 | .fr
 465 | .free
 466 | .fresenius
 467 | .frl
 468 | .frogans
 469 | .frontdoor
 470 | .frontier
 471 | .ftr
 472 | .fujitsu
 473 | .fujixerox
 474 | .fun
 475 | .fund
 476 | .furniture
 477 | .futbol
 478 | .fyi
 479 | .ga
 480 | .gal
 481 | .gallery
 482 | .gallo
 483 | .gallup
 484 | .game
 485 | .games
 486 | .gap
 487 | .garden
 488 | .gb
 489 | .gbiz
 490 | .gd
 491 | .gdn
 492 | .ge
 493 | .gea
 494 | .gent
 495 | .genting
 496 | .george
 497 | .gf
 498 | .gg
 499 | .ggee
 500 | .gh
 501 | .gi
 502 | .gift
 503 | .gifts
 504 | .gives
 505 | .giving
 506 | .gl
 507 | .glade
 508 | .glass
 509 | .gle
 510 | .global
 511 | .globo
 512 | .gm
 513 | .gmail
 514 | .gmbh
 515 | .gmo
 516 | .gmx
 517 | .gn
 518 | .godaddy
 519 | .gold
 520 | .goldpoint
 521 | .golf
 522 | .goo
 523 | .goodhands
 524 | .goodyear
 525 | .goog
 526 | .google
 527 | .gop
 528 | .got
 529 | .gov
 530 | .gp
 531 | .gq
 532 | .gr
 533 | .grainger
 534 | .graphics
 535 | .gratis
 536 | .green
 537 | .gripe
 538 | .group
 539 | .gs
 540 | .gt
 541 | .gu
 542 | .guardian
 543 | .gucci
 544 | .guge
 545 | .guide
 546 | .guitars
 547 | .guru
 548 | .gw
 549 | .gy
 550 | .hair
 551 | .hamburg
 552 | .hangout
 553 | .haus
 554 | .hbo
 555 | .hdfc
 556 | .hdfcbank
 557 | .health
 558 | .healthcare
 559 | .help
 560 | .helsinki
 561 | .here
 562 | .hermes
 563 | .hgtv
 564 | .hiphop
 565 | .hisamitsu
 566 | .hitachi
 567 | .hiv
 568 | .hk
 569 | .hkt
 570 | .hm
 571 | .hn
 572 | .hockey
 573 | .holdings
 574 | .holiday
 575 | .homedepot
 576 | .homegoods
 577 | .homes
 578 | .homesense
 579 | .honda
 580 | .honeywell
 581 | .horse
 582 | .hospital
 583 | .host
 584 | .hosting
 585 | .hot
 586 | .hoteles
 587 | .hotmail
 588 | .house
 589 | .how
 590 | .hr
 591 | .hsbc
 592 | .ht
 593 | .htc
 594 | .hu
 595 | .hughes
 596 | .hyatt
 597 | .hyundai
 598 | .ibm
 599 | .icbc
 600 | .ice
 601 | .icu
 602 | .id
 603 | .ie
 604 | .ieee
 605 | .ifm
 606 | .ikano
 607 | .il
 608 | .im
 609 | .imamat
 610 | .imdb
 611 | .immo
 612 | .immobilien
 613 | .in
 614 | .industries
 615 | .infiniti
 616 | .info
 617 | .ing
 618 | .ink
 619 | .institute
 620 | .insurance
 621 | .insure
 622 | .int
 623 | .intel
 624 | .international
 625 | .intuit
 626 | .investments
 627 | .io
 628 | .ipiranga
 629 | .iq
 630 | .ir
 631 | .irish
 632 | .is
 633 | .iselect
 634 | .ismaili
 635 | .ist
 636 | .istanbul
 637 | .it
 638 | .itau
 639 | .itv
 640 | .iveco
 641 | .iwc
 642 | .jaguar
 643 | .java
 644 | .jcb
 645 | .jcp
 646 | .je
 647 | .jeep
 648 | .jetzt
 649 | .jewelry
 650 | .jio
 651 | .jlc
 652 | .jll
 653 | .jm
 654 | .jmp
 655 | .jnj
 656 | .jo
 657 | .jobs
 658 | .joburg
 659 | .jot
 660 | .joy
 661 | .jp
 662 | .jpmorgan
 663 | .jprs
 664 | .juegos
 665 | .juniper
 666 | .kaufen
 667 | .kddi
 668 | .ke
 669 | .kerryhotels
 670 | .kerrylogistics
 671 | .kerryproperties
 672 | .kfh
 673 | .kg
 674 | .kh
 675 | .ki
 676 | .kia
 677 | .kim
 678 | .kinder
 679 | .kindle
 680 | .kitchen
 681 | .kiwi
 682 | .km
 683 | .kn
 684 | .koeln
 685 | .komatsu
 686 | .kosher
 687 | .kp
 688 | .kpmg
 689 | .kpn
 690 | .kr
 691 | .krd
 692 | .kred
 693 | .kuokgroup
 694 | .kw
 695 | .ky
 696 | .kyoto
 697 | .kz
 698 | .la
 699 | .lacaixa
 700 | .ladbrokes
 701 | .lamborghini
 702 | .lamer
 703 | .lancaster
 704 | .lancia
 705 | .lancome
 706 | .land
 707 | .landrover
 708 | .lanxess
 709 | .lasalle
 710 | .lat
 711 | .latino
 712 | .latrobe
 713 | .law
 714 | .lawyer
 715 | .lb
 716 | .lc
 717 | .lds
 718 | .lease
 719 | .leclerc
 720 | .lefrak
 721 | .legal
 722 | .lego
 723 | .lexus
 724 | .lgbt
 725 | .li
 726 | .liaison
 727 | .lidl
 728 | .life
 729 | .lifeinsurance
 730 | .lifestyle
 731 | .lighting
 732 | .like
 733 | .lilly
 734 | .limited
 735 | .limo
 736 | .lincoln
 737 | .linde
 738 | .link
 739 | .lipsy
 740 | .live
 741 | .living
 742 | .lixil
 743 | .lk
 744 | .loan
 745 | .loans
 746 | .locker
 747 | .locus
 748 | .loft
 749 | .lol
 750 | .london
 751 | .lotte
 752 | .lotto
 753 | .love
 754 | .lpl
 755 | .lplfinancial
 756 | .lr
 757 | .ls
 758 | .lt
 759 | .ltd
 760 | .ltda
 761 | .lu
 762 | .lundbeck
 763 | .lupin
 764 | .luxe
 765 | .luxury
 766 | .lv
 767 | .ly
 768 | .ma
 769 | .macys
 770 | .madrid
 771 | .maif
 772 | .maison
 773 | .makeup
 774 | .man
 775 | .management
 776 | .mango
 777 | .market
 778 | .marketing
 779 | .markets
 780 | .marriott
 781 | .marshalls
 782 | .maserati
 783 | .mattel
 784 | .mba
 785 | .mc
 786 | .mcd
 787 | .mcdonalds
 788 | .mckinsey
 789 | .md
 790 | .me
 791 | .med
 792 | .media
 793 | .meet
 794 | .melbourne
 795 | .meme
 796 | .memorial
 797 | .men
 798 | .menu
 799 | .meo
 800 | .metlife
 801 | .mg
 802 | .mh
 803 | .miami
 804 | .microsoft
 805 | .mil
 806 | .mini
 807 | .mint
 808 | .mit
 809 | .mitsubishi
 810 | .mk
 811 | .ml
 812 | .mlb
 813 | .mls
 814 | .mm
 815 | .mma
 816 | .mn
 817 | .mo
 818 | .mobi
 819 | .mobile
 820 | .mobily
 821 | .moda
 822 | .moe
 823 | .moi
 824 | .mom
 825 | .monash
 826 | .money
 827 | .monster
 828 | .montblanc
 829 | .mopar
 830 | .mormon
 831 | .mortgage
 832 | .moscow
 833 | .moto
 834 | .motorcycles
 835 | .mov
 836 | .movie
 837 | .movistar
 838 | .mp
 839 | .mq
 840 | .mr
 841 | .ms
 842 | .msd
 843 | .mt
 844 | .mtn
 845 | .mtpc
 846 | .mtr
 847 | .mu
 848 | .museum
 849 | .mutual
 850 | .mv
 851 | .mw
 852 | .mx
 853 | .my
 854 | .mz
 855 | .na
 856 | .nab
 857 | .nadex
 858 | .nagoya
 859 | .name
 860 | .nationwide
 861 | .natura
 862 | .navy
 863 | .nba
 864 | .nc
 865 | .ne
 866 | .nec
 867 | .net
 868 | .netbank
 869 | .netflix
 870 | .network
 871 | .neustar
 872 | .new
 873 | .newholland
 874 | .news
 875 | .next
 876 | .nextdirect
 877 | .nexus
 878 | .nf
 879 | .nfl
 880 | .ng
 881 | .ngo
 882 | .nhk
 883 | .ni
 884 | .nico
 885 | .nike
 886 | .nikon
 887 | .ninja
 888 | .nissan
 889 | .nissay
 890 | .nl
 891 | .no
 892 | .nokia
 893 | .northwesternmutual
 894 | .norton
 895 | .now
 896 | .nowruz
 897 | .nowtv
 898 | .np
 899 | .nr
 900 | .nra
 901 | .nrw
 902 | .ntt
 903 | .nu
 904 | .nyc
 905 | .nz
 906 | .obi
 907 | .observer
 908 | .off
 909 | .office
 910 | .okinawa
 911 | .olayan
 912 | .olayangroup
 913 | .oldnavy
 914 | .ollo
 915 | .om
 916 | .omega
 917 | .one
 918 | .ong
 919 | .onl
 920 | .online
 921 | .onyourside
 922 | .ooo
 923 | .open
 924 | .oracle
 925 | .orange
 926 | .org
 927 | .organic
 928 | .orientexpress
 929 | .origins
 930 | .osaka
 931 | .otsuka
 932 | .ott
 933 | .ovh
 934 | .pa
 935 | .page
 936 | .pamperedchef
 937 | .panasonic
 938 | .panerai
 939 | .paris
 940 | .pars
 941 | .partners
 942 | .parts
 943 | .party
 944 | .passagens
 945 | .pay
 946 | .pccw
 947 | .pe
 948 | .pet
 949 | .pf
 950 | .pfizer
 951 | .pg
 952 | .ph
 953 | .pharmacy
 954 | .philips
 955 | .phone
 956 | .photo
 957 | .photography
 958 | .photos
 959 | .physio
 960 | .piaget
 961 | .pics
 962 | .pictet
 963 | .pictures
 964 | .pid
 965 | .pin
 966 | .ping
 967 | .pink
 968 | .pioneer
 969 | .pizza
 970 | .pk
 971 | .pl
 972 | .place
 973 | .play
 974 | .playstation
 975 | .plumbing
 976 | .plus
 977 | .pm
 978 | .pn
 979 | .pnc
 980 | .pohl
 981 | .poker
 982 | .politie
 983 | .porn
 984 | .post
 985 | .pr
 986 | .pramerica
 987 | .praxi
 988 | .press
 989 | .prime
 990 | .pro
 991 | .prod
 992 | .productions
 993 | .prof
 994 | .progressive
 995 | .promo
 996 | .properties
 997 | .property
 998 | .protection
 999 | .pru
1000 | .prudential
1001 | .ps
1002 | .pt
1003 | .pub
1004 | .pw
1005 | .pwc
1006 | .py
1007 | .qa
1008 | .qpon
1009 | .quebec
1010 | .quest
1011 | .qvc
1012 | .racing
1013 | .radio
1014 | .raid
1015 | .re
1016 | .read
1017 | .realestate
1018 | .realtor
1019 | .realty
1020 | .recipes
1021 | .red
1022 | .redstone
1023 | .redumbrella
1024 | .rehab
1025 | .reise
1026 | .reisen
1027 | .reit
1028 | .reliance
1029 | .ren
1030 | .rent
1031 | .rentals
1032 | .repair
1033 | .report
1034 | .republican
1035 | .rest
1036 | .restaurant
1037 | .review
1038 | .reviews
1039 | .rexroth
1040 | .rich
1041 | .richardli
1042 | .ricoh
1043 | .rightathome
1044 | .ril
1045 | .rio
1046 | .rip
1047 | .rmit
1048 | .ro
1049 | .rocher
1050 | .rocks
1051 | .rodeo
1052 | .rogers
1053 | .room
1054 | .rs
1055 | .rsvp
1056 | .ru
1057 | .ruhr
1058 | .run
1059 | .rw
1060 | .rwe
1061 | .ryukyu
1062 | .sa
1063 | .saarland
1064 | .safe
1065 | .safety
1066 | .sakura
1067 | .sale
1068 | .salon
1069 | .samsclub
1070 | .samsung
1071 | .sandvik
1072 | .sandvikcoromant
1073 | .sanofi
1074 | .sap
1075 | .sapo
1076 | .sarl
1077 | .sas
1078 | .save
1079 | .saxo
1080 | .sb
1081 | .sbi
1082 | .sbs
1083 | .sc
1084 | .sca
1085 | .scb
1086 | .schaeffler
1087 | .schmidt
1088 | .scholarships
1089 | .school
1090 | .schule
1091 | .schwarz
1092 | .science
1093 | .scjohnson
1094 | .scor
1095 | .scot
1096 | .sd
1097 | .se
1098 | .seat
1099 | .secure
1100 | .security
1101 | .seek
1102 | .select
1103 | .sener
1104 | .services
1105 | .ses
1106 | .seven
1107 | .sew
1108 | .sex
1109 | .sexy
1110 | .sfr
1111 | .sg
1112 | .sh
1113 | .shangrila
1114 | .sharp
1115 | .shaw
1116 | .shell
1117 | .shia
1118 | .shiksha
1119 | .shoes
1120 | .shop
1121 | .shopping
1122 | .shouji
1123 | .show
1124 | .showtime
1125 | .shriram
1126 | .si
1127 | .silk
1128 | .sina
1129 | .singles
1130 | .site
1131 | .sj
1132 | .sk
1133 | .ski
1134 | .skin
1135 | .sky
1136 | .skype
1137 | .sl
1138 | .sling
1139 | .sm
1140 | .smart
1141 | .smile
1142 | .sn
1143 | .sncf
1144 | .so
1145 | .soccer
1146 | .social
1147 | .softbank
1148 | .software
1149 | .sohu
1150 | .solar
1151 | .solutions
1152 | .song
1153 | .sony
1154 | .soy
1155 | .space
1156 | .spiegel
1157 | .spot
1158 | .spreadbetting
1159 | .sr
1160 | .srl
1161 | .srt
1162 | .st
1163 | .stada
1164 | .staples
1165 | .star
1166 | .starhub
1167 | .statebank
1168 | .statefarm
1169 | .statoil
1170 | .stc
1171 | .stcgroup
1172 | .stockholm
1173 | .storage
1174 | .store
1175 | .stream
1176 | .studio
1177 | .study
1178 | .style
1179 | .su
1180 | .sucks
1181 | .supplies
1182 | .supply
1183 | .support
1184 | .surf
1185 | .surgery
1186 | .suzuki
1187 | .sv
1188 | .swatch
1189 | .swiftcover
1190 | .swiss
1191 | .sx
1192 | .sy
1193 | .sydney
1194 | .symantec
1195 | .systems
1196 | .sz
1197 | .tab
1198 | .taipei
1199 | .talk
1200 | .taobao
1201 | .target
1202 | .tatamotors
1203 | .tatar
1204 | .tattoo
1205 | .tax
1206 | .taxi
1207 | .tc
1208 | .tci
1209 | .td
1210 | .tdk
1211 | .team
1212 | .tech
1213 | .technology
1214 | .tel
1215 | .telecity
1216 | .telefonica
1217 | .temasek
1218 | .tennis
1219 | .teva
1220 | .tf
1221 | .tg
1222 | .th
1223 | .thd
1224 | .theater
1225 | .theatre
1226 | .tiaa
1227 | .tickets
1228 | .tienda
1229 | .tiffany
1230 | .tips
1231 | .tires
1232 | .tirol
1233 | .tj
1234 | .tjmaxx
1235 | .tjx
1236 | .tk
1237 | .tkmaxx
1238 | .tl
1239 | .tm
1240 | .tmall
1241 | .tn
1242 | .to
1243 | .today
1244 | .tokyo
1245 | .tools
1246 | .top
1247 | .toray
1248 | .toshiba
1249 | .total
1250 | .tours
1251 | .town
1252 | .toyota
1253 | .toys
1254 | .tr
1255 | .trade
1256 | .trading
1257 | .training
1258 | .travel
1259 | .travelchannel
1260 | .travelers
1261 | .travelersinsurance
1262 | .trust
1263 | .trv
1264 | .tt
1265 | .tube
1266 | .tui
1267 | .tunes
1268 | .tushu
1269 | .tv
1270 | .tvs
1271 | .tw
1272 | .tz
1273 | .ua
1274 | .ubank
1275 | .ubs
1276 | .uconnect
1277 | .ug
1278 | .uk
1279 | .unicom
1280 | .university
1281 | .uno
1282 | .uol
1283 | .ups
1284 | .us
1285 | .uy
1286 | .uz
1287 | .va
1288 | .vacations
1289 | .vana
1290 | .vanguard
1291 | .vc
1292 | .ve
1293 | .vegas
1294 | .ventures
1295 | .verisign
1296 | .versicherung
1297 | .vet
1298 | .vg
1299 | .vi
1300 | .viajes
1301 | .video
1302 | .vig
1303 | .viking
1304 | .villas
1305 | .vin
1306 | .vip
1307 | .virgin
1308 | .visa
1309 | .vision
1310 | .vista
1311 | .vistaprint
1312 | .viva
1313 | .vivo
1314 | .vlaanderen
1315 | .vn
1316 | .vodka
1317 | .volkswagen
1318 | .volvo
1319 | .vote
1320 | .voting
1321 | .voto
1322 | .voyage
1323 | .vu
1324 | .vuelos
1325 | .wales
1326 | .walmart
1327 | .walter
1328 | .wang
1329 | .wanggou
1330 | .warman
1331 | .watch
1332 | .watches
1333 | .weather
1334 | .weatherchannel
1335 | .webcam
1336 | .weber
1337 | .website
1338 | .wed
1339 | .wedding
1340 | .weibo
1341 | .weir
1342 | .wf
1343 | .whoswho
1344 | .wien
1345 | .wiki
1346 | .williamhill
1347 | .win
1348 | .windows
1349 | .wine
1350 | .winners
1351 | .wme
1352 | .wolterskluwer
1353 | .woodside
1354 | .work
1355 | .works
1356 | .world
1357 | .wow
1358 | .ws
1359 | .wtc
1360 | .wtf
1361 | .xbox
1362 | .xerox
1363 | .xfinity
1364 | .xihuan
1365 | .xin
1366 | .xperia
1367 | .xxx
1368 | .xyz
1369 | .achts
1370 | .yahoo
1371 | .yamaxun
1372 | .yandex
1373 | .ye
1374 | .yodobashi
1375 | .yoga
1376 | .yokohama
1377 | .you
1378 | .youtube
1379 | .yt
1380 | .yun
1381 | .za
1382 | .zappos
1383 | .zara
1384 | .zero
1385 | .zip
1386 | .zippo
1387 | .zm
1388 | .zone
1389 | .zuerich
1390 | .zw
1391 | 


--------------------------------------------------------------------------------
/pythonwhois/parse.py:
--------------------------------------------------------------------------------
   1 | from __future__ import print_function
   2 | import re, sys, datetime, csv, pkgutil
   3 | from . import net, shared
   4 | 
   5 | try: 
   6 | 	from io import StringIO
   7 | except ImportError:
   8 | 	from cStringIO import StringIO
   9 | 
  10 | def pkgdata(name):
  11 | 	data = pkgutil.get_data("pythonwhois", name)
  12 | 	if sys.version_info < (3, 0):
  13 | 		return data
  14 | 	else:
  15 | 		return data.decode("utf-8")
  16 | 
  17 | def read_dataset(filename, destination, abbrev_key, name_key, is_dict=False):
  18 | 	try:
  19 | 		if is_dict:
  20 | 			reader = csv.DictReader(pkgdata(filename).splitlines())
  21 | 		else:
  22 | 			reader = csv.reader(pkgdata(filename).splitlines())
  23 | 
  24 | 		for line in reader:
  25 | 			destination[line[abbrev_key]] = line[name_key]
  26 | 	except IOError as e:
  27 | 		pass
  28 | 	
  29 | airports = {}
  30 | countries = {}
  31 | states_au = {}
  32 | states_us = {}
  33 | states_ca = {}
  34 | 	
  35 | try:
  36 | 	reader = csv.reader(pkgdata("airports.dat").splitlines())
  37 | 
  38 | 	for line in reader:
  39 | 		airports[line[4]] = line[2]
  40 | 		airports[line[5]] = line[2]
  41 | except IOError as e:
  42 | 	# The distributor likely removed airports.dat for licensing reasons. We'll just leave an empty dict.
  43 | 	pass
  44 | 
  45 | read_dataset("countries.dat", countries, "iso", "name", is_dict=True)
  46 | read_dataset("countries3.dat", countries, "iso3", "name", is_dict=True)
  47 | read_dataset("states_au.dat", states_au, 0, 1)
  48 | read_dataset("states_us.dat", states_us, "abbreviation", "name", is_dict=True)
  49 | read_dataset("states_ca.dat", states_ca, "abbreviation", "name", is_dict=True)
  50 | 
  51 | def precompile_regexes(source, flags=0):
  52 | 	return [re.compile(regex, flags) for regex in source]
  53 | 	
  54 | grammar = {
  55 | 	"_data": {
  56 | 		'id':			['Domain ID:[ ]*(?P<val>.+)'],
  57 | 		'status':		['\[Status\]\s*(?P<val>.+)',
  58 | 					 'Status\s*:\s?(?P<val>.+)',
  59 | 					 '\[State\]\s*(?P<val>.+)',
  60 | 					 '^state:\s*(?P<val>.+)'],
  61 | 		'creation_date':	['\[Created on\]\s*(?P<val>.+)',
  62 | 					 'Created on[.]*: [a-zA-Z]+, (?P<val>.+)',
  63 | 					 'Creation Date:\s?(?P<val>.+)',
  64 | 					 'Creation date\s*:\s?(?P<val>.+)',
  65 | 					 'Registration Date:\s?(?P<val>.+)',
  66 | 					 'Created Date:\s?(?P<val>.+)',
  67 | 					 'Created on:\s?(?P<val>.+)',
  68 | 					 'Created on\s?[.]*:\s?(?P<val>.+)\.',
  69 | 					 'Date Registered\s?[.]*:\s?(?P<val>.+)',
  70 | 					 'Domain Created\s?[.]*:\s?(?P<val>.+)',
  71 | 					 'Domain registered\s?[.]*:\s?(?P<val>.+)',
  72 | 					 'Domain record activated\s?[.]*:\s*?(?P<val>.+)',
  73 | 					 'Record created on\s?[.]*:?\s*?(?P<val>.+)',
  74 | 					 'Record created\s?[.]*:?\s*?(?P<val>.+)',
  75 | 					 'Created\s?[.]*:?\s*?(?P<val>.+)',
  76 | 					 'Registered on\s?[.]*:?\s*?(?P<val>.+)',
  77 | 					 'Registered\s?[.]*:?\s*?(?P<val>.+)',
  78 | 					 'Domain Create Date\s?[.]*:?\s*?(?P<val>.+)',
  79 | 					 'Domain Registration Date\s?[.]*:?\s*?(?P<val>.+)',
  80 | 					 'created:\s*(?P<val>.+)',
  81 | 					 '\[Registered Date\]\s*(?P<val>.+)',
  82 | 					 'created-date:\s*(?P<val>.+)',
  83 | 					 'Domain Name Commencement Date: (?P<val>.+)',
  84 | 					 'registered:\s*(?P<val>.+)',
  85 | 					 'registration:\s*(?P<val>.+)'],
  86 | 		'expiration_date':	['\[Expires on\]\s*(?P<val>.+)',
  87 | 					 'Registrar Registration Expiration Date:[ ]*(?P<val>.+)-[0-9]{4}',
  88 | 					 'Expires on[.]*: [a-zA-Z]+, (?P<val>.+)',
  89 | 					 'Expiration Date:\s?(?P<val>.+)',
  90 | 					 'Expiration date\s*:\s?(?P<val>.+)',
  91 | 					 'Expires on:\s?(?P<val>.+)',
  92 | 					 'Expires on\s?[.]*:\s?(?P<val>.+)\.',
  93 | 					 'Exp(?:iry)? Date\s?[.]*:\s?(?P<val>.+)',
  94 | 					 'Expiry\s*:\s?(?P<val>.+)',
  95 | 					 'Domain Currently Expires\s?[.]*:\s?(?P<val>.+)',
  96 | 					 'Record will expire on\s?[.]*:\s?(?P<val>.+)',
  97 | 					 'Domain expires\s?[.]*:\s*?(?P<val>.+)',
  98 | 					 'Record expires on\s?[.]*:?\s*?(?P<val>.+)',
  99 | 					 'Record expires\s?[.]*:?\s*?(?P<val>.+)',
 100 | 					 'Expires\s?[.]*:?\s*?(?P<val>.+)',
 101 | 					 'Expire Date\s?[.]*:?\s*?(?P<val>.+)',
 102 | 					 'Expired\s?[.]*:?\s*?(?P<val>.+)',
 103 | 					 'Domain Expiration Date\s?[.]*:?\s*?(?P<val>.+)',
 104 | 					 'paid-till:\s*(?P<val>.+)',
 105 | 					 'expiration_date:\s*(?P<val>.+)',
 106 | 					 'expire-date:\s*(?P<val>.+)',
 107 | 					 'renewal:\s*(?P<val>.+)',
 108 | 					 'expires:\s*(?P<val>.+)',
 109 | 					 'expire:\s*(?P<val>.+)'],
 110 | 		'updated_date':		['\[Last Updated\]\s*(?P<val>.+)',
 111 | 					 'Record modified on[.]*: (?P<val>.+) [a-zA-Z]+',
 112 | 					 'Record last updated on[.]*: [a-zA-Z]+, (?P<val>.+)',
 113 | 					 'Updated Date:\s?(?P<val>.+)',
 114 | 					 'Updated date\s*:\s?(?P<val>.+)',
 115 | 					 #'Database last updated on\s?[.]*:?\s*?(?P<val>.+)\s[a-z]+\.?',
 116 | 					 'Record last updated on\s?[.]*:?\s?(?P<val>.+)\.',
 117 | 					 'Domain record last updated\s?[.]*:\s*?(?P<val>.+)',
 118 | 					 'Domain Last Updated\s?[.]*:\s*?(?P<val>.+)',
 119 | 					 'Last updated on:\s?(?P<val>.+)',
 120 | 					 'Date Modified\s?[.]*:\s?(?P<val>.+)',
 121 | 					 'Last Modified\s?[.]*:\s?(?P<val>.+)',
 122 | 					 'Domain Last Updated Date\s?[.]*:\s?(?P<val>.+)',
 123 | 					 'Record last updated\s?[.]*:\s?(?P<val>.+)',
 124 | 					 'Modified\s?[.]*:\s?(?P<val>.+)',
 125 | 					 '(C|c)hanged:\s*(?P<val>.+)',
 126 | 					 'last_update:\s*(?P<val>.+)',
 127 | 					 'Last Update\s?[.]*:\s?(?P<val>.+)',
 128 | 					 'Last updated on (?P<val>.+) [a-z]{3,4}',
 129 | 					 'Last updated:\s*(?P<val>.+)',
 130 | 					 'last-updated:\s*(?P<val>.+)',
 131 | 					 '\[Last Update\]\s*(?P<val>.+) \([A-Z]+\)',
 132 | 					 'Last update of whois database:\s?[a-z]{3}, (?P<val>.+) [a-z]{3,4}'],
 133 | 		'registrar':		['registrar:\s*(?P<val>.+)',
 134 | 					 'Registrar:\s*(?P<val>.+)',
 135 | 					 'Sponsoring Registrar Organization:\s*(?P<val>.+)',
 136 | 					 'Registered through:\s?(?P<val>.+)',
 137 | 					 'Registrar Name[.]*:\s?(?P<val>.+)',
 138 | 					 'Record maintained by:\s?(?P<val>.+)',
 139 | 					 'Registration Service Provided By:\s?(?P<val>.+)',
 140 | 					 'Registrar of Record:\s?(?P<val>.+)',
 141 | 					 'Domain Registrar :\s?(?P<val>.+)',
 142 | 					 'Registration Service Provider: (?P<val>.+)',
 143 | 					 '\tName:\t\s(?P<val>.+)'],
 144 | 		'whois_server':		['Whois Server:\s?(?P<val>.+)',
 145 | 					 'Registrar Whois:\s?(?P<val>.+)'],
 146 | 		'nameservers':		['Name Server:[ ]*(?P<val>[^ ]+)',
 147 | 					 'Nameservers:[ ]*(?P<val>[^ ]+)',
 148 | 					 '(?<=[ .]{2})(?P<val>([a-z0-9-]+\.)+[a-z0-9]+)(\s+([0-9]{1,3}\.){3}[0-9]{1,3})',
 149 | 					 'nameserver:\s*(?P<val>.+)',
 150 | 					 'nserver:\s*(?P<val>[^[\s]+)',
 151 | 					 'Name Server[.]+ (?P<val>[^[\s]+)',
 152 | 					 'Hostname:\s*(?P<val>[^\s]+)',
 153 | 					 'DNS[0-9]+:\s*(?P<val>.+)',
 154 | 					 '   DNS:\s*(?P<val>.+)',
 155 | 					 'ns[0-9]+:\s*(?P<val>.+)',
 156 | 					 'NS [0-9]+\s*:\s*(?P<val>.+)',
 157 | 					 '\[Name Server\]\s*(?P<val>.+)',
 158 | 					 '(?<=[ .]{2})(?P<val>[a-z0-9-]+\.d?ns[0-9]*\.([a-z0-9-]+\.)+[a-z0-9]+)',
 159 | 					 '(?<=[ .]{2})(?P<val>([a-z0-9-]+\.)+[a-z0-9]+)(\s+([0-9]{1,3}\.){3}[0-9]{1,3})',
 160 | 					 '(?<=[ .]{2})[^a-z0-9.-](?P<val>d?ns\.([a-z0-9-]+\.)+[a-z0-9]+)',
 161 |                                          'Nserver:\s*(?P<val>.+)'],
 162 | 		'emails':		['(?P<val>[\w.-]+@[\w.-]+\.[\w]{2,6})', # Really need to fix this, much longer TLDs now exist...
 163 | 					 '(?P<val>[\w.-]+\sAT\s[\w.-]+\sDOT\s[\w]{2,6})']
 164 | 	},
 165 | 	"_dateformats": (
 166 | 		'(?P<day>[0-9]{1,2})[./ -](?P<month>Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[./ -](?P<year>[0-9]{4}|[0-9]{2})'
 167 | 		'(\s+(?P<hour>[0-9]{1,2})[:.](?P<minute>[0-9]{1,2})[:.](?P<second>[0-9]{1,2}))?',
 168 | 		'[a-z]{3}\s(?P<month>Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[./ -](?P<day>[0-9]{1,2})(\s+(?P<hour>[0-9]{1,2})[:.](?P<minute>[0-9]{1,2})[:.](?P<second>[0-9]{1,2}))?\s[a-z]{3}\s(?P<year>[0-9]{4}|[0-9]{2})',
 169 | 		'[a-zA-Z]+\s(?P<day>[0-9]{1,2})(?:st|nd|rd|th)\s(?P<month>Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)\s(?P<year>[0-9]{4})',
 170 | 		'(?P<year>[0-9]{4})[./-]?(?P<month>[0-9]{2})[./-]?(?P<day>[0-9]{2})(\s|T|/)((?P<hour>[0-9]{1,2})[:.-](?P<minute>[0-9]{1,2})[:.-](?P<second>[0-9]{1,2}))',
 171 | 		'(?P<year>[0-9]{4})[./-](?P<month>[0-9]{1,2})[./-](?P<day>[0-9]{1,2})',
 172 | 		'(?P<year>[0-9]{4})(?P<month>[0-9]{1,2})(?P<day>[0-9]{1,2})',
 173 | 		'(?P<day>[0-9]{1,2})[./ -](?P<month>[0-9]{1,2})[./ -](?P<year>[0-9]{4}|[0-9]{2})',
 174 | 		'(?P<month>Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) (?P<day>[0-9]{1,2}),? (?P<year>[0-9]{4})',
 175 | 		'(?P<day>[0-9]{1,2})-(?P<month>January|February|March|April|May|June|July|August|September|October|November|December)-(?P<year>[0-9]{4})',
 176 | 	),
 177 | 	"_months": {
 178 | 		'jan': 1,
 179 | 		'january': 1,
 180 | 		'feb': 2,
 181 | 		'february': 2,
 182 | 		'mar': 3,
 183 | 		'march': 3,
 184 | 		'apr': 4,
 185 | 		'april': 4,
 186 | 		'may': 5,
 187 | 		'jun': 6,
 188 | 		'june': 6,
 189 | 		'jul': 7,
 190 | 		'july': 7,
 191 | 		'aug': 8,
 192 | 		'august': 8,
 193 | 		'sep': 9,
 194 | 		'sept': 9,
 195 | 		'september': 9,
 196 | 		'oct': 10,
 197 | 		'october': 10,
 198 | 		'nov': 11,
 199 | 		'november': 11,
 200 | 		'dec': 12,
 201 | 		'december': 12
 202 | 	}
 203 | }
 204 | 
 205 | def preprocess_regex(regex):
 206 | 	# Fix for #2; prevents a ridiculous amount of varying size permutations.
 207 | 	regex = re.sub(r"\\s\*\(\?P<([^>]+)>\.\+\)", r"\s*(?P<\1>\S.*)", regex)
 208 | 	# Experimental fix for #18; removes unnecessary variable-size whitespace
 209 | 	# matching, since we're stripping results anyway.
 210 | 	regex = re.sub(r"\[ \]\*\(\?P<([^>]+)>\.\*\)", r"(?P<\1>.*)", regex)
 211 | 	return regex
 212 | 
 213 | registrant_regexes = [
 214 | 	"   Registrant:[ ]*\n      (?P<organization>.*)\n      (?P<name>.*)\n      (?P<street>.*)\n      (?P<city>.*), (?P<state>.*) (?P<postalcode>.*)\n      (?P<country>.*)\n(?:      Phone: (?P<phone>.*)\n)?      Email: (?P<email>.*)\n", # Corporate Domains, Inc.
 215 | 	"Registrant:\n  (?P<name>.+)\n  (?P<street1>.+)\n(?:  (?P<street2>.*)\n)?(?:  (?P<street3>.*)\n)?  (?P<postalcode>.+), (?P<city>.+)\n  (?P<country>.+)\n  (?P<phone>.+)\n  (?P<email>.+)\n\n", # OVH
 216 | 	"(?:Registrant ID:(?P<handle>.+)\n)?Registrant Name:(?P<name>.*)\n(?:Registrant Organization:(?P<organization>.*)\n)?Registrant Street1?:(?P<street1>.*)\n(?:Registrant Street2:(?P<street2>.*)\n)?(?:Registrant Street3:(?P<street3>.*)\n)?Registrant City:(?P<city>.*)\nRegistrant State/Province:(?P<state>.*)\nRegistrant Postal Code:(?P<postalcode>.*)\nRegistrant Country:(?P<country>.*)\nRegistrant Phone:(?P<phone>.*)\n(?:Registrant Phone Ext.:(?P<phone_ext>.*)\n)?(?:Registrant FAX:(?P<fax>.*)\n)?(?:Registrant FAX Ext.:(?P<fax_ext>.*)\n)?Registrant Email:(?P<email>.*)", # Public Interest Registry (.org), nic.pw, No-IP.com
 217 | 	"Registrant ID:(?P<handle>.+)\nRegistrant Name:(?P<name>.*)\n(?:Registrant Organization:(?P<organization>.*)\n)?Registrant Address1?:(?P<street1>.*)\n(?:Registrant Address2:(?P<street2>.*)\n)?(?:Registrant Address3:(?P<street3>.*)\n)?Registrant City:(?P<city>.*)\nRegistrant State/Province:(?P<state>.*)\nRegistrant Country/Economy:(?P<country>.*)\nRegistrant Postal Code:(?P<postalcode>.*)\nRegistrant Phone:(?P<phone>.*)\n(?:Registrant Phone Ext.:(?P<phone_ext>.*)\n)?(?:Registrant FAX:(?P<fax>.*)\n)?(?:Registrant FAX Ext.:(?P<fax_ext>.*)\n)?Registrant E-mail:(?P<email>.*)", # .ME, DotAsia
 218 | 	"Registrant ID:\s*(?P<handle>.+)\nRegistrant Name:\s*(?P<name>.+)\nRegistrant Organization:\s*(?P<organization>.*)\nRegistrant Address1:\s*(?P<street1>.+)\nRegistrant Address2:\s*(?P<street2>.*)\nRegistrant City:\s*(?P<city>.+)\nRegistrant State/Province:\s*(?P<state>.+)\nRegistrant Postal Code:\s*(?P<postalcode>.+)\nRegistrant Country:\s*(?P<country>.+)\nRegistrant Country Code:\s*(?P<country_code>.+)\nRegistrant Phone Number:\s*(?P<phone>.+)\nRegistrant Email:\s*(?P<email>.+)\n", # .CO Internet
 219 | 	"Registrant Contact: (?P<handle>.+)\nRegistrant Organization: (?P<organization>.+)\nRegistrant Name: (?P<name>.+)\nRegistrant Street: (?P<street>.+)\nRegistrant City: (?P<city>.+)\nRegistrant Postal Code: (?P<postalcode>.+)\nRegistrant State: (?P<state>.+)\nRegistrant Country: (?P<country>.+)\nRegistrant Phone: (?P<phone>.*)\nRegistrant Phone Ext: (?P<phone_ext>.*)\nRegistrant Fax: (?P<fax>.*)\nRegistrant Fax Ext: (?P<fax_ext>.*)\nRegistrant Email: (?P<email>.*)\n", # Key-Systems GmbH
 220 | 	"(?:Registrant ID:[ ]*(?P<handle>.*)\n)?Registrant Name:[ ]*(?P<name>.*)\n(?:Registrant Organization:[ ]*(?P<organization>.*)\n)?Registrant Street:[ ]*(?P<street1>.+)\n(?:Registrant Street:[ ]*(?P<street2>.+)\n)?(?:Registrant Street:[ ]*(?P<street3>.+)\n)?Registrant City:[ ]*(?P<city>.+)\nRegistrant State(?:\/Province)?:[ ]*(?P<state>.*)\nRegistrant Postal Code:[ ]*(?P<postalcode>.+)\nRegistrant Country:[ ]*(?P<country>.+)\n(?:Registrant Phone:[ ]*(?P<phone>.*)\n)?(?:Registrant Phone Ext:[ ]*(?P<phone_ext>.*)\n)?(?:Registrant Fax:[ ]*(?P<fax>.*)\n)?(?:Registrant Fax Ext:[ ]*(?P<fax_ext>.*)\n)?(?:Registrant Email:[ ]*(?P<email>.+)\n)?", # WildWestDomains, GoDaddy, Namecheap/eNom, Ascio, Musedoma (.museum), EuroDNS, nic.ps
 221 | 	"Registrant\n(?:    (?P<organization>.+)\n)?    (?P<name>.+)\n    Email:(?P<email>.+)\n    (?P<street1>.+)\n(?:    (?P<street2>.+)\n)?    (?P<postalcode>.+) (?P<city>.+)\n    (?P<country>.+)\n    Tel: (?P<phone>.+)\n\n", # internet.bs
 222 | 	" Registrant Contact Details:[ ]*\n    (?P<organization>.*)\n    (?P<name>.*)[ ]{2,}\((?P<email>.*)\)\n    (?P<street1>.*)\n(?:    (?P<street2>.*)\n)?(?:    (?P<street3>.*)\n)?    (?P<city>.*)\n    (?P<state>.*),(?P<postalcode>.*)\n    (?P<country>.*)\n    Tel. (?P<phone>.*)", # Whois.com
 223 | 	"owner-id:[ ]*(?P<handle>.*)\n(?:owner-organization:[ ]*(?P<organization>.*)\n)?owner-name:[ ]*(?P<name>.*)\nowner-street:[ ]*(?P<street>.*)\nowner-city:[ ]*(?P<city>.*)\nowner-zip:[ ]*(?P<postalcode>.*)\nowner-country:[ ]*(?P<country>.*)\n(?:owner-phone:[ ]*(?P<phone>.*)\n)?(?:owner-fax:[ ]*(?P<fax>.*)\n)?owner-email:[ ]*(?P<email>.*)", # InterNetworX
 224 | 	"Registrant:\n registrant_org: (?P<organization>.*)\n registrant_name: (?P<name>.*)\n registrant_email: (?P<email>.*)\n registrant_address: (?P<address>.*)\n registrant_city: (?P<city>.*)\n registrant_state: (?P<state>.*)\n registrant_zip: (?P<postalcode>.*)\n registrant_country: (?P<country>.*)\n registrant_phone: (?P<phone>.*)", # Bellnames
 225 | 	"Holder of domain name:\n(?P<name>[\S\s]+)\n(?P<street>.+)\n(?P<postalcode>[A-Z0-9-]+)\s+(?P<city>.+)\n(?P<country>.+)\nContractual Language", # nic.ch
 226 | 	"\n\n(?:Owner)?\s+: (?P<name>.*)\n(?:\s+: (?P<organization>.*)\n)?\s+: (?P<street>.*)\n\s+: (?P<city>.*)\n\s+: (?P<state>.*)\n\s+: (?P<country>.*)\n", # nic.io
 227 | 	"Contact Information:\n\[Name\]\s*(?P<name>.*)\n\[Email\]\s*(?P<email>.*)\n\[Web Page\]\s*(?P<url>.*)\n\[Postal code\]\s*(?P<postalcode>.*)\n\[Postal Address\]\s*(?P<street1>.*)\n(?:\s+(?P<street2>.*)\n)?(?:\s+(?P<street3>.*)\n)?\[Phone\]\s*(?P<phone>.*)\n\[Fax\]\s*(?P<fax>.*)\n", # jprs.jp
 228 | 	"g\. \[Organization\]               (?P<organization>.+)\n", # .co.jp registrations at jprs.jp
 229 | 	"Registrant ID:(?P<handle>.*)\nRegistrant Name:(?P<name>.*)\n(?:Registrant Organization:(?P<organization>.*)\n)?Registrant Address1:(?P<street1>.*)\n(?:Registrant Address2:(?P<street2>.*)\n)?(?:Registrant Address3:(?P<street3>.*)\n)?Registrant City:(?P<city>.*)\n(?:Registrant State/Province:(?P<state>.*)\n)?(?:Registrant Postal Code:(?P<postalcode>.*)\n)?Registrant Country:(?P<country>.*)\nRegistrant Country Code:.*\nRegistrant Phone Number:(?P<phone>.*)\n(?:Registrant Facsimile Number:(?P<facsimile>.*)\n)?Registrant Email:(?P<email>.*)", # .US, .biz (NeuStar), .buzz, .moe (Interlink Co. Ltd.)
 230 | 	"Registrant\n  Name:             (?P<name>.+)\n(?:  Organization:     (?P<organization>.+)\n)?  ContactID:        (?P<handle>.+)\n(?:  Address:          (?P<street1>.+)\n(?:                    (?P<street2>.+)\n(?:                    (?P<street3>.+)\n)?)?                    (?P<city>.+)\n                    (?P<postalcode>.+)\n                    (?P<state>.+)\n                    (?P<country>.+)\n)?(?:  Created:          (?P<creationdate>.+)\n)?(?:  Last Update:      (?P<changedate>.+)\n)?", # nic.it
 231 | 	"  Organisation Name[.]* (?P<name>.*)\n  Organisation Address[.]* (?P<street1>.*)\n  Organisation Address[.]* (?P<street2>.*)\n(?:  Organisation Address[.]* (?P<street3>.*)\n)?  Organisation Address[.]* (?P<city>.*)\n  Organisation Address[.]* (?P<postalcode>.*)\n  Organisation Address[.]* (?P<state>.*)\n  Organisation Address[.]* (?P<country>.*)", # Melbourne IT (what a horrid format...)
 232 | 	"Registrant:[ ]*(?P<name>.+)\n[\s\S]*Eligibility Name:[ ]*(?P<organization>.+)\n[\s\S]*Registrant Contact ID:[ ]*(?P<handle>.+)\n", # .au business
 233 | 	"Eligibility Type:[ ]*Citizen\/Resident\n[\s\S]*Registrant Contact ID:[ ]*(?P<handle>.+)\n[\s\S]*Registrant Contact Name:[ ]*(?P<name>.+)\n", # .au individual
 234 | 	"Registrant:[ ]*(?P<organization>.+)\n[\s\S]*Eligibility Type:[ ]*(Higher Education Institution|Company|Incorporated Association|Other)\n[\s\S]*Registrant Contact ID:[ ]*(?P<handle>.+)\n[\s\S]*Registrant Contact Name:[ ]*(?P<name>.+)\n", # .au educational, company, 'incorporated association' (non-profit?), other (spotted for linux.conf.au, unsure if also for others)
 235 | 	"    Registrant:\n        (?P<name>.+)\n\n    Registrant type:\n        .*\n\n    Registrant's address:\n        The registrant .* opted to have", # Nominet (.uk) with hidden address
 236 | 	"    Registrant:\n        (?P<name>.+)\n\n[\s\S]*    Registrant type:\n        .*\n\n    Registrant's address:\n        (?P<street1>.+)\n(?:        (?P<street2>.+)\n(?:        (?P<street3>.+)\n)??)??        (?P<city>[^0-9\n]+)\n(?:        (?P<state>.+)\n)?        (?P<postalcode>.+)\n        (?P<country>.+)\n\n", # Nominet (.uk) with visible address
 237 | 	"Domain Owner:\n\t(?P<organization>.+)\n\n[\s\S]*?(?:Registrant Contact:\n\t(?P<name>.+))?\n\nRegistrant(?:'s)? (?:a|A)ddress:(?:\n\t(?P<street1>.+)\n(?:\t(?P<street2>.+)\n)?(?:\t(?P<street3>.+)\n)?\t(?P<city>.+)\n\t(?P<postalcode>.+))?\n\t(?P<country>.+)(?:\n\t(?P<phone>.+) \(Phone\)\n\t(?P<fax>.+) \(FAX\)\n\t(?P<email>.+))?\n\n", # .ac.uk - what a mess...
 238 | 	"Registrant ID: (?P<handle>.+)\nRegistrant: (?P<name>.+)\nRegistrant Contact Email: (?P<email>.+)", # .cn (CNNIC)
 239 | 	"Registrant contact:\n  (?P<name>.+)\n  (?P<street>.*)\n  (?P<city>.+), (?P<state>.+) (?P<postalcode>.+) (?P<country>.+)\n\n", # Fabulous.com
 240 | 	"registrant-name:\s*(?P<name>.+)\nregistrant-type:\s*(?P<type>.+)\nregistrant-address:\s*(?P<street>.+)\nregistrant-postcode:\s*(?P<postalcode>.+)\nregistrant-city:\s*(?P<city>.+)\nregistrant-country:\s*(?P<country>.+)\n(?:registrant-phone:\s*(?P<phone>.+)\n)?(?:registrant-email:\s*(?P<email>.+)\n)?", # Hetzner
 241 | 	"Registrant Contact Information :[ ]*\n[ ]+(?P<firstname>.*)\n[ ]+(?P<lastname>.*)\n[ ]+(?P<organization>.*)\n[ ]+(?P<email>.*)\n[ ]+(?P<street>.*)\n[ ]+(?P<city>.*)\n[ ]+(?P<postalcode>.*)\n[ ]+(?P<phone>.*)\n[ ]+(?P<fax>.*)\n\n", # GAL Communication
 242 | 	"Contact Information : For Customer # [0-9]+[ ]*\n[ ]+(?P<firstname>.*)\n[ ]+(?P<lastname>.*)\n[ ]+(?P<organization>.*)\n[ ]+(?P<email>.*)\n[ ]+(?P<street>.*)\n[ ]+(?P<city>.*)\n[ ]+(?P<postalcode>.*)\n[ ]+(?P<phone>.*)\n[ ]+(?P<fax>.*)\n\n", # GAL Communication alternative (private WHOIS) format?
 243 | 	"Registrant:\n   Name:           (?P<name>.+)\n   City:           (?P<city>.+)\n   State:          (?P<state>.+)\n   Country:        (?P<country>.+)\n", # Akky (.com.mx)
 244 | 	"   Registrant:\n      (?P<name>.+)\n      (?P<street>.+)\n      (?P<city>.+) (?P<state>\S+),[ ]+(?P<postalcode>.+)\n      (?P<country>.+)", # .am
 245 | 	"Domain Holder: (?P<organization>.+)\n(?P<street1>.+?)(?:,+ (?P<street2>.+?)(?:,+ (?P<street3>.+?)(?:,+ (?P<street4>.+?)(?:,+ (?P<street5>.+?)(?:,+ (?P<street6>.+?)(?:,+ (?P<street7>.+?))?)?)?)?)?)?, (?P<city>[^.,]+), (?P<district>.+), (?P<state>.+)\n(?P<postalcode>.+)\n(?P<country>[A-Z]+)\n", # .co.th, format 1
 246 | 	"Domain Holder: (?P<organization>.+)\n(?P<street1>.+?)(?:,+ (?P<street2>.+?)(?:,+ (?P<street3>.+?)(?:,+ (?P<street4>.+?)(?:,+ (?P<street5>.+?)(?:,+ (?P<street6>.+?)(?:,+ (?P<street7>.+?))?)?)?)?)?)?, (?P<city>.+)\n(?P<postalcode>.+)\n(?P<country>[A-Z]+)\n", # .co.th, format 2
 247 | 	"Domain Holder: (?P<organization>.+)\n(?P<street1>.+)\n(?:(?P<street2>.+)\n)?(?:(?P<street3>.+)\n)?.+?, (?P<district>.+)\n(?P<city>.+)\n(?P<postalcode>.+)\n(?P<country>[A-Z]+)\n", # .co.th, format 3
 248 | 	"Domain Holder: (?P<organization>.+)\n(?P<street1>.+?)(?:,+ (?P<street2>.+?)(?:,+ (?P<street3>.+?)(?:,+ (?P<street4>.+?)(?:,+ (?P<street5>.+?)(?:,+ (?P<street6>.+?)(?:,+ (?P<street7>.+?))?)?)?)?)?)?\n(?P<city>.+),? (?P<state>[A-Z]{2,3})(?: [A-Z0-9]+)?\n(?P<postalcode>.+)\n(?P<country>[A-Z]+)\n", # .co.th, format 4
 249 | 	"   Registrant:\n      (?P<organization>.+)\n      (?P<name>.+)  (?P<email>.+)\n      (?P<phone>.*)\n      (?P<fax>.*)\n      (?P<street>.*)\n      (?P<city>.+), (?P<state>[^,\n]*)\n      (?P<country>.+)\n", # .com.tw (Western registrars)
 250 | 	"Registrant:\n(?P<organization1>.+)\n(?P<organization2>.+)\n(?P<street1>.+?)(?:,+(?P<street2>.+?)(?:,+(?P<street3>.+?)(?:,+(?P<street4>.+?)(?:,+(?P<street5>.+?)(?:,+(?P<street6>.+?)(?:,+(?P<street7>.+?))?)?)?)?)?)?,(?P<city>.+),(?P<country>.+)\n\n   Contact:\n      (?P<name>.+)   (?P<email>.+)\n      TEL:  (?P<phone>.+?)(?:(?:#|ext.?)(?P<phone_ext>.+))?\n      FAX:  (?P<fax>.+)(?:(?:#|ext.?)(?P<fax_ext>.+))?\n", # .com.tw (TWNIC/SEEDNET, Taiwanese companies only?)
 251 | 	"Registrant Contact Information:\n\nCompany English Name \(It should be the same as the registered/corporation name on your Business Register Certificate or relevant documents\):(?P<organization1>.+)\nCompany Chinese name:(?P<organization2>.+)\nAddress: (?P<street>.+)\nCountry: (?P<country>.+)\nEmail: (?P<email>.+)\n", # HKDNR (.hk)
 252 | 	"Registrant ID:(?P<handle>.+)\nRegistrant Name:(?P<name>.*)\n(?:Registrant Organization:(?P<organization>.*)\n)?Registrant Street1:(?P<street1>.+?)\n(?:Registrant Street2:(?P<street2>.+?)\n(?:Registrant Street3:(?P<street3>.+?)\n)?)?Registrant City:(?P<city>.+)\nRegistrant State:(?P<state>.*)\nRegistrant Postal Code:(?P<postalcode>.+)\nRegistrant Country:(?P<country>[A-Z]+)\nRegistrant Phone:(?P<phone>.*?)\nRegistrant Fax:(?P<fax>.*)\nRegistrant Email:(?P<email>.+)\n", # Realtime Register
 253 | 	"owner:\s+(?P<name>.+)", # .br
 254 | 	"person:\s+(?P<name>.+)", # nic.ru (person)
 255 | 	"org:\s+(?P<organization>.+)", # nic.ru (organization)
 256 | ]
 257 | 
 258 | tech_contact_regexes = [
 259 | 	"   Technical Contact:[ ]*\n      (?P<organization>.*)\n      (?P<name>.*)\n      (?P<street>.*)\n      (?P<city>.*), (?P<state>.*) (?P<postalcode>.*)\n      (?P<country>.*)\n(?:      Phone: (?P<phone>.*)\n)?      Email: (?P<email>.*)\n", # Corporate Domains, Inc.
 260 | 	"Technical Contact:\n  (?P<name>.+)\n  (?P<street1>.+)\n(?:  (?P<street2>.*)\n)?(?:  (?P<street3>.*)\n)?  (?P<postalcode>.+), (?P<city>.+)\n  (?P<country>.+)\n  (?P<phone>.+)\n  (?P<email>.+)\n\n", # OVH
 261 | 	"(?:Tech ID:(?P<handle>.+)\n)?Tech Name:(?P<name>.*)\n(:?Tech Organization:(?P<organization>.*)\n)?Tech Street1?:(?P<street1>.*)\n(?:Tech Street2:(?P<street2>.*)\n)?(?:Tech Street3:(?P<street3>.*)\n)?Tech City:(?P<city>.*)\nTech State/Province:(?P<state>.*)\nTech Postal Code:(?P<postalcode>.*)\nTech Country:(?P<country>.*)\nTech Phone:(?P<phone>.*)\n(?:Tech Phone Ext.:(?P<phone_ext>.*)\n)?(?:Tech FAX:(?P<fax>.*)\n)?(?:Tech FAX Ext.:(?P<fax_ext>.*)\n)?Tech Email:(?P<email>.*)", # Public Interest Registry (.org), nic.pw, No-IP.com
 262 | 	"Tech(?:nical)? ID:(?P<handle>.+)\nTech(?:nical)? Name:(?P<name>.*)\n(?:Tech(?:nical)? Organization:(?P<organization>.*)\n)?Tech(?:nical)? Address1?:(?P<street1>.*)\n(?:Tech(?:nical)? Address2:(?P<street2>.*)\n)?(?:Tech(?:nical)? Address3:(?P<street3>.*)\n)?Tech(?:nical)? City:(?P<city>.*)\nTech(?:nical)? State/Province:(?P<state>.*)\nTech(?:nical)? Country/Economy:(?P<country>.*)\nTech(?:nical)? Postal Code:(?P<postalcode>.*)\nTech(?:nical)? Phone:(?P<phone>.*)\n(?:Tech(?:nical)? Phone Ext.:(?P<phone_ext>.*)\n)?(?:Tech(?:nical)? FAX:(?P<fax>.*)\n)?(?:Tech(?:nical)? FAX Ext.:(?P<fax_ext>.*)\n)?Tech(?:nical)? E-mail:(?P<email>.*)", # .ME, DotAsia
 263 | 	"Technical Contact ID:\s*(?P<handle>.+)\nTechnical Contact Name:\s*(?P<name>.+)\nTechnical Contact Organization:\s*(?P<organization>.*)\nTechnical Contact Address1:\s*(?P<street1>.+)\nTechnical Contact Address2:\s*(?P<street2>.*)\nTechnical Contact City:\s*(?P<city>.+)\nTechnical Contact State/Province:\s*(?P<state>.+)\nTechnical Contact Postal Code:\s*(?P<postalcode>.+)\nTechnical Contact Country:\s*(?P<country>.+)\nTechnical Contact Country Code:\s*(?P<country_code>.+)\nTechnical Contact Phone Number:\s*(?P<phone>.+)\nTechnical Contact Email:\s*(?P<email>.+)\n", # .CO Internet
 264 | 	"Tech Contact: (?P<handle>.+)\nTech Organization: (?P<organization>.+)\nTech Name: (?P<name>.+)\nTech Street: (?P<street>.+)\nTech City: (?P<city>.+)\nTech Postal Code: (?P<postalcode>.+)\nTech State: (?P<state>.+)\nTech Country: (?P<country>.+)\nTech Phone: (?P<phone>.*)\nTech Phone Ext: (?P<phone_ext>.*)\nTech Fax: (?P<fax>.*)\nTech Fax Ext: (?P<fax_ext>.*)\nTech Email: (?P<email>.*)\n", # Key-Systems GmbH
 265 | 	"(?:Tech ID:[ ]*(?P<handle>.*)\n)?Tech[ ]*Name:[ ]*(?P<name>.*)\n(?:Tech[ ]*Organization:[ ]*(?P<organization>.*)\n)?Tech[ ]*Street:[ ]*(?P<street1>.+)\n(?:Tech[ ]*Street:[ ]*(?P<street2>.+)\n)?(?:Tech[ ]*Street:[ ]*(?P<street3>.+)\n)?Tech[ ]*City:[ ]*(?P<city>.+)\nTech[ ]*State(?:\/Province)?:[ ]*(?P<state>.*)\nTech[ ]*Postal[ ]*Code:[ ]*(?P<postalcode>.+)\nTech[ ]*Country:[ ]*(?P<country>.+)\n(?:Tech[ ]*Phone:[ ]*(?P<phone>.*)\n)?(?:Tech[ ]*Phone[ ]*Ext:[ ]*(?P<phone_ext>.*)\n)?(?:Tech[ ]*Fax:[ ]*(?P<fax>.*)\n)?(?:Tech[ ]*Fax[ ]*Ext:\s*?(?P<fax_ext>.*)\n)?(?:Tech[ ]*Email:[ ]*(?P<email>.+)\n)?", # WildWestDomains, GoDaddy, Namecheap/eNom, Ascio, Musedoma (.museum), EuroDNS, nic.ps
 266 | 	"Technical Contact\n(?:    (?P<organization>.+)\n)?    (?P<name>.+)\n    Email:(?P<email>.+)\n    (?P<street1>.+)\n(?:    (?P<street2>.+)\n)?    (?P<postalcode>.+) (?P<city>.+)\n    (?P<country>.+)\n    Tel: (?P<phone>.+)\n\n", # internet.bs
 267 | 	" Technical Contact Details:[ ]*\n    (?P<organization>.*)\n    (?P<name>.*)[ ]{2,}\((?P<email>.*)\)\n    (?P<street1>.*)\n(?:    (?P<street2>.*)\n)?(?:    (?P<street3>.*)\n)?    (?P<city>.*)\n    (?P<state>.*),(?P<postalcode>.*)\n    (?P<country>.*)\n    Tel. (?P<phone>.*)", # Whois.com
 268 | 	"tech-id:[ ]*(?P<handle>.*)\n(?:tech-organization:[ ]*(?P<organization>.*)\n)?tech-name:[ ]*(?P<name>.*)\ntech-street:[ ]*(?P<street>.*)\ntech-city:[ ]*(?P<city>.*)\ntech-zip:[ ]*(?P<postalcode>.*)\ntech-country:[ ]*(?P<country>.*)\n(?:tech-phone:[ ]*(?P<phone>.*)\n)?(?:tech-fax:[ ]*(?P<fax>.*)\n)?tech-email:[ ]*(?P<email>.*)", # InterNetworX
 269 | 	"Technical Contact:\n tech_org: (?P<organization>.*)\n tech_name: (?P<name>.*)\n tech_email: (?P<email>.*)\n tech_address: (?P<address>.*)\n tech_city: (?P<city>.*)\n tech_state: (?P<state>.*)\n tech_zip: (?P<postalcode>.*)\n tech_country: (?P<country>.*)\n tech_phone: (?P<phone>.*)", # Bellnames
 270 | 	"Technical contact:\n(?P<name>[\S\s]+)\n(?P<street>.+)\n(?P<postalcode>[A-Z0-9-]+)\s+(?P<city>.+)\n(?P<country>.+)\n\n", # nic.ch
 271 | 	"Tech Contact ID:[ ]*(?P<handle>.+)\nTech Contact Name:[ ]*(?P<name>.+)", # .au
 272 | 	"Technical Contact ID:(?P<handle>.*)\nTechnical Contact Name:(?P<name>.*)\n(?:Technical Contact Organization:(?P<organization>.*)\n)?Technical Contact Address1:(?P<street1>.*)\n(?:Technical Contact Address2:(?P<street2>.*)\n)?(?:Technical Contact Address3:(?P<street3>.*)\n)?Technical Contact City:(?P<city>.*)\n(?:Technical Contact State/Province:(?P<state>.*)\n)?(?:Technical Contact Postal Code:(?P<postalcode>.*)\n)?Technical Contact Country:(?P<country>.*)\nTechnical Contact Country Code:.*\nTechnical Contact Phone Number:(?P<phone>.*)\n(?:Technical Contact Facsimile Number:(?P<facsimile>.*)\n)?Technical Contact Email:(?P<email>.*)", # .US, .biz (NeuStar), .buzz, .moe (Interlink Co. Ltd.)
 273 | 	"Technical Contacts\n  Name:             (?P<name>.+)\n(?:  Organization:     (?P<organization>.+)\n)?  ContactID:        (?P<handle>.+)\n(?:  Address:          (?P<street1>.+)\n(?:                    (?P<street2>.+)\n(?:                    (?P<street3>.+)\n)?)?                    (?P<city>.+)\n                    (?P<postalcode>.+)\n                    (?P<state>.+)\n                    (?P<country>.+)\n)?(?:  Created:          (?P<creationdate>.+)\n)?(?:  Last Update:      (?P<changedate>.+)\n)?", # nic.it  //  NOTE: Why does this say 'Contacts'? Can it have multiple?
 274 | 	"Tech Name[.]* (?P<name>.*)\n  Tech Address[.]* (?P<street1>.*)\n  Tech Address[.]* (?P<street2>.*)\n(?:  Tech Address[.]* (?P<street3>.*)\n)?  Tech Address[.]* (?P<city>.*)\n  Tech Address[.]* (?P<postalcode>.*)\n  Tech Address[.]* (?P<state>.*)\n  Tech Address[.]* (?P<country>.*)\n  Tech Email[.]* (?P<email>.*)\n  Tech Phone[.]* (?P<phone>.*)\n  Tech Fax[.]* (?P<fax>.*)", # Melbourne IT
 275 | 	"Technical contact:\n(?:  (?P<organization>.+)\n)?  (?P<name>.+)\n  (?P<email>.+)\n  (?P<street>.+)\n  (?P<city>.+), (?P<state>.+) (?P<postalcode>.+) (?P<country>.+)\n  Phone: (?P<phone>.*)\n  Fax: (?P<fax>.*)\n", # Fabulous.com
 276 | 	"tech-c-name:\s*(?P<name>.+)\ntech-c-type:\s*(?P<type>.+)\ntech-c-address:\s*(?P<street>.+)\ntech-c-postcode:\s*(?P<postalcode>.+)\ntech-c-city:\s*(?P<city>.+)\ntech-c-country:\s*(?P<country>.+)\n(?:tech-c-phone:\s*(?P<phone>.+)\n)?(?:tech-c-email:\s*(?P<email>.+)\n)?", # Hetzner
 277 | 	"Admin Contact Information :[ ]*\n[ ]+(?P<firstname>.*)\n[ ]+(?P<lastname>.*)\n[ ]+(?P<organization>.*)\n[ ]+(?P<email>.*)\n[ ]+(?P<street>.*)\n[ ]+(?P<city>.*)\n[ ]+(?P<postalcode>.*)\n[ ]+(?P<phone>.*)\n[ ]+(?P<fax>.*)\n\n", # GAL Communication
 278 | 	"   Technical contact:\n      (?P<name>.+)\n      (?P<organization>.*)\n      (?P<street>.+)\n      (?P<city>.+) (?P<state>\S+),[ ]+(?P<postalcode>.+)\n      (?P<country>.+)\n      (?P<email>.+)\n      (?P<phone>.*)\n      (?P<fax>.*)", # .am
 279 | 	"Technical:\n\s*Name:\s*(?P<name>.*)\n\s*Organisation:\s*(?P<organization>.*)\n\s*Language:.*\n\s*Phone:\s*(?P<phone>.*)\n\s*Fax:\s*(?P<fax>.*)\n\s*Email:\s*(?P<email>.*)\n", # EURid
 280 | 			"\[Zone-C\]\nType: (?P<type>.+)\nName: (?P<name>.+)\n(Organisation: (?P<organization>.+)\n){0,1}(Address: (?P<street1>.+)\n){1}(Address: (?P<street2>.+)\n){0,1}(Address: (?P<street3>.+)\n){0,1}(Address: (?P<street4>.+)\n){0,1}PostalCode: (?P<postalcode>.+)\nCity: (?P<city>.+)\nCountryCode: (?P<country>[A-Za-z]{2})\nPhone: (?P<phone>.+)\nFax: (?P<fax>.+)\nEmail: (?P<email>.+)\n(Remarks: (?P<remark>.+)\n){0,1}Changed: (?P<changed>.+)", # DeNIC
 281 | 	"Technical Contact:\n   Name:           (?P<name>.+)\n   City:           (?P<city>.+)\n   State:          (?P<state>.+)\n   Country:        (?P<country>.+)\n", # Akky (.com.mx)
 282 | 	"Tech Contact: (?P<handle>.+)\n(?P<organization>.+)\n(?P<street1>.+?)(?:,+ (?P<street2>.+?)(?:,+ (?P<street3>.+?)(?:,+ (?P<street4>.+?)(?:,+ (?P<street5>.+?)(?:,+ (?P<street6>.+?)(?:,+ (?P<street7>.+?))?)?)?)?)?)?\n(?P<city>.+),? (?P<state>[A-Z]{2,3})(?: [A-Z0-9]+)?\n(?P<postalcode>.+)\n(?P<country>[A-Z]+)\n", # .co.th, format 1
 283 | 	"Tech Contact: (?P<handle>.+)\n(?P<organization>.+)\n(?P<street1>.+?)(?:,+ (?P<street2>.+?)(?:,+ (?P<street3>.+?)(?:,+ (?P<street4>.+?)(?:,+ (?P<street5>.+?)(?:,+ (?P<street6>.+?)(?:,+ (?P<street7>.+?))?)?)?)?)?)?\n(?P<city>.+), (?P<state>.+)\n(?P<postalcode>.+)\n(?P<country>[A-Z]+)\n", # .co.th, format 2
 284 | 	"Tech Contact: (?P<handle>.+)\n(?P<organization>.+)\n(?P<street1>.+?)(?:,+ (?P<street2>.+?)(?:,+ (?P<street3>.+?)(?:,+ (?P<street4>.+?)(?:,+ (?P<street5>.+?)(?:,+ (?P<street6>.+?)(?:,+ (?P<street7>.+?))?)?)?)?)?)?, (?P<city>.+)\n(?P<postalcode>.+)\n(?P<country>[A-Z]+)\n", # .co.th, format 3
 285 | 	"Tech Contact: (?P<handle>.+)\n(?P<street1>.+) (?P<city>[^\s]+)\n(?P<postalcode>.+)\n(?P<country>[A-Z]+)\n", # .co.th, format 4
 286 | 	"Tech Contact: (?P<handle>.+)\n(?P<organization>.+)\n(?P<street1>.+)\n(?P<district>.+) (?P<city>[^\s]+)\n(?P<postalcode>.+)\n(?P<country>[A-Z]+)\n", # .co.th, format 5
 287 | 	"Tech Contact: (?P<handle>.+)\n(?P<organization>.+)\n(?P<street1>.+)\n(?P<street2>.+)\n(?:(?P<street3>.+)\n)?(?P<city>.+)\n(?P<postalcode>.+)\n(?P<country>[A-Z]+)\n", # .co.th, format 6
 288 | 	"   Technical Contact:\n      (?P<name>.+)  (?P<email>.+)\n      (?P<phone>.*)\n      (?P<fax>.*)\n", # .com.tw (Western registrars)
 289 | 	"Technical Contact Information:\n\n(?:Given name: (?P<firstname>.+)\n)?(?:Family name: (?P<lastname>.+)\n)?(?:Company name: (?P<organization>.+)\n)?Address: (?P<street>.+)\nCountry: (?P<country>.+)\nPhone: (?P<phone>.*)\nFax: (?P<fax>.*)\nEmail: (?P<email>.+)\n(?:Account Name: (?P<handle>.+)\n)?", # HKDNR (.hk)
 290 | 	"TECH ID:(?P<handle>.+)\nTECH Name:(?P<name>.*)\n(?:TECH Organization:(?P<organization>.*)\n)?TECH Street1:(?P<street1>.+?)\n(?:TECH Street2:(?P<street2>.+?)\n(?:TECH Street3:(?P<street3>.+?)\n)?)?TECH City:(?P<city>.+)\nTECH State:(?P<state>.*)\nTECH Postal Code:(?P<postalcode>.+)\nTECH Country:(?P<country>[A-Z]+)\nTECH Phone:(?P<phone>.*?)\nTECH Fax:(?P<fax>.*)\nTECH Email:(?P<email>.+)\n", # Realtime Register
 291 | ]
 292 | 
 293 | admin_contact_regexes = [
 294 | 	"   Administrative Contact:[ ]*\n      (?P<organization>.*)\n      (?P<name>.*)\n      (?P<street>.*)\n      (?P<city>.*), (?P<state>.*) (?P<postalcode>.*)\n      (?P<country>.*)\n(?:      Phone: (?P<phone>.*)\n)?      Email: (?P<email>.*)\n", # Corporate Domains, Inc.
 295 | 	"Administrative Contact:\n  (?P<name>.+)\n  (?P<street1>.+)\n(?:  (?P<street2>.*)\n)?(?:  (?P<street3>.*)\n)?  (?P<postalcode>.+), (?P<city>.+)\n  (?P<country>.+)\n  (?P<phone>.+)\n  (?P<email>.+)\n\n", # OVH
 296 | 	"(?:Admin ID:(?P<handle>.+)\n)?Admin Name:(?P<name>.*)\n(?:Admin Organization:(?P<organization>.*)\n)?Admin Street1?:(?P<street1>.*)\n(?:Admin Street2:(?P<street2>.*)\n)?(?:Admin Street3:(?P<street3>.*)\n)?Admin City:(?P<city>.*)\nAdmin State/Province:(?P<state>.*)\nAdmin Postal Code:(?P<postalcode>.*)\nAdmin Country:(?P<country>.*)\nAdmin Phone:(?P<phone>.*)\n(?:Admin Phone Ext.:(?P<phone_ext>.*)\n)?(?:Admin FAX:(?P<fax>.*)\n)?(?:Admin FAX Ext.:(?P<fax_ext>.*)\n)?Admin Email:(?P<email>.*)", # Public Interest Registry (.org), nic.pw, No-IP.com
 297 | 	"Admin(?:istrative)? ID:(?P<handle>.+)\nAdmin(?:istrative)? Name:(?P<name>.*)\n(?:Admin(?:istrative)? Organization:(?P<organization>.*)\n)?Admin(?:istrative)? Address1?:(?P<street1>.*)\n(?:Admin(?:istrative)? Address2:(?P<street2>.*)\n)?(?:Admin(?:istrative)? Address3:(?P<street3>.*)\n)?Admin(?:istrative)? City:(?P<city>.*)\nAdmin(?:istrative)? State/Province:(?P<state>.*)\nAdmin(?:istrative)? Country/Economy:(?P<country>.*)\nAdmin(?:istrative)? Postal Code:(?P<postalcode>.*)\nAdmin(?:istrative)? Phone:(?P<phone>.*)\n(?:Admin(?:istrative)? Phone Ext.:(?P<phone_ext>.*)\n)?(?:Admin(?:istrative)? FAX:(?P<fax>.*)\n)?(?:Admin(?:istrative)? FAX Ext.:(?P<fax_ext>.*)\n)?Admin(?:istrative)? E-mail:(?P<email>.*)", # .ME, DotAsia
 298 | 	"Administrative Contact ID:\s*(?P<handle>.+)\nAdministrative Contact Name:\s*(?P<name>.+)\nAdministrative Contact Organization:\s*(?P<organization>.*)\nAdministrative Contact Address1:\s*(?P<street1>.+)\nAdministrative Contact Address2:\s*(?P<street2>.*)\nAdministrative Contact City:\s*(?P<city>.+)\nAdministrative Contact State/Province:\s*(?P<state>.+)\nAdministrative Contact Postal Code:\s*(?P<postalcode>.+)\nAdministrative Contact Country:\s*(?P<country>.+)\nAdministrative Contact Country Code:\s*(?P<country_code>.+)\nAdministrative Contact Phone Number:\s*(?P<phone>.+)\nAdministrative Contact Email:\s*(?P<email>.+)\n", # .CO Internet
 299 | 	"Admin Contact: (?P<handle>.+)\nAdmin Organization: (?P<organization>.+)\nAdmin Name: (?P<name>.+)\nAdmin Street: (?P<street>.+)\nAdmin City: (?P<city>.+)\nAdmin State: (?P<state>.+)\nAdmin Postal Code: (?P<postalcode>.+)\nAdmin Country: (?P<country>.+)\nAdmin Phone: (?P<phone>.*)\nAdmin Phone Ext: (?P<phone_ext>.*)\nAdmin Fax: (?P<fax>.*)\nAdmin Fax Ext: (?P<fax_ext>.*)\nAdmin Email: (?P<email>.*)\n", # Key-Systems GmbH
 300 | 	"(?:Admin ID:[ ]*(?P<handle>.*)\n)?Admin[ ]*Name:[ ]*(?P<name>.*)\n(?:Admin[ ]*Organization:[ ]*(?P<organization>.*)\n)?Admin[ ]*Street:[ ]*(?P<street1>.+)\n(?:Admin[ ]*Street:[ ]*(?P<street2>.+)\n)?(?:Admin[ ]*Street:[ ]*(?P<street3>.+)\n)?Admin[ ]*City:[ ]*(?P<city>.+)\nAdmin[ ]*State(?:\/Province)?:[ ]*(?P<state>.*)\nAdmin[ ]*Postal[ ]*Code:[ ]*(?P<postalcode>.+)\nAdmin[ ]*Country:[ ]*(?P<country>.+)\n(?:Admin[ ]*Phone:[ ]*(?P<phone>.*)\n)?(?:Admin[ ]*Phone[ ]*Ext:[ ]*(?P<phone_ext>.*)\n)?(?:Admin[ ]*Fax:[ ]*(?P<fax>.*)\n)?(?:Admin[ ]*Fax[ ]*Ext:\s*?(?P<fax_ext>.*)\n)?(?:Admin[ ]*Email:[ ]*(?P<email>.+)\n)?", # WildWestDomains, GoDaddy, Namecheap/eNom, Ascio, Musedoma (.museum), EuroDNS, nic.ps
 301 | 	"Administrative Contact\n(?:    (?P<organization>.+)\n)?    (?P<name>.+)\n    Email:(?P<email>.+)\n    (?P<street1>.+)\n(?:    (?P<street2>.+)\n)?    (?P<postalcode>.+) (?P<city>.+)\n    (?P<country>.+)\n    Tel: (?P<phone>.+)\n\n", # internet.bs
 302 | 	" Administrative Contact Details:[ ]*\n    (?P<organization>.*)\n    (?P<name>.*)[ ]{2,}\((?P<email>.*)\)\n    (?P<street1>.*)\n(?:    (?P<street2>.*)\n)?(?:    (?P<street3>.*)\n)?    (?P<city>.*)\n    (?P<state>.*),(?P<postalcode>.*)\n    (?P<country>.*)\n    Tel. (?P<phone>.*)", # Whois.com
 303 | 	"admin-id:[ ]*(?P<handle>.*)\n(?:admin-organization:[ ]*(?P<organization>.*)\n)?admin-name:[ ]*(?P<name>.*)\nadmin-street:[ ]*(?P<street>.*)\nadmin-city:[ ]*(?P<city>.*)\nadmin-zip:[ ]*(?P<postalcode>.*)\nadmin-country:[ ]*(?P<country>.*)\n(?:admin-phone:[ ]*(?P<phone>.*)\n)?(?:admin-fax:[ ]*(?P<fax>.*)\n)?admin-email:[ ]*(?P<email>.*)", # InterNetworX
 304 | 	"Administrative Contact:\n admin_org: (?P<organization>.*)\n admin_name: (?P<name>.*)\n admin_email: (?P<email>.*)\n admin_address: (?P<address>.*)\n admin_city: (?P<city>.*)\n admin_state: (?P<state>.*)\n admin_zip: (?P<postalcode>.*)\n admin_country: (?P<country>.*)\n admin_phone: (?P<phone>.*)", # Bellnames
 305 | 	"Administrative Contact ID:(?P<handle>.*)\nAdministrative Contact Name:(?P<name>.*)\n(?:Administrative Contact Organization:(?P<organization>.*)\n)?Administrative Contact Address1:(?P<street1>.*)\n(?:Administrative Contact Address2:(?P<street2>.*)\n)?(?:Administrative Contact Address3:(?P<street3>.*)\n)?Administrative Contact City:(?P<city>.*)\n(?:Administrative Contact State/Province:(?P<state>.*)\n)?(?:Administrative Contact Postal Code:(?P<postalcode>.*)\n)?Administrative Contact Country:(?P<country>.*)\nAdministrative Contact Country Code:.*\nAdministrative Contact Phone Number:(?P<phone>.*)\n(?:Administrative Contact Facsimile Number:(?P<facsimile>.*)\n)?Administrative Contact Email:(?P<email>.*)", # .US, .biz (NeuStar), .buzz, .moe (Interlink Co. Ltd.)
 306 | 	"Admin Contact\n  Name:             (?P<name>.+)\n(?:  Organization:     (?P<organization>.+)\n)?  ContactID:        (?P<handle>.+)\n(?:  Address:          (?P<street1>.+)\n(?:                    (?P<street2>.+)\n(?:                    (?P<street3>.+)\n)?)?                    (?P<city>.+)\n                    (?P<postalcode>.+)\n                    (?P<state>.+)\n                    (?P<country>.+)\n)?(?:  Created:          (?P<creationdate>.+)\n)?(?:  Last Update:      (?P<changedate>.+)\n)?", # nic.it
 307 | 	"Admin Name[.]* (?P<name>.*)\n  Admin Address[.]* (?P<street1>.*)\n  Admin Address[.]* (?P<street2>.*)\n(?:  Admin Address[.]* (?P<street3>.*)\n)?  Admin Address[.]* (?P<city>.*)\n  Admin Address[.]* (?P<postalcode>.*)\n  Admin Address[.]* (?P<state>.*)\n  Admin Address[.]* (?P<country>.*)\n  Admin Email[.]* (?P<email>.*)\n  Admin Phone[.]* (?P<phone>.*)\n  Admin Fax[.]* (?P<fax>.*)", # Melbourne IT
 308 | 	"Administrative contact:\n(?:  (?P<organization>.+)\n)?  (?P<name>.+)\n  (?P<email>.+)\n  (?P<street>.+)\n  (?P<city>.+), (?P<state>.+) (?P<postalcode>.+) (?P<country>.+)\n  Phone: (?P<phone>.*)\n  Fax: (?P<fax>.*)\n", # Fabulous.com
 309 | 	"admin-c-name:\s*(?P<name>.+)\nadmin-c-type:\s*(?P<type>.+)\nadmin-c-address:\s*(?P<street>.+)\nadmin-c-postcode:\s*(?P<postalcode>.+)\nadmin-c-city:\s*(?P<city>.+)\nadmin-c-country:\s*(?P<country>.+)\n(?:admin-c-phone:\s*(?P<phone>.+)\n)?(?:admin-c-email:\s*(?P<email>.+)\n)?", # Hetzner
 310 | 	"Tech Contact Information :[ ]*\n[ ]+(?P<firstname>.*)\n[ ]+(?P<lastname>.*)\n[ ]+(?P<organization>.*)\n[ ]+(?P<email>.*)\n[ ]+(?P<street>.*)\n[ ]+(?P<city>.*)\n[ ]+(?P<postalcode>.*)\n[ ]+(?P<phone>.*)\n[ ]+(?P<fax>.*)\n\n", # GAL Communication
 311 | 	"   Administrative contact:\n      (?P<name>.+)\n      (?P<organization>.*)\n      (?P<street>.+)\n      (?P<city>.+) (?P<state>\S+),[ ]+(?P<postalcode>.+)\n      (?P<country>.+)\n      (?P<email>.+)\n      (?P<phone>.*)\n      (?P<fax>.*)", # .am
 312 | 	"Administrative Contact:\n   Name:           (?P<name>.+)\n   City:           (?P<city>.+)\n   State:          (?P<state>.+)\n   Country:        (?P<country>.+)\n", # Akky (.com.mx)
 313 | 			"\[Tech-C\]\nType: (?P<type>.+)\nName: (?P<name>.+)\n(Organisation: (?P<organization>.+)\n){0,1}(Address: (?P<street1>.+)\n){1}(Address: (?P<street2>.+)\n){0,1}(Address: (?P<street3>.+)\n){0,1}(Address: (?P<street4>.+)\n){0,1}PostalCode: (?P<postalcode>.+)\nCity: (?P<city>.+)\nCountryCode: (?P<country>[A-Za-z]{2})\nPhone: (?P<phone>.+)\nFax: (?P<fax>.+)\nEmail: (?P<email>.+)\n(Remarks: (?P<remark>.+)\n){0,1}Changed: (?P<changed>.+)", # DeNIC
 314 | 	"   Administrative Contact:\n      (?P<name>.+)  (?P<email>.+)\n      (?P<phone>.*)\n      (?P<fax>.*)\n", # .com.tw (Western registrars)
 315 | 	"Administrative Contact Information:\n\n(?:Given name: (?P<firstname>.+)\n)?(?:Family name: (?P<lastname>.+)\n)?(?:Company name: (?P<organization>.+)\n)?Address: (?P<street>.+)\nCountry: (?P<country>.+)\nPhone: (?P<phone>.*)\nFax: (?P<fax>.*)\nEmail: (?P<email>.+)\n(?:Account Name: (?P<handle>.+)\n)?", # HKDNR (.hk)
 316 | 	"ADMIN ID:(?P<handle>.+)\nADMIN Name:(?P<name>.*)\n(?:ADMIN Organization:(?P<organization>.*)\n)?ADMIN Street1:(?P<street1>.+?)\n(?:ADMIN Street2:(?P<street2>.+?)\n(?:ADMIN Street3:(?P<street3>.+?)\n)?)?ADMIN City:(?P<city>.+)\nADMIN State:(?P<state>.*)\nADMIN Postal Code:(?P<postalcode>.+)\nADMIN Country:(?P<country>[A-Z]+)\nADMIN Phone:(?P<phone>.*?)\nADMIN Fax:(?P<fax>.*)\nADMIN Email:(?P<email>.+)\n", # Realtime Register
 317 | ]
 318 | 
 319 | billing_contact_regexes = [
 320 | 	"(?:Billing ID:(?P<handle>.+)\n)?Billing Name:(?P<name>.*)\nBilling Organization:(?P<organization>.*)\nBilling Street1:(?P<street1>.*)\n(?:Billing Street2:(?P<street2>.*)\n)?(?:Billing Street3:(?P<street3>.*)\n)?Billing City:(?P<city>.*)\nBilling State/Province:(?P<state>.*)\nBilling Postal Code:(?P<postalcode>.*)\nBilling Country:(?P<country>.*)\nBilling Phone:(?P<phone>.*)\n(?:Billing Phone Ext.:(?P<phone_ext>.*)\n)?(?:Billing FAX:(?P<fax>.*)\n)?(?:Billing FAX Ext.:(?P<fax_ext>.*)\n)?Billing Email:(?P<email>.*)", # nic.pw, No-IP.com
 321 | 	"Billing ID:(?P<handle>.+)\nBilling Name:(?P<name>.*)\n(?:Billing Organization:(?P<organization>.*)\n)?Billing Address1?:(?P<street1>.*)\n(?:Billing Address2:(?P<street2>.*)\n)?(?:Billing Address3:(?P<street3>.*)\n)?Billing City:(?P<city>.*)\nBilling State/Province:(?P<state>.*)\nBilling Country/Economy:(?P<country>.*)\nBilling Postal Code:(?P<postalcode>.*)\nBilling Phone:(?P<phone>.*)\n(?:Billing Phone Ext.:(?P<phone_ext>.*)\n)?(?:Billing FAX:(?P<fax>.*)\n)?(?:Billing FAX Ext.:(?P<fax_ext>.*)\n)?Billing E-mail:(?P<email>.*)", # DotAsia
 322 | 	"Billing Contact ID:\s*(?P<handle>.+)\nBilling Contact Name:\s*(?P<name>.+)\nBilling Contact Organization:\s*(?P<organization>.*)\nBilling Contact Address1:\s*(?P<street1>.+)\nBilling Contact Address2:\s*(?P<street2>.*)\nBilling Contact City:\s*(?P<city>.+)\nBilling Contact State/Province:\s*(?P<state>.+)\nBilling Contact Postal Code:\s*(?P<postalcode>.+)\nBilling Contact Country:\s*(?P<country>.+)\nBilling Contact Country Code:\s*(?P<country_code>.+)\nBilling Contact Phone Number:\s*(?P<phone>.+)\nBilling Contact Email:\s*(?P<email>.+)\n", # .CO Internet
 323 | 	"Billing Contact: (?P<handle>.+)\nBilling Organization: (?P<organization>.+)\nBilling Name: (?P<name>.+)\nBilling Street: (?P<street>.+)\nBilling City: (?P<city>.+)\nBilling Postal Code: (?P<postalcode>.+)\nBilling State: (?P<state>.+)\nBilling Country: (?P<country>.+)\nBilling Phone: (?P<phone>.*)\nBilling Phone Ext: (?P<phone_ext>.*)\nBilling Fax: (?P<fax>.*)\nBilling Fax Ext: (?P<fax_ext>.*)\nBilling Email: (?P<email>.*)\n", # Key-Systems GmbH
 324 | 	"(?:Billing ID:[ ]*(?P<handle>.*)\n)?Billing[ ]*Name:[ ]*(?P<name>.*)\n(?:Billing[ ]*Organization:[ ]*(?P<organization>.*)\n)?Billing[ ]*Street:[ ]*(?P<street1>.+)\n(?:Billing[ ]*Street:[ ]*(?P<street2>.+)\n)?Billing[ ]*City:[ ]*(?P<city>.+)\nBilling[ ]*State\/Province:[ ]*(?P<state>.+)\nBilling[ ]*Postal[ ]*Code:[ ]*(?P<postalcode>.+)\nBilling[ ]*Country:[ ]*(?P<country>.+)\n(?:Billing[ ]*Phone:[ ]*(?P<phone>.*)\n)?(?:Billing[ ]*Phone[ ]*Ext:[ ]*(?P<phone_ext>.*)\n)?(?:Billing[ ]*Fax:[ ]*(?P<fax>.*)\n)?(?:Billing[ ]*Fax[ ]*Ext:\s*?(?P<fax_ext>.*)\n)?(?:Billing[ ]*Email:[ ]*(?P<email>.+)\n)?", # Musedoma (.museum)
 325 | 	"Billing Contact:\n  (?P<name>.+)\n  (?P<street1>.+)\n(?:  (?P<street2>.*)\n)?(?:  (?P<street3>.*)\n)?  (?P<postalcode>.+), (?P<city>.+)\n  (?P<country>.+)\n  (?P<phone>.+)\n  (?P<email>.+)\n\n", # OVH
 326 | 	" Billing Contact Details:[ ]*\n    (?P<organization>.*)\n    (?P<name>.*)[ ]{2,}\((?P<email>.*)\)\n    (?P<street1>.*)\n(?:    (?P<street2>.*)\n)?(?:    (?P<street3>.*)\n)?    (?P<city>.*)\n    (?P<state>.*),(?P<postalcode>.*)\n    (?P<country>.*)\n    Tel. (?P<phone>.*)", # Whois.com
 327 | 	"billing-id:[ ]*(?P<handle>.*)\n(?:billing-organization:[ ]*(?P<organization>.*)\n)?billing-name:[ ]*(?P<name>.*)\nbilling-street:[ ]*(?P<street>.*)\nbilling-city:[ ]*(?P<city>.*)\nbilling-zip:[ ]*(?P<postalcode>.*)\nbilling-country:[ ]*(?P<country>.*)\n(?:billing-phone:[ ]*(?P<phone>.*)\n)?(?:billing-fax:[ ]*(?P<fax>.*)\n)?billing-email:[ ]*(?P<email>.*)", # InterNetworX
 328 | 	"Billing Contact:\n bill_org: (?P<organization>.*)\n bill_name: (?P<name>.*)\n bill_email: (?P<email>.*)\n bill_address: (?P<address>.*)\n bill_city: (?P<city>.*)\n bill_state: (?P<state>.*)\n bill_zip: (?P<postalcode>.*)\n bill_country: (?P<country>.*)\n bill_phone: (?P<phone>.*)", # Bellnames
 329 | 	"Billing Contact ID:(?P<handle>.*)\nBilling Contact Name:(?P<name>.*)\n(?:Billing Contact Organization:(?P<organization>.*)\n)?Billing Contact Address1:(?P<street1>.*)\n(?:Billing Contact Address2:(?P<street2>.*)\n)?(?:Billing Contact Address3:(?P<street3>.*)\n)?Billing Contact City:(?P<city>.*)\n(?:Billing Contact State/Province:(?P<state>.*)\n)?(?:Billing Contact Postal Code:(?P<postalcode>.*)\n)?Billing Contact Country:(?P<country>.*)\nBilling Contact Country Code:.*\nBilling Contact Phone Number:(?P<phone>.*)\n(?:Billing Contact Facsimile Number:(?P<facsimile>.*)\n)?Billing Contact Email:(?P<email>.*)", # .US, .biz (NeuStar), .buzz, .moe (Interlink Co. Ltd.)
 330 | 	"Billing contact:\n(?:  (?P<organization>.+)\n)?  (?P<name>.+)\n  (?P<email>.+)\n  (?P<street>.+)\n  (?P<city>.+), (?P<state>.+) (?P<postalcode>.+) (?P<country>.+)\n  Phone: (?P<phone>.*)\n  Fax: (?P<fax>.*)\n", # Fabulous.com
 331 | 	"Billing Contact Information :[ ]*\n[ ]+(?P<firstname>.*)\n[ ]+(?P<lastname>.*)\n[ ]+(?P<organization>.*)\n[ ]+(?P<email>.*)\n[ ]+(?P<street>.*)\n[ ]+(?P<city>.*)\n[ ]+(?P<postalcode>.*)\n[ ]+(?P<phone>.*)\n[ ]+(?P<fax>.*)\n\n", # GAL Communication
 332 | 	"Billing Contact:\n   Name:           (?P<name>.+)\n   City:           (?P<city>.+)\n   State:          (?P<state>.+)\n   Country:        (?P<country>.+)\n", # Akky (.com.mx)
 333 | 	"BILLING ID:(?P<handle>.+)\nBILLING Name:(?P<name>.*)\n(?:BILLING Organization:(?P<organization>.*)\n)?BILLING Street1:(?P<street1>.+?)\n(?:BILLING Street2:(?P<street2>.+?)\n(?:BILLING Street3:(?P<street3>.+?)\n)?)?BILLING City:(?P<city>.+)\nBILLING State:(?P<state>.*)\nBILLING Postal Code:(?P<postalcode>.+)\nBILLING Country:(?P<country>[A-Z]+)\nBILLING Phone:(?P<phone>.*?)\nBILLING Fax:(?P<fax>.*)\nBILLING Email:(?P<email>.+)\n", # Realtime Register
 334 | ]
 335 | 
 336 | # Some registries use NIC handle references instead of directly listing contacts...
 337 | nic_contact_references = {
 338 | 	"registrant": [
 339 | 		"registrant:\s*(?P<handle>.+)", # nic.at
 340 | 		"owner-contact:\s*(?P<handle>.+)", # LCN.com
 341 | 		"holder-c:\s*(?P<handle>.+)", # AFNIC
 342 | 		"holder:\s*(?P<handle>.+)", # iis.se (they apparently want to be difficult, and won't give you contact info for the handle over their WHOIS service)
 343 | 	],
 344 | 	"tech": [
 345 | 		"tech-c:\s*(?P<handle>.+)", # nic.at, AFNIC, iis.se
 346 | 		"technical-contact:\s*(?P<handle>.+)", # LCN.com
 347 | 		"n\. \[Technical Contact\]          (?P<handle>.+)\n", #.co.jp
 348 | 	],
 349 | 	"admin": [
 350 | 		"admin-c:\s*(?P<handle>.+)", # nic.at, AFNIC, iis.se
 351 | 		"admin-contact:\s*(?P<handle>.+)", # LCN.com
 352 | 		"m\. \[Administrative Contact\]     (?P<handle>.+)\n", # .co.jp
 353 | 	],
 354 | 	"billing": [
 355 | 		"billing-c:\s*(?P<handle>.+)", # iis.se
 356 | 		"billing-contact:\s*(?P<handle>.+)", # LCN.com
 357 | 	]
 358 | }
 359 | 
 360 | # Why do the below? The below is meant to handle with an edge case (issue #2) where a partial match followed
 361 | # by a failure, for a regex containing the \s*.+ pattern, would send the regex module on a wild goose hunt for
 362 | # matching positions. The workaround is to use \S.* instead of .+, but in the interest of keeping the regexes
 363 | # consistent and compact, it's more practical to do this (predictable) conversion on runtime.
 364 | # FIXME: This breaks on NIC contact regex for nic.at. Why?
 365 | registrant_regexes = [preprocess_regex(regex) for regex in registrant_regexes]
 366 | tech_contact_regexes = [preprocess_regex(regex) for regex in tech_contact_regexes]
 367 | admin_contact_regexes = [preprocess_regex(regex) for regex in admin_contact_regexes]
 368 | billing_contact_regexes = [preprocess_regex(regex) for regex in billing_contact_regexes]
 369 | 
 370 | nic_contact_regexes = [
 371 | 	"personname:\s*(?P<name>.+)\norganization:\s*(?P<organization>.+)\nstreet address:\s*(?P<street>.+)\npostal code:\s*(?P<postalcode>.+)\ncity:\s*(?P<city>.+)\ncountry:\s*(?P<country>.+)\n(?:phone:\s*(?P<phone>.+)\n)?(?:fax-no:\s*(?P<fax>.+)\n)?(?:e-mail:\s*(?P<email>.+)\n)?nic-hdl:\s*(?P<handle>.+)\nchanged:\s*(?P<changedate>.+)", # nic.at
 372 | 	"contact-handle:[ ]*(?P<handle>.+)\ncontact:[ ]*(?P<name>.+)\n(?:organisation:[ ]*(?P<organization>.+)\n)?address:[ ]*(?P<street1>.+)\n(?:address:[ ]*(?P<street2>.+)\n)?(?:address:[ ]*(?P<street3>.+)\n)?(?:address:[ ]*(?P<street4>.+)\n)?address:[ ]*(?P<city>.+)\naddress:[ ]*(?P<state>.+)\naddress:[ ]*(?P<postalcode>.+)\naddress:[ ]*(?P<country>.+)\n(?:phone:[ ]*(?P<phone>.+)\n)?(?:fax:[ ]*(?P<fax>.+)\n)?(?:email:[ ]*(?P<email>.+)\n)?", # LCN.com
 373 | 	"Contact Information:\na\. \[JPNIC Handle\]               (?P<handle>.+)\nc\. \[Last, First\]                (?P<lastname>.+), (?P<firstname>.+)\nd\. \[E-Mail\]                     (?P<email>.+)\ng\. \[Organization\]               (?P<organization>.+)\nl\. \[Division\]                   (?P<division>.+)\nn\. \[Title\]                      (?P<title>.+)\no\. \[TEL\]                        (?P<phone>.+)\np\. \[FAX\]                        (?P<fax>.+)\ny\. \[Reply Mail\]                 .*\n\[Last Update\]                   (?P<changedate>.+) \(JST\)\n", # JPRS .co.jp contact handle lookup
 374 | 	"person:\s*(?P<name>.+)\nnic-hdl:\s*(?P<handle>.+)\n", # .ie
 375 | 	"nic-hdl:\s+(?P<handle>.+)\nperson:\s+(?P<name>.+)\n(?:e-mail:\s+(?P<email>.+)\n)?(?:address:\s+(?P<street1>.+?)(?:,+ (?P<street2>.+?)(?:,+ (?P<street3>.+?)(?:,+ (?P<street4>.+?)(?:,+ (?P<street5>.+?)(?:,+ (?P<street6>.+?)(?:,+ (?P<street7>.+?))?)?)?)?)?)?, (?P<city>.+), (?P<state>.+), (?P<country>.+)\n)?(?:phone:\s+(?P<phone>.+)\n)?(?:fax-no:\s+(?P<fax>.+)\n)?", # nic.ir, individual  - this is a nasty one.
 376 | 	"nic-hdl:\s+(?P<handle>.+)\norg:\s+(?P<organization>.+)\n(?:e-mail:\s+(?P<email>.+)\n)?(?:address:\s+(?P<street1>.+?)(?:,+ (?P<street2>.+?)(?:,+ (?P<street3>.+?)(?:,+ (?P<street4>.+?)(?:,+ (?P<street5>.+?)(?:,+ (?P<street6>.+?)(?:,+ (?P<street7>.+?))?)?)?)?)?)?, (?P<city>.+), (?P<state>.+), (?P<country>.+)\n)?(?:phone:\s+(?P<phone>.+)\n)?(?:fax-no:\s+(?P<fax>.+)\n)?", # nic.ir, organization
 377 | 	"nic-hdl:\s*(?P<handle>.+)\ntype:\s*(?P<type>.+)\ncontact:\s*(?P<name>.+)\n(?:.+\n)*?(?:address:\s*(?P<street1>.+)\naddress:\s*(?P<street2>.+)\naddress:\s*(?P<street3>.+)\naddress:\s*(?P<country>.+)\n)?(?:phone:\s*(?P<phone>.+)\n)?(?:fax-no:\s*(?P<fax>.+)\n)?(?:.+\n)*?(?:e-mail:\s*(?P<email>.+)\n)?(?:.+\n)*?changed:\s*(?P<changedate>[0-9]{2}\/[0-9]{2}\/[0-9]{4}).*\n", # AFNIC madness without country field
 378 | 	"nic-hdl:\s*(?P<handle>.+)\ntype:\s*(?P<type>.+)\ncontact:\s*(?P<name>.+)\n(?:.+\n)*?(?:address:\s*(?P<street1>.+)\n)?(?:address:\s*(?P<street2>.+)\n)?(?:address:\s*(?P<street3>.+)\n)?(?:phone:\s*(?P<phone>.+)\n)?(?:fax-no:\s*(?P<fax>.+)\n)?(?:.+\n)*?(?:e-mail:\s*(?P<email>.+)\n)?(?:.+\n)*?changed:\s*(?P<changedate>[0-9]{2}\/[0-9]{2}\/[0-9]{4}).*\n", # AFNIC madness any country -at all-
 379 | 	"nic-hdl:\s*(?P<handle>.+)\ntype:\s*(?P<type>.+)\ncontact:\s*(?P<name>.+)\n(?:.+\n)*?(?:address:\s*(?P<street1>.+)\n)?(?:address:\s*(?P<street2>.+)\n)?(?:address:\s*(?P<street3>.+)\n)?(?:address:\s*(?P<street4>.+)\n)?country:\s*(?P<country>.+)\n(?:phone:\s*(?P<phone>.+)\n)?(?:fax-no:\s*(?P<fax>.+)\n)?(?:.+\n)*?(?:e-mail:\s*(?P<email>.+)\n)?(?:.+\n)*?changed:\s*(?P<changedate>[0-9]{2}\/[0-9]{2}\/[0-9]{4}).*\n", # AFNIC madness with country field
 380 | ]
 381 | 
 382 | organization_regexes = (
 383 | 	r"\sltd\.?($|\s)",
 384 | 	r"\sco\.?($|\s)",
 385 | 	r"\scorp\.?($|\s)",
 386 | 	r"\sinc\.?($|\s)",
 387 | 	r"\ss\.?p\.?a\.?($|\s)",
 388 | 	r"\ss\.?(c\.?)?r\.?l\.?($|\s)",
 389 | 	r"\ss\.?a\.?s\.?($|\s)",
 390 | 	r"\sa\.?g\.?($|\s)",
 391 | 	r"\sn\.?v\.?($|\s)",
 392 | 	r"\sb\.?v\.?($|\s)",
 393 | 	r"\sp\.?t\.?y\.?($|\s)",
 394 | 	r"\sp\.?l\.?c\.?($|\s)",
 395 | 	r"\sv\.?o\.?f\.?($|\s)",
 396 | 	r"\sb\.?v\.?b\.?a\.?($|\s)",
 397 | 	r"\sg\.?m\.?b\.?h\.?($|\s)",
 398 | 	r"\ss\.?a\.?r\.?l\.?($|\s)",
 399 | )
 400 | 
 401 | grammar["_data"]["id"] = precompile_regexes(grammar["_data"]["id"], re.IGNORECASE)
 402 | grammar["_data"]["status"] = precompile_regexes(grammar["_data"]["status"], re.IGNORECASE)
 403 | grammar["_data"]["creation_date"] = precompile_regexes(grammar["_data"]["creation_date"], re.IGNORECASE)
 404 | grammar["_data"]["expiration_date"] = precompile_regexes(grammar["_data"]["expiration_date"], re.IGNORECASE)
 405 | grammar["_data"]["updated_date"] = precompile_regexes(grammar["_data"]["updated_date"], re.IGNORECASE)
 406 | grammar["_data"]["registrar"] = precompile_regexes(grammar["_data"]["registrar"], re.IGNORECASE)
 407 | grammar["_data"]["whois_server"] = precompile_regexes(grammar["_data"]["whois_server"], re.IGNORECASE)
 408 | grammar["_data"]["nameservers"] = precompile_regexes(grammar["_data"]["nameservers"], re.IGNORECASE)
 409 | grammar["_data"]["emails"] = precompile_regexes(grammar["_data"]["emails"], re.IGNORECASE)
 410 | 
 411 | grammar["_dateformats"] = precompile_regexes(grammar["_dateformats"], re.IGNORECASE)
 412 | 
 413 | registrant_regexes = precompile_regexes(registrant_regexes)
 414 | tech_contact_regexes = precompile_regexes(tech_contact_regexes)
 415 | billing_contact_regexes = precompile_regexes(billing_contact_regexes)
 416 | admin_contact_regexes = precompile_regexes(admin_contact_regexes)
 417 | nic_contact_regexes = precompile_regexes(nic_contact_regexes)
 418 | organization_regexes = precompile_regexes(organization_regexes, re.IGNORECASE)
 419 | 
 420 | nic_contact_references["registrant"] = precompile_regexes(nic_contact_references["registrant"])
 421 | nic_contact_references["tech"] = precompile_regexes(nic_contact_references["tech"])
 422 | nic_contact_references["admin"] = precompile_regexes(nic_contact_references["admin"])
 423 | nic_contact_references["billing"] = precompile_regexes(nic_contact_references["billing"])
 424 | 
 425 | if sys.version_info < (3, 0):
 426 | 	def is_string(data):
 427 | 		"""Test for string with support for python 2."""
 428 | 		return isinstance(data, basestring)
 429 | else:
 430 | 	def is_string(data):
 431 | 		"""Test for string with support for python 3."""
 432 | 		return isinstance(data, str)
 433 | 
 434 | 
 435 | def parse_raw_whois(raw_data, normalized=None, never_query_handles=True, handle_server=""):
 436 | 	normalized = normalized or []
 437 | 	data = {}
 438 | 
 439 | 	raw_data = [segment.replace("\r", "") for segment in raw_data] # Carriage returns are the devil
 440 | 
 441 | 	for segment in raw_data:
 442 | 		for rule_key, rule_regexes in grammar['_data'].items():
 443 | 			if (rule_key in data) == False:
 444 | 				for line in segment.splitlines():
 445 | 					for regex in rule_regexes:
 446 | 						result = re.search(regex, line)
 447 | 
 448 | 						if result is not None:
 449 | 							val = result.group("val").strip()
 450 | 							if val != "":
 451 | 								try:
 452 | 									data[rule_key].append(val)
 453 | 								except KeyError as e:
 454 | 									data[rule_key] = [val]
 455 | 
 456 | 		# Whois.com is a bit special... Fabulous.com also seems to use this format. As do some others.
 457 | 		match = re.search("^\s?Name\s?[Ss]ervers:?\s*\n((?:\s*.+\n)+?\s?)\n", segment, re.MULTILINE)
 458 | 		if match is not None:
 459 | 			chunk = match.group(1)
 460 | 			for match in re.findall("[ ]*(.+)\n", chunk):
 461 | 				if match.strip() != "":
 462 | 					if not re.match("^[a-zA-Z]+:", match):
 463 | 						try:
 464 | 							data["nameservers"].append(match.strip())
 465 | 						except KeyError as e:
 466 | 							data["nameservers"] = [match.strip()]
 467 | 		# Nominet also needs some special attention
 468 | 		match = re.search("    Registrar:\n        (.+)\n", segment)
 469 | 		if match is not None:
 470 | 			data["registrar"] = [match.group(1).strip()]
 471 | 		match = re.search("    Registration status:\n        (.+)\n", segment)
 472 | 		if match is not None:
 473 | 			data["status"] = [match.group(1).strip()]
 474 | 		match = re.search("    Name servers:\n([\s\S]*?\n)\n", segment)
 475 | 		if match is not None:
 476 | 			chunk = match.group(1)
 477 | 			for match in re.findall("        (.+)\n", chunk):
 478 | 				match = match.split()[0]
 479 | 				try:
 480 | 					data["nameservers"].append(match.strip())
 481 | 				except KeyError as e:
 482 | 					data["nameservers"] = [match.strip()]
 483 | 		# janet (.ac.uk) is kinda like Nominet, but also kinda not
 484 | 		match = re.search("Registered By:\n\t(.+)\n", segment)
 485 | 		if match is not None:
 486 | 			data["registrar"] = [match.group(1).strip()]
 487 | 		match = re.search("Entry created:\n\t(.+)\n", segment)
 488 | 		if match is not None:
 489 | 			data["creation_date"] = [match.group(1).strip()]
 490 | 		match = re.search("Renewal date:\n\t(.+)\n", segment)
 491 | 		if match is not None:
 492 | 			data["expiration_date"] = [match.group(1).strip()]
 493 | 		match = re.search("Entry updated:\n\t(.+)\n", segment)
 494 | 		if match is not None:
 495 | 			data["updated_date"] = [match.group(1).strip()]
 496 | 		match = re.search("Servers:([\s\S]*?\n)\n", segment)
 497 | 		if match is not None:
 498 | 			chunk = match.group(1)
 499 | 			for match in re.findall("\t(.+)\n", chunk):
 500 | 				match = match.split()[0]
 501 | 				try:
 502 | 					data["nameservers"].append(match.strip())
 503 | 				except KeyError as e:
 504 | 					data["nameservers"] = [match.strip()]
 505 | 		# .am plays the same game
 506 | 		match = re.search("   DNS servers:([\s\S]*?\n)\n", segment)
 507 | 		if match is not None:
 508 | 			chunk = match.group(1)
 509 | 			for match in re.findall("      (.+)\n", chunk):
 510 | 				match = match.split()[0]
 511 | 				try:
 512 | 					data["nameservers"].append(match.strip())
 513 | 				except KeyError as e:
 514 | 					data["nameservers"] = [match.strip()]
 515 | 		# SIDN isn't very standard either. And EURid uses a similar format.
 516 | 		match = re.search("Registrar:\n\s+(?:Name:\s*)?(\S.*)", segment)
 517 | 		if match is not None:
 518 | 			data.setdefault("registrar", []).insert(0, match.group(1).strip())
 519 | 		match = re.search("(?:Domain nameservers|Name servers):([\s\S]*?\n)\n", segment)
 520 | 		if match is not None:
 521 | 			chunk = match.group(1)
 522 | 			for match in re.findall("\s+?(.+)\n", chunk):
 523 | 				match = match.split()[0]
 524 | 				# Prevent nameserver aliases from being picked up.
 525 | 				if not match.startswith("[") and not match.endswith("]"):
 526 | 					try:
 527 | 						data["nameservers"].append(match.strip())
 528 | 					except KeyError as e:
 529 | 						data["nameservers"] = [match.strip()]
 530 | 		# The .ie WHOIS server puts ambiguous status information in an unhelpful order
 531 | 		match = re.search('ren-status:\s*(.+)', segment)
 532 | 		if match is not None:
 533 | 			data["status"].insert(0, match.group(1).strip())
 534 | 		# nic.it gives us the registrar in a multi-line format...
 535 | 		match = re.search('Registrar\n  Organization:     (.+)\n', segment)
 536 | 		if match is not None:
 537 | 			data["registrar"] = [match.group(1).strip()]
 538 | 		# HKDNR (.hk) provides a weird nameserver format with too much whitespace
 539 | 		match = re.search("Name Servers Information:\n\n([\s\S]*?\n)\n", segment)
 540 | 		if match is not None:
 541 | 			chunk = match.group(1)
 542 | 			for match in re.findall("(.+)\n", chunk):
 543 | 				match = match.split()[0]
 544 | 				try:
 545 | 					data["nameservers"].append(match.strip())
 546 | 				except KeyError as e:
 547 | 					data["nameservers"] = [match.strip()]
 548 | 		# ... and again for TWNIC.
 549 | 		match = re.search("   Domain servers in listed order:\n([\s\S]*?\n)\n", segment)
 550 | 		if match is not None:
 551 | 			chunk = match.group(1)
 552 | 			for match in re.findall("      (.+)\n", chunk):
 553 | 				match = match.split()[0]
 554 | 				try:
 555 | 					data["nameservers"].append(match.strip())
 556 | 				except KeyError as e:
 557 | 					data["nameservers"] = [match.strip()]
 558 | 		
 559 | 
 560 | 	data["contacts"] = parse_registrants(raw_data, never_query_handles, handle_server)
 561 | 
 562 | 	# Parse dates
 563 | 	try:
 564 | 		data['expiration_date'] = remove_duplicates(data['expiration_date'])
 565 | 		data['expiration_date'] = parse_dates(data['expiration_date'])
 566 | 	except KeyError as e:
 567 | 		pass # Not present
 568 | 	try:
 569 | 		data['creation_date'] = remove_duplicates(data['creation_date'])
 570 | 		data['creation_date'] = parse_dates(data['creation_date'])
 571 | 	except KeyError as e:
 572 | 		pass # Not present
 573 | 
 574 | 	try:
 575 | 		data['updated_date'] = remove_duplicates(data['updated_date'])
 576 | 		data['updated_date'] = parse_dates(data['updated_date'])
 577 | 	except KeyError as e:
 578 | 		pass # Not present
 579 | 
 580 | 	try:
 581 | 		data['nameservers'] = remove_suffixes(data['nameservers'])
 582 | 		data['nameservers'] = remove_duplicates([ns.rstrip(".") for ns in data['nameservers']])
 583 | 	except KeyError as e:
 584 | 		pass # Not present
 585 | 
 586 | 	try:
 587 | 		data['emails'] = remove_duplicates(data['emails'])
 588 | 	except KeyError as e:
 589 | 		pass # Not present
 590 | 
 591 | 	try:
 592 | 		data['registrar'] = remove_duplicates(data['registrar'])
 593 | 	except KeyError as e:
 594 | 		pass # Not present
 595 | 
 596 | 	# Remove e-mail addresses if they are already listed for any of the contacts
 597 | 	known_emails = []
 598 | 	for contact in ("registrant", "tech", "admin", "billing"):
 599 | 		if data["contacts"][contact] is not None:
 600 | 			try:
 601 | 				known_emails.append(data["contacts"][contact]["email"])
 602 | 			except KeyError as e:
 603 | 				pass # No e-mail recorded for this contact...
 604 | 	try:
 605 | 		data['emails'] = [email for email in data["emails"] if email not in known_emails]
 606 | 	except KeyError as e:
 607 | 		pass # Not present
 608 | 
 609 | 	for key in list(data.keys()):
 610 | 		if data[key] is None or len(data[key]) == 0:
 611 | 			del data[key]
 612 | 
 613 | 	data["raw"] = raw_data
 614 | 
 615 | 	if normalized != []:
 616 | 		data = normalize_data(data, normalized)
 617 | 
 618 | 	return data
 619 | 
 620 | def normalize_data(data, normalized):
 621 | 	for key in ("nameservers", "emails", "whois_server"):
 622 | 		if key in data and data[key] is not None and (normalized == True or key in normalized):
 623 | 			if is_string(data[key]):
 624 | 				data[key] = data[key].lower()
 625 | 			else:
 626 | 				data[key] = [item.lower() for item in data[key]]
 627 | 
 628 | 	for key, threshold in (("registrar", 4), ("status", 3)):
 629 | 		if key == "registrar":
 630 | 			ignore_nic = True
 631 | 		else:
 632 | 			ignore_nic = False
 633 | 		if key in data and data[key] is not None and (normalized == True or key in normalized):
 634 | 			if is_string(data[key]):
 635 | 				data[key] = normalize_name(data[key], abbreviation_threshold=threshold, length_threshold=1, ignore_nic=ignore_nic)
 636 | 			else:
 637 | 				data[key] = [normalize_name(item, abbreviation_threshold=threshold, length_threshold=1, ignore_nic=ignore_nic) for item in data[key]]
 638 | 
 639 | 	for contact_type, contact in data['contacts'].items():
 640 | 		if contact is not None:
 641 | 			if 'country' in contact and contact['country'] in countries:
 642 | 				contact['country'] = countries[contact['country']]
 643 | 			if 'city' in contact and contact['city'] in airports:
 644 | 				contact['city'] = airports[contact['city']]
 645 | 			if 'country' in contact and 'state' in contact:
 646 | 				for country, source in (("united states", states_us), ("australia", states_au), ("canada", states_ca)):
 647 | 					if country in contact["country"].lower() and contact["state"] in source:
 648 | 						contact["state"] = source[contact["state"]]
 649 | 			
 650 | 			for key in ("email",):
 651 | 				if key in contact and contact[key] is not None and (normalized == True or key in normalized):
 652 | 					if is_string(contact[key]):
 653 | 						contact[key] = contact[key].lower()
 654 | 					else:
 655 | 						contact[key] = [item.lower() for item in contact[key]]
 656 | 
 657 | 			for key in ("name", "street"):
 658 | 				if key in contact and contact[key] is not None and (normalized == True or key in normalized):
 659 | 					contact[key] = normalize_name(contact[key], abbreviation_threshold=3)
 660 | 
 661 | 			for key in ("city", "organization", "state", "country"):
 662 | 				if key in contact and contact[key] is not None and (normalized == True or key in normalized):
 663 | 					contact[key] = normalize_name(contact[key], abbreviation_threshold=3, length_threshold=3)
 664 | 			
 665 | 			if "name" in contact and "organization" not in contact:
 666 | 				lines = [x.strip() for x in contact["name"].splitlines()]
 667 | 				new_lines = []
 668 | 				for i, line in enumerate(lines):
 669 | 					for regex in organization_regexes:
 670 | 						if re.search(regex, line):
 671 | 							new_lines.append(line)
 672 | 							del lines[i]
 673 | 							break
 674 | 				if len(lines) > 0:
 675 | 					contact["name"] = "\n".join(lines)
 676 | 				else:
 677 | 					del contact["name"]
 678 | 					
 679 | 				if len(new_lines) > 0:
 680 | 					contact["organization"] = "\n".join(new_lines)
 681 | 						
 682 | 			if "street" in contact and "organization" not in contact:
 683 | 				lines = [x.strip() for x in contact["street"].splitlines()]
 684 | 				if len(lines) > 1:
 685 | 					for regex in organization_regexes:
 686 | 						if re.search(regex, lines[0]):
 687 | 							contact["organization"] = lines[0]
 688 | 							contact["street"] = "\n".join(lines[1:])
 689 | 							break
 690 | 			
 691 | 			for key in list(contact.keys()):
 692 | 				try:
 693 | 					contact[key] = contact[key].strip(", ")
 694 | 					if contact[key] == "-" or contact[key].lower() == "n/a":
 695 | 						del contact[key]
 696 | 				except AttributeError as e:
 697 | 					pass # Not a string
 698 | 	return data
 699 | 
 700 | def normalize_name(value, abbreviation_threshold=4, length_threshold=8, lowercase_domains=True, ignore_nic=False):
 701 | 	normalized_lines = []
 702 | 	for line in value.split("\n"):
 703 | 		line = line.strip(",") # Get rid of useless comma's
 704 | 		if (line.isupper() or line.islower()) and len(line) >= length_threshold:
 705 | 			# This line is likely not capitalized properly
 706 | 			if ignore_nic == True and "nic" in line.lower():
 707 | 				# This is a registrar name containing 'NIC' - it should probably be all-uppercase.
 708 | 				line = line.upper()
 709 | 			else:
 710 | 				words = line.split()
 711 | 				normalized_words = []
 712 | 				if len(words) >= 1:
 713 | 					# First word
 714 | 					if len(words[0]) >= abbreviation_threshold and "." not in words[0]:
 715 | 						normalized_words.append(words[0].capitalize())
 716 | 					elif lowercase_domains and "." in words[0] and not words[0].endswith(".") and not words[0].startswith("."):
 717 | 						normalized_words.append(words[0].lower())
 718 | 					else:
 719 | 						# Probably an abbreviation or domain, leave it alone
 720 | 						normalized_words.append(words[0])
 721 | 				if len(words) >= 3:
 722 | 					# Words between the first and last
 723 | 					for word in words[1:-1]:
 724 | 						if len(word) >= abbreviation_threshold and "." not in word:
 725 | 							normalized_words.append(word.capitalize())
 726 | 						elif lowercase_domains and "." in word and not word.endswith(".") and not word.startswith("."):
 727 | 							normalized_words.append(word.lower())
 728 | 						else:
 729 | 							# Probably an abbreviation or domain, leave it alone
 730 | 							normalized_words.append(word)
 731 | 				if len(words) >= 2:
 732 | 					# Last word
 733 | 					if len(words[-1]) >= abbreviation_threshold and "." not in words[-1]:
 734 | 						normalized_words.append(words[-1].capitalize())
 735 | 					elif lowercase_domains and "." in words[-1] and not words[-1].endswith(".") and not words[-1].startswith("."):
 736 | 						normalized_words.append(words[-1].lower())
 737 | 					else:
 738 | 						# Probably an abbreviation or domain, leave it alone
 739 | 						normalized_words.append(words[-1])
 740 | 				line = " ".join(normalized_words)
 741 | 		normalized_lines.append(line)
 742 | 	return "\n".join(normalized_lines)
 743 | 
 744 | def parse_dates(dates):
 745 | 	global grammar
 746 | 	parsed_dates = []
 747 | 	for date in dates:
 748 | 		for rule in grammar['_dateformats']:
 749 | 			result = re.match(rule, date)
 750 | 			if result is not None:
 751 | 				try:
 752 | 					# These are always numeric. If they fail, there is no valid date present.
 753 | 					year = int(result.group("year"))
 754 | 					day = int(result.group("day"))
 755 | 					# Detect and correct shorthand year notation
 756 | 					if year < 60:
 757 | 						year += 2000
 758 | 					elif year < 100:
 759 | 						year += 1900
 760 | 
 761 | 					# This will require some more guesswork - some WHOIS servers present the name of the month
 762 | 					try:
 763 | 						month = int(result.group("month"))
 764 | 					except ValueError as e:
 765 | 						# Apparently not a number. Look up the corresponding number.
 766 | 						try:
 767 | 							month = grammar['_months'][result.group("month").lower()]
 768 | 						except KeyError as e:
 769 | 							# Unknown month name, default to 0
 770 | 							month = 0
 771 | 
 772 | 					try:
 773 | 						hour = int(result.group("hour"))
 774 | 					except IndexError as e:
 775 | 						hour = 0
 776 | 					except TypeError as e:
 777 | 						hour = 0
 778 | 
 779 | 					try:
 780 | 						minute = int(result.group("minute"))
 781 | 					except IndexError as e:
 782 | 						minute = 0
 783 | 					except TypeError as e:
 784 | 						minute = 0
 785 | 
 786 | 					try:
 787 | 						second = int(result.group("second"))
 788 | 					except IndexError as e:
 789 | 						second = 0
 790 | 					except TypeError as e:
 791 | 						second = 0
 792 | 
 793 | 					break
 794 | 				except ValueError as e:
 795 | 					# Something went horribly wrong, maybe there is no valid date present?
 796 | 					year = 0
 797 | 					month = 0
 798 | 					day = 0
 799 | 					hour = 0
 800 | 					minute = 0
 801 | 					second = 0
 802 | 					print(e.message) # FIXME: This should have proper logging of some sort...?
 803 | 		try:
 804 | 			if year > 0:
 805 | 				try:
 806 | 					parsed_dates.append(datetime.datetime(year, month, day, hour, minute, second))
 807 | 				except ValueError as e:
 808 | 					# We might have gotten the day and month the wrong way around, let's try it the other way around
 809 | 					# If you're not using an ISO-standard date format, you're an evil registrar!
 810 | 					parsed_dates.append(datetime.datetime(year, day, month, hour, minute, second))
 811 | 		except UnboundLocalError as e:
 812 | 			pass
 813 | 
 814 | 	if len(parsed_dates) > 0:
 815 | 		return parsed_dates
 816 | 	else:
 817 | 		return None
 818 | 
 819 | def remove_duplicates(data):
 820 | 	cleaned_list = []
 821 | 
 822 | 	for entry in data:
 823 | 		if entry not in cleaned_list:
 824 | 			cleaned_list.append(entry)
 825 | 
 826 | 	return cleaned_list
 827 | 
 828 | def remove_suffixes(data):
 829 | 	# Removes everything before and after the first non-whitespace continuous string.
 830 | 	# Used to get rid of IP suffixes for nameservers.
 831 | 	cleaned_list = []
 832 | 	
 833 | 	for entry in data:
 834 | 		cleaned_list.append(re.search("([^\s]+)\s*[\s]*", entry).group(1).lstrip())
 835 | 		
 836 | 	return cleaned_list
 837 | 
 838 | def parse_registrants(data, never_query_handles=True, handle_server=""):
 839 | 	registrant = None
 840 | 	tech_contact = None
 841 | 	billing_contact = None
 842 | 	admin_contact = None
 843 | 
 844 | 	for segment in data:
 845 | 		for regex in registrant_regexes:
 846 | 			match = re.search(regex, segment)
 847 | 			if match is not None:
 848 | 				registrant = match.groupdict()
 849 | 				break
 850 | 
 851 | 	for segment in data:
 852 | 		for regex in tech_contact_regexes:
 853 | 			match = re.search(regex, segment)
 854 | 			if match is not None:
 855 | 				tech_contact = match.groupdict()
 856 | 				break
 857 | 
 858 | 	for segment in data:
 859 | 		for regex in admin_contact_regexes:
 860 | 			match = re.search(regex, segment)
 861 | 			if match is not None:
 862 | 				admin_contact = match.groupdict()
 863 | 				break
 864 | 
 865 | 	for segment in data:
 866 | 		for regex in billing_contact_regexes:
 867 | 			match = re.search(regex, segment)
 868 | 			if match is not None:
 869 | 				billing_contact = match.groupdict()
 870 | 				break
 871 | 
 872 | 	# Find NIC handle contact definitions
 873 | 	handle_contacts = parse_nic_contact(data)
 874 | 
 875 | 	# Find NIC handle references and process them
 876 | 	missing_handle_contacts = []
 877 | 	for category in nic_contact_references:
 878 | 		for regex in nic_contact_references[category]:
 879 | 			for segment in data:
 880 | 				match = re.search(regex, segment)
 881 | 				if match is not None:
 882 | 					data_reference = match.groupdict()
 883 | 					if data_reference["handle"] == "-" or re.match("https?:\/\/", data_reference["handle"]) is not None:
 884 | 						pass  # Reference was either blank or a URL; the latter is to deal with false positives for nic.ru
 885 | 					else:
 886 | 						found = False
 887 | 						for contact in handle_contacts:
 888 | 							if contact["handle"] == data_reference["handle"]:
 889 | 								found = True
 890 | 								data_reference.update(contact)
 891 | 						if found == False:
 892 | 							# The contact definition was not found in the supplied raw WHOIS data. If the
 893 | 							# method has been called with never_query_handles=False, we can use the supplied
 894 | 							# WHOIS server for looking up the handle information separately.
 895 | 							if never_query_handles == False:
 896 | 								try:
 897 | 									contact = fetch_nic_contact(data_reference["handle"], handle_server)
 898 | 									data_reference.update(contact)
 899 | 								except shared.WhoisException as e:
 900 | 									pass # No data found. TODO: Log error?
 901 | 							else:
 902 | 								pass # TODO: Log warning?
 903 | 						if category == "registrant":
 904 | 							registrant = data_reference
 905 | 						elif category == "tech":
 906 | 							tech_contact = data_reference
 907 | 						elif category == "billing":
 908 | 							billing_contact = data_reference
 909 | 						elif category == "admin":
 910 | 							admin_contact = data_reference
 911 | 					break
 912 | 					
 913 | 	# Post-processing
 914 | 	for obj in (registrant, tech_contact, billing_contact, admin_contact):
 915 | 		if obj is not None:
 916 | 			for key in list(obj.keys()):
 917 | 				if obj[key] is None or obj[key].strip() == "": # Just chomp all surrounding whitespace
 918 | 					del obj[key]
 919 | 				else:
 920 | 					obj[key] = obj[key].strip()
 921 | 			if "phone_ext" in obj:
 922 | 				if "phone" in obj:
 923 | 					obj["phone"] += " ext. %s" % obj["phone_ext"]
 924 | 					del obj["phone_ext"]
 925 | 			if "street1" in obj:
 926 | 				street_items = []
 927 | 				i = 1
 928 | 				while True:
 929 | 					try:
 930 | 						street_items.append(obj["street%d" % i])
 931 | 						del obj["street%d" % i]
 932 | 					except KeyError as e:
 933 | 						break
 934 | 					i += 1
 935 | 				obj["street"] = "\n".join(street_items)
 936 | 			if "organization1" in obj: # This is to deal with eg. HKDNR, who allow organization names in multiple languages.
 937 | 				organization_items = []
 938 | 				i = 1
 939 | 				while True:
 940 | 					try:
 941 | 						if obj["organization%d" % i].strip() != "":
 942 | 							organization_items.append(obj["organization%d" % i])
 943 | 							del obj["organization%d" % i]
 944 | 					except KeyError as e:
 945 | 						break
 946 | 					i += 1
 947 | 				obj["organization"] = "\n".join(organization_items)
 948 | 			if 'changedate' in obj:
 949 | 				obj['changedate'] = parse_dates([obj['changedate']])[0]
 950 | 			if 'creationdate' in obj:
 951 | 				obj['creationdate'] = parse_dates([obj['creationdate']])[0]
 952 | 			if 'street' in obj and "\n" in obj["street"] and 'postalcode' not in obj:
 953 | 				# Deal with certain mad WHOIS servers that don't properly delimit address data... (yes, AFNIC, looking at you)
 954 | 				lines = [x.strip() for x in obj["street"].splitlines()]
 955 | 				if " " in lines[-1]:
 956 | 					postal_code, city = lines[-1].split(" ", 1)
 957 | 					if "." not in lines[-1] and re.match("[0-9]", postal_code) and len(postal_code) >= 3:
 958 | 						obj["postalcode"] = postal_code
 959 | 						obj["city"] = city
 960 | 						obj["street"] = "\n".join(lines[:-1])
 961 | 			if 'firstname' in obj or 'lastname' in obj:
 962 | 				elements = []
 963 | 				if 'firstname' in obj:
 964 | 					elements.append(obj["firstname"])
 965 | 				if 'lastname' in obj:
 966 | 					elements.append(obj["lastname"])
 967 | 				obj["name"] = " ".join(elements)
 968 | 			if 'country' in obj and 'city' in obj and (re.match("^R\.?O\.?C\.?$", obj["country"], re.IGNORECASE) or obj["country"].lower() == "republic of china") and obj["city"].lower() == "taiwan":
 969 | 				# There's an edge case where some registrants append ", Republic of China" after "Taiwan", and this is mis-parsed
 970 | 				# as Taiwan being the city. This is meant to correct that.
 971 | 				obj["country"] = "%s, %s" % (obj["city"], obj["country"])
 972 | 				lines = [x.strip() for x in obj["street"].splitlines()]
 973 | 				obj["city"] = lines[-1]
 974 | 				obj["street"] = "\n".join(lines[:-1])
 975 | 
 976 | 	return {
 977 | 		"registrant": registrant,
 978 | 		"tech": tech_contact,
 979 | 		"admin": admin_contact,
 980 | 		"billing": billing_contact,
 981 | 	}
 982 | 
 983 | def fetch_nic_contact(handle, lookup_server):
 984 | 	response = net.get_whois_raw(handle, lookup_server)
 985 | 	response = [segment.replace("\r", "") for segment in response] # Carriage returns are the devil
 986 | 	results = parse_nic_contact(response)
 987 | 	
 988 | 	if len(results) > 0:
 989 | 		return results[0]
 990 | 	else:
 991 | 		raise shared.WhoisException("No contact data found in the response.")
 992 | 	
 993 | def parse_nic_contact(data):
 994 | 	handle_contacts = []
 995 | 	for regex in nic_contact_regexes:
 996 | 		for segment in data:
 997 | 			matches = re.finditer(regex, segment)
 998 | 			for match in matches:
 999 | 				handle_contacts.append(match.groupdict())
1000 | 				
1001 | 	return handle_contacts
1002 | 


--------------------------------------------------------------------------------