├── pythonwhois ├── shared.py ├── states_au.dat ├── states_ca.dat ├── __init__.py ├── states_us.dat ├── net.py ├── countries.dat ├── countries3.dat └── parse.py ├── .gitignore ├── lib ├── files │ ├── GeoLite2-ASN.mmdb │ ├── GeoLite2-Country.mmdb │ ├── extensions.txt │ ├── shorteners.txt │ └── tlds.txt ├── blacklists.py ├── spf.py └── functions.py ├── urls └── national │ └── .urls-benign.csv.swp ├── config.ini ├── requirements.txt ├── get_database_phishtank.py ├── run.py ├── README.md └── extract.py /pythonwhois/shared.py: -------------------------------------------------------------------------------- 1 | class WhoisException(Exception): 2 | pass 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | __pycache__/ 3 | .env/ 4 | database_phishtank.json 5 | -------------------------------------------------------------------------------- /lib/files/GeoLite2-ASN.mmdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucasayres/url-feature-extractor/HEAD/lib/files/GeoLite2-ASN.mmdb -------------------------------------------------------------------------------- /lib/files/GeoLite2-Country.mmdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucasayres/url-feature-extractor/HEAD/lib/files/GeoLite2-Country.mmdb -------------------------------------------------------------------------------- /urls/national/.urls-benign.csv.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucasayres/url-feature-extractor/HEAD/urls/national/.urls-benign.csv.swp -------------------------------------------------------------------------------- /config.ini: -------------------------------------------------------------------------------- 1 | [phishtank] 2 | api_key = 3 | 4 | [safebrowsing] 5 | client_id = 6 | api_key = 7 | version = 1.5.2 8 | 9 | [wot] 10 | api_key = 11 | -------------------------------------------------------------------------------- /pythonwhois/states_au.dat: -------------------------------------------------------------------------------- 1 | NSW,"New South Wales" 2 | QLD,"Queensland" 3 | SA,"South Australia" 4 | TAS,"Tasmania" 5 | VIC,"Victoria" 6 | WA,"Western Australia" -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.6.0 2 | bs4==0.0.1 3 | configparser==3.5.0 4 | dnspython==1.15.0 5 | geoip2==2.5.0 6 | IPy==0.83 7 | maxminddb==1.3.0 8 | rblwatch==0.3.0 9 | requests==2.22.0 10 | urllib3==1.24.2 11 | -------------------------------------------------------------------------------- /pythonwhois/states_ca.dat: -------------------------------------------------------------------------------- 1 | id,name,abbreviation 2 | "60","Alberta","AB" 3 | "61","British Columbia","BC" 4 | "62","Manitoba","MB" 5 | "63","New Brunswick","NB" 6 | "64","Newfoundland and Labrador","NL" 7 | "65","Nova Scotia","NS" 8 | "66","Ontario","ON" 9 | "67","Prince Edward Island","PE" 10 | "68","Quebec","QC" 11 | "69","Saskatchewan","SK" 12 | "70","Northwest Territories","NT" 13 | "71","Nunavut","NU" 14 | "72","Yukon Territory","YT" 15 | -------------------------------------------------------------------------------- /get_database_phishtank.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | import configparser 3 | import json 4 | import bz2 5 | 6 | config = configparser.ConfigParser() 7 | config.read('config.ini') 8 | 9 | 10 | def update_db(): 11 | """Download the PhishTank URLs database in lib/files/database_phishtank.json.""" 12 | api_key = config.get('phishtank', 'api_key') 13 | api_url = 'http://data.phishtank.com/data/%s/online-valid.json.bz2' % (api_key) 14 | compraw = urllib.request.urlopen(api_url).read() 15 | rawdecomp = bz2.decompress(compraw) 16 | database = json.loads(rawdecomp.decode('utf-8')) 17 | with open('lib/files/database_phishtank.json', 'w') as outfile: 18 | json.dump(database, outfile) 19 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | from get_database_phishtank import update_db 2 | import argparse 3 | import extract 4 | 5 | 6 | def main(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("input", help="File of URLs to be analyzed") 9 | parser.add_argument("output", help="Output File") 10 | args = parser.parse_args() 11 | 12 | if args.input and args.output: 13 | # Update phishtank database 14 | print('Download and update phishtank database...') 15 | update_db() 16 | # Starts extraction 17 | print('Starts extraction...') 18 | extract.main(args.input, args.output) 19 | print(''' 20 | ####################################### 21 | # Dataset generated successfully! # 22 | ####################################### 23 | ''') 24 | 25 | 26 | if __name__ == "__main__": 27 | main() 28 | -------------------------------------------------------------------------------- /pythonwhois/__init__.py: -------------------------------------------------------------------------------- 1 | from . import net, parse 2 | 3 | def get_whois(domain, normalized=[]): 4 | try: 5 | raw_data, server_list = net.get_whois_raw(domain, with_server_list=True) 6 | except: 7 | return False 8 | # Unlisted handles will be looked up on the last WHOIS server that was queried. This may be changed to also query 9 | # other servers in the future, if it turns out that there are cases where the last WHOIS server in the chain doesn't 10 | # actually hold the handle contact details, but another WHOIS server in the chain does. 11 | return parse.parse_raw_whois(raw_data, normalized=normalized, never_query_handles=False, handle_server=server_list[-1]) 12 | 13 | def whois(*args, **kwargs): 14 | raise Exception("The whois() method has been replaced by a different method (with a different API), since pythonwhois 2.0. Either install the older pythonwhois 1.2.3, or change your code to use the new API.") 15 | -------------------------------------------------------------------------------- /pythonwhois/states_us.dat: -------------------------------------------------------------------------------- 1 | id,name,abbreviation 2 | "1","Alabama","AL" 3 | "2","Alaska","AK" 4 | "3","Arizona","AZ" 5 | "4","Arkansas","AR" 6 | "5","California","CA" 7 | "6","Colorado","CO" 8 | "7","Connecticut","CT" 9 | "8","Delaware","DE" 10 | "9","Florida","FL" 11 | "10","Georgia","GA" 12 | "11","Hawaii","HI" 13 | "12","Idaho","ID" 14 | "13","Illinois","IL" 15 | "14","Indiana","IN" 16 | "15","Iowa","IA" 17 | "16","Kansas","KS" 18 | "17","Kentucky","KY" 19 | "18","Louisiana","LA" 20 | "19","Maine","ME" 21 | "20","Maryland","MD" 22 | "21","Massachusetts","MA" 23 | "22","Michigan","MI" 24 | "23","Minnesota","MN" 25 | "24","Mississippi","MS" 26 | "25","Missouri","MO" 27 | "26","Montana","MT" 28 | "27","Nebraska","NE" 29 | "28","Nevada","NV" 30 | "29","New Hampshire","NH" 31 | "30","New Jersey","NJ" 32 | "31","New Mexico","NM" 33 | "32","New York","NY" 34 | "33","North Carolina","NC" 35 | "34","North Dakota","ND" 36 | "35","Ohio","OH" 37 | "36","Oklahoma","OK" 38 | "37","Oregon","OR" 39 | "38","Pennsylvania","PA" 40 | "39","Rhode Island","RI" 41 | "40","South Carolina","SC" 42 | "41","South Dakota","SD" 43 | "42","Tennessee","TN" 44 | "43","Texas","TX" 45 | "44","Utah","UT" 46 | "45","Vermont","VT" 47 | "46","Virginia","VA" 48 | "47","Washington","WA" 49 | "48","West Virginia","WV" 50 | "49","Wisconsin","WI" 51 | "50","Wyoming","WY" 52 | "52","Puerto Rico","PR" 53 | "53","U.S. Virgin Islands","VI" 54 | "54","American Samoa","AS" 55 | "55","Guam","GU" 56 | "56","Northern Mariana Islands","MP" -------------------------------------------------------------------------------- /lib/files/extensions.txt: -------------------------------------------------------------------------------- 1 | .3dm 2 | .3ds 3 | .3g2 4 | .3gp 5 | .7z 6 | .accdb 7 | .ai 8 | .aif 9 | .apk 10 | .app 11 | .asf 12 | .asp 13 | .aspx 14 | .avi 15 | .bak 16 | .bat 17 | .bin 18 | .bmp 19 | .c 20 | .cab 21 | .cbr 22 | .cer 23 | .cfg 24 | .cfm 25 | .cgi 26 | .class 27 | .com 28 | .cpl 29 | .cpp 30 | .crdownload 31 | .crx 32 | .cs 33 | .csr 34 | .css 35 | .csv 36 | .cue 37 | .cur 38 | .dat 39 | .db 40 | .dbf 41 | .dds 42 | .deb 43 | .dem 44 | .deskthemepack 45 | .dll 46 | .dmg 47 | .dmp 48 | .doc 49 | .docx 50 | .drv 51 | .dtd 52 | .dwg 53 | .dxf 54 | .eps 55 | .exe 56 | .fla 57 | .flv 58 | .fnt 59 | .fon 60 | .gadget 61 | .gam 62 | .ged 63 | .gif 64 | .gpx 65 | .gz 66 | .h 67 | .hqx 68 | .htm 69 | .html 70 | .icns 71 | .ico 72 | .ics 73 | .iff 74 | .indd 75 | .ini 76 | .iso 77 | .jar 78 | .java 79 | .jpg 80 | .js 81 | .jsp 82 | .key 83 | .keychain 84 | .kml 85 | .kmz 86 | .lnk 87 | .log 88 | .lua 89 | .m 90 | .m3u 91 | .m4a 92 | .m4v 93 | .max 94 | .mdb 95 | .mdf 96 | .mid 97 | .mim 98 | .mov 99 | .mp3 100 | .mp4 101 | .mpa 102 | .mpg 103 | .msg 104 | .msi 105 | .nes 106 | .obj 107 | .odt 108 | .otf 109 | .pages 110 | .part 111 | .pct 112 | .pdb 113 | .pdf 114 | .php 115 | .pkg 116 | .pl 117 | .plugin 118 | .png 119 | .pps 120 | .ppt 121 | .pptx 122 | .prf 123 | .ps 124 | .psd 125 | .pspimage 126 | .py 127 | .rar 128 | .rm 129 | .rom 130 | .rpm 131 | .rss 132 | .rtf 133 | .sav 134 | .sdf 135 | .sh 136 | .sitx 137 | .sln 138 | .sql 139 | .srt 140 | .svg 141 | .swf 142 | .swift 143 | .sys 144 | .tar 145 | .tar.gz 146 | .tax2016 147 | .tex 148 | .tga 149 | .thm 150 | .tif 151 | .tiff 152 | .tmp 153 | .toast 154 | .torrent 155 | .ttf 156 | .txt 157 | .uue 158 | .vb 159 | .vcd 160 | .vcf 161 | .vcxproj 162 | .vob 163 | .wav 164 | .wma 165 | .wmv 166 | .wpd 167 | .wps 168 | .wsf 169 | .xcodeproj 170 | .xhtml 171 | .xlr 172 | .xls 173 | .xlsx 174 | .xml 175 | .yuv 176 | .zip 177 | .zipx 178 | -------------------------------------------------------------------------------- /lib/blacklists.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import requests 3 | import json 4 | 5 | config = configparser.ConfigParser() 6 | config.read('config.ini') 7 | 8 | 9 | def google_safebrowsing(url): 10 | client_id = config.get('safebrowsing', 'client_id') 11 | version = config.get('safebrowsing', 'version') 12 | api_key = config.get('safebrowsing', 'api_key') 13 | platform_types = ['ANY_PLATFORM'] 14 | threat_types = ['THREAT_TYPE_UNSPECIFIED', 15 | 'MALWARE', 'SOCIAL_ENGINEERING', 16 | 'UNWANTED_SOFTWARE', 'POTENTIALLY_HARMFUL_APPLICATION'] 17 | threat_entry_types = ['URL'] 18 | api_url = 'https://safebrowsing.googleapis.com/v4/threatMatches:find?key=%s' % (api_key) 19 | threat_entries = [{'url': url}] 20 | payload = { 21 | 'client': { 22 | 'clientId': client_id, 23 | 'clientVersion': version 24 | }, 25 | 'threatInfo': { 26 | 'threatTypes': threat_types, 27 | 'platformTypes': platform_types, 28 | 'threatEntryTypes': threat_entry_types, 29 | 'threatEntries': threat_entries 30 | } 31 | } 32 | headers = {'content-type': 'application/json'} 33 | try: 34 | response = requests.post(api_url, headers=headers, json=payload).json().get('matches', None) 35 | if response is not None: 36 | return True 37 | else: 38 | return False 39 | except Exception: 40 | return '?' 41 | 42 | 43 | def phishtank(url): 44 | with open('lib/files/database_phishtank.json') as db: 45 | data = json.load(db) 46 | for d in data: 47 | if (url == d['url']): 48 | return True 49 | return False 50 | 51 | 52 | def wot(url): 53 | api_key = config.get('wot', 'api_key') 54 | api_url = 'http://api.mywot.com/0.4/public_link_json2' 55 | try: 56 | response = requests.get(api_url, params={'hosts': url, 'key': api_key}).json() 57 | return any('blacklists' in val for val in response.values()) 58 | except Exception: 59 | return False 60 | -------------------------------------------------------------------------------- /lib/spf.py: -------------------------------------------------------------------------------- 1 | import dns.resolver 2 | import dns.name 3 | from urllib import parse 4 | 5 | 6 | class SPFRecord(object): 7 | 8 | def __init__(self, domain): 9 | self.version = None 10 | self.includes = [] 11 | self.ip4 = [] 12 | self.ip6 = [] 13 | try: 14 | self._dns_response = dns.resolver.query(domain, 'TXT') 15 | except Exception: 16 | return False 17 | self.txt_records = [txt.to_text() for txt in self._dns_response] 18 | for txt in self.txt_records: 19 | self._parse_txt(txt) 20 | 21 | def _parse_txt(self, txt): 22 | for entry in txt.split(' '): 23 | if entry.startswith('v') and '=' in entry: 24 | self._add_version(entry) 25 | elif entry.startswith('include') and ':' in entry: 26 | self._add_include(entry) 27 | elif entry.startswith('ip4') and ':' in entry: 28 | self._add_ip4(entry) 29 | elif entry.startswith('ip6') and ':' in entry: 30 | self._add_ip6(entry) 31 | 32 | @property 33 | def ips(self): 34 | return self.ip4 + self.ip6 35 | 36 | def _add_version(self, entry): 37 | self.version = entry.split('=')[1] 38 | 39 | def _add_include(self, entry): 40 | self.includes.append(entry.split(':')[1]) 41 | 42 | def _add_ip4(self, entry): 43 | ip = entry.split(':')[1] 44 | self.ip4.append(ip) 45 | 46 | def _add_ip6(self, entry): 47 | ip = entry.split(':')[1] 48 | self.ip6.append(ip) 49 | 50 | 51 | def is_expired(domain): 52 | try: 53 | dns.resolver.query(domain) 54 | return False 55 | except dns.resolver.NXDOMAIN: 56 | return True 57 | except Exception: 58 | return False 59 | 60 | 61 | def get_spf_record(domain): 62 | if is_expired(domain): 63 | return None 64 | try: 65 | return SPFRecord(domain) 66 | except Exception: 67 | return None 68 | 69 | 70 | def check_spf(spf, domain): 71 | for inc_domain in spf.includes: 72 | try: 73 | url = parse.urlparse("mail://%s" % inc_domain).netloc 74 | parent = '.'.join(url.split('.')[-2:]) 75 | if is_expired(parent): 76 | return False 77 | else: 78 | return True 79 | except Exception: 80 | return False 81 | return '?' 82 | -------------------------------------------------------------------------------- /pythonwhois/net.py: -------------------------------------------------------------------------------- 1 | import socket, re, sys 2 | from codecs import encode, decode 3 | from . import shared 4 | 5 | def get_whois_raw(domain, server="", previous=None, rfc3490=True, never_cut=False, with_server_list=False, server_list=None): 6 | previous = previous or [] 7 | server_list = server_list or [] 8 | # Sometimes IANA simply won't give us the right root WHOIS server 9 | exceptions = { 10 | ".ac.uk": "whois.ja.net", 11 | ".ps": "whois.pnina.ps", 12 | ".buzz": "whois.nic.buzz", 13 | ".moe": "whois.nic.moe", 14 | # The following is a bit hacky, but IANA won't return the right answer for example.com because it's a direct registration. 15 | "example.com": "whois.verisign-grs.com" 16 | } 17 | 18 | if rfc3490: 19 | if sys.version_info < (3, 0): 20 | domain = encode( domain if type(domain) is unicode else decode(domain, "utf8"), "idna" ) 21 | else: 22 | domain = encode(domain, "idna").decode("ascii") 23 | 24 | if len(previous) == 0 and server == "": 25 | # Root query 26 | is_exception = False 27 | for exception, exc_serv in exceptions.items(): 28 | if domain.endswith(exception): 29 | is_exception = True 30 | target_server = exc_serv 31 | break 32 | if is_exception == False: 33 | target_server = get_root_server(domain) 34 | else: 35 | target_server = server 36 | if target_server == "whois.jprs.jp": 37 | request_domain = "%s/e" % domain # Suppress Japanese output 38 | elif domain.endswith(".de") and ( target_server == "whois.denic.de" or target_server == "de.whois-servers.net" ): 39 | request_domain = "-T dn,ace %s" % domain # regional specific stuff 40 | elif target_server == "whois.verisign-grs.com": 41 | request_domain = "=%s" % domain # Avoid partial matches 42 | elif target_server is False: 43 | return False 44 | else: 45 | request_domain = domain 46 | response = whois_request(request_domain, target_server) 47 | if never_cut: 48 | # If the caller has requested to 'never cut' responses, he will get the original response from the server (this is 49 | # useful for callers that are only interested in the raw data). Otherwise, if the target is verisign-grs, we will 50 | # select the data relevant to the requested domain, and discard the rest, so that in a multiple-option response the 51 | # parsing code will only touch the information relevant to the requested domain. The side-effect of this is that 52 | # when `never_cut` is set to False, any verisign-grs responses in the raw data will be missing header, footer, and 53 | # alternative domain options (this is handled a few lines below, after the verisign-grs processing). 54 | new_list = [response] + previous 55 | if target_server == "whois.verisign-grs.com": 56 | # VeriSign is a little... special. As it may return multiple full records and there's no way to do an exact query, 57 | # we need to actually find the correct record in the list. 58 | for record in response.split("\n\n"): 59 | if re.search("Domain Name: %s\n" % domain.upper(), record): 60 | response = record 61 | break 62 | if never_cut == False: 63 | new_list = [response] + previous 64 | server_list.append(target_server) 65 | for line in [x.strip() for x in response.splitlines()]: 66 | match = re.match("(refer|whois server|referral url|whois server|registrar whois):\s*([^\s]+\.[^\s]+)", line, re.IGNORECASE) 67 | if match is not None: 68 | referal_server = match.group(2) 69 | if referal_server != server and "://" not in referal_server: # We want to ignore anything non-WHOIS (eg. HTTP) for now. 70 | # Referal to another WHOIS server... 71 | return get_whois_raw(domain, referal_server, new_list, server_list=server_list, with_server_list=with_server_list) 72 | if with_server_list: 73 | return (new_list, server_list) 74 | else: 75 | return new_list 76 | 77 | def get_root_server(domain): 78 | data = whois_request(domain, "whois.iana.org") 79 | for line in [x.strip() for x in data.splitlines()]: 80 | match = re.match("refer:\s*([^\s]+)", line) 81 | if match is None: 82 | continue 83 | return match.group(1) 84 | # raise shared.WhoisException("No root WHOIS server found for domain.") 85 | return False 86 | 87 | def whois_request(domain, server, port=43): 88 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 89 | sock.connect((server, port)) 90 | sock.send(("%s\r\n" % domain).encode("utf-8")) 91 | buff = b"" 92 | while True: 93 | data = sock.recv(1024) 94 | if len(data) == 0: 95 | break 96 | buff += data 97 | return buff.decode("utf-8", errors="ignore") 98 | -------------------------------------------------------------------------------- /pythonwhois/countries.dat: -------------------------------------------------------------------------------- 1 | iso,name 2 | AF,Afghanistan 3 | AL,Albania 4 | DZ,Algeria 5 | AS,"American Samoa" 6 | AD,Andorra 7 | AO,Angola 8 | AI,Anguilla 9 | AQ,Antarctica 10 | AG,"Antigua and Barbuda" 11 | AR,Argentina 12 | AM,Armenia 13 | AW,Aruba 14 | AU,Australia 15 | AT,Austria 16 | AZ,Azerbaijan 17 | BS,Bahamas 18 | BH,Bahrain 19 | BD,Bangladesh 20 | BB,Barbados 21 | BY,Belarus 22 | BE,Belgium 23 | BZ,Belize 24 | BJ,Benin 25 | BM,Bermuda 26 | BT,Bhutan 27 | BO,Bolivia 28 | BA,"Bosnia and Herzegovina" 29 | BW,Botswana 30 | BV,"Bouvet Island" 31 | BR,Brazil 32 | BQ,"British Antarctic Territory" 33 | IO,"British Indian Ocean Territory" 34 | VG,"British Virgin Islands" 35 | BN,Brunei 36 | BG,Bulgaria 37 | BF,"Burkina Faso" 38 | BI,Burundi 39 | KH,Cambodia 40 | CM,Cameroon 41 | CA,Canada 42 | CT,"Canton and Enderbury Islands" 43 | CV,"Cape Verde" 44 | KY,"Cayman Islands" 45 | CF,"Central African Republic" 46 | TD,Chad 47 | CL,Chile 48 | CN,China 49 | CX,"Christmas Island" 50 | CC,"Cocos [Keeling] Islands" 51 | CO,Colombia 52 | KM,Comoros 53 | CG,"Congo - Brazzaville" 54 | CD,"Congo - Kinshasa" 55 | CK,"Cook Islands" 56 | CR,"Costa Rica" 57 | HR,Croatia 58 | CU,Cuba 59 | CY,Cyprus 60 | CZ,"Czech Republic" 61 | CI,"Côte d’Ivoire" 62 | DK,Denmark 63 | DJ,Djibouti 64 | DM,Dominica 65 | DO,"Dominican Republic" 66 | NQ,"Dronning Maud Land" 67 | DD,"East Germany" 68 | EC,Ecuador 69 | EG,Egypt 70 | SV,"El Salvador" 71 | GQ,"Equatorial Guinea" 72 | ER,Eritrea 73 | EE,Estonia 74 | ET,Ethiopia 75 | FK,"Falkland Islands" 76 | FO,"Faroe Islands" 77 | FJ,Fiji 78 | FI,Finland 79 | FR,France 80 | GF,"French Guiana" 81 | PF,"French Polynesia" 82 | TF,"French Southern Territories" 83 | FQ,"French Southern and Antarctic Territories" 84 | GA,Gabon 85 | GM,Gambia 86 | GE,Georgia 87 | DE,Germany 88 | GH,Ghana 89 | GI,Gibraltar 90 | GR,Greece 91 | GL,Greenland 92 | GD,Grenada 93 | GP,Guadeloupe 94 | GU,Guam 95 | GT,Guatemala 96 | GG,Guernsey 97 | GN,Guinea 98 | GW,Guinea-Bissau 99 | GY,Guyana 100 | HT,Haiti 101 | HM,"Heard Island and McDonald Islands" 102 | HN,Honduras 103 | HK,"Hong Kong" 104 | HU,Hungary 105 | IS,Iceland 106 | IN,India 107 | ID,Indonesia 108 | IR,Iran 109 | IQ,Iraq 110 | IE,Ireland 111 | IM,"Isle of Man" 112 | IL,Israel 113 | IT,Italy 114 | JM,Jamaica 115 | JP,Japan 116 | JE,Jersey 117 | JT,"Johnston Island" 118 | JO,Jordan 119 | KZ,Kazakhstan 120 | KE,Kenya 121 | KI,Kiribati 122 | KW,Kuwait 123 | KG,Kyrgyzstan 124 | LA,Laos 125 | LV,Latvia 126 | LB,Lebanon 127 | LS,Lesotho 128 | LR,Liberia 129 | LY,Libya 130 | LI,Liechtenstein 131 | LT,Lithuania 132 | LU,Luxembourg 133 | MO,"Macau SAR China" 134 | MK,Macedonia 135 | MG,Madagascar 136 | MW,Malawi 137 | MY,Malaysia 138 | MV,Maldives 139 | ML,Mali 140 | MT,Malta 141 | MH,"Marshall Islands" 142 | MQ,Martinique 143 | MR,Mauritania 144 | MU,Mauritius 145 | YT,Mayotte 146 | FX,"Metropolitan France" 147 | MX,Mexico 148 | FM,Micronesia 149 | MI,"Midway Islands" 150 | MD,Moldova 151 | MC,Monaco 152 | MN,Mongolia 153 | ME,Montenegro 154 | MS,Montserrat 155 | MA,Morocco 156 | MZ,Mozambique 157 | MM,"Myanmar [Burma]" 158 | NA,Namibia 159 | NR,Nauru 160 | NP,Nepal 161 | NL,Netherlands 162 | AN,"Netherlands Antilles" 163 | NT,"Neutral Zone" 164 | NC,"New Caledonia" 165 | NZ,"New Zealand" 166 | NI,Nicaragua 167 | NE,Niger 168 | NG,Nigeria 169 | NU,Niue 170 | NF,"Norfolk Island" 171 | KP,"North Korea" 172 | VD,"North Vietnam" 173 | MP,"Northern Mariana Islands" 174 | NO,Norway 175 | OM,Oman 176 | PC,"Pacific Islands Trust Territory" 177 | PK,Pakistan 178 | PW,Palau 179 | PS,"Palestinian Territories" 180 | PA,Panama 181 | PZ,"Panama Canal Zone" 182 | PG,"Papua New Guinea" 183 | PY,Paraguay 184 | YD,"People's Democratic Republic of Yemen" 185 | PE,Peru 186 | PH,Philippines 187 | PN,"Pitcairn Islands" 188 | PL,Poland 189 | PT,Portugal 190 | PR,"Puerto Rico" 191 | QA,Qatar 192 | RO,Romania 193 | RU,Russia 194 | RW,Rwanda 195 | RE,Réunion 196 | BL,"Saint Barthélemy" 197 | SH,"Saint Helena" 198 | KN,"Saint Kitts and Nevis" 199 | LC,"Saint Lucia" 200 | MF,"Saint Martin" 201 | PM,"Saint Pierre and Miquelon" 202 | VC,"Saint Vincent and the Grenadines" 203 | WS,Samoa 204 | SM,"San Marino" 205 | SA,"Saudi Arabia" 206 | SN,Senegal 207 | RS,Serbia 208 | CS,"Serbia and Montenegro" 209 | SC,Seychelles 210 | SL,"Sierra Leone" 211 | SG,Singapore 212 | SK,Slovakia 213 | SI,Slovenia 214 | SB,"Solomon Islands" 215 | SO,Somalia 216 | ZA,"South Africa" 217 | GS,"South Georgia and the South Sandwich Islands" 218 | KR,"South Korea" 219 | ES,Spain 220 | LK,"Sri Lanka" 221 | SD,Sudan 222 | SR,Suriname 223 | SJ,"Svalbard and Jan Mayen" 224 | SZ,Swaziland 225 | SE,Sweden 226 | CH,Switzerland 227 | SY,Syria 228 | ST,"São Tomé and Príncipe" 229 | TW,Taiwan 230 | TJ,Tajikistan 231 | TZ,Tanzania 232 | TH,Thailand 233 | TL,Timor-Leste 234 | TG,Togo 235 | TK,Tokelau 236 | TO,Tonga 237 | TT,"Trinidad and Tobago" 238 | TN,Tunisia 239 | TR,Turkey 240 | TM,Turkmenistan 241 | TC,"Turks and Caicos Islands" 242 | TV,Tuvalu 243 | UM,"U.S. Minor Outlying Islands" 244 | PU,"U.S. Miscellaneous Pacific Islands" 245 | VI,"U.S. Virgin Islands" 246 | UG,Uganda 247 | UA,Ukraine 248 | SU,"Union of Soviet Socialist Republics" 249 | AE,"United Arab Emirates" 250 | GB,"United Kingdom" 251 | US,"United States" 252 | ZZ,"Unknown or Invalid Region" 253 | UY,Uruguay 254 | UZ,Uzbekistan 255 | VU,Vanuatu 256 | VA,"Vatican City" 257 | VE,Venezuela 258 | VN,Vietnam 259 | WK,"Wake Island" 260 | WF,"Wallis and Futuna" 261 | EH,"Western Sahara" 262 | YE,Yemen 263 | ZM,Zambia 264 | ZW,Zimbabwe 265 | AX,"Åland Islands" -------------------------------------------------------------------------------- /lib/files/shorteners.txt: -------------------------------------------------------------------------------- 1 | 0rz.tw 2 | 1-url.net 3 | 126.am 4 | 1tk.us 5 | 1un.fr 6 | 1url.com 7 | 1url.cz 8 | 1wb2.net 9 | 2.gp 10 | 2.ht 11 | 2ad.in 12 | 2doc.net 13 | 2fear.com 14 | 2tu.us 15 | 2ty.in 16 | 2u.xf.cz 17 | 3ra.be 18 | 3x.si 19 | 4i.ae 20 | 4ks.net 21 | 4view.me 22 | 5em.cz 23 | 5url.net 24 | 5z8.info 25 | 6fr.ru 26 | 6g6.eu 27 | 7.ly 28 | 76.gd 29 | 77.ai 30 | 7fth.cc 31 | 7li.in 32 | 7vd.cn 33 | 8u.cz 34 | 944.la 35 | 98.to 36 | L9.fr 37 | Lvvk.com 38 | To8.cc 39 | a0.fr 40 | abbr.sk 41 | ad-med.cz 42 | ad5.eu 43 | ad7.biz 44 | adb.ug 45 | adf.ly 46 | adfa.st 47 | adfly.fr 48 | adli.pw 49 | adv.li 50 | ajn.me 51 | aka.gr 52 | alil.in 53 | amzn.to 54 | any.gs 55 | aqva.pl 56 | ares.tl 57 | asso.in 58 | au.ms 59 | ayt.fr 60 | azali.fr 61 | b00.fr 62 | b23.ru 63 | b54.in 64 | baid.us 65 | bc.vc 66 | beam.to 67 | bee4.biz 68 | bim.im 69 | bit.do 70 | bit.ly 71 | bitly.com 72 | bitw.in 73 | blap.net 74 | ble.pl 75 | blip.tv 76 | boi.re 77 | bote.me 78 | bougn.at 79 | br4.in 80 | brk.to 81 | brzu.net 82 | bul.lu 83 | bxl.me 84 | bzh.me 85 | cachor.ro 86 | captur.in 87 | cashfly.com 88 | cbs.so 89 | cbug.cc 90 | cc.cc 91 | ccj.im 92 | cf.ly 93 | cf2.me 94 | cf6.co 95 | chilp.it 96 | cjb.net 97 | cli.gs 98 | clikk.in 99 | clk.im 100 | cn86.org 101 | couic.fr 102 | cr.tl 103 | cudder.it 104 | cur.lv 105 | curl.im 106 | curte.me 107 | cut.pe 108 | cut.sk 109 | cutt.eu 110 | cutt.us 111 | cutu.me 112 | cybr.fr 113 | cyonix.to 114 | d75.eu 115 | daa.pl 116 | dai.ly 117 | decenturl.com 118 | dd.ma 119 | ddp.net 120 | dft.ba 121 | digbig.com 122 | doiop.com 123 | dolp.cc 124 | dopice.sk 125 | droid.ws 126 | dv.gd 127 | dyo.gs 128 | e37.eu 129 | easyurl.net 130 | ecra.se 131 | ely.re 132 | encurtador.com.br 133 | erax.cz 134 | erw.cz 135 | esy.es 136 | ex9.co 137 | ezurl.cc 138 | fff.re 139 | fff.to 140 | fff.wf 141 | filz.fr 142 | fnk.es 143 | foe.hn 144 | folu.me 145 | freze.it 146 | fur.ly 147 | fwdurl.net 148 | g00.me 149 | gca.sh 150 | gg.gg 151 | goo.gl 152 | goo.lu 153 | grem.io 154 | guiama.is 155 | hadej.co 156 | hide.my 157 | hjkl.fr 158 | hops.me 159 | href.li 160 | ht.ly 161 | i-2.co 162 | i99.cz 163 | icit.fr 164 | ick.li 165 | icks.ro 166 | iiiii.in 167 | iky.fr 168 | ilix.in 169 | info.ms 170 | is.gd 171 | isra.li 172 | itm.im 173 | ity.im 174 | ix.sk 175 | j.gs 176 | j.mp 177 | jdem.cz 178 | jieb.be 179 | jp22.net 180 | jqw.de 181 | kask.us 182 | kd2.org 183 | kfd.pl 184 | korta.nu 185 | kr3w.de 186 | krat.si 187 | kratsi.cz 188 | krod.cz 189 | kuc.cz 190 | kxb.me 191 | l-k.be 192 | lc-s.co 193 | lc.cx 194 | lcut.in 195 | letop10. 196 | libero.it 197 | lick.my 198 | lien.li 199 | lien.pl 200 | lin.io 201 | linkn.co 202 | linkbucks.com 203 | llu.ch 204 | lnk.co 205 | lnk.ly 206 | lnk.sk 207 | lnks.fr 208 | lnky.fr 209 | lnp.sn 210 | lp25.fr 211 | m1p.fr 212 | m3mi.com 213 | make.my 214 | mcaf.ee 215 | mdl29.net 216 | mic.fr 217 | migre.me 218 | minu.me 219 | moourl.com 220 | more.sh 221 | mut.lu 222 | myurl.in 223 | net.ms 224 | net46.net 225 | nicou.ch 226 | nig.gr 227 | notlong.com 228 | nov.io 229 | nq.st 230 | nutshellurl.com 231 | nxy.in 232 | o-x.fr 233 | okok.fr 234 | onl.li 235 | ou.af 236 | ou.gd 237 | oua.be 238 | ouo.io 239 | ow.ly 240 | p.pw 241 | parky.tv 242 | past.is 243 | pdh.co 244 | ph.ly 245 | pich.in 246 | pin.st 247 | plots.fr 248 | plots.fr 249 | pm.wu.cz 250 | po.st 251 | ppfr.it 252 | ppst.me 253 | ppt.cc 254 | ppt.li 255 | pqn.bz 256 | prejit.cz 257 | ptab.it 258 | ptm.ro 259 | pw2.ro 260 | py6.ru 261 | q.gs 262 | qbn.ru 263 | qqc.co 264 | qr.net 265 | qrtag.fr 266 | qxp.cz 267 | qxp.sk 268 | r.cont.us 269 | rb6.co 270 | rcknr.io 271 | rdz.me 272 | redir.ec 273 | redir.fr 274 | redu.it 275 | ref.so 276 | reise.lc 277 | relink.fr 278 | repla.cr 279 | ri.ms 280 | riz.cz 281 | rod.gs 282 | roflc.at 283 | rt.se 284 | s-url.fr 285 | safe.mn 286 | sagyap.tk 287 | sdu.sk 288 | seeme.at 289 | segue.se 290 | sh.st 291 | sh.st 292 | shar.as 293 | shrinkurl.us 294 | shorl.com 295 | short.cc 296 | short.ie 297 | short.pk 298 | shorte.st 299 | shrt.in 300 | shy.si 301 | smu.sh 302 | sicax.net 303 | sina.lt 304 | sk.gy 305 | skr.sk 306 | skroc.pl 307 | smll.co 308 | sn.im 309 | snipurl.com 310 | snsw.us 311 | snurl.com 312 | soo.gd 313 | sort3.me 314 | spn.sr 315 | sq6.ru 316 | ssl.gs 317 | su.pr 318 | surl.me 319 | sux.cz 320 | sy.pe 321 | t.cn 322 | t.co 323 | ta.gd 324 | tabzi.com 325 | tau.pe 326 | tdjt.cz 327 | thesa.us 328 | tighturl.com 329 | tin.li 330 | tini.cc 331 | tiny.cc 332 | tiny.lt 333 | tiny.ms 334 | tiny.pl 335 | tinyurl.com 336 | tinyurl.hu 337 | tixsu.com 338 | tldr.sk 339 | tllg.net 340 | tnij.org 341 | tny.cz 342 | to.ly 343 | tohle.de 344 | tpmr.com 345 | tr.im 346 | tr5.in 347 | trck.me 348 | trick.ly 349 | trkr.ws 350 | trunc.it 351 | twet.fr 352 | twi.im 353 | twlr.me 354 | twurl.nl 355 | u.to 356 | uby.es 357 | ucam.me 358 | ug.cz 359 | ulmt.in 360 | unlc.us 361 | upzat.com 362 | ur1.ca 363 | url2.fr 364 | url5.org 365 | url.ie 366 | url.likedeck.com 367 | url.lotpatrol.com 368 | urlcut.com 369 | urlin.it 370 | urls.fr 371 | urltea.com 372 | urlz.fr 373 | urub.us 374 | utfg.sk 375 | v.gd 376 | v.ht 377 | v5.gd 378 | vaaa.fr 379 | valv.im 380 | vaza.me 381 | vbly.us 382 | vd55.com 383 | verd.in 384 | vgn.me 385 | vov.li 386 | vsll.eu 387 | vt802.us 388 | vur.me 389 | vv.vg 390 | w1p.fr 391 | waa.ai 392 | wapurl.co.uk 393 | wb1.eu 394 | web99.eu 395 | wed.li 396 | wideo.fr 397 | wp.me 398 | wtc.la 399 | wu.cz 400 | ww7.fr 401 | wwy.me 402 | x.co 403 | x.nu 404 | x10.mx 405 | x2c.eu 406 | x2c.eumx 407 | xav.cc 408 | xgd.in 409 | xib.me 410 | xl8.eu 411 | xoe.cz 412 | xrl.us 413 | xt3.me 414 | xua.me 415 | xub.me 416 | xurls.co 417 | yagoa.fr 418 | yagoa.me 419 | yatuc.com 420 | yau.sh 421 | yeca.eu 422 | yect.com 423 | yep.it 424 | yogh.me 425 | yon.ir 426 | youfap.me 427 | youtu.be 428 | ysear.ch 429 | yyv.co 430 | z9.fr 431 | zSMS.net 432 | zapit.nu 433 | zeek.ir 434 | zip.net 435 | zkr.cz 436 | zkrat.me 437 | zkrt.cz 438 | zoodl.com 439 | zpag.es 440 | zti.me 441 | zxq.net 442 | zyva.org 443 | zzb.bz 444 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # URL Feature Extractor 2 | 3 | Extracting features from URLs to build a data set for machine learning. The purpose is to find a machine learning model to predict phishing URLs, which are targeted to the Brazilian population. 4 | 5 | This repo includes the implementation of our paper: 6 | 7 | Lucas Dantas Gama Ayres, Italo Valcy S Brito and Rodrigo Rocha Gomes e Souza. Using Machine Learning to Automatically Detect Malicious URLs in Brazil. In Simpósio Brasileiro de Redes de Computadores e Sistemas Distribuídos (SBRC 2019) - 2019, Gramado - RS - Brazil. 8 | 9 | The paper is available here: https://sol.sbc.org.br/index.php/sbrc/article/view/7416 10 | 11 | DOI: https://doi.org/10.5753/sbrc.2019.7416 12 | 13 | ## Install 14 | 15 | ```bash 16 | $ sudo apt-get update && sudo apt-get upgrade 17 | $ sudo apt-get install virtualenv python3 python3-dev python-dev gcc libpq-dev libssl-dev libffi-dev build-essentials 18 | $ virtualenv -p /usr/bin/python3 .env 19 | $ source .env/bin/activate 20 | $ pip install -r requirements.txt 21 | ``` 22 | 23 | ## How to use 24 | 25 | Before running the software, add the API Keys to the Google Safe Browsing, Phishtank, and MyWot in the ```config.ini``` file. 26 | 27 | Now, run: 28 | 29 | ```bash 30 | $ python run.py 31 | ``` 32 | 33 | ## Features implemented 34 | 35 | 36 | 37 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 |
38 | LEXICAL 39 |
Count (.) in URLCount (-) in URLCount (_) in URLCount (/) in URL
Count (?) in URLCount (=) in URLCount (@) in URLCount (&) in URL
Count (!) in URLCount ( ) in URLCount (~) in URLCount (,) in URL
Count (+) in URLCount (*) in URLCount (#) in URLCount ($) in URL
Count (%) in URLURL LengthLTLD amount in URLCount (.) in Domain
Count (-) in DomainCount (_) in DomainCount (/) in DomainCount (?) in Domain
Count (=) in DomainCount (@) in DomainCount (&) in DomainCount (!) in Domain
Count ( ) in DomainCount (~) in DomainCount (,) in DomainCount (+) in Domain
Count (*) in DomainCount (#) in DomainCount ($) in DomainCount (%) in Domain
Domain LengthQuantidade de vogais in DomainURL domain in IP address formatDomain contains the key words "server" or "client"
Count (.) in DirectoryCount (-) in DirectoryCount (_) in DirectoryCount (/) in Directory
Count (?) in DirectoryCount (=) in DirectoryCount (@) in DirectoryCount (&) in Directory
Count (!) in DirectoryCount ( ) in DirectoryCount (~) in DirectoryCount (,) in Directory
Count (+) in DirectoryCount (*) in DirectoryCount (#) in DirectoryCount ($) in Directory
Count (%) in DirectoryDirectory LengthCount (.) in fileCount (-) in file
Count (_) in fileCount (/) in fileCount (?) in fileCount (=) in file
Count (@) in fileCount (&) in fileCount (!) in fileCount ( ) in file
Count (~) in fileCount (,) in fileCount (+) in fileCount (*) in file
Count (#) in fileCount ($) in fileCount (%) in fileFile length
Count (.) in parametersCount (-) in parametersCount (_) in parametersCount (/) in parameters
Count (?) in parametersCount (=) in parametersCount (@) in parametersCount (&) in parameters
Count (!) in parametersCount ( ) in parametersCount (~) in parametersCount (,) in parameters
Count (+) in parametersCount (*) in parametersCount (#) in parametersCount ($) in parameters
Count (%) in parametersLength of parametersTLD presence in argumentsNumber of parameters
Email present at URLFile extension
190 | 191 | 192 | 193 | 196 | 197 | 198 | 199 | 200 | 201 | 202 |
194 | BLACKLIST 195 |
Presence of the URL in blacklistsPresence of the IP Address in blacklistsPresence of the domain in Blacklists
203 | 204 | 205 | 206 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 |
207 | HOST 208 |
Presence of the domain in RBL (Real-time Blackhole List)Search time (response) domain (lookup)Domain has SPF?Geographical location of IP
AS Number (or ASN)PTR of IPTime (in days) of domain activationTime (in days) of domain expiration
Number of resolved IPsNumber of resolved name servers (NameServers - NS)Number of MX ServersTime-to-live (TTL) value associated with hostname
229 | 230 | 231 | 232 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 |
233 | OTHERS 234 |
Valid TLS / SSL CertificateNumber of redirectsCheck if URL is indexed on GoogleCheck if domain is indexed on Google
Uses URL shortener service
246 | 247 | ## Contributing 248 | 249 | Any contribution is appreciated. 250 | 251 | #### Submitting a Pull Request (PR) 252 | 253 | 1. Clone the project: 254 | ``` 255 | $ git clone https://github.com/lucasayres/url-feature-extractor.git 256 | ``` 257 | 258 | 2. Make your changes in a new git branch: 259 | ``` 260 | $ git checkout -b my-branch master 261 | ``` 262 | 263 | 3. Add your changes. 264 | 265 | 4. Push your branch to Github. 266 | 267 | 5. Create a PR to master. 268 | -------------------------------------------------------------------------------- /pythonwhois/countries3.dat: -------------------------------------------------------------------------------- 1 | "name","iso_name","iso2","iso3","numcode" 2 | "Antigua and Barbuda","ANTIGUA AND BARBUDA","AG","ATG",28 3 | "Bosnia and Herzegovina","BOSNIA AND HERZEGOVINA","BA","BIH",70 4 | "Cocos (Keeling) Islands","COCOS (KEELING) ISLANDS","CC","\N","\N" 5 | "Congo, the Democratic Republic of the","CONGO, THE DEMOCRATIC REPUBLIC OF THE","CD","COD",180 6 | "Cote D'Ivoire","COTE D'IVOIRE","CI","CIV",384 7 | "Fiji","FIJI","FJ","FJI",242 8 | "French Southern Territories","FRENCH SOUTHERN TERRITORIES","TF","\N","\N" 9 | "Heard Island and Mcdonald Islands","HEARD ISLAND AND MCDONALD ISLANDS","HM","\N","\N" 10 | "Holy See (Vatican City State)","HOLY SEE (VATICAN CITY STATE)","VA","VAT",336 11 | "Iran, Islamic Republic of","IRAN, ISLAMIC REPUBLIC OF","IR","IRN",364 12 | "Korea, Democratic People's Republic of","KOREA, DEMOCRATIC PEOPLE'S REPUBLIC OF","KP","PRK",408 13 | "Korea, Republic of","KOREA, REPUBLIC OF","KR","KOR",410 14 | "Belarus","BELARUS","BY","BLR",112 15 | "Lao People's Democratic Republic","LAO PEOPLE'S DEMOCRATIC REPUBLIC","LA","LAO",418 16 | "Macedonia, the Former Yugoslav Republic of","MACEDONIA, THE FORMER YUGOSLAV REPUBLIC OF","MK","MKD",807 17 | "United States","UNITED STATES","US","USA",840 18 | "Micronesia, Federated States of","MICRONESIA, FEDERATED STATES OF","FM","FSM",583 19 | "Moldova, Republic of","MOLDOVA, REPUBLIC OF","MD","MDA",498 20 | "Palestinian Territory, Occupied","PALESTINIAN TERRITORY, OCCUPIED","PS","\N","\N" 21 | "Pitcairn","PITCAIRN","PN","PCN",612 22 | "Reunion","REUNION","RE","REU",638 23 | "Saint Helena","SAINT HELENA","SH","SHN",654 24 | "Saint Kitts and Nevis","SAINT KITTS AND NEVIS","KN","KNA",659 25 | "Saint Pierre and Miquelon","SAINT PIERRE AND MIQUELON","PM","SPM",666 26 | "Sao Tome and Principe","SAO TOME AND PRINCIPE","ST","STP",678 27 | "Serbia and Montenegro","SERBIA AND MONTENEGRO","CS","\N","\N" 28 | "South Georgia and the South Sandwich Islands","SOUTH GEORGIA AND THE SOUTH SANDWICH ISLANDS","GS","\N","\N" 29 | "Svalbard and Jan Mayen","SVALBARD AND JAN MAYEN","SJ","SJM",744 30 | "Syrian Arab Republic","SYRIAN ARAB REPUBLIC","SY","SYR",760 31 | "Taiwan, Province of China","TAIWAN, PROVINCE OF CHINA","TW","TWN",158 32 | "Tanzania, United Republic of","TANZANIA, UNITED REPUBLIC OF","TZ","TZA",834 33 | "Timor-Leste","TIMOR-LESTE","TL","\N","\N" 34 | "Trinidad and Tobago","TRINIDAD AND TOBAGO","TT","TTO",780 35 | "Mexico","MEXICO","MX","MEX",484 36 | "Myanmar","MYANMAR","MM","MMR",104 37 | "Virgin Islands, British","VIRGIN ISLANDS, BRITISH","VG","VGB",92 38 | "Virgin Islands, U.s.","VIRGIN ISLANDS, U.S.","VI","VIR",850 39 | "Wallis and Futuna","WALLIS AND FUTUNA","WF","WLF",876 40 | "Albania","ALBANIA","AL","ALB",8 41 | "Algeria","ALGERIA","DZ","DZA",12 42 | "American Samoa","AMERICAN SAMOA","AS","ASM",16 43 | "Vanuatu","VANUATU","VU","VUT",548 44 | "Yemen","YEMEN","YE","YEM",887 45 | "Andorra","ANDORRA","AD","AND",20 46 | "Angola","ANGOLA","AO","AGO",24 47 | "Anguilla","ANGUILLA","AI","AIA",660 48 | "Argentina","ARGENTINA","AR","ARG",32 49 | "Armenia","ARMENIA","AM","ARM",51 50 | "Aruba","ARUBA","AW","ABW",533 51 | "Australia","AUSTRALIA","AU","AUS",36 52 | "Austria","AUSTRIA","AT","AUT",40 53 | "Azerbaijan","AZERBAIJAN","AZ","AZE",31 54 | "Bahamas","BAHAMAS","BS","BHS",44 55 | "Bahrain","BAHRAIN","BH","BHR",48 56 | "Bangladesh","BANGLADESH","BD","BGD",50 57 | "Barbados","BARBADOS","BB","BRB",52 58 | "Belgium","BELGIUM","BE","BEL",56 59 | "Benin","BENIN","BJ","BEN",204 60 | "Bermuda","BERMUDA","BM","BMU",60 61 | "Bhutan","BHUTAN","BT","BTN",64 62 | "Bolivia","BOLIVIA","BO","BOL",68 63 | "Botswana","BOTSWANA","BW","BWA",72 64 | "Bouvet Island","BOUVET ISLAND","BV","\N","\N" 65 | "Brazil","BRAZIL","BR","BRA",76 66 | "British Indian Ocean Territory","BRITISH INDIAN OCEAN TERRITORY","IO","\N","\N" 67 | "Brunei Darussalam","BRUNEI DARUSSALAM","BN","BRN",96 68 | "Bulgaria","BULGARIA","BG","BGR",100 69 | "Burkina Faso","BURKINA FASO","BF","BFA",854 70 | "Burundi","BURUNDI","BI","BDI",108 71 | "Cambodia","CAMBODIA","KH","KHM",116 72 | "Cameroon","CAMEROON","CM","CMR",120 73 | "Canada","CANADA","CA","CAN",124 74 | "Cape Verde","CAPE VERDE","CV","CPV",132 75 | "Malta","MALTA","MT","MLT",470 76 | "Cayman Islands","CAYMAN ISLANDS","KY","CYM",136 77 | "Chad","CHAD","TD","TCD",148 78 | "Chile","CHILE","CL","CHL",152 79 | "China","CHINA","CN","CHN",156 80 | "Christmas Island","CHRISTMAS ISLAND","CX","\N","\N" 81 | "Colombia","COLOMBIA","CO","COL",170 82 | "Comoros","COMOROS","KM","COM",174 83 | "Cook Islands","COOK ISLANDS","CK","COK",184 84 | "Costa Rica","COSTA RICA","CR","CRI",188 85 | "Croatia","CROATIA","HR","HRV",191 86 | "Cuba","CUBA","CU","CUB",192 87 | "Cyprus","CYPRUS","CY","CYP",196 88 | "Czech Republic","CZECH REPUBLIC","CZ","CZE",203 89 | "Denmark","DENMARK","DK","DNK",208 90 | "Djibouti","DJIBOUTI","DJ","DJI",262 91 | "Dominica","DOMINICA","DM","DMA",212 92 | "Dominican Republic","DOMINICAN REPUBLIC","DO","DOM",214 93 | "Ecuador","ECUADOR","EC","ECU",218 94 | "Egypt","EGYPT","EG","EGY",818 95 | "Equatorial Guinea","EQUATORIAL GUINEA","GQ","GNQ",226 96 | "Eritrea","ERITREA","ER","ERI",232 97 | "Estonia","ESTONIA","EE","EST",233 98 | "Ethiopia","ETHIOPIA","ET","ETH",231 99 | "Faroe Islands","FAROE ISLANDS","FO","FRO",234 100 | "Finland","FINLAND","FI","FIN",246 101 | "France","FRANCE","FR","FRA",250 102 | "French Guiana","FRENCH GUIANA","GF","GUF",254 103 | "French Polynesia","FRENCH POLYNESIA","PF","PYF",258 104 | "Gabon","GABON","GA","GAB",266 105 | "Gambia","GAMBIA","GM","GMB",270 106 | "Georgia","GEORGIA","GE","GEO",268 107 | "Germany","GERMANY","DE","DEU",276 108 | "Ghana","GHANA","GH","GHA",288 109 | "Gibraltar","GIBRALTAR","GI","GIB",292 110 | "Greece","GREECE","GR","GRC",300 111 | "Greenland","GREENLAND","GL","GRL",304 112 | "Grenada","GRENADA","GD","GRD",308 113 | "Guadeloupe","GUADELOUPE","GP","GLP",312 114 | "Guam","GUAM","GU","GUM",316 115 | "Guatemala","GUATEMALA","GT","GTM",320 116 | "Guinea","GUINEA","GN","GIN",324 117 | "Guinea-Bissau","GUINEA-BISSAU","GW","GNB",624 118 | "Guyana","GUYANA","GY","GUY",328 119 | "Haiti","HAITI","HT","HTI",332 120 | "Honduras","HONDURAS","HN","HND",340 121 | "Hong Kong","HONG KONG","HK","HKG",344 122 | "Hungary","HUNGARY","HU","HUN",348 123 | "Iceland","ICELAND","IS","ISL",352 124 | "India","INDIA","IN","IND",356 125 | "Indonesia","INDONESIA","ID","IDN",360 126 | "Iraq","IRAQ","IQ","IRQ",368 127 | "Israel","ISRAEL","IL","ISR",376 128 | "Italy","ITALY","IT","ITA",380 129 | "Jamaica","JAMAICA","JM","JAM",388 130 | "Japan","JAPAN","JP","JPN",392 131 | "Jordan","JORDAN","JO","JOR",400 132 | "Kazakhstan","KAZAKHSTAN","KZ","KAZ",398 133 | "Kenya","KENYA","KE","KEN",404 134 | "Kiribati","KIRIBATI","KI","KIR",296 135 | "Kuwait","KUWAIT","KW","KWT",414 136 | "Kyrgyzstan","KYRGYZSTAN","KG","KGZ",417 137 | "Latvia","LATVIA","LV","LVA",428 138 | "Lesotho","LESOTHO","LS","LSO",426 139 | "Liberia","LIBERIA","LR","LBR",430 140 | "Libyan Arab Jamahiriya","LIBYAN ARAB JAMAHIRIYA","LY","LBY",434 141 | "Lithuania","LITHUANIA","LT","LTU",440 142 | "Luxembourg","LUXEMBOURG","LU","LUX",442 143 | "Macao","MACAO","MO","MAC",446 144 | "Madagascar","MADAGASCAR","MG","MDG",450 145 | "Malawi","MALAWI","MW","MWI",454 146 | "Malaysia","MALAYSIA","MY","MYS",458 147 | "Maldives","MALDIVES","MV","MDV",462 148 | "Mali","MALI","ML","MLI",466 149 | "Marshall Islands","MARSHALL ISLANDS","MH","MHL",584 150 | "Martinique","MARTINIQUE","MQ","MTQ",474 151 | "Mauritius","MAURITIUS","MU","MUS",480 152 | "Mayotte","MAYOTTE","YT","\N","\N" 153 | "Monaco","MONACO","MC","MCO",492 154 | "Mongolia","MONGOLIA","MN","MNG",496 155 | "Montserrat","MONTSERRAT","MS","MSR",500 156 | "Morocco","MOROCCO","MA","MAR",504 157 | "Mozambique","MOZAMBIQUE","MZ","MOZ",508 158 | "Namibia","NAMIBIA","NA","NAM",516 159 | "Nauru","NAURU","NR","NRU",520 160 | "Nepal","NEPAL","NP","NPL",524 161 | "Netherlands","NETHERLANDS","NL","NLD",528 162 | "New Caledonia","NEW CALEDONIA","NC","NCL",540 163 | "New Zealand","NEW ZEALAND","NZ","NZL",554 164 | "Nicaragua","NICARAGUA","NI","NIC",558 165 | "Niger","NIGER","NE","NER",562 166 | "Nigeria","NIGERIA","NG","NGA",566 167 | "Niue","NIUE","NU","NIU",570 168 | "Norfolk Island","NORFOLK ISLAND","NF","NFK",574 169 | "Northern Mariana Islands","NORTHERN MARIANA ISLANDS","MP","MNP",580 170 | "Norway","NORWAY","NO","NOR",578 171 | "Oman","OMAN","OM","OMN",512 172 | "Pakistan","PAKISTAN","PK","PAK",586 173 | "Palau","PALAU","PW","PLW",585 174 | "Panama","PANAMA","PA","PAN",591 175 | "Paraguay","PARAGUAY","PY","PRY",600 176 | "Peru","PERU","PE","PER",604 177 | "Philippines","PHILIPPINES","PH","PHL",608 178 | "Poland","POLAND","PL","POL",616 179 | "Portugal","PORTUGAL","PT","PRT",620 180 | "Puerto Rico","PUERTO RICO","PR","PRI",630 181 | "Qatar","QATAR","QA","QAT",634 182 | "Romania","ROMANIA","RO","ROM",642 183 | "Russian Federation","RUSSIAN FEDERATION","RU","RUS",643 184 | "Rwanda","RWANDA","RW","RWA",646 185 | "Saint Lucia","SAINT LUCIA","LC","LCA",662 186 | "Saint Vincent and the Grenadines","SAINT VINCENT AND THE GRENADINES","VC","VCT",670 187 | "Samoa","SAMOA","WS","WSM",882 188 | "San Marino","SAN MARINO","SM","SMR",674 189 | "Senegal","SENEGAL","SN","SEN",686 190 | "Seychelles","SEYCHELLES","SC","SYC",690 191 | "Sierra Leone","SIERRA LEONE","SL","SLE",694 192 | "Singapore","SINGAPORE","SG","SGP",702 193 | "Slovakia","SLOVAKIA","SK","SVK",703 194 | "Slovenia","SLOVENIA","SI","SVN",705 195 | "Solomon Islands","SOLOMON ISLANDS","SB","SLB",90 196 | "Somalia","SOMALIA","SO","SOM",706 197 | "South Africa","SOUTH AFRICA","ZA","ZAF",710 198 | "Spain","SPAIN","ES","ESP",724 199 | "Sri Lanka","SRI LANKA","LK","LKA",144 200 | "Sudan","SUDAN","SD","SDN",736 201 | "Suriname","SURINAME","SR","SUR",740 202 | "Swaziland","SWAZILAND","SZ","SWZ",748 203 | "Sweden","SWEDEN","SE","SWE",752 204 | "Switzerland","SWITZERLAND","CH","CHE",756 205 | "Thailand","THAILAND","TH","THA",764 206 | "Togo","TOGO","TG","TGO",768 207 | "Tokelau","TOKELAU","TK","TKL",772 208 | "Tonga","TONGA","TO","TON",776 209 | "Tunisia","TUNISIA","TN","TUN",788 210 | "Turkey","TURKEY","TR","TUR",792 211 | "Turkmenistan","TURKMENISTAN","TM","TKM",795 212 | "Turks and Caicos Islands","TURKS AND CAICOS ISLANDS","TC","TCA",796 213 | "Tuvalu","TUVALU","TV","TUV",798 214 | "Uganda","UGANDA","UG","UGA",800 215 | "Ukraine","UKRAINE","UA","UKR",804 216 | "United Kingdom","UNITED KINGDOM","GB","GBR",826 217 | "United States Minor Outlying Islands","UNITED STATES MINOR OUTLYING ISLANDS","UM","\N","\N" 218 | "Uruguay","URUGUAY","UY","URY",858 219 | "Uzbekistan","UZBEKISTAN","UZ","UZB",860 220 | "Venezuela","VENEZUELA","VE","VEN",862 221 | "Viet Nam","VIET NAM","VN","VNM",704 222 | "Western Sahara","WESTERN SAHARA","EH","ESH",732 223 | "Zambia","ZAMBIA","ZM","ZMB",894 224 | "Zimbabwe","ZIMBABWE","ZW","ZWE",716 225 | "Antarctica","ANTARCTICA","AQ","\N","\N" 226 | "Belize","BELIZE","BZ","BLZ",84 227 | "Central African Republic","CENTRAL AFRICAN REPUBLIC","CF","CAF",140 228 | "El Salvador","EL SALVADOR","SV","SLV",222 229 | "Ireland","IRELAND","IE","IRL",372 230 | "Lebanon","LEBANON","LB","LBN",422 231 | "Liechtenstein","LIECHTENSTEIN","LI","LIE",438 232 | "Mauritania","MAURITANIA","MR","MRT",478 233 | "Afghanistan","AFGHANISTAN","AF","AFG",4 234 | "Falkland Islands (Malvinas)","FALKLAND ISLANDS (MALVINAS)","FK","FLK",238 235 | "Netherlands Antilles","NETHERLANDS ANTILLES","AN","ANT",530 236 | "Congo","CONGO","CG","COG",178 237 | "Papua New Guinea","PAPUA NEW GUINEA","PG","PNG",598 238 | "Saudi Arabia","SAUDI ARABIA","SA","SAU",682 239 | "Tajikistan","TAJIKISTAN","TJ","TJK",762 240 | "United Arab Emirates","UNITED ARAB EMIRATES","AE","ARE",784 -------------------------------------------------------------------------------- /lib/functions.py: -------------------------------------------------------------------------------- 1 | from urllib import parse 2 | from dns import resolver, reversename 3 | from datetime import datetime 4 | from bs4 import BeautifulSoup 5 | from rblwatch import RBLSearch 6 | from .spf import get_spf_record, check_spf 7 | from .blacklists import google_safebrowsing, phishtank, wot 8 | import re 9 | import pythonwhois 10 | import ipaddress 11 | import requests 12 | import geoip2.database 13 | 14 | PATH = 'lib/files/' 15 | 16 | 17 | def start_url(url): 18 | """Split URL into: protocol, host, path, params, query and fragment.""" 19 | if not parse.urlparse(url.strip()).scheme: 20 | url = 'http://' + url 21 | protocol, host, path, params, query, fragment = parse.urlparse(url.strip()) 22 | 23 | result = { 24 | 'url': host + path + params + query + fragment, 25 | 'protocol': protocol, 26 | 'host': host, 27 | 'path': path, 28 | 'params': params, 29 | 'query': query, 30 | 'fragment': fragment 31 | } 32 | return result 33 | 34 | 35 | def count(text, character): 36 | """Return the amount of certain character in the text.""" 37 | return text.count(character) 38 | 39 | 40 | def count_vowels(text): 41 | """Return the number of vowels.""" 42 | vowels = ['a', 'e', 'i', 'o', 'u'] 43 | count = 0 44 | for i in vowels: 45 | count += text.lower().count(i) 46 | return count 47 | 48 | 49 | def length(text): 50 | """Return the length of a string.""" 51 | return len(text) 52 | 53 | 54 | def valid_ip(host): 55 | """Return if the domain has a valid IP format (IPv4 or IPv6).""" 56 | try: 57 | ipaddress.ip_address(host) 58 | return True 59 | except Exception: 60 | return False 61 | 62 | 63 | def valid_email(text): 64 | """Return if there is an email in the text.""" 65 | if re.findall(r'[\w\.-]+@[\w\.-]+', text): 66 | return True 67 | else: 68 | return False 69 | 70 | 71 | def check_shortener(url): 72 | """Check if the domain is a shortener.""" 73 | file = open(PATH + 'shorteners.txt', 'r') 74 | for line in file: 75 | with_www = "www." + line.strip() 76 | if line.strip() == url['host'].lower() or with_www == url['host'].lower(): 77 | file.close() 78 | return True 79 | file.close() 80 | return False 81 | 82 | 83 | def check_tld(text): 84 | """Check for presence of Top-Level Domains (TLD).""" 85 | file = open(PATH + 'tlds.txt', 'r') 86 | pattern = re.compile("[a-zA-Z0-9.]") 87 | for line in file: 88 | i = (text.lower().strip()).find(line.strip()) 89 | while i > -1: 90 | if ((i + len(line) - 1) >= len(text)) or not pattern.match(text[i + len(line) - 1]): 91 | file.close() 92 | return True 93 | i = text.find(line.strip(), i + 1) 94 | file.close() 95 | return False 96 | 97 | 98 | def count_tld(text): 99 | """Return amount of Top-Level Domains (TLD) present in the URL.""" 100 | file = open(PATH + 'tlds.txt', 'r') 101 | count = 0 102 | pattern = re.compile("[a-zA-Z0-9.]") 103 | for line in file: 104 | i = (text.lower().strip()).find(line.strip()) 105 | while i > -1: 106 | if ((i + len(line) - 1) >= len(text)) or not pattern.match(text[i + len(line) - 1]): 107 | count += 1 108 | i = text.find(line.strip(), i + 1) 109 | file.close() 110 | return count 111 | 112 | 113 | def count_params(text): 114 | """Return number of parameters.""" 115 | return len(parse.parse_qs(text)) 116 | 117 | 118 | def check_word_server_client(text): 119 | """Return whether the "server" or "client" keywords exist in the domain.""" 120 | if "server" in text.lower() or "client" in text.lower(): 121 | return True 122 | return False 123 | 124 | 125 | def count_ips(url): 126 | """Return the number of resolved IPs (IPv4).""" 127 | if valid_ip(url['host']): 128 | return 1 129 | 130 | try: 131 | answers = resolver.query(url['host'], 'A') 132 | return len(answers) 133 | except Exception: 134 | return '?' 135 | 136 | 137 | def count_name_servers(url): 138 | """Return number of NameServers (NS) resolved.""" 139 | count = 0 140 | if count_ips(url): 141 | try: 142 | answers = resolver.query(url['host'], 'NS') 143 | return len(answers) 144 | except (resolver.NoAnswer, resolver.NXDOMAIN): 145 | split_host = url['host'].split('.') 146 | while len(split_host) > 0: 147 | split_host.pop(0) 148 | supposed_domain = '.'.join(split_host) 149 | try: 150 | answers = resolver.query(supposed_domain, 'NS') 151 | count = len(answers) 152 | break 153 | except Exception: 154 | count = 0 155 | except Exception: 156 | count = 0 157 | return count 158 | 159 | 160 | def count_mx_servers(url): 161 | """Return Number of Resolved MX Servers.""" 162 | count = 0 163 | if count_ips(url): 164 | try: 165 | answers = resolver.query(url['host'], 'MX') 166 | return len(answers) 167 | except (resolver.NoAnswer, resolver.NXDOMAIN): 168 | split_host = url['host'].split('.') 169 | while len(split_host) > 0: 170 | split_host.pop(0) 171 | supposed_domain = '.'.join(split_host) 172 | try: 173 | answers = resolver.query(supposed_domain, 'MX') 174 | count = len(answers) 175 | break 176 | except Exception: 177 | count = 0 178 | except Exception: 179 | count = 0 180 | return count 181 | 182 | 183 | def extract_ttl(url): 184 | """Return Time-to-live (TTL) value associated with hostname.""" 185 | try: 186 | ttl = resolver.query(url['host']).rrset.ttl 187 | return ttl 188 | except Exception: 189 | return '?' 190 | 191 | 192 | def time_activation_domain(url): 193 | """Return time (in days) of domain activation.""" 194 | if url['host'].startswith("www."): 195 | url['host'] = url['host'][4:] 196 | 197 | pythonwhois.net.socket.setdefaulttimeout(3.0) 198 | try: 199 | result_whois = pythonwhois.get_whois(url['host'].lower()) 200 | if not result_whois: 201 | return '?' 202 | creation_date = str(result_whois['creation_date'][0]) 203 | formated_date = " ".join(creation_date.split()[:1]) 204 | d1 = datetime.strptime(formated_date, "%Y-%m-%d") 205 | d2 = datetime.now() 206 | return abs((d2 - d1).days) 207 | except Exception: 208 | return '?' 209 | 210 | 211 | def expiration_date_register(url): 212 | """Retorna time (in days) for register expiration.""" 213 | if url['host'].startswith("www."): 214 | url['host'] = url['host'][4:] 215 | 216 | pythonwhois.net.socket.setdefaulttimeout(3.0) 217 | try: 218 | result_whois = pythonwhois.get_whois(url['host'].lower()) 219 | if not result_whois: 220 | return '?' 221 | expiration_date = str(result_whois['expiration_date'][0]) 222 | formated_date = " ".join(expiration_date.split()[:1]) 223 | d1 = datetime.strptime(formated_date, "%Y-%m-%d") 224 | d2 = datetime.now() 225 | return abs((d1 - d2).days) 226 | except Exception: 227 | return '?' 228 | 229 | 230 | def extract_extension(text): 231 | """Return file extension name.""" 232 | file = open(PATH + 'extensions.txt', 'r') 233 | pattern = re.compile("[a-zA-Z0-9.]") 234 | for extension in file: 235 | i = (text.lower().strip()).find(extension.strip()) 236 | while i > -1: 237 | if ((i + len(extension) - 1) >= len(text)) or not pattern.match(text[i + len(extension) - 1]): 238 | file.close() 239 | return extension.rstrip().split('.')[-1] 240 | i = text.find(extension.strip(), i + 1) 241 | file.close() 242 | return '?' 243 | 244 | 245 | def check_ssl(url): 246 | """Check if the ssl certificate is valid.""" 247 | try: 248 | requests.get(url, verify=True, timeout=3) 249 | return True 250 | except Exception: 251 | return False 252 | 253 | 254 | def count_redirects(url): 255 | """Return the number of redirects in a URL.""" 256 | try: 257 | response = requests.get(url, timeout=3) 258 | if response.history: 259 | return len(response.history) 260 | else: 261 | return 0 262 | except Exception: 263 | return '?' 264 | 265 | 266 | def get_asn_number(url): 267 | """Return the ANS number associated with the IP.""" 268 | try: 269 | with geoip2.database.Reader(PATH + 'GeoLite2-ASN.mmdb') as reader: 270 | if valid_ip(url['host']): 271 | ip = url['host'] 272 | else: 273 | ip = resolver.query(url['host'], 'A') 274 | ip = ip[0].to_text() 275 | 276 | if ip: 277 | response = reader.asn(ip) 278 | return response.autonomous_system_number 279 | else: 280 | return '?' 281 | except Exception: 282 | return '?' 283 | 284 | 285 | def get_country(url): 286 | """Return the country associated with IP.""" 287 | try: 288 | if valid_ip(url['host']): 289 | ip = url['host'] 290 | else: 291 | ip = resolver.query(url['host'], 'A') 292 | ip = ip[0].to_text() 293 | 294 | if ip: 295 | reader = geoip2.database.Reader(PATH + 'GeoLite2-Country.mmdb') 296 | response = reader.country(ip) 297 | return response.country.iso_code 298 | else: 299 | return '?' 300 | except Exception: 301 | return '?' 302 | 303 | 304 | def get_ptr(url): 305 | """Return PTR associated with IP.""" 306 | try: 307 | if valid_ip(url['host']): 308 | ip = url['host'] 309 | else: 310 | ip = resolver.query(url['host'], 'A') 311 | ip = ip[0].to_text() 312 | 313 | if ip: 314 | r = reversename.from_address(ip) 315 | result = resolver.query(r, 'PTR')[0].to_text() 316 | return result 317 | else: 318 | return '?' 319 | except Exception: 320 | return '?' 321 | 322 | 323 | def google_search(url): 324 | """Check if the url is indexed in google.""" 325 | user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36' 326 | headers = {'User-Agent': user_agent} 327 | 328 | query = {'q': 'info:' + url} 329 | google = "https://www.google.com/search?" + parse.urlencode(query) 330 | try: 331 | data = requests.get(google, headers=headers) 332 | except Exception: 333 | return '?' 334 | data.encoding = 'ISO-8859-1' 335 | soup = BeautifulSoup(str(data.content), "html.parser") 336 | try: 337 | (soup.find(id="rso").find( 338 | "div").find("div").find("h3").find("a"))['href'] 339 | return True 340 | except AttributeError: 341 | return False 342 | 343 | 344 | def valid_spf(domain): 345 | """Check if within the registered domain has SPF and if it is valid.""" 346 | spf = get_spf_record(domain) 347 | if spf is not None: 348 | return check_spf(spf, domain) 349 | return False 350 | 351 | 352 | def check_blacklists(url): 353 | """Check if the URL or Domain is malicious through Google Safebrowsing, Phishtank, and WOT.""" 354 | if (google_safebrowsing(url) or phishtank(url) or wot(url)): 355 | return True 356 | return False 357 | 358 | 359 | def check_blacklists_ip(url): 360 | """Check if the IP is malicious through Google Safebrowsing, Phishtank and WOT.""" 361 | try: 362 | if valid_ip(url['host']): 363 | ip = url['host'] 364 | else: 365 | ip = resolver.query(url['host'], 'A') 366 | ip = ip[0].to_text() 367 | 368 | if ip: 369 | if (google_safebrowsing(ip) or phishtank(ip) or wot(ip)): 370 | return True 371 | return False 372 | else: 373 | return '?' 374 | except Exception: 375 | return '?' 376 | 377 | 378 | def check_rbl(domain): 379 | """Check domain presence on RBL (Real-time Blackhole List).""" 380 | searcher = RBLSearch(domain) 381 | try: 382 | listed = searcher.listed 383 | except Exception: 384 | return False 385 | for key in listed: 386 | if key == 'SEARCH_HOST': 387 | pass 388 | elif listed[key]['LISTED']: 389 | return True 390 | return False 391 | 392 | 393 | def check_time_response(domain): 394 | """Return the response time in seconds.""" 395 | try: 396 | latency = requests.get(domain, headers={'Cache-Control': 'no-cache'}).elapsed.total_seconds() 397 | return latency 398 | except Exception: 399 | return '?' 400 | 401 | 402 | def read_file(archive): 403 | """Read the file with the URLs.""" 404 | with open(archive, 'r') as f: 405 | urls = ([line.rstrip() for line in f]) 406 | return urls 407 | -------------------------------------------------------------------------------- /extract.py: -------------------------------------------------------------------------------- 1 | from lib.functions import * 2 | import posixpath 3 | import csv 4 | 5 | 6 | def attributes(): 7 | """Output file attributes.""" 8 | lexical = [ 9 | 'qtd_ponto_url', 'qtd_hifen_url', 'qtd_underline_url', 10 | 'qtd_barra_url', 'qtd_interrogacao_url', 'qtd_igual_url', 11 | 'qtd_arroba_url', 'qtd_comercial_url', 'qtd_exclamacao_url', 12 | 'qtd_espaco_url', 'qtd_til_url', 'qtd_virgula_url', 13 | 'qtd_mais_url', 'qtd_asterisco_url', 'qtd_hashtag_url', 14 | 'qtd_cifrao_url', 'qtd_porcento_url', 'qtd_tld_url', 15 | 'comprimento_url', 'qtd_ponto_dominio', 'qtd_hifen_dominio', 16 | 'qtd_underline_dominio', 'qtd_barra_dominio', 'qtd_interrogacao_dominio', 17 | 'qtd_igual_dominio', 'qtd_arroba_dominio', 'qtd_comercial_dominio', 18 | 'qtd_exclamacao_dominio', 'qtd_espaco_dominio', 'qtd_til_dominio', 19 | 'qtd_virgula_dominio', 'qtd_mais_dominio', 'qtd_asterisco_dominio', 20 | 'qtd_hashtag_dominio', 'qtd_cifrao_dominio', 'qtd_porcento_dominio', 21 | 'qtd_vogais_dominio', 'comprimento_dominio', 'formato_ip_dominio', 22 | 'server_client_dominio', 'qtd_ponto_diretorio', 'qtd_hifen_diretorio', 23 | 'qtd_underline_diretorio', 'qtd_barra_diretorio', 'qtd_interrogacao_diretorio', 24 | 'qtd_igual_diretorio', 'qtd_arroba_diretorio', 'qtd_comercial_diretorio', 25 | 'qtd_exclamacao_diretorio', 'qtd_espaco_diretorio', 'qtd_til_diretorio', 26 | 'qtd_virgula_diretorio', 'qtd_mais_diretorio', 'qtd_asterisco_diretorio', 27 | 'qtd_hashtag_diretorio', 'qtd_cifrao_diretorio', 'qtd_porcento_diretorio', 28 | 'comprimento_diretorio', 'qtd_ponto_arquivo', 'qtd_hifen_arquivo', 29 | 'qtd_underline_arquivo', 'qtd_barra_arquivo', 'qtd_interrogacao_arquivo', 30 | 'qtd_igual_arquivo', 'qtd_arroba_arquivo', 'qtd_comercial_arquivo', 31 | 'qtd_exclamacao_arquivo', 'qtd_espaco_arquivo', 'qtd_til_arquivo', 32 | 'qtd_virgula_arquivo', 'qtd_mais_arquivo', 'qtd_asterisco_arquivo', 33 | 'qtd_hashtag_arquivo', 'qtd_cifrao_arquivo', 'qtd_porcento_arquivo', 34 | 'comprimento_arquivo', 'qtd_ponto_parametros', 'qtd_hifen_parametros', 35 | 'qtd_underline_parametros', 'qtd_barra_parametros', 'qtd_interrogacao_parametros', 36 | 'qtd_igual_parametros', 'qtd_arroba_parametros', 'qtd_comercial_parametros', 37 | 'qtd_exclamacao_parametros', 'qtd_espaco_parametros', 'qtd_til_parametros', 38 | 'qtd_virgula_parametros', 'qtd_mais_parametros', 'qtd_asterisco_parametros', 39 | 'qtd_hashtag_parametros', 'qtd_cifrao_parametros', 'qtd_porcento_parametros', 40 | 'comprimento_parametros', 'presenca_tld_argumentos', 'qtd_parametros', 41 | 'email_na_url', 'extensao_arquivo' 42 | ] 43 | 44 | blacklist = ['url_presente_em_blacklists', 'presenca_ip_blacklists', 'dominio_presente_em_blacklists'] 45 | 46 | host = ['dominio_presente_em_rbl', 'tempo_resposta', 'possui_spf', 'localizacao_geografica_ip', 47 | 'numero_as_ip', 'ptr_ip', 'tempo_ativacao_dominio', 'tempo_expiracao_dominio', 48 | 'qtd_ip_resolvido', 'qtd_nameservers', 'qtd_servidores_mx', 'valor_ttl_associado'] 49 | 50 | others = ['certificado_tls_ssl', 'qtd_redirecionamentos', 'url_indexada_no_google', 'dominio_indexado_no_google', 'url_encurtada'] 51 | 52 | list_attributes = [] 53 | list_attributes.extend(lexical) 54 | list_attributes.extend(blacklist) 55 | list_attributes.extend(host) 56 | list_attributes.extend(others) 57 | list_attributes.extend(['phishing']) 58 | 59 | return list_attributes 60 | 61 | 62 | def main(urls, dataset): 63 | with open(dataset, "w") as output: 64 | writer = csv.writer(output) 65 | writer.writerow(attributes()) 66 | count_url = 0 67 | for url in read_file(urls): 68 | print(url) 69 | count_url = count_url + 1 70 | dict_url = start_url(url) 71 | 72 | """LEXICAL""" 73 | # URL 74 | dot_url = str(count(dict_url['url'], '.')) 75 | hyphe_url = str(count(dict_url['url'], '-')) 76 | underline_url = str(count(dict_url['url'], '_')) 77 | bar_url = str(count(dict_url['url'], '/')) 78 | question_url = str(count(dict_url['url'], '?')) 79 | equal_url = str(count(dict_url['url'], '=')) 80 | arroba_url = str(count(dict_url['url'], '@')) 81 | ampersand_url = str(count(dict_url['url'], '&')) 82 | exclamation_url = str(count(dict_url['url'], '!')) 83 | blank_url = str(count(dict_url['url'], ' ')) 84 | til_url = str(count(dict_url['url'], '~')) 85 | comma_url = str(count(dict_url['url'], ',')) 86 | plus_url = str(count(dict_url['url'], '+')) 87 | asterisk_url = str(count(dict_url['url'], '*')) 88 | hashtag_url = str(count(dict_url['url'], '#')) 89 | money_sign_url = str(count(dict_url['url'], '$')) 90 | percentage_url = str(count(dict_url['url'], '%')) 91 | len_url = str(length(dict_url['url'])) 92 | email_exist = str(valid_email(dict_url['url'])) 93 | count_tld_url = str(count_tld(dict_url['url'])) 94 | # DOMAIN 95 | dot_host = str(count(dict_url['host'], '.')) 96 | hyphe_host = str(count(dict_url['host'], '-')) 97 | underline_host = str(count(dict_url['host'], '_')) 98 | bar_host = str(count(dict_url['host'], '/')) 99 | question_host = str(count(dict_url['host'], '?')) 100 | equal_host = str(count(dict_url['host'], '=')) 101 | arroba_host = str(count(dict_url['host'], '@')) 102 | ampersand_host = str(count(dict_url['host'], '&')) 103 | exclamation_host = str(count(dict_url['host'], '!')) 104 | blank_host = str(count(dict_url['host'], ' ')) 105 | til_host = str(count(dict_url['host'], '~')) 106 | comma_host = str(count(dict_url['host'], ',')) 107 | plus_host = str(count(dict_url['host'], '+')) 108 | asterisk_host = str(count(dict_url['host'], '*')) 109 | hashtag_host = str(count(dict_url['host'], '#')) 110 | money_sign_host = str(count(dict_url['host'], '$')) 111 | percentage_host = str(count(dict_url['host'], '%')) 112 | vowels_host = str(count_vowels(dict_url['host'])) 113 | len_host = str(length(dict_url['host'])) 114 | ip_exist = str(valid_ip(dict_url['host'])) 115 | server_client = str(check_word_server_client(dict_url['host'])) 116 | # DIRECTORY 117 | if dict_url['path']: 118 | dot_path = str(count(dict_url['path'], '.')) 119 | hyphe_path = str(count(dict_url['path'], '-')) 120 | underline_path = str(count(dict_url['path'], '_')) 121 | bar_path = str(count(dict_url['path'], '/')) 122 | question_path = str(count(dict_url['path'], '?')) 123 | equal_path = str(count(dict_url['path'], '=')) 124 | arroba_path = str(count(dict_url['path'], '@')) 125 | ampersand_path = str(count(dict_url['path'], '&')) 126 | exclamation_path = str(count(dict_url['path'], '!')) 127 | blank_path = str(count(dict_url['path'], ' ')) 128 | til_path = str(count(dict_url['path'], '~')) 129 | comma_path = str(count(dict_url['path'], ',')) 130 | plus_path = str(count(dict_url['path'], '+')) 131 | asterisk_path = str(count(dict_url['path'], '*')) 132 | hashtag_path = str(count(dict_url['path'], '#')) 133 | money_sign_path = str(count(dict_url['path'], '$')) 134 | percentage_path = str(count(dict_url['path'], '%')) 135 | len_path = str(length(dict_url['path'])) 136 | else: 137 | dot_path = '?' 138 | hyphe_path = '?' 139 | underline_path = '?' 140 | bar_path = '?' 141 | question_path = '?' 142 | equal_path = '?' 143 | arroba_path = '?' 144 | ampersand_path = '?' 145 | exclamation_path = '?' 146 | blank_path = '?' 147 | til_path = '?' 148 | comma_path = '?' 149 | plus_path = '?' 150 | asterisk_path = '?' 151 | hashtag_path = '?' 152 | money_sign_path = '?' 153 | percentage_path = '?' 154 | len_path = '?' 155 | # FILE 156 | if dict_url['path']: 157 | dot_file = str(count(posixpath.basename(dict_url['path']), '.')) 158 | hyphe_file = str(count(posixpath.basename(dict_url['path']), '-')) 159 | underline_file = str( 160 | count(posixpath.basename(dict_url['path']), '_')) 161 | bar_file = str(count(posixpath.basename(dict_url['path']), '/')) 162 | question_file = str( 163 | count(posixpath.basename(dict_url['path']), '?')) 164 | equal_file = str(count(posixpath.basename(dict_url['path']), '=')) 165 | arroba_file = str(count(posixpath.basename(dict_url['path']), '@')) 166 | ampersand_file = str( 167 | count(posixpath.basename(dict_url['path']), '&')) 168 | exclamation_file = str( 169 | count(posixpath.basename(dict_url['path']), '!')) 170 | blank_file = str(count(posixpath.basename(dict_url['path']), ' ')) 171 | til_file = str(count(posixpath.basename(dict_url['path']), '~')) 172 | comma_file = str(count(posixpath.basename(dict_url['path']), ',')) 173 | plus_file = str(count(posixpath.basename(dict_url['path']), '+')) 174 | asterisk_file = str( 175 | count(posixpath.basename(dict_url['path']), '*')) 176 | hashtag_file = str( 177 | count(posixpath.basename(dict_url['path']), '#')) 178 | money_sign_file = str( 179 | count(posixpath.basename(dict_url['path']), '$')) 180 | percentage_file = str( 181 | count(posixpath.basename(dict_url['path']), '%')) 182 | len_file = str(length(posixpath.basename(dict_url['path']))) 183 | extension = str(extract_extension( 184 | posixpath.basename(dict_url['path']))) 185 | else: 186 | dot_file = '?' 187 | hyphe_file = '?' 188 | underline_file = '?' 189 | bar_file = '?' 190 | question_file = '?' 191 | equal_file = '?' 192 | arroba_file = '?' 193 | ampersand_file = '?' 194 | exclamation_file = '?' 195 | blank_file = '?' 196 | til_file = '?' 197 | comma_file = '?' 198 | plus_file = '?' 199 | asterisk_file = '?' 200 | hashtag_file = '?' 201 | money_sign_file = '?' 202 | percentage_file = '?' 203 | len_file = '?' 204 | extension = '?' 205 | # PARAMETERS 206 | if dict_url['query']: 207 | dot_params = str(count(dict_url['query'], '.')) 208 | hyphe_params = str(count(dict_url['query'], '-')) 209 | underline_params = str(count(dict_url['query'], '_')) 210 | bar_params = str(count(dict_url['query'], '/')) 211 | question_params = str(count(dict_url['query'], '?')) 212 | equal_params = str(count(dict_url['query'], '=')) 213 | arroba_params = str(count(dict_url['query'], '@')) 214 | ampersand_params = str(count(dict_url['query'], '&')) 215 | exclamation_params = str(count(dict_url['query'], '!')) 216 | blank_params = str(count(dict_url['query'], ' ')) 217 | til_params = str(count(dict_url['query'], '~')) 218 | comma_params = str(count(dict_url['query'], ',')) 219 | plus_params = str(count(dict_url['query'], '+')) 220 | asterisk_params = str(count(dict_url['query'], '*')) 221 | hashtag_params = str(count(dict_url['query'], '#')) 222 | money_sign_params = str(count(dict_url['query'], '$')) 223 | percentage_params = str(count(dict_url['query'], '%')) 224 | len_params = str(length(dict_url['query'])) 225 | tld_params = str(check_tld(dict_url['query'])) 226 | number_params = str(count_params(dict_url['query'])) 227 | else: 228 | dot_params = '?' 229 | hyphe_params = '?' 230 | underline_params = '?' 231 | bar_params = '?' 232 | question_params = '?' 233 | equal_params = '?' 234 | arroba_params = '?' 235 | ampersand_params = '?' 236 | exclamation_params = '?' 237 | blank_params = '?' 238 | til_params = '?' 239 | comma_params = '?' 240 | plus_params = '?' 241 | asterisk_params = '?' 242 | hashtag_params = '?' 243 | money_sign_params = '?' 244 | percentage_params = '?' 245 | len_params = '?' 246 | tld_params = '?' 247 | number_params = '?' 248 | 249 | """BLACKLIST""" 250 | blacklist_url = str(check_blacklists(dict_url['protocol'] + '://' + dict_url['url'])) 251 | blacklist_ip = str(check_blacklists_ip(dict_url)) 252 | blacklist_domain = str(check_blacklists(dict_url['protocol'] + '://' + dict_url['host'])) 253 | 254 | """HOST""" 255 | spf = str(valid_spf(dict_url['host'])) 256 | rbl = str(check_rbl(dict_url['host'])) 257 | time_domain = str(check_time_response(dict_url['protocol'] + '://' + dict_url['host'])) 258 | asn = str(get_asn_number(dict_url)) 259 | country = str(get_country(dict_url)) 260 | ptr = str(get_ptr(dict_url)) 261 | activation_time = str(time_activation_domain(dict_url)) 262 | expiration_time = str(expiration_date_register(dict_url)) 263 | count_ip = str(count_ips(dict_url)) 264 | count_ns = str(count_name_servers(dict_url)) 265 | count_mx = str(count_mx_servers(dict_url)) 266 | ttl = str(extract_ttl(dict_url)) 267 | 268 | """OTHERS""" 269 | ssl = str(check_ssl('https://' + dict_url['url'])) 270 | count_redirect = str(count_redirects( 271 | dict_url['protocol'] + '://' + dict_url['url'])) 272 | google_url = str(google_search(dict_url['url'])) 273 | google_domain = str(google_search(dict_url['host'])) 274 | shortener = str(check_shortener(dict_url)) 275 | 276 | _lexical = [ 277 | dot_url, hyphe_url, underline_url, bar_url, question_url, 278 | equal_url, arroba_url, ampersand_url, exclamation_url, 279 | blank_url, til_url, comma_url, plus_url, asterisk_url, hashtag_url, 280 | money_sign_url, percentage_url, count_tld_url, len_url, dot_host, 281 | hyphe_host, underline_host, bar_host, question_host, equal_host, 282 | arroba_host, ampersand_host, exclamation_host, blank_host, til_host, 283 | comma_host, plus_host, asterisk_host, hashtag_host, money_sign_host, 284 | percentage_host, vowels_host, len_host, ip_exist, server_client, 285 | dot_path, hyphe_path, underline_path, bar_path, question_path, 286 | equal_path, arroba_path, ampersand_path, exclamation_path, 287 | blank_path, til_path, comma_path, plus_path, asterisk_path, 288 | hashtag_path, money_sign_path, percentage_path, len_path, dot_file, 289 | hyphe_file, underline_file, bar_file, question_file, equal_file, 290 | arroba_file, ampersand_file, exclamation_file, blank_file, 291 | til_file, comma_file, plus_file, asterisk_file, hashtag_file, 292 | money_sign_file, percentage_file, len_file, dot_params, 293 | hyphe_params, underline_params, bar_params, question_params, 294 | equal_params, arroba_params, ampersand_params, exclamation_params, 295 | blank_params, til_params, comma_params, plus_params, asterisk_params, 296 | hashtag_params, money_sign_params, percentage_params, len_params, 297 | tld_params, number_params, email_exist, extension 298 | ] 299 | 300 | _blacklist = [blacklist_url, blacklist_ip, blacklist_domain] 301 | 302 | _host = [rbl, time_domain, spf, country, asn, ptr, activation_time, 303 | expiration_time, count_ip, count_ns, count_mx, ttl] 304 | 305 | _others = [ssl, count_redirect, google_url, google_domain, shortener] 306 | 307 | result = [] 308 | result.extend(_lexical) 309 | result.extend(_blacklist) 310 | result.extend(_host) 311 | result.extend(_others) 312 | result.extend(['']) 313 | 314 | writer.writerow(result) 315 | -------------------------------------------------------------------------------- /lib/files/tlds.txt: -------------------------------------------------------------------------------- 1 | .aaa 2 | .aarp 3 | .abarth 4 | .abb 5 | .abbott 6 | .abbvie 7 | .abc 8 | .able 9 | .abogado 10 | .abudhabi 11 | .ac 12 | .academy 13 | .accenture 14 | .accountant 15 | .accountants 16 | .aco 17 | .active 18 | .actor 19 | .ad 20 | .adac 21 | .ads 22 | .adult 23 | .ae 24 | .aeg 25 | .aero 26 | .aetna 27 | .af 28 | .afamilycompany 29 | .afl 30 | .africa 31 | .ag 32 | .agakhan 33 | .agency 34 | .ai 35 | .aig 36 | .aigo 37 | .airbus 38 | .airforce 39 | .airtel 40 | .akdn 41 | .al 42 | .alfaromeo 43 | .alibaba 44 | .alipay 45 | .allfinanz 46 | .allstate 47 | .ally 48 | .alsace 49 | .alstom 50 | .am 51 | .americanexpress 52 | .americanfamily 53 | .amex 54 | .amfam 55 | .amica 56 | .amsterdam 57 | .analytics 58 | .android 59 | .anquan 60 | .anz 61 | .ao 62 | .aol 63 | .apartments 64 | .app 65 | .apple 66 | .aq 67 | .aquarelle 68 | .ar 69 | .aramco 70 | .archi 71 | .army 72 | .arpa 73 | .art 74 | .arte 75 | .as 76 | .asda 77 | .asia 78 | .associates 79 | .at 80 | .athleta 81 | .attorney 82 | .au 83 | .auction 84 | .audi 85 | .audible 86 | .audio 87 | .auspost 88 | .author 89 | .auto 90 | .autos 91 | .avianca 92 | .aw 93 | .aws 94 | .ax 95 | .axa 96 | .az 97 | .azure 98 | .ba 99 | .baby 100 | .baidu 101 | .banamex 102 | .bananarepublic 103 | .band 104 | .bank 105 | .bar 106 | .barcelona 107 | .barclaycard 108 | .barclays 109 | .barefoot 110 | .bargains 111 | .baseball 112 | .basketball 113 | .bauhaus 114 | .bayern 115 | .bb 116 | .bbc 117 | .bbt 118 | .bbva 119 | .bcg 120 | .bcn 121 | .bd 122 | .be 123 | .beats 124 | .beauty 125 | .beer 126 | .bentley 127 | .berlin 128 | .best 129 | .bestbuy 130 | .bet 131 | .bf 132 | .bg 133 | .bh 134 | .bharti 135 | .bi 136 | .bible 137 | .bid 138 | .bike 139 | .bing 140 | .bingo 141 | .bio 142 | .biz 143 | .bj 144 | .black 145 | .blackfriday 146 | .blanco 147 | .blockbuster 148 | .blog 149 | .bloomberg 150 | .blue 151 | .bm 152 | .bms 153 | .bmw 154 | .bn 155 | .bnl 156 | .bnpparibas 157 | .bo 158 | .boats 159 | .boehringer 160 | .bofa 161 | .bom 162 | .bond 163 | .boo 164 | .book 165 | .booking 166 | .boots 167 | .bosch 168 | .bostik 169 | .boston 170 | .bot 171 | .boutique 172 | .box 173 | .br 174 | .bradesco 175 | .bridgestone 176 | .broadway 177 | .broker 178 | .brother 179 | .brussels 180 | .bs 181 | .bt 182 | .budapest 183 | .bugatti 184 | .build 185 | .builders 186 | .business 187 | .buy 188 | .buzz 189 | .bv 190 | .bw 191 | .by 192 | .bz 193 | .bzh 194 | .ca 195 | .cab 196 | .cafe 197 | .cal 198 | .call 199 | .calvinklein 200 | .cam 201 | .camera 202 | .camp 203 | .cancerresearch 204 | .canon 205 | .capetown 206 | .capital 207 | .capitalone 208 | .car 209 | .caravan 210 | .cards 211 | .care 212 | .career 213 | .careers 214 | .cars 215 | .cartier 216 | .casa 217 | .case 218 | .caseih 219 | .cash 220 | .casino 221 | .cat 222 | .catering 223 | .catholic 224 | .cba 225 | .cbn 226 | .cbre 227 | .cbs 228 | .cc 229 | .cd 230 | .ceb 231 | .center 232 | .ceo 233 | .cern 234 | .cf 235 | .cfa 236 | .cfd 237 | .cg 238 | .ch 239 | .chanel 240 | .channel 241 | .chase 242 | .chat 243 | .cheap 244 | .chintai 245 | .chloe 246 | .christmas 247 | .chrome 248 | .chrysler 249 | .church 250 | .ci 251 | .cipriani 252 | .circle 253 | .cisco 254 | .citadel 255 | .citi 256 | .citic 257 | .city 258 | .cityeats 259 | .ck 260 | .cl 261 | .claims 262 | .cleaning 263 | .click 264 | .clinic 265 | .clinique 266 | .clothing 267 | .cloud 268 | .club 269 | .clubmed 270 | .cm 271 | .cn 272 | .co 273 | .coach 274 | .codes 275 | .coffee 276 | .college 277 | .cologne 278 | .com 279 | .comcast 280 | .commbank 281 | .community 282 | .company 283 | .compare 284 | .computer 285 | .comsec 286 | .condos 287 | .construction 288 | .consulting 289 | .contact 290 | .contractors 291 | .cooking 292 | .cookingchannel 293 | .cool 294 | .coop 295 | .corsica 296 | .country 297 | .coupon 298 | .coupons 299 | .courses 300 | .cr 301 | .credit 302 | .creditcard 303 | .creditunion 304 | .cricket 305 | .crown 306 | .crs 307 | .cruise 308 | .cruises 309 | .csc 310 | .cu 311 | .cuisinella 312 | .cv 313 | .cw 314 | .cx 315 | .cy 316 | .cymru 317 | .cyou 318 | .cz 319 | .dabur 320 | .dad 321 | .dance 322 | .data 323 | .date 324 | .dating 325 | .datsun 326 | .day 327 | .dclk 328 | .dds 329 | .de 330 | .deal 331 | .dealer 332 | .deals 333 | .degree 334 | .delivery 335 | .dell 336 | .deloitte 337 | .delta 338 | .democrat 339 | .dental 340 | .dentist 341 | .desi 342 | .design 343 | .dev 344 | .dhl 345 | .diamonds 346 | .diet 347 | .digital 348 | .direct 349 | .directory 350 | .discount 351 | .discover 352 | .dish 353 | .diy 354 | .dj 355 | .dk 356 | .dm 357 | .dnp 358 | .do 359 | .docs 360 | .doctor 361 | .dodge 362 | .dog 363 | .doha 364 | .domains 365 | .dot 366 | .download 367 | .drive 368 | .dtv 369 | .dubai 370 | .duck 371 | .dunlop 372 | .duns 373 | .dupont 374 | .durban 375 | .dvag 376 | .dvr 377 | .dz 378 | .earth 379 | .eat 380 | .ec 381 | .eco 382 | .edeka 383 | .edu 384 | .education 385 | .ee 386 | .eg 387 | .email 388 | .emerck 389 | .energy 390 | .engineer 391 | .engineering 392 | .enterprises 393 | .epost 394 | .epson 395 | .equipment 396 | .er 397 | .ericsson 398 | .erni 399 | .es 400 | .esq 401 | .estate 402 | .esurance 403 | .et 404 | .eu 405 | .eurovision 406 | .eus 407 | .events 408 | .everbank 409 | .exchange 410 | .expert 411 | .exposed 412 | .express 413 | .extraspace 414 | .fage 415 | .fail 416 | .fairwinds 417 | .faith 418 | .family 419 | .fan 420 | .fans 421 | .farm 422 | .farmers 423 | .fashion 424 | .fast 425 | .fedex 426 | .feedback 427 | .ferrari 428 | .ferrero 429 | .fi 430 | .fiat 431 | .fidelity 432 | .fido 433 | .film 434 | .final 435 | .finance 436 | .financial 437 | .fire 438 | .firestone 439 | .firmdale 440 | .fish 441 | .fishing 442 | .fit 443 | .fitness 444 | .fj 445 | .fk 446 | .flickr 447 | .flights 448 | .flir 449 | .florist 450 | .flowers 451 | .fly 452 | .fm 453 | .fo 454 | .foo 455 | .food 456 | .foodnetwork 457 | .football 458 | .ford 459 | .forex 460 | .forsale 461 | .forum 462 | .foundation 463 | .fox 464 | .fr 465 | .free 466 | .fresenius 467 | .frl 468 | .frogans 469 | .frontdoor 470 | .frontier 471 | .ftr 472 | .fujitsu 473 | .fujixerox 474 | .fun 475 | .fund 476 | .furniture 477 | .futbol 478 | .fyi 479 | .ga 480 | .gal 481 | .gallery 482 | .gallo 483 | .gallup 484 | .game 485 | .games 486 | .gap 487 | .garden 488 | .gb 489 | .gbiz 490 | .gd 491 | .gdn 492 | .ge 493 | .gea 494 | .gent 495 | .genting 496 | .george 497 | .gf 498 | .gg 499 | .ggee 500 | .gh 501 | .gi 502 | .gift 503 | .gifts 504 | .gives 505 | .giving 506 | .gl 507 | .glade 508 | .glass 509 | .gle 510 | .global 511 | .globo 512 | .gm 513 | .gmail 514 | .gmbh 515 | .gmo 516 | .gmx 517 | .gn 518 | .godaddy 519 | .gold 520 | .goldpoint 521 | .golf 522 | .goo 523 | .goodhands 524 | .goodyear 525 | .goog 526 | .google 527 | .gop 528 | .got 529 | .gov 530 | .gp 531 | .gq 532 | .gr 533 | .grainger 534 | .graphics 535 | .gratis 536 | .green 537 | .gripe 538 | .group 539 | .gs 540 | .gt 541 | .gu 542 | .guardian 543 | .gucci 544 | .guge 545 | .guide 546 | .guitars 547 | .guru 548 | .gw 549 | .gy 550 | .hair 551 | .hamburg 552 | .hangout 553 | .haus 554 | .hbo 555 | .hdfc 556 | .hdfcbank 557 | .health 558 | .healthcare 559 | .help 560 | .helsinki 561 | .here 562 | .hermes 563 | .hgtv 564 | .hiphop 565 | .hisamitsu 566 | .hitachi 567 | .hiv 568 | .hk 569 | .hkt 570 | .hm 571 | .hn 572 | .hockey 573 | .holdings 574 | .holiday 575 | .homedepot 576 | .homegoods 577 | .homes 578 | .homesense 579 | .honda 580 | .honeywell 581 | .horse 582 | .hospital 583 | .host 584 | .hosting 585 | .hot 586 | .hoteles 587 | .hotmail 588 | .house 589 | .how 590 | .hr 591 | .hsbc 592 | .ht 593 | .htc 594 | .hu 595 | .hughes 596 | .hyatt 597 | .hyundai 598 | .ibm 599 | .icbc 600 | .ice 601 | .icu 602 | .id 603 | .ie 604 | .ieee 605 | .ifm 606 | .ikano 607 | .il 608 | .im 609 | .imamat 610 | .imdb 611 | .immo 612 | .immobilien 613 | .in 614 | .industries 615 | .infiniti 616 | .info 617 | .ing 618 | .ink 619 | .institute 620 | .insurance 621 | .insure 622 | .int 623 | .intel 624 | .international 625 | .intuit 626 | .investments 627 | .io 628 | .ipiranga 629 | .iq 630 | .ir 631 | .irish 632 | .is 633 | .iselect 634 | .ismaili 635 | .ist 636 | .istanbul 637 | .it 638 | .itau 639 | .itv 640 | .iveco 641 | .iwc 642 | .jaguar 643 | .java 644 | .jcb 645 | .jcp 646 | .je 647 | .jeep 648 | .jetzt 649 | .jewelry 650 | .jio 651 | .jlc 652 | .jll 653 | .jm 654 | .jmp 655 | .jnj 656 | .jo 657 | .jobs 658 | .joburg 659 | .jot 660 | .joy 661 | .jp 662 | .jpmorgan 663 | .jprs 664 | .juegos 665 | .juniper 666 | .kaufen 667 | .kddi 668 | .ke 669 | .kerryhotels 670 | .kerrylogistics 671 | .kerryproperties 672 | .kfh 673 | .kg 674 | .kh 675 | .ki 676 | .kia 677 | .kim 678 | .kinder 679 | .kindle 680 | .kitchen 681 | .kiwi 682 | .km 683 | .kn 684 | .koeln 685 | .komatsu 686 | .kosher 687 | .kp 688 | .kpmg 689 | .kpn 690 | .kr 691 | .krd 692 | .kred 693 | .kuokgroup 694 | .kw 695 | .ky 696 | .kyoto 697 | .kz 698 | .la 699 | .lacaixa 700 | .ladbrokes 701 | .lamborghini 702 | .lamer 703 | .lancaster 704 | .lancia 705 | .lancome 706 | .land 707 | .landrover 708 | .lanxess 709 | .lasalle 710 | .lat 711 | .latino 712 | .latrobe 713 | .law 714 | .lawyer 715 | .lb 716 | .lc 717 | .lds 718 | .lease 719 | .leclerc 720 | .lefrak 721 | .legal 722 | .lego 723 | .lexus 724 | .lgbt 725 | .li 726 | .liaison 727 | .lidl 728 | .life 729 | .lifeinsurance 730 | .lifestyle 731 | .lighting 732 | .like 733 | .lilly 734 | .limited 735 | .limo 736 | .lincoln 737 | .linde 738 | .link 739 | .lipsy 740 | .live 741 | .living 742 | .lixil 743 | .lk 744 | .loan 745 | .loans 746 | .locker 747 | .locus 748 | .loft 749 | .lol 750 | .london 751 | .lotte 752 | .lotto 753 | .love 754 | .lpl 755 | .lplfinancial 756 | .lr 757 | .ls 758 | .lt 759 | .ltd 760 | .ltda 761 | .lu 762 | .lundbeck 763 | .lupin 764 | .luxe 765 | .luxury 766 | .lv 767 | .ly 768 | .ma 769 | .macys 770 | .madrid 771 | .maif 772 | .maison 773 | .makeup 774 | .man 775 | .management 776 | .mango 777 | .market 778 | .marketing 779 | .markets 780 | .marriott 781 | .marshalls 782 | .maserati 783 | .mattel 784 | .mba 785 | .mc 786 | .mcd 787 | .mcdonalds 788 | .mckinsey 789 | .md 790 | .me 791 | .med 792 | .media 793 | .meet 794 | .melbourne 795 | .meme 796 | .memorial 797 | .men 798 | .menu 799 | .meo 800 | .metlife 801 | .mg 802 | .mh 803 | .miami 804 | .microsoft 805 | .mil 806 | .mini 807 | .mint 808 | .mit 809 | .mitsubishi 810 | .mk 811 | .ml 812 | .mlb 813 | .mls 814 | .mm 815 | .mma 816 | .mn 817 | .mo 818 | .mobi 819 | .mobile 820 | .mobily 821 | .moda 822 | .moe 823 | .moi 824 | .mom 825 | .monash 826 | .money 827 | .monster 828 | .montblanc 829 | .mopar 830 | .mormon 831 | .mortgage 832 | .moscow 833 | .moto 834 | .motorcycles 835 | .mov 836 | .movie 837 | .movistar 838 | .mp 839 | .mq 840 | .mr 841 | .ms 842 | .msd 843 | .mt 844 | .mtn 845 | .mtpc 846 | .mtr 847 | .mu 848 | .museum 849 | .mutual 850 | .mv 851 | .mw 852 | .mx 853 | .my 854 | .mz 855 | .na 856 | .nab 857 | .nadex 858 | .nagoya 859 | .name 860 | .nationwide 861 | .natura 862 | .navy 863 | .nba 864 | .nc 865 | .ne 866 | .nec 867 | .net 868 | .netbank 869 | .netflix 870 | .network 871 | .neustar 872 | .new 873 | .newholland 874 | .news 875 | .next 876 | .nextdirect 877 | .nexus 878 | .nf 879 | .nfl 880 | .ng 881 | .ngo 882 | .nhk 883 | .ni 884 | .nico 885 | .nike 886 | .nikon 887 | .ninja 888 | .nissan 889 | .nissay 890 | .nl 891 | .no 892 | .nokia 893 | .northwesternmutual 894 | .norton 895 | .now 896 | .nowruz 897 | .nowtv 898 | .np 899 | .nr 900 | .nra 901 | .nrw 902 | .ntt 903 | .nu 904 | .nyc 905 | .nz 906 | .obi 907 | .observer 908 | .off 909 | .office 910 | .okinawa 911 | .olayan 912 | .olayangroup 913 | .oldnavy 914 | .ollo 915 | .om 916 | .omega 917 | .one 918 | .ong 919 | .onl 920 | .online 921 | .onyourside 922 | .ooo 923 | .open 924 | .oracle 925 | .orange 926 | .org 927 | .organic 928 | .orientexpress 929 | .origins 930 | .osaka 931 | .otsuka 932 | .ott 933 | .ovh 934 | .pa 935 | .page 936 | .pamperedchef 937 | .panasonic 938 | .panerai 939 | .paris 940 | .pars 941 | .partners 942 | .parts 943 | .party 944 | .passagens 945 | .pay 946 | .pccw 947 | .pe 948 | .pet 949 | .pf 950 | .pfizer 951 | .pg 952 | .ph 953 | .pharmacy 954 | .philips 955 | .phone 956 | .photo 957 | .photography 958 | .photos 959 | .physio 960 | .piaget 961 | .pics 962 | .pictet 963 | .pictures 964 | .pid 965 | .pin 966 | .ping 967 | .pink 968 | .pioneer 969 | .pizza 970 | .pk 971 | .pl 972 | .place 973 | .play 974 | .playstation 975 | .plumbing 976 | .plus 977 | .pm 978 | .pn 979 | .pnc 980 | .pohl 981 | .poker 982 | .politie 983 | .porn 984 | .post 985 | .pr 986 | .pramerica 987 | .praxi 988 | .press 989 | .prime 990 | .pro 991 | .prod 992 | .productions 993 | .prof 994 | .progressive 995 | .promo 996 | .properties 997 | .property 998 | .protection 999 | .pru 1000 | .prudential 1001 | .ps 1002 | .pt 1003 | .pub 1004 | .pw 1005 | .pwc 1006 | .py 1007 | .qa 1008 | .qpon 1009 | .quebec 1010 | .quest 1011 | .qvc 1012 | .racing 1013 | .radio 1014 | .raid 1015 | .re 1016 | .read 1017 | .realestate 1018 | .realtor 1019 | .realty 1020 | .recipes 1021 | .red 1022 | .redstone 1023 | .redumbrella 1024 | .rehab 1025 | .reise 1026 | .reisen 1027 | .reit 1028 | .reliance 1029 | .ren 1030 | .rent 1031 | .rentals 1032 | .repair 1033 | .report 1034 | .republican 1035 | .rest 1036 | .restaurant 1037 | .review 1038 | .reviews 1039 | .rexroth 1040 | .rich 1041 | .richardli 1042 | .ricoh 1043 | .rightathome 1044 | .ril 1045 | .rio 1046 | .rip 1047 | .rmit 1048 | .ro 1049 | .rocher 1050 | .rocks 1051 | .rodeo 1052 | .rogers 1053 | .room 1054 | .rs 1055 | .rsvp 1056 | .ru 1057 | .ruhr 1058 | .run 1059 | .rw 1060 | .rwe 1061 | .ryukyu 1062 | .sa 1063 | .saarland 1064 | .safe 1065 | .safety 1066 | .sakura 1067 | .sale 1068 | .salon 1069 | .samsclub 1070 | .samsung 1071 | .sandvik 1072 | .sandvikcoromant 1073 | .sanofi 1074 | .sap 1075 | .sapo 1076 | .sarl 1077 | .sas 1078 | .save 1079 | .saxo 1080 | .sb 1081 | .sbi 1082 | .sbs 1083 | .sc 1084 | .sca 1085 | .scb 1086 | .schaeffler 1087 | .schmidt 1088 | .scholarships 1089 | .school 1090 | .schule 1091 | .schwarz 1092 | .science 1093 | .scjohnson 1094 | .scor 1095 | .scot 1096 | .sd 1097 | .se 1098 | .seat 1099 | .secure 1100 | .security 1101 | .seek 1102 | .select 1103 | .sener 1104 | .services 1105 | .ses 1106 | .seven 1107 | .sew 1108 | .sex 1109 | .sexy 1110 | .sfr 1111 | .sg 1112 | .sh 1113 | .shangrila 1114 | .sharp 1115 | .shaw 1116 | .shell 1117 | .shia 1118 | .shiksha 1119 | .shoes 1120 | .shop 1121 | .shopping 1122 | .shouji 1123 | .show 1124 | .showtime 1125 | .shriram 1126 | .si 1127 | .silk 1128 | .sina 1129 | .singles 1130 | .site 1131 | .sj 1132 | .sk 1133 | .ski 1134 | .skin 1135 | .sky 1136 | .skype 1137 | .sl 1138 | .sling 1139 | .sm 1140 | .smart 1141 | .smile 1142 | .sn 1143 | .sncf 1144 | .so 1145 | .soccer 1146 | .social 1147 | .softbank 1148 | .software 1149 | .sohu 1150 | .solar 1151 | .solutions 1152 | .song 1153 | .sony 1154 | .soy 1155 | .space 1156 | .spiegel 1157 | .spot 1158 | .spreadbetting 1159 | .sr 1160 | .srl 1161 | .srt 1162 | .st 1163 | .stada 1164 | .staples 1165 | .star 1166 | .starhub 1167 | .statebank 1168 | .statefarm 1169 | .statoil 1170 | .stc 1171 | .stcgroup 1172 | .stockholm 1173 | .storage 1174 | .store 1175 | .stream 1176 | .studio 1177 | .study 1178 | .style 1179 | .su 1180 | .sucks 1181 | .supplies 1182 | .supply 1183 | .support 1184 | .surf 1185 | .surgery 1186 | .suzuki 1187 | .sv 1188 | .swatch 1189 | .swiftcover 1190 | .swiss 1191 | .sx 1192 | .sy 1193 | .sydney 1194 | .symantec 1195 | .systems 1196 | .sz 1197 | .tab 1198 | .taipei 1199 | .talk 1200 | .taobao 1201 | .target 1202 | .tatamotors 1203 | .tatar 1204 | .tattoo 1205 | .tax 1206 | .taxi 1207 | .tc 1208 | .tci 1209 | .td 1210 | .tdk 1211 | .team 1212 | .tech 1213 | .technology 1214 | .tel 1215 | .telecity 1216 | .telefonica 1217 | .temasek 1218 | .tennis 1219 | .teva 1220 | .tf 1221 | .tg 1222 | .th 1223 | .thd 1224 | .theater 1225 | .theatre 1226 | .tiaa 1227 | .tickets 1228 | .tienda 1229 | .tiffany 1230 | .tips 1231 | .tires 1232 | .tirol 1233 | .tj 1234 | .tjmaxx 1235 | .tjx 1236 | .tk 1237 | .tkmaxx 1238 | .tl 1239 | .tm 1240 | .tmall 1241 | .tn 1242 | .to 1243 | .today 1244 | .tokyo 1245 | .tools 1246 | .top 1247 | .toray 1248 | .toshiba 1249 | .total 1250 | .tours 1251 | .town 1252 | .toyota 1253 | .toys 1254 | .tr 1255 | .trade 1256 | .trading 1257 | .training 1258 | .travel 1259 | .travelchannel 1260 | .travelers 1261 | .travelersinsurance 1262 | .trust 1263 | .trv 1264 | .tt 1265 | .tube 1266 | .tui 1267 | .tunes 1268 | .tushu 1269 | .tv 1270 | .tvs 1271 | .tw 1272 | .tz 1273 | .ua 1274 | .ubank 1275 | .ubs 1276 | .uconnect 1277 | .ug 1278 | .uk 1279 | .unicom 1280 | .university 1281 | .uno 1282 | .uol 1283 | .ups 1284 | .us 1285 | .uy 1286 | .uz 1287 | .va 1288 | .vacations 1289 | .vana 1290 | .vanguard 1291 | .vc 1292 | .ve 1293 | .vegas 1294 | .ventures 1295 | .verisign 1296 | .versicherung 1297 | .vet 1298 | .vg 1299 | .vi 1300 | .viajes 1301 | .video 1302 | .vig 1303 | .viking 1304 | .villas 1305 | .vin 1306 | .vip 1307 | .virgin 1308 | .visa 1309 | .vision 1310 | .vista 1311 | .vistaprint 1312 | .viva 1313 | .vivo 1314 | .vlaanderen 1315 | .vn 1316 | .vodka 1317 | .volkswagen 1318 | .volvo 1319 | .vote 1320 | .voting 1321 | .voto 1322 | .voyage 1323 | .vu 1324 | .vuelos 1325 | .wales 1326 | .walmart 1327 | .walter 1328 | .wang 1329 | .wanggou 1330 | .warman 1331 | .watch 1332 | .watches 1333 | .weather 1334 | .weatherchannel 1335 | .webcam 1336 | .weber 1337 | .website 1338 | .wed 1339 | .wedding 1340 | .weibo 1341 | .weir 1342 | .wf 1343 | .whoswho 1344 | .wien 1345 | .wiki 1346 | .williamhill 1347 | .win 1348 | .windows 1349 | .wine 1350 | .winners 1351 | .wme 1352 | .wolterskluwer 1353 | .woodside 1354 | .work 1355 | .works 1356 | .world 1357 | .wow 1358 | .ws 1359 | .wtc 1360 | .wtf 1361 | .xbox 1362 | .xerox 1363 | .xfinity 1364 | .xihuan 1365 | .xin 1366 | .xperia 1367 | .xxx 1368 | .xyz 1369 | .achts 1370 | .yahoo 1371 | .yamaxun 1372 | .yandex 1373 | .ye 1374 | .yodobashi 1375 | .yoga 1376 | .yokohama 1377 | .you 1378 | .youtube 1379 | .yt 1380 | .yun 1381 | .za 1382 | .zappos 1383 | .zara 1384 | .zero 1385 | .zip 1386 | .zippo 1387 | .zm 1388 | .zone 1389 | .zuerich 1390 | .zw 1391 | -------------------------------------------------------------------------------- /pythonwhois/parse.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import re, sys, datetime, csv, pkgutil 3 | from . import net, shared 4 | 5 | try: 6 | from io import StringIO 7 | except ImportError: 8 | from cStringIO import StringIO 9 | 10 | def pkgdata(name): 11 | data = pkgutil.get_data("pythonwhois", name) 12 | if sys.version_info < (3, 0): 13 | return data 14 | else: 15 | return data.decode("utf-8") 16 | 17 | def read_dataset(filename, destination, abbrev_key, name_key, is_dict=False): 18 | try: 19 | if is_dict: 20 | reader = csv.DictReader(pkgdata(filename).splitlines()) 21 | else: 22 | reader = csv.reader(pkgdata(filename).splitlines()) 23 | 24 | for line in reader: 25 | destination[line[abbrev_key]] = line[name_key] 26 | except IOError as e: 27 | pass 28 | 29 | airports = {} 30 | countries = {} 31 | states_au = {} 32 | states_us = {} 33 | states_ca = {} 34 | 35 | try: 36 | reader = csv.reader(pkgdata("airports.dat").splitlines()) 37 | 38 | for line in reader: 39 | airports[line[4]] = line[2] 40 | airports[line[5]] = line[2] 41 | except IOError as e: 42 | # The distributor likely removed airports.dat for licensing reasons. We'll just leave an empty dict. 43 | pass 44 | 45 | read_dataset("countries.dat", countries, "iso", "name", is_dict=True) 46 | read_dataset("countries3.dat", countries, "iso3", "name", is_dict=True) 47 | read_dataset("states_au.dat", states_au, 0, 1) 48 | read_dataset("states_us.dat", states_us, "abbreviation", "name", is_dict=True) 49 | read_dataset("states_ca.dat", states_ca, "abbreviation", "name", is_dict=True) 50 | 51 | def precompile_regexes(source, flags=0): 52 | return [re.compile(regex, flags) for regex in source] 53 | 54 | grammar = { 55 | "_data": { 56 | 'id': ['Domain ID:[ ]*(?P.+)'], 57 | 'status': ['\[Status\]\s*(?P.+)', 58 | 'Status\s*:\s?(?P.+)', 59 | '\[State\]\s*(?P.+)', 60 | '^state:\s*(?P.+)'], 61 | 'creation_date': ['\[Created on\]\s*(?P.+)', 62 | 'Created on[.]*: [a-zA-Z]+, (?P.+)', 63 | 'Creation Date:\s?(?P.+)', 64 | 'Creation date\s*:\s?(?P.+)', 65 | 'Registration Date:\s?(?P.+)', 66 | 'Created Date:\s?(?P.+)', 67 | 'Created on:\s?(?P.+)', 68 | 'Created on\s?[.]*:\s?(?P.+)\.', 69 | 'Date Registered\s?[.]*:\s?(?P.+)', 70 | 'Domain Created\s?[.]*:\s?(?P.+)', 71 | 'Domain registered\s?[.]*:\s?(?P.+)', 72 | 'Domain record activated\s?[.]*:\s*?(?P.+)', 73 | 'Record created on\s?[.]*:?\s*?(?P.+)', 74 | 'Record created\s?[.]*:?\s*?(?P.+)', 75 | 'Created\s?[.]*:?\s*?(?P.+)', 76 | 'Registered on\s?[.]*:?\s*?(?P.+)', 77 | 'Registered\s?[.]*:?\s*?(?P.+)', 78 | 'Domain Create Date\s?[.]*:?\s*?(?P.+)', 79 | 'Domain Registration Date\s?[.]*:?\s*?(?P.+)', 80 | 'created:\s*(?P.+)', 81 | '\[Registered Date\]\s*(?P.+)', 82 | 'created-date:\s*(?P.+)', 83 | 'Domain Name Commencement Date: (?P.+)', 84 | 'registered:\s*(?P.+)', 85 | 'registration:\s*(?P.+)'], 86 | 'expiration_date': ['\[Expires on\]\s*(?P.+)', 87 | 'Registrar Registration Expiration Date:[ ]*(?P.+)-[0-9]{4}', 88 | 'Expires on[.]*: [a-zA-Z]+, (?P.+)', 89 | 'Expiration Date:\s?(?P.+)', 90 | 'Expiration date\s*:\s?(?P.+)', 91 | 'Expires on:\s?(?P.+)', 92 | 'Expires on\s?[.]*:\s?(?P.+)\.', 93 | 'Exp(?:iry)? Date\s?[.]*:\s?(?P.+)', 94 | 'Expiry\s*:\s?(?P.+)', 95 | 'Domain Currently Expires\s?[.]*:\s?(?P.+)', 96 | 'Record will expire on\s?[.]*:\s?(?P.+)', 97 | 'Domain expires\s?[.]*:\s*?(?P.+)', 98 | 'Record expires on\s?[.]*:?\s*?(?P.+)', 99 | 'Record expires\s?[.]*:?\s*?(?P.+)', 100 | 'Expires\s?[.]*:?\s*?(?P.+)', 101 | 'Expire Date\s?[.]*:?\s*?(?P.+)', 102 | 'Expired\s?[.]*:?\s*?(?P.+)', 103 | 'Domain Expiration Date\s?[.]*:?\s*?(?P.+)', 104 | 'paid-till:\s*(?P.+)', 105 | 'expiration_date:\s*(?P.+)', 106 | 'expire-date:\s*(?P.+)', 107 | 'renewal:\s*(?P.+)', 108 | 'expires:\s*(?P.+)', 109 | 'expire:\s*(?P.+)'], 110 | 'updated_date': ['\[Last Updated\]\s*(?P.+)', 111 | 'Record modified on[.]*: (?P.+) [a-zA-Z]+', 112 | 'Record last updated on[.]*: [a-zA-Z]+, (?P.+)', 113 | 'Updated Date:\s?(?P.+)', 114 | 'Updated date\s*:\s?(?P.+)', 115 | #'Database last updated on\s?[.]*:?\s*?(?P.+)\s[a-z]+\.?', 116 | 'Record last updated on\s?[.]*:?\s?(?P.+)\.', 117 | 'Domain record last updated\s?[.]*:\s*?(?P.+)', 118 | 'Domain Last Updated\s?[.]*:\s*?(?P.+)', 119 | 'Last updated on:\s?(?P.+)', 120 | 'Date Modified\s?[.]*:\s?(?P.+)', 121 | 'Last Modified\s?[.]*:\s?(?P.+)', 122 | 'Domain Last Updated Date\s?[.]*:\s?(?P.+)', 123 | 'Record last updated\s?[.]*:\s?(?P.+)', 124 | 'Modified\s?[.]*:\s?(?P.+)', 125 | '(C|c)hanged:\s*(?P.+)', 126 | 'last_update:\s*(?P.+)', 127 | 'Last Update\s?[.]*:\s?(?P.+)', 128 | 'Last updated on (?P.+) [a-z]{3,4}', 129 | 'Last updated:\s*(?P.+)', 130 | 'last-updated:\s*(?P.+)', 131 | '\[Last Update\]\s*(?P.+) \([A-Z]+\)', 132 | 'Last update of whois database:\s?[a-z]{3}, (?P.+) [a-z]{3,4}'], 133 | 'registrar': ['registrar:\s*(?P.+)', 134 | 'Registrar:\s*(?P.+)', 135 | 'Sponsoring Registrar Organization:\s*(?P.+)', 136 | 'Registered through:\s?(?P.+)', 137 | 'Registrar Name[.]*:\s?(?P.+)', 138 | 'Record maintained by:\s?(?P.+)', 139 | 'Registration Service Provided By:\s?(?P.+)', 140 | 'Registrar of Record:\s?(?P.+)', 141 | 'Domain Registrar :\s?(?P.+)', 142 | 'Registration Service Provider: (?P.+)', 143 | '\tName:\t\s(?P.+)'], 144 | 'whois_server': ['Whois Server:\s?(?P.+)', 145 | 'Registrar Whois:\s?(?P.+)'], 146 | 'nameservers': ['Name Server:[ ]*(?P[^ ]+)', 147 | 'Nameservers:[ ]*(?P[^ ]+)', 148 | '(?<=[ .]{2})(?P([a-z0-9-]+\.)+[a-z0-9]+)(\s+([0-9]{1,3}\.){3}[0-9]{1,3})', 149 | 'nameserver:\s*(?P.+)', 150 | 'nserver:\s*(?P[^[\s]+)', 151 | 'Name Server[.]+ (?P[^[\s]+)', 152 | 'Hostname:\s*(?P[^\s]+)', 153 | 'DNS[0-9]+:\s*(?P.+)', 154 | ' DNS:\s*(?P.+)', 155 | 'ns[0-9]+:\s*(?P.+)', 156 | 'NS [0-9]+\s*:\s*(?P.+)', 157 | '\[Name Server\]\s*(?P.+)', 158 | '(?<=[ .]{2})(?P[a-z0-9-]+\.d?ns[0-9]*\.([a-z0-9-]+\.)+[a-z0-9]+)', 159 | '(?<=[ .]{2})(?P([a-z0-9-]+\.)+[a-z0-9]+)(\s+([0-9]{1,3}\.){3}[0-9]{1,3})', 160 | '(?<=[ .]{2})[^a-z0-9.-](?Pd?ns\.([a-z0-9-]+\.)+[a-z0-9]+)', 161 | 'Nserver:\s*(?P.+)'], 162 | 'emails': ['(?P[\w.-]+@[\w.-]+\.[\w]{2,6})', # Really need to fix this, much longer TLDs now exist... 163 | '(?P[\w.-]+\sAT\s[\w.-]+\sDOT\s[\w]{2,6})'] 164 | }, 165 | "_dateformats": ( 166 | '(?P[0-9]{1,2})[./ -](?PJan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[./ -](?P[0-9]{4}|[0-9]{2})' 167 | '(\s+(?P[0-9]{1,2})[:.](?P[0-9]{1,2})[:.](?P[0-9]{1,2}))?', 168 | '[a-z]{3}\s(?PJan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[./ -](?P[0-9]{1,2})(\s+(?P[0-9]{1,2})[:.](?P[0-9]{1,2})[:.](?P[0-9]{1,2}))?\s[a-z]{3}\s(?P[0-9]{4}|[0-9]{2})', 169 | '[a-zA-Z]+\s(?P[0-9]{1,2})(?:st|nd|rd|th)\s(?PJan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)\s(?P[0-9]{4})', 170 | '(?P[0-9]{4})[./-]?(?P[0-9]{2})[./-]?(?P[0-9]{2})(\s|T|/)((?P[0-9]{1,2})[:.-](?P[0-9]{1,2})[:.-](?P[0-9]{1,2}))', 171 | '(?P[0-9]{4})[./-](?P[0-9]{1,2})[./-](?P[0-9]{1,2})', 172 | '(?P[0-9]{4})(?P[0-9]{1,2})(?P[0-9]{1,2})', 173 | '(?P[0-9]{1,2})[./ -](?P[0-9]{1,2})[./ -](?P[0-9]{4}|[0-9]{2})', 174 | '(?PJan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) (?P[0-9]{1,2}),? (?P[0-9]{4})', 175 | '(?P[0-9]{1,2})-(?PJanuary|February|March|April|May|June|July|August|September|October|November|December)-(?P[0-9]{4})', 176 | ), 177 | "_months": { 178 | 'jan': 1, 179 | 'january': 1, 180 | 'feb': 2, 181 | 'february': 2, 182 | 'mar': 3, 183 | 'march': 3, 184 | 'apr': 4, 185 | 'april': 4, 186 | 'may': 5, 187 | 'jun': 6, 188 | 'june': 6, 189 | 'jul': 7, 190 | 'july': 7, 191 | 'aug': 8, 192 | 'august': 8, 193 | 'sep': 9, 194 | 'sept': 9, 195 | 'september': 9, 196 | 'oct': 10, 197 | 'october': 10, 198 | 'nov': 11, 199 | 'november': 11, 200 | 'dec': 12, 201 | 'december': 12 202 | } 203 | } 204 | 205 | def preprocess_regex(regex): 206 | # Fix for #2; prevents a ridiculous amount of varying size permutations. 207 | regex = re.sub(r"\\s\*\(\?P<([^>]+)>\.\+\)", r"\s*(?P<\1>\S.*)", regex) 208 | # Experimental fix for #18; removes unnecessary variable-size whitespace 209 | # matching, since we're stripping results anyway. 210 | regex = re.sub(r"\[ \]\*\(\?P<([^>]+)>\.\*\)", r"(?P<\1>.*)", regex) 211 | return regex 212 | 213 | registrant_regexes = [ 214 | " Registrant:[ ]*\n (?P.*)\n (?P.*)\n (?P.*)\n (?P.*), (?P.*) (?P.*)\n (?P.*)\n(?: Phone: (?P.*)\n)? Email: (?P.*)\n", # Corporate Domains, Inc. 215 | "Registrant:\n (?P.+)\n (?P.+)\n(?: (?P.*)\n)?(?: (?P.*)\n)? (?P.+), (?P.+)\n (?P.+)\n (?P.+)\n (?P.+)\n\n", # OVH 216 | "(?:Registrant ID:(?P.+)\n)?Registrant Name:(?P.*)\n(?:Registrant Organization:(?P.*)\n)?Registrant Street1?:(?P.*)\n(?:Registrant Street2:(?P.*)\n)?(?:Registrant Street3:(?P.*)\n)?Registrant City:(?P.*)\nRegistrant State/Province:(?P.*)\nRegistrant Postal Code:(?P.*)\nRegistrant Country:(?P.*)\nRegistrant Phone:(?P.*)\n(?:Registrant Phone Ext.:(?P.*)\n)?(?:Registrant FAX:(?P.*)\n)?(?:Registrant FAX Ext.:(?P.*)\n)?Registrant Email:(?P.*)", # Public Interest Registry (.org), nic.pw, No-IP.com 217 | "Registrant ID:(?P.+)\nRegistrant Name:(?P.*)\n(?:Registrant Organization:(?P.*)\n)?Registrant Address1?:(?P.*)\n(?:Registrant Address2:(?P.*)\n)?(?:Registrant Address3:(?P.*)\n)?Registrant City:(?P.*)\nRegistrant State/Province:(?P.*)\nRegistrant Country/Economy:(?P.*)\nRegistrant Postal Code:(?P.*)\nRegistrant Phone:(?P.*)\n(?:Registrant Phone Ext.:(?P.*)\n)?(?:Registrant FAX:(?P.*)\n)?(?:Registrant FAX Ext.:(?P.*)\n)?Registrant E-mail:(?P.*)", # .ME, DotAsia 218 | "Registrant ID:\s*(?P.+)\nRegistrant Name:\s*(?P.+)\nRegistrant Organization:\s*(?P.*)\nRegistrant Address1:\s*(?P.+)\nRegistrant Address2:\s*(?P.*)\nRegistrant City:\s*(?P.+)\nRegistrant State/Province:\s*(?P.+)\nRegistrant Postal Code:\s*(?P.+)\nRegistrant Country:\s*(?P.+)\nRegistrant Country Code:\s*(?P.+)\nRegistrant Phone Number:\s*(?P.+)\nRegistrant Email:\s*(?P.+)\n", # .CO Internet 219 | "Registrant Contact: (?P.+)\nRegistrant Organization: (?P.+)\nRegistrant Name: (?P.+)\nRegistrant Street: (?P.+)\nRegistrant City: (?P.+)\nRegistrant Postal Code: (?P.+)\nRegistrant State: (?P.+)\nRegistrant Country: (?P.+)\nRegistrant Phone: (?P.*)\nRegistrant Phone Ext: (?P.*)\nRegistrant Fax: (?P.*)\nRegistrant Fax Ext: (?P.*)\nRegistrant Email: (?P.*)\n", # Key-Systems GmbH 220 | "(?:Registrant ID:[ ]*(?P.*)\n)?Registrant Name:[ ]*(?P.*)\n(?:Registrant Organization:[ ]*(?P.*)\n)?Registrant Street:[ ]*(?P.+)\n(?:Registrant Street:[ ]*(?P.+)\n)?(?:Registrant Street:[ ]*(?P.+)\n)?Registrant City:[ ]*(?P.+)\nRegistrant State(?:\/Province)?:[ ]*(?P.*)\nRegistrant Postal Code:[ ]*(?P.+)\nRegistrant Country:[ ]*(?P.+)\n(?:Registrant Phone:[ ]*(?P.*)\n)?(?:Registrant Phone Ext:[ ]*(?P.*)\n)?(?:Registrant Fax:[ ]*(?P.*)\n)?(?:Registrant Fax Ext:[ ]*(?P.*)\n)?(?:Registrant Email:[ ]*(?P.+)\n)?", # WildWestDomains, GoDaddy, Namecheap/eNom, Ascio, Musedoma (.museum), EuroDNS, nic.ps 221 | "Registrant\n(?: (?P.+)\n)? (?P.+)\n Email:(?P.+)\n (?P.+)\n(?: (?P.+)\n)? (?P.+) (?P.+)\n (?P.+)\n Tel: (?P.+)\n\n", # internet.bs 222 | " Registrant Contact Details:[ ]*\n (?P.*)\n (?P.*)[ ]{2,}\((?P.*)\)\n (?P.*)\n(?: (?P.*)\n)?(?: (?P.*)\n)? (?P.*)\n (?P.*),(?P.*)\n (?P.*)\n Tel. (?P.*)", # Whois.com 223 | "owner-id:[ ]*(?P.*)\n(?:owner-organization:[ ]*(?P.*)\n)?owner-name:[ ]*(?P.*)\nowner-street:[ ]*(?P.*)\nowner-city:[ ]*(?P.*)\nowner-zip:[ ]*(?P.*)\nowner-country:[ ]*(?P.*)\n(?:owner-phone:[ ]*(?P.*)\n)?(?:owner-fax:[ ]*(?P.*)\n)?owner-email:[ ]*(?P.*)", # InterNetworX 224 | "Registrant:\n registrant_org: (?P.*)\n registrant_name: (?P.*)\n registrant_email: (?P.*)\n registrant_address: (?P
.*)\n registrant_city: (?P.*)\n registrant_state: (?P.*)\n registrant_zip: (?P.*)\n registrant_country: (?P.*)\n registrant_phone: (?P.*)", # Bellnames 225 | "Holder of domain name:\n(?P[\S\s]+)\n(?P.+)\n(?P[A-Z0-9-]+)\s+(?P.+)\n(?P.+)\nContractual Language", # nic.ch 226 | "\n\n(?:Owner)?\s+: (?P.*)\n(?:\s+: (?P.*)\n)?\s+: (?P.*)\n\s+: (?P.*)\n\s+: (?P.*)\n\s+: (?P.*)\n", # nic.io 227 | "Contact Information:\n\[Name\]\s*(?P.*)\n\[Email\]\s*(?P.*)\n\[Web Page\]\s*(?P.*)\n\[Postal code\]\s*(?P.*)\n\[Postal Address\]\s*(?P.*)\n(?:\s+(?P.*)\n)?(?:\s+(?P.*)\n)?\[Phone\]\s*(?P.*)\n\[Fax\]\s*(?P.*)\n", # jprs.jp 228 | "g\. \[Organization\] (?P.+)\n", # .co.jp registrations at jprs.jp 229 | "Registrant ID:(?P.*)\nRegistrant Name:(?P.*)\n(?:Registrant Organization:(?P.*)\n)?Registrant Address1:(?P.*)\n(?:Registrant Address2:(?P.*)\n)?(?:Registrant Address3:(?P.*)\n)?Registrant City:(?P.*)\n(?:Registrant State/Province:(?P.*)\n)?(?:Registrant Postal Code:(?P.*)\n)?Registrant Country:(?P.*)\nRegistrant Country Code:.*\nRegistrant Phone Number:(?P.*)\n(?:Registrant Facsimile Number:(?P.*)\n)?Registrant Email:(?P.*)", # .US, .biz (NeuStar), .buzz, .moe (Interlink Co. Ltd.) 230 | "Registrant\n Name: (?P.+)\n(?: Organization: (?P.+)\n)? ContactID: (?P.+)\n(?: Address: (?P.+)\n(?: (?P.+)\n(?: (?P.+)\n)?)? (?P.+)\n (?P.+)\n (?P.+)\n (?P.+)\n)?(?: Created: (?P.+)\n)?(?: Last Update: (?P.+)\n)?", # nic.it 231 | " Organisation Name[.]* (?P.*)\n Organisation Address[.]* (?P.*)\n Organisation Address[.]* (?P.*)\n(?: Organisation Address[.]* (?P.*)\n)? Organisation Address[.]* (?P.*)\n Organisation Address[.]* (?P.*)\n Organisation Address[.]* (?P.*)\n Organisation Address[.]* (?P.*)", # Melbourne IT (what a horrid format...) 232 | "Registrant:[ ]*(?P.+)\n[\s\S]*Eligibility Name:[ ]*(?P.+)\n[\s\S]*Registrant Contact ID:[ ]*(?P.+)\n", # .au business 233 | "Eligibility Type:[ ]*Citizen\/Resident\n[\s\S]*Registrant Contact ID:[ ]*(?P.+)\n[\s\S]*Registrant Contact Name:[ ]*(?P.+)\n", # .au individual 234 | "Registrant:[ ]*(?P.+)\n[\s\S]*Eligibility Type:[ ]*(Higher Education Institution|Company|Incorporated Association|Other)\n[\s\S]*Registrant Contact ID:[ ]*(?P.+)\n[\s\S]*Registrant Contact Name:[ ]*(?P.+)\n", # .au educational, company, 'incorporated association' (non-profit?), other (spotted for linux.conf.au, unsure if also for others) 235 | " Registrant:\n (?P.+)\n\n Registrant type:\n .*\n\n Registrant's address:\n The registrant .* opted to have", # Nominet (.uk) with hidden address 236 | " Registrant:\n (?P.+)\n\n[\s\S]* Registrant type:\n .*\n\n Registrant's address:\n (?P.+)\n(?: (?P.+)\n(?: (?P.+)\n)??)?? (?P[^0-9\n]+)\n(?: (?P.+)\n)? (?P.+)\n (?P.+)\n\n", # Nominet (.uk) with visible address 237 | "Domain Owner:\n\t(?P.+)\n\n[\s\S]*?(?:Registrant Contact:\n\t(?P.+))?\n\nRegistrant(?:'s)? (?:a|A)ddress:(?:\n\t(?P.+)\n(?:\t(?P.+)\n)?(?:\t(?P.+)\n)?\t(?P.+)\n\t(?P.+))?\n\t(?P.+)(?:\n\t(?P.+) \(Phone\)\n\t(?P.+) \(FAX\)\n\t(?P.+))?\n\n", # .ac.uk - what a mess... 238 | "Registrant ID: (?P.+)\nRegistrant: (?P.+)\nRegistrant Contact Email: (?P.+)", # .cn (CNNIC) 239 | "Registrant contact:\n (?P.+)\n (?P.*)\n (?P.+), (?P.+) (?P.+) (?P.+)\n\n", # Fabulous.com 240 | "registrant-name:\s*(?P.+)\nregistrant-type:\s*(?P.+)\nregistrant-address:\s*(?P.+)\nregistrant-postcode:\s*(?P.+)\nregistrant-city:\s*(?P.+)\nregistrant-country:\s*(?P.+)\n(?:registrant-phone:\s*(?P.+)\n)?(?:registrant-email:\s*(?P.+)\n)?", # Hetzner 241 | "Registrant Contact Information :[ ]*\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n\n", # GAL Communication 242 | "Contact Information : For Customer # [0-9]+[ ]*\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n\n", # GAL Communication alternative (private WHOIS) format? 243 | "Registrant:\n Name: (?P.+)\n City: (?P.+)\n State: (?P.+)\n Country: (?P.+)\n", # Akky (.com.mx) 244 | " Registrant:\n (?P.+)\n (?P.+)\n (?P.+) (?P\S+),[ ]+(?P.+)\n (?P.+)", # .am 245 | "Domain Holder: (?P.+)\n(?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?))?)?)?)?)?)?, (?P[^.,]+), (?P.+), (?P.+)\n(?P.+)\n(?P[A-Z]+)\n", # .co.th, format 1 246 | "Domain Holder: (?P.+)\n(?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?))?)?)?)?)?)?, (?P.+)\n(?P.+)\n(?P[A-Z]+)\n", # .co.th, format 2 247 | "Domain Holder: (?P.+)\n(?P.+)\n(?:(?P.+)\n)?(?:(?P.+)\n)?.+?, (?P.+)\n(?P.+)\n(?P.+)\n(?P[A-Z]+)\n", # .co.th, format 3 248 | "Domain Holder: (?P.+)\n(?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?))?)?)?)?)?)?\n(?P.+),? (?P[A-Z]{2,3})(?: [A-Z0-9]+)?\n(?P.+)\n(?P[A-Z]+)\n", # .co.th, format 4 249 | " Registrant:\n (?P.+)\n (?P.+) (?P.+)\n (?P.*)\n (?P.*)\n (?P.*)\n (?P.+), (?P[^,\n]*)\n (?P.+)\n", # .com.tw (Western registrars) 250 | "Registrant:\n(?P.+)\n(?P.+)\n(?P.+?)(?:,+(?P.+?)(?:,+(?P.+?)(?:,+(?P.+?)(?:,+(?P.+?)(?:,+(?P.+?)(?:,+(?P.+?))?)?)?)?)?)?,(?P.+),(?P.+)\n\n Contact:\n (?P.+) (?P.+)\n TEL: (?P.+?)(?:(?:#|ext.?)(?P.+))?\n FAX: (?P.+)(?:(?:#|ext.?)(?P.+))?\n", # .com.tw (TWNIC/SEEDNET, Taiwanese companies only?) 251 | "Registrant Contact Information:\n\nCompany English Name \(It should be the same as the registered/corporation name on your Business Register Certificate or relevant documents\):(?P.+)\nCompany Chinese name:(?P.+)\nAddress: (?P.+)\nCountry: (?P.+)\nEmail: (?P.+)\n", # HKDNR (.hk) 252 | "Registrant ID:(?P.+)\nRegistrant Name:(?P.*)\n(?:Registrant Organization:(?P.*)\n)?Registrant Street1:(?P.+?)\n(?:Registrant Street2:(?P.+?)\n(?:Registrant Street3:(?P.+?)\n)?)?Registrant City:(?P.+)\nRegistrant State:(?P.*)\nRegistrant Postal Code:(?P.+)\nRegistrant Country:(?P[A-Z]+)\nRegistrant Phone:(?P.*?)\nRegistrant Fax:(?P.*)\nRegistrant Email:(?P.+)\n", # Realtime Register 253 | "owner:\s+(?P.+)", # .br 254 | "person:\s+(?P.+)", # nic.ru (person) 255 | "org:\s+(?P.+)", # nic.ru (organization) 256 | ] 257 | 258 | tech_contact_regexes = [ 259 | " Technical Contact:[ ]*\n (?P.*)\n (?P.*)\n (?P.*)\n (?P.*), (?P.*) (?P.*)\n (?P.*)\n(?: Phone: (?P.*)\n)? Email: (?P.*)\n", # Corporate Domains, Inc. 260 | "Technical Contact:\n (?P.+)\n (?P.+)\n(?: (?P.*)\n)?(?: (?P.*)\n)? (?P.+), (?P.+)\n (?P.+)\n (?P.+)\n (?P.+)\n\n", # OVH 261 | "(?:Tech ID:(?P.+)\n)?Tech Name:(?P.*)\n(:?Tech Organization:(?P.*)\n)?Tech Street1?:(?P.*)\n(?:Tech Street2:(?P.*)\n)?(?:Tech Street3:(?P.*)\n)?Tech City:(?P.*)\nTech State/Province:(?P.*)\nTech Postal Code:(?P.*)\nTech Country:(?P.*)\nTech Phone:(?P.*)\n(?:Tech Phone Ext.:(?P.*)\n)?(?:Tech FAX:(?P.*)\n)?(?:Tech FAX Ext.:(?P.*)\n)?Tech Email:(?P.*)", # Public Interest Registry (.org), nic.pw, No-IP.com 262 | "Tech(?:nical)? ID:(?P.+)\nTech(?:nical)? Name:(?P.*)\n(?:Tech(?:nical)? Organization:(?P.*)\n)?Tech(?:nical)? Address1?:(?P.*)\n(?:Tech(?:nical)? Address2:(?P.*)\n)?(?:Tech(?:nical)? Address3:(?P.*)\n)?Tech(?:nical)? City:(?P.*)\nTech(?:nical)? State/Province:(?P.*)\nTech(?:nical)? Country/Economy:(?P.*)\nTech(?:nical)? Postal Code:(?P.*)\nTech(?:nical)? Phone:(?P.*)\n(?:Tech(?:nical)? Phone Ext.:(?P.*)\n)?(?:Tech(?:nical)? FAX:(?P.*)\n)?(?:Tech(?:nical)? FAX Ext.:(?P.*)\n)?Tech(?:nical)? E-mail:(?P.*)", # .ME, DotAsia 263 | "Technical Contact ID:\s*(?P.+)\nTechnical Contact Name:\s*(?P.+)\nTechnical Contact Organization:\s*(?P.*)\nTechnical Contact Address1:\s*(?P.+)\nTechnical Contact Address2:\s*(?P.*)\nTechnical Contact City:\s*(?P.+)\nTechnical Contact State/Province:\s*(?P.+)\nTechnical Contact Postal Code:\s*(?P.+)\nTechnical Contact Country:\s*(?P.+)\nTechnical Contact Country Code:\s*(?P.+)\nTechnical Contact Phone Number:\s*(?P.+)\nTechnical Contact Email:\s*(?P.+)\n", # .CO Internet 264 | "Tech Contact: (?P.+)\nTech Organization: (?P.+)\nTech Name: (?P.+)\nTech Street: (?P.+)\nTech City: (?P.+)\nTech Postal Code: (?P.+)\nTech State: (?P.+)\nTech Country: (?P.+)\nTech Phone: (?P.*)\nTech Phone Ext: (?P.*)\nTech Fax: (?P.*)\nTech Fax Ext: (?P.*)\nTech Email: (?P.*)\n", # Key-Systems GmbH 265 | "(?:Tech ID:[ ]*(?P.*)\n)?Tech[ ]*Name:[ ]*(?P.*)\n(?:Tech[ ]*Organization:[ ]*(?P.*)\n)?Tech[ ]*Street:[ ]*(?P.+)\n(?:Tech[ ]*Street:[ ]*(?P.+)\n)?(?:Tech[ ]*Street:[ ]*(?P.+)\n)?Tech[ ]*City:[ ]*(?P.+)\nTech[ ]*State(?:\/Province)?:[ ]*(?P.*)\nTech[ ]*Postal[ ]*Code:[ ]*(?P.+)\nTech[ ]*Country:[ ]*(?P.+)\n(?:Tech[ ]*Phone:[ ]*(?P.*)\n)?(?:Tech[ ]*Phone[ ]*Ext:[ ]*(?P.*)\n)?(?:Tech[ ]*Fax:[ ]*(?P.*)\n)?(?:Tech[ ]*Fax[ ]*Ext:\s*?(?P.*)\n)?(?:Tech[ ]*Email:[ ]*(?P.+)\n)?", # WildWestDomains, GoDaddy, Namecheap/eNom, Ascio, Musedoma (.museum), EuroDNS, nic.ps 266 | "Technical Contact\n(?: (?P.+)\n)? (?P.+)\n Email:(?P.+)\n (?P.+)\n(?: (?P.+)\n)? (?P.+) (?P.+)\n (?P.+)\n Tel: (?P.+)\n\n", # internet.bs 267 | " Technical Contact Details:[ ]*\n (?P.*)\n (?P.*)[ ]{2,}\((?P.*)\)\n (?P.*)\n(?: (?P.*)\n)?(?: (?P.*)\n)? (?P.*)\n (?P.*),(?P.*)\n (?P.*)\n Tel. (?P.*)", # Whois.com 268 | "tech-id:[ ]*(?P.*)\n(?:tech-organization:[ ]*(?P.*)\n)?tech-name:[ ]*(?P.*)\ntech-street:[ ]*(?P.*)\ntech-city:[ ]*(?P.*)\ntech-zip:[ ]*(?P.*)\ntech-country:[ ]*(?P.*)\n(?:tech-phone:[ ]*(?P.*)\n)?(?:tech-fax:[ ]*(?P.*)\n)?tech-email:[ ]*(?P.*)", # InterNetworX 269 | "Technical Contact:\n tech_org: (?P.*)\n tech_name: (?P.*)\n tech_email: (?P.*)\n tech_address: (?P
.*)\n tech_city: (?P.*)\n tech_state: (?P.*)\n tech_zip: (?P.*)\n tech_country: (?P.*)\n tech_phone: (?P.*)", # Bellnames 270 | "Technical contact:\n(?P[\S\s]+)\n(?P.+)\n(?P[A-Z0-9-]+)\s+(?P.+)\n(?P.+)\n\n", # nic.ch 271 | "Tech Contact ID:[ ]*(?P.+)\nTech Contact Name:[ ]*(?P.+)", # .au 272 | "Technical Contact ID:(?P.*)\nTechnical Contact Name:(?P.*)\n(?:Technical Contact Organization:(?P.*)\n)?Technical Contact Address1:(?P.*)\n(?:Technical Contact Address2:(?P.*)\n)?(?:Technical Contact Address3:(?P.*)\n)?Technical Contact City:(?P.*)\n(?:Technical Contact State/Province:(?P.*)\n)?(?:Technical Contact Postal Code:(?P.*)\n)?Technical Contact Country:(?P.*)\nTechnical Contact Country Code:.*\nTechnical Contact Phone Number:(?P.*)\n(?:Technical Contact Facsimile Number:(?P.*)\n)?Technical Contact Email:(?P.*)", # .US, .biz (NeuStar), .buzz, .moe (Interlink Co. Ltd.) 273 | "Technical Contacts\n Name: (?P.+)\n(?: Organization: (?P.+)\n)? ContactID: (?P.+)\n(?: Address: (?P.+)\n(?: (?P.+)\n(?: (?P.+)\n)?)? (?P.+)\n (?P.+)\n (?P.+)\n (?P.+)\n)?(?: Created: (?P.+)\n)?(?: Last Update: (?P.+)\n)?", # nic.it // NOTE: Why does this say 'Contacts'? Can it have multiple? 274 | "Tech Name[.]* (?P.*)\n Tech Address[.]* (?P.*)\n Tech Address[.]* (?P.*)\n(?: Tech Address[.]* (?P.*)\n)? Tech Address[.]* (?P.*)\n Tech Address[.]* (?P.*)\n Tech Address[.]* (?P.*)\n Tech Address[.]* (?P.*)\n Tech Email[.]* (?P.*)\n Tech Phone[.]* (?P.*)\n Tech Fax[.]* (?P.*)", # Melbourne IT 275 | "Technical contact:\n(?: (?P.+)\n)? (?P.+)\n (?P.+)\n (?P.+)\n (?P.+), (?P.+) (?P.+) (?P.+)\n Phone: (?P.*)\n Fax: (?P.*)\n", # Fabulous.com 276 | "tech-c-name:\s*(?P.+)\ntech-c-type:\s*(?P.+)\ntech-c-address:\s*(?P.+)\ntech-c-postcode:\s*(?P.+)\ntech-c-city:\s*(?P.+)\ntech-c-country:\s*(?P.+)\n(?:tech-c-phone:\s*(?P.+)\n)?(?:tech-c-email:\s*(?P.+)\n)?", # Hetzner 277 | "Admin Contact Information :[ ]*\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n\n", # GAL Communication 278 | " Technical contact:\n (?P.+)\n (?P.*)\n (?P.+)\n (?P.+) (?P\S+),[ ]+(?P.+)\n (?P.+)\n (?P.+)\n (?P.*)\n (?P.*)", # .am 279 | "Technical:\n\s*Name:\s*(?P.*)\n\s*Organisation:\s*(?P.*)\n\s*Language:.*\n\s*Phone:\s*(?P.*)\n\s*Fax:\s*(?P.*)\n\s*Email:\s*(?P.*)\n", # EURid 280 | "\[Zone-C\]\nType: (?P.+)\nName: (?P.+)\n(Organisation: (?P.+)\n){0,1}(Address: (?P.+)\n){1}(Address: (?P.+)\n){0,1}(Address: (?P.+)\n){0,1}(Address: (?P.+)\n){0,1}PostalCode: (?P.+)\nCity: (?P.+)\nCountryCode: (?P[A-Za-z]{2})\nPhone: (?P.+)\nFax: (?P.+)\nEmail: (?P.+)\n(Remarks: (?P.+)\n){0,1}Changed: (?P.+)", # DeNIC 281 | "Technical Contact:\n Name: (?P.+)\n City: (?P.+)\n State: (?P.+)\n Country: (?P.+)\n", # Akky (.com.mx) 282 | "Tech Contact: (?P.+)\n(?P.+)\n(?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?))?)?)?)?)?)?\n(?P.+),? (?P[A-Z]{2,3})(?: [A-Z0-9]+)?\n(?P.+)\n(?P[A-Z]+)\n", # .co.th, format 1 283 | "Tech Contact: (?P.+)\n(?P.+)\n(?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?))?)?)?)?)?)?\n(?P.+), (?P.+)\n(?P.+)\n(?P[A-Z]+)\n", # .co.th, format 2 284 | "Tech Contact: (?P.+)\n(?P.+)\n(?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?)(?:,+ (?P.+?))?)?)?)?)?)?, (?P.+)\n(?P.+)\n(?P[A-Z]+)\n", # .co.th, format 3 285 | "Tech Contact: (?P.+)\n(?P.+) (?P[^\s]+)\n(?P.+)\n(?P[A-Z]+)\n", # .co.th, format 4 286 | "Tech Contact: (?P.+)\n(?P.+)\n(?P.+)\n(?P.+) (?P[^\s]+)\n(?P.+)\n(?P[A-Z]+)\n", # .co.th, format 5 287 | "Tech Contact: (?P.+)\n(?P.+)\n(?P.+)\n(?P.+)\n(?:(?P.+)\n)?(?P.+)\n(?P.+)\n(?P[A-Z]+)\n", # .co.th, format 6 288 | " Technical Contact:\n (?P.+) (?P.+)\n (?P.*)\n (?P.*)\n", # .com.tw (Western registrars) 289 | "Technical Contact Information:\n\n(?:Given name: (?P.+)\n)?(?:Family name: (?P.+)\n)?(?:Company name: (?P.+)\n)?Address: (?P.+)\nCountry: (?P.+)\nPhone: (?P.*)\nFax: (?P.*)\nEmail: (?P.+)\n(?:Account Name: (?P.+)\n)?", # HKDNR (.hk) 290 | "TECH ID:(?P.+)\nTECH Name:(?P.*)\n(?:TECH Organization:(?P.*)\n)?TECH Street1:(?P.+?)\n(?:TECH Street2:(?P.+?)\n(?:TECH Street3:(?P.+?)\n)?)?TECH City:(?P.+)\nTECH State:(?P.*)\nTECH Postal Code:(?P.+)\nTECH Country:(?P[A-Z]+)\nTECH Phone:(?P.*?)\nTECH Fax:(?P.*)\nTECH Email:(?P.+)\n", # Realtime Register 291 | ] 292 | 293 | admin_contact_regexes = [ 294 | " Administrative Contact:[ ]*\n (?P.*)\n (?P.*)\n (?P.*)\n (?P.*), (?P.*) (?P.*)\n (?P.*)\n(?: Phone: (?P.*)\n)? Email: (?P.*)\n", # Corporate Domains, Inc. 295 | "Administrative Contact:\n (?P.+)\n (?P.+)\n(?: (?P.*)\n)?(?: (?P.*)\n)? (?P.+), (?P.+)\n (?P.+)\n (?P.+)\n (?P.+)\n\n", # OVH 296 | "(?:Admin ID:(?P.+)\n)?Admin Name:(?P.*)\n(?:Admin Organization:(?P.*)\n)?Admin Street1?:(?P.*)\n(?:Admin Street2:(?P.*)\n)?(?:Admin Street3:(?P.*)\n)?Admin City:(?P.*)\nAdmin State/Province:(?P.*)\nAdmin Postal Code:(?P.*)\nAdmin Country:(?P.*)\nAdmin Phone:(?P.*)\n(?:Admin Phone Ext.:(?P.*)\n)?(?:Admin FAX:(?P.*)\n)?(?:Admin FAX Ext.:(?P.*)\n)?Admin Email:(?P.*)", # Public Interest Registry (.org), nic.pw, No-IP.com 297 | "Admin(?:istrative)? ID:(?P.+)\nAdmin(?:istrative)? Name:(?P.*)\n(?:Admin(?:istrative)? Organization:(?P.*)\n)?Admin(?:istrative)? Address1?:(?P.*)\n(?:Admin(?:istrative)? Address2:(?P.*)\n)?(?:Admin(?:istrative)? Address3:(?P.*)\n)?Admin(?:istrative)? City:(?P.*)\nAdmin(?:istrative)? State/Province:(?P.*)\nAdmin(?:istrative)? Country/Economy:(?P.*)\nAdmin(?:istrative)? Postal Code:(?P.*)\nAdmin(?:istrative)? Phone:(?P.*)\n(?:Admin(?:istrative)? Phone Ext.:(?P.*)\n)?(?:Admin(?:istrative)? FAX:(?P.*)\n)?(?:Admin(?:istrative)? FAX Ext.:(?P.*)\n)?Admin(?:istrative)? E-mail:(?P.*)", # .ME, DotAsia 298 | "Administrative Contact ID:\s*(?P.+)\nAdministrative Contact Name:\s*(?P.+)\nAdministrative Contact Organization:\s*(?P.*)\nAdministrative Contact Address1:\s*(?P.+)\nAdministrative Contact Address2:\s*(?P.*)\nAdministrative Contact City:\s*(?P.+)\nAdministrative Contact State/Province:\s*(?P.+)\nAdministrative Contact Postal Code:\s*(?P.+)\nAdministrative Contact Country:\s*(?P.+)\nAdministrative Contact Country Code:\s*(?P.+)\nAdministrative Contact Phone Number:\s*(?P.+)\nAdministrative Contact Email:\s*(?P.+)\n", # .CO Internet 299 | "Admin Contact: (?P.+)\nAdmin Organization: (?P.+)\nAdmin Name: (?P.+)\nAdmin Street: (?P.+)\nAdmin City: (?P.+)\nAdmin State: (?P.+)\nAdmin Postal Code: (?P.+)\nAdmin Country: (?P.+)\nAdmin Phone: (?P.*)\nAdmin Phone Ext: (?P.*)\nAdmin Fax: (?P.*)\nAdmin Fax Ext: (?P.*)\nAdmin Email: (?P.*)\n", # Key-Systems GmbH 300 | "(?:Admin ID:[ ]*(?P.*)\n)?Admin[ ]*Name:[ ]*(?P.*)\n(?:Admin[ ]*Organization:[ ]*(?P.*)\n)?Admin[ ]*Street:[ ]*(?P.+)\n(?:Admin[ ]*Street:[ ]*(?P.+)\n)?(?:Admin[ ]*Street:[ ]*(?P.+)\n)?Admin[ ]*City:[ ]*(?P.+)\nAdmin[ ]*State(?:\/Province)?:[ ]*(?P.*)\nAdmin[ ]*Postal[ ]*Code:[ ]*(?P.+)\nAdmin[ ]*Country:[ ]*(?P.+)\n(?:Admin[ ]*Phone:[ ]*(?P.*)\n)?(?:Admin[ ]*Phone[ ]*Ext:[ ]*(?P.*)\n)?(?:Admin[ ]*Fax:[ ]*(?P.*)\n)?(?:Admin[ ]*Fax[ ]*Ext:\s*?(?P.*)\n)?(?:Admin[ ]*Email:[ ]*(?P.+)\n)?", # WildWestDomains, GoDaddy, Namecheap/eNom, Ascio, Musedoma (.museum), EuroDNS, nic.ps 301 | "Administrative Contact\n(?: (?P.+)\n)? (?P.+)\n Email:(?P.+)\n (?P.+)\n(?: (?P.+)\n)? (?P.+) (?P.+)\n (?P.+)\n Tel: (?P.+)\n\n", # internet.bs 302 | " Administrative Contact Details:[ ]*\n (?P.*)\n (?P.*)[ ]{2,}\((?P.*)\)\n (?P.*)\n(?: (?P.*)\n)?(?: (?P.*)\n)? (?P.*)\n (?P.*),(?P.*)\n (?P.*)\n Tel. (?P.*)", # Whois.com 303 | "admin-id:[ ]*(?P.*)\n(?:admin-organization:[ ]*(?P.*)\n)?admin-name:[ ]*(?P.*)\nadmin-street:[ ]*(?P.*)\nadmin-city:[ ]*(?P.*)\nadmin-zip:[ ]*(?P.*)\nadmin-country:[ ]*(?P.*)\n(?:admin-phone:[ ]*(?P.*)\n)?(?:admin-fax:[ ]*(?P.*)\n)?admin-email:[ ]*(?P.*)", # InterNetworX 304 | "Administrative Contact:\n admin_org: (?P.*)\n admin_name: (?P.*)\n admin_email: (?P.*)\n admin_address: (?P
.*)\n admin_city: (?P.*)\n admin_state: (?P.*)\n admin_zip: (?P.*)\n admin_country: (?P.*)\n admin_phone: (?P.*)", # Bellnames 305 | "Administrative Contact ID:(?P.*)\nAdministrative Contact Name:(?P.*)\n(?:Administrative Contact Organization:(?P.*)\n)?Administrative Contact Address1:(?P.*)\n(?:Administrative Contact Address2:(?P.*)\n)?(?:Administrative Contact Address3:(?P.*)\n)?Administrative Contact City:(?P.*)\n(?:Administrative Contact State/Province:(?P.*)\n)?(?:Administrative Contact Postal Code:(?P.*)\n)?Administrative Contact Country:(?P.*)\nAdministrative Contact Country Code:.*\nAdministrative Contact Phone Number:(?P.*)\n(?:Administrative Contact Facsimile Number:(?P.*)\n)?Administrative Contact Email:(?P.*)", # .US, .biz (NeuStar), .buzz, .moe (Interlink Co. Ltd.) 306 | "Admin Contact\n Name: (?P.+)\n(?: Organization: (?P.+)\n)? ContactID: (?P.+)\n(?: Address: (?P.+)\n(?: (?P.+)\n(?: (?P.+)\n)?)? (?P.+)\n (?P.+)\n (?P.+)\n (?P.+)\n)?(?: Created: (?P.+)\n)?(?: Last Update: (?P.+)\n)?", # nic.it 307 | "Admin Name[.]* (?P.*)\n Admin Address[.]* (?P.*)\n Admin Address[.]* (?P.*)\n(?: Admin Address[.]* (?P.*)\n)? Admin Address[.]* (?P.*)\n Admin Address[.]* (?P.*)\n Admin Address[.]* (?P.*)\n Admin Address[.]* (?P.*)\n Admin Email[.]* (?P.*)\n Admin Phone[.]* (?P.*)\n Admin Fax[.]* (?P.*)", # Melbourne IT 308 | "Administrative contact:\n(?: (?P.+)\n)? (?P.+)\n (?P.+)\n (?P.+)\n (?P.+), (?P.+) (?P.+) (?P.+)\n Phone: (?P.*)\n Fax: (?P.*)\n", # Fabulous.com 309 | "admin-c-name:\s*(?P.+)\nadmin-c-type:\s*(?P.+)\nadmin-c-address:\s*(?P.+)\nadmin-c-postcode:\s*(?P.+)\nadmin-c-city:\s*(?P.+)\nadmin-c-country:\s*(?P.+)\n(?:admin-c-phone:\s*(?P.+)\n)?(?:admin-c-email:\s*(?P.+)\n)?", # Hetzner 310 | "Tech Contact Information :[ ]*\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n\n", # GAL Communication 311 | " Administrative contact:\n (?P.+)\n (?P.*)\n (?P.+)\n (?P.+) (?P\S+),[ ]+(?P.+)\n (?P.+)\n (?P.+)\n (?P.*)\n (?P.*)", # .am 312 | "Administrative Contact:\n Name: (?P.+)\n City: (?P.+)\n State: (?P.+)\n Country: (?P.+)\n", # Akky (.com.mx) 313 | "\[Tech-C\]\nType: (?P.+)\nName: (?P.+)\n(Organisation: (?P.+)\n){0,1}(Address: (?P.+)\n){1}(Address: (?P.+)\n){0,1}(Address: (?P.+)\n){0,1}(Address: (?P.+)\n){0,1}PostalCode: (?P.+)\nCity: (?P.+)\nCountryCode: (?P[A-Za-z]{2})\nPhone: (?P.+)\nFax: (?P.+)\nEmail: (?P.+)\n(Remarks: (?P.+)\n){0,1}Changed: (?P.+)", # DeNIC 314 | " Administrative Contact:\n (?P.+) (?P.+)\n (?P.*)\n (?P.*)\n", # .com.tw (Western registrars) 315 | "Administrative Contact Information:\n\n(?:Given name: (?P.+)\n)?(?:Family name: (?P.+)\n)?(?:Company name: (?P.+)\n)?Address: (?P.+)\nCountry: (?P.+)\nPhone: (?P.*)\nFax: (?P.*)\nEmail: (?P.+)\n(?:Account Name: (?P.+)\n)?", # HKDNR (.hk) 316 | "ADMIN ID:(?P.+)\nADMIN Name:(?P.*)\n(?:ADMIN Organization:(?P.*)\n)?ADMIN Street1:(?P.+?)\n(?:ADMIN Street2:(?P.+?)\n(?:ADMIN Street3:(?P.+?)\n)?)?ADMIN City:(?P.+)\nADMIN State:(?P.*)\nADMIN Postal Code:(?P.+)\nADMIN Country:(?P[A-Z]+)\nADMIN Phone:(?P.*?)\nADMIN Fax:(?P.*)\nADMIN Email:(?P.+)\n", # Realtime Register 317 | ] 318 | 319 | billing_contact_regexes = [ 320 | "(?:Billing ID:(?P.+)\n)?Billing Name:(?P.*)\nBilling Organization:(?P.*)\nBilling Street1:(?P.*)\n(?:Billing Street2:(?P.*)\n)?(?:Billing Street3:(?P.*)\n)?Billing City:(?P.*)\nBilling State/Province:(?P.*)\nBilling Postal Code:(?P.*)\nBilling Country:(?P.*)\nBilling Phone:(?P.*)\n(?:Billing Phone Ext.:(?P.*)\n)?(?:Billing FAX:(?P.*)\n)?(?:Billing FAX Ext.:(?P.*)\n)?Billing Email:(?P.*)", # nic.pw, No-IP.com 321 | "Billing ID:(?P.+)\nBilling Name:(?P.*)\n(?:Billing Organization:(?P.*)\n)?Billing Address1?:(?P.*)\n(?:Billing Address2:(?P.*)\n)?(?:Billing Address3:(?P.*)\n)?Billing City:(?P.*)\nBilling State/Province:(?P.*)\nBilling Country/Economy:(?P.*)\nBilling Postal Code:(?P.*)\nBilling Phone:(?P.*)\n(?:Billing Phone Ext.:(?P.*)\n)?(?:Billing FAX:(?P.*)\n)?(?:Billing FAX Ext.:(?P.*)\n)?Billing E-mail:(?P.*)", # DotAsia 322 | "Billing Contact ID:\s*(?P.+)\nBilling Contact Name:\s*(?P.+)\nBilling Contact Organization:\s*(?P.*)\nBilling Contact Address1:\s*(?P.+)\nBilling Contact Address2:\s*(?P.*)\nBilling Contact City:\s*(?P.+)\nBilling Contact State/Province:\s*(?P.+)\nBilling Contact Postal Code:\s*(?P.+)\nBilling Contact Country:\s*(?P.+)\nBilling Contact Country Code:\s*(?P.+)\nBilling Contact Phone Number:\s*(?P.+)\nBilling Contact Email:\s*(?P.+)\n", # .CO Internet 323 | "Billing Contact: (?P.+)\nBilling Organization: (?P.+)\nBilling Name: (?P.+)\nBilling Street: (?P.+)\nBilling City: (?P.+)\nBilling Postal Code: (?P.+)\nBilling State: (?P.+)\nBilling Country: (?P.+)\nBilling Phone: (?P.*)\nBilling Phone Ext: (?P.*)\nBilling Fax: (?P.*)\nBilling Fax Ext: (?P.*)\nBilling Email: (?P.*)\n", # Key-Systems GmbH 324 | "(?:Billing ID:[ ]*(?P.*)\n)?Billing[ ]*Name:[ ]*(?P.*)\n(?:Billing[ ]*Organization:[ ]*(?P.*)\n)?Billing[ ]*Street:[ ]*(?P.+)\n(?:Billing[ ]*Street:[ ]*(?P.+)\n)?Billing[ ]*City:[ ]*(?P.+)\nBilling[ ]*State\/Province:[ ]*(?P.+)\nBilling[ ]*Postal[ ]*Code:[ ]*(?P.+)\nBilling[ ]*Country:[ ]*(?P.+)\n(?:Billing[ ]*Phone:[ ]*(?P.*)\n)?(?:Billing[ ]*Phone[ ]*Ext:[ ]*(?P.*)\n)?(?:Billing[ ]*Fax:[ ]*(?P.*)\n)?(?:Billing[ ]*Fax[ ]*Ext:\s*?(?P.*)\n)?(?:Billing[ ]*Email:[ ]*(?P.+)\n)?", # Musedoma (.museum) 325 | "Billing Contact:\n (?P.+)\n (?P.+)\n(?: (?P.*)\n)?(?: (?P.*)\n)? (?P.+), (?P.+)\n (?P.+)\n (?P.+)\n (?P.+)\n\n", # OVH 326 | " Billing Contact Details:[ ]*\n (?P.*)\n (?P.*)[ ]{2,}\((?P.*)\)\n (?P.*)\n(?: (?P.*)\n)?(?: (?P.*)\n)? (?P.*)\n (?P.*),(?P.*)\n (?P.*)\n Tel. (?P.*)", # Whois.com 327 | "billing-id:[ ]*(?P.*)\n(?:billing-organization:[ ]*(?P.*)\n)?billing-name:[ ]*(?P.*)\nbilling-street:[ ]*(?P.*)\nbilling-city:[ ]*(?P.*)\nbilling-zip:[ ]*(?P.*)\nbilling-country:[ ]*(?P.*)\n(?:billing-phone:[ ]*(?P.*)\n)?(?:billing-fax:[ ]*(?P.*)\n)?billing-email:[ ]*(?P.*)", # InterNetworX 328 | "Billing Contact:\n bill_org: (?P.*)\n bill_name: (?P.*)\n bill_email: (?P.*)\n bill_address: (?P
.*)\n bill_city: (?P.*)\n bill_state: (?P.*)\n bill_zip: (?P.*)\n bill_country: (?P.*)\n bill_phone: (?P.*)", # Bellnames 329 | "Billing Contact ID:(?P.*)\nBilling Contact Name:(?P.*)\n(?:Billing Contact Organization:(?P.*)\n)?Billing Contact Address1:(?P.*)\n(?:Billing Contact Address2:(?P.*)\n)?(?:Billing Contact Address3:(?P.*)\n)?Billing Contact City:(?P.*)\n(?:Billing Contact State/Province:(?P.*)\n)?(?:Billing Contact Postal Code:(?P.*)\n)?Billing Contact Country:(?P.*)\nBilling Contact Country Code:.*\nBilling Contact Phone Number:(?P.*)\n(?:Billing Contact Facsimile Number:(?P.*)\n)?Billing Contact Email:(?P.*)", # .US, .biz (NeuStar), .buzz, .moe (Interlink Co. Ltd.) 330 | "Billing contact:\n(?: (?P.+)\n)? (?P.+)\n (?P.+)\n (?P.+)\n (?P.+), (?P.+) (?P.+) (?P.+)\n Phone: (?P.*)\n Fax: (?P.*)\n", # Fabulous.com 331 | "Billing Contact Information :[ ]*\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n\n", # GAL Communication 332 | "Billing Contact:\n Name: (?P.+)\n City: (?P.+)\n State: (?P.+)\n Country: (?P.+)\n", # Akky (.com.mx) 333 | "BILLING ID:(?P.+)\nBILLING Name:(?P.*)\n(?:BILLING Organization:(?P.*)\n)?BILLING Street1:(?P.+?)\n(?:BILLING Street2:(?P.+?)\n(?:BILLING Street3:(?P.+?)\n)?)?BILLING City:(?P.+)\nBILLING State:(?P.*)\nBILLING Postal Code:(?P.+)\nBILLING Country:(?P[A-Z]+)\nBILLING Phone:(?P.*?)\nBILLING Fax:(?P.*)\nBILLING Email:(?P.+)\n", # Realtime Register 334 | ] 335 | 336 | # Some registries use NIC handle references instead of directly listing contacts... 337 | nic_contact_references = { 338 | "registrant": [ 339 | "registrant:\s*(?P.+)", # nic.at 340 | "owner-contact:\s*(?P.+)", # LCN.com 341 | "holder-c:\s*(?P.+)", # AFNIC 342 | "holder:\s*(?P.+)", # iis.se (they apparently want to be difficult, and won't give you contact info for the handle over their WHOIS service) 343 | ], 344 | "tech": [ 345 | "tech-c:\s*(?P.+)", # nic.at, AFNIC, iis.se 346 | "technical-contact:\s*(?P.+)", # LCN.com 347 | "n\. \[Technical Contact\] (?P.+)\n", #.co.jp 348 | ], 349 | "admin": [ 350 | "admin-c:\s*(?P.+)", # nic.at, AFNIC, iis.se 351 | "admin-contact:\s*(?P.+)", # LCN.com 352 | "m\. \[Administrative Contact\] (?P.+)\n", # .co.jp 353 | ], 354 | "billing": [ 355 | "billing-c:\s*(?P.+)", # iis.se 356 | "billing-contact:\s*(?P.+)", # LCN.com 357 | ] 358 | } 359 | 360 | # Why do the below? The below is meant to handle with an edge case (issue #2) where a partial match followed 361 | # by a failure, for a regex containing the \s*.+ pattern, would send the regex module on a wild goose hunt for 362 | # matching positions. The workaround is to use \S.* instead of .+, but in the interest of keeping the regexes 363 | # consistent and compact, it's more practical to do this (predictable) conversion on runtime. 364 | # FIXME: This breaks on NIC contact regex for nic.at. Why? 365 | registrant_regexes = [preprocess_regex(regex) for regex in registrant_regexes] 366 | tech_contact_regexes = [preprocess_regex(regex) for regex in tech_contact_regexes] 367 | admin_contact_regexes = [preprocess_regex(regex) for regex in admin_contact_regexes] 368 | billing_contact_regexes = [preprocess_regex(regex) for regex in billing_contact_regexes] 369 | 370 | nic_contact_regexes = [ 371 | "personname:\s*(?P.+)\norganization:\s*(?P.+)\nstreet address:\s*(?P.+)\npostal code:\s*(?P.+)\ncity:\s*(?P.+)\ncountry:\s*(?P.+)\n(?:phone:\s*(?P.+)\n)?(?:fax-no:\s*(?P.+)\n)?(?:e-mail:\s*(?P.+)\n)?nic-hdl:\s*(?P.+)\nchanged:\s*(?P.+)", # nic.at 372 | "contact-handle:[ ]*(?P.+)\ncontact:[ ]*(?P.+)\n(?:organisation:[ ]*(?P.+)\n)?address:[ ]*(?P.+)\n(?:address:[ ]*(?P.+)\n)?(?:address:[ ]*(?P.+)\n)?(?:address:[ ]*(?P.+)\n)?address:[ ]*(?P.+)\naddress:[ ]*(?P.+)\naddress:[ ]*(?P.+)\naddress:[ ]*(?P.+)\n(?:phone:[ ]*(?P.+)\n)?(?:fax:[ ]*(?P.+)\n)?(?:email:[ ]*(?P.+)\n)?", # LCN.com 373 | "Contact Information:\na\. \[JPNIC Handle\] (?P.+)\nc\. \[Last, First\] (?P.+), (?P.+)\nd\. \[E-Mail\] (?P.+)\ng\. \[Organization\] (?P.+)\nl\. \[Division\] (?P.+)\nn\. \[Title\] (?P.+)\no\. \[TEL\] (?P<phone>.+)\np\. \[FAX\] (?P<fax>.+)\ny\. \[Reply Mail\] .*\n\[Last Update\] (?P<changedate>.+) \(JST\)\n", # JPRS .co.jp contact handle lookup 374 | "person:\s*(?P<name>.+)\nnic-hdl:\s*(?P<handle>.+)\n", # .ie 375 | "nic-hdl:\s+(?P<handle>.+)\nperson:\s+(?P<name>.+)\n(?:e-mail:\s+(?P<email>.+)\n)?(?:address:\s+(?P<street1>.+?)(?:,+ (?P<street2>.+?)(?:,+ (?P<street3>.+?)(?:,+ (?P<street4>.+?)(?:,+ (?P<street5>.+?)(?:,+ (?P<street6>.+?)(?:,+ (?P<street7>.+?))?)?)?)?)?)?, (?P<city>.+), (?P<state>.+), (?P<country>.+)\n)?(?:phone:\s+(?P<phone>.+)\n)?(?:fax-no:\s+(?P<fax>.+)\n)?", # nic.ir, individual - this is a nasty one. 376 | "nic-hdl:\s+(?P<handle>.+)\norg:\s+(?P<organization>.+)\n(?:e-mail:\s+(?P<email>.+)\n)?(?:address:\s+(?P<street1>.+?)(?:,+ (?P<street2>.+?)(?:,+ (?P<street3>.+?)(?:,+ (?P<street4>.+?)(?:,+ (?P<street5>.+?)(?:,+ (?P<street6>.+?)(?:,+ (?P<street7>.+?))?)?)?)?)?)?, (?P<city>.+), (?P<state>.+), (?P<country>.+)\n)?(?:phone:\s+(?P<phone>.+)\n)?(?:fax-no:\s+(?P<fax>.+)\n)?", # nic.ir, organization 377 | "nic-hdl:\s*(?P<handle>.+)\ntype:\s*(?P<type>.+)\ncontact:\s*(?P<name>.+)\n(?:.+\n)*?(?:address:\s*(?P<street1>.+)\naddress:\s*(?P<street2>.+)\naddress:\s*(?P<street3>.+)\naddress:\s*(?P<country>.+)\n)?(?:phone:\s*(?P<phone>.+)\n)?(?:fax-no:\s*(?P<fax>.+)\n)?(?:.+\n)*?(?:e-mail:\s*(?P<email>.+)\n)?(?:.+\n)*?changed:\s*(?P<changedate>[0-9]{2}\/[0-9]{2}\/[0-9]{4}).*\n", # AFNIC madness without country field 378 | "nic-hdl:\s*(?P<handle>.+)\ntype:\s*(?P<type>.+)\ncontact:\s*(?P<name>.+)\n(?:.+\n)*?(?:address:\s*(?P<street1>.+)\n)?(?:address:\s*(?P<street2>.+)\n)?(?:address:\s*(?P<street3>.+)\n)?(?:phone:\s*(?P<phone>.+)\n)?(?:fax-no:\s*(?P<fax>.+)\n)?(?:.+\n)*?(?:e-mail:\s*(?P<email>.+)\n)?(?:.+\n)*?changed:\s*(?P<changedate>[0-9]{2}\/[0-9]{2}\/[0-9]{4}).*\n", # AFNIC madness any country -at all- 379 | "nic-hdl:\s*(?P<handle>.+)\ntype:\s*(?P<type>.+)\ncontact:\s*(?P<name>.+)\n(?:.+\n)*?(?:address:\s*(?P<street1>.+)\n)?(?:address:\s*(?P<street2>.+)\n)?(?:address:\s*(?P<street3>.+)\n)?(?:address:\s*(?P<street4>.+)\n)?country:\s*(?P<country>.+)\n(?:phone:\s*(?P<phone>.+)\n)?(?:fax-no:\s*(?P<fax>.+)\n)?(?:.+\n)*?(?:e-mail:\s*(?P<email>.+)\n)?(?:.+\n)*?changed:\s*(?P<changedate>[0-9]{2}\/[0-9]{2}\/[0-9]{4}).*\n", # AFNIC madness with country field 380 | ] 381 | 382 | organization_regexes = ( 383 | r"\sltd\.?($|\s)", 384 | r"\sco\.?($|\s)", 385 | r"\scorp\.?($|\s)", 386 | r"\sinc\.?($|\s)", 387 | r"\ss\.?p\.?a\.?($|\s)", 388 | r"\ss\.?(c\.?)?r\.?l\.?($|\s)", 389 | r"\ss\.?a\.?s\.?($|\s)", 390 | r"\sa\.?g\.?($|\s)", 391 | r"\sn\.?v\.?($|\s)", 392 | r"\sb\.?v\.?($|\s)", 393 | r"\sp\.?t\.?y\.?($|\s)", 394 | r"\sp\.?l\.?c\.?($|\s)", 395 | r"\sv\.?o\.?f\.?($|\s)", 396 | r"\sb\.?v\.?b\.?a\.?($|\s)", 397 | r"\sg\.?m\.?b\.?h\.?($|\s)", 398 | r"\ss\.?a\.?r\.?l\.?($|\s)", 399 | ) 400 | 401 | grammar["_data"]["id"] = precompile_regexes(grammar["_data"]["id"], re.IGNORECASE) 402 | grammar["_data"]["status"] = precompile_regexes(grammar["_data"]["status"], re.IGNORECASE) 403 | grammar["_data"]["creation_date"] = precompile_regexes(grammar["_data"]["creation_date"], re.IGNORECASE) 404 | grammar["_data"]["expiration_date"] = precompile_regexes(grammar["_data"]["expiration_date"], re.IGNORECASE) 405 | grammar["_data"]["updated_date"] = precompile_regexes(grammar["_data"]["updated_date"], re.IGNORECASE) 406 | grammar["_data"]["registrar"] = precompile_regexes(grammar["_data"]["registrar"], re.IGNORECASE) 407 | grammar["_data"]["whois_server"] = precompile_regexes(grammar["_data"]["whois_server"], re.IGNORECASE) 408 | grammar["_data"]["nameservers"] = precompile_regexes(grammar["_data"]["nameservers"], re.IGNORECASE) 409 | grammar["_data"]["emails"] = precompile_regexes(grammar["_data"]["emails"], re.IGNORECASE) 410 | 411 | grammar["_dateformats"] = precompile_regexes(grammar["_dateformats"], re.IGNORECASE) 412 | 413 | registrant_regexes = precompile_regexes(registrant_regexes) 414 | tech_contact_regexes = precompile_regexes(tech_contact_regexes) 415 | billing_contact_regexes = precompile_regexes(billing_contact_regexes) 416 | admin_contact_regexes = precompile_regexes(admin_contact_regexes) 417 | nic_contact_regexes = precompile_regexes(nic_contact_regexes) 418 | organization_regexes = precompile_regexes(organization_regexes, re.IGNORECASE) 419 | 420 | nic_contact_references["registrant"] = precompile_regexes(nic_contact_references["registrant"]) 421 | nic_contact_references["tech"] = precompile_regexes(nic_contact_references["tech"]) 422 | nic_contact_references["admin"] = precompile_regexes(nic_contact_references["admin"]) 423 | nic_contact_references["billing"] = precompile_regexes(nic_contact_references["billing"]) 424 | 425 | if sys.version_info < (3, 0): 426 | def is_string(data): 427 | """Test for string with support for python 2.""" 428 | return isinstance(data, basestring) 429 | else: 430 | def is_string(data): 431 | """Test for string with support for python 3.""" 432 | return isinstance(data, str) 433 | 434 | 435 | def parse_raw_whois(raw_data, normalized=None, never_query_handles=True, handle_server=""): 436 | normalized = normalized or [] 437 | data = {} 438 | 439 | raw_data = [segment.replace("\r", "") for segment in raw_data] # Carriage returns are the devil 440 | 441 | for segment in raw_data: 442 | for rule_key, rule_regexes in grammar['_data'].items(): 443 | if (rule_key in data) == False: 444 | for line in segment.splitlines(): 445 | for regex in rule_regexes: 446 | result = re.search(regex, line) 447 | 448 | if result is not None: 449 | val = result.group("val").strip() 450 | if val != "": 451 | try: 452 | data[rule_key].append(val) 453 | except KeyError as e: 454 | data[rule_key] = [val] 455 | 456 | # Whois.com is a bit special... Fabulous.com also seems to use this format. As do some others. 457 | match = re.search("^\s?Name\s?[Ss]ervers:?\s*\n((?:\s*.+\n)+?\s?)\n", segment, re.MULTILINE) 458 | if match is not None: 459 | chunk = match.group(1) 460 | for match in re.findall("[ ]*(.+)\n", chunk): 461 | if match.strip() != "": 462 | if not re.match("^[a-zA-Z]+:", match): 463 | try: 464 | data["nameservers"].append(match.strip()) 465 | except KeyError as e: 466 | data["nameservers"] = [match.strip()] 467 | # Nominet also needs some special attention 468 | match = re.search(" Registrar:\n (.+)\n", segment) 469 | if match is not None: 470 | data["registrar"] = [match.group(1).strip()] 471 | match = re.search(" Registration status:\n (.+)\n", segment) 472 | if match is not None: 473 | data["status"] = [match.group(1).strip()] 474 | match = re.search(" Name servers:\n([\s\S]*?\n)\n", segment) 475 | if match is not None: 476 | chunk = match.group(1) 477 | for match in re.findall(" (.+)\n", chunk): 478 | match = match.split()[0] 479 | try: 480 | data["nameservers"].append(match.strip()) 481 | except KeyError as e: 482 | data["nameservers"] = [match.strip()] 483 | # janet (.ac.uk) is kinda like Nominet, but also kinda not 484 | match = re.search("Registered By:\n\t(.+)\n", segment) 485 | if match is not None: 486 | data["registrar"] = [match.group(1).strip()] 487 | match = re.search("Entry created:\n\t(.+)\n", segment) 488 | if match is not None: 489 | data["creation_date"] = [match.group(1).strip()] 490 | match = re.search("Renewal date:\n\t(.+)\n", segment) 491 | if match is not None: 492 | data["expiration_date"] = [match.group(1).strip()] 493 | match = re.search("Entry updated:\n\t(.+)\n", segment) 494 | if match is not None: 495 | data["updated_date"] = [match.group(1).strip()] 496 | match = re.search("Servers:([\s\S]*?\n)\n", segment) 497 | if match is not None: 498 | chunk = match.group(1) 499 | for match in re.findall("\t(.+)\n", chunk): 500 | match = match.split()[0] 501 | try: 502 | data["nameservers"].append(match.strip()) 503 | except KeyError as e: 504 | data["nameservers"] = [match.strip()] 505 | # .am plays the same game 506 | match = re.search(" DNS servers:([\s\S]*?\n)\n", segment) 507 | if match is not None: 508 | chunk = match.group(1) 509 | for match in re.findall(" (.+)\n", chunk): 510 | match = match.split()[0] 511 | try: 512 | data["nameservers"].append(match.strip()) 513 | except KeyError as e: 514 | data["nameservers"] = [match.strip()] 515 | # SIDN isn't very standard either. And EURid uses a similar format. 516 | match = re.search("Registrar:\n\s+(?:Name:\s*)?(\S.*)", segment) 517 | if match is not None: 518 | data.setdefault("registrar", []).insert(0, match.group(1).strip()) 519 | match = re.search("(?:Domain nameservers|Name servers):([\s\S]*?\n)\n", segment) 520 | if match is not None: 521 | chunk = match.group(1) 522 | for match in re.findall("\s+?(.+)\n", chunk): 523 | match = match.split()[0] 524 | # Prevent nameserver aliases from being picked up. 525 | if not match.startswith("[") and not match.endswith("]"): 526 | try: 527 | data["nameservers"].append(match.strip()) 528 | except KeyError as e: 529 | data["nameservers"] = [match.strip()] 530 | # The .ie WHOIS server puts ambiguous status information in an unhelpful order 531 | match = re.search('ren-status:\s*(.+)', segment) 532 | if match is not None: 533 | data["status"].insert(0, match.group(1).strip()) 534 | # nic.it gives us the registrar in a multi-line format... 535 | match = re.search('Registrar\n Organization: (.+)\n', segment) 536 | if match is not None: 537 | data["registrar"] = [match.group(1).strip()] 538 | # HKDNR (.hk) provides a weird nameserver format with too much whitespace 539 | match = re.search("Name Servers Information:\n\n([\s\S]*?\n)\n", segment) 540 | if match is not None: 541 | chunk = match.group(1) 542 | for match in re.findall("(.+)\n", chunk): 543 | match = match.split()[0] 544 | try: 545 | data["nameservers"].append(match.strip()) 546 | except KeyError as e: 547 | data["nameservers"] = [match.strip()] 548 | # ... and again for TWNIC. 549 | match = re.search(" Domain servers in listed order:\n([\s\S]*?\n)\n", segment) 550 | if match is not None: 551 | chunk = match.group(1) 552 | for match in re.findall(" (.+)\n", chunk): 553 | match = match.split()[0] 554 | try: 555 | data["nameservers"].append(match.strip()) 556 | except KeyError as e: 557 | data["nameservers"] = [match.strip()] 558 | 559 | 560 | data["contacts"] = parse_registrants(raw_data, never_query_handles, handle_server) 561 | 562 | # Parse dates 563 | try: 564 | data['expiration_date'] = remove_duplicates(data['expiration_date']) 565 | data['expiration_date'] = parse_dates(data['expiration_date']) 566 | except KeyError as e: 567 | pass # Not present 568 | try: 569 | data['creation_date'] = remove_duplicates(data['creation_date']) 570 | data['creation_date'] = parse_dates(data['creation_date']) 571 | except KeyError as e: 572 | pass # Not present 573 | 574 | try: 575 | data['updated_date'] = remove_duplicates(data['updated_date']) 576 | data['updated_date'] = parse_dates(data['updated_date']) 577 | except KeyError as e: 578 | pass # Not present 579 | 580 | try: 581 | data['nameservers'] = remove_suffixes(data['nameservers']) 582 | data['nameservers'] = remove_duplicates([ns.rstrip(".") for ns in data['nameservers']]) 583 | except KeyError as e: 584 | pass # Not present 585 | 586 | try: 587 | data['emails'] = remove_duplicates(data['emails']) 588 | except KeyError as e: 589 | pass # Not present 590 | 591 | try: 592 | data['registrar'] = remove_duplicates(data['registrar']) 593 | except KeyError as e: 594 | pass # Not present 595 | 596 | # Remove e-mail addresses if they are already listed for any of the contacts 597 | known_emails = [] 598 | for contact in ("registrant", "tech", "admin", "billing"): 599 | if data["contacts"][contact] is not None: 600 | try: 601 | known_emails.append(data["contacts"][contact]["email"]) 602 | except KeyError as e: 603 | pass # No e-mail recorded for this contact... 604 | try: 605 | data['emails'] = [email for email in data["emails"] if email not in known_emails] 606 | except KeyError as e: 607 | pass # Not present 608 | 609 | for key in list(data.keys()): 610 | if data[key] is None or len(data[key]) == 0: 611 | del data[key] 612 | 613 | data["raw"] = raw_data 614 | 615 | if normalized != []: 616 | data = normalize_data(data, normalized) 617 | 618 | return data 619 | 620 | def normalize_data(data, normalized): 621 | for key in ("nameservers", "emails", "whois_server"): 622 | if key in data and data[key] is not None and (normalized == True or key in normalized): 623 | if is_string(data[key]): 624 | data[key] = data[key].lower() 625 | else: 626 | data[key] = [item.lower() for item in data[key]] 627 | 628 | for key, threshold in (("registrar", 4), ("status", 3)): 629 | if key == "registrar": 630 | ignore_nic = True 631 | else: 632 | ignore_nic = False 633 | if key in data and data[key] is not None and (normalized == True or key in normalized): 634 | if is_string(data[key]): 635 | data[key] = normalize_name(data[key], abbreviation_threshold=threshold, length_threshold=1, ignore_nic=ignore_nic) 636 | else: 637 | data[key] = [normalize_name(item, abbreviation_threshold=threshold, length_threshold=1, ignore_nic=ignore_nic) for item in data[key]] 638 | 639 | for contact_type, contact in data['contacts'].items(): 640 | if contact is not None: 641 | if 'country' in contact and contact['country'] in countries: 642 | contact['country'] = countries[contact['country']] 643 | if 'city' in contact and contact['city'] in airports: 644 | contact['city'] = airports[contact['city']] 645 | if 'country' in contact and 'state' in contact: 646 | for country, source in (("united states", states_us), ("australia", states_au), ("canada", states_ca)): 647 | if country in contact["country"].lower() and contact["state"] in source: 648 | contact["state"] = source[contact["state"]] 649 | 650 | for key in ("email",): 651 | if key in contact and contact[key] is not None and (normalized == True or key in normalized): 652 | if is_string(contact[key]): 653 | contact[key] = contact[key].lower() 654 | else: 655 | contact[key] = [item.lower() for item in contact[key]] 656 | 657 | for key in ("name", "street"): 658 | if key in contact and contact[key] is not None and (normalized == True or key in normalized): 659 | contact[key] = normalize_name(contact[key], abbreviation_threshold=3) 660 | 661 | for key in ("city", "organization", "state", "country"): 662 | if key in contact and contact[key] is not None and (normalized == True or key in normalized): 663 | contact[key] = normalize_name(contact[key], abbreviation_threshold=3, length_threshold=3) 664 | 665 | if "name" in contact and "organization" not in contact: 666 | lines = [x.strip() for x in contact["name"].splitlines()] 667 | new_lines = [] 668 | for i, line in enumerate(lines): 669 | for regex in organization_regexes: 670 | if re.search(regex, line): 671 | new_lines.append(line) 672 | del lines[i] 673 | break 674 | if len(lines) > 0: 675 | contact["name"] = "\n".join(lines) 676 | else: 677 | del contact["name"] 678 | 679 | if len(new_lines) > 0: 680 | contact["organization"] = "\n".join(new_lines) 681 | 682 | if "street" in contact and "organization" not in contact: 683 | lines = [x.strip() for x in contact["street"].splitlines()] 684 | if len(lines) > 1: 685 | for regex in organization_regexes: 686 | if re.search(regex, lines[0]): 687 | contact["organization"] = lines[0] 688 | contact["street"] = "\n".join(lines[1:]) 689 | break 690 | 691 | for key in list(contact.keys()): 692 | try: 693 | contact[key] = contact[key].strip(", ") 694 | if contact[key] == "-" or contact[key].lower() == "n/a": 695 | del contact[key] 696 | except AttributeError as e: 697 | pass # Not a string 698 | return data 699 | 700 | def normalize_name(value, abbreviation_threshold=4, length_threshold=8, lowercase_domains=True, ignore_nic=False): 701 | normalized_lines = [] 702 | for line in value.split("\n"): 703 | line = line.strip(",") # Get rid of useless comma's 704 | if (line.isupper() or line.islower()) and len(line) >= length_threshold: 705 | # This line is likely not capitalized properly 706 | if ignore_nic == True and "nic" in line.lower(): 707 | # This is a registrar name containing 'NIC' - it should probably be all-uppercase. 708 | line = line.upper() 709 | else: 710 | words = line.split() 711 | normalized_words = [] 712 | if len(words) >= 1: 713 | # First word 714 | if len(words[0]) >= abbreviation_threshold and "." not in words[0]: 715 | normalized_words.append(words[0].capitalize()) 716 | elif lowercase_domains and "." in words[0] and not words[0].endswith(".") and not words[0].startswith("."): 717 | normalized_words.append(words[0].lower()) 718 | else: 719 | # Probably an abbreviation or domain, leave it alone 720 | normalized_words.append(words[0]) 721 | if len(words) >= 3: 722 | # Words between the first and last 723 | for word in words[1:-1]: 724 | if len(word) >= abbreviation_threshold and "." not in word: 725 | normalized_words.append(word.capitalize()) 726 | elif lowercase_domains and "." in word and not word.endswith(".") and not word.startswith("."): 727 | normalized_words.append(word.lower()) 728 | else: 729 | # Probably an abbreviation or domain, leave it alone 730 | normalized_words.append(word) 731 | if len(words) >= 2: 732 | # Last word 733 | if len(words[-1]) >= abbreviation_threshold and "." not in words[-1]: 734 | normalized_words.append(words[-1].capitalize()) 735 | elif lowercase_domains and "." in words[-1] and not words[-1].endswith(".") and not words[-1].startswith("."): 736 | normalized_words.append(words[-1].lower()) 737 | else: 738 | # Probably an abbreviation or domain, leave it alone 739 | normalized_words.append(words[-1]) 740 | line = " ".join(normalized_words) 741 | normalized_lines.append(line) 742 | return "\n".join(normalized_lines) 743 | 744 | def parse_dates(dates): 745 | global grammar 746 | parsed_dates = [] 747 | for date in dates: 748 | for rule in grammar['_dateformats']: 749 | result = re.match(rule, date) 750 | if result is not None: 751 | try: 752 | # These are always numeric. If they fail, there is no valid date present. 753 | year = int(result.group("year")) 754 | day = int(result.group("day")) 755 | # Detect and correct shorthand year notation 756 | if year < 60: 757 | year += 2000 758 | elif year < 100: 759 | year += 1900 760 | 761 | # This will require some more guesswork - some WHOIS servers present the name of the month 762 | try: 763 | month = int(result.group("month")) 764 | except ValueError as e: 765 | # Apparently not a number. Look up the corresponding number. 766 | try: 767 | month = grammar['_months'][result.group("month").lower()] 768 | except KeyError as e: 769 | # Unknown month name, default to 0 770 | month = 0 771 | 772 | try: 773 | hour = int(result.group("hour")) 774 | except IndexError as e: 775 | hour = 0 776 | except TypeError as e: 777 | hour = 0 778 | 779 | try: 780 | minute = int(result.group("minute")) 781 | except IndexError as e: 782 | minute = 0 783 | except TypeError as e: 784 | minute = 0 785 | 786 | try: 787 | second = int(result.group("second")) 788 | except IndexError as e: 789 | second = 0 790 | except TypeError as e: 791 | second = 0 792 | 793 | break 794 | except ValueError as e: 795 | # Something went horribly wrong, maybe there is no valid date present? 796 | year = 0 797 | month = 0 798 | day = 0 799 | hour = 0 800 | minute = 0 801 | second = 0 802 | print(e.message) # FIXME: This should have proper logging of some sort...? 803 | try: 804 | if year > 0: 805 | try: 806 | parsed_dates.append(datetime.datetime(year, month, day, hour, minute, second)) 807 | except ValueError as e: 808 | # We might have gotten the day and month the wrong way around, let's try it the other way around 809 | # If you're not using an ISO-standard date format, you're an evil registrar! 810 | parsed_dates.append(datetime.datetime(year, day, month, hour, minute, second)) 811 | except UnboundLocalError as e: 812 | pass 813 | 814 | if len(parsed_dates) > 0: 815 | return parsed_dates 816 | else: 817 | return None 818 | 819 | def remove_duplicates(data): 820 | cleaned_list = [] 821 | 822 | for entry in data: 823 | if entry not in cleaned_list: 824 | cleaned_list.append(entry) 825 | 826 | return cleaned_list 827 | 828 | def remove_suffixes(data): 829 | # Removes everything before and after the first non-whitespace continuous string. 830 | # Used to get rid of IP suffixes for nameservers. 831 | cleaned_list = [] 832 | 833 | for entry in data: 834 | cleaned_list.append(re.search("([^\s]+)\s*[\s]*", entry).group(1).lstrip()) 835 | 836 | return cleaned_list 837 | 838 | def parse_registrants(data, never_query_handles=True, handle_server=""): 839 | registrant = None 840 | tech_contact = None 841 | billing_contact = None 842 | admin_contact = None 843 | 844 | for segment in data: 845 | for regex in registrant_regexes: 846 | match = re.search(regex, segment) 847 | if match is not None: 848 | registrant = match.groupdict() 849 | break 850 | 851 | for segment in data: 852 | for regex in tech_contact_regexes: 853 | match = re.search(regex, segment) 854 | if match is not None: 855 | tech_contact = match.groupdict() 856 | break 857 | 858 | for segment in data: 859 | for regex in admin_contact_regexes: 860 | match = re.search(regex, segment) 861 | if match is not None: 862 | admin_contact = match.groupdict() 863 | break 864 | 865 | for segment in data: 866 | for regex in billing_contact_regexes: 867 | match = re.search(regex, segment) 868 | if match is not None: 869 | billing_contact = match.groupdict() 870 | break 871 | 872 | # Find NIC handle contact definitions 873 | handle_contacts = parse_nic_contact(data) 874 | 875 | # Find NIC handle references and process them 876 | missing_handle_contacts = [] 877 | for category in nic_contact_references: 878 | for regex in nic_contact_references[category]: 879 | for segment in data: 880 | match = re.search(regex, segment) 881 | if match is not None: 882 | data_reference = match.groupdict() 883 | if data_reference["handle"] == "-" or re.match("https?:\/\/", data_reference["handle"]) is not None: 884 | pass # Reference was either blank or a URL; the latter is to deal with false positives for nic.ru 885 | else: 886 | found = False 887 | for contact in handle_contacts: 888 | if contact["handle"] == data_reference["handle"]: 889 | found = True 890 | data_reference.update(contact) 891 | if found == False: 892 | # The contact definition was not found in the supplied raw WHOIS data. If the 893 | # method has been called with never_query_handles=False, we can use the supplied 894 | # WHOIS server for looking up the handle information separately. 895 | if never_query_handles == False: 896 | try: 897 | contact = fetch_nic_contact(data_reference["handle"], handle_server) 898 | data_reference.update(contact) 899 | except shared.WhoisException as e: 900 | pass # No data found. TODO: Log error? 901 | else: 902 | pass # TODO: Log warning? 903 | if category == "registrant": 904 | registrant = data_reference 905 | elif category == "tech": 906 | tech_contact = data_reference 907 | elif category == "billing": 908 | billing_contact = data_reference 909 | elif category == "admin": 910 | admin_contact = data_reference 911 | break 912 | 913 | # Post-processing 914 | for obj in (registrant, tech_contact, billing_contact, admin_contact): 915 | if obj is not None: 916 | for key in list(obj.keys()): 917 | if obj[key] is None or obj[key].strip() == "": # Just chomp all surrounding whitespace 918 | del obj[key] 919 | else: 920 | obj[key] = obj[key].strip() 921 | if "phone_ext" in obj: 922 | if "phone" in obj: 923 | obj["phone"] += " ext. %s" % obj["phone_ext"] 924 | del obj["phone_ext"] 925 | if "street1" in obj: 926 | street_items = [] 927 | i = 1 928 | while True: 929 | try: 930 | street_items.append(obj["street%d" % i]) 931 | del obj["street%d" % i] 932 | except KeyError as e: 933 | break 934 | i += 1 935 | obj["street"] = "\n".join(street_items) 936 | if "organization1" in obj: # This is to deal with eg. HKDNR, who allow organization names in multiple languages. 937 | organization_items = [] 938 | i = 1 939 | while True: 940 | try: 941 | if obj["organization%d" % i].strip() != "": 942 | organization_items.append(obj["organization%d" % i]) 943 | del obj["organization%d" % i] 944 | except KeyError as e: 945 | break 946 | i += 1 947 | obj["organization"] = "\n".join(organization_items) 948 | if 'changedate' in obj: 949 | obj['changedate'] = parse_dates([obj['changedate']])[0] 950 | if 'creationdate' in obj: 951 | obj['creationdate'] = parse_dates([obj['creationdate']])[0] 952 | if 'street' in obj and "\n" in obj["street"] and 'postalcode' not in obj: 953 | # Deal with certain mad WHOIS servers that don't properly delimit address data... (yes, AFNIC, looking at you) 954 | lines = [x.strip() for x in obj["street"].splitlines()] 955 | if " " in lines[-1]: 956 | postal_code, city = lines[-1].split(" ", 1) 957 | if "." not in lines[-1] and re.match("[0-9]", postal_code) and len(postal_code) >= 3: 958 | obj["postalcode"] = postal_code 959 | obj["city"] = city 960 | obj["street"] = "\n".join(lines[:-1]) 961 | if 'firstname' in obj or 'lastname' in obj: 962 | elements = [] 963 | if 'firstname' in obj: 964 | elements.append(obj["firstname"]) 965 | if 'lastname' in obj: 966 | elements.append(obj["lastname"]) 967 | obj["name"] = " ".join(elements) 968 | if 'country' in obj and 'city' in obj and (re.match("^R\.?O\.?C\.?$", obj["country"], re.IGNORECASE) or obj["country"].lower() == "republic of china") and obj["city"].lower() == "taiwan": 969 | # There's an edge case where some registrants append ", Republic of China" after "Taiwan", and this is mis-parsed 970 | # as Taiwan being the city. This is meant to correct that. 971 | obj["country"] = "%s, %s" % (obj["city"], obj["country"]) 972 | lines = [x.strip() for x in obj["street"].splitlines()] 973 | obj["city"] = lines[-1] 974 | obj["street"] = "\n".join(lines[:-1]) 975 | 976 | return { 977 | "registrant": registrant, 978 | "tech": tech_contact, 979 | "admin": admin_contact, 980 | "billing": billing_contact, 981 | } 982 | 983 | def fetch_nic_contact(handle, lookup_server): 984 | response = net.get_whois_raw(handle, lookup_server) 985 | response = [segment.replace("\r", "") for segment in response] # Carriage returns are the devil 986 | results = parse_nic_contact(response) 987 | 988 | if len(results) > 0: 989 | return results[0] 990 | else: 991 | raise shared.WhoisException("No contact data found in the response.") 992 | 993 | def parse_nic_contact(data): 994 | handle_contacts = [] 995 | for regex in nic_contact_regexes: 996 | for segment in data: 997 | matches = re.finditer(regex, segment) 998 | for match in matches: 999 | handle_contacts.append(match.groupdict()) 1000 | 1001 | return handle_contacts 1002 | --------------------------------------------------------------------------------