├── .gitignore ├── LICENSE ├── README.md ├── SpiderWho.py ├── config.py ├── helperThreads.py ├── proxywhois.py ├── socks.py └── whoisThread.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | GNU GENERAL PUBLIC LICENSE 3 | Version 2, June 1991 4 | 5 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., [http://fsf.org/] 6 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 7 | Everyone is permitted to copy and distribute verbatim copies 8 | of this license document, but changing it is not allowed. 9 | 10 | Preamble 11 | 12 | The licenses for most software are designed to take away your 13 | freedom to share and change it. By contrast, the GNU General Public 14 | License is intended to guarantee your freedom to share and change free 15 | software--to make sure the software is free for all its users. This 16 | General Public License applies to most of the Free Software 17 | Foundation's software and to any other program whose authors commit to 18 | using it. (Some other Free Software Foundation software is covered by 19 | the GNU Lesser General Public License instead.) You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | this service if you wish), that you receive source code or can get it 26 | if you want it, that you can change the software or use pieces of it 27 | in new free programs; and that you know you can do these things. 28 | 29 | To protect your rights, we need to make restrictions that forbid 30 | anyone to deny you these rights or to ask you to surrender the rights. 31 | These restrictions translate to certain responsibilities for you if you 32 | distribute copies of the software, or if you modify it. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must give the recipients all the rights that 36 | you have. You must make sure that they, too, receive or can get the 37 | source code. And you must show them these terms so they know their 38 | rights. 39 | 40 | We protect your rights with two steps: (1) copyright the software, and 41 | (2) offer you this license which gives you legal permission to copy, 42 | distribute and/or modify the software. 43 | 44 | Also, for each author's protection and ours, we want to make certain 45 | that everyone understands that there is no warranty for this free 46 | software. If the software is modified by someone else and passed on, we 47 | want its recipients to know that what they have is not the original, so 48 | that any problems introduced by others will not reflect on the original 49 | authors' reputations. 50 | 51 | Finally, any free program is threatened constantly by software 52 | patents. We wish to avoid the danger that redistributors of a free 53 | program will individually obtain patent licenses, in effect making the 54 | program proprietary. To prevent this, we have made it clear that any 55 | patent must be licensed for everyone's free use or not licensed at all. 56 | 57 | The precise terms and conditions for copying, distribution and 58 | modification follow. 59 | 60 | GNU GENERAL PUBLIC LICENSE 61 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 62 | 63 | 0. This License applies to any program or other work which contains 64 | a notice placed by the copyright holder saying it may be distributed 65 | under the terms of this General Public License. The "Program", below, 66 | refers to any such program or work, and a "work based on the Program" 67 | means either the Program or any derivative work under copyright law: 68 | that is to say, a work containing the Program or a portion of it, 69 | either verbatim or with modifications and/or translated into another 70 | language. (Hereinafter, translation is included without limitation in 71 | the term "modification".) Each licensee is addressed as "you". 72 | 73 | Activities other than copying, distribution and modification are not 74 | covered by this License; they are outside its scope. The act of 75 | running the Program is not restricted, and the output from the Program 76 | is covered only if its contents constitute a work based on the 77 | Program (independent of having been made by running the Program). 78 | Whether that is true depends on what the Program does. 79 | 80 | 1. You may copy and distribute verbatim copies of the Program's 81 | source code as you receive it, in any medium, provided that you 82 | conspicuously and appropriately publish on each copy an appropriate 83 | copyright notice and disclaimer of warranty; keep intact all the 84 | notices that refer to this License and to the absence of any warranty; 85 | and give any other recipients of the Program a copy of this License 86 | along with the Program. 87 | 88 | You may charge a fee for the physical act of transferring a copy, and 89 | you may at your option offer warranty protection in exchange for a fee. 90 | 91 | 2. You may modify your copy or copies of the Program or any portion 92 | of it, thus forming a work based on the Program, and copy and 93 | distribute such modifications or work under the terms of Section 1 94 | above, provided that you also meet all of these conditions: 95 | 96 | a) You must cause the modified files to carry prominent notices 97 | stating that you changed the files and the date of any change. 98 | 99 | b) You must cause any work that you distribute or publish, that in 100 | whole or in part contains or is derived from the Program or any 101 | part thereof, to be licensed as a whole at no charge to all third 102 | parties under the terms of this License. 103 | 104 | c) If the modified program normally reads commands interactively 105 | when run, you must cause it, when started running for such 106 | interactive use in the most ordinary way, to print or display an 107 | announcement including an appropriate copyright notice and a 108 | notice that there is no warranty (or else, saying that you provide 109 | a warranty) and that users may redistribute the program under 110 | these conditions, and telling the user how to view a copy of this 111 | License. (Exception: if the Program itself is interactive but 112 | does not normally print such an announcement, your work based on 113 | the Program is not required to print an announcement.) 114 | 115 | These requirements apply to the modified work as a whole. If 116 | identifiable sections of that work are not derived from the Program, 117 | and can be reasonably considered independent and separate works in 118 | themselves, then this License, and its terms, do not apply to those 119 | sections when you distribute them as separate works. But when you 120 | distribute the same sections as part of a whole which is a work based 121 | on the Program, the distribution of the whole must be on the terms of 122 | this License, whose permissions for other licensees extend to the 123 | entire whole, and thus to each and every part regardless of who wrote it. 124 | 125 | Thus, it is not the intent of this section to claim rights or contest 126 | your rights to work written entirely by you; rather, the intent is to 127 | exercise the right to control the distribution of derivative or 128 | collective works based on the Program. 129 | 130 | In addition, mere aggregation of another work not based on the Program 131 | with the Program (or with a work based on the Program) on a volume of 132 | a storage or distribution medium does not bring the other work under 133 | the scope of this License. 134 | 135 | 3. You may copy and distribute the Program (or a work based on it, 136 | under Section 2) in object code or executable form under the terms of 137 | Sections 1 and 2 above provided that you also do one of the following: 138 | 139 | a) Accompany it with the complete corresponding machine-readable 140 | source code, which must be distributed under the terms of Sections 141 | 1 and 2 above on a medium customarily used for software interchange; or, 142 | 143 | b) Accompany it with a written offer, valid for at least three 144 | years, to give any third party, for a charge no more than your 145 | cost of physically performing source distribution, a complete 146 | machine-readable copy of the corresponding source code, to be 147 | distributed under the terms of Sections 1 and 2 above on a medium 148 | customarily used for software interchange; or, 149 | 150 | c) Accompany it with the information you received as to the offer 151 | to distribute corresponding source code. (This alternative is 152 | allowed only for noncommercial distribution and only if you 153 | received the program in object code or executable form with such 154 | an offer, in accord with Subsection b above.) 155 | 156 | The source code for a work means the preferred form of the work for 157 | making modifications to it. For an executable work, complete source 158 | code means all the source code for all modules it contains, plus any 159 | associated interface definition files, plus the scripts used to 160 | control compilation and installation of the executable. However, as a 161 | special exception, the source code distributed need not include 162 | anything that is normally distributed (in either source or binary 163 | form) with the major components (compiler, kernel, and so on) of the 164 | operating system on which the executable runs, unless that component 165 | itself accompanies the executable. 166 | 167 | If distribution of executable or object code is made by offering 168 | access to copy from a designated place, then offering equivalent 169 | access to copy the source code from the same place counts as 170 | distribution of the source code, even though third parties are not 171 | compelled to copy the source along with the object code. 172 | 173 | 4. You may not copy, modify, sublicense, or distribute the Program 174 | except as expressly provided under this License. Any attempt 175 | otherwise to copy, modify, sublicense or distribute the Program is 176 | void, and will automatically terminate your rights under this License. 177 | However, parties who have received copies, or rights, from you under 178 | this License will not have their licenses terminated so long as such 179 | parties remain in full compliance. 180 | 181 | 5. You are not required to accept this License, since you have not 182 | signed it. However, nothing else grants you permission to modify or 183 | distribute the Program or its derivative works. These actions are 184 | prohibited by law if you do not accept this License. Therefore, by 185 | modifying or distributing the Program (or any work based on the 186 | Program), you indicate your acceptance of this License to do so, and 187 | all its terms and conditions for copying, distributing or modifying 188 | the Program or works based on it. 189 | 190 | 6. Each time you redistribute the Program (or any work based on the 191 | Program), the recipient automatically receives a license from the 192 | original licensor to copy, distribute or modify the Program subject to 193 | these terms and conditions. You may not impose any further 194 | restrictions on the recipients' exercise of the rights granted herein. 195 | You are not responsible for enforcing compliance by third parties to 196 | this License. 197 | 198 | 7. If, as a consequence of a court judgment or allegation of patent 199 | infringement or for any other reason (not limited to patent issues), 200 | conditions are imposed on you (whether by court order, agreement or 201 | otherwise) that contradict the conditions of this License, they do not 202 | excuse you from the conditions of this License. If you cannot 203 | distribute so as to satisfy simultaneously your obligations under this 204 | License and any other pertinent obligations, then as a consequence you 205 | may not distribute the Program at all. For example, if a patent 206 | license would not permit royalty-free redistribution of the Program by 207 | all those who receive copies directly or indirectly through you, then 208 | the only way you could satisfy both it and this License would be to 209 | refrain entirely from distribution of the Program. 210 | 211 | If any portion of this section is held invalid or unenforceable under 212 | any particular circumstance, the balance of the section is intended to 213 | apply and the section as a whole is intended to apply in other 214 | circumstances. 215 | 216 | It is not the purpose of this section to induce you to infringe any 217 | patents or other property right claims or to contest validity of any 218 | such claims; this section has the sole purpose of protecting the 219 | integrity of the free software distribution system, which is 220 | implemented by public license practices. Many people have made 221 | generous contributions to the wide range of software distributed 222 | through that system in reliance on consistent application of that 223 | system; it is up to the author/donor to decide if he or she is willing 224 | to distribute software through any other system and a licensee cannot 225 | impose that choice. 226 | 227 | This section is intended to make thoroughly clear what is believed to 228 | be a consequence of the rest of this License. 229 | 230 | 8. If the distribution and/or use of the Program is restricted in 231 | certain countries either by patents or by copyrighted interfaces, the 232 | original copyright holder who places the Program under this License 233 | may add an explicit geographical distribution limitation excluding 234 | those countries, so that distribution is permitted only in or among 235 | countries not thus excluded. In such case, this License incorporates 236 | the limitation as if written in the body of this License. 237 | 238 | 9. The Free Software Foundation may publish revised and/or new versions 239 | of the General Public License from time to time. Such new versions will 240 | be similar in spirit to the present version, but may differ in detail to 241 | address new problems or concerns. 242 | 243 | Each version is given a distinguishing version number. If the Program 244 | specifies a version number of this License which applies to it and "any 245 | later version", you have the option of following the terms and conditions 246 | either of that version or of any later version published by the Free 247 | Software Foundation. If the Program does not specify a version number of 248 | this License, you may choose any version ever published by the Free Software 249 | Foundation. 250 | 251 | 10. If you wish to incorporate parts of the Program into other free 252 | programs whose distribution conditions are different, write to the author 253 | to ask for permission. For software which is copyrighted by the Free 254 | Software Foundation, write to the Free Software Foundation; we sometimes 255 | make exceptions for this. Our decision will be guided by the two goals 256 | of preserving the free status of all derivatives of our free software and 257 | of promoting the sharing and reuse of software generally. 258 | 259 | NO WARRANTY 260 | 261 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 262 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 263 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 264 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 265 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 266 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 267 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 268 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 269 | REPAIR OR CORRECTION. 270 | 271 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 272 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 273 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 274 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 275 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 276 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 277 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 278 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 279 | POSSIBILITY OF SUCH DAMAGES. 280 | 281 | END OF TERMS AND CONDITIONS 282 | 283 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # SpiderWho 3 | 4 | ### A fast WHOIS crawler 5 | 6 | ## Usage 7 | ``` 8 | usage: SpiderWho.py [-h] [-n NUMPROXIES] [-o OUT] [-f] [-s] [-sn SKIPNUMBER] 9 | [-d] [-e] [-l] [-q] [-z] 10 | proxies domains 11 | 12 | positional arguments: 13 | proxies file containing a list of proxies and ports 14 | domains file containing a list of domains to use 15 | 16 | optional arguments: 17 | -h, --help show this help message and exit 18 | -n NUMPROXIES, --numProxies NUMPROXIES 19 | Maximum number of proxies to use. All=0 Default: 0 20 | -o OUT, --out OUT Output directory to store results. Default: out/ 21 | -f, --files Output to files instead of tgz. Default: False 22 | -s, --skip Skip domains that already have results. Only 23 | compatible with --files Default: False 24 | -sn SKIPNUMBER, --skipNumber SKIPNUMBER 25 | Skip n domains that already have results. Default: 0 26 | -d, --debug Enable debug printing 27 | -e, --emailVerify Enable Email validity check 28 | -l, --log Enable log saving 29 | -q, --quiet Disable status printing 30 | -z, --lazy Enable Lazy mode. Give up after a few ratelimits 31 | ``` 32 | 33 | ### Proxy Lists 34 | Proxy lists should have one proxy per line in the following format: 35 | `http://MyProxyHost:port`. 36 | Both http and socks proxies are supported. Comments are allowed and start with "#". 37 | 38 | ### Domain Lists 39 | Domain lists should have one domain per line. 40 | List may contain domain names or IP addresses. 41 | 42 | ### Lazy mode 43 | Lazy mode will increase your Lookups per Second (LPS) and overall speed at the cost of accuracy 44 | In lazy mode, if a WHOIS server has a rate limit much stronger than what we expect we will fail the domain after 3 attempts. 45 | In normal mode we will try until we get a result. 46 | 47 | ### Output 48 | ``` 49 | Prog All New Fail Completed Active/Proxies DPS Time 50 | 14% 16803445 2438762 9.7% 2192796 90 / 270 32.3 18:11:47 51 | ``` 52 | Prog: How much of the input domain list has been read and queued. 53 | All: How many domains have been scanned as input (also includes skipped domains with the -sn option) 54 | New: Number of domains set to be crawled. 55 | Fail: Percent of domains that have failed more than the max retries amount. 56 | Completed: Total number of domains with saved results. 57 | Active: The number of active threads actively performing a WHOIS query not waiting due to rate limiting or other blocking operations. 58 | Proxies: The total number of working and proxies. This number may change as proxies provided go up or down. 59 | DPS/LPS: Domains or Lookups per second. How many queries have been performed each second. 60 | Time: The total running time of the program. 61 | 62 | ### Output Data 63 | By default a .tgz file is created in the output directory with the results that is rotated and named 64 | with the timestamp of the first record it contains. 65 | This behavior an be disabled with the --files option to create a new file for every domain, however 66 | this will cause poor behavior on large scans due to the massive amount of files out into a single directory. 67 | 68 | ### Advanced Settings 69 | Advanced settings can be changed in config.py. 70 | config.py contains default values that can be overridden by command arguments. 71 | 72 | ## TODO 73 | 1. Adaptive query back-off 74 | 2. Support for whois servers that forward to http 75 | -------------------------------------------------------------------------------- /SpiderWho.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | Main SpiderWho entrypoint 4 | See ./SpiderWho.py -h for how to use 5 | ''' 6 | import time 7 | from helperThreads import ManagerThread 8 | import datetime 9 | import argparse 10 | import config 11 | import sys 12 | import whoisThread 13 | 14 | last_lookups = 0 15 | 16 | def set_proc_name(newname): 17 | try: 18 | import setproctitle 19 | setproctitle.setproctitle(newname) 20 | except: 21 | pass 22 | 23 | 24 | def getTerminalSize(): 25 | """ 26 | stolen from http://stackoverflow.com/questions/566746/how-to-get-console-window-width-in-python 27 | returns (width, height) 28 | """ 29 | import os 30 | env = os.environ 31 | def ioctl_GWINSZ(fd): 32 | try: 33 | import fcntl, termios, struct, os 34 | cr = struct.unpack('hh', fcntl.ioctl(fd, termios.TIOCGWINSZ, 35 | '1234')) 36 | except: 37 | return 38 | return cr 39 | cr = ioctl_GWINSZ(0) or ioctl_GWINSZ(1) or ioctl_GWINSZ(2) 40 | if not cr: 41 | try: 42 | fd = os.open(os.ctermid(), os.O_RDONLY) 43 | cr = ioctl_GWINSZ(fd) 44 | os.close(fd) 45 | except: 46 | pass 47 | if not cr: 48 | cr = (env.get('LINES', 25), env.get('COLUMNS', 80)) 49 | 50 | ### Use get(key[, default]) instead of a try/catch 51 | #try: 52 | # cr = (env['LINES'], env['COLUMNS']) 53 | #except: 54 | # cr = (25, 80) 55 | return int(cr[1]), int(cr[0]) 56 | 57 | def print_status_line(): 58 | '''prints the statusline header''' 59 | rps = "LPS" 60 | if config.DPS: 61 | rps = "DPS" 62 | title = "\r%4s %9s %9s %6s %9s %7s/%-7s %6s %s" % ("Prog", "All", "New", "Fail", "Completed", "Active", "Proxies", rps, "Time") 63 | sys.stdout.write(title) 64 | sys.stdout.write("\n") 65 | sys.stdout.flush() 66 | 67 | 68 | 69 | def print_status_data(manager): 70 | '''updates the statusline data''' 71 | global last_lookups 72 | running_seconds = (time.time() - config.START_TIME) 73 | 74 | domains = manager.input_thread.getDomainCount() 75 | good_saved = manager.save_thread.getNumGood() 76 | fail_saved = manager.save_thread.getNumFails() 77 | total_saved = manager.save_thread.getNumSaved() 78 | skipped = manager.input_thread.getNumSkipped() 79 | active_threads = whoisThread.getActiveThreadCount() 80 | total_threads = whoisThread.getProxyThreadCount() 81 | running_time = str(datetime.timedelta(seconds=int(running_seconds))) 82 | q_size = manager.input_queue.qsize() 83 | progress = 100*manager.input_thread.getProgress() 84 | 85 | rlookups = good_saved 86 | if not config.DPS: 87 | rlookups = whoisThread.getLookupCount() 88 | 89 | last_lps = (rlookups-last_lookups)/config.STATUS_UPDATE_DELAY 90 | total_lps = rlookups/running_seconds 91 | lps = (last_lps * 0.8) + (total_lps * 0.2) 92 | last_lookups = rlookups 93 | 94 | allDomains = (domains + skipped) - q_size 95 | 96 | failp = 0.0 97 | if total_saved != 0: 98 | failp = 100.0 * ( float(fail_saved) / float(total_saved) ) 99 | 100 | # term info 101 | (width, height) = getTerminalSize() 102 | # clear screen 103 | sys.stdout.write('\r' + (' ' * width)) 104 | 105 | data = "\r%3.0f%% %9d %9d %5.1f%% %9d %6d / %-6d %6.1f %s" % (progress, allDomains, domains, failp, good_saved, active_threads, total_threads, lps, running_time) 106 | 107 | sys.stdout.write(data) 108 | 109 | if q_size < (config.MAX_QUEUE_SIZE/10): 110 | sys.stdout.write(" WARNING: input queue is %d " % q_size) 111 | 112 | sq_size = manager.save_queue.qsize() 113 | if sq_size > (config.MAX_QUEUE_SIZE/5): 114 | sys.stdout.write(" WARNING: save queue is %d " % sq_size) 115 | 116 | sys.stdout.flush() 117 | 118 | 119 | def run(): 120 | '''main entrypoint once config has been set by main''' 121 | manager = ManagerThread() 122 | manager.daemon = True #required for ctrl-c exit 123 | config.START_TIME = time.time() 124 | manager.start() 125 | 126 | if config.DEBUG: 127 | print "Waiting for threads to settle" 128 | while not manager.ready: 129 | time.sleep(0.2) 130 | 131 | if config.PRINT_STATUS: 132 | print_status_line() 133 | print_status_data(manager) 134 | 135 | time.sleep(5) 136 | 137 | try: 138 | while whoisThread.getProxyThreadCount() > 0 and manager.isAlive(): 139 | if config.PRINT_STATUS: 140 | print_status_data(manager) 141 | time.sleep(config.STATUS_UPDATE_DELAY) 142 | if (whoisThread.getProxyThreadCount() == 0): 143 | print "No valid Proxy threads running!!" 144 | except KeyboardInterrupt: 145 | q_size = manager.input_queue.qsize() 146 | if q_size <= (config.MAX_QUEUE_SIZE - 1): 147 | skipped = manager.input_thread.getNumSkipped() 148 | loaded = manager.input_thread.getDomainCount() 149 | total = skipped + loaded - config.MAX_QUEUE_SIZE 150 | print "\nExamined at least %d domains" % (total) 151 | config.PRINT_STATUS = False 152 | pass 153 | finally: 154 | # ensure the tar file is closed 155 | manager.save_thread.closeTar() 156 | if config.PRINT_STATUS: 157 | print_status_data(manager) 158 | sys.stdout.write("\n") 159 | if config.SAVE_LOGS: 160 | whoisThread.printExceptionCounts() 161 | 162 | 163 | if __name__ == '__main__': 164 | set_proc_name("SpiderWho") 165 | parser = argparse.ArgumentParser() 166 | parser.add_argument("proxies", help="file containing a list of proxies and ports") 167 | parser.add_argument("domains", help="file containing a list of domains to use") 168 | parser.add_argument("-n", "--numProxies", help="Maximum number of proxies to use. All=0 Default: "+str(config.NUM_PROXIES), type=int, default=config.NUM_PROXIES) 169 | parser.add_argument("-o", "--out", help="Output directory to store results. Default: "+config.OUTPUT_FOLDER, default=config.OUTPUT_FOLDER) 170 | parser.add_argument("-f", "--files", help="Output to files instead of tgz. Default: "+str(not config.SAVE_TAR), action="store_true", default=(not config.SAVE_TAR)) 171 | parser.add_argument("-s", "--skip", help="Skip domains that already have results. Only compatible with --files Default: "+str(config.SKIP_DONE), action='store_true', default=config.SKIP_DONE) 172 | parser.add_argument("-sn", "--skipNumber", help="Skip n domains that already have results. Default: 0", type=int, default=config.SKIP_DOMAINS) 173 | parser.add_argument("-sp", "--split", help="Split Thick and Thin whois results into different folders. Default: "+str(config.SPLIT_THICK), action='store_true', default=config.SPLIT_THICK) 174 | parser.add_argument("-d", "--debug", help="Enable debug printing", action='store_true', default=config.DEBUG) 175 | parser.add_argument("-e", "--emailVerify", help="Enable Email validity check", action='store_true', default=config.RESULT_VALIDCHECK) 176 | parser.add_argument("-l", "--log", help="Enable log saving", action='store_true', default=config.SAVE_LOGS) 177 | parser.add_argument("-q", "--quiet", help="Disable status printing", action='store_true', default=(not config.PRINT_STATUS)) 178 | parser.add_argument("-z", "--lazy", help="Enable Lazy mode. Give up after a few ratelimits", action='store_true', default=config.LAZY_MODE) 179 | args = parser.parse_args() 180 | 181 | config.PROXY_LIST = args.proxies 182 | config.DOMAIN_LIST = args.domains 183 | config.NUM_PROXIES = args.numProxies 184 | config.OUTPUT_FOLDER = args.out+"/" 185 | config.SKIP_DONE = args.skip 186 | config.DEBUG = args.debug 187 | config.RESULT_VALIDCHECK = args.emailVerify 188 | config.PRINT_STATUS = not args.quiet 189 | config.SAVE_LOGS = args.log 190 | config.SPLIT_THICK = args.split 191 | config.LAZY_MODE = args.lazy 192 | config.SKIP_DOMAINS = args.skipNumber 193 | config.SAVE_TAR = not args.files 194 | 195 | if config.SKIP_DONE and config.SAVE_TAR: 196 | print "--skip is only compatible with --files" 197 | else: 198 | run() 199 | 200 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | '''Python module to contain all configuration settings for SpiderWho''' 2 | 3 | '''Debug Mode''' 4 | DEBUG = False 5 | 6 | ''' When printing display DPS or LPS ''' 7 | DPS = True 8 | 9 | '''Enable checking result with EMAIL_REGEX''' 10 | RESULT_VALIDCHECK = False 11 | 12 | '''Regex used for whois validation when email check is enabled''' 13 | EMAIL_REGEX = r'[\w.-]+@[\w.-]+' 14 | 15 | '''Skip domains that already have a result saved''' 16 | SKIP_DONE = False 17 | 18 | '''Enable printing of status output''' 19 | PRINT_STATUS = True 20 | 21 | '''Enable logging''' 22 | SAVE_LOGS = False 23 | 24 | '''Enable lazy mode''' 25 | LAZY_MODE = False 26 | 27 | '''Folders to use for output''' 28 | OUTPUT_FOLDER = "out/" 29 | RESULTS_FOLDER = "results/" 30 | LOG_FOLDER = "logs/" 31 | 32 | '''Name of file to place domains that failed''' 33 | FAIL_FILENAME = "fail.txt" 34 | 35 | '''File extensions''' 36 | SAVE_EXT = "whois" 37 | LOG_EXT = "log" 38 | 39 | '''Maximum number of proxies/threads to use, 0=All''' 40 | NUM_PROXIES = 0 41 | 42 | '''Maximum size of queues, when the queues reach their max size they will block new items until items are removed''' 43 | MAX_QUEUE_SIZE = 10000 44 | 45 | '''Minimum number of lines of response required for it to be considered valid''' 46 | MIN_RESPONSE_LINES = 4 47 | 48 | '''When a result fails for any reason, retry it''' 49 | '''setting this value to less than 2 will greatly reduce the reliability of the program''' 50 | MAX_ATTEMPTS = 3 51 | 52 | '''Amount of seconds to wait when updating output (float)''' 53 | STATUS_UPDATE_DELAY = 1.0 54 | 55 | '''Amount of seconds to wait between using the same whois server per proxy''' 56 | WHOIS_SERVER_JUMP_DELAY = 10 57 | 58 | '''Minimum seconds to sleep when waiting for a JUMP_DELAY''' 59 | WHOIS_SERVER_SLEEP_DELAY = 5 60 | 61 | '''Amount of seconds to give each whois query before failing''' 62 | WHOIS_TIMEOUT_SECONDS = 10 63 | 64 | '''Amount of seconds to wait before trying to reconnect to a failed proxy''' 65 | PROXY_FAIL_RECONNECT_DELAY = 20 66 | 67 | ''' How many minutes to wait before trimming whois history ''' 68 | WHOIS_HISTORY_TRIM_MINUTES = 15 69 | 70 | ''' Saves the tar.gz output format ''' 71 | SAVE_TAR = True 72 | 73 | ''' Numer of results to put in tar file befor rotating ''' 74 | SAVE_TAR_SIZE = 500000 75 | 76 | 77 | ''' Split Thick and Thin results ''' 78 | SPLIT_THICK = False 79 | 80 | 81 | ''' 82 | list of servers who are known to be VERY strict with whois data 83 | TODO currently unused 84 | ''' 85 | STRICT_SERVERS = [ 86 | "org.whois-servers.net", 87 | "whois.godaddy.com" 88 | ] 89 | 90 | '''Placeholders, set at runtime''' 91 | DOMAIN_LIST = None 92 | PROXY_LIST = None 93 | START_TIME = 0 94 | SKIP_DOMAINS = 0 95 | -------------------------------------------------------------------------------- /helperThreads.py: -------------------------------------------------------------------------------- 1 | import threading 2 | from Queue import Queue 3 | import time 4 | import whoisThread 5 | import os 6 | import os.path 7 | import urlparse 8 | import config 9 | import tarfile 10 | import StringIO 11 | 12 | #this thread is in charge of starting all the other 13 | #threads and keeping track of thir running status 14 | class ManagerThread(threading.Thread): 15 | '''main thread that is responsible for starting and keeping 16 | track of all other threads''' 17 | 18 | def __init__(self): 19 | threading.Thread.__init__(self) 20 | self.input_queue = Queue() #maxsize set inside EnqueueThread 21 | self.save_queue = Queue(maxsize=config.MAX_QUEUE_SIZE) 22 | self.input_thread = None 23 | self.save_thread = None 24 | self.ready = False 25 | self.threads = list() 26 | 27 | 28 | def run(self): 29 | #startSaveThread 30 | self.save_thread = SaveThread(self.save_queue) 31 | self.save_thread.start() 32 | 33 | #start whois threads 34 | try: 35 | for l in open(config.PROXY_LIST,'r'): 36 | if config.NUM_PROXIES == 0 or len(self.threads) < config.NUM_PROXIES: 37 | l = l.strip() 38 | if l[0] != '#': #if not a comment 39 | url = urlparse.urlparse(l) 40 | proxy_type = None 41 | if url.scheme == "http": 42 | proxy_type = whoisThread.socks.PROXY_TYPE_HTTP 43 | elif url.scheme == "socks": 44 | proxy_type = whoisThread.socks.PROXY_TYPE_SOCKS4 45 | else: 46 | print "Unknown Proxy Type" 47 | if proxy_type: 48 | proxy = whoisThread.Proxy(url.hostname, url.port, proxy_type) 49 | t = whoisThread.WhoisThread(proxy, self.input_queue, self.save_queue) 50 | t.start() 51 | self.threads.append(t) 52 | except IOError: 53 | print "Unable to open proxy file: " + config.PROXY_LIST 54 | return 55 | if config.DEBUG: 56 | print str(whoisThread.getProxyThreadCount()) + " threads started" 57 | 58 | #now start EnqueueThread 59 | self.input_thread = EnqueueThread(self.input_queue) 60 | self.input_thread.start() 61 | 62 | #wait for threads to settle 63 | time.sleep(0.2) 64 | 65 | self.ready = True 66 | 67 | #now wait for all the work to be done 68 | while self.input_thread.isAlive(): 69 | time.sleep(0.5) 70 | 71 | if config.DEBUG: 72 | print "Done loading domains to queue" 73 | 74 | while self.input_queue.qsize() > whoisThread.getProxyThreadCount(): 75 | time.sleep(config.WHOIS_SERVER_JUMP_DELAY) 76 | 77 | #when the reamining queries are all waiting for an open proxy, reduce the delay 78 | #TODO this does not always prevent getting stuck on the last few 79 | config.WHOIS_SERVER_JUMP_DELAY = config.WHOIS_SERVER_SLEEP_DELAY 80 | config.WHOIS_SERVER_SLEEP_DELAY = 1 81 | 82 | self.input_queue.join() 83 | 84 | if config.DEBUG: 85 | print "Saving results" 86 | self.save_queue.join() 87 | 88 | 89 | 90 | #this is a simple thread to read input lines from a 91 | #file and add them to the queue for prossessing 92 | class EnqueueThread(threading.Thread): 93 | def __init__(self,queue): 94 | threading.Thread.__init__(self) 95 | self._queue = queue 96 | self._domains = 0 97 | self.valid = False 98 | self.skipped = 0 99 | self._results_folder = config.OUTPUT_FOLDER+config.RESULTS_FOLDER 100 | self._inputSize = 0.1 # not 0 to prevent divide by 0 errors 101 | self._fh = None 102 | 103 | def getNumSkipped(self): 104 | return self.skipped 105 | 106 | def skipDomain(self, domain): 107 | path = self._results_folder+domain+"."+config.SAVE_EXT 108 | return os.path.isfile(path) 109 | 110 | def getDomainCount(self): 111 | return self._domains 112 | 113 | def getProgress(self): 114 | if self._fh == None: 115 | return 1.0 116 | return self._fh.tell() / self._inputSize 117 | 118 | def run(self): 119 | try: 120 | self._fh = open(config.DOMAIN_LIST, 'r') 121 | self.valid = True 122 | self._inputSize = float(os.fstat(self._fh.fileno()).st_size) 123 | except IOError: 124 | self.valid = False 125 | print "Unable to open file: "+ config.DOMAIN_LIST 126 | return 127 | for l in self._fh: 128 | if self.skipped < config.SKIP_DOMAINS: 129 | self.skipped +=1 130 | continue 131 | l = l.strip().lower() 132 | if len(l) > 3: 133 | if not (config.SKIP_DONE and self.skipDomain(l)): 134 | while self._queue.qsize() >= config.MAX_QUEUE_SIZE: 135 | time.sleep(0.1) 136 | self._queue.put(whoisThread.WhoisResult(l)) 137 | self._domains +=1 138 | else: 139 | self.skipped +=1 140 | self._fh.close() 141 | self._fh = None 142 | 143 | #runs in the background and saves data as we collect it 144 | class SaveThread(threading.Thread): 145 | def __init__(self, queue): 146 | threading.Thread.__init__(self) 147 | self._queue = queue 148 | self._num_saved = 0 149 | self._num_good = 0 150 | self._num_faild = 0 151 | self._num_tared = 0 152 | self._tar_file = None 153 | self._fail_filepath = self.getFailFileName() 154 | self._log_folder = config.OUTPUT_FOLDER + config.LOG_FOLDER 155 | self._results_folder = config.OUTPUT_FOLDER + config.RESULTS_FOLDER 156 | if not os.path.exists(config.OUTPUT_FOLDER): 157 | os.makedirs(config.OUTPUT_FOLDER) 158 | if not os.path.exists(self._log_folder) and config.SAVE_LOGS: 159 | os.makedirs(self._log_folder) 160 | if not os.path.exists(self._results_folder): 161 | os.makedirs(self._results_folder) 162 | 163 | def getFailFileName(self): 164 | fail_filepath = config.OUTPUT_FOLDER + config.FAIL_FILENAME 165 | if os.path.isfile(fail_filepath): 166 | fail_filepath += "." 167 | i = 1; 168 | while os.path.isfile(fail_filepath + str(i)): 169 | i += 1 170 | return fail_filepath + str(i) 171 | return fail_filepath 172 | 173 | def getNumFails(self): 174 | return self._num_faild 175 | 176 | def getNumSaved(self): 177 | return self._num_saved 178 | 179 | def getNumGood(self): 180 | return self._num_good 181 | 182 | def run(self): 183 | while True: 184 | r = self._queue.get() 185 | try: 186 | if config.SAVE_LOGS: 187 | self.saveLog(r) 188 | if r.current_attempt.success: 189 | self.saveData(r) 190 | else: 191 | self.saveFail(r) 192 | finally: 193 | self._num_saved += 1 194 | self._queue.task_done() 195 | 196 | 197 | def saveLog(self, record): 198 | try: 199 | f = open(self._log_folder+record.domain+"."+config.LOG_EXT,'w') 200 | f.write('\n'.join(record.getLogData()) + '\n') 201 | f.close() 202 | return True 203 | except IOError: 204 | print "Unabe to write "+record.domain+".log log to file" 205 | return False 206 | 207 | 208 | def saveFail(self, record): 209 | try: 210 | fail_file = open(self._fail_filepath, 'a+') 211 | fail_file.write(record.domain+'\n') 212 | fail_file.close() 213 | self._num_faild += 1 214 | return True 215 | except IOError: 216 | print "Unabe to write to fail file" 217 | return False 218 | 219 | def saveData(self, record): 220 | if config.SAVE_TAR: 221 | return self.saveDataTar(record) 222 | else: 223 | return self.saveDataFile(record) 224 | 225 | def startTar(self): 226 | self._num_tared = 0 227 | tarname = self.nextTarName() 228 | self._tar_file = tarfile.open(tarname, "w:gz") 229 | 230 | 231 | def nextTarName(self): 232 | tstamp = time.strftime("%Y.%m.%d-%H.%M") 233 | tar_filepath = self._results_folder+tstamp 234 | if os.path.isfile(tar_filepath+".tgz"): 235 | tar_filepath += "." 236 | i = 1; 237 | while os.path.isfile(tar_filepath+str(i)+".tgz"): 238 | i += 1 239 | return tar_filepath + str(i)+".tgz" 240 | return tar_filepath+".tgz" 241 | 242 | 243 | 244 | def closeTar(self): 245 | """ensure this runs on program exit""" 246 | if self._tar_file: 247 | self._tar_file.close() 248 | self._tar_file = None 249 | 250 | def saveDataTar(self, record): 251 | if not self._tar_file: 252 | self.startTar() 253 | 254 | # append record to tar 255 | if config.SPLIT_THICK: 256 | data = record.getThinData() 257 | if data: 258 | who_file = tarfile.TarInfo("thin/"+record.domain+"."+config.SAVE_EXT) 259 | who_file.size = len(data) 260 | who_file.mtime = time.time() 261 | self._tar_file.addfile(who_file, StringIO.StringIO(data)) 262 | else: 263 | # warning no thin data found 264 | if config.DEBUG: 265 | print "Warning: no thin data for "+record.domain 266 | 267 | data = record.getThickData() 268 | if data: 269 | who_file = tarfile.TarInfo("thick/"+record.domain+"."+config.SAVE_EXT) 270 | who_file.size = len(data) 271 | who_file.mtime = time.time() 272 | self._num_tared += 1 273 | self._tar_file.addfile(who_file, StringIO.StringIO(data)) 274 | self._num_good += 1 275 | else: 276 | # warning no thick data found 277 | if config.DEBUG: 278 | print "Warning: no thick data for "+record.domain 279 | else: 280 | who_file = tarfile.TarInfo(record.domain+"."+config.SAVE_EXT) 281 | data = record.getAllData() 282 | who_file.size = len(data) 283 | who_file.mtime = time.time() 284 | self._num_tared += 1 285 | self._tar_file.addfile(who_file, StringIO.StringIO(data)) 286 | self._num_good += 1 287 | 288 | if self._num_tared == config.SAVE_TAR_SIZE: 289 | self.closeTar() 290 | return True 291 | 292 | 293 | def saveDataFile(self, record): 294 | try: 295 | if config.SPLIT_THICK: 296 | f1 = open(self._results_folder+record.domain+".thick."+config.SAVE_EXT, 'w') 297 | f1.write(record.getThickData()) 298 | f1.close() 299 | f2 = open(self._results_folder+record.domain+".thin."+config.SAVE_EXT, 'w') 300 | thinData = record.getThinData() 301 | if thinData != None: 302 | f2.write(thinData) 303 | f2.close() 304 | else: 305 | f = open(self._results_folder+record.domain+"."+config.SAVE_EXT, 'w') 306 | f.write(record.getAllData()) 307 | f.close() 308 | self._num_good += 1 309 | return True 310 | except IOError: 311 | print "Unabe to write "+record.domain+" data to file" 312 | return False 313 | 314 | -------------------------------------------------------------------------------- /proxywhois.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Taken from pywhois project: https://code.google.com/p/pywhois/ 4 | with some modifications for proxy support 5 | """ 6 | 7 | import socks 8 | import config 9 | 10 | debug = False 11 | 12 | class WhoisNoServerException(Exception): 13 | def __init__(self, server): 14 | self.server = server 15 | def __str__(self): 16 | return "Invalid Server: "+repr(self.server) 17 | 18 | class ServerTroubleException(Exception): 19 | def __init__(self, server, error): 20 | self.server = server 21 | self.error = error 22 | def __str__(self): 23 | return repr(self.server) + repr(self.error) 24 | 25 | 26 | 27 | def enforce_ascii(a): 28 | if isinstance(a, str) or isinstance(a, unicode): 29 | # return a.encode('ascii', 'replace') 30 | r = "" 31 | for i in a: 32 | if ord(i) >= 128: 33 | r += "?" 34 | else: 35 | r += i 36 | return r 37 | else: 38 | return a 39 | 40 | 41 | class NICClient(object) : 42 | 43 | ABUSEHOST = "whois.abuse.net" 44 | NICHOST = "whois.crsnic.net" 45 | INICHOST = "whois.networksolutions.com" 46 | DNICHOST = "whois.nic.mil" 47 | GNICHOST = "whois.nic.gov" 48 | ANICHOST = "whois.arin.net" 49 | LNICHOST = "whois.lacnic.net" 50 | RNICHOST = "whois.ripe.net" 51 | PNICHOST = "whois.apnic.net" 52 | MNICHOST = "whois.ra.net" 53 | QNICHOST_TAIL = ".whois-servers.net" 54 | SNICHOST = "whois.6bone.net" 55 | BNICHOST = "whois.registro.br" 56 | NORIDHOST = "whois.norid.no" 57 | IANAHOST = "whois.iana.org" 58 | DENICHOST = "de.whois-servers.net" 59 | DEFAULT_PORT = "nicname" 60 | WHOIS_SERVER_ID = "Whois Server:" 61 | WHOIS_ORG_SERVER_ID = "Registrant Street1:Whois Server:" 62 | 63 | 64 | WHOIS_RECURSE = 0x01 65 | WHOIS_QUICK = 0x02 66 | 67 | ip_whois = [ LNICHOST, RNICHOST, PNICHOST, BNICHOST ] 68 | 69 | def __init__(self) : 70 | self.use_qnichost = False 71 | self.use_proxy = False 72 | self.proxy_type = None 73 | self.proxy_server = None 74 | self.proxy_port = None 75 | 76 | def set_proxy(self, proxy_type, server,port): 77 | """Enables the use of the specified proxy for lookups""" 78 | self.use_proxy = True 79 | self.proxy_type = proxy_type 80 | self.proxy_server = server 81 | self.proxy_port = port 82 | 83 | def findwhois_server(self, buf, hostname): 84 | """Search the initial TLD lookup results for the regional-specifc 85 | whois server for getting contact details. 86 | """ 87 | #print 'finding whois server' 88 | #print 'parameters:', buf, 'hostname', hostname 89 | nhost = None 90 | parts_index = 1 91 | start = buf.rfind(NICClient.WHOIS_SERVER_ID) 92 | #print 'start', start 93 | if (start == -1): 94 | start = buf.rfind(NICClient.WHOIS_ORG_SERVER_ID) 95 | parts_index = 2 96 | 97 | if (start > -1): 98 | end = buf[start:].find('\n') 99 | #print 'end:', end 100 | whois_line = buf[start:end+start] 101 | #print 'whois_line', whois_line 102 | nhost = whois_line.split(NICClient.WHOIS_SERVER_ID+' ').pop() 103 | nhost = nhost.split('http://').pop() 104 | #if the whois address is domain.tld/something then 105 | #s.connect((hostname, 43)) does not work 106 | if nhost.count('/') > 0: 107 | nhost = None 108 | #print 'nhost:',nhost 109 | elif (hostname == NICClient.ANICHOST): 110 | for nichost in NICClient.ip_whois: 111 | if (buf.find(nichost) != -1): 112 | nhost = nichost 113 | break 114 | return nhost 115 | 116 | def TLDSpecificQuery(self,tld,query,server): 117 | tld = tld.lower() 118 | if tld in ['com','net'] and server.endswith(NICClient.QNICHOST_TAIL): 119 | query = "="+query 120 | return query 121 | 122 | def whois(self, query, hostname, flags): 123 | """Perform initial lookup with TLD whois server 124 | then, if the quick flag is false, search that result 125 | for the region-specifc whois server and do a lookup 126 | there for contact details 127 | """ 128 | if debug: 129 | print 'parameters given:', query, hostname, flags 130 | #pdb.set_trace() 131 | s = socks.socksocket(socks.socket.AF_INET, socks.socket.SOCK_STREAM) 132 | s.settimeout(config.WHOIS_TIMEOUT_SECONDS) 133 | 134 | #added code for proxy 135 | if (self.use_proxy): 136 | s.setproxy(self.proxy_type,self.proxy_server,self.proxy_port) 137 | 138 | #convert hostname to ascii 139 | hostname = hostname.encode('ascii','ignore') 140 | 141 | if debug: 142 | print "==DEBUG: Attempting to connect to: "+hostname 143 | 144 | try: 145 | s.connect((hostname, 43)) 146 | """send takes bytes as an input 147 | """ 148 | queryBytes = None 149 | 150 | tld = self.getTLD(query) 151 | if tld: 152 | query = self.TLDSpecificQuery(tld,query,hostname) 153 | 154 | if (hostname == NICClient.DENICHOST): 155 | #print 'the domain is in NIC DENIC' 156 | queryBytes = ("-T dn,ace -C UTF-8 " + query + "\r\n").encode() 157 | #print 'queryBytes:', queryBytes 158 | else: 159 | queryBytes = (query + "\r\n").encode() 160 | s.send(queryBytes) 161 | """recv returns bytes 162 | """ 163 | #print s 164 | response = b'' 165 | while True: 166 | d = s.recv(4096) 167 | response += d 168 | if not d: 169 | break 170 | #pdb.set_trace() 171 | nhost = None 172 | if debug: 173 | print '===========response==============' 174 | print response 175 | print "=================================" 176 | response = enforce_ascii(response) 177 | if (flags & NICClient.WHOIS_RECURSE and nhost == None): 178 | nhost = self.findwhois_server(response.decode(), hostname) 179 | if (nhost != None): 180 | r = self.whois(query, nhost, 0) 181 | if not r: 182 | return 183 | response += r 184 | #print 'returning whois response' 185 | return response.decode() 186 | except socks.socket.gaierror as e: 187 | #bad hostname 188 | if debug: 189 | print "This TLD has no whois server." 190 | return 191 | else: 192 | raise WhoisNoServerException(hostname) 193 | except (socks.socket.error, socks.socket.timeout) as e: 194 | error = ServerTroubleException(hostname,repr(e)) 195 | if debug: 196 | print error 197 | return 198 | raise error 199 | 200 | finally: 201 | s.close() 202 | 203 | def getTLD(self, domain): 204 | if (domain.endswith("-NORID")): 205 | return NICClient.NORIDHOST 206 | pos = domain.rfind('.') 207 | if (pos == -1): 208 | return None 209 | tld = domain[pos+1:] 210 | if (tld[0].isdigit()): 211 | return None 212 | return tld 213 | 214 | def choose_server(self, domain): 215 | """Choose initial lookup NIC host""" 216 | tld = self.getTLD(domain) 217 | if tld: 218 | return tld + NICClient.QNICHOST_TAIL 219 | return NICClient.ANICHOST 220 | 221 | def whois_lookup(self, options, query_arg, flags): 222 | """Main entry point: Perform initial lookup on TLD whois server, 223 | or other server to get region-specific whois server, then if quick 224 | flag is false, perform a second lookup on the region-specific 225 | server for contact records""" 226 | #print 'whois_lookup' 227 | nichost = None 228 | #pdb.set_trace() 229 | # this would be the case when this function is called by other than main 230 | if (options == None): 231 | options = {} 232 | 233 | if ( (not 'whoishost' in options or options['whoishost'] == None) 234 | and (not 'country' in options or options['country'] == None)): 235 | self.use_qnichost = True 236 | options['whoishost'] = NICClient.NICHOST 237 | if ( not (flags & NICClient.WHOIS_QUICK)): 238 | flags |= NICClient.WHOIS_RECURSE 239 | 240 | if ('country' in options and options['country'] != None): 241 | result = self.whois(query_arg, options['country'] + NICClient.QNICHOST_TAIL, flags) 242 | elif (self.use_qnichost): 243 | nichost = self.choose_server(query_arg) 244 | if (nichost != None): 245 | result = self.whois(query_arg, nichost, flags) 246 | else: 247 | result = self.whois(query_arg, options['whoishost'], flags) 248 | #print 'whois_lookup finished' 249 | return result 250 | #---- END OF NICClient class def --------------------- 251 | 252 | if __name__ == "__main__": 253 | import sys #for args 254 | import traceback 255 | debug = True 256 | flags = 0 257 | nic_client = NICClient() 258 | #(options, args) = parse_command_line(sys.argv) 259 | #if (options.b_quicklookup is True): 260 | # flags = flags|NICClient.WHOIS_QUICK 261 | #print(nic_client.whois_lookup(options.__dict__, args[1], flags)) 262 | data = nic_client.whois_lookup(None, sys.argv[1], flags) 263 | if data: 264 | print data 265 | -------------------------------------------------------------------------------- /socks.py: -------------------------------------------------------------------------------- 1 | """SocksiPy - Python SOCKS module. 2 | Version 1.00 3 | 4 | Copyright 2006 Dan-Haim. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without modification, 7 | are permitted provided that the following conditions are met: 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 3. Neither the name of Dan Haim nor the names of his contributors may be used 14 | to endorse or promote products derived from this software without specific 15 | prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY DAN HAIM "AS IS" AND ANY EXPRESS OR IMPLIED 18 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 19 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 20 | EVENT SHALL DAN HAIM OR HIS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA 23 | OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 25 | OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMANGE. 26 | 27 | 28 | This module provides a standard socket-like interface for Python 29 | for tunneling connections through SOCKS proxies. 30 | 31 | """ 32 | 33 | import socket 34 | import struct 35 | 36 | PROXY_TYPE_SOCKS4 = 1 37 | PROXY_TYPE_SOCKS5 = 2 38 | PROXY_TYPE_HTTP = 3 39 | 40 | _defaultproxy = None 41 | _orgsocket = socket.socket 42 | 43 | class ProxyError(Exception): 44 | def __init__(self, value): 45 | self.value = value 46 | def __str__(self): 47 | return repr(self.value) 48 | 49 | class GeneralProxyError(ProxyError): 50 | def __init__(self, value): 51 | self.value = value 52 | def __str__(self): 53 | return repr(self.value) 54 | 55 | class Socks5AuthError(ProxyError): 56 | def __init__(self, value): 57 | self.value = value 58 | def __str__(self): 59 | return repr(self.value) 60 | 61 | class Socks5Error(ProxyError): 62 | def __init__(self, value): 63 | self.value = value 64 | def __str__(self): 65 | return repr(self.value) 66 | 67 | class Socks4Error(ProxyError): 68 | def __init__(self, value): 69 | self.value = value 70 | def __str__(self): 71 | return repr(self.value) 72 | 73 | class HTTPError(ProxyError): 74 | def __init__(self, value): 75 | self.value = value 76 | def __str__(self): 77 | return repr(self.value) 78 | 79 | _generalerrors = ("success", 80 | "invalid data", 81 | "not connected", 82 | "not available", 83 | "bad proxy type", 84 | "bad input", 85 | "unable to connect to proxy") 86 | 87 | _socks5errors = ("succeeded", 88 | "general SOCKS server failure", 89 | "connection not allowed by ruleset", 90 | "Network unreachable", 91 | "Host unreachable", 92 | "Connection refused", 93 | "TTL expired", 94 | "Command not supported", 95 | "Address type not supported", 96 | "Unknown error") 97 | 98 | _socks5autherrors = ("succeeded", 99 | "authentication is required", 100 | "all offered authentication methods were rejected", 101 | "unknown username or invalid password", 102 | "unknown error") 103 | 104 | _socks4errors = ("request granted", 105 | "request rejected or failed", 106 | "request rejected because SOCKS server cannot connect to identd on the client", 107 | "request rejected because the client program and identd report different user-ids", 108 | "unknown error") 109 | 110 | def setdefaultproxy(proxytype=None,addr=None,port=None,rdns=True,username=None,password=None): 111 | """setdefaultproxy(proxytype, addr[, port[, rdns[, username[, password]]]]) 112 | Sets a default proxy which all further socksocket objects will use, 113 | unless explicitly changed. 114 | """ 115 | global _defaultproxy 116 | _defaultproxy = (proxytype,addr,port,rdns,username,password) 117 | 118 | class socksocket(socket.socket): 119 | """socksocket([family[, type[, proto]]]) -> socket object 120 | 121 | Open a SOCKS enabled socket. The parameters are the same as 122 | those of the standard socket init. In order for SOCKS to work, 123 | you must specify family=AF_INET, type=SOCK_STREAM and proto=0. 124 | """ 125 | 126 | def __init__(self, family=socket.AF_INET, type=socket.SOCK_STREAM, proto=0, _sock=None): 127 | self.allow_direct = True 128 | _orgsocket.__init__(self,family,type,proto,_sock) 129 | if _defaultproxy != None: 130 | self.__proxy = _defaultproxy 131 | else: 132 | self.__proxy = (None, None, None, None, None, None) 133 | self.__proxysockname = None 134 | self.__proxypeername = None 135 | 136 | def __recvall(self, bytes): 137 | """__recvall(bytes) -> data 138 | Receive EXACTLY the number of bytes requested from the socket. 139 | Blocks until the required number of bytes have been received. 140 | """ 141 | data = "" 142 | while len(data) < bytes: 143 | data = data + self.recv(bytes-len(data)) 144 | return data 145 | 146 | def setproxy(self,proxytype=None,addr=None,port=None,rdns=True,username=None,password=None): 147 | """setproxy(proxytype, addr[, port[, rdns[, username[, password]]]]) 148 | Sets the proxy to be used. 149 | proxytype - The type of the proxy to be used. Three types 150 | are supported: PROXY_TYPE_SOCKS4 (including socks4a), 151 | PROXY_TYPE_SOCKS5 and PROXY_TYPE_HTTP 152 | addr - The address of the server (IP or DNS). 153 | port - The port of the server. Defaults to 1080 for SOCKS 154 | servers and 8080 for HTTP proxy servers. 155 | rdns - Should DNS queries be preformed on the remote side 156 | (rather than the local side). The default is True. 157 | Note: This has no effect with SOCKS4 servers. 158 | username - Username to authenticate with to the server. 159 | The default is no authentication. 160 | password - Password to authenticate with to the server. 161 | Only relevant when username is also provided. 162 | """ 163 | self.__proxy = (proxytype,addr,port,rdns,username,password) 164 | self.allow_direct = False #disable local fallback 165 | 166 | def __negotiatesocks5(self,destaddr,destport): 167 | """__negotiatesocks5(self,destaddr,destport) 168 | Negotiates a connection through a SOCKS5 server. 169 | """ 170 | # First we'll send the authentication packages we support. 171 | if (self.__proxy[4]!=None) and (self.__proxy[5]!=None): 172 | # The username/password details were supplied to the 173 | # setproxy method so we support the USERNAME/PASSWORD 174 | # authentication (in addition to the standard none). 175 | self.sendall("\x05\x02\x00\x02") 176 | else: 177 | # No username/password were entered, therefore we 178 | # only support connections with no authentication. 179 | self.sendall("\x05\x01\x00") 180 | # We'll receive the server's response to determine which 181 | # method was selected 182 | chosenauth = self.__recvall(2) 183 | if chosenauth[0] != "\x05": 184 | self.close() 185 | raise GeneralProxyError((1,_generalerrors[1])) 186 | # Check the chosen authentication method 187 | if chosenauth[1] == "\x00": 188 | # No authentication is required 189 | pass 190 | elif chosenauth[1] == "\x02": 191 | # Okay, we need to perform a basic username/password 192 | # authentication. 193 | self.sendall("\x01" + chr(len(self.__proxy[4])) + self.__proxy[4] + chr(len(self.proxy[5])) + self.__proxy[5]) 194 | authstat = self.__recvall(2) 195 | if authstat[0] != "\x01": 196 | # Bad response 197 | self.close() 198 | raise GeneralProxyError((1,_generalerrors[1])) 199 | if authstat[1] != "\x00": 200 | # Authentication failed 201 | self.close() 202 | raise Socks5AuthError,((3,_socks5autherrors[3])) 203 | # Authentication succeeded 204 | else: 205 | # Reaching here is always bad 206 | self.close() 207 | if chosenauth[1] == "\xFF": 208 | raise Socks5AuthError((2,_socks5autherrors[2])) 209 | else: 210 | raise GeneralProxyError((1,_generalerrors[1])) 211 | # Now we can request the actual connection 212 | req = "\x05\x01\x00" 213 | # If the given destination address is an IP address, we'll 214 | # use the IPv4 address request even if remote resolving was specified. 215 | try: 216 | ipaddr = socket.inet_aton(destaddr) 217 | req = req + "\x01" + ipaddr 218 | except socket.error: 219 | # Well it's not an IP number, so it's probably a DNS name. 220 | if self.__proxy[3]==True: 221 | # Resolve remotely 222 | ipaddr = None 223 | req = req + "\x03" + chr(len(destaddr)) + destaddr 224 | else: 225 | # Resolve locally 226 | ipaddr = socket.inet_aton(socket.gethostbyname(destaddr)) 227 | req = req + "\x01" + ipaddr 228 | req = req + struct.pack(">H",destport) 229 | self.sendall(req) 230 | # Get the response 231 | resp = self.__recvall(4) 232 | if resp[0] != "\x05": 233 | self.close() 234 | raise GeneralProxyError((1,_generalerrors[1])) 235 | elif resp[1] != "\x00": 236 | # Connection failed 237 | self.close() 238 | if ord(resp[1])<=8: 239 | raise Socks5Error(ord(resp[1]),_generalerrors[ord(resp[1])]) 240 | else: 241 | raise Socks5Error(9,_generalerrors[9]) 242 | # Get the bound address/port 243 | elif resp[3] == "\x01": 244 | boundaddr = self.__recvall(4) 245 | elif resp[3] == "\x03": 246 | resp = resp + self.recv(1) 247 | boundaddr = self.__recvall(resp[4]) 248 | else: 249 | self.close() 250 | raise GeneralProxyError((1,_generalerrors[1])) 251 | boundport = struct.unpack(">H",self.__recvall(2))[0] 252 | self.__proxysockname = (boundaddr,boundport) 253 | if ipaddr != None: 254 | self.__proxypeername = (socket.inet_ntoa(ipaddr),destport) 255 | else: 256 | self.__proxypeername = (destaddr,destport) 257 | 258 | def getproxysockname(self): 259 | """getsockname() -> address info 260 | Returns the bound IP address and port number at the proxy. 261 | """ 262 | return self.__proxysockname 263 | 264 | def getproxypeername(self): 265 | """getproxypeername() -> address info 266 | Returns the IP and port number of the proxy. 267 | """ 268 | return _orgsocket.getpeername(self) 269 | 270 | def getpeername(self): 271 | """getpeername() -> address info 272 | Returns the IP address and port number of the destination 273 | machine (note: getproxypeername returns the proxy) 274 | """ 275 | return self.__proxypeername 276 | 277 | def __negotiatesocks4(self,destaddr,destport): 278 | """__negotiatesocks4(self,destaddr,destport) 279 | Negotiates a connection through a SOCKS4 server. 280 | """ 281 | # Check if the destination address provided is an IP address 282 | rmtrslv = False 283 | try: 284 | ipaddr = socket.inet_aton(destaddr) 285 | except socket.error: 286 | # It's a DNS name. Check where it should be resolved. 287 | if self.__proxy[3]==True: 288 | ipaddr = "\x00\x00\x00\x01" 289 | rmtrslv = True 290 | else: 291 | ipaddr = socket.inet_aton(socket.gethostbyname(destaddr)) 292 | # Construct the request packet 293 | req = "\x04\x01" + struct.pack(">H",destport) + ipaddr 294 | # The username parameter is considered userid for SOCKS4 295 | if self.__proxy[4] != None: 296 | req = req + self.__proxy[4] 297 | req = req + "\x00" 298 | # DNS name if remote resolving is required 299 | # NOTE: This is actually an extension to the SOCKS4 protocol 300 | # called SOCKS4A and may not be supported in all cases. 301 | if rmtrslv==True: 302 | req = req + destaddr + "\x00" 303 | self.sendall(req) 304 | # Get the response from the server 305 | resp = self.__recvall(8) 306 | if resp[0] != "\x00": 307 | # Bad data 308 | self.close() 309 | raise GeneralProxyError((1,_generalerrors[1])) 310 | if resp[1] != "\x5A": 311 | # Server returned an error 312 | self.close() 313 | if ord(resp[1]) in (91,92,93): 314 | self.close() 315 | raise Socks4Error((ord(resp[1]),_socks4errors[ord(resp[1])-90])) 316 | else: 317 | raise Socks4Error((94,_socks4errors[4])) 318 | # Get the bound address/port 319 | self.__proxysockname = (socket.inet_ntoa(resp[4:]),struct.unpack(">H",resp[2:4])[0]) 320 | if rmtrslv != None: 321 | self.__proxypeername = (socket.inet_ntoa(ipaddr),destport) 322 | else: 323 | self.__proxypeername = (destaddr,destport) 324 | 325 | def __negotiatehttp(self,destaddr,destport): 326 | """__negotiatehttp(self,destaddr,destport) 327 | Negotiates a connection through an HTTP server. 328 | """ 329 | # If we need to resolve locally, we do this now 330 | if self.__proxy[3] == False: 331 | addr = socket.gethostbyname(destaddr) 332 | else: 333 | addr = destaddr 334 | self.sendall("CONNECT " + addr + ":" + str(destport) + " HTTP/1.1\r\n" + "Host: " + destaddr + "\r\n\r\n") 335 | # We read the response until we get the string "\r\n\r\n" 336 | resp = self.recv(1) 337 | while resp.find("\r\n\r\n")==-1: 338 | resp = resp + self.recv(1) 339 | # We just need the first line to check if the connection 340 | # was successful 341 | statusline = resp.splitlines()[0].split(" ",2) 342 | if statusline[0] not in ("HTTP/1.0","HTTP/1.1"): 343 | self.close() 344 | raise GeneralProxyError((1,_generalerrors[1])) 345 | try: 346 | statuscode = int(statusline[1]) 347 | except ValueError: 348 | self.close() 349 | raise GeneralProxyError((1,_generalerrors[1])) 350 | if statuscode != 200: 351 | self.close() 352 | raise HTTPError((statuscode,statusline[2])) 353 | self.__proxysockname = ("0.0.0.0",0) 354 | self.__proxypeername = (addr,destport) 355 | 356 | def connect(self,destpair): 357 | """connect(self,despair) 358 | Connects to the specified destination through a proxy. 359 | destpar - A tuple of the IP/DNS address and the port number. 360 | (identical to socket's connect). 361 | To select the proxy server use setproxy(). 362 | """ 363 | 364 | # Do a minimal input check first 365 | if (type(destpair) in (list,tuple)==False) or (len(destpair)<2) or (type(destpair[0])!=str) or (type(destpair[1])!=int): 366 | raise GeneralProxyError((5,_generalerrors[5])) 367 | 368 | if self.__proxy[0] == PROXY_TYPE_SOCKS5: 369 | if self.__proxy[2] != None: 370 | portnum = self.__proxy[2] 371 | else: 372 | portnum = 1080 373 | try: 374 | _orgsocket.connect(self,(self.__proxy[1],portnum)) 375 | except socket.error: 376 | raise GeneralProxyError((6,_generalerrors[6])) 377 | self.__negotiatesocks5(destpair[0],destpair[1]) 378 | elif self.__proxy[0] == PROXY_TYPE_SOCKS4: 379 | if self.__proxy[2] != None: 380 | portnum = self.__proxy[2] 381 | else: 382 | portnum = 1080 383 | try: 384 | _orgsocket.connect(self,(self.__proxy[1],portnum)) 385 | except socket.error: 386 | raise GeneralProxyError((6,_generalerrors[6])) 387 | self.__negotiatesocks4(destpair[0],destpair[1]) 388 | elif self.__proxy[0] == PROXY_TYPE_HTTP: 389 | if self.__proxy[2] != None: 390 | portnum = self.__proxy[2] 391 | else: 392 | portnum = 8080 393 | try: 394 | _orgsocket.connect(self,(self.__proxy[1],portnum)) 395 | except socket.error: 396 | raise GeneralProxyError((6,_generalerrors[6])) 397 | self.__negotiatehttp(destpair[0],destpair[1]) 398 | elif self.__proxy[0] == None and self.allow_direct: 399 | #WARNING: NO PROXY IN USE! 400 | print "WARNING: NOT USING PROXY" 401 | _orgsocket.connect(self,(destpair[0],destpair[1])) 402 | #elif not self.allow_direct: 403 | # raise GeneralProxyError((6,_generalerrors[6])) 404 | else: 405 | #print "p struct: " +str(self.__proxy) 406 | raise GeneralProxyError((4,_generalerrors[4])) 407 | -------------------------------------------------------------------------------- /whoisThread.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import threading 3 | import proxywhois 4 | import socks 5 | import sys 6 | import time 7 | import traceback 8 | import re 9 | import urlparse 10 | import config 11 | import string 12 | import random 13 | import datetime 14 | 15 | #NULL whois result Exception 16 | class NullWhoisException(Exception): 17 | count = 0 18 | def __init__(self, value, whitespace=False): 19 | NullWhoisException.count += 1 20 | self.value = value 21 | self.whitespace = whitespace 22 | def __str__(self): 23 | return "Null Whois: "+repr(self.value) 24 | 25 | # this has been deprecated in favor of the exceptions inside proxywhois 26 | class WhoisTimeoutException(Exception): 27 | count = 0 28 | def __init__(self, value): 29 | WhoisTimeoutException.count += 1 30 | self.value = value 31 | def __str__(self): 32 | return "Whois Timeout on: "+repr(self.value) 33 | 34 | class WhoisLinesException(Exception): 35 | count = 0 36 | def __init__(self, value,data): 37 | WhoisLinesException.count += 1 38 | self.value = value 39 | self.data = data 40 | def __str__(self): 41 | return "Response Too Small: "+repr(self.value)+"\n"+repr(self.data) 42 | 43 | class WhoisRatelimitException(Exception): 44 | count = 0 45 | def __init__(self, server, hard_limit=True, forceInc=False): 46 | if hard_limit: 47 | WhoisRatelimitException.count += 1 48 | self.server = server 49 | self.hard = hard_limit 50 | self.forceInc = forceInc 51 | def strict(self): 52 | if self.server in config.STRICT_SERVERS: 53 | return True 54 | return False 55 | def __str__(self): 56 | return "Whois Ratelimit Reached on: "+repr(self.server)+" Hard Limit: "+str(self.hard) 57 | 58 | class WhoisBadDomainException(Exception): 59 | count = 0 60 | def __init__(self, domain): 61 | WhoisBadDomainException.count += 1 62 | self.domain = domain 63 | def __str__(self): 64 | return "Invalid Domain: "+repr(self.domain) 65 | 66 | #TODO currently unused 67 | class WhoisHTTPReferralException(Exception): 68 | '''whois result refers us to a web address, possibly due to rate limiting''' 69 | count = 0 70 | def __init__(self, domain, server, url): 71 | WhoisHTTPReferralException.count += 1 72 | self.domain = domain 73 | self.server = server 74 | self.url = url 75 | def __str__(self): 76 | return "HTTP Refferal domain: "+repr(self.domain)+" server: "+repr(self.server)+" url: "+repr(self.url) 77 | 78 | 79 | def printExceptionCounts(): 80 | print "WhoisRatelimitException:\t" + str(WhoisRatelimitException.count) 81 | print "NullWhoisException:\t" + str(NullWhoisException.count) 82 | print "WhoisTimeoutException:\t" + str(WhoisTimeoutException.count) 83 | print "WhoisBadDomainException:\t" + str(WhoisBadDomainException.count) 84 | print "WhoisHTTPReferralException:\t" + str(WhoisHTTPReferralException.count) 85 | print "WhoisLinesException:\t" + str(WhoisLinesException.count) 86 | 87 | 88 | #static vars 89 | numActiveThreads_lock = threading.Lock() 90 | numActiveThreads = 0 91 | numProxyThreads_lock = threading.Lock() 92 | numProxyThreads = 0 93 | proxy_ip_list_lock = threading.Lock() 94 | proxy_ip_list = list() 95 | numLookups_lock = threading.Lock() 96 | numLookups = 0 97 | 98 | 99 | def removeRemoteProxyIP(ip): 100 | global proxy_ip_list_lock 101 | global proxy_ip_list 102 | proxy_ip_list_lock.acquire() 103 | try: 104 | if ip in proxy_ip_list: 105 | proxy_ip_list.remove(ip) 106 | else: 107 | print "Cant remove IP from list it is not in "+ str(ip) 108 | finally: 109 | proxy_ip_list_lock.release() 110 | 111 | 112 | def addRemoteProxyIP(ip): 113 | global proxy_ip_list_lock 114 | global proxy_ip_list 115 | proxy_ip_list_lock.acquire() 116 | ret = None 117 | try: 118 | if not ip in proxy_ip_list: 119 | proxy_ip_list.append(ip) 120 | ret = True 121 | else: 122 | ret = False 123 | finally: 124 | proxy_ip_list_lock.release() 125 | return ret 126 | 127 | def incrementLookupCount(): 128 | global numLookups_lock 129 | global numLookups 130 | numLookups_lock.acquire() 131 | try: 132 | numLookups += 1 133 | finally: 134 | numLookups_lock.release() 135 | 136 | def getLookupCount(): 137 | global numActiveThreads_loc 138 | global numActiveThreads 139 | ret = -1 140 | numLookups_lock.acquire() 141 | try: 142 | ret = numLookups 143 | finally: 144 | numLookups_lock.release() 145 | return ret 146 | 147 | ''' 148 | Active threads are threaads that are not sleeing and activly querying a reccord 149 | ''' 150 | def incrementActiveThreadCount(): 151 | global numActiveThreads_lock 152 | global numActiveThreads 153 | numActiveThreads_lock.acquire() 154 | try: 155 | numActiveThreads += 1 156 | finally: 157 | numActiveThreads_lock.release() 158 | 159 | def decrementActiveThreadCount(): 160 | global numActiveThreads_lock 161 | global numActiveThreads 162 | numActiveThreads_lock.acquire() 163 | try: 164 | numActiveThreads -= 1 165 | finally: 166 | numActiveThreads_lock.release() 167 | 168 | def getActiveThreadCount(): 169 | global numActiveThreads_lock 170 | global numActiveThreads 171 | ret = -1 172 | numActiveThreads_lock.acquire() 173 | try: 174 | ret = numActiveThreads 175 | finally: 176 | numActiveThreads_lock.release() 177 | return ret 178 | 179 | ''' 180 | Proxy threads are threads with working proxies 181 | ''' 182 | def incrementProxyThreadCount(): 183 | global numProxyThreads_lock 184 | global numProxyThreads 185 | numProxyThreads_lock.acquire() 186 | try: 187 | numProxyThreads += 1 188 | finally: 189 | numProxyThreads_lock.release() 190 | 191 | def decrementProxyThreadCount(): 192 | global numProxyThreads_lock 193 | global numProxyThreads 194 | numProxyThreads_lock.acquire() 195 | try: 196 | numProxyThreads -= 1 197 | finally: 198 | numProxyThreads_lock.release() 199 | 200 | def getProxyThreadCount(): 201 | global numProxyThreads_lock 202 | global numProxyThreads 203 | ret = -1 204 | numProxyThreads_lock.acquire() 205 | try: 206 | ret = numProxyThreads 207 | finally: 208 | numProxyThreads_lock.release() 209 | return ret 210 | 211 | 212 | 213 | #this object is used to store the results of a whois result as it is passed around 214 | class WhoisResult: 215 | def __init__(self, domain): 216 | self.domain = domain 217 | self.attempts = list() 218 | self.current_attempt = None 219 | self.maxAttempts = False 220 | self.next_whois_server = None 221 | self.fails = 0 222 | 223 | def getNextServer(self): 224 | return self.next_whois_server 225 | 226 | def setNextServer(self,server): 227 | self.next_whois_server = server 228 | 229 | def valid(self): 230 | '''performs quick checking to verify that the data we got may contain some valid data''' 231 | #search for email 232 | match = re.search(config.EMAIL_REGEX, self.getThickData()) 233 | if match: 234 | return True 235 | return False 236 | 237 | def addAttempt(self, attempt): 238 | self.attempts.append(attempt) 239 | self.current_attempt = self.attempts[-1] 240 | return self.current_attempt 241 | 242 | def addError(self, error, fail=True): 243 | if fail: 244 | self.fails += 1 245 | if self.current_attempt: 246 | self.current_attempt.addError(error) 247 | else: 248 | print "ERROR: Adding error to result without attempt" 249 | 250 | def getLogData(self): 251 | log = list() 252 | log.append("DOMAIN: "+self.domain) 253 | log.append("Fails: "+str(self.fails)) 254 | log.append("Max Attempts: "+ str(self.maxAttempts)) 255 | log.append("Last Whois Server: "+ str(self.next_whois_server)) 256 | for (num, attempt) in enumerate(self.attempts): 257 | if not (attempt.success == False and len(attempt.responses) == 0 and len(attempt.errors) == 0): 258 | log.append("-----------Attempt:"+str(num)+"------------") 259 | #dont log when one proxy hands off to another without doing any work 260 | log += attempt.getLogData() 261 | return log 262 | 263 | def getAllData(self,all_data=True): 264 | """Returnes the string response of the last response on the last attempt""" 265 | """ there is a bug here when a failure ofccored on a thick server the thin data is not saved""" 266 | """ deprecating this function in favor of getthick and getthin """ 267 | if all_data: 268 | return self.attempts[-1].getResponse() 269 | else: 270 | return self.attempts[-1].getLastResponse() 271 | 272 | def getThickData(self): 273 | for attempt in self.attempts[::-1]: 274 | r = attempt.getThickResponse() 275 | if r: 276 | return r.getResponse() 277 | return None 278 | 279 | 280 | def getThinData(self): 281 | for attempt in self.attempts[::-1]: 282 | r = attempt.getThinResponse() 283 | if r: 284 | return r.getResponse() 285 | return None 286 | 287 | 288 | def numFails(self): 289 | return self.fails 290 | 291 | def getLastAttempt(self): 292 | if len(self.attempts) > 0: 293 | return self.attempts[-1] 294 | else: 295 | return None 296 | 297 | 298 | #class to hold details on an attempt to whois a particular domain 299 | class WhoisAttempt: 300 | def __init__(self, proxy): 301 | #timestamp (float) 302 | self.timestamp = time.time() 303 | self.success = False 304 | self.proxy = proxy 305 | self.errors = list() 306 | self.responses = list() #contains a list of WhoisResponse classes in the order they were queried 307 | 308 | def addError(self,error): 309 | self.errors.append(error) 310 | 311 | def getLogData(self): 312 | log = list() 313 | log.append("Timestamp: "+ str(self.timestamp)) 314 | log.append("Proxy: "+ self.proxy.getLog()) 315 | log.append("Success: "+ str(self.success)) 316 | log.append("Responses: "+str(len(self.responses))) 317 | for response in self.responses: 318 | log += response.getLogData() 319 | numErrors = len(self.errors) 320 | log.append("Errors: "+ str(numErrors)) 321 | for error in self.errors: 322 | log.append("--Error: "+str(error)) 323 | return log 324 | 325 | def getLastResponse(self): 326 | if len(self.responses) > 0: 327 | return self.responses[-1] 328 | else: 329 | return None 330 | 331 | def getResponse(self): 332 | if len(self.responses) < 1: 333 | return None 334 | else: 335 | ret = "" 336 | for response in self.responses: 337 | ret += response.getResponse() 338 | ret += "\n" 339 | return ret 340 | 341 | def getThickResponse(self): 342 | for r in self.responses[::-1]: 343 | if r.resultType == ResultType.Thick: 344 | return r 345 | return None 346 | 347 | 348 | def getThinResponse(self): 349 | for r in self.responses[::-1]: 350 | if r.resultType == ResultType.Thin: 351 | return r 352 | return None 353 | 354 | 355 | def addResponse(self,response): 356 | self.responses.append(response) 357 | 358 | 359 | """ Class to represent thick / thin enum types """ 360 | class ResultType(): 361 | Unknown = 0 362 | Thin = 1 363 | Thick = 2 364 | 365 | 366 | """Class used to store the response of an individual 367 | whois query, may be a thick or thin result""" 368 | class WhoisResponse: 369 | def __init__(self, server): 370 | self.server = server 371 | self.response = None 372 | self.resultType = ResultType.Unknown 373 | 374 | def setResponse(self,response): 375 | self.response = response 376 | 377 | def getResponse(self): 378 | return self.response 379 | 380 | def getServer(self): 381 | return self.server 382 | 383 | def getType(self): 384 | return self.resultType 385 | 386 | def setType(self, t): 387 | self.resultType = t 388 | 389 | def getLogData(self): 390 | log = list() 391 | log.append("WHOIS server: "+str(self.server)) 392 | log.append("======Response=====================") 393 | log.append(str(self.response)) 394 | log.append("===================================") 395 | return log 396 | 397 | 398 | #class to hold a proxy object 399 | class Proxy: 400 | def __init__(self,ip, port, proxy_type): 401 | self.server = ip 402 | self.port = port 403 | self.proxy_type = proxy_type 404 | self.external_ip = None 405 | self.ready = False 406 | self.errors = 0 407 | self.client = proxywhois.NICClient() 408 | self.history = dict() 409 | self.nextHistoryTrim = time.time() 410 | 411 | def connect(self): 412 | self.updateExternalIP() 413 | self.client.set_proxy(self.proxy_type, self.server, self.port) 414 | if not self.external_ip: 415 | return False 416 | self.ready = True 417 | return self.ready 418 | 419 | def getLog(self): 420 | return str(self) +" Errors: "+ str(self.errors) 421 | 422 | def __repr__(self): 423 | ret = "Server:"+self.server +":"+str(self.port) 424 | if self.external_ip: 425 | ret += " ExtIP:"+self.external_ip 426 | return ret 427 | 428 | def updateExternalIP(self): 429 | """this method uses the proxy socket to get the remote IP on that proxy""" 430 | host = "http://www.sysnet.ucsd.edu/cgi-bin/whoami.sh" 431 | url = urlparse.urlparse(host) 432 | for i in range(3): #try 3 times 433 | try: 434 | s = socks.socksocket(socks.socket.AF_INET, socks.socket.SOCK_STREAM) 435 | s.settimeout(config.WHOIS_TIMEOUT_SECONDS) 436 | s.setproxy(self.proxy_type,self.server, self.port) 437 | s.connect((url.hostname, 80)) 438 | s.send('GET '+url.path+' HTTP/1.0\r\nHost: '+url.hostname+'\r\n\r\n') 439 | r = s.recv(4096) 440 | except Exception as e: 441 | time.sleep(0.1) 442 | else: 443 | if len(r): 444 | self.external_ip = r.split()[-1] 445 | return self.external_ip 446 | time.sleep(0.1) 447 | return None 448 | 449 | def trimHistory(self, t): 450 | if t > self.nextHistoryTrim: 451 | self.nextHistoryTrim = t + datetime.timedelta(minutes=config.WHOIS_HISTORY_TRIM_MINUTES).total_seconds() 452 | 453 | trimAge = t - config.WHOIS_SERVER_JUMP_DELAY 454 | 455 | for server, lastSeen in self.history.items(): 456 | if lastSeen < trimAge: 457 | del self.history[server] 458 | 459 | def whois(self,record): 460 | """This fucnction is a replacment of whois_lookup 461 | from the proxywhois class""" 462 | if not self.ready: 463 | return False 464 | # this is the maximum amout of times we will recurse looking for 465 | # a thin whois server to reffer us 466 | recurse_level = 2 467 | whois_server = record.getNextServer() 468 | if whois_server == None: 469 | # find inital whois server 470 | whois_server = self.client.choose_server(record.domain) 471 | while (recurse_level > 0) and (whois_server != None): 472 | whois_server = whois_server.lower() 473 | record.setNextServer(whois_server) 474 | t = time.time() 475 | if whois_server in self.history: 476 | tdelta = t - self.history[whois_server] 477 | if tdelta < config.WHOIS_SERVER_JUMP_DELAY: #if the amount of time since the last query is less than the delay 478 | if (config.WHOIS_SERVER_JUMP_DELAY-tdelta) < config.WHOIS_SERVER_SLEEP_DELAY: #if the time left to wait is less then the sleep delay 479 | decrementActiveThreadCount() 480 | time.sleep(config.WHOIS_SERVER_JUMP_DELAY-tdelta) 481 | incrementActiveThreadCount() 482 | else: 483 | time.sleep(random.random()) #this protects us from busy waiting 484 | raise WhoisRatelimitException(whois_server, False) 485 | self.history[whois_server] = t 486 | #TODO have thread remove old entries from history every x runs (runs % x) 487 | # currently useing time 488 | self.trimHistory(t) 489 | response = WhoisResponse(whois_server) 490 | incrementLookupCount() 491 | data = None 492 | try: 493 | data = self.client.whois(record.domain, whois_server, 0) 494 | except proxywhois.ServerTroubleException as e: 495 | raise WhoisRatelimitException(whois_server, False, True) 496 | if data == None or len(data) < 1: 497 | error = "Error: Empty response recieved for domain: "+record.domain+" on server: "+whois_server+" Using proxy: "+self.server 498 | if config.DEBUG: 499 | print error 500 | #TODO this may often be a WhoisRatelimitException case 501 | raise NullWhoisException(error) 502 | 503 | response.setResponse(data) 504 | record.getLastAttempt().addResponse(response) 505 | 506 | nLines = data.count('\n') 507 | if nLines < config.MIN_RESPONSE_LINES: #if we got less than the minimul amount of lines to be considered a valid response 508 | data_lower = data.lower() 509 | 510 | if len(data_lower.strip()) == 0: 511 | raise NullWhoisException("whitespace response",True) 512 | 513 | #TODO move these checks into a response checking function 514 | 515 | ''' check for rate limits''' 516 | #TODO parse limit and add to exception 517 | if "limit exceeded" in data_lower: 518 | raise WhoisRatelimitException(whois_server) 519 | if "please note that the query limit is" in data_lower: 520 | raise WhoisRatelimitException(whois_server) 521 | if "quota exceeded" in data_lower: 522 | raise WhoisRatelimitException(whois_server) 523 | if "try again later" in data_lower: 524 | raise WhoisRatelimitException(whois_server) 525 | if "limit reached" in data_lower: 526 | raise WhoisRatelimitException(whois_server) 527 | if "IP addresses that may have failed" in data_lower: 528 | raise WhoisRatelimitException(whois_server) 529 | 530 | '''non-existant domain''' 531 | if "invalid domain name" in data_lower: 532 | raise WhoisBadDomainException(record.domain) 533 | if "no match" in data_lower: 534 | raise WhoisBadDomainException(record.domain) 535 | if " is not registered here." in data_lower: 536 | raise WhoisBadDomainException(record.domain) 537 | if "not found" in data_lower: 538 | raise WhoisBadDomainException(record.domain) 539 | if "can't get information on local domain" in data_lower: 540 | raise WhoisBadDomainException(record.domain) 541 | if "no information available" in data_lower: 542 | raise WhoisBadDomainException(record.domain) 543 | if "no matching record" in data_lower: 544 | raise WhoisBadDomainException(record.domain) 545 | if "invalid query" in data_lower: 546 | raise WhoisBadDomainException(record.domain) 547 | if "out of this registry" in data_lower: 548 | raise WhoisBadDomainException(record.domain) 549 | if "out of registry" in data_lower: 550 | raise WhoisBadDomainException(record.domain) 551 | if "domain name invalid format" in data_lower: 552 | raise WhoisBadDomainException(record.domain) 553 | if "no data found" in data_lower: 554 | raise WhoisBadDomainException(record.domain) 555 | if "incorrect domain name" in data_lower: 556 | raise WhoisBadDomainException(record.domain) 557 | if "no domain" in data_lower: 558 | raise WhoisBadDomainException(record.domain) 559 | if "no found" in data_lower: # yes, that is 2 spaces 560 | raise WhoisBadDomainException(record.domain) 561 | if "whois service not available for this domain" in data_lower: 562 | raise WhoisBadDomainException(record.domain) 563 | if "we do not have an entry in our database matching your query" in data_lower: 564 | raise WhoisBadDomainException(record.domain) 565 | if "syntax error in specified domain name" in data_lower: 566 | raise WhoisBadDomainException(record.domain) 567 | if "not exists" in data_lower: 568 | raise WhoisBadDomainException(record.domain) 569 | if "we're sorry, there has been a problem. technicians have been notified" in data_lower: 570 | raise WhoisBadDomainException(record.domain) 571 | if u'網域名稱不合規定' in data_lower: 572 | raise WhoisBadDomainException(record.domain) 573 | 574 | ''' http whois errors''' 575 | #TODO WGET http url 576 | #TODO some of these http errors may never acually be seen due to the linux whois client being hardcoded 577 | if "this tld has no whois server, but you can access the whois database at" in data_lower: 578 | #url = data.splitlines()[-1]) 579 | #raise WhoisHTTPReferralException(record.domain, whois_server, url) 580 | return response 581 | if 'registered\nnot the default registrar' in data_lower: 582 | #url = data.splitlines()[-1]) 583 | #raise WhoisHTTPReferralException(record.domain, whois_server, url) 584 | return response 585 | if 'this tld has no whois server, but you can access the whois database at' in data_lower: 586 | #url = data.splitlines()[-1]) 587 | #raise WhoisHTTPReferralException(record.domain, whois_server, url) 588 | return response 589 | 590 | #corner case 591 | if whois_server == "to.whois-servers.net" and "tonic whoisd" in data_lower: 592 | pass 593 | elif whois_server == "it.whois-servers.net" and "unassignable" in data_lower: 594 | pass 595 | else: 596 | error = "Error: recieved small "+str(nLines)+" response for domain: "+record.domain+" on server: "+whois_server+" Using proxy: "+self.server 597 | raise WhoisLinesException(error,data) 598 | 599 | recurse_level -= 1 600 | if recurse_level == 0: 601 | response.setType(ResultType.Thick) 602 | else: 603 | whois_server = self.client.findwhois_server(response.getResponse(),whois_server) # get next whois server if exists 604 | if whois_server == None: 605 | # mark response as thick 606 | response.setType(ResultType.Thick) 607 | else: 608 | # mark response as thin 609 | response.setType(ResultType.Thin) 610 | 611 | return response #returns the last response used 612 | 613 | 614 | #main thread which handles all whois lookups, one per proxy 615 | class WhoisThread(threading.Thread): 616 | def __init__(self, proxy, queue, save): 617 | threading.Thread.__init__(self) 618 | self.daemon = True 619 | self.queue = queue 620 | self.proxy = proxy 621 | self.save_queue = save 622 | self.running = False 623 | 624 | def fail(self, record, error, requeue=True, failIncrement=True): 625 | if failIncrement or config.DEBUG: 626 | self.proxy.errors += 1 627 | record.addError(error, failIncrement) 628 | if config.DEBUG: 629 | print "["+ str(self.proxy) +"] "+ str(error) 630 | if requeue and record.numFails() < config.MAX_ATTEMPTS: 631 | self.queue.put(record) 632 | else: 633 | record.maxAttempts = True 634 | decrementActiveThreadCount() 635 | self.save_queue.put(record) 636 | incrementActiveThreadCount() 637 | 638 | def run(self): 639 | # distribute proxys starting up 640 | time.sleep(random.randrange(0, 5)) 641 | 642 | while True: 643 | #get and print my remote IP, also tests the proxy for usability 644 | 645 | #wait untill proxy is active if down 646 | while not self.proxy.connect(): 647 | if config.DEBUG: 648 | print "WARNING: Failed to connect to proxy: " + str(self.proxy) 649 | time.sleep(config.PROXY_FAIL_RECONNECT_DELAY) 650 | 651 | 652 | if not addRemoteProxyIP(self.proxy.external_ip): 653 | if config.DEBUG: 654 | print "WARNING: Proxy is already being used ["+self.proxy.server+"] on port: "+str(self.proxy.port)+" with remote IP: "+self.proxy.external_ip 655 | #return 656 | # dont return, insteak keep waiting just in case 657 | time.sleep(config.PROXY_FAIL_RECONNECT_DELAY*3) 658 | continue 659 | 660 | 661 | self.running = True 662 | incrementProxyThreadCount() 663 | 664 | while self.running: 665 | #get next host 666 | record = self.queue.get() 667 | incrementActiveThreadCount() 668 | record.addAttempt(WhoisAttempt(self.proxy)) 669 | try: 670 | if config.DEBUG: 671 | print str(self.proxy) +" trying to whois: "+record.domain 672 | self.proxy.whois(record) 673 | if config.DEBUG: 674 | print str(self.proxy) +" whois return on: "+record.domain 675 | except proxywhois.WhoisNoServerException as e: 676 | #the domain does not have a valid known whois server, may be an http server 677 | #nothing we can do, skip domain 678 | self.fail(record, str(e), False) 679 | except WhoisRatelimitException as e: 680 | #we reached a server who's wait is more than the allowed sleeping time 681 | #give the request to another server 682 | if e.hard: 683 | #TODO dynamically change whois server allowed rate 684 | self.fail(record, str(e), True, (config.LAZY_MODE or e.forceInc)) 685 | else: 686 | self.queue.put(record) 687 | except proxywhois.socks.GeneralProxyError as e: 688 | if e.value[0] == 6: #is there a proxy error? 689 | error = "Unable to connect to once valid proxy" 690 | if config.DEBUG: 691 | print error 692 | record.addError(error) 693 | self.queue.put(record) 694 | self.running = False 695 | # make sure to remove us from the active IP list 696 | removeRemoteProxyIP(self.proxy.external_ip) 697 | else: 698 | error = "Error Running whois on domain:["+record.domain+"] " + str(e) 699 | self.fail(record,error) 700 | except (proxywhois.socks.HTTPError, proxywhois.socks.Socks4Error, proxywhois.socks.Socks5Error) as e: 701 | #bad domain name 702 | error = "Invalid domain: " + record.domain 703 | self.fail(record,error) 704 | except (NullWhoisException, WhoisTimeoutException, WhoisLinesException) as e: 705 | self.fail(record, str(e)) 706 | except WhoisBadDomainException as e: 707 | self.fail(record, str(e), False) 708 | except WhoisBadDomainException as e: 709 | error = "FAILED: [" + record.domain + "] error: " + str(sys.exc_info()[0]) 710 | self.fail(record,error) 711 | else: 712 | if (not config.RESULT_VALIDCHECK) or record.valid(): 713 | record.current_attempt.success = True 714 | decrementActiveThreadCount() 715 | self.save_queue.put(record) 716 | incrementActiveThreadCount() 717 | else: 718 | error = "INVALID RESULT: [" + record.domain + "] Failed validity check" 719 | self.fail(record,error) 720 | finally: 721 | #inform the queue we are done 722 | self.queue.task_done() 723 | decrementActiveThreadCount() 724 | 725 | 726 | decrementProxyThreadCount() 727 | 728 | --------------------------------------------------------------------------------