├── .gitignore ├── LICENSE ├── README.md ├── doc └── manual.md ├── lib ├── .DS_Store ├── __init__.py ├── common │ ├── __init__.py │ ├── common.py │ ├── focus.py │ ├── initializtion.py │ └── logs.py ├── core │ ├── __init__.py │ ├── console.py │ ├── crawl.py │ ├── fetch.py │ ├── rules.py │ ├── scheduling.py │ └── spider.py ├── data │ ├── .DS_Store │ └── allurl.txt ├── server │ ├── __init__.py │ ├── scheduling.py │ └── server.py └── structure │ ├── GlobalData.py │ ├── HtmlData.py │ ├── UrlData.py │ └── __init__.py ├── mspider.py ├── plugins ├── __init__.py └── phantomjs │ ├── .gitignore │ └── .travis.yml └── tools ├── test_crawler.py └── test_fetcher.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | 11 | *.egg-info/ 12 | .installed.cfg 13 | *.egg 14 | 15 | # PyInstaller 16 | # Usually these files are written by a python script from a template 17 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 18 | *.manifest 19 | *.spec 20 | 21 | # Installer logs 22 | pip-log.txt 23 | pip-delete-this-directory.txt 24 | 25 | # Unit test / coverage reports 26 | htmlcov/ 27 | .tox/ 28 | .coverage 29 | .cache 30 | nosetests.xml 31 | coverage.xml 32 | 33 | # Translations 34 | *.mo 35 | *.pot 36 | 37 | # Django stuff: 38 | *.log 39 | 40 | # Sphinx documentation 41 | docs/_build/ 42 | 43 | # PyBuilder 44 | target/ 45 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | {description} 294 | Copyright (C) {year} {fullname} 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | {signature of Ty Coon}, 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | 341 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MSpider 2 | 3 | ## Talk 4 | 5 | The information security department of 360 company has been recruiting for a long time and is interested in contacting the mailbox zhangxin1[at]360.cn. 6 | 7 | ## Installation 8 | 9 | In Ubuntu, you need to install some libraries. 10 | 11 | You can use pip or easy_install or apt-get to do this. 12 | 13 | - lxml 14 | - chardet 15 | - splinter 16 | - gevent 17 | - phantomjs 18 | 19 | ## Example 20 | 21 | 1. Use MSpider collect the vulnerability information on the wooyun.org. 22 | ``` 23 | python mspider.py -u "http://www.wooyun.org/bugs/" --focus-domain "wooyun.org" --filter-keyword "xxx" --focus-keyword "bugs" -t 15 --random-agent true 24 | ``` 25 | 26 | 27 | 2. Use MSpider collect the news information on the news.sina.com.cn. 28 | ``` 29 | python mspider.py -u "http://news.sina.com.cn/c/2015-12-20/doc-ifxmszek7395594.shtml" --focus-domain "news.sina.com.cn" -t 15 --random-agent true 30 | ``` 31 | 32 | ## ToDo 33 | 34 | 1. Crawl and storage of information. 35 | 2. Distributed crawling. 36 | 37 | ## MSpider's help 38 | 39 | ``` 40 | Usage: 41 | __ __ _____ _ _ 42 | | \/ |/ ____| (_) | | 43 | | \ / | (___ _ __ _ __| | ___ _ __ 44 | | |\/| |\___ \| '_ \| |/ _` |/ _ \ '__| 45 | | | | |____) | |_) | | (_| | __/ | 46 | |_| |_|_____/| .__/|_|\__,_|\___|_| 47 | | | 48 | |_| 49 | Author: Manning23 50 | 51 | 52 | Options: 53 | -h, --help show this help message and exit 54 | -u MSPIDER_URL, --url=MSPIDER_URL 55 | Target URL (e.g. "http://www.site.com/") 56 | -t MSPIDER_THREADS_NUM, --threads=MSPIDER_THREADS_NUM 57 | Max number of concurrent HTTP(s) requests (default 10) 58 | --depth=MSPIDER_DEPTH 59 | Crawling depth 60 | --count=MSPIDER_COUNT 61 | Crawling number 62 | --time=MSPIDER_TIME Crawl time 63 | --referer=MSPIDER_REFERER 64 | HTTP Referer header value 65 | --cookies=MSPIDER_COOKIES 66 | HTTP Cookie header value 67 | --spider-model=MSPIDER_MODEL 68 | Crawling mode: Static_Spider: 0 Dynamic_Spider: 1 69 | Mixed_Spider: 2 70 | --spider-policy=MSPIDER_POLICY 71 | Crawling strategy: Breadth-first 0 Depth-first 1 72 | Random-first 2 73 | --focus-keyword=MSPIDER_FOCUS_KEYWORD 74 | Focus keyword in URL 75 | --filter-keyword=MSPIDER_FILTER_KEYWORD 76 | Filter keyword in URL 77 | --filter-domain=MSPIDER_FILTER_DOMAIN 78 | Filter domain 79 | --focus-domain=MSPIDER_FOCUS_DOMAIN 80 | Focus domain 81 | --random-agent=MSPIDER_AGENT 82 | Use randomly selected HTTP User-Agent header value 83 | --print-all=MSPIDER_PRINT_ALL 84 | Will show more information 85 | ``` 86 | -------------------------------------------------------------------------------- /doc/manual.md: -------------------------------------------------------------------------------- 1 | # MSpider 2 | 3 | MSpider is a pure web crawler, you can use it to collect all kinds of information. 4 | 5 | 6 | ## Installation 7 | 8 | In Ubuntu, you need to install some libraries. 9 | 10 | You can use pip or easy_install or apt-get to do this. 11 | 12 | - lxml 13 | - chardet 14 | - splinter 15 | - gevent 16 | - phantomjs 17 | 18 | ## Example 19 | 20 | 1. Use MSpider collect the vulnerability information on the wooyun.org. 21 | ``` 22 | python mspider.py -u "http://www.wooyun.org/bugs/" --focus-domain "wooyun.org" --filter-keyword "xxx" --focus-keyword "bugs" -t 15 --random-agent true 23 | ``` 24 | 25 | 26 | 2. Use MSpider collect the news information on the news.sina.com.cn. 27 | ``` 28 | python mspider.py -u "http://news.sina.com.cn/c/2015-12-20/doc-ifxmszek7395594.shtml" --focus-domain "news.sina.com.cn" -t 15 --random-agent true 29 | ``` 30 | ## ToDo 31 | 32 | 1. Crawl and storage of information. 33 | 2. Distributed crawling. -------------------------------------------------------------------------------- /lib/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manning23/MSpider/6d8b4556a8e12e54491be6a840d325eab23c637e/lib/.DS_Store -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manning23/MSpider/6d8b4556a8e12e54491be6a840d325eab23c637e/lib/__init__.py -------------------------------------------------------------------------------- /lib/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manning23/MSpider/6d8b4556a8e12e54491be6a840d325eab23c637e/lib/common/__init__.py -------------------------------------------------------------------------------- /lib/common/common.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*-coding:utf-8-*- 3 | #Author : Manning 4 | #Date : 2015-10-17 5 | ''' 6 | MSpider的通用函数 7 | ''' 8 | 9 | import sys 10 | import time 11 | import urlparse 12 | 13 | def get_absolute_path(): 14 | ''' 15 | 获取MSpider的绝对路径 16 | ''' 17 | path = sys.path[0] 18 | path = path.split('MSpider')[0] + "MSpider/" 19 | return path 20 | 21 | #sys.path[0].split('MSpider')[0] + "MSpider/" 22 | 23 | def is_netloc(url): 24 | ''' 25 | 判断当前url是否为纯域名形式 26 | urlparse('http://www.cwi.nl:80/%7Eguido/Python.html') 27 | ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html',params='', query='', fragment='') 28 | ''' 29 | parse_result = urlparse.urlparse(url) 30 | if len(parse_result[1]) > 0 and len(parse_result[2]) <= 1 and len(parse_result[4]) == 0: 31 | return True 32 | else: 33 | return False 34 | 35 | def get_netloc(url): 36 | ''' 37 | 获取当前url的域名字段 38 | ''' 39 | return urlparse.urlparse(url)[1] 40 | 41 | def is_ipv4_address(ip_str): 42 | ''' 43 | 判断是否是合法的ipv4地址 44 | ''' 45 | if len(ip_str.split('.')) != 4: 46 | return False 47 | 48 | for i in ip_str.split('.'): 49 | try: 50 | int(i) 51 | if int(i) > 255: 52 | return False 53 | except Exception as e: 54 | return False 55 | if ip_str.startswith('192.'): 56 | return False 57 | if ip_str.startswith('10.'): 58 | return False 59 | return True 60 | 61 | def timestamp(): 62 | return str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) 63 | -------------------------------------------------------------------------------- /lib/common/focus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*-coding:utf-8-*- 3 | #Author : Manning 4 | #Date : 2015-10-17 5 | """ 6 | MSpider 任务初始化 7 | """ 8 | import urlparse 9 | 10 | def get_focus_info(url): 11 | if url.startswith('http'): 12 | netloc = urlparse.urlparse(url)[1] 13 | info = '.'.join(netloc.split('.')[1:]) 14 | return info 15 | else: 16 | return url 17 | 18 | 19 | def focus_domain(spider_global_variable): 20 | if len(spider_global_variable.focus_domain) == 0 and len(spider_global_variable.start_url) > 0: 21 | for i in spider_global_variable.start_url: 22 | spider_global_variable.focus_domain.append(get_focus_info(i)) 23 | -------------------------------------------------------------------------------- /lib/common/initializtion.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*-coding:utf-8-*- 3 | #Author : Manning 4 | #Date : 2015-10-18 5 | import urlparse 6 | from common import get_absolute_path 7 | 8 | def check_word_has_not_meaning(word): 9 | ''' 10 | return True 11 | ''' 12 | has_number = False 13 | has_letter = False 14 | 15 | for i in xrange(10): 16 | if str(i) in word: 17 | has_number = True 18 | break 19 | try: 20 | int(word) 21 | except Exception as e: 22 | has_letter = True 23 | 24 | if len(word) > 3 and has_letter and has_number : 25 | return True 26 | else: 27 | return False 28 | 29 | 30 | def set_domain(strs): 31 | ''' 32 | 可以处理的格式 33 | 1, http://abc.baidu.com/asdas 34 | 2, abc.baidu.com 35 | 3, 1.1.1.1 return '' 36 | ''' 37 | host = '' 38 | domain = '' 39 | if 'http://' in strs: 40 | host = urlparse.urlparse(strs)[1].split(':')[0] 41 | else: 42 | host = strs 43 | keyword_list = host.split('.') 44 | if len(keyword_list) == 2: 45 | domain = host 46 | 47 | elif len(keyword_list) == 3: 48 | if 'com.cn' in host: 49 | domain = host 50 | elif 'net.cn' in host: 51 | domain = host 52 | else: 53 | domain = '.'.join(host.split('.')[1:]) 54 | 55 | elif len(keyword_list) > 3: 56 | count = 0 57 | for i in keyword_list: 58 | try: 59 | int(i) 60 | count += 1 61 | except Exception, e: 62 | break 63 | if count == 4: 64 | domain = '' 65 | else: 66 | if keyword_list[-1] == 'cn' and keyword_list[-2] in ['com', 'edu', 'gov', 'org', 'net']: 67 | domain = '.'.join(keyword_list[-3:]) 68 | elif keyword_list[-1] in ['com', 'net', 'org','cc','me']: 69 | domain = '.'.join(keyword_list[-2:]) 70 | elif keyword_list[-1] == 'cn': 71 | domain = '.'.join(keyword_list[-2:]) 72 | else: 73 | domain = host 74 | return domain 75 | 76 | 77 | def deal_url(start_urls): 78 | temp_url_list = start_urls.split(',') 79 | total_url_list = [] 80 | url_list = [] 81 | addr = get_absolute_path() + 'lib/data/allurl.txt' 82 | for i in open(addr).readlines(): 83 | while True: 84 | if i[-1] in ['\r','\n']: 85 | i = i[:-1] 86 | else: 87 | break 88 | url = i 89 | if not url.startswith('http://'): 90 | url = 'http://' + url 91 | 92 | if url.endswith('/'): 93 | url = url[:-1] 94 | total_url_list.append(url) 95 | 96 | for i in temp_url_list: 97 | if i.startswith('http://'): 98 | url_list.append(i) 99 | else: 100 | if i.endswith('/'): 101 | url = i[:-1] 102 | else: 103 | url = i 104 | for j in total_url_list: 105 | keyword_j = set_domain(j) 106 | if url in keyword_j: 107 | url_list.append(j) 108 | url_list = sorted(list(set(url_list))) 109 | new_list = [] 110 | for i in url_list: 111 | netloc = urlparse.urlparse(i)[1] 112 | netloc_list = netloc.split('.') 113 | if len(netloc_list) == 3: 114 | if len(netloc_list[0]) > 10: 115 | continue 116 | else: 117 | new_list.append(i) 118 | elif len(netloc_list) == 4: 119 | if check_word_has_not_meaning(netloc_list[0]): 120 | continue 121 | else: 122 | new_list.append(i) 123 | elif len(netloc_list) == 5: 124 | if check_word_has_not_meaning(netloc_list[0]): 125 | continue 126 | elif check_word_has_not_meaning(netloc_list[1]): 127 | continue 128 | else: 129 | new_list.append(i) 130 | 131 | return new_list 132 | 133 | 134 | def deal_common_strs(words): 135 | if len(words) == 0: 136 | return [] 137 | else: 138 | return words.split(',') 139 | 140 | def deal_strs(words): 141 | if len(words) == 0: 142 | return '' 143 | else: 144 | return words 145 | 146 | def deal_common_int(num): 147 | num = str(num).split('.')[0] 148 | try: 149 | int(num) 150 | except Exception, e: 151 | raise e 152 | return int(num) 153 | 154 | 155 | def deal_common_boolean(boolean): 156 | boolean = str(boolean).lower() 157 | if boolean == 'true': 158 | return True 159 | elif boolean == '1': 160 | return True 161 | elif boolean == '0': 162 | return False 163 | else: 164 | return False 165 | 166 | 167 | 168 | def init_dict(options): 169 | variable_dict = { 170 | "start_url": deal_url(options.mspider_url), 171 | 172 | "threads": deal_common_int(options.mspider_threads_num), 173 | "depth": deal_common_int(options.mspider_depth), 174 | "count": deal_common_int(options.mspider_count), 175 | "time": deal_common_int(options.mspider_time), 176 | 'referer': options.mspider_referer, 177 | 'cookies': options.mspider_cookies, 178 | 179 | "spider_model": deal_common_int(options.mspider_model), 180 | "spider_policy": deal_common_int(options.mspider_policy), 181 | 182 | "focus_keyword": deal_common_strs(options.mspider_focus_keyword), 183 | "filter_keyword": deal_common_strs(options.mspider_filter_keyword), 184 | "focus_domain": deal_common_strs(options.mspider_focus_domain), 185 | "filter_domain": deal_common_strs(options.mspider_filter_domain), 186 | 187 | "random_agent": deal_common_boolean(options.mspider_agent), 188 | 'print_all': deal_common_boolean(options.mspider_print_all), 189 | 190 | } 191 | return variable_dict 192 | -------------------------------------------------------------------------------- /lib/common/logs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*-coding:utf-8-*- 3 | #Author : Manning 4 | #Date : 2015-10-17 5 | """ 6 | MSpider 日志记录 7 | """ 8 | import logging 9 | import sys 10 | 11 | def init_spider_log(spider_global_variable): 12 | 13 | ''' 14 | logs msg定义如下 15 | Function: init_spider_log, Info: xxx 16 | ''' 17 | 18 | spider_logger = logging.getLogger('MSpiderLogs') 19 | spider_logger.setLevel(logging.DEBUG) 20 | 21 | console_handler = logging.StreamHandler() 22 | console_handler.setLevel(logging.DEBUG) 23 | 24 | formatter = logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s") 25 | 26 | console_handler.setFormatter(formatter) 27 | 28 | spider_logger.addHandler(console_handler) 29 | 30 | spider_global_variable.spider_logger = spider_logger 31 | spider_logger.info("Welcome to Mspider !!!") 32 | spider_logger.info("---------------------------") 33 | -------------------------------------------------------------------------------- /lib/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manning23/MSpider/6d8b4556a8e12e54491be6a840d325eab23c637e/lib/core/__init__.py -------------------------------------------------------------------------------- /lib/core/console.py: -------------------------------------------------------------------------------- 1 | """ getTerminalSize() 2 | - get width and height of console 3 | - works on linux,os x,windows,cygwin(windows) 4 | """ 5 | 6 | __all__=['getTerminalSize'] 7 | 8 | 9 | def getTerminalSize(): 10 | import platform 11 | current_os = platform.system() 12 | tuple_xy=None 13 | if current_os == 'Windows': 14 | tuple_xy = _getTerminalSize_windows() 15 | if tuple_xy is None: 16 | tuple_xy = _getTerminalSize_tput() 17 | # needed for window's python in cygwin's xterm! 18 | if current_os == 'Linux' or current_os == 'Darwin' or current_os.startswith('CYGWIN'): 19 | tuple_xy = _getTerminalSize_linux() 20 | if tuple_xy is None: 21 | print "default" 22 | tuple_xy = (80, 25) # default value 23 | return tuple_xy 24 | 25 | def _getTerminalSize_windows(): 26 | res=None 27 | try: 28 | from ctypes import windll, create_string_buffer 29 | 30 | # stdin handle is -10 31 | # stdout handle is -11 32 | # stderr handle is -12 33 | 34 | h = windll.kernel32.GetStdHandle(-12) 35 | csbi = create_string_buffer(22) 36 | res = windll.kernel32.GetConsoleScreenBufferInfo(h, csbi) 37 | except: 38 | return None 39 | if res: 40 | import struct 41 | (bufx, bufy, curx, cury, wattr, 42 | left, top, right, bottom, maxx, maxy) = struct.unpack("hhhhHhhhhhh", csbi.raw) 43 | sizex = right - left + 1 44 | sizey = bottom - top + 1 45 | return sizex, sizey 46 | else: 47 | return None 48 | 49 | def _getTerminalSize_tput(): 50 | # get terminal width 51 | # src: http://stackoverflow.com/questions/263890/how-do-i-find-the-width-height-of-a-terminal-window 52 | try: 53 | import subprocess 54 | proc=subprocess.Popen(["tput", "cols"],stdin=subprocess.PIPE,stdout=subprocess.PIPE) 55 | output=proc.communicate(input=None) 56 | cols=int(output[0]) 57 | proc=subprocess.Popen(["tput", "lines"],stdin=subprocess.PIPE,stdout=subprocess.PIPE) 58 | output=proc.communicate(input=None) 59 | rows=int(output[0]) 60 | return (cols,rows) 61 | except: 62 | return None 63 | 64 | 65 | def _getTerminalSize_linux(): 66 | def ioctl_GWINSZ(fd): 67 | try: 68 | import fcntl, termios, struct, os 69 | cr = struct.unpack('hh', fcntl.ioctl(fd, termios.TIOCGWINSZ,'1234')) 70 | except: 71 | return None 72 | return cr 73 | cr = ioctl_GWINSZ(0) or ioctl_GWINSZ(1) or ioctl_GWINSZ(2) 74 | if not cr: 75 | try: 76 | fd = os.open(os.ctermid(), os.O_RDONLY) 77 | cr = ioctl_GWINSZ(fd) 78 | os.close(fd) 79 | except: 80 | pass 81 | if not cr: 82 | try: 83 | cr = (env['LINES'], env['COLUMNS']) 84 | except: 85 | return None 86 | return int(cr[1]), int(cr[0]) 87 | 88 | if __name__ == "__main__": 89 | sizex,sizey=getTerminalSize() 90 | print 'width =',sizex,'height =',sizey 91 | -------------------------------------------------------------------------------- /lib/core/crawl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*-coding:utf-8-*- 3 | #Author : Manning 4 | #Date : 2015-10-17 5 | ''' 6 | About how to crawl the in the html 7 | ''' 8 | import lxml.html 9 | import sys 10 | reload(sys) 11 | sys.setdefaultencoding("utf-8") 12 | 13 | import urlparse 14 | import chardet 15 | import urllib2 16 | try: 17 | import re2 as re 18 | except ImportError: 19 | import re 20 | 21 | import random 22 | import time 23 | from fetch import fetch 24 | 25 | import logging 26 | spider_logger = logging.getLogger('MSpiderLogs') 27 | 28 | 29 | def get_url_by_lxml(url,html): 30 | try: 31 | if '.js' in urlparse.urlparse(url)[2]: 32 | return [] 33 | tmp = lxml.html.document_fromstring(urllib2.unquote(html)) 34 | tmp.make_links_absolute(url) 35 | links = tmp.iterlinks() 36 | 37 | links = [i[2] for i in links] 38 | return links 39 | except Exception as e: 40 | msg = 'Function: get_url_by_lxml, Info: ' + str(e) 41 | spider_logger.error(msg) 42 | return [] 43 | 44 | def check_suffix(url): 45 | ignore_ext = ['wma', 'png', 'jpeg', 'jpg'] 46 | suffix = urlparse.urlparse(url)[2].split('.')[-1].lower() 47 | if suffix in ignore_ext: 48 | return False 49 | else: 50 | return True 51 | 52 | def check_keyword(domian): 53 | i = domian 54 | if i.startswith('javascript:'): 55 | return False 56 | if i.startswith('about:'): 57 | return False 58 | return True 59 | 60 | def modify_url(url): 61 | i = url 62 | if '/' not in i and '?' not in i: 63 | i = i + '/' 64 | i = 'http://' + i 65 | return i 66 | 67 | 68 | def crawl(url,html): 69 | if len(html) < 10: 70 | return [] 71 | link_set = set() 72 | _ = [link_set.add(i) for i in get_url_by_lxml(url,html) if check_keyword(i)] 73 | get_link_list = [i for i in list(link_set) if check_suffix(i)] 74 | 75 | links = [] 76 | 77 | for i in get_link_list: 78 | data = modify_url_to_structure(i) 79 | links.append(data) 80 | 81 | return links 82 | 83 | 84 | def modify_url_to_structure(url): 85 | method = 'get' 86 | return (method,url,'') 87 | -------------------------------------------------------------------------------- /lib/core/fetch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*-coding:utf-8-*- 3 | #Author : Manning 4 | #Date : 2015-10-17 5 | ''' 6 | About how to get html. 7 | ''' 8 | 9 | import requests 10 | import urlparse 11 | import time 12 | import random 13 | import urllib2 14 | from splinter import Browser 15 | 16 | import sys 17 | sys.path.append(sys.path[0].split('MSpider')[0] + "MSpider/lib") 18 | 19 | import logging 20 | spider_logger = logging.getLogger('MSpiderLogs') 21 | 22 | 23 | def html_pretreatment(html): 24 | html = html.lower() 25 | html = urllib2.unquote(html) 26 | return html 27 | 28 | 29 | def fetch(url, spider_model=0, fetch_time_interval=1, set_random_agent=True, set_referer=False, set_cookies=False): 30 | try: 31 | spider_model = spider_model 32 | fetch_time_interval = fetch_time_interval 33 | random_agent = random_agent 34 | except Exception, e: 35 | spider_model = 0 36 | fetch_time_interval = 1 37 | random_agent = False 38 | 39 | myheaders = dict() 40 | if random_agent: 41 | myheaders['Agent'] = random_http_header() 42 | else: 43 | myheaders['Agent'] = 'MSpider' 44 | 45 | if set_referer: 46 | myheaders['Referer'] = set_referer 47 | 48 | if set_cookies: 49 | myheaders['Cookie'] = set_cookies 50 | 51 | returnhtml = '' 52 | 53 | if spider_model == 0: 54 | # Static Model 55 | try: 56 | response = requests.get(url, timeout=15, headers=myheaders, allow_redirects=False) 57 | if response.status_code == 200: 58 | returnhtml = response.content 59 | else: 60 | return "" 61 | except Exception, e: 62 | msg = 'Function: fetch_0, Info: ' + str(e) 63 | spider_logger.error(msg) 64 | return "" 65 | elif spider_model == 1: 66 | # Dynamic Model 67 | try: 68 | browser = Browser(driver_name='phantomjs', user_agent=myheaders['User-Agent'], load_images=False) 69 | browser.visit(url) 70 | html = browser.html 71 | browser.quit() 72 | returnhtml = html 73 | except Exception, e: 74 | msg = 'Function: fetch_1, Info: ' + str(e) 75 | spider_logger.error(msg) 76 | return "" 77 | else: 78 | return "" 79 | 80 | if len(returnhtml) < 10: 81 | return '' 82 | 83 | html = html_pretreatment(returnhtml).decode('gb2312','ignore') 84 | time.sleep(fetch_time_interval) # 抓取时间间隔 85 | 86 | return html 87 | 88 | 89 | def random_http_header(): 90 | user_agents = [ 91 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0", 92 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)", 93 | ] 94 | return random.choice(user_agents) 95 | -------------------------------------------------------------------------------- /lib/core/rules.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding:utf-8 3 | # manning 2015-1-27 4 | import time 5 | import re 6 | import urlparse 7 | import sys 8 | sys.path.append(sys.path[0].split('MSpider')[0] + "MSpider/lib") 9 | 10 | import logging 11 | spider_logger = logging.getLogger('MSpiderLogs') 12 | 13 | class UrlRuleClass(object): 14 | 15 | """docstring for UrlRule""" 16 | 17 | def __init__(self, SpiderGlobalVariable): 18 | super(UrlRuleClass, self).__init__() 19 | self.url_repeat_set = set() 20 | self.url = '' 21 | self.spiderglobal = SpiderGlobalVariable 22 | 23 | def check_repeat(self,url): 24 | if url not in self.url_repeat_set: 25 | self.url_repeat_set.add(url) 26 | return True 27 | return False 28 | 29 | 30 | def focus_domain(self,url): 31 | if len(self.spiderglobal.focus_domain) == 0: 32 | return True 33 | t = urlparse.urlparse(url)[1] 34 | for i in self.spiderglobal.focus_domain: 35 | if i in t: 36 | return True 37 | return False 38 | 39 | def filter_domain(self,url): 40 | t = urlparse.urlparse(url)[1] 41 | for i in self.spiderglobal.filter_domain: 42 | if i in t: 43 | return False 44 | return True 45 | 46 | def focus_keyword(self,url): 47 | if len(self.spiderglobal.focus_keyword) == 0: 48 | return True 49 | for i in self.spiderglobal.focus_keyword: 50 | if i in url: 51 | return True 52 | return False 53 | 54 | def filter_keyword(self,url): 55 | if len(self.spiderglobal.filter_keyword) == 0: 56 | return True 57 | for i in self.spiderglobal.filter_keyword: 58 | if i in url: 59 | return False 60 | return True 61 | 62 | def check_filter_and_focus(self,url): 63 | if self.focus_domain(url) and self.filter_domain(url) and self.focus_keyword(url) and self.filter_keyword(url): 64 | return True 65 | return False 66 | 67 | def check_url(self, url): 68 | if self.check_repeat(url) and self.check_filter_and_focus(url): 69 | return True 70 | else: 71 | return False 72 | -------------------------------------------------------------------------------- /lib/core/scheduling.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*-coding:utf-8-*- 3 | #Author : Manning 4 | #Date : 2015-10-17 5 | """ 6 | MSpider 全局调度 7 | """ 8 | 9 | import random 10 | import time 11 | import sys 12 | sys.path.append(sys.path[0].split('MSpider')[0] + "MSpider/lib") 13 | 14 | from crawl import crawl 15 | from structure.UrlData import UrlNode 16 | from common.common import is_netloc 17 | 18 | def exit_condition(SpiderGlobalVariable): 19 | # 调度退出机制函数 20 | if time.time() -SpiderGlobalVariable.start_time < SpiderGlobalVariable.time: 21 | if SpiderGlobalVariable.exit_flag_count < SpiderGlobalVariable.threads: 22 | if SpiderGlobalVariable.total_count < SpiderGlobalVariable.count: 23 | return True 24 | return False 25 | 26 | 27 | def init_urlnode(start_urls_list,UrlRule): 28 | nodelist = [] 29 | for i in start_urls_list: 30 | if UrlRule.check_url(i): 31 | tmpnode = UrlNode(i, '', -1) 32 | nodelist.append(tmpnode) 33 | return nodelist 34 | 35 | 36 | def spider_scheduling(SpiderGlobalVariable,UrlRule): 37 | ''' 38 | SpiderGlobalVariable 39 | ''' 40 | for i in init_urlnode(SpiderGlobalVariable.start_url,UrlRule): 41 | SpiderGlobalVariable.global_urlnode_queue.put((0,i)) 42 | 43 | while exit_condition(SpiderGlobalVariable): 44 | if SpiderGlobalVariable.htmlnode_queue.qsize() > 0: 45 | html_node = SpiderGlobalVariable.htmlnode_queue.get() 46 | linklist = crawl(html_node.url, html_node.html) 47 | for i in linklist: 48 | url = i[1] 49 | method = i[0] 50 | data = i[2] 51 | depth = html_node.depth 52 | referer = html_node.url 53 | i = UrlNode(url, referer, depth, method, data) 54 | 55 | if i.depth <= SpiderGlobalVariable.depth and UrlRule.check_url(i.check_url): 56 | if is_netloc(i.url): 57 | SpiderGlobalVariable.global_urlnode_queue.put((0,i)) 58 | else: 59 | SpiderGlobalVariable.global_urlnode_queue.put((random.randint(1,5),i)) 60 | 61 | else: 62 | SpiderGlobalVariable.refuse_count += 1 63 | -------------------------------------------------------------------------------- /lib/core/spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*-coding:utf-8-*- 3 | #Author : Manning 4 | #Date : 2015-10-17 5 | ''' 6 | MSpider 爬虫工作线程 7 | 2015.5.20 8 | 调整输出样式,参考了@lijiejie的一些代码 9 | 10 | 2015.3.28 11 | 抓取模型 12 | 0,广度优先(缺省) 13 | 1,深度优先 14 | 2,随机优先 15 | 16 | 2015.3.27 17 | 加入数据队列,单起一个线程写入数据库 18 | 数据库类型为sqlite 19 | 预计支持mysql、sql server等 20 | 21 | 2015。3.26 22 | 添加深度控制 23 | 24 | 2015.3.8 25 | server退出机制 26 | 1,超过爬取时间 27 | 2,爬取线程不存在(可能爬完) 28 | 3,深度超越 29 | 4,抓取个数超越 30 | 31 | 线程退出机制 32 | 如果此线程5分钟内没有工作,线程退出 33 | 34 | ''' 35 | import time 36 | import sys 37 | sys.path.append(sys.path[0].split('MSpider')[0] + "MSpider/lib") 38 | from console import getTerminalSize 39 | from fetch import fetch 40 | from common.common import timestamp 41 | from structure.HtmlData import HtmlNode 42 | 43 | import logging 44 | spider_logger = logging.getLogger('MSpiderLogs') 45 | 46 | def spider(SpiderGlobalVariable): 47 | if SpiderGlobalVariable.spider_use_gevent: 48 | import gevent 49 | while True: 50 | if SpiderGlobalVariable.spider_urlnode_queue.qsize() > 0: 51 | _,node = SpiderGlobalVariable.spider_urlnode_queue.get() 52 | html = fetch(node.url, SpiderGlobalVariable.spider_model, SpiderGlobalVariable.fetch_time_interval, SpiderGlobalVariable.random_agent) 53 | if len(html) < 10: 54 | pass 55 | html_node = HtmlNode(node.url, html, timestamp(), node.depth) 56 | SpiderGlobalVariable.htmlnode_queue.put(html_node) 57 | SpiderGlobalVariable.total_count += 1 58 | 59 | if SpiderGlobalVariable.print_all: 60 | msg = "[Url] %s Depth: %s Found: %s Remaining: %s Html: %s"% (node.url, str(node.depth), str(SpiderGlobalVariable.total_count), str(SpiderGlobalVariable.spider_urlnode_queue.qsize()), str(len(html))) 61 | spider_logger.info(msg) 62 | 63 | else: 64 | msg = "[Url] %s Depth: %s Found: %s Remaining: %s Html: %s" % (node.url, str(node.depth), str(SpiderGlobalVariable.total_count), str(SpiderGlobalVariable.spider_urlnode_queue.qsize()), str(len(html))) 65 | console_width = getTerminalSize()[0] - 0 66 | if len(msg) - console_width > 0: 67 | msg = msg[:console_width] 68 | sys.stdout.write('\r' + msg) 69 | sys.stdout.flush() 70 | else: 71 | sys.stdout.write('\r' + msg + ' ' * (console_width - len(msg))) 72 | sys.stdout.flush() 73 | if SpiderGlobalVariable.spider_use_gevent: 74 | gevent.sleep(0) 75 | else: 76 | if SpiderGlobalVariable.spider_use_gevent: 77 | gevent.sleep(0) 78 | else: 79 | time.sleep(5) 80 | SpiderGlobalVariable.exit_flag_count += 1 81 | -------------------------------------------------------------------------------- /lib/data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manning23/MSpider/6d8b4556a8e12e54491be6a840d325eab23c637e/lib/data/.DS_Store -------------------------------------------------------------------------------- /lib/server/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manning23/MSpider/6d8b4556a8e12e54491be6a840d325eab23c637e/lib/server/__init__.py -------------------------------------------------------------------------------- /lib/server/scheduling.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*-coding:utf-8-*- 3 | #Author : Manning 4 | #Date : 2015-10-17 5 | """ 6 | MSpider global_scheduling 7 | """ 8 | import time 9 | import logging 10 | spider_logger = logging.getLogger('MSpiderLogs') 11 | 12 | def global_scheduling(spider_global_variable): 13 | while True: 14 | if spider_global_variable.global_urlnode_queue.qsize() > 0: 15 | node = spider_global_variable.global_urlnode_queue.get() 16 | spider_global_variable.spider_urlnode_queue.put(node) 17 | 18 | ''' 19 | In this function, you can put something interesting code in this, 20 | The global_scheduling function can get all the url_node, the url_node 21 | structure in the UrlData.py. 22 | ''' 23 | -------------------------------------------------------------------------------- /lib/server/server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*-coding:utf-8-*- 3 | #Author : Manning 4 | #Date : 2015-10-17 5 | 6 | import time 7 | import sys 8 | sys.path.append(sys.path[0].split('MSpider')[0] + "MSpider") 9 | import threading 10 | import logging 11 | from lib.core.rules import UrlRuleClass 12 | from lib.core.scheduling import spider_scheduling 13 | from lib.core.spider import spider 14 | from lib.structure.GlobalData import MSpiderGlobalVariable 15 | from scheduling import global_scheduling 16 | 17 | spider_logger = logging.getLogger('MSpiderLogs') 18 | 19 | def global_server(spider_global_variable): 20 | # 初始化全局变量 21 | url_rule = UrlRuleClass(spider_global_variable) 22 | 23 | threads_list = [] 24 | spider_threads = [] 25 | 26 | threads_list.append(threading.Thread(target=spider_scheduling, args=(spider_global_variable, url_rule,))) 27 | threads_list.append(threading.Thread(target=global_scheduling, args=(spider_global_variable,))) 28 | 29 | for t in threads_list: 30 | t.setDaemon(True) 31 | t.start() 32 | 33 | if spider_global_variable.spider_use_gevent: 34 | import gevent 35 | from gevent import monkey 36 | monkey.patch_all(thread=False) 37 | for i in xrange(spider_global_variable.threads): 38 | spider_threads.append(gevent.spawn(spider, spider_global_variable)) 39 | gevent.joinall(spider_threads) 40 | else: 41 | for i in xrange(spider_global_variable.threads): 42 | spider_threads.append(threading.Thread(target=spider, args=(spider_global_variable,))) 43 | for t in spider_threads: 44 | t.setDaemon(True) 45 | t.start() 46 | 47 | 48 | time.sleep(120) 49 | while True: 50 | if spider_global_variable.spider_urlnode_queue.qsize() == 0: 51 | spider_logger.critical('MSpider wait to exit!!') 52 | time.sleep(120) 53 | if spider_global_variable.spider_urlnode_queue.qsize() == 0: 54 | pass 55 | else: 56 | continue 57 | spider_global_variable.end_ctime = time.ctime() 58 | time.sleep(120) 59 | spider_logger.critical('MSpider exit!!') 60 | sys.exit(0) 61 | else: 62 | time.sleep(10) 63 | -------------------------------------------------------------------------------- /lib/structure/GlobalData.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*-coding:utf-8-*- 3 | #Author : Manning 4 | #Date : 2015-10-17 5 | """ 6 | MSpider 全局变量 7 | """ 8 | 9 | import Queue 10 | import urlparse 11 | import time 12 | 13 | class MSpiderGlobalVariable(object): 14 | def __init__(self, variable_dict): 15 | self.variable_dict = variable_dict 16 | 17 | self.start_url = ["http://www.baidu.com"] 18 | self.focus_keyword = [] 19 | self.filter_keyword = [] 20 | self.focus_domain = [] 21 | self.filter_domain = [] 22 | 23 | self.threads = 10 24 | self.spider_use_gevent = False 25 | self.depth = 10 26 | self.count = 1000 27 | self.time = 24 * 3600 28 | self.referer = '' 29 | self.cookies = '' 30 | self.spider_model = 0 31 | self.spider_policy = 0 32 | 33 | self.random_agent = False 34 | self.print_all = True 35 | self.spider_use_gevent = False 36 | 37 | self.ignore_ext = [] 38 | self.spider_proxy = True 39 | self.spider_proxy_ip_pool = [] 40 | self.download_rate = 50 41 | self.fetch_time_interval = 5 42 | 43 | ''' 44 | 全局控制参数 45 | ''' 46 | 47 | self.exit_flag_count = 0 48 | self.global_urlnode_queue = Queue.Queue() 49 | self.global_unfocus_urlnode_queue = Queue.Queue() 50 | self.spider_urlnode_queue = None 51 | self.htmlnode_queue = Queue.Queue() 52 | self.store_queue = Queue.Queue() 53 | self.parse_variable_dict() 54 | self.set_urlnode_queue() 55 | 56 | self.spider_logger = None 57 | 58 | ''' 59 | 爬虫任务参数 60 | ''' 61 | self.total_count = 0 62 | self.refuse_count = 0 63 | 64 | self.start_time = time.time() 65 | self.end_time = None 66 | self.start_ctime = time.ctime() 67 | self.end_ctime = None 68 | self.maintain_time = None 69 | 70 | self.task_name = None 71 | 72 | 73 | 74 | def set_urlnode_queue(self): 75 | if self.spider_policy == 1: 76 | self.spider_urlnode_queue = Queue.LifoQueue() 77 | elif self.spider_policy == 2: 78 | self.spider_urlnode_queue = Queue.PriorityQueue() 79 | else: 80 | self.spider_urlnode_queue = Queue.Queue() 81 | 82 | def parse_variable_dict(self): 83 | self.start_url = self.variable_dict['start_url'] 84 | self.focus_keyword = self.variable_dict['focus_keyword'] 85 | self.filter_keyword = self.variable_dict['filter_keyword'] 86 | self.focus_domain = self.variable_dict['focus_domain'] 87 | self.filter_domain = self.variable_dict['filter_domain'] 88 | self.threads = self.variable_dict['threads'] 89 | self.depth = self.variable_dict['depth'] 90 | self.count = self.variable_dict['count'] 91 | self.time = self.variable_dict['time'] 92 | self.referer = self.variable_dict['referer'] 93 | self.cookies = self.variable_dict['cookies'] 94 | self.spider_model = self.variable_dict['spider_model'] 95 | self.spider_policy = self.variable_dict['spider_policy'] 96 | self.random_agent = self.variable_dict['random_agent'] 97 | self.print_all = self.variable_dict['print_all'] 98 | -------------------------------------------------------------------------------- /lib/structure/HtmlData.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*-coding:utf-8-*- 3 | #Author : Manning 4 | #Date : 2015-10-17 5 | """ 6 | MSpider HtmlNode结点类 7 | """ 8 | class HtmlNode(object): 9 | def __init__(self, url, html, time, depth): 10 | self.url = url 11 | self.html = html 12 | self.time = time 13 | self.depth = depth 14 | -------------------------------------------------------------------------------- /lib/structure/UrlData.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*-coding:utf-8-*- 3 | #Author : Manning 4 | #Date : 2015-10-17 5 | 6 | class UrlNode(object): 7 | def __init__(self, url, referer, depth, method = 'get', data = ''): 8 | self.url = url 9 | self.referer = referer 10 | self.method = method 11 | self.depth = int(depth) + 1 12 | self.data = data 13 | self.check_url = None 14 | self.init_check_url() 15 | 16 | def show(self): 17 | print self.method 18 | print self.url 19 | print self.data 20 | print '--------------------' 21 | 22 | def init_check_url(self): 23 | self.check_url = self.url 24 | 25 | -------------------------------------------------------------------------------- /lib/structure/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manning23/MSpider/6d8b4556a8e12e54491be6a840d325eab23c637e/lib/structure/__init__.py -------------------------------------------------------------------------------- /mspider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*-coding:utf-8-*- 3 | #Author : Manning 4 | #Date : 2015-10-17 5 | """ 6 | MSpider 起始文件 7 | """ 8 | import optparse 9 | import sys 10 | 11 | from lib.structure.GlobalData import MSpiderGlobalVariable 12 | from lib.common.initializtion import init_dict 13 | from lib.common.logs import init_spider_log 14 | from lib.common.focus import focus_domain 15 | from lib.server.server import global_server 16 | 17 | import logging 18 | spider_logger = logging.getLogger('MSpiderLogs') 19 | 20 | def main(): 21 | usage = ''' 22 | __ __ _____ _ _ 23 | | \/ |/ ____| (_) | | 24 | | \ / | (___ _ __ _ __| | ___ _ __ 25 | | |\/| |\___ \| '_ \| |/ _` |/ _ \ '__| 26 | | | | |____) | |_) | | (_| | __/ | 27 | |_| |_|_____/| .__/|_|\__,_|\___|_| 28 | | | 29 | |_| 30 | Author: Manning23 31 | ''' 32 | parser = optparse.OptionParser(usage=usage) 33 | parser.add_option("-u", "--url", 34 | dest="mspider_url", 35 | default='http://www.baidu.com', 36 | help='''Target URL (e.g. "http://www.site.com/")''') 37 | 38 | parser.add_option("-t", "--threads", 39 | dest="mspider_threads_num", 40 | default=10, 41 | help="Max number of concurrent HTTP(s) requests (default 10)") 42 | 43 | parser.add_option("--depth", 44 | dest="mspider_depth", 45 | default=1000, 46 | help="Crawling depth") 47 | 48 | parser.add_option("--count", 49 | dest="mspider_count", 50 | default=1000 * 1000, 51 | help="Crawling number") 52 | 53 | parser.add_option("--time", 54 | dest="mspider_time", 55 | default=3600 * 24 * 7, 56 | help="Crawl time") 57 | 58 | parser.add_option("--referer", 59 | dest="mspider_referer", 60 | default='', 61 | help="HTTP Referer header value") 62 | 63 | parser.add_option("--cookies", 64 | dest="mspider_cookies", 65 | default='', 66 | help="HTTP Cookie header value") 67 | 68 | parser.add_option("--spider-model", 69 | dest="mspider_model", 70 | default=0, 71 | help='''Crawling mode: Static_Spider: 0 Dynamic_Spider: 1 Mixed_Spider: 2''') 72 | 73 | parser.add_option("--spider-policy", 74 | dest="mspider_policy", 75 | default=2, 76 | help="Crawling strategy: Breadth-first 0 Depth-first 1 Random-first 2") 77 | 78 | parser.add_option("--focus-keyword", 79 | dest="mspider_focus_keyword", 80 | default='', 81 | help="Focus keyword in URL") 82 | 83 | parser.add_option("--filter-keyword", 84 | dest="mspider_filter_keyword", 85 | default='', 86 | help="Filter keyword in URL") 87 | 88 | parser.add_option("--filter-domain", 89 | dest="mspider_filter_domain", 90 | default='', 91 | help="Filter domain") 92 | 93 | parser.add_option("--focus-domain", 94 | dest="mspider_focus_domain", 95 | default='', 96 | help="Focus domain") 97 | 98 | parser.add_option("--random-agent", 99 | dest="mspider_agent", 100 | default=False, 101 | help="Use randomly selected HTTP User-Agent header value") 102 | 103 | parser.add_option("--print-all", 104 | dest="mspider_print_all", 105 | default=True, 106 | help="Will show more information") 107 | 108 | 109 | 110 | 111 | (options, args) = parser.parse_args() 112 | print usage 113 | variable_dict = init_dict(options) 114 | 115 | spider_global_variable = MSpiderGlobalVariable(variable_dict) 116 | 117 | focus_domain(spider_global_variable) 118 | 119 | init_spider_log(spider_global_variable) 120 | 121 | global_server(spider_global_variable) 122 | 123 | 124 | 125 | 126 | if __name__ == "__main__": 127 | try: 128 | main() 129 | except KeyboardInterrupt, e: 130 | print '\nBreak out.' 131 | sys.exit() 132 | -------------------------------------------------------------------------------- /plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manning23/MSpider/6d8b4556a8e12e54491be6a840d325eab23c637e/plugins/__init__.py -------------------------------------------------------------------------------- /plugins/phantomjs/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.pro.user* 3 | *.xcodeproj 4 | Makefile* 5 | *~ 6 | *.moc 7 | moc_* 8 | qrc_* 9 | .qmake.stash 10 | *.o 11 | *.swp 12 | *.pyc 13 | *.a 14 | /debian/*.debhelper 15 | /debian/files 16 | /debian/*.log 17 | /debian/*.substvars 18 | /debian/*/ 19 | /deploy/qt-*.tar.gz 20 | /deploy/Qt-* 21 | /symbols 22 | /src/qt/qtc-debugging-helper 23 | /src/phantomjs_plugin_import.cpp 24 | 25 | # ignore ctags 26 | /tags 27 | /tools/dump_syms.app/ 28 | 29 | # Ignore Visual Studio temporary files, build results, etc 30 | *.suo 31 | *.user 32 | *.sln.docstates 33 | *_i.c 34 | *_p.c 35 | *.ilk 36 | *.meta 37 | *.obj 38 | *.pch 39 | *.pdb 40 | *.pgc 41 | *.pgd 42 | *.rsp 43 | *.sbr 44 | *.tlb 45 | *.tli 46 | *.tlh 47 | *.tmp 48 | *.log 49 | *.sdf 50 | *.vcxproj 51 | *.vcxproj.filters 52 | *.lib 53 | *.prl 54 | *.intermediate.manifest 55 | 56 | # Build results 57 | [Dd]ebug*/ 58 | [Rr]elease/ 59 | bin/ 60 | *.class 61 | build/ 62 | .gradle/ 63 | -------------------------------------------------------------------------------- /plugins/phantomjs/.travis.yml: -------------------------------------------------------------------------------- 1 | language: cpp 2 | sudo: false 3 | compiler: 4 | - gcc 5 | cache: apt 6 | 7 | addons: 8 | apt: 9 | packages: 10 | - gperf 11 | - libicu-dev 12 | - libssl-dev 13 | 14 | before_script: 15 | - chmod +x ./build.sh 16 | - chmod +x ./test/run-tests.sh 17 | - chmod +x ./test/run-tests-ghostdriver.sh 18 | 19 | script: 20 | - ./build.sh --qtdeps=bundled --confirm --silent #< Build 21 | - ./test/run-tests.sh #< Test (PhantomJS) 22 | - ./test/run-tests-ghostdriver.sh #< Test (GhostDriver / PhantomJSDriver) 23 | 24 | notifications: 25 | irc: 26 | channels: 27 | - "irc.freenode.org#phantomjs" 28 | on_success: always 29 | on_failure: always 30 | use_notice: true 31 | -------------------------------------------------------------------------------- /tools/test_crawler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*-coding:utf-8-*- 3 | #Author : Manning 4 | #Date : 2015-10-17 5 | import sys 6 | sys.path.append(sys.path[0].split('MSpider')[0] + "MSpider") 7 | from lib.core.crawl import crawl 8 | from lib.core.fetch import fetch 9 | 10 | 11 | if __name__ == '__main__': 12 | url = 'http://www.wooyun.org/bugs/' 13 | html = fetch(url) 14 | for i in crawl(url,html): 15 | print i 16 | -------------------------------------------------------------------------------- /tools/test_fetcher.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*-coding:utf-8-*- 3 | #Author : Manning 4 | #Date : 2015-10-17 5 | import sys 6 | sys.path.append(sys.path[0].split('MSpider')[0] + "MSpider") 7 | from lib.core.fetch import fetch 8 | 9 | 10 | if __name__ == '__main__': 11 | url = 'http://www.baidu.com' 12 | print fetch(url) 13 | --------------------------------------------------------------------------------