├── .gitignore
├── LICENSE
├── README.md
├── doc
    └── manual.md
├── lib
    ├── .DS_Store
    ├── __init__.py
    ├── common
    │   ├── __init__.py
    │   ├── common.py
    │   ├── focus.py
    │   ├── initializtion.py
    │   └── logs.py
    ├── core
    │   ├── __init__.py
    │   ├── console.py
    │   ├── crawl.py
    │   ├── fetch.py
    │   ├── rules.py
    │   ├── scheduling.py
    │   └── spider.py
    ├── data
    │   ├── .DS_Store
    │   └── allurl.txt
    ├── server
    │   ├── __init__.py
    │   ├── scheduling.py
    │   └── server.py
    └── structure
    │   ├── GlobalData.py
    │   ├── HtmlData.py
    │   ├── UrlData.py
    │   └── __init__.py
├── mspider.py
├── plugins
    ├── __init__.py
    └── phantomjs
    │   ├── .gitignore
    │   └── .travis.yml
└── tools
    ├── test_crawler.py
    └── test_fetcher.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | 
11 | *.egg-info/
12 | .installed.cfg
13 | *.egg
14 | 
15 | # PyInstaller
16 | #  Usually these files are written by a python script from a template
17 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
18 | *.manifest
19 | *.spec
20 | 
21 | # Installer logs
22 | pip-log.txt
23 | pip-delete-this-directory.txt
24 | 
25 | # Unit test / coverage reports
26 | htmlcov/
27 | .tox/
28 | .coverage
29 | .cache
30 | nosetests.xml
31 | coverage.xml
32 | 
33 | # Translations
34 | *.mo
35 | *.pot
36 | 
37 | # Django stuff:
38 | *.log
39 | 
40 | # Sphinx documentation
41 | docs/_build/
42 | 
43 | # PyBuilder
44 | target/
45 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc., <http://fsf.org/>
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     {description}
294 |     Copyright (C) {year}  {fullname}
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   {signature of Ty Coon}, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 
341 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MSpider
 2 | 
 3 | ## Talk
 4 | 
 5 | The information security department of 360 company has been recruiting for a long time and is interested in contacting the mailbox zhangxin1[at]360.cn.
 6 | 
 7 | ## Installation
 8 | 
 9 | In Ubuntu, you need to install some libraries.
10 | 
11 | You can use pip or easy_install or apt-get to do this.
12 | 
13 | - lxml
14 | - chardet
15 | - splinter
16 | - gevent
17 | - phantomjs
18 | 
19 | ## Example
20 | 
21 | 1. Use MSpider collect the vulnerability information on the wooyun.org.
22 | ```
23 | 	python mspider.py -u "http://www.wooyun.org/bugs/" --focus-domain "wooyun.org" --filter-keyword "xxx" --focus-keyword "bugs" -t 15 --random-agent true
24 | ```
25 | 
26 | 
27 | 2. Use MSpider collect the news information on the news.sina.com.cn.
28 | ```
29 | 	python mspider.py -u "http://news.sina.com.cn/c/2015-12-20/doc-ifxmszek7395594.shtml" --focus-domain "news.sina.com.cn"  -t 15 --random-agent true
30 | ```
31 | 
32 | ## ToDo
33 | 
34 | 1. Crawl and storage of information.
35 | 2. Distributed crawling.
36 | 
37 | ## MSpider's help
38 | 
39 | ```
40 | Usage:
41 |   __  __  _____       _     _
42 |  |  \/  |/ ____|     (_)   | |
43 |  | \  / | (___  _ __  _  __| | ___ _ __
44 |  | |\/| |\___ \| '_ \| |/ _` |/ _ \ '__|
45 |  | |  | |____) | |_) | | (_| |  __/ |
46 |  |_|  |_|_____/| .__/|_|\__,_|\___|_|
47 |                | |
48 |                |_|
49 |                         Author: Manning23
50 | 
51 | 
52 | Options:
53 |   -h, --help            show this help message and exit
54 |   -u MSPIDER_URL, --url=MSPIDER_URL
55 |                         Target URL (e.g. "http://www.site.com/")
56 |   -t MSPIDER_THREADS_NUM, --threads=MSPIDER_THREADS_NUM
57 |                         Max number of concurrent HTTP(s) requests (default 10)
58 |   --depth=MSPIDER_DEPTH
59 |                         Crawling depth
60 |   --count=MSPIDER_COUNT
61 |                         Crawling number
62 |   --time=MSPIDER_TIME   Crawl time
63 |   --referer=MSPIDER_REFERER
64 |                         HTTP Referer header value
65 |   --cookies=MSPIDER_COOKIES
66 |                         HTTP Cookie header value
67 |   --spider-model=MSPIDER_MODEL
68 |                         Crawling mode: Static_Spider: 0  Dynamic_Spider: 1
69 |                         Mixed_Spider: 2
70 |   --spider-policy=MSPIDER_POLICY
71 |                         Crawling strategy: Breadth-first 0  Depth-first 1
72 |                         Random-first 2
73 |   --focus-keyword=MSPIDER_FOCUS_KEYWORD
74 |                         Focus keyword in URL
75 |   --filter-keyword=MSPIDER_FILTER_KEYWORD
76 |                         Filter keyword in URL
77 |   --filter-domain=MSPIDER_FILTER_DOMAIN
78 |                         Filter domain
79 |   --focus-domain=MSPIDER_FOCUS_DOMAIN
80 |                         Focus domain
81 |   --random-agent=MSPIDER_AGENT
82 |                         Use randomly selected HTTP User-Agent header value
83 |   --print-all=MSPIDER_PRINT_ALL
84 |                         Will show more information
85 | ```
86 | 


--------------------------------------------------------------------------------
/doc/manual.md:
--------------------------------------------------------------------------------
 1 | # MSpider
 2 | 
 3 | MSpider is a pure web crawler, you can use it to collect all kinds of information.
 4 | 
 5 | 
 6 | ## Installation
 7 | 
 8 | In Ubuntu, you need to install some libraries.
 9 | 
10 | You can use pip or easy_install or apt-get to do this.
11 | 
12 | - lxml
13 | - chardet
14 | - splinter
15 | - gevent
16 | - phantomjs
17 | 
18 | ## Example
19 | 
20 | 1. Use MSpider collect the vulnerability information on the wooyun.org.
21 | ```
22 | 	python mspider.py -u "http://www.wooyun.org/bugs/" --focus-domain "wooyun.org" --filter-keyword "xxx" --focus-keyword "bugs" -t 15 --random-agent true
23 | ```
24 | 
25 | 
26 | 2. Use MSpider collect the news information on the news.sina.com.cn.
27 | ```
28 | 	python mspider.py -u "http://news.sina.com.cn/c/2015-12-20/doc-ifxmszek7395594.shtml" --focus-domain "news.sina.com.cn"  -t 15 --random-agent true
29 | ```
30 | ## ToDo
31 | 
32 | 1. Crawl and storage of information.
33 | 2. Distributed crawling.


--------------------------------------------------------------------------------
/lib/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manning23/MSpider/6d8b4556a8e12e54491be6a840d325eab23c637e/lib/.DS_Store


--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manning23/MSpider/6d8b4556a8e12e54491be6a840d325eab23c637e/lib/__init__.py


--------------------------------------------------------------------------------
/lib/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manning23/MSpider/6d8b4556a8e12e54491be6a840d325eab23c637e/lib/common/__init__.py


--------------------------------------------------------------------------------
/lib/common/common.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-*-coding:utf-8-*-
 3 | #Author : Manning
 4 | #Date : 2015-10-17
 5 | '''
 6 | MSpider的通用函数
 7 | '''
 8 | 
 9 | import sys
10 | import time
11 | import urlparse
12 | 
13 | def get_absolute_path():
14 |     '''
15 |     获取MSpider的绝对路径
16 |     '''
17 |     path = sys.path[0]
18 |     path = path.split('MSpider')[0] + "MSpider/"
19 |     return path
20 | 
21 | #sys.path[0].split('MSpider')[0] + "MSpider/"
22 | 
23 | def is_netloc(url):
24 |     '''
25 |     判断当前url是否为纯域名形式
26 |     urlparse('http://www.cwi.nl:80/%7Eguido/Python.html')
27 |     ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html',params='', query='', fragment='')
28 |     '''
29 |     parse_result = urlparse.urlparse(url)
30 |     if len(parse_result[1]) > 0 and len(parse_result[2]) <= 1 and len(parse_result[4]) == 0:
31 |         return True
32 |     else:
33 |         return False
34 | 
35 | def get_netloc(url):
36 |     '''
37 |     获取当前url的域名字段
38 |     '''
39 |     return urlparse.urlparse(url)[1]
40 | 
41 | def is_ipv4_address(ip_str):
42 |     '''
43 |     判断是否是合法的ipv4地址
44 |     '''
45 |     if len(ip_str.split('.')) != 4:
46 |         return False
47 | 
48 |     for i in ip_str.split('.'):
49 |         try:
50 |             int(i)
51 |             if int(i) > 255:
52 |                 return False
53 |         except Exception as e:
54 |             return False
55 |     if ip_str.startswith('192.'):
56 |         return False
57 |     if ip_str.startswith('10.'):
58 |         return False
59 |     return True
60 | 
61 | def timestamp():
62 |     return  str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
63 | 


--------------------------------------------------------------------------------
/lib/common/focus.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-*-coding:utf-8-*-
 3 | #Author : Manning
 4 | #Date : 2015-10-17
 5 | """
 6 | MSpider 任务初始化
 7 | """
 8 | import urlparse
 9 | 
10 | def get_focus_info(url):
11 |     if url.startswith('http'):
12 |         netloc = urlparse.urlparse(url)[1]
13 |         info = '.'.join(netloc.split('.')[1:])
14 |         return info
15 |     else:
16 |         return url
17 | 
18 | 
19 | def focus_domain(spider_global_variable):
20 |     if len(spider_global_variable.focus_domain) == 0 and len(spider_global_variable.start_url) > 0:
21 |         for i in spider_global_variable.start_url:
22 |             spider_global_variable.focus_domain.append(get_focus_info(i))
23 | 


--------------------------------------------------------------------------------
/lib/common/initializtion.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #-*-coding:utf-8-*-
  3 | #Author : Manning
  4 | #Date : 2015-10-18
  5 | import urlparse
  6 | from common import get_absolute_path
  7 | 
  8 | def check_word_has_not_meaning(word):
  9 |     '''
 10 |     return True
 11 |     '''
 12 |     has_number = False
 13 |     has_letter = False
 14 | 
 15 |     for i in xrange(10):
 16 |         if str(i) in word:
 17 |             has_number = True
 18 |             break
 19 |     try:
 20 |         int(word)
 21 |     except Exception as e:
 22 |         has_letter = True
 23 | 
 24 |     if len(word) > 3 and has_letter and has_number :
 25 |         return True
 26 |     else:
 27 |         return False
 28 | 
 29 | 
 30 | def set_domain(strs):
 31 |     '''
 32 |     可以处理的格式
 33 |     1, http://abc.baidu.com/asdas
 34 |     2, abc.baidu.com
 35 |     3, 1.1.1.1  return ''
 36 |     '''
 37 |     host = ''
 38 |     domain = ''
 39 |     if 'http://' in strs:
 40 |         host = urlparse.urlparse(strs)[1].split(':')[0]
 41 |     else:
 42 |         host = strs
 43 |     keyword_list = host.split('.')
 44 |     if len(keyword_list) == 2:
 45 |         domain = host
 46 | 
 47 |     elif len(keyword_list) == 3:
 48 |         if 'com.cn' in host:
 49 |             domain = host
 50 |         elif 'net.cn' in host:
 51 |             domain = host
 52 |         else:
 53 |             domain = '.'.join(host.split('.')[1:])
 54 | 
 55 |     elif len(keyword_list) > 3:
 56 |         count = 0
 57 |         for i in keyword_list:
 58 |             try:
 59 |                 int(i)
 60 |                 count += 1
 61 |             except Exception, e:
 62 |                 break
 63 |         if count == 4:
 64 |             domain = ''
 65 |         else:
 66 |             if keyword_list[-1] == 'cn' and keyword_list[-2] in ['com', 'edu', 'gov', 'org', 'net']:
 67 |                 domain = '.'.join(keyword_list[-3:])
 68 |             elif keyword_list[-1] in ['com', 'net', 'org','cc','me']:
 69 |                 domain = '.'.join(keyword_list[-2:])
 70 |             elif keyword_list[-1] == 'cn':
 71 |                 domain = '.'.join(keyword_list[-2:])
 72 |             else:
 73 |                 domain = host
 74 |     return domain
 75 | 
 76 | 
 77 | def deal_url(start_urls):
 78 |     temp_url_list = start_urls.split(',')
 79 |     total_url_list = []
 80 |     url_list = []
 81 |     addr = get_absolute_path() + 'lib/data/allurl.txt'
 82 |     for i in open(addr).readlines():
 83 |         while True:
 84 |             if i[-1] in ['\r','\n']:
 85 |                 i = i[:-1]
 86 |             else:
 87 |                 break
 88 |         url = i
 89 |         if not url.startswith('http://'):
 90 |             url = 'http://' + url
 91 | 
 92 |         if url.endswith('/'):
 93 |             url = url[:-1]
 94 |         total_url_list.append(url)
 95 | 
 96 |     for i in temp_url_list:
 97 |         if i.startswith('http://'):
 98 |             url_list.append(i)
 99 |         else:
100 |             if i.endswith('/'):
101 |                 url = i[:-1]
102 |             else:
103 |                 url = i
104 |             for j in total_url_list:
105 |                 keyword_j = set_domain(j)
106 |                 if url in keyword_j:
107 |                     url_list.append(j)
108 |     url_list = sorted(list(set(url_list)))
109 |     new_list = []
110 |     for i in url_list:
111 |         netloc = urlparse.urlparse(i)[1]
112 |         netloc_list = netloc.split('.')
113 |         if len(netloc_list) == 3:
114 |             if len(netloc_list[0]) > 10:
115 |                 continue
116 |             else:
117 |                 new_list.append(i)
118 |         elif len(netloc_list) == 4:
119 |             if check_word_has_not_meaning(netloc_list[0]):
120 |                 continue
121 |             else:
122 |                 new_list.append(i)
123 |         elif len(netloc_list) == 5:
124 |             if check_word_has_not_meaning(netloc_list[0]):
125 |                 continue
126 |             elif check_word_has_not_meaning(netloc_list[1]):
127 |                 continue
128 |             else:
129 |                 new_list.append(i)
130 | 
131 |     return new_list
132 | 
133 | 
134 | def deal_common_strs(words):
135 |     if len(words) == 0:
136 |         return []
137 |     else:
138 |         return words.split(',')
139 | 
140 | def deal_strs(words):
141 |     if len(words) == 0:
142 |         return ''
143 |     else:
144 |         return words
145 | 
146 | def deal_common_int(num):
147 |     num = str(num).split('.')[0]
148 |     try:
149 |         int(num)
150 |     except Exception, e:
151 |         raise e
152 |     return int(num)
153 | 
154 | 
155 | def deal_common_boolean(boolean):
156 |     boolean = str(boolean).lower()
157 |     if boolean == 'true':
158 |         return True
159 |     elif boolean == '1':
160 |         return True
161 |     elif boolean == '0':
162 |         return False
163 |     else:
164 |         return False
165 | 
166 | 
167 | 
168 | def init_dict(options):
169 |     variable_dict = {
170 |         "start_url": deal_url(options.mspider_url),
171 | 
172 |         "threads": deal_common_int(options.mspider_threads_num),
173 |         "depth": deal_common_int(options.mspider_depth),
174 |         "count": deal_common_int(options.mspider_count),
175 |         "time": deal_common_int(options.mspider_time),
176 |         'referer': options.mspider_referer,
177 |         'cookies': options.mspider_cookies,
178 | 
179 |         "spider_model": deal_common_int(options.mspider_model),
180 |         "spider_policy": deal_common_int(options.mspider_policy),
181 | 
182 |         "focus_keyword": deal_common_strs(options.mspider_focus_keyword),
183 |         "filter_keyword": deal_common_strs(options.mspider_filter_keyword),
184 |         "focus_domain": deal_common_strs(options.mspider_focus_domain),
185 |         "filter_domain": deal_common_strs(options.mspider_filter_domain),
186 | 
187 |         "random_agent": deal_common_boolean(options.mspider_agent),
188 |         'print_all': deal_common_boolean(options.mspider_print_all),
189 | 
190 |     }
191 |     return variable_dict
192 | 


--------------------------------------------------------------------------------
/lib/common/logs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-*-coding:utf-8-*-
 3 | #Author : Manning
 4 | #Date : 2015-10-17
 5 | """
 6 | MSpider 日志记录
 7 | """
 8 | import logging
 9 | import sys
10 | 
11 | def init_spider_log(spider_global_variable):
12 | 
13 |     '''
14 |     logs msg定义如下
15 |     Function: init_spider_log, Info: xxx
16 |     '''
17 | 
18 |     spider_logger = logging.getLogger('MSpiderLogs')
19 |     spider_logger.setLevel(logging.DEBUG)
20 | 
21 |     console_handler = logging.StreamHandler()
22 |     console_handler.setLevel(logging.DEBUG)
23 | 
24 |     formatter = logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s")
25 | 
26 |     console_handler.setFormatter(formatter)
27 | 
28 |     spider_logger.addHandler(console_handler)
29 | 
30 |     spider_global_variable.spider_logger = spider_logger
31 |     spider_logger.info("Welcome to Mspider !!!")
32 |     spider_logger.info("---------------------------")
33 | 


--------------------------------------------------------------------------------
/lib/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manning23/MSpider/6d8b4556a8e12e54491be6a840d325eab23c637e/lib/core/__init__.py


--------------------------------------------------------------------------------
/lib/core/console.py:
--------------------------------------------------------------------------------
 1 | """ getTerminalSize()
 2 |  - get width and height of console
 3 |  - works on linux,os x,windows,cygwin(windows) 
 4 | """
 5 | 
 6 | __all__=['getTerminalSize']
 7 | 
 8 | 
 9 | def getTerminalSize():
10 |    import platform
11 |    current_os = platform.system()
12 |    tuple_xy=None
13 |    if current_os == 'Windows':
14 |        tuple_xy = _getTerminalSize_windows()
15 |        if tuple_xy is None:
16 |           tuple_xy = _getTerminalSize_tput()
17 |           # needed for window's python in cygwin's xterm!
18 |    if current_os == 'Linux' or current_os == 'Darwin' or  current_os.startswith('CYGWIN'):
19 |        tuple_xy = _getTerminalSize_linux()
20 |    if tuple_xy is None:
21 |        print "default"
22 |        tuple_xy = (80, 25)      # default value
23 |    return tuple_xy
24 | 
25 | def _getTerminalSize_windows():
26 |     res=None
27 |     try:
28 |         from ctypes import windll, create_string_buffer
29 | 
30 |         # stdin handle is -10
31 |         # stdout handle is -11
32 |         # stderr handle is -12
33 | 
34 |         h = windll.kernel32.GetStdHandle(-12)
35 |         csbi = create_string_buffer(22)
36 |         res = windll.kernel32.GetConsoleScreenBufferInfo(h, csbi)
37 |     except:
38 |         return None
39 |     if res:
40 |         import struct
41 |         (bufx, bufy, curx, cury, wattr,
42 |          left, top, right, bottom, maxx, maxy) = struct.unpack("hhhhHhhhhhh", csbi.raw)
43 |         sizex = right - left + 1
44 |         sizey = bottom - top + 1
45 |         return sizex, sizey
46 |     else:
47 |         return None
48 | 
49 | def _getTerminalSize_tput():
50 |     # get terminal width
51 |     # src: http://stackoverflow.com/questions/263890/how-do-i-find-the-width-height-of-a-terminal-window
52 |     try:
53 |        import subprocess
54 |        proc=subprocess.Popen(["tput", "cols"],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
55 |        output=proc.communicate(input=None)
56 |        cols=int(output[0])
57 |        proc=subprocess.Popen(["tput", "lines"],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
58 |        output=proc.communicate(input=None)
59 |        rows=int(output[0])
60 |        return (cols,rows)
61 |     except:
62 |        return None
63 | 
64 | 
65 | def _getTerminalSize_linux():
66 |     def ioctl_GWINSZ(fd):
67 |         try:
68 |             import fcntl, termios, struct, os
69 |             cr = struct.unpack('hh', fcntl.ioctl(fd, termios.TIOCGWINSZ,'1234'))
70 |         except:
71 |             return None
72 |         return cr
73 |     cr = ioctl_GWINSZ(0) or ioctl_GWINSZ(1) or ioctl_GWINSZ(2)
74 |     if not cr:
75 |         try:
76 |             fd = os.open(os.ctermid(), os.O_RDONLY)
77 |             cr = ioctl_GWINSZ(fd)
78 |             os.close(fd)
79 |         except:
80 |             pass
81 |     if not cr:
82 |         try:
83 |             cr = (env['LINES'], env['COLUMNS'])
84 |         except:
85 |             return None
86 |     return int(cr[1]), int(cr[0])
87 | 
88 | if __name__ == "__main__":
89 |     sizex,sizey=getTerminalSize()
90 |     print  'width =',sizex,'height =',sizey
91 | 


--------------------------------------------------------------------------------
/lib/core/crawl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-*-coding:utf-8-*-
 3 | #Author : Manning
 4 | #Date : 2015-10-17
 5 | '''
 6 | About how to crawl the <a href=""> in the html
 7 | '''
 8 | import lxml.html
 9 | import sys
10 | reload(sys)
11 | sys.setdefaultencoding("utf-8")
12 | 
13 | import urlparse
14 | import chardet
15 | import urllib2
16 | try:
17 |     import re2 as re
18 | except ImportError:
19 |     import re
20 | 
21 | import random
22 | import time
23 | from fetch import fetch
24 | 
25 | import logging
26 | spider_logger = logging.getLogger('MSpiderLogs')
27 | 
28 | 
29 | def get_url_by_lxml(url,html):
30 |     try:
31 |         if '.js' in urlparse.urlparse(url)[2]:
32 |             return []
33 |         tmp = lxml.html.document_fromstring(urllib2.unquote(html))
34 |         tmp.make_links_absolute(url)
35 |         links = tmp.iterlinks()
36 | 
37 |         links = [i[2] for i in links]
38 |         return links
39 |     except Exception as e:
40 |         msg = 'Function: get_url_by_lxml, Info: ' + str(e)
41 |         spider_logger.error(msg)
42 |         return []
43 | 
44 | def check_suffix(url):
45 |     ignore_ext = ['wma', 'png', 'jpeg', 'jpg']
46 |     suffix = urlparse.urlparse(url)[2].split('.')[-1].lower()
47 |     if suffix in ignore_ext:
48 |         return False
49 |     else:
50 |         return True
51 | 
52 | def check_keyword(domian):
53 |     i = domian
54 |     if i.startswith('javascript:'):
55 |         return False
56 |     if i.startswith('about:'):
57 |         return False
58 |     return True
59 | 
60 | def modify_url(url):
61 |     i = url
62 |     if '/' not in i and '?' not in i:
63 |         i = i + '/'
64 |     i = 'http://' + i
65 |     return i
66 | 
67 | 
68 | def crawl(url,html):
69 |     if len(html) < 10:
70 |         return []
71 |     link_set = set()
72 |     _ = [link_set.add(i) for i in get_url_by_lxml(url,html) if check_keyword(i)]
73 |     get_link_list = [i for i in list(link_set) if check_suffix(i)]
74 | 
75 |     links = []
76 | 
77 |     for i in get_link_list:
78 |         data = modify_url_to_structure(i)
79 |         links.append(data)
80 | 
81 |     return links
82 | 
83 | 
84 | def modify_url_to_structure(url):
85 |     method = 'get'
86 |     return (method,url,'')
87 | 


--------------------------------------------------------------------------------
/lib/core/fetch.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-*-coding:utf-8-*-
 3 | #Author : Manning
 4 | #Date : 2015-10-17
 5 | '''
 6 | About how to get html.
 7 | '''
 8 | 
 9 | import requests
10 | import urlparse
11 | import time
12 | import random
13 | import urllib2
14 | from splinter import Browser
15 | 
16 | import sys
17 | sys.path.append(sys.path[0].split('MSpider')[0] + "MSpider/lib")
18 | 
19 | import logging
20 | spider_logger = logging.getLogger('MSpiderLogs')
21 | 
22 | 
23 | def html_pretreatment(html):
24 |     html = html.lower()
25 |     html = urllib2.unquote(html)
26 |     return html
27 | 
28 | 
29 | def fetch(url, spider_model=0, fetch_time_interval=1, set_random_agent=True, set_referer=False, set_cookies=False):
30 |     try:
31 |         spider_model = spider_model
32 |         fetch_time_interval = fetch_time_interval
33 |         random_agent = random_agent
34 |     except Exception, e:
35 |         spider_model = 0
36 |         fetch_time_interval = 1
37 |         random_agent = False
38 | 
39 |     myheaders = dict()
40 |     if random_agent:
41 |         myheaders['Agent'] = random_http_header()
42 |     else:
43 |         myheaders['Agent'] = 'MSpider'
44 | 
45 |     if set_referer:
46 |         myheaders['Referer'] = set_referer
47 | 
48 |     if set_cookies:
49 |         myheaders['Cookie'] = set_cookies
50 | 
51 |     returnhtml = ''
52 | 
53 |     if spider_model == 0:
54 |         # Static Model
55 |         try:
56 |             response = requests.get(url, timeout=15, headers=myheaders, allow_redirects=False)
57 |             if response.status_code == 200:
58 |                 returnhtml = response.content
59 |             else:
60 |                 return ""
61 |         except Exception, e:
62 |             msg = 'Function: fetch_0, Info: ' + str(e)
63 |             spider_logger.error(msg)
64 |             return ""
65 |     elif spider_model == 1:
66 |         # Dynamic Model
67 |         try:
68 |             browser = Browser(driver_name='phantomjs', user_agent=myheaders['User-Agent'], load_images=False)
69 |             browser.visit(url)
70 |             html = browser.html
71 |             browser.quit()
72 |             returnhtml = html
73 |         except Exception, e:
74 |             msg = 'Function: fetch_1, Info: ' + str(e)
75 |             spider_logger.error(msg)
76 |             return ""
77 |     else:
78 |         return ""
79 | 
80 |     if len(returnhtml) < 10:
81 |         return ''
82 | 
83 |     html = html_pretreatment(returnhtml).decode('gb2312','ignore')
84 |     time.sleep(fetch_time_interval)  # 抓取时间间隔
85 | 
86 |     return html
87 | 
88 | 
89 | def random_http_header():
90 |     user_agents = [
91 |         "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0",
92 |         "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)",
93 |     ]
94 |     return random.choice(user_agents)
95 | 


--------------------------------------------------------------------------------
/lib/core/rules.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding:utf-8
 3 | # manning  2015-1-27
 4 | import time
 5 | import re
 6 | import urlparse
 7 | import sys
 8 | sys.path.append(sys.path[0].split('MSpider')[0] + "MSpider/lib")
 9 | 
10 | import logging
11 | spider_logger = logging.getLogger('MSpiderLogs')
12 | 
13 | class UrlRuleClass(object):
14 | 
15 |     """docstring for UrlRule"""
16 | 
17 |     def __init__(self, SpiderGlobalVariable):
18 |         super(UrlRuleClass, self).__init__()
19 |         self.url_repeat_set = set()
20 |         self.url = ''
21 |         self.spiderglobal = SpiderGlobalVariable
22 | 
23 |     def check_repeat(self,url):
24 |         if url not in self.url_repeat_set:
25 |             self.url_repeat_set.add(url)
26 |             return True
27 |         return False
28 | 
29 | 
30 |     def focus_domain(self,url):
31 |         if len(self.spiderglobal.focus_domain) == 0:
32 |             return True
33 |         t = urlparse.urlparse(url)[1]
34 |         for i in self.spiderglobal.focus_domain:
35 |             if i in t:
36 |                 return True
37 |         return False
38 | 
39 |     def filter_domain(self,url):
40 |         t = urlparse.urlparse(url)[1]
41 |         for i in self.spiderglobal.filter_domain:
42 |             if i in t:
43 |                 return False
44 |         return True
45 | 
46 |     def focus_keyword(self,url):
47 |         if len(self.spiderglobal.focus_keyword) == 0:
48 |             return True
49 |         for i in self.spiderglobal.focus_keyword:
50 |             if i in url:
51 |                 return True
52 |         return False
53 | 
54 |     def filter_keyword(self,url):
55 |         if len(self.spiderglobal.filter_keyword) == 0:
56 |             return True
57 |         for i in self.spiderglobal.filter_keyword:
58 |             if i in url:
59 |                 return False
60 |         return True
61 | 
62 |     def check_filter_and_focus(self,url):
63 |         if self.focus_domain(url) and self.filter_domain(url) and self.focus_keyword(url) and self.filter_keyword(url):
64 |             return True
65 |         return False
66 | 
67 |     def check_url(self, url):
68 |         if self.check_repeat(url) and self.check_filter_and_focus(url):
69 |             return True
70 |         else:
71 |             return False
72 | 


--------------------------------------------------------------------------------
/lib/core/scheduling.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-*-coding:utf-8-*-
 3 | #Author : Manning
 4 | #Date : 2015-10-17
 5 | """
 6 | MSpider 全局调度
 7 | """
 8 | 
 9 | import random
10 | import time
11 | import sys
12 | sys.path.append(sys.path[0].split('MSpider')[0] + "MSpider/lib")
13 | 
14 | from crawl import crawl
15 | from structure.UrlData import UrlNode
16 | from common.common import is_netloc
17 | 
18 | def exit_condition(SpiderGlobalVariable):
19 |     # 调度退出机制函数
20 |     if time.time() -SpiderGlobalVariable.start_time < SpiderGlobalVariable.time:
21 |         if SpiderGlobalVariable.exit_flag_count < SpiderGlobalVariable.threads:
22 |             if SpiderGlobalVariable.total_count < SpiderGlobalVariable.count:
23 |                 return True
24 |     return False
25 | 
26 | 
27 | def init_urlnode(start_urls_list,UrlRule):
28 |     nodelist = []
29 |     for i in start_urls_list:
30 |         if UrlRule.check_url(i):
31 |             tmpnode = UrlNode(i, '', -1)
32 |             nodelist.append(tmpnode)
33 |     return nodelist
34 | 
35 | 
36 | def spider_scheduling(SpiderGlobalVariable,UrlRule):
37 |     '''
38 |     SpiderGlobalVariable
39 |     '''
40 |     for i in init_urlnode(SpiderGlobalVariable.start_url,UrlRule):
41 |         SpiderGlobalVariable.global_urlnode_queue.put((0,i))
42 | 
43 |     while exit_condition(SpiderGlobalVariable):
44 |         if SpiderGlobalVariable.htmlnode_queue.qsize() > 0:
45 |             html_node = SpiderGlobalVariable.htmlnode_queue.get()
46 |             linklist = crawl(html_node.url, html_node.html)
47 |             for i in linklist:
48 |                 url = i[1]
49 |                 method = i[0]
50 |                 data = i[2]
51 |                 depth = html_node.depth
52 |                 referer = html_node.url
53 |                 i = UrlNode(url, referer, depth, method, data)
54 | 
55 |                 if i.depth <= SpiderGlobalVariable.depth and UrlRule.check_url(i.check_url):
56 |                     if is_netloc(i.url):
57 |                         SpiderGlobalVariable.global_urlnode_queue.put((0,i))
58 |                     else:
59 |                         SpiderGlobalVariable.global_urlnode_queue.put((random.randint(1,5),i))
60 | 
61 |                 else:
62 |                     SpiderGlobalVariable.refuse_count += 1
63 | 


--------------------------------------------------------------------------------
/lib/core/spider.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-*-coding:utf-8-*-
 3 | #Author : Manning
 4 | #Date : 2015-10-17
 5 | '''
 6 | MSpider 爬虫工作线程
 7 |             2015.5.20
 8 |             调整输出样式，参考了@lijiejie的一些代码
 9 | 
10 |             2015.3.28
11 |             抓取模型
12 |             0，广度优先（缺省）
13 |             1，深度优先
14 |             2，随机优先
15 | 
16 |             2015.3.27
17 |             加入数据队列，单起一个线程写入数据库
18 |             数据库类型为sqlite
19 |             预计支持mysql、sql server等
20 | 
21 |             2015。3.26
22 |             添加深度控制
23 | 
24 |             2015.3.8
25 |             server退出机制
26 |             1，超过爬取时间
27 |             2，爬取线程不存在(可能爬完)
28 |             3，深度超越
29 |             4，抓取个数超越
30 | 
31 |             线程退出机制
32 |             如果此线程5分钟内没有工作，线程退出
33 | 
34 | '''
35 | import time
36 | import sys
37 | sys.path.append(sys.path[0].split('MSpider')[0] + "MSpider/lib")
38 | from console import getTerminalSize
39 | from fetch import fetch
40 | from common.common import timestamp
41 | from structure.HtmlData import HtmlNode
42 | 
43 | import logging
44 | spider_logger = logging.getLogger('MSpiderLogs')
45 | 
46 | def spider(SpiderGlobalVariable):
47 |     if SpiderGlobalVariable.spider_use_gevent:
48 |         import gevent
49 |     while True:
50 |         if SpiderGlobalVariable.spider_urlnode_queue.qsize() > 0:
51 |             _,node = SpiderGlobalVariable.spider_urlnode_queue.get()
52 |             html = fetch(node.url, SpiderGlobalVariable.spider_model, SpiderGlobalVariable.fetch_time_interval, SpiderGlobalVariable.random_agent)
53 |             if len(html) < 10:
54 |                 pass
55 |             html_node = HtmlNode(node.url, html, timestamp(), node.depth)
56 |             SpiderGlobalVariable.htmlnode_queue.put(html_node)
57 |             SpiderGlobalVariable.total_count += 1
58 | 
59 |             if SpiderGlobalVariable.print_all:
60 |                 msg = "[Url] %s  Depth: %s  Found: %s Remaining: %s  Html: %s"% (node.url, str(node.depth), str(SpiderGlobalVariable.total_count), str(SpiderGlobalVariable.spider_urlnode_queue.qsize()), str(len(html)))
61 |                 spider_logger.info(msg)
62 | 
63 |             else:
64 |                 msg = "[Url] %s  Depth: %s  Found: %s Remaining: %s  Html: %s" % (node.url, str(node.depth), str(SpiderGlobalVariable.total_count), str(SpiderGlobalVariable.spider_urlnode_queue.qsize()), str(len(html)))
65 |                 console_width = getTerminalSize()[0] - 0
66 |                 if len(msg) - console_width > 0:
67 |                     msg = msg[:console_width]
68 |                     sys.stdout.write('\r' + msg)
69 |                     sys.stdout.flush()
70 |                 else:
71 |                     sys.stdout.write('\r' + msg + ' ' * (console_width - len(msg)))
72 |                     sys.stdout.flush()
73 |             if SpiderGlobalVariable.spider_use_gevent:
74 |                 gevent.sleep(0)
75 |         else:
76 |             if SpiderGlobalVariable.spider_use_gevent:
77 |                 gevent.sleep(0)
78 |             else:
79 |                 time.sleep(5)
80 |     SpiderGlobalVariable.exit_flag_count += 1
81 | 


--------------------------------------------------------------------------------
/lib/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manning23/MSpider/6d8b4556a8e12e54491be6a840d325eab23c637e/lib/data/.DS_Store


--------------------------------------------------------------------------------
/lib/server/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manning23/MSpider/6d8b4556a8e12e54491be6a840d325eab23c637e/lib/server/__init__.py


--------------------------------------------------------------------------------
/lib/server/scheduling.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-*-coding:utf-8-*-
 3 | #Author : Manning
 4 | #Date : 2015-10-17
 5 | """
 6 | MSpider global_scheduling
 7 | """
 8 | import time
 9 | import logging
10 | spider_logger = logging.getLogger('MSpiderLogs')
11 | 
12 | def global_scheduling(spider_global_variable):
13 |     while True:
14 |         if spider_global_variable.global_urlnode_queue.qsize() > 0:
15 |             node = spider_global_variable.global_urlnode_queue.get()
16 |             spider_global_variable.spider_urlnode_queue.put(node)
17 | 
18 |         '''
19 |             In this function, you can put something interesting code in this,
20 |         The global_scheduling function can get all the url_node, the url_node
21 |         structure in the UrlData.py.
22 |         '''
23 | 


--------------------------------------------------------------------------------
/lib/server/server.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-*-coding:utf-8-*-
 3 | #Author : Manning
 4 | #Date : 2015-10-17
 5 | 
 6 | import time
 7 | import sys
 8 | sys.path.append(sys.path[0].split('MSpider')[0] + "MSpider")
 9 | import threading
10 | import logging
11 | from lib.core.rules import UrlRuleClass
12 | from lib.core.scheduling import spider_scheduling
13 | from lib.core.spider import spider
14 | from lib.structure.GlobalData import MSpiderGlobalVariable
15 | from scheduling import global_scheduling
16 | 
17 | spider_logger = logging.getLogger('MSpiderLogs')
18 | 
19 | def global_server(spider_global_variable):
20 |     # 初始化全局变量
21 |     url_rule = UrlRuleClass(spider_global_variable)
22 | 
23 |     threads_list = []
24 |     spider_threads = []
25 | 
26 |     threads_list.append(threading.Thread(target=spider_scheduling, args=(spider_global_variable, url_rule,)))
27 |     threads_list.append(threading.Thread(target=global_scheduling, args=(spider_global_variable,)))
28 | 
29 |     for t in threads_list:
30 |         t.setDaemon(True)
31 |         t.start()
32 | 
33 |     if spider_global_variable.spider_use_gevent:
34 |         import gevent
35 |         from gevent import monkey
36 |         monkey.patch_all(thread=False)
37 |         for i in xrange(spider_global_variable.threads):
38 |             spider_threads.append(gevent.spawn(spider, spider_global_variable))
39 |         gevent.joinall(spider_threads)
40 |     else:
41 |         for i in xrange(spider_global_variable.threads):
42 |             spider_threads.append(threading.Thread(target=spider, args=(spider_global_variable,)))
43 |         for t in spider_threads:
44 |             t.setDaemon(True)
45 |             t.start()
46 | 
47 | 
48 |     time.sleep(120)
49 |     while True:
50 |         if spider_global_variable.spider_urlnode_queue.qsize() == 0:
51 |             spider_logger.critical('MSpider wait to exit!!')
52 |             time.sleep(120)
53 |             if spider_global_variable.spider_urlnode_queue.qsize() == 0:
54 |                 pass
55 |             else:
56 |                 continue
57 |             spider_global_variable.end_ctime = time.ctime()
58 |             time.sleep(120)
59 |             spider_logger.critical('MSpider exit!!')
60 |             sys.exit(0)
61 |         else:
62 |             time.sleep(10)
63 | 


--------------------------------------------------------------------------------
/lib/structure/GlobalData.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-*-coding:utf-8-*-
 3 | #Author : Manning
 4 | #Date : 2015-10-17
 5 | """
 6 | MSpider 全局变量
 7 | """
 8 | 
 9 | import Queue
10 | import urlparse
11 | import time
12 | 
13 | class MSpiderGlobalVariable(object):
14 |     def __init__(self, variable_dict):
15 |         self.variable_dict = variable_dict
16 | 
17 |         self.start_url = ["http://www.baidu.com"]
18 |         self.focus_keyword = []
19 |         self.filter_keyword = []
20 |         self.focus_domain = []
21 |         self.filter_domain = []
22 | 
23 |         self.threads = 10
24 |         self.spider_use_gevent = False
25 |         self.depth = 10
26 |         self.count = 1000
27 |         self.time = 24 * 3600
28 |         self.referer = ''
29 |         self.cookies = ''
30 |         self.spider_model = 0
31 |         self.spider_policy = 0
32 | 
33 |         self.random_agent = False
34 |         self.print_all = True
35 |         self.spider_use_gevent = False
36 | 
37 |         self.ignore_ext = []
38 |         self.spider_proxy = True
39 |         self.spider_proxy_ip_pool = []
40 |         self.download_rate = 50
41 |         self.fetch_time_interval = 5
42 | 
43 |         '''
44 |         全局控制参数
45 |         '''
46 | 
47 |         self.exit_flag_count = 0
48 |         self.global_urlnode_queue = Queue.Queue()
49 |         self.global_unfocus_urlnode_queue = Queue.Queue()
50 |         self.spider_urlnode_queue = None
51 |         self.htmlnode_queue = Queue.Queue()
52 |         self.store_queue = Queue.Queue()
53 |         self.parse_variable_dict()
54 |         self.set_urlnode_queue()
55 | 
56 |         self.spider_logger = None
57 | 
58 |         '''
59 |         爬虫任务参数
60 |         '''
61 |         self.total_count = 0
62 |         self.refuse_count = 0
63 | 
64 |         self.start_time = time.time()
65 |         self.end_time = None
66 |         self.start_ctime = time.ctime()
67 |         self.end_ctime = None
68 |         self.maintain_time = None
69 | 
70 |         self.task_name = None
71 | 
72 | 
73 | 
74 |     def set_urlnode_queue(self):
75 |         if self.spider_policy == 1:
76 |             self.spider_urlnode_queue = Queue.LifoQueue()
77 |         elif self.spider_policy == 2:
78 |             self.spider_urlnode_queue = Queue.PriorityQueue()
79 |         else:
80 |             self.spider_urlnode_queue = Queue.Queue()
81 | 
82 |     def parse_variable_dict(self):
83 |         self.start_url = self.variable_dict['start_url']
84 |         self.focus_keyword = self.variable_dict['focus_keyword']
85 |         self.filter_keyword = self.variable_dict['filter_keyword']
86 |         self.focus_domain = self.variable_dict['focus_domain']
87 |         self.filter_domain = self.variable_dict['filter_domain']
88 |         self.threads = self.variable_dict['threads']
89 |         self.depth = self.variable_dict['depth']
90 |         self.count = self.variable_dict['count']
91 |         self.time = self.variable_dict['time']
92 |         self.referer = self.variable_dict['referer']
93 |         self.cookies = self.variable_dict['cookies']
94 |         self.spider_model = self.variable_dict['spider_model']
95 |         self.spider_policy = self.variable_dict['spider_policy']
96 |         self.random_agent = self.variable_dict['random_agent']
97 |         self.print_all = self.variable_dict['print_all']
98 | 


--------------------------------------------------------------------------------
/lib/structure/HtmlData.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-*-coding:utf-8-*-
 3 | #Author : Manning
 4 | #Date : 2015-10-17
 5 | """
 6 | MSpider HtmlNode结点类
 7 | """
 8 | class HtmlNode(object):
 9 |     def __init__(self, url, html, time, depth):
10 |         self.url = url
11 |         self.html = html
12 |         self.time = time
13 |         self.depth = depth
14 | 


--------------------------------------------------------------------------------
/lib/structure/UrlData.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-*-coding:utf-8-*-
 3 | #Author : Manning
 4 | #Date : 2015-10-17
 5 | 
 6 | class UrlNode(object):
 7 |     def __init__(self, url, referer, depth, method = 'get', data = ''):
 8 |         self.url = url
 9 |         self.referer = referer
10 |         self.method = method
11 |         self.depth = int(depth) + 1
12 |         self.data = data
13 |         self.check_url = None
14 |         self.init_check_url()
15 | 
16 |     def show(self):
17 |         print self.method
18 |         print self.url
19 |         print self.data
20 |         print '--------------------'
21 | 
22 |     def init_check_url(self):
23 |         self.check_url = self.url
24 |         
25 | 


--------------------------------------------------------------------------------
/lib/structure/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manning23/MSpider/6d8b4556a8e12e54491be6a840d325eab23c637e/lib/structure/__init__.py


--------------------------------------------------------------------------------
/mspider.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #-*-coding:utf-8-*-
  3 | #Author : Manning
  4 | #Date : 2015-10-17
  5 | """
  6 | MSpider 起始文件
  7 | """
  8 | import optparse
  9 | import sys
 10 | 
 11 | from lib.structure.GlobalData import MSpiderGlobalVariable
 12 | from lib.common.initializtion import init_dict
 13 | from lib.common.logs import init_spider_log
 14 | from lib.common.focus import focus_domain
 15 | from lib.server.server import global_server
 16 | 
 17 | import logging
 18 | spider_logger = logging.getLogger('MSpiderLogs')
 19 | 
 20 | def main():
 21 |     usage = '''
 22 |   __  __  _____       _     _
 23 |  |  \/  |/ ____|     (_)   | |
 24 |  | \  / | (___  _ __  _  __| | ___ _ __
 25 |  | |\/| |\___ \| '_ \| |/ _` |/ _ \ '__|
 26 |  | |  | |____) | |_) | | (_| |  __/ |
 27 |  |_|  |_|_____/| .__/|_|\__,_|\___|_|
 28 |                | |
 29 |                |_|
 30 |                         Author: Manning23
 31 |     '''
 32 |     parser = optparse.OptionParser(usage=usage)
 33 |     parser.add_option("-u", "--url",
 34 |                       dest="mspider_url",
 35 |                       default='http://www.baidu.com',
 36 |                       help='''Target URL (e.g. "http://www.site.com/")''')
 37 | 
 38 |     parser.add_option("-t", "--threads",
 39 |                       dest="mspider_threads_num",
 40 |                       default=10,
 41 |                       help="Max number of concurrent HTTP(s) requests (default 10)")
 42 | 
 43 |     parser.add_option("--depth",
 44 |                       dest="mspider_depth",
 45 |                       default=1000,
 46 |                       help="Crawling depth")
 47 | 
 48 |     parser.add_option("--count",
 49 |                       dest="mspider_count",
 50 |                       default=1000 * 1000,
 51 |                       help="Crawling number")
 52 | 
 53 |     parser.add_option("--time",
 54 |                       dest="mspider_time",
 55 |                       default=3600 * 24 * 7,
 56 |                       help="Crawl time")
 57 | 
 58 |     parser.add_option("--referer",
 59 |                       dest="mspider_referer",
 60 |                       default='',
 61 |                       help="HTTP Referer header value")
 62 | 
 63 |     parser.add_option("--cookies",
 64 |                       dest="mspider_cookies",
 65 |                       default='',
 66 |                       help="HTTP Cookie header value")
 67 | 
 68 |     parser.add_option("--spider-model",
 69 |                       dest="mspider_model",
 70 |                       default=0,
 71 |                       help='''Crawling mode: Static_Spider: 0  Dynamic_Spider: 1  Mixed_Spider: 2''')
 72 | 
 73 |     parser.add_option("--spider-policy",
 74 |                       dest="mspider_policy",
 75 |                       default=2,
 76 |                       help="Crawling strategy: Breadth-first 0  Depth-first 1  Random-first 2")
 77 | 
 78 |     parser.add_option("--focus-keyword",
 79 |                       dest="mspider_focus_keyword",
 80 |                       default='',
 81 |                       help="Focus keyword in URL")
 82 | 
 83 |     parser.add_option("--filter-keyword",
 84 |                       dest="mspider_filter_keyword",
 85 |                       default='',
 86 |                       help="Filter keyword in URL")
 87 | 
 88 |     parser.add_option("--filter-domain",
 89 |                       dest="mspider_filter_domain",
 90 |                       default='',
 91 |                       help="Filter domain")
 92 | 
 93 |     parser.add_option("--focus-domain",
 94 |                       dest="mspider_focus_domain",
 95 |                       default='',
 96 |                       help="Focus domain")
 97 | 
 98 |     parser.add_option("--random-agent",
 99 |                       dest="mspider_agent",
100 |                       default=False,
101 |                       help="Use randomly selected HTTP User-Agent header value")
102 | 
103 |     parser.add_option("--print-all",
104 |                       dest="mspider_print_all",
105 |                       default=True,
106 |                       help="Will show more information")
107 | 
108 | 
109 | 
110 | 
111 |     (options, args) = parser.parse_args()
112 |     print usage
113 |     variable_dict = init_dict(options)
114 | 
115 |     spider_global_variable = MSpiderGlobalVariable(variable_dict)
116 | 
117 |     focus_domain(spider_global_variable)
118 | 
119 |     init_spider_log(spider_global_variable)
120 | 
121 |     global_server(spider_global_variable)
122 | 
123 | 
124 | 
125 | 
126 | if __name__ == "__main__":
127 |     try:
128 |         main()
129 |     except KeyboardInterrupt, e:
130 |         print '\nBreak out.'
131 |         sys.exit()
132 | 


--------------------------------------------------------------------------------
/plugins/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manning23/MSpider/6d8b4556a8e12e54491be6a840d325eab23c637e/plugins/__init__.py


--------------------------------------------------------------------------------
/plugins/phantomjs/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | *.pro.user*
 3 | *.xcodeproj
 4 | Makefile*
 5 | *~
 6 | *.moc
 7 | moc_*
 8 | qrc_*
 9 | .qmake.stash
10 | *.o
11 | *.swp
12 | *.pyc
13 | *.a
14 | /debian/*.debhelper
15 | /debian/files
16 | /debian/*.log
17 | /debian/*.substvars
18 | /debian/*/
19 | /deploy/qt-*.tar.gz
20 | /deploy/Qt-*
21 | /symbols
22 | /src/qt/qtc-debugging-helper
23 | /src/phantomjs_plugin_import.cpp
24 | 
25 | # ignore ctags
26 | /tags
27 | /tools/dump_syms.app/
28 | 
29 | # Ignore Visual Studio temporary files, build results, etc
30 | *.suo
31 | *.user
32 | *.sln.docstates
33 | *_i.c
34 | *_p.c
35 | *.ilk
36 | *.meta
37 | *.obj
38 | *.pch
39 | *.pdb
40 | *.pgc
41 | *.pgd
42 | *.rsp
43 | *.sbr
44 | *.tlb
45 | *.tli
46 | *.tlh
47 | *.tmp
48 | *.log
49 | *.sdf
50 | *.vcxproj
51 | *.vcxproj.filters
52 | *.lib
53 | *.prl
54 | *.intermediate.manifest
55 | 
56 | # Build results
57 | [Dd]ebug*/
58 | [Rr]elease/
59 | bin/
60 | *.class
61 | build/
62 | .gradle/
63 | 


--------------------------------------------------------------------------------
/plugins/phantomjs/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: cpp
 2 | sudo: false
 3 | compiler:
 4 |   - gcc
 5 | cache: apt
 6 | 
 7 | addons:
 8 |   apt:
 9 |     packages:
10 |     - gperf
11 |     - libicu-dev
12 |     - libssl-dev
13 | 
14 | before_script:
15 |   - chmod +x ./build.sh
16 |   - chmod +x ./test/run-tests.sh
17 |   - chmod +x ./test/run-tests-ghostdriver.sh
18 |   
19 | script:
20 |   - ./build.sh --qtdeps=bundled --confirm --silent   #< Build
21 |   - ./test/run-tests.sh                              #< Test (PhantomJS)
22 |   - ./test/run-tests-ghostdriver.sh                  #< Test (GhostDriver / PhantomJSDriver)
23 | 
24 | notifications:
25 |   irc:
26 |     channels:
27 |       - "irc.freenode.org#phantomjs"
28 |     on_success: always
29 |     on_failure: always
30 |     use_notice: true
31 | 


--------------------------------------------------------------------------------
/tools/test_crawler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-*-coding:utf-8-*-
 3 | #Author : Manning
 4 | #Date : 2015-10-17
 5 | import sys
 6 | sys.path.append(sys.path[0].split('MSpider')[0] + "MSpider")
 7 | from lib.core.crawl import crawl
 8 | from lib.core.fetch import fetch
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |     url = 'http://www.wooyun.org/bugs/'
13 |     html = fetch(url)
14 |     for i in crawl(url,html):
15 |         print i
16 | 


--------------------------------------------------------------------------------
/tools/test_fetcher.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-*-coding:utf-8-*-
 3 | #Author : Manning
 4 | #Date : 2015-10-17
 5 | import sys
 6 | sys.path.append(sys.path[0].split('MSpider')[0] + "MSpider")
 7 | from lib.core.fetch import fetch
 8 | 
 9 | 
10 | if __name__ == '__main__':
11 |     url = 'http://www.baidu.com'
12 |     print fetch(url)
13 | 


--------------------------------------------------------------------------------