├── LICENSE
├── README.md
├── beike_find_house
    ├── .ipynb_checkpoints
    │   └── room_data_analyze-checkpoint.ipynb
    ├── __pycache__
    │   ├── connect_redis.cpython-37.pyc
    │   └── proxy.cpython-37.pyc
    ├── after_deal_data.csv
    ├── beijing_fang111.csv
    ├── beike_find_house - 副本.xlsx
    ├── home_spider.py
    ├── photo
    │   ├── 01.png
    │   ├── 1.png
    │   ├── 11.png
    │   ├── 12.png
    │   ├── 13.png
    │   ├── 2.png
    │   ├── 3.png
    │   ├── 4.png
    │   ├── 5.png
    │   ├── 6.png
    │   ├── 7.png
    │   ├── 8.png
    │   ├── 9.png
    │   └── oo.png
    ├── render.html
    └── room_data_analyze.ipynb
├── car_home
    ├── auto_bmw
    │   ├── auto_bmw
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   ├── items.cpython-37.pyc
    │   │   │   ├── pipelines.cpython-37.pyc
    │   │   │   └── settings.cpython-37.pyc
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   ├── spiders
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   │   └── bmw_spider.cpython-37.pyc
    │   │   │   └── bmw_spider.py
    │   │   └── test.py
    │   ├── scrapy.cfg
    │   └── start.py
    ├── auto_bmw_all
    │   ├── auto_bmw_all
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   ├── items.cpython-37.pyc
    │   │   │   ├── pipelines.cpython-37.pyc
    │   │   │   └── settings.cpython-37.pyc
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │       ├── __init__.cpython-37.pyc
    │   │   │       └── auto_crawl.cpython-37.pyc
    │   │   │   └── auto_crawl.py
    │   ├── scrapy.cfg
    │   └── start.py
    ├── autohome
    │   ├── auto.json
    │   ├── autohome
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   ├── items.cpython-37.pyc
    │   │   │   ├── pipelines.cpython-37.pyc
    │   │   │   └── settings.cpython-37.pyc
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │       ├── __init__.cpython-37.pyc
    │   │   │       └── autohome_crawl.cpython-37.pyc
    │   │   │   └── autohome_crawl.py
    │   └── scrapy.cfg
    └── test_spider
    │   ├── scrapy.cfg
    │   ├── start.py
    │   └── test_spider
    │       ├── __init__.py
    │       ├── __pycache__
    │           ├── __init__.cpython-37.pyc
    │           └── settings.cpython-37.pyc
    │       ├── items.py
    │       ├── middlewares.py
    │       ├── pipelines.py
    │       ├── settings.py
    │       └── spiders
    │           ├── __init__.py
    │           ├── __pycache__
    │               ├── __init__.cpython-37.pyc
    │               └── demo.cpython-37.pyc
    │           └── demo.py
├── death_company
    ├── .ipynb_checkpoints
    │   ├── death_company_data_analyze-checkpoint.ipynb
    │   ├── lagou_spider-checkpoint.ipynb
    │   └── position_data_analyze-checkpoint.ipynb
    ├── apple.jpg
    ├── death_company.py
    ├── death_company_data_analyze.ipynb
    ├── death_company_info - 副本.xls
    ├── heart.jpg
    └── 可视化图表
    │   ├── com_death_reason.jpg
    │   ├── com_financing.jpg
    │   ├── com_financing_pie.jpg
    │   ├── com_live_time.jpg
    │   ├── com_position.jpg
    │   ├── com_position_pie.jpg
    │   ├── com_style.jpg
    │   └── com_style_pie.jpg
├── jianshu
    ├── ghostdriver.log
    ├── jianshu
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── items.cpython-37.pyc
    │   │   ├── middlewares.cpython-37.pyc
    │   │   ├── pipelines.cpython-37.pyc
    │   │   └── settings.cpython-37.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-37.pyc
    │   │       └── js.cpython-37.pyc
    │   │   └── js.py
    ├── scrapy.cfg
    └── start.py
├── ppt_download_spider
    └── ppt_download_spider.py
├── proxy_design
    ├── __pycache__
    │   └── connect_redis.cpython-37.pyc
    ├── connect_redis.py
    └── proxy.py
├── stock
    ├── A_stock_company.py
    └── stock_spider_new.py
├── vehicle_home
    ├── scrapy.cfg
    ├── vehicle.json
    └── vehicle_home
    │   ├── __init__.py
    │   ├── __pycache__
    │       ├── __init__.cpython-38.pyc
    │       ├── items.cpython-38.pyc
    │       ├── pipelines.cpython-38.pyc
    │       └── settings.cpython-38.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │       ├── __init__.py
    │       ├── __pycache__
    │           ├── __init__.cpython-38.pyc
    │           ├── test.cpython-38.pyc
    │           └── vehicle_style.cpython-38.pyc
    │       ├── test.py
    │       └── vehicle_style.py
├── weather_spider_analyze
    └── weather_spider.py
└── zhihu
    └── hot.py


/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <https://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <https://www.gnu.org/licenses/why-not-lgpl.html>.
675 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## python网络爬虫实战合集(部分爬虫数据附加数据分析)
 2 | 
 3 | #### PPT模板
 4 | - [PPT模板自动下载](https://github.com/ShanYonggang/spider_list/blob/master/ppt_download_spider/ppt_download_spider.py "PPT模板自动下载")
 5 | 
 6 | #### 知乎
 7 | - [知乎热榜问题及答案数据获取](https://www.shanyonggang.cn/article_detail/65/ "知乎热榜问题及答案数据获取")
 8 | 
 9 | #### 爬虫代理池
10 | - [个人爬虫代理池创建](https://www.shanyonggang.cn/article_detail/66/ "个人爬虫代理池创建")
11 | 
12 | #### IT桔子
13 | - [IT桔子死亡公司数据库获取](https://www.shanyonggang.cn/article_detail/67/ "IT桔子死亡公司数据库获取")
14 | - [用python数据分析来解密新经济(IT桔子)死亡公司的内幕](https://www.shanyonggang.cn/article_detail/69/ "用python数据分析来解密新经济(IT桔子)死亡公司的内幕")
15 | 
16 | #### 贝壳找房
17 | - [python爬取贝壳找房北京二手房信息数据](https://www.shanyonggang.cn/article_detail/85/ "python爬取贝壳找房北京二手房信息数据")
18 | - [使用python对北京二手房信息数据分析及可视化展示](https://www.shanyonggang.cn/article_detail/86/ "使用python对北京二手房信息数据分析及可视化展示")
19 | 
20 | #### 汽车之家口碑频道
21 | - [使用Scrapy进行汽车之家口碑频道爬虫](https://zhuanlan.zhihu.com/p/268117716 "使用Scrapy进行汽车之家口碑频道爬虫")
22 | 
23 | #### 天气
24 | - [使用selenium获取北京地区2015年至2019年天气情况](https://github.com/ShanYonggang/spider_list/blob/master/weather_spider_analyze/weather_spider.py "使用selenium获取北京地区2015年至2019年天气情况")
25 | 
26 | #### Scrapy框架爬虫合集
27 | - [使用Scrapy进行汽车之家信息爬虫](https://github.com/ShanYonggang/spider_list/tree/master/car_home "使用Scrapy进行汽车之家信息爬虫")
28 | - [使用Scrapy进行简书信息爬虫](https://github.com/ShanYonggang/spider_list/tree/master/jianshu "使用Scrapy进行简书信息爬虫")
29 | 
30 | #### 股票相关数据抓取
31 | - [使用Python抓取A股上市公司基本信息](https://github.com/ShanYonggang/spider_list/blob/master/stock/A_stock_company.py "使用Python抓取A股上市公司基本信息")
32 | - [A股上市公司的主力控盘情况及资金动向数据抓取](https://github.com/ShanYonggang/spider_list/blob/master/stock/stock_spider_new.py "A股上市公司的主力控盘情况及资金动向数据抓取")
33 | 
34 | 
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/beike_find_house/__pycache__/connect_redis.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/__pycache__/connect_redis.cpython-37.pyc


--------------------------------------------------------------------------------
/beike_find_house/__pycache__/proxy.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/__pycache__/proxy.cpython-37.pyc


--------------------------------------------------------------------------------
/beike_find_house/beijing_fang111.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/beijing_fang111.csv


--------------------------------------------------------------------------------
/beike_find_house/beike_find_house - 副本.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/beike_find_house - 副本.xlsx


--------------------------------------------------------------------------------
/beike_find_house/home_spider.py:
--------------------------------------------------------------------------------
  1 | # 目的，爬取贝壳找房数据
  2 | 
  3 | # 每页 url = 'bj.ke.com/ershoufang/pg{}/'.format(i),然后获取房屋的详细url，对详细url进行尽可能多的数据采集
  4 | 
  5 | # 爬虫中使用多线程、多进程
  6 | 
  7 | import requests
  8 | import time
  9 | from multiprocessing import Pool
 10 | from lxml import etree
 11 | import pandas as pd
 12 | import os
 13 | import random
 14 | 
 15 | # 获取房源的基本url
 16 | # 参数page
 17 | def get_home_url(page):
 18 |     url = 'http://bj.ke.com/ershoufang/pg{}/'.format(page)
 19 |     headers = {
 20 |         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
 21 |         'Cookie': 'lianjia_uuid=e6a91b7a-b6a4-40b5-88c6-ff67759cbc8a; crosSdkDT2019DeviceId=-51npj6--xbmlw5-f22i5qg8bh36ouv-yttqkmwdf; _ga=GA1.2.121082359.1579583230; ke_uuid=6de1afa21a5799c0874702af39248907; __xsptplus788=788.1.1579583230.1579583347.4%234%7C%7C%7C%7C%7C%23%23Q6jl-k46IlXjCORdTOp6O3JyzHokoUrb%23; select_city=110000; digv_extends=%7B%22utmTrackId%22%3A%2280418605%22%7D; lianjia_ssid=a4ab1bc0-cb04-492f-960c-342c66065da0; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1583897013,1583932737; User-Realip=111.196.247.121; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216fc67f100b140-06f07f8f707639-33365a06-1049088-16fc67f100c603%22%2C%22%24device_id%22%3A%2216fc67f100b140-06f07f8f707639-33365a06-1049088-16fc67f100c603%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_utm_source%22%3A%22baidu%22%2C%22%24latest_utm_medium%22%3A%22pinzhuan%22%2C%22%24latest_utm_campaign%22%3A%22wybeijing%22%2C%22%24latest_utm_content%22%3A%22biaotimiaoshu%22%2C%22%24latest_utm_term%22%3A%22biaoti%22%7D%7D; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1583933576; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiMjAxZjBjNWU1ZWE1ZGVmYjQxZDFlYTE4MGVkNWI1OGRjYzk5Mzc2MjEwNTcyMWI3ODhiNTQyNTExOGQ1NTVlZDNkMTY2MWE2YWI5YWRlMGY0NDA3NjkwNWEyMzRlNTdhZWExNDViNGFiNWVmMmMyZWJlZGY1ZjM2Y2M0NWIxMWZlMWFiOWI2MDJiMzFmOTJmYzgxNzNiZTIwMzE1ZGJjNTUyMWE2ZjcxYzZmMTFhOWIyOWU2NzJkZTkyZjc3ZDk1MzhiNjhhMTQyZDQ2YmEyNjJhYzJmNjdjNmFjM2I5YzU0MzdjMDkwYWUwMzZmZjVjYWZkZTY5YjllYzY0NzEwMWY2OTc1NmU1Y2ExNzNhOWRmZTdiNGY4M2E1Zjc2NDZmY2JkMGM2N2JiMjdmZTJjNjI2MzNkMjdlNDY4ODljZGRjMjc3MTQ0NDUxMDllZThlZDVmZmMwMjViNjc2ZjFlY1wiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCJkMDI2MDk0N1wifSIsInIiOiJodHRwczovL2JqLmtlLmNvbS9lcnNob3VmYW5nLzE5MTExMzE5NTEwMTAwMTcxNzU5Lmh0bWwiLCJvcyI6IndlYiIsInYiOiIwLjEifQ=='
 22 |     }
 23 |     text = requests.get(url,headers=headers).text
 24 |     html = etree.HTML(text)
 25 |     detail_url = html.xpath('//ul[@class="sellListContent"]//li[@class="clear"]/a/@href')
 26 |     return detail_url
 27 | 
 28 | # 获取房源详细数据信息
 29 | def get_home_detail_infos(detail_url):
 30 |     headers = {
 31 |         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
 32 |         'Cookie': 'lianjia_uuid=e6a91b7a-b6a4-40b5-88c6-ff67759cbc8a; crosSdkDT2019DeviceId=-51npj6--xbmlw5-f22i5qg8bh36ouv-yttqkmwdf; _ga=GA1.2.121082359.1579583230; ke_uuid=6de1afa21a5799c0874702af39248907; __xsptplus788=788.1.1579583230.1579583347.4%234%7C%7C%7C%7C%7C%23%23Q6jl-k46IlXjCORdTOp6O3JyzHokoUrb%23; select_city=110000; digv_extends=%7B%22utmTrackId%22%3A%2280418605%22%7D; lianjia_ssid=a4ab1bc0-cb04-492f-960c-342c66065da0; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1583897013,1583932737; User-Realip=111.196.247.121; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216fc67f100b140-06f07f8f707639-33365a06-1049088-16fc67f100c603%22%2C%22%24device_id%22%3A%2216fc67f100b140-06f07f8f707639-33365a06-1049088-16fc67f100c603%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_utm_source%22%3A%22baidu%22%2C%22%24latest_utm_medium%22%3A%22pinzhuan%22%2C%22%24latest_utm_campaign%22%3A%22wybeijing%22%2C%22%24latest_utm_content%22%3A%22biaotimiaoshu%22%2C%22%24latest_utm_term%22%3A%22biaoti%22%7D%7D; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1583933576; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiMjAxZjBjNWU1ZWE1ZGVmYjQxZDFlYTE4MGVkNWI1OGRjYzk5Mzc2MjEwNTcyMWI3ODhiNTQyNTExOGQ1NTVlZDNkMTY2MWE2YWI5YWRlMGY0NDA3NjkwNWEyMzRlNTdhZWExNDViNGFiNWVmMmMyZWJlZGY1ZjM2Y2M0NWIxMWZlMWFiOWI2MDJiMzFmOTJmYzgxNzNiZTIwMzE1ZGJjNTUyMWE2ZjcxYzZmMTFhOWIyOWU2NzJkZTkyZjc3ZDk1MzhiNjhhMTQyZDQ2YmEyNjJhYzJmNjdjNmFjM2I5YzU0MzdjMDkwYWUwMzZmZjVjYWZkZTY5YjllYzY0NzEwMWY2OTc1NmU1Y2ExNzNhOWRmZTdiNGY4M2E1Zjc2NDZmY2JkMGM2N2JiMjdmZTJjNjI2MzNkMjdlNDY4ODljZGRjMjc3MTQ0NDUxMDllZThlZDVmZmMwMjViNjc2ZjFlY1wiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCJkMDI2MDk0N1wifSIsInIiOiJodHRwczovL2JqLmtlLmNvbS9lcnNob3VmYW5nLzE5MTExMzE5NTEwMTAwMTcxNzU5Lmh0bWwiLCJvcyI6IndlYiIsInYiOiIwLjEifQ=='
 33 |     }
 34 |     detail_text = requests.get(detail_url,headers=headers).text
 35 |     html = etree.HTML(detail_text)
 36 |     all_data = []
 37 |     # 解析获取相关数据
 38 |     # 所在地址
 39 |     home_location = html.xpath('//div[@data-component="overviewIntro"]//div[@class="content"]//div[@class="areaName"]/span[@class="info"]/a/text()')
 40 |     all_data.append(home_location)
 41 |     # 小区名称
 42 |     local_name = html.xpath('//div[@data-component="overviewIntro"]//div[@class="content"]//div[@class="communityName"]/a/text()')[0]
 43 |     all_data.append(local_name)
 44 |     # 总价格
 45 |     total_price = html.xpath('//div[@data-component="overviewIntro"]//div[@class="content"]//div[@class="price "]/span[@class="total"]/text()')[0]
 46 |     all_data.append(total_price)
 47 |     # 单价
 48 |     unit_price = html.xpath('//div[@data-component="overviewIntro"]//div[@class="content"]//div[@class="price "]//div[@class="unitPrice"]/span/text()')[0]
 49 |     all_data.append(unit_price)
 50 |     # 房屋基本信息
 51 |     home_style = html.xpath('//div[@class="introContent"]//div[@class="base"]//div[@class="content"]/ul/li/text()')
 52 |     all_data.append(home_style)
 53 |     # 房屋交易属性信息
 54 |     transaction_info = html.xpath('//div[@class="introContent"]//div[@class="transaction"]//div[@class="content"]/ul/li/text()')
 55 |     all_data.append(transaction_info)
 56 |     # 小区均价
 57 |     xiaoqu_price = html.xpath('//div[@class="xiaoquCard"]//div[@class="xiaoqu_main fl"]//span[@class="xiaoqu_main_info price_red"]/text()')[0].replace(' ','')
 58 |     all_data.append(xiaoqu_price)
 59 |     # 小区建造时间
 60 |     xiaoqu_built_time = html.xpath('//div[@class="xiaoquCard"]//div[@class="xiaoqu_main fl"]//span[@class="xiaoqu_main_info"]/text()')[0].replace(' ','').replace('\n','')
 61 |     all_data.append(xiaoqu_built_time)
 62 |     # 小区建筑类型
 63 |     xiaoqu_built_style = html.xpath('//div[@class="xiaoquCard"]//div[@class="xiaoqu_main fl"]//span[@class="xiaoqu_main_info"]/text()')[1].replace(' ','').replace('\n','')
 64 |     all_data.append(xiaoqu_built_style)
 65 |     # 小区楼层总数
 66 |     xiaoqu_total_ceng = html.xpath('//div[@class="xiaoquCard"]//div[@class="xiaoqu_main fl"]//span[@class="xiaoqu_main_info"]/text()')[2].replace(' ','').replace('\n','')
 67 |     all_data.append(xiaoqu_total_ceng)
 68 |     return all_data
 69 | 
 70 | # 数据保存至csv文件里（使用pandas中的to_csv保存）
 71 | def save_data(data):
 72 |     data_frame = pd.DataFrame(data,columns=['小区位置','小区名称','房屋总价','房屋单价','房屋基本信息','房屋交易信息','小区均价','小区建造时间','小区房屋类型','小区层数'])
 73 |     print(data_frame)
 74 |     data_frame.to_csv('beijing_fang111.csv',header=False,index=False,mode='a',encoding='utf_8_sig')
 75 | 
 76 | def main(page):
 77 |     print('开始爬取第{}页的数据！'.format(page))
 78 |     # choice_time = random.choice(range(0,5))
 79 |     # print(choice_time)
 80 |     
 81 |     urls = get_home_url(page)
 82 |     for url in urls:
 83 |         print('开始爬去详细网页为{}的房屋详细信息资料！'.format(url))
 84 |         all_data = get_home_detail_infos(detail_url=url)
 85 |         data = []
 86 |         data.append(all_data)
 87 |         save_data(data)
 88 | 
 89 | if __name__ == "__main__":
 90 |     page = range(0,100)
 91 |     print('爬虫开始')
 92 |     pool = Pool(processes=4)
 93 |     pool.map(main,page)
 94 |     # proxies = proxy.get_proxy_random()
 95 |     # pool.apply_async(main,args=(page,proxies,))
 96 |     pool.close()
 97 |     pool.join()
 98 |     
 99 |     
100 | 


--------------------------------------------------------------------------------
/beike_find_house/photo/01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/01.png


--------------------------------------------------------------------------------
/beike_find_house/photo/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/1.png


--------------------------------------------------------------------------------
/beike_find_house/photo/11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/11.png


--------------------------------------------------------------------------------
/beike_find_house/photo/12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/12.png


--------------------------------------------------------------------------------
/beike_find_house/photo/13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/13.png


--------------------------------------------------------------------------------
/beike_find_house/photo/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/2.png


--------------------------------------------------------------------------------
/beike_find_house/photo/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/3.png


--------------------------------------------------------------------------------
/beike_find_house/photo/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/4.png


--------------------------------------------------------------------------------
/beike_find_house/photo/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/5.png


--------------------------------------------------------------------------------
/beike_find_house/photo/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/6.png


--------------------------------------------------------------------------------
/beike_find_house/photo/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/7.png


--------------------------------------------------------------------------------
/beike_find_house/photo/8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/8.png


--------------------------------------------------------------------------------
/beike_find_house/photo/9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/9.png


--------------------------------------------------------------------------------
/beike_find_house/photo/oo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/oo.png


--------------------------------------------------------------------------------
/car_home/auto_bmw/auto_bmw/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw/auto_bmw/__init__.py


--------------------------------------------------------------------------------
/car_home/auto_bmw/auto_bmw/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw/auto_bmw/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/car_home/auto_bmw/auto_bmw/__pycache__/items.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw/auto_bmw/__pycache__/items.cpython-37.pyc


--------------------------------------------------------------------------------
/car_home/auto_bmw/auto_bmw/__pycache__/pipelines.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw/auto_bmw/__pycache__/pipelines.cpython-37.pyc


--------------------------------------------------------------------------------
/car_home/auto_bmw/auto_bmw/__pycache__/settings.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw/auto_bmw/__pycache__/settings.cpython-37.pyc


--------------------------------------------------------------------------------
/car_home/auto_bmw/auto_bmw/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class AutoBmwItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     image_file = scrapy.Field()
15 |     image_urls = scrapy.Field()
16 | 


--------------------------------------------------------------------------------
/car_home/auto_bmw/auto_bmw/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class AutoBmwSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class AutoBmwDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/car_home/auto_bmw/auto_bmw/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import os
 8 | import urllib 
 9 | import scrapy
10 | from scrapy.pipelines.images import ImagesPipeline
11 | from scrapy.http import Request
12 | 
13 | class AutoBmwPipeline(object):
14 | 
15 |     def __init__(self):
16 |         self.file_path = os.path.join(os.path.dirname(os.path.dirname(__file__)),'image')
17 |         if not os.path.exists(self.file_path):
18 |             os.mkdir(self.file_path)
19 |         else:
20 |             print('Path already exist......')
21 |         
22 |     def process_item(self, item, spider):
23 |         image_file = item['image_file']
24 |         image_urls = item['image_urls']
25 |         image_path = os.path.join(self.file_path,image_file)
26 |         if not os.path.exists(image_path):
27 |             os.mkdir(image_path)
28 |         for url in image_urls:
29 |             image_name = url.split('__')[-1]
30 |             urllib.request.urlretrieve(url,os.path.join(image_path,image_name))
31 |         return item
32 | 
33 | class BmwImagesPipeline(ImagesPipeline):
34 |         
35 |     def get_media_requests(self, item, info):
36 |         img_url_list = item['image_urls']
37 |         for img_url in img_url_list:
38 |             yield Request(img_url,meta={'image_file':item['image_file']},)
39 |     
40 |     def file_path(self, request, response=None, info=None):
41 |         image_file = request.meta['image_file']
42 |         name = request.url.split('_')[-1]
43 |         return '%s/%s.png'%(image_file,name)
44 |     
45 |     def item_completed(self, results, item, info):
46 |         print(results)
47 |         return item
48 | 
49 |     


--------------------------------------------------------------------------------
/car_home/auto_bmw/auto_bmw/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for auto_bmw project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'auto_bmw'
13 | 
14 | SPIDER_MODULES = ['auto_bmw.spiders']
15 | NEWSPIDER_MODULE = 'auto_bmw.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'auto_bmw (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 1
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'auto_bmw.middlewares.AutoBmwSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'auto_bmw.middlewares.AutoBmwDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | #    'auto_bmw.pipelines.AutoBmwPipeline': 300,
69 |    'scrapy.pipelines.images.ImagesPipeline': None,
70 |    'auto_bmw.pipelines.BmwImagesPipeline': 1,
71 | }
72 | IMAGES_STORE = 'ImgDownload'
73 | 
74 | IMAGES_URLS_FIELD = 'image_urls'
75 | IMAGES_RESULT_FIELD = 'image_path'
76 | 
77 | # Enable and configure the AutoThrottle extension (disabled by default)
78 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
79 | #AUTOTHROTTLE_ENABLED = True
80 | # The initial download delay
81 | #AUTOTHROTTLE_START_DELAY = 5
82 | # The maximum download delay to be set in case of high latencies
83 | #AUTOTHROTTLE_MAX_DELAY = 60
84 | # The average number of requests Scrapy should be sending in parallel to
85 | # each remote server
86 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
87 | # Enable showing throttling stats for every response received:
88 | #AUTOTHROTTLE_DEBUG = False
89 | 
90 | # Enable and configure HTTP caching (disabled by default)
91 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
92 | #HTTPCACHE_ENABLED = True
93 | #HTTPCACHE_EXPIRATION_SECS = 0
94 | #HTTPCACHE_DIR = 'httpcache'
95 | #HTTPCACHE_IGNORE_HTTP_CODES = []
96 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
97 | 


--------------------------------------------------------------------------------
/car_home/auto_bmw/auto_bmw/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/car_home/auto_bmw/auto_bmw/spiders/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw/auto_bmw/spiders/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/car_home/auto_bmw/auto_bmw/spiders/__pycache__/bmw_spider.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw/auto_bmw/spiders/__pycache__/bmw_spider.cpython-37.pyc


--------------------------------------------------------------------------------
/car_home/auto_bmw/auto_bmw/spiders/bmw_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from auto_bmw.items import AutoBmwItem
 4 | 
 5 | class BmwSpiderSpider(scrapy.Spider):
 6 |     name = 'bmw_spider'
 7 |     allowed_domains = ['car.autohome.com.cn']
 8 |     start_urls = ['https://car.autohome.com.cn/pic/series/66.html']
 9 | 
10 |     def parse(self, response):
11 |         uibox_urls = response.xpath('//div[@class="uibox"]')[1:]
12 |         for uibox_url in uibox_urls:
13 |             item = AutoBmwItem()
14 |             item['image_file'] = uibox_url.xpath('./div[@class="uibox-title"]/a')[0].xpath('./text()').get()
15 |             image_urls = uibox_url.xpath('./div')[-1].xpath('.//img/@src').getall()
16 |             item['image_urls'] = list(map(lambda url: response.urljoin(url),image_urls))
17 |             yield item
18 | 


--------------------------------------------------------------------------------
/car_home/auto_bmw/auto_bmw/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | path = os.path.join(os.path.dirname(os.path.dirname(__file__)),'image')
3 | if not os.path.exists(path):
4 |     os.mkdir(path)
5 | else:
6 |     print("Path already exist......")


--------------------------------------------------------------------------------
/car_home/auto_bmw/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = auto_bmw.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = auto_bmw
12 | 


--------------------------------------------------------------------------------
/car_home/auto_bmw/start.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute('scrapy crawl bmw_spider'.split())


--------------------------------------------------------------------------------
/car_home/auto_bmw_all/auto_bmw_all/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw_all/auto_bmw_all/__init__.py


--------------------------------------------------------------------------------
/car_home/auto_bmw_all/auto_bmw_all/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw_all/auto_bmw_all/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/car_home/auto_bmw_all/auto_bmw_all/__pycache__/items.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw_all/auto_bmw_all/__pycache__/items.cpython-37.pyc


--------------------------------------------------------------------------------
/car_home/auto_bmw_all/auto_bmw_all/__pycache__/pipelines.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw_all/auto_bmw_all/__pycache__/pipelines.cpython-37.pyc


--------------------------------------------------------------------------------
/car_home/auto_bmw_all/auto_bmw_all/__pycache__/settings.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw_all/auto_bmw_all/__pycache__/settings.cpython-37.pyc


--------------------------------------------------------------------------------
/car_home/auto_bmw_all/auto_bmw_all/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class AutoBmwAllItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     image_file = scrapy.Field()
15 |     image_urls = scrapy.Field()
16 |     pass
17 | 


--------------------------------------------------------------------------------
/car_home/auto_bmw_all/auto_bmw_all/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class AutoBmwAllSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class AutoBmwAllDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/car_home/auto_bmw_all/auto_bmw_all/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import os
 8 | import urllib 
 9 | import scrapy
10 | from scrapy.pipelines.images import ImagesPipeline
11 | from scrapy.http import Request
12 | 
13 | class AutoBmwAllPipeline(object):
14 |     def process_item(self, item, spider):
15 |         return item
16 | 
17 | class BmwImagesPipeline(ImagesPipeline):
18 |         
19 |     def get_media_requests(self, item, info):
20 |         img_url_list = item['image_urls']
21 |         for img_url in img_url_list:
22 |             yield Request(img_url,meta={'image_file':item['image_file']},)
23 |     
24 |     def file_path(self, request, response=None, info=None):
25 |         image_file = request.meta['image_file']
26 |         name = request.url.split('_')[-1]
27 |         return '%s/%s.png'%(image_file,name)
28 |     
29 |     def item_completed(self, results, item, info):
30 |         print(results)
31 |         return item
32 | 


--------------------------------------------------------------------------------
/car_home/auto_bmw_all/auto_bmw_all/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for auto_bmw_all project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'auto_bmw_all'
13 | 
14 | SPIDER_MODULES = ['auto_bmw_all.spiders']
15 | NEWSPIDER_MODULE = 'auto_bmw_all.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'auto_bmw_all (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 1
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'auto_bmw_all.middlewares.AutoBmwAllSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'auto_bmw_all.middlewares.AutoBmwAllDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |    'scrapy.pipelines.images.ImagesPipeline': None,
69 |    'auto_bmw_all.pipelines.BmwImagesPipeline': 1,
70 | }
71 | IMAGES_STORE = 'ImgDownload'
72 | 
73 | IMAGES_URLS_FIELD = 'image_urls'
74 | IMAGES_RESULT_FIELD = 'image_path'
75 | 
76 | # Enable and configure the AutoThrottle extension (disabled by default)
77 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
78 | #AUTOTHROTTLE_ENABLED = True
79 | # The initial download delay
80 | #AUTOTHROTTLE_START_DELAY = 5
81 | # The maximum download delay to be set in case of high latencies
82 | #AUTOTHROTTLE_MAX_DELAY = 60
83 | # The average number of requests Scrapy should be sending in parallel to
84 | # each remote server
85 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
86 | # Enable showing throttling stats for every response received:
87 | #AUTOTHROTTLE_DEBUG = False
88 | 
89 | # Enable and configure HTTP caching (disabled by default)
90 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
91 | #HTTPCACHE_ENABLED = True
92 | #HTTPCACHE_EXPIRATION_SECS = 0
93 | #HTTPCACHE_DIR = 'httpcache'
94 | #HTTPCACHE_IGNORE_HTTP_CODES = []
95 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
96 | 


--------------------------------------------------------------------------------
/car_home/auto_bmw_all/auto_bmw_all/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/car_home/auto_bmw_all/auto_bmw_all/spiders/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw_all/auto_bmw_all/spiders/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/car_home/auto_bmw_all/auto_bmw_all/spiders/__pycache__/auto_crawl.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw_all/auto_bmw_all/spiders/__pycache__/auto_crawl.cpython-37.pyc


--------------------------------------------------------------------------------
/car_home/auto_bmw_all/auto_bmw_all/spiders/auto_crawl.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy.linkextractors import LinkExtractor
 4 | from scrapy.spiders import CrawlSpider, Rule
 5 | from auto_bmw_all.items import AutoBmwAllItem
 6 | 
 7 | class AutoCrawlSpider(CrawlSpider):
 8 |     name = 'auto_crawl'
 9 |     allowed_domains = ['car.autohome.com.cn']
10 |     start_urls = ['https://car.autohome.com.cn/pic/series/66.html']
11 | 
12 |     rules = (
13 |         Rule(LinkExtractor(allow=r'.+/pic/series/66-.+'), callback='parse_item',follow=True),
14 |     )
15 | 
16 |     src="//car3.autoimg.cn/cardfs/product/g1/M07/63/01/240x180_0_q95_autohomecar__ChsEmVz37-OABcHCAAR_DO3soxI667.jpg"
17 |     src="//car3.autoimg.cn/cardfs/product/g1/M07/63/01/  800x0_1_q95_autohomecar__ChsEmVz37-OABcHCAAR_DO3soxI667.jpg"
18 | 
19 |     def parse_item(self, response):
20 |         uibox = response.xpath('//div[@class="uibox"]')
21 |         item = AutoBmwAllItem()
22 |         item['image_file'] = uibox.xpath('./div[@class="uibox-title"]/text()').get()
23 |         image_url = uibox.xpath('./div[@class="uibox-con carpic-list03 border-b-solid"]/ul/li//img/@src').getall()
24 |         item['image_urls'] = list(map(lambda url: response.urljoin(url.replace('240x180_0','800x0_1')),image_url))
25 |         yield item
26 | 
27 |         
28 | 


--------------------------------------------------------------------------------
/car_home/auto_bmw_all/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = auto_bmw_all.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = auto_bmw_all
12 | 


--------------------------------------------------------------------------------
/car_home/auto_bmw_all/start.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute('scrapy crawl auto_crawl'.split())


--------------------------------------------------------------------------------
/car_home/autohome/autohome/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/autohome/autohome/__init__.py


--------------------------------------------------------------------------------
/car_home/autohome/autohome/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/autohome/autohome/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/car_home/autohome/autohome/__pycache__/items.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/autohome/autohome/__pycache__/items.cpython-37.pyc


--------------------------------------------------------------------------------
/car_home/autohome/autohome/__pycache__/pipelines.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/autohome/autohome/__pycache__/pipelines.cpython-37.pyc


--------------------------------------------------------------------------------
/car_home/autohome/autohome/__pycache__/settings.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/autohome/autohome/__pycache__/settings.cpython-37.pyc


--------------------------------------------------------------------------------
/car_home/autohome/autohome/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class AutohomeItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     # pass
15 |     model_name = scrapy.Field()
16 |     level = scrapy.Field()
17 |     auto_body = scrapy.Field()
18 |     price = scrapy.Field()
19 |     engine = scrapy.Field()
20 |     transmission = scrapy.Field()
21 |     auto_img = scrapy.Field()
22 | 


--------------------------------------------------------------------------------
/car_home/autohome/autohome/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class AutohomeSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class AutohomeDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/car_home/autohome/autohome/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import json
 8 | import pymongo
 9 | import pymysql
10 | 
11 | class AutohomePipeline(object):
12 | 
13 |     def __init__(self):
14 |         self.fp = open('auto.json','a+',encoding='utf-8')
15 | 
16 |     def process_item(self, item, spider):
17 |         file = json.dumps(dict(item),ensure_ascii=False)
18 |         self.fp.write(file)
19 |         print(self.fp.read())
20 |         return item
21 | 
22 |     def close_spider(self,spider):
23 |         self.fp.close()
24 |         print('Spider Stop...')
25 | 
26 | class MongoPipeline(object):
27 |     
28 |     def __init__(self,mongo_uri,mongo_db):
29 |         self.mongo_uri = mongo_uri
30 |         self.mongo_db = mongo_db
31 | 
32 |     @classmethod
33 |     def from_crawler(cls, crawler):
34 |         # This method is used by Scrapy to create your spiders.
35 |         return cls(
36 |             mongo_uri = crawler.settings.get('MONGO_URI'),
37 |             mongo_db = crawler.settings.get('MONGO_DB'),
38 |         )
39 | 
40 |     def open_spider(self,spider):
41 |         self.client = pymongo.MongoClient(self.mongo_uri)
42 |         self.db = self.client[self.mongo_db]
43 |         print('Spider Start run...')
44 | 
45 |     def process_item(self, item, spider):
46 |         name = item.__class__.__name__
47 |         self.db[name].insert(dict(item))
48 |         return item
49 | 
50 |     def close_spider(self,spider):
51 |         self.client.close()
52 |         print('Spider Stop...')
53 | 
54 | class MysqlPipeline(object):
55 | 
56 |     def __init__(self,host,port,user,password,db):
57 |         self.host = host
58 |         self.user = user
59 |         self.password =password
60 |         self.port = port
61 |         self.db = db
62 | 
63 |     @classmethod
64 |     def from_crawler(cls, crawler):
65 |         # This method is used by Scrapy to create your spiders.
66 |         return cls(
67 |             host = crawler.settings.get('HOST'),
68 |             port = crawler.settings.get('PORT'),
69 |             user = crawler.settings.get('USER'),
70 |             password = crawler.settings.get('PASSWORD'),
71 |             db = crawler.settings.get('DB'),
72 |         )
73 | 
74 |     def open_spider(self,spider):
75 |         self.db =  pymysql.connect(host=self.host,user=self.user,port=self.port,password=self.password,db=self.db,charset='utf8')
76 |         self.cursor = self.db.cursor()
77 | 
78 |     def process_item(self,item,spider):
79 |         sql = 'insert into auto_data (model_name,level,auto_body,price,engine,transmission,auto_img) VALUES(%s,%s,%s,%s,%s,%s,%s)'
80 |         self.cursor.execute(sql,(item["model_name"],item["level"],item["price"],item["auto_body"],item["engine"],item["transmission"],item['auto_img']))
81 |         self.db.commit()
82 |         return item
83 | 
84 |     def close_spider(self,spider):
85 |         self.cursor.close()
86 |         self.db.close()


--------------------------------------------------------------------------------
/car_home/autohome/autohome/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for autohome project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'autohome'
 13 | 
 14 | SPIDER_MODULES = ['autohome.spiders']
 15 | NEWSPIDER_MODULE = 'autohome.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'autohome (+http://www.yourdomain.com)'
 20 | 
 21 | # Obey robots.txt rules
 22 | ROBOTSTXT_OBEY = False
 23 | 
 24 | FEED_EXPORT_ENCODING = 'gb18030'
 25 | 
 26 | MONGO_URI = 'localhost'
 27 | MONGO_DB = 'autohome'
 28 | 
 29 | HOST = 'localhost'
 30 | USER = 'root'
 31 | PASSWORD = 'pass4321'
 32 | PORT = 3306
 33 | DB = 'auto_home'
 34 | 
 35 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 36 | #CONCURRENT_REQUESTS = 32
 37 | 
 38 | # Configure a delay for requests for the same website (default: 0)
 39 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 40 | # See also autothrottle settings and docs
 41 | DOWNLOAD_DELAY = 1
 42 | # The download delay setting will honor only one of:
 43 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 44 | #CONCURRENT_REQUESTS_PER_IP = 16
 45 | 
 46 | # Disable cookies (enabled by default)
 47 | #COOKIES_ENABLED = False
 48 | 
 49 | # Disable Telnet Console (enabled by default)
 50 | #TELNETCONSOLE_ENABLED = False
 51 | 
 52 | # Override the default request headers:
 53 | # DEFAULT_REQUEST_HEADERS = {
 54 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 55 | #   'Accept-Language': 'en',
 56 | #   'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
 57 | # }
 58 | 
 59 | # Enable or disable spider middlewares
 60 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 61 | #SPIDER_MIDDLEWARES = {
 62 | #    'autohome.middlewares.AutohomeSpiderMiddleware': 543,
 63 | #}
 64 | 
 65 | # Enable or disable downloader middlewares
 66 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 67 | #DOWNLOADER_MIDDLEWARES = {
 68 | #    'autohome.middlewares.AutohomeDownloaderMiddleware': 543,
 69 | #}
 70 | 
 71 | # Enable or disable extensions
 72 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 73 | #EXTENSIONS = {
 74 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 75 | #}
 76 | 
 77 | # Configure item pipelines
 78 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 79 | ITEM_PIPELINES = {
 80 |    'autohome.pipelines.AutohomePipeline': 300,
 81 |    'autohome.pipelines.MongoPipeline': 100,   
 82 |    'autohome.pipelines.MysqlPipeline': 200,
 83 | }
 84 | 
 85 | # Enable and configure the AutoThrottle extension (disabled by default)
 86 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 87 | #AUTOTHROTTLE_ENABLED = True
 88 | # The initial download delay
 89 | #AUTOTHROTTLE_START_DELAY = 5
 90 | # The maximum download delay to be set in case of high latencies
 91 | #AUTOTHROTTLE_MAX_DELAY = 60
 92 | # The average number of requests Scrapy should be sending in parallel to
 93 | # each remote server
 94 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 95 | # Enable showing throttling stats for every response received:
 96 | #AUTOTHROTTLE_DEBUG = False
 97 | 
 98 | # Enable and configure HTTP caching (disabled by default)
 99 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
100 | #HTTPCACHE_ENABLED = True
101 | #HTTPCACHE_EXPIRATION_SECS = 0
102 | #HTTPCACHE_DIR = 'httpcache'
103 | #HTTPCACHE_IGNORE_HTTP_CODES = []
104 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
105 | 


--------------------------------------------------------------------------------
/car_home/autohome/autohome/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/car_home/autohome/autohome/spiders/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/autohome/autohome/spiders/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/car_home/autohome/autohome/spiders/__pycache__/autohome_crawl.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/autohome/autohome/spiders/__pycache__/autohome_crawl.cpython-37.pyc


--------------------------------------------------------------------------------
/car_home/autohome/autohome/spiders/autohome_crawl.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy.linkextractors import LinkExtractor
 4 | from scrapy.spiders import CrawlSpider, Rule
 5 | from autohome.items import AutohomeItem
 6 | 
 7 | class AutohomeCrawlSpider(CrawlSpider):
 8 |     name = 'autohome_crawl'
 9 |     allowed_domains = ['car.autohome.com.cn']
10 |     start_urls = ['https://car.autohome.com.cn/price']
11 |     rules = (
12 |         Rule(LinkExtractor(allow=r'.+price/list-0-0-0-0-0-0-0-1-0-0-0-0-0-0-0-\d+\.html'), callback='parse_item', follow=True),
13 |     )
14 | 
15 |     def parse_item(self, response):
16 |         item = AutohomeItem()
17 |         print('_'*90)
18 |         print(response.url+'\n')
19 |         print('_'*90)
20 |         auto_list = response.xpath('//div[@class="list-cont-bg"]')
21 |         print('_'*90)
22 |         for auto in auto_list:
23 |             item["model_name"] = auto.xpath('.//a[@class="font-bold"]/text()').get()
24 |             item["level"] = auto.xpath('.//span[@class="info-gray"]/text()').get()
25 |             item["price"] = auto.xpath('.//span[@class="lever-price red"]/span/text()').get()
26 |             item["auto_body"] = auto.xpath('.//ul[@class="lever-ul"]/li')[1].xpath('.//a/text()').get()
27 |             engine = auto.xpath('.//ul[@class="lever-ul"]/li')[2].xpath('.//a/text()').getall()
28 |             item["engine"] = ','.join(engine).strip(',')
29 |             item["transmission"] = auto.xpath('.//ul[@class="lever-ul"]/li')[3].xpath('.//a/text()').get()
30 |             item['auto_img'] = auto.xpath('.//div[@class="list-cont-img"]/a/img/@src').get()
31 |             yield item
32 | 
33 |             


--------------------------------------------------------------------------------
/car_home/autohome/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = autohome.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = autohome
12 | 


--------------------------------------------------------------------------------
/car_home/test_spider/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = test_spider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = test_spider
12 | 


--------------------------------------------------------------------------------
/car_home/test_spider/start.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute('scrapy crawl demo'.split())


--------------------------------------------------------------------------------
/car_home/test_spider/test_spider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/test_spider/test_spider/__init__.py


--------------------------------------------------------------------------------
/car_home/test_spider/test_spider/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/test_spider/test_spider/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/car_home/test_spider/test_spider/__pycache__/settings.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/test_spider/test_spider/__pycache__/settings.cpython-37.pyc


--------------------------------------------------------------------------------
/car_home/test_spider/test_spider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class TestSpiderItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/car_home/test_spider/test_spider/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class TestSpiderSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class TestSpiderDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/car_home/test_spider/test_spider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class TestSpiderPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/car_home/test_spider/test_spider/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for test_spider project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'test_spider'
13 | 
14 | SPIDER_MODULES = ['test_spider.spiders']
15 | NEWSPIDER_MODULE = 'test_spider.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'test_spider (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'test_spider.middlewares.TestSpiderSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'test_spider.middlewares.TestSpiderDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'test_spider.pipelines.TestSpiderPipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/car_home/test_spider/test_spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/car_home/test_spider/test_spider/spiders/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/test_spider/test_spider/spiders/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/car_home/test_spider/test_spider/spiders/__pycache__/demo.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/test_spider/test_spider/spiders/__pycache__/demo.cpython-37.pyc


--------------------------------------------------------------------------------
/car_home/test_spider/test_spider/spiders/demo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | 
 5 | class DemoSpider(scrapy.Spider):
 6 |     name = 'demo'
 7 |     allowed_domains = ['httpbin.org/']
 8 |     start_urls = ['http://httpbin.org/']
 9 | 
10 |     def parse(self, response):
11 |         print(response['headers'])
12 | 


--------------------------------------------------------------------------------
/death_company/.ipynb_checkpoints/lagou_spider-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from selenium import webdriver\n",
 10 |     "from selenium.webdriver.support.wait import WebDriverWait\n",
 11 |     "from selenium.webdriver.common.by import By\n",
 12 |     "from selenium.webdriver.common.keys import Keys\n",
 13 |     "from selenium.webdriver.support import expected_conditions as EC\n",
 14 |     "import time\n",
 15 |     "import pymysql\n",
 16 |     "import pandas as pd\n",
 17 |     "\n",
 18 |     "\n",
 19 |     "broswer = webdriver.Chrome()\n",
 20 |     "def save_data(data):\n",
 21 |     "    db = pymysql.connect(host='localhost',user='root',password='pass4321',port=3306,db='position_data')\n",
 22 |     "    cursor = db.cursor()\n",
 23 |     "    tables = 'position_table'\n",
 24 |     "    keys = ','.join(data.keys())\n",
 25 |     "    values = ','.join(['%s']*len(data))\n",
 26 |     "    sql = 'INSERT INTO {table}({keys}) VALUES({values})'.format(table=tables,keys=keys,values=values)\n",
 27 |     "    try:\n",
 28 |     "        if cursor.execute(sql,tuple(data.values())):\n",
 29 |     "            print('save success!')\n",
 30 |     "            db.commit()\n",
 31 |     "    except:\n",
 32 |     "        print('save failed!')\n",
 33 |     "        db.rollback()\n",
 34 |     "    db.close()\n",
 35 |     "try:\n",
 36 |     "    broswer.get('https://www.lagou.com/')\n",
 37 |     "    choose_city = broswer.find_elements_by_class_name('tab')[1]\n",
 38 |     "    print(choose_city.text)\n",
 39 |     "    choose_city.click()\n",
 40 |     "    input_zhiwei = input('请输入需要查询的岗位:')\n",
 41 |     "    input_name = broswer.find_element_by_id('search_input')\n",
 42 |     "    input_name.send_keys(input_zhiwei)\n",
 43 |     "    wait = WebDriverWait(broswer,1)\n",
 44 |     "    click_but = broswer.find_element_by_id('search_button')\n",
 45 |     "    click_but.click()\n",
 46 |     "    wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME,'con_list_item')))\n",
 47 |     "    time.sleep(1)\n",
 48 |     "    # 获取需要爬取岗位的总页数\n",
 49 |     "    total_page = broswer.find_elements_by_class_name('pager_not_current')[-1].text\n",
 50 |     "    print(total_page)\n",
 51 |     "    page = 1\n",
 52 |     "    position_data = pd.DataFrame(columns=['position_name','position_location','position_time','position_experience',\n",
 53 |     "    'position_company','position_infos','position_tags','position_introdce'])\n",
 54 |     "    while page<= int(total_page):\n",
 55 |     "        infos = broswer.find_elements_by_class_name('con_list_item')\n",
 56 |     "        print('开始爬取第{}页的岗位信息数据'.format(page))\n",
 57 |     "        page = page + 1\n",
 58 |     "        for info in infos:\n",
 59 |     "            # 岗位名称\n",
 60 |     "            position_name = info.find_element_by_class_name('position_link').find_element_by_tag_name('h3').text\n",
 61 |     "            # 岗位地点\n",
 62 |     "            position_location = info.find_element_by_class_name('position_link').find_element_by_tag_name('em').text\n",
 63 |     "            # 岗位发布时间\n",
 64 |     "            position_time = info.find_element_by_class_name('format-time').text\n",
 65 |     "            # 岗位工资、工作经验及学历\n",
 66 |     "            position_experience = info.find_element_by_class_name('li_b_l').text\n",
 67 |     "            # 招聘公司\n",
 68 |     "            position_company = info.find_element_by_class_name('company_name').find_element_by_tag_name('a').text\n",
 69 |     "            # 公司基本情况\n",
 70 |     "            position_infos = info.find_element_by_class_name('industry').text\n",
 71 |     "            # 岗位标签\n",
 72 |     "            position_tags = info.find_element_by_class_name('list_item_bot').find_element_by_class_name('li_b_l').text\n",
 73 |     "            # 公司基本介绍\n",
 74 |     "            position_introdce = info.find_element_by_class_name('list_item_bot').find_element_by_class_name('li_b_r').text\n",
 75 |     "\n",
 76 |     "            # po_data = {'position_name':position_name,'position_location':position_location,'position_time':position_time,'position_experience':position_experience,\n",
 77 |     "            # 'position_company':position_company,'position_infos':position_infos,'position_tags':position_tags,'position_introdce':position_introdce}\n",
 78 |     "            \n",
 79 |     "            # save_data(po_data)\n",
 80 |     "\n",
 81 |     "            position_data=position_data.append({'position_name':position_name,'position_location':position_location,'position_time':position_time,'position_experience':position_experience,'position_company':position_company,'position_infos':position_infos,'position_tags':position_tags,'position_introdce':position_introdce},ignore_index=True)\n",
 82 |     "            print(position_name,position_location,position_time,position_experience,position_company,position_infos,position_tags,position_introdce)\n",
 83 |     "        position_data.to_csv('position_infos.csv',mode='a',encoding='utf-8-sig')\n",
 84 |     "        print('\\n')\n",
 85 |     "        next_page = broswer.find_element_by_class_name('pager_next')\n",
 86 |     "        next_page.click()\n",
 87 |     "        time.sleep(1)\n",
 88 |     "    position_data.to_csv('position_data.csv',mode='a',encoding='utf-8-sig')\n",
 89 |     "    broswer.quit()\n",
 90 |     "except:\n",
 91 |     "    print('error')"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 5,
 97 |    "metadata": {},
 98 |    "outputs": [
 99 |     {
100 |      "data": {
101 |       "text/html": [
102 |        "<div>\n",
103 |        "<style scoped>\n",
104 |        "    .dataframe tbody tr th:only-of-type {\n",
105 |        "        vertical-align: middle;\n",
106 |        "    }\n",
107 |        "\n",
108 |        "    .dataframe tbody tr th {\n",
109 |        "        vertical-align: top;\n",
110 |        "    }\n",
111 |        "\n",
112 |        "    .dataframe thead th {\n",
113 |        "        text-align: right;\n",
114 |        "    }\n",
115 |        "</style>\n",
116 |        "<table border=\"1\" class=\"dataframe\">\n",
117 |        "  <thead>\n",
118 |        "    <tr style=\"text-align: right;\">\n",
119 |        "      <th></th>\n",
120 |        "      <th>Unnamed: 0</th>\n",
121 |        "      <th>position_name</th>\n",
122 |        "      <th>position_location</th>\n",
123 |        "      <th>position_time</th>\n",
124 |        "      <th>position_experience</th>\n",
125 |        "      <th>position_company</th>\n",
126 |        "      <th>position_infos</th>\n",
127 |        "      <th>position_tags</th>\n",
128 |        "      <th>position_introdce</th>\n",
129 |        "    </tr>\n",
130 |        "  </thead>\n",
131 |        "  <tbody>\n",
132 |        "    <tr>\n",
133 |        "      <td>0</td>\n",
134 |        "      <td>0</td>\n",
135 |        "      <td>Python开发工程师(技术平台)</td>\n",
136 |        "      <td>望京</td>\n",
137 |        "      <td>1天前发布</td>\n",
138 |        "      <td>20k-40k 经验3-5年 / 本科</td>\n",
139 |        "      <td>一起教育科技</td>\n",
140 |        "      <td>移动互联网,教育 / D轮及以上 / 2000人以上</td>\n",
141 |        "      <td>教育 运维</td>\n",
142 |        "      <td>“企业福利 平台 发展空间 团队氛围”</td>\n",
143 |        "    </tr>\n",
144 |        "    <tr>\n",
145 |        "      <td>1</td>\n",
146 |        "      <td>1</td>\n",
147 |        "      <td>python开发</td>\n",
148 |        "      <td>朝阳区</td>\n",
149 |        "      <td>1天前发布</td>\n",
150 |        "      <td>6k-10k 经验不限 / 本科</td>\n",
151 |        "      <td>一网互通</td>\n",
152 |        "      <td>广告营销,移动互联网 / 未融资 / 15-50人</td>\n",
153 |        "      <td>大数据 企业服务 后端 Python 爬虫 数据挖掘</td>\n",
154 |        "      <td>“海外社交媒体大数据团队”</td>\n",
155 |        "    </tr>\n",
156 |        "    <tr>\n",
157 |        "      <td>2</td>\n",
158 |        "      <td>2</td>\n",
159 |        "      <td>python开发工程师</td>\n",
160 |        "      <td>望京</td>\n",
161 |        "      <td>1天前发布</td>\n",
162 |        "      <td>30k-50k 经验3-5年 / 本科</td>\n",
163 |        "      <td>橙睿科技</td>\n",
164 |        "      <td>移动互联网,社交 / 不需要融资 / 50-150人</td>\n",
165 |        "      <td>电商</td>\n",
166 |        "      <td>“福利好、发展前景”</td>\n",
167 |        "    </tr>\n",
168 |        "    <tr>\n",
169 |        "      <td>3</td>\n",
170 |        "      <td>3</td>\n",
171 |        "      <td>高级python开发工程师</td>\n",
172 |        "      <td>中关村</td>\n",
173 |        "      <td>10:15发布</td>\n",
174 |        "      <td>20k-25k 经验5-10年 / 本科</td>\n",
175 |        "      <td>达观数据</td>\n",
176 |        "      <td>人工智能 / B轮 / 150-500人</td>\n",
177 |        "      <td>企业服务 大数据 后端 docker Python C++</td>\n",
178 |        "      <td>“AI准独角兽 技术氛围浓 优秀伙伴”</td>\n",
179 |        "    </tr>\n",
180 |        "    <tr>\n",
181 |        "      <td>4</td>\n",
182 |        "      <td>4</td>\n",
183 |        "      <td>python开发工程师</td>\n",
184 |        "      <td>中关村</td>\n",
185 |        "      <td>2天前发布</td>\n",
186 |        "      <td>20k-30k 经验3-5年 / 本科</td>\n",
187 |        "      <td>TRON</td>\n",
188 |        "      <td>软件开发,区块链 / 不需要融资 / 150-500人</td>\n",
189 |        "      <td>后端 Python</td>\n",
190 |        "      <td>“区块链技术，技术氛围，扁平化”</td>\n",
191 |        "    </tr>\n",
192 |        "  </tbody>\n",
193 |        "</table>\n",
194 |        "</div>"
195 |       ],
196 |       "text/plain": [
197 |        "   Unnamed: 0      position_name position_location position_time  \\\n",
198 |        "0           0  Python开发工程师(技术平台)                望京         1天前发布   \n",
199 |        "1           1           python开发               朝阳区         1天前发布   \n",
200 |        "2           2        python开发工程师                望京         1天前发布   \n",
201 |        "3           3      高级python开发工程师               中关村       10:15发布   \n",
202 |        "4           4        python开发工程师               中关村         2天前发布   \n",
203 |        "\n",
204 |        "    position_experience position_company               position_infos  \\\n",
205 |        "0   20k-40k 经验3-5年 / 本科           一起教育科技   移动互联网,教育 / D轮及以上 / 2000人以上   \n",
206 |        "1      6k-10k 经验不限 / 本科             一网互通    广告营销,移动互联网 / 未融资 / 15-50人   \n",
207 |        "2   30k-50k 经验3-5年 / 本科             橙睿科技   移动互联网,社交 / 不需要融资 / 50-150人   \n",
208 |        "3  20k-25k 经验5-10年 / 本科             达观数据         人工智能 / B轮 / 150-500人   \n",
209 |        "4   20k-30k 经验3-5年 / 本科             TRON  软件开发,区块链 / 不需要融资 / 150-500人   \n",
210 |        "\n",
211 |        "                   position_tags    position_introdce  \n",
212 |        "0                          教育 运维  “企业福利 平台 发展空间 团队氛围”  \n",
213 |        "1     大数据 企业服务 后端 Python 爬虫 数据挖掘        “海外社交媒体大数据团队”  \n",
214 |        "2                             电商           “福利好、发展前景”  \n",
215 |        "3  企业服务 大数据 后端 docker Python C++  “AI准独角兽 技术氛围浓 优秀伙伴”  \n",
216 |        "4                      后端 Python     “区块链技术，技术氛围，扁平化”  "
217 |       ]
218 |      },
219 |      "execution_count": 5,
220 |      "metadata": {},
221 |      "output_type": "execute_result"
222 |     }
223 |    ],
224 |    "source": [
225 |     "position_data = pd.read_csv('position_data.csv')\n",
226 |     "position_data.head()"
227 |    ]
228 |   }
229 |  ],
230 |  "metadata": {
231 |   "kernelspec": {
232 |    "display_name": "Python 3",
233 |    "language": "python",
234 |    "name": "python3"
235 |   },
236 |   "language_info": {
237 |    "codemirror_mode": {
238 |     "name": "ipython",
239 |     "version": 3
240 |    },
241 |    "file_extension": ".py",
242 |    "mimetype": "text/x-python",
243 |    "name": "python",
244 |    "nbconvert_exporter": "python",
245 |    "pygments_lexer": "ipython3",
246 |    "version": "3.7.0"
247 |   }
248 |  },
249 |  "nbformat": 4,
250 |  "nbformat_minor": 2
251 | }
252 | 


--------------------------------------------------------------------------------
/death_company/apple.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/death_company/apple.jpg


--------------------------------------------------------------------------------
/death_company/death_company.py:
--------------------------------------------------------------------------------
  1 | import requests 
  2 | from xici import proxy
  3 | import pymysql
  4 | import multiprocessing
  5 | import time
  6 | import random
  7 | import requests
  8 | 
  9 | def get_data(json_url,proxies):
 10 |     user_agent_list = [
 11 |         # Opera
 12 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
 13 |         "Opera/8.0 (Windows NT 5.1; U; en)",
 14 |         "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
 15 |         "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
 16 |         # Firefox
 17 |         "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
 18 |         "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
 19 |         # Safari
 20 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
 21 |         # chrome
 22 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
 23 |         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
 24 |         "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
 25 |         # 360
 26 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
 27 |         "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
 28 |         # 淘宝浏览器
 29 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
 30 |         # 猎豹浏览器
 31 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
 32 |         "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
 33 |         "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
 34 |         # QQ浏览器
 35 |         "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
 36 |         "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
 37 |         # sogou浏览器
 38 |         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
 39 |         "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
 40 |         # maxthon浏览器
 41 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
 42 |         # UC浏览器
 43 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
 44 |     ]
 45 |     UserAgent = random.choice(user_agent_list)
 46 |     # print(UserAgent)
 47 |     headers = {
 48 |         'User-Agent': UserAgent,
 49 |         'cookie': '_ga=GA1.2.1552320068.1576549906; _gid=GA1.2.2023492117.1576549906; gr_user_id=3d348e20-9820-49f1-afcb-8c55d72ad2d2; MEIQIA_TRACK_ID=1V5jsLMCkebhIrU4wdaptzAcxNh; MEIQIA_VISIT_ID=1V5jsNCG3mS1OZ381JMTk7aWSy4; Hm_lvt_1c587ad486cdb6b962e94fc2002edf89=1576549906,1576553882,1576568223,1576574298; _gat_gtag_UA_59006131_1=1; Hm_lpvt_1c587ad486cdb6b962e94fc2002edf89=1576574523'
 50 |     }
 51 |     try:
 52 |         json_data = requests.get(json_url,headers=headers,proxies=proxies)
 53 |         if json_data.status_code is 200:
 54 |             deatail_infos = json_data.json()['data']['info']
 55 |             for data in deatail_infos:
 56 |                 # 公司名称
 57 |                 com_name = data['com_name']
 58 |                 # 公司简介
 59 |                 com_description = data['com_des']
 60 |                 # 公司创建时间
 61 |                 com_born = data['born']
 62 |                 # 公司关闭时间
 63 |                 com_change_close_date = data['com_change_close_date']
 64 |                 # 公司类型
 65 |                 com_style = data['cat_name'] + data['se_cat_name']
 66 |                 # 公司地址
 67 |                 com_position = data['com_prov']
 68 |                 # 公司创始人
 69 |                 com_team = data['com_team']
 70 |                 com_people = ''
 71 |                 for name in com_team:
 72 |                     com_people += name['name'] + '/' + name['per_des'] + ';'
 73 |                 # 公司标签
 74 |                 com_tag = data['com_tag']
 75 |                 com_tags = ''
 76 |                 for tag in com_tag:
 77 |                     com_tags += tag['tag_name'] + '/'
 78 |                 # 投资轮次
 79 |                 com_fund_status_name = data['com_fund_status_name']
 80 |                 # 投资公司
 81 |                 com_invst = data['com_invst']
 82 |                 com_invsts = ''
 83 |                 for com in com_invst:
 84 |                     com_invsts += com['invst_name'] + '/'
 85 |                 # 公司死亡原因
 86 |                 closure_type = data['closure_type']
 87 |                 death_reason = ''
 88 |                 for da in closure_type:
 89 |                     death_reason += da['name'] + '/'
 90 |                 # 公司存活时间
 91 |                 live_time = data['live_time']
 92 |                 # 公司资金情况
 93 |                 total_money = data['total_money']
 94 |                 # 公司类型
 95 |                 cat_name = data['cat_name']
 96 |                 db = pymysql.connect(host='localhost',user='root',password='123456',port=3306,db='spider_data')
 97 |                 cursor = db.cursor()
 98 |                 sql = 'INSERT INTO juzi_death_company_all_info(com_name,com_description,com_born,com_change_close_date,com_style,com_position,com_people,com_tags,com_fund_status_name,com_invsts,death_reason,live_time,total_money,cat_name) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
 99 |                 try:
100 |                     cursor.execute(sql,(com_name,com_description,com_born,com_change_close_date,com_style,com_position,com_people,com_tags,com_fund_status_name,com_invsts,death_reason,live_time,total_money,cat_name))
101 |                     db.commit()
102 |                     print('{}数据写入成功！！！'.format(com_name))
103 |                 except:
104 |                     print('数据写入失败！')
105 |                     db.rollback()
106 |         else:
107 |             print('{}访问不了了！！！'.format(json_url))
108 |     except:
109 |         print("{}数据爬取失败".format(json_url))
110 | 
111 | if __name__ == "__main__":
112 |     start = time.time()
113 |     print('当前环境CPU核数是：{}核'.format(multiprocessing.cpu_count()))
114 |     json_urls = [ 'https://www.itjuzi.com/api/closure?com_prov=&sort=&page={}&keyword=&cat_id='.format(i) for i in range(1,629)]
115 |     p = multiprocessing.Pool(4) 
116 |     i = 1
117 |     proxies = proxy.get_proxy_random()
118 |     for json_url in json_urls:
119 |         if (i%35 == 0):
120 |             proxy.get_proxy_random()
121 |         p.apply_async(get_data,args=(json_url,proxies,))
122 |         i += 1
123 |     p.close()
124 |     p.join()
125 |     print('总耗时：%.5f秒'% float(time.time()-start))


--------------------------------------------------------------------------------
/death_company/death_company_info - 副本.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/death_company/death_company_info - 副本.xls


--------------------------------------------------------------------------------
/death_company/heart.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/death_company/heart.jpg


--------------------------------------------------------------------------------
/death_company/可视化图表/com_death_reason.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/death_company/可视化图表/com_death_reason.jpg


--------------------------------------------------------------------------------
/death_company/可视化图表/com_financing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/death_company/可视化图表/com_financing.jpg


--------------------------------------------------------------------------------
/death_company/可视化图表/com_financing_pie.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/death_company/可视化图表/com_financing_pie.jpg


--------------------------------------------------------------------------------
/death_company/可视化图表/com_live_time.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/death_company/可视化图表/com_live_time.jpg


--------------------------------------------------------------------------------
/death_company/可视化图表/com_position.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/death_company/可视化图表/com_position.jpg


--------------------------------------------------------------------------------
/death_company/可视化图表/com_position_pie.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/death_company/可视化图表/com_position_pie.jpg


--------------------------------------------------------------------------------
/death_company/可视化图表/com_style.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/death_company/可视化图表/com_style.jpg


--------------------------------------------------------------------------------
/death_company/可视化图表/com_style_pie.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/death_company/可视化图表/com_style_pie.jpg


--------------------------------------------------------------------------------
/jianshu/ghostdriver.log:
--------------------------------------------------------------------------------
  1 | [INFO  - 2019-10-16T06:55:41.289Z] GhostDriver - Main - running on port 50831
  2 | [INFO  - 2019-10-16T06:55:41.398Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/538.1 (KHTML, like Gecko) PhantomJS/2.1.1 Safari/538.1","webSecurityEnabled":true}
  3 | [INFO  - 2019-10-16T06:55:41.398Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.customHeaders:  - {}
  4 | [INFO  - 2019-10-16T06:55:41.398Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"2.1.1","driverName":"ghostdriver","driverVersion":"1.2.0","platform":"windows-7-32bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"}}
  5 | [INFO  - 2019-10-16T06:55:41.398Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: f7b316e0-efe1-11e9-8204-558610a45b39
  6 | [ERROR - 2019-10-16T06:55:53.212Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
  7 | 
  8 |   phantomjs://platform/console++.js:263 in error
  9 | [ERROR - 2019-10-16T06:55:53.212Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
 10 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
 11 | 
 12 |   phantomjs://platform/console++.js:263 in error
 13 | [ERROR - 2019-10-16T06:55:55.207Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
 14 | 
 15 |   phantomjs://platform/console++.js:263 in error
 16 | [ERROR - 2019-10-16T06:55:55.207Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
 17 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
 18 | 
 19 |   phantomjs://platform/console++.js:263 in error
 20 | [ERROR - 2019-10-16T06:55:57.042Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
 21 | 
 22 |   phantomjs://platform/console++.js:263 in error
 23 | [ERROR - 2019-10-16T06:55:57.042Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
 24 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
 25 | 
 26 |   phantomjs://platform/console++.js:263 in error
 27 | [ERROR - 2019-10-16T06:55:59.031Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
 28 | 
 29 |   phantomjs://platform/console++.js:263 in error
 30 | [ERROR - 2019-10-16T06:55:59.032Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
 31 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
 32 | 
 33 |   phantomjs://platform/console++.js:263 in error
 34 | [ERROR - 2019-10-16T06:56:01.124Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
 35 | 
 36 |   phantomjs://platform/console++.js:263 in error
 37 | [ERROR - 2019-10-16T06:56:01.124Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
 38 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
 39 | 
 40 |   phantomjs://platform/console++.js:263 in error
 41 | [ERROR - 2019-10-16T06:56:03.133Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
 42 | 
 43 |   phantomjs://platform/console++.js:263 in error
 44 | [ERROR - 2019-10-16T06:56:03.134Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
 45 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
 46 | 
 47 |   phantomjs://platform/console++.js:263 in error
 48 | [ERROR - 2019-10-16T06:56:04.906Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
 49 | 
 50 |   phantomjs://platform/console++.js:263 in error
 51 | [ERROR - 2019-10-16T06:56:04.906Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
 52 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
 53 | 
 54 |   phantomjs://platform/console++.js:263 in error
 55 | [ERROR - 2019-10-16T06:56:06.585Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
 56 | 
 57 |   phantomjs://platform/console++.js:263 in error
 58 | [ERROR - 2019-10-16T06:56:06.585Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
 59 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
 60 | 
 61 |   phantomjs://platform/console++.js:263 in error
 62 | [ERROR - 2019-10-16T06:56:09.043Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
 63 | 
 64 |   phantomjs://platform/console++.js:263 in error
 65 | [ERROR - 2019-10-16T06:56:09.043Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
 66 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
 67 | 
 68 |   phantomjs://platform/console++.js:263 in error
 69 | [ERROR - 2019-10-16T06:56:12.430Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
 70 | 
 71 |   phantomjs://platform/console++.js:263 in error
 72 | [ERROR - 2019-10-16T06:56:12.430Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
 73 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
 74 | 
 75 |   phantomjs://platform/console++.js:263 in error
 76 | [ERROR - 2019-10-16T06:56:14.119Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
 77 | 
 78 |   phantomjs://platform/console++.js:263 in error
 79 | [ERROR - 2019-10-16T06:56:14.119Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
 80 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
 81 | 
 82 |   phantomjs://platform/console++.js:263 in error
 83 | [ERROR - 2019-10-16T06:56:15.780Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
 84 | 
 85 |   phantomjs://platform/console++.js:263 in error
 86 | [ERROR - 2019-10-16T06:56:15.780Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
 87 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
 88 | 
 89 |   phantomjs://platform/console++.js:263 in error
 90 | [ERROR - 2019-10-16T06:56:17.465Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
 91 | 
 92 |   phantomjs://platform/console++.js:263 in error
 93 | [ERROR - 2019-10-16T06:56:17.465Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
 94 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
 95 | 
 96 |   phantomjs://platform/console++.js:263 in error
 97 | [ERROR - 2019-10-16T06:56:19.819Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
 98 | 
 99 |   phantomjs://platform/console++.js:263 in error
100 | [ERROR - 2019-10-16T06:56:19.820Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
101 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
102 | 
103 |   phantomjs://platform/console++.js:263 in error
104 | [ERROR - 2019-10-16T06:56:22.468Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
105 | 
106 |   phantomjs://platform/console++.js:263 in error
107 | [ERROR - 2019-10-16T06:56:22.468Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
108 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
109 | 
110 |   phantomjs://platform/console++.js:263 in error
111 | [ERROR - 2019-10-16T06:56:24.191Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
112 | 
113 |   phantomjs://platform/console++.js:263 in error
114 | [ERROR - 2019-10-16T06:56:24.191Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
115 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
116 | 
117 |   phantomjs://platform/console++.js:263 in error
118 | [ERROR - 2019-10-16T06:56:26.351Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
119 | 
120 |   phantomjs://platform/console++.js:263 in error
121 | [ERROR - 2019-10-16T06:56:26.352Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
122 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
123 | 
124 |   phantomjs://platform/console++.js:263 in error
125 | [ERROR - 2019-10-16T06:56:28.419Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
126 | 
127 |   phantomjs://platform/console++.js:263 in error
128 | [ERROR - 2019-10-16T06:56:28.419Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
129 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
130 | 
131 |   phantomjs://platform/console++.js:263 in error
132 | [ERROR - 2019-10-16T06:56:30.058Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
133 | 
134 |   phantomjs://platform/console++.js:263 in error
135 | [ERROR - 2019-10-16T06:56:30.058Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
136 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
137 | 
138 |   phantomjs://platform/console++.js:263 in error
139 | [ERROR - 2019-10-16T06:56:32.359Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
140 | 
141 |   phantomjs://platform/console++.js:263 in error
142 | [ERROR - 2019-10-16T06:56:32.359Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
143 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
144 | 
145 |   phantomjs://platform/console++.js:263 in error
146 | [ERROR - 2019-10-16T06:56:34.092Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
147 | 
148 |   phantomjs://platform/console++.js:263 in error
149 | [ERROR - 2019-10-16T06:56:34.092Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
150 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
151 | 
152 |   phantomjs://platform/console++.js:263 in error
153 | [ERROR - 2019-10-16T06:56:36.031Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
154 | 
155 |   phantomjs://platform/console++.js:263 in error
156 | [ERROR - 2019-10-16T06:56:36.031Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
157 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
158 | 
159 |   phantomjs://platform/console++.js:263 in error
160 | [ERROR - 2019-10-16T06:56:37.748Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
161 | 
162 |   phantomjs://platform/console++.js:263 in error
163 | [ERROR - 2019-10-16T06:56:37.748Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
164 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
165 | 
166 |   phantomjs://platform/console++.js:263 in error
167 | [ERROR - 2019-10-16T06:56:39.790Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
168 | 
169 |   phantomjs://platform/console++.js:263 in error
170 | [ERROR - 2019-10-16T06:56:39.790Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
171 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
172 | 
173 |   phantomjs://platform/console++.js:263 in error
174 | [ERROR - 2019-10-16T06:56:41.954Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
175 | 
176 |   phantomjs://platform/console++.js:263 in error
177 | [ERROR - 2019-10-16T06:56:41.954Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
178 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
179 | 
180 |   phantomjs://platform/console++.js:263 in error
181 | [ERROR - 2019-10-16T06:56:44.333Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
182 | 
183 |   phantomjs://platform/console++.js:263 in error
184 | [ERROR - 2019-10-16T06:56:44.333Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
185 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
186 | 
187 |   phantomjs://platform/console++.js:263 in error
188 | [ERROR - 2019-10-16T06:56:46.202Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
189 | 
190 |   phantomjs://platform/console++.js:263 in error
191 | [ERROR - 2019-10-16T06:56:46.202Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
192 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
193 | 
194 |   phantomjs://platform/console++.js:263 in error
195 | [ERROR - 2019-10-16T06:56:47.870Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
196 | 
197 |   phantomjs://platform/console++.js:263 in error
198 | [ERROR - 2019-10-16T06:56:47.870Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
199 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
200 | 
201 |   phantomjs://platform/console++.js:263 in error
202 | [ERROR - 2019-10-16T06:56:49.511Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
203 | 
204 |   phantomjs://platform/console++.js:263 in error
205 | [ERROR - 2019-10-16T06:56:49.511Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
206 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
207 | 
208 |   phantomjs://platform/console++.js:263 in error
209 | [ERROR - 2019-10-16T06:56:51.597Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
210 | 
211 |   phantomjs://platform/console++.js:263 in error
212 | [ERROR - 2019-10-16T06:56:51.597Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
213 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
214 | 
215 |   phantomjs://platform/console++.js:263 in error
216 | [ERROR - 2019-10-16T06:56:53.240Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
217 | 
218 |   phantomjs://platform/console++.js:263 in error
219 | [ERROR - 2019-10-16T06:56:53.240Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
220 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
221 | 
222 |   phantomjs://platform/console++.js:263 in error
223 | [ERROR - 2019-10-16T06:56:55.317Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
224 | 
225 |   phantomjs://platform/console++.js:263 in error
226 | [ERROR - 2019-10-16T06:56:55.317Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
227 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
228 | 
229 |   phantomjs://platform/console++.js:263 in error
230 | [ERROR - 2019-10-16T06:56:57.056Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
231 | 
232 |   phantomjs://platform/console++.js:263 in error
233 | [ERROR - 2019-10-16T06:56:57.056Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
234 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
235 | 
236 |   phantomjs://platform/console++.js:263 in error
237 | [ERROR - 2019-10-16T06:56:58.943Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
238 | 
239 |   phantomjs://platform/console++.js:263 in error
240 | [ERROR - 2019-10-16T06:56:58.944Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
241 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
242 | 
243 |   phantomjs://platform/console++.js:263 in error
244 | [ERROR - 2019-10-16T06:57:00.550Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
245 | 
246 |   phantomjs://platform/console++.js:263 in error
247 | [ERROR - 2019-10-16T06:57:00.550Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
248 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
249 | 
250 |   phantomjs://platform/console++.js:263 in error
251 | [ERROR - 2019-10-16T06:57:02.730Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
252 | 
253 |   phantomjs://platform/console++.js:263 in error
254 | [ERROR - 2019-10-16T06:57:02.731Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
255 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
256 | 
257 |   phantomjs://platform/console++.js:263 in error
258 | [ERROR - 2019-10-16T06:57:04.680Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")')
259 | 
260 |   phantomjs://platform/console++.js:263 in error
261 | [ERROR - 2019-10-16T06:57:04.680Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack:
262 |   (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1)
263 | 
264 |   phantomjs://platform/console++.js:263 in error
265 | 


--------------------------------------------------------------------------------
/jianshu/jianshu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/jianshu/jianshu/__init__.py


--------------------------------------------------------------------------------
/jianshu/jianshu/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/jianshu/jianshu/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/jianshu/jianshu/__pycache__/items.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/jianshu/jianshu/__pycache__/items.cpython-37.pyc


--------------------------------------------------------------------------------
/jianshu/jianshu/__pycache__/middlewares.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/jianshu/jianshu/__pycache__/middlewares.cpython-37.pyc


--------------------------------------------------------------------------------
/jianshu/jianshu/__pycache__/pipelines.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/jianshu/jianshu/__pycache__/pipelines.cpython-37.pyc


--------------------------------------------------------------------------------
/jianshu/jianshu/__pycache__/settings.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/jianshu/jianshu/__pycache__/settings.cpython-37.pyc


--------------------------------------------------------------------------------
/jianshu/jianshu/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class JianshuItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     title = scrapy.Field()
15 |     author = scrapy.Field()
16 |     time = scrapy.Field()
17 |     word_num = scrapy.Field()
18 |     read_num = scrapy.Field()
19 |     content = scrapy.Field()
20 |     comment_num = scrapy.Field()
21 |     support_num = scrapy.Field()
22 |     process_url = scrapy.Field()
23 |     article_id = scrapy.Field()
24 |     origin_url = scrapy.Field()
25 |     tags = scrapy.Field()
26 |     pass
27 | 


--------------------------------------------------------------------------------
/jianshu/jianshu/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | from selenium import webdriver
 10 | import time
 11 | from selenium.webdriver.support.wait import WebDriverWait
 12 | from selenium.webdriver.common.by import By
 13 | from selenium.webdriver.support import expected_conditions as EC
 14 | from scrapy.http.response.html import HtmlResponse
 15 | 
 16 | class SeleniumDownloadMiddleware(object):
 17 | 
 18 |     def __init__(self):
 19 |         self.borswer = webdriver.Chrome(r"E:/0_软件/3_爬虫相关软件/chromedriver_win32/chromedriver.exe")
 20 |         self.wait = WebDriverWait(self.borswer,10)
 21 |     
 22 |     def process_request(self, request, spider):
 23 |         self.borswer.get(request.url)
 24 |         print('我正在使用Selenium加载网页！！！')
 25 |         time.sleep(1)
 26 |         try:
 27 |             while True:
 28 |                 showmore = self.borswer.browser.find_element_by_class_name('H7E3vT')
 29 |                 showmore.click()
 30 |                 time.sleep(0.3)
 31 |                 if not showmore:
 32 |                     break
 33 |         except:
 34 |             pass
 35 |         source = self.borswer.page_source
 36 |         response = HtmlResponse(url=self.borswer.current_url, request=request, body=source, encoding='utf-8')
 37 |         return response
 38 | 
 39 | 
 40 | class JianshuSpiderMiddleware(object):
 41 |     # Not all methods need to be defined. If a method is not defined,
 42 |     # scrapy acts as if the spider middleware does not modify the
 43 |     # passed objects.
 44 | 
 45 |     @classmethod
 46 |     def from_crawler(cls, crawler):
 47 |         # This method is used by Scrapy to create your spiders.
 48 |         s = cls()
 49 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 50 |         return s
 51 | 
 52 |     def process_spider_input(self, response, spider):
 53 |         # Called for each response that goes through the spider
 54 |         # middleware and into the spider.
 55 | 
 56 |         # Should return None or raise an exception.
 57 |         return None
 58 | 
 59 |     def process_spider_output(self, response, result, spider):
 60 |         # Called with the results returned from the Spider, after
 61 |         # it has processed the response.
 62 | 
 63 |         # Must return an iterable of Request, dict or Item objects.
 64 |         for i in result:
 65 |             yield i
 66 | 
 67 |     def process_spider_exception(self, response, exception, spider):
 68 |         # Called when a spider or process_spider_input() method
 69 |         # (from other spider middleware) raises an exception.
 70 | 
 71 |         # Should return either None or an iterable of Response, dict
 72 |         # or Item objects.
 73 |         pass
 74 | 
 75 |     def process_start_requests(self, start_requests, spider):
 76 |         # Called with the start requests of the spider, and works
 77 |         # similarly to the process_spider_output() method, except
 78 |         # that it doesn’t have a response associated.
 79 | 
 80 |         # Must return only requests (not items).
 81 |         for r in start_requests:
 82 |             yield r
 83 | 
 84 |     def spider_opened(self, spider):
 85 |         spider.logger.info('Spider opened: %s' % spider.name)
 86 | 
 87 | 
 88 | class JianshuDownloaderMiddleware(object):
 89 |     # Not all methods need to be defined. If a method is not defined,
 90 |     # scrapy acts as if the downloader middleware does not modify the
 91 |     # passed objects.
 92 | 
 93 |     @classmethod
 94 |     def from_crawler(cls, crawler):
 95 |         # This method is used by Scrapy to create your spiders.
 96 |         s = cls()
 97 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 98 |         return s
 99 | 
100 |     def process_request(self, request, spider):
101 |         # Called for each request that goes through the downloader
102 |         # middleware.
103 | 
104 |         # Must either:
105 |         # - return None: continue processing this request
106 |         # - or return a Response object
107 |         # - or return a Request object
108 |         # - or raise IgnoreRequest: process_exception() methods of
109 |         #   installed downloader middleware will be called
110 |         return None
111 | 
112 |     def process_response(self, request, response, spider):
113 |         # Called with the response returned from the downloader.
114 | 
115 |         # Must either;
116 |         # - return a Response object
117 |         # - return a Request object
118 |         # - or raise IgnoreRequest
119 |         return response
120 | 
121 |     def process_exception(self, request, exception, spider):
122 |         # Called when a download handler or a process_request()
123 |         # (from other downloader middleware) raises an exception.
124 | 
125 |         # Must either:
126 |         # - return None: continue processing this exception
127 |         # - return a Response object: stops process_exception() chain
128 |         # - return a Request object: stops process_exception() chain
129 |         pass
130 | 
131 |     def spider_opened(self, spider):
132 |         spider.logger.info('Spider opened: %s' % spider.name)


--------------------------------------------------------------------------------
/jianshu/jianshu/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import pymysql
 8 | 
 9 | class JianshuPipeline(object):
10 |     def __init__(self):
11 |         params = {
12 |             'host':'localhost',
13 |             'user':'root',
14 |             'password':'pass4321',
15 |             'port':3306,
16 |             'db':'jianshu',
17 |             'charset':'utf8'
18 |         }
19 |         self.db = pymysql.connect(**params)
20 |         self.cursor = self.db.cursor()
21 |         self._sql = None
22 | 
23 |     @property
24 |     def sql(self):
25 |         if not self._sql:
26 |             self._sql = '''INSERT INTO article (title,author,pub_time,word_num,read_num,content,comment_num,support_num,process_url,article_id,origin_url,tags)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
27 |             return self._sql
28 |         return self._sql
29 | 
30 |     def process_item(self, item, spider):
31 |         self.cursor.execute(self.sql,(item['title'],item['author'],item['time'],item['word_num'],item['read_num'],item['content'],item['comment_num'],item['support_num'],item['process_url'],item['article_id'],item['origin_url'],item['tags']))
32 |         self.db.commit()
33 |         return item
34 | 


--------------------------------------------------------------------------------
/jianshu/jianshu/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for jianshu project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'jianshu'
13 | 
14 | SPIDER_MODULES = ['jianshu.spiders']
15 | NEWSPIDER_MODULE = 'jianshu.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'jianshu (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | DEFAULT_REQUEST_HEADERS = {
43 |   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 |   'Accept-Language': 'en',
45 |   'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
46 | }
47 | 
48 | # Enable or disable spider middlewares
49 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
50 | #SPIDER_MIDDLEWARES = {
51 | #    'jianshu.middlewares.JianshuSpiderMiddleware': 543,
52 | #}
53 | 
54 | # Enable or disable downloader middlewares
55 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
56 | DOWNLOADER_MIDDLEWARES = {
57 |   #  'jianshu.middlewares.JianshuDownloaderMiddleware': 543,
58 |    'jianshu.middlewares.SeleniumDownloadMiddleware': 543,
59 | }
60 | 
61 | # Enable or disable extensions
62 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
63 | #EXTENSIONS = {
64 | #    'scrapy.extensions.telnet.TelnetConsole': None,
65 | #}
66 | 
67 | # Configure item pipelines
68 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
69 | ITEM_PIPELINES = {
70 |    'jianshu.pipelines.JianshuPipeline': 300,
71 | }
72 | 
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 | 
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | #HTTPCACHE_ENABLED = True
89 | #HTTPCACHE_EXPIRATION_SECS = 0
90 | #HTTPCACHE_DIR = 'httpcache'
91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 | 


--------------------------------------------------------------------------------
/jianshu/jianshu/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/jianshu/jianshu/spiders/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/jianshu/jianshu/spiders/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/jianshu/jianshu/spiders/__pycache__/js.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/jianshu/jianshu/spiders/__pycache__/js.cpython-37.pyc


--------------------------------------------------------------------------------
/jianshu/jianshu/spiders/js.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy.linkextractors import LinkExtractor
 4 | from scrapy.spiders import CrawlSpider, Rule
 5 | from jianshu.items import JianshuItem
 6 | 
 7 | class JsSpider(CrawlSpider):
 8 |     name = 'js'
 9 |     allowed_domains = ["jianshu.com"]
10 |     start_urls = ["https://www.jianshu.com/"]
11 |     rules = (
12 |         Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_detail', follow=True),
13 |     )
14 | 
15 |     def parse_detail(self, response):
16 |         title = response.xpath('//h1[@class="_1RuRku"]/text()').get()
17 |         author = response.xpath('//span[@class="_22gUMi"]/text()').get()
18 |         time = response.xpath('//div[@class="s-dsoj"]/time/text()').get()
19 |         word_num = response.xpath('//div[@class="s-dsoj"]/span/text()').getall()[0].split(' ')[-1]
20 |         read_num = response.xpath('//div[@class="s-dsoj"]/span/text()').getall()[1].split(' ')[-1]
21 |         content = response.xpath('//article[@class="_2rhmJa"]').get()
22 |         comment_num = response.xpath('//div[@class="-pXE92"]//span/text()').getall()[1]
23 |         support_num = response.xpath('//div[@class="-pXE92"]//span/text()').getall()[-1]
24 |         process_url = response.url.split('?')[0]
25 |         article_id = process_url.split('/')[-1]
26 |         tags = response.xpath('//div[@class="_2Nttfz"]/a//span/text()').getall()
27 |         tags = ','.join(tags)
28 |         origin_url = response.url
29 |         item = JianshuItem(title=title,author=author,time=time,word_num=word_num,read_num=read_num,
30 |         content=content,comment_num=comment_num,support_num=support_num,process_url=process_url,article_id=article_id,
31 |         origin_url=origin_url,tags=tags)
32 |         return item
33 | 


--------------------------------------------------------------------------------
/jianshu/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = jianshu.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = jianshu
12 | 


--------------------------------------------------------------------------------
/jianshu/start.py:
--------------------------------------------------------------------------------
1 | from scrapy.cmdline import execute
2 | execute('scrapy crawl js'.split(' '))


--------------------------------------------------------------------------------
/ppt_download_spider/ppt_download_spider.py:
--------------------------------------------------------------------------------
 1 | from urllib import request,error 
 2 | from lxml import etree
 3 | import os
 4 | import time
 5 | 
 6 | class Get_PPT():
 7 | 
 8 |     def __init__(self):
 9 | 
10 |         self.base_url = 'http://www.1ppt.com'
11 |         self.page_url = 'ppt_dabian_{}.html'.format(1)
12 |         self.header = {
13 |             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
14 |         }
15 | 
16 |     # 获取进入ppt详细介绍页面的url   
17 |     def get_urls(self,url):
18 |         req = request.Request(url, headers=self.header)
19 |         response = request.urlopen(req)
20 |         text = response.read().decode('gb2312')
21 |         html = etree.HTML(text)
22 |         # 获取进入ppt详细介绍页面的url
23 |         detail_url = html.xpath("//ul[@class='tplist']//h2/a/@href")
24 |         ppt_urls = []
25 |         for url in detail_url:
26 |             ppt_urls.append(self.base_url+url)
27 |         return ppt_urls
28 |     # 获取详细页面中PPT下载的链接
29 |     def get_down_ppt_url(self,url,file_name):
30 |         ppt_urls = self.get_urls(url)
31 |         for url in ppt_urls:
32 |             print("下载链接为{}的ppt模板".format(url))
33 |             req = request.Request(url, headers=self.header)
34 |             response = request.urlopen(req)
35 |             text = response.read().decode('gb2312')
36 |             html = etree.HTML(text)
37 |             name = html.xpath("//div[@class='ppt_info clearfix']/h1/text()")[0]
38 |             down_url = html.xpath("//ul[@class='downurllist']//a/@href")[0]
39 |             try:
40 |                 time.sleep(1)
41 |                 spon = request.urlopen(down_url)
42 |                 filename = file_name + '/' + '{}.zip'.format(name)
43 |                 with open(filename,"wb") as code:
44 |                     code.write(spon.read())
45 |             except error.HTTPError:
46 |                 print("403 Forbidden!!!!")
47 | if __name__ == "__main__":
48 |     ppt = Get_PPT()
49 |     # 获取论文答辩模块的PPT模板（可根据自己需求更改、此时页数也需要根据此模块的情况进行修改）
50 |     for i in range(1,9):
51 |         print("下载第{}页的PPT".format(i))
52 |         lunwen_url = '/xiazai/dabian/ppt_dabian_{}.html'.format(i)
53 |         file_name = "第{}页PPT总和".format(i)
54 |         if not os.path.exists(file_name):
55 |             os.mkdir(file_name)
56 |         url = ppt.base_url + lunwen_url
57 |         ppt.get_down_ppt_url(url,file_name)


--------------------------------------------------------------------------------
/proxy_design/__pycache__/connect_redis.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/proxy_design/__pycache__/connect_redis.cpython-37.pyc


--------------------------------------------------------------------------------
/proxy_design/connect_redis.py:
--------------------------------------------------------------------------------
 1 | import redis
 2 | 
 3 | class RedisClient(object):
 4 |     def __init__(self):
 5 |         self.key = 'proxy'
 6 |         if not hasattr(self, 'pool'):
 7 |             self.pool = redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)
 8 |         self.getConnection()
 9 | 
10 | 
11 |     def getConnection(self):
12 |         self._conn = redis.StrictRedis(connection_pool=self.pool)
13 | 
14 | 
15 |     def add(self, value):
16 |         return self._conn.sadd(self.key, value)
17 | 
18 | 
19 |     def random(self):
20 |         return self._conn.srandmember(self.key)
21 | 
22 | 
23 |     def delete(self, value):
24 |         return self._conn.srem(self.key, value)
25 | 
26 | 
27 | r = RedisClient()


--------------------------------------------------------------------------------
/proxy_design/proxy.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from lxml import etree
 3 | from connect_redis import r
 4 | import multiprocessing
 5 | 
 6 | class proxy():
 7 |     def __init__(self):
 8 |         self.key = 'proxy'
 9 |         self.headers = {
10 |             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36'
11 |         }
12 |         self.proxy_list = []
13 |         self.url = [ 'http://www.66ip.cn/{}.html'.format(i) for i in range(20)]
14 | 
15 |     def getContent(self):
16 |         '''
17 |         获取网站源代码
18 |         :return:
19 |         '''
20 |         # url = 'http://www.66ip.cn/1.html'
21 |         # 由于一页仅有5个代理IP，因此默认获取20页
22 |         for url in self.url:
23 |             data = requests.get(url, headers=self.headers)
24 |             content = data.text.encode('ISO-8859-1').decode(requests.utils.get_encodings_from_content(data.text)[0])
25 |             self.parse(content)
26 | 
27 |     def parse(self,content):
28 |         '''
29 |         解析网站源代码
30 |         :return:
31 |         '''
32 |         html = etree.HTML(content)
33 |         reslut = html.xpath("//div[@align='center']/table/tr")[1:]
34 |         for re in reslut:
35 |             proxy_ip = re.xpath("./td/text()")[0]
36 |             proxy_port = re.xpath("./td/text()")[1]
37 |             proxy_address = re.xpath("./td/text()")[2]
38 |             proxy_style = re.xpath("./td/text()")[3]
39 |             proxy_check_time = re.xpath("./td/text()")[4]
40 |             # 将获取的代理存放至列表中
41 |             self.proxy_list.append(proxy_ip+'+'+proxy_port+'+'+proxy_address+'+'+proxy_style+'+'+proxy_check_time)
42 |             proxy_value = proxy_ip+":"+proxy_port
43 |             print(proxy_value)
44 |             # 将代理添加至redis数据库中
45 |             r.add(proxy_value)
46 |     
47 |     def get_proxy_random(self):
48 |         '''
49 |         随机获取代理地址
50 |         :return:
51 |         '''
52 |         # 通过其是否可以访问百度验证其有效性
53 |         url = 'https://www.baidu.com'
54 |         value = r.random()
55 |         print(value)
56 |         if value is None:
57 |             self.getContent()
58 |         proxies = {"http": "http://" + value.decode("utf-8")}
59 |         print(proxies)
60 |         try:
61 |             data = requests.get(url=url, headers=self.headers, proxies=proxies, timeout=5)
62 |             if data.status_code is not 200:
63 |                 print('代理无效，进行删除')
64 |                 r.delete(value)
65 |                 self.random()
66 |             else:
67 |                 print('可以访问百度网页！有效代理')
68 |                 return 'http://' + value.decode("utf-8")
69 |         except:
70 |             print('代理无效，进行删除')
71 |             r.delete(value)
72 |             self.random()
73 | 
74 | proxy = proxy()
75 | # ip = proxy.get_proxy_random()
76 | # print(ip)
77 | 
78 | 


--------------------------------------------------------------------------------
/stock/A_stock_company.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import pymysql
 3 | from lxml import etree
 4 | import time
 5 | import random
 6 | 
 7 | 
 8 | def crawl_stock_company(url, connect):
 9 |     headers = {
10 |         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) \
11 |             AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 \
12 |             Safari/537.36"
13 |     }
14 |     data = requests.get(url, headers=headers).text
15 |     html = etree.HTML(data)
16 |     tr_data = html.xpath("//table[@id='myTable04']//tbody/tr")
17 |     for tr in tr_data:
18 |         stock_id = tr.xpath(".//td//text()")[1]
19 |         stock_name = tr.xpath(".//td//text()")[2]
20 |         company_name = tr.xpath(".//td//text()")[3]
21 |         company_province = tr.xpath(".//td//text()")[4]
22 |         company_loc = tr.xpath(".//td//text()")[5]
23 |         company_num = tr.xpath(".//td//text()")[8]
24 |         company_create_date = tr.xpath(".//td//text()")[9]
25 |         company_trade = tr.xpath(".//td//text()")[-3]
26 |         company_business = tr.xpath(".//td//text()")[-1]
27 |         value = (stock_id, stock_name, company_name, company_province,
28 |                  company_loc, company_num, company_create_date,
29 |                  company_trade, company_business)
30 |         sql = "insert into All_Stock_Name(stock_id, stock_name, \
31 |                 company_name, company_province, company_loc, company_num, \
32 |                 company_create_date,company_trade, company_business) values \
33 |                 (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
34 |         cursor = connect.cursor()
35 |         cursor.execute(sql, value)
36 |         connect.commit()
37 |         print(value)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     connect = pymysql.connect(host='xxxxx', user='root', password=
42 |                               'xxxxxx', port=3306, db='stock')
43 |     file = open("new_{}.text".format(time.time()), "w")
44 |     for i in range(1, 207):
45 |         try:
46 |             url = "https://s.askci.com/stock/a/0-0?reportTime=2020-03-31&pageNum={}#\
47 |                     QueryCondition".format(i)
48 |             crawl_stock_company(url, connect)
49 |             time.sleep(random.randint(1, 2))
50 |         except(Exception):
51 |             print("股票抓取失败！！！")
52 |             file.write("股票{}数据未抓取成功!".format(i)+"\n")
53 | 


--------------------------------------------------------------------------------
/stock/stock_spider_new.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:UTF-8 -*-
  2 | """
  3 |  * @Author: Jack Shan
  4 |  * @Date: 2020-11-17 14:20:17
  5 |  * @Last Modified by:   Jack Shan
  6 |  * @Last Modified time: 2020-11-17 14:20:17
  7 | """
  8 | # 相关库的导入
  9 | import requests
 10 | import pymysql
 11 | from lxml import etree
 12 | import pandas as pd
 13 | import threading
 14 | import time
 15 | import datetime
 16 | import random
 17 | from queue import Queue
 18 | import logging
 19 | import os
 20 | 
 21 | 
 22 | # 添加日志相关内容
 23 | # 创建一个logger
 24 | logger = logging.getLogger()
 25 | logger.setLevel(level=logging.INFO)
 26 | # 创建handler，用于写入日志文件
 27 | file = time.strftime("%Y%m%d%H%M", time.localtime(time.time()))
 28 | log_path = os.path.dirname(os.getcwd()) + '/stock/Logs/'
 29 | log_name = log_path + file + '.log'
 30 | logfile = log_name
 31 | fh = logging.FileHandler(logfile, mode="w", encoding="UTF-8")
 32 | fh.setLevel(logging.DEBUG)    # 输出到file的log等级的开关
 33 | ch = logging.StreamHandler()
 34 | ch.setLevel(logging.WARNING)  # 输出到console的log等级的开关
 35 | # 第三步，定义handler的输出格式
 36 | formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] -\
 37 |                                 %(levelname)s: %(message)s")
 38 | fh.setFormatter(formatter)
 39 | ch.setFormatter(formatter)
 40 | # 第四步，将logger添加到handler里面
 41 | logger.addHandler(fh)
 42 | logger.addHandler(ch)
 43 | # 定义日志
 44 | 
 45 | 
 46 | class CrwalStockName(threading.Thread):
 47 | 
 48 |     headers = {
 49 |         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) \
 50 |         AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 \
 51 |         Safari/537.36"
 52 |     }
 53 | 
 54 |     def __init__(self, page_queue, stock_name, *args, **kwargs):
 55 |         super(CrwalStockName, self).__init__(*args, **kwargs)
 56 |         self.page_queue = page_queue
 57 |         self.stock_name = stock_name
 58 | 
 59 |     def run(self):
 60 |         while True:
 61 |             if self.page_queue.empty():
 62 |                 break
 63 |             url = self.page_queue.get()
 64 |             if "funds" not in url:
 65 |                 self.parse_page(url)
 66 |             else:
 67 |                 self.parse_page_detail(url)
 68 | 
 69 |     def parse_page(self, url):
 70 |         try:
 71 |             # logger.info('{}网页解析完成'.format(url))
 72 |             response = requests.get(url=url, headers=self.headers)
 73 |             html = etree.HTML(response.text)
 74 |             text1 = "".join(html.xpath("//div[@id='history_funds_analysis_free']\
 75 |                                         //p//text()"))
 76 |             text2 = html.xpath("//p[@class='zjlxlstj_txt mb14']//\
 77 |                                text()")[0].strip()
 78 |             text = text1 + "&" + text2
 79 |             stock_id = str(url).split("/")[-2]
 80 |             date = datetime.datetime.now()
 81 |             today = "{}/{}/{}".format(date.year, date.month, date.day)
 82 |             self.stock_name.put((text, stock_id, today))
 83 |             time.sleep(random.randint(1, 2))
 84 |         except(Exception):
 85 |             logger.info('{}网页解析失败'.format(url))
 86 |             self.page_queue.put(url)
 87 | 
 88 |     def parse_page_detail(self, url):
 89 |         try:
 90 |             # logger.info('{}网页解析完成'.format(url))
 91 |             response = requests.get(url=url, headers=self.headers)
 92 |             html = etree.HTML(response.text)
 93 |             data = html.xpath("//div[@id='history_table_free']//tr")[2:]
 94 |             stock_id = str(url).split("/")[-3]
 95 |             for i in data:
 96 |                 date = tuple(i.xpath(".//td//text()"))
 97 |                 value = (stock_id, date[0], date[1], date[2], date[3], date[4],
 98 |                          date[5], date[6], date[7], date[8], date[9], date[10])
 99 |                 self.stock_name.put(value)
100 |                 time.sleep(random.randint(1, 2))
101 |         except(Exception):
102 |             logger.info('{}网页解析失败'.format(url))
103 |             self.page_queue.put(url)
104 | 
105 | 
106 | class StockNameConsumer(threading.Thread):
107 | 
108 |     headers = {
109 |         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) \
110 |         AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 \
111 |         Safari/537.36"
112 |     }
113 | 
114 |     sql_recode = []
115 | 
116 |     def __init__(self, page_queue, stock_name, connect, *args, **kwargs):
117 |         super(StockNameConsumer, self).__init__(*args, **kwargs)
118 |         self.page_queue = page_queue
119 |         self.stock_name = stock_name
120 |         self.connect = connect
121 | 
122 |     def run(self):
123 |         while True:
124 |             if self.stock_name.empty():
125 |                 if self.page_queue.empty():
126 |                     return
127 |             data = self.stock_name.get()
128 |             print(data)
129 |             if len(data) == 3:
130 |                 self.save_data(data)
131 |             else:
132 |                 self.save_data_detail(data)
133 | 
134 |     def save_data(self, data):
135 |         try:
136 |             print(data[0], data[1])
137 |             self.connect.ping(reconnect=True)
138 |             # sql = "update stock_name_new_copy2 set text = '{}' where id = {}"
139 |             # .format(data[0], data[1])
140 |             sql = "insert into stock_name_new_copy3(text, id, date) values (%s, \
141 |                     %s, %s)"
142 |             cursor = self.connect.cursor()
143 |             cursor.execute(sql, tuple(data))
144 |             self.connect.commit()
145 |         except(Exception):
146 |             logger.error('{}数据保存数据库失败'.format(data))
147 | 
148 |     def save_data_detail(self, data):
149 |         try:
150 |             # logger.info('{}数据保存完成'.format(data))
151 |             self.connect.ping(reconnect=True)
152 |             sql = "insert into stock_price_new_copy3(id, tr_time, end_price, \
153 |             up_down, money_in, d5_in_big, b_money, b_part, m_money, m_part,\
154 |             l_money, l_part) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, \
155 |             %s, %s)"
156 |             cursor = self.connect.cursor()
157 |             cursor.execute(sql, data)
158 |             self.connect.commit()
159 |         except(Exception):
160 |             logger.error('{}数据保存数据库失败'.format(data))
161 | 
162 | 
163 | def main():
164 |     connect = pymysql.connect(host='xxxxx', user='root', password='\
165 |                                 xxxxx', port=3306, db='stock')
166 |     page_queue = Queue()
167 |     stock_name = Queue()
168 |     stock_id = pd.read_sql(sql='select distinct stock_id from All_Stock_Name',
169 |                            con=connect)
170 |     stock_list = list(stock_id["stock_id"])
171 |     for i in stock_list:
172 |         text_url = "http://stockpage.10jqka.com.cn/"+i+"/"
173 |         detail_url = "http://stockpage.10jqka.com.cn/"+i+"/funds/"
174 |         page_queue.put(text_url)
175 |         page_queue.put(detail_url)
176 | 
177 |     for i in range(2):
178 |         t = CrwalStockName(page_queue=page_queue, stock_name=stock_name)
179 |         t.start()
180 | 
181 |     for i in range(3):
182 |         t = StockNameConsumer(page_queue=page_queue, stock_name=stock_name,
183 |                               connect=connect)
184 |         t.start()
185 | 
186 | 
187 | if __name__ == "__main__":
188 |     main()
189 | 


--------------------------------------------------------------------------------
/vehicle_home/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = vehicle_home.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = vehicle_home
12 | 


--------------------------------------------------------------------------------
/vehicle_home/vehicle_home/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/vehicle_home/vehicle_home/__init__.py


--------------------------------------------------------------------------------
/vehicle_home/vehicle_home/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/vehicle_home/vehicle_home/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/vehicle_home/vehicle_home/__pycache__/items.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/vehicle_home/vehicle_home/__pycache__/items.cpython-38.pyc


--------------------------------------------------------------------------------
/vehicle_home/vehicle_home/__pycache__/pipelines.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/vehicle_home/vehicle_home/__pycache__/pipelines.cpython-38.pyc


--------------------------------------------------------------------------------
/vehicle_home/vehicle_home/__pycache__/settings.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/vehicle_home/vehicle_home/__pycache__/settings.cpython-38.pyc


--------------------------------------------------------------------------------
/vehicle_home/vehicle_home/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # https://docs.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | import scrapy
 7 | 
 8 | 
 9 | class VehicleHomeItem(scrapy.Item):
10 |     # define the fields for your item here like:
11 |     # name = scrapy.Field()
12 |     # info_length = scrapy.Field()
13 |     usercont = scrapy.Field()
14 |     vehicle_style = scrapy.Field()
15 |     vehicle_by_location = scrapy.Field()
16 |     vehicle_seller = scrapy.Field()
17 |     vehcle_seller_date = scrapy.Field()
18 |     vehicle_seller_money = scrapy.Field()
19 |     vehicle_status = scrapy.Field()
20 |     vehicle_evaluate = scrapy.Field()
21 |     vehicle_bu_aim = scrapy.Field()
22 |     evalute_date = scrapy.Field()
23 |     evalute_title = scrapy.Field()
24 |     evalute_infos = scrapy.Field()
25 |     visit_count = scrapy.Field()
26 |     helpful_count = scrapy.Field()
27 |     comment_count = scrapy.Field()
28 |     vehicle_brand = scrapy.Field()
29 | 


--------------------------------------------------------------------------------
/vehicle_home/vehicle_home/middlewares.py:
--------------------------------------------------------------------------------
  1 | # Define here the models for your spider middleware
  2 | #
  3 | # See documentation in:
  4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  5 | 
  6 | from scrapy import signals
  7 | 
  8 | # useful for handling different item types with a single interface
  9 | from itemadapter import is_item, ItemAdapter
 10 | 
 11 | 
 12 | class VehicleHomeSpiderMiddleware:
 13 |     # Not all methods need to be defined. If a method is not defined,
 14 |     # scrapy acts as if the spider middleware does not modify the
 15 |     # passed objects.
 16 | 
 17 |     @classmethod
 18 |     def from_crawler(cls, crawler):
 19 |         # This method is used by Scrapy to create your spiders.
 20 |         s = cls()
 21 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 22 |         return s
 23 | 
 24 |     def process_spider_input(self, response, spider):
 25 |         # Called for each response that goes through the spider
 26 |         # middleware and into the spider.
 27 | 
 28 |         # Should return None or raise an exception.
 29 |         return None
 30 | 
 31 |     def process_spider_output(self, response, result, spider):
 32 |         # Called with the results returned from the Spider, after
 33 |         # it has processed the response.
 34 | 
 35 |         # Must return an iterable of Request, or item objects.
 36 |         for i in result:
 37 |             yield i
 38 | 
 39 |     def process_spider_exception(self, response, exception, spider):
 40 |         # Called when a spider or process_spider_input() method
 41 |         # (from other spider middleware) raises an exception.
 42 | 
 43 |         # Should return either None or an iterable of Request or item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class VehicleHomeDownloaderMiddleware:
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/vehicle_home/vehicle_home/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | # useful for handling different item types with a single interface
 8 | # from itemadapter import ItemAdapter
 9 | import pymysql
10 | 
11 | 
12 | class VehicleHomePipeline:
13 |     def __init__(self):
14 |         self.connect = pymysql.connect(
15 |                                     host='xxxx', user='root',
16 |                                     password='xxxxx', db='Spider',
17 |                                     port=3306)
18 |         self.cursor = self.connect.cursor()
19 | 
20 |     def process_item(self, item, spider):
21 |         print("----------开启数据库存储模式----------------")
22 |         self.connect.ping(reconnect=True)
23 |         sql = 'insert into vehicle_home_new(usercont, vehicle_style, vehicle_by_location, vehicle_seller, vehicle_brand, vehcle_seller_date, vehicle_seller_money, vehicle_status, vehicle_evaluate,vehicle_bu_aim,evalute_date,evalute_title,visit_count,helpful_count,comment_count,evalute_infos) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
24 |         self.cursor.execute(sql, (item["usercont"], ";".join(item["vehicle_style"]), item['vehicle_by_location'], item['vehicle_seller'], item['vehicle_brand'].split("-")[0], item['vehcle_seller_date'], "".join(item['vehicle_seller_money']), " ".join(item['vehicle_status']), " ".join(item['vehicle_evaluate']), " ".join(item['vehicle_bu_aim']), item['evalute_date'], item['evalute_title'], item['visit_count'], item['helpful_count'], item['comment_count'], item["evalute_infos"]))
25 |         self.connect.commit()
26 |         return item
27 | 
28 |     def close_spider(self, spider):
29 |         print('----------关闭数据库资源-----------')
30 |         # 关闭游标
31 |         self.cursor.close()
32 |         # 关闭连接
33 |         self.connect.close()
34 | 


--------------------------------------------------------------------------------
/vehicle_home/vehicle_home/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for vehicle_home project
 2 | #
 3 | # For simplicity, this file contains only settings considered important or
 4 | # commonly used. You can find more settings consulting the documentation:
 5 | #
 6 | #     https://docs.scrapy.org/en/latest/topics/settings.html
 7 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 8 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 9 | 
10 | BOT_NAME = 'vehicle_home'
11 | 
12 | SPIDER_MODULES = ['vehicle_home.spiders']
13 | NEWSPIDER_MODULE = 'vehicle_home.spiders'
14 | 
15 | 
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | # USER_AGENT = 'vehicle_home (+http://www.yourdomain.com)'
18 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' 
19 | 
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = False
22 | 
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | CONCURRENT_REQUESTS = 100
25 | 
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | DOWNLOAD_DELAY = 0.01
30 | # The download delay setting will honor only one of:
31 | CONCURRENT_REQUESTS_PER_DOMAIN = 100
32 | CONCURRENT_REQUESTS_PER_IP = 100
33 | 
34 | # Disable cookies (enabled by default)
35 | COOKIES_ENABLED = False
36 | 
37 | LOG_FILE = 'log.txt'
38 | LOG_LEVEL = "INFO"
39 | 
40 | # Disable Telnet Console (enabled by default)
41 | #TELNETCONSOLE_ENABLED = False
42 | 
43 | # Override the default request headers:
44 | #DEFAULT_REQUEST_HEADERS = {
45 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
46 | #   'Accept-Language': 'en',
47 | #}
48 | 
49 | # Enable or disable spider middlewares
50 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
51 | #SPIDER_MIDDLEWARES = {
52 | #    'vehicle_home.middlewares.VehicleHomeSpiderMiddleware': 543,
53 | #}
54 | 
55 | # Enable or disable downloader middlewares
56 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
57 | #DOWNLOADER_MIDDLEWARES = {
58 | #    'vehicle_home.middlewares.VehicleHomeDownloaderMiddleware': 543,
59 | #}
60 | 
61 | # Enable or disable extensions
62 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
63 | #EXTENSIONS = {
64 | #    'scrapy.extensions.telnet.TelnetConsole': None,
65 | #}
66 | 
67 | # Configure item pipelines
68 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
69 | ITEM_PIPELINES = {
70 |    'vehicle_home.pipelines.VehicleHomePipeline': 300,
71 | }
72 | 
73 | FEED_EXPORT_ENCODING = "gb18030"
74 | 
75 | # Enable and configure the AutoThrottle extension (disabled by default)
76 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
77 | #AUTOTHROTTLE_ENABLED = True
78 | # The initial download delay
79 | #AUTOTHROTTLE_START_DELAY = 5
80 | # The maximum download delay to be set in case of high latencies
81 | #AUTOTHROTTLE_MAX_DELAY = 60
82 | # The average number of requests Scrapy should be sending in parallel to
83 | # each remote server
84 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
85 | # Enable showing throttling stats for every response received:
86 | #AUTOTHROTTLE_DEBUG = False
87 | 
88 | # Enable and configure HTTP caching (disabled by default)
89 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
90 | #HTTPCACHE_ENABLED = True
91 | #HTTPCACHE_EXPIRATION_SECS = 0
92 | #HTTPCACHE_DIR = 'httpcache'
93 | #HTTPCACHE_IGNORE_HTTP_CODES = []
94 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
95 | 


--------------------------------------------------------------------------------
/vehicle_home/vehicle_home/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/vehicle_home/vehicle_home/spiders/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/vehicle_home/vehicle_home/spiders/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/vehicle_home/vehicle_home/spiders/__pycache__/test.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/vehicle_home/vehicle_home/spiders/__pycache__/test.cpython-38.pyc


--------------------------------------------------------------------------------
/vehicle_home/vehicle_home/spiders/__pycache__/vehicle_style.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/vehicle_home/vehicle_home/spiders/__pycache__/vehicle_style.cpython-38.pyc


--------------------------------------------------------------------------------
/vehicle_home/vehicle_home/spiders/test.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from lxml import etree
 3 | import json
 4 | 
 5 | 
 6 | data = requests.get("https://k.autohome.com.cn/detail/view_01enhzwr4z6csk0c9j70r00000.html#pvareaid=2112108").text
 7 | html = etree.HTML(data).xpath("//div[@class='choose-con']//dl")
 8 | seller_id = html[2].xpath(".//a/@data-val")[0]
 9 | data_evalid = html[2].xpath(".//a/@data-evalid")[0]
10 | url_api = "https://k.autohome.com.cn/frontapi/GetDealerInfor?dearerandspecIdlist=" + seller_id + "," + data_evalid +"|"
11 | data = requests.get(url_api).text 
12 | 
13 | seller_name = json.loads(data)["result"]["List"][0]["CompanySimple"]
14 | print(seller_id, data_evalid)
15 | 
16 | print(seller_name)
17 | 


--------------------------------------------------------------------------------
/vehicle_home/vehicle_home/spiders/vehicle_style.py:
--------------------------------------------------------------------------------
  1 | import scrapy
  2 | from vehicle_home.items import VehicleHomeItem
  3 | import re
  4 | import json
  5 | 
  6 | 
  7 | class VehicleStyleSpider(scrapy.Spider):
  8 |     name = 'vehicle_style'
  9 |     allowed_domains = ['k.autohome.com.cn']
 10 |     start_urls = ['https://k.autohome.com.cn/suva01/']
 11 | 
 12 |     def parse(self, response):
 13 |         # 获取车型级别链接
 14 |         vehicle_style_url = response.xpath("//div[@class='findcont-choose']/a/\
 15 |                                             @href").getall()
 16 |         # 获取车型级别下属所有车型的链接
 17 |         vehicle_style_one_url = response.xpath("//ul[@class='list-cont']/li/div\
 18 |                                         [@class='cont-pic']/a/@href").getall()
 19 |         # yield {
 20 |         #     "vehicle_style_one_url": vehicle_style_one_url,
 21 |         #     # "vehicle_style_url": vehicle_style_url
 22 |         # }
 23 |         if vehicle_style_url is not None:
 24 |             for url in vehicle_style_url:
 25 |                 yield response.follow(url, self.parse)
 26 |         if vehicle_style_one_url is not None:
 27 |             for url in vehicle_style_one_url:
 28 |                 yield response.follow(url, self.parse_vehicle_detail)
 29 | 
 30 |     def parse_vehicle_detail(self, response):
 31 |         item = VehicleHomeItem()
 32 |         # 每个用户评价的口碑详细信息
 33 |         kou_bei_detail_url = response.xpath("//div[@class='allcont border-b-solid']\
 34 |                                         //a/@href").getall()
 35 |         # 获取品牌及其名称
 36 |         vehicle_brand = response.xpath("//div[@class='subnav']//div[@class='subnav-title-name']/a/text()").get()
 37 |         item["vehicle_brand"] = vehicle_brand
 38 |         # yield items
 39 |         # 下一页链接
 40 |         kou_bei_next_url = response.xpath("//div[@class='page']//a[@class=\
 41 |                                         'page-item-next']/@href").get()
 42 |         if kou_bei_detail_url is not None:
 43 |             for url in kou_bei_detail_url:
 44 |                 # print(url)
 45 |                 url = "https:" + url
 46 |                 yield scrapy.Request(url=url, callback=self.parse_vehicle_detail_infos, meta={'item': item})
 47 |                 # yield response.follow(url, self.parse_vehicle_detail_infos)
 48 |         if kou_bei_next_url is not None:
 49 |             yield response.follow(kou_bei_next_url, self.parse_vehicle_detail)
 50 | 
 51 |     def parse_vehicle_detail_infos(self, response):
 52 |         item = response.meta['item']
 53 |         # 获取用户昵称
 54 |         usercont = response.xpath("//div[@class='mouth']//dl[@class='user-cont']\
 55 |                                     //div[@class='user-name']//a/text()").get()
 56 |         # 车型
 57 |         vehicle_style = response.xpath("//div[@class='choose-con']//dl//dd//a/\
 58 |                                         text()").getall()
 59 |         # 购车情况
 60 |         choose_dl = response.xpath("//div[@class='choose-con']//dl")
 61 |         # info_length = len(choose_dl)
 62 |         # 购车地点
 63 |         vehicle_by_location = choose_dl[1].xpath("./dd//text()").get()
 64 |         # 购车所属品牌
 65 |         vehicle_seller = choose_dl[2].xpath("./dd/a/text()").get()
 66 |         # 购车时间\价格
 67 |         if vehicle_seller is not None:
 68 |             vehcle_seller_date = choose_dl[3].xpath("./dd/text()").get()
 69 |             vehicle_seller_money = choose_dl[4].xpath("./dd//text()").getall()
 70 |         else:
 71 |             vehcle_seller_date = choose_dl[2].xpath("./dd/text()").get()
 72 |             vehicle_seller_money = choose_dl[3].xpath("./dd//text()").getall()
 73 |         # 油耗或着电耗，目前行驶里程
 74 |         vehicle_status = choose_dl[-10].xpath("./dd/p//text()").getall()
 75 |         # 购车多个方面的评价
 76 |         vehicle_evaluate = response.xpath("//span[@class='testfont']/\
 77 |                                             text()").getall()
 78 |         # 购车目的
 79 |         vehicle_bu_aim = choose_dl[-1].xpath(".//dd/p/text()").getall()
 80 |         # 发布口碑的时间
 81 |         evalute_date = response.xpath("//div[@class='mouth-item koubei-final']//\
 82 |                         div[@class='title-name name-width-01']/b/text()").get()
 83 |         # 口碑题目
 84 |         evalute_title = response.xpath("//div[@class='mouth-item koubei-final']//\
 85 |                         div[@class='kou-tit']/h3/text()").get()
 86 |         # 发布的内容
 87 |         evalute_infos = response.xpath("//div[@class='mouth-item koubei-final']//\
 88 |                         div[@class='text-con']//text()").getall()
 89 |         # 浏览量
 90 |         visit_count = response.xpath("//div[@class='mouth-remak']//div[@class=\
 91 |                                 'help']//span[@class='orange']/text()").get()
 92 |         # 口碑支持数
 93 |         helpful_count = response.xpath("//div[@class='mouth-remak']//div[@class=\
 94 |                         'help']//label[@class='supportNumber']/text()").get()
 95 |         # 评论数
 96 |         comment_count = response.xpath("//div[@class='mouth-remak']//div[@class=\
 97 |                         'help']/a//span/text()").get()
 98 |         # item["info_length"] = info_length
 99 |         item["usercont"] = usercont
100 |         item["vehicle_style"] = vehicle_style
101 |         item["vehicle_by_location"] = re.sub("[A-Za-z0-9\!\%\[\]\,\。\(\)\}\{\_\=\;&''+\<\>//$.::\"-#：\- \r\n]", "", "".join(vehicle_by_location))
102 |         item["vehcle_seller_date"] = vehcle_seller_date
103 |         item["vehicle_seller_money"] = vehicle_seller_money
104 |         item["vehicle_status"] = vehicle_status
105 |         item["vehicle_evaluate"] = vehicle_evaluate
106 |         item["vehicle_bu_aim"] = vehicle_bu_aim
107 |         item["evalute_date"] = evalute_date
108 |         item["evalute_title"] = evalute_title
109 |         item["evalute_infos"] = re.sub("[A-Za-z0-9\!\%\[\]\,\。\(\)\}\{\_\=\;&''+\<\>//$.::\"-#：\- \r\n]", "", "".join(evalute_infos))
110 |         item["visit_count"] = visit_count
111 |         item["helpful_count"] = helpful_count
112 |         item["comment_count"] = comment_count
113 |         # 获取经销商的信息（新增的代码）
114 |         if vehicle_seller is not None:
115 |             print(response.url)
116 |             seller_id = choose_dl[2].xpath(".//a/@data-val").get()
117 |             data_evalid = choose_dl[2].xpath(".//a/@data-evalid").get()
118 |             seller_api_url = "https://k.autohome.com.cn/frontapi/GetDealerInfor?dearerandspecIdlist=" + seller_id + "," + data_evalid +"|"
119 |             print(seller_id, data_evalid)
120 |             print(seller_api_url)
121 |             print("="*100)
122 |             yield scrapy.Request(url=seller_api_url, callback=self.parse_vehicle_seller, meta={'item': item})
123 |         else:
124 |             item["vehicle_seller"] = vehicle_seller
125 |             yield item
126 | 
127 |     # 获取经销商信息
128 |     def parse_vehicle_seller(self, response):
129 |         item = response.meta['item']
130 |         seller_name = json.loads(response.text)["result"]["List"][0]["CompanySimple"]
131 |         item["vehicle_seller"] = seller_name
132 |         yield item
133 | 


--------------------------------------------------------------------------------
/weather_spider_analyze/weather_spider.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.webdriver.common.by import By
 3 | from selenium.webdriver.support import expected_conditions as EC
 4 | from selenium.webdriver.support.wait import WebDriverWait
 5 | import time
 6 | import datetime
 7 | import pymysql
 8 | 
 9 | url = 'http://tianqi.2345.com/wea_history/54511.htm'
10 | driver=webdriver.Chrome("E:/0_软件/chromedriver.exe")
11 | try:
12 |     driver.get(url)
13 |     time.sleep(2)
14 |     print(driver.current_url)
15 |     # 暂时获取60页(5年的信息)
16 |     for i in range(60):
17 |         weathter_lists = driver.find_elements_by_xpath('//table/tbody/tr')
18 |         for weather in weathter_lists:
19 |             infos = weather.find_elements_by_tag_name('td')
20 |             weather_time = infos[0].text[:-3]   # 时间
21 |             # 将字符串时间转换成Datetime
22 |             weather_time = datetime.datetime.strptime(weather_time,'%Y-%m-%d').date()
23 |             high_tm = infos[1].text  # 最高温
24 |             low_tm = infos[2].text  # 最低温
25 |             weath = infos[3].text   # 天气
26 |             wind_style = infos[4].text   # 风向、风力
27 |             air = infos[5].text  # 空气质量指数
28 |             db = pymysql.connect(host='localhost',user='root',password='123456',port=3306,db='spider_data')
29 |             cursor = db.cursor()
30 |             sql = 'INSERT INTO weather_beijing(weather_time,high_tem,low_tem,weather,wind_direction,air) VALUES(%s,%s,%s,%s,%s,%s)'
31 |             try: 
32 |                 cursor.execute(sql,(weather_time,high_tm,low_tm,weath,wind_style,air))
33 |                 db.commit()
34 |                 print('数据保存成功!')
35 |             except:
36 |                 print('数据保存失败!')
37 |             print(weather_time,high_tm,low_tm,weath,wind_style,air)
38 |         # 点击上一页
39 |         pre_page_button = driver.find_element_by_xpath('//div[@id="prevNextBtn"]/a[@class="prev"]')
40 |         pre_page_button.click()
41 |         time.sleep(3)
42 | except:
43 |     print('fail!')
44 | driver.close()
45 | 


--------------------------------------------------------------------------------
/zhihu/hot.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from lxml import etree
 3 | import time
 4 | import multiprocessing
 5 | import pymysql
 6 | 
 7 | headers = {
 8 |     'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
 9 |     'cookie': '_zap=47e250c3-7a07-41d6-88a7-52b3cb282921; d_c0="ALCvK5ioexCPTmjktJFsrBEH1LQX-TTUjkM=|1575956493"; capsion_ticket="2|1:0|10:1576116250|14:capsion_ticket|44:ZTEwNjMxZmQ0OTA4NDU5MGI1MWNiODgxYjg4MTRmMWE=|ac6c9175199323ab564e53969f64440ccc244feacfdf40e5bce5a8084d82a806"; z_c0="2|1:0|10:1576116298|4:z_c0|92:Mi4xRi1ac0JnQUFBQUFBc0s4cm1LaDdFQ2NBQUFDRUFsVk5TaTBaWGdBX2puWUIxcmhYa3hoR1hsWkRwN2FKOENGNDN3|7f3436a96ccb3cdf3d549b20ed658d14ab8c9b1f75937684fd3ee524fb061e93"; q_c1=f254b825456c428fb665bc0ba903aca4|1576116327000|1576116327000; __utma=51854390.997463445.1576214466.1576214466.1576214466.1; __utmz=51854390.1576214466.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=51854390.100--|2=registration_date=20171105=1^3=entry_date=20171105=1; _xsrf=f03ebad1-fe74-45e4-a8e1-33c0c0219444; tshl=; tst=h; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1576218531,1576221535,1576472582,1576482250; tgw_l7_route=64ba0a179156dda09fec37a3b2d556ed; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1576483496'
10 | }
11 | url = 'https://www.zhihu.com/hot'
12 | def get_question_num(url,headers):
13 |     response = requests.get(url,headers=headers)
14 |     text = response.text
15 |     html = etree.HTML(text)
16 |     reslut = html.xpath("//section[@class='HotItem']")
17 |     # 获取问题的ID
18 |     question_list = []
19 |     for question in reslut:
20 |         number = question.xpath(".//div[@class='HotItem-index']//text()")[0].strip()
21 |         title = question.xpath(".//h2[@class='HotItem-title']/text()")[0].strip()
22 |         href = question.xpath(".//div[@class='HotItem-content']/a/@href")[0].strip()
23 |         question_num = href.split('/')[-1]
24 |         question_list.append([question_num,title])
25 |         # print(number,'\n',title,'\n',href)
26 |     return question_list
27 | # 数据json请求（问题均通过ajax请求）
28 | # 分析链接格式,如下:
29 | # https://www.zhihu.com/api/v4/questions/359056618/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&limit=5&offset=5&platform=desktop&sort_by=default
30 | # 变化量如：question_id , offset=5,10,15......
31 | def data_json_request(question_id,question_title,headers):
32 |     num = 0
33 |     i = 1
34 |     while True:
35 |         json_url = 'https://www.zhihu.com/api/v4/questions/' + question_id + '/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&limit=5&offset={}&platform=desktop&sort_by=default'.format(num)
36 |         data_json = requests.get(json_url,headers=headers)
37 |         all_detail_data = data_json.json()['data']
38 |         length_detail_data = len(all_detail_data)
39 |         for one_detail_data in all_detail_data:
40 |             question_title = question_title
41 |             answer_author = one_detail_data['author']['name']
42 |             author_introduce = one_detail_data['author']['headline']
43 |             author_followers = one_detail_data['author']['follower_count']
44 |             answer_vote_num = one_detail_data['voteup_count']
45 |             answer_comment_num = one_detail_data['comment_count']
46 |             updated_time = one_detail_data['updated_time']
47 |             content = one_detail_data['content']
48 |             # 保存数据至数据库
49 |             db = pymysql.connect(host='localhost',user='root',password='123456',port=3306,db='spider_data')
50 |             cursor = db.cursor()
51 |             sql = 'INSERT INTO zhihu_hot_question(question_title,author_name,author_introduce,author_followers,answer_vote_num,answer_comment_num,updated_time,content) VALUES(%s,%s,%s,%s,%s,%s,%s,%s)'
52 |             try:
53 |                 if int(answer_vote_num) >= 90:
54 |                     cursor.execute(sql,(question_title,answer_author,author_introduce,author_followers,answer_vote_num,answer_comment_num,updated_time,content))
55 |                     db.commit()
56 |                     print('数据写入成功！！！')
57 |                 else:
58 |                     print('点赞数太少，不保存至数据库！！！')
59 |             except:
60 |                 print('数据写入失败！')
61 |                 db.rollback()
62 |             # print(question_title,'\n',answer_author,'\n',author_introduce,'\n',author_followers,'\n',answer_vote_num,'\n',answer_comment_num
63 |             # ,'\n',updated_time,'\n',content)
64 |         num = i*5
65 |         i = i+1
66 |         if length_detail_data == 0:
67 |             print('answaer_stop!!!!!')
68 |             break
69 | 
70 | # def save_to_mysql():
71 | #     db = pymysql.connect(host='localhost',user='root',password='123456',port=3306,db='spider_data')
72 | #     cursor = db.cursor()
73 | #     sql = 'INSERT INTO zhihu_hot_question(question_title,author_name,author_introduce,author_followers,answer_vote_num,answer_comment_num,updated_time,content) VALUES(%s,%s,%s,%s,%s,%s,%s,%s)'
74 |     
75 | 
76 | def main():
77 |     question_id = get_question_num(url,headers)
78 |     print(question_id)
79 |     print('当前环境CPU核数是：{}核'.format(multiprocessing.cpu_count()))
80 |     p = multiprocessing.Pool(4)
81 |     for q_id in question_id:
82 |         p.apply_async(data_json_request,args=(q_id[0],q_id[1],headers))
83 |     p.close()
84 |     p.join()
85 | 
86 | if __name__ == "__main__":
87 |     start = time.time()
88 |     main()
89 |     print('总耗时：%.5f秒'% float(time.time()-start))


--------------------------------------------------------------------------------