├── LICENSE ├── README.md ├── beike_find_house ├── .ipynb_checkpoints │ └── room_data_analyze-checkpoint.ipynb ├── __pycache__ │ ├── connect_redis.cpython-37.pyc │ └── proxy.cpython-37.pyc ├── after_deal_data.csv ├── beijing_fang111.csv ├── beike_find_house - 副本.xlsx ├── home_spider.py ├── photo │ ├── 01.png │ ├── 1.png │ ├── 11.png │ ├── 12.png │ ├── 13.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ ├── 5.png │ ├── 6.png │ ├── 7.png │ ├── 8.png │ ├── 9.png │ └── oo.png ├── render.html └── room_data_analyze.ipynb ├── car_home ├── auto_bmw │ ├── auto_bmw │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── items.cpython-37.pyc │ │ │ ├── pipelines.cpython-37.pyc │ │ │ └── settings.cpython-37.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ ├── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-37.pyc │ │ │ │ └── bmw_spider.cpython-37.pyc │ │ │ └── bmw_spider.py │ │ └── test.py │ ├── scrapy.cfg │ └── start.py ├── auto_bmw_all │ ├── auto_bmw_all │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── items.cpython-37.pyc │ │ │ ├── pipelines.cpython-37.pyc │ │ │ └── settings.cpython-37.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ └── auto_crawl.cpython-37.pyc │ │ │ └── auto_crawl.py │ ├── scrapy.cfg │ └── start.py ├── autohome │ ├── auto.json │ ├── autohome │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── items.cpython-37.pyc │ │ │ ├── pipelines.cpython-37.pyc │ │ │ └── settings.cpython-37.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ └── autohome_crawl.cpython-37.pyc │ │ │ └── autohome_crawl.py │ └── scrapy.cfg └── test_spider │ ├── scrapy.cfg │ ├── start.py │ └── test_spider │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-37.pyc │ └── settings.cpython-37.pyc │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-37.pyc │ └── demo.cpython-37.pyc │ └── demo.py ├── death_company ├── .ipynb_checkpoints │ ├── death_company_data_analyze-checkpoint.ipynb │ ├── lagou_spider-checkpoint.ipynb │ └── position_data_analyze-checkpoint.ipynb ├── apple.jpg ├── death_company.py ├── death_company_data_analyze.ipynb ├── death_company_info - 副本.xls ├── heart.jpg └── 可视化图表 │ ├── com_death_reason.jpg │ ├── com_financing.jpg │ ├── com_financing_pie.jpg │ ├── com_live_time.jpg │ ├── com_position.jpg │ ├── com_position_pie.jpg │ ├── com_style.jpg │ └── com_style_pie.jpg ├── jianshu ├── ghostdriver.log ├── jianshu │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── items.cpython-37.pyc │ │ ├── middlewares.cpython-37.pyc │ │ ├── pipelines.cpython-37.pyc │ │ └── settings.cpython-37.pyc │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ └── js.cpython-37.pyc │ │ └── js.py ├── scrapy.cfg └── start.py ├── ppt_download_spider └── ppt_download_spider.py ├── proxy_design ├── __pycache__ │ └── connect_redis.cpython-37.pyc ├── connect_redis.py └── proxy.py ├── stock ├── A_stock_company.py └── stock_spider_new.py ├── vehicle_home ├── scrapy.cfg ├── vehicle.json └── vehicle_home │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-38.pyc │ ├── items.cpython-38.pyc │ ├── pipelines.cpython-38.pyc │ └── settings.cpython-38.pyc │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-38.pyc │ ├── test.cpython-38.pyc │ └── vehicle_style.cpython-38.pyc │ ├── test.py │ └── vehicle_style.py ├── weather_spider_analyze └── weather_spider.py └── zhihu └── hot.py /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## python网络爬虫实战合集(部分爬虫数据附加数据分析) 2 | 3 | #### PPT模板 4 | - [PPT模板自动下载](https://github.com/ShanYonggang/spider_list/blob/master/ppt_download_spider/ppt_download_spider.py "PPT模板自动下载") 5 | 6 | #### 知乎 7 | - [知乎热榜问题及答案数据获取](https://www.shanyonggang.cn/article_detail/65/ "知乎热榜问题及答案数据获取") 8 | 9 | #### 爬虫代理池 10 | - [个人爬虫代理池创建](https://www.shanyonggang.cn/article_detail/66/ "个人爬虫代理池创建") 11 | 12 | #### IT桔子 13 | - [IT桔子死亡公司数据库获取](https://www.shanyonggang.cn/article_detail/67/ "IT桔子死亡公司数据库获取") 14 | - [用python数据分析来解密新经济(IT桔子)死亡公司的内幕](https://www.shanyonggang.cn/article_detail/69/ "用python数据分析来解密新经济(IT桔子)死亡公司的内幕") 15 | 16 | #### 贝壳找房 17 | - [python爬取贝壳找房北京二手房信息数据](https://www.shanyonggang.cn/article_detail/85/ "python爬取贝壳找房北京二手房信息数据") 18 | - [使用python对北京二手房信息数据分析及可视化展示](https://www.shanyonggang.cn/article_detail/86/ "使用python对北京二手房信息数据分析及可视化展示") 19 | 20 | #### 汽车之家口碑频道 21 | - [使用Scrapy进行汽车之家口碑频道爬虫](https://zhuanlan.zhihu.com/p/268117716 "使用Scrapy进行汽车之家口碑频道爬虫") 22 | 23 | #### 天气 24 | - [使用selenium获取北京地区2015年至2019年天气情况](https://github.com/ShanYonggang/spider_list/blob/master/weather_spider_analyze/weather_spider.py "使用selenium获取北京地区2015年至2019年天气情况") 25 | 26 | #### Scrapy框架爬虫合集 27 | - [使用Scrapy进行汽车之家信息爬虫](https://github.com/ShanYonggang/spider_list/tree/master/car_home "使用Scrapy进行汽车之家信息爬虫") 28 | - [使用Scrapy进行简书信息爬虫](https://github.com/ShanYonggang/spider_list/tree/master/jianshu "使用Scrapy进行简书信息爬虫") 29 | 30 | #### 股票相关数据抓取 31 | - [使用Python抓取A股上市公司基本信息](https://github.com/ShanYonggang/spider_list/blob/master/stock/A_stock_company.py "使用Python抓取A股上市公司基本信息") 32 | - [A股上市公司的主力控盘情况及资金动向数据抓取](https://github.com/ShanYonggang/spider_list/blob/master/stock/stock_spider_new.py "A股上市公司的主力控盘情况及资金动向数据抓取") 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /beike_find_house/__pycache__/connect_redis.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/__pycache__/connect_redis.cpython-37.pyc -------------------------------------------------------------------------------- /beike_find_house/__pycache__/proxy.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/__pycache__/proxy.cpython-37.pyc -------------------------------------------------------------------------------- /beike_find_house/beijing_fang111.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/beijing_fang111.csv -------------------------------------------------------------------------------- /beike_find_house/beike_find_house - 副本.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/beike_find_house - 副本.xlsx -------------------------------------------------------------------------------- /beike_find_house/home_spider.py: -------------------------------------------------------------------------------- 1 | # 目的,爬取贝壳找房数据 2 | 3 | # 每页 url = 'bj.ke.com/ershoufang/pg{}/'.format(i),然后获取房屋的详细url,对详细url进行尽可能多的数据采集 4 | 5 | # 爬虫中使用多线程、多进程 6 | 7 | import requests 8 | import time 9 | from multiprocessing import Pool 10 | from lxml import etree 11 | import pandas as pd 12 | import os 13 | import random 14 | 15 | # 获取房源的基本url 16 | # 参数page 17 | def get_home_url(page): 18 | url = 'http://bj.ke.com/ershoufang/pg{}/'.format(page) 19 | headers = { 20 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36', 21 | 'Cookie': 'lianjia_uuid=e6a91b7a-b6a4-40b5-88c6-ff67759cbc8a; crosSdkDT2019DeviceId=-51npj6--xbmlw5-f22i5qg8bh36ouv-yttqkmwdf; _ga=GA1.2.121082359.1579583230; ke_uuid=6de1afa21a5799c0874702af39248907; __xsptplus788=788.1.1579583230.1579583347.4%234%7C%7C%7C%7C%7C%23%23Q6jl-k46IlXjCORdTOp6O3JyzHokoUrb%23; select_city=110000; digv_extends=%7B%22utmTrackId%22%3A%2280418605%22%7D; lianjia_ssid=a4ab1bc0-cb04-492f-960c-342c66065da0; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1583897013,1583932737; User-Realip=111.196.247.121; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216fc67f100b140-06f07f8f707639-33365a06-1049088-16fc67f100c603%22%2C%22%24device_id%22%3A%2216fc67f100b140-06f07f8f707639-33365a06-1049088-16fc67f100c603%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_utm_source%22%3A%22baidu%22%2C%22%24latest_utm_medium%22%3A%22pinzhuan%22%2C%22%24latest_utm_campaign%22%3A%22wybeijing%22%2C%22%24latest_utm_content%22%3A%22biaotimiaoshu%22%2C%22%24latest_utm_term%22%3A%22biaoti%22%7D%7D; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1583933576; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiMjAxZjBjNWU1ZWE1ZGVmYjQxZDFlYTE4MGVkNWI1OGRjYzk5Mzc2MjEwNTcyMWI3ODhiNTQyNTExOGQ1NTVlZDNkMTY2MWE2YWI5YWRlMGY0NDA3NjkwNWEyMzRlNTdhZWExNDViNGFiNWVmMmMyZWJlZGY1ZjM2Y2M0NWIxMWZlMWFiOWI2MDJiMzFmOTJmYzgxNzNiZTIwMzE1ZGJjNTUyMWE2ZjcxYzZmMTFhOWIyOWU2NzJkZTkyZjc3ZDk1MzhiNjhhMTQyZDQ2YmEyNjJhYzJmNjdjNmFjM2I5YzU0MzdjMDkwYWUwMzZmZjVjYWZkZTY5YjllYzY0NzEwMWY2OTc1NmU1Y2ExNzNhOWRmZTdiNGY4M2E1Zjc2NDZmY2JkMGM2N2JiMjdmZTJjNjI2MzNkMjdlNDY4ODljZGRjMjc3MTQ0NDUxMDllZThlZDVmZmMwMjViNjc2ZjFlY1wiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCJkMDI2MDk0N1wifSIsInIiOiJodHRwczovL2JqLmtlLmNvbS9lcnNob3VmYW5nLzE5MTExMzE5NTEwMTAwMTcxNzU5Lmh0bWwiLCJvcyI6IndlYiIsInYiOiIwLjEifQ==' 22 | } 23 | text = requests.get(url,headers=headers).text 24 | html = etree.HTML(text) 25 | detail_url = html.xpath('//ul[@class="sellListContent"]//li[@class="clear"]/a/@href') 26 | return detail_url 27 | 28 | # 获取房源详细数据信息 29 | def get_home_detail_infos(detail_url): 30 | headers = { 31 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36', 32 | 'Cookie': 'lianjia_uuid=e6a91b7a-b6a4-40b5-88c6-ff67759cbc8a; crosSdkDT2019DeviceId=-51npj6--xbmlw5-f22i5qg8bh36ouv-yttqkmwdf; _ga=GA1.2.121082359.1579583230; ke_uuid=6de1afa21a5799c0874702af39248907; __xsptplus788=788.1.1579583230.1579583347.4%234%7C%7C%7C%7C%7C%23%23Q6jl-k46IlXjCORdTOp6O3JyzHokoUrb%23; select_city=110000; digv_extends=%7B%22utmTrackId%22%3A%2280418605%22%7D; lianjia_ssid=a4ab1bc0-cb04-492f-960c-342c66065da0; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1583897013,1583932737; User-Realip=111.196.247.121; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216fc67f100b140-06f07f8f707639-33365a06-1049088-16fc67f100c603%22%2C%22%24device_id%22%3A%2216fc67f100b140-06f07f8f707639-33365a06-1049088-16fc67f100c603%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_utm_source%22%3A%22baidu%22%2C%22%24latest_utm_medium%22%3A%22pinzhuan%22%2C%22%24latest_utm_campaign%22%3A%22wybeijing%22%2C%22%24latest_utm_content%22%3A%22biaotimiaoshu%22%2C%22%24latest_utm_term%22%3A%22biaoti%22%7D%7D; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1583933576; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiMjAxZjBjNWU1ZWE1ZGVmYjQxZDFlYTE4MGVkNWI1OGRjYzk5Mzc2MjEwNTcyMWI3ODhiNTQyNTExOGQ1NTVlZDNkMTY2MWE2YWI5YWRlMGY0NDA3NjkwNWEyMzRlNTdhZWExNDViNGFiNWVmMmMyZWJlZGY1ZjM2Y2M0NWIxMWZlMWFiOWI2MDJiMzFmOTJmYzgxNzNiZTIwMzE1ZGJjNTUyMWE2ZjcxYzZmMTFhOWIyOWU2NzJkZTkyZjc3ZDk1MzhiNjhhMTQyZDQ2YmEyNjJhYzJmNjdjNmFjM2I5YzU0MzdjMDkwYWUwMzZmZjVjYWZkZTY5YjllYzY0NzEwMWY2OTc1NmU1Y2ExNzNhOWRmZTdiNGY4M2E1Zjc2NDZmY2JkMGM2N2JiMjdmZTJjNjI2MzNkMjdlNDY4ODljZGRjMjc3MTQ0NDUxMDllZThlZDVmZmMwMjViNjc2ZjFlY1wiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCJkMDI2MDk0N1wifSIsInIiOiJodHRwczovL2JqLmtlLmNvbS9lcnNob3VmYW5nLzE5MTExMzE5NTEwMTAwMTcxNzU5Lmh0bWwiLCJvcyI6IndlYiIsInYiOiIwLjEifQ==' 33 | } 34 | detail_text = requests.get(detail_url,headers=headers).text 35 | html = etree.HTML(detail_text) 36 | all_data = [] 37 | # 解析获取相关数据 38 | # 所在地址 39 | home_location = html.xpath('//div[@data-component="overviewIntro"]//div[@class="content"]//div[@class="areaName"]/span[@class="info"]/a/text()') 40 | all_data.append(home_location) 41 | # 小区名称 42 | local_name = html.xpath('//div[@data-component="overviewIntro"]//div[@class="content"]//div[@class="communityName"]/a/text()')[0] 43 | all_data.append(local_name) 44 | # 总价格 45 | total_price = html.xpath('//div[@data-component="overviewIntro"]//div[@class="content"]//div[@class="price "]/span[@class="total"]/text()')[0] 46 | all_data.append(total_price) 47 | # 单价 48 | unit_price = html.xpath('//div[@data-component="overviewIntro"]//div[@class="content"]//div[@class="price "]//div[@class="unitPrice"]/span/text()')[0] 49 | all_data.append(unit_price) 50 | # 房屋基本信息 51 | home_style = html.xpath('//div[@class="introContent"]//div[@class="base"]//div[@class="content"]/ul/li/text()') 52 | all_data.append(home_style) 53 | # 房屋交易属性信息 54 | transaction_info = html.xpath('//div[@class="introContent"]//div[@class="transaction"]//div[@class="content"]/ul/li/text()') 55 | all_data.append(transaction_info) 56 | # 小区均价 57 | xiaoqu_price = html.xpath('//div[@class="xiaoquCard"]//div[@class="xiaoqu_main fl"]//span[@class="xiaoqu_main_info price_red"]/text()')[0].replace(' ','') 58 | all_data.append(xiaoqu_price) 59 | # 小区建造时间 60 | xiaoqu_built_time = html.xpath('//div[@class="xiaoquCard"]//div[@class="xiaoqu_main fl"]//span[@class="xiaoqu_main_info"]/text()')[0].replace(' ','').replace('\n','') 61 | all_data.append(xiaoqu_built_time) 62 | # 小区建筑类型 63 | xiaoqu_built_style = html.xpath('//div[@class="xiaoquCard"]//div[@class="xiaoqu_main fl"]//span[@class="xiaoqu_main_info"]/text()')[1].replace(' ','').replace('\n','') 64 | all_data.append(xiaoqu_built_style) 65 | # 小区楼层总数 66 | xiaoqu_total_ceng = html.xpath('//div[@class="xiaoquCard"]//div[@class="xiaoqu_main fl"]//span[@class="xiaoqu_main_info"]/text()')[2].replace(' ','').replace('\n','') 67 | all_data.append(xiaoqu_total_ceng) 68 | return all_data 69 | 70 | # 数据保存至csv文件里(使用pandas中的to_csv保存) 71 | def save_data(data): 72 | data_frame = pd.DataFrame(data,columns=['小区位置','小区名称','房屋总价','房屋单价','房屋基本信息','房屋交易信息','小区均价','小区建造时间','小区房屋类型','小区层数']) 73 | print(data_frame) 74 | data_frame.to_csv('beijing_fang111.csv',header=False,index=False,mode='a',encoding='utf_8_sig') 75 | 76 | def main(page): 77 | print('开始爬取第{}页的数据!'.format(page)) 78 | # choice_time = random.choice(range(0,5)) 79 | # print(choice_time) 80 | 81 | urls = get_home_url(page) 82 | for url in urls: 83 | print('开始爬去详细网页为{}的房屋详细信息资料!'.format(url)) 84 | all_data = get_home_detail_infos(detail_url=url) 85 | data = [] 86 | data.append(all_data) 87 | save_data(data) 88 | 89 | if __name__ == "__main__": 90 | page = range(0,100) 91 | print('爬虫开始') 92 | pool = Pool(processes=4) 93 | pool.map(main,page) 94 | # proxies = proxy.get_proxy_random() 95 | # pool.apply_async(main,args=(page,proxies,)) 96 | pool.close() 97 | pool.join() 98 | 99 | 100 | -------------------------------------------------------------------------------- /beike_find_house/photo/01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/01.png -------------------------------------------------------------------------------- /beike_find_house/photo/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/1.png -------------------------------------------------------------------------------- /beike_find_house/photo/11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/11.png -------------------------------------------------------------------------------- /beike_find_house/photo/12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/12.png -------------------------------------------------------------------------------- /beike_find_house/photo/13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/13.png -------------------------------------------------------------------------------- /beike_find_house/photo/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/2.png -------------------------------------------------------------------------------- /beike_find_house/photo/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/3.png -------------------------------------------------------------------------------- /beike_find_house/photo/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/4.png -------------------------------------------------------------------------------- /beike_find_house/photo/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/5.png -------------------------------------------------------------------------------- /beike_find_house/photo/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/6.png -------------------------------------------------------------------------------- /beike_find_house/photo/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/7.png -------------------------------------------------------------------------------- /beike_find_house/photo/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/8.png -------------------------------------------------------------------------------- /beike_find_house/photo/9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/9.png -------------------------------------------------------------------------------- /beike_find_house/photo/oo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/beike_find_house/photo/oo.png -------------------------------------------------------------------------------- /car_home/auto_bmw/auto_bmw/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw/auto_bmw/__init__.py -------------------------------------------------------------------------------- /car_home/auto_bmw/auto_bmw/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw/auto_bmw/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /car_home/auto_bmw/auto_bmw/__pycache__/items.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw/auto_bmw/__pycache__/items.cpython-37.pyc -------------------------------------------------------------------------------- /car_home/auto_bmw/auto_bmw/__pycache__/pipelines.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw/auto_bmw/__pycache__/pipelines.cpython-37.pyc -------------------------------------------------------------------------------- /car_home/auto_bmw/auto_bmw/__pycache__/settings.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw/auto_bmw/__pycache__/settings.cpython-37.pyc -------------------------------------------------------------------------------- /car_home/auto_bmw/auto_bmw/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class AutoBmwItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | image_file = scrapy.Field() 15 | image_urls = scrapy.Field() 16 | -------------------------------------------------------------------------------- /car_home/auto_bmw/auto_bmw/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class AutoBmwSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class AutoBmwDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /car_home/auto_bmw/auto_bmw/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import os 8 | import urllib 9 | import scrapy 10 | from scrapy.pipelines.images import ImagesPipeline 11 | from scrapy.http import Request 12 | 13 | class AutoBmwPipeline(object): 14 | 15 | def __init__(self): 16 | self.file_path = os.path.join(os.path.dirname(os.path.dirname(__file__)),'image') 17 | if not os.path.exists(self.file_path): 18 | os.mkdir(self.file_path) 19 | else: 20 | print('Path already exist......') 21 | 22 | def process_item(self, item, spider): 23 | image_file = item['image_file'] 24 | image_urls = item['image_urls'] 25 | image_path = os.path.join(self.file_path,image_file) 26 | if not os.path.exists(image_path): 27 | os.mkdir(image_path) 28 | for url in image_urls: 29 | image_name = url.split('__')[-1] 30 | urllib.request.urlretrieve(url,os.path.join(image_path,image_name)) 31 | return item 32 | 33 | class BmwImagesPipeline(ImagesPipeline): 34 | 35 | def get_media_requests(self, item, info): 36 | img_url_list = item['image_urls'] 37 | for img_url in img_url_list: 38 | yield Request(img_url,meta={'image_file':item['image_file']},) 39 | 40 | def file_path(self, request, response=None, info=None): 41 | image_file = request.meta['image_file'] 42 | name = request.url.split('_')[-1] 43 | return '%s/%s.png'%(image_file,name) 44 | 45 | def item_completed(self, results, item, info): 46 | print(results) 47 | return item 48 | 49 | -------------------------------------------------------------------------------- /car_home/auto_bmw/auto_bmw/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for auto_bmw project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'auto_bmw' 13 | 14 | SPIDER_MODULES = ['auto_bmw.spiders'] 15 | NEWSPIDER_MODULE = 'auto_bmw.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'auto_bmw (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | DOWNLOAD_DELAY = 1 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'auto_bmw.middlewares.AutoBmwSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'auto_bmw.middlewares.AutoBmwDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | # 'auto_bmw.pipelines.AutoBmwPipeline': 300, 69 | 'scrapy.pipelines.images.ImagesPipeline': None, 70 | 'auto_bmw.pipelines.BmwImagesPipeline': 1, 71 | } 72 | IMAGES_STORE = 'ImgDownload' 73 | 74 | IMAGES_URLS_FIELD = 'image_urls' 75 | IMAGES_RESULT_FIELD = 'image_path' 76 | 77 | # Enable and configure the AutoThrottle extension (disabled by default) 78 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 79 | #AUTOTHROTTLE_ENABLED = True 80 | # The initial download delay 81 | #AUTOTHROTTLE_START_DELAY = 5 82 | # The maximum download delay to be set in case of high latencies 83 | #AUTOTHROTTLE_MAX_DELAY = 60 84 | # The average number of requests Scrapy should be sending in parallel to 85 | # each remote server 86 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 87 | # Enable showing throttling stats for every response received: 88 | #AUTOTHROTTLE_DEBUG = False 89 | 90 | # Enable and configure HTTP caching (disabled by default) 91 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 92 | #HTTPCACHE_ENABLED = True 93 | #HTTPCACHE_EXPIRATION_SECS = 0 94 | #HTTPCACHE_DIR = 'httpcache' 95 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 96 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 97 | -------------------------------------------------------------------------------- /car_home/auto_bmw/auto_bmw/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /car_home/auto_bmw/auto_bmw/spiders/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw/auto_bmw/spiders/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /car_home/auto_bmw/auto_bmw/spiders/__pycache__/bmw_spider.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw/auto_bmw/spiders/__pycache__/bmw_spider.cpython-37.pyc -------------------------------------------------------------------------------- /car_home/auto_bmw/auto_bmw/spiders/bmw_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from auto_bmw.items import AutoBmwItem 4 | 5 | class BmwSpiderSpider(scrapy.Spider): 6 | name = 'bmw_spider' 7 | allowed_domains = ['car.autohome.com.cn'] 8 | start_urls = ['https://car.autohome.com.cn/pic/series/66.html'] 9 | 10 | def parse(self, response): 11 | uibox_urls = response.xpath('//div[@class="uibox"]')[1:] 12 | for uibox_url in uibox_urls: 13 | item = AutoBmwItem() 14 | item['image_file'] = uibox_url.xpath('./div[@class="uibox-title"]/a')[0].xpath('./text()').get() 15 | image_urls = uibox_url.xpath('./div')[-1].xpath('.//img/@src').getall() 16 | item['image_urls'] = list(map(lambda url: response.urljoin(url),image_urls)) 17 | yield item 18 | -------------------------------------------------------------------------------- /car_home/auto_bmw/auto_bmw/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | path = os.path.join(os.path.dirname(os.path.dirname(__file__)),'image') 3 | if not os.path.exists(path): 4 | os.mkdir(path) 5 | else: 6 | print("Path already exist......") -------------------------------------------------------------------------------- /car_home/auto_bmw/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = auto_bmw.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = auto_bmw 12 | -------------------------------------------------------------------------------- /car_home/auto_bmw/start.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | cmdline.execute('scrapy crawl bmw_spider'.split()) -------------------------------------------------------------------------------- /car_home/auto_bmw_all/auto_bmw_all/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw_all/auto_bmw_all/__init__.py -------------------------------------------------------------------------------- /car_home/auto_bmw_all/auto_bmw_all/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw_all/auto_bmw_all/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /car_home/auto_bmw_all/auto_bmw_all/__pycache__/items.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw_all/auto_bmw_all/__pycache__/items.cpython-37.pyc -------------------------------------------------------------------------------- /car_home/auto_bmw_all/auto_bmw_all/__pycache__/pipelines.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw_all/auto_bmw_all/__pycache__/pipelines.cpython-37.pyc -------------------------------------------------------------------------------- /car_home/auto_bmw_all/auto_bmw_all/__pycache__/settings.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw_all/auto_bmw_all/__pycache__/settings.cpython-37.pyc -------------------------------------------------------------------------------- /car_home/auto_bmw_all/auto_bmw_all/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class AutoBmwAllItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | image_file = scrapy.Field() 15 | image_urls = scrapy.Field() 16 | pass 17 | -------------------------------------------------------------------------------- /car_home/auto_bmw_all/auto_bmw_all/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class AutoBmwAllSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class AutoBmwAllDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /car_home/auto_bmw_all/auto_bmw_all/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import os 8 | import urllib 9 | import scrapy 10 | from scrapy.pipelines.images import ImagesPipeline 11 | from scrapy.http import Request 12 | 13 | class AutoBmwAllPipeline(object): 14 | def process_item(self, item, spider): 15 | return item 16 | 17 | class BmwImagesPipeline(ImagesPipeline): 18 | 19 | def get_media_requests(self, item, info): 20 | img_url_list = item['image_urls'] 21 | for img_url in img_url_list: 22 | yield Request(img_url,meta={'image_file':item['image_file']},) 23 | 24 | def file_path(self, request, response=None, info=None): 25 | image_file = request.meta['image_file'] 26 | name = request.url.split('_')[-1] 27 | return '%s/%s.png'%(image_file,name) 28 | 29 | def item_completed(self, results, item, info): 30 | print(results) 31 | return item 32 | -------------------------------------------------------------------------------- /car_home/auto_bmw_all/auto_bmw_all/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for auto_bmw_all project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'auto_bmw_all' 13 | 14 | SPIDER_MODULES = ['auto_bmw_all.spiders'] 15 | NEWSPIDER_MODULE = 'auto_bmw_all.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'auto_bmw_all (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | DOWNLOAD_DELAY = 1 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'auto_bmw_all.middlewares.AutoBmwAllSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'auto_bmw_all.middlewares.AutoBmwAllDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'scrapy.pipelines.images.ImagesPipeline': None, 69 | 'auto_bmw_all.pipelines.BmwImagesPipeline': 1, 70 | } 71 | IMAGES_STORE = 'ImgDownload' 72 | 73 | IMAGES_URLS_FIELD = 'image_urls' 74 | IMAGES_RESULT_FIELD = 'image_path' 75 | 76 | # Enable and configure the AutoThrottle extension (disabled by default) 77 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 78 | #AUTOTHROTTLE_ENABLED = True 79 | # The initial download delay 80 | #AUTOTHROTTLE_START_DELAY = 5 81 | # The maximum download delay to be set in case of high latencies 82 | #AUTOTHROTTLE_MAX_DELAY = 60 83 | # The average number of requests Scrapy should be sending in parallel to 84 | # each remote server 85 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 86 | # Enable showing throttling stats for every response received: 87 | #AUTOTHROTTLE_DEBUG = False 88 | 89 | # Enable and configure HTTP caching (disabled by default) 90 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 91 | #HTTPCACHE_ENABLED = True 92 | #HTTPCACHE_EXPIRATION_SECS = 0 93 | #HTTPCACHE_DIR = 'httpcache' 94 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 95 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 96 | -------------------------------------------------------------------------------- /car_home/auto_bmw_all/auto_bmw_all/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /car_home/auto_bmw_all/auto_bmw_all/spiders/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw_all/auto_bmw_all/spiders/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /car_home/auto_bmw_all/auto_bmw_all/spiders/__pycache__/auto_crawl.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/auto_bmw_all/auto_bmw_all/spiders/__pycache__/auto_crawl.cpython-37.pyc -------------------------------------------------------------------------------- /car_home/auto_bmw_all/auto_bmw_all/spiders/auto_crawl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.linkextractors import LinkExtractor 4 | from scrapy.spiders import CrawlSpider, Rule 5 | from auto_bmw_all.items import AutoBmwAllItem 6 | 7 | class AutoCrawlSpider(CrawlSpider): 8 | name = 'auto_crawl' 9 | allowed_domains = ['car.autohome.com.cn'] 10 | start_urls = ['https://car.autohome.com.cn/pic/series/66.html'] 11 | 12 | rules = ( 13 | Rule(LinkExtractor(allow=r'.+/pic/series/66-.+'), callback='parse_item',follow=True), 14 | ) 15 | 16 | src="//car3.autoimg.cn/cardfs/product/g1/M07/63/01/240x180_0_q95_autohomecar__ChsEmVz37-OABcHCAAR_DO3soxI667.jpg" 17 | src="//car3.autoimg.cn/cardfs/product/g1/M07/63/01/ 800x0_1_q95_autohomecar__ChsEmVz37-OABcHCAAR_DO3soxI667.jpg" 18 | 19 | def parse_item(self, response): 20 | uibox = response.xpath('//div[@class="uibox"]') 21 | item = AutoBmwAllItem() 22 | item['image_file'] = uibox.xpath('./div[@class="uibox-title"]/text()').get() 23 | image_url = uibox.xpath('./div[@class="uibox-con carpic-list03 border-b-solid"]/ul/li//img/@src').getall() 24 | item['image_urls'] = list(map(lambda url: response.urljoin(url.replace('240x180_0','800x0_1')),image_url)) 25 | yield item 26 | 27 | 28 | -------------------------------------------------------------------------------- /car_home/auto_bmw_all/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = auto_bmw_all.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = auto_bmw_all 12 | -------------------------------------------------------------------------------- /car_home/auto_bmw_all/start.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | cmdline.execute('scrapy crawl auto_crawl'.split()) -------------------------------------------------------------------------------- /car_home/autohome/autohome/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/autohome/autohome/__init__.py -------------------------------------------------------------------------------- /car_home/autohome/autohome/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/autohome/autohome/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /car_home/autohome/autohome/__pycache__/items.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/autohome/autohome/__pycache__/items.cpython-37.pyc -------------------------------------------------------------------------------- /car_home/autohome/autohome/__pycache__/pipelines.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/autohome/autohome/__pycache__/pipelines.cpython-37.pyc -------------------------------------------------------------------------------- /car_home/autohome/autohome/__pycache__/settings.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/autohome/autohome/__pycache__/settings.cpython-37.pyc -------------------------------------------------------------------------------- /car_home/autohome/autohome/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class AutohomeItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | # pass 15 | model_name = scrapy.Field() 16 | level = scrapy.Field() 17 | auto_body = scrapy.Field() 18 | price = scrapy.Field() 19 | engine = scrapy.Field() 20 | transmission = scrapy.Field() 21 | auto_img = scrapy.Field() 22 | -------------------------------------------------------------------------------- /car_home/autohome/autohome/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class AutohomeSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class AutohomeDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /car_home/autohome/autohome/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import json 8 | import pymongo 9 | import pymysql 10 | 11 | class AutohomePipeline(object): 12 | 13 | def __init__(self): 14 | self.fp = open('auto.json','a+',encoding='utf-8') 15 | 16 | def process_item(self, item, spider): 17 | file = json.dumps(dict(item),ensure_ascii=False) 18 | self.fp.write(file) 19 | print(self.fp.read()) 20 | return item 21 | 22 | def close_spider(self,spider): 23 | self.fp.close() 24 | print('Spider Stop...') 25 | 26 | class MongoPipeline(object): 27 | 28 | def __init__(self,mongo_uri,mongo_db): 29 | self.mongo_uri = mongo_uri 30 | self.mongo_db = mongo_db 31 | 32 | @classmethod 33 | def from_crawler(cls, crawler): 34 | # This method is used by Scrapy to create your spiders. 35 | return cls( 36 | mongo_uri = crawler.settings.get('MONGO_URI'), 37 | mongo_db = crawler.settings.get('MONGO_DB'), 38 | ) 39 | 40 | def open_spider(self,spider): 41 | self.client = pymongo.MongoClient(self.mongo_uri) 42 | self.db = self.client[self.mongo_db] 43 | print('Spider Start run...') 44 | 45 | def process_item(self, item, spider): 46 | name = item.__class__.__name__ 47 | self.db[name].insert(dict(item)) 48 | return item 49 | 50 | def close_spider(self,spider): 51 | self.client.close() 52 | print('Spider Stop...') 53 | 54 | class MysqlPipeline(object): 55 | 56 | def __init__(self,host,port,user,password,db): 57 | self.host = host 58 | self.user = user 59 | self.password =password 60 | self.port = port 61 | self.db = db 62 | 63 | @classmethod 64 | def from_crawler(cls, crawler): 65 | # This method is used by Scrapy to create your spiders. 66 | return cls( 67 | host = crawler.settings.get('HOST'), 68 | port = crawler.settings.get('PORT'), 69 | user = crawler.settings.get('USER'), 70 | password = crawler.settings.get('PASSWORD'), 71 | db = crawler.settings.get('DB'), 72 | ) 73 | 74 | def open_spider(self,spider): 75 | self.db = pymysql.connect(host=self.host,user=self.user,port=self.port,password=self.password,db=self.db,charset='utf8') 76 | self.cursor = self.db.cursor() 77 | 78 | def process_item(self,item,spider): 79 | sql = 'insert into auto_data (model_name,level,auto_body,price,engine,transmission,auto_img) VALUES(%s,%s,%s,%s,%s,%s,%s)' 80 | self.cursor.execute(sql,(item["model_name"],item["level"],item["price"],item["auto_body"],item["engine"],item["transmission"],item['auto_img'])) 81 | self.db.commit() 82 | return item 83 | 84 | def close_spider(self,spider): 85 | self.cursor.close() 86 | self.db.close() -------------------------------------------------------------------------------- /car_home/autohome/autohome/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for autohome project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'autohome' 13 | 14 | SPIDER_MODULES = ['autohome.spiders'] 15 | NEWSPIDER_MODULE = 'autohome.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'autohome (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | FEED_EXPORT_ENCODING = 'gb18030' 25 | 26 | MONGO_URI = 'localhost' 27 | MONGO_DB = 'autohome' 28 | 29 | HOST = 'localhost' 30 | USER = 'root' 31 | PASSWORD = 'pass4321' 32 | PORT = 3306 33 | DB = 'auto_home' 34 | 35 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 36 | #CONCURRENT_REQUESTS = 32 37 | 38 | # Configure a delay for requests for the same website (default: 0) 39 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 40 | # See also autothrottle settings and docs 41 | DOWNLOAD_DELAY = 1 42 | # The download delay setting will honor only one of: 43 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 44 | #CONCURRENT_REQUESTS_PER_IP = 16 45 | 46 | # Disable cookies (enabled by default) 47 | #COOKIES_ENABLED = False 48 | 49 | # Disable Telnet Console (enabled by default) 50 | #TELNETCONSOLE_ENABLED = False 51 | 52 | # Override the default request headers: 53 | # DEFAULT_REQUEST_HEADERS = { 54 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 55 | # 'Accept-Language': 'en', 56 | # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36' 57 | # } 58 | 59 | # Enable or disable spider middlewares 60 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 61 | #SPIDER_MIDDLEWARES = { 62 | # 'autohome.middlewares.AutohomeSpiderMiddleware': 543, 63 | #} 64 | 65 | # Enable or disable downloader middlewares 66 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 67 | #DOWNLOADER_MIDDLEWARES = { 68 | # 'autohome.middlewares.AutohomeDownloaderMiddleware': 543, 69 | #} 70 | 71 | # Enable or disable extensions 72 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 73 | #EXTENSIONS = { 74 | # 'scrapy.extensions.telnet.TelnetConsole': None, 75 | #} 76 | 77 | # Configure item pipelines 78 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 79 | ITEM_PIPELINES = { 80 | 'autohome.pipelines.AutohomePipeline': 300, 81 | 'autohome.pipelines.MongoPipeline': 100, 82 | 'autohome.pipelines.MysqlPipeline': 200, 83 | } 84 | 85 | # Enable and configure the AutoThrottle extension (disabled by default) 86 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 87 | #AUTOTHROTTLE_ENABLED = True 88 | # The initial download delay 89 | #AUTOTHROTTLE_START_DELAY = 5 90 | # The maximum download delay to be set in case of high latencies 91 | #AUTOTHROTTLE_MAX_DELAY = 60 92 | # The average number of requests Scrapy should be sending in parallel to 93 | # each remote server 94 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 95 | # Enable showing throttling stats for every response received: 96 | #AUTOTHROTTLE_DEBUG = False 97 | 98 | # Enable and configure HTTP caching (disabled by default) 99 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 100 | #HTTPCACHE_ENABLED = True 101 | #HTTPCACHE_EXPIRATION_SECS = 0 102 | #HTTPCACHE_DIR = 'httpcache' 103 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 104 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 105 | -------------------------------------------------------------------------------- /car_home/autohome/autohome/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /car_home/autohome/autohome/spiders/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/autohome/autohome/spiders/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /car_home/autohome/autohome/spiders/__pycache__/autohome_crawl.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/autohome/autohome/spiders/__pycache__/autohome_crawl.cpython-37.pyc -------------------------------------------------------------------------------- /car_home/autohome/autohome/spiders/autohome_crawl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.linkextractors import LinkExtractor 4 | from scrapy.spiders import CrawlSpider, Rule 5 | from autohome.items import AutohomeItem 6 | 7 | class AutohomeCrawlSpider(CrawlSpider): 8 | name = 'autohome_crawl' 9 | allowed_domains = ['car.autohome.com.cn'] 10 | start_urls = ['https://car.autohome.com.cn/price'] 11 | rules = ( 12 | Rule(LinkExtractor(allow=r'.+price/list-0-0-0-0-0-0-0-1-0-0-0-0-0-0-0-\d+\.html'), callback='parse_item', follow=True), 13 | ) 14 | 15 | def parse_item(self, response): 16 | item = AutohomeItem() 17 | print('_'*90) 18 | print(response.url+'\n') 19 | print('_'*90) 20 | auto_list = response.xpath('//div[@class="list-cont-bg"]') 21 | print('_'*90) 22 | for auto in auto_list: 23 | item["model_name"] = auto.xpath('.//a[@class="font-bold"]/text()').get() 24 | item["level"] = auto.xpath('.//span[@class="info-gray"]/text()').get() 25 | item["price"] = auto.xpath('.//span[@class="lever-price red"]/span/text()').get() 26 | item["auto_body"] = auto.xpath('.//ul[@class="lever-ul"]/li')[1].xpath('.//a/text()').get() 27 | engine = auto.xpath('.//ul[@class="lever-ul"]/li')[2].xpath('.//a/text()').getall() 28 | item["engine"] = ','.join(engine).strip(',') 29 | item["transmission"] = auto.xpath('.//ul[@class="lever-ul"]/li')[3].xpath('.//a/text()').get() 30 | item['auto_img'] = auto.xpath('.//div[@class="list-cont-img"]/a/img/@src').get() 31 | yield item 32 | 33 | -------------------------------------------------------------------------------- /car_home/autohome/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = autohome.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = autohome 12 | -------------------------------------------------------------------------------- /car_home/test_spider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = test_spider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = test_spider 12 | -------------------------------------------------------------------------------- /car_home/test_spider/start.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | cmdline.execute('scrapy crawl demo'.split()) -------------------------------------------------------------------------------- /car_home/test_spider/test_spider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/test_spider/test_spider/__init__.py -------------------------------------------------------------------------------- /car_home/test_spider/test_spider/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/test_spider/test_spider/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /car_home/test_spider/test_spider/__pycache__/settings.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/test_spider/test_spider/__pycache__/settings.cpython-37.pyc -------------------------------------------------------------------------------- /car_home/test_spider/test_spider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class TestSpiderItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /car_home/test_spider/test_spider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class TestSpiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class TestSpiderDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /car_home/test_spider/test_spider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class TestSpiderPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /car_home/test_spider/test_spider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for test_spider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'test_spider' 13 | 14 | SPIDER_MODULES = ['test_spider.spiders'] 15 | NEWSPIDER_MODULE = 'test_spider.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'test_spider (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'test_spider.middlewares.TestSpiderSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'test_spider.middlewares.TestSpiderDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'test_spider.pipelines.TestSpiderPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /car_home/test_spider/test_spider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /car_home/test_spider/test_spider/spiders/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/test_spider/test_spider/spiders/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /car_home/test_spider/test_spider/spiders/__pycache__/demo.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/car_home/test_spider/test_spider/spiders/__pycache__/demo.cpython-37.pyc -------------------------------------------------------------------------------- /car_home/test_spider/test_spider/spiders/demo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | 5 | class DemoSpider(scrapy.Spider): 6 | name = 'demo' 7 | allowed_domains = ['httpbin.org/'] 8 | start_urls = ['http://httpbin.org/'] 9 | 10 | def parse(self, response): 11 | print(response['headers']) 12 | -------------------------------------------------------------------------------- /death_company/.ipynb_checkpoints/lagou_spider-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from selenium import webdriver\n", 10 | "from selenium.webdriver.support.wait import WebDriverWait\n", 11 | "from selenium.webdriver.common.by import By\n", 12 | "from selenium.webdriver.common.keys import Keys\n", 13 | "from selenium.webdriver.support import expected_conditions as EC\n", 14 | "import time\n", 15 | "import pymysql\n", 16 | "import pandas as pd\n", 17 | "\n", 18 | "\n", 19 | "broswer = webdriver.Chrome()\n", 20 | "def save_data(data):\n", 21 | " db = pymysql.connect(host='localhost',user='root',password='pass4321',port=3306,db='position_data')\n", 22 | " cursor = db.cursor()\n", 23 | " tables = 'position_table'\n", 24 | " keys = ','.join(data.keys())\n", 25 | " values = ','.join(['%s']*len(data))\n", 26 | " sql = 'INSERT INTO {table}({keys}) VALUES({values})'.format(table=tables,keys=keys,values=values)\n", 27 | " try:\n", 28 | " if cursor.execute(sql,tuple(data.values())):\n", 29 | " print('save success!')\n", 30 | " db.commit()\n", 31 | " except:\n", 32 | " print('save failed!')\n", 33 | " db.rollback()\n", 34 | " db.close()\n", 35 | "try:\n", 36 | " broswer.get('https://www.lagou.com/')\n", 37 | " choose_city = broswer.find_elements_by_class_name('tab')[1]\n", 38 | " print(choose_city.text)\n", 39 | " choose_city.click()\n", 40 | " input_zhiwei = input('请输入需要查询的岗位:')\n", 41 | " input_name = broswer.find_element_by_id('search_input')\n", 42 | " input_name.send_keys(input_zhiwei)\n", 43 | " wait = WebDriverWait(broswer,1)\n", 44 | " click_but = broswer.find_element_by_id('search_button')\n", 45 | " click_but.click()\n", 46 | " wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME,'con_list_item')))\n", 47 | " time.sleep(1)\n", 48 | " # 获取需要爬取岗位的总页数\n", 49 | " total_page = broswer.find_elements_by_class_name('pager_not_current')[-1].text\n", 50 | " print(total_page)\n", 51 | " page = 1\n", 52 | " position_data = pd.DataFrame(columns=['position_name','position_location','position_time','position_experience',\n", 53 | " 'position_company','position_infos','position_tags','position_introdce'])\n", 54 | " while page<= int(total_page):\n", 55 | " infos = broswer.find_elements_by_class_name('con_list_item')\n", 56 | " print('开始爬取第{}页的岗位信息数据'.format(page))\n", 57 | " page = page + 1\n", 58 | " for info in infos:\n", 59 | " # 岗位名称\n", 60 | " position_name = info.find_element_by_class_name('position_link').find_element_by_tag_name('h3').text\n", 61 | " # 岗位地点\n", 62 | " position_location = info.find_element_by_class_name('position_link').find_element_by_tag_name('em').text\n", 63 | " # 岗位发布时间\n", 64 | " position_time = info.find_element_by_class_name('format-time').text\n", 65 | " # 岗位工资、工作经验及学历\n", 66 | " position_experience = info.find_element_by_class_name('li_b_l').text\n", 67 | " # 招聘公司\n", 68 | " position_company = info.find_element_by_class_name('company_name').find_element_by_tag_name('a').text\n", 69 | " # 公司基本情况\n", 70 | " position_infos = info.find_element_by_class_name('industry').text\n", 71 | " # 岗位标签\n", 72 | " position_tags = info.find_element_by_class_name('list_item_bot').find_element_by_class_name('li_b_l').text\n", 73 | " # 公司基本介绍\n", 74 | " position_introdce = info.find_element_by_class_name('list_item_bot').find_element_by_class_name('li_b_r').text\n", 75 | "\n", 76 | " # po_data = {'position_name':position_name,'position_location':position_location,'position_time':position_time,'position_experience':position_experience,\n", 77 | " # 'position_company':position_company,'position_infos':position_infos,'position_tags':position_tags,'position_introdce':position_introdce}\n", 78 | " \n", 79 | " # save_data(po_data)\n", 80 | "\n", 81 | " position_data=position_data.append({'position_name':position_name,'position_location':position_location,'position_time':position_time,'position_experience':position_experience,'position_company':position_company,'position_infos':position_infos,'position_tags':position_tags,'position_introdce':position_introdce},ignore_index=True)\n", 82 | " print(position_name,position_location,position_time,position_experience,position_company,position_infos,position_tags,position_introdce)\n", 83 | " position_data.to_csv('position_infos.csv',mode='a',encoding='utf-8-sig')\n", 84 | " print('\\n')\n", 85 | " next_page = broswer.find_element_by_class_name('pager_next')\n", 86 | " next_page.click()\n", 87 | " time.sleep(1)\n", 88 | " position_data.to_csv('position_data.csv',mode='a',encoding='utf-8-sig')\n", 89 | " broswer.quit()\n", 90 | "except:\n", 91 | " print('error')" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 5, 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "data": { 101 | "text/html": [ 102 | "
\n", 103 | "\n", 116 | "\n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | "
Unnamed: 0position_nameposition_locationposition_timeposition_experienceposition_companyposition_infosposition_tagsposition_introdce
00Python开发工程师(技术平台)望京1天前发布20k-40k 经验3-5年 / 本科一起教育科技移动互联网,教育 / D轮及以上 / 2000人以上教育 运维“企业福利 平台 发展空间 团队氛围”
11python开发朝阳区1天前发布6k-10k 经验不限 / 本科一网互通广告营销,移动互联网 / 未融资 / 15-50人大数据 企业服务 后端 Python 爬虫 数据挖掘“海外社交媒体大数据团队”
22python开发工程师望京1天前发布30k-50k 经验3-5年 / 本科橙睿科技移动互联网,社交 / 不需要融资 / 50-150人电商“福利好、发展前景”
33高级python开发工程师中关村10:15发布20k-25k 经验5-10年 / 本科达观数据人工智能 / B轮 / 150-500人企业服务 大数据 后端 docker Python C++“AI准独角兽 技术氛围浓 优秀伙伴”
44python开发工程师中关村2天前发布20k-30k 经验3-5年 / 本科TRON软件开发,区块链 / 不需要融资 / 150-500人后端 Python“区块链技术,技术氛围,扁平化”
\n", 194 | "
" 195 | ], 196 | "text/plain": [ 197 | " Unnamed: 0 position_name position_location position_time \\\n", 198 | "0 0 Python开发工程师(技术平台) 望京 1天前发布 \n", 199 | "1 1 python开发 朝阳区 1天前发布 \n", 200 | "2 2 python开发工程师 望京 1天前发布 \n", 201 | "3 3 高级python开发工程师 中关村 10:15发布 \n", 202 | "4 4 python开发工程师 中关村 2天前发布 \n", 203 | "\n", 204 | " position_experience position_company position_infos \\\n", 205 | "0 20k-40k 经验3-5年 / 本科 一起教育科技 移动互联网,教育 / D轮及以上 / 2000人以上 \n", 206 | "1 6k-10k 经验不限 / 本科 一网互通 广告营销,移动互联网 / 未融资 / 15-50人 \n", 207 | "2 30k-50k 经验3-5年 / 本科 橙睿科技 移动互联网,社交 / 不需要融资 / 50-150人 \n", 208 | "3 20k-25k 经验5-10年 / 本科 达观数据 人工智能 / B轮 / 150-500人 \n", 209 | "4 20k-30k 经验3-5年 / 本科 TRON 软件开发,区块链 / 不需要融资 / 150-500人 \n", 210 | "\n", 211 | " position_tags position_introdce \n", 212 | "0 教育 运维 “企业福利 平台 发展空间 团队氛围” \n", 213 | "1 大数据 企业服务 后端 Python 爬虫 数据挖掘 “海外社交媒体大数据团队” \n", 214 | "2 电商 “福利好、发展前景” \n", 215 | "3 企业服务 大数据 后端 docker Python C++ “AI准独角兽 技术氛围浓 优秀伙伴” \n", 216 | "4 后端 Python “区块链技术,技术氛围,扁平化” " 217 | ] 218 | }, 219 | "execution_count": 5, 220 | "metadata": {}, 221 | "output_type": "execute_result" 222 | } 223 | ], 224 | "source": [ 225 | "position_data = pd.read_csv('position_data.csv')\n", 226 | "position_data.head()" 227 | ] 228 | } 229 | ], 230 | "metadata": { 231 | "kernelspec": { 232 | "display_name": "Python 3", 233 | "language": "python", 234 | "name": "python3" 235 | }, 236 | "language_info": { 237 | "codemirror_mode": { 238 | "name": "ipython", 239 | "version": 3 240 | }, 241 | "file_extension": ".py", 242 | "mimetype": "text/x-python", 243 | "name": "python", 244 | "nbconvert_exporter": "python", 245 | "pygments_lexer": "ipython3", 246 | "version": "3.7.0" 247 | } 248 | }, 249 | "nbformat": 4, 250 | "nbformat_minor": 2 251 | } 252 | -------------------------------------------------------------------------------- /death_company/apple.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/death_company/apple.jpg -------------------------------------------------------------------------------- /death_company/death_company.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from xici import proxy 3 | import pymysql 4 | import multiprocessing 5 | import time 6 | import random 7 | import requests 8 | 9 | def get_data(json_url,proxies): 10 | user_agent_list = [ 11 | # Opera 12 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60", 13 | "Opera/8.0 (Windows NT 5.1; U; en)", 14 | "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50", 15 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50", 16 | # Firefox 17 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0", 18 | "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", 19 | # Safari 20 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2", 21 | # chrome 22 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36", 23 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", 24 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16", 25 | # 360 26 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36", 27 | "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko", 28 | # 淘宝浏览器 29 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", 30 | # 猎豹浏览器 31 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", 32 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", 33 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", 34 | # QQ浏览器 35 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", 36 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", 37 | # sogou浏览器 38 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0", 39 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)", 40 | # maxthon浏览器 41 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36", 42 | # UC浏览器 43 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36", 44 | ] 45 | UserAgent = random.choice(user_agent_list) 46 | # print(UserAgent) 47 | headers = { 48 | 'User-Agent': UserAgent, 49 | 'cookie': '_ga=GA1.2.1552320068.1576549906; _gid=GA1.2.2023492117.1576549906; gr_user_id=3d348e20-9820-49f1-afcb-8c55d72ad2d2; MEIQIA_TRACK_ID=1V5jsLMCkebhIrU4wdaptzAcxNh; MEIQIA_VISIT_ID=1V5jsNCG3mS1OZ381JMTk7aWSy4; Hm_lvt_1c587ad486cdb6b962e94fc2002edf89=1576549906,1576553882,1576568223,1576574298; _gat_gtag_UA_59006131_1=1; Hm_lpvt_1c587ad486cdb6b962e94fc2002edf89=1576574523' 50 | } 51 | try: 52 | json_data = requests.get(json_url,headers=headers,proxies=proxies) 53 | if json_data.status_code is 200: 54 | deatail_infos = json_data.json()['data']['info'] 55 | for data in deatail_infos: 56 | # 公司名称 57 | com_name = data['com_name'] 58 | # 公司简介 59 | com_description = data['com_des'] 60 | # 公司创建时间 61 | com_born = data['born'] 62 | # 公司关闭时间 63 | com_change_close_date = data['com_change_close_date'] 64 | # 公司类型 65 | com_style = data['cat_name'] + data['se_cat_name'] 66 | # 公司地址 67 | com_position = data['com_prov'] 68 | # 公司创始人 69 | com_team = data['com_team'] 70 | com_people = '' 71 | for name in com_team: 72 | com_people += name['name'] + '/' + name['per_des'] + ';' 73 | # 公司标签 74 | com_tag = data['com_tag'] 75 | com_tags = '' 76 | for tag in com_tag: 77 | com_tags += tag['tag_name'] + '/' 78 | # 投资轮次 79 | com_fund_status_name = data['com_fund_status_name'] 80 | # 投资公司 81 | com_invst = data['com_invst'] 82 | com_invsts = '' 83 | for com in com_invst: 84 | com_invsts += com['invst_name'] + '/' 85 | # 公司死亡原因 86 | closure_type = data['closure_type'] 87 | death_reason = '' 88 | for da in closure_type: 89 | death_reason += da['name'] + '/' 90 | # 公司存活时间 91 | live_time = data['live_time'] 92 | # 公司资金情况 93 | total_money = data['total_money'] 94 | # 公司类型 95 | cat_name = data['cat_name'] 96 | db = pymysql.connect(host='localhost',user='root',password='123456',port=3306,db='spider_data') 97 | cursor = db.cursor() 98 | sql = 'INSERT INTO juzi_death_company_all_info(com_name,com_description,com_born,com_change_close_date,com_style,com_position,com_people,com_tags,com_fund_status_name,com_invsts,death_reason,live_time,total_money,cat_name) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' 99 | try: 100 | cursor.execute(sql,(com_name,com_description,com_born,com_change_close_date,com_style,com_position,com_people,com_tags,com_fund_status_name,com_invsts,death_reason,live_time,total_money,cat_name)) 101 | db.commit() 102 | print('{}数据写入成功!!!'.format(com_name)) 103 | except: 104 | print('数据写入失败!') 105 | db.rollback() 106 | else: 107 | print('{}访问不了了!!!'.format(json_url)) 108 | except: 109 | print("{}数据爬取失败".format(json_url)) 110 | 111 | if __name__ == "__main__": 112 | start = time.time() 113 | print('当前环境CPU核数是:{}核'.format(multiprocessing.cpu_count())) 114 | json_urls = [ 'https://www.itjuzi.com/api/closure?com_prov=&sort=&page={}&keyword=&cat_id='.format(i) for i in range(1,629)] 115 | p = multiprocessing.Pool(4) 116 | i = 1 117 | proxies = proxy.get_proxy_random() 118 | for json_url in json_urls: 119 | if (i%35 == 0): 120 | proxy.get_proxy_random() 121 | p.apply_async(get_data,args=(json_url,proxies,)) 122 | i += 1 123 | p.close() 124 | p.join() 125 | print('总耗时:%.5f秒'% float(time.time()-start)) -------------------------------------------------------------------------------- /death_company/death_company_info - 副本.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/death_company/death_company_info - 副本.xls -------------------------------------------------------------------------------- /death_company/heart.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/death_company/heart.jpg -------------------------------------------------------------------------------- /death_company/可视化图表/com_death_reason.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/death_company/可视化图表/com_death_reason.jpg -------------------------------------------------------------------------------- /death_company/可视化图表/com_financing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/death_company/可视化图表/com_financing.jpg -------------------------------------------------------------------------------- /death_company/可视化图表/com_financing_pie.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/death_company/可视化图表/com_financing_pie.jpg -------------------------------------------------------------------------------- /death_company/可视化图表/com_live_time.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/death_company/可视化图表/com_live_time.jpg -------------------------------------------------------------------------------- /death_company/可视化图表/com_position.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/death_company/可视化图表/com_position.jpg -------------------------------------------------------------------------------- /death_company/可视化图表/com_position_pie.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/death_company/可视化图表/com_position_pie.jpg -------------------------------------------------------------------------------- /death_company/可视化图表/com_style.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/death_company/可视化图表/com_style.jpg -------------------------------------------------------------------------------- /death_company/可视化图表/com_style_pie.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/death_company/可视化图表/com_style_pie.jpg -------------------------------------------------------------------------------- /jianshu/ghostdriver.log: -------------------------------------------------------------------------------- 1 | [INFO - 2019-10-16T06:55:41.289Z] GhostDriver - Main - running on port 50831 2 | [INFO - 2019-10-16T06:55:41.398Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/538.1 (KHTML, like Gecko) PhantomJS/2.1.1 Safari/538.1","webSecurityEnabled":true} 3 | [INFO - 2019-10-16T06:55:41.398Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.customHeaders: - {} 4 | [INFO - 2019-10-16T06:55:41.398Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"2.1.1","driverName":"ghostdriver","driverVersion":"1.2.0","platform":"windows-7-32bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"}} 5 | [INFO - 2019-10-16T06:55:41.398Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: f7b316e0-efe1-11e9-8204-558610a45b39 6 | [ERROR - 2019-10-16T06:55:53.212Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 7 | 8 | phantomjs://platform/console++.js:263 in error 9 | [ERROR - 2019-10-16T06:55:53.212Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 10 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 11 | 12 | phantomjs://platform/console++.js:263 in error 13 | [ERROR - 2019-10-16T06:55:55.207Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 14 | 15 | phantomjs://platform/console++.js:263 in error 16 | [ERROR - 2019-10-16T06:55:55.207Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 17 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 18 | 19 | phantomjs://platform/console++.js:263 in error 20 | [ERROR - 2019-10-16T06:55:57.042Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 21 | 22 | phantomjs://platform/console++.js:263 in error 23 | [ERROR - 2019-10-16T06:55:57.042Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 24 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 25 | 26 | phantomjs://platform/console++.js:263 in error 27 | [ERROR - 2019-10-16T06:55:59.031Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 28 | 29 | phantomjs://platform/console++.js:263 in error 30 | [ERROR - 2019-10-16T06:55:59.032Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 31 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 32 | 33 | phantomjs://platform/console++.js:263 in error 34 | [ERROR - 2019-10-16T06:56:01.124Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 35 | 36 | phantomjs://platform/console++.js:263 in error 37 | [ERROR - 2019-10-16T06:56:01.124Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 38 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 39 | 40 | phantomjs://platform/console++.js:263 in error 41 | [ERROR - 2019-10-16T06:56:03.133Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 42 | 43 | phantomjs://platform/console++.js:263 in error 44 | [ERROR - 2019-10-16T06:56:03.134Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 45 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 46 | 47 | phantomjs://platform/console++.js:263 in error 48 | [ERROR - 2019-10-16T06:56:04.906Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 49 | 50 | phantomjs://platform/console++.js:263 in error 51 | [ERROR - 2019-10-16T06:56:04.906Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 52 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 53 | 54 | phantomjs://platform/console++.js:263 in error 55 | [ERROR - 2019-10-16T06:56:06.585Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 56 | 57 | phantomjs://platform/console++.js:263 in error 58 | [ERROR - 2019-10-16T06:56:06.585Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 59 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 60 | 61 | phantomjs://platform/console++.js:263 in error 62 | [ERROR - 2019-10-16T06:56:09.043Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 63 | 64 | phantomjs://platform/console++.js:263 in error 65 | [ERROR - 2019-10-16T06:56:09.043Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 66 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 67 | 68 | phantomjs://platform/console++.js:263 in error 69 | [ERROR - 2019-10-16T06:56:12.430Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 70 | 71 | phantomjs://platform/console++.js:263 in error 72 | [ERROR - 2019-10-16T06:56:12.430Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 73 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 74 | 75 | phantomjs://platform/console++.js:263 in error 76 | [ERROR - 2019-10-16T06:56:14.119Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 77 | 78 | phantomjs://platform/console++.js:263 in error 79 | [ERROR - 2019-10-16T06:56:14.119Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 80 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 81 | 82 | phantomjs://platform/console++.js:263 in error 83 | [ERROR - 2019-10-16T06:56:15.780Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 84 | 85 | phantomjs://platform/console++.js:263 in error 86 | [ERROR - 2019-10-16T06:56:15.780Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 87 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 88 | 89 | phantomjs://platform/console++.js:263 in error 90 | [ERROR - 2019-10-16T06:56:17.465Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 91 | 92 | phantomjs://platform/console++.js:263 in error 93 | [ERROR - 2019-10-16T06:56:17.465Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 94 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 95 | 96 | phantomjs://platform/console++.js:263 in error 97 | [ERROR - 2019-10-16T06:56:19.819Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 98 | 99 | phantomjs://platform/console++.js:263 in error 100 | [ERROR - 2019-10-16T06:56:19.820Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 101 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 102 | 103 | phantomjs://platform/console++.js:263 in error 104 | [ERROR - 2019-10-16T06:56:22.468Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 105 | 106 | phantomjs://platform/console++.js:263 in error 107 | [ERROR - 2019-10-16T06:56:22.468Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 108 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 109 | 110 | phantomjs://platform/console++.js:263 in error 111 | [ERROR - 2019-10-16T06:56:24.191Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 112 | 113 | phantomjs://platform/console++.js:263 in error 114 | [ERROR - 2019-10-16T06:56:24.191Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 115 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 116 | 117 | phantomjs://platform/console++.js:263 in error 118 | [ERROR - 2019-10-16T06:56:26.351Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 119 | 120 | phantomjs://platform/console++.js:263 in error 121 | [ERROR - 2019-10-16T06:56:26.352Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 122 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 123 | 124 | phantomjs://platform/console++.js:263 in error 125 | [ERROR - 2019-10-16T06:56:28.419Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 126 | 127 | phantomjs://platform/console++.js:263 in error 128 | [ERROR - 2019-10-16T06:56:28.419Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 129 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 130 | 131 | phantomjs://platform/console++.js:263 in error 132 | [ERROR - 2019-10-16T06:56:30.058Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 133 | 134 | phantomjs://platform/console++.js:263 in error 135 | [ERROR - 2019-10-16T06:56:30.058Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 136 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 137 | 138 | phantomjs://platform/console++.js:263 in error 139 | [ERROR - 2019-10-16T06:56:32.359Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 140 | 141 | phantomjs://platform/console++.js:263 in error 142 | [ERROR - 2019-10-16T06:56:32.359Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 143 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 144 | 145 | phantomjs://platform/console++.js:263 in error 146 | [ERROR - 2019-10-16T06:56:34.092Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 147 | 148 | phantomjs://platform/console++.js:263 in error 149 | [ERROR - 2019-10-16T06:56:34.092Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 150 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 151 | 152 | phantomjs://platform/console++.js:263 in error 153 | [ERROR - 2019-10-16T06:56:36.031Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 154 | 155 | phantomjs://platform/console++.js:263 in error 156 | [ERROR - 2019-10-16T06:56:36.031Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 157 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 158 | 159 | phantomjs://platform/console++.js:263 in error 160 | [ERROR - 2019-10-16T06:56:37.748Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 161 | 162 | phantomjs://platform/console++.js:263 in error 163 | [ERROR - 2019-10-16T06:56:37.748Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 164 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 165 | 166 | phantomjs://platform/console++.js:263 in error 167 | [ERROR - 2019-10-16T06:56:39.790Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 168 | 169 | phantomjs://platform/console++.js:263 in error 170 | [ERROR - 2019-10-16T06:56:39.790Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 171 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 172 | 173 | phantomjs://platform/console++.js:263 in error 174 | [ERROR - 2019-10-16T06:56:41.954Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 175 | 176 | phantomjs://platform/console++.js:263 in error 177 | [ERROR - 2019-10-16T06:56:41.954Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 178 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 179 | 180 | phantomjs://platform/console++.js:263 in error 181 | [ERROR - 2019-10-16T06:56:44.333Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 182 | 183 | phantomjs://platform/console++.js:263 in error 184 | [ERROR - 2019-10-16T06:56:44.333Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 185 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 186 | 187 | phantomjs://platform/console++.js:263 in error 188 | [ERROR - 2019-10-16T06:56:46.202Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 189 | 190 | phantomjs://platform/console++.js:263 in error 191 | [ERROR - 2019-10-16T06:56:46.202Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 192 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 193 | 194 | phantomjs://platform/console++.js:263 in error 195 | [ERROR - 2019-10-16T06:56:47.870Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 196 | 197 | phantomjs://platform/console++.js:263 in error 198 | [ERROR - 2019-10-16T06:56:47.870Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 199 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 200 | 201 | phantomjs://platform/console++.js:263 in error 202 | [ERROR - 2019-10-16T06:56:49.511Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 203 | 204 | phantomjs://platform/console++.js:263 in error 205 | [ERROR - 2019-10-16T06:56:49.511Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 206 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 207 | 208 | phantomjs://platform/console++.js:263 in error 209 | [ERROR - 2019-10-16T06:56:51.597Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 210 | 211 | phantomjs://platform/console++.js:263 in error 212 | [ERROR - 2019-10-16T06:56:51.597Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 213 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 214 | 215 | phantomjs://platform/console++.js:263 in error 216 | [ERROR - 2019-10-16T06:56:53.240Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 217 | 218 | phantomjs://platform/console++.js:263 in error 219 | [ERROR - 2019-10-16T06:56:53.240Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 220 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 221 | 222 | phantomjs://platform/console++.js:263 in error 223 | [ERROR - 2019-10-16T06:56:55.317Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 224 | 225 | phantomjs://platform/console++.js:263 in error 226 | [ERROR - 2019-10-16T06:56:55.317Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 227 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 228 | 229 | phantomjs://platform/console++.js:263 in error 230 | [ERROR - 2019-10-16T06:56:57.056Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 231 | 232 | phantomjs://platform/console++.js:263 in error 233 | [ERROR - 2019-10-16T06:56:57.056Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 234 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 235 | 236 | phantomjs://platform/console++.js:263 in error 237 | [ERROR - 2019-10-16T06:56:58.943Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 238 | 239 | phantomjs://platform/console++.js:263 in error 240 | [ERROR - 2019-10-16T06:56:58.944Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 241 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 242 | 243 | phantomjs://platform/console++.js:263 in error 244 | [ERROR - 2019-10-16T06:57:00.550Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 245 | 246 | phantomjs://platform/console++.js:263 in error 247 | [ERROR - 2019-10-16T06:57:00.550Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 248 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 249 | 250 | phantomjs://platform/console++.js:263 in error 251 | [ERROR - 2019-10-16T06:57:02.730Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 252 | 253 | phantomjs://platform/console++.js:263 in error 254 | [ERROR - 2019-10-16T06:57:02.731Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 255 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 256 | 257 | phantomjs://platform/console++.js:263 in error 258 | [ERROR - 2019-10-16T06:57:04.680Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - msg: TypeError: undefined is not a constructor (evaluating 't.getEntriesByType("navigation")') 259 | 260 | phantomjs://platform/console++.js:263 in error 261 | [ERROR - 2019-10-16T06:57:04.680Z] Session [f7b316e0-efe1-11e9-8204-558610a45b39] - page.onError - stack: 262 | (anonymous function) (https://cdn2.jianshu.io/shakespeare/_next/static/runtime/main-d69502b33991d2df6c95.js:1) 263 | 264 | phantomjs://platform/console++.js:263 in error 265 | -------------------------------------------------------------------------------- /jianshu/jianshu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/jianshu/jianshu/__init__.py -------------------------------------------------------------------------------- /jianshu/jianshu/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/jianshu/jianshu/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /jianshu/jianshu/__pycache__/items.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/jianshu/jianshu/__pycache__/items.cpython-37.pyc -------------------------------------------------------------------------------- /jianshu/jianshu/__pycache__/middlewares.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/jianshu/jianshu/__pycache__/middlewares.cpython-37.pyc -------------------------------------------------------------------------------- /jianshu/jianshu/__pycache__/pipelines.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/jianshu/jianshu/__pycache__/pipelines.cpython-37.pyc -------------------------------------------------------------------------------- /jianshu/jianshu/__pycache__/settings.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/jianshu/jianshu/__pycache__/settings.cpython-37.pyc -------------------------------------------------------------------------------- /jianshu/jianshu/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class JianshuItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | title = scrapy.Field() 15 | author = scrapy.Field() 16 | time = scrapy.Field() 17 | word_num = scrapy.Field() 18 | read_num = scrapy.Field() 19 | content = scrapy.Field() 20 | comment_num = scrapy.Field() 21 | support_num = scrapy.Field() 22 | process_url = scrapy.Field() 23 | article_id = scrapy.Field() 24 | origin_url = scrapy.Field() 25 | tags = scrapy.Field() 26 | pass 27 | -------------------------------------------------------------------------------- /jianshu/jianshu/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | from selenium import webdriver 10 | import time 11 | from selenium.webdriver.support.wait import WebDriverWait 12 | from selenium.webdriver.common.by import By 13 | from selenium.webdriver.support import expected_conditions as EC 14 | from scrapy.http.response.html import HtmlResponse 15 | 16 | class SeleniumDownloadMiddleware(object): 17 | 18 | def __init__(self): 19 | self.borswer = webdriver.Chrome(r"E:/0_软件/3_爬虫相关软件/chromedriver_win32/chromedriver.exe") 20 | self.wait = WebDriverWait(self.borswer,10) 21 | 22 | def process_request(self, request, spider): 23 | self.borswer.get(request.url) 24 | print('我正在使用Selenium加载网页!!!') 25 | time.sleep(1) 26 | try: 27 | while True: 28 | showmore = self.borswer.browser.find_element_by_class_name('H7E3vT') 29 | showmore.click() 30 | time.sleep(0.3) 31 | if not showmore: 32 | break 33 | except: 34 | pass 35 | source = self.borswer.page_source 36 | response = HtmlResponse(url=self.borswer.current_url, request=request, body=source, encoding='utf-8') 37 | return response 38 | 39 | 40 | class JianshuSpiderMiddleware(object): 41 | # Not all methods need to be defined. If a method is not defined, 42 | # scrapy acts as if the spider middleware does not modify the 43 | # passed objects. 44 | 45 | @classmethod 46 | def from_crawler(cls, crawler): 47 | # This method is used by Scrapy to create your spiders. 48 | s = cls() 49 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 50 | return s 51 | 52 | def process_spider_input(self, response, spider): 53 | # Called for each response that goes through the spider 54 | # middleware and into the spider. 55 | 56 | # Should return None or raise an exception. 57 | return None 58 | 59 | def process_spider_output(self, response, result, spider): 60 | # Called with the results returned from the Spider, after 61 | # it has processed the response. 62 | 63 | # Must return an iterable of Request, dict or Item objects. 64 | for i in result: 65 | yield i 66 | 67 | def process_spider_exception(self, response, exception, spider): 68 | # Called when a spider or process_spider_input() method 69 | # (from other spider middleware) raises an exception. 70 | 71 | # Should return either None or an iterable of Response, dict 72 | # or Item objects. 73 | pass 74 | 75 | def process_start_requests(self, start_requests, spider): 76 | # Called with the start requests of the spider, and works 77 | # similarly to the process_spider_output() method, except 78 | # that it doesn’t have a response associated. 79 | 80 | # Must return only requests (not items). 81 | for r in start_requests: 82 | yield r 83 | 84 | def spider_opened(self, spider): 85 | spider.logger.info('Spider opened: %s' % spider.name) 86 | 87 | 88 | class JianshuDownloaderMiddleware(object): 89 | # Not all methods need to be defined. If a method is not defined, 90 | # scrapy acts as if the downloader middleware does not modify the 91 | # passed objects. 92 | 93 | @classmethod 94 | def from_crawler(cls, crawler): 95 | # This method is used by Scrapy to create your spiders. 96 | s = cls() 97 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 98 | return s 99 | 100 | def process_request(self, request, spider): 101 | # Called for each request that goes through the downloader 102 | # middleware. 103 | 104 | # Must either: 105 | # - return None: continue processing this request 106 | # - or return a Response object 107 | # - or return a Request object 108 | # - or raise IgnoreRequest: process_exception() methods of 109 | # installed downloader middleware will be called 110 | return None 111 | 112 | def process_response(self, request, response, spider): 113 | # Called with the response returned from the downloader. 114 | 115 | # Must either; 116 | # - return a Response object 117 | # - return a Request object 118 | # - or raise IgnoreRequest 119 | return response 120 | 121 | def process_exception(self, request, exception, spider): 122 | # Called when a download handler or a process_request() 123 | # (from other downloader middleware) raises an exception. 124 | 125 | # Must either: 126 | # - return None: continue processing this exception 127 | # - return a Response object: stops process_exception() chain 128 | # - return a Request object: stops process_exception() chain 129 | pass 130 | 131 | def spider_opened(self, spider): 132 | spider.logger.info('Spider opened: %s' % spider.name) -------------------------------------------------------------------------------- /jianshu/jianshu/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymysql 8 | 9 | class JianshuPipeline(object): 10 | def __init__(self): 11 | params = { 12 | 'host':'localhost', 13 | 'user':'root', 14 | 'password':'pass4321', 15 | 'port':3306, 16 | 'db':'jianshu', 17 | 'charset':'utf8' 18 | } 19 | self.db = pymysql.connect(**params) 20 | self.cursor = self.db.cursor() 21 | self._sql = None 22 | 23 | @property 24 | def sql(self): 25 | if not self._sql: 26 | self._sql = '''INSERT INTO article (title,author,pub_time,word_num,read_num,content,comment_num,support_num,process_url,article_id,origin_url,tags)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' 27 | return self._sql 28 | return self._sql 29 | 30 | def process_item(self, item, spider): 31 | self.cursor.execute(self.sql,(item['title'],item['author'],item['time'],item['word_num'],item['read_num'],item['content'],item['comment_num'],item['support_num'],item['process_url'],item['article_id'],item['origin_url'],item['tags'])) 32 | self.db.commit() 33 | return item 34 | -------------------------------------------------------------------------------- /jianshu/jianshu/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for jianshu project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'jianshu' 13 | 14 | SPIDER_MODULES = ['jianshu.spiders'] 15 | NEWSPIDER_MODULE = 'jianshu.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'jianshu (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | DEFAULT_REQUEST_HEADERS = { 43 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | 'Accept-Language': 'en', 45 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', 46 | } 47 | 48 | # Enable or disable spider middlewares 49 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 50 | #SPIDER_MIDDLEWARES = { 51 | # 'jianshu.middlewares.JianshuSpiderMiddleware': 543, 52 | #} 53 | 54 | # Enable or disable downloader middlewares 55 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 56 | DOWNLOADER_MIDDLEWARES = { 57 | # 'jianshu.middlewares.JianshuDownloaderMiddleware': 543, 58 | 'jianshu.middlewares.SeleniumDownloadMiddleware': 543, 59 | } 60 | 61 | # Enable or disable extensions 62 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 63 | #EXTENSIONS = { 64 | # 'scrapy.extensions.telnet.TelnetConsole': None, 65 | #} 66 | 67 | # Configure item pipelines 68 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 69 | ITEM_PIPELINES = { 70 | 'jianshu.pipelines.JianshuPipeline': 300, 71 | } 72 | 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 75 | #AUTOTHROTTLE_ENABLED = True 76 | # The initial download delay 77 | #AUTOTHROTTLE_START_DELAY = 5 78 | # The maximum download delay to be set in case of high latencies 79 | #AUTOTHROTTLE_MAX_DELAY = 60 80 | # The average number of requests Scrapy should be sending in parallel to 81 | # each remote server 82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 83 | # Enable showing throttling stats for every response received: 84 | #AUTOTHROTTLE_DEBUG = False 85 | 86 | # Enable and configure HTTP caching (disabled by default) 87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 88 | #HTTPCACHE_ENABLED = True 89 | #HTTPCACHE_EXPIRATION_SECS = 0 90 | #HTTPCACHE_DIR = 'httpcache' 91 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 93 | -------------------------------------------------------------------------------- /jianshu/jianshu/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /jianshu/jianshu/spiders/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/jianshu/jianshu/spiders/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /jianshu/jianshu/spiders/__pycache__/js.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/jianshu/jianshu/spiders/__pycache__/js.cpython-37.pyc -------------------------------------------------------------------------------- /jianshu/jianshu/spiders/js.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.linkextractors import LinkExtractor 4 | from scrapy.spiders import CrawlSpider, Rule 5 | from jianshu.items import JianshuItem 6 | 7 | class JsSpider(CrawlSpider): 8 | name = 'js' 9 | allowed_domains = ["jianshu.com"] 10 | start_urls = ["https://www.jianshu.com/"] 11 | rules = ( 12 | Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_detail', follow=True), 13 | ) 14 | 15 | def parse_detail(self, response): 16 | title = response.xpath('//h1[@class="_1RuRku"]/text()').get() 17 | author = response.xpath('//span[@class="_22gUMi"]/text()').get() 18 | time = response.xpath('//div[@class="s-dsoj"]/time/text()').get() 19 | word_num = response.xpath('//div[@class="s-dsoj"]/span/text()').getall()[0].split(' ')[-1] 20 | read_num = response.xpath('//div[@class="s-dsoj"]/span/text()').getall()[1].split(' ')[-1] 21 | content = response.xpath('//article[@class="_2rhmJa"]').get() 22 | comment_num = response.xpath('//div[@class="-pXE92"]//span/text()').getall()[1] 23 | support_num = response.xpath('//div[@class="-pXE92"]//span/text()').getall()[-1] 24 | process_url = response.url.split('?')[0] 25 | article_id = process_url.split('/')[-1] 26 | tags = response.xpath('//div[@class="_2Nttfz"]/a//span/text()').getall() 27 | tags = ','.join(tags) 28 | origin_url = response.url 29 | item = JianshuItem(title=title,author=author,time=time,word_num=word_num,read_num=read_num, 30 | content=content,comment_num=comment_num,support_num=support_num,process_url=process_url,article_id=article_id, 31 | origin_url=origin_url,tags=tags) 32 | return item 33 | -------------------------------------------------------------------------------- /jianshu/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = jianshu.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = jianshu 12 | -------------------------------------------------------------------------------- /jianshu/start.py: -------------------------------------------------------------------------------- 1 | from scrapy.cmdline import execute 2 | execute('scrapy crawl js'.split(' ')) -------------------------------------------------------------------------------- /ppt_download_spider/ppt_download_spider.py: -------------------------------------------------------------------------------- 1 | from urllib import request,error 2 | from lxml import etree 3 | import os 4 | import time 5 | 6 | class Get_PPT(): 7 | 8 | def __init__(self): 9 | 10 | self.base_url = 'http://www.1ppt.com' 11 | self.page_url = 'ppt_dabian_{}.html'.format(1) 12 | self.header = { 13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36' 14 | } 15 | 16 | # 获取进入ppt详细介绍页面的url 17 | def get_urls(self,url): 18 | req = request.Request(url, headers=self.header) 19 | response = request.urlopen(req) 20 | text = response.read().decode('gb2312') 21 | html = etree.HTML(text) 22 | # 获取进入ppt详细介绍页面的url 23 | detail_url = html.xpath("//ul[@class='tplist']//h2/a/@href") 24 | ppt_urls = [] 25 | for url in detail_url: 26 | ppt_urls.append(self.base_url+url) 27 | return ppt_urls 28 | # 获取详细页面中PPT下载的链接 29 | def get_down_ppt_url(self,url,file_name): 30 | ppt_urls = self.get_urls(url) 31 | for url in ppt_urls: 32 | print("下载链接为{}的ppt模板".format(url)) 33 | req = request.Request(url, headers=self.header) 34 | response = request.urlopen(req) 35 | text = response.read().decode('gb2312') 36 | html = etree.HTML(text) 37 | name = html.xpath("//div[@class='ppt_info clearfix']/h1/text()")[0] 38 | down_url = html.xpath("//ul[@class='downurllist']//a/@href")[0] 39 | try: 40 | time.sleep(1) 41 | spon = request.urlopen(down_url) 42 | filename = file_name + '/' + '{}.zip'.format(name) 43 | with open(filename,"wb") as code: 44 | code.write(spon.read()) 45 | except error.HTTPError: 46 | print("403 Forbidden!!!!") 47 | if __name__ == "__main__": 48 | ppt = Get_PPT() 49 | # 获取论文答辩模块的PPT模板(可根据自己需求更改、此时页数也需要根据此模块的情况进行修改) 50 | for i in range(1,9): 51 | print("下载第{}页的PPT".format(i)) 52 | lunwen_url = '/xiazai/dabian/ppt_dabian_{}.html'.format(i) 53 | file_name = "第{}页PPT总和".format(i) 54 | if not os.path.exists(file_name): 55 | os.mkdir(file_name) 56 | url = ppt.base_url + lunwen_url 57 | ppt.get_down_ppt_url(url,file_name) -------------------------------------------------------------------------------- /proxy_design/__pycache__/connect_redis.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/proxy_design/__pycache__/connect_redis.cpython-37.pyc -------------------------------------------------------------------------------- /proxy_design/connect_redis.py: -------------------------------------------------------------------------------- 1 | import redis 2 | 3 | class RedisClient(object): 4 | def __init__(self): 5 | self.key = 'proxy' 6 | if not hasattr(self, 'pool'): 7 | self.pool = redis.ConnectionPool(host='127.0.0.1', port=6379, db=0) 8 | self.getConnection() 9 | 10 | 11 | def getConnection(self): 12 | self._conn = redis.StrictRedis(connection_pool=self.pool) 13 | 14 | 15 | def add(self, value): 16 | return self._conn.sadd(self.key, value) 17 | 18 | 19 | def random(self): 20 | return self._conn.srandmember(self.key) 21 | 22 | 23 | def delete(self, value): 24 | return self._conn.srem(self.key, value) 25 | 26 | 27 | r = RedisClient() -------------------------------------------------------------------------------- /proxy_design/proxy.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | from connect_redis import r 4 | import multiprocessing 5 | 6 | class proxy(): 7 | def __init__(self): 8 | self.key = 'proxy' 9 | self.headers = { 10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36' 11 | } 12 | self.proxy_list = [] 13 | self.url = [ 'http://www.66ip.cn/{}.html'.format(i) for i in range(20)] 14 | 15 | def getContent(self): 16 | ''' 17 | 获取网站源代码 18 | :return: 19 | ''' 20 | # url = 'http://www.66ip.cn/1.html' 21 | # 由于一页仅有5个代理IP,因此默认获取20页 22 | for url in self.url: 23 | data = requests.get(url, headers=self.headers) 24 | content = data.text.encode('ISO-8859-1').decode(requests.utils.get_encodings_from_content(data.text)[0]) 25 | self.parse(content) 26 | 27 | def parse(self,content): 28 | ''' 29 | 解析网站源代码 30 | :return: 31 | ''' 32 | html = etree.HTML(content) 33 | reslut = html.xpath("//div[@align='center']/table/tr")[1:] 34 | for re in reslut: 35 | proxy_ip = re.xpath("./td/text()")[0] 36 | proxy_port = re.xpath("./td/text()")[1] 37 | proxy_address = re.xpath("./td/text()")[2] 38 | proxy_style = re.xpath("./td/text()")[3] 39 | proxy_check_time = re.xpath("./td/text()")[4] 40 | # 将获取的代理存放至列表中 41 | self.proxy_list.append(proxy_ip+'+'+proxy_port+'+'+proxy_address+'+'+proxy_style+'+'+proxy_check_time) 42 | proxy_value = proxy_ip+":"+proxy_port 43 | print(proxy_value) 44 | # 将代理添加至redis数据库中 45 | r.add(proxy_value) 46 | 47 | def get_proxy_random(self): 48 | ''' 49 | 随机获取代理地址 50 | :return: 51 | ''' 52 | # 通过其是否可以访问百度验证其有效性 53 | url = 'https://www.baidu.com' 54 | value = r.random() 55 | print(value) 56 | if value is None: 57 | self.getContent() 58 | proxies = {"http": "http://" + value.decode("utf-8")} 59 | print(proxies) 60 | try: 61 | data = requests.get(url=url, headers=self.headers, proxies=proxies, timeout=5) 62 | if data.status_code is not 200: 63 | print('代理无效,进行删除') 64 | r.delete(value) 65 | self.random() 66 | else: 67 | print('可以访问百度网页!有效代理') 68 | return 'http://' + value.decode("utf-8") 69 | except: 70 | print('代理无效,进行删除') 71 | r.delete(value) 72 | self.random() 73 | 74 | proxy = proxy() 75 | # ip = proxy.get_proxy_random() 76 | # print(ip) 77 | 78 | -------------------------------------------------------------------------------- /stock/A_stock_company.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import pymysql 3 | from lxml import etree 4 | import time 5 | import random 6 | 7 | 8 | def crawl_stock_company(url, connect): 9 | headers = { 10 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) \ 11 | AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 \ 12 | Safari/537.36" 13 | } 14 | data = requests.get(url, headers=headers).text 15 | html = etree.HTML(data) 16 | tr_data = html.xpath("//table[@id='myTable04']//tbody/tr") 17 | for tr in tr_data: 18 | stock_id = tr.xpath(".//td//text()")[1] 19 | stock_name = tr.xpath(".//td//text()")[2] 20 | company_name = tr.xpath(".//td//text()")[3] 21 | company_province = tr.xpath(".//td//text()")[4] 22 | company_loc = tr.xpath(".//td//text()")[5] 23 | company_num = tr.xpath(".//td//text()")[8] 24 | company_create_date = tr.xpath(".//td//text()")[9] 25 | company_trade = tr.xpath(".//td//text()")[-3] 26 | company_business = tr.xpath(".//td//text()")[-1] 27 | value = (stock_id, stock_name, company_name, company_province, 28 | company_loc, company_num, company_create_date, 29 | company_trade, company_business) 30 | sql = "insert into All_Stock_Name(stock_id, stock_name, \ 31 | company_name, company_province, company_loc, company_num, \ 32 | company_create_date,company_trade, company_business) values \ 33 | (%s, %s, %s, %s, %s, %s, %s, %s, %s)" 34 | cursor = connect.cursor() 35 | cursor.execute(sql, value) 36 | connect.commit() 37 | print(value) 38 | 39 | 40 | if __name__ == "__main__": 41 | connect = pymysql.connect(host='xxxxx', user='root', password= 42 | 'xxxxxx', port=3306, db='stock') 43 | file = open("new_{}.text".format(time.time()), "w") 44 | for i in range(1, 207): 45 | try: 46 | url = "https://s.askci.com/stock/a/0-0?reportTime=2020-03-31&pageNum={}#\ 47 | QueryCondition".format(i) 48 | crawl_stock_company(url, connect) 49 | time.sleep(random.randint(1, 2)) 50 | except(Exception): 51 | print("股票抓取失败!!!") 52 | file.write("股票{}数据未抓取成功!".format(i)+"\n") 53 | -------------------------------------------------------------------------------- /stock/stock_spider_new.py: -------------------------------------------------------------------------------- 1 | # -*- coding:UTF-8 -*- 2 | """ 3 | * @Author: Jack Shan 4 | * @Date: 2020-11-17 14:20:17 5 | * @Last Modified by: Jack Shan 6 | * @Last Modified time: 2020-11-17 14:20:17 7 | """ 8 | # 相关库的导入 9 | import requests 10 | import pymysql 11 | from lxml import etree 12 | import pandas as pd 13 | import threading 14 | import time 15 | import datetime 16 | import random 17 | from queue import Queue 18 | import logging 19 | import os 20 | 21 | 22 | # 添加日志相关内容 23 | # 创建一个logger 24 | logger = logging.getLogger() 25 | logger.setLevel(level=logging.INFO) 26 | # 创建handler,用于写入日志文件 27 | file = time.strftime("%Y%m%d%H%M", time.localtime(time.time())) 28 | log_path = os.path.dirname(os.getcwd()) + '/stock/Logs/' 29 | log_name = log_path + file + '.log' 30 | logfile = log_name 31 | fh = logging.FileHandler(logfile, mode="w", encoding="UTF-8") 32 | fh.setLevel(logging.DEBUG) # 输出到file的log等级的开关 33 | ch = logging.StreamHandler() 34 | ch.setLevel(logging.WARNING) # 输出到console的log等级的开关 35 | # 第三步,定义handler的输出格式 36 | formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] -\ 37 | %(levelname)s: %(message)s") 38 | fh.setFormatter(formatter) 39 | ch.setFormatter(formatter) 40 | # 第四步,将logger添加到handler里面 41 | logger.addHandler(fh) 42 | logger.addHandler(ch) 43 | # 定义日志 44 | 45 | 46 | class CrwalStockName(threading.Thread): 47 | 48 | headers = { 49 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) \ 50 | AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 \ 51 | Safari/537.36" 52 | } 53 | 54 | def __init__(self, page_queue, stock_name, *args, **kwargs): 55 | super(CrwalStockName, self).__init__(*args, **kwargs) 56 | self.page_queue = page_queue 57 | self.stock_name = stock_name 58 | 59 | def run(self): 60 | while True: 61 | if self.page_queue.empty(): 62 | break 63 | url = self.page_queue.get() 64 | if "funds" not in url: 65 | self.parse_page(url) 66 | else: 67 | self.parse_page_detail(url) 68 | 69 | def parse_page(self, url): 70 | try: 71 | # logger.info('{}网页解析完成'.format(url)) 72 | response = requests.get(url=url, headers=self.headers) 73 | html = etree.HTML(response.text) 74 | text1 = "".join(html.xpath("//div[@id='history_funds_analysis_free']\ 75 | //p//text()")) 76 | text2 = html.xpath("//p[@class='zjlxlstj_txt mb14']//\ 77 | text()")[0].strip() 78 | text = text1 + "&" + text2 79 | stock_id = str(url).split("/")[-2] 80 | date = datetime.datetime.now() 81 | today = "{}/{}/{}".format(date.year, date.month, date.day) 82 | self.stock_name.put((text, stock_id, today)) 83 | time.sleep(random.randint(1, 2)) 84 | except(Exception): 85 | logger.info('{}网页解析失败'.format(url)) 86 | self.page_queue.put(url) 87 | 88 | def parse_page_detail(self, url): 89 | try: 90 | # logger.info('{}网页解析完成'.format(url)) 91 | response = requests.get(url=url, headers=self.headers) 92 | html = etree.HTML(response.text) 93 | data = html.xpath("//div[@id='history_table_free']//tr")[2:] 94 | stock_id = str(url).split("/")[-3] 95 | for i in data: 96 | date = tuple(i.xpath(".//td//text()")) 97 | value = (stock_id, date[0], date[1], date[2], date[3], date[4], 98 | date[5], date[6], date[7], date[8], date[9], date[10]) 99 | self.stock_name.put(value) 100 | time.sleep(random.randint(1, 2)) 101 | except(Exception): 102 | logger.info('{}网页解析失败'.format(url)) 103 | self.page_queue.put(url) 104 | 105 | 106 | class StockNameConsumer(threading.Thread): 107 | 108 | headers = { 109 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) \ 110 | AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 \ 111 | Safari/537.36" 112 | } 113 | 114 | sql_recode = [] 115 | 116 | def __init__(self, page_queue, stock_name, connect, *args, **kwargs): 117 | super(StockNameConsumer, self).__init__(*args, **kwargs) 118 | self.page_queue = page_queue 119 | self.stock_name = stock_name 120 | self.connect = connect 121 | 122 | def run(self): 123 | while True: 124 | if self.stock_name.empty(): 125 | if self.page_queue.empty(): 126 | return 127 | data = self.stock_name.get() 128 | print(data) 129 | if len(data) == 3: 130 | self.save_data(data) 131 | else: 132 | self.save_data_detail(data) 133 | 134 | def save_data(self, data): 135 | try: 136 | print(data[0], data[1]) 137 | self.connect.ping(reconnect=True) 138 | # sql = "update stock_name_new_copy2 set text = '{}' where id = {}" 139 | # .format(data[0], data[1]) 140 | sql = "insert into stock_name_new_copy3(text, id, date) values (%s, \ 141 | %s, %s)" 142 | cursor = self.connect.cursor() 143 | cursor.execute(sql, tuple(data)) 144 | self.connect.commit() 145 | except(Exception): 146 | logger.error('{}数据保存数据库失败'.format(data)) 147 | 148 | def save_data_detail(self, data): 149 | try: 150 | # logger.info('{}数据保存完成'.format(data)) 151 | self.connect.ping(reconnect=True) 152 | sql = "insert into stock_price_new_copy3(id, tr_time, end_price, \ 153 | up_down, money_in, d5_in_big, b_money, b_part, m_money, m_part,\ 154 | l_money, l_part) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, \ 155 | %s, %s)" 156 | cursor = self.connect.cursor() 157 | cursor.execute(sql, data) 158 | self.connect.commit() 159 | except(Exception): 160 | logger.error('{}数据保存数据库失败'.format(data)) 161 | 162 | 163 | def main(): 164 | connect = pymysql.connect(host='xxxxx', user='root', password='\ 165 | xxxxx', port=3306, db='stock') 166 | page_queue = Queue() 167 | stock_name = Queue() 168 | stock_id = pd.read_sql(sql='select distinct stock_id from All_Stock_Name', 169 | con=connect) 170 | stock_list = list(stock_id["stock_id"]) 171 | for i in stock_list: 172 | text_url = "http://stockpage.10jqka.com.cn/"+i+"/" 173 | detail_url = "http://stockpage.10jqka.com.cn/"+i+"/funds/" 174 | page_queue.put(text_url) 175 | page_queue.put(detail_url) 176 | 177 | for i in range(2): 178 | t = CrwalStockName(page_queue=page_queue, stock_name=stock_name) 179 | t.start() 180 | 181 | for i in range(3): 182 | t = StockNameConsumer(page_queue=page_queue, stock_name=stock_name, 183 | connect=connect) 184 | t.start() 185 | 186 | 187 | if __name__ == "__main__": 188 | main() 189 | -------------------------------------------------------------------------------- /vehicle_home/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = vehicle_home.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = vehicle_home 12 | -------------------------------------------------------------------------------- /vehicle_home/vehicle_home/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/vehicle_home/vehicle_home/__init__.py -------------------------------------------------------------------------------- /vehicle_home/vehicle_home/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/vehicle_home/vehicle_home/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /vehicle_home/vehicle_home/__pycache__/items.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/vehicle_home/vehicle_home/__pycache__/items.cpython-38.pyc -------------------------------------------------------------------------------- /vehicle_home/vehicle_home/__pycache__/pipelines.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/vehicle_home/vehicle_home/__pycache__/pipelines.cpython-38.pyc -------------------------------------------------------------------------------- /vehicle_home/vehicle_home/__pycache__/settings.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/vehicle_home/vehicle_home/__pycache__/settings.cpython-38.pyc -------------------------------------------------------------------------------- /vehicle_home/vehicle_home/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class VehicleHomeItem(scrapy.Item): 10 | # define the fields for your item here like: 11 | # name = scrapy.Field() 12 | # info_length = scrapy.Field() 13 | usercont = scrapy.Field() 14 | vehicle_style = scrapy.Field() 15 | vehicle_by_location = scrapy.Field() 16 | vehicle_seller = scrapy.Field() 17 | vehcle_seller_date = scrapy.Field() 18 | vehicle_seller_money = scrapy.Field() 19 | vehicle_status = scrapy.Field() 20 | vehicle_evaluate = scrapy.Field() 21 | vehicle_bu_aim = scrapy.Field() 22 | evalute_date = scrapy.Field() 23 | evalute_title = scrapy.Field() 24 | evalute_infos = scrapy.Field() 25 | visit_count = scrapy.Field() 26 | helpful_count = scrapy.Field() 27 | comment_count = scrapy.Field() 28 | vehicle_brand = scrapy.Field() 29 | -------------------------------------------------------------------------------- /vehicle_home/vehicle_home/middlewares.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your spider middleware 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 | 6 | from scrapy import signals 7 | 8 | # useful for handling different item types with a single interface 9 | from itemadapter import is_item, ItemAdapter 10 | 11 | 12 | class VehicleHomeSpiderMiddleware: 13 | # Not all methods need to be defined. If a method is not defined, 14 | # scrapy acts as if the spider middleware does not modify the 15 | # passed objects. 16 | 17 | @classmethod 18 | def from_crawler(cls, crawler): 19 | # This method is used by Scrapy to create your spiders. 20 | s = cls() 21 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 22 | return s 23 | 24 | def process_spider_input(self, response, spider): 25 | # Called for each response that goes through the spider 26 | # middleware and into the spider. 27 | 28 | # Should return None or raise an exception. 29 | return None 30 | 31 | def process_spider_output(self, response, result, spider): 32 | # Called with the results returned from the Spider, after 33 | # it has processed the response. 34 | 35 | # Must return an iterable of Request, or item objects. 36 | for i in result: 37 | yield i 38 | 39 | def process_spider_exception(self, response, exception, spider): 40 | # Called when a spider or process_spider_input() method 41 | # (from other spider middleware) raises an exception. 42 | 43 | # Should return either None or an iterable of Request or item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class VehicleHomeDownloaderMiddleware: 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /vehicle_home/vehicle_home/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | # from itemadapter import ItemAdapter 9 | import pymysql 10 | 11 | 12 | class VehicleHomePipeline: 13 | def __init__(self): 14 | self.connect = pymysql.connect( 15 | host='xxxx', user='root', 16 | password='xxxxx', db='Spider', 17 | port=3306) 18 | self.cursor = self.connect.cursor() 19 | 20 | def process_item(self, item, spider): 21 | print("----------开启数据库存储模式----------------") 22 | self.connect.ping(reconnect=True) 23 | sql = 'insert into vehicle_home_new(usercont, vehicle_style, vehicle_by_location, vehicle_seller, vehicle_brand, vehcle_seller_date, vehicle_seller_money, vehicle_status, vehicle_evaluate,vehicle_bu_aim,evalute_date,evalute_title,visit_count,helpful_count,comment_count,evalute_infos) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' 24 | self.cursor.execute(sql, (item["usercont"], ";".join(item["vehicle_style"]), item['vehicle_by_location'], item['vehicle_seller'], item['vehicle_brand'].split("-")[0], item['vehcle_seller_date'], "".join(item['vehicle_seller_money']), " ".join(item['vehicle_status']), " ".join(item['vehicle_evaluate']), " ".join(item['vehicle_bu_aim']), item['evalute_date'], item['evalute_title'], item['visit_count'], item['helpful_count'], item['comment_count'], item["evalute_infos"])) 25 | self.connect.commit() 26 | return item 27 | 28 | def close_spider(self, spider): 29 | print('----------关闭数据库资源-----------') 30 | # 关闭游标 31 | self.cursor.close() 32 | # 关闭连接 33 | self.connect.close() 34 | -------------------------------------------------------------------------------- /vehicle_home/vehicle_home/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for vehicle_home project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://docs.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | 10 | BOT_NAME = 'vehicle_home' 11 | 12 | SPIDER_MODULES = ['vehicle_home.spiders'] 13 | NEWSPIDER_MODULE = 'vehicle_home.spiders' 14 | 15 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 17 | # USER_AGENT = 'vehicle_home (+http://www.yourdomain.com)' 18 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | CONCURRENT_REQUESTS = 100 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | DOWNLOAD_DELAY = 0.01 30 | # The download delay setting will honor only one of: 31 | CONCURRENT_REQUESTS_PER_DOMAIN = 100 32 | CONCURRENT_REQUESTS_PER_IP = 100 33 | 34 | # Disable cookies (enabled by default) 35 | COOKIES_ENABLED = False 36 | 37 | LOG_FILE = 'log.txt' 38 | LOG_LEVEL = "INFO" 39 | 40 | # Disable Telnet Console (enabled by default) 41 | #TELNETCONSOLE_ENABLED = False 42 | 43 | # Override the default request headers: 44 | #DEFAULT_REQUEST_HEADERS = { 45 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 46 | # 'Accept-Language': 'en', 47 | #} 48 | 49 | # Enable or disable spider middlewares 50 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 51 | #SPIDER_MIDDLEWARES = { 52 | # 'vehicle_home.middlewares.VehicleHomeSpiderMiddleware': 543, 53 | #} 54 | 55 | # Enable or disable downloader middlewares 56 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 57 | #DOWNLOADER_MIDDLEWARES = { 58 | # 'vehicle_home.middlewares.VehicleHomeDownloaderMiddleware': 543, 59 | #} 60 | 61 | # Enable or disable extensions 62 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 63 | #EXTENSIONS = { 64 | # 'scrapy.extensions.telnet.TelnetConsole': None, 65 | #} 66 | 67 | # Configure item pipelines 68 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 69 | ITEM_PIPELINES = { 70 | 'vehicle_home.pipelines.VehicleHomePipeline': 300, 71 | } 72 | 73 | FEED_EXPORT_ENCODING = "gb18030" 74 | 75 | # Enable and configure the AutoThrottle extension (disabled by default) 76 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 77 | #AUTOTHROTTLE_ENABLED = True 78 | # The initial download delay 79 | #AUTOTHROTTLE_START_DELAY = 5 80 | # The maximum download delay to be set in case of high latencies 81 | #AUTOTHROTTLE_MAX_DELAY = 60 82 | # The average number of requests Scrapy should be sending in parallel to 83 | # each remote server 84 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 85 | # Enable showing throttling stats for every response received: 86 | #AUTOTHROTTLE_DEBUG = False 87 | 88 | # Enable and configure HTTP caching (disabled by default) 89 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 90 | #HTTPCACHE_ENABLED = True 91 | #HTTPCACHE_EXPIRATION_SECS = 0 92 | #HTTPCACHE_DIR = 'httpcache' 93 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 94 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 95 | -------------------------------------------------------------------------------- /vehicle_home/vehicle_home/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /vehicle_home/vehicle_home/spiders/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/vehicle_home/vehicle_home/spiders/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /vehicle_home/vehicle_home/spiders/__pycache__/test.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/vehicle_home/vehicle_home/spiders/__pycache__/test.cpython-38.pyc -------------------------------------------------------------------------------- /vehicle_home/vehicle_home/spiders/__pycache__/vehicle_style.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShanYonggang/spider_list/0bfbef449459c864e06f5e3d7ea4d8a441529caa/vehicle_home/vehicle_home/spiders/__pycache__/vehicle_style.cpython-38.pyc -------------------------------------------------------------------------------- /vehicle_home/vehicle_home/spiders/test.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | import json 4 | 5 | 6 | data = requests.get("https://k.autohome.com.cn/detail/view_01enhzwr4z6csk0c9j70r00000.html#pvareaid=2112108").text 7 | html = etree.HTML(data).xpath("//div[@class='choose-con']//dl") 8 | seller_id = html[2].xpath(".//a/@data-val")[0] 9 | data_evalid = html[2].xpath(".//a/@data-evalid")[0] 10 | url_api = "https://k.autohome.com.cn/frontapi/GetDealerInfor?dearerandspecIdlist=" + seller_id + "," + data_evalid +"|" 11 | data = requests.get(url_api).text 12 | 13 | seller_name = json.loads(data)["result"]["List"][0]["CompanySimple"] 14 | print(seller_id, data_evalid) 15 | 16 | print(seller_name) 17 | -------------------------------------------------------------------------------- /vehicle_home/vehicle_home/spiders/vehicle_style.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from vehicle_home.items import VehicleHomeItem 3 | import re 4 | import json 5 | 6 | 7 | class VehicleStyleSpider(scrapy.Spider): 8 | name = 'vehicle_style' 9 | allowed_domains = ['k.autohome.com.cn'] 10 | start_urls = ['https://k.autohome.com.cn/suva01/'] 11 | 12 | def parse(self, response): 13 | # 获取车型级别链接 14 | vehicle_style_url = response.xpath("//div[@class='findcont-choose']/a/\ 15 | @href").getall() 16 | # 获取车型级别下属所有车型的链接 17 | vehicle_style_one_url = response.xpath("//ul[@class='list-cont']/li/div\ 18 | [@class='cont-pic']/a/@href").getall() 19 | # yield { 20 | # "vehicle_style_one_url": vehicle_style_one_url, 21 | # # "vehicle_style_url": vehicle_style_url 22 | # } 23 | if vehicle_style_url is not None: 24 | for url in vehicle_style_url: 25 | yield response.follow(url, self.parse) 26 | if vehicle_style_one_url is not None: 27 | for url in vehicle_style_one_url: 28 | yield response.follow(url, self.parse_vehicle_detail) 29 | 30 | def parse_vehicle_detail(self, response): 31 | item = VehicleHomeItem() 32 | # 每个用户评价的口碑详细信息 33 | kou_bei_detail_url = response.xpath("//div[@class='allcont border-b-solid']\ 34 | //a/@href").getall() 35 | # 获取品牌及其名称 36 | vehicle_brand = response.xpath("//div[@class='subnav']//div[@class='subnav-title-name']/a/text()").get() 37 | item["vehicle_brand"] = vehicle_brand 38 | # yield items 39 | # 下一页链接 40 | kou_bei_next_url = response.xpath("//div[@class='page']//a[@class=\ 41 | 'page-item-next']/@href").get() 42 | if kou_bei_detail_url is not None: 43 | for url in kou_bei_detail_url: 44 | # print(url) 45 | url = "https:" + url 46 | yield scrapy.Request(url=url, callback=self.parse_vehicle_detail_infos, meta={'item': item}) 47 | # yield response.follow(url, self.parse_vehicle_detail_infos) 48 | if kou_bei_next_url is not None: 49 | yield response.follow(kou_bei_next_url, self.parse_vehicle_detail) 50 | 51 | def parse_vehicle_detail_infos(self, response): 52 | item = response.meta['item'] 53 | # 获取用户昵称 54 | usercont = response.xpath("//div[@class='mouth']//dl[@class='user-cont']\ 55 | //div[@class='user-name']//a/text()").get() 56 | # 车型 57 | vehicle_style = response.xpath("//div[@class='choose-con']//dl//dd//a/\ 58 | text()").getall() 59 | # 购车情况 60 | choose_dl = response.xpath("//div[@class='choose-con']//dl") 61 | # info_length = len(choose_dl) 62 | # 购车地点 63 | vehicle_by_location = choose_dl[1].xpath("./dd//text()").get() 64 | # 购车所属品牌 65 | vehicle_seller = choose_dl[2].xpath("./dd/a/text()").get() 66 | # 购车时间\价格 67 | if vehicle_seller is not None: 68 | vehcle_seller_date = choose_dl[3].xpath("./dd/text()").get() 69 | vehicle_seller_money = choose_dl[4].xpath("./dd//text()").getall() 70 | else: 71 | vehcle_seller_date = choose_dl[2].xpath("./dd/text()").get() 72 | vehicle_seller_money = choose_dl[3].xpath("./dd//text()").getall() 73 | # 油耗或着电耗,目前行驶里程 74 | vehicle_status = choose_dl[-10].xpath("./dd/p//text()").getall() 75 | # 购车多个方面的评价 76 | vehicle_evaluate = response.xpath("//span[@class='testfont']/\ 77 | text()").getall() 78 | # 购车目的 79 | vehicle_bu_aim = choose_dl[-1].xpath(".//dd/p/text()").getall() 80 | # 发布口碑的时间 81 | evalute_date = response.xpath("//div[@class='mouth-item koubei-final']//\ 82 | div[@class='title-name name-width-01']/b/text()").get() 83 | # 口碑题目 84 | evalute_title = response.xpath("//div[@class='mouth-item koubei-final']//\ 85 | div[@class='kou-tit']/h3/text()").get() 86 | # 发布的内容 87 | evalute_infos = response.xpath("//div[@class='mouth-item koubei-final']//\ 88 | div[@class='text-con']//text()").getall() 89 | # 浏览量 90 | visit_count = response.xpath("//div[@class='mouth-remak']//div[@class=\ 91 | 'help']//span[@class='orange']/text()").get() 92 | # 口碑支持数 93 | helpful_count = response.xpath("//div[@class='mouth-remak']//div[@class=\ 94 | 'help']//label[@class='supportNumber']/text()").get() 95 | # 评论数 96 | comment_count = response.xpath("//div[@class='mouth-remak']//div[@class=\ 97 | 'help']/a//span/text()").get() 98 | # item["info_length"] = info_length 99 | item["usercont"] = usercont 100 | item["vehicle_style"] = vehicle_style 101 | item["vehicle_by_location"] = re.sub("[A-Za-z0-9\!\%\[\]\,\。\(\)\}\{\_\=\;&''+\<\>//$.::\"-#:\- \r\n]", "", "".join(vehicle_by_location)) 102 | item["vehcle_seller_date"] = vehcle_seller_date 103 | item["vehicle_seller_money"] = vehicle_seller_money 104 | item["vehicle_status"] = vehicle_status 105 | item["vehicle_evaluate"] = vehicle_evaluate 106 | item["vehicle_bu_aim"] = vehicle_bu_aim 107 | item["evalute_date"] = evalute_date 108 | item["evalute_title"] = evalute_title 109 | item["evalute_infos"] = re.sub("[A-Za-z0-9\!\%\[\]\,\。\(\)\}\{\_\=\;&''+\<\>//$.::\"-#:\- \r\n]", "", "".join(evalute_infos)) 110 | item["visit_count"] = visit_count 111 | item["helpful_count"] = helpful_count 112 | item["comment_count"] = comment_count 113 | # 获取经销商的信息(新增的代码) 114 | if vehicle_seller is not None: 115 | print(response.url) 116 | seller_id = choose_dl[2].xpath(".//a/@data-val").get() 117 | data_evalid = choose_dl[2].xpath(".//a/@data-evalid").get() 118 | seller_api_url = "https://k.autohome.com.cn/frontapi/GetDealerInfor?dearerandspecIdlist=" + seller_id + "," + data_evalid +"|" 119 | print(seller_id, data_evalid) 120 | print(seller_api_url) 121 | print("="*100) 122 | yield scrapy.Request(url=seller_api_url, callback=self.parse_vehicle_seller, meta={'item': item}) 123 | else: 124 | item["vehicle_seller"] = vehicle_seller 125 | yield item 126 | 127 | # 获取经销商信息 128 | def parse_vehicle_seller(self, response): 129 | item = response.meta['item'] 130 | seller_name = json.loads(response.text)["result"]["List"][0]["CompanySimple"] 131 | item["vehicle_seller"] = seller_name 132 | yield item 133 | -------------------------------------------------------------------------------- /weather_spider_analyze/weather_spider.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.by import By 3 | from selenium.webdriver.support import expected_conditions as EC 4 | from selenium.webdriver.support.wait import WebDriverWait 5 | import time 6 | import datetime 7 | import pymysql 8 | 9 | url = 'http://tianqi.2345.com/wea_history/54511.htm' 10 | driver=webdriver.Chrome("E:/0_软件/chromedriver.exe") 11 | try: 12 | driver.get(url) 13 | time.sleep(2) 14 | print(driver.current_url) 15 | # 暂时获取60页(5年的信息) 16 | for i in range(60): 17 | weathter_lists = driver.find_elements_by_xpath('//table/tbody/tr') 18 | for weather in weathter_lists: 19 | infos = weather.find_elements_by_tag_name('td') 20 | weather_time = infos[0].text[:-3] # 时间 21 | # 将字符串时间转换成Datetime 22 | weather_time = datetime.datetime.strptime(weather_time,'%Y-%m-%d').date() 23 | high_tm = infos[1].text # 最高温 24 | low_tm = infos[2].text # 最低温 25 | weath = infos[3].text # 天气 26 | wind_style = infos[4].text # 风向、风力 27 | air = infos[5].text # 空气质量指数 28 | db = pymysql.connect(host='localhost',user='root',password='123456',port=3306,db='spider_data') 29 | cursor = db.cursor() 30 | sql = 'INSERT INTO weather_beijing(weather_time,high_tem,low_tem,weather,wind_direction,air) VALUES(%s,%s,%s,%s,%s,%s)' 31 | try: 32 | cursor.execute(sql,(weather_time,high_tm,low_tm,weath,wind_style,air)) 33 | db.commit() 34 | print('数据保存成功!') 35 | except: 36 | print('数据保存失败!') 37 | print(weather_time,high_tm,low_tm,weath,wind_style,air) 38 | # 点击上一页 39 | pre_page_button = driver.find_element_by_xpath('//div[@id="prevNextBtn"]/a[@class="prev"]') 40 | pre_page_button.click() 41 | time.sleep(3) 42 | except: 43 | print('fail!') 44 | driver.close() 45 | -------------------------------------------------------------------------------- /zhihu/hot.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | import time 4 | import multiprocessing 5 | import pymysql 6 | 7 | headers = { 8 | 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36', 9 | 'cookie': '_zap=47e250c3-7a07-41d6-88a7-52b3cb282921; d_c0="ALCvK5ioexCPTmjktJFsrBEH1LQX-TTUjkM=|1575956493"; capsion_ticket="2|1:0|10:1576116250|14:capsion_ticket|44:ZTEwNjMxZmQ0OTA4NDU5MGI1MWNiODgxYjg4MTRmMWE=|ac6c9175199323ab564e53969f64440ccc244feacfdf40e5bce5a8084d82a806"; z_c0="2|1:0|10:1576116298|4:z_c0|92:Mi4xRi1ac0JnQUFBQUFBc0s4cm1LaDdFQ2NBQUFDRUFsVk5TaTBaWGdBX2puWUIxcmhYa3hoR1hsWkRwN2FKOENGNDN3|7f3436a96ccb3cdf3d549b20ed658d14ab8c9b1f75937684fd3ee524fb061e93"; q_c1=f254b825456c428fb665bc0ba903aca4|1576116327000|1576116327000; __utma=51854390.997463445.1576214466.1576214466.1576214466.1; __utmz=51854390.1576214466.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=51854390.100--|2=registration_date=20171105=1^3=entry_date=20171105=1; _xsrf=f03ebad1-fe74-45e4-a8e1-33c0c0219444; tshl=; tst=h; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1576218531,1576221535,1576472582,1576482250; tgw_l7_route=64ba0a179156dda09fec37a3b2d556ed; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1576483496' 10 | } 11 | url = 'https://www.zhihu.com/hot' 12 | def get_question_num(url,headers): 13 | response = requests.get(url,headers=headers) 14 | text = response.text 15 | html = etree.HTML(text) 16 | reslut = html.xpath("//section[@class='HotItem']") 17 | # 获取问题的ID 18 | question_list = [] 19 | for question in reslut: 20 | number = question.xpath(".//div[@class='HotItem-index']//text()")[0].strip() 21 | title = question.xpath(".//h2[@class='HotItem-title']/text()")[0].strip() 22 | href = question.xpath(".//div[@class='HotItem-content']/a/@href")[0].strip() 23 | question_num = href.split('/')[-1] 24 | question_list.append([question_num,title]) 25 | # print(number,'\n',title,'\n',href) 26 | return question_list 27 | # 数据json请求(问题均通过ajax请求) 28 | # 分析链接格式,如下: 29 | # https://www.zhihu.com/api/v4/questions/359056618/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&limit=5&offset=5&platform=desktop&sort_by=default 30 | # 变化量如:question_id , offset=5,10,15...... 31 | def data_json_request(question_id,question_title,headers): 32 | num = 0 33 | i = 1 34 | while True: 35 | json_url = 'https://www.zhihu.com/api/v4/questions/' + question_id + '/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&limit=5&offset={}&platform=desktop&sort_by=default'.format(num) 36 | data_json = requests.get(json_url,headers=headers) 37 | all_detail_data = data_json.json()['data'] 38 | length_detail_data = len(all_detail_data) 39 | for one_detail_data in all_detail_data: 40 | question_title = question_title 41 | answer_author = one_detail_data['author']['name'] 42 | author_introduce = one_detail_data['author']['headline'] 43 | author_followers = one_detail_data['author']['follower_count'] 44 | answer_vote_num = one_detail_data['voteup_count'] 45 | answer_comment_num = one_detail_data['comment_count'] 46 | updated_time = one_detail_data['updated_time'] 47 | content = one_detail_data['content'] 48 | # 保存数据至数据库 49 | db = pymysql.connect(host='localhost',user='root',password='123456',port=3306,db='spider_data') 50 | cursor = db.cursor() 51 | sql = 'INSERT INTO zhihu_hot_question(question_title,author_name,author_introduce,author_followers,answer_vote_num,answer_comment_num,updated_time,content) VALUES(%s,%s,%s,%s,%s,%s,%s,%s)' 52 | try: 53 | if int(answer_vote_num) >= 90: 54 | cursor.execute(sql,(question_title,answer_author,author_introduce,author_followers,answer_vote_num,answer_comment_num,updated_time,content)) 55 | db.commit() 56 | print('数据写入成功!!!') 57 | else: 58 | print('点赞数太少,不保存至数据库!!!') 59 | except: 60 | print('数据写入失败!') 61 | db.rollback() 62 | # print(question_title,'\n',answer_author,'\n',author_introduce,'\n',author_followers,'\n',answer_vote_num,'\n',answer_comment_num 63 | # ,'\n',updated_time,'\n',content) 64 | num = i*5 65 | i = i+1 66 | if length_detail_data == 0: 67 | print('answaer_stop!!!!!') 68 | break 69 | 70 | # def save_to_mysql(): 71 | # db = pymysql.connect(host='localhost',user='root',password='123456',port=3306,db='spider_data') 72 | # cursor = db.cursor() 73 | # sql = 'INSERT INTO zhihu_hot_question(question_title,author_name,author_introduce,author_followers,answer_vote_num,answer_comment_num,updated_time,content) VALUES(%s,%s,%s,%s,%s,%s,%s,%s)' 74 | 75 | 76 | def main(): 77 | question_id = get_question_num(url,headers) 78 | print(question_id) 79 | print('当前环境CPU核数是:{}核'.format(multiprocessing.cpu_count())) 80 | p = multiprocessing.Pool(4) 81 | for q_id in question_id: 82 | p.apply_async(data_json_request,args=(q_id[0],q_id[1],headers)) 83 | p.close() 84 | p.join() 85 | 86 | if __name__ == "__main__": 87 | start = time.time() 88 | main() 89 | print('总耗时:%.5f秒'% float(time.time()-start)) --------------------------------------------------------------------------------