├── .gitattributes ├── .idea ├── misc.xml ├── modules.xml ├── scrapy_autohome.iml └── workspace.xml ├── README.md ├── main.py ├── scrapy.cfg └── scrapy_autohome ├── __init__.py ├── __pycache__ ├── __init__.cpython-36.pyc ├── all_car_id.cpython-36.pyc ├── items.cpython-36.pyc └── settings.cpython-36.pyc ├── all_car_id.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders ├── __init__.py ├── __pycache__ ├── __init__.cpython-36.pyc └── autohome_spider.cpython-36.pyc └── autohome_spider.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/scrapy_autohome.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 10 | 11 | 13 | 14 | 16 | 17 | 18 | 1495199607865 19 | 23 | 24 | 25 | 26 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [toc] 2 | # AutoHomeSpider_Scrapy 3 | 4 | ## 1需求分析 5 | 6 | 因项目需求,要爬取汽车之家的口碑数据进行下一步分析。 7 | 8 | 但是普通的爬虫软件(如八爪鱼、火车头、神箭手)无法爬取评论(该公司采取了反爬虫措施)。 9 | 10 | 经分析,发现该公司的的反爬虫措施主要是用前端js去替换显示的字体,为一些标签。并且封住鼠标右键导致不好观察源代码。 11 | 12 | 本文以解决各个问题为顺序。 13 | 14 | ## 2 前端js反爬虫措施分析 15 | 16 | 17 | ## 3 爬虫框架scrapy 18 | 19 | ### 3.1 获取所有车型的id 20 | ### 3.2 本爬虫采用scrapy框架分析所需要的评论信息为 21 | 22 | 23 | ## 4 运行方式:下载后,用pycharm运行main.py文件即可 24 | 25 | ## 5 完整项目描述博客:http://blog.csdn.net/u012052268/article/details/72810037 26 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | cmdline.execute("scrapy crawl autohome_spider -o cars.csv".split()) -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = scrapy_autohome.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = scrapy_autohome 12 | -------------------------------------------------------------------------------- /scrapy_autohome/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xqtbox/AutoHomeSpider_Scrapy/6f3b475b3705bbdf5f3d89fd2e2759c48ec4dcf3/scrapy_autohome/__init__.py -------------------------------------------------------------------------------- /scrapy_autohome/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xqtbox/AutoHomeSpider_Scrapy/6f3b475b3705bbdf5f3d89fd2e2759c48ec4dcf3/scrapy_autohome/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy_autohome/__pycache__/all_car_id.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xqtbox/AutoHomeSpider_Scrapy/6f3b475b3705bbdf5f3d89fd2e2759c48ec4dcf3/scrapy_autohome/__pycache__/all_car_id.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy_autohome/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xqtbox/AutoHomeSpider_Scrapy/6f3b475b3705bbdf5f3d89fd2e2759c48ec4dcf3/scrapy_autohome/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy_autohome/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xqtbox/AutoHomeSpider_Scrapy/6f3b475b3705bbdf5f3d89fd2e2759c48ec4dcf3/scrapy_autohome/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy_autohome/all_car_id.py: -------------------------------------------------------------------------------- 1 | # 这个类用于存储 爬取的汽车之家的所有汽车的id,然后传递给spider 2 | class All_Car_Id(): 3 | car_id_string = '''3170 4 | 692 5 | 18 6 | 2951 7 | 812 8 | 19 9 | 509 10 | 650 11 | 370 12 | 2730 13 | 471 14 | 538 15 | 2734 16 | 472 17 | 2736 18 | 740 19 | 2738 20 | 146 21 | 2739 22 | 2264 23 | 593 24 | 2841 25 | 412 26 | 148 27 | 2740 28 | 511 29 | 2415 30 | 3276 31 | 926 32 | 2732 33 | 3669 34 | 3350 35 | 2908 36 | 3287 37 | 3479 38 | 3894 39 | 4003 40 | 4288 41 | 3822 42 | 3210 43 | 2218 44 | 2832 45 | 2735 46 | 2737 47 | 2994 48 | 2731 49 | 2733 50 | 2760 51 | 2741 52 | 923 53 | 884 54 | 385 55 | 822 56 | 266 57 | 386 58 | 3891 59 | 2075 60 | 2275 61 | 3730 62 | 3678 63 | 3004 64 | 3742 65 | 2846 66 | 582 67 | 2745 68 | 2097 69 | 2148 70 | 2098 71 | 3825 72 | 715 73 | 1021 74 | 4196 75 | 2288 76 | 2715 77 | 3030 78 | 179 79 | 401 80 | 4212 81 | 4034 82 | 4109 83 | 4106 84 | 588 85 | 197 86 | 3248 87 | 3862 88 | 2562 89 | 3823 90 | 2084 91 | 2564 92 | 2034 93 | 52 94 | 398 95 | 2966 96 | 56 97 | 450 98 | 365 99 | 59 100 | 4344 101 | 3339 102 | 3683 103 | 60 104 | 3688 105 | 469 106 | 3278 107 | 237 108 | 4411 109 | 3918 110 | 2065 111 | 3700 112 | 4213 113 | 3079 114 | 2950 115 | 3441 116 | 2005 117 | 3364 118 | 192 119 | 2762 120 | 2424 121 | 4195 122 | 4231 123 | 595 124 | 57 125 | 467 126 | 300 127 | 235 128 | 683 129 | 267 130 | 2842 131 | 2967 132 | 2717 133 | 2719 134 | 2197 135 | 3264 136 | 3704 137 | 2723 138 | 3901 139 | 3451 140 | 2718 141 | 4364 142 | 4074 143 | 2721 144 | 2720 145 | 3246 146 | 3985 147 | 2722 148 | 2833 149 | 914 150 | 3665 151 | 4300 152 | 3104 153 | 135 154 | 3859 155 | 4179 156 | 859 157 | 3582 158 | 314 159 | 4304 160 | 2565 161 | 81 162 | 3876 163 | 3085 164 | 2168 165 | 78 166 | 3460 167 | 4102 168 | 880 169 | 2786 170 | 694 171 | 449 172 | 723 173 | 897 174 | 4360 175 | 900 176 | 3275 177 | 3266 178 | 3478 179 | 3705 180 | 896 181 | 2948 182 | 559 183 | 3257 184 | 3605 185 | 2995 186 | 3268 187 | 918 188 | 810 189 | 733 190 | 2587 191 | 2700 192 | 3232 193 | 2551 194 | 233 195 | 4239 196 | 3751 197 | 982 198 | 164 199 | 834 200 | 2896 201 | 3554 202 | 166 203 | 875 204 | 525 205 | 344 206 | 592 207 | 2511 208 | 985 209 | 719 210 | 3708 211 | 2317 212 | 4000 213 | 3078 214 | 3707 215 | 4171 216 | 3941 217 | 66 218 | 4350 219 | 2561 220 | 4356 221 | 65 222 | 2388 223 | 373 224 | 317 225 | 2963 226 | 2968 227 | 202 228 | 2847 229 | 270 230 | 153 231 | 4345 232 | 271 233 | 3053 234 | 159 235 | 4348 236 | 587 237 | 3230 238 | 2387 239 | 161 240 | 3965 241 | 4067 242 | 675 243 | 3386 244 | 4136 245 | 3843 246 | 3146 247 | 2831 248 | 3464 249 | 2196 250 | 3189 251 | 2726 252 | 2727 253 | 2728 254 | 2729 255 | 3357 256 | 2725 257 | 579 258 | 4073 259 | 407 260 | 3283 261 | 2761 262 | 2806 263 | 3781 264 | 3780 265 | 4394 266 | 2088 267 | 3059 268 | 3430 269 | 831 270 | 798 271 | 4393 272 | 3474 273 | 4279 274 | 417 275 | 940 276 | 4333 277 | 927 278 | 997 279 | 2085 280 | 540 281 | 2091 282 | 2944 283 | 489 284 | 4271 285 | 3068 286 | 877 287 | 3632 288 | 987 289 | 2299 290 | 3234 291 | 2619 292 | 4167 293 | 437 294 | 688 295 | 99 296 | 2472 297 | 2047 298 | 726 299 | 3381 300 | 3903 301 | 2544 302 | 3434 303 | 4331 304 | 2829 305 | 689 306 | 2191 307 | 2404 308 | 2801 309 | 2891 310 | 3614 311 | 724 312 | 4193 313 | 2315 314 | 861 315 | 2892 316 | 3974 317 | 184 318 | 640 319 | 277 320 | 185 321 | 186 322 | 155 323 | 4077 324 | 3774 325 | 3343 326 | 2236 327 | 4166 328 | 3677 329 | 3412 330 | 4392 331 | 703 332 | 4322 333 | 2838 334 | 172 335 | 4324 336 | 4175 337 | 162 338 | 2073 339 | 168 340 | 415 341 | 3917 342 | 3401 343 | 3695 344 | 632 345 | 466 346 | 2310 347 | 4069 348 | 3000 349 | 901 350 | 3014 351 | 2685 352 | 305 353 | 3739 354 | 4363 355 | 3668 356 | 306 357 | 3557 358 | 3794 359 | 2791 360 | 2787 361 | 3673 362 | 3839 363 | 3928 364 | 3795 365 | 3417 366 | 3427 367 | 3800 368 | 623 369 | 2852 370 | 2252 371 | 3284 372 | 3361 373 | 4276 374 | 3426 375 | 3874 376 | 3661 377 | 4137 378 | 3714 379 | 4046 380 | 3913 381 | 4047 382 | 4403 383 | 965 384 | 3035 385 | 2126 386 | 622 387 | 2960 388 | 852 389 | 4009 390 | 4154 391 | 2482 392 | 3231 393 | 3191 394 | 3916 395 | 4061 396 | 3428 397 | 2943 398 | 3712 399 | 2446 400 | 2447 401 | 2575 402 | 2461 403 | 2444 404 | 2809 405 | 2947 406 | 2448 407 | 2460 408 | 390 409 | 3889 410 | 3919 411 | 4264 412 | 3533 413 | 3884 414 | 4104 415 | 4015 416 | 3630 417 | 3537 418 | 4298 419 | 4164 420 | 4173 421 | 4269 422 | 3217 423 | 4380 424 | 2119 425 | 2788 426 | 2567 427 | 2429 428 | 4343 429 | 3422 430 | 2785 431 | 3924 432 | 2778 433 | 3204 434 | 3121 435 | 4242 436 | 3120 437 | 3811 438 | 2316 439 | 484 440 | 2046 441 | 2045 442 | 705 443 | 1008 444 | 590 445 | 520 446 | 2090 447 | 2462 448 | 3101 449 | 625 450 | 2121 451 | 2459 452 | 491 453 | 2122 454 | 2001 455 | 493 456 | 624 457 | 492 458 | 2120 459 | 535 460 | 2304 461 | 6 462 | 552 463 | 2200 464 | 536 465 | 2653 466 | 4181 467 | 3311 468 | 4319 469 | 4182 470 | 4376 471 | 2954 472 | 3893 473 | 3783 474 | 2600 475 | 2605 476 | 3526 477 | 3514 478 | 3685 479 | 2566 480 | 3155 481 | 2505 482 | 2604 483 | 4372 484 | 4132 485 | 2923 486 | 3227 487 | 4011 488 | 3091 489 | 4012 490 | 4013 491 | 2606 492 | 3922 493 | 3980 494 | 4208 495 | 2478 496 | 3640 497 | 4142 498 | 76 499 | 3504 500 | 3505 501 | 3508 502 | 4008 503 | 4093 504 | 4450 505 | 145 506 | 2922 507 | 3103 508 | 614 509 | 3197 510 | 3457 511 | 528 512 | 4045 513 | 874 514 | 4274 515 | 4232 516 | 333 517 | 144 518 | 826 519 | 149 520 | 207 521 | 16 522 | 633 523 | 871 524 | 442 525 | 3964 526 | 4204 527 | 496 528 | 905 529 | 15 530 | 360 531 | 210 532 | 372 533 | 3999 534 | 557 535 | 82 536 | 86 537 | 631 538 | 3416 539 | 669 540 | 224 541 | 422 542 | 780 543 | 2435 544 | 782 545 | 925 546 | 680 547 | 906 548 | 4211 549 | 4268 550 | 368 551 | 3732 552 | 700 553 | 3801 554 | 2427 555 | 2917 556 | 4048 557 | 4412 558 | 3398 559 | 2993 560 | 4275 561 | 2584 562 | 4113 563 | 4186 564 | 3998 565 | 2279 566 | 3027 567 | 4358 568 | 4287 569 | 2185 570 | 512 571 | 430 572 | 3622 573 | 2380 574 | 3616 575 | 4330 576 | 539 577 | 3301 578 | 3970 579 | 3414 580 | 3128 581 | 3785 582 | 2990 583 | 3792 584 | 3789 585 | 2540 586 | 4423 587 | 554 588 | 2742 589 | 3119 590 | 2556 591 | 3493 592 | 3461 593 | 3463 594 | 3786 595 | 4097 596 | 3341 597 | 4383 598 | 1006 599 | 790 600 | 2839 601 | 4278 602 | 2803 603 | 560 604 | 2901 605 | 951 606 | 3279 607 | 2512 608 | 2510 609 | 561 610 | 562 611 | 3963 612 | 3309 613 | 2952 614 | 3324 615 | 898 616 | 2078 617 | 3594 618 | 2314 619 | 4347 620 | 3063 621 | 4041 622 | 606 623 | 2769 624 | 2776 625 | 4086 626 | 3634 627 | 3088 628 | 126 629 | 2477 630 | 2530 631 | 4023 632 | 3828 633 | 3925 634 | 3829 635 | 3086 636 | 3984 637 | 3637 638 | 4306 639 | 3036 640 | 3502 641 | 2865 642 | 4461 643 | 4299 644 | 2500 645 | 2501 646 | 3697 647 | 2499 648 | 2490 649 | 2744 650 | 3698 651 | 3699 652 | 2452 653 | 2497 654 | 2491 655 | 2494 656 | 2495 657 | 574 658 | 602 659 | 2628 660 | 576 661 | 2089 662 | 2201 663 | 2226 664 | 738 665 | 2198 666 | 575 667 | 3049 668 | 4303 669 | 573 670 | 3050 671 | 2627 672 | 2626 673 | 2625 674 | 2703 675 | 3354 676 | 545 677 | 4259 678 | 3126 679 | 3462 680 | 110 681 | 771 682 | 2237 683 | 505 684 | 111 685 | 4260 686 | 526 687 | 371 688 | 375 689 | 882 690 | 770 691 | 46 692 | 2527 693 | 109 694 | 45 695 | 170 696 | 513 697 | 3851 698 | 964 699 | 2107 700 | 107 701 | 2574 702 | 2607 703 | 762 704 | 2244 705 | 748 706 | 3019 707 | 3948 708 | 938 709 | 671 710 | 711 711 | 3041 712 | 3118 713 | 917 714 | 893 715 | 2646 716 | 774 717 | 721 718 | 2055 719 | 763 720 | 3682 721 | 3239 722 | 963 723 | 945 724 | 2614 725 | 3611 726 | 206 727 | 3273 728 | 549 729 | 929 730 | 550 731 | 334 732 | 983 733 | 3837 734 | 3718 735 | 786 736 | 3322 737 | 3352 738 | 934 739 | 3931 740 | 3975 741 | 4281 742 | 2714 743 | 2411 744 | 2354 745 | 364 746 | 3347 747 | 3693 748 | 117 749 | 2871 750 | 2863 751 | 3615 752 | 659 753 | 577 754 | 3175 755 | 498 756 | 3518 757 | 3814 758 | 4192 759 | 2524 760 | 2523 761 | 912 762 | 378 763 | 704 764 | 2024 765 | 2302 766 | 102 767 | 2353 768 | 713 769 | 3373 770 | 665 771 | 3113 772 | 2023 773 | 744 774 | 759 775 | 986 776 | 2622 777 | 97 778 | 3183 779 | 684 780 | 972 781 | 2025 782 | 2281 783 | 2006 784 | 921 785 | 2041 786 | 3028 787 | 2677 788 | 2946 789 | 2406 790 | 2707 791 | 2997 792 | 2820 793 | 2389 794 | 2390 795 | 2884 796 | 4314 797 | 676 798 | 2682 799 | 4027 800 | 3016 801 | 3720 802 | 3026 803 | 889 804 | 459 805 | 2261 806 | 361 807 | 308 808 | 367 809 | 359 810 | 3633 811 | 2455 812 | 2475 813 | 2895 814 | 3754 815 | 2767 816 | 3267 817 | 465 818 | 89 819 | 90 820 | 91 821 | 601 822 | 2262 823 | 765 824 | 3709 825 | 543 826 | 3841 827 | 542 828 | 544 829 | 3624 830 | 391 831 | 3866 832 | 4055 833 | 3992 834 | 3877 835 | 3346 836 | 3676 837 | 3821 838 | 3440 839 | 4035 840 | 2535 841 | 2542 842 | 3089 843 | 3735 844 | 2579 845 | 4165 846 | 661 847 | 3307 848 | 4227 849 | 4228 850 | 3991 851 | 3990 852 | 2577 853 | 4285 854 | 3995 855 | 4230 856 | 4255 857 | 3069 858 | 3524 859 | 4340 860 | 3349 861 | 3782 862 | 3691 863 | 2560 864 | 3574 865 | 4094 866 | 3952 867 | 3498 868 | 603 869 | 4291 870 | 4290 871 | 3951 872 | 4250 873 | 2141 874 | 3272 875 | 2488 876 | 2885 877 | 2599 878 | 2571 879 | 2568 880 | 3480 881 | 2774 882 | 3477 883 | 3134 884 | 2114 885 | 1015 886 | 2306 887 | 864 888 | 2974 889 | 3662 890 | 2580 891 | 3791 892 | 4382 893 | 947 894 | 980 895 | 2355 896 | 955 897 | 952 898 | 3345 899 | 4214 900 | 2093 901 | 2095 902 | 2094 903 | 3454 904 | 2615 905 | 4168 906 | 2027 907 | 2123 908 | 3481 909 | 3074 910 | 2124 911 | 3298 912 | 3808 913 | 3449 914 | 4100 915 | 3807 916 | 2326 917 | 395 918 | 470 919 | 2766 920 | 3075 921 | 4205 922 | 4339 923 | 824 924 | 823 925 | 3994 926 | 47 927 | 527 928 | 696 929 | 844 930 | 4156 931 | 2941 932 | 3006 933 | 4169 934 | 3214 935 | 855 936 | 2318 937 | 3160 938 | 2481 939 | 2771 940 | 3108 941 | 4122 942 | 4410 943 | 2789 944 | 4424 945 | 3812 946 | 556 947 | 428 948 | 2487 949 | 2485 950 | 2486 951 | 3038 952 | 392 953 | 67 954 | 68 955 | 481 956 | 2536 957 | 2754 958 | 2755 959 | 2973 960 | 2133 961 | 3803 962 | 2144 963 | 3896 964 | 477 965 | 3483 966 | 290 967 | 2108 968 | 2160 969 | 2517 970 | 2515 971 | 3455 972 | 3846 973 | 4436 974 | 2212 975 | 673 976 | 2211 977 | 4131 978 | 2572 979 | 3243 980 | 3636 981 | 3235 982 | 3149 983 | 2673 984 | 4065 985 | 4066 986 | 3607 987 | 3911 988 | 4130 989 | 4226 990 | 570 991 | 461 992 | 194 993 | 460 994 | 862 995 | 3148 996 | 2873 997 | 379 998 | 38 999 | 3844 1000 | 3125 1001 | 3915 1002 | 3122 1003 | 447 1004 | 3556 1005 | 4342 1006 | 4139 1007 | 474 1008 | 3589 1009 | 4293 1010 | 3788 1011 | 3465 1012 | 4133 1013 | 821 1014 | 2338 1015 | 2322 1016 | 2192 1017 | 608 1018 | 133 1019 | 2840 1020 | 2111 1021 | 2166 1022 | 2156 1023 | 609 1024 | 132 1025 | 421 1026 | 409 1027 | 2051 1028 | 2155 1029 | 799 1030 | 841 1031 | 801 1032 | 2112 1033 | 2158 1034 | 989 1035 | 2964 1036 | 3467 1037 | 2157 1038 | 138 1039 | 611 1040 | 291 1041 | 23 1042 | 4072 1043 | 3845 1044 | 3872 1045 | 121 1046 | 3062 1047 | 521 1048 | 504 1049 | 777 1050 | 3321 1051 | 263 1052 | 2813 1053 | 503 1054 | 3048 1055 | 4234 1056 | 4088 1057 | 828 1058 | 2956 1059 | 3395 1060 | 616 1061 | 3363 1062 | 3545 1063 | 4087 1064 | 3080 1065 | 2752 1066 | 3351 1067 | 4090 1068 | 2543 1069 | 3490 1070 | 2541 1071 | 2569 1072 | 2581 1073 | 3546 1074 | 3407 1075 | 3408 1076 | 3961 1077 | 2130 1078 | 3084 1079 | 4089 1080 | 4085 1081 | 3450 1082 | 3061 1083 | 2763 1084 | 660 1085 | 617 1086 | 567 1087 | 572 1088 | 816 1089 | 3443 1090 | 3444 1091 | 3456 1092 | 4083 1093 | 3312 1094 | 178 1095 | 3209 1096 | 2903 1097 | 589 1098 | 4251 1099 | 2414 1100 | 2217 1101 | 2369 1102 | 456 1103 | 328 1104 | 258 1105 | 4209 1106 | 2514 1107 | 2860 1108 | 4294 1109 | 2537 1110 | 2545 1111 | 2810 1112 | 4237 1113 | 2402 1114 | 3763 1115 | 3955 1116 | 2601 1117 | 2711 1118 | 3136 1119 | 2837 1120 | 3721 1121 | 3722 1122 | 4312 1123 | 2325 1124 | 3898 1125 | 3515 1126 | 3853 1127 | 2659 1128 | 4076 1129 | 2660 1130 | 2665 1131 | 3981 1132 | 3466 1133 | 3639 1134 | 2986 1135 | 2985 1136 | 3549 1137 | 3681 1138 | 4388 1139 | 3320 1140 | 3628 1141 | 2573 1142 | 3885 1143 | 2976 1144 | 3207 1145 | 3802 1146 | 2949 1147 | 3989 1148 | 488 1149 | 970 1150 | 311 1151 | 462 1152 | 2629 1153 | 3359 1154 | 2087 1155 | 2401 1156 | 4185 1157 | 3647 1158 | 3190 1159 | 752 1160 | 49 1161 | 426 1162 | 3511 1163 | 3360 1164 | 4172 1165 | 566 1166 | 487 1167 | 227 1168 | 2207 1169 | 2224 1170 | 4002 1171 | 4284 1172 | 2231 1173 | 380 1174 | 3051 1175 | 4039 1176 | 3384 1177 | 2493 1178 | 3156 1179 | 2476 1180 | 3017 1181 | 2496 1182 | 3854 1183 | 2484 1184 | 517 1185 | 2489 1186 | 2492 1187 | 911 1188 | 3586 1189 | 3581 1190 | 3983 1191 | 3971 1192 | 3580 1193 | 4095 1194 | 2068 1195 | 3377 1196 | 3741 1197 | 732 1198 | 2612 1199 | 2611 1200 | 2610 1201 | 2419 1202 | 2063 1203 | 201 1204 | 403 1205 | 261 1206 | 341 1207 | 3442 1208 | 112 1209 | 3934 1210 | 352 1211 | 3758 1212 | 3737 1213 | 3944 1214 | 4197 1215 | 2623 1216 | 3658 1217 | 351 1218 | 332 1219 | 3238 1220 | 3756 1221 | 3755 1222 | 697 1223 | 3871 1224 | 3521 1225 | 4316 1226 | 754 1227 | 802 1228 | 850 1229 | 69 1230 | 2410 1231 | 3565 1232 | 256 1233 | 3435 1234 | 77 1235 | 3177 1236 | 75 1237 | 3216 1238 | 3112 1239 | 2534 1240 | 2049 1241 | 432 1242 | 4435 1243 | 872 1244 | 362 1245 | 3528 1246 | 529 1247 | 3185 1248 | 3858 1249 | 94 1250 | 2242 1251 | 2176 1252 | 892 1253 | 508 1254 | 500 1255 | 3358 1256 | 674 1257 | 2708 1258 | 707 1259 | 3744 1260 | 3476 1261 | 3047 1262 | 3888 1263 | 3926 1264 | 3584 1265 | 4036 1266 | 3745 1267 | 3212 1268 | 2905 1269 | 793 1270 | 3731 1271 | 2991 1272 | 758 1273 | 95 1274 | 869 1275 | 794 1276 | 103 1277 | 4111 1278 | 3472 1279 | 961 1280 | 3150 1281 | 4126 1282 | 706 1283 | 815 1284 | 3124 1285 | 962 1286 | 569 1287 | 568 1288 | 2520 1289 | 2521 1290 | 3034 1291 | 4098 1292 | 3870 1293 | 908 1294 | 2184 1295 | 2765 1296 | 2268 1297 | 196 1298 | 686 1299 | 887 1300 | 3077 1301 | 2706 1302 | 784 1303 | 3612 1304 | 3368 1305 | 3717 1306 | 2029 1307 | 3211 1308 | 199 1309 | 2830 1310 | 4032 1311 | 4210 1312 | 3905 1313 | 188 1314 | 257 1315 | 265 1316 | 836 1317 | 3015 1318 | 3838 1319 | 3277 1320 | 2277 1321 | 2775 1322 | 3623 1323 | 4033 1324 | 3760 1325 | 3137 1326 | 3020 1327 | 354 1328 | 174 1329 | 727 1330 | 4329 1331 | 3326 1332 | 3220 1333 | 2134 1334 | 3987 1335 | 3759 1336 | 4194 1337 | 2503 1338 | 2800 1339 | 3631 1340 | 2502 1341 | 3228 1342 | 2957 1343 | 596 1344 | 2312 1345 | 3797 1346 | 597 1347 | 3082 1348 | 443 1349 | 3083 1350 | 272 1351 | 681 1352 | 891 1353 | 2248 1354 | 4302 1355 | 2883 1356 | 3413 1357 | 833 1358 | 4233 1359 | 571 1360 | 501 1361 | 635 1362 | 468 1363 | 2782 1364 | 2802 1365 | 583 1366 | 691 1367 | 928 1368 | 2125 1369 | 4121 1370 | 2271 1371 | 3105 1372 | 4402 1373 | 4225 1374 | 4224 1375 | 4221 1376 | 3666 1377 | 2418 1378 | 2987 1379 | 433 1380 | 641 1381 | 363 1382 | 3154 1383 | 3968 1384 | 22 1385 | 655 1386 | 3066 1387 | 2118 1388 | 672 1389 | 1005 1390 | 3394 1391 | 946 1392 | 584 1393 | 3096 1394 | 2518 1395 | 728 1396 | 3538 1397 | 3887 1398 | 2284 1399 | 2391 1400 | 3942 1401 | 658 1402 | 578 1403 | 304 1404 | 295 1405 | 3060 1406 | 289 1407 | 2428 1408 | 551 1409 | 903 1410 | 3402 1411 | 322 1412 | 191 1413 | 555 1414 | 2147 1415 | 835 1416 | 3547 1417 | 4247 1418 | 3065 1419 | 532 1420 | 2758 1421 | 3815 1422 | 4395 1423 | 531 1424 | 533 1425 | 209 1426 | 749 1427 | 750 1428 | 930 1429 | 2241 1430 | 904 1431 | 2276 1432 | 2676 1433 | 3178 1434 | 3179 1435 | 3182 1436 | 3180 1437 | 4049 1438 | 3491 1439 | 2835 1440 | 3736 1441 | 2836 1442 | 2902 1443 | 3809 1444 | 4040 1445 | 3767 1446 | 3620 1447 | 3382 1448 | 3733 1449 | 4366 1450 | 2293 1451 | 389 1452 | 2988 1453 | 3939 1454 | 3293 1455 | 2295 1456 | 3040 1457 | 2296 1458 | 2609 1459 | 3242 1460 | 3432 1461 | 4323 1462 | 3746 1463 | 2377 1464 | 4096 1465 | 4381 1466 | 2642 1467 | 3328 1468 | 2748 1469 | 888 1470 | 524 1471 | 806 1472 | 2641 1473 | 755 1474 | 3109 1475 | 479 1476 | 464 1477 | 2563 1478 | 2859 1479 | 3679 1480 | 753 1481 | 785 1482 | 348 1483 | 653 1484 | 4295 1485 | 2645 1486 | 3912 1487 | 2021 1488 | 381 1489 | 3193 1490 | 2916 1491 | 2092 1492 | 182 1493 | 670 1494 | 2263 1495 | 366 1496 | 3092 1497 | 3102 1498 | 2989 1499 | 837 1500 | 2953 1501 | 3157 1502 | 3405 1503 | 2980 1504 | 4223 1505 | 2324 1506 | 3397 1507 | 4099 1508 | 3195 1509 | 3766 1510 | 3300 1511 | 2772 1512 | 4262 1513 | 4084 1514 | 3446 1515 | 2759 1516 | 87 1517 | 478 1518 | 612 1519 | 996 1520 | 518 1521 | 85 1522 | 2178 1523 | 84 1524 | 530 1525 | 434 1526 | 2180 1527 | 83 1528 | 2331 1529 | 854 1530 | 396 1531 | 451 1532 | 3648 1533 | 4218 1534 | 2319 1535 | 2886 1536 | 3286 1537 | 2246 1538 | 3664 1539 | 3954 1540 | 2137 1541 | 4200 1542 | 813 1543 | 3385 1544 | 4387 1545 | 142 1546 | 454 1547 | 876 1548 | 413 1549 | 298 1550 | 565 1551 | 284 1552 | 1010 1553 | 2681 1554 | 3198 1555 | 281 1556 | 591 1557 | 453 1558 | 3448 1559 | 1016 1560 | 2032 1561 | 3018 1562 | 666 1563 | 2274 1564 | 757 1565 | 919 1566 | 899 1567 | 2866 1568 | 890 1569 | 2420 1570 | 3265 1571 | 452 1572 | 3342 1573 | 275 1574 | 502 1575 | 3475 1576 | 2955 1577 | 2341 1578 | 2867 1579 | 3226 1580 | 3618 1581 | 3904 1582 | 4115 1583 | 4315 1584 | 3114 1585 | 3787 1586 | 4401 1587 | 3820 1588 | 2086 1589 | 522 1590 | 425 1591 | 448 1592 | 64 1593 | 3817 1594 | 634 1595 | 3957 1596 | 564 1597 | 656 1598 | 2381 1599 | 4305 1600 | 355 1601 | 63 1602 | 475 1603 | 4307 1604 | 53 1605 | 2113 1606 | 2466 1607 | 2853 1608 | 264 1609 | 438 1610 | 702 1611 | 436 1612 | 2578 1613 | 2656 1614 | 764 1615 | 992 1616 | 3930 1617 | 2070 1618 | 3509 1619 | 2890 1620 | 2186 1621 | 894 1622 | 3943 1623 | 932 1624 | 4289 1625 | 751 1626 | 2054 1627 | 2893 1628 | 3641 1629 | 2701 1630 | 2194 1631 | 208 1632 | 775 1633 | 958 1634 | 2061 1635 | 3274 1636 | 838 1637 | 3241 1638 | 2266 1639 | 3914 1640 | 3255 1641 | 2981 1642 | 2479 1643 | 3534 1644 | 205 1645 | 376 1646 | 204 1647 | 316 1648 | 2779 1649 | 2062 1650 | 3857 1651 | 537 1652 | 3978 1653 | 4263 1654 | 4246 1655 | 2743 1656 | 3977 1657 | 4080 1658 | 4240 1659 | 2297 1660 | 4399 1661 | 3956 1662 | 482 1663 | 2970 1664 | 804 1665 | 853 1666 | 2109 1667 | 797 1668 | 791 1669 | 3860 1670 | 4037 1671 | 3684 1672 | 3986 1673 | 382 1674 | 519 1675 | 3290 1676 | 2962 1677 | 3171 1678 | 3013 1679 | 4217 1680 | 772 1681 | 356 1682 | 3269 1683 | 2463 1684 | 795 1685 | 2421 1686 | 3369 1687 | 2286 1688 | 3270 1689 | 2843 1690 | 357 1691 | 858 1692 | 4451 1693 | 4021 1694 | 4389 1695 | 4129 1696 | 3452 1697 | 25 1698 | 24 1699 | 873 1700 | 483 1701 | 3131 1702 | 458 1703 | 668 1704 | 128 1705 | 2768 1706 | 4147 1707 | 3008 1708 | 377 1709 | 580 1710 | 3219 1711 | 2588 1712 | 3021 1713 | 652 1714 | 369 1715 | 3348 1716 | 3070 1717 | 3932 1718 | 1018 1719 | 4308 1720 | 486 1721 | 4203 1722 | 3253 1723 | 3252 1724 | 3022 1725 | 3525 1726 | 325 1727 | 651 1728 | 3064 1729 | 2608 1730 | 4241 1731 | 4120 1732 | 3192 1733 | 287 1734 | 2417 1735 | 285 1736 | 286 1737 | 2557 1738 | 3251 1739 | 3263 1740 | 283 1741 | 3250 1742 | 3023 1743 | 3261 1744 | 4257 1745 | 948 1746 | 2283 1747 | 414 1748 | 2751 1749 | 620 1750 | 1004 1751 | 2958 1752 | 2898 1753 | 3205 1754 | 2638 1755 | 3486 1756 | 3468 1757 | 141 1758 | 3406 1759 | 2214 1760 | 455 1761 | 3372 1762 | 3132 1763 | 485 1764 | 516 1765 | 3908 1766 | 4030 1767 | 3907 1768 | 139 1769 | 4201 1770 | 4357 1771 | 4336 1772 | 4151 1773 | 343 1774 | 211 1775 | 506 1776 | 13 1777 | 345 1778 | 3769 1779 | 2382 1780 | 3660 1781 | 2590 1782 | 2383 1783 | 2398 1784 | 599 1785 | 4078 1786 | 2357 1787 | 2664 1788 | 2356 1789 | 2805 1790 | 4058 1791 | 4006 1792 | 463 1793 | 693 1794 | 3158 1795 | 4335 1796 | 4206 1797 | 3411 1798 | 3420 1799 | 2678 1800 | 404 1801 | 2190 1802 | 3861 1803 | 4029 1804 | 177 1805 | 3396 1806 | 747 1807 | 175 1808 | 743 1809 | 2426 1810 | 585 1811 | 405 1812 | 3314 1813 | 3196 1814 | 494 1815 | 981 1816 | 406 1817 | 2139 1818 | 2451 1819 | 3657 1820 | 2456 1821 | 4270 1822 | 2506 1823 | 2855 1824 | 3946 1825 | 3570 1826 | 4408 1827 | 4252 1828 | 4253 1829 | 3306 1830 | 3882 1831 | 3969 1832 | 3653 1833 | 3576 1834 | 3562 1835 | 3935 1836 | 2834 1837 | 3571 1838 | 3564 1839 | 725 1840 | 959 1841 | 613 1842 | 909 1843 | 2538 1844 | 2539 1845 | 4265 1846 | 4373 1847 | 4427 1848 | 2858 1849 | 2670 1850 | 2672 1851 | 2115 1852 | 3415 1853 | 4107 1854 | 4222 1855 | 586 1856 | 2764 1857 | 3959 1858 | 51 1859 | 3073 1860 | 3672 1861 | 3292 1862 | 358 1863 | 1007 1864 | 2927 1865 | 431 1866 | 814 1867 | 690 1868 | 866 1869 | 2649 1870 | 50 1871 | 429 1872 | 3613 1873 | 2256 1874 | 446 1875 | 3453 1876 | 255 1877 | 3164 1878 | 2117 1879 | 756 1880 | 995 1881 | 3366 1882 | 2899 1883 | 805 1884 | 2206 1885 | 936 1886 | 2174 1887 | 2698 1888 | 710 1889 | 712 1890 | 1017 1891 | 2272 1892 | 4176 1893 | 3982 1894 | 3310 1895 | 975 1896 | 971 1897 | 2984 1898 | 127 1899 | 3045 1900 | 3711 1901 | 131 1902 | 594 1903 | 510 1904 | 457 1905 | 252 1906 | 163 1907 | 3923 1908 | 657 1909 | 4105 1910 | 2313 1911 | 4031 1912 | 3335 1913 | 2583 1914 | 4235 1915 | 2348 1916 | 420 1917 | 439 1918 | 397 1919 | 678 1920 | 2362 1921 | 2405 1922 | 808 1923 | 3710 1924 | 2919 1925 | 2255 1926 | 2037 1927 | 1014 1928 | 682 1929 | 2747 1930 | 722 1931 | 3011 1932 | 3583 1933 | 2869 1934 | 546 1935 | 868 1936 | 2553 1937 | 766 1938 | 3213 1939 | 2854 1940 | 4384 1941 | 2639 1942 | 387 1943 | 2640 1944 | 2945 1945 | 639 1946 | 98 1947 | 3873 1948 | 792 1949 | 4043 1950 | 3429 1951 | 4370 1952 | 476 1953 | 293 1954 | 388 1955 | 230 1956 | 232 1957 | 2473 1958 | 473 1959 | 2208 1960 | 2053 1961 | 664 1962 | 4042 1963 | 329 1964 | 2844 1965 | 480 1966 | 2909 1967 | 212 1968 | 4189 1969 | 3646 1970 | 3206 1971 | 4301 1972 | 809 1973 | 3010 1974 | 2413 1975 | 2753 1976 | 3976 1977 | 440 1978 | 2474 1979 | 714 1980 | 718 1981 | 2684 1982 | 2422 1983 | 4028 1984 | 2280 1985 | 3740 1986 | 2105 1987 | 4202 1988 | 3553 1989 | 3591 1990 | 2992 1991 | 3097 1992 | 3535 1993 | 3652 1994 | 3043 1995 | 3099 1996 | 3100 1997 | 3201 1998 | 2270 1999 | 3176 2000 | 2749 2001 | 3617 2002 | 2679 2003 | 383 2004 | 581 2005 | 605 2006 | 122 2007 | 2400 2008 | 416 2009 | 3202 2010 | 2716 2011 | 878 2012 | 2781 2013 | 3979 2014 | 3081 2015 | 101 2016 | 4404 2017 | 4405 2018 | 106 2019 | 444 2020 | 879 2021 | 104 2022 | 2159 2023 | 3824 2024 | 2131 2025 | 2469 2026 | 3775 2027 | 3776 2028 | 3052 2029 | 2526 2030 | 2525 2031 | 2465 2032 | 3459 2033 | 913 2034 | 2464 2035 | 2870 2036 | 2603 2037 | 2888 2038 | 3906 2039 | 3765 2040 | 2889 2041 | 3003 2042 | 2887 2043 | 2378 2044 | 2445 2045 | 2440 2046 | 939 2047 | 2533 2048 | 2532 2049 | 2531 2050 | 3619 2051 | 3777 2052 | 2228 2053 | 2828 2054 | 424 2055 | 3087 2056 | 3880 2057 | 4320 2058 | 4321 2059 | 4385 2060 | 3575 2061 | 3779 2062 | 3529 2063 | 2337 2064 | 4371 2065 | 4317 2066 | 3627 2067 | 3793 2068 | 3899 2069 | 4092 2070 | 4070 2071 | 2334 2072 | 2480 2073 | 3425 2074 | 3447 2075 | 2333 2076 | 4417 2077 | 3852 2078 | 708 2079 | 2230 2080 | 3002 2081 | 2161 2082 | 2171 2083 | 558 2084 | 663 2085 | 2336 2086 | 3289 2087 | 2999 2088 | 2770 2089 | 3878 2090 | 2998 2091 | 2323 2092 | 3530 2093 | 2294 2094 | 2857 2095 | 411 2096 | 130 2097 | 3095 2098 | 860 2099 | 523 2100 | 825 2101 | 490 2102 | 2658 2103 | 3644 2104 | 4178 2105 | 3215 2106 | 2519 2107 | 3881 2108 | 627 2109 | 2081 2110 | 2143 2111 | 2522 2112 | 2613 2113 | 3237 2114 | 4146 2115 | 3706 2116 | 3827 2117 | 3497 2118 | 2602 2119 | 4440 2120 | 4325 2121 | 4159 2122 | 4183 2123 | 3726 2124 | 3302 2125 | 3471 2126 | 3996 2127 | 2683 2128 | 4355 2129 | 342 2130 | 3294 2131 | 4152 2132 | 3516 2133 | 3181 2134 | 3531 2135 | 2332 2136 | 4326 2137 | 4091 2138 | 3098 2139 | 3690''' 2140 | car_id_list =car_id_string.split('\n') 2141 | #print(car_id_list) -------------------------------------------------------------------------------- /scrapy_autohome/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ScrapyAutohomeItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | 15 | # 车ID 16 | CAR_ID = scrapy.Field() 17 | # 车名 18 | CAR_NAME = scrapy.Field() 19 | 20 | # 用户ID 21 | USER_ID = scrapy.Field() 22 | # 用户名 23 | USER_NAME = scrapy.Field() 24 | 25 | # 购买地点 26 | PURCHASE_PLACE = scrapy.Field() 27 | # 购买时间 28 | PURCHASE_TIME = scrapy.Field() 29 | # 裸车购买价 30 | CAR_PRICE = scrapy.Field() 31 | # 购车目的 32 | PURCHASE_PURPOSE = scrapy.Field() 33 | 34 | # 评分- 空间 35 | SCORE_SPACE = scrapy.Field() 36 | # 评分- 动力 37 | SCORE_POWER = scrapy.Field() 38 | # 评分- 操控 39 | SCORE_CONTROL = scrapy.Field() 40 | # 评分- 油耗 41 | SCORE_FUEL_CONSUMPTION = scrapy.Field() 42 | # 评分- 舒适性 43 | SCORE_COMFORT = scrapy.Field() 44 | # 评分- 外观 45 | SCORE_EXTERIOR = scrapy.Field() 46 | # 评分- 内饰 47 | SCORE_INTERIOR = scrapy.Field() 48 | # 评分- 性价比 49 | SCORE_COST_EFFECTIVE = scrapy.Field() 50 | 51 | # 评论的url 52 | COMMENT_URL = scrapy.Field() 53 | # 评论的内容 54 | COMMENT_CONTENT = scrapy.Field() 55 | 56 | # 有多少人支持这条口碑 57 | COMMENT_SUPPORT_QUANTITY = scrapy.Field() 58 | # 有多少人看过这条口碑 59 | COMMENT_SEEN_QUANTITY = scrapy.Field() -------------------------------------------------------------------------------- /scrapy_autohome/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ScrapyAutohomeSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /scrapy_autohome/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class ScrapyAutohomePipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /scrapy_autohome/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for scrapy_autohome project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'scrapy_autohome' 13 | 14 | SPIDER_MODULES = ['scrapy_autohome.spiders'] 15 | NEWSPIDER_MODULE = 'scrapy_autohome.spiders' 16 | # 绕过robots.txt 17 | ROBOTSTXT_OBEY = False 18 | 19 | #记录日志 20 | LOG_FILE = "scrapy_autohome_log.log" 21 | 22 | # 保存文件编码类型 23 | FEED_EXPORT_ENCODING = 'GBK' 24 | 25 | # 伪装chrome 26 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 27 | 28 | # #DOWNLOADER_MIDDLEWARES 29 | # DOWNLOADER_MIDDLEWARES = { 30 | # 'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware':301, 31 | # } 32 | 33 | 34 | 35 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 36 | #USER_AGENT = 'scrapy_autohome (+http://www.yourdomain.com)' 37 | 38 | 39 | 40 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 41 | #CONCURRENT_REQUESTS = 32 42 | 43 | # Configure a delay for requests for the same website (default: 0) 44 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 45 | # See also autothrottle settings and docs 46 | #DOWNLOAD_DELAY = 3 47 | # The download delay setting will honor only one of: 48 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 49 | #CONCURRENT_REQUESTS_PER_IP = 16 50 | 51 | # Disable cookies (enabled by default) 52 | #COOKIES_ENABLED = False 53 | 54 | # Disable Telnet Console (enabled by default) 55 | #TELNETCONSOLE_ENABLED = False 56 | 57 | # Override the default request headers: 58 | #DEFAULT_REQUEST_HEADERS = { 59 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 60 | # 'Accept-Language': 'en', 61 | #} 62 | 63 | # Enable or disable spider middlewares 64 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 65 | #SPIDER_MIDDLEWARES = { 66 | # 'scrapy_autohome.middlewares.ScrapyAutohomeSpiderMiddleware': 543, 67 | #} 68 | 69 | # Enable or disable downloader middlewares 70 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 71 | #DOWNLOADER_MIDDLEWARES = { 72 | # 'scrapy_autohome.middlewares.MyCustomDownloaderMiddleware': 543, 73 | #} 74 | 75 | # Enable or disable extensions 76 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 77 | #EXTENSIONS = { 78 | # 'scrapy.extensions.telnet.TelnetConsole': None, 79 | #} 80 | 81 | # Configure item pipelines 82 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 83 | #ITEM_PIPELINES = { 84 | # 'scrapy_autohome.pipelines.ScrapyAutohomePipeline': 300, 85 | #} 86 | 87 | # Enable and configure the AutoThrottle extension (disabled by default) 88 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 89 | #AUTOTHROTTLE_ENABLED = True 90 | # The initial download delay 91 | #AUTOTHROTTLE_START_DELAY = 5 92 | # The maximum download delay to be set in case of high latencies 93 | #AUTOTHROTTLE_MAX_DELAY = 60 94 | # The average number of requests Scrapy should be sending in parallel to 95 | # each remote server 96 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 97 | # Enable showing throttling stats for every response received: 98 | #AUTOTHROTTLE_DEBUG = False 99 | 100 | # Enable and configure HTTP caching (disabled by default) 101 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 102 | #HTTPCACHE_ENABLED = True 103 | #HTTPCACHE_EXPIRATION_SECS = 0 104 | #HTTPCACHE_DIR = 'httpcache' 105 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 106 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 107 | -------------------------------------------------------------------------------- /scrapy_autohome/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy_autohome/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xqtbox/AutoHomeSpider_Scrapy/6f3b475b3705bbdf5f3d89fd2e2759c48ec4dcf3/scrapy_autohome/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy_autohome/spiders/__pycache__/autohome_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xqtbox/AutoHomeSpider_Scrapy/6f3b475b3705bbdf5f3d89fd2e2759c48ec4dcf3/scrapy_autohome/spiders/__pycache__/autohome_spider.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy_autohome/spiders/autohome_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import scrapy 4 | import urllib.parse 5 | from scrapy_autohome.all_car_id import All_Car_Id 6 | from scrapy_autohome.items import ScrapyAutohomeItem 7 | 8 | 9 | class AutohomeSpider(scrapy.Spider): 10 | name = "autohome_spider" 11 | allowed_domains = ["autohome.com"] 12 | start_urls = ['http://autohome.com/'] 13 | # 评论的个数 14 | count = 0 15 | 16 | # 循环页码,就在这个函数中实现。 17 | def start_requests(self): 18 | reqs = [] # 每个车型页面的request 19 | 20 | # 获取所有车辆的ID 21 | all_car_id = All_Car_Id() 22 | car_id_list = all_car_id.car_id_list 23 | # 两层遍历,分别遍历车型和页数 24 | for i in car_id_list: # i代表从车型的遍历 25 | for j in range(1,101): # j代表评论页数,range(1,3)表示1到2页 26 | req = scrapy.Request("http://k.autohome.com.cn/"+str(i)+"/index_"+str(j)+".html#dataList") 27 | reqs.append(req) 28 | return reqs 29 | 30 | def parse(self, response): 31 | # 记录个数 32 | AutohomeSpider.count += 1 33 | #print(AutohomeSpider.count) 34 | 35 | # 获取所有评论div //*[@id="maodian"]/div/div/div[2]/div[4] 36 | divs = response.xpath('//*[@id="maodian"]/div/div/div[2]/div[@class="mouthcon"]') 37 | 38 | 39 | 40 | for div in divs: 41 | # 记录个数 42 | AutohomeSpider.count += 1 43 | print("----------------------------------") 44 | print("第:",AutohomeSpider.count,"个评论。") 45 | 46 | item = ScrapyAutohomeItem() 47 | # 车ID //*[@id="maodian"]/div/div/div[2]/div[4]/div/div[1]/div[2]/dl[1]/dd/a[1] 48 | item['CAR_ID'] = div.xpath('div/div[1]/div[2]/dl[1]/dd/a[1]/@href')[0].extract().replace('/','') 49 | # 车名字 50 | item['CAR_NAME'] = div.xpath('div/div[1]/div[2]/dl[1]/dd/a[1]/text()')[0].extract() 51 | 52 | # 用户ID //*[@id="maodian"]/div/div/div[2]/div[4]/div/div[1]/div[1]/div/div[1]/div[2]/p/a 53 | USER_ID1 = div.xpath('div/div[1]/div[1]/div/div[1]/div[2]/p/a/@href')[0].extract() 54 | item['USER_ID'] = re.findall('\d{1,15}',USER_ID1)[0] 55 | item['USER_NAME'] = div.xpath('div/div[1]/div[1]/div/div[1]/div[2]/p/a/text()')[0].extract().strip() 56 | 57 | # 购买地点 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[2]/dd 58 | PURCHASE_PLACE = div.xpath('div/div[1]/div[2]/dl[2]/dd')[0] 59 | item['PURCHASE_PLACE'] =PURCHASE_PLACE.xpath('string(.)').extract()[0].strip() 60 | 61 | 62 | # 因为列表属性相同且数量不确定,所要加入判断 63 | dls =div.xpath('div/div[1]/div[2]/dl') 64 | # 正常的有7个 65 | if dls.__len__() == 7: 66 | # 购买时间 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[4]/dd 67 | item['PURCHASE_TIME'] = div.xpath('div/div[1]/div[2]/dl[4]/dd/text()')[0].extract().strip() 68 | # 裸车购买价 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[5]/dd 69 | CAR_PRICE = div.xpath('div/div[1]/div[2]/dl[5]/dd')[0] 70 | item['CAR_PRICE'] = CAR_PRICE.xpath('string(.)').extract()[0].strip().replace('\xa0','') 71 | # 购车目的 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[7]/dd 72 | PURCHASE_PURPOSE = div.xpath('div/div[1]/div[2]/dl[7]/dd')[0] 73 | item['PURCHASE_PURPOSE'] = PURCHASE_PURPOSE.xpath('string(.)').extract()[0].strip().replace('\r\n','').replace(' ',';') 74 | #不正常的有6个,分为两种情况:缺经销商和缺油耗。 75 | elif dls.__len__() == 6: 76 | p = div.xpath('div/div[1]/div[2]/dl[5]/dt/p') 77 | # 如果有p标签 ,说明有油耗,没有经销商 78 | if p: 79 | # 购买时间 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[4]/dd 80 | item['PURCHASE_TIME'] = div.xpath('div/div[1]/div[2]/dl[3]/dd/text()')[0].extract().strip() 81 | # 裸车购买价 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[5]/dd 82 | CAR_PRICE = div.xpath('div/div[1]/div[2]/dl[4]/dd')[0] 83 | item['CAR_PRICE'] = CAR_PRICE.xpath('string(.)').extract()[0].strip().replace('\xa0', '') 84 | # 购车目的 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[7]/dd 85 | PURCHASE_PURPOSE = div.xpath('div/div[1]/div[2]/dl[6]/dd')[0] 86 | item['PURCHASE_PURPOSE'] = PURCHASE_PURPOSE.xpath('string(.)').extract()[0].strip().replace('\r\n','').replace(' ', ';') 87 | # 如果没有p说明 没有油耗,有经销商 88 | else: 89 | # 购买时间 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[4]/dd 90 | item['PURCHASE_TIME'] = div.xpath('div/div[1]/div[2]/dl[4]/dd/text()')[0].extract().strip() 91 | # 裸车购买价 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[5]/dd 92 | CAR_PRICE = div.xpath('div/div[1]/div[2]/dl[5]/dd')[0] 93 | item['CAR_PRICE'] = CAR_PRICE.xpath('string(.)').extract()[0].strip().replace('\xa0', '') 94 | # 购车目的 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[7]/dd 95 | PURCHASE_PURPOSE = div.xpath('div/div[1]/div[2]/dl[6]/dd')[0] 96 | item['PURCHASE_PURPOSE'] = PURCHASE_PURPOSE.xpath('string(.)').extract()[0].strip().replace('\r\n','').replace(' ', ';') 97 | 98 | 99 | 100 | # 评分- 空间 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/div[1]/dl/dd/span[2] 101 | item['SCORE_SPACE'] = div.xpath('div/div[1]/div[2]/div[1]/dl/dd/span[2]/text()')[0].extract() 102 | # 评分- 动力 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/div[2]/dl/dd/span[2] 103 | item['SCORE_POWER'] = div.xpath('div/div[1]/div[2]/div[2]/dl/dd/span[2]/text()')[0].extract() 104 | # 评分- 操控 105 | item['SCORE_CONTROL'] = div.xpath('div/div[1]/div[2]/div[3]/dl/dd/span[2]/text()')[0].extract() 106 | # 评分- 油耗 107 | item['SCORE_FUEL_CONSUMPTION'] = div.xpath('div/div[1]/div[2]/div[4]/dl/dd/span[2]/text()')[0].extract() 108 | # 评分- 舒适性 109 | item['SCORE_COMFORT'] = div.xpath('div/div[1]/div[2]/div[5]/dl/dd/span[2]/text()')[0].extract() 110 | # 评分- 外观 111 | item['SCORE_EXTERIOR'] = div.xpath('div/div[1]/div[2]/div[6]/dl/dd/span[2]/text()')[0].extract() 112 | # 评分- 内饰 113 | item['SCORE_INTERIOR'] = div.xpath('div/div[1]/div[2]/div[7]/dl/dd/span[2]/text()')[0].extract() 114 | # 评分- 性价比 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/div[8]/dl/dd/span[2] 115 | item['SCORE_COST_EFFECTIVE'] = div.xpath('div/div[1]/div[2]/div[8]/dl/dd/span[2]/text()')[0].extract() 116 | 117 | 118 | 119 | # 有多少人支持这条口碑 #//*[@id="maodian"]/div/div/div[2]/div[6]/ div/div[2]/div[1]/div[3]/div[2]/span[3]/label 120 | item['COMMENT_SUPPORT_QUANTITY'] = div.xpath('div/div[2]/div[1]/div[3]/div[2]/span[3]/label/text()')[0].extract() 121 | # 有多少人看过这条口碑 #//*[@id="maodian"]/div/div/div[2]/div[6]/ div/div[2]/div[1]/div[3]/div[2]/span[4]/a 122 | item['COMMENT_SEEN_QUANTITY'] = div.xpath('div/div[2]/div[1]/div[3]/div[2]/span[4]/a/text()')[0].extract() 123 | 124 | 125 | # 评论的url //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[1]/div/div[2]/div[2] 126 | url_id_pre = div.xpath('div/div[1]/div[1]/div/div[2]/div[2]/@id')[0].extract()# 结果为 DivRelatedTopics_1565672 127 | # 截取id 128 | url_id = re.findall('\d{1,20}', url_id_pre)[0] 129 | # 存入评论url 130 | item['COMMENT_URL'] ="http://k.autohome.com.cn/FrontAPI/GetFeelingByEvalId?evalId=" + url_id 131 | COMMENT_URL = item['COMMENT_URL'] 132 | 133 | # 用回调函数获取 评论内容 134 | yield scrapy.Request(url=COMMENT_URL,meta={'item': item},callback=self.parse_recommand,dont_filter=True) 135 | 136 | 137 | def parse_recommand(self,response): 138 | # 此函数用于解析评论json 139 | 140 | # 获取该页面的代码  141 | text = response.body 142 | # 解码为gb312(通过response.headers知道) 143 | text1 = str(text, encoding="gb2312",errors='ignore').replace("\\u0027", "'").replace("\\u003e", ">").replace("\\u003c", "<") 144 | # 调用函数 替换 145 | text2 = AutohomeSpider.get_complete_text_autohome(text1) 146 | # 获取中文评论 147 | text3 = re.findall(r'@HS_BASE64@.*@HS_ZY@',text2)[0].replace('@HS_BASE64@-->',"").replace("", text) 160 | #print("find : %s" % js.group()) 161 | if not js: 162 | print(" if not js:") 163 | return text 164 | try: 165 | char_list = AutohomeSpider.get_char(js.group(1)) 166 | print("try111") 167 | 168 | except Exception as e: 169 | print(e) 170 | print("except222") 171 | return text 172 | 173 | def char_replace(m): 174 | index = int(m.group(1)) 175 | char = char_list[index] 176 | return char 177 | 178 | text = re.sub("", char_replace, text) 179 | # print(text) 180 | return text 181 | 182 | 183 | # 这个函数用于 获取js中的变换规则 184 | def get_char(js): 185 | all_var = {} 186 | # 判断混淆 无参数 返回常量 函数 187 | if_else_no_args_return_constant_function_functions = [] 188 | """ 189 | function zX_() { 190 | function _z() { 191 | return '09'; 192 | }; 193 | if (_z() == '09,') { 194 | return 'zX_'; 195 | } else { 196 | return _z(); 197 | } 198 | } 199 | """ 200 | constant_function_regex4 = re.compile(""" 201 | function\s+\w+\(\)\s*\{\s* 202 | function\s+\w+\(\)\s*\{\s* 203 | return\s+[\'\"][^\'\"]+[\'\"];\s* 204 | \};\s* 205 | if\s*\(\w+\(\)\s*==\s*[\'\"][^\'\"]+[\'\"]\)\s*\{\s* 206 | return\s*[\'\"][^\'\"]+[\'\"];\s* 207 | \}\s*else\s*\{\s* 208 | return\s*\w+\(\);\s* 209 | \}\s* 210 | \} 211 | """, 212 | re.X) 213 | l = constant_function_regex4.findall(js) 214 | # print("l 38",l) 215 | for i in l: 216 | function_name = re.search(""" 217 | function\s+(\w+)\(\)\s*\{\s* 218 | function\s+\w+\(\)\s*\{\s* 219 | return\s+[\'\"]([^\'\"]+)[\'\"];\s* 220 | \};\s* 221 | if\s*\(\w+\(\)\s*==\s*[\'\"]([^\'\"]+)[\'\"]\)\s*\{\s* 222 | return\s*[\'\"]([^\'\"]+)[\'\"];\s* 223 | \}\s*else\s*\{\s* 224 | return\s*\w+\(\);\s* 225 | \}\s* 226 | \} 227 | """, i, 228 | re.X) 229 | if_else_no_args_return_constant_function_functions.append(function_name.groups()) 230 | js = js.replace(i, "") 231 | # 替换全文 232 | a, b, c, d = function_name.groups() 233 | all_var["%s()" % a] = d if b == c else b 234 | 235 | # 判断混淆 无参数 返回函数 常量 236 | if_else_no_args_return_function_constant_functions = [] 237 | """ 238 | function wu_() { 239 | function _w() { 240 | return 'wu_'; 241 | }; 242 | if (_w() == 'wu__') { 243 | return _w(); 244 | } else { 245 | return '5%'; 246 | } 247 | } 248 | """ 249 | constant_function_regex5 = re.compile(""" 250 | function\s+\w+\(\)\s*\{\s* 251 | function\s+\w+\(\)\s*\{\s* 252 | return\s+[\'\"][^\'\"]+[\'\"];\s* 253 | \};\s* 254 | if\s*\(\w+\(\)\s*==\s*[\'\"][^\'\"]+[\'\"]\)\s*\{\s* 255 | return\s*\w+\(\);\s* 256 | \}\s*else\s*\{\s* 257 | return\s*[\'\"][^\'\"]+[\'\"];\s* 258 | \}\s* 259 | \} 260 | """, 261 | re.X) 262 | l = constant_function_regex5.findall(js) 263 | # print("l 87",l) 264 | for i in l: 265 | function_name = re.search(""" 266 | function\s+(\w+)\(\)\s*\{\s* 267 | function\s+\w+\(\)\s*\{\s* 268 | return\s+[\'\"]([^\'\"]+)[\'\"];\s* 269 | \};\s* 270 | if\s*\(\w+\(\)\s*==\s*[\'\"]([^\'\"]+)[\'\"]\)\s*\{\s* 271 | return\s*\w+\(\);\s* 272 | \}\s*else\s*\{\s* 273 | return\s*[\'\"]([^\'\"]+)[\'\"];\s* 274 | \}\s* 275 | \} 276 | """, i, 277 | re.X) 278 | if_else_no_args_return_function_constant_functions.append(function_name.groups()) 279 | js = js.replace(i, "") 280 | # 替换全文 281 | a, b, c, d = function_name.groups() 282 | all_var["%s()" % a] = b if b == c else d 283 | 284 | # var 参数等于返回值函数 285 | var_args_equal_value_functions = [] 286 | """ 287 | var ZA_ = function(ZA__) { 288 | 'return ZA_'; 289 | return ZA__; 290 | }; 291 | """ 292 | constant_function_regex1 = re.compile( 293 | "var\s+[^=]+=\s*function\(\w+\)\{\s*[\'\"]return\s*\w+\s*[\'\"];\s*return\s+\w+;\s*\};") 294 | l = constant_function_regex1.findall(js) 295 | # print("l 119",l) 296 | for i in l: 297 | function_name = re.search("var\s+([^=]+)", i).group(1) 298 | var_args_equal_value_functions.append(function_name) 299 | js = js.replace(i, "") 300 | # 替换全文 301 | a = function_name 302 | js = re.sub("%s\(([^\)]+)\)" % a, r"\1", js) 303 | 304 | # var 无参数 返回常量 函数 305 | var_no_args_return_constant_functions = [] 306 | """ 307 | var Qh_ = function() { 308 | 'return Qh_'; 309 | return ';'; 310 | }; 311 | """ 312 | constant_function_regex2 = re.compile(""" 313 | var\s+[^=]+=\s*function\(\)\{\s* 314 | [\'\"]return\s*\w+\s*[\'\"];\s* 315 | return\s+[\'\"][^\'\"]+[\'\"];\s* 316 | \}; 317 | """, 318 | re.X) 319 | l = constant_function_regex2.findall(js) 320 | # print("l 144",l) 321 | for i in l: 322 | function_name = re.search(""" 323 | var\s+([^=]+)=\s*function\(\)\{\s* 324 | [\'\"]return\s*\w+\s*[\'\"];\s* 325 | return\s+[\'\"]([^\'\"]+)[\'\"];\s* 326 | \}; 327 | """, 328 | i, 329 | re.X) 330 | var_no_args_return_constant_functions.append(function_name.groups()) 331 | js = js.replace(i, "") 332 | # 替换全文 333 | a, b = function_name.groups() 334 | all_var["%s()" % a] = b 335 | 336 | # 无参数 返回常量 函数 337 | no_args_return_constant_functions = [] 338 | """ 339 | function ZP_() { 340 | 'return ZP_'; 341 | return 'E'; 342 | } 343 | """ 344 | constant_function_regex3 = re.compile(""" 345 | function\s*\w+\(\)\s*\{\s* 346 | [\'\"]return\s*[^\'\"]+[\'\"];\s* 347 | return\s*[\'\"][^\'\"]+[\'\"];\s* 348 | \}\s* 349 | """, 350 | re.X) 351 | l = constant_function_regex3.findall(js) 352 | # print("l 176",l) 353 | for i in l: 354 | function_name = re.search(""" 355 | function\s*(\w+)\(\)\s*\{\s* 356 | [\'\"]return\s*[^\'\"]+[\'\"];\s* 357 | return\s*[\'\"]([^\'\"]+)[\'\"];\s* 358 | \}\s* 359 | """, 360 | i, 361 | re.X) 362 | no_args_return_constant_functions.append(function_name.groups()) 363 | js = js.replace(i, "") 364 | # 替换全文 365 | a, b = function_name.groups() 366 | all_var["%s()" % a] = b 367 | 368 | # 无参数 返回常量 函数 中间无混淆代码 369 | no_args_return_constant_sample_functions = [] 370 | """ 371 | function do_() { 372 | return ''; 373 | } 374 | """ 375 | constant_function_regex3 = re.compile(""" 376 | function\s*\w+\(\)\s*\{\s* 377 | return\s*[\'\"][^\'\"]*[\'\"];\s* 378 | \}\s* 379 | """, 380 | re.X) 381 | l = constant_function_regex3.findall(js) 382 | # print("l 206",l) 383 | for i in l: 384 | function_name = re.search(""" 385 | function\s*(\w+)\(\)\s*\{\s* 386 | return\s*[\'\"]([^\'\"]*)[\'\"];\s* 387 | \}\s* 388 | """, 389 | i, 390 | re.X) 391 | no_args_return_constant_sample_functions.append(function_name.groups()) 392 | js = js.replace(i, "") 393 | # 替换全文 394 | a, b = function_name.groups() 395 | all_var["%s()" % a] = b 396 | 397 | # 字符串拼接时使无参常量函数 398 | """ 399 | (function() { 400 | 'return sZ_'; 401 | return '1' 402 | })() 403 | """ 404 | constant_function_regex6 = re.compile(""" 405 | \(function\(\)\s*\{\s* 406 | [\'\"]return[^\'\"]+[\'\"];\s* 407 | return\s*[\'\"][^\'\"]*[\'\"];? 408 | \}\)\(\) 409 | """, 410 | re.X) 411 | l = constant_function_regex6.findall(js) 412 | # print("l 236",l) 413 | for i in l: 414 | function_name = re.search(""" 415 | \(function\(\)\s*\{\s* 416 | [\'\"]return[^\'\"]+[\'\"];\s* 417 | return\s*([\'\"][^\'\"]*[\'\"]);? 418 | \}\)\(\) 419 | """, 420 | i, 421 | re.X) 422 | js = js.replace(i, function_name.group(1)) 423 | 424 | # 字符串拼接时使用返回参数的函数 425 | """ 426 | (function(iU__) { 427 | 'return iU_'; 428 | return iU__; 429 | })('9F') 430 | """ 431 | constant_function_regex6 = re.compile(""" 432 | \(function\(\w+\)\s*\{\s* 433 | [\'\"]return[^\'\"]+[\'\"];\s* 434 | return\s*\w+; 435 | \}\)\([\'\"][^\'\"]*[\'\"]\) 436 | """, 437 | re.X) 438 | 439 | l = constant_function_regex6.findall(js) 440 | # print("l 264",l) 441 | for i in l: 442 | function_name = re.search(""" 443 | \(function\(\w+\)\s*\{\s* 444 | [\'\"]return[^\'\"]+[\'\"];\s* 445 | return\s*\w+; 446 | \}\)\(([\'\"][^\'\"]*[\'\"])\) 447 | """, 448 | i, 449 | re.X) 450 | js = js.replace(i, function_name.group(1)) 451 | #print("275", js) 452 | # 获取所有变量 453 | var_regex = "var\s+(\w+)=(.*?);\s" 454 | var_find = re.findall(var_regex, js) 455 | #print("var_find", var_find) 456 | for var_name, var_value in var_find: 457 | var_value = var_value.strip("\'\"").strip() 458 | # print(var_name,"---",var_value) 459 | if "(" in var_value: 460 | var_value = ";" 461 | all_var[var_name] = var_value 462 | #print("all var", all_var) 463 | # 注释掉 此正则可能会把关键js语句删除掉 464 | # js = re.sub(var_regex, "", js) 465 | 466 | for var_name, var_value in all_var.items(): 467 | js = js.replace(var_name, var_value) 468 | #print("----282", js) 469 | js = re.sub("[\s+']", "", js) 470 | #print("----284", js) 471 | string_m = re.search("(%\w\w(?:%\w\w)+)", js) 472 | # string = urllib.parse.unquote(string_m.group(1)).encode("utf-8").decode("utf8") 473 | #print("string_m", string_m.groups()) 474 | string = urllib.parse.unquote(string_m.group(1)).encode("utf-8").decode("utf8") 475 | #print(string) 476 | index_m = re.search("([\d,]+(;[\d,]+)+)", js[string_m.end():]) 477 | #print(index_m.group()) 478 | string_list = list(string) 479 | #print("str", len(string_list)) 480 | # print("string_list",string_list) 481 | index_list = index_m.group(1).split(";") 482 | # print("index_list",index_list) 483 | _word_list = [] 484 | # print(type(_word_list)) 485 | # print(_word_list) 486 | i = 1 487 | exflag = 0; 488 | # deal exception 489 | 490 | # print("--max ",type(int(max(index_list)))) 491 | max_index = 0; 492 | for word_index_list in index_list: 493 | _word = "" 494 | if "," in word_index_list: 495 | word_index_list = word_index_list.split(",") 496 | word_index_list = [int(x) for x in word_index_list] 497 | else: 498 | word_index_list = [int(word_index_list)] 499 | for word_index in word_index_list: 500 | # print(word_index) 501 | if (word_index > max_index): 502 | max_index = word_index 503 | try: 504 | string_list[word_index] 505 | except Exception as e: 506 | exflag = 1; 507 | print(max_index) 508 | print("exflag", exflag) 509 | less = max_index - len(string_list) 510 | print(less) 511 | for word_index_list in index_list: 512 | _word = "" 513 | if "," in word_index_list: 514 | word_index_list = word_index_list.split(",") 515 | # print("word_index_list",word_index_list) 516 | word_index_list = [int(x) for x in word_index_list] 517 | # print("word_index_list", word_index_list) 518 | else: 519 | word_index_list = [int(word_index_list)] 520 | j = 1; 521 | for word_index in word_index_list: 522 | # print("for",j) 523 | j += 1 524 | # print("word_index",word_index) 525 | # print("string_list[word_index]",string_list[word_index]) 526 | try: 527 | _word += string_list[word_index - 1 - less] 528 | except Exception as e: 529 | print(e) 530 | 531 | # print(_word) 532 | _word_list.append(_word) 533 | # print("----------") 534 | # print(i) 535 | # print(_word_list) 536 | 537 | i += 1 538 | 539 | return _word_list --------------------------------------------------------------------------------