├── .gitattributes
├── .idea
├── misc.xml
├── modules.xml
├── scrapy_autohome.iml
└── workspace.xml
├── README.md
├── main.py
├── scrapy.cfg
└── scrapy_autohome
├── __init__.py
├── __pycache__
├── __init__.cpython-36.pyc
├── all_car_id.cpython-36.pyc
├── items.cpython-36.pyc
└── settings.cpython-36.pyc
├── all_car_id.py
├── items.py
├── middlewares.py
├── pipelines.py
├── settings.py
└── spiders
├── __init__.py
├── __pycache__
├── __init__.cpython-36.pyc
└── autohome_spider.cpython-36.pyc
└── autohome_spider.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/scrapy_autohome.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 | 1495199607865
19 |
20 |
21 | 1495199607865
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [toc]
2 | # AutoHomeSpider_Scrapy
3 |
4 | ## 1需求分析
5 |
6 | 因项目需求,要爬取汽车之家的口碑数据进行下一步分析。
7 |
8 | 但是普通的爬虫软件(如八爪鱼、火车头、神箭手)无法爬取评论(该公司采取了反爬虫措施)。
9 |
10 | 经分析,发现该公司的的反爬虫措施主要是用前端js去替换显示的字体,为一些标签。并且封住鼠标右键导致不好观察源代码。
11 |
12 | 本文以解决各个问题为顺序。
13 |
14 | ## 2 前端js反爬虫措施分析
15 |
16 |
17 | ## 3 爬虫框架scrapy
18 |
19 | ### 3.1 获取所有车型的id
20 | ### 3.2 本爬虫采用scrapy框架分析所需要的评论信息为
21 |
22 |
23 | ## 4 运行方式:下载后,用pycharm运行main.py文件即可
24 |
25 | ## 5 完整项目描述博客:http://blog.csdn.net/u012052268/article/details/72810037
26 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute("scrapy crawl autohome_spider -o cars.csv".split())
--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = scrapy_autohome.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scrapy_autohome
12 |
--------------------------------------------------------------------------------
/scrapy_autohome/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xqtbox/AutoHomeSpider_Scrapy/6f3b475b3705bbdf5f3d89fd2e2759c48ec4dcf3/scrapy_autohome/__init__.py
--------------------------------------------------------------------------------
/scrapy_autohome/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xqtbox/AutoHomeSpider_Scrapy/6f3b475b3705bbdf5f3d89fd2e2759c48ec4dcf3/scrapy_autohome/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy_autohome/__pycache__/all_car_id.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xqtbox/AutoHomeSpider_Scrapy/6f3b475b3705bbdf5f3d89fd2e2759c48ec4dcf3/scrapy_autohome/__pycache__/all_car_id.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy_autohome/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xqtbox/AutoHomeSpider_Scrapy/6f3b475b3705bbdf5f3d89fd2e2759c48ec4dcf3/scrapy_autohome/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy_autohome/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xqtbox/AutoHomeSpider_Scrapy/6f3b475b3705bbdf5f3d89fd2e2759c48ec4dcf3/scrapy_autohome/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy_autohome/all_car_id.py:
--------------------------------------------------------------------------------
1 | # 这个类用于存储 爬取的汽车之家的所有汽车的id,然后传递给spider
2 | class All_Car_Id():
3 | car_id_string = '''3170
4 | 692
5 | 18
6 | 2951
7 | 812
8 | 19
9 | 509
10 | 650
11 | 370
12 | 2730
13 | 471
14 | 538
15 | 2734
16 | 472
17 | 2736
18 | 740
19 | 2738
20 | 146
21 | 2739
22 | 2264
23 | 593
24 | 2841
25 | 412
26 | 148
27 | 2740
28 | 511
29 | 2415
30 | 3276
31 | 926
32 | 2732
33 | 3669
34 | 3350
35 | 2908
36 | 3287
37 | 3479
38 | 3894
39 | 4003
40 | 4288
41 | 3822
42 | 3210
43 | 2218
44 | 2832
45 | 2735
46 | 2737
47 | 2994
48 | 2731
49 | 2733
50 | 2760
51 | 2741
52 | 923
53 | 884
54 | 385
55 | 822
56 | 266
57 | 386
58 | 3891
59 | 2075
60 | 2275
61 | 3730
62 | 3678
63 | 3004
64 | 3742
65 | 2846
66 | 582
67 | 2745
68 | 2097
69 | 2148
70 | 2098
71 | 3825
72 | 715
73 | 1021
74 | 4196
75 | 2288
76 | 2715
77 | 3030
78 | 179
79 | 401
80 | 4212
81 | 4034
82 | 4109
83 | 4106
84 | 588
85 | 197
86 | 3248
87 | 3862
88 | 2562
89 | 3823
90 | 2084
91 | 2564
92 | 2034
93 | 52
94 | 398
95 | 2966
96 | 56
97 | 450
98 | 365
99 | 59
100 | 4344
101 | 3339
102 | 3683
103 | 60
104 | 3688
105 | 469
106 | 3278
107 | 237
108 | 4411
109 | 3918
110 | 2065
111 | 3700
112 | 4213
113 | 3079
114 | 2950
115 | 3441
116 | 2005
117 | 3364
118 | 192
119 | 2762
120 | 2424
121 | 4195
122 | 4231
123 | 595
124 | 57
125 | 467
126 | 300
127 | 235
128 | 683
129 | 267
130 | 2842
131 | 2967
132 | 2717
133 | 2719
134 | 2197
135 | 3264
136 | 3704
137 | 2723
138 | 3901
139 | 3451
140 | 2718
141 | 4364
142 | 4074
143 | 2721
144 | 2720
145 | 3246
146 | 3985
147 | 2722
148 | 2833
149 | 914
150 | 3665
151 | 4300
152 | 3104
153 | 135
154 | 3859
155 | 4179
156 | 859
157 | 3582
158 | 314
159 | 4304
160 | 2565
161 | 81
162 | 3876
163 | 3085
164 | 2168
165 | 78
166 | 3460
167 | 4102
168 | 880
169 | 2786
170 | 694
171 | 449
172 | 723
173 | 897
174 | 4360
175 | 900
176 | 3275
177 | 3266
178 | 3478
179 | 3705
180 | 896
181 | 2948
182 | 559
183 | 3257
184 | 3605
185 | 2995
186 | 3268
187 | 918
188 | 810
189 | 733
190 | 2587
191 | 2700
192 | 3232
193 | 2551
194 | 233
195 | 4239
196 | 3751
197 | 982
198 | 164
199 | 834
200 | 2896
201 | 3554
202 | 166
203 | 875
204 | 525
205 | 344
206 | 592
207 | 2511
208 | 985
209 | 719
210 | 3708
211 | 2317
212 | 4000
213 | 3078
214 | 3707
215 | 4171
216 | 3941
217 | 66
218 | 4350
219 | 2561
220 | 4356
221 | 65
222 | 2388
223 | 373
224 | 317
225 | 2963
226 | 2968
227 | 202
228 | 2847
229 | 270
230 | 153
231 | 4345
232 | 271
233 | 3053
234 | 159
235 | 4348
236 | 587
237 | 3230
238 | 2387
239 | 161
240 | 3965
241 | 4067
242 | 675
243 | 3386
244 | 4136
245 | 3843
246 | 3146
247 | 2831
248 | 3464
249 | 2196
250 | 3189
251 | 2726
252 | 2727
253 | 2728
254 | 2729
255 | 3357
256 | 2725
257 | 579
258 | 4073
259 | 407
260 | 3283
261 | 2761
262 | 2806
263 | 3781
264 | 3780
265 | 4394
266 | 2088
267 | 3059
268 | 3430
269 | 831
270 | 798
271 | 4393
272 | 3474
273 | 4279
274 | 417
275 | 940
276 | 4333
277 | 927
278 | 997
279 | 2085
280 | 540
281 | 2091
282 | 2944
283 | 489
284 | 4271
285 | 3068
286 | 877
287 | 3632
288 | 987
289 | 2299
290 | 3234
291 | 2619
292 | 4167
293 | 437
294 | 688
295 | 99
296 | 2472
297 | 2047
298 | 726
299 | 3381
300 | 3903
301 | 2544
302 | 3434
303 | 4331
304 | 2829
305 | 689
306 | 2191
307 | 2404
308 | 2801
309 | 2891
310 | 3614
311 | 724
312 | 4193
313 | 2315
314 | 861
315 | 2892
316 | 3974
317 | 184
318 | 640
319 | 277
320 | 185
321 | 186
322 | 155
323 | 4077
324 | 3774
325 | 3343
326 | 2236
327 | 4166
328 | 3677
329 | 3412
330 | 4392
331 | 703
332 | 4322
333 | 2838
334 | 172
335 | 4324
336 | 4175
337 | 162
338 | 2073
339 | 168
340 | 415
341 | 3917
342 | 3401
343 | 3695
344 | 632
345 | 466
346 | 2310
347 | 4069
348 | 3000
349 | 901
350 | 3014
351 | 2685
352 | 305
353 | 3739
354 | 4363
355 | 3668
356 | 306
357 | 3557
358 | 3794
359 | 2791
360 | 2787
361 | 3673
362 | 3839
363 | 3928
364 | 3795
365 | 3417
366 | 3427
367 | 3800
368 | 623
369 | 2852
370 | 2252
371 | 3284
372 | 3361
373 | 4276
374 | 3426
375 | 3874
376 | 3661
377 | 4137
378 | 3714
379 | 4046
380 | 3913
381 | 4047
382 | 4403
383 | 965
384 | 3035
385 | 2126
386 | 622
387 | 2960
388 | 852
389 | 4009
390 | 4154
391 | 2482
392 | 3231
393 | 3191
394 | 3916
395 | 4061
396 | 3428
397 | 2943
398 | 3712
399 | 2446
400 | 2447
401 | 2575
402 | 2461
403 | 2444
404 | 2809
405 | 2947
406 | 2448
407 | 2460
408 | 390
409 | 3889
410 | 3919
411 | 4264
412 | 3533
413 | 3884
414 | 4104
415 | 4015
416 | 3630
417 | 3537
418 | 4298
419 | 4164
420 | 4173
421 | 4269
422 | 3217
423 | 4380
424 | 2119
425 | 2788
426 | 2567
427 | 2429
428 | 4343
429 | 3422
430 | 2785
431 | 3924
432 | 2778
433 | 3204
434 | 3121
435 | 4242
436 | 3120
437 | 3811
438 | 2316
439 | 484
440 | 2046
441 | 2045
442 | 705
443 | 1008
444 | 590
445 | 520
446 | 2090
447 | 2462
448 | 3101
449 | 625
450 | 2121
451 | 2459
452 | 491
453 | 2122
454 | 2001
455 | 493
456 | 624
457 | 492
458 | 2120
459 | 535
460 | 2304
461 | 6
462 | 552
463 | 2200
464 | 536
465 | 2653
466 | 4181
467 | 3311
468 | 4319
469 | 4182
470 | 4376
471 | 2954
472 | 3893
473 | 3783
474 | 2600
475 | 2605
476 | 3526
477 | 3514
478 | 3685
479 | 2566
480 | 3155
481 | 2505
482 | 2604
483 | 4372
484 | 4132
485 | 2923
486 | 3227
487 | 4011
488 | 3091
489 | 4012
490 | 4013
491 | 2606
492 | 3922
493 | 3980
494 | 4208
495 | 2478
496 | 3640
497 | 4142
498 | 76
499 | 3504
500 | 3505
501 | 3508
502 | 4008
503 | 4093
504 | 4450
505 | 145
506 | 2922
507 | 3103
508 | 614
509 | 3197
510 | 3457
511 | 528
512 | 4045
513 | 874
514 | 4274
515 | 4232
516 | 333
517 | 144
518 | 826
519 | 149
520 | 207
521 | 16
522 | 633
523 | 871
524 | 442
525 | 3964
526 | 4204
527 | 496
528 | 905
529 | 15
530 | 360
531 | 210
532 | 372
533 | 3999
534 | 557
535 | 82
536 | 86
537 | 631
538 | 3416
539 | 669
540 | 224
541 | 422
542 | 780
543 | 2435
544 | 782
545 | 925
546 | 680
547 | 906
548 | 4211
549 | 4268
550 | 368
551 | 3732
552 | 700
553 | 3801
554 | 2427
555 | 2917
556 | 4048
557 | 4412
558 | 3398
559 | 2993
560 | 4275
561 | 2584
562 | 4113
563 | 4186
564 | 3998
565 | 2279
566 | 3027
567 | 4358
568 | 4287
569 | 2185
570 | 512
571 | 430
572 | 3622
573 | 2380
574 | 3616
575 | 4330
576 | 539
577 | 3301
578 | 3970
579 | 3414
580 | 3128
581 | 3785
582 | 2990
583 | 3792
584 | 3789
585 | 2540
586 | 4423
587 | 554
588 | 2742
589 | 3119
590 | 2556
591 | 3493
592 | 3461
593 | 3463
594 | 3786
595 | 4097
596 | 3341
597 | 4383
598 | 1006
599 | 790
600 | 2839
601 | 4278
602 | 2803
603 | 560
604 | 2901
605 | 951
606 | 3279
607 | 2512
608 | 2510
609 | 561
610 | 562
611 | 3963
612 | 3309
613 | 2952
614 | 3324
615 | 898
616 | 2078
617 | 3594
618 | 2314
619 | 4347
620 | 3063
621 | 4041
622 | 606
623 | 2769
624 | 2776
625 | 4086
626 | 3634
627 | 3088
628 | 126
629 | 2477
630 | 2530
631 | 4023
632 | 3828
633 | 3925
634 | 3829
635 | 3086
636 | 3984
637 | 3637
638 | 4306
639 | 3036
640 | 3502
641 | 2865
642 | 4461
643 | 4299
644 | 2500
645 | 2501
646 | 3697
647 | 2499
648 | 2490
649 | 2744
650 | 3698
651 | 3699
652 | 2452
653 | 2497
654 | 2491
655 | 2494
656 | 2495
657 | 574
658 | 602
659 | 2628
660 | 576
661 | 2089
662 | 2201
663 | 2226
664 | 738
665 | 2198
666 | 575
667 | 3049
668 | 4303
669 | 573
670 | 3050
671 | 2627
672 | 2626
673 | 2625
674 | 2703
675 | 3354
676 | 545
677 | 4259
678 | 3126
679 | 3462
680 | 110
681 | 771
682 | 2237
683 | 505
684 | 111
685 | 4260
686 | 526
687 | 371
688 | 375
689 | 882
690 | 770
691 | 46
692 | 2527
693 | 109
694 | 45
695 | 170
696 | 513
697 | 3851
698 | 964
699 | 2107
700 | 107
701 | 2574
702 | 2607
703 | 762
704 | 2244
705 | 748
706 | 3019
707 | 3948
708 | 938
709 | 671
710 | 711
711 | 3041
712 | 3118
713 | 917
714 | 893
715 | 2646
716 | 774
717 | 721
718 | 2055
719 | 763
720 | 3682
721 | 3239
722 | 963
723 | 945
724 | 2614
725 | 3611
726 | 206
727 | 3273
728 | 549
729 | 929
730 | 550
731 | 334
732 | 983
733 | 3837
734 | 3718
735 | 786
736 | 3322
737 | 3352
738 | 934
739 | 3931
740 | 3975
741 | 4281
742 | 2714
743 | 2411
744 | 2354
745 | 364
746 | 3347
747 | 3693
748 | 117
749 | 2871
750 | 2863
751 | 3615
752 | 659
753 | 577
754 | 3175
755 | 498
756 | 3518
757 | 3814
758 | 4192
759 | 2524
760 | 2523
761 | 912
762 | 378
763 | 704
764 | 2024
765 | 2302
766 | 102
767 | 2353
768 | 713
769 | 3373
770 | 665
771 | 3113
772 | 2023
773 | 744
774 | 759
775 | 986
776 | 2622
777 | 97
778 | 3183
779 | 684
780 | 972
781 | 2025
782 | 2281
783 | 2006
784 | 921
785 | 2041
786 | 3028
787 | 2677
788 | 2946
789 | 2406
790 | 2707
791 | 2997
792 | 2820
793 | 2389
794 | 2390
795 | 2884
796 | 4314
797 | 676
798 | 2682
799 | 4027
800 | 3016
801 | 3720
802 | 3026
803 | 889
804 | 459
805 | 2261
806 | 361
807 | 308
808 | 367
809 | 359
810 | 3633
811 | 2455
812 | 2475
813 | 2895
814 | 3754
815 | 2767
816 | 3267
817 | 465
818 | 89
819 | 90
820 | 91
821 | 601
822 | 2262
823 | 765
824 | 3709
825 | 543
826 | 3841
827 | 542
828 | 544
829 | 3624
830 | 391
831 | 3866
832 | 4055
833 | 3992
834 | 3877
835 | 3346
836 | 3676
837 | 3821
838 | 3440
839 | 4035
840 | 2535
841 | 2542
842 | 3089
843 | 3735
844 | 2579
845 | 4165
846 | 661
847 | 3307
848 | 4227
849 | 4228
850 | 3991
851 | 3990
852 | 2577
853 | 4285
854 | 3995
855 | 4230
856 | 4255
857 | 3069
858 | 3524
859 | 4340
860 | 3349
861 | 3782
862 | 3691
863 | 2560
864 | 3574
865 | 4094
866 | 3952
867 | 3498
868 | 603
869 | 4291
870 | 4290
871 | 3951
872 | 4250
873 | 2141
874 | 3272
875 | 2488
876 | 2885
877 | 2599
878 | 2571
879 | 2568
880 | 3480
881 | 2774
882 | 3477
883 | 3134
884 | 2114
885 | 1015
886 | 2306
887 | 864
888 | 2974
889 | 3662
890 | 2580
891 | 3791
892 | 4382
893 | 947
894 | 980
895 | 2355
896 | 955
897 | 952
898 | 3345
899 | 4214
900 | 2093
901 | 2095
902 | 2094
903 | 3454
904 | 2615
905 | 4168
906 | 2027
907 | 2123
908 | 3481
909 | 3074
910 | 2124
911 | 3298
912 | 3808
913 | 3449
914 | 4100
915 | 3807
916 | 2326
917 | 395
918 | 470
919 | 2766
920 | 3075
921 | 4205
922 | 4339
923 | 824
924 | 823
925 | 3994
926 | 47
927 | 527
928 | 696
929 | 844
930 | 4156
931 | 2941
932 | 3006
933 | 4169
934 | 3214
935 | 855
936 | 2318
937 | 3160
938 | 2481
939 | 2771
940 | 3108
941 | 4122
942 | 4410
943 | 2789
944 | 4424
945 | 3812
946 | 556
947 | 428
948 | 2487
949 | 2485
950 | 2486
951 | 3038
952 | 392
953 | 67
954 | 68
955 | 481
956 | 2536
957 | 2754
958 | 2755
959 | 2973
960 | 2133
961 | 3803
962 | 2144
963 | 3896
964 | 477
965 | 3483
966 | 290
967 | 2108
968 | 2160
969 | 2517
970 | 2515
971 | 3455
972 | 3846
973 | 4436
974 | 2212
975 | 673
976 | 2211
977 | 4131
978 | 2572
979 | 3243
980 | 3636
981 | 3235
982 | 3149
983 | 2673
984 | 4065
985 | 4066
986 | 3607
987 | 3911
988 | 4130
989 | 4226
990 | 570
991 | 461
992 | 194
993 | 460
994 | 862
995 | 3148
996 | 2873
997 | 379
998 | 38
999 | 3844
1000 | 3125
1001 | 3915
1002 | 3122
1003 | 447
1004 | 3556
1005 | 4342
1006 | 4139
1007 | 474
1008 | 3589
1009 | 4293
1010 | 3788
1011 | 3465
1012 | 4133
1013 | 821
1014 | 2338
1015 | 2322
1016 | 2192
1017 | 608
1018 | 133
1019 | 2840
1020 | 2111
1021 | 2166
1022 | 2156
1023 | 609
1024 | 132
1025 | 421
1026 | 409
1027 | 2051
1028 | 2155
1029 | 799
1030 | 841
1031 | 801
1032 | 2112
1033 | 2158
1034 | 989
1035 | 2964
1036 | 3467
1037 | 2157
1038 | 138
1039 | 611
1040 | 291
1041 | 23
1042 | 4072
1043 | 3845
1044 | 3872
1045 | 121
1046 | 3062
1047 | 521
1048 | 504
1049 | 777
1050 | 3321
1051 | 263
1052 | 2813
1053 | 503
1054 | 3048
1055 | 4234
1056 | 4088
1057 | 828
1058 | 2956
1059 | 3395
1060 | 616
1061 | 3363
1062 | 3545
1063 | 4087
1064 | 3080
1065 | 2752
1066 | 3351
1067 | 4090
1068 | 2543
1069 | 3490
1070 | 2541
1071 | 2569
1072 | 2581
1073 | 3546
1074 | 3407
1075 | 3408
1076 | 3961
1077 | 2130
1078 | 3084
1079 | 4089
1080 | 4085
1081 | 3450
1082 | 3061
1083 | 2763
1084 | 660
1085 | 617
1086 | 567
1087 | 572
1088 | 816
1089 | 3443
1090 | 3444
1091 | 3456
1092 | 4083
1093 | 3312
1094 | 178
1095 | 3209
1096 | 2903
1097 | 589
1098 | 4251
1099 | 2414
1100 | 2217
1101 | 2369
1102 | 456
1103 | 328
1104 | 258
1105 | 4209
1106 | 2514
1107 | 2860
1108 | 4294
1109 | 2537
1110 | 2545
1111 | 2810
1112 | 4237
1113 | 2402
1114 | 3763
1115 | 3955
1116 | 2601
1117 | 2711
1118 | 3136
1119 | 2837
1120 | 3721
1121 | 3722
1122 | 4312
1123 | 2325
1124 | 3898
1125 | 3515
1126 | 3853
1127 | 2659
1128 | 4076
1129 | 2660
1130 | 2665
1131 | 3981
1132 | 3466
1133 | 3639
1134 | 2986
1135 | 2985
1136 | 3549
1137 | 3681
1138 | 4388
1139 | 3320
1140 | 3628
1141 | 2573
1142 | 3885
1143 | 2976
1144 | 3207
1145 | 3802
1146 | 2949
1147 | 3989
1148 | 488
1149 | 970
1150 | 311
1151 | 462
1152 | 2629
1153 | 3359
1154 | 2087
1155 | 2401
1156 | 4185
1157 | 3647
1158 | 3190
1159 | 752
1160 | 49
1161 | 426
1162 | 3511
1163 | 3360
1164 | 4172
1165 | 566
1166 | 487
1167 | 227
1168 | 2207
1169 | 2224
1170 | 4002
1171 | 4284
1172 | 2231
1173 | 380
1174 | 3051
1175 | 4039
1176 | 3384
1177 | 2493
1178 | 3156
1179 | 2476
1180 | 3017
1181 | 2496
1182 | 3854
1183 | 2484
1184 | 517
1185 | 2489
1186 | 2492
1187 | 911
1188 | 3586
1189 | 3581
1190 | 3983
1191 | 3971
1192 | 3580
1193 | 4095
1194 | 2068
1195 | 3377
1196 | 3741
1197 | 732
1198 | 2612
1199 | 2611
1200 | 2610
1201 | 2419
1202 | 2063
1203 | 201
1204 | 403
1205 | 261
1206 | 341
1207 | 3442
1208 | 112
1209 | 3934
1210 | 352
1211 | 3758
1212 | 3737
1213 | 3944
1214 | 4197
1215 | 2623
1216 | 3658
1217 | 351
1218 | 332
1219 | 3238
1220 | 3756
1221 | 3755
1222 | 697
1223 | 3871
1224 | 3521
1225 | 4316
1226 | 754
1227 | 802
1228 | 850
1229 | 69
1230 | 2410
1231 | 3565
1232 | 256
1233 | 3435
1234 | 77
1235 | 3177
1236 | 75
1237 | 3216
1238 | 3112
1239 | 2534
1240 | 2049
1241 | 432
1242 | 4435
1243 | 872
1244 | 362
1245 | 3528
1246 | 529
1247 | 3185
1248 | 3858
1249 | 94
1250 | 2242
1251 | 2176
1252 | 892
1253 | 508
1254 | 500
1255 | 3358
1256 | 674
1257 | 2708
1258 | 707
1259 | 3744
1260 | 3476
1261 | 3047
1262 | 3888
1263 | 3926
1264 | 3584
1265 | 4036
1266 | 3745
1267 | 3212
1268 | 2905
1269 | 793
1270 | 3731
1271 | 2991
1272 | 758
1273 | 95
1274 | 869
1275 | 794
1276 | 103
1277 | 4111
1278 | 3472
1279 | 961
1280 | 3150
1281 | 4126
1282 | 706
1283 | 815
1284 | 3124
1285 | 962
1286 | 569
1287 | 568
1288 | 2520
1289 | 2521
1290 | 3034
1291 | 4098
1292 | 3870
1293 | 908
1294 | 2184
1295 | 2765
1296 | 2268
1297 | 196
1298 | 686
1299 | 887
1300 | 3077
1301 | 2706
1302 | 784
1303 | 3612
1304 | 3368
1305 | 3717
1306 | 2029
1307 | 3211
1308 | 199
1309 | 2830
1310 | 4032
1311 | 4210
1312 | 3905
1313 | 188
1314 | 257
1315 | 265
1316 | 836
1317 | 3015
1318 | 3838
1319 | 3277
1320 | 2277
1321 | 2775
1322 | 3623
1323 | 4033
1324 | 3760
1325 | 3137
1326 | 3020
1327 | 354
1328 | 174
1329 | 727
1330 | 4329
1331 | 3326
1332 | 3220
1333 | 2134
1334 | 3987
1335 | 3759
1336 | 4194
1337 | 2503
1338 | 2800
1339 | 3631
1340 | 2502
1341 | 3228
1342 | 2957
1343 | 596
1344 | 2312
1345 | 3797
1346 | 597
1347 | 3082
1348 | 443
1349 | 3083
1350 | 272
1351 | 681
1352 | 891
1353 | 2248
1354 | 4302
1355 | 2883
1356 | 3413
1357 | 833
1358 | 4233
1359 | 571
1360 | 501
1361 | 635
1362 | 468
1363 | 2782
1364 | 2802
1365 | 583
1366 | 691
1367 | 928
1368 | 2125
1369 | 4121
1370 | 2271
1371 | 3105
1372 | 4402
1373 | 4225
1374 | 4224
1375 | 4221
1376 | 3666
1377 | 2418
1378 | 2987
1379 | 433
1380 | 641
1381 | 363
1382 | 3154
1383 | 3968
1384 | 22
1385 | 655
1386 | 3066
1387 | 2118
1388 | 672
1389 | 1005
1390 | 3394
1391 | 946
1392 | 584
1393 | 3096
1394 | 2518
1395 | 728
1396 | 3538
1397 | 3887
1398 | 2284
1399 | 2391
1400 | 3942
1401 | 658
1402 | 578
1403 | 304
1404 | 295
1405 | 3060
1406 | 289
1407 | 2428
1408 | 551
1409 | 903
1410 | 3402
1411 | 322
1412 | 191
1413 | 555
1414 | 2147
1415 | 835
1416 | 3547
1417 | 4247
1418 | 3065
1419 | 532
1420 | 2758
1421 | 3815
1422 | 4395
1423 | 531
1424 | 533
1425 | 209
1426 | 749
1427 | 750
1428 | 930
1429 | 2241
1430 | 904
1431 | 2276
1432 | 2676
1433 | 3178
1434 | 3179
1435 | 3182
1436 | 3180
1437 | 4049
1438 | 3491
1439 | 2835
1440 | 3736
1441 | 2836
1442 | 2902
1443 | 3809
1444 | 4040
1445 | 3767
1446 | 3620
1447 | 3382
1448 | 3733
1449 | 4366
1450 | 2293
1451 | 389
1452 | 2988
1453 | 3939
1454 | 3293
1455 | 2295
1456 | 3040
1457 | 2296
1458 | 2609
1459 | 3242
1460 | 3432
1461 | 4323
1462 | 3746
1463 | 2377
1464 | 4096
1465 | 4381
1466 | 2642
1467 | 3328
1468 | 2748
1469 | 888
1470 | 524
1471 | 806
1472 | 2641
1473 | 755
1474 | 3109
1475 | 479
1476 | 464
1477 | 2563
1478 | 2859
1479 | 3679
1480 | 753
1481 | 785
1482 | 348
1483 | 653
1484 | 4295
1485 | 2645
1486 | 3912
1487 | 2021
1488 | 381
1489 | 3193
1490 | 2916
1491 | 2092
1492 | 182
1493 | 670
1494 | 2263
1495 | 366
1496 | 3092
1497 | 3102
1498 | 2989
1499 | 837
1500 | 2953
1501 | 3157
1502 | 3405
1503 | 2980
1504 | 4223
1505 | 2324
1506 | 3397
1507 | 4099
1508 | 3195
1509 | 3766
1510 | 3300
1511 | 2772
1512 | 4262
1513 | 4084
1514 | 3446
1515 | 2759
1516 | 87
1517 | 478
1518 | 612
1519 | 996
1520 | 518
1521 | 85
1522 | 2178
1523 | 84
1524 | 530
1525 | 434
1526 | 2180
1527 | 83
1528 | 2331
1529 | 854
1530 | 396
1531 | 451
1532 | 3648
1533 | 4218
1534 | 2319
1535 | 2886
1536 | 3286
1537 | 2246
1538 | 3664
1539 | 3954
1540 | 2137
1541 | 4200
1542 | 813
1543 | 3385
1544 | 4387
1545 | 142
1546 | 454
1547 | 876
1548 | 413
1549 | 298
1550 | 565
1551 | 284
1552 | 1010
1553 | 2681
1554 | 3198
1555 | 281
1556 | 591
1557 | 453
1558 | 3448
1559 | 1016
1560 | 2032
1561 | 3018
1562 | 666
1563 | 2274
1564 | 757
1565 | 919
1566 | 899
1567 | 2866
1568 | 890
1569 | 2420
1570 | 3265
1571 | 452
1572 | 3342
1573 | 275
1574 | 502
1575 | 3475
1576 | 2955
1577 | 2341
1578 | 2867
1579 | 3226
1580 | 3618
1581 | 3904
1582 | 4115
1583 | 4315
1584 | 3114
1585 | 3787
1586 | 4401
1587 | 3820
1588 | 2086
1589 | 522
1590 | 425
1591 | 448
1592 | 64
1593 | 3817
1594 | 634
1595 | 3957
1596 | 564
1597 | 656
1598 | 2381
1599 | 4305
1600 | 355
1601 | 63
1602 | 475
1603 | 4307
1604 | 53
1605 | 2113
1606 | 2466
1607 | 2853
1608 | 264
1609 | 438
1610 | 702
1611 | 436
1612 | 2578
1613 | 2656
1614 | 764
1615 | 992
1616 | 3930
1617 | 2070
1618 | 3509
1619 | 2890
1620 | 2186
1621 | 894
1622 | 3943
1623 | 932
1624 | 4289
1625 | 751
1626 | 2054
1627 | 2893
1628 | 3641
1629 | 2701
1630 | 2194
1631 | 208
1632 | 775
1633 | 958
1634 | 2061
1635 | 3274
1636 | 838
1637 | 3241
1638 | 2266
1639 | 3914
1640 | 3255
1641 | 2981
1642 | 2479
1643 | 3534
1644 | 205
1645 | 376
1646 | 204
1647 | 316
1648 | 2779
1649 | 2062
1650 | 3857
1651 | 537
1652 | 3978
1653 | 4263
1654 | 4246
1655 | 2743
1656 | 3977
1657 | 4080
1658 | 4240
1659 | 2297
1660 | 4399
1661 | 3956
1662 | 482
1663 | 2970
1664 | 804
1665 | 853
1666 | 2109
1667 | 797
1668 | 791
1669 | 3860
1670 | 4037
1671 | 3684
1672 | 3986
1673 | 382
1674 | 519
1675 | 3290
1676 | 2962
1677 | 3171
1678 | 3013
1679 | 4217
1680 | 772
1681 | 356
1682 | 3269
1683 | 2463
1684 | 795
1685 | 2421
1686 | 3369
1687 | 2286
1688 | 3270
1689 | 2843
1690 | 357
1691 | 858
1692 | 4451
1693 | 4021
1694 | 4389
1695 | 4129
1696 | 3452
1697 | 25
1698 | 24
1699 | 873
1700 | 483
1701 | 3131
1702 | 458
1703 | 668
1704 | 128
1705 | 2768
1706 | 4147
1707 | 3008
1708 | 377
1709 | 580
1710 | 3219
1711 | 2588
1712 | 3021
1713 | 652
1714 | 369
1715 | 3348
1716 | 3070
1717 | 3932
1718 | 1018
1719 | 4308
1720 | 486
1721 | 4203
1722 | 3253
1723 | 3252
1724 | 3022
1725 | 3525
1726 | 325
1727 | 651
1728 | 3064
1729 | 2608
1730 | 4241
1731 | 4120
1732 | 3192
1733 | 287
1734 | 2417
1735 | 285
1736 | 286
1737 | 2557
1738 | 3251
1739 | 3263
1740 | 283
1741 | 3250
1742 | 3023
1743 | 3261
1744 | 4257
1745 | 948
1746 | 2283
1747 | 414
1748 | 2751
1749 | 620
1750 | 1004
1751 | 2958
1752 | 2898
1753 | 3205
1754 | 2638
1755 | 3486
1756 | 3468
1757 | 141
1758 | 3406
1759 | 2214
1760 | 455
1761 | 3372
1762 | 3132
1763 | 485
1764 | 516
1765 | 3908
1766 | 4030
1767 | 3907
1768 | 139
1769 | 4201
1770 | 4357
1771 | 4336
1772 | 4151
1773 | 343
1774 | 211
1775 | 506
1776 | 13
1777 | 345
1778 | 3769
1779 | 2382
1780 | 3660
1781 | 2590
1782 | 2383
1783 | 2398
1784 | 599
1785 | 4078
1786 | 2357
1787 | 2664
1788 | 2356
1789 | 2805
1790 | 4058
1791 | 4006
1792 | 463
1793 | 693
1794 | 3158
1795 | 4335
1796 | 4206
1797 | 3411
1798 | 3420
1799 | 2678
1800 | 404
1801 | 2190
1802 | 3861
1803 | 4029
1804 | 177
1805 | 3396
1806 | 747
1807 | 175
1808 | 743
1809 | 2426
1810 | 585
1811 | 405
1812 | 3314
1813 | 3196
1814 | 494
1815 | 981
1816 | 406
1817 | 2139
1818 | 2451
1819 | 3657
1820 | 2456
1821 | 4270
1822 | 2506
1823 | 2855
1824 | 3946
1825 | 3570
1826 | 4408
1827 | 4252
1828 | 4253
1829 | 3306
1830 | 3882
1831 | 3969
1832 | 3653
1833 | 3576
1834 | 3562
1835 | 3935
1836 | 2834
1837 | 3571
1838 | 3564
1839 | 725
1840 | 959
1841 | 613
1842 | 909
1843 | 2538
1844 | 2539
1845 | 4265
1846 | 4373
1847 | 4427
1848 | 2858
1849 | 2670
1850 | 2672
1851 | 2115
1852 | 3415
1853 | 4107
1854 | 4222
1855 | 586
1856 | 2764
1857 | 3959
1858 | 51
1859 | 3073
1860 | 3672
1861 | 3292
1862 | 358
1863 | 1007
1864 | 2927
1865 | 431
1866 | 814
1867 | 690
1868 | 866
1869 | 2649
1870 | 50
1871 | 429
1872 | 3613
1873 | 2256
1874 | 446
1875 | 3453
1876 | 255
1877 | 3164
1878 | 2117
1879 | 756
1880 | 995
1881 | 3366
1882 | 2899
1883 | 805
1884 | 2206
1885 | 936
1886 | 2174
1887 | 2698
1888 | 710
1889 | 712
1890 | 1017
1891 | 2272
1892 | 4176
1893 | 3982
1894 | 3310
1895 | 975
1896 | 971
1897 | 2984
1898 | 127
1899 | 3045
1900 | 3711
1901 | 131
1902 | 594
1903 | 510
1904 | 457
1905 | 252
1906 | 163
1907 | 3923
1908 | 657
1909 | 4105
1910 | 2313
1911 | 4031
1912 | 3335
1913 | 2583
1914 | 4235
1915 | 2348
1916 | 420
1917 | 439
1918 | 397
1919 | 678
1920 | 2362
1921 | 2405
1922 | 808
1923 | 3710
1924 | 2919
1925 | 2255
1926 | 2037
1927 | 1014
1928 | 682
1929 | 2747
1930 | 722
1931 | 3011
1932 | 3583
1933 | 2869
1934 | 546
1935 | 868
1936 | 2553
1937 | 766
1938 | 3213
1939 | 2854
1940 | 4384
1941 | 2639
1942 | 387
1943 | 2640
1944 | 2945
1945 | 639
1946 | 98
1947 | 3873
1948 | 792
1949 | 4043
1950 | 3429
1951 | 4370
1952 | 476
1953 | 293
1954 | 388
1955 | 230
1956 | 232
1957 | 2473
1958 | 473
1959 | 2208
1960 | 2053
1961 | 664
1962 | 4042
1963 | 329
1964 | 2844
1965 | 480
1966 | 2909
1967 | 212
1968 | 4189
1969 | 3646
1970 | 3206
1971 | 4301
1972 | 809
1973 | 3010
1974 | 2413
1975 | 2753
1976 | 3976
1977 | 440
1978 | 2474
1979 | 714
1980 | 718
1981 | 2684
1982 | 2422
1983 | 4028
1984 | 2280
1985 | 3740
1986 | 2105
1987 | 4202
1988 | 3553
1989 | 3591
1990 | 2992
1991 | 3097
1992 | 3535
1993 | 3652
1994 | 3043
1995 | 3099
1996 | 3100
1997 | 3201
1998 | 2270
1999 | 3176
2000 | 2749
2001 | 3617
2002 | 2679
2003 | 383
2004 | 581
2005 | 605
2006 | 122
2007 | 2400
2008 | 416
2009 | 3202
2010 | 2716
2011 | 878
2012 | 2781
2013 | 3979
2014 | 3081
2015 | 101
2016 | 4404
2017 | 4405
2018 | 106
2019 | 444
2020 | 879
2021 | 104
2022 | 2159
2023 | 3824
2024 | 2131
2025 | 2469
2026 | 3775
2027 | 3776
2028 | 3052
2029 | 2526
2030 | 2525
2031 | 2465
2032 | 3459
2033 | 913
2034 | 2464
2035 | 2870
2036 | 2603
2037 | 2888
2038 | 3906
2039 | 3765
2040 | 2889
2041 | 3003
2042 | 2887
2043 | 2378
2044 | 2445
2045 | 2440
2046 | 939
2047 | 2533
2048 | 2532
2049 | 2531
2050 | 3619
2051 | 3777
2052 | 2228
2053 | 2828
2054 | 424
2055 | 3087
2056 | 3880
2057 | 4320
2058 | 4321
2059 | 4385
2060 | 3575
2061 | 3779
2062 | 3529
2063 | 2337
2064 | 4371
2065 | 4317
2066 | 3627
2067 | 3793
2068 | 3899
2069 | 4092
2070 | 4070
2071 | 2334
2072 | 2480
2073 | 3425
2074 | 3447
2075 | 2333
2076 | 4417
2077 | 3852
2078 | 708
2079 | 2230
2080 | 3002
2081 | 2161
2082 | 2171
2083 | 558
2084 | 663
2085 | 2336
2086 | 3289
2087 | 2999
2088 | 2770
2089 | 3878
2090 | 2998
2091 | 2323
2092 | 3530
2093 | 2294
2094 | 2857
2095 | 411
2096 | 130
2097 | 3095
2098 | 860
2099 | 523
2100 | 825
2101 | 490
2102 | 2658
2103 | 3644
2104 | 4178
2105 | 3215
2106 | 2519
2107 | 3881
2108 | 627
2109 | 2081
2110 | 2143
2111 | 2522
2112 | 2613
2113 | 3237
2114 | 4146
2115 | 3706
2116 | 3827
2117 | 3497
2118 | 2602
2119 | 4440
2120 | 4325
2121 | 4159
2122 | 4183
2123 | 3726
2124 | 3302
2125 | 3471
2126 | 3996
2127 | 2683
2128 | 4355
2129 | 342
2130 | 3294
2131 | 4152
2132 | 3516
2133 | 3181
2134 | 3531
2135 | 2332
2136 | 4326
2137 | 4091
2138 | 3098
2139 | 3690'''
2140 | car_id_list =car_id_string.split('\n')
2141 | #print(car_id_list)
--------------------------------------------------------------------------------
/scrapy_autohome/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class ScrapyAutohomeItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 |
15 | # 车ID
16 | CAR_ID = scrapy.Field()
17 | # 车名
18 | CAR_NAME = scrapy.Field()
19 |
20 | # 用户ID
21 | USER_ID = scrapy.Field()
22 | # 用户名
23 | USER_NAME = scrapy.Field()
24 |
25 | # 购买地点
26 | PURCHASE_PLACE = scrapy.Field()
27 | # 购买时间
28 | PURCHASE_TIME = scrapy.Field()
29 | # 裸车购买价
30 | CAR_PRICE = scrapy.Field()
31 | # 购车目的
32 | PURCHASE_PURPOSE = scrapy.Field()
33 |
34 | # 评分- 空间
35 | SCORE_SPACE = scrapy.Field()
36 | # 评分- 动力
37 | SCORE_POWER = scrapy.Field()
38 | # 评分- 操控
39 | SCORE_CONTROL = scrapy.Field()
40 | # 评分- 油耗
41 | SCORE_FUEL_CONSUMPTION = scrapy.Field()
42 | # 评分- 舒适性
43 | SCORE_COMFORT = scrapy.Field()
44 | # 评分- 外观
45 | SCORE_EXTERIOR = scrapy.Field()
46 | # 评分- 内饰
47 | SCORE_INTERIOR = scrapy.Field()
48 | # 评分- 性价比
49 | SCORE_COST_EFFECTIVE = scrapy.Field()
50 |
51 | # 评论的url
52 | COMMENT_URL = scrapy.Field()
53 | # 评论的内容
54 | COMMENT_CONTENT = scrapy.Field()
55 |
56 | # 有多少人支持这条口碑
57 | COMMENT_SUPPORT_QUANTITY = scrapy.Field()
58 | # 有多少人看过这条口碑
59 | COMMENT_SEEN_QUANTITY = scrapy.Field()
--------------------------------------------------------------------------------
/scrapy_autohome/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class ScrapyAutohomeSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/scrapy_autohome/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class ScrapyAutohomePipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/scrapy_autohome/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for scrapy_autohome project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'scrapy_autohome'
13 |
14 | SPIDER_MODULES = ['scrapy_autohome.spiders']
15 | NEWSPIDER_MODULE = 'scrapy_autohome.spiders'
16 | # 绕过robots.txt
17 | ROBOTSTXT_OBEY = False
18 |
19 | #记录日志
20 | LOG_FILE = "scrapy_autohome_log.log"
21 |
22 | # 保存文件编码类型
23 | FEED_EXPORT_ENCODING = 'GBK'
24 |
25 | # 伪装chrome
26 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
27 |
28 | # #DOWNLOADER_MIDDLEWARES
29 | # DOWNLOADER_MIDDLEWARES = {
30 | # 'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware':301,
31 | # }
32 |
33 |
34 |
35 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
36 | #USER_AGENT = 'scrapy_autohome (+http://www.yourdomain.com)'
37 |
38 |
39 |
40 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
41 | #CONCURRENT_REQUESTS = 32
42 |
43 | # Configure a delay for requests for the same website (default: 0)
44 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
45 | # See also autothrottle settings and docs
46 | #DOWNLOAD_DELAY = 3
47 | # The download delay setting will honor only one of:
48 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
49 | #CONCURRENT_REQUESTS_PER_IP = 16
50 |
51 | # Disable cookies (enabled by default)
52 | #COOKIES_ENABLED = False
53 |
54 | # Disable Telnet Console (enabled by default)
55 | #TELNETCONSOLE_ENABLED = False
56 |
57 | # Override the default request headers:
58 | #DEFAULT_REQUEST_HEADERS = {
59 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 | # 'Accept-Language': 'en',
61 | #}
62 |
63 | # Enable or disable spider middlewares
64 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
65 | #SPIDER_MIDDLEWARES = {
66 | # 'scrapy_autohome.middlewares.ScrapyAutohomeSpiderMiddleware': 543,
67 | #}
68 |
69 | # Enable or disable downloader middlewares
70 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
71 | #DOWNLOADER_MIDDLEWARES = {
72 | # 'scrapy_autohome.middlewares.MyCustomDownloaderMiddleware': 543,
73 | #}
74 |
75 | # Enable or disable extensions
76 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
77 | #EXTENSIONS = {
78 | # 'scrapy.extensions.telnet.TelnetConsole': None,
79 | #}
80 |
81 | # Configure item pipelines
82 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
83 | #ITEM_PIPELINES = {
84 | # 'scrapy_autohome.pipelines.ScrapyAutohomePipeline': 300,
85 | #}
86 |
87 | # Enable and configure the AutoThrottle extension (disabled by default)
88 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
89 | #AUTOTHROTTLE_ENABLED = True
90 | # The initial download delay
91 | #AUTOTHROTTLE_START_DELAY = 5
92 | # The maximum download delay to be set in case of high latencies
93 | #AUTOTHROTTLE_MAX_DELAY = 60
94 | # The average number of requests Scrapy should be sending in parallel to
95 | # each remote server
96 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
97 | # Enable showing throttling stats for every response received:
98 | #AUTOTHROTTLE_DEBUG = False
99 |
100 | # Enable and configure HTTP caching (disabled by default)
101 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
102 | #HTTPCACHE_ENABLED = True
103 | #HTTPCACHE_EXPIRATION_SECS = 0
104 | #HTTPCACHE_DIR = 'httpcache'
105 | #HTTPCACHE_IGNORE_HTTP_CODES = []
106 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
107 |
--------------------------------------------------------------------------------
/scrapy_autohome/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/scrapy_autohome/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xqtbox/AutoHomeSpider_Scrapy/6f3b475b3705bbdf5f3d89fd2e2759c48ec4dcf3/scrapy_autohome/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy_autohome/spiders/__pycache__/autohome_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xqtbox/AutoHomeSpider_Scrapy/6f3b475b3705bbdf5f3d89fd2e2759c48ec4dcf3/scrapy_autohome/spiders/__pycache__/autohome_spider.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy_autohome/spiders/autohome_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import re
3 | import scrapy
4 | import urllib.parse
5 | from scrapy_autohome.all_car_id import All_Car_Id
6 | from scrapy_autohome.items import ScrapyAutohomeItem
7 |
8 |
9 | class AutohomeSpider(scrapy.Spider):
10 | name = "autohome_spider"
11 | allowed_domains = ["autohome.com"]
12 | start_urls = ['http://autohome.com/']
13 | # 评论的个数
14 | count = 0
15 |
16 | # 循环页码,就在这个函数中实现。
17 | def start_requests(self):
18 | reqs = [] # 每个车型页面的request
19 |
20 | # 获取所有车辆的ID
21 | all_car_id = All_Car_Id()
22 | car_id_list = all_car_id.car_id_list
23 | # 两层遍历,分别遍历车型和页数
24 | for i in car_id_list: # i代表从车型的遍历
25 | for j in range(1,101): # j代表评论页数,range(1,3)表示1到2页
26 | req = scrapy.Request("http://k.autohome.com.cn/"+str(i)+"/index_"+str(j)+".html#dataList")
27 | reqs.append(req)
28 | return reqs
29 |
30 | def parse(self, response):
31 | # 记录个数
32 | AutohomeSpider.count += 1
33 | #print(AutohomeSpider.count)
34 |
35 | # 获取所有评论div //*[@id="maodian"]/div/div/div[2]/div[4]
36 | divs = response.xpath('//*[@id="maodian"]/div/div/div[2]/div[@class="mouthcon"]')
37 |
38 |
39 |
40 | for div in divs:
41 | # 记录个数
42 | AutohomeSpider.count += 1
43 | print("----------------------------------")
44 | print("第:",AutohomeSpider.count,"个评论。")
45 |
46 | item = ScrapyAutohomeItem()
47 | # 车ID //*[@id="maodian"]/div/div/div[2]/div[4]/div/div[1]/div[2]/dl[1]/dd/a[1]
48 | item['CAR_ID'] = div.xpath('div/div[1]/div[2]/dl[1]/dd/a[1]/@href')[0].extract().replace('/','')
49 | # 车名字
50 | item['CAR_NAME'] = div.xpath('div/div[1]/div[2]/dl[1]/dd/a[1]/text()')[0].extract()
51 |
52 | # 用户ID //*[@id="maodian"]/div/div/div[2]/div[4]/div/div[1]/div[1]/div/div[1]/div[2]/p/a
53 | USER_ID1 = div.xpath('div/div[1]/div[1]/div/div[1]/div[2]/p/a/@href')[0].extract()
54 | item['USER_ID'] = re.findall('\d{1,15}',USER_ID1)[0]
55 | item['USER_NAME'] = div.xpath('div/div[1]/div[1]/div/div[1]/div[2]/p/a/text()')[0].extract().strip()
56 |
57 | # 购买地点 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[2]/dd
58 | PURCHASE_PLACE = div.xpath('div/div[1]/div[2]/dl[2]/dd')[0]
59 | item['PURCHASE_PLACE'] =PURCHASE_PLACE.xpath('string(.)').extract()[0].strip()
60 |
61 |
62 | # 因为列表属性相同且数量不确定,所要加入判断
63 | dls =div.xpath('div/div[1]/div[2]/dl')
64 | # 正常的有7个
65 | if dls.__len__() == 7:
66 | # 购买时间 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[4]/dd
67 | item['PURCHASE_TIME'] = div.xpath('div/div[1]/div[2]/dl[4]/dd/text()')[0].extract().strip()
68 | # 裸车购买价 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[5]/dd
69 | CAR_PRICE = div.xpath('div/div[1]/div[2]/dl[5]/dd')[0]
70 | item['CAR_PRICE'] = CAR_PRICE.xpath('string(.)').extract()[0].strip().replace('\xa0','')
71 | # 购车目的 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[7]/dd
72 | PURCHASE_PURPOSE = div.xpath('div/div[1]/div[2]/dl[7]/dd')[0]
73 | item['PURCHASE_PURPOSE'] = PURCHASE_PURPOSE.xpath('string(.)').extract()[0].strip().replace('\r\n','').replace(' ',';')
74 | #不正常的有6个,分为两种情况:缺经销商和缺油耗。
75 | elif dls.__len__() == 6:
76 | p = div.xpath('div/div[1]/div[2]/dl[5]/dt/p')
77 | # 如果有p标签 ,说明有油耗,没有经销商
78 | if p:
79 | # 购买时间 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[4]/dd
80 | item['PURCHASE_TIME'] = div.xpath('div/div[1]/div[2]/dl[3]/dd/text()')[0].extract().strip()
81 | # 裸车购买价 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[5]/dd
82 | CAR_PRICE = div.xpath('div/div[1]/div[2]/dl[4]/dd')[0]
83 | item['CAR_PRICE'] = CAR_PRICE.xpath('string(.)').extract()[0].strip().replace('\xa0', '')
84 | # 购车目的 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[7]/dd
85 | PURCHASE_PURPOSE = div.xpath('div/div[1]/div[2]/dl[6]/dd')[0]
86 | item['PURCHASE_PURPOSE'] = PURCHASE_PURPOSE.xpath('string(.)').extract()[0].strip().replace('\r\n','').replace(' ', ';')
87 | # 如果没有p说明 没有油耗,有经销商
88 | else:
89 | # 购买时间 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[4]/dd
90 | item['PURCHASE_TIME'] = div.xpath('div/div[1]/div[2]/dl[4]/dd/text()')[0].extract().strip()
91 | # 裸车购买价 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[5]/dd
92 | CAR_PRICE = div.xpath('div/div[1]/div[2]/dl[5]/dd')[0]
93 | item['CAR_PRICE'] = CAR_PRICE.xpath('string(.)').extract()[0].strip().replace('\xa0', '')
94 | # 购车目的 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[7]/dd
95 | PURCHASE_PURPOSE = div.xpath('div/div[1]/div[2]/dl[6]/dd')[0]
96 | item['PURCHASE_PURPOSE'] = PURCHASE_PURPOSE.xpath('string(.)').extract()[0].strip().replace('\r\n','').replace(' ', ';')
97 |
98 |
99 |
100 | # 评分- 空间 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/div[1]/dl/dd/span[2]
101 | item['SCORE_SPACE'] = div.xpath('div/div[1]/div[2]/div[1]/dl/dd/span[2]/text()')[0].extract()
102 | # 评分- 动力 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/div[2]/dl/dd/span[2]
103 | item['SCORE_POWER'] = div.xpath('div/div[1]/div[2]/div[2]/dl/dd/span[2]/text()')[0].extract()
104 | # 评分- 操控
105 | item['SCORE_CONTROL'] = div.xpath('div/div[1]/div[2]/div[3]/dl/dd/span[2]/text()')[0].extract()
106 | # 评分- 油耗
107 | item['SCORE_FUEL_CONSUMPTION'] = div.xpath('div/div[1]/div[2]/div[4]/dl/dd/span[2]/text()')[0].extract()
108 | # 评分- 舒适性
109 | item['SCORE_COMFORT'] = div.xpath('div/div[1]/div[2]/div[5]/dl/dd/span[2]/text()')[0].extract()
110 | # 评分- 外观
111 | item['SCORE_EXTERIOR'] = div.xpath('div/div[1]/div[2]/div[6]/dl/dd/span[2]/text()')[0].extract()
112 | # 评分- 内饰
113 | item['SCORE_INTERIOR'] = div.xpath('div/div[1]/div[2]/div[7]/dl/dd/span[2]/text()')[0].extract()
114 | # 评分- 性价比 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/div[8]/dl/dd/span[2]
115 | item['SCORE_COST_EFFECTIVE'] = div.xpath('div/div[1]/div[2]/div[8]/dl/dd/span[2]/text()')[0].extract()
116 |
117 |
118 |
119 | # 有多少人支持这条口碑 #//*[@id="maodian"]/div/div/div[2]/div[6]/ div/div[2]/div[1]/div[3]/div[2]/span[3]/label
120 | item['COMMENT_SUPPORT_QUANTITY'] = div.xpath('div/div[2]/div[1]/div[3]/div[2]/span[3]/label/text()')[0].extract()
121 | # 有多少人看过这条口碑 #//*[@id="maodian"]/div/div/div[2]/div[6]/ div/div[2]/div[1]/div[3]/div[2]/span[4]/a
122 | item['COMMENT_SEEN_QUANTITY'] = div.xpath('div/div[2]/div[1]/div[3]/div[2]/span[4]/a/text()')[0].extract()
123 |
124 |
125 | # 评论的url //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[1]/div/div[2]/div[2]
126 | url_id_pre = div.xpath('div/div[1]/div[1]/div/div[2]/div[2]/@id')[0].extract()# 结果为 DivRelatedTopics_1565672
127 | # 截取id
128 | url_id = re.findall('\d{1,20}', url_id_pre)[0]
129 | # 存入评论url
130 | item['COMMENT_URL'] ="http://k.autohome.com.cn/FrontAPI/GetFeelingByEvalId?evalId=" + url_id
131 | COMMENT_URL = item['COMMENT_URL']
132 |
133 | # 用回调函数获取 评论内容
134 | yield scrapy.Request(url=COMMENT_URL,meta={'item': item},callback=self.parse_recommand,dont_filter=True)
135 |
136 |
137 | def parse_recommand(self,response):
138 | # 此函数用于解析评论json
139 |
140 | # 获取该页面的代码
141 | text = response.body
142 | # 解码为gb312(通过response.headers知道)
143 | text1 = str(text, encoding="gb2312",errors='ignore').replace("\\u0027", "'").replace("\\u003e", ">").replace("\\u003c", "<")
144 | # 调用函数 替换
145 | text2 = AutohomeSpider.get_complete_text_autohome(text1)
146 | # 获取中文评论
147 | text3 = re.findall(r'@HS_BASE64@.*@HS_ZY@',text2)[0].replace('@HS_BASE64@-->',"").replace("", text)
160 | #print("find : %s" % js.group())
161 | if not js:
162 | print(" if not js:")
163 | return text
164 | try:
165 | char_list = AutohomeSpider.get_char(js.group(1))
166 | print("try111")
167 |
168 | except Exception as e:
169 | print(e)
170 | print("except222")
171 | return text
172 |
173 | def char_replace(m):
174 | index = int(m.group(1))
175 | char = char_list[index]
176 | return char
177 |
178 | text = re.sub("", char_replace, text)
179 | # print(text)
180 | return text
181 |
182 |
183 | # 这个函数用于 获取js中的变换规则
184 | def get_char(js):
185 | all_var = {}
186 | # 判断混淆 无参数 返回常量 函数
187 | if_else_no_args_return_constant_function_functions = []
188 | """
189 | function zX_() {
190 | function _z() {
191 | return '09';
192 | };
193 | if (_z() == '09,') {
194 | return 'zX_';
195 | } else {
196 | return _z();
197 | }
198 | }
199 | """
200 | constant_function_regex4 = re.compile("""
201 | function\s+\w+\(\)\s*\{\s*
202 | function\s+\w+\(\)\s*\{\s*
203 | return\s+[\'\"][^\'\"]+[\'\"];\s*
204 | \};\s*
205 | if\s*\(\w+\(\)\s*==\s*[\'\"][^\'\"]+[\'\"]\)\s*\{\s*
206 | return\s*[\'\"][^\'\"]+[\'\"];\s*
207 | \}\s*else\s*\{\s*
208 | return\s*\w+\(\);\s*
209 | \}\s*
210 | \}
211 | """,
212 | re.X)
213 | l = constant_function_regex4.findall(js)
214 | # print("l 38",l)
215 | for i in l:
216 | function_name = re.search("""
217 | function\s+(\w+)\(\)\s*\{\s*
218 | function\s+\w+\(\)\s*\{\s*
219 | return\s+[\'\"]([^\'\"]+)[\'\"];\s*
220 | \};\s*
221 | if\s*\(\w+\(\)\s*==\s*[\'\"]([^\'\"]+)[\'\"]\)\s*\{\s*
222 | return\s*[\'\"]([^\'\"]+)[\'\"];\s*
223 | \}\s*else\s*\{\s*
224 | return\s*\w+\(\);\s*
225 | \}\s*
226 | \}
227 | """, i,
228 | re.X)
229 | if_else_no_args_return_constant_function_functions.append(function_name.groups())
230 | js = js.replace(i, "")
231 | # 替换全文
232 | a, b, c, d = function_name.groups()
233 | all_var["%s()" % a] = d if b == c else b
234 |
235 | # 判断混淆 无参数 返回函数 常量
236 | if_else_no_args_return_function_constant_functions = []
237 | """
238 | function wu_() {
239 | function _w() {
240 | return 'wu_';
241 | };
242 | if (_w() == 'wu__') {
243 | return _w();
244 | } else {
245 | return '5%';
246 | }
247 | }
248 | """
249 | constant_function_regex5 = re.compile("""
250 | function\s+\w+\(\)\s*\{\s*
251 | function\s+\w+\(\)\s*\{\s*
252 | return\s+[\'\"][^\'\"]+[\'\"];\s*
253 | \};\s*
254 | if\s*\(\w+\(\)\s*==\s*[\'\"][^\'\"]+[\'\"]\)\s*\{\s*
255 | return\s*\w+\(\);\s*
256 | \}\s*else\s*\{\s*
257 | return\s*[\'\"][^\'\"]+[\'\"];\s*
258 | \}\s*
259 | \}
260 | """,
261 | re.X)
262 | l = constant_function_regex5.findall(js)
263 | # print("l 87",l)
264 | for i in l:
265 | function_name = re.search("""
266 | function\s+(\w+)\(\)\s*\{\s*
267 | function\s+\w+\(\)\s*\{\s*
268 | return\s+[\'\"]([^\'\"]+)[\'\"];\s*
269 | \};\s*
270 | if\s*\(\w+\(\)\s*==\s*[\'\"]([^\'\"]+)[\'\"]\)\s*\{\s*
271 | return\s*\w+\(\);\s*
272 | \}\s*else\s*\{\s*
273 | return\s*[\'\"]([^\'\"]+)[\'\"];\s*
274 | \}\s*
275 | \}
276 | """, i,
277 | re.X)
278 | if_else_no_args_return_function_constant_functions.append(function_name.groups())
279 | js = js.replace(i, "")
280 | # 替换全文
281 | a, b, c, d = function_name.groups()
282 | all_var["%s()" % a] = b if b == c else d
283 |
284 | # var 参数等于返回值函数
285 | var_args_equal_value_functions = []
286 | """
287 | var ZA_ = function(ZA__) {
288 | 'return ZA_';
289 | return ZA__;
290 | };
291 | """
292 | constant_function_regex1 = re.compile(
293 | "var\s+[^=]+=\s*function\(\w+\)\{\s*[\'\"]return\s*\w+\s*[\'\"];\s*return\s+\w+;\s*\};")
294 | l = constant_function_regex1.findall(js)
295 | # print("l 119",l)
296 | for i in l:
297 | function_name = re.search("var\s+([^=]+)", i).group(1)
298 | var_args_equal_value_functions.append(function_name)
299 | js = js.replace(i, "")
300 | # 替换全文
301 | a = function_name
302 | js = re.sub("%s\(([^\)]+)\)" % a, r"\1", js)
303 |
304 | # var 无参数 返回常量 函数
305 | var_no_args_return_constant_functions = []
306 | """
307 | var Qh_ = function() {
308 | 'return Qh_';
309 | return ';';
310 | };
311 | """
312 | constant_function_regex2 = re.compile("""
313 | var\s+[^=]+=\s*function\(\)\{\s*
314 | [\'\"]return\s*\w+\s*[\'\"];\s*
315 | return\s+[\'\"][^\'\"]+[\'\"];\s*
316 | \};
317 | """,
318 | re.X)
319 | l = constant_function_regex2.findall(js)
320 | # print("l 144",l)
321 | for i in l:
322 | function_name = re.search("""
323 | var\s+([^=]+)=\s*function\(\)\{\s*
324 | [\'\"]return\s*\w+\s*[\'\"];\s*
325 | return\s+[\'\"]([^\'\"]+)[\'\"];\s*
326 | \};
327 | """,
328 | i,
329 | re.X)
330 | var_no_args_return_constant_functions.append(function_name.groups())
331 | js = js.replace(i, "")
332 | # 替换全文
333 | a, b = function_name.groups()
334 | all_var["%s()" % a] = b
335 |
336 | # 无参数 返回常量 函数
337 | no_args_return_constant_functions = []
338 | """
339 | function ZP_() {
340 | 'return ZP_';
341 | return 'E';
342 | }
343 | """
344 | constant_function_regex3 = re.compile("""
345 | function\s*\w+\(\)\s*\{\s*
346 | [\'\"]return\s*[^\'\"]+[\'\"];\s*
347 | return\s*[\'\"][^\'\"]+[\'\"];\s*
348 | \}\s*
349 | """,
350 | re.X)
351 | l = constant_function_regex3.findall(js)
352 | # print("l 176",l)
353 | for i in l:
354 | function_name = re.search("""
355 | function\s*(\w+)\(\)\s*\{\s*
356 | [\'\"]return\s*[^\'\"]+[\'\"];\s*
357 | return\s*[\'\"]([^\'\"]+)[\'\"];\s*
358 | \}\s*
359 | """,
360 | i,
361 | re.X)
362 | no_args_return_constant_functions.append(function_name.groups())
363 | js = js.replace(i, "")
364 | # 替换全文
365 | a, b = function_name.groups()
366 | all_var["%s()" % a] = b
367 |
368 | # 无参数 返回常量 函数 中间无混淆代码
369 | no_args_return_constant_sample_functions = []
370 | """
371 | function do_() {
372 | return '';
373 | }
374 | """
375 | constant_function_regex3 = re.compile("""
376 | function\s*\w+\(\)\s*\{\s*
377 | return\s*[\'\"][^\'\"]*[\'\"];\s*
378 | \}\s*
379 | """,
380 | re.X)
381 | l = constant_function_regex3.findall(js)
382 | # print("l 206",l)
383 | for i in l:
384 | function_name = re.search("""
385 | function\s*(\w+)\(\)\s*\{\s*
386 | return\s*[\'\"]([^\'\"]*)[\'\"];\s*
387 | \}\s*
388 | """,
389 | i,
390 | re.X)
391 | no_args_return_constant_sample_functions.append(function_name.groups())
392 | js = js.replace(i, "")
393 | # 替换全文
394 | a, b = function_name.groups()
395 | all_var["%s()" % a] = b
396 |
397 | # 字符串拼接时使无参常量函数
398 | """
399 | (function() {
400 | 'return sZ_';
401 | return '1'
402 | })()
403 | """
404 | constant_function_regex6 = re.compile("""
405 | \(function\(\)\s*\{\s*
406 | [\'\"]return[^\'\"]+[\'\"];\s*
407 | return\s*[\'\"][^\'\"]*[\'\"];?
408 | \}\)\(\)
409 | """,
410 | re.X)
411 | l = constant_function_regex6.findall(js)
412 | # print("l 236",l)
413 | for i in l:
414 | function_name = re.search("""
415 | \(function\(\)\s*\{\s*
416 | [\'\"]return[^\'\"]+[\'\"];\s*
417 | return\s*([\'\"][^\'\"]*[\'\"]);?
418 | \}\)\(\)
419 | """,
420 | i,
421 | re.X)
422 | js = js.replace(i, function_name.group(1))
423 |
424 | # 字符串拼接时使用返回参数的函数
425 | """
426 | (function(iU__) {
427 | 'return iU_';
428 | return iU__;
429 | })('9F')
430 | """
431 | constant_function_regex6 = re.compile("""
432 | \(function\(\w+\)\s*\{\s*
433 | [\'\"]return[^\'\"]+[\'\"];\s*
434 | return\s*\w+;
435 | \}\)\([\'\"][^\'\"]*[\'\"]\)
436 | """,
437 | re.X)
438 |
439 | l = constant_function_regex6.findall(js)
440 | # print("l 264",l)
441 | for i in l:
442 | function_name = re.search("""
443 | \(function\(\w+\)\s*\{\s*
444 | [\'\"]return[^\'\"]+[\'\"];\s*
445 | return\s*\w+;
446 | \}\)\(([\'\"][^\'\"]*[\'\"])\)
447 | """,
448 | i,
449 | re.X)
450 | js = js.replace(i, function_name.group(1))
451 | #print("275", js)
452 | # 获取所有变量
453 | var_regex = "var\s+(\w+)=(.*?);\s"
454 | var_find = re.findall(var_regex, js)
455 | #print("var_find", var_find)
456 | for var_name, var_value in var_find:
457 | var_value = var_value.strip("\'\"").strip()
458 | # print(var_name,"---",var_value)
459 | if "(" in var_value:
460 | var_value = ";"
461 | all_var[var_name] = var_value
462 | #print("all var", all_var)
463 | # 注释掉 此正则可能会把关键js语句删除掉
464 | # js = re.sub(var_regex, "", js)
465 |
466 | for var_name, var_value in all_var.items():
467 | js = js.replace(var_name, var_value)
468 | #print("----282", js)
469 | js = re.sub("[\s+']", "", js)
470 | #print("----284", js)
471 | string_m = re.search("(%\w\w(?:%\w\w)+)", js)
472 | # string = urllib.parse.unquote(string_m.group(1)).encode("utf-8").decode("utf8")
473 | #print("string_m", string_m.groups())
474 | string = urllib.parse.unquote(string_m.group(1)).encode("utf-8").decode("utf8")
475 | #print(string)
476 | index_m = re.search("([\d,]+(;[\d,]+)+)", js[string_m.end():])
477 | #print(index_m.group())
478 | string_list = list(string)
479 | #print("str", len(string_list))
480 | # print("string_list",string_list)
481 | index_list = index_m.group(1).split(";")
482 | # print("index_list",index_list)
483 | _word_list = []
484 | # print(type(_word_list))
485 | # print(_word_list)
486 | i = 1
487 | exflag = 0;
488 | # deal exception
489 |
490 | # print("--max ",type(int(max(index_list))))
491 | max_index = 0;
492 | for word_index_list in index_list:
493 | _word = ""
494 | if "," in word_index_list:
495 | word_index_list = word_index_list.split(",")
496 | word_index_list = [int(x) for x in word_index_list]
497 | else:
498 | word_index_list = [int(word_index_list)]
499 | for word_index in word_index_list:
500 | # print(word_index)
501 | if (word_index > max_index):
502 | max_index = word_index
503 | try:
504 | string_list[word_index]
505 | except Exception as e:
506 | exflag = 1;
507 | print(max_index)
508 | print("exflag", exflag)
509 | less = max_index - len(string_list)
510 | print(less)
511 | for word_index_list in index_list:
512 | _word = ""
513 | if "," in word_index_list:
514 | word_index_list = word_index_list.split(",")
515 | # print("word_index_list",word_index_list)
516 | word_index_list = [int(x) for x in word_index_list]
517 | # print("word_index_list", word_index_list)
518 | else:
519 | word_index_list = [int(word_index_list)]
520 | j = 1;
521 | for word_index in word_index_list:
522 | # print("for",j)
523 | j += 1
524 | # print("word_index",word_index)
525 | # print("string_list[word_index]",string_list[word_index])
526 | try:
527 | _word += string_list[word_index - 1 - less]
528 | except Exception as e:
529 | print(e)
530 |
531 | # print(_word)
532 | _word_list.append(_word)
533 | # print("----------")
534 | # print(i)
535 | # print(_word_list)
536 |
537 | i += 1
538 |
539 | return _word_list
--------------------------------------------------------------------------------