├── .gitignore
├── .idea
├── .name
├── encodings.xml
├── itjuzi_dis.iml
├── misc.xml
├── modules.xml
└── workspace.xml
├── Dockerfile
├── README.md
├── docker-compose.yml
├── itjuzi_dis
├── .gitignore
├── .idea
│ ├── .name
│ ├── encodings.xml
│ ├── itjuzi_dis.iml
│ ├── misc.xml
│ ├── modules.xml
│ └── workspace.xml
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-35.pyc
│ ├── db_util.cpython-35.pyc
│ ├── items.cpython-35.pyc
│ ├── middlewares.cpython-35.pyc
│ ├── pipelines.cpython-35.pyc
│ └── settings.cpython-35.pyc
├── db_util.py
├── items.py
├── middlewares.py
├── pipelines.py
├── settings.py
└── spiders
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-35.pyc
│ └── juzi_spider.cpython-35.pyc
│ └── juzi_spider.py
├── requirements.txt
├── scrapy.cfg
└── spiders.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 |
--------------------------------------------------------------------------------
/.idea/.name:
--------------------------------------------------------------------------------
1 | itjuzi_dis
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/itjuzi_dis.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 | true
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 | 1470744258822
495 |
496 | 1470744258822
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
659 |
660 |
661 |
662 |
663 |
664 |
665 |
666 |
667 |
668 |
669 |
670 |
671 |
672 |
673 |
674 |
675 |
676 |
677 |
678 |
679 |
680 |
681 |
682 |
683 |
684 |
685 |
686 |
687 |
688 |
689 |
690 |
691 |
692 |
693 |
694 |
695 |
696 |
697 |
698 |
699 |
700 |
701 |
702 |
703 |
704 |
705 |
706 |
707 |
708 |
709 |
710 |
711 |
712 |
713 |
714 |
715 |
716 |
717 |
718 |
719 |
720 |
721 |
722 |
723 |
724 |
725 |
726 |
727 |
728 |
729 |
730 |
731 |
732 |
733 |
734 |
735 |
736 |
737 |
738 |
739 |
740 |
741 |
742 |
743 |
744 |
745 |
746 |
747 |
748 |
749 |
750 |
751 |
752 |
753 |
754 |
755 |
756 |
757 |
758 |
759 |
760 |
761 |
762 |
763 |
764 |
765 |
766 |
767 |
768 |
769 |
770 |
771 |
772 |
773 |
774 |
775 |
776 |
777 |
778 |
779 |
780 |
781 |
782 |
783 |
784 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.5
2 | ENV PATH /usr/local/bin:$PATH
3 | ADD . /code
4 | WORKDIR /code
5 | RUN pip install -r requirements.txt
6 | COPY spiders.py /usr/local/lib/python3.5/site-packages/scrapy_redis
7 | CMD /usr/local/bin/scrapy crawl itjuzi_dis
8 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## 简介
2 | 在使用 scrapy 爬取 [IT桔子公司][1]信息,用来进行分析,了解 IT 创业公司的一切情况,之前使用 scrapy 写了一个默认线程是10的单个实例,为了防止被 ban IP 设置了下载的速度,3万多个公司信息爬了1天多才完成,现在想到使用分布式爬虫来提高效率。
3 |
4 | ***[源码githup][2]***
5 |
6 | ####技术工具:`Python3.5` `scrapy` `scrapy_redis` `redis` `docker1.12` `docker-compose` `Kitematic` `mysql` `SQLAlchemy`
7 |
8 | ## 准备工作
9 |
10 | 1. 安装 `Docker` [点这里][3]去了解、安装;
11 | 2. `pip install scrapy scrapy_redis`;
12 |
13 | ## 代码编写
14 |
15 | 1. 分析页面信息:
16 | 我需要获取的是每一个「公司」的详情页面链接 和 分页按钮链接;
17 | 2. 统一存储获取到的链接,提供给多个 `spider` 爬取;
18 | 3. 多个 `spider` 共享一个 `redis` `list` 中的链接;
19 |
20 | ###目录结构图
21 | ![图片描述][5]
22 | ###juzi_spider.py
23 | ```
24 | # coding:utf-8
25 |
26 | from bs4 import BeautifulSoup
27 | from scrapy.linkextractors import LinkExtractor
28 | from scrapy.spiders import CrawlSpider, Rule
29 |
30 | from scrapy_redis.spiders import RedisCrawlSpider
31 | from itjuzi_dis.items import CompanyItem
32 |
33 |
34 | class ITjuziSpider(RedisCrawlSpider):
35 | name = 'itjuzi_dis'
36 | allowed_domains = ['itjuzi.com']
37 | # start_urls = ['http://www.itjuzi.com/company/157']
38 | redis_key = 'itjuziCrawler:start_urls'
39 | rules = [
40 | # 获取每一页的链接
41 | Rule(link_extractor=LinkExtractor(allow=('/company\?page=\d+'))),
42 | # 获取每一个公司的详情
43 | Rule(link_extractor=LinkExtractor(allow=('/company/\d+')), callback='parse_item')
44 | ]
45 |
46 | def parse_item(self, response):
47 | soup = BeautifulSoup(response.body, 'lxml')
48 |
49 | .
50 | .省略一些处理代码
51 | .
52 | return item
53 | ```
54 | **说明:**
55 | 1. `class` 继承了`RedisCrawlSpider` 而不是`CrawlSpider`
56 | 2. `start_urls` 改为一个自定义的 `itjuziCrawler:start_urls`,这里的`itjuziCrawler:start_urls` 就是作为所有链接存储到 `redis` 中的 `key`,`scrapy_redis` 里也是通过`redis`的 `lpop`方法弹出并删除链接的;
57 |
58 | ###db_util.py
59 | 使用 `SQLAlchemy` 作为 `ORM` 工具,当表结构不存在时,自动创建表结构
60 |
61 | ###middlewares.py
62 | 增加了很多 `User-Agent`,每一个请求随机使用一个,防止防止网站通过 `User-Agent` 屏蔽爬虫
63 |
64 | ###settings.py
65 | 配置`middlewares.py` `scrapy_redis` `redis` 链接相关信息
66 |
67 | ##部署
68 | 在上面的「目录结构图」中有,`Dockerfile`和`docker-compose.yml`
69 | ### Dockerfile
70 |
71 | ```
72 | FROM python:3.5
73 | ENV PATH /usr/local/bin:$PATH
74 | ADD . /code
75 | WORKDIR /code
76 | RUN pip install -r requirements.txt
77 | COPY spiders.py /usr/local/lib/python3.5/site-packages/scrapy_redis
78 | CMD /usr/local/bin/scrapy crawl itjuzi_dis
79 |
80 | ```
81 | **说明:**
82 |
83 | - 使用 `python3.5`作为基础镜像
84 | - 将`/usr/local/bin`设置环境变量
85 | - 映射 `host` 和 `container` 的目录
86 | - 安装 `requirements.txt`
87 | - 特别要说明的是`COPY spiders.py /usr/local/lib/python3.5/site-packages/scrapy_redis`,将 `host` 中的 `spiders.py` 拷贝到`container` 中的 `scrapy_redis` 安装目录中,因为 `lpop` 获取`redis` 的值在 `python2`中是 `str` 类型,而在 `python3`中是 `bytes` 类型,这个问题在 `scrapy_reids` 中需要修复,`spiders.py` 第84行需要修改;
88 | - 启动后立即执行爬行命令 `scrapy crawl itjuzi_dis`
89 |
90 | ### docker-compose.yml
91 | ```
92 | version: '2'
93 | services:
94 | spider:
95 | build: .
96 | volumes:
97 | - .:/code
98 | links:
99 | - redis
100 | depends_on:
101 | - redis
102 | redis:
103 | image: redis
104 | ports:
105 | - "6379:6379"
106 |
107 | ```
108 | **说明:**
109 |
110 | - 使用第2版本的 `compose` 描述语言
111 | - 定义了 `spider` 和 `redis` 两个 `service`
112 | - `spider`默认使用当前目录的 `Dockerfile` 来创建,`redis`使用 `redis:latest` 镜像创建,并都映射6379端口
113 |
114 | ###开始部署
115 |
116 | **启动 container**
117 | docker-compose up #从 docker-compose.yml 中创建 `container` 们
118 | docker-compose scale spider=4 #将 spider 这一个服务扩展到4个,还是同一个 redis
119 |
120 | 可以在 `Kitematic` GUI 工具中观察创建和运行情况;
121 |
122 | ![图片描述][6]
123 |
124 | 在没有设置 `start_urls` 时,4个 `container` 中的爬虫都处于饥渴的等待状态
125 |
126 | ![图片描述][7]
127 |
128 | 现在给 `redis` 中放入 `start_urls`:
129 | lpush itjuziCrawler:start_urls http://www.itjuzi.com/company
130 |
131 | 4个爬虫都动起来了,一直爬到`start_urls`为空
132 | ![图片描述][8]
133 |
134 | 以上!
135 |
136 |
137 | [1]: http://www.itjuzi.com/company
138 | [2]: https://github.com/caoxiaozh/itjuzi_dis
139 | [3]: https://www.docker.com/products/overview
140 | [4]: https://github.com/caoxiaozh/itjuzi_dis
141 | [5]: https://segmentfault.com/img/bVAlLY
142 | [6]: https://segmentfault.com/img/bVAlSh
143 | [7]: https://segmentfault.com/img/bVAlS7
144 | [8]: https://segmentfault.com/img/bVAlUh
145 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '2'
2 | services:
3 | spider:
4 | build: .
5 | volumes:
6 | - .:/code
7 | links:
8 | - redis
9 | depends_on:
10 | - redis
11 | redis:
12 | image: redis
13 | ports:
14 | - "6379:6379"
15 |
--------------------------------------------------------------------------------
/itjuzi_dis/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 |
--------------------------------------------------------------------------------
/itjuzi_dis/.idea/.name:
--------------------------------------------------------------------------------
1 | itjuzi_dis
--------------------------------------------------------------------------------
/itjuzi_dis/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/itjuzi_dis/.idea/itjuzi_dis.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/itjuzi_dis/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/itjuzi_dis/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/itjuzi_dis/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 | true
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 | 1470720085616
343 |
344 | 1470720085616
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
--------------------------------------------------------------------------------
/itjuzi_dis/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hardy4yooz/itjuzi_dis/551d43573b1dd93fdb05900953cd0b79044c8114/itjuzi_dis/__init__.py
--------------------------------------------------------------------------------
/itjuzi_dis/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hardy4yooz/itjuzi_dis/551d43573b1dd93fdb05900953cd0b79044c8114/itjuzi_dis/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/itjuzi_dis/__pycache__/db_util.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hardy4yooz/itjuzi_dis/551d43573b1dd93fdb05900953cd0b79044c8114/itjuzi_dis/__pycache__/db_util.cpython-35.pyc
--------------------------------------------------------------------------------
/itjuzi_dis/__pycache__/items.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hardy4yooz/itjuzi_dis/551d43573b1dd93fdb05900953cd0b79044c8114/itjuzi_dis/__pycache__/items.cpython-35.pyc
--------------------------------------------------------------------------------
/itjuzi_dis/__pycache__/middlewares.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hardy4yooz/itjuzi_dis/551d43573b1dd93fdb05900953cd0b79044c8114/itjuzi_dis/__pycache__/middlewares.cpython-35.pyc
--------------------------------------------------------------------------------
/itjuzi_dis/__pycache__/pipelines.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hardy4yooz/itjuzi_dis/551d43573b1dd93fdb05900953cd0b79044c8114/itjuzi_dis/__pycache__/pipelines.cpython-35.pyc
--------------------------------------------------------------------------------
/itjuzi_dis/__pycache__/settings.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hardy4yooz/itjuzi_dis/551d43573b1dd93fdb05900953cd0b79044c8114/itjuzi_dis/__pycache__/settings.cpython-35.pyc
--------------------------------------------------------------------------------
/itjuzi_dis/db_util.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | from sqlalchemy import create_engine, Column, Integer, String
4 | from sqlalchemy.ext.declarative import declarative_base
5 | from sqlalchemy.orm import sessionmaker
6 |
7 | Base = declarative_base()
8 | url = 'mysql+pymysql://root:root@192.168.0.7/spider_tools?charset=utf8'
9 | engine = create_engine(url, echo=False)
10 |
11 |
12 | class DB_Util(object):
13 | @staticmethod
14 | def get_session(url=None):
15 | Session = sessionmaker(bind=engine)
16 | session = Session()
17 | return session
18 |
19 | @staticmethod
20 | def init_db():
21 | Base.metadata.create_all(engine)
22 |
23 |
24 | class JuziCompany(Base):
25 | __tablename__ = 't_juzi_company'
26 | id = Column(Integer, primary_key=True)
27 | company_name = Column(String(100), nullable=True)
28 | slogan = Column(String(100), nullable=True)
29 | scope = Column(String(30), nullable=True)
30 | sub_scope = Column(String(30), nullable=True)
31 | city = Column(String(30),nullable=True)
32 | area = Column(String(30),nullable=True)
33 | home_page = Column(String(100), nullable=True)
34 | tags = Column(String(200))
35 | company_intro = Column(String(500), nullable=True)
36 | company_full_name = Column(String(100), nullable=True)
37 | found_time = Column(String(10), nullable=True)
38 | company_size = Column(String(20), nullable=True)
39 | company_status = Column(String(20), nullable=True)
40 | info_id = Column(String(20), nullable=False)
41 |
42 |
43 | class JuziTeam(Base):
44 | __tablename__ = 't_juzi_team'
45 | id = Column(Integer, primary_key=True)
46 | company_id = Column(String(20), nullable=False)
47 | tm_m_name = Column(String(100), nullable=True)
48 | tm_m_title = Column(String(100), nullable=True)
49 | tm_m_intro = Column(String(500), nullable=True)
50 |
51 |
52 | class JuziTz(Base):
53 | __tablename__ = 't_juzi_tz'
54 | company_id = Column(String(20), nullable=False)
55 | id = Column(Integer, primary_key=True)
56 | tz_time = Column(String(100), nullable=True)
57 | tz_round = Column(String(20), nullable=True)
58 | tz_finades = Column(String(100), nullable=True)
59 | tz_capital = Column(String(500), nullable=True)
60 |
61 |
62 | class JuziProduct(Base):
63 | __tablename__ = 't_juzi_product'
64 | company_id = Column(String(20), nullable=False)
65 | id = Column(Integer, primary_key=True)
66 | pdt_name = Column(String(100), nullable=True)
67 | pdt_type = Column(String(100), nullable=True)
68 | pdt_intro = Column(String(500), nullable=True)
69 |
--------------------------------------------------------------------------------
/itjuzi_dis/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class CompanyItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | info_id = scrapy.Field()
15 | company_name = scrapy.Field()
16 | slogan = scrapy.Field()
17 | scope = scrapy.Field()
18 | sub_scope = scrapy.Field()
19 | city = scrapy.Field()
20 | area = scrapy.Field()
21 | home_page = scrapy.Field()
22 | tags = scrapy.Field()
23 | company_intro = scrapy.Field()
24 | company_full_name = scrapy.Field()
25 | found_time = scrapy.Field()
26 | company_size = scrapy.Field()
27 | company_status = scrapy.Field()
28 | tz_info = scrapy.Field()
29 | tm_info = scrapy.Field()
30 | pdt_info = scrapy.Field()
--------------------------------------------------------------------------------
/itjuzi_dis/middlewares.py:
--------------------------------------------------------------------------------
1 | # coding:utf8
2 |
3 | import logging
4 | import random
5 |
6 | # Start your middleware class
7 |
8 | from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
9 |
10 |
11 | class RotateUserAgentMiddleware(UserAgentMiddleware):
12 | def __init__(self, user_agent=''):
13 | self.user_agent = user_agent
14 |
15 | def process_request(self, request, spider):
16 | # 这句话用于随机选择user-agent
17 | ua = random.choice(self.user_agent_list)
18 | if ua:
19 | logging.info(ua)
20 | request.headers.setdefault('User-Agent', ua)
21 |
22 | # the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape
23 | # for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php
24 | user_agent_list = [
25 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
26 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
27 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
28 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
29 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
30 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
31 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
32 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
33 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
34 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
35 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
36 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
37 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
38 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
39 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
40 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
41 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
42 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
43 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10",
44 | "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8",
45 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
46 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12",
47 | "Mozilla/5.0 (Windows NT 5.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1",
48 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.8 (KHTML, like Gecko) Chrome/4.0.302.2 Safari/532.8",
49 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.464.0 Safari/534.3",
50 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13",
51 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.186 Safari/535.1",
52 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.54 Safari/535.2",
53 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
54 | "Mozilla/5.0 (Macintosh; U; Mac OS X Mach-O; en-US; rv:2.0a) Gecko/20040614 Firefox/3.0.0 ",
55 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.0.3) Gecko/2008092414 Firefox/3.0.3",
56 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1) Gecko/20090624 Firefox/3.5",
57 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.14) Gecko/20110218 AlexaToolbar/alxf-2.0 Firefox/3.6.14",
58 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
59 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
60 | ]
61 |
62 |
63 | # class ProxyMiddleware(object):
64 | # # overwrite process request
65 | # def process_request(self, request, spider):
66 | # # Set the location of the proxy
67 | # sql = 'select ip,port from t_proxy_ip t where t.is_valid =1'
68 | # result = SqlUtil.query_all(sql)
69 | # ip_port = random.choice(result)
70 | # logging.info(ip_port)
71 | # request.meta['proxy'] = "http://{0}:{1}".format(ip_port['ip'], ip_port['port'])
72 | # # # Use the following lines if your proxy requires authentication
73 | # # proxy_user_pass = "USERNAME:PASSWORD"
74 | # # # setup basic authentication for the proxy
75 | # # encoded_user_pass = base64.encodestring(proxy_user_pass)
76 | # # request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
77 |
--------------------------------------------------------------------------------
/itjuzi_dis/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | from scrapy.exceptions import DropItem
8 |
9 | from itjuzi_dis.db_util import JuziCompany,DB_Util,JuziTeam,JuziTz,JuziProduct
10 |
11 |
12 | # 去重复的 company
13 | class DuplicatesPipeline(object):
14 |
15 | def __init__(self):
16 | self.ids_seen = set()
17 |
18 | def process_item(self, item, spider):
19 | if item['info_id'] in self.ids_seen:
20 | raise DropItem("Duplicate item found: %s" % item)
21 | else:
22 | self.ids_seen.add(item['info_id'])
23 | return item
24 |
25 |
26 | class ItjuziSpiderPipeline(object):
27 | def open_spider(self, spider):
28 | DB_Util.init_db() # 表不存在时候,初始化表结构
29 |
30 | def process_item(self, item, spider):
31 | if not item['info_id']:
32 | raise DropItem('item info_id is null.{0}'.format(item))
33 | else:
34 | session = DB_Util.get_session()
35 | company = JuziCompany()
36 | company.company_name = item['company_name']
37 | company.slogan = item['slogan']
38 | company.scope=item['scope']
39 | company.sub_scope=item['sub_scope']
40 | company.city = item['city']
41 | company.area = item['area']
42 | company.home_page=item['home_page']
43 | company.tags=item['tags']
44 | company.company_intro=item['company_intro']
45 | company.company_full_name=item['company_full_name']
46 | company.found_time=item['found_time']
47 | company.company_size=item['company_size']
48 | company.company_status=item['company_status']
49 | company.info_id = item['info_id']
50 | session.add(company)
51 | if item['tz_info']:
52 | for touzi in item['tz_info']:
53 | tz = JuziTz()
54 | tz.company_id = company.info_id
55 | tz.tz_time = touzi['tz_time']
56 | tz.tz_finades = touzi['tz_finades']
57 | tz.tz_capital = touzi['tz_capital']
58 | tz.tz_round = touzi['tz_round']
59 | session.add(tz)
60 | if item['tm_info']:
61 | for team in item['tm_info']:
62 | tm = JuziTeam()
63 | tm.company_id = company.info_id
64 | tm.tm_m_name = team['tm_m_name']
65 | tm.tm_m_title = team['tm_m_title']
66 | tm.tm_m_intro = team['tm_m_intro']
67 | session.add(tm)
68 | if item['pdt_info']:
69 | for product in item['pdt_info']:
70 | pdt = JuziProduct()
71 | pdt.company_id = company.info_id
72 | pdt.pdt_name = product['pdt_name']
73 | pdt.pdt_type = product['pdt_type']
74 | pdt.pdt_intro = product['pdt_intro']
75 | session.add(pdt)
76 | session.commit()
77 | return item
78 |
--------------------------------------------------------------------------------
/itjuzi_dis/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for itjuzi_dis project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'itjuzi_dis'
13 |
14 | SPIDER_MODULES = ['itjuzi_dis.spiders']
15 | NEWSPIDER_MODULE = 'itjuzi_dis.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'itjuzi_dis (+http://www.yourdomain.com)'
20 |
21 | # USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2810.2 Safari/537.36'
22 |
23 |
24 | # Obey robots.txt rules
25 | ROBOTSTXT_OBEY = False
26 |
27 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
28 | #CONCURRENT_REQUESTS = 32
29 |
30 | # Configure a delay for requests for the same website (default: 0)
31 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
32 | # See also autothrottle settings and docs
33 | DOWNLOAD_DELAY = 1.5
34 | # The download delay setting will honor only one of:
35 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
36 | #CONCURRENT_REQUESTS_PER_IP = 16
37 |
38 | # Disable cookies (enabled by default)
39 | COOKIES_ENABLED = False
40 |
41 | # Disable Telnet Console (enabled by default)
42 | #TELNETCONSOLE_ENABLED = False
43 |
44 | # Override the default request headers:
45 | #DEFAULT_REQUEST_HEADERS = {
46 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
47 | # 'Accept-Language': 'en',
48 | #}
49 |
50 | # Enable or disable spider middlewares
51 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
52 | #SPIDER_MIDDLEWARES = {
53 | # 'itjuzi_dis.middlewares.MyCustomSpiderMiddleware': 543,
54 | #}
55 |
56 | # Enable or disable downloader middlewares
57 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
58 | DOWNLOADER_MIDDLEWARES = {
59 | 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 80,
60 | # 'itjuzi_dis.middlewares.ProxyMiddleware': 90,
61 | 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 100,
62 | 'itjuzi_dis.middlewares.RotateUserAgentMiddleware': 200,
63 | }
64 |
65 | # Enable or disable extensions
66 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
67 | #EXTENSIONS = {
68 | # 'scrapy.extensions.telnet.TelnetConsole': None,
69 | #}
70 |
71 | # Configure item pipelines
72 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
73 | ITEM_PIPELINES = {
74 | 'itjuzi_dis.pipelines.DuplicatesPipeline': 200,
75 | 'itjuzi_dis.pipelines.ItjuziSpiderPipeline': 300,
76 | 'scrapy_redis.pipelines.RedisPipeline': 300
77 | }
78 |
79 | # Enable and configure the AutoThrottle extension (disabled by default)
80 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
81 | #AUTOTHROTTLE_ENABLED = True
82 | # The initial download delay
83 | #AUTOTHROTTLE_START_DELAY = 5
84 | # The maximum download delay to be set in case of high latencies
85 | #AUTOTHROTTLE_MAX_DELAY = 60
86 | # The average number of requests Scrapy should be sending in parallel to
87 | # each remote server
88 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
89 | # Enable showing throttling stats for every response received:
90 | #AUTOTHROTTLE_DEBUG = False
91 |
92 | # Enable and configure HTTP caching (disabled by default)
93 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
94 | #HTTPCACHE_ENABLED = True
95 | #HTTPCACHE_EXPIRATION_SECS = 0
96 | #HTTPCACHE_DIR = 'httpcache'
97 | #HTTPCACHE_IGNORE_HTTP_CODES = []
98 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
99 | # REDIRECT_ENABLED = False
100 | RANDOMIZE_DOWNLOAD_DELAY = True
101 | # Enables scheduling storing requests queue in redis.
102 | SCHEDULER = "scrapy_redis.scheduler.Scheduler"
103 |
104 | # Ensure all spiders share same duplicates filter through redis.
105 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
106 |
107 | # REDIS_START_URLS_AS_SET = True
108 |
109 | REDIS_PARAMS = {'host':'redis','decode_responses':False}
110 |
--------------------------------------------------------------------------------
/itjuzi_dis/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/itjuzi_dis/spiders/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hardy4yooz/itjuzi_dis/551d43573b1dd93fdb05900953cd0b79044c8114/itjuzi_dis/spiders/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/itjuzi_dis/spiders/__pycache__/juzi_spider.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hardy4yooz/itjuzi_dis/551d43573b1dd93fdb05900953cd0b79044c8114/itjuzi_dis/spiders/__pycache__/juzi_spider.cpython-35.pyc
--------------------------------------------------------------------------------
/itjuzi_dis/spiders/juzi_spider.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | from bs4 import BeautifulSoup
4 | from scrapy.linkextractors import LinkExtractor
5 | from scrapy.spiders import CrawlSpider, Rule
6 |
7 | from scrapy_redis.spiders import RedisCrawlSpider
8 | from itjuzi_dis.items import CompanyItem
9 |
10 |
11 | class ITjuziSpider(RedisCrawlSpider):
12 | name = 'itjuzi_dis'
13 | allowed_domains = ['itjuzi.com']
14 | # start_urls = ['http://www.itjuzi.com/company/157']
15 | redis_key = 'itjuziCrawler:start_urls'
16 | rules = [
17 | # 获取每一页的链接
18 | Rule(link_extractor=LinkExtractor(allow=('/company\?page=\d+'))),
19 | # 获取每一个公司的详情
20 | Rule(link_extractor=LinkExtractor(allow=('/company/\d+')), callback='parse_item')
21 | ]
22 |
23 | def parse_item(self, response):
24 | soup = BeautifulSoup(response.body, 'lxml')
25 |
26 | cpy1 = soup.find('div', class_='infoheadrow-v2')
27 | if cpy1:
28 | company_name = cpy1.find(class_='title').b.contents[0].strip().replace('\t', '').replace('\n', '')
29 | slogan = cpy1.find(class_='info-line').p.get_text()
30 | scope_a = cpy1.find(class_='scope c-gray-aset').find_all('a')
31 | scope = scope_a[0].get_text().strip() if len(scope_a) > 0 else ''
32 | sub_scope = scope_a[1].get_text().strip() if len(scope_a) > 1 else ''
33 | city_a = cpy1.find(class_='loca c-gray-aset').find_all('a')
34 | city = city_a[0].get_text().strip() if len(city_a) > 0 else ''
35 | area = city_a[1].get_text().strip() if len(city_a) > 1 else ''
36 |
37 | home_page = cpy1.find(class_='weblink marl10')['href']
38 | tags = cpy1.find(class_='tagset dbi c-gray-aset').get_text().strip().strip().replace('\n', ',')
39 |
40 | cpy2 = soup.find('div', class_='block-inc-info on-edit-hide')
41 | if cpy2:
42 | company_intro = cpy2.find(class_='des').get_text().strip()
43 | cpy2_content = cpy2.find(class_='des-more').contents
44 | company_full_name = cpy2_content[1].get_text().strip()[len('公司全称:'):] if cpy2_content[1] else ''
45 | found_time = cpy2_content[3].contents[1].get_text().strip()[len('成立时间:'):] if cpy2_content[3] else ''
46 | company_size = cpy2_content[3].contents[3].get_text().strip()[len('公司规模:'):] if cpy2_content[3] else ''
47 | company_status = cpy2_content[5].get_text().strip() if cpy2_content[5] else ''
48 |
49 | main = soup.find('div', class_='main')
50 |
51 | # 投资
52 | tz = main.find('table', 'list-round-v2')
53 | tz_list = []
54 | if tz:
55 | all_tr = tz.find_all('tr')
56 | for tr in all_tr:
57 | tz_dict = {}
58 | all_td = tr.find_all('td')
59 | tz_dict['tz_time'] = all_td[0].span.get_text().strip()
60 | tz_dict['tz_round'] = all_td[1].get_text().strip()
61 | tz_dict['tz_finades'] = all_td[2].get_text().strip()
62 | tz_dict['tz_capital'] = all_td[3].get_text().strip().replace('\n', ',')
63 | tz_list.append(tz_dict)
64 |
65 | # 团队 team
66 | tm = main.find('ul', class_='list-prodcase limited-itemnum')
67 | tm_list = []
68 | if tm:
69 | for li in tm.find_all('li'):
70 | tm_dict = {}
71 | tm_dict['tm_m_name'] = li.find('span', class_='c').get_text().strip()
72 | tm_dict['tm_m_title'] = li.find('span', class_='c-gray').get_text().strip()
73 | tm_dict['tm_m_intro'] = li.find('p', class_='mart10 person-des').get_text().strip()
74 | tm_list.append(tm_dict)
75 |
76 | pdt = main.find('ul', class_='list-prod limited-itemnum')
77 | pdt_list = []
78 | if pdt:
79 | for li in pdt.find_all('li'):
80 | pdt_dict = {}
81 | pdt_dict['pdt_name'] = li.find('h4').b.get_text().strip()
82 | pdt_dict['pdt_type'] = li.find('span', class_='tag yellow').get_text().strip()
83 | pdt_dict['pdt_intro'] = li.find(class_='on-edit-hide').p.get_text().strip()
84 | pdt_list.append(pdt_dict)
85 | item = CompanyItem()
86 | item['info_id'] = response.url.split('/')[-1:][0]
87 | item['company_name'] = company_name
88 | item['slogan'] = slogan
89 | item['scope'] = scope
90 | item['sub_scope'] = sub_scope
91 | item['city'] = city
92 | item['area'] = area
93 | item['home_page'] = home_page
94 | item['tags'] = tags
95 | item['company_intro'] = company_intro
96 | item['company_full_name'] = company_full_name
97 | item['found_time'] = found_time
98 | item['company_size'] = company_size
99 | item['company_status'] = company_status
100 | item['tz_info'] = tz_list
101 | item['tm_info'] = tm_list
102 | item['pdt_info'] = pdt_list
103 | return item
104 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | BeautifulSoup4
2 | scrapy
3 | scrapy_redis
4 | redis
5 | sqlalchemy
6 | pymysql
7 |
8 |
--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = itjuzi_dis.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = itjuzi_dis
12 |
--------------------------------------------------------------------------------
/spiders.py:
--------------------------------------------------------------------------------
1 | from scrapy import signals
2 | from scrapy.exceptions import DontCloseSpider
3 | from scrapy.spiders import Spider, CrawlSpider
4 |
5 | from . import connection
6 |
7 |
8 | # Default batch size matches default concurrent requests setting.
9 | DEFAULT_START_URLS_BATCH_SIZE = 16
10 | DEFAULT_START_URLS_KEY = '%(name)s:start_urls'
11 |
12 |
13 | class RedisMixin(object):
14 | """Mixin class to implement reading urls from a redis queue."""
15 | # Per spider redis key, default to DEFAULT_KEY.
16 | redis_key = None
17 | # Fetch this amount of start urls when idle. Default to DEFAULT_BATCH_SIZE.
18 | redis_batch_size = None
19 | # Redis client instance.
20 | server = None
21 |
22 | def start_requests(self):
23 | """Returns a batch of start requests from redis."""
24 | return self.next_requests()
25 |
26 | def setup_redis(self, crawler=None):
27 | """Setup redis connection and idle signal.
28 |
29 | This should be called after the spider has set its crawler object.
30 | """
31 | if self.server is not None:
32 | return
33 |
34 | if crawler is None:
35 | # We allow optional crawler argument to keep backwards
36 | # compatibility.
37 | # XXX: Raise a deprecation warning.
38 | crawler = getattr(self, 'crawler', None)
39 |
40 | if crawler is None:
41 | raise ValueError("crawler is required")
42 |
43 | settings = crawler.settings
44 |
45 | if self.redis_key is None:
46 | self.redis_key = settings.get(
47 | 'REDIS_START_URLS_KEY', DEFAULT_START_URLS_KEY,
48 | )
49 |
50 | self.redis_key = self.redis_key % {'name': self.name}
51 |
52 | if not self.redis_key.strip():
53 | raise ValueError("redis_key must not be empty")
54 |
55 | if self.redis_batch_size is None:
56 | self.redis_batch_size = settings.getint(
57 | 'REDIS_START_URLS_BATCH_SIZE', DEFAULT_START_URLS_BATCH_SIZE,
58 | )
59 |
60 | try:
61 | self.redis_batch_size = int(self.redis_batch_size)
62 | except (TypeError, ValueError):
63 | raise ValueError("redis_batch_size must be an integer")
64 |
65 | self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
66 | "(batch size: %(redis_batch_size)s)", self.__dict__)
67 |
68 | self.server = connection.from_settings(crawler.settings)
69 | # The idle signal is called when the spider has no requests left,
70 | # that's when we will schedule new requests from redis queue
71 | crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
72 |
73 | def next_requests(self):
74 | """Returns a request to be scheduled or none."""
75 | use_set = self.settings.getbool('REDIS_START_URLS_AS_SET')
76 | fetch_one = self.server.spop if use_set else self.server.lpop
77 | # XXX: Do we need to use a timeout here?
78 | found = 0
79 | while found < self.redis_batch_size:
80 | data = fetch_one(self.redis_key)
81 | if not data:
82 | # Queue empty.
83 | break
84 | req = self.make_request_from_data(data.decode('utf-8'))
85 | if req:
86 | yield req
87 | found += 1
88 | else:
89 | self.logger.debug("Request not made from data: %r", data)
90 |
91 | if found:
92 | self.logger.debug("Read %s requests from '%s'", found, self.redis_key)
93 |
94 | def make_request_from_data(self, data):
95 | # By default, data is an URL.
96 | if '://' in data:
97 | return self.make_requests_from_url(data)
98 | else:
99 | self.logger.error("Unexpected URL from '%s': %r", self.redis_key, data)
100 |
101 | def schedule_next_requests(self):
102 | """Schedules a request if available"""
103 | for req in self.next_requests():
104 | self.crawler.engine.crawl(req, spider=self)
105 |
106 | def spider_idle(self):
107 | """Schedules a request if available, otherwise waits."""
108 | # XXX: Handle a sentinel to close the spider.
109 | self.schedule_next_requests()
110 | raise DontCloseSpider
111 |
112 |
113 | class RedisSpider(RedisMixin, Spider):
114 | """Spider that reads urls from redis queue when idle."""
115 |
116 | @classmethod
117 | def from_crawler(self, crawler, *args, **kwargs):
118 | obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs)
119 | obj.setup_redis(crawler)
120 | return obj
121 |
122 |
123 | class RedisCrawlSpider(RedisMixin, CrawlSpider):
124 | """Spider that reads urls from redis queue when idle."""
125 |
126 | @classmethod
127 | def from_crawler(self, crawler, *args, **kwargs):
128 | obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs)
129 | obj.setup_redis(crawler)
130 | return obj
131 |
--------------------------------------------------------------------------------