├── Technical_Artical_Spider
├── __init__.py
├── models
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── common.cpython-36.pyc
│ │ ├── __init__.cpython-36.pyc
│ │ ├── elaticsearch_type_4hou.cpython-36.pyc
│ │ └── elaticsearch_type_anquanke.cpython-36.pyc
│ ├── common.py
│ ├── elaticsearch_type_anquanke.py
│ ├── elaticsearch_type_freebuf.py
│ └── elaticsearch_type_4hou.py
├── utils
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── common.cpython-35.pyc
│ │ ├── common.cpython-36.pyc
│ │ ├── __init__.cpython-35.pyc
│ │ └── __init__.cpython-36.pyc
│ └── common.py
├── __init__.pyc
├── settings.pyc
├── spiders
│ ├── a4hou.pyc
│ ├── __init__.pyc
│ ├── __pycache__
│ │ ├── a4hou.cpython-35.pyc
│ │ ├── a4hou.cpython-36.pyc
│ │ ├── __init__.cpython-35.pyc
│ │ ├── __init__.cpython-36.pyc
│ │ └── anquanke360.cpython-36.pyc
│ ├── __init__.py
│ ├── freebuf.py
│ ├── a4hou.py
│ └── anquanke360.py
├── __pycache__
│ ├── items.cpython-35.pyc
│ ├── items.cpython-36.pyc
│ ├── __init__.cpython-35.pyc
│ ├── __init__.cpython-36.pyc
│ ├── pipelines.cpython-35.pyc
│ ├── pipelines.cpython-36.pyc
│ ├── settings.cpython-35.pyc
│ ├── settings.cpython-36.pyc
│ └── middlewares.cpython-36.pyc
├── images
│ └── Cover_images_4hou
│ │ └── 1f208f97337379677734.jpg
├── main.py
├── middlewares.py
├── settings.py
├── pipelines.py
└── items.py
├── .idea
├── vcs.xml
├── misc.xml
├── sqldialects.xml
├── modules.xml
├── Technical_Artical_Spider.iml
└── workspace.xml
├── scrapy.cfg
├── anquanke_article-struct.sql
├── 4hou_Article-struct.sql
└── README.md
/Technical_Artical_Spider/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/Technical_Artical_Spider/models/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/Technical_Artical_Spider/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/Technical_Artical_Spider/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__init__.pyc
--------------------------------------------------------------------------------
/Technical_Artical_Spider/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/settings.pyc
--------------------------------------------------------------------------------
/Technical_Artical_Spider/spiders/a4hou.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/spiders/a4hou.pyc
--------------------------------------------------------------------------------
/Technical_Artical_Spider/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/spiders/__init__.pyc
--------------------------------------------------------------------------------
/Technical_Artical_Spider/__pycache__/items.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/items.cpython-35.pyc
--------------------------------------------------------------------------------
/Technical_Artical_Spider/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/Technical_Artical_Spider/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/Technical_Artical_Spider/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/Technical_Artical_Spider/__pycache__/pipelines.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/pipelines.cpython-35.pyc
--------------------------------------------------------------------------------
/Technical_Artical_Spider/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/pipelines.cpython-36.pyc
--------------------------------------------------------------------------------
/Technical_Artical_Spider/__pycache__/settings.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/settings.cpython-35.pyc
--------------------------------------------------------------------------------
/Technical_Artical_Spider/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/Technical_Artical_Spider/__pycache__/middlewares.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/middlewares.cpython-36.pyc
--------------------------------------------------------------------------------
/Technical_Artical_Spider/utils/__pycache__/common.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/utils/__pycache__/common.cpython-35.pyc
--------------------------------------------------------------------------------
/Technical_Artical_Spider/utils/__pycache__/common.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/utils/__pycache__/common.cpython-36.pyc
--------------------------------------------------------------------------------
/Technical_Artical_Spider/models/__pycache__/common.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/models/__pycache__/common.cpython-36.pyc
--------------------------------------------------------------------------------
/Technical_Artical_Spider/spiders/__pycache__/a4hou.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/spiders/__pycache__/a4hou.cpython-35.pyc
--------------------------------------------------------------------------------
/Technical_Artical_Spider/spiders/__pycache__/a4hou.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/spiders/__pycache__/a4hou.cpython-36.pyc
--------------------------------------------------------------------------------
/Technical_Artical_Spider/utils/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/utils/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/Technical_Artical_Spider/utils/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/utils/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/Technical_Artical_Spider/models/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/models/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/Technical_Artical_Spider/spiders/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/spiders/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/Technical_Artical_Spider/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/Technical_Artical_Spider/spiders/__pycache__/anquanke360.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/spiders/__pycache__/anquanke360.cpython-36.pyc
--------------------------------------------------------------------------------
/Technical_Artical_Spider/images/Cover_images_4hou/1f208f97337379677734.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/images/Cover_images_4hou/1f208f97337379677734.jpg
--------------------------------------------------------------------------------
/Technical_Artical_Spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/Technical_Artical_Spider/models/__pycache__/elaticsearch_type_4hou.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/models/__pycache__/elaticsearch_type_4hou.cpython-36.pyc
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/Technical_Artical_Spider/models/__pycache__/elaticsearch_type_anquanke.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/models/__pycache__/elaticsearch_type_anquanke.cpython-36.pyc
--------------------------------------------------------------------------------
/.idea/sqldialects.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/Technical_Artical_Spider/utils/common.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import re
3 | import hashlib
4 |
5 | def get_md5(url):
6 | if isinstance(url, str):
7 | url = url.encode("utf-8")
8 | m = hashlib.md5()
9 | m.update(url)
10 | return m.hexdigest()
--------------------------------------------------------------------------------
/Technical_Artical_Spider/main.py:
--------------------------------------------------------------------------------
1 | from scrapy.cmdline import execute
2 |
3 | import sys
4 | import os
5 |
6 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
7 | #execute(["scrapy","crawl","4hou"])
8 | #execute(["scrapy","crawl","anquanke360"])
9 | execute(["scrapy","crawl","freebuf"])
--------------------------------------------------------------------------------
/Technical_Artical_Spider/models/common.py:
--------------------------------------------------------------------------------
1 |
2 | #据说是个bug,反正就是要这样写
3 | from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer
4 |
5 | class CustomAnalyzer(_CustomAnalyzer):
6 | def get_analysis_definition(self):
7 | return {}
8 |
9 | ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"])
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = Technical_Artical_Spider.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = Technical_Artical_Spider
12 |
--------------------------------------------------------------------------------
/.idea/Technical_Artical_Spider.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/Technical_Artical_Spider/models/elaticsearch_type_anquanke.py:
--------------------------------------------------------------------------------
1 | from Technical_Artical_Spider.models.common import ik_analyzer
2 | from elasticsearch_dsl import DocType, Date, Nested, Boolean, analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
3 | from elasticsearch_dsl.connections import connections
4 | connections.create_connection(hosts=["localhost"])
5 |
6 | class Article_anquankeType(DocType):
7 | suggest = Completion(analyzer=ik_analyzer) #搜索建议
8 | id = Integer()
9 | url = Keyword()
10 | title = Text(analyzer="ik_max_word")
11 | create_time = Date()
12 | cover_local = Keyword()
13 | watch_num = Integer()
14 | comment_num = Integer()
15 | tags = Text(analyzer="ik_max_word")
16 | author = Keyword()
17 | content = Text(analyzer="ik_max_word")
18 |
19 | class Meta:
20 | index = "article_anquanke"
21 | doc_type = "anquanke"
22 |
23 |
24 | if __name__ == "__main__":
25 | Article_anquankeType.init()
26 |
--------------------------------------------------------------------------------
/Technical_Artical_Spider/models/elaticsearch_type_freebuf.py:
--------------------------------------------------------------------------------
1 | from Technical_Artical_Spider.models.common import ik_analyzer
2 | from elasticsearch_dsl import DocType, Date, Nested, Boolean, analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
3 | from elasticsearch_dsl.connections import connections
4 | connections.create_connection(hosts=["localhost"])
5 |
6 |
7 | class Article_freebuf(DocType):
8 | suggest = Completion(analyzer=ik_analyzer) #搜索建议
9 | image_local = Keyword()
10 | title = Text(analyzer="ik_max_word")
11 | url_id = Keyword()
12 | create_time = Date()
13 | url = Keyword()
14 | author = Keyword()
15 | tags = Text(analyzer="ik_max_word")
16 | watch_nums = Integer()
17 | comment_nums = Integer()
18 | content = Text(analyzer="ik_max_word")
19 |
20 | class Meta:
21 | index = "teachnical_freebuf"
22 | doc_type = "freebuf"
23 |
24 |
25 | if __name__ == "__main__":
26 | Article_freebuf.init()
27 |
--------------------------------------------------------------------------------
/Technical_Artical_Spider/models/elaticsearch_type_4hou.py:
--------------------------------------------------------------------------------
1 | from Technical_Artical_Spider.models.common import ik_analyzer
2 | from elasticsearch_dsl import DocType, Date, Nested, Boolean, analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
3 | from elasticsearch_dsl.connections import connections
4 | connections.create_connection(hosts=["localhost"])
5 |
6 |
7 | class Article_4houType(DocType):
8 | suggest = Completion(analyzer=ik_analyzer) #搜索建议
9 | image_local = Keyword()
10 | title = Text(analyzer="ik_max_word")
11 | url_id = Keyword()
12 | create_time = Date()
13 | url = Keyword()
14 | author = Keyword()
15 | tags = Text(analyzer="ik_max_word")
16 | watch_nums = Integer()
17 | comment_nums = Integer()
18 | praise_nums = Integer()
19 | content = Text(analyzer="ik_max_word")
20 |
21 | class Meta:
22 | index = "teachnical_4hou"
23 | doc_type = "A_4hou"
24 |
25 |
26 | if __name__ == "__main__":
27 | Article_4houType.init()
28 |
--------------------------------------------------------------------------------
/anquanke_article-struct.sql:
--------------------------------------------------------------------------------
1 | /*
2 | Navicat MySQL Data Transfer
3 |
4 | Source Server : TT_ubuntu16.04
5 | Source Server Version : 50720
6 | Source Host : 192.168.123.66:3306
7 | Source Database : ArticleSpider
8 |
9 | Target Server Type : MYSQL
10 | Target Server Version : 50720
11 | File Encoding : 65001
12 |
13 | Date: 2017-12-15 20:56:57
14 | */
15 |
16 | SET FOREIGN_KEY_CHECKS=0;
17 |
18 | -- ----------------------------
19 | -- Table structure for anquanke_article
20 | -- ----------------------------
21 | DROP TABLE IF EXISTS `anquanke_article`;
22 | CREATE TABLE `anquanke_article` (
23 | `id` int(32) NOT NULL,
24 | `url` varchar(255) COLLATE utf8_bin NOT NULL,
25 | `title` varchar(50) COLLATE utf8_bin NOT NULL,
26 | `create_time` date NOT NULL,
27 | `cover_local` varchar(255) COLLATE utf8_bin NOT NULL,
28 | `watch_num` int(32) DEFAULT '0',
29 | `tags` varchar(255) COLLATE utf8_bin NOT NULL,
30 | `author` varchar(255) COLLATE utf8_bin NOT NULL,
31 | `comment_num` int(32) DEFAULT '0',
32 | `content` longtext COLLATE utf8_bin NOT NULL,
33 | PRIMARY KEY (`id`)
34 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
35 |
--------------------------------------------------------------------------------
/4hou_Article-struct.sql:
--------------------------------------------------------------------------------
1 | /*
2 | Navicat MySQL Data Transfer
3 |
4 | Source Server : TT_ubuntu16.04
5 | Source Server Version : 50720
6 | Source Host : 192.168.250.66:3306
7 | Source Database : ArticleSpider
8 |
9 | Target Server Type : MYSQL
10 | Target Server Version : 50720
11 | File Encoding : 65001
12 |
13 | Date: 2017-12-05 15:03:35
14 | */
15 |
16 | SET FOREIGN_KEY_CHECKS=0;
17 |
18 | -- ----------------------------
19 | -- Table structure for 4hou_Article
20 | -- ----------------------------
21 | DROP TABLE IF EXISTS `4hou_Article`;
22 | CREATE TABLE `4hou_Article` (
23 | `image_local` varchar(255) COLLATE utf8_bin NOT NULL,
24 | `image_url` varchar(255) COLLATE utf8_bin NOT NULL,
25 | `title` varchar(200) COLLATE utf8_bin NOT NULL,
26 | `url_id` varchar(32) COLLATE utf8_bin NOT NULL,
27 | `create_date` date DEFAULT NULL,
28 | `url` varchar(100) COLLATE utf8_bin NOT NULL,
29 | `author` varchar(200) COLLATE utf8_bin NOT NULL,
30 | `tags` varchar(50) COLLATE utf8_bin NOT NULL,
31 | `watch_num` int(10) DEFAULT '0' COMMENT '0',
32 | `comment_num` int(10) DEFAULT '0',
33 | `praise_nums` int(10) DEFAULT '0',
34 | `content` longtext COLLATE utf8_bin NOT NULL,
35 | PRIMARY KEY (`url_id`)
36 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin ROW_FORMAT=DYNAMIC;
37 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Technical_Article_Spider
2 |
3 | #### 一个爬取国内技术站点的技术文章
4 |
5 | 为了方便之后的搜索引擎搭建,改用`elasticsearch`
6 |
7 | 开发环境:
8 |
9 | - python3
10 |
11 |
12 | - Scrapy ==1.4.0
13 | - elasticsearch-rtf
14 | - docker
15 |
16 | #### 最新版本安装
17 |
18 | - Linux安装
19 |
20 | > sudo apt-get install python3-pip git xvfb
21 | >
22 | > sudo pip3 install scrapy mysqlclient scrapy-splash fake-useragent
23 | >
24 | > git clone https://github.com/medcl/elasticsearch-rtf.git
25 | >
26 | > git clone https://github.com/smile0304/Technical_Article_Spider.git
27 |
28 | - Windows安装
29 |
30 | >pip install scrapy pillow mysqlclient scrapy-splash pypiwin32 fake-useragent
31 | >
32 | >git clone https://github.com/smile0304/Technical_Article_Spider.git
33 | >
34 | >git clone https://github.com/medcl/elasticsearch-rtf.git
35 |
36 | `windows`和`linux`相同操作
37 |
38 | 需要下载安装[`docker`](https://www.docker.com/community-edition)
39 |
40 | - 配置docker国内镜像
41 |
42 | > Linux下配置:
43 | >
44 | > curl -sSL https://get.daocloud.io/daotools/set_mirror.sh | sh -s http://7db66207.m.daocloud.io
45 | >
46 | > Windows下右键setting -> Daemon -> Registry mirrors 添加
47 | >
48 | > http://7db66207.m.daocloud.io
49 |
50 | - 拉取镜像
51 |
52 | > docker pull scrapinghub/splash
53 |
54 | - 用docker运行`scapinghub/splash`服务
55 |
56 | > docker run -p 8050:8050 scrapinghub/splash
57 |
58 | #### 还可以修改的一些配置
59 |
60 | ```python
61 | AUTOTHROTTLE_ENABLED #设置是否延迟
62 |
63 | AUTOTHROTTLE_START_DELAY = 2 #请求的延时(需要AUTOTHROTTLE_ENABLED=True)
64 |
65 | AUTOTHROTTLE_MAX_DELAY = 60 #如果网络差的最大等待时长(需要AUTOTHROTTLE_ENABLED=True)
66 |
67 | IMAGES_STORE = os.path.join(project_dir, 'images') #images为图片的默认存放地址
68 |
69 | ```
70 |
71 | #### PS:
72 |
73 | 已突破安全客反爬虫机制,搜索引擎搭建,请移步至[Article_Search](https://github.com/smile0304/Article_Search)
74 |
75 | #### 更新日志
76 |
77 | - 2017年12月25日
78 | - 突破安全客反爬机制
79 | - 弃用`selenium`
80 | - 增加爬取`freebuf`的数据
81 |
82 |
83 | - 2017年12月23日
84 | - 增加任意`User-Agent`
85 |
86 |
87 | - 2017年12月18日
88 | - 数据分库
89 | - 设置浏览器为无界面
90 |
91 |
92 | - 2017年12月15日
93 | - 弃用`Mysql`保存数据库
94 | - 使用`elasticsearch`保存数据
95 |
96 |
97 | - 2017年12月8日更新
98 | - 对安全客进行爬去
99 |
100 | - 完成图片的分类
101 |
102 | - 优化代码性能,降低冗余性
103 |
104 | - 2017年12月5日首次提交
105 | - 当前版本仅对嘶吼的文章进行爬取
--------------------------------------------------------------------------------
/Technical_Artical_Spider/spiders/freebuf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from urllib import parse
4 | from Technical_Artical_Spider.items import ArticleItemLoader,ArticleSpiderfreebuf
5 | from Technical_Artical_Spider.utils.common import get_md5
6 | class FreebufSpider(scrapy.Spider):
7 | name = 'freebuf'
8 | allowed_domains = ['www.freebuf.com']
9 | start_urls = ['http://www.freebuf.com/vuls',
10 | 'http://www.freebuf.com/sectool',
11 | 'http://www.freebuf.com/articles/web',
12 | 'http://www.freebuf.com/articles/system',
13 | 'http://www.freebuf.com/articles/network',
14 | 'http://www.freebuf.com/articles/wireless',
15 | 'http://www.freebuf.com/articles/terminal',
16 | 'http://www.freebuf.com/articles/database',
17 | 'http://www.freebuf.com/articles/security-management',
18 | 'http://www.freebuf.com/articles/es',
19 | 'http://www.freebuf.com/ics-articles'
20 | ]
21 |
22 | def start_requests(self):
23 | for url in self.start_urls:
24 | yield scrapy.Request(url,callback=self.parse)
25 |
26 | def parse(self, response):
27 | next_url = response.css(".news-more a::attr(href)").extract()[0]
28 | if next_url:
29 | yield scrapy.Request(url=parse.urljoin(response.url,next_url),callback=self.parse)
30 |
31 | Article_Boxs = response.css(".news-detial .news_inner")
32 | for article in Article_Boxs:
33 | Image_url = article.css(".news-img img::attr(src)").extract()[0].split('!')[0]
34 | Article_url = article.css(".news-info a::attr(href)").extract()[0]
35 | yield scrapy.Request(url=parse.urljoin(response.url,Article_url),
36 | meta={"image_url": parse.urljoin(response.url,Image_url)},
37 | callback=self.parse_detail
38 | )
39 |
40 | def parse_detail(self,response):
41 | image_url = response.meta.get("image_url", "") # 文章封面图
42 | item_loader = ArticleItemLoader(item=ArticleSpiderfreebuf(), response=response)
43 | item_loader.add_css("title",".articlecontent .title h2::text")
44 | item_loader.add_css("author",".property .name a::text")
45 | item_loader.add_css("create_date",".property .time::text")
46 | item_loader.add_value("url",response.url)
47 | item_loader.add_value("url_id",get_md5(response.url))
48 | item_loader.add_css("tags",".property .tags a::text")
49 | item_loader.add_value("image_url",[image_url])
50 | item_loader.add_css("watch_num",".property .look strong::text")
51 | if len(response.css(".main-tit02 h3 span::text").extract()) != 0:
52 | item_loader.add_css("comment_num",".main-tit02 h3 span::text")
53 | else:
54 | item_loader.add_value("comment_num","0")
55 | item_loader.add_css("content","#contenttxt")
56 | article_item = item_loader.load_item()
57 | yield article_item
58 |
59 |
--------------------------------------------------------------------------------
/Technical_Artical_Spider/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 | from fake_useragent import UserAgent
10 |
11 | class TechnicalArticalSpiderSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | from scrapy.http import HtmlResponse
60 |
61 | class ChromMiddleware(object):
62 | """
63 | 使用Chrome浏览器进行访问
64 | """
65 | def process_request(self,request,spider):
66 | if request.url.startswith("https://www.anquanke.com/post/id/") and spider.name == "anquanke360":
67 | spider.browser.get(request.url)
68 | import time
69 | time.sleep(3)
70 | return HtmlResponse(url=spider.browser.current_url,body=spider.browser.page_source,encoding="utf-8", request=request)
71 |
72 | class RandomUserAgentMiddleware(object):
73 | """
74 | 使用任意的User-Agent头
75 | """
76 | def __init__(self,crawl):
77 | super(RandomUserAgentMiddleware, self).__init__()
78 | self.ua = UserAgent()
79 | self.ua_type = crawl.settings.get("RANDOM_UA_TYPE","random")
80 |
81 | @classmethod
82 | def from_crawler(cls,crawler):
83 | return cls(crawler)
84 |
85 | def process_request(self,request,spider):
86 | def get_ua():
87 | return getattr(self.ua,self.ua_type)
88 |
89 | request.headers.setdefault('User-Agent',get_ua())
90 |
--------------------------------------------------------------------------------
/Technical_Artical_Spider/spiders/a4hou.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | import re
4 | from urllib import parse
5 | from Technical_Artical_Spider.items import ArticleSpider4hou,ArticleItemLoader
6 | from Technical_Artical_Spider.utils.common import get_md5
7 | class A4houSpider(scrapy.Spider):
8 | name = '4hou'
9 | allowed_domains = ['www.4hou.com']
10 | start_urls = ['http://www.4hou.com/page/1']
11 | #start_urls = ['http://www.4hou.com/vulnerable/8663.html']
12 | headers = {
13 | 'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0"
14 | }
15 | urls = {}
16 |
17 | def parse(self, response):
18 | #提取出下一页的url
19 | next_url = response.css(".post-read-more-new a::attr(href)").extract()[0]
20 | if next_url:
21 | yield scrapy.Request(url=parse.urljoin(response.url,next_url),headers=self.headers,callback=self.parse)
22 |
23 | #提取出页面中全部的URL
24 | Article_Boxs = response.css(".main-box .ehover1")
25 | for Article_box in Article_Boxs:
26 | Article_url = Article_box.css(".new_img_title::attr(href)").extract_first("")
27 | #过滤出技术文章,不要新闻
28 | match_obj = re.match("(.*4hou.com/(technology|reverse|penetration|web|vulnerable)/(\d+)\.html$)", Article_url)
29 | if match_obj:
30 | Image_url = Article_box.css(".new_img .wp-post-image::attr(data-original)").extract_first("")
31 | yield scrapy.Request(url = parse.urljoin(response.url,Article_url),
32 | headers=self.headers
33 | ,meta={"image_url":parse.urljoin(response.url,Image_url)}
34 | ,callback=self.parse_detail)
35 |
36 | def parse_detail(self,response):
37 | image_url = response.meta.get("image_url","") #文章封面图
38 | item_loader =ArticleItemLoader(item=ArticleSpider4hou(),response=response)
39 | item_loader.add_css("title",".art_title::text")
40 | item_loader.add_css("create_date",".art_time::text")
41 | item_loader.add_value("url",response.url)
42 | item_loader.add_value("url_id",get_md5(response.url))
43 | item_loader.add_css("author",".article_author_name .upload-img::text")
44 | item_loader.add_xpath('tags',"//*[@class='art_nav']/a[2]/text()")
45 | item_loader.add_value('image_url',[image_url])
46 | item_loader.add_css("watch_num",".newtype .read span::text")
47 | item_loader.add_css("comment_num",".newtype .comment span::text")
48 | item_loader.add_css("praise_nums",".newtype .Praise span::text")
49 | item_loader.add_css("content",".article_cen")
50 | #文章中引用的图片
51 | if response.css(".article_cen img::attr(data-original)").extract():
52 | item_loader.add_css("ArticlecontentImage",".article_cen img::attr(data-original)")
53 | else:
54 | imgs = re.findall('
([\s\S].*)
', response.body.decode('utf-8'))
55 | #TODO imgs = ['
']
56 | imgs = [re.search('src="(/.*/.*/.*?)"',i).group(1) for i in imgs if re.search('src="(.*?)"', i)]
57 | item_loader.add_value("ArticlecontentImage",imgs)
58 | article_item = item_loader.load_item()
59 | yield article_item
60 |
--------------------------------------------------------------------------------
/Technical_Artical_Spider/spiders/anquanke360.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | import json
4 | from Technical_Artical_Spider.items import ArticleSpideranquanke
5 | from urllib import parse
6 | from selenium import webdriver
7 | from scrapy.xlib.pydispatch import dispatcher
8 | from scrapy import signals
9 | from Technical_Artical_Spider.settings import EXECUTABLE_PATH
10 | #from pyvirtualdisplay import Display
11 | from scrapy_splash import SplashRequest
12 | from scrapy_splash import SplashRequest
13 | from scrapy_splash import SplashMiddleware
14 | import re
15 | class Anquanke360Spider(scrapy.Spider):
16 | name = 'anquanke360'
17 | allowed_domains = ['anquanke.com']
18 | start_urls = ['https://api.anquanke.com/data/v1/posts?page=2&size=10&category=knowledge/']
19 | headers_api = {
20 | "HOST": "api.anquanke.com",
21 | 'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0"
22 | }
23 | headers_article = {
24 | "HOST": "www.anquanke.com",
25 | 'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0"
26 | }
27 | """
28 | def __init__(self):
29 | #设置不加载图片
30 | chrome_opt = webdriver.ChromeOptions()
31 | prefs = {"profile.managed_default_content_settings.images":2}
32 | chrome_opt.add_experimental_option("prefs",prefs)
33 | #设置无界面
34 | display = Display(visible=0,size=(800,600))
35 | display.start()
36 | self.browser = webdriver.Chrome(executable_path=EXECUTABLE_PATH,chrome_options=chrome_opt)
37 | super(Anquanke360Spider,self).__init__()
38 | dispatcher.connect(self.spider_close,signals.spider_closed)
39 |
40 | def spider_close(self,spider):
41 | self.browser.quit()
42 | """
43 |
44 | def parse(self, response):
45 | article_json = json.loads(response.text)
46 | next_url = article_json["next"]
47 |
48 | for data in article_json["data"]:
49 | url = "https://www.anquanke.com/post/id/"+str(data["id"])
50 | title = data["title"]
51 | title_start = re.search("(^\d{1,2}月\d{1,2}日)",title)
52 | if not title_start:
53 | cover_image = data["cover"]
54 | item = ArticleSpideranquanke()
55 | item["id"] = data["id"]
56 | item["url"] = url
57 | item["title"] = title
58 | item["create_time"] = data["date"].split(" ")[0]
59 | item["image_url"] = [cover_image]
60 | item["watch_num"] = data["pv"]
61 | tags_list = data["tags"]
62 | item["tags"] = ",".join(tags_list)
63 | item["author"] = data["author"]["nickname"]
64 | """
65 | yield scrapy.Request(url,
66 | headers=self.headers_article,
67 | meta={"image_url": parse.urljoin(response.url, cover_image)},
68 | callback=lambda arg1=response,arg2=item: self.parse_detail(arg1,arg2))
69 | """
70 | yield SplashRequest(url,
71 | meta={"image_url": parse.urljoin(response.url, cover_image)},
72 | callback=lambda arg1=response,arg2=item: self.parse_detail(arg1,arg2))
73 | if next_url:
74 | yield scrapy.Request(next_url,headers=self.headers_api,callback=self.parse)
75 |
76 |
77 | def parse_detail(self,response,item):
78 | item["content"] = response.xpath("//div[@class='article-content']").extract()[0]
79 | item["comment_num"] = int(response.css(".comment-list-area h1 span::text").extract()[0])
80 | item['ArticlecontentImage'] = response.css(".aligncenter::attr(data-original)").extract()
81 | yield item
82 |
--------------------------------------------------------------------------------
/Technical_Artical_Spider/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import os
3 | # Scrapy settings for Technical_Artical_Spider project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'Technical_Artical_Spider'
13 |
14 | SPIDER_MODULES = ['Technical_Artical_Spider.spiders']
15 | NEWSPIDER_MODULE = 'Technical_Artical_Spider.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'Technical_Artical_Spider (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 1
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | DOWNLOADER_MIDDLEWARES = {
56 | #'Technical_Artical_Spider.middlewares.ChromMiddleware': 1,
57 | 'Technical_Artical_Spider.middlewares.RandomUserAgentMiddleware': 1,
58 | 'scrapy_splash.SplashCookiesMiddleware': 723,
59 | 'scrapy_splash.SplashMiddleware': 725,
60 | 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
61 | }
62 | SPLASH_URL = 'http://localhost:8050'
63 | DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
64 | HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
65 | # Enable or disable extensions
66 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
67 | #EXTENSIONS = {
68 | # 'scrapy.extensions.telnet.TelnetConsole': None,
69 | #}
70 |
71 | # Configure item pipelines
72 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
73 |
74 | ITEM_PIPELINES = {
75 | #'Technical_Artical_Spider.pipelines.MysqlTwistedPipline': 30,
76 | 'Technical_Artical_Spider.pipelines.ElasticsearchPipline': 30,
77 | 'Technical_Artical_Spider.pipelines.ArticleImagePipeline': 1,
78 | #'Technical_Artical_Spider.pipelines.ArticlecontentImagePipline': 10,
79 | #'Technical_Artical_Spider.pipelines.ArticleHTMLreplacePipline': 20,
80 | }
81 |
82 | IMAGES_URLS_FIELD = "image_url"
83 | project_dir = os.path.abspath(os.path.dirname(__file__))
84 | IMAGES_STORE = os.path.join(project_dir, 'images')
85 | # Enable and configure the AutoThrottle extension (disabled by default)
86 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
87 | AUTOTHROTTLE_ENABLED = True
88 | # The initial download delay
89 | AUTOTHROTTLE_START_DELAY = 2
90 | # The maximum download delay to be set in case of high latencies
91 | AUTOTHROTTLE_MAX_DELAY = 60
92 | # The average number of requests Scrapy should be sending in parallel to
93 | # each remote server
94 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
95 | # Enable showing throttling stats for every response received:
96 | #AUTOTHROTTLE_DEBUG = False
97 |
98 | # Enable and configure HTTP caching (disabled by default)
99 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
100 | #HTTPCACHE_ENABLED = True
101 | #HTTPCACHE_EXPIRATION_SECS = 0
102 | #HTTPCACHE_DIR = 'httpcache'
103 | #HTTPCACHE_IGNORE_HTTP_CODES = []
104 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
105 |
106 | MYSQL_HOST = "127.0.0.1"
107 | MYSQL_DBNAME = "dbname"
108 | MYSQL_USER = "root"
109 | MYSQL_PASSWORD = "password"
110 |
111 | EXECUTABLE_PATH = "D:\\chromedriver" #设置chrom路径
112 | RANDON_UA_TYPE = "random" #设置使用任意useragent头
113 |
114 |
--------------------------------------------------------------------------------
/Technical_Artical_Spider/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | #import MySQLdb
8 | #import MySQLdb.cursors
9 | import scrapy
10 | import re
11 | from twisted.enterprise import adbapi
12 | from scrapy.pipelines.images import ImagesPipeline
13 | from Technical_Artical_Spider.items import ArticleSpideranquanke,ArticleSpider4hou,ArticleSpiderfreebuf
14 |
15 | class TechnicalArticalSpiderPipeline(object):
16 | def process_item(self, item, spider):
17 | return item
18 |
19 | #使用twised异步机制插入数据库
20 | """
21 | class MysqlTwistedPipline(object):
22 | def __init__(self, dbpool):
23 | self.dbpool = dbpool
24 |
25 | @classmethod
26 | def from_settings(cls, settings):
27 | dbparms = dict(
28 | host = settings["MYSQL_HOST"],
29 | db = settings["MYSQL_DBNAME"],
30 | user = settings["MYSQL_USER"],
31 | passwd = settings["MYSQL_PASSWORD"],
32 | charset='utf8',
33 | cursorclass=MySQLdb.cursors.DictCursor,
34 | use_unicode=True,
35 | )
36 | dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
37 |
38 | return cls(dbpool)
39 |
40 | def process_item(self, item, spider):
41 | #使用twisted将mysql插入变成异步执行
42 | query = self.dbpool.runInteraction(self.do_insert, item)
43 | query.addErrback(self.handle_error, item, spider) #处理异常
44 |
45 | def handle_error(self, failure, item, spider):
46 | #处理异步插入的异常
47 | print(failure)
48 | def do_insert(self, cursor, item):
49 | #执行具体的插入
50 | #根据不同的item 构建不同的sql语句并插入到mysql中
51 | insert_sql, params = item.get_insert_sql()
52 | cursor.execute(insert_sql, params)
53 | """
54 |
55 | #将数据写入elsticsearch
56 | class ElasticsearchPipline(object):
57 | #将数据写入到es中,
58 | def process_item(self,item,spider):
59 | #提升代码性能
60 | item.save_to_es()
61 | return item
62 |
63 | class ImagesavepathPipline(ImagesPipeline):
64 | path = "image"
65 |
66 | def file_path(self, request, response=None, info=None):
67 | image = request.url.split('/')[-1]
68 | path = self.path
69 | return '%s/%s' % (path,image)
70 |
71 | #文章封面图处理
72 | class ArticleImagePipeline(ImagesavepathPipline):
73 | Cover_image = "image_url"
74 |
75 | def get_media_requests(self, item, info):
76 | if isinstance(item,ArticleSpideranquanke):
77 | self.path = "Cover_images_anquanke"
78 | elif isinstance(item,ArticleSpider4hou):
79 | self.path = "Cover_images_4hou"
80 | elif isinstance(item,ArticleSpiderfreebuf):
81 | self.path = "Cover_images_freebuf"
82 | if len(item[self.Cover_image]):
83 | if isinstance(item,ArticleSpider4hou):
84 | for image_content_url in item[self.Cover_image]:
85 | yield scrapy.Request(image_content_url.split("?")[0])
86 | else:
87 | for image_content_url in item[self.Cover_image]:
88 | yield scrapy.Request(image_content_url)
89 |
90 | def item_completed(self, results, item, info):
91 | if self.Cover_image in item:
92 | for ok, value in results:
93 | image_file_path = value["path"]
94 | item[self.Cover_image] = image_file_path
95 | return item
96 |
97 | #下载文章图片
98 | class ArticlecontentImagePipline(ImagesavepathPipline):
99 | contentImage = "ArticlecontentImage"
100 | def get_media_requests(self, item, info):
101 | if isinstance(item,ArticleSpideranquanke):
102 | self.path = "Content_images_anquanke"
103 | elif isinstance(item,ArticleSpider4hou):
104 | self.path = "Content_images_4hou"
105 | if len(item[self.contentImage]):
106 | for image_content_url in item[self.contentImage]:
107 | yield scrapy.Request(image_content_url)
108 |
109 | def item_completed(self, results, item, info):
110 | return_list = []
111 | if self.contentImage in item:
112 | for ok,value in results:
113 | image_content_path = value["path"]
114 | return_list.append(image_content_path)
115 | item[self.contentImage] = return_list
116 | return item
117 | #处理文章中图片的替换
118 | class ArticleHTMLreplacePipline(object):
119 | # exchange html
120 | def process_item(self,item,spider):
121 | if spider.name == "4hou":
122 | itemcontentname = "content"
123 | re_findall = '$]'
125 | re_replace = '

'
126 | contentImage = "ArticlecontentImage"
127 | elif spider.name=="anquanke360":
128 | itemcontentname = "content"
129 | re_findall = ''
130 | re_sub = '
'
131 | re_replace = '
'
132 | contentImage = "ArticlecontentImage"
133 | if itemcontentname not in item:
134 | return item
135 | content = item[itemcontentname]
136 | sum = len(re.findall(re_findall,content))
137 | if sum != len(item[contentImage]):
138 | return item
139 | if item[contentImage]:
140 | for exf in range(sum):
141 | html = item[contentImage][exf]
142 | html = re_replace.format(html)
143 | content = re.sub(re_sub,html,content,1)
144 |
145 | item["content"] = content
146 |
147 | return item
--------------------------------------------------------------------------------
/Technical_Artical_Spider/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 | from scrapy.loader import ItemLoader
8 | from scrapy.loader.processors import TakeFirst,MapCompose,Join,Identity
9 | from Technical_Artical_Spider.models.elaticsearch_type_4hou import Article_4houType
10 | from Technical_Artical_Spider.models.elaticsearch_type_anquanke import Article_anquankeType
11 | from Technical_Artical_Spider.models.elaticsearch_type_freebuf import Article_freebuf
12 | from elasticsearch_dsl.connections import connections
13 | es_4hou = connections.create_connection(Article_4houType._doc_type.using)
14 | es_anquanke = connections.create_connection(Article_anquankeType._doc_type.using)
15 | es_anquanke = connections.create_connection(Article_freebuf._doc_type.using)
16 | import scrapy
17 |
18 | def gen_suggests(es,index,info_tuple):
19 | #根据字符串生生搜索建议数据
20 | used_words = set() #供去重使用
21 | suggests = []
22 | for text,weight in info_tuple:
23 | if text:
24 | #调用es的analyze接口分析字符串
25 | words = es.indices.analyze(index=index,analyzer="ik_max_word",params={'filter':["lowercase"]},body=text)
26 | anylyzed_words = set([r["token"] for r in words["tokens"] if len(r["token"])>1])
27 | new_words = anylyzed_words - used_words
28 | else:
29 | new_words = set()
30 |
31 | if new_words:
32 | suggests.append({"input":list(new_words),"weight":weight})
33 |
34 | return suggests
35 |
36 |
37 | class TechnicalArticalSpiderItem(scrapy.Item):
38 | # define the fields for your item here like:
39 | # name = scrapy.Field()
40 | pass
41 |
42 | class ArticleItemLoader(ItemLoader):
43 | # 自定义itemloader
44 | default_output_processor = TakeFirst()
45 |
46 |
47 | def splitspace(value):
48 | value = value.strip()
49 | value = value.replace('\n','')
50 | value = value.replace('\r','')
51 | return value
52 |
53 | def remove_comma(value):
54 | if "," in value:
55 | return value.replace(",","")
56 | else:
57 | return value
58 |
59 | def remove_Keywords(value):
60 | if "发布" in value:
61 | value = value.replace("发布", "")
62 | if "前" in value:
63 | #now_time = time.strftime("%Y-%m-%d")
64 | import time
65 | now_time = time.strftime('%Y-%m-%d',time.localtime(time.time()))
66 | return now_time
67 | else:
68 | time = value.replace("年","-").replace("月","-").replace("日","")
69 | return time
70 |
71 | def return_value(value):
72 | return value
73 |
74 | def return_intvalue(value):
75 | value = int(value)
76 | return value
77 |
78 | def seturl(value):
79 | if value == None:
80 | return value
81 | elif value.startswith("http://") or value.startswith("https://"):
82 | return value
83 | else:
84 | return "http://www.4hou.com"+value
85 | def listtransstr(value):
86 | return "".join(value)
87 |
88 | #嘶吼文章Item
89 | class ArticleSpider4hou(scrapy.Item):
90 | image_local = scrapy.Field() #图片本地地址
91 | image_url =scrapy.Field(
92 | output_processor=MapCompose(return_value)
93 | ) #图片地址
94 | title = scrapy.Field() #文章标题
95 | create_date = scrapy.Field(
96 | input_processor=MapCompose(remove_Keywords),
97 | ) #发布日期
98 | url = scrapy.Field() #原文地址
99 | url_id = scrapy.Field() #经过md5加密过后的url 作为主键
100 | author = scrapy.Field(
101 | input_processor =MapCompose(splitspace),
102 | ) #作者
103 | tags = scrapy.Field() #标签
104 | watch_num = scrapy.Field(
105 | input_processor=MapCompose(remove_comma,return_intvalue),
106 | ) #观看数量
107 | comment_num = scrapy.Field(
108 | input_processor=MapCompose(remove_comma,return_intvalue),
109 | ) #评论数量
110 | praise_nums =scrapy.Field(
111 | input_processor=MapCompose(remove_comma,return_intvalue),
112 | ) #点赞数量
113 | content = scrapy.Field() #文章正文
114 | #文章中的背景图处理
115 | ArticlecontentImage = scrapy.Field(
116 | input_processor = MapCompose(seturl),
117 | output_processor = Identity(),
118 |
119 | )
120 |
121 | # TODO 评论信息的显示
122 | def get_insert_sql(self):
123 | insert_sql = """
124 | insert into 4hou_Article(
125 | image_local,
126 | title,
127 | url_id,
128 | create_date,
129 | url,
130 | author,
131 | tags,
132 | watch_num,
133 | comment_num,
134 | praise_nums,
135 | content
136 | )
137 | VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE watch_num=VALUES(watch_num),
138 | comment_num=VALUES(comment_num),praise_nums=VALUES(praise_nums)
139 | """
140 | params= (
141 | self["image_url"],
142 | self["title"],
143 | self["url_id"],
144 | self["create_date"],
145 | self["url"],
146 | self["author"],
147 | self["tags"],
148 | self["watch_num"],
149 | self["comment_num"],
150 | self["praise_nums"],
151 | self["content"]
152 | )
153 | return insert_sql,params
154 | ##将数据写入es
155 | def save_to_es(self):
156 | article = Article_4houType()
157 | article.image_local = self["image_url"]
158 | article.title = self["title"]
159 | article.url_id = self["url_id"]
160 | article.create_time = self["create_date"]
161 | article.url = self["url"]
162 | article.author = self["author"]
163 | article.tags = self["tags"]
164 | article.watch_nums = self["watch_num"]
165 | article.comment_nums = self["comment_num"]
166 | article.praise_nums = self["praise_nums"]
167 | article.content = self["content"]
168 | article.suggest = gen_suggests(es_4hou,Article_4houType._doc_type.index,((article.title,10),(article.tags,7)))
169 | article.save()
170 |
171 | return
172 |
173 | #Freebuf文章Item
174 | class ArticleSpiderfreebuf(scrapy.Item):
175 | image_local = scrapy.Field() # 图片本地地址
176 | image_url = scrapy.Field(
177 | output_processor=MapCompose(return_value)
178 | ) # 图片地址
179 | title = scrapy.Field() # 文章标题
180 | create_date = scrapy.Field() # 发布日期
181 | url = scrapy.Field() # 原文地址
182 | url_id = scrapy.Field() # 经过md5加密过后的url 作为主键
183 | author = scrapy.Field() # 作者
184 | tags = scrapy.Field(
185 | output_processor=MapCompose(listtransstr)
186 | ) # 标签
187 | watch_num = scrapy.Field(
188 | input_processor=MapCompose(return_intvalue)
189 | ) # 观看数量
190 | comment_num = scrapy.Field(
191 | input_processor=MapCompose(return_intvalue)
192 | ) # 评论数量
193 | content = scrapy.Field() # 文章正文
194 |
195 | def save_to_es(self):
196 | article = Article_freebuf()
197 | article.image_local = self["image_url"]
198 | article.title = self["title"]
199 | article.url_id = self["url_id"]
200 | article.create_time = self["create_date"]
201 | article.url = self["url"]
202 | article.author = self["author"]
203 | article.tags = self["tags"]
204 | article.watch_nums = self["watch_num"]
205 | if self["comment_num"]:
206 | article.comment_nums = self["comment_num"]
207 | else:
208 | article.comment_nums = 0
209 | article.content = self["content"]
210 | article.suggest = gen_suggests(es_4hou,Article_4houType._doc_type.index,((article.title,10),(article.tags,7)))
211 | article.save()
212 |
213 | return
214 |
215 | #安全客文章Iten
216 | class ArticleSpideranquanke(scrapy.Item):
217 | id = scrapy.Field()
218 | url = scrapy.Field()
219 | title = scrapy.Field()
220 | create_time= scrapy.Field()
221 | image_url = scrapy.Field()
222 | image_local = scrapy.Field()
223 | watch_num = scrapy.Field()
224 | tags = scrapy.Field()
225 | author = scrapy.Field()
226 | comment_num = scrapy.Field()
227 | content = scrapy.Field()
228 | ArticlecontentImage = scrapy.Field()
229 |
230 | def get_insert_sql(self):
231 | insert_sql = """
232 | insert into anquanke_article(
233 | id,
234 | url,
235 | title,
236 | create_time,
237 | cover_local,
238 | watch_num,
239 | tags,
240 | author,
241 | comment_num,
242 | content
243 | )
244 | VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE watch_num=VALUES(watch_num),
245 | comment_num=VALUES(comment_num)
246 | """
247 | params = (
248 | self["id"],
249 | self["url"],
250 | self["title"],
251 | self["create_time"],
252 | self["image_url"],
253 | self["watch_num"],
254 | self["tags"],
255 | self["author"],
256 | self["comment_num"],
257 | self["content"]
258 | )
259 | return insert_sql, params
260 | #将代码写入es
261 | def save_to_es(self):
262 | article = Article_anquankeType()
263 | article.id = self["id"]
264 | article.url = self["url"]
265 | article.title = self["title"]
266 | article.create_time = self["create_time"]
267 | article.cover_local = self["image_url"]
268 | article.watch_num = self["watch_num"]
269 | article.tags = self["tags"]
270 | article.author = self["author"]
271 | article.comment_num = self["comment_num"]
272 | article.content = self["content"]
273 | article.suggest = gen_suggests(es_anquanke,Article_anquankeType._doc_type.index,((article.title,10),(article.tags,7)))
274 | article.save()
275 |
276 | return
277 |
278 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
92 |
93 |
94 |
95 | create_date
96 | print
97 | parse
98 | starts
99 | EXECUTABLE_PATH
100 | browser
101 | start
102 | Anquanke_ArticleImagePipeline
103 | DUPEFILT
104 | download
105 |
106 |
107 |
108 |
109 |
110 |
111 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 | true
134 | DEFINITION_ORDER
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 | 1512112372865
306 |
307 |
308 | 1512112372865
309 |
310 |
311 | 1512623042279
312 |
313 |
314 |
315 | 1512623042279
316 |
317 |
318 | 1512625891003
319 |
320 |
321 |
322 | 1512625891003
323 |
324 |
325 | 1512661727063
326 |
327 |
328 |
329 | 1512661727063
330 |
331 |
332 | 1512719606009
333 |
334 |
335 |
336 | 1512719606009
337 |
338 |
339 | 1512740929260
340 |
341 |
342 |
343 | 1512740929260
344 |
345 |
346 | 1514018179100
347 |
348 |
349 |
350 | 1514018179100
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 | file://$PROJECT_DIR$/Technical_Artical_Spider/pipelines.py
423 | 46
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
659 |
660 |
661 |
662 |
663 |
664 |
665 |
666 |
667 |
668 |
669 |
670 |
671 |
672 |
673 |
674 |
675 |
676 |
677 |
678 |
679 |
680 |
681 |
--------------------------------------------------------------------------------