├── Technical_Artical_Spider ├── __init__.py ├── models │ ├── __init__.py │ ├── __pycache__ │ │ ├── common.cpython-36.pyc │ │ ├── __init__.cpython-36.pyc │ │ ├── elaticsearch_type_4hou.cpython-36.pyc │ │ └── elaticsearch_type_anquanke.cpython-36.pyc │ ├── common.py │ ├── elaticsearch_type_anquanke.py │ ├── elaticsearch_type_freebuf.py │ └── elaticsearch_type_4hou.py ├── utils │ ├── __init__.py │ ├── __pycache__ │ │ ├── common.cpython-35.pyc │ │ ├── common.cpython-36.pyc │ │ ├── __init__.cpython-35.pyc │ │ └── __init__.cpython-36.pyc │ └── common.py ├── __init__.pyc ├── settings.pyc ├── spiders │ ├── a4hou.pyc │ ├── __init__.pyc │ ├── __pycache__ │ │ ├── a4hou.cpython-35.pyc │ │ ├── a4hou.cpython-36.pyc │ │ ├── __init__.cpython-35.pyc │ │ ├── __init__.cpython-36.pyc │ │ └── anquanke360.cpython-36.pyc │ ├── __init__.py │ ├── freebuf.py │ ├── a4hou.py │ └── anquanke360.py ├── __pycache__ │ ├── items.cpython-35.pyc │ ├── items.cpython-36.pyc │ ├── __init__.cpython-35.pyc │ ├── __init__.cpython-36.pyc │ ├── pipelines.cpython-35.pyc │ ├── pipelines.cpython-36.pyc │ ├── settings.cpython-35.pyc │ ├── settings.cpython-36.pyc │ └── middlewares.cpython-36.pyc ├── images │ └── Cover_images_4hou │ │ └── 1f208f97337379677734.jpg ├── main.py ├── middlewares.py ├── settings.py ├── pipelines.py └── items.py ├── .idea ├── vcs.xml ├── misc.xml ├── sqldialects.xml ├── modules.xml ├── Technical_Artical_Spider.iml └── workspace.xml ├── scrapy.cfg ├── anquanke_article-struct.sql ├── 4hou_Article-struct.sql └── README.md /Technical_Artical_Spider/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Technical_Artical_Spider/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Technical_Artical_Spider/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Technical_Artical_Spider/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__init__.pyc -------------------------------------------------------------------------------- /Technical_Artical_Spider/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/settings.pyc -------------------------------------------------------------------------------- /Technical_Artical_Spider/spiders/a4hou.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/spiders/a4hou.pyc -------------------------------------------------------------------------------- /Technical_Artical_Spider/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/spiders/__init__.pyc -------------------------------------------------------------------------------- /Technical_Artical_Spider/__pycache__/items.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/items.cpython-35.pyc -------------------------------------------------------------------------------- /Technical_Artical_Spider/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /Technical_Artical_Spider/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /Technical_Artical_Spider/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Technical_Artical_Spider/__pycache__/pipelines.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/pipelines.cpython-35.pyc -------------------------------------------------------------------------------- /Technical_Artical_Spider/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /Technical_Artical_Spider/__pycache__/settings.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/settings.cpython-35.pyc -------------------------------------------------------------------------------- /Technical_Artical_Spider/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /Technical_Artical_Spider/__pycache__/middlewares.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/middlewares.cpython-36.pyc -------------------------------------------------------------------------------- /Technical_Artical_Spider/utils/__pycache__/common.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/utils/__pycache__/common.cpython-35.pyc -------------------------------------------------------------------------------- /Technical_Artical_Spider/utils/__pycache__/common.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/utils/__pycache__/common.cpython-36.pyc -------------------------------------------------------------------------------- /Technical_Artical_Spider/models/__pycache__/common.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/models/__pycache__/common.cpython-36.pyc -------------------------------------------------------------------------------- /Technical_Artical_Spider/spiders/__pycache__/a4hou.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/spiders/__pycache__/a4hou.cpython-35.pyc -------------------------------------------------------------------------------- /Technical_Artical_Spider/spiders/__pycache__/a4hou.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/spiders/__pycache__/a4hou.cpython-36.pyc -------------------------------------------------------------------------------- /Technical_Artical_Spider/utils/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/utils/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /Technical_Artical_Spider/utils/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/utils/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Technical_Artical_Spider/models/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/models/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Technical_Artical_Spider/spiders/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/spiders/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /Technical_Artical_Spider/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Technical_Artical_Spider/spiders/__pycache__/anquanke360.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/spiders/__pycache__/anquanke360.cpython-36.pyc -------------------------------------------------------------------------------- /Technical_Artical_Spider/images/Cover_images_4hou/1f208f97337379677734.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/images/Cover_images_4hou/1f208f97337379677734.jpg -------------------------------------------------------------------------------- /Technical_Artical_Spider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Technical_Artical_Spider/models/__pycache__/elaticsearch_type_4hou.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/models/__pycache__/elaticsearch_type_4hou.cpython-36.pyc -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /Technical_Artical_Spider/models/__pycache__/elaticsearch_type_anquanke.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/models/__pycache__/elaticsearch_type_anquanke.cpython-36.pyc -------------------------------------------------------------------------------- /.idea/sqldialects.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Technical_Artical_Spider/utils/common.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import hashlib 4 | 5 | def get_md5(url): 6 | if isinstance(url, str): 7 | url = url.encode("utf-8") 8 | m = hashlib.md5() 9 | m.update(url) 10 | return m.hexdigest() -------------------------------------------------------------------------------- /Technical_Artical_Spider/main.py: -------------------------------------------------------------------------------- 1 | from scrapy.cmdline import execute 2 | 3 | import sys 4 | import os 5 | 6 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 7 | #execute(["scrapy","crawl","4hou"]) 8 | #execute(["scrapy","crawl","anquanke360"]) 9 | execute(["scrapy","crawl","freebuf"]) -------------------------------------------------------------------------------- /Technical_Artical_Spider/models/common.py: -------------------------------------------------------------------------------- 1 | 2 | #据说是个bug,反正就是要这样写 3 | from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer 4 | 5 | class CustomAnalyzer(_CustomAnalyzer): 6 | def get_analysis_definition(self): 7 | return {} 8 | 9 | ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"]) -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = Technical_Artical_Spider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = Technical_Artical_Spider 12 | -------------------------------------------------------------------------------- /.idea/Technical_Artical_Spider.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 14 | -------------------------------------------------------------------------------- /Technical_Artical_Spider/models/elaticsearch_type_anquanke.py: -------------------------------------------------------------------------------- 1 | from Technical_Artical_Spider.models.common import ik_analyzer 2 | from elasticsearch_dsl import DocType, Date, Nested, Boolean, analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer 3 | from elasticsearch_dsl.connections import connections 4 | connections.create_connection(hosts=["localhost"]) 5 | 6 | class Article_anquankeType(DocType): 7 | suggest = Completion(analyzer=ik_analyzer) #搜索建议 8 | id = Integer() 9 | url = Keyword() 10 | title = Text(analyzer="ik_max_word") 11 | create_time = Date() 12 | cover_local = Keyword() 13 | watch_num = Integer() 14 | comment_num = Integer() 15 | tags = Text(analyzer="ik_max_word") 16 | author = Keyword() 17 | content = Text(analyzer="ik_max_word") 18 | 19 | class Meta: 20 | index = "article_anquanke" 21 | doc_type = "anquanke" 22 | 23 | 24 | if __name__ == "__main__": 25 | Article_anquankeType.init() 26 | -------------------------------------------------------------------------------- /Technical_Artical_Spider/models/elaticsearch_type_freebuf.py: -------------------------------------------------------------------------------- 1 | from Technical_Artical_Spider.models.common import ik_analyzer 2 | from elasticsearch_dsl import DocType, Date, Nested, Boolean, analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer 3 | from elasticsearch_dsl.connections import connections 4 | connections.create_connection(hosts=["localhost"]) 5 | 6 | 7 | class Article_freebuf(DocType): 8 | suggest = Completion(analyzer=ik_analyzer) #搜索建议 9 | image_local = Keyword() 10 | title = Text(analyzer="ik_max_word") 11 | url_id = Keyword() 12 | create_time = Date() 13 | url = Keyword() 14 | author = Keyword() 15 | tags = Text(analyzer="ik_max_word") 16 | watch_nums = Integer() 17 | comment_nums = Integer() 18 | content = Text(analyzer="ik_max_word") 19 | 20 | class Meta: 21 | index = "teachnical_freebuf" 22 | doc_type = "freebuf" 23 | 24 | 25 | if __name__ == "__main__": 26 | Article_freebuf.init() 27 | -------------------------------------------------------------------------------- /Technical_Artical_Spider/models/elaticsearch_type_4hou.py: -------------------------------------------------------------------------------- 1 | from Technical_Artical_Spider.models.common import ik_analyzer 2 | from elasticsearch_dsl import DocType, Date, Nested, Boolean, analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer 3 | from elasticsearch_dsl.connections import connections 4 | connections.create_connection(hosts=["localhost"]) 5 | 6 | 7 | class Article_4houType(DocType): 8 | suggest = Completion(analyzer=ik_analyzer) #搜索建议 9 | image_local = Keyword() 10 | title = Text(analyzer="ik_max_word") 11 | url_id = Keyword() 12 | create_time = Date() 13 | url = Keyword() 14 | author = Keyword() 15 | tags = Text(analyzer="ik_max_word") 16 | watch_nums = Integer() 17 | comment_nums = Integer() 18 | praise_nums = Integer() 19 | content = Text(analyzer="ik_max_word") 20 | 21 | class Meta: 22 | index = "teachnical_4hou" 23 | doc_type = "A_4hou" 24 | 25 | 26 | if __name__ == "__main__": 27 | Article_4houType.init() 28 | -------------------------------------------------------------------------------- /anquanke_article-struct.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Navicat MySQL Data Transfer 3 | 4 | Source Server : TT_ubuntu16.04 5 | Source Server Version : 50720 6 | Source Host : 192.168.123.66:3306 7 | Source Database : ArticleSpider 8 | 9 | Target Server Type : MYSQL 10 | Target Server Version : 50720 11 | File Encoding : 65001 12 | 13 | Date: 2017-12-15 20:56:57 14 | */ 15 | 16 | SET FOREIGN_KEY_CHECKS=0; 17 | 18 | -- ---------------------------- 19 | -- Table structure for anquanke_article 20 | -- ---------------------------- 21 | DROP TABLE IF EXISTS `anquanke_article`; 22 | CREATE TABLE `anquanke_article` ( 23 | `id` int(32) NOT NULL, 24 | `url` varchar(255) COLLATE utf8_bin NOT NULL, 25 | `title` varchar(50) COLLATE utf8_bin NOT NULL, 26 | `create_time` date NOT NULL, 27 | `cover_local` varchar(255) COLLATE utf8_bin NOT NULL, 28 | `watch_num` int(32) DEFAULT '0', 29 | `tags` varchar(255) COLLATE utf8_bin NOT NULL, 30 | `author` varchar(255) COLLATE utf8_bin NOT NULL, 31 | `comment_num` int(32) DEFAULT '0', 32 | `content` longtext COLLATE utf8_bin NOT NULL, 33 | PRIMARY KEY (`id`) 34 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin; 35 | -------------------------------------------------------------------------------- /4hou_Article-struct.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Navicat MySQL Data Transfer 3 | 4 | Source Server : TT_ubuntu16.04 5 | Source Server Version : 50720 6 | Source Host : 192.168.250.66:3306 7 | Source Database : ArticleSpider 8 | 9 | Target Server Type : MYSQL 10 | Target Server Version : 50720 11 | File Encoding : 65001 12 | 13 | Date: 2017-12-05 15:03:35 14 | */ 15 | 16 | SET FOREIGN_KEY_CHECKS=0; 17 | 18 | -- ---------------------------- 19 | -- Table structure for 4hou_Article 20 | -- ---------------------------- 21 | DROP TABLE IF EXISTS `4hou_Article`; 22 | CREATE TABLE `4hou_Article` ( 23 | `image_local` varchar(255) COLLATE utf8_bin NOT NULL, 24 | `image_url` varchar(255) COLLATE utf8_bin NOT NULL, 25 | `title` varchar(200) COLLATE utf8_bin NOT NULL, 26 | `url_id` varchar(32) COLLATE utf8_bin NOT NULL, 27 | `create_date` date DEFAULT NULL, 28 | `url` varchar(100) COLLATE utf8_bin NOT NULL, 29 | `author` varchar(200) COLLATE utf8_bin NOT NULL, 30 | `tags` varchar(50) COLLATE utf8_bin NOT NULL, 31 | `watch_num` int(10) DEFAULT '0' COMMENT '0', 32 | `comment_num` int(10) DEFAULT '0', 33 | `praise_nums` int(10) DEFAULT '0', 34 | `content` longtext COLLATE utf8_bin NOT NULL, 35 | PRIMARY KEY (`url_id`) 36 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin ROW_FORMAT=DYNAMIC; 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Technical_Article_Spider 2 | 3 | #### 一个爬取国内技术站点的技术文章 4 | 5 | 为了方便之后的搜索引擎搭建,改用`elasticsearch` 6 | 7 | 开发环境: 8 | 9 | - python3 10 | 11 | 12 | - Scrapy ==1.4.0 13 | - elasticsearch-rtf 14 | - docker 15 | 16 | #### 最新版本安装 17 | 18 | - Linux安装 19 | 20 | > sudo apt-get install python3-pip git xvfb 21 | > 22 | > sudo pip3 install scrapy mysqlclient scrapy-splash fake-useragent 23 | > 24 | > git clone https://github.com/medcl/elasticsearch-rtf.git 25 | > 26 | > git clone https://github.com/smile0304/Technical_Article_Spider.git 27 | 28 | - Windows安装 29 | 30 | >pip install scrapy pillow mysqlclient scrapy-splash pypiwin32 fake-useragent 31 | > 32 | >git clone https://github.com/smile0304/Technical_Article_Spider.git 33 | > 34 | >git clone https://github.com/medcl/elasticsearch-rtf.git 35 | 36 | `windows`和`linux`相同操作 37 | 38 | 需要下载安装[`docker`](https://www.docker.com/community-edition) 39 | 40 | - 配置docker国内镜像 41 | 42 | > Linux下配置: 43 | > 44 | > ​ curl -sSL https://get.daocloud.io/daotools/set_mirror.sh | sh -s http://7db66207.m.daocloud.io 45 | > 46 | > Windows下右键setting -> Daemon -> Registry mirrors 添加 47 | > 48 | > http://7db66207.m.daocloud.io 49 | 50 | - 拉取镜像 51 | 52 | > docker pull scrapinghub/splash 53 | 54 | - 用docker运行`scapinghub/splash`服务 55 | 56 | > docker run -p 8050:8050 scrapinghub/splash 57 | 58 | #### 还可以修改的一些配置 59 | 60 | ```python 61 | AUTOTHROTTLE_ENABLED #设置是否延迟 62 | 63 | AUTOTHROTTLE_START_DELAY = 2 #请求的延时(需要AUTOTHROTTLE_ENABLED=True) 64 | 65 | AUTOTHROTTLE_MAX_DELAY = 60 #如果网络差的最大等待时长(需要AUTOTHROTTLE_ENABLED=True) 66 | 67 | IMAGES_STORE = os.path.join(project_dir, 'images') #images为图片的默认存放地址 68 | 69 | ``` 70 | 71 | #### PS: 72 | 73 | ​ 已突破安全客反爬虫机制,搜索引擎搭建,请移步至[Article_Search](https://github.com/smile0304/Article_Search) 74 | 75 | #### 更新日志 76 | 77 | - 2017年12月25日 78 | - 突破安全客反爬机制 79 | - 弃用`selenium` 80 | - 增加爬取`freebuf`的数据 81 | 82 | 83 | - 2017年12月23日 84 | - 增加任意`User-Agent` 85 | 86 | 87 | - 2017年12月18日 88 | - 数据分库 89 | - 设置浏览器为无界面 90 | 91 | 92 | - 2017年12月15日 93 | - 弃用`Mysql`保存数据库 94 | - 使用`elasticsearch`保存数据 95 | 96 | 97 | - 2017年12月8日更新 98 | - 对安全客进行爬去 99 | 100 | - 完成图片的分类 101 | 102 | - 优化代码性能,降低冗余性 103 | 104 | - 2017年12月5日首次提交 105 | - 当前版本仅对嘶吼的文章进行爬取 -------------------------------------------------------------------------------- /Technical_Artical_Spider/spiders/freebuf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from urllib import parse 4 | from Technical_Artical_Spider.items import ArticleItemLoader,ArticleSpiderfreebuf 5 | from Technical_Artical_Spider.utils.common import get_md5 6 | class FreebufSpider(scrapy.Spider): 7 | name = 'freebuf' 8 | allowed_domains = ['www.freebuf.com'] 9 | start_urls = ['http://www.freebuf.com/vuls', 10 | 'http://www.freebuf.com/sectool', 11 | 'http://www.freebuf.com/articles/web', 12 | 'http://www.freebuf.com/articles/system', 13 | 'http://www.freebuf.com/articles/network', 14 | 'http://www.freebuf.com/articles/wireless', 15 | 'http://www.freebuf.com/articles/terminal', 16 | 'http://www.freebuf.com/articles/database', 17 | 'http://www.freebuf.com/articles/security-management', 18 | 'http://www.freebuf.com/articles/es', 19 | 'http://www.freebuf.com/ics-articles' 20 | ] 21 | 22 | def start_requests(self): 23 | for url in self.start_urls: 24 | yield scrapy.Request(url,callback=self.parse) 25 | 26 | def parse(self, response): 27 | next_url = response.css(".news-more a::attr(href)").extract()[0] 28 | if next_url: 29 | yield scrapy.Request(url=parse.urljoin(response.url,next_url),callback=self.parse) 30 | 31 | Article_Boxs = response.css(".news-detial .news_inner") 32 | for article in Article_Boxs: 33 | Image_url = article.css(".news-img img::attr(src)").extract()[0].split('!')[0] 34 | Article_url = article.css(".news-info a::attr(href)").extract()[0] 35 | yield scrapy.Request(url=parse.urljoin(response.url,Article_url), 36 | meta={"image_url": parse.urljoin(response.url,Image_url)}, 37 | callback=self.parse_detail 38 | ) 39 | 40 | def parse_detail(self,response): 41 | image_url = response.meta.get("image_url", "") # 文章封面图 42 | item_loader = ArticleItemLoader(item=ArticleSpiderfreebuf(), response=response) 43 | item_loader.add_css("title",".articlecontent .title h2::text") 44 | item_loader.add_css("author",".property .name a::text") 45 | item_loader.add_css("create_date",".property .time::text") 46 | item_loader.add_value("url",response.url) 47 | item_loader.add_value("url_id",get_md5(response.url)) 48 | item_loader.add_css("tags",".property .tags a::text") 49 | item_loader.add_value("image_url",[image_url]) 50 | item_loader.add_css("watch_num",".property .look strong::text") 51 | if len(response.css(".main-tit02 h3 span::text").extract()) != 0: 52 | item_loader.add_css("comment_num",".main-tit02 h3 span::text") 53 | else: 54 | item_loader.add_value("comment_num","0") 55 | item_loader.add_css("content","#contenttxt") 56 | article_item = item_loader.load_item() 57 | yield article_item 58 | 59 | -------------------------------------------------------------------------------- /Technical_Artical_Spider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | from fake_useragent import UserAgent 10 | 11 | class TechnicalArticalSpiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | from scrapy.http import HtmlResponse 60 | 61 | class ChromMiddleware(object): 62 | """ 63 | 使用Chrome浏览器进行访问 64 | """ 65 | def process_request(self,request,spider): 66 | if request.url.startswith("https://www.anquanke.com/post/id/") and spider.name == "anquanke360": 67 | spider.browser.get(request.url) 68 | import time 69 | time.sleep(3) 70 | return HtmlResponse(url=spider.browser.current_url,body=spider.browser.page_source,encoding="utf-8", request=request) 71 | 72 | class RandomUserAgentMiddleware(object): 73 | """ 74 | 使用任意的User-Agent头 75 | """ 76 | def __init__(self,crawl): 77 | super(RandomUserAgentMiddleware, self).__init__() 78 | self.ua = UserAgent() 79 | self.ua_type = crawl.settings.get("RANDOM_UA_TYPE","random") 80 | 81 | @classmethod 82 | def from_crawler(cls,crawler): 83 | return cls(crawler) 84 | 85 | def process_request(self,request,spider): 86 | def get_ua(): 87 | return getattr(self.ua,self.ua_type) 88 | 89 | request.headers.setdefault('User-Agent',get_ua()) 90 | -------------------------------------------------------------------------------- /Technical_Artical_Spider/spiders/a4hou.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import re 4 | from urllib import parse 5 | from Technical_Artical_Spider.items import ArticleSpider4hou,ArticleItemLoader 6 | from Technical_Artical_Spider.utils.common import get_md5 7 | class A4houSpider(scrapy.Spider): 8 | name = '4hou' 9 | allowed_domains = ['www.4hou.com'] 10 | start_urls = ['http://www.4hou.com/page/1'] 11 | #start_urls = ['http://www.4hou.com/vulnerable/8663.html'] 12 | headers = { 13 | 'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0" 14 | } 15 | urls = {} 16 | 17 | def parse(self, response): 18 | #提取出下一页的url 19 | next_url = response.css(".post-read-more-new a::attr(href)").extract()[0] 20 | if next_url: 21 | yield scrapy.Request(url=parse.urljoin(response.url,next_url),headers=self.headers,callback=self.parse) 22 | 23 | #提取出页面中全部的URL 24 | Article_Boxs = response.css(".main-box .ehover1") 25 | for Article_box in Article_Boxs: 26 | Article_url = Article_box.css(".new_img_title::attr(href)").extract_first("") 27 | #过滤出技术文章,不要新闻 28 | match_obj = re.match("(.*4hou.com/(technology|reverse|penetration|web|vulnerable)/(\d+)\.html$)", Article_url) 29 | if match_obj: 30 | Image_url = Article_box.css(".new_img .wp-post-image::attr(data-original)").extract_first("") 31 | yield scrapy.Request(url = parse.urljoin(response.url,Article_url), 32 | headers=self.headers 33 | ,meta={"image_url":parse.urljoin(response.url,Image_url)} 34 | ,callback=self.parse_detail) 35 | 36 | def parse_detail(self,response): 37 | image_url = response.meta.get("image_url","") #文章封面图 38 | item_loader =ArticleItemLoader(item=ArticleSpider4hou(),response=response) 39 | item_loader.add_css("title",".art_title::text") 40 | item_loader.add_css("create_date",".art_time::text") 41 | item_loader.add_value("url",response.url) 42 | item_loader.add_value("url_id",get_md5(response.url)) 43 | item_loader.add_css("author",".article_author_name .upload-img::text") 44 | item_loader.add_xpath('tags',"//*[@class='art_nav']/a[2]/text()") 45 | item_loader.add_value('image_url',[image_url]) 46 | item_loader.add_css("watch_num",".newtype .read span::text") 47 | item_loader.add_css("comment_num",".newtype .comment span::text") 48 | item_loader.add_css("praise_nums",".newtype .Praise span::text") 49 | item_loader.add_css("content",".article_cen") 50 | #文章中引用的图片 51 | if response.css(".article_cen img::attr(data-original)").extract(): 52 | item_loader.add_css("ArticlecontentImage",".article_cen img::attr(data-original)") 53 | else: 54 | imgs = re.findall('

([\s\S].*)

', response.body.decode('utf-8')) 55 | #TODO imgs = ['1.png'] 56 | imgs = [re.search('src="(/.*/.*/.*?)"',i).group(1) for i in imgs if re.search('src="(.*?)"', i)] 57 | item_loader.add_value("ArticlecontentImage",imgs) 58 | article_item = item_loader.load_item() 59 | yield article_item 60 | -------------------------------------------------------------------------------- /Technical_Artical_Spider/spiders/anquanke360.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import json 4 | from Technical_Artical_Spider.items import ArticleSpideranquanke 5 | from urllib import parse 6 | from selenium import webdriver 7 | from scrapy.xlib.pydispatch import dispatcher 8 | from scrapy import signals 9 | from Technical_Artical_Spider.settings import EXECUTABLE_PATH 10 | #from pyvirtualdisplay import Display 11 | from scrapy_splash import SplashRequest 12 | from scrapy_splash import SplashRequest 13 | from scrapy_splash import SplashMiddleware 14 | import re 15 | class Anquanke360Spider(scrapy.Spider): 16 | name = 'anquanke360' 17 | allowed_domains = ['anquanke.com'] 18 | start_urls = ['https://api.anquanke.com/data/v1/posts?page=2&size=10&category=knowledge/'] 19 | headers_api = { 20 | "HOST": "api.anquanke.com", 21 | 'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0" 22 | } 23 | headers_article = { 24 | "HOST": "www.anquanke.com", 25 | 'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0" 26 | } 27 | """ 28 | def __init__(self): 29 | #设置不加载图片 30 | chrome_opt = webdriver.ChromeOptions() 31 | prefs = {"profile.managed_default_content_settings.images":2} 32 | chrome_opt.add_experimental_option("prefs",prefs) 33 | #设置无界面 34 | display = Display(visible=0,size=(800,600)) 35 | display.start() 36 | self.browser = webdriver.Chrome(executable_path=EXECUTABLE_PATH,chrome_options=chrome_opt) 37 | super(Anquanke360Spider,self).__init__() 38 | dispatcher.connect(self.spider_close,signals.spider_closed) 39 | 40 | def spider_close(self,spider): 41 | self.browser.quit() 42 | """ 43 | 44 | def parse(self, response): 45 | article_json = json.loads(response.text) 46 | next_url = article_json["next"] 47 | 48 | for data in article_json["data"]: 49 | url = "https://www.anquanke.com/post/id/"+str(data["id"]) 50 | title = data["title"] 51 | title_start = re.search("(^\d{1,2}月\d{1,2}日)",title) 52 | if not title_start: 53 | cover_image = data["cover"] 54 | item = ArticleSpideranquanke() 55 | item["id"] = data["id"] 56 | item["url"] = url 57 | item["title"] = title 58 | item["create_time"] = data["date"].split(" ")[0] 59 | item["image_url"] = [cover_image] 60 | item["watch_num"] = data["pv"] 61 | tags_list = data["tags"] 62 | item["tags"] = ",".join(tags_list) 63 | item["author"] = data["author"]["nickname"] 64 | """ 65 | yield scrapy.Request(url, 66 | headers=self.headers_article, 67 | meta={"image_url": parse.urljoin(response.url, cover_image)}, 68 | callback=lambda arg1=response,arg2=item: self.parse_detail(arg1,arg2)) 69 | """ 70 | yield SplashRequest(url, 71 | meta={"image_url": parse.urljoin(response.url, cover_image)}, 72 | callback=lambda arg1=response,arg2=item: self.parse_detail(arg1,arg2)) 73 | if next_url: 74 | yield scrapy.Request(next_url,headers=self.headers_api,callback=self.parse) 75 | 76 | 77 | def parse_detail(self,response,item): 78 | item["content"] = response.xpath("//div[@class='article-content']").extract()[0] 79 | item["comment_num"] = int(response.css(".comment-list-area h1 span::text").extract()[0]) 80 | item['ArticlecontentImage'] = response.css(".aligncenter::attr(data-original)").extract() 81 | yield item 82 | -------------------------------------------------------------------------------- /Technical_Artical_Spider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | # Scrapy settings for Technical_Artical_Spider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'Technical_Artical_Spider' 13 | 14 | SPIDER_MODULES = ['Technical_Artical_Spider.spiders'] 15 | NEWSPIDER_MODULE = 'Technical_Artical_Spider.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'Technical_Artical_Spider (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 1 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | DOWNLOADER_MIDDLEWARES = { 56 | #'Technical_Artical_Spider.middlewares.ChromMiddleware': 1, 57 | 'Technical_Artical_Spider.middlewares.RandomUserAgentMiddleware': 1, 58 | 'scrapy_splash.SplashCookiesMiddleware': 723, 59 | 'scrapy_splash.SplashMiddleware': 725, 60 | 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, 61 | } 62 | SPLASH_URL = 'http://localhost:8050' 63 | DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' 64 | HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage' 65 | # Enable or disable extensions 66 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 67 | #EXTENSIONS = { 68 | # 'scrapy.extensions.telnet.TelnetConsole': None, 69 | #} 70 | 71 | # Configure item pipelines 72 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 73 | 74 | ITEM_PIPELINES = { 75 | #'Technical_Artical_Spider.pipelines.MysqlTwistedPipline': 30, 76 | 'Technical_Artical_Spider.pipelines.ElasticsearchPipline': 30, 77 | 'Technical_Artical_Spider.pipelines.ArticleImagePipeline': 1, 78 | #'Technical_Artical_Spider.pipelines.ArticlecontentImagePipline': 10, 79 | #'Technical_Artical_Spider.pipelines.ArticleHTMLreplacePipline': 20, 80 | } 81 | 82 | IMAGES_URLS_FIELD = "image_url" 83 | project_dir = os.path.abspath(os.path.dirname(__file__)) 84 | IMAGES_STORE = os.path.join(project_dir, 'images') 85 | # Enable and configure the AutoThrottle extension (disabled by default) 86 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 87 | AUTOTHROTTLE_ENABLED = True 88 | # The initial download delay 89 | AUTOTHROTTLE_START_DELAY = 2 90 | # The maximum download delay to be set in case of high latencies 91 | AUTOTHROTTLE_MAX_DELAY = 60 92 | # The average number of requests Scrapy should be sending in parallel to 93 | # each remote server 94 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 95 | # Enable showing throttling stats for every response received: 96 | #AUTOTHROTTLE_DEBUG = False 97 | 98 | # Enable and configure HTTP caching (disabled by default) 99 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 100 | #HTTPCACHE_ENABLED = True 101 | #HTTPCACHE_EXPIRATION_SECS = 0 102 | #HTTPCACHE_DIR = 'httpcache' 103 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 104 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 105 | 106 | MYSQL_HOST = "127.0.0.1" 107 | MYSQL_DBNAME = "dbname" 108 | MYSQL_USER = "root" 109 | MYSQL_PASSWORD = "password" 110 | 111 | EXECUTABLE_PATH = "D:\\chromedriver" #设置chrom路径 112 | RANDON_UA_TYPE = "random" #设置使用任意useragent头 113 | 114 | -------------------------------------------------------------------------------- /Technical_Artical_Spider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | #import MySQLdb 8 | #import MySQLdb.cursors 9 | import scrapy 10 | import re 11 | from twisted.enterprise import adbapi 12 | from scrapy.pipelines.images import ImagesPipeline 13 | from Technical_Artical_Spider.items import ArticleSpideranquanke,ArticleSpider4hou,ArticleSpiderfreebuf 14 | 15 | class TechnicalArticalSpiderPipeline(object): 16 | def process_item(self, item, spider): 17 | return item 18 | 19 | #使用twised异步机制插入数据库 20 | """ 21 | class MysqlTwistedPipline(object): 22 | def __init__(self, dbpool): 23 | self.dbpool = dbpool 24 | 25 | @classmethod 26 | def from_settings(cls, settings): 27 | dbparms = dict( 28 | host = settings["MYSQL_HOST"], 29 | db = settings["MYSQL_DBNAME"], 30 | user = settings["MYSQL_USER"], 31 | passwd = settings["MYSQL_PASSWORD"], 32 | charset='utf8', 33 | cursorclass=MySQLdb.cursors.DictCursor, 34 | use_unicode=True, 35 | ) 36 | dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms) 37 | 38 | return cls(dbpool) 39 | 40 | def process_item(self, item, spider): 41 | #使用twisted将mysql插入变成异步执行 42 | query = self.dbpool.runInteraction(self.do_insert, item) 43 | query.addErrback(self.handle_error, item, spider) #处理异常 44 | 45 | def handle_error(self, failure, item, spider): 46 | #处理异步插入的异常 47 | print(failure) 48 | def do_insert(self, cursor, item): 49 | #执行具体的插入 50 | #根据不同的item 构建不同的sql语句并插入到mysql中 51 | insert_sql, params = item.get_insert_sql() 52 | cursor.execute(insert_sql, params) 53 | """ 54 | 55 | #将数据写入elsticsearch 56 | class ElasticsearchPipline(object): 57 | #将数据写入到es中, 58 | def process_item(self,item,spider): 59 | #提升代码性能 60 | item.save_to_es() 61 | return item 62 | 63 | class ImagesavepathPipline(ImagesPipeline): 64 | path = "image" 65 | 66 | def file_path(self, request, response=None, info=None): 67 | image = request.url.split('/')[-1] 68 | path = self.path 69 | return '%s/%s' % (path,image) 70 | 71 | #文章封面图处理 72 | class ArticleImagePipeline(ImagesavepathPipline): 73 | Cover_image = "image_url" 74 | 75 | def get_media_requests(self, item, info): 76 | if isinstance(item,ArticleSpideranquanke): 77 | self.path = "Cover_images_anquanke" 78 | elif isinstance(item,ArticleSpider4hou): 79 | self.path = "Cover_images_4hou" 80 | elif isinstance(item,ArticleSpiderfreebuf): 81 | self.path = "Cover_images_freebuf" 82 | if len(item[self.Cover_image]): 83 | if isinstance(item,ArticleSpider4hou): 84 | for image_content_url in item[self.Cover_image]: 85 | yield scrapy.Request(image_content_url.split("?")[0]) 86 | else: 87 | for image_content_url in item[self.Cover_image]: 88 | yield scrapy.Request(image_content_url) 89 | 90 | def item_completed(self, results, item, info): 91 | if self.Cover_image in item: 92 | for ok, value in results: 93 | image_file_path = value["path"] 94 | item[self.Cover_image] = image_file_path 95 | return item 96 | 97 | #下载文章图片 98 | class ArticlecontentImagePipline(ImagesavepathPipline): 99 | contentImage = "ArticlecontentImage" 100 | def get_media_requests(self, item, info): 101 | if isinstance(item,ArticleSpideranquanke): 102 | self.path = "Content_images_anquanke" 103 | elif isinstance(item,ArticleSpider4hou): 104 | self.path = "Content_images_4hou" 105 | if len(item[self.contentImage]): 106 | for image_content_url in item[self.contentImage]: 107 | yield scrapy.Request(image_content_url) 108 | 109 | def item_completed(self, results, item, info): 110 | return_list = [] 111 | if self.contentImage in item: 112 | for ok,value in results: 113 | image_content_path = value["path"] 114 | return_list.append(image_content_path) 115 | item[self.contentImage] = return_list 116 | return item 117 | #处理文章中图片的替换 118 | class ArticleHTMLreplacePipline(object): 119 | # exchange html 120 | def process_item(self,item,spider): 121 | if spider.name == "4hou": 122 | itemcontentname = "content" 123 | re_findall = '

$]' 125 | re_replace = '

' 126 | contentImage = "ArticlecontentImage" 127 | elif spider.name=="anquanke360": 128 | itemcontentname = "content" 129 | re_findall = '' 130 | re_sub = '' 131 | re_replace = '

' 132 | contentImage = "ArticlecontentImage" 133 | if itemcontentname not in item: 134 | return item 135 | content = item[itemcontentname] 136 | sum = len(re.findall(re_findall,content)) 137 | if sum != len(item[contentImage]): 138 | return item 139 | if item[contentImage]: 140 | for exf in range(sum): 141 | html = item[contentImage][exf] 142 | html = re_replace.format(html) 143 | content = re.sub(re_sub,html,content,1) 144 | 145 | item["content"] = content 146 | 147 | return item -------------------------------------------------------------------------------- /Technical_Artical_Spider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | from scrapy.loader import ItemLoader 8 | from scrapy.loader.processors import TakeFirst,MapCompose,Join,Identity 9 | from Technical_Artical_Spider.models.elaticsearch_type_4hou import Article_4houType 10 | from Technical_Artical_Spider.models.elaticsearch_type_anquanke import Article_anquankeType 11 | from Technical_Artical_Spider.models.elaticsearch_type_freebuf import Article_freebuf 12 | from elasticsearch_dsl.connections import connections 13 | es_4hou = connections.create_connection(Article_4houType._doc_type.using) 14 | es_anquanke = connections.create_connection(Article_anquankeType._doc_type.using) 15 | es_anquanke = connections.create_connection(Article_freebuf._doc_type.using) 16 | import scrapy 17 | 18 | def gen_suggests(es,index,info_tuple): 19 | #根据字符串生生搜索建议数据 20 | used_words = set() #供去重使用 21 | suggests = [] 22 | for text,weight in info_tuple: 23 | if text: 24 | #调用es的analyze接口分析字符串 25 | words = es.indices.analyze(index=index,analyzer="ik_max_word",params={'filter':["lowercase"]},body=text) 26 | anylyzed_words = set([r["token"] for r in words["tokens"] if len(r["token"])>1]) 27 | new_words = anylyzed_words - used_words 28 | else: 29 | new_words = set() 30 | 31 | if new_words: 32 | suggests.append({"input":list(new_words),"weight":weight}) 33 | 34 | return suggests 35 | 36 | 37 | class TechnicalArticalSpiderItem(scrapy.Item): 38 | # define the fields for your item here like: 39 | # name = scrapy.Field() 40 | pass 41 | 42 | class ArticleItemLoader(ItemLoader): 43 | # 自定义itemloader 44 | default_output_processor = TakeFirst() 45 | 46 | 47 | def splitspace(value): 48 | value = value.strip() 49 | value = value.replace('\n','') 50 | value = value.replace('\r','') 51 | return value 52 | 53 | def remove_comma(value): 54 | if "," in value: 55 | return value.replace(",","") 56 | else: 57 | return value 58 | 59 | def remove_Keywords(value): 60 | if "发布" in value: 61 | value = value.replace("发布", "") 62 | if "前" in value: 63 | #now_time = time.strftime("%Y-%m-%d") 64 | import time 65 | now_time = time.strftime('%Y-%m-%d',time.localtime(time.time())) 66 | return now_time 67 | else: 68 | time = value.replace("年","-").replace("月","-").replace("日","") 69 | return time 70 | 71 | def return_value(value): 72 | return value 73 | 74 | def return_intvalue(value): 75 | value = int(value) 76 | return value 77 | 78 | def seturl(value): 79 | if value == None: 80 | return value 81 | elif value.startswith("http://") or value.startswith("https://"): 82 | return value 83 | else: 84 | return "http://www.4hou.com"+value 85 | def listtransstr(value): 86 | return "".join(value) 87 | 88 | #嘶吼文章Item 89 | class ArticleSpider4hou(scrapy.Item): 90 | image_local = scrapy.Field() #图片本地地址 91 | image_url =scrapy.Field( 92 | output_processor=MapCompose(return_value) 93 | ) #图片地址 94 | title = scrapy.Field() #文章标题 95 | create_date = scrapy.Field( 96 | input_processor=MapCompose(remove_Keywords), 97 | ) #发布日期 98 | url = scrapy.Field() #原文地址 99 | url_id = scrapy.Field() #经过md5加密过后的url 作为主键 100 | author = scrapy.Field( 101 | input_processor =MapCompose(splitspace), 102 | ) #作者 103 | tags = scrapy.Field() #标签 104 | watch_num = scrapy.Field( 105 | input_processor=MapCompose(remove_comma,return_intvalue), 106 | ) #观看数量 107 | comment_num = scrapy.Field( 108 | input_processor=MapCompose(remove_comma,return_intvalue), 109 | ) #评论数量 110 | praise_nums =scrapy.Field( 111 | input_processor=MapCompose(remove_comma,return_intvalue), 112 | ) #点赞数量 113 | content = scrapy.Field() #文章正文 114 | #文章中的背景图处理 115 | ArticlecontentImage = scrapy.Field( 116 | input_processor = MapCompose(seturl), 117 | output_processor = Identity(), 118 | 119 | ) 120 | 121 | # TODO 评论信息的显示 122 | def get_insert_sql(self): 123 | insert_sql = """ 124 | insert into 4hou_Article( 125 | image_local, 126 | title, 127 | url_id, 128 | create_date, 129 | url, 130 | author, 131 | tags, 132 | watch_num, 133 | comment_num, 134 | praise_nums, 135 | content 136 | ) 137 | VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE watch_num=VALUES(watch_num), 138 | comment_num=VALUES(comment_num),praise_nums=VALUES(praise_nums) 139 | """ 140 | params= ( 141 | self["image_url"], 142 | self["title"], 143 | self["url_id"], 144 | self["create_date"], 145 | self["url"], 146 | self["author"], 147 | self["tags"], 148 | self["watch_num"], 149 | self["comment_num"], 150 | self["praise_nums"], 151 | self["content"] 152 | ) 153 | return insert_sql,params 154 | ##将数据写入es 155 | def save_to_es(self): 156 | article = Article_4houType() 157 | article.image_local = self["image_url"] 158 | article.title = self["title"] 159 | article.url_id = self["url_id"] 160 | article.create_time = self["create_date"] 161 | article.url = self["url"] 162 | article.author = self["author"] 163 | article.tags = self["tags"] 164 | article.watch_nums = self["watch_num"] 165 | article.comment_nums = self["comment_num"] 166 | article.praise_nums = self["praise_nums"] 167 | article.content = self["content"] 168 | article.suggest = gen_suggests(es_4hou,Article_4houType._doc_type.index,((article.title,10),(article.tags,7))) 169 | article.save() 170 | 171 | return 172 | 173 | #Freebuf文章Item 174 | class ArticleSpiderfreebuf(scrapy.Item): 175 | image_local = scrapy.Field() # 图片本地地址 176 | image_url = scrapy.Field( 177 | output_processor=MapCompose(return_value) 178 | ) # 图片地址 179 | title = scrapy.Field() # 文章标题 180 | create_date = scrapy.Field() # 发布日期 181 | url = scrapy.Field() # 原文地址 182 | url_id = scrapy.Field() # 经过md5加密过后的url 作为主键 183 | author = scrapy.Field() # 作者 184 | tags = scrapy.Field( 185 | output_processor=MapCompose(listtransstr) 186 | ) # 标签 187 | watch_num = scrapy.Field( 188 | input_processor=MapCompose(return_intvalue) 189 | ) # 观看数量 190 | comment_num = scrapy.Field( 191 | input_processor=MapCompose(return_intvalue) 192 | ) # 评论数量 193 | content = scrapy.Field() # 文章正文 194 | 195 | def save_to_es(self): 196 | article = Article_freebuf() 197 | article.image_local = self["image_url"] 198 | article.title = self["title"] 199 | article.url_id = self["url_id"] 200 | article.create_time = self["create_date"] 201 | article.url = self["url"] 202 | article.author = self["author"] 203 | article.tags = self["tags"] 204 | article.watch_nums = self["watch_num"] 205 | if self["comment_num"]: 206 | article.comment_nums = self["comment_num"] 207 | else: 208 | article.comment_nums = 0 209 | article.content = self["content"] 210 | article.suggest = gen_suggests(es_4hou,Article_4houType._doc_type.index,((article.title,10),(article.tags,7))) 211 | article.save() 212 | 213 | return 214 | 215 | #安全客文章Iten 216 | class ArticleSpideranquanke(scrapy.Item): 217 | id = scrapy.Field() 218 | url = scrapy.Field() 219 | title = scrapy.Field() 220 | create_time= scrapy.Field() 221 | image_url = scrapy.Field() 222 | image_local = scrapy.Field() 223 | watch_num = scrapy.Field() 224 | tags = scrapy.Field() 225 | author = scrapy.Field() 226 | comment_num = scrapy.Field() 227 | content = scrapy.Field() 228 | ArticlecontentImage = scrapy.Field() 229 | 230 | def get_insert_sql(self): 231 | insert_sql = """ 232 | insert into anquanke_article( 233 | id, 234 | url, 235 | title, 236 | create_time, 237 | cover_local, 238 | watch_num, 239 | tags, 240 | author, 241 | comment_num, 242 | content 243 | ) 244 | VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE watch_num=VALUES(watch_num), 245 | comment_num=VALUES(comment_num) 246 | """ 247 | params = ( 248 | self["id"], 249 | self["url"], 250 | self["title"], 251 | self["create_time"], 252 | self["image_url"], 253 | self["watch_num"], 254 | self["tags"], 255 | self["author"], 256 | self["comment_num"], 257 | self["content"] 258 | ) 259 | return insert_sql, params 260 | #将代码写入es 261 | def save_to_es(self): 262 | article = Article_anquankeType() 263 | article.id = self["id"] 264 | article.url = self["url"] 265 | article.title = self["title"] 266 | article.create_time = self["create_time"] 267 | article.cover_local = self["image_url"] 268 | article.watch_num = self["watch_num"] 269 | article.tags = self["tags"] 270 | article.author = self["author"] 271 | article.comment_num = self["comment_num"] 272 | article.content = self["content"] 273 | article.suggest = gen_suggests(es_anquanke,Article_anquankeType._doc_type.index,((article.title,10),(article.tags,7))) 274 | article.save() 275 | 276 | return 277 | 278 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 224 | 225 | 226 | 227 | 245 | 246 | 264 | 265 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 1512112372865 306 | 310 | 311 | 1512623042279 312 | 317 | 318 | 1512625891003 319 | 324 | 325 | 1512661727063 326 | 331 | 332 | 1512719606009 333 | 338 | 339 | 1512740929260 340 | 345 | 346 | 1514018179100 347 | 352 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 406 | 407 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 418 | 419 | 420 | 421 | 422 | file://$PROJECT_DIR$/Technical_Artical_Spider/pipelines.py 423 | 46 424 | 426 | 427 | 428 | 429 | 430 | 432 | 433 | 434 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | --------------------------------------------------------------------------------