├── Technical_Artical_Spider
    ├── __init__.py
    ├── models
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── common.cpython-36.pyc
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── elaticsearch_type_4hou.cpython-36.pyc
    │   │   └── elaticsearch_type_anquanke.cpython-36.pyc
    │   ├── common.py
    │   ├── elaticsearch_type_anquanke.py
    │   ├── elaticsearch_type_freebuf.py
    │   └── elaticsearch_type_4hou.py
    ├── utils
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── common.cpython-35.pyc
    │   │   ├── common.cpython-36.pyc
    │   │   ├── __init__.cpython-35.pyc
    │   │   └── __init__.cpython-36.pyc
    │   └── common.py
    ├── __init__.pyc
    ├── settings.pyc
    ├── spiders
    │   ├── a4hou.pyc
    │   ├── __init__.pyc
    │   ├── __pycache__
    │   │   ├── a4hou.cpython-35.pyc
    │   │   ├── a4hou.cpython-36.pyc
    │   │   ├── __init__.cpython-35.pyc
    │   │   ├── __init__.cpython-36.pyc
    │   │   └── anquanke360.cpython-36.pyc
    │   ├── __init__.py
    │   ├── freebuf.py
    │   ├── a4hou.py
    │   └── anquanke360.py
    ├── __pycache__
    │   ├── items.cpython-35.pyc
    │   ├── items.cpython-36.pyc
    │   ├── __init__.cpython-35.pyc
    │   ├── __init__.cpython-36.pyc
    │   ├── pipelines.cpython-35.pyc
    │   ├── pipelines.cpython-36.pyc
    │   ├── settings.cpython-35.pyc
    │   ├── settings.cpython-36.pyc
    │   └── middlewares.cpython-36.pyc
    ├── images
    │   └── Cover_images_4hou
    │   │   └── 1f208f97337379677734.jpg
    ├── main.py
    ├── middlewares.py
    ├── settings.py
    ├── pipelines.py
    └── items.py
├── .idea
    ├── vcs.xml
    ├── misc.xml
    ├── sqldialects.xml
    ├── modules.xml
    ├── Technical_Artical_Spider.iml
    └── workspace.xml
├── scrapy.cfg
├── anquanke_article-struct.sql
├── 4hou_Article-struct.sql
└── README.md


/Technical_Artical_Spider/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Technical_Artical_Spider/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Technical_Artical_Spider/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Technical_Artical_Spider/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__init__.pyc


--------------------------------------------------------------------------------
/Technical_Artical_Spider/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/settings.pyc


--------------------------------------------------------------------------------
/Technical_Artical_Spider/spiders/a4hou.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/spiders/a4hou.pyc


--------------------------------------------------------------------------------
/Technical_Artical_Spider/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/spiders/__init__.pyc


--------------------------------------------------------------------------------
/Technical_Artical_Spider/__pycache__/items.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/items.cpython-35.pyc


--------------------------------------------------------------------------------
/Technical_Artical_Spider/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/Technical_Artical_Spider/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/Technical_Artical_Spider/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Technical_Artical_Spider/__pycache__/pipelines.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/pipelines.cpython-35.pyc


--------------------------------------------------------------------------------
/Technical_Artical_Spider/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/Technical_Artical_Spider/__pycache__/settings.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/settings.cpython-35.pyc


--------------------------------------------------------------------------------
/Technical_Artical_Spider/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/Technical_Artical_Spider/__pycache__/middlewares.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/__pycache__/middlewares.cpython-36.pyc


--------------------------------------------------------------------------------
/Technical_Artical_Spider/utils/__pycache__/common.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/utils/__pycache__/common.cpython-35.pyc


--------------------------------------------------------------------------------
/Technical_Artical_Spider/utils/__pycache__/common.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/utils/__pycache__/common.cpython-36.pyc


--------------------------------------------------------------------------------
/Technical_Artical_Spider/models/__pycache__/common.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/models/__pycache__/common.cpython-36.pyc


--------------------------------------------------------------------------------
/Technical_Artical_Spider/spiders/__pycache__/a4hou.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/spiders/__pycache__/a4hou.cpython-35.pyc


--------------------------------------------------------------------------------
/Technical_Artical_Spider/spiders/__pycache__/a4hou.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/spiders/__pycache__/a4hou.cpython-36.pyc


--------------------------------------------------------------------------------
/Technical_Artical_Spider/utils/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/utils/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/Technical_Artical_Spider/utils/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/utils/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Technical_Artical_Spider/models/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/models/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Technical_Artical_Spider/spiders/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/spiders/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/Technical_Artical_Spider/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/Technical_Artical_Spider/spiders/__pycache__/anquanke360.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/spiders/__pycache__/anquanke360.cpython-36.pyc


--------------------------------------------------------------------------------
/Technical_Artical_Spider/images/Cover_images_4hou/1f208f97337379677734.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/images/Cover_images_4hou/1f208f97337379677734.jpg


--------------------------------------------------------------------------------
/Technical_Artical_Spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/Technical_Artical_Spider/models/__pycache__/elaticsearch_type_4hou.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/models/__pycache__/elaticsearch_type_4hou.cpython-36.pyc


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (Technical_Article_Spider)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/Technical_Artical_Spider/models/__pycache__/elaticsearch_type_anquanke.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smile0304/Technical_Article_Spider/HEAD/Technical_Artical_Spider/models/__pycache__/elaticsearch_type_anquanke.cpython-36.pyc


--------------------------------------------------------------------------------
/.idea/sqldialects.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="SqlDialectMappings">
4 |     <file url="file://$PROJECT_DIR$/4hou_Article-struct.sql" dialect="MySQL" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/Technical_Artical_Spider/utils/common.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | import hashlib
 4 | 
 5 | def get_md5(url):
 6 |     if isinstance(url, str):
 7 |         url = url.encode("utf-8")
 8 |     m = hashlib.md5()
 9 |     m.update(url)
10 |     return m.hexdigest()


--------------------------------------------------------------------------------
/Technical_Artical_Spider/main.py:
--------------------------------------------------------------------------------
1 | from scrapy.cmdline import execute
2 | 
3 | import sys
4 | import os
5 | 
6 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
7 | #execute(["scrapy","crawl","4hou"])
8 | #execute(["scrapy","crawl","anquanke360"])
9 | execute(["scrapy","crawl","freebuf"])


--------------------------------------------------------------------------------
/Technical_Artical_Spider/models/common.py:
--------------------------------------------------------------------------------
1 | 
2 | #据说是个bug,反正就是要这样写
3 | from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer
4 | 
5 | class CustomAnalyzer(_CustomAnalyzer):
6 |     def get_analysis_definition(self):
7 |         return {}
8 | 
9 | ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"])


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/Technical_Artical_Spider.iml" filepath="$PROJECT_DIR$/.idea/Technical_Artical_Spider.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = Technical_Artical_Spider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = Technical_Artical_Spider
12 | 


--------------------------------------------------------------------------------
/.idea/Technical_Artical_Spider.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$">
 5 |       <excludeFolder url="file://$MODULE_DIR$/venv" />
 6 |     </content>
 7 |     <orderEntry type="jdk" jdkName="Python 3.6 (Technical_Article_Spider)" jdkType="Python SDK" />
 8 |     <orderEntry type="sourceFolder" forTests="false" />
 9 |   </component>
10 |   <component name="TestRunnerService">
11 |     <option name="projectConfiguration" value="Twisted Trial" />
12 |     <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
13 |   </component>
14 | </module>


--------------------------------------------------------------------------------
/Technical_Artical_Spider/models/elaticsearch_type_anquanke.py:
--------------------------------------------------------------------------------
 1 | from Technical_Artical_Spider.models.common import ik_analyzer
 2 | from elasticsearch_dsl import DocType, Date, Nested, Boolean, analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
 3 | from elasticsearch_dsl.connections import connections
 4 | connections.create_connection(hosts=["localhost"])
 5 | 
 6 | class Article_anquankeType(DocType):
 7 |     suggest = Completion(analyzer=ik_analyzer)  #搜索建议
 8 |     id = Integer()
 9 |     url = Keyword()
10 |     title = Text(analyzer="ik_max_word")
11 |     create_time = Date()
12 |     cover_local = Keyword()
13 |     watch_num = Integer()
14 |     comment_num = Integer()
15 |     tags = Text(analyzer="ik_max_word")
16 |     author = Keyword()
17 |     content = Text(analyzer="ik_max_word")
18 | 
19 |     class Meta:
20 |         index = "article_anquanke"
21 |         doc_type = "anquanke"
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     Article_anquankeType.init()
26 | 


--------------------------------------------------------------------------------
/Technical_Artical_Spider/models/elaticsearch_type_freebuf.py:
--------------------------------------------------------------------------------
 1 | from Technical_Artical_Spider.models.common import ik_analyzer
 2 | from elasticsearch_dsl import DocType, Date, Nested, Boolean, analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
 3 | from elasticsearch_dsl.connections import connections
 4 | connections.create_connection(hosts=["localhost"])
 5 | 
 6 | 
 7 | class Article_freebuf(DocType):
 8 |     suggest = Completion(analyzer=ik_analyzer)  #搜索建议
 9 |     image_local = Keyword()
10 |     title = Text(analyzer="ik_max_word")
11 |     url_id = Keyword()
12 |     create_time = Date()
13 |     url = Keyword()
14 |     author = Keyword()
15 |     tags = Text(analyzer="ik_max_word")
16 |     watch_nums = Integer()
17 |     comment_nums = Integer()
18 |     content = Text(analyzer="ik_max_word")
19 | 
20 |     class Meta:
21 |         index = "teachnical_freebuf"
22 |         doc_type = "freebuf"
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     Article_freebuf.init()
27 | 


--------------------------------------------------------------------------------
/Technical_Artical_Spider/models/elaticsearch_type_4hou.py:
--------------------------------------------------------------------------------
 1 | from Technical_Artical_Spider.models.common import ik_analyzer
 2 | from elasticsearch_dsl import DocType, Date, Nested, Boolean, analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
 3 | from elasticsearch_dsl.connections import connections
 4 | connections.create_connection(hosts=["localhost"])
 5 | 
 6 | 
 7 | class Article_4houType(DocType):
 8 |     suggest = Completion(analyzer=ik_analyzer)  #搜索建议
 9 |     image_local = Keyword()
10 |     title = Text(analyzer="ik_max_word")
11 |     url_id = Keyword()
12 |     create_time = Date()
13 |     url = Keyword()
14 |     author = Keyword()
15 |     tags = Text(analyzer="ik_max_word")
16 |     watch_nums = Integer()
17 |     comment_nums = Integer()
18 |     praise_nums = Integer()
19 |     content = Text(analyzer="ik_max_word")
20 | 
21 |     class Meta:
22 |         index = "teachnical_4hou"
23 |         doc_type = "A_4hou"
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     Article_4houType.init()
28 | 


--------------------------------------------------------------------------------
/anquanke_article-struct.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Navicat MySQL Data Transfer
 3 | 
 4 | Source Server         : TT_ubuntu16.04
 5 | Source Server Version : 50720
 6 | Source Host           : 192.168.123.66:3306
 7 | Source Database       : ArticleSpider
 8 | 
 9 | Target Server Type    : MYSQL
10 | Target Server Version : 50720
11 | File Encoding         : 65001
12 | 
13 | Date: 2017-12-15 20:56:57
14 | */
15 | 
16 | SET FOREIGN_KEY_CHECKS=0;
17 | 
18 | -- ----------------------------
19 | -- Table structure for anquanke_article
20 | -- ----------------------------
21 | DROP TABLE IF EXISTS `anquanke_article`;
22 | CREATE TABLE `anquanke_article` (
23 |   `id` int(32) NOT NULL,
24 |   `url` varchar(255) COLLATE utf8_bin NOT NULL,
25 |   `title` varchar(50) COLLATE utf8_bin NOT NULL,
26 |   `create_time` date NOT NULL,
27 |   `cover_local` varchar(255) COLLATE utf8_bin NOT NULL,
28 |   `watch_num` int(32) DEFAULT '0',
29 |   `tags` varchar(255) COLLATE utf8_bin NOT NULL,
30 |   `author` varchar(255) COLLATE utf8_bin NOT NULL,
31 |   `comment_num` int(32) DEFAULT '0',
32 |   `content` longtext COLLATE utf8_bin NOT NULL,
33 |   PRIMARY KEY (`id`)
34 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
35 | 


--------------------------------------------------------------------------------
/4hou_Article-struct.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Navicat MySQL Data Transfer
 3 | 
 4 | Source Server         : TT_ubuntu16.04
 5 | Source Server Version : 50720
 6 | Source Host           : 192.168.250.66:3306
 7 | Source Database       : ArticleSpider
 8 | 
 9 | Target Server Type    : MYSQL
10 | Target Server Version : 50720
11 | File Encoding         : 65001
12 | 
13 | Date: 2017-12-05 15:03:35
14 | */
15 | 
16 | SET FOREIGN_KEY_CHECKS=0;
17 | 
18 | -- ----------------------------
19 | -- Table structure for 4hou_Article
20 | -- ----------------------------
21 | DROP TABLE IF EXISTS `4hou_Article`;
22 | CREATE TABLE `4hou_Article` (
23 |   `image_local` varchar(255) COLLATE utf8_bin NOT NULL,
24 |   `image_url` varchar(255) COLLATE utf8_bin NOT NULL,
25 |   `title` varchar(200) COLLATE utf8_bin NOT NULL,
26 |   `url_id` varchar(32) COLLATE utf8_bin NOT NULL,
27 |   `create_date` date DEFAULT NULL,
28 |   `url` varchar(100) COLLATE utf8_bin NOT NULL,
29 |   `author` varchar(200) COLLATE utf8_bin NOT NULL,
30 |   `tags` varchar(50) COLLATE utf8_bin NOT NULL,
31 |   `watch_num` int(10) DEFAULT '0' COMMENT '0',
32 |   `comment_num` int(10) DEFAULT '0',
33 |   `praise_nums` int(10) DEFAULT '0',
34 |   `content` longtext COLLATE utf8_bin NOT NULL,
35 |   PRIMARY KEY (`url_id`)
36 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin ROW_FORMAT=DYNAMIC;
37 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Technical_Article_Spider
  2 | 
  3 | #### 一个爬取国内技术站点的技术文章
  4 | 
  5 | 为了方便之后的搜索引擎搭建,改用`elasticsearch`
  6 | 
  7 | 开发环境:
  8 | 
  9 | - python3
 10 | 
 11 | 
 12 | - Scrapy ==1.4.0
 13 | - elasticsearch-rtf
 14 | - docker
 15 | 
 16 | #### 最新版本安装
 17 | 
 18 | - Linux安装
 19 | 
 20 | > sudo apt-get install python3-pip git xvfb
 21 | >
 22 | > sudo pip3 install scrapy mysqlclient scrapy-splash fake-useragent
 23 | >
 24 | > git clone https://github.com/medcl/elasticsearch-rtf.git
 25 | >
 26 | > git clone https://github.com/smile0304/Technical_Article_Spider.git
 27 | 
 28 | - Windows安装
 29 | 
 30 | >pip install scrapy pillow mysqlclient scrapy-splash pypiwin32 fake-useragent
 31 | >
 32 | >git clone https://github.com/smile0304/Technical_Article_Spider.git
 33 | >
 34 | >git clone https://github.com/medcl/elasticsearch-rtf.git
 35 | 
 36 | `windows`和`linux`相同操作
 37 | 
 38 | 需要下载安装[`docker`](https://www.docker.com/community-edition)
 39 | 
 40 | - 配置docker国内镜像
 41 | 
 42 |   > Linux下配置：
 43 |   >
 44 |   > ​	curl -sSL https://get.daocloud.io/daotools/set_mirror.sh | sh -s http://7db66207.m.daocloud.io
 45 |   >
 46 |   > Windows下右键setting -> Daemon -> Registry mirrors 添加
 47 |   >
 48 |   > http://7db66207.m.daocloud.io
 49 | 
 50 | - 拉取镜像
 51 | 
 52 |   > docker pull scrapinghub/splash
 53 | 
 54 | - 用docker运行`scapinghub/splash`服务
 55 | 
 56 |   > docker run -p 8050:8050 scrapinghub/splash
 57 | 
 58 | #### 还可以修改的一些配置
 59 | 
 60 | ```python
 61 | AUTOTHROTTLE_ENABLED   #设置是否延迟
 62 | 
 63 | AUTOTHROTTLE_START_DELAY = 2	#请求的延时(需要AUTOTHROTTLE_ENABLED=True)
 64 | 
 65 | AUTOTHROTTLE_MAX_DELAY = 60   #如果网络差的最大等待时长(需要AUTOTHROTTLE_ENABLED=True)
 66 | 
 67 | IMAGES_STORE = os.path.join(project_dir, 'images')	#images为图片的默认存放地址
 68 | 
 69 | ```
 70 | 
 71 | #### PS：
 72 | 
 73 | ​	已突破安全客反爬虫机制，搜索引擎搭建，请移步至[Article_Search](https://github.com/smile0304/Article_Search)
 74 | 
 75 | #### 更新日志
 76 | 
 77 | - 2017年12月25日
 78 |   - 突破安全客反爬机制
 79 |   - 弃用`selenium`
 80 |   - 增加爬取`freebuf`的数据
 81 | 
 82 | 
 83 | - 2017年12月23日
 84 |   - 增加任意`User-Agent`
 85 | 
 86 | 
 87 | - 2017年12月18日
 88 |   - 数据分库
 89 |   - 设置浏览器为无界面
 90 | 
 91 | 
 92 | - 2017年12月15日
 93 |   - 弃用`Mysql`保存数据库
 94 |   - 使用`elasticsearch`保存数据
 95 | 
 96 | 
 97 | - 2017年12月8日更新
 98 |   - 对安全客进行爬去
 99 | 
100 |   - 完成图片的分类
101 | 
102 |   - 优化代码性能,降低冗余性
103 | 
104 | - 2017年12月5日首次提交
105 |   - 当前版本仅对嘶吼的文章进行爬取


--------------------------------------------------------------------------------
/Technical_Artical_Spider/spiders/freebuf.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from urllib import parse
 4 | from Technical_Artical_Spider.items import ArticleItemLoader,ArticleSpiderfreebuf
 5 | from Technical_Artical_Spider.utils.common import get_md5
 6 | class FreebufSpider(scrapy.Spider):
 7 |     name = 'freebuf'
 8 |     allowed_domains = ['www.freebuf.com']
 9 |     start_urls = ['http://www.freebuf.com/vuls',
10 |                   'http://www.freebuf.com/sectool',
11 |                   'http://www.freebuf.com/articles/web',
12 |                   'http://www.freebuf.com/articles/system',
13 |                   'http://www.freebuf.com/articles/network',
14 |                   'http://www.freebuf.com/articles/wireless',
15 |                   'http://www.freebuf.com/articles/terminal',
16 |                   'http://www.freebuf.com/articles/database',
17 |                   'http://www.freebuf.com/articles/security-management',
18 |                   'http://www.freebuf.com/articles/es',
19 |                   'http://www.freebuf.com/ics-articles'
20 |                   ]
21 | 
22 |     def start_requests(self):
23 |         for url in self.start_urls:
24 |             yield scrapy.Request(url,callback=self.parse)
25 | 
26 |     def parse(self, response):
27 |         next_url = response.css(".news-more a::attr(href)").extract()[0]
28 |         if next_url:
29 |             yield scrapy.Request(url=parse.urljoin(response.url,next_url),callback=self.parse)
30 | 
31 |         Article_Boxs = response.css(".news-detial .news_inner")
32 |         for article in Article_Boxs:
33 |             Image_url = article.css(".news-img img::attr(src)").extract()[0].split('!')[0]
34 |             Article_url = article.css(".news-info a::attr(href)").extract()[0]
35 |             yield  scrapy.Request(url=parse.urljoin(response.url,Article_url),
36 |                                   meta={"image_url": parse.urljoin(response.url,Image_url)},
37 |                                   callback=self.parse_detail
38 |                                   )
39 | 
40 |     def parse_detail(self,response):
41 |         image_url = response.meta.get("image_url", "")  # 文章封面图
42 |         item_loader = ArticleItemLoader(item=ArticleSpiderfreebuf(), response=response)
43 |         item_loader.add_css("title",".articlecontent .title h2::text")
44 |         item_loader.add_css("author",".property .name a::text")
45 |         item_loader.add_css("create_date",".property .time::text")
46 |         item_loader.add_value("url",response.url)
47 |         item_loader.add_value("url_id",get_md5(response.url))
48 |         item_loader.add_css("tags",".property .tags a::text")
49 |         item_loader.add_value("image_url",[image_url])
50 |         item_loader.add_css("watch_num",".property .look strong::text")
51 |         if len(response.css(".main-tit02 h3 span::text").extract()) != 0:
52 |             item_loader.add_css("comment_num",".main-tit02 h3 span::text")
53 |         else:
54 |             item_loader.add_value("comment_num","0")
55 |         item_loader.add_css("content","#contenttxt")
56 |         article_item = item_loader.load_item()
57 |         yield article_item
58 | 
59 | 


--------------------------------------------------------------------------------
/Technical_Artical_Spider/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | from fake_useragent import UserAgent
10 | 
11 | class TechnicalArticalSpiderSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 
58 | 
59 | from scrapy.http import HtmlResponse
60 | 
61 | class ChromMiddleware(object):
62 |     """
63 |     使用Chrome浏览器进行访问
64 |     """
65 |     def process_request(self,request,spider):
66 |         if request.url.startswith("https://www.anquanke.com/post/id/") and spider.name == "anquanke360":
67 |             spider.browser.get(request.url)
68 |             import time
69 |             time.sleep(3)
70 |             return HtmlResponse(url=spider.browser.current_url,body=spider.browser.page_source,encoding="utf-8", request=request)
71 | 
72 | class RandomUserAgentMiddleware(object):
73 |     """
74 |         使用任意的User-Agent头
75 |     """
76 |     def __init__(self,crawl):
77 |         super(RandomUserAgentMiddleware, self).__init__()
78 |         self.ua = UserAgent()
79 |         self.ua_type = crawl.settings.get("RANDOM_UA_TYPE","random")
80 | 
81 |     @classmethod
82 |     def from_crawler(cls,crawler):
83 |         return cls(crawler)
84 | 
85 |     def process_request(self,request,spider):
86 |         def get_ua():
87 |             return getattr(self.ua,self.ua_type)
88 | 
89 |         request.headers.setdefault('User-Agent',get_ua())
90 | 


--------------------------------------------------------------------------------
/Technical_Artical_Spider/spiders/a4hou.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import re
 4 | from urllib import parse
 5 | from Technical_Artical_Spider.items import ArticleSpider4hou,ArticleItemLoader
 6 | from Technical_Artical_Spider.utils.common import get_md5
 7 | class A4houSpider(scrapy.Spider):
 8 |     name = '4hou'
 9 |     allowed_domains = ['www.4hou.com']
10 |     start_urls = ['http://www.4hou.com/page/1']
11 |     #start_urls = ['http://www.4hou.com/vulnerable/8663.html']
12 |     headers = {
13 |         'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0"
14 |     }
15 |     urls = {}
16 | 
17 |     def parse(self, response):
18 |         #提取出下一页的url
19 |         next_url = response.css(".post-read-more-new a::attr(href)").extract()[0]
20 |         if next_url:
21 |             yield scrapy.Request(url=parse.urljoin(response.url,next_url),headers=self.headers,callback=self.parse)
22 | 
23 |         #提取出页面中全部的URL
24 |         Article_Boxs  = response.css(".main-box .ehover1")
25 |         for Article_box in Article_Boxs:
26 |             Article_url = Article_box.css(".new_img_title::attr(href)").extract_first("")
27 |             #过滤出技术文章，不要新闻
28 |             match_obj = re.match("(.*4hou.com/(technology|reverse|penetration|web|vulnerable)/(\d+)\.html$)", Article_url)
29 |             if match_obj:
30 |                 Image_url = Article_box.css(".new_img .wp-post-image::attr(data-original)").extract_first("")
31 |                 yield scrapy.Request(url = parse.urljoin(response.url,Article_url),
32 |                                      headers=self.headers
33 |                                      ,meta={"image_url":parse.urljoin(response.url,Image_url)}
34 |                                      ,callback=self.parse_detail)
35 | 
36 |     def parse_detail(self,response):
37 |         image_url = response.meta.get("image_url","") #文章封面图
38 |         item_loader =ArticleItemLoader(item=ArticleSpider4hou(),response=response)
39 |         item_loader.add_css("title",".art_title::text")
40 |         item_loader.add_css("create_date",".art_time::text")
41 |         item_loader.add_value("url",response.url)
42 |         item_loader.add_value("url_id",get_md5(response.url))
43 |         item_loader.add_css("author",".article_author_name .upload-img::text")
44 |         item_loader.add_xpath('tags',"//*[@class='art_nav']/a[2]/text()")
45 |         item_loader.add_value('image_url',[image_url])
46 |         item_loader.add_css("watch_num",".newtype .read span::text")
47 |         item_loader.add_css("comment_num",".newtype .comment span::text")
48 |         item_loader.add_css("praise_nums",".newtype .Praise span::text")
49 |         item_loader.add_css("content",".article_cen")
50 |         #文章中引用的图片
51 |         if response.css(".article_cen img::attr(data-original)").extract():
52 |             item_loader.add_css("ArticlecontentImage",".article_cen img::attr(data-original)")
53 |         else:
54 |             imgs = re.findall('<p style="text-align:.*center">([\s\S].*)</p>', response.body.decode('utf-8'))
55 |             #TODO imgs = ['<img src="/uploads/20171208/1512719385618983.png" title="1512719385618983.png" alt="1.png"/>']
56 |             imgs = [re.search('src="(/.*/.*/.*?)"',i).group(1) for i in imgs if re.search('src="(.*?)"', i)]
57 |             item_loader.add_value("ArticlecontentImage",imgs)
58 |         article_item = item_loader.load_item()
59 |         yield article_item
60 | 


--------------------------------------------------------------------------------
/Technical_Artical_Spider/spiders/anquanke360.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import json
 4 | from Technical_Artical_Spider.items import ArticleSpideranquanke
 5 | from urllib import parse
 6 | from selenium import webdriver
 7 | from scrapy.xlib.pydispatch import dispatcher
 8 | from scrapy import signals
 9 | from Technical_Artical_Spider.settings import EXECUTABLE_PATH
10 | #from pyvirtualdisplay import Display
11 | from scrapy_splash import SplashRequest
12 | from scrapy_splash import SplashRequest
13 | from scrapy_splash import SplashMiddleware
14 | import re
15 | class Anquanke360Spider(scrapy.Spider):
16 |     name = 'anquanke360'
17 |     allowed_domains = ['anquanke.com']
18 |     start_urls = ['https://api.anquanke.com/data/v1/posts?page=2&size=10&category=knowledge/']
19 |     headers_api = {
20 |         "HOST": "api.anquanke.com",
21 |         'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0"
22 |     }
23 |     headers_article = {
24 |         "HOST": "www.anquanke.com",
25 |         'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0"
26 |     }
27 |     """
28 |     def __init__(self):
29 |         #设置不加载图片
30 |         chrome_opt = webdriver.ChromeOptions()
31 |         prefs = {"profile.managed_default_content_settings.images":2}
32 |         chrome_opt.add_experimental_option("prefs",prefs)
33 |         #设置无界面     
34 |         display = Display(visible=0,size=(800,600))
35 |         display.start()
36 |         self.browser = webdriver.Chrome(executable_path=EXECUTABLE_PATH,chrome_options=chrome_opt)
37 |         super(Anquanke360Spider,self).__init__()
38 |         dispatcher.connect(self.spider_close,signals.spider_closed)
39 |     
40 |     def spider_close(self,spider):
41 |         self.browser.quit()
42 |     """
43 | 
44 |     def parse(self, response):
45 |         article_json = json.loads(response.text)
46 |         next_url = article_json["next"]
47 | 
48 |         for data in article_json["data"]:
49 |             url = "https://www.anquanke.com/post/id/"+str(data["id"])
50 |             title = data["title"]
51 |             title_start = re.search("(^\d{1,2}月\d{1,2}日)",title)
52 |             if not title_start:
53 |                 cover_image = data["cover"]
54 |                 item = ArticleSpideranquanke()
55 |                 item["id"] = data["id"]
56 |                 item["url"] = url
57 |                 item["title"] = title
58 |                 item["create_time"] = data["date"].split(" ")[0]
59 |                 item["image_url"] = [cover_image]
60 |                 item["watch_num"] = data["pv"]
61 |                 tags_list = data["tags"]
62 |                 item["tags"] = ",".join(tags_list)
63 |                 item["author"] = data["author"]["nickname"]
64 |                 """
65 |                 yield scrapy.Request(url,
66 |                                headers=self.headers_article,
67 |                                meta={"image_url": parse.urljoin(response.url, cover_image)},
68 |                                callback=lambda arg1=response,arg2=item: self.parse_detail(arg1,arg2))
69 |                 """
70 |                 yield SplashRequest(url,
71 |                                     meta={"image_url": parse.urljoin(response.url, cover_image)},
72 |                                     callback=lambda arg1=response,arg2=item: self.parse_detail(arg1,arg2))
73 |         if next_url:
74 |             yield scrapy.Request(next_url,headers=self.headers_api,callback=self.parse)
75 | 
76 | 
77 |     def parse_detail(self,response,item):
78 |         item["content"] = response.xpath("//div[@class='article-content']").extract()[0]
79 |         item["comment_num"] = int(response.css(".comment-list-area h1 span::text").extract()[0])
80 |         item['ArticlecontentImage'] = response.css(".aligncenter::attr(data-original)").extract()
81 |         yield item
82 | 


--------------------------------------------------------------------------------
/Technical_Artical_Spider/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | # Scrapy settings for Technical_Artical_Spider project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'Technical_Artical_Spider'
 13 | 
 14 | SPIDER_MODULES = ['Technical_Artical_Spider.spiders']
 15 | NEWSPIDER_MODULE = 'Technical_Artical_Spider.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'Technical_Artical_Spider (+http://www.yourdomain.com)'
 20 | 
 21 | # Obey robots.txt rules
 22 | ROBOTSTXT_OBEY = False
 23 | 
 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 25 | #CONCURRENT_REQUESTS = 1
 26 | 
 27 | # Configure a delay for requests for the same website (default: 0)
 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 29 | # See also autothrottle settings and docs
 30 | #DOWNLOAD_DELAY = 3
 31 | # The download delay setting will honor only one of:
 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 33 | #CONCURRENT_REQUESTS_PER_IP = 16
 34 | 
 35 | # Disable cookies (enabled by default)
 36 | #COOKIES_ENABLED = False
 37 | 
 38 | # Disable Telnet Console (enabled by default)
 39 | #TELNETCONSOLE_ENABLED = False
 40 | 
 41 | # Override the default request headers:
 42 | #DEFAULT_REQUEST_HEADERS = {
 43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 44 | #   'Accept-Language': 'en',
 45 | #}
 46 | 
 47 | # Enable or disable spider middlewares
 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 49 | #SPIDER_MIDDLEWARES = {
 50 | #    'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
 51 | #}
 52 | 
 53 | # Enable or disable downloader middlewares
 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 55 | DOWNLOADER_MIDDLEWARES = {
 56 |     #'Technical_Artical_Spider.middlewares.ChromMiddleware': 1,
 57 |     'Technical_Artical_Spider.middlewares.RandomUserAgentMiddleware': 1,
 58 |     'scrapy_splash.SplashCookiesMiddleware': 723,
 59 |     'scrapy_splash.SplashMiddleware': 725,
 60 |     'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
 61 |     }
 62 | SPLASH_URL = 'http://localhost:8050'
 63 | DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
 64 | HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
 65 | # Enable or disable extensions
 66 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 67 | #EXTENSIONS = {
 68 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 69 | #}
 70 | 
 71 | # Configure item pipelines
 72 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 73 | 
 74 | ITEM_PIPELINES = {
 75 |     #'Technical_Artical_Spider.pipelines.MysqlTwistedPipline': 30,
 76 |     'Technical_Artical_Spider.pipelines.ElasticsearchPipline': 30,
 77 |     'Technical_Artical_Spider.pipelines.ArticleImagePipeline': 1,
 78 |     #'Technical_Artical_Spider.pipelines.ArticlecontentImagePipline': 10,
 79 |     #'Technical_Artical_Spider.pipelines.ArticleHTMLreplacePipline': 20,
 80 | }
 81 | 
 82 | IMAGES_URLS_FIELD = "image_url"
 83 | project_dir = os.path.abspath(os.path.dirname(__file__))
 84 | IMAGES_STORE = os.path.join(project_dir, 'images')
 85 | # Enable and configure the AutoThrottle extension (disabled by default)
 86 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 87 | AUTOTHROTTLE_ENABLED = True
 88 | # The initial download delay
 89 | AUTOTHROTTLE_START_DELAY = 2
 90 | # The maximum download delay to be set in case of high latencies
 91 | AUTOTHROTTLE_MAX_DELAY = 60
 92 | # The average number of requests Scrapy should be sending in parallel to
 93 | # each remote server
 94 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 95 | # Enable showing throttling stats for every response received:
 96 | #AUTOTHROTTLE_DEBUG = False
 97 | 
 98 | # Enable and configure HTTP caching (disabled by default)
 99 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
100 | #HTTPCACHE_ENABLED = True
101 | #HTTPCACHE_EXPIRATION_SECS = 0
102 | #HTTPCACHE_DIR = 'httpcache'
103 | #HTTPCACHE_IGNORE_HTTP_CODES = []
104 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
105 | 
106 | MYSQL_HOST = "127.0.0.1"
107 | MYSQL_DBNAME = "dbname"
108 | MYSQL_USER = "root"
109 | MYSQL_PASSWORD = "password"
110 | 
111 | EXECUTABLE_PATH = "D:\\chromedriver" #设置chrom路径
112 | RANDON_UA_TYPE = "random" #设置使用任意useragent头
113 | 
114 | 


--------------------------------------------------------------------------------
/Technical_Artical_Spider/pipelines.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define your item pipelines here
  4 | #
  5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
  7 | #import MySQLdb
  8 | #import MySQLdb.cursors
  9 | import scrapy
 10 | import re
 11 | from twisted.enterprise import adbapi
 12 | from scrapy.pipelines.images import ImagesPipeline
 13 | from Technical_Artical_Spider.items import ArticleSpideranquanke,ArticleSpider4hou,ArticleSpiderfreebuf
 14 | 
 15 | class TechnicalArticalSpiderPipeline(object):
 16 |     def process_item(self, item, spider):
 17 |         return item
 18 | 
 19 | #使用twised异步机制插入数据库
 20 | """
 21 | class MysqlTwistedPipline(object):
 22 |     def __init__(self, dbpool):
 23 |         self.dbpool = dbpool
 24 | 
 25 |     @classmethod
 26 |     def from_settings(cls, settings):
 27 |         dbparms = dict(
 28 |             host = settings["MYSQL_HOST"],
 29 |             db = settings["MYSQL_DBNAME"],
 30 |             user = settings["MYSQL_USER"],
 31 |             passwd = settings["MYSQL_PASSWORD"],
 32 |             charset='utf8',
 33 |             cursorclass=MySQLdb.cursors.DictCursor,
 34 |             use_unicode=True,
 35 |         )
 36 |         dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
 37 | 
 38 |         return cls(dbpool)
 39 | 
 40 |     def process_item(self, item, spider):
 41 |         #使用twisted将mysql插入变成异步执行
 42 |         query = self.dbpool.runInteraction(self.do_insert, item)
 43 |         query.addErrback(self.handle_error, item, spider) #处理异常
 44 | 
 45 |     def handle_error(self, failure, item, spider):
 46 |         #处理异步插入的异常
 47 |         print(failure)
 48 |     def do_insert(self, cursor, item):
 49 |         #执行具体的插入
 50 |         #根据不同的item 构建不同的sql语句并插入到mysql中
 51 |         insert_sql, params = item.get_insert_sql()
 52 |         cursor.execute(insert_sql, params)
 53 | """
 54 | 
 55 | #将数据写入elsticsearch
 56 | class ElasticsearchPipline(object):
 57 |     #将数据写入到es中,
 58 |     def process_item(self,item,spider):
 59 |         #提升代码性能
 60 |         item.save_to_es()
 61 |         return item
 62 | 
 63 | class ImagesavepathPipline(ImagesPipeline):
 64 |     path = "image"
 65 | 
 66 |     def file_path(self, request, response=None, info=None):
 67 |         image = request.url.split('/')[-1]
 68 |         path = self.path
 69 |         return '%s/%s' % (path,image)
 70 | 
 71 | #文章封面图处理
 72 | class ArticleImagePipeline(ImagesavepathPipline):
 73 |     Cover_image = "image_url"
 74 | 
 75 |     def get_media_requests(self, item, info):
 76 |         if isinstance(item,ArticleSpideranquanke):
 77 |             self.path = "Cover_images_anquanke"
 78 |         elif isinstance(item,ArticleSpider4hou):
 79 |             self.path = "Cover_images_4hou"
 80 |         elif isinstance(item,ArticleSpiderfreebuf):
 81 |             self.path = "Cover_images_freebuf"
 82 |         if len(item[self.Cover_image]):
 83 |             if isinstance(item,ArticleSpider4hou):
 84 |                 for image_content_url in item[self.Cover_image]:
 85 |                     yield scrapy.Request(image_content_url.split("?")[0])
 86 |             else:
 87 |                 for image_content_url in item[self.Cover_image]:
 88 |                     yield scrapy.Request(image_content_url)
 89 | 
 90 |     def item_completed(self, results, item, info):
 91 |         if self.Cover_image in item:
 92 |             for ok, value in results:
 93 |                 image_file_path = value["path"]
 94 |             item[self.Cover_image] = image_file_path
 95 |         return item
 96 | 
 97 | #下载文章图片
 98 | class ArticlecontentImagePipline(ImagesavepathPipline):
 99 |     contentImage = "ArticlecontentImage"
100 |     def get_media_requests(self, item, info):
101 |         if isinstance(item,ArticleSpideranquanke):
102 |             self.path = "Content_images_anquanke"
103 |         elif isinstance(item,ArticleSpider4hou):
104 |             self.path = "Content_images_4hou"
105 |         if len(item[self.contentImage]):
106 |             for image_content_url in item[self.contentImage]:
107 |                 yield scrapy.Request(image_content_url)
108 | 
109 |     def item_completed(self, results, item, info):
110 |         return_list = []
111 |         if self.contentImage in item:
112 |             for ok,value in results:
113 |                 image_content_path = value["path"]
114 |                 return_list.append(image_content_path)
115 |             item[self.contentImage] = return_list
116 |         return item
117 | #处理文章中图片的替换
118 | class ArticleHTMLreplacePipline(object):
119 |     # exchange html <img>
120 |     def process_item(self,item,spider):
121 |         if spider.name == "4hou":
122 |             itemcontentname = "content"
123 |             re_findall = '<p style="text-align.*<img.*[<\/noscript>$]'
124 |             re_sub = '<p style="text-align.*<img.*[<\/noscript>$]'
125 |             re_replace = '<center><p><img src="../images/{0}" /></p></center>'
126 |             contentImage = "ArticlecontentImage"
127 |         elif spider.name=="anquanke360":
128 |             itemcontentname = "content"
129 |             re_findall = '<img.*\.[png|jpg|gif|jpeg].*>'
130 |             re_sub = '<img class=.*\.[png|jpg|gif|jpeg].*>'
131 |             re_replace = '<center><p><img src="../images/{0}" /></p></center>'
132 |             contentImage = "ArticlecontentImage"
133 |         if itemcontentname not in item:
134 |             return item
135 |         content = item[itemcontentname]
136 |         sum = len(re.findall(re_findall,content))
137 |         if sum != len(item[contentImage]):
138 |             return item
139 |         if item[contentImage]:
140 |             for exf in range(sum):
141 |                 html = item[contentImage][exf]
142 |                 html = re_replace.format(html)
143 |                 content = re.sub(re_sub,html,content,1)
144 | 
145 |         item["content"] = content
146 | 
147 |         return item


--------------------------------------------------------------------------------
/Technical_Artical_Spider/items.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your scraped items
  4 | #
  5 | # See documentation in:
  6 | # http://doc.scrapy.org/en/latest/topics/items.html
  7 | from scrapy.loader import ItemLoader
  8 | from scrapy.loader.processors import TakeFirst,MapCompose,Join,Identity
  9 | from Technical_Artical_Spider.models.elaticsearch_type_4hou import Article_4houType
 10 | from Technical_Artical_Spider.models.elaticsearch_type_anquanke import Article_anquankeType
 11 | from Technical_Artical_Spider.models.elaticsearch_type_freebuf import Article_freebuf
 12 | from elasticsearch_dsl.connections import connections
 13 | es_4hou = connections.create_connection(Article_4houType._doc_type.using)
 14 | es_anquanke = connections.create_connection(Article_anquankeType._doc_type.using)
 15 | es_anquanke = connections.create_connection(Article_freebuf._doc_type.using)
 16 | import scrapy
 17 | 
 18 | def gen_suggests(es,index,info_tuple):
 19 |     #根据字符串生生搜索建议数据
 20 |     used_words = set() #供去重使用
 21 |     suggests = []
 22 |     for text,weight in info_tuple:
 23 |         if text:
 24 |             #调用es的analyze接口分析字符串
 25 |             words = es.indices.analyze(index=index,analyzer="ik_max_word",params={'filter':["lowercase"]},body=text)
 26 |             anylyzed_words = set([r["token"] for r in words["tokens"] if len(r["token"])>1])
 27 |             new_words = anylyzed_words - used_words
 28 |         else:
 29 |             new_words = set()
 30 | 
 31 |         if new_words:
 32 |             suggests.append({"input":list(new_words),"weight":weight})
 33 | 
 34 |     return suggests
 35 | 
 36 | 
 37 | class TechnicalArticalSpiderItem(scrapy.Item):
 38 |     # define the fields for your item here like:
 39 |     # name = scrapy.Field()
 40 |     pass
 41 | 
 42 | class ArticleItemLoader(ItemLoader):
 43 |     # 自定义itemloader
 44 |     default_output_processor = TakeFirst()
 45 | 
 46 | 
 47 | def splitspace(value):
 48 |     value = value.strip()
 49 |     value = value.replace('\n','')
 50 |     value = value.replace('\r','')
 51 |     return value
 52 | 
 53 | def remove_comma(value):
 54 |     if "," in value:
 55 |         return value.replace(",","")
 56 |     else:
 57 |         return value
 58 | 
 59 | def remove_Keywords(value):
 60 |     if "发布" in value:
 61 |         value = value.replace("发布", "")
 62 |     if "前" in value:
 63 |         #now_time = time.strftime("%Y-%m-%d")
 64 |         import time
 65 |         now_time = time.strftime('%Y-%m-%d',time.localtime(time.time()))
 66 |         return now_time
 67 |     else:
 68 |         time = value.replace("年","-").replace("月","-").replace("日","")
 69 |         return time
 70 | 
 71 | def return_value(value):
 72 |     return value
 73 | 
 74 | def return_intvalue(value):
 75 |     value =  int(value)
 76 |     return value
 77 | 
 78 | def seturl(value):
 79 |     if value == None:
 80 |         return value
 81 |     elif value.startswith("http://") or value.startswith("https://"):
 82 |         return value
 83 |     else:
 84 |         return "http://www.4hou.com"+value
 85 | def listtransstr(value):
 86 |     return "".join(value)
 87 | 
 88 | #嘶吼文章Item
 89 | class ArticleSpider4hou(scrapy.Item):
 90 |     image_local = scrapy.Field() #图片本地地址
 91 |     image_url =scrapy.Field(
 92 |         output_processor=MapCompose(return_value)
 93 |     ) #图片地址
 94 |     title = scrapy.Field()  #文章标题
 95 |     create_date = scrapy.Field(
 96 |         input_processor=MapCompose(remove_Keywords),
 97 |     ) #发布日期
 98 |     url = scrapy.Field()  #原文地址
 99 |     url_id = scrapy.Field() #经过md5加密过后的url  作为主键
100 |     author = scrapy.Field(
101 |         input_processor =MapCompose(splitspace),
102 |     ) #作者
103 |     tags = scrapy.Field() #标签
104 |     watch_num = scrapy.Field(
105 |         input_processor=MapCompose(remove_comma,return_intvalue),
106 |     ) #观看数量
107 |     comment_num = scrapy.Field(
108 |         input_processor=MapCompose(remove_comma,return_intvalue),
109 |     ) #评论数量
110 |     praise_nums =scrapy.Field(
111 |         input_processor=MapCompose(remove_comma,return_intvalue),
112 |     ) #点赞数量
113 |     content = scrapy.Field() #文章正文
114 |     #文章中的背景图处理
115 |     ArticlecontentImage = scrapy.Field(
116 |         input_processor = MapCompose(seturl),
117 |         output_processor = Identity(),
118 | 
119 |     )
120 | 
121 |     # TODO 评论信息的显示
122 |     def get_insert_sql(self):
123 |         insert_sql = """
124 |             insert into 4hou_Article(
125 |             image_local,
126 |             title,
127 |             url_id,
128 |             create_date,
129 |             url,
130 |             author,
131 |             tags,
132 |             watch_num,
133 |             comment_num,
134 |             praise_nums,
135 |             content
136 |             )
137 |             VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE watch_num=VALUES(watch_num),
138 |             comment_num=VALUES(comment_num),praise_nums=VALUES(praise_nums)
139 |         """
140 |         params= (
141 |               self["image_url"],
142 |               self["title"],
143 |               self["url_id"],
144 |               self["create_date"],
145 |               self["url"],
146 |               self["author"],
147 |               self["tags"],
148 |               self["watch_num"],
149 |               self["comment_num"],
150 |               self["praise_nums"],
151 |               self["content"]
152 |                   )
153 |         return insert_sql,params
154 |     ##将数据写入es
155 |     def save_to_es(self):
156 |         article = Article_4houType()
157 |         article.image_local = self["image_url"]
158 |         article.title = self["title"]
159 |         article.url_id = self["url_id"]
160 |         article.create_time = self["create_date"]
161 |         article.url = self["url"]
162 |         article.author = self["author"]
163 |         article.tags = self["tags"]
164 |         article.watch_nums = self["watch_num"]
165 |         article.comment_nums = self["comment_num"]
166 |         article.praise_nums = self["praise_nums"]
167 |         article.content = self["content"]
168 |         article.suggest = gen_suggests(es_4hou,Article_4houType._doc_type.index,((article.title,10),(article.tags,7)))
169 |         article.save()
170 | 
171 |         return
172 | 
173 | #Freebuf文章Item
174 | class ArticleSpiderfreebuf(scrapy.Item):
175 |     image_local = scrapy.Field()  # 图片本地地址
176 |     image_url = scrapy.Field(
177 |         output_processor=MapCompose(return_value)
178 |     )  # 图片地址
179 |     title = scrapy.Field()  # 文章标题
180 |     create_date = scrapy.Field()  # 发布日期
181 |     url = scrapy.Field()  # 原文地址
182 |     url_id = scrapy.Field()  # 经过md5加密过后的url  作为主键
183 |     author = scrapy.Field()  # 作者
184 |     tags = scrapy.Field(
185 |         output_processor=MapCompose(listtransstr)
186 |     )  # 标签
187 |     watch_num = scrapy.Field(
188 |         input_processor=MapCompose(return_intvalue)
189 |     )  # 观看数量
190 |     comment_num = scrapy.Field(
191 |         input_processor=MapCompose(return_intvalue)
192 |     )  # 评论数量
193 |     content = scrapy.Field()  # 文章正文
194 | 
195 |     def save_to_es(self):
196 |         article = Article_freebuf()
197 |         article.image_local = self["image_url"]
198 |         article.title = self["title"]
199 |         article.url_id = self["url_id"]
200 |         article.create_time = self["create_date"]
201 |         article.url = self["url"]
202 |         article.author = self["author"]
203 |         article.tags = self["tags"]
204 |         article.watch_nums = self["watch_num"]
205 |         if self["comment_num"]:
206 |             article.comment_nums = self["comment_num"]
207 |         else:
208 |             article.comment_nums = 0
209 |         article.content = self["content"]
210 |         article.suggest = gen_suggests(es_4hou,Article_4houType._doc_type.index,((article.title,10),(article.tags,7)))
211 |         article.save()
212 | 
213 |         return
214 | 
215 | #安全客文章Iten
216 | class ArticleSpideranquanke(scrapy.Item):
217 |     id = scrapy.Field()
218 |     url = scrapy.Field()
219 |     title = scrapy.Field()
220 |     create_time= scrapy.Field()
221 |     image_url = scrapy.Field()
222 |     image_local = scrapy.Field()
223 |     watch_num = scrapy.Field()
224 |     tags = scrapy.Field()
225 |     author = scrapy.Field()
226 |     comment_num = scrapy.Field()
227 |     content = scrapy.Field()
228 |     ArticlecontentImage = scrapy.Field()
229 | 
230 |     def get_insert_sql(self):
231 |         insert_sql = """
232 |             insert into anquanke_article(
233 |             id,
234 |             url,
235 |             title,
236 |             create_time,
237 |             cover_local,
238 |             watch_num,
239 |             tags,
240 |             author,
241 |             comment_num,
242 |             content
243 |             )
244 |             VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE watch_num=VALUES(watch_num),
245 |             comment_num=VALUES(comment_num)
246 |         """
247 |         params = (
248 |             self["id"],
249 |             self["url"],
250 |             self["title"],
251 |             self["create_time"],
252 |             self["image_url"],
253 |             self["watch_num"],
254 |             self["tags"],
255 |             self["author"],
256 |             self["comment_num"],
257 |             self["content"]
258 |         )
259 |         return insert_sql, params
260 |     #将代码写入es
261 |     def save_to_es(self):
262 |         article = Article_anquankeType()
263 |         article.id = self["id"]
264 |         article.url = self["url"]
265 |         article.title = self["title"]
266 |         article.create_time = self["create_time"]
267 |         article.cover_local = self["image_url"]
268 |         article.watch_num = self["watch_num"]
269 |         article.tags = self["tags"]
270 |         article.author = self["author"]
271 |         article.comment_num = self["comment_num"]
272 |         article.content = self["content"]
273 |         article.suggest = gen_suggests(es_anquanke,Article_anquankeType._doc_type.index,((article.title,10),(article.tags,7)))
274 |         article.save()
275 | 
276 |         return
277 | 
278 | 


--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="BookmarkManager">
  4 |     <bookmark url="file://$PROJECT_DIR$/Technical_Artical_Spider/spiders/a4hou.py" />
  5 |   </component>
  6 |   <component name="ChangeListManager">
  7 |     <list default="true" id="16079fd1-f568-458e-b1a1-68dfbbb50d67" name="Default" comment="">
  8 |       <change beforePath="$PROJECT_DIR$/.idea/Technical_Artical_Spider.iml" afterPath="$PROJECT_DIR$/.idea/Technical_Artical_Spider.iml" />
  9 |       <change beforePath="$PROJECT_DIR$/.idea/misc.xml" afterPath="$PROJECT_DIR$/.idea/misc.xml" />
 10 |       <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
 11 |     </list>
 12 |     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
 13 |     <option name="TRACKING_ENABLED" value="true" />
 14 |     <option name="SHOW_DIALOG" value="false" />
 15 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
 16 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 17 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 18 |   </component>
 19 |   <component name="CoverageDataManager">
 20 |     <SUITE FILE_PATH="coverage/Technical_Article_Spider$main.coverage" NAME="main Coverage Results" MODIFIED="1514015688424" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/Technical_Artical_Spider" />
 21 |     <SUITE FILE_PATH="coverage/Technical_Artical_Spider$main.coverage" NAME="main Coverage Results" MODIFIED="1512730864970" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/Technical_Artical_Spider" />
 22 |     <SUITE FILE_PATH="coverage/Technical_Article_Spider$elaticsearch_type_anquanke.coverage" NAME="elaticsearch_type_anquanke Coverage Results" MODIFIED="1514014298563" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/Technical_Artical_Spider/models" />
 23 |     <SUITE FILE_PATH="coverage/Technical_Artical_Spider$a4hou.coverage" NAME="a4hou Coverage Results" MODIFIED="1512120591331" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/Technical_Artical_Spider/spiders" />
 24 |   </component>
 25 |   <component name="DatabaseView">
 26 |     <option name="SHOW_INTERMEDIATE" value="true" />
 27 |     <option name="GROUP_DATA_SOURCES" value="true" />
 28 |     <option name="GROUP_SCHEMA" value="true" />
 29 |     <option name="GROUP_CONTENTS" value="false" />
 30 |     <option name="SORT_POSITIONED" value="false" />
 31 |     <option name="SHOW_TABLE_DETAILS" value="true" />
 32 |     <option name="SHOW_EMPTY_GROUPS" value="false" />
 33 |     <option name="AUTO_SCROLL_FROM_SOURCE" value="false" />
 34 |     <expand />
 35 |     <select />
 36 |   </component>
 37 |   <component name="FileEditorManager">
 38 |     <leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
 39 |       <file leaf-file-name="settings.py" pinned="false" current-in-tab="false">
 40 |         <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/settings.py">
 41 |           <provider selected="true" editor-type-id="text-editor">
 42 |             <state relative-caret-position="290">
 43 |               <caret line="75" column="0" lean-forward="false" selection-start-line="75" selection-start-column="0" selection-end-line="75" selection-end-column="0" />
 44 |               <folding />
 45 |             </state>
 46 |           </provider>
 47 |         </entry>
 48 |       </file>
 49 |       <file leaf-file-name="main.py" pinned="false" current-in-tab="true">
 50 |         <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/main.py">
 51 |           <provider selected="true" editor-type-id="text-editor">
 52 |             <state relative-caret-position="92">
 53 |               <caret line="7" column="36" lean-forward="true" selection-start-line="7" selection-start-column="36" selection-end-line="7" selection-end-column="36" />
 54 |               <folding />
 55 |             </state>
 56 |           </provider>
 57 |         </entry>
 58 |       </file>
 59 |       <file leaf-file-name="pipelines.py" pinned="false" current-in-tab="false">
 60 |         <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/pipelines.py">
 61 |           <provider selected="true" editor-type-id="text-editor">
 62 |             <state relative-caret-position="736">
 63 |               <caret line="53" column="0" lean-forward="false" selection-start-line="53" selection-start-column="0" selection-end-line="53" selection-end-column="0" />
 64 |               <folding>
 65 |                 <element signature="e#232#245#0" expanded="true" />
 66 |               </folding>
 67 |             </state>
 68 |           </provider>
 69 |         </entry>
 70 |       </file>
 71 |       <file leaf-file-name="middlewares.py" pinned="false" current-in-tab="false">
 72 |         <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/middlewares.py">
 73 |           <provider selected="true" editor-type-id="text-editor">
 74 |             <state relative-caret-position="575">
 75 |               <caret line="70" column="0" lean-forward="false" selection-start-line="70" selection-start-column="0" selection-end-line="70" selection-end-column="0" />
 76 |               <folding>
 77 |                 <element signature="e#168#194#0" expanded="true" />
 78 |                 <element signature="e#291#1953#0" expanded="false" />
 79 |               </folding>
 80 |             </state>
 81 |           </provider>
 82 |         </entry>
 83 |       </file>
 84 |     </leaf>
 85 |   </component>
 86 |   <component name="FileTemplateManagerImpl">
 87 |     <option name="RECENT_TEMPLATES">
 88 |       <list>
 89 |         <option value="Python Script" />
 90 |       </list>
 91 |     </option>
 92 |   </component>
 93 |   <component name="FindInProjectRecents">
 94 |     <findStrings>
 95 |       <find>create_date</find>
 96 |       <find>print</find>
 97 |       <find>parse</find>
 98 |       <find>starts</find>
 99 |       <find>EXECUTABLE_PATH</find>
100 |       <find>browser</find>
101 |       <find>start</find>
102 |       <find>Anquanke_ArticleImagePipeline</find>
103 |       <find>DUPEFILT</find>
104 |       <find>download</find>
105 |     </findStrings>
106 |   </component>
107 |   <component name="Git.Settings">
108 |     <option name="ROOT_SYNC" value="DONT_SYNC" />
109 |     <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
110 |     <option name="RECENT_BRANCH_BY_REPOSITORY">
111 |       <map>
112 |         <entry key="$PROJECT_DIR$" value="master" />
113 |       </map>
114 |     </option>
115 |   </component>
116 |   <component name="IdeDocumentHistory">
117 |     <option name="CHANGED_PATHS">
118 |       <list>
119 |         <option value="$PROJECT_DIR$/Technical_Artical_Spider/utils/common.py" />
120 |         <option value="$PROJECT_DIR$/Technical_Artical_Spider/spiders/a4hou.py" />
121 |         <option value="$PROJECT_DIR$/Technical_Artical_Spider/items.py" />
122 |         <option value="$PROJECT_DIR$/Technical_Artical_Spider/spiders/anquanke360.py" />
123 |         <option value="$PROJECT_DIR$/Technical_Artical_Spider/settings.py" />
124 |         <option value="$PROJECT_DIR$/Technical_Artical_Spider/pipelines.py" />
125 |         <option value="$PROJECT_DIR$/Technical_Artical_Spider/middlewares.py" />
126 |         <option value="$PROJECT_DIR$/Technical_Artical_Spider/main.py" />
127 |       </list>
128 |     </option>
129 |   </component>
130 |   <component name="JsBuildToolGruntFileManager" detection-done="true" sorting="DEFINITION_ORDER" />
131 |   <component name="JsBuildToolPackageJson" detection-done="true" sorting="DEFINITION_ORDER" />
132 |   <component name="JsGulpfileManager">
133 |     <detection-done>true</detection-done>
134 |     <sorting>DEFINITION_ORDER</sorting>
135 |   </component>
136 |   <component name="ProjectFrameBounds">
137 |     <option name="x" value="837" />
138 |     <option name="y" value="24" />
139 |     <option name="width" value="1656" />
140 |     <option name="height" value="1327" />
141 |   </component>
142 |   <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
143 |   <component name="ProjectView">
144 |     <navigator currentView="ProjectPane" proportions="" version="1">
145 |       <flattenPackages />
146 |       <showMembers ProjectPane="false" />
147 |       <showModules />
148 |       <showLibraryContents />
149 |       <hideEmptyPackages />
150 |       <abbreviatePackageNames />
151 |       <autoscrollToSource />
152 |       <autoscrollFromSource />
153 |       <sortByType />
154 |       <manualOrder />
155 |       <foldersAlwaysOnTop value="true" />
156 |     </navigator>
157 |     <panes>
158 |       <pane id="ProjectPane">
159 |         <subPane>
160 |           <expand>
161 |             <path>
162 |               <item name="Technical_Article_Spider" type="b2602c69:ProjectViewProjectNode" />
163 |               <item name="Technical_Article_Spider" type="462c0819:PsiDirectoryNode" />
164 |             </path>
165 |             <path>
166 |               <item name="Technical_Article_Spider" type="b2602c69:ProjectViewProjectNode" />
167 |               <item name="Technical_Article_Spider" type="462c0819:PsiDirectoryNode" />
168 |               <item name="Technical_Artical_Spider" type="462c0819:PsiDirectoryNode" />
169 |             </path>
170 |             <path>
171 |               <item name="Technical_Article_Spider" type="b2602c69:ProjectViewProjectNode" />
172 |               <item name="Technical_Article_Spider" type="462c0819:PsiDirectoryNode" />
173 |               <item name="Technical_Artical_Spider" type="462c0819:PsiDirectoryNode" />
174 |               <item name="images" type="462c0819:PsiDirectoryNode" />
175 |             </path>
176 |             <path>
177 |               <item name="Technical_Article_Spider" type="b2602c69:ProjectViewProjectNode" />
178 |               <item name="Technical_Article_Spider" type="462c0819:PsiDirectoryNode" />
179 |               <item name="Technical_Artical_Spider" type="462c0819:PsiDirectoryNode" />
180 |               <item name="images" type="462c0819:PsiDirectoryNode" />
181 |               <item name="Cover_images_4hou" type="462c0819:PsiDirectoryNode" />
182 |             </path>
183 |             <path>
184 |               <item name="Technical_Article_Spider" type="b2602c69:ProjectViewProjectNode" />
185 |               <item name="Technical_Article_Spider" type="462c0819:PsiDirectoryNode" />
186 |               <item name="Technical_Artical_Spider" type="462c0819:PsiDirectoryNode" />
187 |               <item name="models" type="462c0819:PsiDirectoryNode" />
188 |             </path>
189 |             <path>
190 |               <item name="Technical_Article_Spider" type="b2602c69:ProjectViewProjectNode" />
191 |               <item name="Technical_Article_Spider" type="462c0819:PsiDirectoryNode" />
192 |               <item name="Technical_Artical_Spider" type="462c0819:PsiDirectoryNode" />
193 |               <item name="spiders" type="462c0819:PsiDirectoryNode" />
194 |             </path>
195 |           </expand>
196 |           <select />
197 |         </subPane>
198 |       </pane>
199 |       <pane id="Scratches" />
200 |       <pane id="Scope" />
201 |     </panes>
202 |   </component>
203 |   <component name="PropertiesComponent">
204 |     <property name="WebServerToolWindowFactoryState" value="false" />
205 |     <property name="last_opened_file_path" value="$PROJECT_DIR$" />
206 |     <property name="settings.editor.selected.configurable" value="editor.preferences.fonts.default" />
207 |   </component>
208 |   <component name="RecentsManager">
209 |     <key name="CopyFile.RECENT_KEYS">
210 |       <recent name="$PROJECT_DIR$/Technical_Artical_Spider" />
211 |     </key>
212 |   </component>
213 |   <component name="RunDashboard">
214 |     <option name="ruleStates">
215 |       <list>
216 |         <RuleState>
217 |           <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
218 |         </RuleState>
219 |         <RuleState>
220 |           <option name="name" value="StatusDashboardGroupingRule" />
221 |         </RuleState>
222 |       </list>
223 |     </option>
224 |   </component>
225 |   <component name="RunManager" selected="Python.main">
226 |     <configuration name="a4hou" type="PythonConfigurationType" factoryName="Python" temporary="true">
227 |       <option name="INTERPRETER_OPTIONS" value="" />
228 |       <option name="PARENT_ENVS" value="true" />
229 |       <envs>
230 |         <env name="PYTHONUNBUFFERED" value="1" />
231 |       </envs>
232 |       <option name="SDK_HOME" value="" />
233 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/Technical_Artical_Spider/spiders" />
234 |       <option name="IS_MODULE_SDK" value="true" />
235 |       <option name="ADD_CONTENT_ROOTS" value="true" />
236 |       <option name="ADD_SOURCE_ROOTS" value="true" />
237 |       <module name="Technical_Artical_Spider" />
238 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
239 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/Technical_Artical_Spider/spiders/a4hou.py" />
240 |       <option name="PARAMETERS" value="" />
241 |       <option name="SHOW_COMMAND_LINE" value="false" />
242 |       <option name="EMULATE_TERMINAL" value="false" />
243 |       <option name="MODULE_MODE" value="false" />
244 |     </configuration>
245 |     <configuration name="elaticsearch_type_anquanke" type="PythonConfigurationType" factoryName="Python" temporary="true">
246 |       <option name="INTERPRETER_OPTIONS" value="" />
247 |       <option name="PARENT_ENVS" value="true" />
248 |       <envs>
249 |         <env name="PYTHONUNBUFFERED" value="1" />
250 |       </envs>
251 |       <option name="SDK_HOME" value="" />
252 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/Technical_Artical_Spider/models" />
253 |       <option name="IS_MODULE_SDK" value="true" />
254 |       <option name="ADD_CONTENT_ROOTS" value="true" />
255 |       <option name="ADD_SOURCE_ROOTS" value="true" />
256 |       <module name="Technical_Artical_Spider" />
257 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
258 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/Technical_Artical_Spider/models/elaticsearch_type_anquanke.py" />
259 |       <option name="PARAMETERS" value="" />
260 |       <option name="SHOW_COMMAND_LINE" value="false" />
261 |       <option name="EMULATE_TERMINAL" value="false" />
262 |       <option name="MODULE_MODE" value="false" />
263 |     </configuration>
264 |     <configuration name="main" type="PythonConfigurationType" factoryName="Python" temporary="true">
265 |       <option name="INTERPRETER_OPTIONS" value="" />
266 |       <option name="PARENT_ENVS" value="true" />
267 |       <envs>
268 |         <env name="PYTHONUNBUFFERED" value="1" />
269 |       </envs>
270 |       <option name="SDK_HOME" value="" />
271 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/Technical_Artical_Spider" />
272 |       <option name="IS_MODULE_SDK" value="true" />
273 |       <option name="ADD_CONTENT_ROOTS" value="true" />
274 |       <option name="ADD_SOURCE_ROOTS" value="true" />
275 |       <module name="Technical_Artical_Spider" />
276 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
277 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/Technical_Artical_Spider/main.py" />
278 |       <option name="PARAMETERS" value="" />
279 |       <option name="SHOW_COMMAND_LINE" value="false" />
280 |       <option name="EMULATE_TERMINAL" value="false" />
281 |       <option name="MODULE_MODE" value="false" />
282 |     </configuration>
283 |     <list size="3">
284 |       <item index="0" class="java.lang.String" itemvalue="Python.main" />
285 |       <item index="1" class="java.lang.String" itemvalue="Python.a4hou" />
286 |       <item index="2" class="java.lang.String" itemvalue="Python.elaticsearch_type_anquanke" />
287 |     </list>
288 |     <recent_temporary>
289 |       <list size="3">
290 |         <item index="0" class="java.lang.String" itemvalue="Python.main" />
291 |         <item index="1" class="java.lang.String" itemvalue="Python.elaticsearch_type_anquanke" />
292 |         <item index="2" class="java.lang.String" itemvalue="Python.a4hou" />
293 |       </list>
294 |     </recent_temporary>
295 |   </component>
296 |   <component name="ShelveChangesManager" show_recycled="false">
297 |     <option name="remove_strategy" value="false" />
298 |   </component>
299 |   <component name="SvnConfiguration">
300 |     <configuration />
301 |   </component>
302 |   <component name="TaskManager">
303 |     <task active="true" id="Default" summary="Default task">
304 |       <changelist id="16079fd1-f568-458e-b1a1-68dfbbb50d67" name="Default" comment="" />
305 |       <created>1512112372865</created>
306 |       <option name="number" value="Default" />
307 |       <option name="presentableId" value="Default" />
308 |       <updated>1512112372865</updated>
309 |     </task>
310 |     <task id="LOCAL-00001" summary="分离文章图片和封面图">
311 |       <created>1512623042279</created>
312 |       <option name="number" value="00001" />
313 |       <option name="presentableId" value="LOCAL-00001" />
314 |       <option name="project" value="LOCAL" />
315 |       <updated>1512623042279</updated>
316 |     </task>
317 |     <task id="LOCAL-00002" summary="提高代码的复用性">
318 |       <created>1512625891003</created>
319 |       <option name="number" value="00002" />
320 |       <option name="presentableId" value="LOCAL-00002" />
321 |       <option name="project" value="LOCAL" />
322 |       <updated>1512625891003</updated>
323 |     </task>
324 |     <task id="LOCAL-00003" summary="完成安全客爬取">
325 |       <created>1512661727063</created>
326 |       <option name="number" value="00003" />
327 |       <option name="presentableId" value="LOCAL-00003" />
328 |       <option name="project" value="LOCAL" />
329 |       <updated>1512661727063</updated>
330 |     </task>
331 |     <task id="LOCAL-00004" summary="解决图片不同步的bug">
332 |       <created>1512719606009</created>
333 |       <option name="number" value="00004" />
334 |       <option name="presentableId" value="LOCAL-00004" />
335 |       <option name="project" value="LOCAL" />
336 |       <updated>1512719606009</updated>
337 |     </task>
338 |     <task id="LOCAL-00005" summary="增加代码的灵活性">
339 |       <created>1512740929260</created>
340 |       <option name="number" value="00005" />
341 |       <option name="presentableId" value="LOCAL-00005" />
342 |       <option name="project" value="LOCAL" />
343 |       <updated>1512740929260</updated>
344 |     </task>
345 |     <task id="LOCAL-00006" summary="update">
346 |       <created>1514018179100</created>
347 |       <option name="number" value="00006" />
348 |       <option name="presentableId" value="LOCAL-00006" />
349 |       <option name="project" value="LOCAL" />
350 |       <updated>1514018179100</updated>
351 |     </task>
352 |     <option name="localTasksCounter" value="7" />
353 |     <servers />
354 |   </component>
355 |   <component name="ToolWindowManager">
356 |     <frame x="837" y="24" width="1656" height="1327" extended-state="0" />
357 |     <editor active="true" />
358 |     <layout>
359 |       <window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
360 |       <window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.32893652" sideWeight="0.5" order="7" side_tool="true" content_ui="tabs" />
361 |       <window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.36603463" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
362 |       <window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.3297609" sideWeight="0.5" order="11" side_tool="false" content_ui="tabs" />
363 |       <window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="8" side_tool="false" content_ui="tabs" />
364 |       <window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="9" side_tool="false" content_ui="tabs" />
365 |       <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.18658537" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
366 |       <window_info id="Docker" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="false" weight="0.33" sideWeight="0.5" order="10" side_tool="false" content_ui="tabs" />
367 |       <window_info id="Database" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
368 |       <window_info id="SciView" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
369 |       <window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
370 |       <window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.32563892" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
371 |       <window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="true" content_ui="tabs" />
372 |       <window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
373 |       <window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
374 |       <window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
375 |       <window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
376 |       <window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" />
377 |       <window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
378 |       <window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
379 |     </layout>
380 |     <layout-to-restore>
381 |       <window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
382 |       <window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
383 |       <window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
384 |       <window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
385 |       <window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.32919255" sideWeight="0.5" order="7" side_tool="true" content_ui="tabs" />
386 |       <window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
387 |       <window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="11" side_tool="false" content_ui="tabs" />
388 |       <window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="8" side_tool="false" content_ui="tabs" />
389 |       <window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.36675462" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
390 |       <window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="9" side_tool="false" content_ui="tabs" />
391 |       <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.3648036" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
392 |       <window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" />
393 |       <window_info id="Docker" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="false" weight="0.33" sideWeight="0.5" order="10" side_tool="false" content_ui="tabs" />
394 |       <window_info id="Database" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
395 |       <window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
396 |       <window_info id="SciView" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
397 |       <window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
398 |       <window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
399 |       <window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="true" content_ui="tabs" />
400 |       <window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.32585752" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
401 |     </layout-to-restore>
402 |   </component>
403 |   <component name="TypeScriptGeneratedFilesManager">
404 |     <option name="version" value="1" />
405 |   </component>
406 |   <component name="VcsContentAnnotationSettings">
407 |     <option name="myLimit" value="2678400000" />
408 |   </component>
409 |   <component name="VcsManagerConfiguration">
410 |     <MESSAGE value="分离文章图片和封面图" />
411 |     <MESSAGE value="提高代码的复用性" />
412 |     <MESSAGE value="完成安全客爬取" />
413 |     <MESSAGE value="解决图片不同步的bug" />
414 |     <MESSAGE value="增加代码的灵活性" />
415 |     <MESSAGE value="update" />
416 |     <option name="LAST_COMMIT_MESSAGE" value="update" />
417 |   </component>
418 |   <component name="XDebuggerManager">
419 |     <breakpoint-manager>
420 |       <breakpoints>
421 |         <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
422 |           <url>file://$PROJECT_DIR$/Technical_Artical_Spider/pipelines.py</url>
423 |           <line>46</line>
424 |           <option name="timeStamp" value="180" />
425 |         </line-breakpoint>
426 |       </breakpoints>
427 |       <default-breakpoints>
428 |         <breakpoint type="python-exception">
429 |           <properties notifyOnTerminate="true" exception="BaseException">
430 |             <option name="notifyOnTerminate" value="true" />
431 |           </properties>
432 |         </breakpoint>
433 |       </default-breakpoints>
434 |       <option name="time" value="185" />
435 |     </breakpoint-manager>
436 |     <watches-manager />
437 |   </component>
438 |   <component name="editorHistoryManager">
439 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/utils/common.py">
440 |       <provider selected="true" editor-type-id="text-editor">
441 |         <state relative-caret-position="144">
442 |           <caret line="9" column="24" lean-forward="false" selection-start-line="9" selection-start-column="24" selection-end-line="9" selection-end-column="24" />
443 |         </state>
444 |       </provider>
445 |     </entry>
446 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/main.py">
447 |       <provider selected="true" editor-type-id="text-editor">
448 |         <state relative-caret-position="54">
449 |           <caret line="6" column="34" lean-forward="false" selection-start-line="6" selection-start-column="34" selection-end-line="6" selection-end-column="34" />
450 |         </state>
451 |       </provider>
452 |     </entry>
453 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/items.py">
454 |       <provider selected="true" editor-type-id="text-editor">
455 |         <state relative-caret-position="1782">
456 |           <caret line="99" column="34" lean-forward="false" selection-start-line="99" selection-start-column="34" selection-end-line="99" selection-end-column="34" />
457 |           <folding>
458 |             <element signature="e#151#187#0" expanded="true" />
459 |           </folding>
460 |         </state>
461 |       </provider>
462 |     </entry>
463 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/pipelines.py">
464 |       <provider selected="true" editor-type-id="text-editor">
465 |         <state relative-caret-position="594">
466 |           <caret line="36" column="41" lean-forward="false" selection-start-line="36" selection-start-column="41" selection-end-line="36" selection-end-column="41" />
467 |           <folding>
468 |             <element signature="e#232#245#0" expanded="true" />
469 |           </folding>
470 |         </state>
471 |       </provider>
472 |     </entry>
473 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/settings.py">
474 |       <provider selected="true" editor-type-id="text-editor">
475 |         <state relative-caret-position="1548">
476 |           <caret line="86" column="62" lean-forward="false" selection-start-line="86" selection-start-column="62" selection-end-line="86" selection-end-column="62" />
477 |           <folding />
478 |         </state>
479 |       </provider>
480 |     </entry>
481 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/spiders/a4hou.py">
482 |       <provider selected="true" editor-type-id="text-editor">
483 |         <state relative-caret-position="0">
484 |           <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
485 |         </state>
486 |       </provider>
487 |     </entry>
488 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/main.py">
489 |       <provider selected="true" editor-type-id="text-editor">
490 |         <state relative-caret-position="54">
491 |           <caret line="6" column="34" lean-forward="true" selection-start-line="6" selection-start-column="34" selection-end-line="6" selection-end-column="34" />
492 |         </state>
493 |       </provider>
494 |     </entry>
495 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/settings.py">
496 |       <provider selected="true" editor-type-id="text-editor">
497 |         <state relative-caret-position="378">
498 |           <caret line="21" column="22" lean-forward="false" selection-start-line="21" selection-start-column="22" selection-end-line="21" selection-end-column="22" />
499 |           <folding />
500 |         </state>
501 |       </provider>
502 |     </entry>
503 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/items.py">
504 |       <provider selected="true" editor-type-id="text-editor">
505 |         <state relative-caret-position="558">
506 |           <caret line="31" column="25" lean-forward="false" selection-start-line="31" selection-start-column="25" selection-end-line="31" selection-end-column="25" />
507 |           <folding>
508 |             <element signature="e#151#187#0" expanded="true" />
509 |           </folding>
510 |         </state>
511 |       </provider>
512 |     </entry>
513 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/spiders/a4hou.py">
514 |       <provider selected="true" editor-type-id="text-editor">
515 |         <state relative-caret-position="198">
516 |           <caret line="11" column="0" lean-forward="false" selection-start-line="11" selection-start-column="0" selection-end-line="11" selection-end-column="0" />
517 |         </state>
518 |       </provider>
519 |     </entry>
520 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/items.py">
521 |       <provider selected="true" editor-type-id="text-editor">
522 |         <state relative-caret-position="468">
523 |           <caret line="26" column="18" lean-forward="true" selection-start-line="26" selection-start-column="18" selection-end-line="26" selection-end-column="18" />
524 |           <folding>
525 |             <element signature="e#151#187#0" expanded="true" />
526 |           </folding>
527 |         </state>
528 |       </provider>
529 |     </entry>
530 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/spiders/a4hou.py">
531 |       <provider selected="true" editor-type-id="text-editor">
532 |         <state relative-caret-position="198">
533 |           <caret line="11" column="0" lean-forward="true" selection-start-line="11" selection-start-column="0" selection-end-line="11" selection-end-column="0" />
534 |         </state>
535 |       </provider>
536 |     </entry>
537 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/items.py">
538 |       <provider selected="true" editor-type-id="text-editor">
539 |         <state relative-caret-position="288">
540 |           <caret line="16" column="26" lean-forward="false" selection-start-line="16" selection-start-column="26" selection-end-line="16" selection-end-column="26" />
541 |           <folding>
542 |             <element signature="e#151#187#0" expanded="true" />
543 |           </folding>
544 |         </state>
545 |       </provider>
546 |     </entry>
547 |     <entry file="file:///usr/local/lib/python3.5/dist-packages/scrapy/spidermiddlewares/offsite.py" />
548 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/images/full/908c6befc4fa533297d63c544a8121933ef52392.jpg" />
549 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/spiders/freebuf.py" />
550 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/content_image/339fefafb98ff46e8c93-2.jpg" />
551 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/content_image/d03c88b98b5090756346-2.jpg" />
552 |     <entry file="file:///usr/local/lib/python3.5/dist-packages/scrapy/loader/processors.py" />
553 |     <entry file="file:///usr/local/lib/python3.5/dist-packages/scrapy/utils/misc.py" />
554 |     <entry file="file:///usr/local/lib/python3.5/dist-packages/scrapy/spiderloader.py" />
555 |     <entry file="file:///usr/lib/python3.5/importlib/__init__.py" />
556 |     <entry file="file://$PROJECT_DIR$/4hou_Article-struct.sql">
557 |       <provider selected="true" editor-type-id="text-editor">
558 |         <state relative-caret-position="486">
559 |           <caret line="27" column="47" lean-forward="true" selection-start-line="27" selection-start-column="47" selection-end-line="27" selection-end-column="47" />
560 |         </state>
561 |       </provider>
562 |     </entry>
563 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/images/full/1a1c602ba204b3cd994be38e5575457ea1df282d.jpg" />
564 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/utils/common.py">
565 |       <provider selected="true" editor-type-id="text-editor">
566 |         <state relative-caret-position="162">
567 |           <caret line="9" column="22" lean-forward="false" selection-start-line="9" selection-start-column="13" selection-end-line="9" selection-end-column="22" />
568 |         </state>
569 |       </provider>
570 |     </entry>
571 |     <entry file="file:///usr/local/lib/python3.5/dist-packages/scrapy/spidermiddlewares/depth.py" />
572 |     <entry file="file:///usr/local/lib/python3.5/dist-packages/scrapy/spidermiddlewares/urllength.py" />
573 |     <entry file="file:///usr/local/lib/python3.5/dist-packages/scrapy/spidermiddlewares/referer.py" />
574 |     <entry file="file:///usr/local/lib/python3.5/dist-packages/scrapy/core/scraper.py" />
575 |     <entry file="file:///usr/local/lib/python3.5/dist-packages/scrapy/pipelines/__init__.py" />
576 |     <entry file="file:///usr/local/lib/python3.5/dist-packages/scrapy/utils/defer.py" />
577 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/ghostdriver.log" />
578 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/images/Cover_images_anquanke/t01d2f9eccdf22d5d6c.jpg" />
579 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/images/Cover_images_anquanke/t0174e2e35d1cbdfd9d.jpg" />
580 |     <entry file="file:///usr/local/lib/python3.5/dist-packages/twisted/internet/base.py" />
581 |     <entry file="file:///usr/local/lib/python3.5/dist-packages/twisted/internet/task.py" />
582 |     <entry file="file:///usr/local/lib/python3.5/dist-packages/scrapy/middleware.py" />
583 |     <entry file="file:///usr/local/lib/python3.5/dist-packages/scrapy/pipelines/images.py" />
584 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/images/image/t01899db408f7764530.jpg" />
585 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/images/image/t0132a6e668a96c8db8.jpg" />
586 |     <entry file="file:///usr/local/lib/python3.5/dist-packages/twisted/internet/defer.py" />
587 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/spiders/a4hou.py">
588 |       <provider selected="true" editor-type-id="text-editor">
589 |         <state relative-caret-position="598">
590 |           <caret line="30" column="25" lean-forward="false" selection-start-line="30" selection-start-column="25" selection-end-line="30" selection-end-column="25" />
591 |           <folding>
592 |             <element signature="e#24#37#0" expanded="true" />
593 |           </folding>
594 |         </state>
595 |       </provider>
596 |     </entry>
597 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/items.py">
598 |       <provider selected="true" editor-type-id="text-editor">
599 |         <state relative-caret-position="1886">
600 |           <caret line="86" column="23" lean-forward="false" selection-start-line="86" selection-start-column="4" selection-end-line="86" selection-end-column="23" />
601 |           <folding>
602 |             <element signature="e#151#187#0" expanded="true" />
603 |           </folding>
604 |         </state>
605 |       </provider>
606 |     </entry>
607 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/models/elaticsearch_type_anquanke.py">
608 |       <provider selected="true" editor-type-id="text-editor">
609 |         <state relative-caret-position="253">
610 |           <caret line="11" column="27" lean-forward="false" selection-start-line="11" selection-start-column="27" selection-end-line="11" selection-end-column="27" />
611 |           <folding>
612 |             <element signature="e#0#62#0" expanded="true" />
613 |           </folding>
614 |         </state>
615 |       </provider>
616 |     </entry>
617 |     <entry file="file://$USER_HOME$/AppData/Local/Programs/Python/Python36/Lib/site-packages/scrapy_splash/request.py">
618 |       <provider selected="true" editor-type-id="text-editor">
619 |         <state relative-caret-position="245">
620 |           <caret line="20" column="8" lean-forward="false" selection-start-line="20" selection-start-column="8" selection-end-line="20" selection-end-column="8" />
621 |           <folding />
622 |         </state>
623 |       </provider>
624 |     </entry>
625 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/spiders/anquanke360.py">
626 |       <provider selected="true" editor-type-id="text-editor">
627 |         <state relative-caret-position="782">
628 |           <caret line="34" column="0" lean-forward="true" selection-start-line="34" selection-start-column="0" selection-end-line="34" selection-end-column="0" />
629 |           <folding>
630 |             <element signature="e#24#37#0" expanded="true" />
631 |           </folding>
632 |         </state>
633 |       </provider>
634 |     </entry>
635 |     <entry file="file://$PROJECT_DIR$/README.md">
636 |       <provider selected="true" editor-type-id="text-editor">
637 |         <state relative-caret-position="391">
638 |           <caret line="17" column="0" lean-forward="true" selection-start-line="17" selection-start-column="0" selection-end-line="17" selection-end-column="0" />
639 |           <folding />
640 |         </state>
641 |       </provider>
642 |     </entry>
643 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/settings.py">
644 |       <provider selected="true" editor-type-id="text-editor">
645 |         <state relative-caret-position="290">
646 |           <caret line="75" column="0" lean-forward="false" selection-start-line="75" selection-start-column="0" selection-end-line="75" selection-end-column="0" />
647 |           <folding />
648 |         </state>
649 |       </provider>
650 |     </entry>
651 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/pipelines.py">
652 |       <provider selected="true" editor-type-id="text-editor">
653 |         <state relative-caret-position="736">
654 |           <caret line="53" column="0" lean-forward="false" selection-start-line="53" selection-start-column="0" selection-end-line="53" selection-end-column="0" />
655 |           <folding>
656 |             <element signature="e#232#245#0" expanded="true" />
657 |           </folding>
658 |         </state>
659 |       </provider>
660 |     </entry>
661 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/middlewares.py">
662 |       <provider selected="true" editor-type-id="text-editor">
663 |         <state relative-caret-position="575">
664 |           <caret line="70" column="0" lean-forward="false" selection-start-line="70" selection-start-column="0" selection-end-line="70" selection-end-column="0" />
665 |           <folding>
666 |             <element signature="e#168#194#0" expanded="true" />
667 |             <element signature="e#291#1953#0" expanded="false" />
668 |           </folding>
669 |         </state>
670 |       </provider>
671 |     </entry>
672 |     <entry file="file://$PROJECT_DIR$/Technical_Artical_Spider/main.py">
673 |       <provider selected="true" editor-type-id="text-editor">
674 |         <state relative-caret-position="92">
675 |           <caret line="7" column="36" lean-forward="true" selection-start-line="7" selection-start-column="36" selection-end-line="7" selection-end-column="36" />
676 |           <folding />
677 |         </state>
678 |       </provider>
679 |     </entry>
680 |   </component>
681 | </project>


--------------------------------------------------------------------------------