├── .gitignore ├── LICENSE ├── README.md ├── art ├── business.png ├── education.png ├── employment.png ├── follow.png ├── location.png ├── topic.png ├── user_info.png └── zhihu_tables.png ├── main.py ├── requirements.txt ├── scrapy.cfg ├── start.sh ├── zhihu.sql └── zhihu_spider ├── __init__.py ├── __init__.pyc ├── items.py ├── items.pyc ├── middlewares.py ├── misc ├── all_secret_set.py ├── db_tools.py ├── mysql_pool.py ├── tools.py └── ua_list.txt ├── pipelines.py ├── pipelines.pyc ├── settings.py ├── settings.pyc └── spiders ├── ZhihuSpider.py ├── ZhihuSpider.pyc ├── __init__.py └── __init__.pyc /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *,cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # dotenv 85 | .env 86 | 87 | # virtualenv 88 | .venv 89 | venv/ 90 | ENV/ 91 | 92 | # Spyder project settings 93 | .spyderproject 94 | 95 | # Rope project settings 96 | .ropeproject 97 | 98 | .DS_Store 99 | .deploy*/ 100 | .idea/ 101 | re_utils.py 102 | image 103 | NumberThreads.py 104 | zhihu_spider/misc/cookie -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 2 | Version 2, December 2004 3 | 4 | Copyright (C) 2012 Romain Lespinasse 5 | 6 | Everyone is permitted to copy and distribute verbatim or modified 7 | copies of this license document, and changing it is allowed as long 8 | as the name is changed. 9 | 10 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 11 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### 知乎爬虫 2 | #### 简介 3 | 1. 项目主要为爬取知乎用户以及知乎关注被关注的关系网,数据库使用MySQL,分为八个表,如下图 4 | 5 | ![表展示](art/zhihu_tables.png) 6 | 7 | 2. 表中相关字段可参照 zhihu_spider/items.py 中的字段进行一一比对,在此不在赘述。 8 | 3. 爬虫核心主要依靠知乎用户信息api和用户关注api进行字段拆解 9 | 4. 由于知乎反爬限制,需要自行配置代理,配置代理方式有很多种,不配置代理的话请加大延迟时间 10 | 5. 代码用于个人测试和学习,请遵守相关法律法规,请谨慎对待用户信息。 11 | 12 | 13 | #### 使用方式 14 | 1. 安装MySQL 教程不在此赘述,新建数据库并将 zhihu.sql建表文件导入数据库 并在 all_secret_set.py中配置好数据库相关项 15 | 2. 安装scrapy以及相关依赖 `pip install -r requirements.txt` 16 | 3. 运行main.py 17 | 4. start.sh 用于服务器部署,请自行修改适应自己运行环境 18 | 19 | #### 部分内容图片展示 20 | 21 | - 用户信息 22 | ![](art/user_info.png) 23 | 24 | - 公司信息 25 | ![](art/business.png) 26 | 27 | - 话题信息 28 | ![](art/topic.png) 29 | 30 | - 教育信息 31 | ![](art/education.png) 32 | 33 | - 工作信息 34 | ![](art/employment.png) 35 | 36 | - 关注信息 37 | ![](art/follow.png) 38 | 39 | - 地区信息 40 | ![](art/location.png) -------------------------------------------------------------------------------- /art/business.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dengqiangxi/zhihu_spider/ac507f9b1db68a72d5774a8e42f49b980e19c078/art/business.png -------------------------------------------------------------------------------- /art/education.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dengqiangxi/zhihu_spider/ac507f9b1db68a72d5774a8e42f49b980e19c078/art/education.png -------------------------------------------------------------------------------- /art/employment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dengqiangxi/zhihu_spider/ac507f9b1db68a72d5774a8e42f49b980e19c078/art/employment.png -------------------------------------------------------------------------------- /art/follow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dengqiangxi/zhihu_spider/ac507f9b1db68a72d5774a8e42f49b980e19c078/art/follow.png -------------------------------------------------------------------------------- /art/location.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dengqiangxi/zhihu_spider/ac507f9b1db68a72d5774a8e42f49b980e19c078/art/location.png -------------------------------------------------------------------------------- /art/topic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dengqiangxi/zhihu_spider/ac507f9b1db68a72d5774a8e42f49b980e19c078/art/topic.png -------------------------------------------------------------------------------- /art/user_info.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dengqiangxi/zhihu_spider/ac507f9b1db68a72d5774a8e42f49b980e19c078/art/user_info.png -------------------------------------------------------------------------------- /art/zhihu_tables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dengqiangxi/zhihu_spider/ac507f9b1db68a72d5774a8e42f49b980e19c078/art/zhihu_tables.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Created by yaochao on 2017/6/7 4 | from scrapy.crawler import CrawlerProcess 5 | from scrapy.utils.project import get_project_settings 6 | 7 | process = CrawlerProcess(get_project_settings()) 8 | 9 | # 执行指定的spider 10 | process.crawl('zhihu') 11 | 12 | process.start() 13 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | asn1crypto==0.24.0 2 | attrs==18.2.0 3 | Automat==0.7.0 4 | cffi==1.11.5 5 | constantly==15.1.0 6 | cryptography==3.2 7 | cssselect==1.0.3 8 | hyperlink==18.0.0 9 | idna==2.7 10 | incremental==17.5.0 11 | lxml==4.6.2 12 | parsel==1.5.0 13 | Pillow==6.2.1 14 | pyasn1==0.4.4 15 | pyasn1-modules==0.2.2 16 | pycparser==2.19 17 | PyDispatcher==2.0.5 18 | PyHamcrest==1.9.0 19 | PyMySQL==0.9.2 20 | pyOpenSSL==18.0.0 21 | queuelib==1.5.0 22 | Scrapy==1.5.0 23 | service-identity==17.0.0 24 | six==1.11.0 25 | Twisted==19.10.0 26 | user-agent==0.1.9 27 | w3lib==1.19.0 28 | zope.interface==4.5.0 29 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = zhihu_spider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = spiders 12 | -------------------------------------------------------------------------------- /start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | cd /opt/zhihu_spider 3 | source venv/bin/activate 4 | nohup scrapy crawl zhihu --set JOBDIR=crawls/project_saved > /var/log/zhihu_spider.log 2>&1 & 5 | deactivate -------------------------------------------------------------------------------- /zhihu.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Source Server Type : MySQL 3 | Source Schema : zhihu 4 | Target Server Type : MySQL 5 | Date: 17/11/2018 15:27:27 6 | */ 7 | 8 | SET NAMES utf8mb4; 9 | SET FOREIGN_KEY_CHECKS = 0; 10 | 11 | -- ---------------------------- 12 | -- Table structure for business 13 | -- ---------------------------- 14 | DROP TABLE IF EXISTS `business`; 15 | CREATE TABLE `business` ( 16 | `id` varchar(20) NOT NULL, 17 | `url` varchar(200) DEFAULT NULL, 18 | `avatar_url` varchar(200) NOT NULL DEFAULT '', 19 | `name` varchar(50) NOT NULL DEFAULT '', 20 | `introduction` mediumtext, 21 | `type` varchar(20) NOT NULL DEFAULT '', 22 | `excerpt` varchar(1000) NOT NULL DEFAULT '', 23 | `meta` json DEFAULT NULL, 24 | `experience` varchar(1000) NOT NULL DEFAULT '', 25 | PRIMARY KEY (`name`) USING BTREE 26 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; 27 | 28 | -- ---------------------------- 29 | -- Table structure for education 30 | -- ---------------------------- 31 | DROP TABLE IF EXISTS `education`; 32 | CREATE TABLE `education` ( 33 | `id` varchar(20) NOT NULL, 34 | `url` varchar(200) DEFAULT NULL, 35 | `avatar_url` varchar(200) NOT NULL DEFAULT '', 36 | `name` varchar(50) NOT NULL DEFAULT '', 37 | `introduction` mediumtext, 38 | `type` varchar(20) NOT NULL DEFAULT '', 39 | `meta` json DEFAULT NULL, 40 | `excerpt` varchar(1000) NOT NULL DEFAULT '', 41 | `experience` varchar(1000) NOT NULL DEFAULT '', 42 | PRIMARY KEY (`name`) USING BTREE 43 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; 44 | 45 | -- ---------------------------- 46 | -- Table structure for employment 47 | -- ---------------------------- 48 | DROP TABLE IF EXISTS `employment`; 49 | CREATE TABLE `employment` ( 50 | `id` varchar(20) NOT NULL, 51 | `url` varchar(200) DEFAULT NULL, 52 | `avatar_url` varchar(200) NOT NULL DEFAULT '', 53 | `name` varchar(50) NOT NULL DEFAULT '', 54 | `introduction` mediumtext, 55 | `type` varchar(20) NOT NULL DEFAULT '', 56 | `excerpt` varchar(1000) NOT NULL DEFAULT '', 57 | `experience` varchar(1000) NOT NULL DEFAULT '', 58 | `meta` json DEFAULT NULL, 59 | PRIMARY KEY (`name`) USING BTREE 60 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; 61 | 62 | -- ---------------------------- 63 | -- Table structure for follower 64 | -- ---------------------------- 65 | DROP TABLE IF EXISTS `follower`; 66 | CREATE TABLE `follower` ( 67 | `id` int(11) NOT NULL AUTO_INCREMENT, 68 | `follower_token` varchar(200) NOT NULL DEFAULT '', 69 | `following_token` varchar(200) NOT NULL DEFAULT '', 70 | PRIMARY KEY (`id`) 71 | ) ENGINE=InnoDB AUTO_INCREMENT=560994 DEFAULT CHARSET=utf8mb4; 72 | 73 | -- ---------------------------- 74 | -- Table structure for following 75 | -- ---------------------------- 76 | DROP TABLE IF EXISTS `following`; 77 | CREATE TABLE `following` ( 78 | `id` int(11) NOT NULL AUTO_INCREMENT, 79 | `follower_token` varchar(200) NOT NULL DEFAULT '', 80 | `following_token` varchar(200) NOT NULL DEFAULT '', 81 | PRIMARY KEY (`id`) 82 | ) ENGINE=InnoDB AUTO_INCREMENT=688277 DEFAULT CHARSET=utf8mb4; 83 | 84 | -- ---------------------------- 85 | -- Table structure for location 86 | -- ---------------------------- 87 | DROP TABLE IF EXISTS `location`; 88 | CREATE TABLE `location` ( 89 | `id` varchar(20) NOT NULL, 90 | `url` varchar(200) DEFAULT NULL, 91 | `avatar_url` varchar(200) NOT NULL DEFAULT '', 92 | `name` varchar(50) NOT NULL DEFAULT '', 93 | `introduction` mediumtext, 94 | `meta` json DEFAULT NULL, 95 | `type` varchar(20) NOT NULL DEFAULT '', 96 | `excerpt` varchar(1000) NOT NULL DEFAULT '', 97 | PRIMARY KEY (`name`) USING BTREE 98 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; 99 | 100 | -- ---------------------------- 101 | -- Table structure for topic 102 | -- ---------------------------- 103 | DROP TABLE IF EXISTS `topic`; 104 | CREATE TABLE `topic` ( 105 | `id` varchar(20) NOT NULL, 106 | `url` varchar(200) DEFAULT NULL, 107 | `avatar_url` varchar(200) NOT NULL DEFAULT '', 108 | `name` varchar(50) NOT NULL DEFAULT '', 109 | `meta` json DEFAULT NULL, 110 | `introduction` mediumtext, 111 | `type` varchar(20) NOT NULL DEFAULT '', 112 | `excerpt` varchar(1000) NOT NULL DEFAULT '', 113 | PRIMARY KEY (`name`) USING BTREE 114 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; 115 | 116 | -- ---------------------------- 117 | -- Table structure for user_info 118 | -- ---------------------------- 119 | DROP TABLE IF EXISTS `user_info`; 120 | CREATE TABLE `user_info` ( 121 | `sort_order` int(11) NOT NULL AUTO_INCREMENT, 122 | `id` varchar(100) NOT NULL DEFAULT '', 123 | `name` varchar(100) NOT NULL DEFAULT '', 124 | `headline` varchar(500) NOT NULL DEFAULT '', 125 | `url_token` varchar(300) NOT NULL DEFAULT '', 126 | `user_type` varchar(100) NOT NULL DEFAULT '', 127 | `avatar_hue` varchar(500) NOT NULL DEFAULT '', 128 | `included_text` varchar(500) NOT NULL DEFAULT '', 129 | `description` varchar(1000) NOT NULL DEFAULT '', 130 | `type` varchar(50) NOT NULL DEFAULT '', 131 | `avatar_url` varchar(400) NOT NULL DEFAULT '', 132 | `cover_url` varchar(400) NOT NULL DEFAULT '', 133 | `url` varchar(400) NOT NULL DEFAULT '', 134 | `avatar_url_template` varchar(400) NOT NULL DEFAULT '', 135 | `allow_message` int(11) NOT NULL DEFAULT '-1', 136 | `is_privacy_protected` int(11) NOT NULL DEFAULT '-1', 137 | `is_blocking` int(11) NOT NULL DEFAULT '-1', 138 | `is_advertiser` int(11) NOT NULL DEFAULT '-1', 139 | `is_force_renamed` int(11) NOT NULL DEFAULT '-1', 140 | `is_active` int(11) NOT NULL DEFAULT '-1', 141 | `is_blocked` int(11) NOT NULL DEFAULT '-1', 142 | `following_topic_count` int(11) NOT NULL DEFAULT '-1', 143 | `columns_count` int(11) NOT NULL DEFAULT '-1', 144 | `hosted_live_count` int(11) NOT NULL DEFAULT '-1', 145 | `thank_to_count` int(11) NOT NULL DEFAULT '-1', 146 | `mutual_followees_count` int(11) NOT NULL DEFAULT '-1', 147 | `answer_count` int(11) NOT NULL DEFAULT '-1', 148 | `thank_from_count` int(11) NOT NULL DEFAULT '-1', 149 | `vote_to_count` int(11) NOT NULL DEFAULT '-1', 150 | `articles_count` int(11) NOT NULL DEFAULT '-1', 151 | `question_count` int(11) NOT NULL DEFAULT '-1', 152 | `included_answers_count` int(11) NOT NULL DEFAULT '-1', 153 | `gender` int(11) NOT NULL DEFAULT '-1', 154 | `logs_count` int(11) NOT NULL DEFAULT '-1', 155 | `following_question_count` int(11) NOT NULL DEFAULT '-1', 156 | `thanked_count` int(11) NOT NULL DEFAULT '-1', 157 | `following_count` int(11) NOT NULL DEFAULT '-1', 158 | `vote_from_count` int(11) NOT NULL DEFAULT '-1', 159 | `pins_count` int(11) NOT NULL DEFAULT '-1', 160 | `included_articles_count` int(11) NOT NULL DEFAULT '-1', 161 | `favorite_count` int(11) NOT NULL DEFAULT '-1', 162 | `voteup_count` int(11) NOT NULL DEFAULT '-1', 163 | `commercial_question_count` int(11) NOT NULL DEFAULT '-1', 164 | `participated_live_count` int(11) NOT NULL DEFAULT '-1', 165 | `following_favlists_count` int(11) NOT NULL DEFAULT '-1', 166 | `favorited_count` int(11) NOT NULL DEFAULT '-1', 167 | `is_org` int(11) NOT NULL DEFAULT '-1', 168 | `follower_count` int(11) NOT NULL DEFAULT '-1', 169 | `following_columns_count` int(11) NOT NULL DEFAULT '-1', 170 | `location` json DEFAULT NULL, 171 | `badge` json DEFAULT NULL, 172 | `business` varchar(20) NOT NULL DEFAULT '', 173 | `employment` json DEFAULT NULL, 174 | `education` json DEFAULT NULL, 175 | `shared_count` int(11) NOT NULL DEFAULT '-1', 176 | `lite_favorite_content_count` int(11) NOT NULL DEFAULT '-1', 177 | `independent_articles_count` int(11) NOT NULL DEFAULT '-1', 178 | `reactions_count` int(11) NOT NULL DEFAULT '-1', 179 | `is_activity_blocked` int(11) NOT NULL DEFAULT '-1', 180 | `is_bind_sina` int(11) NOT NULL DEFAULT '-1', 181 | `is_hanged` int(11) NOT NULL DEFAULT '-1', 182 | `is_unicom_free` int(11) NOT NULL DEFAULT '-1', 183 | `live_count` int(11) NOT NULL DEFAULT '-1', 184 | `is_baned` int(11) NOT NULL DEFAULT '-1', 185 | `is_enable_signalment` int(11) NOT NULL DEFAULT '-1', 186 | `is_enable_watermark` int(11) NOT NULL DEFAULT '-1', 187 | `sina_weibo_url` varchar(200) NOT NULL DEFAULT '', 188 | `sina_weibo_name` varchar(50) NOT NULL DEFAULT '', 189 | `marked_answers_text` varchar(1000) NOT NULL DEFAULT '', 190 | `infinity` json DEFAULT NULL, 191 | `created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间戳', 192 | PRIMARY KEY (`sort_order`), 193 | UNIQUE KEY `main_index` (`id`) USING BTREE 194 | ) ENGINE=InnoDB AUTO_INCREMENT=111883 DEFAULT CHARSET=utf8mb4; 195 | 196 | SET FOREIGN_KEY_CHECKS = 1; 197 | -------------------------------------------------------------------------------- /zhihu_spider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dengqiangxi/zhihu_spider/ac507f9b1db68a72d5774a8e42f49b980e19c078/zhihu_spider/__init__.py -------------------------------------------------------------------------------- /zhihu_spider/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dengqiangxi/zhihu_spider/ac507f9b1db68a72d5774a8e42f49b980e19c078/zhihu_spider/__init__.pyc -------------------------------------------------------------------------------- /zhihu_spider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.org/en/latest/topics/items.html 7 | 8 | from scrapy import Field, Item 9 | from scrapy.loader import ItemLoader 10 | from scrapy.loader.processors import TakeFirst 11 | 12 | 13 | class UserInfo(Item): 14 | education = Field() 15 | following_count = Field() 16 | vote_from_count = Field() 17 | user_type = Field() 18 | included_text = Field() 19 | pins_count = Field() 20 | is_privacy_protected = Field() 21 | included_articles_count = Field() 22 | is_force_renamed = Field() 23 | id = Field() 24 | favorite_count = Field() 25 | voteup_count = Field() 26 | commercial_question_count = Field() 27 | is_blocking = Field() 28 | following_columns_count = Field() 29 | headline = Field() 30 | url_token = Field() 31 | participated_live_count = Field() 32 | is_advertiser = Field() 33 | following_favlists_count = Field() 34 | favorited_count = Field() 35 | is_org = Field() 36 | follower_count = Field() 37 | employment = Field() 38 | type = Field() 39 | avatar_hue = Field() 40 | avatar_url_template = Field() 41 | following_topiceducation_count = Field() 42 | description = Field() 43 | business = Field() 44 | avatar_url = Field() 45 | columns_count = Field() 46 | hosted_live_count = Field() 47 | is_active = Field() 48 | thank_to_count = Field() 49 | mutual_followees_count = Field() 50 | cover_url = Field() 51 | thank_from_count = Field() 52 | vote_to_count = Field() 53 | is_blocked = Field() 54 | answer_count = Field() 55 | allow_message = Field() 56 | articles_count = Field() 57 | name = Field() 58 | question_count = Field() 59 | location = Field() 60 | badge = Field() 61 | included_answers_count = Field() 62 | url = Field() 63 | logs_count = Field() 64 | following_question_count = Field() 65 | thanked_count = Field() 66 | gender = Field() 67 | 68 | sina_weibo_url = Field() 69 | sina_weibo_name = Field() 70 | marked_answers_text = Field() 71 | 72 | shared_count = Field() 73 | lite_favorite_content_count = Field() 74 | independent_articles_count = Field() 75 | reactions_count = Field() 76 | is_activity_blocked = Field() 77 | is_bind_sina = Field() 78 | is_hanged = Field() 79 | is_unicom_free = Field() 80 | live_count = Field() 81 | is_baned = Field() 82 | is_enable_signalment = Field() 83 | is_enable_watermark = Field() 84 | infinity = Field() 85 | 86 | 87 | class Base(Item): 88 | url = Field() 89 | avatar_url = Field() 90 | name = Field() 91 | introduction = Field() 92 | type = Field() 93 | excerpt = Field() 94 | id = Field() 95 | meta = Field() 96 | 97 | 98 | class Business(Base): 99 | experience = Field() 100 | pass 101 | 102 | 103 | class Location(Base): 104 | pass 105 | 106 | 107 | class Topic(Base): 108 | pass 109 | 110 | 111 | class Education(Base): 112 | pass 113 | 114 | 115 | class Employment(Base): 116 | pass 117 | 118 | 119 | class Following(Item): 120 | follower_token = Field() 121 | following_token = Field() 122 | 123 | 124 | # is_vip 需要转换 125 | 126 | 127 | class Follower(Item): 128 | follower_token = Field() 129 | following_token = Field() 130 | 131 | 132 | class RawDataItem(Item): 133 | json_obj = Field() 134 | 135 | 136 | class TestLoader(ItemLoader): 137 | default_item_class = UserInfo 138 | default_input_processor = TakeFirst() 139 | -------------------------------------------------------------------------------- /zhihu_spider/items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dengqiangxi/zhihu_spider/ac507f9b1db68a72d5774a8e42f49b980e19c078/zhihu_spider/items.pyc -------------------------------------------------------------------------------- /zhihu_spider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | import random 6 | from scrapy import Request 7 | from scrapy.downloadermiddlewares.retry import RetryMiddleware 8 | from zhihu_spider.misc.tools import get_ua_list 9 | 10 | """ 11 | 遇到验证码检测重试 12 | """ 13 | 14 | 15 | class ZhihuRetryMiddleware(RetryMiddleware): 16 | 17 | def process_response(self, request, response, spider): 18 | if 'account/unhuman' in response.url: 19 | reason = 'zhihu need login %s' % request 20 | return self._retry(request, reason, spider) 21 | return response 22 | 23 | 24 | """ 25 | 此处需要自行设定代理 26 | """ 27 | 28 | 29 | class ZhihuDownloaderMiddleware(object): 30 | ''' 31 | 下载器中间件 32 | ''' 33 | 34 | def process_request(self, request: Request, spider): 35 | request.headers['User-Agent'] = random.choice(get_ua_list()) 36 | -------------------------------------------------------------------------------- /zhihu_spider/misc/all_secret_set.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding=utf8 -*- 3 | # Created by dengqiangxi at 2018/11/5 4 | import pymysql 5 | 6 | # 数据库相关配置 7 | mysql_config = { 8 | 'host': '127.0.0.1', 9 | 'port': 3306, 10 | 'user': 'root', 11 | 'password': 'toor', 12 | 'db': 'zhihu', 13 | 'charset': 'utf8', 14 | 'cursorclass': pymysql.cursors.DictCursor, 15 | 'autocommit': True 16 | } -------------------------------------------------------------------------------- /zhihu_spider/misc/db_tools.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding=utf8 -*- 3 | # Created by dengqiangxi at 2018/11/7 4 | 5 | from zhihu_spider.misc.all_secret_set import mysql_config 6 | import pymysql 7 | 8 | __all__ = ['db_location_names', 'db_business_names', 'db_topic_names', 'db_user_ids', 'db_education_names', 9 | 'db_employ_names'] 10 | connect = pymysql.connect(**mysql_config) 11 | 12 | 13 | def get_data(table_name, distinct_key): 14 | with connect.cursor() as cursor: 15 | cursor.execute("select %s as `key` from %s" % (distinct_key, table_name)) 16 | return {x['key'] for x in cursor.fetchall()} 17 | 18 | 19 | db_location_names = get_data('location', 'name') 20 | db_business_names = get_data('business', 'name') 21 | db_topic_names = get_data('topic', 'name') 22 | db_employ_names = get_data('employment', 'name') 23 | db_user_ids = get_data('user_info', 'id') 24 | db_education_names = get_data('education', 'name') 25 | -------------------------------------------------------------------------------- /zhihu_spider/misc/mysql_pool.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding=utf8 -*- 3 | # Created by dengqiangxi at 2018/11/7 4 | 5 | import pymysql 6 | import warnings 7 | import queue 8 | import logging 9 | import threading 10 | from pymysql.connections import Connection as conn 11 | 12 | warnings.filterwarnings('error', category=pymysql.err.Warning) 13 | 14 | 15 | class Connection(pymysql.connections.Connection): 16 | """ 17 | Return a connection object with or without connection_pool feature. 18 | This is all the same with pymysql.connections.Connection instance except that with connection_pool feature: 19 | the __exit__() method additionally put the connection back to it's pool 20 | """ 21 | _pool = None 22 | _reusable_expection = (pymysql.err.ProgrammingError, pymysql.err.IntegrityError, pymysql.err.NotSupportedError) 23 | 24 | def __init__(self, *args, **kwargs): 25 | pymysql.connections.Connection.__init__(self, *args, **kwargs) 26 | self.args = args 27 | self.kwargs = kwargs 28 | 29 | def __exit__(self, exc, value, traceback): 30 | """ 31 | Overwrite the __exit__() method of pymysql.connections.Connection 32 | Base action: on successful exit, commit. On exception, rollback 33 | With pool additional action: put connection back to pool 34 | """ 35 | pymysql.connections.Connection.__exit__(self, exc, value, traceback) 36 | if self._pool: 37 | if not exc or exc in self._reusable_expection: 38 | '''reusable connection''' 39 | self._pool.put_connection(self) 40 | else: 41 | '''no reusable connection, close it and create a new one then put it to the pool''' 42 | self._pool.put_connection(self._recreate(*self.args, **self.kwargs)) 43 | self._pool = None 44 | try: 45 | self.close() 46 | logging.warning("Close not reusable connection from pool(%s) caused by %s", self._pool.name, value) 47 | except Exception: 48 | pass 49 | 50 | def _recreate(self, *args, **kwargs): 51 | conn = Connection(*args, **kwargs) 52 | logging.debug('Create new connection due to pool(%s) lacking', self._pool.name) 53 | return conn 54 | 55 | def close(self): 56 | """ 57 | Overwrite the close() method of pymysql.connections.Connection 58 | With pool, put connection back to pool; 59 | Without pool, send the quit message and close the socket 60 | """ 61 | if self._pool: 62 | self._pool.put_connection(self) 63 | else: 64 | pymysql.connections.Connection.close(self) 65 | 66 | def execute_query(self, query, args=(), dictcursor=False, return_one=False, exec_many=False): 67 | """ 68 | A wrapped method of pymysql's execute() or executemany(). 69 | dictcursor: whether want use the dict cursor(cursor's default type is tuple) 70 | return_one: whether want only one row of the result 71 | exec_many: whether use pymysql's executemany() method 72 | """ 73 | with self: 74 | cur = self.cursor() if not dictcursor else self.cursor(pymysql.cursors.DictCursor) 75 | try: 76 | if exec_many: 77 | cur.executemany(query, args) 78 | else: 79 | cur.execute(query, args) 80 | except Exception: 81 | raise 82 | # if no record match the query, return () if return_one==False, else return None 83 | return cur.fetchone() if return_one else cur.fetchall() 84 | 85 | 86 | class ConnectionPool: 87 | """ 88 | Return connection_pool object, which has method can get connection from a pool with timeout and retry feature; 89 | put a reusable connection back to the pool, etc; also we can create different instance of this class that represent 90 | different pool of different DB Server or different user 91 | """ 92 | _HARD_LIMIT = 100 93 | _THREAD_LOCAL = threading.local() 94 | _THREAD_LOCAL.retry_counter = 0 # a counter used for debug get_connection() method 95 | 96 | def __init__(self, size=5, name=None, *args, **kwargs): 97 | self._pool = queue.Queue(self._HARD_LIMIT) 98 | self.name = name if name else '-'.join([kwargs.get('host', 'localhost'), str(kwargs.get('port', 3306)), 99 | kwargs.get('user', ''), kwargs.get('database', '')]) 100 | for _ in range(size if size < self._HARD_LIMIT else self._HARD_LIMIT): 101 | conn = Connection(*args, **kwargs) 102 | conn._pool = self 103 | self._pool.put(conn) 104 | 105 | def get_connection(self, timeout=1, retry_num=1) -> conn: 106 | """ 107 | timeout: timeout of get a connection from pool, should be a int(0 means return or raise immediately) 108 | retry_num: how many times will retry to get a connection 109 | """ 110 | try: 111 | conn = self._pool.get(timeout=timeout) if timeout > 0 else self._pool.get_nowait() 112 | logging.debug('Get connection from pool(%s)', self.name) 113 | return conn 114 | except queue.Empty: 115 | if retry_num > 0: 116 | self._THREAD_LOCAL.retry_counter += 1 117 | logging.debug('Retry get connection from pool(%s), the %d times', self.name, 118 | self._THREAD_LOCAL.retry_counter) 119 | retry_num -= 1 120 | return self.get_connection(timeout, retry_num) 121 | else: 122 | total_times = self._THREAD_LOCAL.retry_counter + 1 123 | self._THREAD_LOCAL.retry_counter = 0 124 | raise GetConnectionFromPoolError("can't get connection from pool({}) within {}*{} second(s)".format( 125 | self.name, timeout, total_times)) 126 | 127 | def put_connection(self, conn): 128 | if not conn._pool: 129 | conn._pool = self 130 | conn.cursor().close() 131 | try: 132 | self._pool.put_nowait(conn) 133 | logging.debug("Put connection back to pool(%s)", self.name) 134 | except queue.Full: 135 | logging.warning("Put connection to pool(%s) error, pool is full, size:%d", self.name, self.size()) 136 | 137 | def size(self): 138 | return self._pool.qsize() 139 | 140 | 141 | class GetConnectionFromPoolError(Exception): 142 | """Exception related can't get connection from pool within timeout seconds.""" 143 | -------------------------------------------------------------------------------- /zhihu_spider/misc/tools.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import logging 3 | import re 4 | import sys 5 | 6 | 7 | def config_logger(): 8 | logging.basicConfig(format='%(asctime)-15s %(levelname)s %(filename)s %(lineno)d %(process)d %(message)s', 9 | datefmt='%a %d %b %Y %H:%M:%S', filemode='w') 10 | 11 | 12 | def init_logger(logger_name: str, logger_path: str, debug=False): 13 | """ 14 | 初始化logger 15 | :param logger_name: logger_name 16 | :param logger_path: logger文件路径 17 | :param debug: 是否debug模式 18 | """ 19 | logger = logging.getLogger(logger_name) 20 | logger.setLevel(logging.DEBUG) 21 | fh = logging.FileHandler(logger_path) 22 | fh.setLevel(logging.DEBUG) 23 | sh = logging.StreamHandler(sys.stdout) if debug else logging.NullHandler() 24 | sh.setLevel(logging.NOTSET) 25 | fmt = "%(asctime)-15s %(levelname)s %(filename)s %(lineno)d %(process)d %(message)s" 26 | datefmt = "%a %d %b %Y %H:%M:%S" 27 | formatter = logging.Formatter(fmt, datefmt) 28 | fh.setFormatter(formatter) 29 | sh.setFormatter(formatter) 30 | logger.addHandler(fh) 31 | logger.addHandler(sh) 32 | return logger 33 | 34 | 35 | def format_avatar(avatar_url: str): 36 | if avatar_url is None: 37 | return '' 38 | return avatar_url.replace('_s.jpg', 'jpg').replace('_xl.jpg', '.jpg') 39 | 40 | 41 | def hump2underline(hunp_str): 42 | ''' 43 | 驼峰形式字符串转成下划线形式 44 | :param hunp_str: 驼峰形式字符串 45 | :return: 字母全小写的下划线形式字符串 46 | ''' 47 | # 匹配正则,匹配小写字母和大写字母的分界位置 48 | p = re.compile(r'([a-z]|\d)([A-Z])') 49 | # 这里第二个参数使用了正则分组的后向引用 50 | sub = re.sub(p, r'\1_\2', hunp_str).lower() 51 | return sub 52 | 53 | 54 | def spelling_insert_sql(dict_keys, table_name: str): 55 | """ 56 | 拼接插入sql 57 | """ 58 | return "insert ignore into %s (%s) values (%s)" % (table_name, 59 | ",".join(dict_keys), 60 | "%(" + ")s,%(".join(dict_keys) + ")s" 61 | ) 62 | 63 | 64 | def get_ua_list(): 65 | """ 66 | 获取ua列表 67 | """ 68 | with open('zhihu_spider/misc/ua_list.txt', 'r') as f: 69 | return [x.replace('\n', '') for x in f.readlines()] 70 | -------------------------------------------------------------------------------- /zhihu_spider/misc/ua_list.txt: -------------------------------------------------------------------------------- 1 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 2 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 3 | Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36 4 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 5 | Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36 6 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 7 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 8 | Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36 9 | Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 10 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 11 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 12 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 13 | Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 14 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36 15 | Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 16 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36 17 | Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 18 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 19 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 20 | Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 21 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 22 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 23 | Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36 24 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 25 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.12 Safari/537.36 26 | Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36 27 | Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36 28 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 29 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.77 Chrome/70.0.3538.77 Safari/537.36 30 | Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 31 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36 32 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 33 | Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36 34 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 35 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 36 | Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36 37 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36 38 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 39 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 40 | Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 41 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36 42 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.52 43 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 44 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 45 | Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36 46 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36 47 | Mozilla/5.0 (Linux; Android 8.0.0; MI 6 Build/OPR1.170623.027; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/68.0.3440.91 Mobile Safari/537.36 48 | Mozilla/5.0 (Linux; Android 6.0.1; LEX727 Build/WEXNAOP5801810261S) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36 49 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 50 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 51 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 52 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.52 53 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 54 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.7 Safari/537.36 55 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.12 Safari/537.36 56 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 57 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36 58 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36 59 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3423.2 Safari/537.36 60 | Mozilla/5.0 (Linux; Android 9; MI 8 Build/PKQ1.180729.001) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 YaBrowser/18.9.2.31.00 Mobile Safari/537.36 61 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 62 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.7 Safari/537.36 63 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 AppEngine-Google; (+http://code.google.com/appengine; appid: s~mmx4xxnet2) 64 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 AppEngine-Google; (+http://code.google.com/appengine; appid: s~mmx4xxnet) 65 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 AppEngine-Google; (+http://code.google.com/appengine; appid: p~mmx4xxnet3) 66 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36 67 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36 68 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 69 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 70 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 71 | Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36 72 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 73 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36 74 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36 75 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36 76 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 77 | Mozilla/5.0 (Linux; Android 5.1; CPH1605) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 78 | Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 79 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36 80 | Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36 81 | Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3472.3 Safari/537.36 82 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36 83 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36 84 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3528.4 Safari/537.36 85 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36 86 | Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 87 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.9 Safari/537.36 88 | Mozilla/5.0 (Linux; Android 8.0.0; BLA-L29) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 89 | Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36 90 | Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36 91 | Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.88 92 | Mozilla/5.0 (Linux; Android 8.0.0; SM-G9600 Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36 93 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 94 | Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36 95 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 96 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 97 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36 98 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 99 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.23 Safari/537.36 100 | Mozilla/5.0 (Linux; Android 8.0.0; ONEPLUS A3010) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 101 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.33 Safari/537.36 102 | Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.9 Safari/537.36 103 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 104 | Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36 105 | Mozilla/5.0 (Linux; Android 7.0; KNT-AL20 Build/HUAWEIKNT-AL20) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36 106 | Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 107 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.9 Safari/537.36 108 | Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 109 | Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 110 | Mozilla/5.0 (Linux; Android 8.1.0; ALP-AL00 Build/HUAWEIALP-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.91 Mobile Safari/537.36 111 | Mozilla/5.0 (Linux; Android 8.0.0; MI 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 112 | Mozilla/5.0 (Linux; Android 8.1.0; MI 8 Build/OPM1.171019.026) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36 113 | Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.40 114 | Mozilla/5.0 (Linux; Android 8.0.0; SM-N950F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 115 | Mozilla/5.0 (Linux; Android 7.0; Redmi Note 4X Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36 116 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/69.0.3452.0 Safari/537.36 117 | Mozilla/5.0 (Linux; Android 8.1.0; Moto G (5) Plus) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 118 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 119 | Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36 MicroMessenger/5.0 JMEII/2.0 SIYOUMI/2.0 120 | Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36 121 | Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36 122 | Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3464.0 Safari/537.36 123 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 124 | Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 125 | Mozilla/5.0 (Linux; Android 8.0.0; BKL-AL20 Build/HUAWEIBKL-AL20) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36 OPR/48.0.2331.132643 126 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/68.0.3440.106 Chrome/68.0.3440.106 Safari/537.36 127 | Mozilla/5.0 (Linux; Android 7.1.1; OD103 Build/NMF26F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36 128 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36 129 | Mozilla/5.0 (Linux; Android 9; ONEPLUS A6000) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 130 | Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3514.0 Safari/537.36 131 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 Vivaldi/2.1.1337.36 132 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36 133 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3514.0 Safari/537.36 134 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 135 | Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 YaBrowser/18.9.1.954 Yowser/2.5 Safari/537.36 136 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 137 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36 138 | Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 139 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36 140 | Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.52 141 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36 142 | Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36 143 | Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 144 | Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 145 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36 146 | Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.7 Safari/537.36 147 | Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36 148 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 149 | Mozilla/5.0 (Linux; Android 8.1.0; EML-L29) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 150 | Mozilla/5.0 (Linux; Android 8.1.0; Mi Note 3 Build/OPM1.171019.019; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/70.0.3538.80 Mobile Safari/537.36 151 | Mozilla/5.0 (Linux; Android 8.0.0; SM-C9000) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 152 | Mozilla/5.0 (Linux; Android 8.0.0; HTC_U-3u) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 153 | Mozilla/5.0 (Linux; Android 8.0.0; EVA-AL10) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 154 | Mozilla/5.0 (Linux; Android 8.1.0; ONEPLUS A5010) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 155 | Mozilla/5.0 (Linux; Android 9; Pixel 2 XL) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 156 | Mozilla/5.0 (Linux; Android 9; MIX 2S) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 157 | Mozilla/5.0 (Linux; Android 8.1.0; EML-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 158 | Mozilla/5.0 (Linux; Android 8.1.0; MI MAX 2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 159 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 AppEngine-Google; (+http://code.google.com/appengine; appid: f~xxnet8602101) 160 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 AppEngine-Google; (+http://code.google.com/appengine; appid: d~xxnet8602123) 161 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 AppEngine-Google; (+http://code.google.com/appengine; appid: b~xxnet8602100) 162 | Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 163 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 Avast/69.1.852.101 164 | Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 Vivaldi/2.1.1337.36 165 | Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 MicroMessenger/5.0 JMEII/2.0 SIYOUMI/1.0 166 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36 167 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) snap Chromium/70.0.3538.77 Chrome/70.0.3538.77 Safari/537.36 168 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3486.0 Safari/537.36 169 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 170 | Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36 171 | Mozilla/5.0 (Linux; Android 8.1.0; MI 8 Build/OPM1.171019.026; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/68.0.3440.91 Mobile Safari/537.36 172 | Mozilla/5.0 (Linux; Android 8.1.0; ONEPLUS A5000) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 173 | Mozilla/5.0 (Linux; Android 5.1; HUAWEI TAG-L22) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 174 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 175 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 176 | Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 177 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 178 | Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 179 | Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36 180 | Mozilla/5.0 (Linux; Android 7.0; LG-H870DS) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 181 | Mozilla/5.0 (Linux; Android 8.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Focus/7.0.12 Chrome/70.0.3538.80 Mobile Safari/537.36 182 | Mozilla/5.0 (Linux; Android 5.1; MX5 Build/LMY47I) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36 183 | Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 184 | Mozilla/5.0 (Linux; Android 7.0; MI 5s Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36 185 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.52 186 | Mozilla/5.0 (Linux; Android 5.1; A1601) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 187 | Mozilla/5.0 (Linux; Android 5.1.1; SM-G531H Build/LMY48B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36 188 | Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3423.2 Safari/537.36 189 | Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36 190 | Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.12 Safari/537.36 191 | Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36 192 | Mozilla/5.0 (Linux; Android 5.0; SM-N9007 Build/LRX21V) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.91 Mobile Safari/537.36 193 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 194 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.5 Safari/537.36 195 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.17 Safari/537.36 196 | Mozilla/5.0 (Linux; Android 7.1.1; OPPO R11) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 197 | Mozilla/5.0 (Linux; Android 8.0.0; MIX 2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 198 | Mozilla/5.0 (Linux; Android 8.0.0; ALP-L29) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 199 | Mozilla/5.0 (Linux; Android 8.1.0; vivo NEX A Build/OPM1.171019.026) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.91 Mobile Safari/537.36 200 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 201 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.52 202 | Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36 203 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 204 | Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36,gzip(gfe) 205 | Mozilla/5.0 (Linux; Android 8.1.0; G8441 Build/OPM7.181005.003; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/69.0.3497.109 Mobile Safari/537.36 206 | Mozilla/5.0 (Linux; Android 8.1.0; Nokia X5 Build/O11019) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.70 Mobile Safari/537.36 207 | Mozilla/5.0 (Linux; Android 5.1; HUAWEI TAG-AL00 Build/HUAWEITAG-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.70 Mobile Safari/537.36 208 | Mozilla/5.0 (Linux; Android 4.3; vivo Y18L Build/JLS36C) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.70 Mobile Safari/537.36 209 | Mozilla/5.0 (Linux; Android 8.0.0; G8342) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 210 | Mozilla/5.0 (X11; Linux aarch64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.77 Chrome/70.0.3538.77 Safari/537.36 211 | Mozilla/5.0 (Linux; Android 8.1.0; BLN-AL40) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 212 | Mozilla/5.0 (Linux; Android 5.1; m2 note Build/LMY47D) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36 213 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 214 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36 215 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3521.2 Safari/537.36 216 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 217 | Mozilla/5.0 (Linux; Android 8.0.0; ASUS_Z012DA Build/OPR1.170623.026) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36 218 | Mozilla/5.0 (Linux; Android 6.0.1; Redmi 3S Build/MMB29M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36 219 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36 220 | Mozilla/5.0 (Linux; Android 6.0.1; NX531J Build/MMB29M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.91 Mobile Safari/537.36 221 | Mozilla/5.0 (Linux; Android 8.0.0; MIX 2 Build/OPR1.170623.027) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36 222 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3458.0 Safari/537.36 223 | Mozilla/5.0 (Linux; Android 8.0.0; HWI-AL00 Build/HUAWEIHWI-AL00; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/68.0.3440.91 Mobile Safari/537.36 224 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3472.3 Safari/537.36 225 | Mozilla/5.0 (Linux; Android 9; BLA-L29) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 226 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 227 | Mozilla/5.0 (Linux; Android 8.1.0; Redmi Note 5 Build/OPM1.171019.011) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.91 Mobile Safari/537.36 228 | Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.52 229 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3528.4 Safari/537.36 AppEngine-Google; (+http://code.google.com/appengine; appid: s~xxnet-skyhifi7) 230 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3528.4 Safari/537.36 AppEngine-Google; (+http://code.google.com/appengine; appid: s~xxnet-sdcff11) 231 | Mozilla/5.0 (Linux; Android 6.0.1; OPPO R9st) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Mobile Safari/537.36 232 | -------------------------------------------------------------------------------- /zhihu_spider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | import pymongo 6 | import pymysql 7 | 8 | from zhihu_spider.misc.all_secret_set import mysql_config 9 | import logging 10 | from zhihu_spider.misc.mysql_pool import ConnectionPool 11 | from zhihu_spider.items import * 12 | from scrapy.exceptions import DropItem 13 | from zhihu_spider.misc.tools import spelling_insert_sql, hump2underline 14 | 15 | item_class_list = [ 16 | UserInfo, 17 | Business, 18 | Location, 19 | Topic, 20 | Following, 21 | Follower, 22 | Employment, 23 | Education, 24 | ] 25 | 26 | 27 | class ZhihuSpiderPipeLine(object): 28 | 29 | def __init__(self): 30 | pool = ConnectionPool(size=20, name='pool', **mysql_config) 31 | self.connections = pool.get_connection() 32 | 33 | def process_item(self, item, spider): 34 | for item_class in item_class_list: 35 | if isinstance(item, item_class): 36 | self.save_item(item, hump2underline(item_class.__name__)) 37 | 38 | def save_item(self, item, table_name): 39 | sql = spelling_insert_sql(item.keys(), table_name) 40 | try: 41 | with self.connections.cursor() as cursor: 42 | cursor.execute(sql, dict(item)) 43 | except pymysql.err.MySQLError as e: 44 | logging.error(e) 45 | logging.warning("error item %s", item.__class__.__name__) 46 | self.connections.ping(reconnect=True) 47 | self.connections.rollback() 48 | except Exception as e: 49 | logging.error(e) 50 | raise DropItem('item exception', sql) 51 | 52 | def close_spider(self, spider): 53 | self.connections.close() 54 | -------------------------------------------------------------------------------- /zhihu_spider/pipelines.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dengqiangxi/zhihu_spider/ac507f9b1db68a72d5774a8e42f49b980e19c078/zhihu_spider/pipelines.pyc -------------------------------------------------------------------------------- /zhihu_spider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for spiders project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'zhihu_spider' 13 | 14 | SPIDER_MODULES = ['zhihu_spider.spiders'] 15 | NEWSPIDER_MODULE = 'zhihu_spider.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | # USER_AGENT = 'weibo_spider (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | # CONCURRENT_REQUESTS = 100 25 | # Increase Twisted IO thread pool maximum size 26 | # REACTOR_THREADPOOL_MAXSIZE = 40 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | # DOWNLOAD_DELAY = 3 32 | # The download delay setting will honor only one of: 33 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 34 | # CONCURRENT_REQUESTS_PER_IP = 3 35 | 36 | # Disable cookies (enabled by default) 37 | COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | # TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | # DEFAULT_REQUEST_HEADERS = { 44 | # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 45 | # "Accept-Encoding": "gzip, identity", 46 | # "Cache-Control": "max-age=0", 47 | # "Referer": "https://www.zhihu.com/", 48 | # "Host": "www.zhihu.com", 49 | # "Upgrade-Insecure-Requests": "1", 50 | # "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36", 51 | # "Accept-Language": "zh-CN,zh;q=0.8", 52 | # } 53 | 54 | 55 | # Enable or disable spider middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 57 | # SPIDER_MIDDLEWARES = { 58 | # 'weibo_spider.middlewares.MyCustomSpiderMiddleware': 543, 59 | # } 60 | 61 | # Enable or disable downloader middlewares 62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 63 | DOWNLOADER_MIDDLEWARES = { 64 | 'zhihu_spider.middlewares.ZhihuRetryMiddleware': 200, 65 | 'zhihu_spider.middlewares.ZhihuDownloaderMiddleware': 543, 66 | } 67 | 68 | # IMAGES_STORE = 'image' 69 | 70 | # Enable or disable extensions 71 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 72 | # EXTENSIONS = { 73 | # 'scrapy.extensions.telnet.TelnetConsole': None, 74 | # } 75 | 76 | # Configure item pipelines 77 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 78 | ITEM_PIPELINES = { 79 | # 'spiders.pipelines.ZhihuImagePipeLine': 200, 80 | 'zhihu_spider.pipelines.ZhihuSpiderPipeLine': 800, 81 | } 82 | 83 | # Enable and configure the AutoThrottle extension (disabled by default) 84 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 85 | AUTOTHROTTLE_ENABLED = True 86 | # The initial download delay 87 | AUTOTHROTTLE_START_DELAY = 5 88 | # The maximum download delay to be set in case of high latencies 89 | AUTOTHROTTLE_MAX_DELAY = 10 90 | # The average number of requests Scrapy should be sending in parallel to 91 | # each remote server 92 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 4 93 | # Enable showing throttling stats for every response received: 94 | # AUTOTHROTTLE_DEBUG = False 95 | 96 | # Enable and configure HTTP caching (disabled by default) 97 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 98 | # HTTPCACHE_ENABLED = True 99 | # HTTPCACHE_EXPIRATION_SECS = 0 100 | # HTTPCACHE_DIR = 'httpcache' 101 | # HTTPCACHE_IGNORE_HTTP_CODES = [401] 102 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 103 | ZHIHU_HEADER = { 104 | "Accept-Encoding": "gzip, identity", 105 | "Referer": "https://www.zhihu.com/", 106 | "Upgrade-Insecure-Requests": "1", 107 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36", 108 | "Accept-Language": "zh-CN,zh;q=0.8", 109 | } 110 | 111 | RETRY_TIMES = 10 112 | DOWNLOAD_TIMEOUT = 10 113 | 114 | LOG_FILE = "spider.log" 115 | LOG_LEVEL = "WARNING" 116 | -------------------------------------------------------------------------------- /zhihu_spider/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dengqiangxi/zhihu_spider/ac507f9b1db68a72d5774a8e42f49b980e19c078/zhihu_spider/settings.pyc -------------------------------------------------------------------------------- /zhihu_spider/spiders/ZhihuSpider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Created by dengqiangxi on 16/9/12 4 | import logging 5 | 6 | import scrapy 7 | from scrapy.http import Request, Response 8 | from zhihu_spider.settings import * 9 | from zhihu_spider.items import * 10 | from zhihu_spider.misc.db_tools import * 11 | import json 12 | 13 | from zhihu_spider.misc.tools import config_logger 14 | 15 | config_logger() 16 | 17 | ignore_key_set = {'isFollowed', 'vipInfo', 'accountStatus', 'messageThreadToken', 'isFollowing', 'orgHomepage', 18 | 'industryCategory'} 19 | 20 | 21 | def parse_sub_item(sub_item_obj: dict, sub_item: Item): 22 | for sub_key, sub_value in sub_item_obj.items(): 23 | if sub_key in Employment.fields.keys(): 24 | if sub_key == 'meta': 25 | sub_item[sub_key] = json.dumps(sub_value, ensure_ascii=False) 26 | else: 27 | sub_item[sub_key] = sub_value 28 | 29 | 30 | class ZhihuSpider(scrapy.Spider): 31 | name = 'zhihu' 32 | type_format_str = '''https://www.zhihu.com/api/v4/members/{}/{}?limit=20&offset=100''' 33 | # type_format_str = '''https://www.zhihu.com/api/v4/members/{}/{}?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&limit=20&offset=0''' 34 | url_user_info_api_format = 'https://api.zhihu.com/people/{}' 35 | start_urls = [ 36 | url_user_info_api_format.format('0970f947b898ecc0ec035f9126dd4e08'), 37 | url_user_info_api_format.format('80d73b7ec52adc8afd54894cead6063f'), 38 | url_user_info_api_format.format('8b68876001197b3b9cd605b20814616f'), 39 | ] 40 | 41 | def __init__(self, **kwargs): 42 | super(ZhihuSpider, self).__init__(**kwargs) 43 | self.base_url = 'https://www.zhihu.com' 44 | self.following_url = 'https://www.zhihu.com/people/renfish/following' 45 | 46 | def start_requests(self): 47 | for url in self.start_urls: 48 | yield self.make_requests_from_url(url) 49 | 50 | def make_requests_from_url(self, url): 51 | return Request(url, method='GET', headers=ZHIHU_HEADER) 52 | 53 | def parse(self, response: Response): 54 | """ 55 | 解析单个用户的详细信息 56 | :param response: 57 | """ 58 | text = response.text 59 | user_info = json.loads(text) 60 | raw_data_obj = RawDataItem() 61 | raw_data_obj['json_obj'] = user_info 62 | yield raw_data_obj 63 | item = UserInfo() 64 | for key, value in user_info.items(): 65 | if key in ignore_key_set: 66 | continue 67 | if key == 'vipInfo': 68 | item['isVip'] = value.get('isVip') 69 | elif key == 'education': 70 | edu_names = [] 71 | for e_item in value: 72 | if not e_item: 73 | continue 74 | education_item = Education() 75 | parse_sub_item(e_item, education_item) 76 | if education_item['name'] not in db_education_names: 77 | db_education_names.add(education_item['name']) 78 | yield education_item 79 | edu_names.append(e_item['name']) 80 | item[key] = json.dumps(edu_names, ensure_ascii=False) 81 | elif key == 'employment': 82 | all_employ_names = [] 83 | for e_item in value: 84 | employ_names = [] 85 | for e_sub_item in e_item: 86 | if not e_sub_item: 87 | continue 88 | employ_item = Employment() 89 | parse_sub_item(e_sub_item, employ_item) 90 | if employ_item['name'] not in db_employ_names: 91 | db_employ_names.add(employ_item['name']) 92 | yield employ_item 93 | employ_names.append(e_sub_item['name']) 94 | all_employ_names.append(employ_names) 95 | 96 | item[key] = json.dumps(all_employ_names, ensure_ascii=False) 97 | elif key == 'location': 98 | location_names = [] 99 | for l_item in value: 100 | if not l_item: 101 | continue 102 | location = Location() 103 | parse_sub_item(l_item, location) 104 | l_name = l_item.get('name') 105 | location_names.append(l_name) 106 | if l_name not in db_location_names: 107 | db_location_names.add(l_name) 108 | yield location 109 | item[key] = json.dumps(location_names, ensure_ascii=False) 110 | elif key == 'business': 111 | business_item = Business() 112 | if not value: 113 | continue 114 | parse_sub_item(value, business_item) 115 | b_name = value.get('name') 116 | item[key] = b_name 117 | if b_name not in db_business_names: 118 | db_business_names.add(b_name) 119 | yield business_item 120 | elif key == 'badge': 121 | badge_items = [] 122 | for badge_item in value: 123 | topics = badge_item.get('topics') 124 | if topics: 125 | topic_names = [] 126 | for topic_item in topics: 127 | if not topic_item: 128 | continue 129 | topic = Topic() 130 | parse_sub_item(topic_item, topic) 131 | topic_names.append(topic_item.get('name')) 132 | if topic['name'] not in db_topic_names: 133 | db_topic_names.add(topic['name']) 134 | yield topic 135 | del badge_item['topics'] 136 | badge_item['topic_names'] = topic_names 137 | badge_items.append(badge_item) 138 | item[key] = json.dumps(badge_items, ensure_ascii=False) 139 | elif key == 'infinity': 140 | item[key] = json.dumps(value, ensure_ascii=Field) 141 | else: 142 | if key in UserInfo.fields.keys(): 143 | item[key] = value 144 | db_user_ids.add(item['id']) 145 | yield item 146 | url_token = item['url_token'] 147 | api_followings_url = self.type_format_str.format(url_token, 'followees') 148 | api_followers_url = self.type_format_str.format(url_token, 'followers') 149 | 150 | yield scrapy.Request(url=api_followings_url, callback=self.parser_follow_json, headers=ZHIHU_HEADER, 151 | meta={'url_token': url_token}) 152 | yield scrapy.Request(url=api_followers_url, callback=self.parser_follow_json, headers=ZHIHU_HEADER, 153 | meta={'url_token': url_token}) 154 | 155 | def parser_follow_json(self, response): 156 | """ 157 | 从粉丝和关注者的接口中抽出用户的token 158 | """ 159 | url_token = response.meta['url_token'] 160 | json_text = response.text 161 | f_obj = json.loads(json_text) 162 | paging = f_obj['paging'] 163 | data = f_obj['data'] 164 | item = {} 165 | if 'followers' in response.url: 166 | item = Follower() 167 | elif 'followees' in response.url or 'following' in response.url: 168 | item = Following() 169 | if data: 170 | for userinfo in data: 171 | if isinstance(item, Following): 172 | item['follower_token'] = url_token 173 | item['following_token'] = userinfo['url_token'] 174 | else: 175 | item['follower_token'] = userinfo['url_token'] 176 | item['following_token'] = url_token 177 | yield item 178 | user_id = userinfo['url'].split('/')[-1] 179 | if user_id not in db_user_ids: 180 | logging.info("%s not in ids", user_id) 181 | db_user_ids.add(user_id) 182 | json_url = self.url_user_info_api_format.format(user_id) 183 | yield scrapy.Request(url=json_url, 184 | callback=self.parse, 185 | headers=ZHIHU_HEADER) 186 | 187 | if paging and not paging['is_end']: 188 | next_url = paging['next'].replace('https://www.zhihu.com', 'https://www.zhihu.com/api/v4') 189 | print('next_url', next_url) 190 | yield scrapy.Request(url=next_url, callback=self.parser_follow_json, headers=ZHIHU_HEADER, 191 | dont_filter=True, meta={'url_token': url_token}) 192 | -------------------------------------------------------------------------------- /zhihu_spider/spiders/ZhihuSpider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dengqiangxi/zhihu_spider/ac507f9b1db68a72d5774a8e42f49b980e19c078/zhihu_spider/spiders/ZhihuSpider.pyc -------------------------------------------------------------------------------- /zhihu_spider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /zhihu_spider/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dengqiangxi/zhihu_spider/ac507f9b1db68a72d5774a8e42f49b980e19c078/zhihu_spider/spiders/__init__.pyc --------------------------------------------------------------------------------