├── .gitignore ├── ArticleSpider ├── __init__.py ├── images │ └── full │ │ ├── 35011d6168be00e949624c665041dc724e3ad786.jpg │ │ ├── 5630c3d4f4f3f685aa06d550f00a4cdef5d9a492.jpg │ │ ├── b906ccd28196f4dac2561cbd4120d442b442de87.jpg │ │ ├── d1b17b98748a74826464a08e6d30a4ee1b15b171.jpg │ │ └── f5d4611e2094787b56111e18af07ad9bad0e04f6.jpg ├── items.py ├── middlewares.py ├── models │ ├── __init__.py │ └── es_types.py ├── pipelines.py ├── settings.py ├── spiders │ ├── __init__.py │ ├── jobbole.py │ ├── lagou.py │ └── zhihu.py ├── tools │ ├── __init__.py │ ├── debug.log │ └── selenium_spider.py └── utils │ ├── __init__.py │ ├── captcha.gif │ ├── common.py │ ├── crawl_xici_ip.py │ └── zhihu_login_requests.py ├── README.md ├── articleexport.json ├── build └── lib │ └── ArticleSpider │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── models │ ├── __init__.py │ └── es_types.py │ ├── pipelines.py │ ├── settings.py │ ├── spiders │ ├── __init__.py │ ├── jobbole.py │ ├── lagou.py │ └── zhihu.py │ ├── tools │ ├── __init__.py │ └── selenium_spider.py │ └── utils │ ├── __init__.py │ ├── common.py │ ├── crawl_xici_ip.py │ └── zhihu_login_requests.py ├── dbs ├── ArticleSpider.db └── default.db ├── eggs └── ArticleSpider │ └── 1504775520.egg ├── main.py ├── project.egg-info ├── PKG-INFO ├── SOURCES.txt ├── dependency_links.txt ├── entry_points.txt └── top_level.txt ├── requestments.txt ├── scrapy.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | dep.sh 3 | config.yaml 4 | config.json 5 | env_*.py 6 | log/ 7 | tmp/ 8 | test.py 9 | .DS_Store 10 | apidoc/ 11 | .idea/ 12 | venv/ -------------------------------------------------------------------------------- /ArticleSpider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackfengJam/ArticleSpider/a332bd1e0db36d41b23f85cd850dfb20c75d00f0/ArticleSpider/__init__.py -------------------------------------------------------------------------------- /ArticleSpider/images/full/35011d6168be00e949624c665041dc724e3ad786.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackfengJam/ArticleSpider/a332bd1e0db36d41b23f85cd850dfb20c75d00f0/ArticleSpider/images/full/35011d6168be00e949624c665041dc724e3ad786.jpg -------------------------------------------------------------------------------- /ArticleSpider/images/full/5630c3d4f4f3f685aa06d550f00a4cdef5d9a492.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackfengJam/ArticleSpider/a332bd1e0db36d41b23f85cd850dfb20c75d00f0/ArticleSpider/images/full/5630c3d4f4f3f685aa06d550f00a4cdef5d9a492.jpg -------------------------------------------------------------------------------- /ArticleSpider/images/full/b906ccd28196f4dac2561cbd4120d442b442de87.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackfengJam/ArticleSpider/a332bd1e0db36d41b23f85cd850dfb20c75d00f0/ArticleSpider/images/full/b906ccd28196f4dac2561cbd4120d442b442de87.jpg -------------------------------------------------------------------------------- /ArticleSpider/images/full/d1b17b98748a74826464a08e6d30a4ee1b15b171.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackfengJam/ArticleSpider/a332bd1e0db36d41b23f85cd850dfb20c75d00f0/ArticleSpider/images/full/d1b17b98748a74826464a08e6d30a4ee1b15b171.jpg -------------------------------------------------------------------------------- /ArticleSpider/images/full/f5d4611e2094787b56111e18af07ad9bad0e04f6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackfengJam/ArticleSpider/a332bd1e0db36d41b23f85cd850dfb20c75d00f0/ArticleSpider/images/full/f5d4611e2094787b56111e18af07ad9bad0e04f6.jpg -------------------------------------------------------------------------------- /ArticleSpider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | import re 8 | import datetime 9 | import scrapy 10 | from scrapy.loader import ItemLoader 11 | from scrapy.loader.processors import MapCompose, TakeFirst, Join 12 | from ArticleSpider.utils.common import extract_num 13 | from ArticleSpider.settings import SQL_DATETIME_FORMAT, SQL_DATE_FORMAT 14 | from w3lib.html import remove_tags 15 | from ArticleSpider.models.es_types import ArticleType 16 | import redis 17 | 18 | 19 | from elasticsearch_dsl.connections import connections 20 | 21 | es = connections.create_connection(ArticleType._doc_type.using) 22 | 23 | redis_cli = redis.StrictRedis() 24 | 25 | 26 | class ArticlespiderItem(scrapy.Item): 27 | # define the fields for your item here like: 28 | # name = scrapy.Field() 29 | pass 30 | 31 | 32 | def date_convert(value): 33 | try: 34 | create_date = datetime.datetime.strptime(value, "%Y/%m/%d").date() 35 | except Exception as e: 36 | create_date = datetime.datetime.now() 37 | return create_date 38 | 39 | 40 | def add_jobbole(value): 41 | return value+"-haifeng" 42 | 43 | 44 | def get_nums(value): 45 | # print(value) 46 | match_re = re.match(r'.*?(\d+).*', value) 47 | if match_re: 48 | nums = match_re.group(1) 49 | return nums 50 | else: 51 | return 0 52 | 53 | 54 | def remove_comment_tags(value): 55 | # 去掉tag中提取得评论 56 | if "评论" in value: 57 | return "" 58 | else: 59 | return value 60 | 61 | 62 | def return_value(value): 63 | return value 64 | 65 | 66 | def gen_suggests(index, info_tuple): 67 | # 根据字符串生成搜索建议数据 68 | # python工程师 title 10 69 | # python工程师 text 3 70 | # 不能覆盖,所以用set 71 | used_words = set() 72 | suggests = [] 73 | for text, weight in info_tuple: 74 | if text: 75 | # 调用es得analyze接口分析字符串 76 | words = es.indices.analyze(index=index, analyzer="ik_max_word", params={'filter': ["lowercase"]}, body=text) 77 | analyzed_words = set(r["token"] for r in words["tokens"] if len(r["token"]) > 1) 78 | new_words = analyzed_words - used_words 79 | else: 80 | new_words = set() 81 | if new_words: 82 | suggests.append({"input": list(new_words), "weight": weight}) 83 | return suggests 84 | 85 | 86 | class ArticleItemLoader(ItemLoader): 87 | # 自定义ItemLoader 88 | default_output_processor = TakeFirst() 89 | 90 | 91 | class JobBoleArticleItem(scrapy.Item): 92 | # title = scrapy.Field( 93 | # input_processor=MapCompose(lambda x: x+"-jobbole", add_jobbole), 94 | # output_processor = TakeFirst() 95 | # ) 96 | title = scrapy.Field() 97 | 98 | create_date = scrapy.Field( 99 | input_processor=MapCompose(date_convert), 100 | # output_processor=TakeFirst() 101 | ) 102 | url = scrapy.Field() 103 | url_object_id = scrapy.Field( 104 | output_processor=MapCompose(return_value) 105 | ) 106 | front_image_url = scrapy.Field( 107 | output_processor=MapCompose(return_value) 108 | ) 109 | front_image_path = scrapy.Field() 110 | praise_nums = scrapy.Field( 111 | input_processor=MapCompose(get_nums) 112 | ) 113 | comment_nums = scrapy.Field( 114 | input_processor=MapCompose(get_nums) 115 | ) 116 | fav_nums = scrapy.Field( 117 | input_processor=MapCompose(get_nums) 118 | ) 119 | tags = scrapy.Field( 120 | input_processor=MapCompose(remove_comment_tags), 121 | output_processor=Join(","), 122 | ) 123 | content = scrapy.Field() 124 | 125 | def get_insert_sql(self): 126 | insert_sql = """ 127 | insert into jobbole(title, url, create_date, fav_nums) 128 | VALUES (%s, %s, %s, %s) 129 | ON DUPLICATE KEY 130 | UPDATE title=VALUES(title),url=VALUES(url), 131 | create_date=VALUES (create_date),fav_nums=VALUES (fav_nums) 132 | """ 133 | params = (self["title"], self["url"], self["create_date"], self["fav_nums"]) 134 | return insert_sql, params 135 | 136 | def save_to_es(self): 137 | article = ArticleType() 138 | article.title = self['title'] 139 | article.create_date = self["create_date"] 140 | article.content = remove_tags(self["content"]) 141 | article.front_image_url = self["front_image_url"] 142 | if "front_image_path" in self: 143 | article.front_image_path = self["front_image_path"] 144 | article.praise_nums = self["praise_nums"] 145 | article.fav_nums = self["fav_nums"] 146 | article.comment_nums = self["comment_nums"] 147 | article.url = self["url"] 148 | article.tags = self["tags"] 149 | article.meta.id = self["url_object_id"] 150 | 151 | # article.suggest = [{"input":[], "weight":2}] 152 | article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7))) 153 | 154 | article.save() 155 | 156 | redis_cli.incr("jobble_count") 157 | 158 | return 159 | 160 | 161 | class ZhihuQuestionItem(scrapy.Item): 162 | # 知乎的问题 Item 163 | zhihu_id = scrapy.Field() 164 | topics = scrapy.Field() 165 | url = scrapy.Field() 166 | title = scrapy.Field() 167 | content = scrapy.Field() 168 | answer_num = scrapy.Field() 169 | comments_num = scrapy.Field() 170 | watch_user_num = scrapy.Field() 171 | click_num = scrapy.Field() 172 | crawl_time = scrapy.Field() 173 | 174 | def get_insert_sql(self): 175 | insert_sql = """ 176 | insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num, 177 | watch_user_num, click_num, crawl_time) 178 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s,%s, %s) 179 | ON DUPLICATE KEY 180 | UPDATE content=VALUES(content),answer_num=VALUES(answer_num), 181 | comments_num=VALUES (comments_num),watch_user_num=VALUES (watch_user_num), 182 | click_num=VALUES (click_num) 183 | """ 184 | zhihu_id = self["zhihu_id"][0] 185 | topics = ",".join(self["topics"]) 186 | url = self["url"][0] 187 | title = "".join(self["title"]) 188 | content = "".join(self["content"]) 189 | answer_num = extract_num("".join(self["answer_num"])) 190 | comments_num = extract_num("".join(self["comments_num"])) 191 | watch_user_num = extract_num("".join(self["watch_user_num"])) 192 | click_num = extract_num("".join(self["click_num"])) 193 | crawl_time = datetime.datetime.now().strftime("") 194 | 195 | params = (zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time) 196 | 197 | return insert_sql, params 198 | 199 | 200 | class ZhihuAnswerItem(scrapy.Item): 201 | zhihu_id = scrapy.Field() 202 | url = scrapy.Field() 203 | question_id = scrapy.Field() 204 | author_id = scrapy.Field() 205 | content = scrapy.Field() 206 | praise_num = scrapy.Field() 207 | comments_num = scrapy.Field() 208 | create_time = scrapy.Field() 209 | update_time = scrapy.Field() 210 | crawl_time = scrapy.Field() 211 | 212 | def get_insert_sql(self): 213 | 214 | insert_sql = """ 215 | insert into zhihu_answer(zhihu_id, url, question_id, author_id, content, praise_num, 216 | comments_num,create_time, update_time, crawl_time) 217 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s,%s, %s) 218 | ON DUPLICATE KEY 219 | UPDATE content=VALUES(content),comments_num=VALUES(comments_num), 220 | praise_num=VALUES (praise_num),update_time=VALUES (update_time) 221 | """ # on duplicate是mysql特有的语法 222 | create_time = datetime.datetime.fromtimestamp(self["create_time"]) 223 | update_time = datetime.datetime.fromtimestamp(self["update_time"]) 224 | 225 | params = ( 226 | self["zhuhu_id"], self["url"], self["question_id"], 227 | self["author_id"], self["content"], self["praise_num"], 228 | self["comments_num"], create_time, update_time, self["crawl_time"], 229 | ) 230 | 231 | 232 | def remove_splash(value): 233 | # 去掉工作城市得斜线 234 | return value.replace("/", "") 235 | 236 | 237 | def handle_jobaddr(value): 238 | addr_list = value.split("\n") 239 | addr_list = [item.strip() for item in addr_list if item.strip() != "查看地图"] 240 | return "".join(addr_list) 241 | 242 | 243 | class LagouJobItemLoader(ItemLoader): 244 | # 自定义ItemLoader 245 | default_out_processor = TakeFirst() 246 | 247 | 248 | class LagouJobItem(scrapy.Item): 249 | # 拉钩网职位信息 250 | url = scrapy.Field() 251 | url_object_id = scrapy.Field() 252 | title = scrapy.Field() 253 | salary = scrapy.Field() 254 | job_city = scrapy.Field( 255 | input_processor=MapCompose(remove_splash), 256 | ) 257 | work_years = scrapy.Field( 258 | input_processor=MapCompose(remove_splash), 259 | ) 260 | degree_need = scrapy.Field( 261 | input_processor=MapCompose(remove_splash), 262 | ) 263 | job_type = scrapy.Field() 264 | publish_time = scrapy.Field() 265 | tags = scrapy.Field( 266 | input_processor=MapCompose(Join(",")) 267 | ) 268 | job_advantage = scrapy.Field() 269 | job_desc = scrapy.Field() 270 | job_addr = scrapy.Field( 271 | input_processor=MapCompose(remove_tags), 272 | ) 273 | company_url = scrapy.Field() 274 | company_name = scrapy.Field() 275 | crawl_time = scrapy.Field() 276 | crawl_update_time = scrapy.Field() 277 | 278 | def get_insert_sql(self): 279 | insert_sql = """ 280 | insert into 281 | lagou_job(title, url, url_object_id, salary, job_city, work_years, degree_need, job_type,publish_time, 282 | tags, job_advantage, job_desc, job_addr, company_url, company_name, crawl_time, crawl_update_time) 283 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 284 | ON DUPLICATE KEY UPDATE 285 | salary=VALUES (salary), job_desc=VALUES (job_desc), crawl_update_time=VALUES (crawl_update_time) 286 | """ 287 | params = ( 288 | self["title"], self["url"], self["url_object_id"], self["salary"], self["job_city"], self["work_years"], 289 | self["degree_need"],self["job_type"], self["publish_time"], self["tags"], self["job_advantage"], 290 | self["job_desc"],self["job_addr"], self["company_url"], self["company_name"], 291 | self["crawl_time"].strftime(SQL_DATETIME_FORMAT), self["crawl_update_time"].strftime(SQL_DATETIME_FORMAT) 292 | ) 293 | 294 | return insert_sql, params 295 | 296 | 297 | 298 | 299 | -------------------------------------------------------------------------------- /ArticleSpider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | from fake_useragent import UserAgent 11 | from ArticleSpider.utils.crawl_xici_ip import GetIP 12 | 13 | 14 | class ArticlespiderSpiderMiddleware(object): 15 | # Not all methods need to be defined. If a method is not defined, 16 | # scrapy acts as if the spider middleware does not modify the 17 | # passed objects. 18 | 19 | @classmethod 20 | def from_crawler(cls, crawler): 21 | # This method is used by Scrapy to create your spiders. 22 | s = cls() 23 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 24 | return s 25 | 26 | def process_spider_input(self, response, spider): 27 | # Called for each response that goes through the spider 28 | # middleware and into the spider. 29 | 30 | # Should return None or raise an exception. 31 | return None 32 | 33 | def process_spider_output(self, response, result, spider): 34 | # Called with the results returned from the Spider, after 35 | # it has processed the response. 36 | 37 | # Must return an iterable of Request, dict or Item objects. 38 | for i in result: 39 | yield i 40 | 41 | def process_spider_exception(self, response, exception, spider): 42 | # Called when a spider or process_spider_input() method 43 | # (from other spider middleware) raises an exception. 44 | 45 | # Should return either None or an iterable of Response, dict 46 | # or Item objects. 47 | pass 48 | 49 | def process_start_requests(self, start_requests, spider): 50 | # Called with the start requests of the spider, and works 51 | # similarly to the process_spider_output() method, except 52 | # that it doesn’t have a response associated. 53 | 54 | # Must return only requests (not items). 55 | for r in start_requests: 56 | yield r 57 | 58 | def spider_opened(self, spider): 59 | spider.logger.info('Spider opened: %s' % spider.name) 60 | 61 | 62 | class RandomUserAgentMiddleware(object): 63 | # 随机更换User-Agent 64 | def __init__(self, crawler): 65 | super(RandomUserAgentMiddleware, self).__init__() 66 | self.user_agent_list = crawler.settings.get("user_agent_list", []) 67 | self.ua = UserAgent() 68 | self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random") 69 | 70 | @classmethod 71 | def from_crawler(cls, crawler): 72 | return cls(crawler) 73 | 74 | def process_request(self, request, spider): 75 | # from ArticleSpider.settings import user_agent_list 76 | # import random 77 | # request.headers.setdefault("User-Agent", user_agent_list[random.randint(0, len(user_agent_list)-1)]) 78 | 79 | def get_ua(): 80 | return getattr(self.ua, self.ua_type) 81 | 82 | # random_agent = get_ua() # 调试的时候用 83 | 84 | request.headers.setdefault("User-Agent", get_ua()) 85 | # request.meta["proxy"] = "http://113.128.90.192:48888" 86 | 87 | 88 | class RandomProxyMiddleware(object): 89 | # 动态设置ip代理 90 | def process_request(self, request, spider): 91 | get_ip = GetIP() 92 | request.meta["proxy"] = get_ip.get_random_ip() 93 | 94 | from selenium import webdriver 95 | from scrapy.http import HtmlResponse 96 | 97 | 98 | class JSPageMiddleware(object): 99 | # def __init__(self): 100 | # self.browser = webdriver.Chrome(executable_path="G:/Document/PythonServerEnvironment/SelniumWebdriver/chromedriver.exe") 101 | # super(JSPageMiddleware, self).__init__() 102 | 103 | # 通过chrome请求动态网页 104 | def process_request(self, request, spider): 105 | if spider.name == "jobbole": 106 | # chrome_opt = webdriver.ChromeOptions() 107 | # prefs = {"profile.managed_default_content_settings.images": 2} 108 | # chrome_opt.add_experimental_option("prefs", prefs) 109 | # browser = webdriver.Chrome( executable_path="G:/Document/PythonServerEnvironment/SelniumWebdriver/chromedriver.exe", chrome_options=chrome_opt) 110 | # browser = webdriver.Chrome(executable_path="G:/Document/PythonServerEnvironment/SelniumWebdriver/chromedriver.exe") 111 | spider.browser.get(request.url) 112 | import time 113 | time.sleep(3) 114 | print("访问:{0}".format(request.url)) 115 | 116 | return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding="utf-8", request=request) 117 | 118 | 119 | 120 | -------------------------------------------------------------------------------- /ArticleSpider/models/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | __Author__ = "HackFun" -------------------------------------------------------------------------------- /ArticleSpider/models/es_types.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | __Author__ = "HackFun" 5 | 6 | from datetime import datetime 7 | from elasticsearch_dsl import DocType, Date, Nested, Boolean, \ 8 | analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer 9 | 10 | from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalysis 11 | 12 | from elasticsearch_dsl.connections import connections 13 | 14 | es = connections.create_connection(hosts=["localhost"]) # connection可以连接多台服务器 15 | 16 | 17 | class CustomAnalyzer(_CustomAnalysis): 18 | def get_analysis_definition(self): 19 | return {} 20 | 21 | ik_analyser = CustomAnalyzer("ik_max_word", filter=["lowercase"]) 22 | 23 | 24 | class ArticleType(DocType): 25 | # 伯乐在线文章类型 26 | # suggest = Completion(analyzer="ik_max_word") # 不能直接使用这个,由于源码问题,必须使用CustomAnalyzer 27 | suggest = Completion(analyzer=ik_analyser) 28 | title = Text(analyzer="ik_max_word") 29 | create_date = Date() 30 | url = Keyword() 31 | url_object_id = Keyword() 32 | front_image_url = Keyword() 33 | front_image_path = Keyword() 34 | praise_nums = Integer() 35 | comment_nums = Integer() 36 | fav_nums = Integer() 37 | tags = Text(analyzer="ik_max_word") 38 | content = Text(analyzer="ik_max_word") 39 | 40 | class Meta: 41 | index = "jobbole" 42 | doc_type = "article" 43 | 44 | 45 | if __name__ == "__main__": 46 | ArticleType.init() # 根据类,直接生成mapping, 47 | -------------------------------------------------------------------------------- /ArticleSpider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import codecs 9 | import json 10 | import MySQLdb 11 | import MySQLdb.cursors 12 | from scrapy.pipelines.images import ImagesPipeline 13 | from scrapy.exporters import JsonItemExporter 14 | from twisted.enterprise import adbapi 15 | from ArticleSpider.models.es_types import ArticleType 16 | from w3lib.html import remove_tags 17 | 18 | 19 | class ArticlespiderPipeline(object): 20 | def process_item(self, item, spider): 21 | return item 22 | 23 | 24 | class JsonWithEncodingPipeline(object): 25 | # 自定义json文件的导出 26 | def __init__(self): 27 | self.file = codecs.open('article.json', 'w', encoding="utf-8") 28 | 29 | def process_item(self, item, spider): 30 | lines = json.dumps(dict(item), ensure_ascii=False) + "\n" 31 | self.file.write(lines) 32 | return item 33 | 34 | def spider_closed(self, spider): 35 | self.file.close() 36 | 37 | 38 | class MysqlPipeline(object): 39 | # 采用同步得机制写入mysql 40 | def __init__(self): 41 | host = "localhost" 42 | user = "root" 43 | password = "123456" 44 | dbname = "jobble_article" 45 | self.conn = MySQLdb.connect(host=host, user=user, passwd=password, db=dbname, charset='utf8', use_unicode=True) 46 | self.cursor = self.conn.cursor() 47 | 48 | def process_item(self, item, spider): 49 | insert_sql = """ 50 | insert into jobbole(title, url, create_date, fav_nums) 51 | VALUES (%s, %s, %s, %s) 52 | """ 53 | self.cursor.execute(insert_sql, (item["title"], item["url"], item["create_date"], item["fav_nums"])) 54 | self.conn.commit() 55 | 56 | 57 | class MysqlTwistedPipeline(object): 58 | def __init__(self, dbpool): 59 | self.dbpool = dbpool 60 | 61 | @classmethod 62 | def from_settings(cls, settings): 63 | dbparams = dict( 64 | host=settings['MYSQL_HOST'], 65 | dbname=settings['MYSQL_DBNAME'], 66 | user=settings['MYSQL_USER'], 67 | passwd=settings['MYSQL_PASSWORD'], 68 | charset='utf8', 69 | cursorclass=MySQLdb.cursors.DictCursor, 70 | use_unicode=True, 71 | ) 72 | dbpool = adbapi.ConnectionPool("MySQLdb", **dbparams) 73 | 74 | return cls(dbpool) 75 | 76 | def process_item(self, item, spider): 77 | # 使用twisted将mysql插入变成异步执行 78 | query = self.dbpool.runInteraction(self.do_insert, item) 79 | query.addErrback(self.handle_error, item, spider) # 处理异常 (self.handle_error, item, spider) 80 | 81 | def handle_error(self, failure, item, spider): 82 | # 处理异步插入的异常 83 | print(failure) 84 | 85 | def do_insert(self, cursor, item): 86 | # 执行具体的插入 87 | # if item.__class__.__name__ == "JobBoleArticleItem": 88 | insert_sql, params = item.get_insert_sql() 89 | 90 | cursor.execute(insert_sql, params) 91 | 92 | 93 | class JsonExporterPipeline(object): 94 | # 调用scrapy提供的json export 导出json文件 95 | def __init__(self): 96 | self.file = open('articleexport.json', 'wb') 97 | self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False) 98 | self.exporter.start_exporting() 99 | 100 | def close_spider(self, spider): 101 | self.exporter.finish_exporting() 102 | self.file.close() 103 | 104 | def process_item(self, item, spider): 105 | self.exporter.export_item(item) 106 | return item 107 | 108 | 109 | class ArticleImagePipeline(ImagesPipeline): 110 | def item_completed(self, results, item, info): 111 | if "front_image_url" in item: 112 | for ok, value in results: 113 | image_file_path = value["path"] 114 | item["front_image_path"] = image_file_path 115 | return item 116 | 117 | 118 | class ElasticsearchPipeline(object): 119 | # 将数据写入打到es中 120 | def process_item(self, item, spider): 121 | # 将 item 转换 为es的数据 122 | item.save_to_es() 123 | 124 | return item 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /ArticleSpider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | # Scrapy settings for ArticleSpider project 5 | # 6 | # For simplicity, this file contains only settings considered important or 7 | # commonly used. You can find more settings consulting the documentation: 8 | # 9 | # http://doc.scrapy.org/en/latest/topics/settings.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 11 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 12 | 13 | BOT_NAME = 'ArticleSpider' 14 | 15 | SPIDER_MODULES = ['ArticleSpider.spiders'] 16 | NEWSPIDER_MODULE = 'ArticleSpider.spiders' 17 | 18 | 19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 20 | #USER_AGENT = 'ArticleSpider (+http://www.yourdomain.com)' 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = False 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | #CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | DOWNLOAD_DELAY = 3 32 | # The download delay setting will honor only one of: 33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 34 | #CONCURRENT_REQUESTS_PER_IP = 16 35 | 36 | # Disable cookies (enabled by default) 37 | # COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | #TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | #DEFAULT_REQUEST_HEADERS = { 44 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 | # 'Accept-Language': 'en', 46 | #} 47 | 48 | # Enable or disable spider middlewares 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 50 | #SPIDER_MIDDLEWARES = { 51 | # 'ArticleSpider.middlewares.ArticlespiderSpiderMiddleware': 543, 52 | #} 53 | 54 | # Enable or disable downloader middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 56 | 57 | DOWNLOADER_MIDDLEWARES = { 58 | 'ArticleSpider.middlewares.RandomUserAgentMiddleware': 543, 59 | # 'ArticleSpider.middlewares.JSPageMiddleware': 1, 60 | # 'ArticleSpider.middlewares.RandomProxyMiddleware': 544, 61 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 62 | } 63 | 64 | 65 | 66 | # Enable or disable extensions 67 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 68 | #EXTENSIONS = { 69 | # 'scrapy.extensions.telnet.TelnetConsole': None, 70 | #} 71 | 72 | # Configure item pipelines 73 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 74 | ITEM_PIPELINES = { 75 | 'ArticleSpider.pipelines.ArticlespiderPipeline': 300, 76 | # 'scrapy.pipelines.images.ImagesPipeline': 1, 77 | 'ArticleSpider.pipelines.ArticleImagePipeline': 1, 78 | 'ArticleSpider.pipelines.ElasticsearchPipeline': 3, 79 | # 'ArticleSpider.pipelines.JsonWithEncodingPipeline': 2, 80 | # 'ArticleSpider.pipelines.JsonExporterPipeline': 2, 81 | } 82 | IMAGES_URLS_FIELD = "front_image_url" 83 | project_dir = os.path.abspath(os.path.dirname(__file__)) 84 | IMAGES_STORE = os.path.join(project_dir, 'images') 85 | # print(IMAGES_STORE) 86 | 87 | import os 88 | import sys 89 | 90 | BASE_DIR = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) 91 | # print(os.path.join(BASE_DIR, 'ArticleSpider')) 92 | sys.path.insert(0, os.path.join(BASE_DIR, 'ArticleSpider')) 93 | 94 | # sys.path.insert(0, r"G:\MyProgramFiles\Py3Code\ArticleSpider\ArticleSpider") 95 | 96 | user_agent_list = [ 97 | "", 98 | "", 99 | ] 100 | USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" 101 | 102 | RANDOM_UA_TYPE = "random" 103 | 104 | # Enable and configure the AutoThrottle extension (disabled by default) 105 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 106 | #AUTOTHROTTLE_ENABLED = True 107 | # The initial download delay 108 | #AUTOTHROTTLE_START_DELAY = 5 109 | # The maximum download delay to be set in case of high latencies 110 | #AUTOTHROTTLE_MAX_DELAY = 60 111 | # The average number of requests Scrapy should be sending in parallel to 112 | # each remote server 113 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 114 | # Enable showing throttling stats for every response received: 115 | #AUTOTHROTTLE_DEBUG = False 116 | 117 | # Enable and configure HTTP caching (disabled by default) 118 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 119 | #HTTPCACHE_ENABLED = True 120 | #HTTPCACHE_EXPIRATION_SECS = 0 121 | #HTTPCACHE_DIR = 'httpcache' 122 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 123 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 124 | 125 | MYSQL_HOST = "127.0.0.1" 126 | MYSQL_DBNAME = "jobble_article" 127 | MYSQL_USER = "root" 128 | MYSQL_PASSWORD = "123456" 129 | 130 | # JOBDIR = "job_info/001" 131 | 132 | SQL_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" 133 | SQL_DATE_FORMAT = "%Y-%m-%d" -------------------------------------------------------------------------------- /ArticleSpider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /ArticleSpider/spiders/jobbole.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import datetime 4 | import scrapy 5 | from scrapy.http import Request 6 | from urllib import parse 7 | from ArticleSpider.items import JobBoleArticleItem, ArticleItemLoader 8 | from ArticleSpider.utils.common import get_md5 9 | from scrapy.loader import ItemLoader 10 | from selenium import webdriver 11 | from scrapy.xlib.pydispatch import dispatcher 12 | from scrapy import signals 13 | 14 | 15 | class JobboleSpider(scrapy.Spider): 16 | name = 'jobbole' 17 | allowed_domains = ['blog.jobbole.com'] 18 | start_urls = ['http://blog.jobbole.com/all-posts/'] 19 | 20 | def __init__(self): 21 | self.start_urls = ('http://blog.jobbole.com/all-posts/',) 22 | # # self.fail_urls = [] 23 | # self.browser = webdriver.Chrome( 24 | # executable_path="G:/Document/PythonServerEnvironment/SelniumWebdriver/chromedriver.exe") 25 | # super(JobboleSpider, self).__init__() 26 | # dispatcher.connect(self.spider_close, signals.spider_closed) 27 | # 28 | # def spider_close(self, spider): 29 | # # 当爬虫退出的时候关闭chrome 30 | # print("spider closed") 31 | # self.browser.quit() 32 | # start_urls = ['http://blog.jobbole.com/112239/'] 33 | 34 | # 收集伯乐在线所有404的url以及404页面数 35 | # handle_httpstatus_list = [404] 36 | # 37 | # def __init__(self): 38 | # self.fail_urls = [] 39 | # dispatcher.connect(self.handle_spider_closed, signals.spider_closed) 40 | # 41 | # def handle_spider_closed(self, spider, reason): 42 | # self.crawler.stats.set_value("failed_urls", ",".join(self.fail_urls)) 43 | 44 | def parse(self, response): 45 | """ 46 | 1.获取文章列表中得文章url并交给scrapy下载后并进行解析 47 | 2.获取下一页得url 并交给scrapy进行下载, 下载完成后交给parse 48 | :param response: 49 | :return: 50 | """ 51 | # if response.status == 404: 52 | # self.fail_urls.append(response.url) 53 | # self.crawler.stats.inc_value("failed_url") 54 | 55 | # 解析列表中得所有文章url,然后下载 56 | post_nodes = response.css("#archive .floated-thumb .post-thumb a") 57 | for post_node in post_nodes: 58 | img_url = post_node.css("img::attr(src)").extract_first("") 59 | post_url = post_node.css("::attr(href)").extract_first("") 60 | pa_url = parse.urljoin(response.url, post_url) # 域名+url # response.url + post_url 61 | yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url": img_url}, callback=self.parse_detail) 62 | 63 | # 提取下一页进行下载 64 | next_url = response.css(".next.page-numbers::attr(href)").extract_first("") 65 | if next_url: 66 | pa_url = parse.urljoin(response.url, next_url) 67 | yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse) 68 | 69 | def parse_detail(self, response): 70 | # article_item = JobBoleArticleItem() 71 | 72 | # # 提取文章详情页 73 | # # re_selector = response.xpath("/html/body/div[2]/div[3]/div[1]/div[1]/h1") # 最好不用这种 74 | # title = response.xpath('//*[@id="post-112239"]/div[1]/h1/text()').extract_first() 75 | # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·", 76 | # "").strip() 77 | # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] 78 | # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] 79 | # match_re = re.match(r'.*?(\d+).*', fav_nums) 80 | # if match_re: 81 | # fav_nums = int(match_re.group(1)) 82 | # else: 83 | # fav_nums = 0 84 | # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] 85 | # match_re = re.match(r'.*?(\d+).*', comment_nums) 86 | # if match_re: 87 | # comment_nums = int(match_re.group(1)) 88 | # else: 89 | # comment_nums = 0 90 | # 91 | # content = response.xpath("//div[@class='entry']").extract()[0] 92 | # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() 93 | # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] 94 | # tags = ",".join(tag_list) 95 | 96 | # 通过css选择器提取字段 97 | # front_image_url = response.meta.get("front_image_url", "") # 文章封面图 98 | # title = response.css(".entry-header h1::text").extract()[0] 99 | # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace('·', '').strip() 100 | # praise_nums = response.css(".vote-post-up h10::text").extract()[0] 101 | # fav_nums = response.css(".bookmark-btn::text").extract()[0] 102 | # match_re = re.match(r'.*?(\d+).*', fav_nums) 103 | # if match_re: 104 | # fav_nums = match_re.group(1) 105 | # else: 106 | # fav_nums = 0 107 | # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] 108 | # match_re = re.match(r'.*?(\d+).*', comment_nums) 109 | # if match_re: 110 | # comment_nums = match_re.group(1) 111 | # else: 112 | # comment_nums = 0 113 | # 114 | # content = response.css("div.entry").extract()[0] 115 | # tags = response.css("p.entry-meta-hide-on-mobile a::text").extract() 116 | # tag_list = [element for element in tags if not element.strip().endswith("评论")] 117 | # tags = ",".join(tag_list) 118 | # 119 | # article_item["title"] = title 120 | # article_item["url"] = response.url 121 | # article_item["url_object_id"] = get_md5(response.url) 122 | # try: 123 | # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() 124 | # except Exception as e: 125 | # create_date = datetime.datetime.now() 126 | # article_item["create_date"] = create_date 127 | # article_item["front_image_url"] = [front_image_url] 128 | # article_item["content"] = content 129 | # article_item["praise_nums"] = praise_nums 130 | # article_item["comment_nums"] = comment_nums 131 | # article_item["fav_nums"] = fav_nums 132 | # article_item["tags"] = tags 133 | 134 | # 通过Item Loader 加载item 135 | item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) 136 | item_loader.add_css("title", ".entry-header h1::text") 137 | item_loader.add_value("url", response.url) 138 | item_loader.add_value("url_object_id", get_md5(response.url)) 139 | item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") 140 | item_loader.add_value("front_image_url", [response.meta.get("front_image_url", "")]) 141 | item_loader.add_css("praise_nums", ".vote-post-up h10::text") 142 | item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") 143 | item_loader.add_css("fav_nums", ".bookmark-btn::text") 144 | item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") 145 | item_loader.add_css("content", "div.entry") 146 | 147 | article_item = item_loader.load_item() 148 | # item_loader.add_xpath() 149 | 150 | yield article_item 151 | -------------------------------------------------------------------------------- /ArticleSpider/spiders/lagou.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.linkextractors import LinkExtractor 4 | from scrapy.spiders import CrawlSpider, Rule 5 | from ArticleSpider.items import LagouJobItem, LagouJobItemLoader 6 | from ArticleSpider.utils.common import get_md5 7 | import datetime 8 | 9 | 10 | class LagouSpider(CrawlSpider): 11 | name = 'lagou' 12 | allowed_domains = ['www.lagou.com'] 13 | start_urls = ['https://www.lagou.com/'] 14 | 15 | rules = ( 16 | Rule(LinkExtractor(allow=("zhaopin/.*",)), follow=True), 17 | Rule(LinkExtractor(allow=("gongsi/j\d+.html",)), follow=True), 18 | Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True), 19 | ) 20 | 21 | def parse_job(self, response): 22 | # 解析拉勾网的职位 23 | item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) 24 | item_loader.add_css("title", "") 25 | item_loader.add_value("url", response.url) 26 | item_loader.add_value("url_object_id", get_md5(response.url)) 27 | item_loader.add_css("salary", ".job_request .salary::text") 28 | item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text") 29 | item_loader.add_css("work_years", ".job_request p span:nth-child(3)::text") # 这里使用css ,是为了在学习时,熟悉css选择器用法 30 | item_loader.add_xpath("degree_need", "//dd[@class='job_request']/p/span[4]/text()") 31 | item_loader.add_xpath("job_type", "//dd[@class='job_request']/p/span[5]/text()") 32 | 33 | item_loader.add_css("publish_time", ".publish_time::text") 34 | item_loader.add_css("tags", ".position-label.clearfix li::text") 35 | item_loader.add_css("job_advantage", ".job-advantage p::text") 36 | item_loader.add_css("job_desc", ".job_bt div") 37 | item_loader.add_css("job_addr", ".work_addr") 38 | item_loader.add_css("company_url", "#job_company dt a::attr(href)") 39 | item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") 40 | item_loader.add_value("crawl_time", datetime.datetime.now()) 41 | # item_loader.add_css("crawl_update_time", datetime.datetime.now()) 42 | 43 | job_item = item_loader.load_item() # 这里先赋值给一个变量,是考虑到便于调试以及代码可读性,而不是为了代码简洁而直接return 44 | 45 | return job_item 46 | 47 | def parse_start_url(self, response): 48 | return [] 49 | 50 | def process_results(self, response, results): 51 | return results -------------------------------------------------------------------------------- /ArticleSpider/spiders/zhihu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import scrapy 4 | import json 5 | import datetime 6 | try: 7 | import urlparse as parse 8 | except: 9 | from urllib import parse 10 | 11 | from scrapy.loader import ItemLoader 12 | from ArticleSpider.items import ZhihuAnswerItem, ZhihuQuestionItem 13 | from ArticleSpider.settings import user_agent_list 14 | 15 | 16 | class ZhihuSpider(scrapy.Spider): 17 | name = 'zhihu' 18 | allowed_domains = ['www.zhihu.com'] 19 | start_urls = ['http://www.zhihu.com/'] 20 | 21 | # question的第一页answer得请求url 22 | start_answer_url = "https://www.zhihu.com/api/v4/questions/26234383/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit={1}&offset={2}" 23 | agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" 24 | 25 | 26 | headers = { 27 | "Host": "www.zhihu.com", 28 | "Referer": "https://www.zhihu.com/", 29 | "User-Agent": agent, 30 | 31 | } 32 | custom_settings = { 33 | "COOKIES_ENABLED": True 34 | } 35 | 36 | def parse(self, response): 37 | # 提取出html页面中的所有url 并跟踪url进行一些爬取 38 | # 如果提取得url中格式为 /question/xxx 就下载进行之后直接进入解析函数 39 | all_urls = response.css("a::attr(href)").extract() 40 | all_urls = [parse.urljoin(response.url, url) for url in all_urls] 41 | all_urls = filter(lambda x: True if x.startswith("https") else False, all_urls) 42 | for url in all_urls: 43 | match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", url) 44 | if match_obj: 45 | # 如果提取到得question相关页面则下载交由提取函数进行提取 46 | request_url = match_obj.group(1) 47 | # 简单的随机更换User-Agent 48 | # import random 49 | # random_index = random.randint(0, len(user_agent_list) - 1) 50 | # random_agent = user_agent_list[random_index] 51 | # self.headers["User-Agent"] = random_agent 52 | yield scrapy.Request(request_url, headers=self.headers, callback=self.parse_question) 53 | # break # debug时候用 54 | else: 55 | # 如果不是question 页面则直接进一步跟踪 56 | yield scrapy.Request(url, headers=self.headers, callback=self.parse) # debug 注释 57 | 58 | def parse_question(self, response): 59 | # 处理question页面, 从页面中提取question 具体item 60 | match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) 61 | if match_obj: 62 | question_id = int(match_obj.group(2)) 63 | 64 | if "QuestionHeader-title" in response.text: 65 | # 处理新版本 66 | item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) 67 | item_loader.add_css("title", "h1.QuestionHeader-title::text") 68 | item_loader.add_css("content", ".QuestionHeader-detail") 69 | item_loader.add_value("url", response.url) 70 | item_loader.add_value("zhihu_id", question_id) 71 | item_loader.add_css("answer_num", ".List-headerText span::text") 72 | item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") 73 | item_loader.add_css("watch_user_num", ".NumberBoard-value::text") 74 | item_loader.add_css("topics", ".QuestionHeader-topics.Popover::text") 75 | 76 | else: 77 | # 处理知乎旧版本 78 | item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) 79 | item_loader.add_css("title", "h1.QuestionHeader-title::text") 80 | item_loader.add_css("content", ".QuestionHeader-detail") 81 | item_loader.add_value("url", response.url) 82 | item_loader.add_value("zhihu_id", question_id) 83 | item_loader.add_css("answer_num", ".List-headerText span::text") 84 | item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") 85 | item_loader.add_css("watch_user_num", ".NumberBoard-value::text") 86 | item_loader.add_css("topics", ".QuestionHeader-topics.Popover::text") 87 | 88 | question_item = item_loader.load_item() 89 | yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) 90 | yield question_item # debug 注释 91 | 92 | def parse_answer(self, response): 93 | # 处理answer 94 | ans_json = json.loads(response.text) 95 | is_end = ans_json["paging"]["is_end"] 96 | # totals_answer = ans_json["paging"]["totals"] 97 | next_url = ans_json["paging"]["next"] 98 | 99 | # 提取answer的具体字段 100 | for answer in ans_json["data"]: 101 | answer_item = ZhihuAnswerItem() 102 | answer_item["zhihu_id"] = answer["id"] 103 | answer_item["url"] = answer["url"] 104 | answer_item["question_id"] = answer["question"]["id"] 105 | answer_item["author_id"] = answer["author"]["id"] if "id" in answer["author"] else None 106 | answer_item["content"] = answer["content"] if "content" in answer else None 107 | answer_item["praise_num"] = answer["voteup_count"] 108 | answer_item["comments_num"] = answer["comment_count"] 109 | answer_item["create_time"] = answer["created_time"] 110 | answer_item["update_time"] = answer["updated_time"] 111 | answer_item["crawl_time"] = datetime.datetime.now() 112 | 113 | yield answer_item 114 | 115 | if not is_end: 116 | yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer) 117 | 118 | def start_requests(self): 119 | return [scrapy.Request('https://www.zhihu.com/#signin', headers=self.headers, callback=self.login)] 120 | 121 | def login(self, response): 122 | 123 | response_text = response.text 124 | match_obj = re.match('.*name="_xsrf" value="(.*?)".*', response_text, re.DOTALL) 125 | if match_obj: 126 | print(match_obj.group(1)) 127 | _xsrf = match_obj.group(1) 128 | if _xsrf: 129 | 130 | post_data = { 131 | "_xsrf": _xsrf, 132 | "phone_num": '13342266862', 133 | "password": '553768563', 134 | "captcha": "", 135 | 136 | } 137 | import time 138 | t = str(int(time.time() * 1000)) 139 | captcha_url = "https://www.zhihu.com/captcha.gif?r={0}&type=login&lang=cn".format(t) 140 | yield scrapy.Request(captcha_url, headers=self.headers, meta={"post_data": post_data}, callback=self.login_after_captcha) # 重点 141 | 142 | def login_after_captcha(self, response): 143 | with open("captcha.gif", "wb") as f: 144 | f.write(response.body) 145 | f.close() 146 | 147 | from PIL import Image 148 | try: 149 | im = Image.open("captcha.gif") 150 | im.show() 151 | im.close() 152 | except: 153 | pass 154 | 155 | captcha = input("输入验证码\n>") 156 | 157 | post_data = response.meta.get("post_data", {}) 158 | post_url = "https://www.zhihu.com/login/phone_num" 159 | post_data["captcha"] = captcha 160 | return [scrapy.FormRequest( 161 | url=post_url, 162 | formdata=post_data, 163 | headers=self.headers, 164 | callback=self.check_login 165 | )] 166 | 167 | def check_login(self, response): 168 | # 验证服务器的返回数据是否成功 169 | text_json = json.loads(response.text) 170 | if "msg" in text_json and text_json["msg"] == "登录成功": 171 | for url in self.start_urls: 172 | yield scrapy.Request(url, dont_filter=True, headers=self.headers) 173 | 174 | -------------------------------------------------------------------------------- /ArticleSpider/tools/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | __Author__ = "HackFun" -------------------------------------------------------------------------------- /ArticleSpider/tools/debug.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackfengJam/ArticleSpider/a332bd1e0db36d41b23f85cd850dfb20c75d00f0/ArticleSpider/tools/debug.log -------------------------------------------------------------------------------- /ArticleSpider/tools/selenium_spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | __Author__ = "HackFun" 5 | 6 | from selenium import webdriver 7 | from scrapy.selector import Selector 8 | 9 | # browser = webdriver.Chrome(executable_path="G:/Document/PythonServerEnvironment/SelniumWebdriver/chromedriver.exe") 10 | # 11 | # browser.get("https://item.taobao.com/item.htm?spm=2013.1.iteminfo.10.4b556901SPB44D&scm=1007.10010.52063.100200300000003&id=552169264763&pvid=19a525ca-6111-4648-98ab-0ff06f668623") 12 | # 13 | # print(browser.page_source) 14 | # 15 | # selector_ = Selector(text=browser.page_source) 16 | 17 | 18 | # browser.quit() 19 | 20 | # 设置chromedirver 21 | chrome_opt = webdriver.ChromeOptions() 22 | prefs = {"profile.managed_default_content_settings.images": 2} 23 | chrome_opt.add_experimental_option("prefs", prefs) 24 | browser = webdriver.Chrome(executable_path="G:/Document/PythonServerEnvironment/SelniumWebdriver/chromedriver.exe", chrome_options=chrome_opt) 25 | browser.get("https://anta.tmall.com/") 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /ArticleSpider/utils/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | __Author__ = "HackFun" -------------------------------------------------------------------------------- /ArticleSpider/utils/captcha.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackfengJam/ArticleSpider/a332bd1e0db36d41b23f85cd850dfb20c75d00f0/ArticleSpider/utils/captcha.gif -------------------------------------------------------------------------------- /ArticleSpider/utils/common.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | __Author__ = "HackFun" 5 | import hashlib 6 | import re 7 | import webbrowser 8 | from webbrowser import Chrome 9 | 10 | 11 | def get_md5(url): 12 | if isinstance(url, str): 13 | url = url.encode("utf-8") 14 | m = hashlib.md5() 15 | m.update(url) 16 | return m.hexdigest() 17 | 18 | 19 | def extract_num(text): 20 | # 字符串中提取数字 21 | match_re = re.match(r'.*?(\d+).*', text) 22 | if match_re: 23 | nums = match_re.group(1) 24 | return nums 25 | 26 | 27 | def webtest(): 28 | # webbrowser.open("http://jobbole.com", new=0, autoraise=1) 29 | # webbrowser.open_new("http://jobbole.com") 30 | # webbrowser.open_new_tab("http://jobbole.com") 31 | webbrowser.register(name="chrome", klass=Chrome) 32 | webbrowser.get('chrome').open("http://jobbole.com") 33 | # .open('www.baidu.com', new=1, autoraise=True) 34 | 35 | chromePath = r'你的浏览器目录' # 例如我的:C:\***\***\***\***\Google\Chrome\Application\chrome.exe 36 | webbrowser.register('chrome', None, webbrowser.BackgroundBrowser(chromePath)) # 这里的'chrome'可以用其它任意名字,如chrome111,这里将想打开的浏览器保存到'chrome' 37 | webbrowser.get('chrome').open('www.baidu.com', new=1, autoraise=True) 38 | 39 | 40 | 41 | # 42 | # def to_list(t): 43 | # return [i for i in t] 44 | 45 | if __name__ == '__main__': 46 | webtest() 47 | print(get_md5("http://jobbole.com")) 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /ArticleSpider/utils/crawl_xici_ip.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | __Author__ = "HackFun" 5 | 6 | import requests 7 | from scrapy.selector import Selector 8 | import MySQLdb 9 | 10 | conn = MySQLdb.connect(host="localhost", user="root", passwd="123456", db="article_spider", charset="utf8") 11 | cursor = conn.cursor() 12 | 13 | 14 | def crawl_ips(): 15 | # 爬取西刺得免费ip代理 16 | headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"} 17 | for i in range(2354): 18 | re = requests.get("http://www.xicidaili.com/nn/{0}".format(i), headers=headers) 19 | 20 | # print(re.text) 21 | selector = Selector(text=re.text) 22 | # all_trs = selector.css("#ip_list tr[class]:not([class='subtitle'])") 23 | all_trs = selector.css("#ip_list tr") 24 | 25 | ip_list = [] 26 | 27 | for tr in all_trs[1:]: 28 | speed_str = tr.css(".bar::attr(title)").extract()[0] 29 | if speed_str: 30 | speed = float(speed_str.split("秒")[0]) 31 | # ip = tr.css("td:nth-child[2]::text").extract()[0] # 报错 32 | all_text = tr.css("td::text").extract() 33 | ip = all_text[0] 34 | port = all_text[1] 35 | proxy_type = all_text[5] 36 | 37 | # lis = (ip, port, speed, proxy_type) 38 | # lis = list(map(lambda a: str(a) if type(a) != 'str' else a, (ip, port, speed, proxy_type))) 39 | # print(':'.join(lis)) 40 | 41 | ip_list.append((ip, port, speed, proxy_type)) 42 | 43 | # print(all_trs) 44 | # for tr in all_trs: 45 | # # print(tr.extract()) 46 | # # ip = tr.xpath('/td[2]/text()').extract() 47 | # # port = tr.xpath('/td[3]/text()').extract() 48 | # # http_type = tr.xpath('/td[6]/text()').extract() 49 | # ip = tr.css('td:nth-child(2)::text').extract()[0] 50 | # port = tr.css('td:nth-child(3)::text').extract()[0] 51 | # speed = tr.css('td:nth-child(6)::text').extract()[0] 52 | # proxy_type = tr.css('td:nth-child(6)::text').extract()[0] 53 | # # print(ip, port) 54 | # # print(':'.join((str(ip), str(port), str(http_type)))) 55 | # print(':'.join((ip, port, speed, proxy_type))) 56 | # ip_list.append((ip, port, speed, proxy_type)) 57 | 58 | print(": ".join(ip_info)) 59 | 60 | for ip_info in ip_list: 61 | cursor.execute("insert into proxy_ip(ip, port, speed, proxy_type) VALUES ('{0}','{1}',{2},'{3}')".format( 62 | ip_info[0], ip_info[1], ip_info[2], ip_info[3]) 63 | ) # 传递字符串一定要加单引号 64 | 65 | conn.commit() 66 | 67 | # for tr in all_trs[1:]: 68 | # # speed_str = tr.css(".bar::attr(title)").extract()[0] 69 | # # if speed_str: 70 | # # speed = float(speed_str.split("秒")[0]) 71 | # all_texts = tr.css("td::text").extract() 72 | # print(all_texts) 73 | 74 | # print(re.text) 75 | 76 | 77 | class GetIP(object): 78 | def delete_ip(self, ip): 79 | # 从数据库中删除无效的ip 80 | delete_sql = """ 81 | delete from proxy_ip where ip='{0}' 82 | """.format(ip) 83 | cursor.execute(delete_sql) 84 | conn.commit() 85 | return True 86 | 87 | def judge_ip(self, ip, port, proxy_type): 88 | # 判断IP 是否可用 89 | http_url = "proxy_type://www.baidu.com" 90 | proxy_url = "{3}://{0}:{1}".format(ip, port, proxy_type) 91 | response = None 92 | try: 93 | proxy_dict = { 94 | proxy_type: proxy_url 95 | } 96 | response = requests.get(http_url, proxies=proxy_dict) 97 | return True 98 | except Exception as e: 99 | print("invalid ip and port") 100 | self.delete_ip(ip) 101 | return False 102 | else: 103 | code = response.status_code 104 | if code >= 200 and code < 300: 105 | print("effective ip") 106 | return True 107 | else: 108 | print("invalid ip and port") 109 | self.delete_ip(ip) 110 | return False 111 | 112 | def get_random_ip(self): 113 | # 从数据库中随机获取一个可用的ip 114 | random_sql = """ 115 | SELECT ip,port FROM proxy_ip 116 | ORDER BY RAND() 117 | LIMIT 1 118 | """ 119 | cursor.execute(random_sql) 120 | for ip_info in cursor.fetchall(): 121 | ip = ip_info[0] 122 | port = ip_info[1] 123 | proxy_type = ip_info[3] if ip_info[3] and ip_info[3] != "" else 'http' 124 | 125 | judge_re = self.judge_ip(ip, port, proxy_type) 126 | if judge_re: 127 | return "{3}://{0}:{1}".format(ip, port, proxy_type) 128 | else: 129 | return self.get_random_ip() 130 | 131 | if __name__ == '__main__': 132 | # crawl_ips() 133 | get_ip = GetIP() 134 | print(get_ip.get_random_ip()) -------------------------------------------------------------------------------- /ArticleSpider/utils/zhihu_login_requests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | __Author__ = "HackFun" 5 | 6 | import requests 7 | try: 8 | import cookielib 9 | except: 10 | import http.cookiejar as cookielib 11 | 12 | import re 13 | 14 | 15 | 16 | agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" 17 | header = { 18 | "Host": "www.zhihu.com", 19 | "Referer": "https://www.zhihu.com/", 20 | "User-Agent": agent, 21 | "Cookie":'q_c1=447a188541144e3fb30424b694576ef2|1502619613000|1491825468000; q_c1=447a188541144e3fb30424b694576ef2|1502619613000|1491825468000; _zap=6efcefae-72d9-4251-9d91-2f350d61f8ee; capsion_ticket="2|1:0|10:1503325910|14:capsion_ticket|44:MDM1NThhZGYwMTM1NDAyNzkzNTYzMDMwNjhlNDNkNjM=|05608b1721fc351684c420227a8cc8c6a3926cfaea2c64ec23c62a1fbcd3a48f"; aliyungf_tc=AQAAAO2IxyovewwAshrJtkWO76wHBbMh; d_c0="AECCvOx7SwyPTtI7hlhRAcElYn2NHqLNeYI=|1504004081"; _xsrf=1be2d9a7-746b-4245-bc8f-4b50692e0965; l_cap_id="NzVjMmQ2ZTFkODVjNGVlYzkzZGNjNDQ4OTgwNjA2MDI=|1504010920|556da1e4afe6174e99f237007f3b12c2dd7054a2"; r_cap_id="MTAxNjU2MzFjZDM5NGNmZDgyNTliODljZDc3Y2IyMmQ=|1504010920|7698fb675ed8a3d0aff05ca5fa4e92297889b4e2"; cap_id="MjYzZjRlYTllOTA0NDA4MWE5ZGRjOTRlNGNiZTk5Y2M=|1504010920|7a120663cef55b6d4c72932874f2ed61afd2d050"; __utma=51854390.504384623.1504004084.1504004084.1504010008.2; __utmb=51854390.0.10.1504010008; __utmc=51854390; __utmz=51854390.1504010008.2.2.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=51854390.000--|2=registration_date=20170615=1^3=entry_date=20170410=1' 22 | } 23 | 24 | 25 | session = requests.session() 26 | session.cookies = cookielib.LWPCookieJar(filename="cookies.txt") 27 | 28 | try: 29 | session.cookies.load(ignore_discard=True) 30 | print("cookies已被加载") 31 | except: 32 | print("cookies未能加载") 33 | 34 | 35 | def get_xsrf(): 36 | response = session.get("https://www.zhihu.com/", headers=header) 37 | # print(response.text) 38 | 39 | # text = '' 40 | # text = '' 41 | text = response.text 42 | print(text) 43 | match_obj = re.match(r'.*?required.*', text) 44 | match_obj = re.match(r'.*name="_xsrf" value="(.*?)".*', text.strip()) 45 | match_obj = re.match(r'.*name="_xsrf" value="(.*?)".*', text, re.DOTALL) 46 | match_obj = re.search('.*name="_xsrf" value="(.*?)".*', text) 47 | if match_obj: 48 | print(match_obj.group(1)) 49 | return match_obj.group(1) 50 | else: 51 | return "" 52 | 53 | 54 | def is_login(): 55 | inbox_url = "https://www.zhihu.com/inbox" 56 | response = session.get(inbox_url, headers=header, allow_redirects=False) 57 | if response.status_code != 200: 58 | return False 59 | else: 60 | return True 61 | 62 | 63 | def get_index(): 64 | response = session.get("https://www.zhihu.com/", headers=header) 65 | with open("index_page.html", "wb") as f: 66 | f.write(response.text.encode("utf-8")) 67 | print("ok") 68 | 69 | 70 | def get_captcha(): 71 | import time 72 | t = str(int(time.time()*1000)) 73 | captcha_url = "https://www.zhihu.com/captcha.gif?r=1504099197089&type=login&lang=cn" 74 | captcha_url = "https://www.zhihu.com/captcha.gif?r={0}&type=login&lang=cn" 75 | t = session.get(captcha_url, headers=header) 76 | with open("captcha.gif", "wb") as f: 77 | f.write(t.content) 78 | f.close() 79 | 80 | from PIL import Image 81 | try: 82 | im = Image.open("captcha.gif") 83 | im.show() 84 | im.close() 85 | except: 86 | pass 87 | 88 | captcha = input("输入验证码\n>") # python2 中是 raw_input 89 | return captcha 90 | 91 | 92 | def zhihu_login(account, password): 93 | # 知乎登陆 94 | if re.match("^1\d{10}", account): 95 | print("手机号码登陆") 96 | post_url = "https://www.zhihu.com/login/phone_num" 97 | post_data = { 98 | "_xsrf": get_xsrf(), 99 | "phone_num": account, 100 | "password": password, 101 | "captcha": get_captcha(), 102 | # captcha:{"img_size":[200,44],"input_points":[[21.375,28],[156.375,33]]}# 2017-08-30 103 | "captcha_type": 'cn' 104 | } 105 | else: 106 | if "@" in account: 107 | # 判断用户名是否为邮箱 108 | print("邮箱方式登陆") 109 | post_url = "https://www.zhihu.com/login/email" 110 | post_data = { 111 | "_xsrf": get_xsrf(), 112 | "email": account, 113 | "password": password, 114 | "captcha": get_captcha(), 115 | "captcha_type": 'cn' 116 | } 117 | 118 | response_text = session.post(post_url, data=post_data, headers=header) 119 | session.cookies.save() 120 | 121 | if __name__ == '__main__': 122 | zhihu_login("13342266862", "553768563") 123 | # print(get_xsrf()) 124 | get_index() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ArticleSpider 2 | 通过scrapy,爬取知乎,伯乐在线,拉钩网 3 | 4 | **注:** 5 | 6 | > 这是一个进阶项目,需要有一定的爬虫知识,如果不是很懂基本的爬虫原理,请自行学习一下爬虫基础知识。 7 | 我有一个对应的仓库[MyPythonForSpider](http://git.oschina.net/hackfun/MyPythonForSpider "baidumusicspider"),是一个单线程爬取百度音乐数据的实例,比较适合刚入门的朋友。 8 | 9 | 10 | 11 | **这是一个基于web抓取框架[scrapy](https://baike.baidu.com/item/scrapy/7914913?fr=aladdin "scrapy"),实现的对于知乎,伯乐在线,拉勾网的爬取。** 12 | 13 | ### 涉及到的知识点 14 |
15 | |-- 基础 16 | | |-- 正则表达式 [jobbole.py](ArticleSpider/spiders/jobbole.py) 17 | | |-- xpath (ArticleSpider/spiders/jobbole.py) 18 | | |-- css选择器 (ArticleSpider/spiders/*.py) 19 | | `-- ItemLoader 20 | |-- 进阶 21 | | |-- 图片验证码的处理(ArticleSpider/spiders/lagou.login_after_captcha) 22 | | |-- ip访问频率限制(ArticleSpider.middlewares.RandomProxyMiddleware) 23 | | `-- user-agent随机切换(ArticleSpider.middlewares.RandomUserAgentMiddleware) 24 | |-- 高级 25 | | |-- scrapy的原理 26 | | `-- 基于scrapy的中间件开发 27 | | |-- 动态网站的抓取处理 28 | | |-- 将selenium集成到scrapy中 29 | | `-- scrapy log配置 30 | `-- |后续(在此项目中没有体现,后续我将上传此部分代码) 31 | |-- scrapy-redis 32 | |-- 分布式爬虫原理 33 | |-- 分析scrapy-redis源码 34 | `-- 集成bloomfilter到scrapy-redis中 35 | `-- Elasticsearch (ArticleSpider.pipelines.ElasticsearchPipeline;)(ArticleSpider.items.JobBoleArticleItem.save_to_es;) 36 | |-- 安装 elasticsearch-rtf 37 | |-- 学习使用 elasticsearch-head、kibana 38 | |-- 学习使用 elasticsearch的Python API: elasticsearch-dsl 39 | `-- 利用elasticsearch和爬取到的数据+django框架搭建搜索网站(此部分代码将在以后上传) 40 |41 | 42 | **PS:使用此代码前,需创建mysql数据库,详见ArticleSpider/settings.py** 43 | -------------------------------------------------------------------------------- /articleexport.json: -------------------------------------------------------------------------------- 1 | [] -------------------------------------------------------------------------------- /build/lib/ArticleSpider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackfengJam/ArticleSpider/a332bd1e0db36d41b23f85cd850dfb20c75d00f0/build/lib/ArticleSpider/__init__.py -------------------------------------------------------------------------------- /build/lib/ArticleSpider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | import re 8 | import datetime 9 | import scrapy 10 | from scrapy.loader import ItemLoader 11 | from scrapy.loader.processors import MapCompose, TakeFirst, Join 12 | from ArticleSpider.utils.common import extract_num 13 | from ArticleSpider.settings import SQL_DATETIME_FORMAT, SQL_DATE_FORMAT 14 | from w3lib.html import remove_tags 15 | from ArticleSpider.models.es_types import ArticleType 16 | import redis 17 | 18 | 19 | from elasticsearch_dsl.connections import connections 20 | 21 | es = connections.create_connection(ArticleType._doc_type.using) 22 | 23 | redis_cli = redis.StrictRedis() 24 | 25 | 26 | class ArticlespiderItem(scrapy.Item): 27 | # define the fields for your item here like: 28 | # name = scrapy.Field() 29 | pass 30 | 31 | 32 | def date_convert(value): 33 | try: 34 | create_date = datetime.datetime.strptime(value, "%Y/%m/%d").date() 35 | except Exception as e: 36 | create_date = datetime.datetime.now() 37 | return create_date 38 | 39 | 40 | def add_jobbole(value): 41 | return value+"-haifeng" 42 | 43 | 44 | def get_nums(value): 45 | # print(value) 46 | match_re = re.match(r'.*?(\d+).*', value) 47 | if match_re: 48 | nums = match_re.group(1) 49 | return nums 50 | else: 51 | return 0 52 | 53 | 54 | def remove_comment_tags(value): 55 | # 去掉tag中提取得评论 56 | if "评论" in value: 57 | return "" 58 | else: 59 | return value 60 | 61 | 62 | def return_value(value): 63 | return value 64 | 65 | 66 | def gen_suggests(index, info_tuple): 67 | # 根据字符串生成搜索建议数据 68 | # python工程师 title 10 69 | # python工程师 text 3 70 | # 不能覆盖,所以用set 71 | used_words = set() 72 | suggests = [] 73 | for text, weight in info_tuple: 74 | if text: 75 | # 调用es得analyze接口分析字符串 76 | words = es.indices.analyze(index=index, analyzer="ik_max_word", params={'filter': ["lowercase"]}, body=text) 77 | analyzed_words = set(r["token"] for r in words["tokens"] if len(r["token"]) > 1) 78 | new_words = analyzed_words - used_words 79 | else: 80 | new_words = set() 81 | if new_words: 82 | suggests.append({"input": list(new_words), "weight": weight}) 83 | return suggests 84 | 85 | 86 | class ArticleItemLoader(ItemLoader): 87 | # 自定义ItemLoader 88 | default_output_processor = TakeFirst() 89 | 90 | 91 | class JobBoleArticleItem(scrapy.Item): 92 | # title = scrapy.Field( 93 | # input_processor=MapCompose(lambda x: x+"-jobbole", add_jobbole), 94 | # output_processor = TakeFirst() 95 | # ) 96 | title = scrapy.Field() 97 | 98 | create_date = scrapy.Field( 99 | input_processor=MapCompose(date_convert), 100 | # output_processor=TakeFirst() 101 | ) 102 | url = scrapy.Field() 103 | url_object_id = scrapy.Field( 104 | output_processor=MapCompose(return_value) 105 | ) 106 | front_image_url = scrapy.Field( 107 | output_processor=MapCompose(return_value) 108 | ) 109 | front_image_path = scrapy.Field() 110 | praise_nums = scrapy.Field( 111 | input_processor=MapCompose(get_nums) 112 | ) 113 | comment_nums = scrapy.Field( 114 | input_processor=MapCompose(get_nums) 115 | ) 116 | fav_nums = scrapy.Field( 117 | input_processor=MapCompose(get_nums) 118 | ) 119 | tags = scrapy.Field( 120 | input_processor=MapCompose(remove_comment_tags), 121 | output_processor=Join(","), 122 | ) 123 | content = scrapy.Field() 124 | 125 | def get_insert_sql(self): 126 | insert_sql = """ 127 | insert into jobbole(title, url, create_date, fav_nums) 128 | VALUES (%s, %s, %s, %s) 129 | ON DUPLICATE KEY 130 | UPDATE title=VALUES(title),url=VALUES(url), 131 | create_date=VALUES (create_date),fav_nums=VALUES (fav_nums) 132 | """ 133 | params = (self["title"], self["url"], self["create_date"], self["fav_nums"]) 134 | return insert_sql, params 135 | 136 | def save_to_es(self): 137 | article = ArticleType() 138 | article.title = self['title'] 139 | article.create_date = self["create_date"] 140 | article.content = remove_tags(self["content"]) 141 | article.front_image_url = self["front_image_url"] 142 | if "front_image_path" in self: 143 | article.front_image_path = self["front_image_path"] 144 | article.praise_nums = self["praise_nums"] 145 | article.fav_nums = self["fav_nums"] 146 | article.comment_nums = self["comment_nums"] 147 | article.url = self["url"] 148 | article.tags = self["tags"] 149 | article.meta.id = self["url_object_id"] 150 | 151 | # article.suggest = [{"input":[], "weight":2}] 152 | article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7))) 153 | 154 | article.save() 155 | 156 | redis_cli.incr("jobble_count") 157 | 158 | return 159 | 160 | 161 | class ZhihuQuestionItem(scrapy.Item): 162 | # 知乎的问题 Item 163 | zhihu_id = scrapy.Field() 164 | topics = scrapy.Field() 165 | url = scrapy.Field() 166 | title = scrapy.Field() 167 | content = scrapy.Field() 168 | answer_num = scrapy.Field() 169 | comments_num = scrapy.Field() 170 | watch_user_num = scrapy.Field() 171 | click_num = scrapy.Field() 172 | crawl_time = scrapy.Field() 173 | 174 | def get_insert_sql(self): 175 | insert_sql = """ 176 | insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num, 177 | watch_user_num, click_num, crawl_time) 178 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s,%s, %s) 179 | ON DUPLICATE KEY 180 | UPDATE content=VALUES(content),answer_num=VALUES(answer_num), 181 | comments_num=VALUES (comments_num),watch_user_num=VALUES (watch_user_num), 182 | click_num=VALUES (click_num) 183 | """ 184 | zhihu_id = self["zhihu_id"][0] 185 | topics = ",".join(self["topics"]) 186 | url = self["url"][0] 187 | title = "".join(self["title"]) 188 | content = "".join(self["content"]) 189 | answer_num = extract_num("".join(self["answer_num"])) 190 | comments_num = extract_num("".join(self["comments_num"])) 191 | watch_user_num = extract_num("".join(self["watch_user_num"])) 192 | click_num = extract_num("".join(self["click_num"])) 193 | crawl_time = datetime.datetime.now().strftime("") 194 | 195 | params = (zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time) 196 | 197 | return insert_sql, params 198 | 199 | 200 | class ZhihuAnswerItem(scrapy.Item): 201 | zhihu_id = scrapy.Field() 202 | url = scrapy.Field() 203 | question_id = scrapy.Field() 204 | author_id = scrapy.Field() 205 | content = scrapy.Field() 206 | praise_num = scrapy.Field() 207 | comments_num = scrapy.Field() 208 | create_time = scrapy.Field() 209 | update_time = scrapy.Field() 210 | crawl_time = scrapy.Field() 211 | 212 | def get_insert_sql(self): 213 | 214 | insert_sql = """ 215 | insert into zhihu_answer(zhihu_id, url, question_id, author_id, content, praise_num, 216 | comments_num,create_time, update_time, crawl_time) 217 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s,%s, %s) 218 | ON DUPLICATE KEY 219 | UPDATE content=VALUES(content),comments_num=VALUES(comments_num), 220 | praise_num=VALUES (praise_num),update_time=VALUES (update_time) 221 | """ # on duplicate是mysql特有的语法 222 | create_time = datetime.datetime.fromtimestamp(self["create_time"]) 223 | update_time = datetime.datetime.fromtimestamp(self["update_time"]) 224 | 225 | params = ( 226 | self["zhuhu_id"], self["url"], self["question_id"], 227 | self["author_id"], self["content"], self["praise_num"], 228 | self["comments_num"], create_time, update_time, self["crawl_time"], 229 | ) 230 | 231 | 232 | def remove_splash(value): 233 | # 去掉工作城市得斜线 234 | return value.replace("/", "") 235 | 236 | 237 | def handle_jobaddr(value): 238 | addr_list = value.split("\n") 239 | addr_list = [item.strip() for item in addr_list if item.strip() != "查看地图"] 240 | return "".join(addr_list) 241 | 242 | 243 | class LagouJobItemLoader(ItemLoader): 244 | # 自定义ItemLoader 245 | default_out_processor = TakeFirst() 246 | 247 | 248 | class LagouJobItem(scrapy.Item): 249 | # 拉钩网职位信息 250 | url = scrapy.Field() 251 | url_object_id = scrapy.Field() 252 | title = scrapy.Field() 253 | salary = scrapy.Field() 254 | job_city = scrapy.Field( 255 | input_processor=MapCompose(remove_splash), 256 | ) 257 | work_years = scrapy.Field( 258 | input_processor=MapCompose(remove_splash), 259 | ) 260 | degree_need = scrapy.Field( 261 | input_processor=MapCompose(remove_splash), 262 | ) 263 | job_type = scrapy.Field() 264 | publish_time = scrapy.Field() 265 | tags = scrapy.Field( 266 | input_processor=MapCompose(Join(",")) 267 | ) 268 | job_advantage = scrapy.Field() 269 | job_desc = scrapy.Field() 270 | job_addr = scrapy.Field( 271 | input_processor=MapCompose(remove_tags), 272 | ) 273 | company_url = scrapy.Field() 274 | company_name = scrapy.Field() 275 | crawl_time = scrapy.Field() 276 | crawl_update_time = scrapy.Field() 277 | 278 | def get_insert_sql(self): 279 | insert_sql = """ 280 | insert into 281 | lagou_job(title, url, url_object_id, salary, job_city, work_years, degree_need, job_type,publish_time, 282 | tags, job_advantage, job_desc, job_addr, company_url, company_name, crawl_time, crawl_update_time) 283 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 284 | ON DUPLICATE KEY UPDATE 285 | salary=VALUES (salary), job_desc=VALUES (job_desc), crawl_update_time=VALUES (crawl_update_time) 286 | """ 287 | params = ( 288 | self["title"], self["url"], self["url_object_id"], self["salary"], self["job_city"], self["work_years"], 289 | self["degree_need"],self["job_type"], self["publish_time"], self["tags"], self["job_advantage"], 290 | self["job_desc"],self["job_addr"], self["company_url"], self["company_name"], 291 | self["crawl_time"].strftime(SQL_DATETIME_FORMAT), self["crawl_update_time"].strftime(SQL_DATETIME_FORMAT) 292 | ) 293 | 294 | return insert_sql, params 295 | 296 | 297 | 298 | 299 | -------------------------------------------------------------------------------- /build/lib/ArticleSpider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | from fake_useragent import UserAgent 11 | from ArticleSpider.utils.crawl_xici_ip import GetIP 12 | 13 | 14 | class ArticlespiderSpiderMiddleware(object): 15 | # Not all methods need to be defined. If a method is not defined, 16 | # scrapy acts as if the spider middleware does not modify the 17 | # passed objects. 18 | 19 | @classmethod 20 | def from_crawler(cls, crawler): 21 | # This method is used by Scrapy to create your spiders. 22 | s = cls() 23 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 24 | return s 25 | 26 | def process_spider_input(self, response, spider): 27 | # Called for each response that goes through the spider 28 | # middleware and into the spider. 29 | 30 | # Should return None or raise an exception. 31 | return None 32 | 33 | def process_spider_output(self, response, result, spider): 34 | # Called with the results returned from the Spider, after 35 | # it has processed the response. 36 | 37 | # Must return an iterable of Request, dict or Item objects. 38 | for i in result: 39 | yield i 40 | 41 | def process_spider_exception(self, response, exception, spider): 42 | # Called when a spider or process_spider_input() method 43 | # (from other spider middleware) raises an exception. 44 | 45 | # Should return either None or an iterable of Response, dict 46 | # or Item objects. 47 | pass 48 | 49 | def process_start_requests(self, start_requests, spider): 50 | # Called with the start requests of the spider, and works 51 | # similarly to the process_spider_output() method, except 52 | # that it doesn’t have a response associated. 53 | 54 | # Must return only requests (not items). 55 | for r in start_requests: 56 | yield r 57 | 58 | def spider_opened(self, spider): 59 | spider.logger.info('Spider opened: %s' % spider.name) 60 | 61 | 62 | class RandomUserAgentMiddleware(object): 63 | # 随机更换User-Agent 64 | def __init__(self, crawler): 65 | super(RandomUserAgentMiddleware, self).__init__() 66 | self.user_agent_list = crawler.settings.get("user_agent_list", []) 67 | self.ua = UserAgent() 68 | self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random") 69 | 70 | @classmethod 71 | def from_crawler(cls, crawler): 72 | return cls(crawler) 73 | 74 | def process_request(self, request, spider): 75 | # from ArticleSpider.settings import user_agent_list 76 | # import random 77 | # request.headers.setdefault("User-Agent", user_agent_list[random.randint(0, len(user_agent_list)-1)]) 78 | 79 | def get_ua(): 80 | return getattr(self.ua, self.ua_type) 81 | 82 | # random_agent = get_ua() # 调试的时候用 83 | 84 | request.headers.setdefault("User-Agent", get_ua()) 85 | # request.meta["proxy"] = "http://113.128.90.192:48888" 86 | 87 | 88 | class RandomProxyMiddleware(object): 89 | # 动态设置ip代理 90 | def process_request(self, request, spider): 91 | get_ip = GetIP() 92 | request.meta["proxy"] = get_ip.get_random_ip() 93 | 94 | from selenium import webdriver 95 | from scrapy.http import HtmlResponse 96 | 97 | 98 | class JSPageMiddleware(object): 99 | # def __init__(self): 100 | # self.browser = webdriver.Chrome(executable_path="G:/Document/PythonServerEnvironment/SelniumWebdriver/chromedriver.exe") 101 | # super(JSPageMiddleware, self).__init__() 102 | 103 | # 通过chrome请求动态网页 104 | def process_request(self, request, spider): 105 | if spider.name == "jobbole": 106 | # chrome_opt = webdriver.ChromeOptions() 107 | # prefs = {"profile.managed_default_content_settings.images": 2} 108 | # chrome_opt.add_experimental_option("prefs", prefs) 109 | # browser = webdriver.Chrome( executable_path="G:/Document/PythonServerEnvironment/SelniumWebdriver/chromedriver.exe", chrome_options=chrome_opt) 110 | # browser = webdriver.Chrome(executable_path="G:/Document/PythonServerEnvironment/SelniumWebdriver/chromedriver.exe") 111 | spider.browser.get(request.url) 112 | import time 113 | time.sleep(3) 114 | print("访问:{0}".format(request.url)) 115 | 116 | return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding="utf-8", request=request) 117 | 118 | 119 | 120 | -------------------------------------------------------------------------------- /build/lib/ArticleSpider/models/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | __Author__ = "HackFun" -------------------------------------------------------------------------------- /build/lib/ArticleSpider/models/es_types.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | __Author__ = "HackFun" 5 | 6 | from datetime import datetime 7 | from elasticsearch_dsl import DocType, Date, Nested, Boolean, \ 8 | analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer 9 | 10 | from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalysis 11 | 12 | from elasticsearch_dsl.connections import connections 13 | 14 | es = connections.create_connection(hosts=["localhost"]) # connection可以连接多台服务器 15 | 16 | 17 | class CustomAnalyzer(_CustomAnalysis): 18 | def get_analysis_definition(self): 19 | return {} 20 | 21 | ik_analyser = CustomAnalyzer("ik_max_word", filter=["lowercase"]) 22 | 23 | 24 | class ArticleType(DocType): 25 | # 伯乐在线文章类型 26 | # suggest = Completion(analyzer="ik_max_word") # 不能直接使用这个,由于源码问题,必须使用CustomAnalyzer 27 | suggest = Completion(analyzer=ik_analyser) 28 | title = Text(analyzer="ik_max_word") 29 | create_date = Date() 30 | url = Keyword() 31 | url_object_id = Keyword() 32 | front_image_url = Keyword() 33 | front_image_path = Keyword() 34 | praise_nums = Integer() 35 | comment_nums = Integer() 36 | fav_nums = Integer() 37 | tags = Text(analyzer="ik_max_word") 38 | content = Text(analyzer="ik_max_word") 39 | 40 | class Meta: 41 | index = "jobbole" 42 | doc_type = "article" 43 | 44 | 45 | if __name__ == "__main__": 46 | ArticleType.init() # 根据类,直接生成mapping, 47 | -------------------------------------------------------------------------------- /build/lib/ArticleSpider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import codecs 9 | import json 10 | import MySQLdb 11 | import MySQLdb.cursors 12 | from scrapy.pipelines.images import ImagesPipeline 13 | from scrapy.exporters import JsonItemExporter 14 | from twisted.enterprise import adbapi 15 | from ArticleSpider.models.es_types import ArticleType 16 | from w3lib.html import remove_tags 17 | 18 | 19 | class ArticlespiderPipeline(object): 20 | def process_item(self, item, spider): 21 | return item 22 | 23 | 24 | class JsonWithEncodingPipeline(object): 25 | # 自定义json文件的导出 26 | def __init__(self): 27 | self.file = codecs.open('article.json', 'w', encoding="utf-8") 28 | 29 | def process_item(self, item, spider): 30 | lines = json.dumps(dict(item), ensure_ascii=False) + "\n" 31 | self.file.write(lines) 32 | return item 33 | 34 | def spider_closed(self, spider): 35 | self.file.close() 36 | 37 | 38 | class MysqlPipeline(object): 39 | # 采用同步得机制写入mysql 40 | def __init__(self): 41 | host = "localhost" 42 | user = "root" 43 | password = "123456" 44 | dbname = "jobble_article" 45 | self.conn = MySQLdb.connect(host=host, user=user, passwd=password, db=dbname, charset='utf8', use_unicode=True) 46 | self.cursor = self.conn.cursor() 47 | 48 | def process_item(self, item, spider): 49 | insert_sql = """ 50 | insert into jobbole(title, url, create_date, fav_nums) 51 | VALUES (%s, %s, %s, %s) 52 | """ 53 | self.cursor.execute(insert_sql, (item["title"], item["url"], item["create_date"], item["fav_nums"])) 54 | self.conn.commit() 55 | 56 | 57 | class MysqlTwistedPipeline(object): 58 | def __init__(self, dbpool): 59 | self.dbpool = dbpool 60 | 61 | @classmethod 62 | def from_settings(cls, settings): 63 | dbparams = dict( 64 | host=settings['MYSQL_HOST'], 65 | dbname=settings['MYSQL_DBNAME'], 66 | user=settings['MYSQL_USER'], 67 | passwd=settings['MYSQL_PASSWORD'], 68 | charset='utf8', 69 | cursorclass=MySQLdb.cursors.DictCursor, 70 | use_unicode=True, 71 | ) 72 | dbpool = adbapi.ConnectionPool("MySQLdb", **dbparams) 73 | 74 | return cls(dbpool) 75 | 76 | def process_item(self, item, spider): 77 | # 使用twisted将mysql插入变成异步执行 78 | query = self.dbpool.runInteraction(self.do_insert, item) 79 | query.addErrback(self.handle_error, item, spider) # 处理异常 (self.handle_error, item, spider) 80 | 81 | def handle_error(self, failure, item, spider): 82 | # 处理异步插入的异常 83 | print(failure) 84 | 85 | def do_insert(self, cursor, item): 86 | # 执行具体的插入 87 | # if item.__class__.__name__ == "JobBoleArticleItem": 88 | insert_sql, params = item.get_insert_sql() 89 | 90 | cursor.execute(insert_sql, params) 91 | 92 | 93 | class JsonExporterPipeline(object): 94 | # 调用scrapy提供的json export 导出json文件 95 | def __init__(self): 96 | self.file = open('articleexport.json', 'wb') 97 | self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False) 98 | self.exporter.start_exporting() 99 | 100 | def close_spider(self, spider): 101 | self.exporter.finish_exporting() 102 | self.file.close() 103 | 104 | def process_item(self, item, spider): 105 | self.exporter.export_item(item) 106 | return item 107 | 108 | 109 | class ArticleImagePipeline(ImagesPipeline): 110 | def item_completed(self, results, item, info): 111 | if "front_image_url" in item: 112 | for ok, value in results: 113 | image_file_path = value["path"] 114 | item["front_image_path"] = image_file_path 115 | return item 116 | 117 | 118 | class ElasticsearchPipeline(object): 119 | # 将数据写入打到es中 120 | def process_item(self, item, spider): 121 | # 将 item 转换 为es的数据 122 | item.save_to_es() 123 | 124 | return item 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /build/lib/ArticleSpider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | # Scrapy settings for ArticleSpider project 5 | # 6 | # For simplicity, this file contains only settings considered important or 7 | # commonly used. You can find more settings consulting the documentation: 8 | # 9 | # http://doc.scrapy.org/en/latest/topics/settings.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 11 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 12 | 13 | BOT_NAME = 'ArticleSpider' 14 | 15 | SPIDER_MODULES = ['ArticleSpider.spiders'] 16 | NEWSPIDER_MODULE = 'ArticleSpider.spiders' 17 | 18 | 19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 20 | #USER_AGENT = 'ArticleSpider (+http://www.yourdomain.com)' 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = False 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | #CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | DOWNLOAD_DELAY = 3 32 | # The download delay setting will honor only one of: 33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 34 | #CONCURRENT_REQUESTS_PER_IP = 16 35 | 36 | # Disable cookies (enabled by default) 37 | # COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | #TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | #DEFAULT_REQUEST_HEADERS = { 44 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 | # 'Accept-Language': 'en', 46 | #} 47 | 48 | # Enable or disable spider middlewares 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 50 | #SPIDER_MIDDLEWARES = { 51 | # 'ArticleSpider.middlewares.ArticlespiderSpiderMiddleware': 543, 52 | #} 53 | 54 | # Enable or disable downloader middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 56 | 57 | DOWNLOADER_MIDDLEWARES = { 58 | 'ArticleSpider.middlewares.RandomUserAgentMiddleware': 543, 59 | # 'ArticleSpider.middlewares.JSPageMiddleware': 1, 60 | # 'ArticleSpider.middlewares.RandomProxyMiddleware': 544, 61 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 62 | } 63 | 64 | 65 | 66 | # Enable or disable extensions 67 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 68 | #EXTENSIONS = { 69 | # 'scrapy.extensions.telnet.TelnetConsole': None, 70 | #} 71 | 72 | # Configure item pipelines 73 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 74 | ITEM_PIPELINES = { 75 | 'ArticleSpider.pipelines.ArticlespiderPipeline': 300, 76 | # 'scrapy.pipelines.images.ImagesPipeline': 1, 77 | 'ArticleSpider.pipelines.ArticleImagePipeline': 1, 78 | 'ArticleSpider.pipelines.ElasticsearchPipeline': 3, 79 | # 'ArticleSpider.pipelines.JsonWithEncodingPipeline': 2, 80 | # 'ArticleSpider.pipelines.JsonExporterPipeline': 2, 81 | } 82 | IMAGES_URLS_FIELD = "front_image_url" 83 | project_dir = os.path.abspath(os.path.dirname(__file__)) 84 | IMAGES_STORE = os.path.join(project_dir, 'images') 85 | # print(IMAGES_STORE) 86 | 87 | import os 88 | import sys 89 | 90 | BASE_DIR = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) 91 | # print(os.path.join(BASE_DIR, 'ArticleSpider')) 92 | sys.path.insert(0, os.path.join(BASE_DIR, 'ArticleSpider')) 93 | 94 | # sys.path.insert(0, r"G:\MyProgramFiles\Py3Code\ArticleSpider\ArticleSpider") 95 | 96 | user_agent_list = [ 97 | "", 98 | "", 99 | ] 100 | USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" 101 | 102 | RANDOM_UA_TYPE = "random" 103 | 104 | # Enable and configure the AutoThrottle extension (disabled by default) 105 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 106 | #AUTOTHROTTLE_ENABLED = True 107 | # The initial download delay 108 | #AUTOTHROTTLE_START_DELAY = 5 109 | # The maximum download delay to be set in case of high latencies 110 | #AUTOTHROTTLE_MAX_DELAY = 60 111 | # The average number of requests Scrapy should be sending in parallel to 112 | # each remote server 113 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 114 | # Enable showing throttling stats for every response received: 115 | #AUTOTHROTTLE_DEBUG = False 116 | 117 | # Enable and configure HTTP caching (disabled by default) 118 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 119 | #HTTPCACHE_ENABLED = True 120 | #HTTPCACHE_EXPIRATION_SECS = 0 121 | #HTTPCACHE_DIR = 'httpcache' 122 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 123 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 124 | 125 | MYSQL_HOST = "127.0.0.1" 126 | MYSQL_DBNAME = "jobble_article" 127 | MYSQL_USER = "root" 128 | MYSQL_PASSWORD = "123456" 129 | 130 | # JOBDIR = "job_info/001" 131 | 132 | SQL_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" 133 | SQL_DATE_FORMAT = "%Y-%m-%d" -------------------------------------------------------------------------------- /build/lib/ArticleSpider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /build/lib/ArticleSpider/spiders/jobbole.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import datetime 4 | import scrapy 5 | from scrapy.http import Request 6 | from urllib import parse 7 | from ArticleSpider.items import JobBoleArticleItem, ArticleItemLoader 8 | from ArticleSpider.utils.common import get_md5 9 | from scrapy.loader import ItemLoader 10 | from selenium import webdriver 11 | from scrapy.xlib.pydispatch import dispatcher 12 | from scrapy import signals 13 | 14 | 15 | class JobboleSpider(scrapy.Spider): 16 | name = 'jobbole' 17 | allowed_domains = ['blog.jobbole.com'] 18 | start_urls = ['http://blog.jobbole.com/all-posts/'] 19 | 20 | def __init__(self): 21 | self.start_urls = ('http://blog.jobbole.com/all-posts/',) 22 | # # self.fail_urls = [] 23 | # self.browser = webdriver.Chrome( 24 | # executable_path="G:/Document/PythonServerEnvironment/SelniumWebdriver/chromedriver.exe") 25 | # super(JobboleSpider, self).__init__() 26 | # dispatcher.connect(self.spider_close, signals.spider_closed) 27 | # 28 | # def spider_close(self, spider): 29 | # # 当爬虫退出的时候关闭chrome 30 | # print("spider closed") 31 | # self.browser.quit() 32 | # start_urls = ['http://blog.jobbole.com/112239/'] 33 | 34 | # 收集伯乐在线所有404的url以及404页面数 35 | # handle_httpstatus_list = [404] 36 | # 37 | # def __init__(self): 38 | # self.fail_urls = [] 39 | # dispatcher.connect(self.handle_spider_closed, signals.spider_closed) 40 | # 41 | # def handle_spider_closed(self, spider, reason): 42 | # self.crawler.stats.set_value("failed_urls", ",".join(self.fail_urls)) 43 | 44 | def parse(self, response): 45 | """ 46 | 1.获取文章列表中得文章url并交给scrapy下载后并进行解析 47 | 2.获取下一页得url 并交给scrapy进行下载, 下载完成后交给parse 48 | :param response: 49 | :return: 50 | """ 51 | # if response.status == 404: 52 | # self.fail_urls.append(response.url) 53 | # self.crawler.stats.inc_value("failed_url") 54 | 55 | # 解析列表中得所有文章url,然后下载 56 | post_nodes = response.css("#archive .floated-thumb .post-thumb a") 57 | for post_node in post_nodes: 58 | img_url = post_node.css("img::attr(src)").extract_first("") 59 | post_url = post_node.css("::attr(href)").extract_first("") 60 | pa_url = parse.urljoin(response.url, post_url) # 域名+url # response.url + post_url 61 | yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url": img_url}, callback=self.parse_detail) 62 | 63 | # 提取下一页进行下载 64 | next_url = response.css(".next.page-numbers::attr(href)").extract_first("") 65 | if next_url: 66 | pa_url = parse.urljoin(response.url, next_url) 67 | yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse) 68 | 69 | def parse_detail(self, response): 70 | # article_item = JobBoleArticleItem() 71 | 72 | # # 提取文章详情页 73 | # # re_selector = response.xpath("/html/body/div[2]/div[3]/div[1]/div[1]/h1") # 最好不用这种 74 | # title = response.xpath('//*[@id="post-112239"]/div[1]/h1/text()').extract_first() 75 | # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·", 76 | # "").strip() 77 | # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] 78 | # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] 79 | # match_re = re.match(r'.*?(\d+).*', fav_nums) 80 | # if match_re: 81 | # fav_nums = int(match_re.group(1)) 82 | # else: 83 | # fav_nums = 0 84 | # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] 85 | # match_re = re.match(r'.*?(\d+).*', comment_nums) 86 | # if match_re: 87 | # comment_nums = int(match_re.group(1)) 88 | # else: 89 | # comment_nums = 0 90 | # 91 | # content = response.xpath("//div[@class='entry']").extract()[0] 92 | # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() 93 | # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] 94 | # tags = ",".join(tag_list) 95 | 96 | # 通过css选择器提取字段 97 | # front_image_url = response.meta.get("front_image_url", "") # 文章封面图 98 | # title = response.css(".entry-header h1::text").extract()[0] 99 | # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace('·', '').strip() 100 | # praise_nums = response.css(".vote-post-up h10::text").extract()[0] 101 | # fav_nums = response.css(".bookmark-btn::text").extract()[0] 102 | # match_re = re.match(r'.*?(\d+).*', fav_nums) 103 | # if match_re: 104 | # fav_nums = match_re.group(1) 105 | # else: 106 | # fav_nums = 0 107 | # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] 108 | # match_re = re.match(r'.*?(\d+).*', comment_nums) 109 | # if match_re: 110 | # comment_nums = match_re.group(1) 111 | # else: 112 | # comment_nums = 0 113 | # 114 | # content = response.css("div.entry").extract()[0] 115 | # tags = response.css("p.entry-meta-hide-on-mobile a::text").extract() 116 | # tag_list = [element for element in tags if not element.strip().endswith("评论")] 117 | # tags = ",".join(tag_list) 118 | # 119 | # article_item["title"] = title 120 | # article_item["url"] = response.url 121 | # article_item["url_object_id"] = get_md5(response.url) 122 | # try: 123 | # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() 124 | # except Exception as e: 125 | # create_date = datetime.datetime.now() 126 | # article_item["create_date"] = create_date 127 | # article_item["front_image_url"] = [front_image_url] 128 | # article_item["content"] = content 129 | # article_item["praise_nums"] = praise_nums 130 | # article_item["comment_nums"] = comment_nums 131 | # article_item["fav_nums"] = fav_nums 132 | # article_item["tags"] = tags 133 | 134 | # 通过Item Loader 加载item 135 | item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) 136 | item_loader.add_css("title", ".entry-header h1::text") 137 | item_loader.add_value("url", response.url) 138 | item_loader.add_value("url_object_id", get_md5(response.url)) 139 | item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") 140 | item_loader.add_value("front_image_url", [response.meta.get("front_image_url", "")]) 141 | item_loader.add_css("praise_nums", ".vote-post-up h10::text") 142 | item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") 143 | item_loader.add_css("fav_nums", ".bookmark-btn::text") 144 | item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") 145 | item_loader.add_css("content", "div.entry") 146 | 147 | article_item = item_loader.load_item() 148 | # item_loader.add_xpath() 149 | 150 | yield article_item 151 | -------------------------------------------------------------------------------- /build/lib/ArticleSpider/spiders/lagou.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.linkextractors import LinkExtractor 4 | from scrapy.spiders import CrawlSpider, Rule 5 | from ArticleSpider.items import LagouJobItem, LagouJobItemLoader 6 | from ArticleSpider.utils.common import get_md5 7 | import datetime 8 | 9 | 10 | class LagouSpider(CrawlSpider): 11 | name = 'lagou' 12 | allowed_domains = ['www.lagou.com'] 13 | start_urls = ['https://www.lagou.com/'] 14 | 15 | rules = ( 16 | Rule(LinkExtractor(allow=("zhaopin/.*",)), follow=True), 17 | Rule(LinkExtractor(allow=("gongsi/j\d+.html",)), follow=True), 18 | Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True), 19 | ) 20 | 21 | def parse_job(self, response): 22 | # 解析拉勾网的职位 23 | item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) 24 | item_loader.add_css("title", "") 25 | item_loader.add_value("url", response.url) 26 | item_loader.add_value("url_object_id", get_md5(response.url)) 27 | item_loader.add_css("salary", ".job_request .salary::text") 28 | item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text") 29 | item_loader.add_css("work_years", ".job_request p span:nth-child(3)::text") # 这里使用css ,是为了在学习时,熟悉css选择器用法 30 | item_loader.add_xpath("degree_need", "//dd[@class='job_request']/p/span[4]/text()") 31 | item_loader.add_xpath("job_type", "//dd[@class='job_request']/p/span[5]/text()") 32 | 33 | item_loader.add_css("publish_time", ".publish_time::text") 34 | item_loader.add_css("tags", ".position-label.clearfix li::text") 35 | item_loader.add_css("job_advantage", ".job-advantage p::text") 36 | item_loader.add_css("job_desc", ".job_bt div") 37 | item_loader.add_css("job_addr", ".work_addr") 38 | item_loader.add_css("company_url", "#job_company dt a::attr(href)") 39 | item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") 40 | item_loader.add_value("crawl_time", datetime.datetime.now()) 41 | # item_loader.add_css("crawl_update_time", datetime.datetime.now()) 42 | 43 | job_item = item_loader.load_item() # 这里先赋值给一个变量,是考虑到便于调试以及代码可读性,而不是为了代码简洁而直接return 44 | 45 | return job_item 46 | 47 | def parse_start_url(self, response): 48 | return [] 49 | 50 | def process_results(self, response, results): 51 | return results -------------------------------------------------------------------------------- /build/lib/ArticleSpider/spiders/zhihu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import scrapy 4 | import json 5 | import datetime 6 | try: 7 | import urlparse as parse 8 | except: 9 | from urllib import parse 10 | 11 | from scrapy.loader import ItemLoader 12 | from ArticleSpider.items import ZhihuAnswerItem, ZhihuQuestionItem 13 | from ArticleSpider.settings import user_agent_list 14 | 15 | 16 | class ZhihuSpider(scrapy.Spider): 17 | name = 'zhihu' 18 | allowed_domains = ['www.zhihu.com'] 19 | start_urls = ['http://www.zhihu.com/'] 20 | 21 | # question的第一页answer得请求url 22 | start_answer_url = "https://www.zhihu.com/api/v4/questions/26234383/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit={1}&offset={2}" 23 | agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" 24 | 25 | 26 | headers = { 27 | "Host": "www.zhihu.com", 28 | "Referer": "https://www.zhihu.com/", 29 | "User-Agent": agent, 30 | 31 | } 32 | custom_settings = { 33 | "COOKIES_ENABLED": True 34 | } 35 | 36 | def parse(self, response): 37 | # 提取出html页面中的所有url 并跟踪url进行一些爬取 38 | # 如果提取得url中格式为 /question/xxx 就下载进行之后直接进入解析函数 39 | all_urls = response.css("a::attr(href)").extract() 40 | all_urls = [parse.urljoin(response.url, url) for url in all_urls] 41 | all_urls = filter(lambda x: True if x.startswith("https") else False, all_urls) 42 | for url in all_urls: 43 | match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", url) 44 | if match_obj: 45 | # 如果提取到得question相关页面则下载交由提取函数进行提取 46 | request_url = match_obj.group(1) 47 | # 简单的随机更换User-Agent 48 | # import random 49 | # random_index = random.randint(0, len(user_agent_list) - 1) 50 | # random_agent = user_agent_list[random_index] 51 | # self.headers["User-Agent"] = random_agent 52 | yield scrapy.Request(request_url, headers=self.headers, callback=self.parse_question) 53 | # break # debug时候用 54 | else: 55 | # 如果不是question 页面则直接进一步跟踪 56 | yield scrapy.Request(url, headers=self.headers, callback=self.parse) # debug 注释 57 | 58 | def parse_question(self, response): 59 | # 处理question页面, 从页面中提取question 具体item 60 | match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) 61 | if match_obj: 62 | question_id = int(match_obj.group(2)) 63 | 64 | if "QuestionHeader-title" in response.text: 65 | # 处理新版本 66 | item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) 67 | item_loader.add_css("title", "h1.QuestionHeader-title::text") 68 | item_loader.add_css("content", ".QuestionHeader-detail") 69 | item_loader.add_value("url", response.url) 70 | item_loader.add_value("zhihu_id", question_id) 71 | item_loader.add_css("answer_num", ".List-headerText span::text") 72 | item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") 73 | item_loader.add_css("watch_user_num", ".NumberBoard-value::text") 74 | item_loader.add_css("topics", ".QuestionHeader-topics.Popover::text") 75 | 76 | else: 77 | # 处理知乎旧版本 78 | item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) 79 | item_loader.add_css("title", "h1.QuestionHeader-title::text") 80 | item_loader.add_css("content", ".QuestionHeader-detail") 81 | item_loader.add_value("url", response.url) 82 | item_loader.add_value("zhihu_id", question_id) 83 | item_loader.add_css("answer_num", ".List-headerText span::text") 84 | item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") 85 | item_loader.add_css("watch_user_num", ".NumberBoard-value::text") 86 | item_loader.add_css("topics", ".QuestionHeader-topics.Popover::text") 87 | 88 | question_item = item_loader.load_item() 89 | yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) 90 | yield question_item # debug 注释 91 | 92 | def parse_answer(self, response): 93 | # 处理answer 94 | ans_json = json.loads(response.text) 95 | is_end = ans_json["paging"]["is_end"] 96 | # totals_answer = ans_json["paging"]["totals"] 97 | next_url = ans_json["paging"]["next"] 98 | 99 | # 提取answer的具体字段 100 | for answer in ans_json["data"]: 101 | answer_item = ZhihuAnswerItem() 102 | answer_item["zhihu_id"] = answer["id"] 103 | answer_item["url"] = answer["url"] 104 | answer_item["question_id"] = answer["question"]["id"] 105 | answer_item["author_id"] = answer["author"]["id"] if "id" in answer["author"] else None 106 | answer_item["content"] = answer["content"] if "content" in answer else None 107 | answer_item["praise_num"] = answer["voteup_count"] 108 | answer_item["comments_num"] = answer["comment_count"] 109 | answer_item["create_time"] = answer["created_time"] 110 | answer_item["update_time"] = answer["updated_time"] 111 | answer_item["crawl_time"] = datetime.datetime.now() 112 | 113 | yield answer_item 114 | 115 | if not is_end: 116 | yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer) 117 | 118 | def start_requests(self): 119 | return [scrapy.Request('https://www.zhihu.com/#signin', headers=self.headers, callback=self.login)] 120 | 121 | def login(self, response): 122 | 123 | response_text = response.text 124 | match_obj = re.match('.*name="_xsrf" value="(.*?)".*', response_text, re.DOTALL) 125 | if match_obj: 126 | print(match_obj.group(1)) 127 | _xsrf = match_obj.group(1) 128 | if _xsrf: 129 | 130 | post_data = { 131 | "_xsrf": _xsrf, 132 | "phone_num": '13342266862', 133 | "password": '553768563', 134 | "captcha": "", 135 | 136 | } 137 | import time 138 | t = str(int(time.time() * 1000)) 139 | captcha_url = "https://www.zhihu.com/captcha.gif?r={0}&type=login&lang=cn".format(t) 140 | yield scrapy.Request(captcha_url, headers=self.headers, meta={"post_data": post_data}, callback=self.login_after_captcha) # 重点 141 | 142 | def login_after_captcha(self, response): 143 | with open("captcha.gif", "wb") as f: 144 | f.write(response.body) 145 | f.close() 146 | 147 | from PIL import Image 148 | try: 149 | im = Image.open("captcha.gif") 150 | im.show() 151 | im.close() 152 | except: 153 | pass 154 | 155 | captcha = input("输入验证码\n>") 156 | 157 | post_data = response.meta.get("post_data", {}) 158 | post_url = "https://www.zhihu.com/login/phone_num" 159 | post_data["captcha"] = captcha 160 | return [scrapy.FormRequest( 161 | url=post_url, 162 | formdata=post_data, 163 | headers=self.headers, 164 | callback=self.check_login 165 | )] 166 | 167 | def check_login(self, response): 168 | # 验证服务器的返回数据是否成功 169 | text_json = json.loads(response.text) 170 | if "msg" in text_json and text_json["msg"] == "登录成功": 171 | for url in self.start_urls: 172 | yield scrapy.Request(url, dont_filter=True, headers=self.headers) 173 | 174 | -------------------------------------------------------------------------------- /build/lib/ArticleSpider/tools/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | __Author__ = "HackFun" -------------------------------------------------------------------------------- /build/lib/ArticleSpider/tools/selenium_spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | __Author__ = "HackFun" 5 | 6 | from selenium import webdriver 7 | from scrapy.selector import Selector 8 | 9 | # browser = webdriver.Chrome(executable_path="G:/Document/PythonServerEnvironment/SelniumWebdriver/chromedriver.exe") 10 | # 11 | # browser.get("https://item.taobao.com/item.htm?spm=2013.1.iteminfo.10.4b556901SPB44D&scm=1007.10010.52063.100200300000003&id=552169264763&pvid=19a525ca-6111-4648-98ab-0ff06f668623") 12 | # 13 | # print(browser.page_source) 14 | # 15 | # selector_ = Selector(text=browser.page_source) 16 | 17 | 18 | # browser.quit() 19 | 20 | # 设置chromedirver 21 | chrome_opt = webdriver.ChromeOptions() 22 | prefs = {"profile.managed_default_content_settings.images": 2} 23 | chrome_opt.add_experimental_option("prefs", prefs) 24 | browser = webdriver.Chrome(executable_path="G:/Document/PythonServerEnvironment/SelniumWebdriver/chromedriver.exe", chrome_options=chrome_opt) 25 | browser.get("https://anta.tmall.com/") 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /build/lib/ArticleSpider/utils/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | __Author__ = "HackFun" -------------------------------------------------------------------------------- /build/lib/ArticleSpider/utils/common.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | __Author__ = "HackFun" 5 | import hashlib 6 | import re 7 | import webbrowser 8 | from webbrowser import Chrome 9 | 10 | 11 | def get_md5(url): 12 | if isinstance(url, str): 13 | url = url.encode("utf-8") 14 | m = hashlib.md5() 15 | m.update(url) 16 | return m.hexdigest() 17 | 18 | 19 | def extract_num(text): 20 | # 字符串中提取数字 21 | match_re = re.match(r'.*?(\d+).*', text) 22 | if match_re: 23 | nums = match_re.group(1) 24 | return nums 25 | 26 | 27 | def webtest(): 28 | # webbrowser.open("http://jobbole.com", new=0, autoraise=1) 29 | # webbrowser.open_new("http://jobbole.com") 30 | # webbrowser.open_new_tab("http://jobbole.com") 31 | webbrowser.register(name="chrome", klass=Chrome) 32 | webbrowser.get('chrome').open("http://jobbole.com") 33 | # .open('www.baidu.com', new=1, autoraise=True) 34 | 35 | chromePath = r'你的浏览器目录' # 例如我的:C:\***\***\***\***\Google\Chrome\Application\chrome.exe 36 | webbrowser.register('chrome', None, webbrowser.BackgroundBrowser(chromePath)) # 这里的'chrome'可以用其它任意名字,如chrome111,这里将想打开的浏览器保存到'chrome' 37 | webbrowser.get('chrome').open('www.baidu.com', new=1, autoraise=True) 38 | 39 | 40 | def choose(bool, a, b): 41 | return (bool and a or [b])[0] 42 | 43 | 44 | def reversed(sequence): 45 | x = [] 46 | for i in range(len(sequence)-1, -1, -1): 47 | # print(i) 48 | x.append(sequence[i]) 49 | # x = sequence[i] 50 | # print(sequence[i]) 51 | return x 52 | 53 | 54 | def to_list(t): 55 | return [i if not isinstance(i, tuple) else to_list(i) for i in t] 56 | 57 | # 58 | # def to_list(t): 59 | # return [i for i in t] 60 | 61 | if __name__ == '__main__': 62 | # webtest() 63 | # print(get_md5("http://jobbole.com")) 64 | # print(1) 65 | 66 | # print(choose(True, 1, 2)) 67 | # print(forxinreversed([1, 2, 3, 3, 4, 5])) 68 | # sequence = [1, 2, 3, 4, 5] 69 | # sequence.reverse() 70 | # print(sequence) 71 | # x = [sequence[i] for i in range(len(sequence)-1, -1, -1)] 72 | # print(x) 73 | 74 | # print(int('1234')) 75 | # print(float(12)) 76 | # print(str(98)) 77 | # print(list('abcd')) 78 | # print(dict.fromkeys(['name', 'age'])) 79 | # print(tuple([1, 2, 3, 4])) 80 | 81 | # a_list = [1, 2, [1, 2, 3], 3, 4, 5] 82 | # print(tuple(a_list)) 83 | # t = tuple(a_list) 84 | # t = (1, 2, (1, 2, 3), 3, 4) 85 | # print(t) 86 | # print(to_list(t)) 87 | 88 | # 10 89 | # L1 = [4, 1, 3, 2, 3, 5, 1] 90 | # L2 = [] 91 | # [L2.append(i) for i in L1 if i not in L2] 92 | # print(L2) 93 | 94 | from copy import deepcopy 95 | L1 = [1, [1, 2, 3], 2, 3] 96 | print("before copy L1: ", L1) 97 | L2 = L1.copy() 98 | L2[1][2] = 1 99 | print("after copy L2: ", L2) 100 | print("after copy L1: ", L1) 101 | L1 = [3, [3, 4, 5], 4, 5] 102 | print("before deepcopy L1: ", L1) 103 | L2 = deepcopy(L1) 104 | L2[1][2] = 1 105 | print("after deepcopy L2: ", L2) 106 | print("after deepcopy L1: ", L1) 107 | -------------------------------------------------------------------------------- /build/lib/ArticleSpider/utils/crawl_xici_ip.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | __Author__ = "HackFun" 5 | 6 | import requests 7 | from scrapy.selector import Selector 8 | import MySQLdb 9 | 10 | conn = MySQLdb.connect(host="localhost", user="root", passwd="123456", db="article_spider", charset="utf8") 11 | cursor = conn.cursor() 12 | 13 | 14 | def crawl_ips(): 15 | # 爬取西刺得免费ip代理 16 | headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"} 17 | for i in range(2354): 18 | re = requests.get("http://www.xicidaili.com/nn/{0}".format(i), headers=headers) 19 | 20 | # print(re.text) 21 | selector = Selector(text=re.text) 22 | # all_trs = selector.css("#ip_list tr[class]:not([class='subtitle'])") 23 | all_trs = selector.css("#ip_list tr") 24 | 25 | ip_list = [] 26 | 27 | for tr in all_trs[1:]: 28 | speed_str = tr.css(".bar::attr(title)").extract()[0] 29 | if speed_str: 30 | speed = float(speed_str.split("秒")[0]) 31 | # ip = tr.css("td:nth-child[2]::text").extract()[0] # 报错 32 | all_text = tr.css("td::text").extract() 33 | ip = all_text[0] 34 | port = all_text[1] 35 | proxy_type = all_text[5] 36 | 37 | # lis = (ip, port, speed, proxy_type) 38 | # lis = list(map(lambda a: str(a) if type(a) != 'str' else a, (ip, port, speed, proxy_type))) 39 | # print(':'.join(lis)) 40 | 41 | ip_list.append((ip, port, speed, proxy_type)) 42 | 43 | # print(all_trs) 44 | # for tr in all_trs: 45 | # # print(tr.extract()) 46 | # # ip = tr.xpath('/td[2]/text()').extract() 47 | # # port = tr.xpath('/td[3]/text()').extract() 48 | # # http_type = tr.xpath('/td[6]/text()').extract() 49 | # ip = tr.css('td:nth-child(2)::text').extract()[0] 50 | # port = tr.css('td:nth-child(3)::text').extract()[0] 51 | # speed = tr.css('td:nth-child(6)::text').extract()[0] 52 | # proxy_type = tr.css('td:nth-child(6)::text').extract()[0] 53 | # # print(ip, port) 54 | # # print(':'.join((str(ip), str(port), str(http_type)))) 55 | # print(':'.join((ip, port, speed, proxy_type))) 56 | # ip_list.append((ip, port, speed, proxy_type)) 57 | 58 | print(": ".join(ip_info)) 59 | 60 | for ip_info in ip_list: 61 | cursor.execute("insert into proxy_ip(ip, port, speed, proxy_type) VALUES ('{0}','{1}',{2},'{3}')".format( 62 | ip_info[0], ip_info[1], ip_info[2], ip_info[3]) 63 | ) # 传递字符串一定要加单引号 64 | 65 | conn.commit() 66 | 67 | # for tr in all_trs[1:]: 68 | # # speed_str = tr.css(".bar::attr(title)").extract()[0] 69 | # # if speed_str: 70 | # # speed = float(speed_str.split("秒")[0]) 71 | # all_texts = tr.css("td::text").extract() 72 | # print(all_texts) 73 | 74 | # print(re.text) 75 | 76 | 77 | class GetIP(object): 78 | def delete_ip(self, ip): 79 | # 从数据库中删除无效的ip 80 | delete_sql = """ 81 | delete from proxy_ip where ip='{0}' 82 | """.format(ip) 83 | cursor.execute(delete_sql) 84 | conn.commit() 85 | return True 86 | 87 | def judge_ip(self, ip, port, proxy_type): 88 | # 判断IP 是否可用 89 | http_url = "proxy_type://www.baidu.com" 90 | proxy_url = "{3}://{0}:{1}".format(ip, port, proxy_type) 91 | response = None 92 | try: 93 | proxy_dict = { 94 | proxy_type: proxy_url 95 | } 96 | response = requests.get(http_url, proxies=proxy_dict) 97 | return True 98 | except Exception as e: 99 | print("invalid ip and port") 100 | self.delete_ip(ip) 101 | return False 102 | else: 103 | code = response.status_code 104 | if code >= 200 and code < 300: 105 | print("effective ip") 106 | return True 107 | else: 108 | print("invalid ip and port") 109 | self.delete_ip(ip) 110 | return False 111 | 112 | def get_random_ip(self): 113 | # 从数据库中随机获取一个可用的ip 114 | random_sql = """ 115 | SELECT ip,port FROM proxy_ip 116 | ORDER BY RAND() 117 | LIMIT 1 118 | """ 119 | cursor.execute(random_sql) 120 | for ip_info in cursor.fetchall(): 121 | ip = ip_info[0] 122 | port = ip_info[1] 123 | proxy_type = ip_info[3] if ip_info[3] and ip_info[3] != "" else 'http' 124 | 125 | judge_re = self.judge_ip(ip, port, proxy_type) 126 | if judge_re: 127 | return "{3}://{0}:{1}".format(ip, port, proxy_type) 128 | else: 129 | return self.get_random_ip() 130 | 131 | if __name__ == '__main__': 132 | # crawl_ips() 133 | get_ip = GetIP() 134 | print(get_ip.get_random_ip()) -------------------------------------------------------------------------------- /build/lib/ArticleSpider/utils/zhihu_login_requests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | __Author__ = "HackFun" 5 | 6 | import requests 7 | try: 8 | import cookielib 9 | except: 10 | import http.cookiejar as cookielib 11 | 12 | import re 13 | 14 | 15 | 16 | agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" 17 | header = { 18 | "Host": "www.zhihu.com", 19 | "Referer": "https://www.zhihu.com/", 20 | "User-Agent": agent, 21 | "Cookie":'q_c1=447a188541144e3fb30424b694576ef2|1502619613000|1491825468000; q_c1=447a188541144e3fb30424b694576ef2|1502619613000|1491825468000; _zap=6efcefae-72d9-4251-9d91-2f350d61f8ee; capsion_ticket="2|1:0|10:1503325910|14:capsion_ticket|44:MDM1NThhZGYwMTM1NDAyNzkzNTYzMDMwNjhlNDNkNjM=|05608b1721fc351684c420227a8cc8c6a3926cfaea2c64ec23c62a1fbcd3a48f"; aliyungf_tc=AQAAAO2IxyovewwAshrJtkWO76wHBbMh; d_c0="AECCvOx7SwyPTtI7hlhRAcElYn2NHqLNeYI=|1504004081"; _xsrf=1be2d9a7-746b-4245-bc8f-4b50692e0965; l_cap_id="NzVjMmQ2ZTFkODVjNGVlYzkzZGNjNDQ4OTgwNjA2MDI=|1504010920|556da1e4afe6174e99f237007f3b12c2dd7054a2"; r_cap_id="MTAxNjU2MzFjZDM5NGNmZDgyNTliODljZDc3Y2IyMmQ=|1504010920|7698fb675ed8a3d0aff05ca5fa4e92297889b4e2"; cap_id="MjYzZjRlYTllOTA0NDA4MWE5ZGRjOTRlNGNiZTk5Y2M=|1504010920|7a120663cef55b6d4c72932874f2ed61afd2d050"; __utma=51854390.504384623.1504004084.1504004084.1504010008.2; __utmb=51854390.0.10.1504010008; __utmc=51854390; __utmz=51854390.1504010008.2.2.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=51854390.000--|2=registration_date=20170615=1^3=entry_date=20170410=1' 22 | } 23 | 24 | 25 | session = requests.session() 26 | session.cookies = cookielib.LWPCookieJar(filename="cookies.txt") 27 | 28 | try: 29 | session.cookies.load(ignore_discard=True) 30 | print("cookies已被加载") 31 | except: 32 | print("cookies未能加载") 33 | 34 | 35 | def get_xsrf(): 36 | response = session.get("https://www.zhihu.com/", headers=header) 37 | # print(response.text) 38 | 39 | # text = '' 40 | # text = '' 41 | text = response.text 42 | print(text) 43 | match_obj = re.match(r'.*?required.*', text) 44 | match_obj = re.match(r'.*name="_xsrf" value="(.*?)".*', text.strip()) 45 | match_obj = re.match(r'.*name="_xsrf" value="(.*?)".*', text, re.DOTALL) 46 | match_obj = re.search('.*name="_xsrf" value="(.*?)".*', text) 47 | if match_obj: 48 | print(match_obj.group(1)) 49 | return match_obj.group(1) 50 | else: 51 | return "" 52 | 53 | 54 | def is_login(): 55 | inbox_url = "https://www.zhihu.com/inbox" 56 | response = session.get(inbox_url, headers=header, allow_redirects=False) 57 | if response.status_code != 200: 58 | return False 59 | else: 60 | return True 61 | 62 | 63 | def get_index(): 64 | response = session.get("https://www.zhihu.com/", headers=header) 65 | with open("index_page.html", "wb") as f: 66 | f.write(response.text.encode("utf-8")) 67 | print("ok") 68 | 69 | 70 | def get_captcha(): 71 | import time 72 | t = str(int(time.time()*1000)) 73 | captcha_url = "https://www.zhihu.com/captcha.gif?r=1504099197089&type=login&lang=cn" 74 | captcha_url = "https://www.zhihu.com/captcha.gif?r={0}&type=login&lang=cn" 75 | t = session.get(captcha_url, headers=header) 76 | with open("captcha.gif", "wb") as f: 77 | f.write(t.content) 78 | f.close() 79 | 80 | from PIL import Image 81 | try: 82 | im = Image.open("captcha.gif") 83 | im.show() 84 | im.close() 85 | except: 86 | pass 87 | 88 | captcha = input("输入验证码\n>") # python2 中是 raw_input 89 | return captcha 90 | 91 | 92 | def zhihu_login(account, password): 93 | # 知乎登陆 94 | if re.match("^1\d{10}", account): 95 | print("手机号码登陆") 96 | post_url = "https://www.zhihu.com/login/phone_num" 97 | post_data = { 98 | "_xsrf": get_xsrf(), 99 | "phone_num": account, 100 | "password": password, 101 | "captcha": get_captcha(), 102 | # captcha:{"img_size":[200,44],"input_points":[[21.375,28],[156.375,33]]}# 2017-08-30 103 | "captcha_type": 'cn' 104 | } 105 | else: 106 | if "@" in account: 107 | # 判断用户名是否为邮箱 108 | print("邮箱方式登陆") 109 | post_url = "https://www.zhihu.com/login/email" 110 | post_data = { 111 | "_xsrf": get_xsrf(), 112 | "email": account, 113 | "password": password, 114 | "captcha": get_captcha(), 115 | "captcha_type": 'cn' 116 | } 117 | 118 | response_text = session.post(post_url, data=post_data, headers=header) 119 | session.cookies.save() 120 | 121 | if __name__ == '__main__': 122 | zhihu_login("13342266862", "553768563") 123 | # print(get_xsrf()) 124 | get_index() -------------------------------------------------------------------------------- /dbs/ArticleSpider.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackfengJam/ArticleSpider/a332bd1e0db36d41b23f85cd850dfb20c75d00f0/dbs/ArticleSpider.db -------------------------------------------------------------------------------- /dbs/default.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackfengJam/ArticleSpider/a332bd1e0db36d41b23f85cd850dfb20c75d00f0/dbs/default.db -------------------------------------------------------------------------------- /eggs/ArticleSpider/1504775520.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackfengJam/ArticleSpider/a332bd1e0db36d41b23f85cd850dfb20c75d00f0/eggs/ArticleSpider/1504775520.egg -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | __Author__ = "HackFun" 5 | 6 | from scrapy.cmdline import execute 7 | import sys 8 | import os 9 | 10 | # print(os.path.dirname(os.path.abspath(__file__))) 11 | # G:\MyProgramFiles\Py3Code\ArticleSpider 12 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 13 | execute(["scrapy", "crawl", "jobbole"]) # execute("scrapy crawl jobbole".split()) 14 | # execute(["scrapy", "crawl", "zhihu"]) # execute("scrapy crawl jobbole".split()) 15 | 16 | # # test 17 | # def a(max): 18 | # n, a, b = 0, 0, 1 19 | # while n < max: 20 | # yield b 21 | # # print b 22 | # a, b = b, a + b 23 | # n = n + 1 24 | # 25 | # def b(max): 26 | # while max > 0: 27 | # yield max 28 | # max = max - 1 29 | # 30 | # 31 | # f = a(5) 32 | # f = b(5) 33 | # 34 | # print(f.__next__()) 35 | # print(f.__next__()) 36 | # print(f.__next__()) 37 | # print(f.__next__()) -------------------------------------------------------------------------------- /project.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.0 2 | Name: project 3 | Version: 1.0 4 | Summary: UNKNOWN 5 | Home-page: UNKNOWN 6 | Author: UNKNOWN 7 | Author-email: UNKNOWN 8 | License: UNKNOWN 9 | Description: UNKNOWN 10 | Platform: UNKNOWN 11 | -------------------------------------------------------------------------------- /project.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | setup.py 2 | ArticleSpider/__init__.py 3 | ArticleSpider/items.py 4 | ArticleSpider/middlewares.py 5 | ArticleSpider/pipelines.py 6 | ArticleSpider/settings.py 7 | ArticleSpider/models/__init__.py 8 | ArticleSpider/models/es_types.py 9 | ArticleSpider/spiders/__init__.py 10 | ArticleSpider/spiders/jobbole.py 11 | ArticleSpider/spiders/jobboleBackupto16.py 12 | ArticleSpider/spiders/lagou.py 13 | ArticleSpider/spiders/zhihu.py 14 | ArticleSpider/tools/__init__.py 15 | ArticleSpider/tools/selenium_spider.py 16 | ArticleSpider/utils/__init__.py 17 | ArticleSpider/utils/common.py 18 | ArticleSpider/utils/crawl_xici_ip.py 19 | ArticleSpider/utils/zhihu_login_requests.py 20 | project.egg-info/PKG-INFO 21 | project.egg-info/SOURCES.txt 22 | project.egg-info/dependency_links.txt 23 | project.egg-info/entry_points.txt 24 | project.egg-info/top_level.txt -------------------------------------------------------------------------------- /project.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /project.egg-info/entry_points.txt: -------------------------------------------------------------------------------- 1 | [scrapy] 2 | settings = ArticleSpider.settings 3 | 4 | -------------------------------------------------------------------------------- /project.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | ArticleSpider 2 | -------------------------------------------------------------------------------- /requestments.txt: -------------------------------------------------------------------------------- 1 | arrow==0.12.0 2 | asn1crypto==0.23.0 3 | attrs==17.3.0 4 | Automat==0.6.0 5 | backports.functools-lru-cache==1.2.1 6 | certifi==2017.11.5 7 | cffi==1.11.2 8 | chardet==3.0.4 9 | constantly==15.1.0 10 | cryptography==2.1.4 11 | cssselect==1.0.1 12 | Django==2.0 13 | elasticsearch==5.5.1 14 | elasticsearch-dsl==5.3.0 15 | hyperlink==17.3.1 16 | idna==2.6 17 | incremental==17.5.0 18 | lxml==4.1.1 19 | mysql-connector-python==8.0.5 20 | mysqlclient==1.3.12 21 | parsel==1.2.0 22 | pyasn1==0.4.2 23 | pyasn1-modules==0.2.1 24 | pycparser==2.18 25 | PyDispatcher==2.0.5 26 | PyMySQL==0.7.11 27 | pyOpenSSL==17.5.0 28 | python-dateutil==2.6.1 29 | pytz==2017.3 30 | queuelib==1.4.2 31 | redis==2.10.6 32 | requests==2.18.4 33 | Scrapy==1.4.0 34 | selenium==3.8.0 35 | service-identity==17.0.0 36 | six==1.11.0 37 | Twisted==17.9.0 38 | urllib3==1.21.1 39 | w3lib==1.18.0 40 | zope.interface==4.4.3 41 | pillow==5.0.0 42 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = ArticleSpider.settings 8 | 9 | [deploy:haifeng] 10 | url = http://localhost:6800/ 11 | project = ArticleSpider 12 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapyd-deploy 2 | 3 | from setuptools import setup, find_packages 4 | 5 | setup( 6 | name = 'project', 7 | version = '1.0', 8 | packages = find_packages(), 9 | entry_points = {'scrapy': ['settings = ArticleSpider.settings']}, 10 | ) 11 | --------------------------------------------------------------------------------