├── .gitignore
├── ArticleSpider
    ├── __init__.py
    ├── images
    │   └── full
    │   │   ├── 35011d6168be00e949624c665041dc724e3ad786.jpg
    │   │   ├── 5630c3d4f4f3f685aa06d550f00a4cdef5d9a492.jpg
    │   │   ├── b906ccd28196f4dac2561cbd4120d442b442de87.jpg
    │   │   ├── d1b17b98748a74826464a08e6d30a4ee1b15b171.jpg
    │   │   └── f5d4611e2094787b56111e18af07ad9bad0e04f6.jpg
    ├── items.py
    ├── middlewares.py
    ├── models
    │   ├── __init__.py
    │   └── es_types.py
    ├── pipelines.py
    ├── settings.py
    ├── spiders
    │   ├── __init__.py
    │   ├── jobbole.py
    │   ├── lagou.py
    │   └── zhihu.py
    ├── tools
    │   ├── __init__.py
    │   ├── debug.log
    │   └── selenium_spider.py
    └── utils
    │   ├── __init__.py
    │   ├── captcha.gif
    │   ├── common.py
    │   ├── crawl_xici_ip.py
    │   └── zhihu_login_requests.py
├── README.md
├── articleexport.json
├── build
    └── lib
    │   └── ArticleSpider
    │       ├── __init__.py
    │       ├── items.py
    │       ├── middlewares.py
    │       ├── models
    │           ├── __init__.py
    │           └── es_types.py
    │       ├── pipelines.py
    │       ├── settings.py
    │       ├── spiders
    │           ├── __init__.py
    │           ├── jobbole.py
    │           ├── lagou.py
    │           └── zhihu.py
    │       ├── tools
    │           ├── __init__.py
    │           └── selenium_spider.py
    │       └── utils
    │           ├── __init__.py
    │           ├── common.py
    │           ├── crawl_xici_ip.py
    │           └── zhihu_login_requests.py
├── dbs
    ├── ArticleSpider.db
    └── default.db
├── eggs
    └── ArticleSpider
    │   └── 1504775520.egg
├── main.py
├── project.egg-info
    ├── PKG-INFO
    ├── SOURCES.txt
    ├── dependency_links.txt
    ├── entry_points.txt
    └── top_level.txt
├── requestments.txt
├── scrapy.cfg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | dep.sh
 3 | config.yaml
 4 | config.json
 5 | env_*.py
 6 | log/
 7 | tmp/
 8 | test.py
 9 | .DS_Store
10 | apidoc/
11 | .idea/
12 | venv/


--------------------------------------------------------------------------------
/ArticleSpider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hackfengJam/ArticleSpider/a332bd1e0db36d41b23f85cd850dfb20c75d00f0/ArticleSpider/__init__.py


--------------------------------------------------------------------------------
/ArticleSpider/images/full/35011d6168be00e949624c665041dc724e3ad786.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hackfengJam/ArticleSpider/a332bd1e0db36d41b23f85cd850dfb20c75d00f0/ArticleSpider/images/full/35011d6168be00e949624c665041dc724e3ad786.jpg


--------------------------------------------------------------------------------
/ArticleSpider/images/full/5630c3d4f4f3f685aa06d550f00a4cdef5d9a492.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hackfengJam/ArticleSpider/a332bd1e0db36d41b23f85cd850dfb20c75d00f0/ArticleSpider/images/full/5630c3d4f4f3f685aa06d550f00a4cdef5d9a492.jpg


--------------------------------------------------------------------------------
/ArticleSpider/images/full/b906ccd28196f4dac2561cbd4120d442b442de87.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hackfengJam/ArticleSpider/a332bd1e0db36d41b23f85cd850dfb20c75d00f0/ArticleSpider/images/full/b906ccd28196f4dac2561cbd4120d442b442de87.jpg


--------------------------------------------------------------------------------
/ArticleSpider/images/full/d1b17b98748a74826464a08e6d30a4ee1b15b171.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hackfengJam/ArticleSpider/a332bd1e0db36d41b23f85cd850dfb20c75d00f0/ArticleSpider/images/full/d1b17b98748a74826464a08e6d30a4ee1b15b171.jpg


--------------------------------------------------------------------------------
/ArticleSpider/images/full/f5d4611e2094787b56111e18af07ad9bad0e04f6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hackfengJam/ArticleSpider/a332bd1e0db36d41b23f85cd850dfb20c75d00f0/ArticleSpider/images/full/f5d4611e2094787b56111e18af07ad9bad0e04f6.jpg


--------------------------------------------------------------------------------
/ArticleSpider/items.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your scraped items
  4 | #
  5 | # See documentation in:
  6 | # http://doc.scrapy.org/en/latest/topics/items.html
  7 | import re
  8 | import datetime
  9 | import scrapy
 10 | from scrapy.loader import ItemLoader
 11 | from scrapy.loader.processors import MapCompose, TakeFirst, Join
 12 | from ArticleSpider.utils.common import extract_num
 13 | from ArticleSpider.settings import SQL_DATETIME_FORMAT, SQL_DATE_FORMAT
 14 | from w3lib.html import remove_tags
 15 | from ArticleSpider.models.es_types import ArticleType
 16 | import redis
 17 | 
 18 | 
 19 | from elasticsearch_dsl.connections import connections
 20 | 
 21 | es = connections.create_connection(ArticleType._doc_type.using)
 22 | 
 23 | redis_cli = redis.StrictRedis()
 24 | 
 25 | 
 26 | class ArticlespiderItem(scrapy.Item):
 27 |     # define the fields for your item here like:
 28 |     # name = scrapy.Field()
 29 |     pass
 30 | 
 31 | 
 32 | def date_convert(value):
 33 |     try:
 34 |         create_date = datetime.datetime.strptime(value, "%Y/%m/%d").date()
 35 |     except Exception as e:
 36 |         create_date = datetime.datetime.now()
 37 |     return create_date
 38 | 
 39 | 
 40 | def add_jobbole(value):
 41 |     return value+"-haifeng"
 42 | 
 43 | 
 44 | def get_nums(value):
 45 |     # print(value)
 46 |     match_re = re.match(r'.*?(\d+).*', value)
 47 |     if match_re:
 48 |         nums = match_re.group(1)
 49 |         return nums
 50 |     else:
 51 |         return 0
 52 | 
 53 | 
 54 | def remove_comment_tags(value):
 55 |     # 去掉tag中提取得评论
 56 |     if "评论" in value:
 57 |         return ""
 58 |     else:
 59 |         return value
 60 | 
 61 | 
 62 | def return_value(value):
 63 |     return value
 64 | 
 65 | 
 66 | def gen_suggests(index, info_tuple):
 67 |     # 根据字符串生成搜索建议数据
 68 |     # python工程师  title  10
 69 |     # python工程师  text   3
 70 |     # 不能覆盖，所以用set
 71 |     used_words = set()
 72 |     suggests = []
 73 |     for text, weight in info_tuple:
 74 |         if text:
 75 |             # 调用es得analyze接口分析字符串
 76 |             words = es.indices.analyze(index=index, analyzer="ik_max_word", params={'filter': ["lowercase"]}, body=text)
 77 |             analyzed_words = set(r["token"] for r in words["tokens"] if len(r["token"]) > 1)
 78 |             new_words = analyzed_words - used_words
 79 |         else:
 80 |             new_words = set()
 81 |         if new_words:
 82 |             suggests.append({"input": list(new_words), "weight": weight})
 83 |     return suggests
 84 | 
 85 | 
 86 | class ArticleItemLoader(ItemLoader):
 87 |     # 自定义ItemLoader
 88 |     default_output_processor = TakeFirst()
 89 | 
 90 | 
 91 | class JobBoleArticleItem(scrapy.Item):
 92 |     # title = scrapy.Field(
 93 |     #     input_processor=MapCompose(lambda x: x+"-jobbole", add_jobbole),
 94 |     #     output_processor = TakeFirst()
 95 |     # )
 96 |     title = scrapy.Field()
 97 | 
 98 |     create_date = scrapy.Field(
 99 |         input_processor=MapCompose(date_convert),
100 |         # output_processor=TakeFirst()
101 |     )
102 |     url = scrapy.Field()
103 |     url_object_id = scrapy.Field(
104 |         output_processor=MapCompose(return_value)
105 |     )
106 |     front_image_url = scrapy.Field(
107 |         output_processor=MapCompose(return_value)
108 |     )
109 |     front_image_path = scrapy.Field()
110 |     praise_nums = scrapy.Field(
111 |         input_processor=MapCompose(get_nums)
112 |     )
113 |     comment_nums = scrapy.Field(
114 |         input_processor=MapCompose(get_nums)
115 |     )
116 |     fav_nums = scrapy.Field(
117 |         input_processor=MapCompose(get_nums)
118 |     )
119 |     tags = scrapy.Field(
120 |         input_processor=MapCompose(remove_comment_tags),
121 |         output_processor=Join(","),
122 |     )
123 |     content = scrapy.Field()
124 | 
125 |     def get_insert_sql(self):
126 |         insert_sql = """
127 |                             insert into jobbole(title, url, create_date, fav_nums)
128 |                             VALUES (%s, %s, %s, %s)
129 |                             ON DUPLICATE KEY 
130 |                             UPDATE title=VALUES(title),url=VALUES(url),
131 |                             create_date=VALUES (create_date),fav_nums=VALUES (fav_nums) 
132 |                             """
133 |         params = (self["title"], self["url"], self["create_date"], self["fav_nums"])
134 |         return insert_sql, params
135 | 
136 |     def save_to_es(self):
137 |         article = ArticleType()
138 |         article.title = self['title']
139 |         article.create_date = self["create_date"]
140 |         article.content = remove_tags(self["content"])
141 |         article.front_image_url = self["front_image_url"]
142 |         if "front_image_path" in self:
143 |             article.front_image_path = self["front_image_path"]
144 |         article.praise_nums = self["praise_nums"]
145 |         article.fav_nums = self["fav_nums"]
146 |         article.comment_nums = self["comment_nums"]
147 |         article.url = self["url"]
148 |         article.tags = self["tags"]
149 |         article.meta.id = self["url_object_id"]
150 | 
151 |         # article.suggest = [{"input":[], "weight":2}]
152 |         article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7)))
153 | 
154 |         article.save()
155 | 
156 |         redis_cli.incr("jobble_count")
157 | 
158 |         return
159 | 
160 | 
161 | class ZhihuQuestionItem(scrapy.Item):
162 |     # 知乎的问题 Item
163 |     zhihu_id = scrapy.Field()
164 |     topics = scrapy.Field()
165 |     url = scrapy.Field()
166 |     title = scrapy.Field()
167 |     content = scrapy.Field()
168 |     answer_num = scrapy.Field()
169 |     comments_num = scrapy.Field()
170 |     watch_user_num = scrapy.Field()
171 |     click_num = scrapy.Field()
172 |     crawl_time = scrapy.Field()
173 | 
174 |     def get_insert_sql(self):
175 |         insert_sql = """
176 |                             insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num, 
177 |                             watch_user_num, click_num, crawl_time)
178 |                             VALUES (%s, %s, %s, %s, %s, %s, %s, %s,%s, %s)
179 |                             ON DUPLICATE KEY 
180 |                             UPDATE content=VALUES(content),answer_num=VALUES(answer_num), 
181 |                             comments_num=VALUES (comments_num),watch_user_num=VALUES (watch_user_num), 
182 |                             click_num=VALUES (click_num)  
183 |                             """
184 |         zhihu_id = self["zhihu_id"][0]
185 |         topics = ",".join(self["topics"])
186 |         url = self["url"][0]
187 |         title = "".join(self["title"])
188 |         content = "".join(self["content"])
189 |         answer_num = extract_num("".join(self["answer_num"]))
190 |         comments_num = extract_num("".join(self["comments_num"]))
191 |         watch_user_num = extract_num("".join(self["watch_user_num"]))
192 |         click_num = extract_num("".join(self["click_num"]))
193 |         crawl_time = datetime.datetime.now().strftime("")
194 | 
195 |         params = (zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time)
196 | 
197 |         return insert_sql, params
198 | 
199 | 
200 | class ZhihuAnswerItem(scrapy.Item):
201 |     zhihu_id = scrapy.Field()
202 |     url = scrapy.Field()
203 |     question_id = scrapy.Field()
204 |     author_id = scrapy.Field()
205 |     content = scrapy.Field()
206 |     praise_num = scrapy.Field()
207 |     comments_num = scrapy.Field()
208 |     create_time = scrapy.Field()
209 |     update_time = scrapy.Field()
210 |     crawl_time = scrapy.Field()
211 | 
212 |     def get_insert_sql(self):
213 | 
214 |         insert_sql = """
215 |                             insert into zhihu_answer(zhihu_id, url, question_id, author_id, content, praise_num, 
216 |                             comments_num,create_time, update_time, crawl_time) 
217 |                             VALUES (%s, %s, %s, %s, %s, %s, %s, %s,%s, %s) 
218 |                             ON DUPLICATE KEY 
219 |                             UPDATE content=VALUES(content),comments_num=VALUES(comments_num),
220 |                             praise_num=VALUES (praise_num),update_time=VALUES (update_time) 
221 |                             """  # on duplicate是mysql特有的语法
222 |         create_time = datetime.datetime.fromtimestamp(self["create_time"])
223 |         update_time = datetime.datetime.fromtimestamp(self["update_time"])
224 | 
225 |         params = (
226 |             self["zhuhu_id"], self["url"], self["question_id"],
227 |             self["author_id"], self["content"], self["praise_num"],
228 |             self["comments_num"], create_time, update_time, self["crawl_time"],
229 |         )
230 | 
231 | 
232 | def remove_splash(value):
233 |     # 去掉工作城市得斜线
234 |     return value.replace("/", "")
235 | 
236 | 
237 | def handle_jobaddr(value):
238 |     addr_list = value.split("\n")
239 |     addr_list = [item.strip() for item in addr_list if item.strip() != "查看地图"]
240 |     return "".join(addr_list)
241 | 
242 | 
243 | class LagouJobItemLoader(ItemLoader):
244 |     # 自定义ItemLoader
245 |     default_out_processor = TakeFirst()
246 | 
247 | 
248 | class LagouJobItem(scrapy.Item):
249 |     # 拉钩网职位信息
250 |     url = scrapy.Field()
251 |     url_object_id = scrapy.Field()
252 |     title = scrapy.Field()
253 |     salary = scrapy.Field()
254 |     job_city = scrapy.Field(
255 |         input_processor=MapCompose(remove_splash),
256 |     )
257 |     work_years = scrapy.Field(
258 |         input_processor=MapCompose(remove_splash),
259 |     )
260 |     degree_need = scrapy.Field(
261 |         input_processor=MapCompose(remove_splash),
262 |     )
263 |     job_type = scrapy.Field()
264 |     publish_time = scrapy.Field()
265 |     tags = scrapy.Field(
266 |         input_processor=MapCompose(Join(","))
267 |     )
268 |     job_advantage = scrapy.Field()
269 |     job_desc = scrapy.Field()
270 |     job_addr = scrapy.Field(
271 |         input_processor=MapCompose(remove_tags),
272 |     )
273 |     company_url = scrapy.Field()
274 |     company_name = scrapy.Field()
275 |     crawl_time = scrapy.Field()
276 |     crawl_update_time = scrapy.Field()
277 | 
278 |     def get_insert_sql(self):
279 |         insert_sql = """
280 |             insert into 
281 |             lagou_job(title, url, url_object_id, salary, job_city, work_years, degree_need, job_type,publish_time,
282 |             tags, job_advantage, job_desc, job_addr, company_url, company_name, crawl_time, crawl_update_time)
283 |             VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
284 |             ON DUPLICATE KEY UPDATE 
285 |             salary=VALUES (salary), job_desc=VALUES (job_desc), crawl_update_time=VALUES (crawl_update_time)
286 |         """
287 |         params = (
288 |             self["title"], self["url"], self["url_object_id"], self["salary"], self["job_city"], self["work_years"],
289 |             self["degree_need"],self["job_type"], self["publish_time"], self["tags"], self["job_advantage"],
290 |             self["job_desc"],self["job_addr"], self["company_url"], self["company_name"],
291 |             self["crawl_time"].strftime(SQL_DATETIME_FORMAT), self["crawl_update_time"].strftime(SQL_DATETIME_FORMAT)
292 |         )
293 | 
294 |         return insert_sql, params
295 | 
296 | 
297 | 
298 | 
299 | 


--------------------------------------------------------------------------------
/ArticleSpider/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | from fake_useragent import UserAgent
 11 | from ArticleSpider.utils.crawl_xici_ip import GetIP
 12 | 
 13 | 
 14 | class ArticlespiderSpiderMiddleware(object):
 15 |     # Not all methods need to be defined. If a method is not defined,
 16 |     # scrapy acts as if the spider middleware does not modify the
 17 |     # passed objects.
 18 | 
 19 |     @classmethod
 20 |     def from_crawler(cls, crawler):
 21 |         # This method is used by Scrapy to create your spiders.
 22 |         s = cls()
 23 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 24 |         return s
 25 | 
 26 |     def process_spider_input(self, response, spider):
 27 |         # Called for each response that goes through the spider
 28 |         # middleware and into the spider.
 29 | 
 30 |         # Should return None or raise an exception.
 31 |         return None
 32 | 
 33 |     def process_spider_output(self, response, result, spider):
 34 |         # Called with the results returned from the Spider, after
 35 |         # it has processed the response.
 36 | 
 37 |         # Must return an iterable of Request, dict or Item objects.
 38 |         for i in result:
 39 |             yield i
 40 | 
 41 |     def process_spider_exception(self, response, exception, spider):
 42 |         # Called when a spider or process_spider_input() method
 43 |         # (from other spider middleware) raises an exception.
 44 | 
 45 |         # Should return either None or an iterable of Response, dict
 46 |         # or Item objects.
 47 |         pass
 48 | 
 49 |     def process_start_requests(self, start_requests, spider):
 50 |         # Called with the start requests of the spider, and works
 51 |         # similarly to the process_spider_output() method, except
 52 |         # that it doesn’t have a response associated.
 53 | 
 54 |         # Must return only requests (not items).
 55 |         for r in start_requests:
 56 |             yield r
 57 | 
 58 |     def spider_opened(self, spider):
 59 |         spider.logger.info('Spider opened: %s' % spider.name)
 60 | 
 61 | 
 62 | class RandomUserAgentMiddleware(object):
 63 |     # 随机更换User-Agent
 64 |     def __init__(self, crawler):
 65 |         super(RandomUserAgentMiddleware, self).__init__()
 66 |         self.user_agent_list = crawler.settings.get("user_agent_list", [])
 67 |         self.ua = UserAgent()
 68 |         self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")
 69 | 
 70 |     @classmethod
 71 |     def from_crawler(cls, crawler):
 72 |         return cls(crawler)
 73 | 
 74 |     def process_request(self, request, spider):
 75 |         # from ArticleSpider.settings import user_agent_list
 76 |         # import random
 77 |         # request.headers.setdefault("User-Agent", user_agent_list[random.randint(0, len(user_agent_list)-1)])
 78 | 
 79 |         def get_ua():
 80 |             return getattr(self.ua, self.ua_type)
 81 | 
 82 |         # random_agent = get_ua()  # 调试的时候用
 83 | 
 84 |         request.headers.setdefault("User-Agent", get_ua())
 85 |         # request.meta["proxy"] = "http://113.128.90.192:48888"
 86 | 
 87 | 
 88 | class RandomProxyMiddleware(object):
 89 |     # 动态设置ip代理
 90 |     def process_request(self, request, spider):
 91 |         get_ip = GetIP()
 92 |         request.meta["proxy"] = get_ip.get_random_ip()
 93 | 
 94 | from selenium import webdriver
 95 | from scrapy.http import HtmlResponse
 96 | 
 97 | 
 98 | class JSPageMiddleware(object):
 99 |     # def __init__(self):
100 |     #     self.browser = webdriver.Chrome(executable_path="G:/Document/PythonServerEnvironment/SelniumWebdriver/chromedriver.exe")
101 |     #     super(JSPageMiddleware, self).__init__()
102 | 
103 |     # 通过chrome请求动态网页
104 |     def process_request(self, request, spider):
105 |         if spider.name == "jobbole":
106 |             # chrome_opt = webdriver.ChromeOptions()
107 |             # prefs = {"profile.managed_default_content_settings.images": 2}
108 |             # chrome_opt.add_experimental_option("prefs", prefs)
109 |             # browser = webdriver.Chrome( executable_path="G:/Document/PythonServerEnvironment/SelniumWebdriver/chromedriver.exe", chrome_options=chrome_opt)
110 |             # browser = webdriver.Chrome(executable_path="G:/Document/PythonServerEnvironment/SelniumWebdriver/chromedriver.exe")
111 |             spider.browser.get(request.url)
112 |             import time
113 |             time.sleep(3)
114 |             print("访问：{0}".format(request.url))
115 | 
116 |             return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding="utf-8", request=request)
117 | 
118 | 
119 | 
120 | 


--------------------------------------------------------------------------------
/ArticleSpider/models/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding:utf-8 -*-
3 | 
4 | __Author__ = "HackFun"


--------------------------------------------------------------------------------
/ArticleSpider/models/es_types.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding:utf-8 -*-
 3 | 
 4 | __Author__ = "HackFun"
 5 | 
 6 | from datetime import datetime
 7 | from elasticsearch_dsl import DocType, Date, Nested, Boolean, \
 8 |     analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
 9 | 
10 | from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalysis
11 | 
12 | from elasticsearch_dsl.connections import connections
13 | 
14 | es = connections.create_connection(hosts=["localhost"])  # connection可以连接多台服务器
15 | 
16 | 
17 | class CustomAnalyzer(_CustomAnalysis):
18 |     def get_analysis_definition(self):
19 |         return {}
20 | 
21 | ik_analyser = CustomAnalyzer("ik_max_word", filter=["lowercase"])
22 | 
23 | 
24 | class ArticleType(DocType):
25 |     # 伯乐在线文章类型
26 |     # suggest = Completion(analyzer="ik_max_word")  # 不能直接使用这个，由于源码问题，必须使用CustomAnalyzer
27 |     suggest = Completion(analyzer=ik_analyser)
28 |     title = Text(analyzer="ik_max_word")
29 |     create_date = Date()
30 |     url = Keyword()
31 |     url_object_id = Keyword()
32 |     front_image_url = Keyword()
33 |     front_image_path = Keyword()
34 |     praise_nums = Integer()
35 |     comment_nums = Integer()
36 |     fav_nums = Integer()
37 |     tags = Text(analyzer="ik_max_word")
38 |     content = Text(analyzer="ik_max_word")
39 | 
40 |     class Meta:
41 |         index = "jobbole"
42 |         doc_type = "article"
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     ArticleType.init()  # 根据类，直接生成mapping，
47 | 


--------------------------------------------------------------------------------
/ArticleSpider/pipelines.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define your item pipelines here
  4 | #
  5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
  7 | 
  8 | import codecs
  9 | import json
 10 | import MySQLdb
 11 | import MySQLdb.cursors
 12 | from scrapy.pipelines.images import ImagesPipeline
 13 | from scrapy.exporters import JsonItemExporter
 14 | from twisted.enterprise import adbapi
 15 | from ArticleSpider.models.es_types import ArticleType
 16 | from w3lib.html import remove_tags
 17 | 
 18 | 
 19 | class ArticlespiderPipeline(object):
 20 |     def process_item(self, item, spider):
 21 |         return item
 22 | 
 23 | 
 24 | class JsonWithEncodingPipeline(object):
 25 |     # 自定义json文件的导出
 26 |     def __init__(self):
 27 |         self.file = codecs.open('article.json', 'w', encoding="utf-8")
 28 | 
 29 |     def process_item(self, item, spider):
 30 |         lines = json.dumps(dict(item), ensure_ascii=False) + "\n"
 31 |         self.file.write(lines)
 32 |         return item
 33 | 
 34 |     def spider_closed(self, spider):
 35 |         self.file.close()
 36 | 
 37 | 
 38 | class MysqlPipeline(object):
 39 |     # 采用同步得机制写入mysql
 40 |     def __init__(self):
 41 |         host = "localhost"
 42 |         user = "root"
 43 |         password = "123456"
 44 |         dbname = "jobble_article"
 45 |         self.conn = MySQLdb.connect(host=host, user=user, passwd=password, db=dbname, charset='utf8', use_unicode=True)
 46 |         self.cursor = self.conn.cursor()
 47 | 
 48 |     def process_item(self, item, spider):
 49 |         insert_sql = """
 50 |         insert into jobbole(title, url, create_date, fav_nums)
 51 |         VALUES (%s, %s, %s, %s)
 52 |         """
 53 |         self.cursor.execute(insert_sql, (item["title"], item["url"], item["create_date"], item["fav_nums"]))
 54 |         self.conn.commit()
 55 | 
 56 | 
 57 | class MysqlTwistedPipeline(object):
 58 |     def __init__(self, dbpool):
 59 |         self.dbpool = dbpool
 60 | 
 61 |     @classmethod
 62 |     def from_settings(cls, settings):
 63 |         dbparams = dict(
 64 |             host=settings['MYSQL_HOST'],
 65 |             dbname=settings['MYSQL_DBNAME'],
 66 |             user=settings['MYSQL_USER'],
 67 |             passwd=settings['MYSQL_PASSWORD'],
 68 |             charset='utf8',
 69 |             cursorclass=MySQLdb.cursors.DictCursor,
 70 |             use_unicode=True,
 71 |         )
 72 |         dbpool = adbapi.ConnectionPool("MySQLdb", **dbparams)
 73 | 
 74 |         return cls(dbpool)
 75 | 
 76 |     def process_item(self, item, spider):
 77 |         # 使用twisted将mysql插入变成异步执行
 78 |         query = self.dbpool.runInteraction(self.do_insert, item)
 79 |         query.addErrback(self.handle_error, item, spider)  # 处理异常 (self.handle_error, item, spider)
 80 | 
 81 |     def handle_error(self, failure, item, spider):
 82 |         # 处理异步插入的异常
 83 |         print(failure)
 84 | 
 85 |     def do_insert(self, cursor, item):
 86 |         # 执行具体的插入
 87 |         # if item.__class__.__name__ == "JobBoleArticleItem":
 88 |         insert_sql, params = item.get_insert_sql()
 89 | 
 90 |         cursor.execute(insert_sql, params)
 91 | 
 92 | 
 93 | class JsonExporterPipeline(object):
 94 |     # 调用scrapy提供的json export 导出json文件
 95 |     def __init__(self):
 96 |         self.file = open('articleexport.json', 'wb')
 97 |         self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
 98 |         self.exporter.start_exporting()
 99 | 
100 |     def close_spider(self, spider):
101 |         self.exporter.finish_exporting()
102 |         self.file.close()
103 | 
104 |     def process_item(self, item, spider):
105 |         self.exporter.export_item(item)
106 |         return item
107 | 
108 | 
109 | class ArticleImagePipeline(ImagesPipeline):
110 |     def item_completed(self, results, item, info):
111 |         if "front_image_url" in item:
112 |             for ok, value in results:
113 |                 image_file_path = value["path"]
114 |             item["front_image_path"] = image_file_path
115 |         return item
116 | 
117 | 
118 | class ElasticsearchPipeline(object):
119 |     # 将数据写入打到es中
120 |     def process_item(self, item, spider):
121 |         # 将 item 转换 为es的数据
122 |         item.save_to_es()
123 | 
124 |         return item
125 | 
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/ArticleSpider/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import os
  4 | # Scrapy settings for ArticleSpider project
  5 | #
  6 | # For simplicity, this file contains only settings considered important or
  7 | # commonly used. You can find more settings consulting the documentation:
  8 | #
  9 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 11 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 12 | 
 13 | BOT_NAME = 'ArticleSpider'
 14 | 
 15 | SPIDER_MODULES = ['ArticleSpider.spiders']
 16 | NEWSPIDER_MODULE = 'ArticleSpider.spiders'
 17 | 
 18 | 
 19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 20 | #USER_AGENT = 'ArticleSpider (+http://www.yourdomain.com)'
 21 | 
 22 | # Obey robots.txt rules
 23 | ROBOTSTXT_OBEY = False
 24 | 
 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 26 | #CONCURRENT_REQUESTS = 32
 27 | 
 28 | # Configure a delay for requests for the same website (default: 0)
 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 30 | # See also autothrottle settings and docs
 31 | DOWNLOAD_DELAY = 3
 32 | # The download delay setting will honor only one of:
 33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 34 | #CONCURRENT_REQUESTS_PER_IP = 16
 35 | 
 36 | # Disable cookies (enabled by default)
 37 | # COOKIES_ENABLED = False
 38 | 
 39 | # Disable Telnet Console (enabled by default)
 40 | #TELNETCONSOLE_ENABLED = False
 41 | 
 42 | # Override the default request headers:
 43 | #DEFAULT_REQUEST_HEADERS = {
 44 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 45 | #   'Accept-Language': 'en',
 46 | #}
 47 | 
 48 | # Enable or disable spider middlewares
 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 50 | #SPIDER_MIDDLEWARES = {
 51 | #    'ArticleSpider.middlewares.ArticlespiderSpiderMiddleware': 543,
 52 | #}
 53 | 
 54 | # Enable or disable downloader middlewares
 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 56 | 
 57 | DOWNLOADER_MIDDLEWARES = {
 58 |    'ArticleSpider.middlewares.RandomUserAgentMiddleware': 543,
 59 |    # 'ArticleSpider.middlewares.JSPageMiddleware': 1,
 60 |    # 'ArticleSpider.middlewares.RandomProxyMiddleware': 544,
 61 |     'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
 62 | }
 63 | 
 64 | 
 65 | 
 66 | # Enable or disable extensions
 67 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 68 | #EXTENSIONS = {
 69 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 70 | #}
 71 | 
 72 | # Configure item pipelines
 73 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 74 | ITEM_PIPELINES = {
 75 |     'ArticleSpider.pipelines.ArticlespiderPipeline': 300,
 76 |     # 'scrapy.pipelines.images.ImagesPipeline': 1,
 77 |     'ArticleSpider.pipelines.ArticleImagePipeline': 1,
 78 |     'ArticleSpider.pipelines.ElasticsearchPipeline': 3,
 79 |     # 'ArticleSpider.pipelines.JsonWithEncodingPipeline': 2,
 80 |     # 'ArticleSpider.pipelines.JsonExporterPipeline': 2,
 81 | }
 82 | IMAGES_URLS_FIELD = "front_image_url"
 83 | project_dir = os.path.abspath(os.path.dirname(__file__))
 84 | IMAGES_STORE = os.path.join(project_dir, 'images')
 85 | # print(IMAGES_STORE)
 86 | 
 87 | import os
 88 | import sys
 89 | 
 90 | BASE_DIR = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
 91 | # print(os.path.join(BASE_DIR, 'ArticleSpider'))
 92 | sys.path.insert(0, os.path.join(BASE_DIR, 'ArticleSpider'))
 93 | 
 94 | # sys.path.insert(0, r"G:\MyProgramFiles\Py3Code\ArticleSpider\ArticleSpider")
 95 | 
 96 | user_agent_list = [
 97 |     "",
 98 |     "",
 99 | ]
100 | USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
101 | 
102 | RANDOM_UA_TYPE = "random"
103 | 
104 | # Enable and configure the AutoThrottle extension (disabled by default)
105 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
106 | #AUTOTHROTTLE_ENABLED = True
107 | # The initial download delay
108 | #AUTOTHROTTLE_START_DELAY = 5
109 | # The maximum download delay to be set in case of high latencies
110 | #AUTOTHROTTLE_MAX_DELAY = 60
111 | # The average number of requests Scrapy should be sending in parallel to
112 | # each remote server
113 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
114 | # Enable showing throttling stats for every response received:
115 | #AUTOTHROTTLE_DEBUG = False
116 | 
117 | # Enable and configure HTTP caching (disabled by default)
118 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
119 | #HTTPCACHE_ENABLED = True
120 | #HTTPCACHE_EXPIRATION_SECS = 0
121 | #HTTPCACHE_DIR = 'httpcache'
122 | #HTTPCACHE_IGNORE_HTTP_CODES = []
123 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
124 | 
125 | MYSQL_HOST = "127.0.0.1"
126 | MYSQL_DBNAME = "jobble_article"
127 | MYSQL_USER = "root"
128 | MYSQL_PASSWORD = "123456"
129 | 
130 | # JOBDIR = "job_info/001"
131 | 
132 | SQL_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"
133 | SQL_DATE_FORMAT = "%Y-%m-%d"


--------------------------------------------------------------------------------
/ArticleSpider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ArticleSpider/spiders/jobbole.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import re
  3 | import datetime
  4 | import scrapy
  5 | from scrapy.http import Request
  6 | from urllib import parse
  7 | from ArticleSpider.items import JobBoleArticleItem, ArticleItemLoader
  8 | from ArticleSpider.utils.common import get_md5
  9 | from scrapy.loader import ItemLoader
 10 | from selenium import webdriver
 11 | from scrapy.xlib.pydispatch import dispatcher
 12 | from scrapy import signals
 13 | 
 14 | 
 15 | class JobboleSpider(scrapy.Spider):
 16 |     name = 'jobbole'
 17 |     allowed_domains = ['blog.jobbole.com']
 18 |     start_urls = ['http://blog.jobbole.com/all-posts/']
 19 | 
 20 |     def __init__(self):
 21 |         self.start_urls = ('http://blog.jobbole.com/all-posts/',)
 22 |     #     # self.fail_urls = []
 23 |     #     self.browser = webdriver.Chrome(
 24 |     #         executable_path="G:/Document/PythonServerEnvironment/SelniumWebdriver/chromedriver.exe")
 25 |     #     super(JobboleSpider, self).__init__()
 26 |     #     dispatcher.connect(self.spider_close, signals.spider_closed)
 27 |     #
 28 |     # def spider_close(self, spider):
 29 |     #     # 当爬虫退出的时候关闭chrome
 30 |     #     print("spider closed")
 31 |     #     self.browser.quit()
 32 |     # start_urls = ['http://blog.jobbole.com/112239/']
 33 | 
 34 |     # 收集伯乐在线所有404的url以及404页面数
 35 |     # handle_httpstatus_list = [404]
 36 |     #
 37 |     # def __init__(self):
 38 |     #     self.fail_urls = []
 39 |     #     dispatcher.connect(self.handle_spider_closed, signals.spider_closed)
 40 |     #
 41 |     # def handle_spider_closed(self, spider, reason):
 42 |     #     self.crawler.stats.set_value("failed_urls", ",".join(self.fail_urls))
 43 | 
 44 |     def parse(self, response):
 45 |         """
 46 |         1.获取文章列表中得文章url并交给scrapy下载后并进行解析
 47 |         2.获取下一页得url 并交给scrapy进行下载， 下载完成后交给parse
 48 |         :param response:
 49 |         :return:
 50 |         """
 51 |         # if response.status == 404:
 52 |         #     self.fail_urls.append(response.url)
 53 |         #     self.crawler.stats.inc_value("failed_url")
 54 | 
 55 |         # 解析列表中得所有文章url，然后下载
 56 |         post_nodes = response.css("#archive .floated-thumb .post-thumb a")
 57 |         for post_node in post_nodes:
 58 |             img_url = post_node.css("img::attr(src)").extract_first("")
 59 |             post_url = post_node.css("::attr(href)").extract_first("")
 60 |             pa_url = parse.urljoin(response.url, post_url)  # 域名+url # response.url + post_url
 61 |             yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url": img_url}, callback=self.parse_detail)
 62 | 
 63 |         # 提取下一页进行下载
 64 |         next_url = response.css(".next.page-numbers::attr(href)").extract_first("")
 65 |         if next_url:
 66 |             pa_url = parse.urljoin(response.url, next_url)
 67 |             yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
 68 | 
 69 |     def parse_detail(self, response):
 70 |         # article_item = JobBoleArticleItem()
 71 | 
 72 |         # # 提取文章详情页
 73 |         # # re_selector = response.xpath("/html/body/div[2]/div[3]/div[1]/div[1]/h1") # 最好不用这种
 74 |         # title = response.xpath('//*[@id="post-112239"]/div[1]/h1/text()').extract_first()
 75 |         # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·",
 76 |         #                                                                                                             "").strip()
 77 |         # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]
 78 |         # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
 79 |         # match_re = re.match(r'.*?(\d+).*', fav_nums)
 80 |         # if match_re:
 81 |         #     fav_nums = int(match_re.group(1))
 82 |         # else:
 83 |         #     fav_nums = 0
 84 |         # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
 85 |         # match_re = re.match(r'.*?(\d+).*', comment_nums)
 86 |         # if match_re:
 87 |         #     comment_nums = int(match_re.group(1))
 88 |         # else:
 89 |         #     comment_nums = 0
 90 |         #
 91 |         # content = response.xpath("//div[@class='entry']").extract()[0]
 92 |         # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
 93 |         # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
 94 |         # tags = ",".join(tag_list)
 95 | 
 96 |         # 通过css选择器提取字段
 97 |         # front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
 98 |         # title = response.css(".entry-header h1::text").extract()[0]
 99 |         # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace('·', '').strip()
100 |         # praise_nums = response.css(".vote-post-up h10::text").extract()[0]
101 |         # fav_nums = response.css(".bookmark-btn::text").extract()[0]
102 |         # match_re = re.match(r'.*?(\d+).*', fav_nums)
103 |         # if match_re:
104 |         #     fav_nums = match_re.group(1)
105 |         # else:
106 |         #     fav_nums = 0
107 |         # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
108 |         # match_re = re.match(r'.*?(\d+).*', comment_nums)
109 |         # if match_re:
110 |         #     comment_nums = match_re.group(1)
111 |         # else:
112 |         #     comment_nums = 0
113 |         #
114 |         # content = response.css("div.entry").extract()[0]
115 |         # tags = response.css("p.entry-meta-hide-on-mobile a::text").extract()
116 |         # tag_list = [element for element in tags if not element.strip().endswith("评论")]
117 |         # tags = ",".join(tag_list)
118 |         #
119 |         # article_item["title"] = title
120 |         # article_item["url"] = response.url
121 |         # article_item["url_object_id"] = get_md5(response.url)
122 |         # try:
123 |         #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
124 |         # except Exception as e:
125 |         #     create_date = datetime.datetime.now()
126 |         # article_item["create_date"] = create_date
127 |         # article_item["front_image_url"] = [front_image_url]
128 |         # article_item["content"] = content
129 |         # article_item["praise_nums"] = praise_nums
130 |         # article_item["comment_nums"] = comment_nums
131 |         # article_item["fav_nums"] = fav_nums
132 |         # article_item["tags"] = tags
133 | 
134 |         # 通过Item Loader 加载item
135 |         item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
136 |         item_loader.add_css("title", ".entry-header h1::text")
137 |         item_loader.add_value("url", response.url)
138 |         item_loader.add_value("url_object_id", get_md5(response.url))
139 |         item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
140 |         item_loader.add_value("front_image_url", [response.meta.get("front_image_url", "")])
141 |         item_loader.add_css("praise_nums", ".vote-post-up h10::text")
142 |         item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
143 |         item_loader.add_css("fav_nums", ".bookmark-btn::text")
144 |         item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
145 |         item_loader.add_css("content", "div.entry")
146 | 
147 |         article_item = item_loader.load_item()
148 |         # item_loader.add_xpath()
149 | 
150 |         yield article_item
151 | 


--------------------------------------------------------------------------------
/ArticleSpider/spiders/lagou.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy.linkextractors import LinkExtractor
 4 | from scrapy.spiders import CrawlSpider, Rule
 5 | from ArticleSpider.items import LagouJobItem, LagouJobItemLoader
 6 | from ArticleSpider.utils.common import get_md5
 7 | import datetime
 8 | 
 9 | 
10 | class LagouSpider(CrawlSpider):
11 |     name = 'lagou'
12 |     allowed_domains = ['www.lagou.com']
13 |     start_urls = ['https://www.lagou.com/']
14 | 
15 |     rules = (
16 |         Rule(LinkExtractor(allow=("zhaopin/.*",)), follow=True),
17 |         Rule(LinkExtractor(allow=("gongsi/j\d+.html",)), follow=True),
18 |         Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True),
19 |     )
20 | 
21 |     def parse_job(self, response):
22 |         # 解析拉勾网的职位
23 |         item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response)
24 |         item_loader.add_css("title", "")
25 |         item_loader.add_value("url", response.url)
26 |         item_loader.add_value("url_object_id", get_md5(response.url))
27 |         item_loader.add_css("salary", ".job_request .salary::text")
28 |         item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text")
29 |         item_loader.add_css("work_years", ".job_request p span:nth-child(3)::text")  # 这里使用css ，是为了在学习时，熟悉css选择器用法
30 |         item_loader.add_xpath("degree_need", "//dd[@class='job_request']/p/span[4]/text()")
31 |         item_loader.add_xpath("job_type", "//dd[@class='job_request']/p/span[5]/text()")
32 | 
33 |         item_loader.add_css("publish_time", ".publish_time::text")
34 |         item_loader.add_css("tags", ".position-label.clearfix li::text")
35 |         item_loader.add_css("job_advantage", ".job-advantage p::text")
36 |         item_loader.add_css("job_desc", ".job_bt div")
37 |         item_loader.add_css("job_addr", ".work_addr")
38 |         item_loader.add_css("company_url", "#job_company dt a::attr(href)")
39 |         item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
40 |         item_loader.add_value("crawl_time", datetime.datetime.now())
41 |         # item_loader.add_css("crawl_update_time", datetime.datetime.now())
42 | 
43 |         job_item = item_loader.load_item()  # 这里先赋值给一个变量，是考虑到便于调试以及代码可读性，而不是为了代码简洁而直接return
44 | 
45 |         return job_item
46 | 
47 |     def parse_start_url(self, response):
48 |         return []
49 | 
50 |     def process_results(self, response, results):
51 |         return results


--------------------------------------------------------------------------------
/ArticleSpider/spiders/zhihu.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import re
  3 | import scrapy
  4 | import json
  5 | import datetime
  6 | try:
  7 |     import urlparse as parse
  8 | except:
  9 |     from urllib import parse
 10 | 
 11 | from scrapy.loader import ItemLoader
 12 | from ArticleSpider.items import ZhihuAnswerItem, ZhihuQuestionItem
 13 | from ArticleSpider.settings import user_agent_list
 14 | 
 15 | 
 16 | class ZhihuSpider(scrapy.Spider):
 17 |     name = 'zhihu'
 18 |     allowed_domains = ['www.zhihu.com']
 19 |     start_urls = ['http://www.zhihu.com/']
 20 | 
 21 |     # question的第一页answer得请求url
 22 |     start_answer_url = "https://www.zhihu.com/api/v4/questions/26234383/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit={1}&offset={2}"
 23 |     agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
 24 | 
 25 | 
 26 |     headers = {
 27 |         "Host": "www.zhihu.com",
 28 |         "Referer": "https://www.zhihu.com/",
 29 |         "User-Agent": agent,
 30 | 
 31 |     }
 32 |     custom_settings = {
 33 |         "COOKIES_ENABLED": True
 34 |     }
 35 | 
 36 |     def parse(self, response):
 37 |         # 提取出html页面中的所有url 并跟踪url进行一些爬取
 38 |         # 如果提取得url中格式为 /question/xxx 就下载进行之后直接进入解析函数
 39 |         all_urls = response.css("a::attr(href)").extract()
 40 |         all_urls = [parse.urljoin(response.url, url) for url in all_urls]
 41 |         all_urls = filter(lambda x: True if x.startswith("https") else False, all_urls)
 42 |         for url in all_urls:
 43 |             match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", url)
 44 |             if match_obj:
 45 |                 # 如果提取到得question相关页面则下载交由提取函数进行提取
 46 |                 request_url = match_obj.group(1)
 47 |                 # 简单的随机更换User-Agent
 48 |                 # import random
 49 |                 # random_index = random.randint(0, len(user_agent_list) - 1)
 50 |                 # random_agent = user_agent_list[random_index]
 51 |                 # self.headers["User-Agent"] = random_agent
 52 |                 yield scrapy.Request(request_url, headers=self.headers, callback=self.parse_question)
 53 |                 # break  # debug时候用
 54 |             else:
 55 |                 # 如果不是question 页面则直接进一步跟踪
 56 |                 yield scrapy.Request(url, headers=self.headers, callback=self.parse)  # debug 注释
 57 | 
 58 |     def parse_question(self, response):
 59 |         # 处理question页面， 从页面中提取question 具体item
 60 |         match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
 61 |         if match_obj:
 62 |             question_id = int(match_obj.group(2))
 63 | 
 64 |         if "QuestionHeader-title" in response.text:
 65 |             # 处理新版本
 66 |             item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
 67 |             item_loader.add_css("title", "h1.QuestionHeader-title::text")
 68 |             item_loader.add_css("content", ".QuestionHeader-detail")
 69 |             item_loader.add_value("url", response.url)
 70 |             item_loader.add_value("zhihu_id", question_id)
 71 |             item_loader.add_css("answer_num", ".List-headerText span::text")
 72 |             item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text")
 73 |             item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
 74 |             item_loader.add_css("topics", ".QuestionHeader-topics.Popover::text")
 75 | 
 76 |         else:
 77 |             # 处理知乎旧版本
 78 |             item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
 79 |             item_loader.add_css("title", "h1.QuestionHeader-title::text")
 80 |             item_loader.add_css("content", ".QuestionHeader-detail")
 81 |             item_loader.add_value("url", response.url)
 82 |             item_loader.add_value("zhihu_id", question_id)
 83 |             item_loader.add_css("answer_num", ".List-headerText span::text")
 84 |             item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text")
 85 |             item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
 86 |             item_loader.add_css("topics", ".QuestionHeader-topics.Popover::text")
 87 | 
 88 |         question_item = item_loader.load_item()
 89 |         yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer)
 90 |         yield question_item  # debug 注释
 91 | 
 92 |     def parse_answer(self, response):
 93 |         # 处理answer
 94 |         ans_json = json.loads(response.text)
 95 |         is_end = ans_json["paging"]["is_end"]
 96 |         # totals_answer = ans_json["paging"]["totals"]
 97 |         next_url = ans_json["paging"]["next"]
 98 | 
 99 |         # 提取answer的具体字段
100 |         for answer in ans_json["data"]:
101 |             answer_item = ZhihuAnswerItem()
102 |             answer_item["zhihu_id"] = answer["id"]
103 |             answer_item["url"] = answer["url"]
104 |             answer_item["question_id"] = answer["question"]["id"]
105 |             answer_item["author_id"] = answer["author"]["id"] if "id" in answer["author"] else None
106 |             answer_item["content"] = answer["content"] if "content" in answer else None
107 |             answer_item["praise_num"] = answer["voteup_count"]
108 |             answer_item["comments_num"] = answer["comment_count"]
109 |             answer_item["create_time"] = answer["created_time"]
110 |             answer_item["update_time"] = answer["updated_time"]
111 |             answer_item["crawl_time"] = datetime.datetime.now()
112 | 
113 |             yield answer_item
114 | 
115 |         if not is_end:
116 |             yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer)
117 | 
118 |     def start_requests(self):
119 |         return [scrapy.Request('https://www.zhihu.com/#signin', headers=self.headers, callback=self.login)]
120 | 
121 |     def login(self, response):
122 | 
123 |         response_text = response.text
124 |         match_obj = re.match('.*name="_xsrf" value="(.*?)".*', response_text, re.DOTALL)
125 |         if match_obj:
126 |             print(match_obj.group(1))
127 |             _xsrf = match_obj.group(1)
128 |         if _xsrf:
129 | 
130 |             post_data = {
131 |                 "_xsrf": _xsrf,
132 |                 "phone_num": '13342266862',
133 |                 "password": '553768563',
134 |                 "captcha": "",
135 | 
136 |             }
137 |             import time
138 |             t = str(int(time.time() * 1000))
139 |             captcha_url = "https://www.zhihu.com/captcha.gif?r={0}&type=login&lang=cn".format(t)
140 |             yield scrapy.Request(captcha_url, headers=self.headers, meta={"post_data": post_data}, callback=self.login_after_captcha)   # 重点
141 | 
142 |     def login_after_captcha(self, response):
143 |         with open("captcha.gif", "wb") as f:
144 |             f.write(response.body)
145 |             f.close()
146 | 
147 |         from PIL import Image
148 |         try:
149 |             im = Image.open("captcha.gif")
150 |             im.show()
151 |             im.close()
152 |         except:
153 |             pass
154 | 
155 |         captcha = input("输入验证码\n>")
156 | 
157 |         post_data = response.meta.get("post_data", {})
158 |         post_url = "https://www.zhihu.com/login/phone_num"
159 |         post_data["captcha"] = captcha
160 |         return [scrapy.FormRequest(
161 |             url=post_url,
162 |             formdata=post_data,
163 |             headers=self.headers,
164 |             callback=self.check_login
165 |         )]
166 | 
167 |     def check_login(self, response):
168 |         # 验证服务器的返回数据是否成功
169 |         text_json = json.loads(response.text)
170 |         if "msg" in text_json and text_json["msg"] == "登录成功":
171 |             for url in self.start_urls:
172 |                 yield scrapy.Request(url, dont_filter=True, headers=self.headers)
173 | 
174 | 


--------------------------------------------------------------------------------
/ArticleSpider/tools/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding:utf-8 -*-
3 | 
4 | __Author__ = "HackFun"


--------------------------------------------------------------------------------
/ArticleSpider/tools/debug.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hackfengJam/ArticleSpider/a332bd1e0db36d41b23f85cd850dfb20c75d00f0/ArticleSpider/tools/debug.log


--------------------------------------------------------------------------------
/ArticleSpider/tools/selenium_spider.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding:utf-8 -*-
 3 | 
 4 | __Author__ = "HackFun"
 5 | 
 6 | from selenium import webdriver
 7 | from scrapy.selector import Selector
 8 | 
 9 | # browser = webdriver.Chrome(executable_path="G:/Document/PythonServerEnvironment/SelniumWebdriver/chromedriver.exe")
10 | #
11 | # browser.get("https://item.taobao.com/item.htm?spm=2013.1.iteminfo.10.4b556901SPB44D&scm=1007.10010.52063.100200300000003&id=552169264763&pvid=19a525ca-6111-4648-98ab-0ff06f668623")
12 | #
13 | # print(browser.page_source)
14 | #
15 | # selector_ = Selector(text=browser.page_source)
16 | 
17 | 
18 | # browser.quit()
19 | 
20 | # 设置chromedirver
21 | chrome_opt = webdriver.ChromeOptions()
22 | prefs = {"profile.managed_default_content_settings.images": 2}
23 | chrome_opt.add_experimental_option("prefs", prefs)
24 | browser = webdriver.Chrome(executable_path="G:/Document/PythonServerEnvironment/SelniumWebdriver/chromedriver.exe", chrome_options=chrome_opt)
25 | browser.get("https://anta.tmall.com/")
26 | 
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/ArticleSpider/utils/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding:utf-8 -*-
3 | 
4 | __Author__ = "HackFun"


--------------------------------------------------------------------------------
/ArticleSpider/utils/captcha.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hackfengJam/ArticleSpider/a332bd1e0db36d41b23f85cd850dfb20c75d00f0/ArticleSpider/utils/captcha.gif


--------------------------------------------------------------------------------
/ArticleSpider/utils/common.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding:utf-8 -*-
 3 | 
 4 | __Author__ = "HackFun"
 5 | import hashlib
 6 | import re
 7 | import webbrowser
 8 | from webbrowser import Chrome
 9 | 
10 | 
11 | def get_md5(url):
12 |     if isinstance(url, str):
13 |         url = url.encode("utf-8")
14 |     m = hashlib.md5()
15 |     m.update(url)
16 |     return m.hexdigest()
17 | 
18 | 
19 | def extract_num(text):
20 |     # 字符串中提取数字
21 |     match_re = re.match(r'.*?(\d+).*', text)
22 |     if match_re:
23 |         nums = match_re.group(1)
24 |     return nums
25 | 
26 | 
27 | def webtest():
28 |     # webbrowser.open("http://jobbole.com", new=0, autoraise=1)
29 |     # webbrowser.open_new("http://jobbole.com")
30 |     # webbrowser.open_new_tab("http://jobbole.com")
31 |     webbrowser.register(name="chrome", klass=Chrome)
32 |     webbrowser.get('chrome').open("http://jobbole.com")
33 |         # .open('www.baidu.com', new=1, autoraise=True)
34 | 
35 |     chromePath = r'你的浏览器目录'  # 例如我的：C:\***\***\***\***\Google\Chrome\Application\chrome.exe
36 |     webbrowser.register('chrome', None, webbrowser.BackgroundBrowser(chromePath))  # 这里的'chrome'可以用其它任意名字，如chrome111，这里将想打开的浏览器保存到'chrome'
37 |     webbrowser.get('chrome').open('www.baidu.com', new=1, autoraise=True)
38 | 
39 | 
40 | 
41 | #
42 | # def to_list(t):
43 | #     return [i for i in t]
44 | 
45 | if __name__ == '__main__':
46 |     webtest()
47 |     print(get_md5("http://jobbole.com"))
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/ArticleSpider/utils/crawl_xici_ip.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding:utf-8 -*-
  3 | 
  4 | __Author__ = "HackFun"
  5 | 
  6 | import requests
  7 | from scrapy.selector import Selector
  8 | import MySQLdb
  9 | 
 10 | conn = MySQLdb.connect(host="localhost", user="root", passwd="123456", db="article_spider", charset="utf8")
 11 | cursor = conn.cursor()
 12 | 
 13 | 
 14 | def crawl_ips():
 15 |     # 爬取西刺得免费ip代理
 16 |     headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"}
 17 |     for i in range(2354):
 18 |         re = requests.get("http://www.xicidaili.com/nn/{0}".format(i), headers=headers)
 19 | 
 20 |         # print(re.text)
 21 |         selector = Selector(text=re.text)
 22 |         # all_trs = selector.css("#ip_list  tr[class]:not([class='subtitle'])")
 23 |         all_trs = selector.css("#ip_list tr")
 24 | 
 25 |         ip_list = []
 26 | 
 27 |         for tr in all_trs[1:]:
 28 |             speed_str = tr.css(".bar::attr(title)").extract()[0]
 29 |             if speed_str:
 30 |                 speed = float(speed_str.split("秒")[0])
 31 |             # ip = tr.css("td:nth-child[2]::text").extract()[0]  # 报错
 32 |             all_text = tr.css("td::text").extract()
 33 |             ip = all_text[0]
 34 |             port = all_text[1]
 35 |             proxy_type = all_text[5]
 36 | 
 37 |             # lis = (ip, port, speed, proxy_type)
 38 |             # lis = list(map(lambda a: str(a) if type(a) != 'str' else a, (ip, port, speed, proxy_type)))
 39 |             # print(':'.join(lis))
 40 | 
 41 |             ip_list.append((ip, port, speed, proxy_type))
 42 | 
 43 |             # print(all_trs)
 44 |         # for tr in all_trs:
 45 |         #     # print(tr.extract())
 46 |         #     # ip = tr.xpath('/td[2]/text()').extract()
 47 |         #     # port = tr.xpath('/td[3]/text()').extract()
 48 |         #     # http_type = tr.xpath('/td[6]/text()').extract()
 49 |         #     ip = tr.css('td:nth-child(2)::text').extract()[0]
 50 |         #     port = tr.css('td:nth-child(3)::text').extract()[0]
 51 |         #     speed = tr.css('td:nth-child(6)::text').extract()[0]
 52 |         #     proxy_type = tr.css('td:nth-child(6)::text').extract()[0]
 53 |         #     # print(ip, port)
 54 |         #     # print(':'.join((str(ip), str(port), str(http_type))))
 55 |         #     print(':'.join((ip, port, speed, proxy_type)))
 56 |         #     ip_list.append((ip, port, speed, proxy_type))
 57 | 
 58 |         print(": ".join(ip_info))
 59 | 
 60 |         for ip_info in ip_list:
 61 |             cursor.execute("insert into proxy_ip(ip, port, speed, proxy_type) VALUES ('{0}','{1}',{2},'{3}')".format(
 62 |                 ip_info[0], ip_info[1], ip_info[2], ip_info[3])
 63 |             )  # 传递字符串一定要加单引号
 64 | 
 65 |         conn.commit()
 66 | 
 67 |         # for tr in all_trs[1:]:
 68 |         #     # speed_str = tr.css(".bar::attr(title)").extract()[0]
 69 |         #     # if speed_str:
 70 |         #     #     speed = float(speed_str.split("秒")[0])
 71 |         #     all_texts = tr.css("td::text").extract()
 72 |         #     print(all_texts)
 73 | 
 74 |         # print(re.text)
 75 | 
 76 | 
 77 | class GetIP(object):
 78 |     def delete_ip(self, ip):
 79 |         # 从数据库中删除无效的ip
 80 |         delete_sql = """
 81 |         delete from proxy_ip where ip='{0}'
 82 |         """.format(ip)
 83 |         cursor.execute(delete_sql)
 84 |         conn.commit()
 85 |         return True
 86 | 
 87 |     def judge_ip(self, ip, port, proxy_type):
 88 |         # 判断IP 是否可用
 89 |         http_url = "proxy_type://www.baidu.com"
 90 |         proxy_url = "{3}://{0}:{1}".format(ip, port, proxy_type)
 91 |         response = None
 92 |         try:
 93 |             proxy_dict = {
 94 |                 proxy_type: proxy_url
 95 |             }
 96 |             response = requests.get(http_url, proxies=proxy_dict)
 97 |             return True
 98 |         except Exception as e:
 99 |             print("invalid ip and port")
100 |             self.delete_ip(ip)
101 |             return False
102 |         else:
103 |             code = response.status_code
104 |             if code >= 200 and code < 300:
105 |                 print("effective ip")
106 |                 return True
107 |             else:
108 |                 print("invalid ip and port")
109 |                 self.delete_ip(ip)
110 |                 return False
111 | 
112 |     def get_random_ip(self):
113 |         # 从数据库中随机获取一个可用的ip
114 |         random_sql = """
115 |             SELECT ip,port FROM proxy_ip
116 |             ORDER BY RAND()
117 |             LIMIT 1
118 |         """
119 |         cursor.execute(random_sql)
120 |         for ip_info in cursor.fetchall():
121 |             ip = ip_info[0]
122 |             port = ip_info[1]
123 |             proxy_type = ip_info[3] if ip_info[3] and ip_info[3] != "" else 'http'
124 | 
125 |             judge_re = self.judge_ip(ip, port, proxy_type)
126 |             if judge_re:
127 |                 return "{3}://{0}:{1}".format(ip, port, proxy_type)
128 |             else:
129 |                 return self.get_random_ip()
130 | 
131 | if __name__ == '__main__':
132 |     # crawl_ips()
133 |     get_ip = GetIP()
134 |     print(get_ip.get_random_ip())


--------------------------------------------------------------------------------
/ArticleSpider/utils/zhihu_login_requests.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding:utf-8 -*-
  3 | 
  4 | __Author__ = "HackFun"
  5 | 
  6 | import requests
  7 | try:
  8 |     import cookielib
  9 | except:
 10 |     import http.cookiejar as cookielib
 11 | 
 12 | import re
 13 | 
 14 | 
 15 | 
 16 | agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
 17 | header = {
 18 |     "Host": "www.zhihu.com",
 19 |     "Referer": "https://www.zhihu.com/",
 20 |     "User-Agent": agent,
 21 | "Cookie":'q_c1=447a188541144e3fb30424b694576ef2|1502619613000|1491825468000; q_c1=447a188541144e3fb30424b694576ef2|1502619613000|1491825468000; _zap=6efcefae-72d9-4251-9d91-2f350d61f8ee; capsion_ticket="2|1:0|10:1503325910|14:capsion_ticket|44:MDM1NThhZGYwMTM1NDAyNzkzNTYzMDMwNjhlNDNkNjM=|05608b1721fc351684c420227a8cc8c6a3926cfaea2c64ec23c62a1fbcd3a48f"; aliyungf_tc=AQAAAO2IxyovewwAshrJtkWO76wHBbMh; d_c0="AECCvOx7SwyPTtI7hlhRAcElYn2NHqLNeYI=|1504004081"; _xsrf=1be2d9a7-746b-4245-bc8f-4b50692e0965; l_cap_id="NzVjMmQ2ZTFkODVjNGVlYzkzZGNjNDQ4OTgwNjA2MDI=|1504010920|556da1e4afe6174e99f237007f3b12c2dd7054a2"; r_cap_id="MTAxNjU2MzFjZDM5NGNmZDgyNTliODljZDc3Y2IyMmQ=|1504010920|7698fb675ed8a3d0aff05ca5fa4e92297889b4e2"; cap_id="MjYzZjRlYTllOTA0NDA4MWE5ZGRjOTRlNGNiZTk5Y2M=|1504010920|7a120663cef55b6d4c72932874f2ed61afd2d050"; __utma=51854390.504384623.1504004084.1504004084.1504010008.2; __utmb=51854390.0.10.1504010008; __utmc=51854390; __utmz=51854390.1504010008.2.2.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=51854390.000--|2=registration_date=20170615=1^3=entry_date=20170410=1'
 22 | }
 23 | 
 24 | 
 25 | session = requests.session()
 26 | session.cookies = cookielib.LWPCookieJar(filename="cookies.txt")
 27 | 
 28 | try:
 29 |     session.cookies.load(ignore_discard=True)
 30 |     print("cookies已被加载")
 31 | except:
 32 |     print("cookies未能加载")
 33 | 
 34 | 
 35 | def get_xsrf():
 36 |     response = session.get("https://www.zhihu.com/", headers=header)
 37 |     # print(response.text)
 38 | 
 39 |     # text = '<input type="hidden" name="_xsrf" value="3442415578affffa55306d46aa708318"/>'
 40 |     # text = '<input type="hidden" name="_xsrf" value="73886f49bb135ef3ebf0af39b467eb3b"/>'
 41 |     text = response.text
 42 |     print(text)
 43 |     match_obj = re.match(r'.*?required.*', text)
 44 |     match_obj = re.match(r'.*name="_xsrf" value="(.*?)".*', text.strip())
 45 |     match_obj = re.match(r'.*name="_xsrf" value="(.*?)".*', text, re.DOTALL)
 46 |     match_obj = re.search('.*name="_xsrf" value="(.*?)".*', text)
 47 |     if match_obj:
 48 |         print(match_obj.group(1))
 49 |         return match_obj.group(1)
 50 |     else:
 51 |         return ""
 52 | 
 53 | 
 54 | def is_login():
 55 |     inbox_url = "https://www.zhihu.com/inbox"
 56 |     response = session.get(inbox_url, headers=header, allow_redirects=False)
 57 |     if response.status_code != 200:
 58 |         return False
 59 |     else:
 60 |         return True
 61 | 
 62 | 
 63 | def get_index():
 64 |     response = session.get("https://www.zhihu.com/", headers=header)
 65 |     with open("index_page.html", "wb") as f:
 66 |         f.write(response.text.encode("utf-8"))
 67 |     print("ok")
 68 | 
 69 | 
 70 | def get_captcha():
 71 |     import time
 72 |     t = str(int(time.time()*1000))
 73 |     captcha_url = "https://www.zhihu.com/captcha.gif?r=1504099197089&type=login&lang=cn"
 74 |     captcha_url = "https://www.zhihu.com/captcha.gif?r={0}&type=login&lang=cn"
 75 |     t = session.get(captcha_url, headers=header)
 76 |     with open("captcha.gif", "wb") as f:
 77 |         f.write(t.content)
 78 |         f.close()
 79 | 
 80 |     from PIL import Image
 81 |     try:
 82 |         im = Image.open("captcha.gif")
 83 |         im.show()
 84 |         im.close()
 85 |     except:
 86 |         pass
 87 | 
 88 |     captcha = input("输入验证码\n>")  # python2 中是 raw_input
 89 |     return captcha
 90 | 
 91 | 
 92 | def zhihu_login(account, password):
 93 |     # 知乎登陆
 94 |     if re.match("^1\d{10}", account):
 95 |         print("手机号码登陆")
 96 |         post_url = "https://www.zhihu.com/login/phone_num"
 97 |         post_data = {
 98 |             "_xsrf": get_xsrf(),
 99 |             "phone_num": account,
100 |             "password": password,
101 |             "captcha": get_captcha(),
102 |             # captcha:{"img_size":[200,44],"input_points":[[21.375,28],[156.375,33]]}# 2017-08-30
103 |             "captcha_type": 'cn'
104 |         }
105 |     else:
106 |         if "@" in account:
107 |             # 判断用户名是否为邮箱
108 |             print("邮箱方式登陆")
109 |             post_url = "https://www.zhihu.com/login/email"
110 |             post_data = {
111 |                 "_xsrf": get_xsrf(),
112 |                 "email": account,
113 |                 "password": password,
114 |                 "captcha": get_captcha(),
115 |                 "captcha_type": 'cn'
116 |             }
117 | 
118 |     response_text = session.post(post_url, data=post_data, headers=header)
119 |     session.cookies.save()
120 | 
121 | if __name__ == '__main__':
122 |     zhihu_login("13342266862", "553768563")
123 |     # print(get_xsrf())
124 |     get_index()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ArticleSpider
 2 | 通过scrapy，爬取知乎，伯乐在线，拉钩网
 3 | 
 4 | **注：**
 5 | 
 6 | > 这是一个进阶项目，需要有一定的爬虫知识，如果不是很懂基本的爬虫原理，请自行学习一下爬虫基础知识。
 7 | 我有一个对应的仓库[MyPythonForSpider](http://git.oschina.net/hackfun/MyPythonForSpider "baidumusicspider")，是一个单线程爬取百度音乐数据的实例，比较适合刚入门的朋友。
 8 | 
 9 | 
10 | 
11 | **这是一个基于web抓取框架[scrapy](https://baike.baidu.com/item/scrapy/7914913?fr=aladdin "scrapy")，实现的对于知乎，伯乐在线，拉勾网的爬取。**
12 | 
13 | ### 涉及到的知识点
14 | <pre>
15 | |-- 基础
16 | |   |-- 正则表达式 [jobbole.py](ArticleSpider/spiders/jobbole.py）
17 | |   |-- xpath （ArticleSpider/spiders/jobbole.py）
18 | |   |-- css选择器 （ArticleSpider/spiders/*.py）
19 | |   `-- ItemLoader
20 | |-- 进阶
21 | |   |-- 图片验证码的处理（ArticleSpider/spiders/lagou.login_after_captcha）
22 | |   |-- ip访问频率限制（ArticleSpider.middlewares.RandomProxyMiddleware）
23 | |   `-- user-agent随机切换（ArticleSpider.middlewares.RandomUserAgentMiddleware）
24 | |-- 高级
25 | |   |-- scrapy的原理
26 | |       `-- 基于scrapy的中间件开发
27 | |   |-- 动态网站的抓取处理
28 | |   |-- 将selenium集成到scrapy中 
29 | |   `-- scrapy log配置
30 | `-- |后续(在此项目中没有体现，后续我将上传此部分代码)
31 |     |-- scrapy-redis
32 |         |-- 分布式爬虫原理
33 |         |-- 分析scrapy-redis源码
34 |         `-- 集成bloomfilter到scrapy-redis中
35 |     `-- Elasticsearch （ArticleSpider.pipelines.ElasticsearchPipeline;）(ArticleSpider.items.JobBoleArticleItem.save_to_es;)
36 |         |-- 安装 elasticsearch-rtf
37 |         |-- 学习使用 elasticsearch-head、kibana
38 |         |-- 学习使用 elasticsearch的Python API： elasticsearch-dsl
39 |         `-- 利用elasticsearch和爬取到的数据+django框架搭建搜索网站（此部分代码将在以后上传）
40 | </pre>
41 | 
42 | **PS：使用此代码前，需创建mysql数据库，详见ArticleSpider/settings.py**
43 | 


--------------------------------------------------------------------------------
/articleexport.json:
--------------------------------------------------------------------------------
1 | []


--------------------------------------------------------------------------------
/build/lib/ArticleSpider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hackfengJam/ArticleSpider/a332bd1e0db36d41b23f85cd850dfb20c75d00f0/build/lib/ArticleSpider/__init__.py


--------------------------------------------------------------------------------
/build/lib/ArticleSpider/items.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your scraped items
  4 | #
  5 | # See documentation in:
  6 | # http://doc.scrapy.org/en/latest/topics/items.html
  7 | import re
  8 | import datetime
  9 | import scrapy
 10 | from scrapy.loader import ItemLoader
 11 | from scrapy.loader.processors import MapCompose, TakeFirst, Join
 12 | from ArticleSpider.utils.common import extract_num
 13 | from ArticleSpider.settings import SQL_DATETIME_FORMAT, SQL_DATE_FORMAT
 14 | from w3lib.html import remove_tags
 15 | from ArticleSpider.models.es_types import ArticleType
 16 | import redis
 17 | 
 18 | 
 19 | from elasticsearch_dsl.connections import connections
 20 | 
 21 | es = connections.create_connection(ArticleType._doc_type.using)
 22 | 
 23 | redis_cli = redis.StrictRedis()
 24 | 
 25 | 
 26 | class ArticlespiderItem(scrapy.Item):
 27 |     # define the fields for your item here like:
 28 |     # name = scrapy.Field()
 29 |     pass
 30 | 
 31 | 
 32 | def date_convert(value):
 33 |     try:
 34 |         create_date = datetime.datetime.strptime(value, "%Y/%m/%d").date()
 35 |     except Exception as e:
 36 |         create_date = datetime.datetime.now()
 37 |     return create_date
 38 | 
 39 | 
 40 | def add_jobbole(value):
 41 |     return value+"-haifeng"
 42 | 
 43 | 
 44 | def get_nums(value):
 45 |     # print(value)
 46 |     match_re = re.match(r'.*?(\d+).*', value)
 47 |     if match_re:
 48 |         nums = match_re.group(1)
 49 |         return nums
 50 |     else:
 51 |         return 0
 52 | 
 53 | 
 54 | def remove_comment_tags(value):
 55 |     # 去掉tag中提取得评论
 56 |     if "评论" in value:
 57 |         return ""
 58 |     else:
 59 |         return value
 60 | 
 61 | 
 62 | def return_value(value):
 63 |     return value
 64 | 
 65 | 
 66 | def gen_suggests(index, info_tuple):
 67 |     # 根据字符串生成搜索建议数据
 68 |     # python工程师  title  10
 69 |     # python工程师  text   3
 70 |     # 不能覆盖，所以用set
 71 |     used_words = set()
 72 |     suggests = []
 73 |     for text, weight in info_tuple:
 74 |         if text:
 75 |             # 调用es得analyze接口分析字符串
 76 |             words = es.indices.analyze(index=index, analyzer="ik_max_word", params={'filter': ["lowercase"]}, body=text)
 77 |             analyzed_words = set(r["token"] for r in words["tokens"] if len(r["token"]) > 1)
 78 |             new_words = analyzed_words - used_words
 79 |         else:
 80 |             new_words = set()
 81 |         if new_words:
 82 |             suggests.append({"input": list(new_words), "weight": weight})
 83 |     return suggests
 84 | 
 85 | 
 86 | class ArticleItemLoader(ItemLoader):
 87 |     # 自定义ItemLoader
 88 |     default_output_processor = TakeFirst()
 89 | 
 90 | 
 91 | class JobBoleArticleItem(scrapy.Item):
 92 |     # title = scrapy.Field(
 93 |     #     input_processor=MapCompose(lambda x: x+"-jobbole", add_jobbole),
 94 |     #     output_processor = TakeFirst()
 95 |     # )
 96 |     title = scrapy.Field()
 97 | 
 98 |     create_date = scrapy.Field(
 99 |         input_processor=MapCompose(date_convert),
100 |         # output_processor=TakeFirst()
101 |     )
102 |     url = scrapy.Field()
103 |     url_object_id = scrapy.Field(
104 |         output_processor=MapCompose(return_value)
105 |     )
106 |     front_image_url = scrapy.Field(
107 |         output_processor=MapCompose(return_value)
108 |     )
109 |     front_image_path = scrapy.Field()
110 |     praise_nums = scrapy.Field(
111 |         input_processor=MapCompose(get_nums)
112 |     )
113 |     comment_nums = scrapy.Field(
114 |         input_processor=MapCompose(get_nums)
115 |     )
116 |     fav_nums = scrapy.Field(
117 |         input_processor=MapCompose(get_nums)
118 |     )
119 |     tags = scrapy.Field(
120 |         input_processor=MapCompose(remove_comment_tags),
121 |         output_processor=Join(","),
122 |     )
123 |     content = scrapy.Field()
124 | 
125 |     def get_insert_sql(self):
126 |         insert_sql = """
127 |                             insert into jobbole(title, url, create_date, fav_nums)
128 |                             VALUES (%s, %s, %s, %s)
129 |                             ON DUPLICATE KEY 
130 |                             UPDATE title=VALUES(title),url=VALUES(url),
131 |                             create_date=VALUES (create_date),fav_nums=VALUES (fav_nums) 
132 |                             """
133 |         params = (self["title"], self["url"], self["create_date"], self["fav_nums"])
134 |         return insert_sql, params
135 | 
136 |     def save_to_es(self):
137 |         article = ArticleType()
138 |         article.title = self['title']
139 |         article.create_date = self["create_date"]
140 |         article.content = remove_tags(self["content"])
141 |         article.front_image_url = self["front_image_url"]
142 |         if "front_image_path" in self:
143 |             article.front_image_path = self["front_image_path"]
144 |         article.praise_nums = self["praise_nums"]
145 |         article.fav_nums = self["fav_nums"]
146 |         article.comment_nums = self["comment_nums"]
147 |         article.url = self["url"]
148 |         article.tags = self["tags"]
149 |         article.meta.id = self["url_object_id"]
150 | 
151 |         # article.suggest = [{"input":[], "weight":2}]
152 |         article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7)))
153 | 
154 |         article.save()
155 | 
156 |         redis_cli.incr("jobble_count")
157 | 
158 |         return
159 | 
160 | 
161 | class ZhihuQuestionItem(scrapy.Item):
162 |     # 知乎的问题 Item
163 |     zhihu_id = scrapy.Field()
164 |     topics = scrapy.Field()
165 |     url = scrapy.Field()
166 |     title = scrapy.Field()
167 |     content = scrapy.Field()
168 |     answer_num = scrapy.Field()
169 |     comments_num = scrapy.Field()
170 |     watch_user_num = scrapy.Field()
171 |     click_num = scrapy.Field()
172 |     crawl_time = scrapy.Field()
173 | 
174 |     def get_insert_sql(self):
175 |         insert_sql = """
176 |                             insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num, 
177 |                             watch_user_num, click_num, crawl_time)
178 |                             VALUES (%s, %s, %s, %s, %s, %s, %s, %s,%s, %s)
179 |                             ON DUPLICATE KEY 
180 |                             UPDATE content=VALUES(content),answer_num=VALUES(answer_num), 
181 |                             comments_num=VALUES (comments_num),watch_user_num=VALUES (watch_user_num), 
182 |                             click_num=VALUES (click_num)  
183 |                             """
184 |         zhihu_id = self["zhihu_id"][0]
185 |         topics = ",".join(self["topics"])
186 |         url = self["url"][0]
187 |         title = "".join(self["title"])
188 |         content = "".join(self["content"])
189 |         answer_num = extract_num("".join(self["answer_num"]))
190 |         comments_num = extract_num("".join(self["comments_num"]))
191 |         watch_user_num = extract_num("".join(self["watch_user_num"]))
192 |         click_num = extract_num("".join(self["click_num"]))
193 |         crawl_time = datetime.datetime.now().strftime("")
194 | 
195 |         params = (zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time)
196 | 
197 |         return insert_sql, params
198 | 
199 | 
200 | class ZhihuAnswerItem(scrapy.Item):
201 |     zhihu_id = scrapy.Field()
202 |     url = scrapy.Field()
203 |     question_id = scrapy.Field()
204 |     author_id = scrapy.Field()
205 |     content = scrapy.Field()
206 |     praise_num = scrapy.Field()
207 |     comments_num = scrapy.Field()
208 |     create_time = scrapy.Field()
209 |     update_time = scrapy.Field()
210 |     crawl_time = scrapy.Field()
211 | 
212 |     def get_insert_sql(self):
213 | 
214 |         insert_sql = """
215 |                             insert into zhihu_answer(zhihu_id, url, question_id, author_id, content, praise_num, 
216 |                             comments_num,create_time, update_time, crawl_time) 
217 |                             VALUES (%s, %s, %s, %s, %s, %s, %s, %s,%s, %s) 
218 |                             ON DUPLICATE KEY 
219 |                             UPDATE content=VALUES(content),comments_num=VALUES(comments_num),
220 |                             praise_num=VALUES (praise_num),update_time=VALUES (update_time) 
221 |                             """  # on duplicate是mysql特有的语法
222 |         create_time = datetime.datetime.fromtimestamp(self["create_time"])
223 |         update_time = datetime.datetime.fromtimestamp(self["update_time"])
224 | 
225 |         params = (
226 |             self["zhuhu_id"], self["url"], self["question_id"],
227 |             self["author_id"], self["content"], self["praise_num"],
228 |             self["comments_num"], create_time, update_time, self["crawl_time"],
229 |         )
230 | 
231 | 
232 | def remove_splash(value):
233 |     # 去掉工作城市得斜线
234 |     return value.replace("/", "")
235 | 
236 | 
237 | def handle_jobaddr(value):
238 |     addr_list = value.split("\n")
239 |     addr_list = [item.strip() for item in addr_list if item.strip() != "查看地图"]
240 |     return "".join(addr_list)
241 | 
242 | 
243 | class LagouJobItemLoader(ItemLoader):
244 |     # 自定义ItemLoader
245 |     default_out_processor = TakeFirst()
246 | 
247 | 
248 | class LagouJobItem(scrapy.Item):
249 |     # 拉钩网职位信息
250 |     url = scrapy.Field()
251 |     url_object_id = scrapy.Field()
252 |     title = scrapy.Field()
253 |     salary = scrapy.Field()
254 |     job_city = scrapy.Field(
255 |         input_processor=MapCompose(remove_splash),
256 |     )
257 |     work_years = scrapy.Field(
258 |         input_processor=MapCompose(remove_splash),
259 |     )
260 |     degree_need = scrapy.Field(
261 |         input_processor=MapCompose(remove_splash),
262 |     )
263 |     job_type = scrapy.Field()
264 |     publish_time = scrapy.Field()
265 |     tags = scrapy.Field(
266 |         input_processor=MapCompose(Join(","))
267 |     )
268 |     job_advantage = scrapy.Field()
269 |     job_desc = scrapy.Field()
270 |     job_addr = scrapy.Field(
271 |         input_processor=MapCompose(remove_tags),
272 |     )
273 |     company_url = scrapy.Field()
274 |     company_name = scrapy.Field()
275 |     crawl_time = scrapy.Field()
276 |     crawl_update_time = scrapy.Field()
277 | 
278 |     def get_insert_sql(self):
279 |         insert_sql = """
280 |             insert into 
281 |             lagou_job(title, url, url_object_id, salary, job_city, work_years, degree_need, job_type,publish_time,
282 |             tags, job_advantage, job_desc, job_addr, company_url, company_name, crawl_time, crawl_update_time)
283 |             VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
284 |             ON DUPLICATE KEY UPDATE 
285 |             salary=VALUES (salary), job_desc=VALUES (job_desc), crawl_update_time=VALUES (crawl_update_time)
286 |         """
287 |         params = (
288 |             self["title"], self["url"], self["url_object_id"], self["salary"], self["job_city"], self["work_years"],
289 |             self["degree_need"],self["job_type"], self["publish_time"], self["tags"], self["job_advantage"],
290 |             self["job_desc"],self["job_addr"], self["company_url"], self["company_name"],
291 |             self["crawl_time"].strftime(SQL_DATETIME_FORMAT), self["crawl_update_time"].strftime(SQL_DATETIME_FORMAT)
292 |         )
293 | 
294 |         return insert_sql, params
295 | 
296 | 
297 | 
298 | 
299 | 


--------------------------------------------------------------------------------
/build/lib/ArticleSpider/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | from fake_useragent import UserAgent
 11 | from ArticleSpider.utils.crawl_xici_ip import GetIP
 12 | 
 13 | 
 14 | class ArticlespiderSpiderMiddleware(object):
 15 |     # Not all methods need to be defined. If a method is not defined,
 16 |     # scrapy acts as if the spider middleware does not modify the
 17 |     # passed objects.
 18 | 
 19 |     @classmethod
 20 |     def from_crawler(cls, crawler):
 21 |         # This method is used by Scrapy to create your spiders.
 22 |         s = cls()
 23 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 24 |         return s
 25 | 
 26 |     def process_spider_input(self, response, spider):
 27 |         # Called for each response that goes through the spider
 28 |         # middleware and into the spider.
 29 | 
 30 |         # Should return None or raise an exception.
 31 |         return None
 32 | 
 33 |     def process_spider_output(self, response, result, spider):
 34 |         # Called with the results returned from the Spider, after
 35 |         # it has processed the response.
 36 | 
 37 |         # Must return an iterable of Request, dict or Item objects.
 38 |         for i in result:
 39 |             yield i
 40 | 
 41 |     def process_spider_exception(self, response, exception, spider):
 42 |         # Called when a spider or process_spider_input() method
 43 |         # (from other spider middleware) raises an exception.
 44 | 
 45 |         # Should return either None or an iterable of Response, dict
 46 |         # or Item objects.
 47 |         pass
 48 | 
 49 |     def process_start_requests(self, start_requests, spider):
 50 |         # Called with the start requests of the spider, and works
 51 |         # similarly to the process_spider_output() method, except
 52 |         # that it doesn’t have a response associated.
 53 | 
 54 |         # Must return only requests (not items).
 55 |         for r in start_requests:
 56 |             yield r
 57 | 
 58 |     def spider_opened(self, spider):
 59 |         spider.logger.info('Spider opened: %s' % spider.name)
 60 | 
 61 | 
 62 | class RandomUserAgentMiddleware(object):
 63 |     # 随机更换User-Agent
 64 |     def __init__(self, crawler):
 65 |         super(RandomUserAgentMiddleware, self).__init__()
 66 |         self.user_agent_list = crawler.settings.get("user_agent_list", [])
 67 |         self.ua = UserAgent()
 68 |         self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")
 69 | 
 70 |     @classmethod
 71 |     def from_crawler(cls, crawler):
 72 |         return cls(crawler)
 73 | 
 74 |     def process_request(self, request, spider):
 75 |         # from ArticleSpider.settings import user_agent_list
 76 |         # import random
 77 |         # request.headers.setdefault("User-Agent", user_agent_list[random.randint(0, len(user_agent_list)-1)])
 78 | 
 79 |         def get_ua():
 80 |             return getattr(self.ua, self.ua_type)
 81 | 
 82 |         # random_agent = get_ua()  # 调试的时候用
 83 | 
 84 |         request.headers.setdefault("User-Agent", get_ua())
 85 |         # request.meta["proxy"] = "http://113.128.90.192:48888"
 86 | 
 87 | 
 88 | class RandomProxyMiddleware(object):
 89 |     # 动态设置ip代理
 90 |     def process_request(self, request, spider):
 91 |         get_ip = GetIP()
 92 |         request.meta["proxy"] = get_ip.get_random_ip()
 93 | 
 94 | from selenium import webdriver
 95 | from scrapy.http import HtmlResponse
 96 | 
 97 | 
 98 | class JSPageMiddleware(object):
 99 |     # def __init__(self):
100 |     #     self.browser = webdriver.Chrome(executable_path="G:/Document/PythonServerEnvironment/SelniumWebdriver/chromedriver.exe")
101 |     #     super(JSPageMiddleware, self).__init__()
102 | 
103 |     # 通过chrome请求动态网页
104 |     def process_request(self, request, spider):
105 |         if spider.name == "jobbole":
106 |             # chrome_opt = webdriver.ChromeOptions()
107 |             # prefs = {"profile.managed_default_content_settings.images": 2}
108 |             # chrome_opt.add_experimental_option("prefs", prefs)
109 |             # browser = webdriver.Chrome( executable_path="G:/Document/PythonServerEnvironment/SelniumWebdriver/chromedriver.exe", chrome_options=chrome_opt)
110 |             # browser = webdriver.Chrome(executable_path="G:/Document/PythonServerEnvironment/SelniumWebdriver/chromedriver.exe")
111 |             spider.browser.get(request.url)
112 |             import time
113 |             time.sleep(3)
114 |             print("访问：{0}".format(request.url))
115 | 
116 |             return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding="utf-8", request=request)
117 | 
118 | 
119 | 
120 | 


--------------------------------------------------------------------------------
/build/lib/ArticleSpider/models/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding:utf-8 -*-
3 | 
4 | __Author__ = "HackFun"


--------------------------------------------------------------------------------
/build/lib/ArticleSpider/models/es_types.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding:utf-8 -*-
 3 | 
 4 | __Author__ = "HackFun"
 5 | 
 6 | from datetime import datetime
 7 | from elasticsearch_dsl import DocType, Date, Nested, Boolean, \
 8 |     analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
 9 | 
10 | from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalysis
11 | 
12 | from elasticsearch_dsl.connections import connections
13 | 
14 | es = connections.create_connection(hosts=["localhost"])  # connection可以连接多台服务器
15 | 
16 | 
17 | class CustomAnalyzer(_CustomAnalysis):
18 |     def get_analysis_definition(self):
19 |         return {}
20 | 
21 | ik_analyser = CustomAnalyzer("ik_max_word", filter=["lowercase"])
22 | 
23 | 
24 | class ArticleType(DocType):
25 |     # 伯乐在线文章类型
26 |     # suggest = Completion(analyzer="ik_max_word")  # 不能直接使用这个，由于源码问题，必须使用CustomAnalyzer
27 |     suggest = Completion(analyzer=ik_analyser)
28 |     title = Text(analyzer="ik_max_word")
29 |     create_date = Date()
30 |     url = Keyword()
31 |     url_object_id = Keyword()
32 |     front_image_url = Keyword()
33 |     front_image_path = Keyword()
34 |     praise_nums = Integer()
35 |     comment_nums = Integer()
36 |     fav_nums = Integer()
37 |     tags = Text(analyzer="ik_max_word")
38 |     content = Text(analyzer="ik_max_word")
39 | 
40 |     class Meta:
41 |         index = "jobbole"
42 |         doc_type = "article"
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     ArticleType.init()  # 根据类，直接生成mapping，
47 | 


--------------------------------------------------------------------------------
/build/lib/ArticleSpider/pipelines.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define your item pipelines here
  4 | #
  5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
  7 | 
  8 | import codecs
  9 | import json
 10 | import MySQLdb
 11 | import MySQLdb.cursors
 12 | from scrapy.pipelines.images import ImagesPipeline
 13 | from scrapy.exporters import JsonItemExporter
 14 | from twisted.enterprise import adbapi
 15 | from ArticleSpider.models.es_types import ArticleType
 16 | from w3lib.html import remove_tags
 17 | 
 18 | 
 19 | class ArticlespiderPipeline(object):
 20 |     def process_item(self, item, spider):
 21 |         return item
 22 | 
 23 | 
 24 | class JsonWithEncodingPipeline(object):
 25 |     # 自定义json文件的导出
 26 |     def __init__(self):
 27 |         self.file = codecs.open('article.json', 'w', encoding="utf-8")
 28 | 
 29 |     def process_item(self, item, spider):
 30 |         lines = json.dumps(dict(item), ensure_ascii=False) + "\n"
 31 |         self.file.write(lines)
 32 |         return item
 33 | 
 34 |     def spider_closed(self, spider):
 35 |         self.file.close()
 36 | 
 37 | 
 38 | class MysqlPipeline(object):
 39 |     # 采用同步得机制写入mysql
 40 |     def __init__(self):
 41 |         host = "localhost"
 42 |         user = "root"
 43 |         password = "123456"
 44 |         dbname = "jobble_article"
 45 |         self.conn = MySQLdb.connect(host=host, user=user, passwd=password, db=dbname, charset='utf8', use_unicode=True)
 46 |         self.cursor = self.conn.cursor()
 47 | 
 48 |     def process_item(self, item, spider):
 49 |         insert_sql = """
 50 |         insert into jobbole(title, url, create_date, fav_nums)
 51 |         VALUES (%s, %s, %s, %s)
 52 |         """
 53 |         self.cursor.execute(insert_sql, (item["title"], item["url"], item["create_date"], item["fav_nums"]))
 54 |         self.conn.commit()
 55 | 
 56 | 
 57 | class MysqlTwistedPipeline(object):
 58 |     def __init__(self, dbpool):
 59 |         self.dbpool = dbpool
 60 | 
 61 |     @classmethod
 62 |     def from_settings(cls, settings):
 63 |         dbparams = dict(
 64 |             host=settings['MYSQL_HOST'],
 65 |             dbname=settings['MYSQL_DBNAME'],
 66 |             user=settings['MYSQL_USER'],
 67 |             passwd=settings['MYSQL_PASSWORD'],
 68 |             charset='utf8',
 69 |             cursorclass=MySQLdb.cursors.DictCursor,
 70 |             use_unicode=True,
 71 |         )
 72 |         dbpool = adbapi.ConnectionPool("MySQLdb", **dbparams)
 73 | 
 74 |         return cls(dbpool)
 75 | 
 76 |     def process_item(self, item, spider):
 77 |         # 使用twisted将mysql插入变成异步执行
 78 |         query = self.dbpool.runInteraction(self.do_insert, item)
 79 |         query.addErrback(self.handle_error, item, spider)  # 处理异常 (self.handle_error, item, spider)
 80 | 
 81 |     def handle_error(self, failure, item, spider):
 82 |         # 处理异步插入的异常
 83 |         print(failure)
 84 | 
 85 |     def do_insert(self, cursor, item):
 86 |         # 执行具体的插入
 87 |         # if item.__class__.__name__ == "JobBoleArticleItem":
 88 |         insert_sql, params = item.get_insert_sql()
 89 | 
 90 |         cursor.execute(insert_sql, params)
 91 | 
 92 | 
 93 | class JsonExporterPipeline(object):
 94 |     # 调用scrapy提供的json export 导出json文件
 95 |     def __init__(self):
 96 |         self.file = open('articleexport.json', 'wb')
 97 |         self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
 98 |         self.exporter.start_exporting()
 99 | 
100 |     def close_spider(self, spider):
101 |         self.exporter.finish_exporting()
102 |         self.file.close()
103 | 
104 |     def process_item(self, item, spider):
105 |         self.exporter.export_item(item)
106 |         return item
107 | 
108 | 
109 | class ArticleImagePipeline(ImagesPipeline):
110 |     def item_completed(self, results, item, info):
111 |         if "front_image_url" in item:
112 |             for ok, value in results:
113 |                 image_file_path = value["path"]
114 |             item["front_image_path"] = image_file_path
115 |         return item
116 | 
117 | 
118 | class ElasticsearchPipeline(object):
119 |     # 将数据写入打到es中
120 |     def process_item(self, item, spider):
121 |         # 将 item 转换 为es的数据
122 |         item.save_to_es()
123 | 
124 |         return item
125 | 
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/build/lib/ArticleSpider/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import os
  4 | # Scrapy settings for ArticleSpider project
  5 | #
  6 | # For simplicity, this file contains only settings considered important or
  7 | # commonly used. You can find more settings consulting the documentation:
  8 | #
  9 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 11 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 12 | 
 13 | BOT_NAME = 'ArticleSpider'
 14 | 
 15 | SPIDER_MODULES = ['ArticleSpider.spiders']
 16 | NEWSPIDER_MODULE = 'ArticleSpider.spiders'
 17 | 
 18 | 
 19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 20 | #USER_AGENT = 'ArticleSpider (+http://www.yourdomain.com)'
 21 | 
 22 | # Obey robots.txt rules
 23 | ROBOTSTXT_OBEY = False
 24 | 
 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 26 | #CONCURRENT_REQUESTS = 32
 27 | 
 28 | # Configure a delay for requests for the same website (default: 0)
 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 30 | # See also autothrottle settings and docs
 31 | DOWNLOAD_DELAY = 3
 32 | # The download delay setting will honor only one of:
 33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 34 | #CONCURRENT_REQUESTS_PER_IP = 16
 35 | 
 36 | # Disable cookies (enabled by default)
 37 | # COOKIES_ENABLED = False
 38 | 
 39 | # Disable Telnet Console (enabled by default)
 40 | #TELNETCONSOLE_ENABLED = False
 41 | 
 42 | # Override the default request headers:
 43 | #DEFAULT_REQUEST_HEADERS = {
 44 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 45 | #   'Accept-Language': 'en',
 46 | #}
 47 | 
 48 | # Enable or disable spider middlewares
 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 50 | #SPIDER_MIDDLEWARES = {
 51 | #    'ArticleSpider.middlewares.ArticlespiderSpiderMiddleware': 543,
 52 | #}
 53 | 
 54 | # Enable or disable downloader middlewares
 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 56 | 
 57 | DOWNLOADER_MIDDLEWARES = {
 58 |    'ArticleSpider.middlewares.RandomUserAgentMiddleware': 543,
 59 |    # 'ArticleSpider.middlewares.JSPageMiddleware': 1,
 60 |    # 'ArticleSpider.middlewares.RandomProxyMiddleware': 544,
 61 |     'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
 62 | }
 63 | 
 64 | 
 65 | 
 66 | # Enable or disable extensions
 67 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 68 | #EXTENSIONS = {
 69 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 70 | #}
 71 | 
 72 | # Configure item pipelines
 73 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 74 | ITEM_PIPELINES = {
 75 |     'ArticleSpider.pipelines.ArticlespiderPipeline': 300,
 76 |     # 'scrapy.pipelines.images.ImagesPipeline': 1,
 77 |     'ArticleSpider.pipelines.ArticleImagePipeline': 1,
 78 |     'ArticleSpider.pipelines.ElasticsearchPipeline': 3,
 79 |     # 'ArticleSpider.pipelines.JsonWithEncodingPipeline': 2,
 80 |     # 'ArticleSpider.pipelines.JsonExporterPipeline': 2,
 81 | }
 82 | IMAGES_URLS_FIELD = "front_image_url"
 83 | project_dir = os.path.abspath(os.path.dirname(__file__))
 84 | IMAGES_STORE = os.path.join(project_dir, 'images')
 85 | # print(IMAGES_STORE)
 86 | 
 87 | import os
 88 | import sys
 89 | 
 90 | BASE_DIR = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
 91 | # print(os.path.join(BASE_DIR, 'ArticleSpider'))
 92 | sys.path.insert(0, os.path.join(BASE_DIR, 'ArticleSpider'))
 93 | 
 94 | # sys.path.insert(0, r"G:\MyProgramFiles\Py3Code\ArticleSpider\ArticleSpider")
 95 | 
 96 | user_agent_list = [
 97 |     "",
 98 |     "",
 99 | ]
100 | USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
101 | 
102 | RANDOM_UA_TYPE = "random"
103 | 
104 | # Enable and configure the AutoThrottle extension (disabled by default)
105 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
106 | #AUTOTHROTTLE_ENABLED = True
107 | # The initial download delay
108 | #AUTOTHROTTLE_START_DELAY = 5
109 | # The maximum download delay to be set in case of high latencies
110 | #AUTOTHROTTLE_MAX_DELAY = 60
111 | # The average number of requests Scrapy should be sending in parallel to
112 | # each remote server
113 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
114 | # Enable showing throttling stats for every response received:
115 | #AUTOTHROTTLE_DEBUG = False
116 | 
117 | # Enable and configure HTTP caching (disabled by default)
118 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
119 | #HTTPCACHE_ENABLED = True
120 | #HTTPCACHE_EXPIRATION_SECS = 0
121 | #HTTPCACHE_DIR = 'httpcache'
122 | #HTTPCACHE_IGNORE_HTTP_CODES = []
123 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
124 | 
125 | MYSQL_HOST = "127.0.0.1"
126 | MYSQL_DBNAME = "jobble_article"
127 | MYSQL_USER = "root"
128 | MYSQL_PASSWORD = "123456"
129 | 
130 | # JOBDIR = "job_info/001"
131 | 
132 | SQL_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"
133 | SQL_DATE_FORMAT = "%Y-%m-%d"


--------------------------------------------------------------------------------
/build/lib/ArticleSpider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/build/lib/ArticleSpider/spiders/jobbole.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import re
  3 | import datetime
  4 | import scrapy
  5 | from scrapy.http import Request
  6 | from urllib import parse
  7 | from ArticleSpider.items import JobBoleArticleItem, ArticleItemLoader
  8 | from ArticleSpider.utils.common import get_md5
  9 | from scrapy.loader import ItemLoader
 10 | from selenium import webdriver
 11 | from scrapy.xlib.pydispatch import dispatcher
 12 | from scrapy import signals
 13 | 
 14 | 
 15 | class JobboleSpider(scrapy.Spider):
 16 |     name = 'jobbole'
 17 |     allowed_domains = ['blog.jobbole.com']
 18 |     start_urls = ['http://blog.jobbole.com/all-posts/']
 19 | 
 20 |     def __init__(self):
 21 |         self.start_urls = ('http://blog.jobbole.com/all-posts/',)
 22 |     #     # self.fail_urls = []
 23 |     #     self.browser = webdriver.Chrome(
 24 |     #         executable_path="G:/Document/PythonServerEnvironment/SelniumWebdriver/chromedriver.exe")
 25 |     #     super(JobboleSpider, self).__init__()
 26 |     #     dispatcher.connect(self.spider_close, signals.spider_closed)
 27 |     #
 28 |     # def spider_close(self, spider):
 29 |     #     # 当爬虫退出的时候关闭chrome
 30 |     #     print("spider closed")
 31 |     #     self.browser.quit()
 32 |     # start_urls = ['http://blog.jobbole.com/112239/']
 33 | 
 34 |     # 收集伯乐在线所有404的url以及404页面数
 35 |     # handle_httpstatus_list = [404]
 36 |     #
 37 |     # def __init__(self):
 38 |     #     self.fail_urls = []
 39 |     #     dispatcher.connect(self.handle_spider_closed, signals.spider_closed)
 40 |     #
 41 |     # def handle_spider_closed(self, spider, reason):
 42 |     #     self.crawler.stats.set_value("failed_urls", ",".join(self.fail_urls))
 43 | 
 44 |     def parse(self, response):
 45 |         """
 46 |         1.获取文章列表中得文章url并交给scrapy下载后并进行解析
 47 |         2.获取下一页得url 并交给scrapy进行下载， 下载完成后交给parse
 48 |         :param response:
 49 |         :return:
 50 |         """
 51 |         # if response.status == 404:
 52 |         #     self.fail_urls.append(response.url)
 53 |         #     self.crawler.stats.inc_value("failed_url")
 54 | 
 55 |         # 解析列表中得所有文章url，然后下载
 56 |         post_nodes = response.css("#archive .floated-thumb .post-thumb a")
 57 |         for post_node in post_nodes:
 58 |             img_url = post_node.css("img::attr(src)").extract_first("")
 59 |             post_url = post_node.css("::attr(href)").extract_first("")
 60 |             pa_url = parse.urljoin(response.url, post_url)  # 域名+url # response.url + post_url
 61 |             yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url": img_url}, callback=self.parse_detail)
 62 | 
 63 |         # 提取下一页进行下载
 64 |         next_url = response.css(".next.page-numbers::attr(href)").extract_first("")
 65 |         if next_url:
 66 |             pa_url = parse.urljoin(response.url, next_url)
 67 |             yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
 68 | 
 69 |     def parse_detail(self, response):
 70 |         # article_item = JobBoleArticleItem()
 71 | 
 72 |         # # 提取文章详情页
 73 |         # # re_selector = response.xpath("/html/body/div[2]/div[3]/div[1]/div[1]/h1") # 最好不用这种
 74 |         # title = response.xpath('//*[@id="post-112239"]/div[1]/h1/text()').extract_first()
 75 |         # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·",
 76 |         #                                                                                                             "").strip()
 77 |         # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]
 78 |         # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
 79 |         # match_re = re.match(r'.*?(\d+).*', fav_nums)
 80 |         # if match_re:
 81 |         #     fav_nums = int(match_re.group(1))
 82 |         # else:
 83 |         #     fav_nums = 0
 84 |         # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
 85 |         # match_re = re.match(r'.*?(\d+).*', comment_nums)
 86 |         # if match_re:
 87 |         #     comment_nums = int(match_re.group(1))
 88 |         # else:
 89 |         #     comment_nums = 0
 90 |         #
 91 |         # content = response.xpath("//div[@class='entry']").extract()[0]
 92 |         # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
 93 |         # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
 94 |         # tags = ",".join(tag_list)
 95 | 
 96 |         # 通过css选择器提取字段
 97 |         # front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
 98 |         # title = response.css(".entry-header h1::text").extract()[0]
 99 |         # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace('·', '').strip()
100 |         # praise_nums = response.css(".vote-post-up h10::text").extract()[0]
101 |         # fav_nums = response.css(".bookmark-btn::text").extract()[0]
102 |         # match_re = re.match(r'.*?(\d+).*', fav_nums)
103 |         # if match_re:
104 |         #     fav_nums = match_re.group(1)
105 |         # else:
106 |         #     fav_nums = 0
107 |         # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
108 |         # match_re = re.match(r'.*?(\d+).*', comment_nums)
109 |         # if match_re:
110 |         #     comment_nums = match_re.group(1)
111 |         # else:
112 |         #     comment_nums = 0
113 |         #
114 |         # content = response.css("div.entry").extract()[0]
115 |         # tags = response.css("p.entry-meta-hide-on-mobile a::text").extract()
116 |         # tag_list = [element for element in tags if not element.strip().endswith("评论")]
117 |         # tags = ",".join(tag_list)
118 |         #
119 |         # article_item["title"] = title
120 |         # article_item["url"] = response.url
121 |         # article_item["url_object_id"] = get_md5(response.url)
122 |         # try:
123 |         #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
124 |         # except Exception as e:
125 |         #     create_date = datetime.datetime.now()
126 |         # article_item["create_date"] = create_date
127 |         # article_item["front_image_url"] = [front_image_url]
128 |         # article_item["content"] = content
129 |         # article_item["praise_nums"] = praise_nums
130 |         # article_item["comment_nums"] = comment_nums
131 |         # article_item["fav_nums"] = fav_nums
132 |         # article_item["tags"] = tags
133 | 
134 |         # 通过Item Loader 加载item
135 |         item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
136 |         item_loader.add_css("title", ".entry-header h1::text")
137 |         item_loader.add_value("url", response.url)
138 |         item_loader.add_value("url_object_id", get_md5(response.url))
139 |         item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
140 |         item_loader.add_value("front_image_url", [response.meta.get("front_image_url", "")])
141 |         item_loader.add_css("praise_nums", ".vote-post-up h10::text")
142 |         item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
143 |         item_loader.add_css("fav_nums", ".bookmark-btn::text")
144 |         item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
145 |         item_loader.add_css("content", "div.entry")
146 | 
147 |         article_item = item_loader.load_item()
148 |         # item_loader.add_xpath()
149 | 
150 |         yield article_item
151 | 


--------------------------------------------------------------------------------
/build/lib/ArticleSpider/spiders/lagou.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy.linkextractors import LinkExtractor
 4 | from scrapy.spiders import CrawlSpider, Rule
 5 | from ArticleSpider.items import LagouJobItem, LagouJobItemLoader
 6 | from ArticleSpider.utils.common import get_md5
 7 | import datetime
 8 | 
 9 | 
10 | class LagouSpider(CrawlSpider):
11 |     name = 'lagou'
12 |     allowed_domains = ['www.lagou.com']
13 |     start_urls = ['https://www.lagou.com/']
14 | 
15 |     rules = (
16 |         Rule(LinkExtractor(allow=("zhaopin/.*",)), follow=True),
17 |         Rule(LinkExtractor(allow=("gongsi/j\d+.html",)), follow=True),
18 |         Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True),
19 |     )
20 | 
21 |     def parse_job(self, response):
22 |         # 解析拉勾网的职位
23 |         item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response)
24 |         item_loader.add_css("title", "")
25 |         item_loader.add_value("url", response.url)
26 |         item_loader.add_value("url_object_id", get_md5(response.url))
27 |         item_loader.add_css("salary", ".job_request .salary::text")
28 |         item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text")
29 |         item_loader.add_css("work_years", ".job_request p span:nth-child(3)::text")  # 这里使用css ，是为了在学习时，熟悉css选择器用法
30 |         item_loader.add_xpath("degree_need", "//dd[@class='job_request']/p/span[4]/text()")
31 |         item_loader.add_xpath("job_type", "//dd[@class='job_request']/p/span[5]/text()")
32 | 
33 |         item_loader.add_css("publish_time", ".publish_time::text")
34 |         item_loader.add_css("tags", ".position-label.clearfix li::text")
35 |         item_loader.add_css("job_advantage", ".job-advantage p::text")
36 |         item_loader.add_css("job_desc", ".job_bt div")
37 |         item_loader.add_css("job_addr", ".work_addr")
38 |         item_loader.add_css("company_url", "#job_company dt a::attr(href)")
39 |         item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
40 |         item_loader.add_value("crawl_time", datetime.datetime.now())
41 |         # item_loader.add_css("crawl_update_time", datetime.datetime.now())
42 | 
43 |         job_item = item_loader.load_item()  # 这里先赋值给一个变量，是考虑到便于调试以及代码可读性，而不是为了代码简洁而直接return
44 | 
45 |         return job_item
46 | 
47 |     def parse_start_url(self, response):
48 |         return []
49 | 
50 |     def process_results(self, response, results):
51 |         return results


--------------------------------------------------------------------------------
/build/lib/ArticleSpider/spiders/zhihu.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import re
  3 | import scrapy
  4 | import json
  5 | import datetime
  6 | try:
  7 |     import urlparse as parse
  8 | except:
  9 |     from urllib import parse
 10 | 
 11 | from scrapy.loader import ItemLoader
 12 | from ArticleSpider.items import ZhihuAnswerItem, ZhihuQuestionItem
 13 | from ArticleSpider.settings import user_agent_list
 14 | 
 15 | 
 16 | class ZhihuSpider(scrapy.Spider):
 17 |     name = 'zhihu'
 18 |     allowed_domains = ['www.zhihu.com']
 19 |     start_urls = ['http://www.zhihu.com/']
 20 | 
 21 |     # question的第一页answer得请求url
 22 |     start_answer_url = "https://www.zhihu.com/api/v4/questions/26234383/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit={1}&offset={2}"
 23 |     agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
 24 | 
 25 | 
 26 |     headers = {
 27 |         "Host": "www.zhihu.com",
 28 |         "Referer": "https://www.zhihu.com/",
 29 |         "User-Agent": agent,
 30 | 
 31 |     }
 32 |     custom_settings = {
 33 |         "COOKIES_ENABLED": True
 34 |     }
 35 | 
 36 |     def parse(self, response):
 37 |         # 提取出html页面中的所有url 并跟踪url进行一些爬取
 38 |         # 如果提取得url中格式为 /question/xxx 就下载进行之后直接进入解析函数
 39 |         all_urls = response.css("a::attr(href)").extract()
 40 |         all_urls = [parse.urljoin(response.url, url) for url in all_urls]
 41 |         all_urls = filter(lambda x: True if x.startswith("https") else False, all_urls)
 42 |         for url in all_urls:
 43 |             match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", url)
 44 |             if match_obj:
 45 |                 # 如果提取到得question相关页面则下载交由提取函数进行提取
 46 |                 request_url = match_obj.group(1)
 47 |                 # 简单的随机更换User-Agent
 48 |                 # import random
 49 |                 # random_index = random.randint(0, len(user_agent_list) - 1)
 50 |                 # random_agent = user_agent_list[random_index]
 51 |                 # self.headers["User-Agent"] = random_agent
 52 |                 yield scrapy.Request(request_url, headers=self.headers, callback=self.parse_question)
 53 |                 # break  # debug时候用
 54 |             else:
 55 |                 # 如果不是question 页面则直接进一步跟踪
 56 |                 yield scrapy.Request(url, headers=self.headers, callback=self.parse)  # debug 注释
 57 | 
 58 |     def parse_question(self, response):
 59 |         # 处理question页面， 从页面中提取question 具体item
 60 |         match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
 61 |         if match_obj:
 62 |             question_id = int(match_obj.group(2))
 63 | 
 64 |         if "QuestionHeader-title" in response.text:
 65 |             # 处理新版本
 66 |             item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
 67 |             item_loader.add_css("title", "h1.QuestionHeader-title::text")
 68 |             item_loader.add_css("content", ".QuestionHeader-detail")
 69 |             item_loader.add_value("url", response.url)
 70 |             item_loader.add_value("zhihu_id", question_id)
 71 |             item_loader.add_css("answer_num", ".List-headerText span::text")
 72 |             item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text")
 73 |             item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
 74 |             item_loader.add_css("topics", ".QuestionHeader-topics.Popover::text")
 75 | 
 76 |         else:
 77 |             # 处理知乎旧版本
 78 |             item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
 79 |             item_loader.add_css("title", "h1.QuestionHeader-title::text")
 80 |             item_loader.add_css("content", ".QuestionHeader-detail")
 81 |             item_loader.add_value("url", response.url)
 82 |             item_loader.add_value("zhihu_id", question_id)
 83 |             item_loader.add_css("answer_num", ".List-headerText span::text")
 84 |             item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text")
 85 |             item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
 86 |             item_loader.add_css("topics", ".QuestionHeader-topics.Popover::text")
 87 | 
 88 |         question_item = item_loader.load_item()
 89 |         yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer)
 90 |         yield question_item  # debug 注释
 91 | 
 92 |     def parse_answer(self, response):
 93 |         # 处理answer
 94 |         ans_json = json.loads(response.text)
 95 |         is_end = ans_json["paging"]["is_end"]
 96 |         # totals_answer = ans_json["paging"]["totals"]
 97 |         next_url = ans_json["paging"]["next"]
 98 | 
 99 |         # 提取answer的具体字段
100 |         for answer in ans_json["data"]:
101 |             answer_item = ZhihuAnswerItem()
102 |             answer_item["zhihu_id"] = answer["id"]
103 |             answer_item["url"] = answer["url"]
104 |             answer_item["question_id"] = answer["question"]["id"]
105 |             answer_item["author_id"] = answer["author"]["id"] if "id" in answer["author"] else None
106 |             answer_item["content"] = answer["content"] if "content" in answer else None
107 |             answer_item["praise_num"] = answer["voteup_count"]
108 |             answer_item["comments_num"] = answer["comment_count"]
109 |             answer_item["create_time"] = answer["created_time"]
110 |             answer_item["update_time"] = answer["updated_time"]
111 |             answer_item["crawl_time"] = datetime.datetime.now()
112 | 
113 |             yield answer_item
114 | 
115 |         if not is_end:
116 |             yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer)
117 | 
118 |     def start_requests(self):
119 |         return [scrapy.Request('https://www.zhihu.com/#signin', headers=self.headers, callback=self.login)]
120 | 
121 |     def login(self, response):
122 | 
123 |         response_text = response.text
124 |         match_obj = re.match('.*name="_xsrf" value="(.*?)".*', response_text, re.DOTALL)
125 |         if match_obj:
126 |             print(match_obj.group(1))
127 |             _xsrf = match_obj.group(1)
128 |         if _xsrf:
129 | 
130 |             post_data = {
131 |                 "_xsrf": _xsrf,
132 |                 "phone_num": '13342266862',
133 |                 "password": '553768563',
134 |                 "captcha": "",
135 | 
136 |             }
137 |             import time
138 |             t = str(int(time.time() * 1000))
139 |             captcha_url = "https://www.zhihu.com/captcha.gif?r={0}&type=login&lang=cn".format(t)
140 |             yield scrapy.Request(captcha_url, headers=self.headers, meta={"post_data": post_data}, callback=self.login_after_captcha)   # 重点
141 | 
142 |     def login_after_captcha(self, response):
143 |         with open("captcha.gif", "wb") as f:
144 |             f.write(response.body)
145 |             f.close()
146 | 
147 |         from PIL import Image
148 |         try:
149 |             im = Image.open("captcha.gif")
150 |             im.show()
151 |             im.close()
152 |         except:
153 |             pass
154 | 
155 |         captcha = input("输入验证码\n>")
156 | 
157 |         post_data = response.meta.get("post_data", {})
158 |         post_url = "https://www.zhihu.com/login/phone_num"
159 |         post_data["captcha"] = captcha
160 |         return [scrapy.FormRequest(
161 |             url=post_url,
162 |             formdata=post_data,
163 |             headers=self.headers,
164 |             callback=self.check_login
165 |         )]
166 | 
167 |     def check_login(self, response):
168 |         # 验证服务器的返回数据是否成功
169 |         text_json = json.loads(response.text)
170 |         if "msg" in text_json and text_json["msg"] == "登录成功":
171 |             for url in self.start_urls:
172 |                 yield scrapy.Request(url, dont_filter=True, headers=self.headers)
173 | 
174 | 


--------------------------------------------------------------------------------
/build/lib/ArticleSpider/tools/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding:utf-8 -*-
3 | 
4 | __Author__ = "HackFun"


--------------------------------------------------------------------------------
/build/lib/ArticleSpider/tools/selenium_spider.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding:utf-8 -*-
 3 | 
 4 | __Author__ = "HackFun"
 5 | 
 6 | from selenium import webdriver
 7 | from scrapy.selector import Selector
 8 | 
 9 | # browser = webdriver.Chrome(executable_path="G:/Document/PythonServerEnvironment/SelniumWebdriver/chromedriver.exe")
10 | #
11 | # browser.get("https://item.taobao.com/item.htm?spm=2013.1.iteminfo.10.4b556901SPB44D&scm=1007.10010.52063.100200300000003&id=552169264763&pvid=19a525ca-6111-4648-98ab-0ff06f668623")
12 | #
13 | # print(browser.page_source)
14 | #
15 | # selector_ = Selector(text=browser.page_source)
16 | 
17 | 
18 | # browser.quit()
19 | 
20 | # 设置chromedirver
21 | chrome_opt = webdriver.ChromeOptions()
22 | prefs = {"profile.managed_default_content_settings.images": 2}
23 | chrome_opt.add_experimental_option("prefs", prefs)
24 | browser = webdriver.Chrome(executable_path="G:/Document/PythonServerEnvironment/SelniumWebdriver/chromedriver.exe", chrome_options=chrome_opt)
25 | browser.get("https://anta.tmall.com/")
26 | 
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/build/lib/ArticleSpider/utils/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding:utf-8 -*-
3 | 
4 | __Author__ = "HackFun"


--------------------------------------------------------------------------------
/build/lib/ArticleSpider/utils/common.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding:utf-8 -*-
  3 | 
  4 | __Author__ = "HackFun"
  5 | import hashlib
  6 | import re
  7 | import webbrowser
  8 | from webbrowser import Chrome
  9 | 
 10 | 
 11 | def get_md5(url):
 12 |     if isinstance(url, str):
 13 |         url = url.encode("utf-8")
 14 |     m = hashlib.md5()
 15 |     m.update(url)
 16 |     return m.hexdigest()
 17 | 
 18 | 
 19 | def extract_num(text):
 20 |     # 字符串中提取数字
 21 |     match_re = re.match(r'.*?(\d+).*', text)
 22 |     if match_re:
 23 |         nums = match_re.group(1)
 24 |     return nums
 25 | 
 26 | 
 27 | def webtest():
 28 |     # webbrowser.open("http://jobbole.com", new=0, autoraise=1)
 29 |     # webbrowser.open_new("http://jobbole.com")
 30 |     # webbrowser.open_new_tab("http://jobbole.com")
 31 |     webbrowser.register(name="chrome", klass=Chrome)
 32 |     webbrowser.get('chrome').open("http://jobbole.com")
 33 |         # .open('www.baidu.com', new=1, autoraise=True)
 34 | 
 35 |     chromePath = r'你的浏览器目录'  # 例如我的：C:\***\***\***\***\Google\Chrome\Application\chrome.exe
 36 |     webbrowser.register('chrome', None, webbrowser.BackgroundBrowser(chromePath))  # 这里的'chrome'可以用其它任意名字，如chrome111，这里将想打开的浏览器保存到'chrome'
 37 |     webbrowser.get('chrome').open('www.baidu.com', new=1, autoraise=True)
 38 | 
 39 | 
 40 | def choose(bool, a, b):
 41 |     return (bool and a or [b])[0]
 42 | 
 43 | 
 44 | def reversed(sequence):
 45 |     x = []
 46 |     for i in range(len(sequence)-1, -1, -1):
 47 |         # print(i)
 48 |         x.append(sequence[i])
 49 |         # x = sequence[i]
 50 |         # print(sequence[i])
 51 |     return x
 52 | 
 53 | 
 54 | def to_list(t):
 55 |         return [i if not isinstance(i, tuple) else to_list(i) for i in t]
 56 | 
 57 | #
 58 | # def to_list(t):
 59 | #     return [i for i in t]
 60 | 
 61 | if __name__ == '__main__':
 62 |     # webtest()
 63 |     # print(get_md5("http://jobbole.com"))
 64 |     # print(1)
 65 | 
 66 |     # print(choose(True, 1, 2))
 67 |     # print(forxinreversed([1, 2, 3, 3, 4, 5]))
 68 |     # sequence = [1, 2, 3, 4, 5]
 69 |     # sequence.reverse()
 70 |     # print(sequence)
 71 |     # x = [sequence[i] for i in range(len(sequence)-1, -1, -1)]
 72 |     # print(x)
 73 | 
 74 |     # print(int('1234'))
 75 |     # print(float(12))
 76 |     # print(str(98))
 77 |     # print(list('abcd'))
 78 |     # print(dict.fromkeys(['name', 'age']))
 79 |     # print(tuple([1, 2, 3, 4]))
 80 | 
 81 |     # a_list = [1, 2, [1, 2, 3], 3, 4, 5]
 82 |     # print(tuple(a_list))
 83 |     # t = tuple(a_list)
 84 |     # t = (1, 2, (1, 2, 3), 3, 4)
 85 |     # print(t)
 86 |     # print(to_list(t))
 87 | 
 88 |     # 10
 89 |     # L1 = [4, 1, 3, 2, 3, 5, 1]
 90 |     # L2 = []
 91 |     # [L2.append(i) for i in L1 if i not in L2]
 92 |     # print(L2)
 93 | 
 94 |     from copy import deepcopy
 95 |     L1 = [1, [1, 2, 3], 2, 3]
 96 |     print("before copy L1: ", L1)
 97 |     L2 = L1.copy()
 98 |     L2[1][2] = 1
 99 |     print("after copy L2: ", L2)
100 |     print("after copy L1: ", L1)
101 |     L1 = [3, [3, 4, 5], 4, 5]
102 |     print("before deepcopy L1: ", L1)
103 |     L2 = deepcopy(L1)
104 |     L2[1][2] = 1
105 |     print("after deepcopy L2: ", L2)
106 |     print("after deepcopy L1: ", L1)
107 | 


--------------------------------------------------------------------------------
/build/lib/ArticleSpider/utils/crawl_xici_ip.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding:utf-8 -*-
  3 | 
  4 | __Author__ = "HackFun"
  5 | 
  6 | import requests
  7 | from scrapy.selector import Selector
  8 | import MySQLdb
  9 | 
 10 | conn = MySQLdb.connect(host="localhost", user="root", passwd="123456", db="article_spider", charset="utf8")
 11 | cursor = conn.cursor()
 12 | 
 13 | 
 14 | def crawl_ips():
 15 |     # 爬取西刺得免费ip代理
 16 |     headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"}
 17 |     for i in range(2354):
 18 |         re = requests.get("http://www.xicidaili.com/nn/{0}".format(i), headers=headers)
 19 | 
 20 |         # print(re.text)
 21 |         selector = Selector(text=re.text)
 22 |         # all_trs = selector.css("#ip_list  tr[class]:not([class='subtitle'])")
 23 |         all_trs = selector.css("#ip_list tr")
 24 | 
 25 |         ip_list = []
 26 | 
 27 |         for tr in all_trs[1:]:
 28 |             speed_str = tr.css(".bar::attr(title)").extract()[0]
 29 |             if speed_str:
 30 |                 speed = float(speed_str.split("秒")[0])
 31 |             # ip = tr.css("td:nth-child[2]::text").extract()[0]  # 报错
 32 |             all_text = tr.css("td::text").extract()
 33 |             ip = all_text[0]
 34 |             port = all_text[1]
 35 |             proxy_type = all_text[5]
 36 | 
 37 |             # lis = (ip, port, speed, proxy_type)
 38 |             # lis = list(map(lambda a: str(a) if type(a) != 'str' else a, (ip, port, speed, proxy_type)))
 39 |             # print(':'.join(lis))
 40 | 
 41 |             ip_list.append((ip, port, speed, proxy_type))
 42 | 
 43 |             # print(all_trs)
 44 |         # for tr in all_trs:
 45 |         #     # print(tr.extract())
 46 |         #     # ip = tr.xpath('/td[2]/text()').extract()
 47 |         #     # port = tr.xpath('/td[3]/text()').extract()
 48 |         #     # http_type = tr.xpath('/td[6]/text()').extract()
 49 |         #     ip = tr.css('td:nth-child(2)::text').extract()[0]
 50 |         #     port = tr.css('td:nth-child(3)::text').extract()[0]
 51 |         #     speed = tr.css('td:nth-child(6)::text').extract()[0]
 52 |         #     proxy_type = tr.css('td:nth-child(6)::text').extract()[0]
 53 |         #     # print(ip, port)
 54 |         #     # print(':'.join((str(ip), str(port), str(http_type))))
 55 |         #     print(':'.join((ip, port, speed, proxy_type)))
 56 |         #     ip_list.append((ip, port, speed, proxy_type))
 57 | 
 58 |         print(": ".join(ip_info))
 59 | 
 60 |         for ip_info in ip_list:
 61 |             cursor.execute("insert into proxy_ip(ip, port, speed, proxy_type) VALUES ('{0}','{1}',{2},'{3}')".format(
 62 |                 ip_info[0], ip_info[1], ip_info[2], ip_info[3])
 63 |             )  # 传递字符串一定要加单引号
 64 | 
 65 |         conn.commit()
 66 | 
 67 |         # for tr in all_trs[1:]:
 68 |         #     # speed_str = tr.css(".bar::attr(title)").extract()[0]
 69 |         #     # if speed_str:
 70 |         #     #     speed = float(speed_str.split("秒")[0])
 71 |         #     all_texts = tr.css("td::text").extract()
 72 |         #     print(all_texts)
 73 | 
 74 |         # print(re.text)
 75 | 
 76 | 
 77 | class GetIP(object):
 78 |     def delete_ip(self, ip):
 79 |         # 从数据库中删除无效的ip
 80 |         delete_sql = """
 81 |         delete from proxy_ip where ip='{0}'
 82 |         """.format(ip)
 83 |         cursor.execute(delete_sql)
 84 |         conn.commit()
 85 |         return True
 86 | 
 87 |     def judge_ip(self, ip, port, proxy_type):
 88 |         # 判断IP 是否可用
 89 |         http_url = "proxy_type://www.baidu.com"
 90 |         proxy_url = "{3}://{0}:{1}".format(ip, port, proxy_type)
 91 |         response = None
 92 |         try:
 93 |             proxy_dict = {
 94 |                 proxy_type: proxy_url
 95 |             }
 96 |             response = requests.get(http_url, proxies=proxy_dict)
 97 |             return True
 98 |         except Exception as e:
 99 |             print("invalid ip and port")
100 |             self.delete_ip(ip)
101 |             return False
102 |         else:
103 |             code = response.status_code
104 |             if code >= 200 and code < 300:
105 |                 print("effective ip")
106 |                 return True
107 |             else:
108 |                 print("invalid ip and port")
109 |                 self.delete_ip(ip)
110 |                 return False
111 | 
112 |     def get_random_ip(self):
113 |         # 从数据库中随机获取一个可用的ip
114 |         random_sql = """
115 |             SELECT ip,port FROM proxy_ip
116 |             ORDER BY RAND()
117 |             LIMIT 1
118 |         """
119 |         cursor.execute(random_sql)
120 |         for ip_info in cursor.fetchall():
121 |             ip = ip_info[0]
122 |             port = ip_info[1]
123 |             proxy_type = ip_info[3] if ip_info[3] and ip_info[3] != "" else 'http'
124 | 
125 |             judge_re = self.judge_ip(ip, port, proxy_type)
126 |             if judge_re:
127 |                 return "{3}://{0}:{1}".format(ip, port, proxy_type)
128 |             else:
129 |                 return self.get_random_ip()
130 | 
131 | if __name__ == '__main__':
132 |     # crawl_ips()
133 |     get_ip = GetIP()
134 |     print(get_ip.get_random_ip())


--------------------------------------------------------------------------------
/build/lib/ArticleSpider/utils/zhihu_login_requests.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding:utf-8 -*-
  3 | 
  4 | __Author__ = "HackFun"
  5 | 
  6 | import requests
  7 | try:
  8 |     import cookielib
  9 | except:
 10 |     import http.cookiejar as cookielib
 11 | 
 12 | import re
 13 | 
 14 | 
 15 | 
 16 | agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
 17 | header = {
 18 |     "Host": "www.zhihu.com",
 19 |     "Referer": "https://www.zhihu.com/",
 20 |     "User-Agent": agent,
 21 | "Cookie":'q_c1=447a188541144e3fb30424b694576ef2|1502619613000|1491825468000; q_c1=447a188541144e3fb30424b694576ef2|1502619613000|1491825468000; _zap=6efcefae-72d9-4251-9d91-2f350d61f8ee; capsion_ticket="2|1:0|10:1503325910|14:capsion_ticket|44:MDM1NThhZGYwMTM1NDAyNzkzNTYzMDMwNjhlNDNkNjM=|05608b1721fc351684c420227a8cc8c6a3926cfaea2c64ec23c62a1fbcd3a48f"; aliyungf_tc=AQAAAO2IxyovewwAshrJtkWO76wHBbMh; d_c0="AECCvOx7SwyPTtI7hlhRAcElYn2NHqLNeYI=|1504004081"; _xsrf=1be2d9a7-746b-4245-bc8f-4b50692e0965; l_cap_id="NzVjMmQ2ZTFkODVjNGVlYzkzZGNjNDQ4OTgwNjA2MDI=|1504010920|556da1e4afe6174e99f237007f3b12c2dd7054a2"; r_cap_id="MTAxNjU2MzFjZDM5NGNmZDgyNTliODljZDc3Y2IyMmQ=|1504010920|7698fb675ed8a3d0aff05ca5fa4e92297889b4e2"; cap_id="MjYzZjRlYTllOTA0NDA4MWE5ZGRjOTRlNGNiZTk5Y2M=|1504010920|7a120663cef55b6d4c72932874f2ed61afd2d050"; __utma=51854390.504384623.1504004084.1504004084.1504010008.2; __utmb=51854390.0.10.1504010008; __utmc=51854390; __utmz=51854390.1504010008.2.2.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=51854390.000--|2=registration_date=20170615=1^3=entry_date=20170410=1'
 22 | }
 23 | 
 24 | 
 25 | session = requests.session()
 26 | session.cookies = cookielib.LWPCookieJar(filename="cookies.txt")
 27 | 
 28 | try:
 29 |     session.cookies.load(ignore_discard=True)
 30 |     print("cookies已被加载")
 31 | except:
 32 |     print("cookies未能加载")
 33 | 
 34 | 
 35 | def get_xsrf():
 36 |     response = session.get("https://www.zhihu.com/", headers=header)
 37 |     # print(response.text)
 38 | 
 39 |     # text = '<input type="hidden" name="_xsrf" value="3442415578affffa55306d46aa708318"/>'
 40 |     # text = '<input type="hidden" name="_xsrf" value="73886f49bb135ef3ebf0af39b467eb3b"/>'
 41 |     text = response.text
 42 |     print(text)
 43 |     match_obj = re.match(r'.*?required.*', text)
 44 |     match_obj = re.match(r'.*name="_xsrf" value="(.*?)".*', text.strip())
 45 |     match_obj = re.match(r'.*name="_xsrf" value="(.*?)".*', text, re.DOTALL)
 46 |     match_obj = re.search('.*name="_xsrf" value="(.*?)".*', text)
 47 |     if match_obj:
 48 |         print(match_obj.group(1))
 49 |         return match_obj.group(1)
 50 |     else:
 51 |         return ""
 52 | 
 53 | 
 54 | def is_login():
 55 |     inbox_url = "https://www.zhihu.com/inbox"
 56 |     response = session.get(inbox_url, headers=header, allow_redirects=False)
 57 |     if response.status_code != 200:
 58 |         return False
 59 |     else:
 60 |         return True
 61 | 
 62 | 
 63 | def get_index():
 64 |     response = session.get("https://www.zhihu.com/", headers=header)
 65 |     with open("index_page.html", "wb") as f:
 66 |         f.write(response.text.encode("utf-8"))
 67 |     print("ok")
 68 | 
 69 | 
 70 | def get_captcha():
 71 |     import time
 72 |     t = str(int(time.time()*1000))
 73 |     captcha_url = "https://www.zhihu.com/captcha.gif?r=1504099197089&type=login&lang=cn"
 74 |     captcha_url = "https://www.zhihu.com/captcha.gif?r={0}&type=login&lang=cn"
 75 |     t = session.get(captcha_url, headers=header)
 76 |     with open("captcha.gif", "wb") as f:
 77 |         f.write(t.content)
 78 |         f.close()
 79 | 
 80 |     from PIL import Image
 81 |     try:
 82 |         im = Image.open("captcha.gif")
 83 |         im.show()
 84 |         im.close()
 85 |     except:
 86 |         pass
 87 | 
 88 |     captcha = input("输入验证码\n>")  # python2 中是 raw_input
 89 |     return captcha
 90 | 
 91 | 
 92 | def zhihu_login(account, password):
 93 |     # 知乎登陆
 94 |     if re.match("^1\d{10}", account):
 95 |         print("手机号码登陆")
 96 |         post_url = "https://www.zhihu.com/login/phone_num"
 97 |         post_data = {
 98 |             "_xsrf": get_xsrf(),
 99 |             "phone_num": account,
100 |             "password": password,
101 |             "captcha": get_captcha(),
102 |             # captcha:{"img_size":[200,44],"input_points":[[21.375,28],[156.375,33]]}# 2017-08-30
103 |             "captcha_type": 'cn'
104 |         }
105 |     else:
106 |         if "@" in account:
107 |             # 判断用户名是否为邮箱
108 |             print("邮箱方式登陆")
109 |             post_url = "https://www.zhihu.com/login/email"
110 |             post_data = {
111 |                 "_xsrf": get_xsrf(),
112 |                 "email": account,
113 |                 "password": password,
114 |                 "captcha": get_captcha(),
115 |                 "captcha_type": 'cn'
116 |             }
117 | 
118 |     response_text = session.post(post_url, data=post_data, headers=header)
119 |     session.cookies.save()
120 | 
121 | if __name__ == '__main__':
122 |     zhihu_login("13342266862", "553768563")
123 |     # print(get_xsrf())
124 |     get_index()


--------------------------------------------------------------------------------
/dbs/ArticleSpider.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hackfengJam/ArticleSpider/a332bd1e0db36d41b23f85cd850dfb20c75d00f0/dbs/ArticleSpider.db


--------------------------------------------------------------------------------
/dbs/default.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hackfengJam/ArticleSpider/a332bd1e0db36d41b23f85cd850dfb20c75d00f0/dbs/default.db


--------------------------------------------------------------------------------
/eggs/ArticleSpider/1504775520.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hackfengJam/ArticleSpider/a332bd1e0db36d41b23f85cd850dfb20c75d00f0/eggs/ArticleSpider/1504775520.egg


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding:utf-8 -*-
 3 | 
 4 | __Author__ = "HackFun"
 5 | 
 6 | from scrapy.cmdline import execute
 7 | import sys
 8 | import os
 9 | 
10 | # print(os.path.dirname(os.path.abspath(__file__)))
11 | # G:\MyProgramFiles\Py3Code\ArticleSpider
12 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
13 | execute(["scrapy", "crawl", "jobbole"])  # execute("scrapy crawl jobbole".split())
14 | # execute(["scrapy", "crawl", "zhihu"])  # execute("scrapy crawl jobbole".split())
15 | 
16 | # # test
17 | # def a(max):
18 | #     n, a, b = 0, 0, 1
19 | #     while n < max:
20 | #         yield b
21 | #         # print b
22 | #         a, b = b, a + b
23 | #         n = n + 1
24 | #
25 | # def b(max):
26 | #     while max > 0:
27 | #         yield max
28 | #         max = max - 1
29 | #
30 | #
31 | # f = a(5)
32 | # f = b(5)
33 | #
34 | # print(f.__next__())
35 | # print(f.__next__())
36 | # print(f.__next__())
37 | # print(f.__next__())


--------------------------------------------------------------------------------
/project.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.0
 2 | Name: project
 3 | Version: 1.0
 4 | Summary: UNKNOWN
 5 | Home-page: UNKNOWN
 6 | Author: UNKNOWN
 7 | Author-email: UNKNOWN
 8 | License: UNKNOWN
 9 | Description: UNKNOWN
10 | Platform: UNKNOWN
11 | 


--------------------------------------------------------------------------------
/project.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | setup.py
 2 | ArticleSpider/__init__.py
 3 | ArticleSpider/items.py
 4 | ArticleSpider/middlewares.py
 5 | ArticleSpider/pipelines.py
 6 | ArticleSpider/settings.py
 7 | ArticleSpider/models/__init__.py
 8 | ArticleSpider/models/es_types.py
 9 | ArticleSpider/spiders/__init__.py
10 | ArticleSpider/spiders/jobbole.py
11 | ArticleSpider/spiders/jobboleBackupto16.py
12 | ArticleSpider/spiders/lagou.py
13 | ArticleSpider/spiders/zhihu.py
14 | ArticleSpider/tools/__init__.py
15 | ArticleSpider/tools/selenium_spider.py
16 | ArticleSpider/utils/__init__.py
17 | ArticleSpider/utils/common.py
18 | ArticleSpider/utils/crawl_xici_ip.py
19 | ArticleSpider/utils/zhihu_login_requests.py
20 | project.egg-info/PKG-INFO
21 | project.egg-info/SOURCES.txt
22 | project.egg-info/dependency_links.txt
23 | project.egg-info/entry_points.txt
24 | project.egg-info/top_level.txt


--------------------------------------------------------------------------------
/project.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/project.egg-info/entry_points.txt:
--------------------------------------------------------------------------------
1 | [scrapy]
2 | settings = ArticleSpider.settings
3 | 
4 | 


--------------------------------------------------------------------------------
/project.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | ArticleSpider
2 | 


--------------------------------------------------------------------------------
/requestments.txt:
--------------------------------------------------------------------------------
 1 | arrow==0.12.0
 2 | asn1crypto==0.23.0
 3 | attrs==17.3.0
 4 | Automat==0.6.0
 5 | backports.functools-lru-cache==1.2.1
 6 | certifi==2017.11.5
 7 | cffi==1.11.2
 8 | chardet==3.0.4
 9 | constantly==15.1.0
10 | cryptography==2.1.4
11 | cssselect==1.0.1
12 | Django==2.0
13 | elasticsearch==5.5.1
14 | elasticsearch-dsl==5.3.0
15 | hyperlink==17.3.1
16 | idna==2.6
17 | incremental==17.5.0
18 | lxml==4.1.1
19 | mysql-connector-python==8.0.5
20 | mysqlclient==1.3.12
21 | parsel==1.2.0
22 | pyasn1==0.4.2
23 | pyasn1-modules==0.2.1
24 | pycparser==2.18
25 | PyDispatcher==2.0.5
26 | PyMySQL==0.7.11
27 | pyOpenSSL==17.5.0
28 | python-dateutil==2.6.1
29 | pytz==2017.3
30 | queuelib==1.4.2
31 | redis==2.10.6
32 | requests==2.18.4
33 | Scrapy==1.4.0
34 | selenium==3.8.0
35 | service-identity==17.0.0
36 | six==1.11.0
37 | Twisted==17.9.0
38 | urllib3==1.21.1
39 | w3lib==1.18.0
40 | zope.interface==4.4.3
41 | pillow==5.0.0
42 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = ArticleSpider.settings
 8 | 
 9 | [deploy:haifeng]
10 | url = http://localhost:6800/
11 | project = ArticleSpider
12 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapyd-deploy
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | setup(
 6 |     name         = 'project',
 7 |     version      = '1.0',
 8 |     packages     = find_packages(),
 9 |     entry_points = {'scrapy': ['settings = ArticleSpider.settings']},
10 | )
11 | 


--------------------------------------------------------------------------------