├── .idea ├── .gitignore ├── vcs.xml ├── misc.xml ├── inspectionProfiles │ └── profiles_settings.xml ├── modules.xml └── spiderbook.iml ├── 第3章 ├── 3-7.conf ├── 3-6.py ├── 3-2.py ├── 3-4.py ├── 3-1.code ├── 3-5.code └── 3-3.py ├── 第5章 ├── 5-6.code ├── 5-1.html ├── 5-7.code ├── 5-5.code ├── 5-8.code ├── 5-9.code ├── 5-4.code ├── 5-2.code └── 5-3.code ├── 第4章 ├── 4-2.command ├── 4-8.code ├── 4-3.command ├── 4-4.command ├── 4-12.py ├── 4-6.code ├── 4-11.py ├── 4-10.py ├── 4-1.py ├── 4-14.py ├── 4-7.code ├── 4-13.py ├── 4-9.command └── 4-5.command ├── 第1章 ├── 1-3.py ├── 1-7.py ├── 1-8.py ├── 1-4.py ├── 1-2.py ├── 1-1.py ├── 1-6.html ├── 1-5.py └── 1-9.py ├── README.md ├── 第2章 ├── 2-6.py ├── 2-4.py ├── 2-3.py ├── 2-1.py ├── 2-2.py └── 2-5.py ├── 第6章 ├── 6-3.code ├── 6-4.code ├── 6-2.code ├── 6-11.code ├── 6-1.code ├── 6-7.code ├── 6-10.code ├── 6-5.code ├── 6-6.code ├── 6-9.code ├── 6-8.code ├── 6-13.code └── 6-12.code └── .gitignore /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Default ignored files 3 | /workspace.xml -------------------------------------------------------------------------------- /第3章/3-7.conf: -------------------------------------------------------------------------------- 1 | # save 900 1 2 | # save 300 10 3 | # save 60 10000 4 | save "" -------------------------------------------------------------------------------- /第5章/5-6.code: -------------------------------------------------------------------------------- 1 | def calc_standard_deviation(self): 2 | score_list = [x['density'] for x in self.node_info.values()] 3 | std = np.std(score_list, ddof=1) 4 | return std -------------------------------------------------------------------------------- /第4章/4-2.command: -------------------------------------------------------------------------------- 1 | # MySQL 2 | CREATE TABLE WaitCrawl 3 | ( 4 | id int NOT NULL, 5 | name varchar(255) NOT NULL, 6 | url varchar(255) NOT NULL, 7 | UNIQUE (url) 8 | ); -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /第1章/1-3.py: -------------------------------------------------------------------------------- 1 | import time 2 | from datetime import datetime 3 | def wait(): 4 | time.sleep(5) 5 | print("开始", datetime.now().strftime("%Y-%m-%d %H:%M:%S")) 6 | wait() 7 | print("结束", datetime.now().strftime("%Y-%m-%d %H:%M:%S")) -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /第4章/4-8.code: -------------------------------------------------------------------------------- 1 | def request_seen(self, request): 2 | fp = self.request_fingerprint(request) 3 | if fp in self.fingerprints: 4 | return True 5 | self.fingerprints.add(fp) 6 | if self.file: 7 | self.file.write(fp + os.linesep) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # spiderbook 2 | 3 | 《Python3 网络爬虫宝典》书籍配套代码 4 | 5 | ![封面](http://can.sfhfpc.com/uPic/WechatIMG146.jpeg) 6 | 7 | 8 | 章节中所用到的代码片段均在该项目中,例如 9 | 10 | 第 `5` 章中的 `代码片段 5-6` 存放路径为 `spiderbook/第5章/5-6.code` 11 | 12 | 其他代码片段依此类推 13 | -------------------------------------------------------------------------------- /第2章/2-6.py: -------------------------------------------------------------------------------- 1 | def request(flow): 2 | if ".png" in flow.request.url: 3 | # 判断 .png 是否在请求 URL 中 4 | with open("image.txt", "a+") as file: 5 | # 保存 URL 6 | file.write(flow.request.url) 7 | file.write("\n") -------------------------------------------------------------------------------- /第5章/5-1.html: -------------------------------------------------------------------------------- 1 |
2 |

article

3 |
4 |

hello world

5 |
6 |
7 | pre 8 | next 9 |
10 |
-------------------------------------------------------------------------------- /第1章/1-7.py: -------------------------------------------------------------------------------- 1 | import parsel 2 | 3 | 4 | sel = parsel.Selector(html) 5 | publisher = sel.css(".publisher::text").extract_first() 6 | pub_time = sel.css(".pubTime::text").extract_first() 7 | content = "\n".join(sel.css(".content p::text").extract()) 8 | 9 | print(publisher, "\n", pub_time, "\n", content) -------------------------------------------------------------------------------- /第4章/4-3.command: -------------------------------------------------------------------------------- 1 | # MySQL 2 | > insert into WaitCrawl (id, name, url) VALUES (1, "exam", "http://exam.com"); 3 | Query OK, 1 row affected (0.01 sec) 4 | > insert into WaitCrawl (id, name, url) VALUES (2, "exam", "http://exam.com"); 5 | ERROR 1062 (23000): Duplicate entry 'http://exam.com' for key 'url' -------------------------------------------------------------------------------- /第5章/5-7.code: -------------------------------------------------------------------------------- 1 | def calc_new_score(self, std): 2 | for node_hash, node_info in self.node_info.items(): 3 | score = np.log(std) * node_info['density'] * np.log10(node_info['text_tag_count'] + 2) * np.log( 4 | node_info['sbdi']) 5 | self.node_info[node_hash]['score'] = score -------------------------------------------------------------------------------- /第6章/6-3.code: -------------------------------------------------------------------------------- 1 | def startService(self): 2 | for slot in range(self.max_proc): 3 | self._wait_for_project(slot) 4 | log.msg(format='Scrapyd %(version)s started: max_proc=%(max_proc)r, runner=%(runner)r', 5 | version=__version__, max_proc=self.max_proc, 6 | runner=self.runner, system='Launcher') -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /第1章/1-8.py: -------------------------------------------------------------------------------- 1 | import parsel 2 | 3 | 4 | sel = parsel.Selector(html) 5 | publisher = sel.xpath("//span[@class='publisher']/text()").extract_first() 6 | pub_time = sel.xpath("//span[@class='pubTime']/text()").extract_first() 7 | content = "\n".join(sel.xpath("//div[@class='content']/p/text()").extract()) 8 | 9 | print(publisher, "\n", pub_time, "\n", content) -------------------------------------------------------------------------------- /第2章/2-4.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from pyppeteer import launch 3 | 4 | async def main(): 5 | browser = await launch() 6 | page = await browser.newPage() 7 | await page.goto('http://example.com') 8 | await page.screenshot({'path': 'example.png'}) 9 | await browser.close() 10 | 11 | asyncio.get_event_loop().run_until_complete(main()) -------------------------------------------------------------------------------- /第5章/5-5.code: -------------------------------------------------------------------------------- 1 | def calc_sbdi(self, text, ti, lti): 2 | """ 3 | Ti - LTi 4 | SbDi = -------------- 5 | Sbi + 1 6 | 7 | SbDi:符号密度 8 | Sbi:符号数量 9 | 10 | :return: 11 | """ 12 | sbi = self.count_punctuation_num(text) 13 | sbdi = (ti - lti) / (sbi + 1) 14 | return sbdi or 1 # sbdi 不能为0,否则会导致求对数时报错。 -------------------------------------------------------------------------------- /第4章/4-4.command: -------------------------------------------------------------------------------- 1 | # Redis 2 | # 插入数据 3 | > SADD WaitCrawl mysql 4 | (integer) 1 5 | > SADD WaitCrawl redis 6 | (integer) 1 7 | > SADD WaitCrawl mongodb 8 | (integer) 1 9 | > SADD WaitCrawl sqlite 10 | (integer) 1 11 | > SADD WaitCrawl redis 12 | (integer) 0 13 | # 查询集合 14 | > SMEMBERS WaitCrawl 15 | 1) "redis" 16 | 2) "sqlite" 17 | 3) "mongodb" 18 | 4) "mysql" -------------------------------------------------------------------------------- /第3章/3-6.py: -------------------------------------------------------------------------------- 1 | from pybloom_live import BloomFilter 2 | 3 | # 初始化 BloomFilter 对象,设定容量为 1000,误判几率 0.001 4 | f = BloomFilter(capacity=1000, error_rate=0.001) 5 | # 循环将 0~4 的数字添加到 vector 中,并打印返回结果 6 | res = [f.add(x) for x in range(5)] 7 | print(res) 8 | # 单独将数字 4 添加到 vector 中,并打印返回结果 9 | print(f.add(3)) 10 | # 判断数字 10 和数字 5 是否在 vector 中,并打印判断结果 11 | print(10 in f) 12 | print(5 in f) -------------------------------------------------------------------------------- /第4章/4-12.py: -------------------------------------------------------------------------------- 1 | import pika 2 | from pymongo import MongoClient 3 | 4 | # 连接 RabbitMQ 5 | auth = pika.PlainCredentials("books", "spider") 6 | connection = pika.BlockingConnection(pika.ConnectionParameters('148.70.6*.5*', 5672, "/", auth)) 7 | channel = connection.channel() 8 | queue = "dcs" 9 | 10 | 11 | # 连接 MongoDB 12 | client = MongoClient('localhost', 27017) 13 | db = client.news 14 | detail = db.detail -------------------------------------------------------------------------------- /第4章/4-6.code: -------------------------------------------------------------------------------- 1 | import requests 2 | # 假设页码 id 为递增数字 3 | for i in range(20): 4 | # 构造列表页单页 URL 5 | page_url = "http://example.com?page=%s" % i 6 | # 向列表页发出请求 7 | resp = requests.get(page_url) 8 | # 从返回结果中抽取详情页 URL 9 | url_list = [x for x in resp.text] 10 | for url in url_list: 11 | # 向详情页发出请求 12 | article = requests.get(url) 13 | # 拿到目标数据 14 | text = article.text -------------------------------------------------------------------------------- /第5章/5-8.code: -------------------------------------------------------------------------------- 1 | class TimeExtractor: 2 | def __init__(self): 3 | self.time_pattern = DATETIME_PATTERN 4 | 5 | def extractor(self, element: HtmlElement): 6 | text = ''.join(element.xpath('.//text()')) 7 | for dt in self.time_pattern: 8 | dt_obj = re.search(dt, text) 9 | if dt_obj: 10 | return dt_obj.group(1) 11 | else: 12 | return '' -------------------------------------------------------------------------------- /第5章/5-9.code: -------------------------------------------------------------------------------- 1 | class AuthorExtractor: 2 | def __init__(self): 3 | self.author_pattern = AUTHOR_PATTERN 4 | 5 | def extractor(self, element: HtmlElement): 6 | text = ''.join(element.xpath('.//text()')) 7 | for pattern in self.author_pattern: 8 | author_obj = re.search(pattern, text) 9 | if author_obj: 10 | return author_obj.group(1) 11 | return '' -------------------------------------------------------------------------------- /第2章/2-3.py: -------------------------------------------------------------------------------- 1 | import time 2 | from selenium import webdriver 3 | from selenium.webdriver.common.action_chains import ActionChains 4 | 5 | 6 | with webdriver.Chrome() as driver: 7 | # 访问指定网址 8 | driver.get("https://www.phei.com.cn/module/goods/wssd_index.jsp") 9 | # 定位版权信息 10 | footer = driver.find_element_by_class_name("web_book_footer") 11 | # 移动到指定位置 12 | ActionChains(driver).move_to_element(footer).perform() 13 | time.sleep(10) -------------------------------------------------------------------------------- /第1章/1-4.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import asyncio 3 | 4 | 5 | async def wait(): 6 | asyncio.sleep(5) 7 | print("等我 5 秒钟") 8 | 9 | 10 | async def print_time(word): 11 | print(word, datetime.now().strftime("%Y-%m-%d %H:%M:%S")) 12 | 13 | 14 | async def main(): 15 | await print_time("开始") 16 | await wait() 17 | await print_time("结束") 18 | 19 | loop = asyncio.get_event_loop() 20 | loop.run_until_complete(main()) 21 | loop.close() -------------------------------------------------------------------------------- /第1章/1-2.py: -------------------------------------------------------------------------------- 1 | import re 2 | import parsel 3 | import requests 4 | 5 | 6 | url = "https://www.phei.com.cn/gywm/cbsjj/2010-11-19/47.shtml" 7 | req = requests.get(url) 8 | text = req.content.decode("utf8") 9 | title = re.search("

(.*)

", text).group(1) 10 | sel = parsel.Selector(text) 11 | content = "\n".join(sel.css(".column_content_inner p font::text").extract()) 12 | with open("about.txt", "a") as file: 13 | file.write(title) 14 | file.write("\n") 15 | file.write(content) -------------------------------------------------------------------------------- /第4章/4-11.py: -------------------------------------------------------------------------------- 1 | import pika 2 | 3 | 4 | def callback(ch, method, properties, body): 5 | print(" [x] Received %r" % body) 6 | 7 | 8 | auth = pika.PlainCredentials("books", "spider") 9 | connection = pika.BlockingConnection(pika.ConnectionParameters('148.70.6*.5*', 5672, "/", auth)) 10 | channel = connection.channel() 11 | channel.basic_consume( 12 | queue='message_box', on_message_callback=callback, auto_ack=True) 13 | 14 | print(' [*] Waiting for messages. To exit press CTRL+C') 15 | channel.start_consuming() -------------------------------------------------------------------------------- /第4章/4-10.py: -------------------------------------------------------------------------------- 1 | import pika 2 | 3 | 4 | auth = pika.PlainCredentials("books", "spider") 5 | connection = pika.BlockingConnection(pika.ConnectionParameters('148.70.6*.5*', 5672, "/", auth)) 6 | 7 | channel = connection.channel() 8 | channel.queue_declare(queue='message_box') 9 | for i in range(5): 10 | channel.basic_publish(exchange='', 11 | routing_key='message_box', 12 | body='Hello World-{}'.format(i)) 13 | print(" [x] Sent 'Hello World-{}'".format(i)) 14 | connection.close() -------------------------------------------------------------------------------- /第1章/1-1.py: -------------------------------------------------------------------------------- 1 | import re 2 | import parsel 3 | from urllib import request 4 | 5 | 6 | url = "https://www.phei.com.cn/gywm/cbsjj/2010-11-19/47.shtml" 7 | with request.urlopen(url) as req: 8 | text = req.read().decode("utf8") 9 | title = re.search("

(.*)

", text).group(1) 10 | sel = parsel.Selector(text) 11 | content = "\n".join(sel.css(".column_content_inner p font::text").extract()) 12 | with open("about.txt", "a") as file: 13 | file.write(title) 14 | file.write("\n") 15 | file.write(content) -------------------------------------------------------------------------------- /.idea/spiderbook.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /第1章/1-6.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |

新溪大桥早高峰报道:堵成一窝蜂

4 |
是否让白沙大桥帮助每小时前进300米的新溪大桥分流呢
5 |
6 |
7 |

发布者:今日新闻|发布时间:2020-1-29

8 |
9 |
10 |

新溪大桥于 2018 年 6 月正式启用通车……

11 |

……

12 |

……

13 |

……

14 |

记者:王大力、陈小七(实习)

15 |
16 |
17 | 18 | -------------------------------------------------------------------------------- /第3章/3-2.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | # 创建一个集合,作为增量池 4 | after = set() 5 | # 设定 URL 列表 6 | urls = ["http://www.example.com?page=1&id=2r9l74hjng", 7 | "http://www.example.com?page=1&id=9kiujamzj6", 8 | "http://www.example.com?page=1&id=77274jnasf", 9 | "http://www.example.com?page=1&id=9kiujamzj6" 10 | ] 11 | # 循环 URL 列表 12 | for url in urls: 13 | # 条件判断 14 | if url not in after: 15 | # 如果 URL 不在增量池中则向目标网页发出请求 16 | resp = requests.get(url) 17 | # 发出请求后,将 URL 添加到增量池 18 | after.add(url) 19 | else: 20 | # 不作处理 21 | pass 22 | print(len(after), after) -------------------------------------------------------------------------------- /第6章/6-4.code: -------------------------------------------------------------------------------- 1 | def _spawn_process(self, message, slot): 2 | msg = native_stringify_dict(message, keys_only=False) 3 | project = msg['_project'] 4 | args = [sys.executable, '-m', self.runner, 'crawl'] 5 | args += get_crawl_args(msg) 6 | e = self.app.getComponent(IEnvironment) 7 | env = e.get_environment(msg, slot) 8 | env = native_stringify_dict(env, keys_only=False) 9 | pp = ScrapyProcessProtocol(slot, project, msg['_spider'], \ 10 | msg['_job'], env) 11 | pp.deferred.addBoth(self._process_finished, slot) 12 | reactor.spawnProcess(pp, sys.executable, args=args, env=env) 13 | self.processes[slot] = pp -------------------------------------------------------------------------------- /第6章/6-2.code: -------------------------------------------------------------------------------- 1 | class AddVersion(WsResource): 2 | 3 | def render_POST(self, txrequest): 4 | eggf = BytesIO(txrequest.args.pop(b'egg')[0]) 5 | args = native_stringify_dict(copy(txrequest.args), keys_only=False) 6 | project = args['project'][0] 7 | version = args['version'][0] 8 | self.root.eggstorage.put(eggf, project, version) 9 | spiders = get_spider_list(project, version=version) 10 | self.root.update_projects() 11 | UtilsCache.invalid_cache(project) 12 | return {"node_name": self.root.nodename, "status": "ok", "project": project, "version": version, \ 13 | "spiders": len(spiders)} -------------------------------------------------------------------------------- /第3章/3-4.py: -------------------------------------------------------------------------------- 1 | import time 2 | import string 3 | import random 4 | import pymongo 5 | 6 | 7 | # 连接 MongoDB 8 | client = pymongo.MongoClient("localhost", 27017) 9 | # 使用 test 数据库 10 | db = client.test 11 | 12 | 13 | for i in range(500000): 14 | base_url = "http://www.******.com" 15 | # 生成 6 位的随机小写字母组合 16 | article = ''.join(random.choices(string.ascii_lowercase, k=6)) 17 | # 生成时间戳 18 | timestamp = int(time.time()) 19 | # 生成 sign 参数 20 | sign = article + str(timestamp * 3) 21 | # 拼接成常见的 URL 22 | url = "%s?page=1&article=%s&sign=%s×=%s" % (base_url, article, sign, timestamp) 23 | # 往 mongodb 集合中插入数据 24 | db.sfhfpc.insert_one({article: url}) -------------------------------------------------------------------------------- /第2章/2-1.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.by import By 3 | from selenium.webdriver.common.keys import Keys 4 | from selenium.webdriver.support.ui import WebDriverWait 5 | from selenium.webdriver.support.expected_conditions import presence_of_element_located 6 | 7 | #This example requires Selenium WebDriver 3.13 or newer 8 | with webdriver.Firefox() as driver: 9 | wait = WebDriverWait(driver, 10) 10 | driver.get("https://***.com") 11 | driver.find_element_by_name("q").send_keys("cheese" + Keys.RETURN) 12 | first_result = wait.until(presence_of_element_located((By.CSS_SELECTOR, "h3>div"))) 13 | print(first_result.get_attribute("textContent")) -------------------------------------------------------------------------------- /第2章/2-2.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | 3 | with webdriver.Chrome() as driver: 4 | # 访问指定网址 5 | driver.get("https://www.phei.com.cn/module/goods/wssd_index.jsp") 6 | # 定位图书列表 7 | lis = driver.find_elements_by_css_selector("#book_sort_area > ul:nth-child(1) > li") 8 | # 循环图书列表并从中提取图书信息 9 | for i in lis: 10 | image = i.find_element_by_css_selector("p > a > img").get_attribute("src") 11 | book = i.find_element_by_css_selector("p.li_title > a").text 12 | author = i.find_element_by_css_selector("p.li_author").text.split("\n")[0] 13 | price = i.find_element_by_css_selector("p.li_author > i").text 14 | print([book, price, author, image]) -------------------------------------------------------------------------------- /第4章/4-1.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | 4 | # 假设几个爬取目标的 URL 5 | url1 = "http://example.com?x=1" 6 | url2 = "http://example.com?x=2" 7 | url3 = "http://example.com?x=3" 8 | 9 | # 初始化待爬队列 before 和已爬队列 after 10 | before = set() 11 | after = set() 12 | 13 | # 模拟爬虫程序将 URL 存储到待爬队列 14 | before.add(url1) 15 | before.add(url2) 16 | before.add(url3) 17 | 18 | # 打印队列长度 19 | print("未向目标 URL 发出请求时,待爬队列的长度为 %s,已爬队列的长度为 %s" % (len(before), len(after))) 20 | 21 | while len(before): 22 | # 模拟爬虫程序从待爬队列中取出 URL 23 | request_url = before.pop() 24 | # 模拟爬虫程序请求 URL 25 | resp = requests.get(request_url) 26 | # 模拟爬虫程序将 URL 放入已爬队列 27 | after.add(request_url) 28 | 29 | # 打印队列长度 30 | print("完成请求后,待爬队列的长度为 %s,已爬队列的长度为 %s" % (len(before), len(after))) -------------------------------------------------------------------------------- /第6章/6-11.code: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class Monitor(ABC): 5 | """异常监控器""" 6 | 7 | @abstractmethod 8 | def push(self): 9 | """接收器 10 | 被捕获到的异常信息将会送到这里""" 11 | 12 | @abstractmethod 13 | def extractor(self): 14 | """拆分车间 15 | 根据需求拆分异常信息""" 16 | 17 | @abstractmethod 18 | def recombination(self): 19 | """重组车间 20 | 异常信息将在这里重组""" 21 | 22 | 23 | class Alarm(ABC): 24 | """警报器""" 25 | 26 | @abstractmethod 27 | def __init__(self): 28 | """初始配置""" 29 | 30 | def receive(self): 31 | """接收者 32 | 接收异常信息,将其进行处理后交给发送者""" 33 | 34 | @abstractmethod 35 | def sender(self): 36 | """发送者 37 | 将重组后的信息发送到端""" -------------------------------------------------------------------------------- /第3章/3-1.code: -------------------------------------------------------------------------------- 1 | # 伪代码 2 | import requests 3 | 4 | # 向目标网页发出请求,假设页面 id 为 3376 5 | now_html_data = requests.get("http://example.com/article=3376") 6 | # 解析页面 7 | data = parse(now_html_data) 8 | # 抽取页面内容 9 | article = 3376 10 | title = data.title 11 | salay = data.salay 12 | place = data.place 13 | edu = data.edu 14 | # 从数据库中查询与页面 id 相同的数据 15 | mysql_data = query("select * from info where aid=3376") 16 | # 判断,如果页面内容与数据库存储的数据不同,则更新数据库 17 | if ([article != mysql_data.id, title != mysql_data.title, 18 | salay != mysql_data.salay, place != mysql_data.place, 19 | edu != mysql_data.edu 20 | ]): 21 | # 更新数据库 22 | query("update info set title=%s, salay=%s, place=%s, edu=%s where aid=3376" 23 | % (title, salay, place, edu)) 24 | else: 25 | # 如果页面内容与数据库存储的数据相同,则不作处理 26 | pass -------------------------------------------------------------------------------- /第3章/3-5.code: -------------------------------------------------------------------------------- 1 | >>> import pybloom_live 2 | >>> f = pybloom_live.BloomFilter(capacity=1000, error_rate=0.001) 3 | >>> [f.add(x) for x in range(10)] 4 | [False, False, False, False, False, False, False, False, False, False] 5 | >>> all([(x in f) for x in range(10)]) 6 | True 7 | >>> 10 in f 8 | False 9 | >>> 5 in f 10 | True 11 | >>> f = pybloom_live.BloomFilter(capacity=1000, error_rate=0.001) 12 | >>> for i in xrange(0, f.capacity): 13 | ... _ = f.add(i) 14 | >>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18 15 | True 16 | 17 | >>> sbf = pybloom_live.ScalableBloomFilter(mode=pybloom_live.ScalableBloomFilter.SMALL_SET_GROWTH) 18 | >>> count = 10000 19 | >>> for i in range(0, count): 20 | _ = sbf.add(i) 21 | 22 | >>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18 23 | True -------------------------------------------------------------------------------- /第5章/5-4.code: -------------------------------------------------------------------------------- 1 | def calc_text_density(self, element): 2 | """ 3 | 根据公式: 4 | 5 | Ti - LTi 6 | TDi = ----------- 7 | TGi - LTGi 8 | 9 | 10 | Ti:节点 i 的字符串字数 11 | LTi:节点 i 的带链接的字符串字数 12 | TGi:节点 i 的标签数 13 | LTGi:节点 i 的带连接的标签数 14 | 15 | 16 | :return: 17 | """ 18 | ti_text = '\n'.join(self.get_all_text_of_element(element)) 19 | ti = len(ti_text) 20 | lti = len(''.join(self.get_all_text_of_element(element.xpath('.//a')))) 21 | tgi = len(element.xpath('.//*')) 22 | ltgi = len(element.xpath('.//a')) 23 | if (tgi - ltgi) == 0: 24 | return {'density': 0, 'ti_text': ti_text, 'ti': ti, 'lti': lti, 'tgi': tgi, 'ltgi': ltgi} 25 | density = (ti - lti) / (tgi - ltgi) 26 | return {'density': density, 'ti_text': ti_text, 'ti': ti, 'lti': lti, 'tgi': tgi, 'ltgi': ltgi} -------------------------------------------------------------------------------- /第1章/1-5.py: -------------------------------------------------------------------------------- 1 | import re 2 | import aiohttp 3 | import asyncio 4 | import parsel 5 | 6 | 7 | async def fetch(session, url): 8 | async with session.get(url) as response: 9 | return await response.text() 10 | 11 | 12 | async def main(): 13 | async with aiohttp.ClientSession() as session: 14 | html = await fetch(session, 'https://www.phei.com.cn/gywm/cbsjj/2010-11-19/47.shtml') 15 | title = re.search("

(.*)

", html).group(1) 16 | sel = parsel.Selector(html) 17 | content = "\n".join(sel.css(".column_content_inner p font::text").extract()) 18 | with open("about.txt", "a") as file: 19 | file.write(title) 20 | file.write("\n") 21 | file.write(content) 22 | 23 | if __name__ == '__main__': 24 | loop = asyncio.get_event_loop() 25 | loop.run_until_complete(main()) -------------------------------------------------------------------------------- /第4章/4-14.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | import parsel 4 | from urllib.parse import urljoin 5 | from common import channel, queue 6 | from common import detail 7 | 8 | 9 | def callback(ch, method, properties, body): 10 | url = str(body, "utf8") 11 | print(url) 12 | resp = requests.get(url) 13 | sel = parsel.Selector(resp.content.decode("utf8")) 14 | the_time = sel.css(".news_date::text").extract_first() 15 | pub_time = re.search("(\d+-\d+-\d+)", the_time).group(1) 16 | # 为保持文章排版和样式,保留标签 17 | contents = sel.css(".news_content p").extract() 18 | content = "\n".join(contents) 19 | # 将文章数据存入 MongoDB 20 | detail.insert_one({"pubTime": pub_time, "url": url, "content": content}) 21 | 22 | 23 | channel.basic_consume( 24 | queue=queue, on_message_callback=callback, auto_ack=True) 25 | 26 | channel.start_consuming() -------------------------------------------------------------------------------- /第4章/4-7.code: -------------------------------------------------------------------------------- 1 | def request_fingerprint(request, include_headers=None): 2 | if include_headers: 3 | include_headers = tuple(to_bytes(h.lower()) 4 | for h in sorted(include_headers)) 5 | cache = _fingerprint_cache.setdefault(request, {}) 6 | if include_headers not in cache: 7 | fp = hashlib.sha1() 8 | fp.update(to_bytes(request.method)) 9 | fp.update(to_bytes(canonicalize_url(request.url))) 10 | fp.update(request.body or b'') 11 | if include_headers: 12 | for hdr in include_headers: 13 | if hdr in request.headers: 14 | fp.update(hdr) 15 | for v in request.headers.getlist(hdr): 16 | fp.update(v) 17 | cache[include_headers] = fp.hexdigest() 18 | return cache[include_headers] -------------------------------------------------------------------------------- /第6章/6-1.code: -------------------------------------------------------------------------------- 1 | class Schedule(WsResource): 2 | 3 | def render_POST(self, txrequest): 4 | args = native_stringify_dict(copy(txrequest.args), keys_only=False) 5 | settings = args.pop('setting', []) 6 | settings = dict(x.split('=', 1) for x in settings) 7 | args = dict((k, v[0]) for k, v in args.items()) 8 | project = args.pop('project') 9 | spider = args.pop('spider') 10 | version = args.get('_version', '') 11 | spiders = get_spider_list(project, version=version) 12 | if not spider in spiders: 13 | return {"status": "error", "message": "spider '%s' not found" % spider} 14 | args['settings'] = settings 15 | jobid = args.pop('jobid', uuid.uuid1().hex) 16 | args['_job'] = jobid 17 | self.root.scheduler.schedule(project, spider, **args) 18 | return {"node_name": self.root.nodename, "status": "ok", "jobid": jobid} -------------------------------------------------------------------------------- /第4章/4-13.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import parsel 3 | from urllib.parse import urljoin 4 | from common import channel, queue 5 | 6 | 7 | urls = ["https://www.phei.com.cn/xwxx/index_{}.shtml".format(i) for i in range(1, 46)] 8 | urls.append("https://www.phei.com.cn/xwxx/index.shtml") 9 | 10 | for url in urls: 11 | # 翻页爬取 12 | resp = requests.get(url) 13 | sel = parsel.Selector(resp.content.decode("utf8")) 14 | li = sel.css(".web_news_list ul li.li_b60") 15 | for news in li: 16 | link = news.css("a:first-child::attr('href')").extract_first() 17 | full_link = urljoin(url, link) # 拼接完整 URL 18 | # 将新闻资讯详情页 URL 发布到 RabbitMQ 队列 19 | channel.queue_declare(queue=queue) 20 | channel.basic_publish(exchange='', 21 | routing_key=queue, 22 | body='{}'.format(full_link)) 23 | print("[x] Sent '{}'".format(urljoin(url, link))) -------------------------------------------------------------------------------- /第4章/4-9.command: -------------------------------------------------------------------------------- 1 | # 步骤 1,更改 settings.py 中的配置 2 | # 设置调度器 3 | SCHEDULER = "scrapy_redis.scheduler.Scheduler" 4 | # 设置去重器 5 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 6 | # 更改管道器 7 | ITEM_PIPELINES = { 8 | 'scrapy_redis.pipelines.RedisPipeline': 300 9 | } 10 | # 设置队列 11 | SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' 12 | # 设置 Redis 连接参数,其中包括用户名、密码、地址和端口号 13 | REDIS_HOST = 'localhost' 14 | REDIS_URL = 'redis://user:pass@hostname:9001' 15 | 16 | # 步骤 2,在终端执行 17 | $ scrapy startproject Example 18 | $ cd Example 19 | $ scrapy genspider example example.com 20 | 21 | # 步骤 3 22 | from scrapy_redis.spiders import RedisSpider 23 | class ExampleSpider(RedisSpider): 24 | name = 'example' 25 | allowed_domains = ['example.com'] 26 | def parse(self, response): 27 | # do stuff 28 | pass 29 | 30 | # 步骤 4,在终端执行 31 | $ scrapy runspider example.py 32 | 33 | # 步骤 5,在 Redis-Client 执行 34 | > lpush example:start_urls http://example.com -------------------------------------------------------------------------------- /第5章/5-2.code: -------------------------------------------------------------------------------- 1 | from .utils import pre_parse, remove_noise_node 2 | from gne.extractor import ContentExtractor, TitleExtractor, TimeExtractor, AuthorExtractor 3 | 4 | 5 | class GeneralNewsExtractor: 6 | def extract(self, html, title_xpath='', noise_node_list=None, with_body_html=False): 7 | element = pre_parse(html) 8 | remove_noise_node(element, noise_node_list) 9 | content = ContentExtractor().extract(element, with_body_html) 10 | title = TitleExtractor().extract(element, title_xpath=title_xpath) 11 | publish_time = TimeExtractor().extractor(element) 12 | author = AuthorExtractor().extractor(element) 13 | result = {'title': title, 14 | 'author': author, 15 | 'publish_time': publish_time, 16 | 'content': content[0][1]['text'], 17 | 'images': content[0][1]['images']} 18 | if with_body_html: 19 | result['body_html'] = content[0][1]['body_html'] 20 | return result -------------------------------------------------------------------------------- /第4章/4-5.command: -------------------------------------------------------------------------------- 1 | # MongoDB 2 | # 为集合 WaitCrawl 中的 url 创建 unique 约束 3 | > db.WaitCrawl.ensureIndex({"url": 1}, {"unique": true}); 4 | { 5 | "createdCollectionAutomatically" : true, 6 | "numIndexesBefore" : 1, 7 | "numIndexesAfter" : 2, 8 | "ok" : 1 9 | } 10 | # 插入第 1 条数据 11 | > db.WaitCrawl.insert({"name": "exam", "url": "http://exam.com"}); 12 | WriteResult({ "nInserted" : 1 }) 13 | # 插入第 2 条数据 14 | > db.WaitCrawl.insert({"name": "exam", "url": "http://exam.com"}); 15 | WriteResult({ 16 | "nInserted" : 0, 17 | "writeError" : { 18 | "code" : 11000, 19 | "errmsg" : "E11000 duplicate key error collection: WaitCrawl.WaitCrawl index: url_1 dup key: { : \"http://exam.com\" }" 20 | } 21 | }) 22 | # 插入第 3 条数据 23 | > db.WaitCrawl.insert({"name": "exam", "url": "http://exam.com2"}); 24 | WriteResult({ "nInserted" : 1 }) 25 | # 查看集合 WaitCrawl 中的文档 26 | > db.WaitCrawl.find(); 27 | { "_id" : ObjectId("5dc3cd3cba05dc8f5eeac929"), "name" : "exam", "url" : "http://exam.com" } 28 | { "_id" : ObjectId("5dc3cdb7ba05dc8f5eeac92b"), "name" : "exam", "url" : "http://exam.com2" } -------------------------------------------------------------------------------- /第1章/1-9.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import parsel 3 | from urllib.parse import urljoin 4 | from pymongo import MongoClient 5 | 6 | # 连接数据库并指定数据库和集合 7 | client = MongoClient('localhost', 27017) 8 | db = client.news 9 | collection = db.phei 10 | 11 | 12 | urls = ["https://www.phei.com.cn/xwxx/index_{}.shtml".format(i) for i in range(45)] 13 | urls.append("https://www.phei.com.cn/xwxx/index.shtml") 14 | 15 | for url in urls: 16 | # 翻页爬取 17 | resp = requests.get(url) 18 | sel = parsel.Selector(resp.content.decode("utf8")) 19 | li = sel.css(".web_news_list ul li.li_b60") 20 | for news in li: 21 | # 从单页中提取资讯信息 22 | title = news.css("p.li_news_title::text").extract_first() 23 | pub_time = news.css("span::text").extract_first() 24 | desc = news.css("p.li_news_summary::text").extract_first() 25 | image = news.css("div.li_news_line img::attr('src')").extract_first() 26 | full_image = urljoin(url, image) # 完整图片链接 27 | # 将数据存入 MongoDB 数据库中 28 | collection.insert_one({"title": title, "pubTime": pub_time, 29 | "image": full_image, "desc": desc}) -------------------------------------------------------------------------------- /第6章/6-7.code: -------------------------------------------------------------------------------- 1 | class LoginHandler(MethodView): 2 | 3 | def post(self): 4 | username = request.json.get("username") 5 | pwd = request.json.get("password") 6 | password = md5_encode(pwd) 7 | # 支持用户名或邮箱登录 8 | query = {"username": username, "password": password} 9 | name_exit = databases.user.count_documents(query) 10 | # 校验用户是否存在 11 | if not name_exit: 12 | query = {"email": username, "password": password} 13 | result = databases.user.find_one(query) 14 | if not result: 15 | return {"message": StatusCode.NotFound.value[0], 16 | "data": {}, 17 | "code": StatusCode.NotFound.value[1] 18 | }, 400 19 | # 校验用户状态 20 | status = result.get("status") 21 | if not status: 22 | return {"message": StatusCode.UserStatusOff.value[0], 23 | "data": {}, 24 | "code": StatusCode.UserStatusOff.value[1] 25 | }, 400 26 | # 返回登录结果 27 | return {"message": "success", 28 | "data": {"username": username}, 29 | "code": 200} -------------------------------------------------------------------------------- /第2章/2-5.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import re 3 | from pyppeteer import launch 4 | 5 | async def main(): 6 | browser = await launch() 7 | page = await browser.newPage() 8 | await page.goto('https://www.phei.com.cn/module/goods/wssd_index.jsp') 9 | lis = await page.querySelectorAll("#book_sort_area ul:nth-child(1) li") 10 | for i in lis: 11 | image_element = await i.querySelector("p a img") 12 | image = await (await image_element.getProperty("src")).jsonValue() 13 | book_element = await i.querySelector("p.li_title a") 14 | book = await (await book_element.getProperty("textContent")).jsonValue() 15 | author_price_element = await i.querySelector("p.li_author") 16 | author_price = await (await author_price_element.getProperty("textContent")).jsonValue() 17 | try: 18 | author = re.search("作译者:(.*)定价", str(author_price)).group(1) 19 | price = re.search(r"(\d+.\d+)", str(author_price)).group(1) 20 | except Exception as exc: 21 | author, price = "", "" 22 | print(exc) 23 | print([book, price, author, image]) 24 | await browser.close() 25 | 26 | asyncio.get_event_loop().run_until_complete(main()) -------------------------------------------------------------------------------- /第6章/6-10.code: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from importlib import import_module 4 | 5 | from component.storage import FileStorages 6 | 7 | 8 | storages = FileStorages() 9 | 10 | 11 | class Helmsman: 12 | """为文件导入和执行创造条件的上下文管理器""" 13 | def __init__(self, project, version): 14 | self.project = project 15 | self.version = version 16 | self.storage = storages 17 | self.temp_file = "" 18 | 19 | def __enter__(self): 20 | """上文""" 21 | # 将文件拷贝到临时区 22 | target = self.storage.copy_to_temporary(project, version) 23 | self.temp_file = target 24 | if target: 25 | # 将文件路径添加到 sys.path 26 | sys.path.insert(0, target) 27 | 28 | def __exit__(self, exc_type, exc_val, exc_tb): 29 | """下文""" 30 | if os.path.exists(self.temp_file): 31 | # 清理临时区中对应的文件 32 | os.remove(self.temp_file) 33 | 34 | 35 | def main(project, version): 36 | helmsman = Helmsman(project, version) 37 | with helmsman: 38 | # 从指定的文件中导入模块并调用指定方法 39 | spider = import_module("sail") 40 | spider.main() 41 | 42 | 43 | if __name__ == "__main__": 44 | project, version = sys.argv[-2], sys.argv[-1] 45 | main(project, version) -------------------------------------------------------------------------------- /第3章/3-3.py: -------------------------------------------------------------------------------- 1 | import time 2 | import string 3 | import random 4 | import asyncio 5 | import aiomysql 6 | 7 | 8 | async def test_example_execute(loop): 9 | # 填写参数,以连接数据库 10 | conn = await aiomysql.connect(host='127.0.0.1', port=3306, 11 | user='root', password='******', 12 | db='football', loop=loop, 13 | autocommit=True) 14 | async with conn.cursor() as cur: 15 | # 循环 50 万次 16 | for i in range(500000): 17 | base_url = "http://www.******.com" 18 | # 生成 6 位的随机小写字母组合 19 | article = ''.join(random.choices(string.ascii_lowercase, k=6)) 20 | # 生成时间戳 21 | timestamp = int(time.time()) 22 | # 生成 sign 参数 23 | sign = article + str(timestamp * 3) 24 | # 拼接成常见的 URL 25 | url = "%s?page=1&article=%s&sign=%s×=%s" % (base_url, article, sign, timestamp) 26 | # SQL 语句 27 | query = "INSERT INTO player(url) VALUES ('%s');" % url 28 | # 执行指定的 SQL 语句 29 | await cur.execute(query) 30 | conn.close() 31 | 32 | 33 | loop = asyncio.get_event_loop() 34 | loop.run_until_complete(test_example_execute(loop)) -------------------------------------------------------------------------------- /第6章/6-5.code: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import shutil 4 | import tempfile 5 | from contextlib import contextmanager 6 | 7 | from scrapyd import get_application 8 | from scrapyd.interfaces import IEggStorage 9 | from scrapyd.eggutils import activate_egg 10 | 11 | @contextmanager 12 | def project_environment(project): 13 | app = get_application() 14 | eggstorage = app.getComponent(IEggStorage) 15 | eggversion = os.environ.get('SCRAPY_EGG_VERSION', None) 16 | version, eggfile = eggstorage.get(project, eggversion) 17 | if eggfile: 18 | prefix = '%s-%s-' % (project, version) 19 | fd, eggpath = tempfile.mkstemp(prefix=prefix, suffix='.egg') 20 | lf = os.fdopen(fd, 'wb') 21 | shutil.copyfileobj(eggfile, lf) 22 | lf.close() 23 | activate_egg(eggpath) 24 | else: 25 | eggpath = None 26 | try: 27 | assert 'scrapy.conf' not in sys.modules, "Scrapy settings already loaded" 28 | yield 29 | finally: 30 | if eggpath: 31 | os.remove(eggpath) 32 | 33 | def main(): 34 | project = os.environ['SCRAPY_PROJECT'] 35 | with project_environment(project): 36 | from scrapy.cmdline import execute 37 | execute() 38 | 39 | if __name__ == '__main__': 40 | main() -------------------------------------------------------------------------------- /第5章/5-3.code: -------------------------------------------------------------------------------- 1 | def extract(self, selector, with_body_html=False): 2 | body = selector.xpath('//body')[0] 3 | for node in iter_node(body): 4 | node_hash = hash(node) 5 | density_info = self.calc_text_density(node) 6 | text_density = density_info['density'] 7 | ti_text = density_info['ti_text'] 8 | text_tag_count = self.count_text_tag(node, tag='p') 9 | sbdi = self.calc_sbdi(ti_text, density_info['ti'], density_info['lti']) 10 | images_list = node.xpath('.//img/@src') 11 | node_info = {'ti': density_info['ti'], 12 | 'lti': density_info['lti'], 13 | 'tgi': density_info['tgi'], 14 | 'ltgi': density_info['ltgi'], 15 | 'node': node, 16 | 'density': text_density, 17 | 'text': ti_text, 18 | 'images': images_list, 19 | 'text_tag_count': text_tag_count, 20 | 'sbdi': sbdi} 21 | if with_body_html: 22 | body_source_code = unescape(etree.tostring(node).decode()) 23 | node_info['body_html'] = body_source_code 24 | self.node_info[node_hash] = node_info 25 | std = self.calc_standard_deviation() 26 | self.calc_new_score(std) 27 | result = sorted(self.node_info.items(), key=lambda x: x[1]['score'], reverse=True) 28 | return result -------------------------------------------------------------------------------- /第6章/6-6.code: -------------------------------------------------------------------------------- 1 | class RegisterHandler(MethodView): 2 | 3 | def post(self): 4 | username = request.json.get("username") 5 | pwd = request.json.get("password") 6 | nick = request.json.get("nick") 7 | email = request.json.get("email") 8 | if not username or not pwd or not nick or not email or "@" not in email: 9 | return {"message": StatusCode.ParameterError.value[0], 10 | "data": {}, 11 | "code": StatusCode.ParameterError.value[1] 12 | }, 400 13 | password = pwd 14 | count = databases.user.count_documents({}) 15 | if not count: 16 | # 首次注册的账户为超级管理员,启动激活 17 | role = Role.SuperUser.value 18 | message = {"username": username, "password": password, 19 | "nick": nick, "email": email, 20 | "role": role, "status": Status.On.value} 21 | else: 22 | # 非首次注册账户默认为开发者,且未激活 23 | role = Role.Developer.value 24 | message = {"username": username, "password": password, 25 | "nick": nick, "email": email, 26 | "role": role, "status": Status.Off.value} 27 | message["create"] = datetime.now() 28 | # 将信息写入数据库并将相应信息返回给用户 29 | inserted = databases.user.insert_one(message).inserted_id 30 | message["id"] = str(inserted) 31 | message["username"] = username 32 | message["email"] = email 33 | message["role"] = role 34 | message.pop("_id") 35 | return {"message": "success", "data": message, "code": 201}, 201 -------------------------------------------------------------------------------- /第6章/6-9.code: -------------------------------------------------------------------------------- 1 | import time 2 | from datetime import datetime 3 | from flask.views import MethodView 4 | from flask import request 5 | 6 | from component.enums import StatusCode 7 | from component.storage import FileStorages 8 | from connect import databases 9 | 10 | 11 | storages = FileStorages() 12 | 13 | 14 | class DeployHandler(MethodView): 15 | 16 | def post(self): 17 | """项目部署接口""" 18 | project = request.form.get('project') 19 | remark = request.form.get('remark') 20 | file = request.files.get('file') 21 | if not project or not file: 22 | # 确保参数和值存在 23 | return {"message": StatusCode.MissingParameter.value[0], 24 | "data": {}, 25 | "code": StatusCode.MissingParameter.value[1] 26 | }, 400 27 | filename = file.filename 28 | if not filename.endswith('.egg'): 29 | # 确保文件类型正确 30 | return {"message": StatusCode.NotFound.value[0], 31 | "data": {}, 32 | "code": StatusCode.NotFound.value[1] 33 | }, 400 34 | version = int(time.time()) 35 | content = file.stream.read() 36 | # 将文件存储到服务端 37 | result = storages.put(project, version, content) 38 | if not result: 39 | # 存储失败则返回相关提示 40 | return {"message": StatusCode.OperationError.value[0], 41 | "data": {}, 42 | "code": StatusCode.OperationError.value[1] 43 | }, 400 44 | message = {"project": project, 45 | "version": str(version), 46 | "remark": remark or "Nothing", 47 | "create": datetime.now()} 48 | databases.deploy.insert_one(message).inserted_id 49 | message["_id"] = str(message.pop("_id")) 50 | return {"message": "success", 51 | "data": message, 52 | "code": 201}, 201 -------------------------------------------------------------------------------- /第6章/6-8.code: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import shutil 4 | from settings import FILEPATH, TEMPATH 5 | 6 | 7 | class FileStorages: 8 | 9 | @staticmethod 10 | def put(project, version, content): 11 | """文件存储 12 | """ 13 | # 根据项目名称生成路径 14 | room = os.path.join(FILEPATH, project) 15 | if not os.path.exists(room): 16 | # 如果目录不存在则创建 17 | os.makedirs(room) 18 | # 拼接文件完整路径,以时间戳作为文件名 19 | filename = os.path.join(room, "%s.egg" % str(version)) 20 | try: 21 | with open(filename, 'wb') as file: 22 | # 写入文件 23 | file.write(content) 24 | except Exception as exc: 25 | # 异常处理,打印异常信息 26 | logging.warning(exc) 27 | return False 28 | return True 29 | 30 | def get(self): 31 | pass 32 | 33 | @staticmethod 34 | def delete(project, version): 35 | """文件删除状态 36 | A - 文件或目录存在且成功删除 37 | B - 文件或目录不存在,无需删除 38 | """ 39 | sign = 'B' 40 | room = os.path.join(FILEPATH, project) 41 | if project and version: 42 | # 删除指定文件 43 | filename = os.path.join(room, "%s.egg" % str(version)) 44 | if os.path.exists(filename): 45 | sign = 'A' 46 | os.remove(filename) 47 | if project and not version: 48 | # 删除指定目录 49 | if os.path.exists(room): 50 | sign = 'A' 51 | shutil.rmtree(room) 52 | return sign 53 | 54 | @staticmethod 55 | def copy_to_temporary(project, version): 56 | """根据参数将指定文件拷贝到指定目录 57 | """ 58 | before = os.path.join(FILEPATH, project, "%s.egg" % version) 59 | after = os.path.join(TEMPATH, "%s.egg" % version) 60 | if not os.path.exists(before): 61 | logging.warning("File %s Not Exists" % before) 62 | return None 63 | if not os.path.exists(TEMPATH): 64 | os.makedirs(TEMPATH) 65 | # 文件拷贝 66 | shutil.copyfile(before, after) 67 | return after 68 | 69 | @staticmethod 70 | def exists(project, version): 71 | """检查指定项目名称和版本号的文件是否存在""" 72 | file = os.path.join(FILEPATH, project, "%s.egg" % version) 73 | if not os.path.exists(file): 74 | return False 75 | return True -------------------------------------------------------------------------------- /第6章/6-13.code: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import hmac 3 | import time 4 | import base64 5 | import json 6 | import logging 7 | from urllib.parse import quote_plus 8 | import requests 9 | from interface import Alarm 10 | 11 | from supervise.monitors import MarkdownMonitor 12 | 13 | 14 | class DingAlarm(Alarm): 15 | 16 | def __init__(self): 17 | self.access_key = "xxx" 18 | self.secret = "GQSxx" 19 | self.token = "https://oapi.dingtalk.com/robot/send?access_token=xxx" 20 | self.header = {"Content-Type": "application/json;charset=UTF-8"} 21 | self.monitor = MarkdownMonitor() 22 | 23 | def receive(self, txt, occurrence, timer): 24 | """接收者 25 | 接收异常信息,将其进行处理后交给发送者""" 26 | content = self.monitor.push(txt, occurrence, timer) 27 | self.sender(content) 28 | 29 | @staticmethod 30 | def _sign(timestamps, secret, mode=False): 31 | """钉钉签名计算 32 | 根据钉钉文档指引计算签名信息 33 | 文档参考 34 | https://docs.python.org/3.6/library/hmac.html 35 | https://docs.python.org/3.6/library/urllib.parse.html#urllib.parse.quote 36 | https://ding-doc.dingtalk.com/doc#/faquestions/hxs5v9 37 | """ 38 | if not isinstance(timestamps, str): 39 | # 如果钉钉机器人的安全措施为密钥,那么按照文档指引传入的是字符串,反之为数字 40 | # 加密时需要转成字节,所以这里要确保时间戳为字符串 41 | timestamps = str(timestamps) 42 | mav = hmac.new(secret.encode("utf8"), digestmod=hashlib.sha256) 43 | mav.update(timestamps.encode("utf8")) 44 | result = mav.digest() 45 | # 对签名值进行 Base64 编码 46 | signature = base64.b64encode(result).decode("utf8") 47 | if mode: 48 | # 可选择是否将签名值进行 URL 编码 49 | signature = quote_plus(signature) 50 | return signature 51 | 52 | def sender(self, message): 53 | """发送者 54 | 将重组后的信息发送到端""" 55 | timestamps = int(time.time()) * 1000 56 | # sign = self._sign(timestamps, self.secret, True) 57 | # 根据钉钉文档构造链接 58 | url = self.token # + "×tamp=%s&sign=%s" % (timestamps, sign) 59 | # 通过钉钉机器人将消息发送到钉钉群 60 | resp = requests.post(url, headers=self.header, json=message) 61 | # 根据返回的错误码判断消息发送状态 62 | err = json.loads(resp.text) 63 | if err.get("errcode"): 64 | logging.warning(err) 65 | return False 66 | else: 67 | logging.info("Message Sender Success") 68 | return True -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /第6章/6-12.code: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from interface import Monitor 3 | 4 | 5 | class MarkdownMonitor(Monitor): 6 | 7 | def __init__(self): 8 | self.keyword = "Alarm" 9 | self.err_image = "http://can.sfhfpc.com/sfhfpc/20191210133853.png" 10 | self.traceback_image = "http://can.sfhfpc.com/sfhfpc/20191210133616.png" 11 | 12 | def push(self, txt, occurrence, timer): 13 | """接收器 14 | 被捕获到的异常信息将会送到这里""" 15 | 16 | # 将信息按行分割 17 | message = [] 18 | line = "" 19 | for i in txt: 20 | if i != "\n": 21 | line += i 22 | else: 23 | message.append(line) 24 | line = "" 25 | err, traceback, res = self.extractor(message) 26 | content = self.recombination(err, traceback, res, occurrence, timer) 27 | return content 28 | 29 | def extractor(self, message): 30 | """拆分车间 31 | 根据需求拆分异常信息""" 32 | result = [] 33 | err_number = 0 34 | traceback_number = 0 35 | for k, v in enumerate(message): 36 | # 异常分类 37 | if "ERROR" in v: 38 | # 列别数量统计 39 | err_number += 1 40 | # 放入信息队列 41 | result.append(v) 42 | if "Traceback" in v: 43 | # 类别数量统计 44 | traceback_number += 1 45 | # 放入信息队列 46 | result += message[k:] 47 | return err_number, traceback_number, result 48 | 49 | def recombination(self, err, traceback, res, occurrence, timer): 50 | """重组车间 51 | 异常信息将在这里重组""" 52 | title = "Traceback" if traceback else "Error" 53 | image = self.traceback_image if traceback else self.err_image 54 | err_message = "\n\n > ".join(res) 55 | now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 56 | # 按照钉钉文档中的 MarkDown 格式示例构造信息 57 | article = "#### TOTAL -- Error Number: {}, Traceback Number: {} \n".format(err, traceback) + \ 58 | "> ![screenshot]({}) \n\n".format(image) + \ 59 | "> **Error message** \n\n" + \ 60 | "> {} \n\n".format(err_message) + \ 61 | "> -------- \n\n" + \ 62 | "> **Timer**\n\n> {} \n\n".format(timer) +\ 63 | "> -------- \n\n" + \ 64 | "> **Other information** \n\n" + \ 65 | "> Occurrence Time: {} \n\n".format(occurrence) + \ 66 | "> Send Time: {} \n\n".format(now) + \ 67 | "> Message Type: {}".format(self.keyword) 68 | 69 | content = { 70 | "msgtype": "markdown", 71 | "markdown": {"title": title, "text": article} 72 | } 73 | return content --------------------------------------------------------------------------------