├── .idea
├── .gitignore
├── vcs.xml
├── misc.xml
├── inspectionProfiles
│ └── profiles_settings.xml
├── modules.xml
└── spiderbook.iml
├── 第3章
├── 3-7.conf
├── 3-6.py
├── 3-2.py
├── 3-4.py
├── 3-1.code
├── 3-5.code
└── 3-3.py
├── 第5章
├── 5-6.code
├── 5-1.html
├── 5-7.code
├── 5-5.code
├── 5-8.code
├── 5-9.code
├── 5-4.code
├── 5-2.code
└── 5-3.code
├── 第4章
├── 4-2.command
├── 4-8.code
├── 4-3.command
├── 4-4.command
├── 4-12.py
├── 4-6.code
├── 4-11.py
├── 4-10.py
├── 4-1.py
├── 4-14.py
├── 4-7.code
├── 4-13.py
├── 4-9.command
└── 4-5.command
├── 第1章
├── 1-3.py
├── 1-7.py
├── 1-8.py
├── 1-4.py
├── 1-2.py
├── 1-1.py
├── 1-6.html
├── 1-5.py
└── 1-9.py
├── README.md
├── 第2章
├── 2-6.py
├── 2-4.py
├── 2-3.py
├── 2-1.py
├── 2-2.py
└── 2-5.py
├── 第6章
├── 6-3.code
├── 6-4.code
├── 6-2.code
├── 6-11.code
├── 6-1.code
├── 6-7.code
├── 6-10.code
├── 6-5.code
├── 6-6.code
├── 6-9.code
├── 6-8.code
├── 6-13.code
└── 6-12.code
└── .gitignore
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Default ignored files
3 | /workspace.xml
--------------------------------------------------------------------------------
/第3章/3-7.conf:
--------------------------------------------------------------------------------
1 | # save 900 1
2 | # save 300 10
3 | # save 60 10000
4 | save ""
--------------------------------------------------------------------------------
/第5章/5-6.code:
--------------------------------------------------------------------------------
1 | def calc_standard_deviation(self):
2 | score_list = [x['density'] for x in self.node_info.values()]
3 | std = np.std(score_list, ddof=1)
4 | return std
--------------------------------------------------------------------------------
/第4章/4-2.command:
--------------------------------------------------------------------------------
1 | # MySQL
2 | CREATE TABLE WaitCrawl
3 | (
4 | id int NOT NULL,
5 | name varchar(255) NOT NULL,
6 | url varchar(255) NOT NULL,
7 | UNIQUE (url)
8 | );
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/第1章/1-3.py:
--------------------------------------------------------------------------------
1 | import time
2 | from datetime import datetime
3 | def wait():
4 | time.sleep(5)
5 | print("开始", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
6 | wait()
7 | print("结束", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/第4章/4-8.code:
--------------------------------------------------------------------------------
1 | def request_seen(self, request):
2 | fp = self.request_fingerprint(request)
3 | if fp in self.fingerprints:
4 | return True
5 | self.fingerprints.add(fp)
6 | if self.file:
7 | self.file.write(fp + os.linesep)
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # spiderbook
2 |
3 | 《Python3 网络爬虫宝典》书籍配套代码
4 |
5 | 
6 |
7 |
8 | 章节中所用到的代码片段均在该项目中,例如
9 |
10 | 第 `5` 章中的 `代码片段 5-6` 存放路径为 `spiderbook/第5章/5-6.code`
11 |
12 | 其他代码片段依此类推
13 |
--------------------------------------------------------------------------------
/第2章/2-6.py:
--------------------------------------------------------------------------------
1 | def request(flow):
2 | if ".png" in flow.request.url:
3 | # 判断 .png 是否在请求 URL 中
4 | with open("image.txt", "a+") as file:
5 | # 保存 URL
6 | file.write(flow.request.url)
7 | file.write("\n")
--------------------------------------------------------------------------------
/第5章/5-1.html:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/第1章/1-7.py:
--------------------------------------------------------------------------------
1 | import parsel
2 |
3 |
4 | sel = parsel.Selector(html)
5 | publisher = sel.css(".publisher::text").extract_first()
6 | pub_time = sel.css(".pubTime::text").extract_first()
7 | content = "\n".join(sel.css(".content p::text").extract())
8 |
9 | print(publisher, "\n", pub_time, "\n", content)
--------------------------------------------------------------------------------
/第4章/4-3.command:
--------------------------------------------------------------------------------
1 | # MySQL
2 | > insert into WaitCrawl (id, name, url) VALUES (1, "exam", "http://exam.com");
3 | Query OK, 1 row affected (0.01 sec)
4 | > insert into WaitCrawl (id, name, url) VALUES (2, "exam", "http://exam.com");
5 | ERROR 1062 (23000): Duplicate entry 'http://exam.com' for key 'url'
--------------------------------------------------------------------------------
/第5章/5-7.code:
--------------------------------------------------------------------------------
1 | def calc_new_score(self, std):
2 | for node_hash, node_info in self.node_info.items():
3 | score = np.log(std) * node_info['density'] * np.log10(node_info['text_tag_count'] + 2) * np.log(
4 | node_info['sbdi'])
5 | self.node_info[node_hash]['score'] = score
--------------------------------------------------------------------------------
/第6章/6-3.code:
--------------------------------------------------------------------------------
1 | def startService(self):
2 | for slot in range(self.max_proc):
3 | self._wait_for_project(slot)
4 | log.msg(format='Scrapyd %(version)s started: max_proc=%(max_proc)r, runner=%(runner)r',
5 | version=__version__, max_proc=self.max_proc,
6 | runner=self.runner, system='Launcher')
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/第1章/1-8.py:
--------------------------------------------------------------------------------
1 | import parsel
2 |
3 |
4 | sel = parsel.Selector(html)
5 | publisher = sel.xpath("//span[@class='publisher']/text()").extract_first()
6 | pub_time = sel.xpath("//span[@class='pubTime']/text()").extract_first()
7 | content = "\n".join(sel.xpath("//div[@class='content']/p/text()").extract())
8 |
9 | print(publisher, "\n", pub_time, "\n", content)
--------------------------------------------------------------------------------
/第2章/2-4.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from pyppeteer import launch
3 |
4 | async def main():
5 | browser = await launch()
6 | page = await browser.newPage()
7 | await page.goto('http://example.com')
8 | await page.screenshot({'path': 'example.png'})
9 | await browser.close()
10 |
11 | asyncio.get_event_loop().run_until_complete(main())
--------------------------------------------------------------------------------
/第5章/5-5.code:
--------------------------------------------------------------------------------
1 | def calc_sbdi(self, text, ti, lti):
2 | """
3 | Ti - LTi
4 | SbDi = --------------
5 | Sbi + 1
6 |
7 | SbDi:符号密度
8 | Sbi:符号数量
9 |
10 | :return:
11 | """
12 | sbi = self.count_punctuation_num(text)
13 | sbdi = (ti - lti) / (sbi + 1)
14 | return sbdi or 1 # sbdi 不能为0,否则会导致求对数时报错。
--------------------------------------------------------------------------------
/第4章/4-4.command:
--------------------------------------------------------------------------------
1 | # Redis
2 | # 插入数据
3 | > SADD WaitCrawl mysql
4 | (integer) 1
5 | > SADD WaitCrawl redis
6 | (integer) 1
7 | > SADD WaitCrawl mongodb
8 | (integer) 1
9 | > SADD WaitCrawl sqlite
10 | (integer) 1
11 | > SADD WaitCrawl redis
12 | (integer) 0
13 | # 查询集合
14 | > SMEMBERS WaitCrawl
15 | 1) "redis"
16 | 2) "sqlite"
17 | 3) "mongodb"
18 | 4) "mysql"
--------------------------------------------------------------------------------
/第3章/3-6.py:
--------------------------------------------------------------------------------
1 | from pybloom_live import BloomFilter
2 |
3 | # 初始化 BloomFilter 对象,设定容量为 1000,误判几率 0.001
4 | f = BloomFilter(capacity=1000, error_rate=0.001)
5 | # 循环将 0~4 的数字添加到 vector 中,并打印返回结果
6 | res = [f.add(x) for x in range(5)]
7 | print(res)
8 | # 单独将数字 4 添加到 vector 中,并打印返回结果
9 | print(f.add(3))
10 | # 判断数字 10 和数字 5 是否在 vector 中,并打印判断结果
11 | print(10 in f)
12 | print(5 in f)
--------------------------------------------------------------------------------
/第4章/4-12.py:
--------------------------------------------------------------------------------
1 | import pika
2 | from pymongo import MongoClient
3 |
4 | # 连接 RabbitMQ
5 | auth = pika.PlainCredentials("books", "spider")
6 | connection = pika.BlockingConnection(pika.ConnectionParameters('148.70.6*.5*', 5672, "/", auth))
7 | channel = connection.channel()
8 | queue = "dcs"
9 |
10 |
11 | # 连接 MongoDB
12 | client = MongoClient('localhost', 27017)
13 | db = client.news
14 | detail = db.detail
--------------------------------------------------------------------------------
/第4章/4-6.code:
--------------------------------------------------------------------------------
1 | import requests
2 | # 假设页码 id 为递增数字
3 | for i in range(20):
4 | # 构造列表页单页 URL
5 | page_url = "http://example.com?page=%s" % i
6 | # 向列表页发出请求
7 | resp = requests.get(page_url)
8 | # 从返回结果中抽取详情页 URL
9 | url_list = [x for x in resp.text]
10 | for url in url_list:
11 | # 向详情页发出请求
12 | article = requests.get(url)
13 | # 拿到目标数据
14 | text = article.text
--------------------------------------------------------------------------------
/第5章/5-8.code:
--------------------------------------------------------------------------------
1 | class TimeExtractor:
2 | def __init__(self):
3 | self.time_pattern = DATETIME_PATTERN
4 |
5 | def extractor(self, element: HtmlElement):
6 | text = ''.join(element.xpath('.//text()'))
7 | for dt in self.time_pattern:
8 | dt_obj = re.search(dt, text)
9 | if dt_obj:
10 | return dt_obj.group(1)
11 | else:
12 | return ''
--------------------------------------------------------------------------------
/第5章/5-9.code:
--------------------------------------------------------------------------------
1 | class AuthorExtractor:
2 | def __init__(self):
3 | self.author_pattern = AUTHOR_PATTERN
4 |
5 | def extractor(self, element: HtmlElement):
6 | text = ''.join(element.xpath('.//text()'))
7 | for pattern in self.author_pattern:
8 | author_obj = re.search(pattern, text)
9 | if author_obj:
10 | return author_obj.group(1)
11 | return ''
--------------------------------------------------------------------------------
/第2章/2-3.py:
--------------------------------------------------------------------------------
1 | import time
2 | from selenium import webdriver
3 | from selenium.webdriver.common.action_chains import ActionChains
4 |
5 |
6 | with webdriver.Chrome() as driver:
7 | # 访问指定网址
8 | driver.get("https://www.phei.com.cn/module/goods/wssd_index.jsp")
9 | # 定位版权信息
10 | footer = driver.find_element_by_class_name("web_book_footer")
11 | # 移动到指定位置
12 | ActionChains(driver).move_to_element(footer).perform()
13 | time.sleep(10)
--------------------------------------------------------------------------------
/第1章/1-4.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | import asyncio
3 |
4 |
5 | async def wait():
6 | asyncio.sleep(5)
7 | print("等我 5 秒钟")
8 |
9 |
10 | async def print_time(word):
11 | print(word, datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
12 |
13 |
14 | async def main():
15 | await print_time("开始")
16 | await wait()
17 | await print_time("结束")
18 |
19 | loop = asyncio.get_event_loop()
20 | loop.run_until_complete(main())
21 | loop.close()
--------------------------------------------------------------------------------
/第1章/1-2.py:
--------------------------------------------------------------------------------
1 | import re
2 | import parsel
3 | import requests
4 |
5 |
6 | url = "https://www.phei.com.cn/gywm/cbsjj/2010-11-19/47.shtml"
7 | req = requests.get(url)
8 | text = req.content.decode("utf8")
9 | title = re.search("(.*)
", text).group(1)
10 | sel = parsel.Selector(text)
11 | content = "\n".join(sel.css(".column_content_inner p font::text").extract())
12 | with open("about.txt", "a") as file:
13 | file.write(title)
14 | file.write("\n")
15 | file.write(content)
--------------------------------------------------------------------------------
/第4章/4-11.py:
--------------------------------------------------------------------------------
1 | import pika
2 |
3 |
4 | def callback(ch, method, properties, body):
5 | print(" [x] Received %r" % body)
6 |
7 |
8 | auth = pika.PlainCredentials("books", "spider")
9 | connection = pika.BlockingConnection(pika.ConnectionParameters('148.70.6*.5*', 5672, "/", auth))
10 | channel = connection.channel()
11 | channel.basic_consume(
12 | queue='message_box', on_message_callback=callback, auto_ack=True)
13 |
14 | print(' [*] Waiting for messages. To exit press CTRL+C')
15 | channel.start_consuming()
--------------------------------------------------------------------------------
/第4章/4-10.py:
--------------------------------------------------------------------------------
1 | import pika
2 |
3 |
4 | auth = pika.PlainCredentials("books", "spider")
5 | connection = pika.BlockingConnection(pika.ConnectionParameters('148.70.6*.5*', 5672, "/", auth))
6 |
7 | channel = connection.channel()
8 | channel.queue_declare(queue='message_box')
9 | for i in range(5):
10 | channel.basic_publish(exchange='',
11 | routing_key='message_box',
12 | body='Hello World-{}'.format(i))
13 | print(" [x] Sent 'Hello World-{}'".format(i))
14 | connection.close()
--------------------------------------------------------------------------------
/第1章/1-1.py:
--------------------------------------------------------------------------------
1 | import re
2 | import parsel
3 | from urllib import request
4 |
5 |
6 | url = "https://www.phei.com.cn/gywm/cbsjj/2010-11-19/47.shtml"
7 | with request.urlopen(url) as req:
8 | text = req.read().decode("utf8")
9 | title = re.search("(.*)
", text).group(1)
10 | sel = parsel.Selector(text)
11 | content = "\n".join(sel.css(".column_content_inner p font::text").extract())
12 | with open("about.txt", "a") as file:
13 | file.write(title)
14 | file.write("\n")
15 | file.write(content)
--------------------------------------------------------------------------------
/.idea/spiderbook.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/第1章/1-6.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | 新溪大桥早高峰报道:堵成一窝蜂
4 | 是否让白沙大桥帮助每小时前进300米的新溪大桥分流呢
5 |
6 |
7 |
发布者:今日新闻|发布时间:2020-1-29
8 |
9 |
10 |
新溪大桥于 2018 年 6 月正式启用通车……
11 |
……
12 |
……
13 |
……
14 |
记者:王大力、陈小七(实习)
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/第3章/3-2.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | # 创建一个集合,作为增量池
4 | after = set()
5 | # 设定 URL 列表
6 | urls = ["http://www.example.com?page=1&id=2r9l74hjng",
7 | "http://www.example.com?page=1&id=9kiujamzj6",
8 | "http://www.example.com?page=1&id=77274jnasf",
9 | "http://www.example.com?page=1&id=9kiujamzj6"
10 | ]
11 | # 循环 URL 列表
12 | for url in urls:
13 | # 条件判断
14 | if url not in after:
15 | # 如果 URL 不在增量池中则向目标网页发出请求
16 | resp = requests.get(url)
17 | # 发出请求后,将 URL 添加到增量池
18 | after.add(url)
19 | else:
20 | # 不作处理
21 | pass
22 | print(len(after), after)
--------------------------------------------------------------------------------
/第6章/6-4.code:
--------------------------------------------------------------------------------
1 | def _spawn_process(self, message, slot):
2 | msg = native_stringify_dict(message, keys_only=False)
3 | project = msg['_project']
4 | args = [sys.executable, '-m', self.runner, 'crawl']
5 | args += get_crawl_args(msg)
6 | e = self.app.getComponent(IEnvironment)
7 | env = e.get_environment(msg, slot)
8 | env = native_stringify_dict(env, keys_only=False)
9 | pp = ScrapyProcessProtocol(slot, project, msg['_spider'], \
10 | msg['_job'], env)
11 | pp.deferred.addBoth(self._process_finished, slot)
12 | reactor.spawnProcess(pp, sys.executable, args=args, env=env)
13 | self.processes[slot] = pp
--------------------------------------------------------------------------------
/第6章/6-2.code:
--------------------------------------------------------------------------------
1 | class AddVersion(WsResource):
2 |
3 | def render_POST(self, txrequest):
4 | eggf = BytesIO(txrequest.args.pop(b'egg')[0])
5 | args = native_stringify_dict(copy(txrequest.args), keys_only=False)
6 | project = args['project'][0]
7 | version = args['version'][0]
8 | self.root.eggstorage.put(eggf, project, version)
9 | spiders = get_spider_list(project, version=version)
10 | self.root.update_projects()
11 | UtilsCache.invalid_cache(project)
12 | return {"node_name": self.root.nodename, "status": "ok", "project": project, "version": version, \
13 | "spiders": len(spiders)}
--------------------------------------------------------------------------------
/第3章/3-4.py:
--------------------------------------------------------------------------------
1 | import time
2 | import string
3 | import random
4 | import pymongo
5 |
6 |
7 | # 连接 MongoDB
8 | client = pymongo.MongoClient("localhost", 27017)
9 | # 使用 test 数据库
10 | db = client.test
11 |
12 |
13 | for i in range(500000):
14 | base_url = "http://www.******.com"
15 | # 生成 6 位的随机小写字母组合
16 | article = ''.join(random.choices(string.ascii_lowercase, k=6))
17 | # 生成时间戳
18 | timestamp = int(time.time())
19 | # 生成 sign 参数
20 | sign = article + str(timestamp * 3)
21 | # 拼接成常见的 URL
22 | url = "%s?page=1&article=%s&sign=%s×=%s" % (base_url, article, sign, timestamp)
23 | # 往 mongodb 集合中插入数据
24 | db.sfhfpc.insert_one({article: url})
--------------------------------------------------------------------------------
/第2章/2-1.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from selenium.webdriver.common.by import By
3 | from selenium.webdriver.common.keys import Keys
4 | from selenium.webdriver.support.ui import WebDriverWait
5 | from selenium.webdriver.support.expected_conditions import presence_of_element_located
6 |
7 | #This example requires Selenium WebDriver 3.13 or newer
8 | with webdriver.Firefox() as driver:
9 | wait = WebDriverWait(driver, 10)
10 | driver.get("https://***.com")
11 | driver.find_element_by_name("q").send_keys("cheese" + Keys.RETURN)
12 | first_result = wait.until(presence_of_element_located((By.CSS_SELECTOR, "h3>div")))
13 | print(first_result.get_attribute("textContent"))
--------------------------------------------------------------------------------
/第2章/2-2.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 |
3 | with webdriver.Chrome() as driver:
4 | # 访问指定网址
5 | driver.get("https://www.phei.com.cn/module/goods/wssd_index.jsp")
6 | # 定位图书列表
7 | lis = driver.find_elements_by_css_selector("#book_sort_area > ul:nth-child(1) > li")
8 | # 循环图书列表并从中提取图书信息
9 | for i in lis:
10 | image = i.find_element_by_css_selector("p > a > img").get_attribute("src")
11 | book = i.find_element_by_css_selector("p.li_title > a").text
12 | author = i.find_element_by_css_selector("p.li_author").text.split("\n")[0]
13 | price = i.find_element_by_css_selector("p.li_author > i").text
14 | print([book, price, author, image])
--------------------------------------------------------------------------------
/第4章/4-1.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 |
4 | # 假设几个爬取目标的 URL
5 | url1 = "http://example.com?x=1"
6 | url2 = "http://example.com?x=2"
7 | url3 = "http://example.com?x=3"
8 |
9 | # 初始化待爬队列 before 和已爬队列 after
10 | before = set()
11 | after = set()
12 |
13 | # 模拟爬虫程序将 URL 存储到待爬队列
14 | before.add(url1)
15 | before.add(url2)
16 | before.add(url3)
17 |
18 | # 打印队列长度
19 | print("未向目标 URL 发出请求时,待爬队列的长度为 %s,已爬队列的长度为 %s" % (len(before), len(after)))
20 |
21 | while len(before):
22 | # 模拟爬虫程序从待爬队列中取出 URL
23 | request_url = before.pop()
24 | # 模拟爬虫程序请求 URL
25 | resp = requests.get(request_url)
26 | # 模拟爬虫程序将 URL 放入已爬队列
27 | after.add(request_url)
28 |
29 | # 打印队列长度
30 | print("完成请求后,待爬队列的长度为 %s,已爬队列的长度为 %s" % (len(before), len(after)))
--------------------------------------------------------------------------------
/第6章/6-11.code:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 |
3 |
4 | class Monitor(ABC):
5 | """异常监控器"""
6 |
7 | @abstractmethod
8 | def push(self):
9 | """接收器
10 | 被捕获到的异常信息将会送到这里"""
11 |
12 | @abstractmethod
13 | def extractor(self):
14 | """拆分车间
15 | 根据需求拆分异常信息"""
16 |
17 | @abstractmethod
18 | def recombination(self):
19 | """重组车间
20 | 异常信息将在这里重组"""
21 |
22 |
23 | class Alarm(ABC):
24 | """警报器"""
25 |
26 | @abstractmethod
27 | def __init__(self):
28 | """初始配置"""
29 |
30 | def receive(self):
31 | """接收者
32 | 接收异常信息,将其进行处理后交给发送者"""
33 |
34 | @abstractmethod
35 | def sender(self):
36 | """发送者
37 | 将重组后的信息发送到端"""
--------------------------------------------------------------------------------
/第3章/3-1.code:
--------------------------------------------------------------------------------
1 | # 伪代码
2 | import requests
3 |
4 | # 向目标网页发出请求,假设页面 id 为 3376
5 | now_html_data = requests.get("http://example.com/article=3376")
6 | # 解析页面
7 | data = parse(now_html_data)
8 | # 抽取页面内容
9 | article = 3376
10 | title = data.title
11 | salay = data.salay
12 | place = data.place
13 | edu = data.edu
14 | # 从数据库中查询与页面 id 相同的数据
15 | mysql_data = query("select * from info where aid=3376")
16 | # 判断,如果页面内容与数据库存储的数据不同,则更新数据库
17 | if ([article != mysql_data.id, title != mysql_data.title,
18 | salay != mysql_data.salay, place != mysql_data.place,
19 | edu != mysql_data.edu
20 | ]):
21 | # 更新数据库
22 | query("update info set title=%s, salay=%s, place=%s, edu=%s where aid=3376"
23 | % (title, salay, place, edu))
24 | else:
25 | # 如果页面内容与数据库存储的数据相同,则不作处理
26 | pass
--------------------------------------------------------------------------------
/第3章/3-5.code:
--------------------------------------------------------------------------------
1 | >>> import pybloom_live
2 | >>> f = pybloom_live.BloomFilter(capacity=1000, error_rate=0.001)
3 | >>> [f.add(x) for x in range(10)]
4 | [False, False, False, False, False, False, False, False, False, False]
5 | >>> all([(x in f) for x in range(10)])
6 | True
7 | >>> 10 in f
8 | False
9 | >>> 5 in f
10 | True
11 | >>> f = pybloom_live.BloomFilter(capacity=1000, error_rate=0.001)
12 | >>> for i in xrange(0, f.capacity):
13 | ... _ = f.add(i)
14 | >>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18
15 | True
16 |
17 | >>> sbf = pybloom_live.ScalableBloomFilter(mode=pybloom_live.ScalableBloomFilter.SMALL_SET_GROWTH)
18 | >>> count = 10000
19 | >>> for i in range(0, count):
20 | _ = sbf.add(i)
21 |
22 | >>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18
23 | True
--------------------------------------------------------------------------------
/第5章/5-4.code:
--------------------------------------------------------------------------------
1 | def calc_text_density(self, element):
2 | """
3 | 根据公式:
4 |
5 | Ti - LTi
6 | TDi = -----------
7 | TGi - LTGi
8 |
9 |
10 | Ti:节点 i 的字符串字数
11 | LTi:节点 i 的带链接的字符串字数
12 | TGi:节点 i 的标签数
13 | LTGi:节点 i 的带连接的标签数
14 |
15 |
16 | :return:
17 | """
18 | ti_text = '\n'.join(self.get_all_text_of_element(element))
19 | ti = len(ti_text)
20 | lti = len(''.join(self.get_all_text_of_element(element.xpath('.//a'))))
21 | tgi = len(element.xpath('.//*'))
22 | ltgi = len(element.xpath('.//a'))
23 | if (tgi - ltgi) == 0:
24 | return {'density': 0, 'ti_text': ti_text, 'ti': ti, 'lti': lti, 'tgi': tgi, 'ltgi': ltgi}
25 | density = (ti - lti) / (tgi - ltgi)
26 | return {'density': density, 'ti_text': ti_text, 'ti': ti, 'lti': lti, 'tgi': tgi, 'ltgi': ltgi}
--------------------------------------------------------------------------------
/第1章/1-5.py:
--------------------------------------------------------------------------------
1 | import re
2 | import aiohttp
3 | import asyncio
4 | import parsel
5 |
6 |
7 | async def fetch(session, url):
8 | async with session.get(url) as response:
9 | return await response.text()
10 |
11 |
12 | async def main():
13 | async with aiohttp.ClientSession() as session:
14 | html = await fetch(session, 'https://www.phei.com.cn/gywm/cbsjj/2010-11-19/47.shtml')
15 | title = re.search("(.*)
", html).group(1)
16 | sel = parsel.Selector(html)
17 | content = "\n".join(sel.css(".column_content_inner p font::text").extract())
18 | with open("about.txt", "a") as file:
19 | file.write(title)
20 | file.write("\n")
21 | file.write(content)
22 |
23 | if __name__ == '__main__':
24 | loop = asyncio.get_event_loop()
25 | loop.run_until_complete(main())
--------------------------------------------------------------------------------
/第4章/4-14.py:
--------------------------------------------------------------------------------
1 | import re
2 | import requests
3 | import parsel
4 | from urllib.parse import urljoin
5 | from common import channel, queue
6 | from common import detail
7 |
8 |
9 | def callback(ch, method, properties, body):
10 | url = str(body, "utf8")
11 | print(url)
12 | resp = requests.get(url)
13 | sel = parsel.Selector(resp.content.decode("utf8"))
14 | the_time = sel.css(".news_date::text").extract_first()
15 | pub_time = re.search("(\d+-\d+-\d+)", the_time).group(1)
16 | # 为保持文章排版和样式,保留标签
17 | contents = sel.css(".news_content p").extract()
18 | content = "\n".join(contents)
19 | # 将文章数据存入 MongoDB
20 | detail.insert_one({"pubTime": pub_time, "url": url, "content": content})
21 |
22 |
23 | channel.basic_consume(
24 | queue=queue, on_message_callback=callback, auto_ack=True)
25 |
26 | channel.start_consuming()
--------------------------------------------------------------------------------
/第4章/4-7.code:
--------------------------------------------------------------------------------
1 | def request_fingerprint(request, include_headers=None):
2 | if include_headers:
3 | include_headers = tuple(to_bytes(h.lower())
4 | for h in sorted(include_headers))
5 | cache = _fingerprint_cache.setdefault(request, {})
6 | if include_headers not in cache:
7 | fp = hashlib.sha1()
8 | fp.update(to_bytes(request.method))
9 | fp.update(to_bytes(canonicalize_url(request.url)))
10 | fp.update(request.body or b'')
11 | if include_headers:
12 | for hdr in include_headers:
13 | if hdr in request.headers:
14 | fp.update(hdr)
15 | for v in request.headers.getlist(hdr):
16 | fp.update(v)
17 | cache[include_headers] = fp.hexdigest()
18 | return cache[include_headers]
--------------------------------------------------------------------------------
/第6章/6-1.code:
--------------------------------------------------------------------------------
1 | class Schedule(WsResource):
2 |
3 | def render_POST(self, txrequest):
4 | args = native_stringify_dict(copy(txrequest.args), keys_only=False)
5 | settings = args.pop('setting', [])
6 | settings = dict(x.split('=', 1) for x in settings)
7 | args = dict((k, v[0]) for k, v in args.items())
8 | project = args.pop('project')
9 | spider = args.pop('spider')
10 | version = args.get('_version', '')
11 | spiders = get_spider_list(project, version=version)
12 | if not spider in spiders:
13 | return {"status": "error", "message": "spider '%s' not found" % spider}
14 | args['settings'] = settings
15 | jobid = args.pop('jobid', uuid.uuid1().hex)
16 | args['_job'] = jobid
17 | self.root.scheduler.schedule(project, spider, **args)
18 | return {"node_name": self.root.nodename, "status": "ok", "jobid": jobid}
--------------------------------------------------------------------------------
/第4章/4-13.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import parsel
3 | from urllib.parse import urljoin
4 | from common import channel, queue
5 |
6 |
7 | urls = ["https://www.phei.com.cn/xwxx/index_{}.shtml".format(i) for i in range(1, 46)]
8 | urls.append("https://www.phei.com.cn/xwxx/index.shtml")
9 |
10 | for url in urls:
11 | # 翻页爬取
12 | resp = requests.get(url)
13 | sel = parsel.Selector(resp.content.decode("utf8"))
14 | li = sel.css(".web_news_list ul li.li_b60")
15 | for news in li:
16 | link = news.css("a:first-child::attr('href')").extract_first()
17 | full_link = urljoin(url, link) # 拼接完整 URL
18 | # 将新闻资讯详情页 URL 发布到 RabbitMQ 队列
19 | channel.queue_declare(queue=queue)
20 | channel.basic_publish(exchange='',
21 | routing_key=queue,
22 | body='{}'.format(full_link))
23 | print("[x] Sent '{}'".format(urljoin(url, link)))
--------------------------------------------------------------------------------
/第4章/4-9.command:
--------------------------------------------------------------------------------
1 | # 步骤 1,更改 settings.py 中的配置
2 | # 设置调度器
3 | SCHEDULER = "scrapy_redis.scheduler.Scheduler"
4 | # 设置去重器
5 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
6 | # 更改管道器
7 | ITEM_PIPELINES = {
8 | 'scrapy_redis.pipelines.RedisPipeline': 300
9 | }
10 | # 设置队列
11 | SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
12 | # 设置 Redis 连接参数,其中包括用户名、密码、地址和端口号
13 | REDIS_HOST = 'localhost'
14 | REDIS_URL = 'redis://user:pass@hostname:9001'
15 |
16 | # 步骤 2,在终端执行
17 | $ scrapy startproject Example
18 | $ cd Example
19 | $ scrapy genspider example example.com
20 |
21 | # 步骤 3
22 | from scrapy_redis.spiders import RedisSpider
23 | class ExampleSpider(RedisSpider):
24 | name = 'example'
25 | allowed_domains = ['example.com']
26 | def parse(self, response):
27 | # do stuff
28 | pass
29 |
30 | # 步骤 4,在终端执行
31 | $ scrapy runspider example.py
32 |
33 | # 步骤 5,在 Redis-Client 执行
34 | > lpush example:start_urls http://example.com
--------------------------------------------------------------------------------
/第5章/5-2.code:
--------------------------------------------------------------------------------
1 | from .utils import pre_parse, remove_noise_node
2 | from gne.extractor import ContentExtractor, TitleExtractor, TimeExtractor, AuthorExtractor
3 |
4 |
5 | class GeneralNewsExtractor:
6 | def extract(self, html, title_xpath='', noise_node_list=None, with_body_html=False):
7 | element = pre_parse(html)
8 | remove_noise_node(element, noise_node_list)
9 | content = ContentExtractor().extract(element, with_body_html)
10 | title = TitleExtractor().extract(element, title_xpath=title_xpath)
11 | publish_time = TimeExtractor().extractor(element)
12 | author = AuthorExtractor().extractor(element)
13 | result = {'title': title,
14 | 'author': author,
15 | 'publish_time': publish_time,
16 | 'content': content[0][1]['text'],
17 | 'images': content[0][1]['images']}
18 | if with_body_html:
19 | result['body_html'] = content[0][1]['body_html']
20 | return result
--------------------------------------------------------------------------------
/第4章/4-5.command:
--------------------------------------------------------------------------------
1 | # MongoDB
2 | # 为集合 WaitCrawl 中的 url 创建 unique 约束
3 | > db.WaitCrawl.ensureIndex({"url": 1}, {"unique": true});
4 | {
5 | "createdCollectionAutomatically" : true,
6 | "numIndexesBefore" : 1,
7 | "numIndexesAfter" : 2,
8 | "ok" : 1
9 | }
10 | # 插入第 1 条数据
11 | > db.WaitCrawl.insert({"name": "exam", "url": "http://exam.com"});
12 | WriteResult({ "nInserted" : 1 })
13 | # 插入第 2 条数据
14 | > db.WaitCrawl.insert({"name": "exam", "url": "http://exam.com"});
15 | WriteResult({
16 | "nInserted" : 0,
17 | "writeError" : {
18 | "code" : 11000,
19 | "errmsg" : "E11000 duplicate key error collection: WaitCrawl.WaitCrawl index: url_1 dup key: { : \"http://exam.com\" }"
20 | }
21 | })
22 | # 插入第 3 条数据
23 | > db.WaitCrawl.insert({"name": "exam", "url": "http://exam.com2"});
24 | WriteResult({ "nInserted" : 1 })
25 | # 查看集合 WaitCrawl 中的文档
26 | > db.WaitCrawl.find();
27 | { "_id" : ObjectId("5dc3cd3cba05dc8f5eeac929"), "name" : "exam", "url" : "http://exam.com" }
28 | { "_id" : ObjectId("5dc3cdb7ba05dc8f5eeac92b"), "name" : "exam", "url" : "http://exam.com2" }
--------------------------------------------------------------------------------
/第1章/1-9.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import parsel
3 | from urllib.parse import urljoin
4 | from pymongo import MongoClient
5 |
6 | # 连接数据库并指定数据库和集合
7 | client = MongoClient('localhost', 27017)
8 | db = client.news
9 | collection = db.phei
10 |
11 |
12 | urls = ["https://www.phei.com.cn/xwxx/index_{}.shtml".format(i) for i in range(45)]
13 | urls.append("https://www.phei.com.cn/xwxx/index.shtml")
14 |
15 | for url in urls:
16 | # 翻页爬取
17 | resp = requests.get(url)
18 | sel = parsel.Selector(resp.content.decode("utf8"))
19 | li = sel.css(".web_news_list ul li.li_b60")
20 | for news in li:
21 | # 从单页中提取资讯信息
22 | title = news.css("p.li_news_title::text").extract_first()
23 | pub_time = news.css("span::text").extract_first()
24 | desc = news.css("p.li_news_summary::text").extract_first()
25 | image = news.css("div.li_news_line img::attr('src')").extract_first()
26 | full_image = urljoin(url, image) # 完整图片链接
27 | # 将数据存入 MongoDB 数据库中
28 | collection.insert_one({"title": title, "pubTime": pub_time,
29 | "image": full_image, "desc": desc})
--------------------------------------------------------------------------------
/第6章/6-7.code:
--------------------------------------------------------------------------------
1 | class LoginHandler(MethodView):
2 |
3 | def post(self):
4 | username = request.json.get("username")
5 | pwd = request.json.get("password")
6 | password = md5_encode(pwd)
7 | # 支持用户名或邮箱登录
8 | query = {"username": username, "password": password}
9 | name_exit = databases.user.count_documents(query)
10 | # 校验用户是否存在
11 | if not name_exit:
12 | query = {"email": username, "password": password}
13 | result = databases.user.find_one(query)
14 | if not result:
15 | return {"message": StatusCode.NotFound.value[0],
16 | "data": {},
17 | "code": StatusCode.NotFound.value[1]
18 | }, 400
19 | # 校验用户状态
20 | status = result.get("status")
21 | if not status:
22 | return {"message": StatusCode.UserStatusOff.value[0],
23 | "data": {},
24 | "code": StatusCode.UserStatusOff.value[1]
25 | }, 400
26 | # 返回登录结果
27 | return {"message": "success",
28 | "data": {"username": username},
29 | "code": 200}
--------------------------------------------------------------------------------
/第2章/2-5.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import re
3 | from pyppeteer import launch
4 |
5 | async def main():
6 | browser = await launch()
7 | page = await browser.newPage()
8 | await page.goto('https://www.phei.com.cn/module/goods/wssd_index.jsp')
9 | lis = await page.querySelectorAll("#book_sort_area ul:nth-child(1) li")
10 | for i in lis:
11 | image_element = await i.querySelector("p a img")
12 | image = await (await image_element.getProperty("src")).jsonValue()
13 | book_element = await i.querySelector("p.li_title a")
14 | book = await (await book_element.getProperty("textContent")).jsonValue()
15 | author_price_element = await i.querySelector("p.li_author")
16 | author_price = await (await author_price_element.getProperty("textContent")).jsonValue()
17 | try:
18 | author = re.search("作译者:(.*)定价", str(author_price)).group(1)
19 | price = re.search(r"(\d+.\d+)", str(author_price)).group(1)
20 | except Exception as exc:
21 | author, price = "", ""
22 | print(exc)
23 | print([book, price, author, image])
24 | await browser.close()
25 |
26 | asyncio.get_event_loop().run_until_complete(main())
--------------------------------------------------------------------------------
/第6章/6-10.code:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from importlib import import_module
4 |
5 | from component.storage import FileStorages
6 |
7 |
8 | storages = FileStorages()
9 |
10 |
11 | class Helmsman:
12 | """为文件导入和执行创造条件的上下文管理器"""
13 | def __init__(self, project, version):
14 | self.project = project
15 | self.version = version
16 | self.storage = storages
17 | self.temp_file = ""
18 |
19 | def __enter__(self):
20 | """上文"""
21 | # 将文件拷贝到临时区
22 | target = self.storage.copy_to_temporary(project, version)
23 | self.temp_file = target
24 | if target:
25 | # 将文件路径添加到 sys.path
26 | sys.path.insert(0, target)
27 |
28 | def __exit__(self, exc_type, exc_val, exc_tb):
29 | """下文"""
30 | if os.path.exists(self.temp_file):
31 | # 清理临时区中对应的文件
32 | os.remove(self.temp_file)
33 |
34 |
35 | def main(project, version):
36 | helmsman = Helmsman(project, version)
37 | with helmsman:
38 | # 从指定的文件中导入模块并调用指定方法
39 | spider = import_module("sail")
40 | spider.main()
41 |
42 |
43 | if __name__ == "__main__":
44 | project, version = sys.argv[-2], sys.argv[-1]
45 | main(project, version)
--------------------------------------------------------------------------------
/第3章/3-3.py:
--------------------------------------------------------------------------------
1 | import time
2 | import string
3 | import random
4 | import asyncio
5 | import aiomysql
6 |
7 |
8 | async def test_example_execute(loop):
9 | # 填写参数,以连接数据库
10 | conn = await aiomysql.connect(host='127.0.0.1', port=3306,
11 | user='root', password='******',
12 | db='football', loop=loop,
13 | autocommit=True)
14 | async with conn.cursor() as cur:
15 | # 循环 50 万次
16 | for i in range(500000):
17 | base_url = "http://www.******.com"
18 | # 生成 6 位的随机小写字母组合
19 | article = ''.join(random.choices(string.ascii_lowercase, k=6))
20 | # 生成时间戳
21 | timestamp = int(time.time())
22 | # 生成 sign 参数
23 | sign = article + str(timestamp * 3)
24 | # 拼接成常见的 URL
25 | url = "%s?page=1&article=%s&sign=%s×=%s" % (base_url, article, sign, timestamp)
26 | # SQL 语句
27 | query = "INSERT INTO player(url) VALUES ('%s');" % url
28 | # 执行指定的 SQL 语句
29 | await cur.execute(query)
30 | conn.close()
31 |
32 |
33 | loop = asyncio.get_event_loop()
34 | loop.run_until_complete(test_example_execute(loop))
--------------------------------------------------------------------------------
/第6章/6-5.code:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import shutil
4 | import tempfile
5 | from contextlib import contextmanager
6 |
7 | from scrapyd import get_application
8 | from scrapyd.interfaces import IEggStorage
9 | from scrapyd.eggutils import activate_egg
10 |
11 | @contextmanager
12 | def project_environment(project):
13 | app = get_application()
14 | eggstorage = app.getComponent(IEggStorage)
15 | eggversion = os.environ.get('SCRAPY_EGG_VERSION', None)
16 | version, eggfile = eggstorage.get(project, eggversion)
17 | if eggfile:
18 | prefix = '%s-%s-' % (project, version)
19 | fd, eggpath = tempfile.mkstemp(prefix=prefix, suffix='.egg')
20 | lf = os.fdopen(fd, 'wb')
21 | shutil.copyfileobj(eggfile, lf)
22 | lf.close()
23 | activate_egg(eggpath)
24 | else:
25 | eggpath = None
26 | try:
27 | assert 'scrapy.conf' not in sys.modules, "Scrapy settings already loaded"
28 | yield
29 | finally:
30 | if eggpath:
31 | os.remove(eggpath)
32 |
33 | def main():
34 | project = os.environ['SCRAPY_PROJECT']
35 | with project_environment(project):
36 | from scrapy.cmdline import execute
37 | execute()
38 |
39 | if __name__ == '__main__':
40 | main()
--------------------------------------------------------------------------------
/第5章/5-3.code:
--------------------------------------------------------------------------------
1 | def extract(self, selector, with_body_html=False):
2 | body = selector.xpath('//body')[0]
3 | for node in iter_node(body):
4 | node_hash = hash(node)
5 | density_info = self.calc_text_density(node)
6 | text_density = density_info['density']
7 | ti_text = density_info['ti_text']
8 | text_tag_count = self.count_text_tag(node, tag='p')
9 | sbdi = self.calc_sbdi(ti_text, density_info['ti'], density_info['lti'])
10 | images_list = node.xpath('.//img/@src')
11 | node_info = {'ti': density_info['ti'],
12 | 'lti': density_info['lti'],
13 | 'tgi': density_info['tgi'],
14 | 'ltgi': density_info['ltgi'],
15 | 'node': node,
16 | 'density': text_density,
17 | 'text': ti_text,
18 | 'images': images_list,
19 | 'text_tag_count': text_tag_count,
20 | 'sbdi': sbdi}
21 | if with_body_html:
22 | body_source_code = unescape(etree.tostring(node).decode())
23 | node_info['body_html'] = body_source_code
24 | self.node_info[node_hash] = node_info
25 | std = self.calc_standard_deviation()
26 | self.calc_new_score(std)
27 | result = sorted(self.node_info.items(), key=lambda x: x[1]['score'], reverse=True)
28 | return result
--------------------------------------------------------------------------------
/第6章/6-6.code:
--------------------------------------------------------------------------------
1 | class RegisterHandler(MethodView):
2 |
3 | def post(self):
4 | username = request.json.get("username")
5 | pwd = request.json.get("password")
6 | nick = request.json.get("nick")
7 | email = request.json.get("email")
8 | if not username or not pwd or not nick or not email or "@" not in email:
9 | return {"message": StatusCode.ParameterError.value[0],
10 | "data": {},
11 | "code": StatusCode.ParameterError.value[1]
12 | }, 400
13 | password = pwd
14 | count = databases.user.count_documents({})
15 | if not count:
16 | # 首次注册的账户为超级管理员,启动激活
17 | role = Role.SuperUser.value
18 | message = {"username": username, "password": password,
19 | "nick": nick, "email": email,
20 | "role": role, "status": Status.On.value}
21 | else:
22 | # 非首次注册账户默认为开发者,且未激活
23 | role = Role.Developer.value
24 | message = {"username": username, "password": password,
25 | "nick": nick, "email": email,
26 | "role": role, "status": Status.Off.value}
27 | message["create"] = datetime.now()
28 | # 将信息写入数据库并将相应信息返回给用户
29 | inserted = databases.user.insert_one(message).inserted_id
30 | message["id"] = str(inserted)
31 | message["username"] = username
32 | message["email"] = email
33 | message["role"] = role
34 | message.pop("_id")
35 | return {"message": "success", "data": message, "code": 201}, 201
--------------------------------------------------------------------------------
/第6章/6-9.code:
--------------------------------------------------------------------------------
1 | import time
2 | from datetime import datetime
3 | from flask.views import MethodView
4 | from flask import request
5 |
6 | from component.enums import StatusCode
7 | from component.storage import FileStorages
8 | from connect import databases
9 |
10 |
11 | storages = FileStorages()
12 |
13 |
14 | class DeployHandler(MethodView):
15 |
16 | def post(self):
17 | """项目部署接口"""
18 | project = request.form.get('project')
19 | remark = request.form.get('remark')
20 | file = request.files.get('file')
21 | if not project or not file:
22 | # 确保参数和值存在
23 | return {"message": StatusCode.MissingParameter.value[0],
24 | "data": {},
25 | "code": StatusCode.MissingParameter.value[1]
26 | }, 400
27 | filename = file.filename
28 | if not filename.endswith('.egg'):
29 | # 确保文件类型正确
30 | return {"message": StatusCode.NotFound.value[0],
31 | "data": {},
32 | "code": StatusCode.NotFound.value[1]
33 | }, 400
34 | version = int(time.time())
35 | content = file.stream.read()
36 | # 将文件存储到服务端
37 | result = storages.put(project, version, content)
38 | if not result:
39 | # 存储失败则返回相关提示
40 | return {"message": StatusCode.OperationError.value[0],
41 | "data": {},
42 | "code": StatusCode.OperationError.value[1]
43 | }, 400
44 | message = {"project": project,
45 | "version": str(version),
46 | "remark": remark or "Nothing",
47 | "create": datetime.now()}
48 | databases.deploy.insert_one(message).inserted_id
49 | message["_id"] = str(message.pop("_id"))
50 | return {"message": "success",
51 | "data": message,
52 | "code": 201}, 201
--------------------------------------------------------------------------------
/第6章/6-8.code:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | import shutil
4 | from settings import FILEPATH, TEMPATH
5 |
6 |
7 | class FileStorages:
8 |
9 | @staticmethod
10 | def put(project, version, content):
11 | """文件存储
12 | """
13 | # 根据项目名称生成路径
14 | room = os.path.join(FILEPATH, project)
15 | if not os.path.exists(room):
16 | # 如果目录不存在则创建
17 | os.makedirs(room)
18 | # 拼接文件完整路径,以时间戳作为文件名
19 | filename = os.path.join(room, "%s.egg" % str(version))
20 | try:
21 | with open(filename, 'wb') as file:
22 | # 写入文件
23 | file.write(content)
24 | except Exception as exc:
25 | # 异常处理,打印异常信息
26 | logging.warning(exc)
27 | return False
28 | return True
29 |
30 | def get(self):
31 | pass
32 |
33 | @staticmethod
34 | def delete(project, version):
35 | """文件删除状态
36 | A - 文件或目录存在且成功删除
37 | B - 文件或目录不存在,无需删除
38 | """
39 | sign = 'B'
40 | room = os.path.join(FILEPATH, project)
41 | if project and version:
42 | # 删除指定文件
43 | filename = os.path.join(room, "%s.egg" % str(version))
44 | if os.path.exists(filename):
45 | sign = 'A'
46 | os.remove(filename)
47 | if project and not version:
48 | # 删除指定目录
49 | if os.path.exists(room):
50 | sign = 'A'
51 | shutil.rmtree(room)
52 | return sign
53 |
54 | @staticmethod
55 | def copy_to_temporary(project, version):
56 | """根据参数将指定文件拷贝到指定目录
57 | """
58 | before = os.path.join(FILEPATH, project, "%s.egg" % version)
59 | after = os.path.join(TEMPATH, "%s.egg" % version)
60 | if not os.path.exists(before):
61 | logging.warning("File %s Not Exists" % before)
62 | return None
63 | if not os.path.exists(TEMPATH):
64 | os.makedirs(TEMPATH)
65 | # 文件拷贝
66 | shutil.copyfile(before, after)
67 | return after
68 |
69 | @staticmethod
70 | def exists(project, version):
71 | """检查指定项目名称和版本号的文件是否存在"""
72 | file = os.path.join(FILEPATH, project, "%s.egg" % version)
73 | if not os.path.exists(file):
74 | return False
75 | return True
--------------------------------------------------------------------------------
/第6章/6-13.code:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import hmac
3 | import time
4 | import base64
5 | import json
6 | import logging
7 | from urllib.parse import quote_plus
8 | import requests
9 | from interface import Alarm
10 |
11 | from supervise.monitors import MarkdownMonitor
12 |
13 |
14 | class DingAlarm(Alarm):
15 |
16 | def __init__(self):
17 | self.access_key = "xxx"
18 | self.secret = "GQSxx"
19 | self.token = "https://oapi.dingtalk.com/robot/send?access_token=xxx"
20 | self.header = {"Content-Type": "application/json;charset=UTF-8"}
21 | self.monitor = MarkdownMonitor()
22 |
23 | def receive(self, txt, occurrence, timer):
24 | """接收者
25 | 接收异常信息,将其进行处理后交给发送者"""
26 | content = self.monitor.push(txt, occurrence, timer)
27 | self.sender(content)
28 |
29 | @staticmethod
30 | def _sign(timestamps, secret, mode=False):
31 | """钉钉签名计算
32 | 根据钉钉文档指引计算签名信息
33 | 文档参考
34 | https://docs.python.org/3.6/library/hmac.html
35 | https://docs.python.org/3.6/library/urllib.parse.html#urllib.parse.quote
36 | https://ding-doc.dingtalk.com/doc#/faquestions/hxs5v9
37 | """
38 | if not isinstance(timestamps, str):
39 | # 如果钉钉机器人的安全措施为密钥,那么按照文档指引传入的是字符串,反之为数字
40 | # 加密时需要转成字节,所以这里要确保时间戳为字符串
41 | timestamps = str(timestamps)
42 | mav = hmac.new(secret.encode("utf8"), digestmod=hashlib.sha256)
43 | mav.update(timestamps.encode("utf8"))
44 | result = mav.digest()
45 | # 对签名值进行 Base64 编码
46 | signature = base64.b64encode(result).decode("utf8")
47 | if mode:
48 | # 可选择是否将签名值进行 URL 编码
49 | signature = quote_plus(signature)
50 | return signature
51 |
52 | def sender(self, message):
53 | """发送者
54 | 将重组后的信息发送到端"""
55 | timestamps = int(time.time()) * 1000
56 | # sign = self._sign(timestamps, self.secret, True)
57 | # 根据钉钉文档构造链接
58 | url = self.token # + "×tamp=%s&sign=%s" % (timestamps, sign)
59 | # 通过钉钉机器人将消息发送到钉钉群
60 | resp = requests.post(url, headers=self.header, json=message)
61 | # 根据返回的错误码判断消息发送状态
62 | err = json.loads(resp.text)
63 | if err.get("errcode"):
64 | logging.warning(err)
65 | return False
66 | else:
67 | logging.info("Message Sender Success")
68 | return True
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/第6章/6-12.code:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | from interface import Monitor
3 |
4 |
5 | class MarkdownMonitor(Monitor):
6 |
7 | def __init__(self):
8 | self.keyword = "Alarm"
9 | self.err_image = "http://can.sfhfpc.com/sfhfpc/20191210133853.png"
10 | self.traceback_image = "http://can.sfhfpc.com/sfhfpc/20191210133616.png"
11 |
12 | def push(self, txt, occurrence, timer):
13 | """接收器
14 | 被捕获到的异常信息将会送到这里"""
15 |
16 | # 将信息按行分割
17 | message = []
18 | line = ""
19 | for i in txt:
20 | if i != "\n":
21 | line += i
22 | else:
23 | message.append(line)
24 | line = ""
25 | err, traceback, res = self.extractor(message)
26 | content = self.recombination(err, traceback, res, occurrence, timer)
27 | return content
28 |
29 | def extractor(self, message):
30 | """拆分车间
31 | 根据需求拆分异常信息"""
32 | result = []
33 | err_number = 0
34 | traceback_number = 0
35 | for k, v in enumerate(message):
36 | # 异常分类
37 | if "ERROR" in v:
38 | # 列别数量统计
39 | err_number += 1
40 | # 放入信息队列
41 | result.append(v)
42 | if "Traceback" in v:
43 | # 类别数量统计
44 | traceback_number += 1
45 | # 放入信息队列
46 | result += message[k:]
47 | return err_number, traceback_number, result
48 |
49 | def recombination(self, err, traceback, res, occurrence, timer):
50 | """重组车间
51 | 异常信息将在这里重组"""
52 | title = "Traceback" if traceback else "Error"
53 | image = self.traceback_image if traceback else self.err_image
54 | err_message = "\n\n > ".join(res)
55 | now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
56 | # 按照钉钉文档中的 MarkDown 格式示例构造信息
57 | article = "#### TOTAL -- Error Number: {}, Traceback Number: {} \n".format(err, traceback) + \
58 | ">  \n\n".format(image) + \
59 | "> **Error message** \n\n" + \
60 | "> {} \n\n".format(err_message) + \
61 | "> -------- \n\n" + \
62 | "> **Timer**\n\n> {} \n\n".format(timer) +\
63 | "> -------- \n\n" + \
64 | "> **Other information** \n\n" + \
65 | "> Occurrence Time: {} \n\n".format(occurrence) + \
66 | "> Send Time: {} \n\n".format(now) + \
67 | "> Message Type: {}".format(self.keyword)
68 |
69 | content = {
70 | "msgtype": "markdown",
71 | "markdown": {"title": title, "text": article}
72 | }
73 | return content
--------------------------------------------------------------------------------