├── src
├── __init__.py
├── tools.py
├── save_to_excel.py
├── base_spider.py
├── all_process.py
└── wechat_funcs.py
├── README
├── qrcode_1749894334903.jpg
├── wechat_article_drawio.png
├── image-20251203185742977.png
├── image-20251203185757196.png
├── image-20251203185810439.png
└── image-20251203185822659.png
├── requirements.txt
├── main.py
├── README.md
└── LICENSE
/src/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/README/qrcode_1749894334903.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeximm/Access_wechat_article/HEAD/README/qrcode_1749894334903.jpg
--------------------------------------------------------------------------------
/README/wechat_article_drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeximm/Access_wechat_article/HEAD/README/wechat_article_drawio.png
--------------------------------------------------------------------------------
/README/image-20251203185742977.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeximm/Access_wechat_article/HEAD/README/image-20251203185742977.png
--------------------------------------------------------------------------------
/README/image-20251203185757196.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeximm/Access_wechat_article/HEAD/README/image-20251203185757196.png
--------------------------------------------------------------------------------
/README/image-20251203185810439.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeximm/Access_wechat_article/HEAD/README/image-20251203185810439.png
--------------------------------------------------------------------------------
/README/image-20251203185822659.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeximm/Access_wechat_article/HEAD/README/image-20251203185822659.png
--------------------------------------------------------------------------------
/src/tools.py:
--------------------------------------------------------------------------------
1 | """
2 | 工具模块,包含一些常用的工具函数,如保存内容到缓存文件等
3 | 功能1:
4 | save_cache(content) # 保存内容到缓存文件
5 |
6 |
7 | """
8 |
9 |
10 | # 保存内容到缓存文件, 用于调试
11 | def save_cache(content):
12 | with open(r'src/cache/test_cache.txt', 'w', encoding='utf-8') as f:
13 | f.write(content)
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.13.3
2 | certifi==2025.1.31
3 | charset-normalizer==3.4.1
4 | et_xmlfile==2.0.0
5 | fake-useragent==2.0.3
6 | idna==3.10
7 | jsonpath==0.82
8 | lxml==6.0.2
9 | numpy==2.2.3
10 | openpyxl==3.1.5
11 | pandas==2.2.3
12 | python-dateutil==2.9.0.post0
13 | pytz==2025.1
14 | requests==2.32.3
15 | six==1.17.0
16 | soupsieve==2.6
17 | typing_extensions==4.12.2
18 | tzdata==2025.1
19 | urllib3==2.3.0
20 |
--------------------------------------------------------------------------------
/src/save_to_excel.py:
--------------------------------------------------------------------------------
1 | """
2 | 爬虫基类模块
3 | 主要通过单篇文章获取信息
4 | """
5 |
6 | import pandas as pd # 修改excel
7 | import os
8 | import time
9 |
10 | from src.tools import *
11 |
12 |
13 | class SaveToExcel():
14 | """
15 | 功能描述:
16 | 保存文章信息到excel文件
17 | """
18 | def __init__(self,data_path, nickname):
19 | self.nickname = nickname
20 | # 创建excel文件保存目录
21 | self.excel_save_path = os.path.join(data_path, '公众号----' + nickname)
22 | # print('excel文件保存目录: ', self.excel_save_path)
23 | os.makedirs(self.excel_save_path, exist_ok=True) # 创建数据存储目录
24 |
25 | self.article_raw_path = os.path.join(self.excel_save_path, '文章列表 (article_list).xlsx')
26 | self.article_contents_path = os.path.join(self.excel_save_path, '文章内容 (article_contents).xlsx')
27 | self.article_details_path = os.path.join(self.excel_save_path, '文章详情 (article_detiles).xlsx')
28 | self.article_error_path = os.path.join(self.excel_save_path, '问题链接 (error_links).xlsx')
29 |
30 | def read_article_list(self, article_list_path):
31 | """
32 | 功能描述:
33 | 读取文章列表
34 | 输入:
35 | 文章列表文件路径
36 | 输出:
37 | 文章列表
38 | """
39 | # 读取文章列表
40 | article_list = pd.read_excel(article_list_path)
41 | all_article_list = []
42 | for index, row in article_list.iterrows():
43 | if pd.isna(row.iloc[6]):
44 | print('检测到存在空数据,跳过')
45 | continue
46 | else:
47 | all_article_list.append(row.to_list())
48 | return all_article_list
49 |
50 | def save_article_content(self, file_path, columns,content_info):
51 | """
52 | 功能描述:
53 | 保存单篇文章信息到excel文件
54 | 输入:
55 | 文件路径(已经合并好的路径)
56 | 表头 (一维数组) columns = ['本地存储时间', '文章发布时间'] # 列名
57 | 文章信息 (二维数组) content_info
58 | 输出:
59 | None
60 | """
61 | # 创建 or 打开表格,检查文件是否存在,判断不存在时创建表格文件
62 | article_contents_path = file_path # 文章内容文件路径
63 | if not os.path.exists(article_contents_path):
64 | pd.DataFrame().to_excel(article_contents_path, index=False)
65 | frame_df = pd.read_excel(article_contents_path) # 读取表格内容,默认打开DataFrame对象包含第一个工作表中的数据
66 |
67 | # 将新数据转换为 DataFrame 并添加到现有 DataFrame 的末尾
68 | new_data_df = pd.DataFrame(content_info, columns=columns)
69 | df = pd.concat([frame_df, new_data_df], ignore_index=True)
70 |
71 | # 将更新后的数据写入 Excel 文件
72 | df.to_excel(article_contents_path, index=False)
73 | local_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) # 本地时间
74 | print(local_time + ' 存储路径>>>> ' + article_contents_path)
75 |
76 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | from src.all_process import AccessWechatArticle
2 |
3 |
4 | if __name__=="__main__":
5 | AWA = AccessWechatArticle()
6 | screen_text = '''请输入数字键!
7 | 数字键1: 获取公众号主页链接
8 | 数字键2: 获取公众号已发布的文章列表
9 | 数字键3: 下载公众号文章内容 (默认下载 "文章列表" 中的所有文章)
10 | 数字键4: 同功能3, 另外获取每篇文章的 "阅读量"、"点赞数"等信息
11 | (请注意请求间隔,若请求太多太快可能会触发封禁!!)
12 | 输入其他任意字符退出!'''
13 | print('欢迎使用, ' + screen_text)
14 | while True:
15 | text = str(input('请输入功能数字: '))
16 |
17 | if text == '1':
18 | screen_text1 = '########## 默认公众号主页链接为“研招网资讯”,按回车键使用。##########\n' + \
19 | '########## 若需获取其他公众号主页链接,请输入公众号下任意一篇已发布的文章链接。##########\n' + \
20 | '请输入文章链接:'
21 | temporary_url = (input(screen_text1) or 'https://mp.weixin.qq.com/s/4r_LKJu0mOeUc70ZZXK9LA')
22 | AWA.get_public_main_link(temporary_url)
23 |
24 | input("按回车键继续...")
25 | print('\n' + screen_text)
26 |
27 | elif text == '2':
28 | screen_text21 = '\n########## 以下内容需要用到fiddler工具 ##########\n' + \
29 | ' (1) 在微信客户端打开步骤1获取到的链接,\n' + \
30 | ' (2) 在fiddler中查看——主机地址为https://mp.weixin.qq.com, URL地址为: /mp/profile_ext?acti\n' + \
31 | ' (3) 选中此项后按快捷键: Ctrl+U 复制该网址到剪贴板, 将内容粘贴到此处\n' + \
32 | '请输入复制的链接(づ ̄ 3 ̄)づ:'
33 | access_token = input(screen_text21)
34 | screen_text22 = '\n########## 获取指定页数的文章列表 ##########\n' + \
35 | '一页文章数量约 15 篇, 请根据实际情况估算 (即: input * 15 = 文章数量)\n' + \
36 | '例如: 获取前3页的文章列表, 请输入 3 \n' + \
37 | ' 公众号下全部文章列表, 请输入: 0 (注意: 若输入0, 全部列表可能需要较长时间, 视文章数量而定)\n' + \
38 | ' 公众号下第2页到第5页的文章列表, 请输入 2-5 \n' + \
39 | '请输入需要下载的页数(默认: 1): '
40 | pages = input(screen_text22) or '1'
41 | if '-' in pages:
42 | pages = pages.split('-')
43 | pages_start = int(pages[0])
44 | pages_end = int(pages[1])
45 | AWA.get_article_list(access_token, pages_start, pages_end)
46 | else:
47 | pages = int(pages)
48 | AWA.get_article_list(access_token, pages)
49 |
50 | input("按回车键继续...")
51 | print('\n' + screen_text)
52 |
53 | elif text == '3': # 该功能不需要token
54 | screen_text31 = '\n########## 保存公众号文章内容 ##########\n' + \
55 | '输入: 已下载文章列表的公众号名称 (例如: 研招网资讯) 或 公众号的一篇文章链接 \n' + \
56 | '(若当前会话已执行过步骤2, 可按回车跳过)\n' + \
57 | '请输入: '
58 | nickname = input(screen_text31)
59 | screen_text32 = '\n########## 是否保存图片 ##########\n' + \
60 | '是否保存图片? 是(输入任意值), 否(默认,按回车跳过): '
61 | save_img = input(screen_text32)
62 | AWA.save_article_content(str(nickname), save_img)
63 |
64 | input("按回车键继续...")
65 | print('\n' + screen_text)
66 |
67 | elif text == '4':
68 | screen_text41 = '\n########## 保存公众号文章详情 ##########\n' + \
69 | '以下内容需要用到fiddler工具, 参考步骤2将 URL地址 粘贴到此处\n' + \
70 | '请输入复制的链接(づ ̄ 3 ̄)づ: '
71 | access_token = input(screen_text41)
72 | AWA.save_article_details(access_token)
73 |
74 | input("按回车键继续...")
75 | print('\n未成功获取的链接已保存到本地。' + '\n' + screen_text)
76 |
77 | else:
78 | print('\n已成功退出!')
79 | break
80 |
81 |
82 |
--------------------------------------------------------------------------------
/src/base_spider.py:
--------------------------------------------------------------------------------
1 | """
2 | 爬虫基类模块
3 | 主要通过单篇文章获取信息
4 | """
5 | import requests
6 | requests.packages.urllib3.disable_warnings() # 去除网络请求警告
7 |
8 | from fake_useragent import UserAgent # 生成随机浏览器代理
9 | User_Agent = UserAgent().chrome # 获取chrome浏览器标识
10 |
11 | from bs4 import BeautifulSoup
12 | import lxml
13 | import os
14 | import re
15 | import time
16 | import random
17 |
18 |
19 | from src.tools import *
20 |
21 |
22 | class BaseSpider:
23 | """
24 | 功能描述:
25 | 爬虫基类,所有爬虫类均需继承该类
26 | """
27 | def __init__(self):
28 | self.session = requests.Session() # 配置共享请求会话
29 | self.timeout = 10 # 设置超时
30 | self.headers = {'User-Agent': User_Agent}
31 | self.data = {}
32 | self.cookies = {}
33 | self.nickname = '' # 公众号名称
34 | self.public_main_link = '' # 公众号主页链接
35 |
36 | def delay_time(self):
37 | """
38 | 功能描述:
39 | 延时函数, 用于避免频繁请求导致的IP被封禁
40 | 输入:
41 | 无
42 | 输出:
43 | 无
44 | """
45 | second_max_num = 7
46 | second_min_num = 3
47 | second_num = random.uniform(second_min_num, second_max_num)
48 | second_num = round(second_num, 3) # 保留3位小数
49 | print('为预防被封禁,开始延时操作,延时时间:' + str(second_num) + '秒')
50 |
51 | time.sleep(second_num)
52 |
53 | def delay_short_time(self):
54 | """
55 | 功能描述:
56 | 延时函数, 用于避免频繁请求导致的IP被封禁
57 | 输入:
58 | 无
59 | 输出:
60 | 无
61 | """
62 | second_max_num = 1.5
63 | second_min_num = 0.1
64 | second_num = random.uniform(second_min_num, second_max_num)
65 | second_num = round(second_num, 3) # 保留3位小数
66 | print('为预防被封禁, 短延时:' + str(second_num) + '秒')
67 |
68 | time.sleep(second_num)
69 |
70 | def get_an_article(self, content_url):
71 | """
72 | 功能描述:
73 | 单独获取一篇文章, 只负责是否获取成功, 成功则返回文章内容, 失败则返回空字符串
74 | 输入:
75 | 微信文章链接(永久链接或短链接)
76 | 输出:
77 | 1.状态码
78 | 2.文章内容
79 | """
80 | res = self.session.get(
81 | url=content_url,
82 | headers=self.headers,
83 | cookies=self.cookies,
84 | verify=False)
85 | self.delay_short_time()
86 | # 验证请求
87 | if 'var createTime = ' in res.text: # 正常获取到文章内容
88 | print('正常获取到文章内容')
89 | # save_cache(res.text) # 保存文章内容到缓存文件,方便后续检查内容
90 | return {'content_flag': 1, 'content': res.text}
91 | elif '>当前环境异常, 完成验证后即可继续访问 <' in res.text:
92 | print('当前环境异常, 请检查链接后访问!!!') # 代码访问遇到人机验证,需进行验证操作
93 | return {'content_flag': 0, 'current_url': content_url}
94 | elif '操作频繁, 请稍后再试' in res.text:
95 | print('操作频繁了, 等会再弄或换ip弄!!!') # 遇到次数较少,如有遇到请前往GitHub留言
96 | return {'content_flag': 0, 'current_url': content_url}
97 | else:
98 | print('出现其他问题, 请查找原因后再试!!!!\n'
99 | '************************************\n'
100 | '一般情况下, 这篇文章可能是一整页的图片, 没有文本内容, 具体原因待讨论.该文章链接为:\n'
101 | + content_url + '\n'
102 | '************************************\n') # 出现错误信息,如有遇到请前往GitHub留言
103 | return {'content_flag': 0, 'current_url': content_url}
104 |
105 | def format_content(self, content):
106 | """
107 | 功能描述:
108 | 格式化文章内容,提取出文章中的文本内容
109 | 输入:
110 | 文章内容
111 | 输出:
112 | 格式化后的文章内容
113 | """
114 | # 整理文章关键信息
115 | # nickname = re.search(r'var nickname.*"(.*?)".*', article_content).group(1) # 公众号名称
116 | # article_link = re.search(r'var msg_link = .*"(.*?)".*', article_content).group(1) # 文章链接
117 | # createTime = re.search(r"var createTime = '(.*?)'.*", article_content).group(1) # 文章创建时间
118 | # # year, month, day = createTime.split(" ")[0].split("-") # 年,月,日
119 | # # hour, minute = createTime.split(" ")[1].split(":") # 小时,分钟
120 | # author = re.search(r'var author = "(.*?)".*', article_content).group(1) # 文章作者
121 | # print(article_content)
122 |
123 | # 整理文章关键信息
124 | soup = BeautifulSoup(content, 'lxml')
125 | self.nickname = soup.find("a", id="js_name").get_text().strip() # 公众号名称
126 | author = soup.find("meta", {"name": "author"}).get("content").strip() # 文章作者
127 | article_link = soup.find("meta", property="og:url").get("content") # 文章链接
128 | article_title = soup.find("h1", id="activity-name").get_text().strip() # 文章标题
129 | print('当前文章为>>>> ' + article_title)
130 |
131 | # 将文字内容转换为列表形式存储
132 | original_texts = soup.getText().split('\n') # 将页面所有的文本内容提取,并转为列表形式
133 | format_texts = list(filter(lambda x: bool(x.strip()), original_texts)) # filter() 函数可以根据指定的函数对可迭代对象进行过滤
134 |
135 | # 正则方式
136 | createTime = re.search(r"var createTime = '(.*?)'.*", content).group(1) # 文章创建时间
137 | year, month, day = createTime.split(" ")[0].split("-") # 年,月,日
138 | hour, minute = createTime.split(" ")[1].split(":")
139 |
140 | # 提取公众号biz值, 拼凑主页链接
141 | appuin = re.search(r"var appuin = (.*?);", content).group(1) # 公众号biz值
142 | quoted_values = re.findall(r'["\']([^"\']*)["\']', appuin)
143 | for value in quoted_values:
144 | if value:
145 | self.biz = value
146 | # 公众号主页链接
147 | self.public_main_link = ('https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz='
148 | + self.biz + '&scene=124#wechat_redirect')
149 |
150 | return {
151 | 'nickname': self.nickname,
152 | 'author': author,
153 | 'article_link': article_link,
154 | 'article_title': article_title,
155 | 'createTime': createTime,
156 | 'content': content,
157 | 'format_texts': format_texts, # 格式化后的文章内容
158 | }
159 |
160 | def save_article_img(self, data_path, content_info):
161 | """
162 | 功能描述:
163 | 保存单篇文章的图片内容到本地目录
164 | 输入:
165 | 文章信息
166 | 输出:
167 | None
168 | """
169 | # 文章图片保存目录
170 | nickname_path = os.path.join(data_path, '公众号----' + self.nickname)
171 | os.makedirs(nickname_path, exist_ok=True) # 创建数据存储目录
172 |
173 | # 适配Windows系统路径
174 | article_title = content_info['article_title'] # 文章标题
175 | article_title_win = re.sub(r'[\\/*?:"<>|].', '_', article_title) # Windows下标题
176 | article_title_win = article_title_win.replace('.', '') # Windows下标题,去除小数点,防止自动省略报错
177 | title_time = content_info['createTime'].replace(':', '_') # 文章发布时间,Windows下文件名不能包含冒号
178 |
179 | # 创建图片保存目录
180 | img_save_path = os.path.join(nickname_path, title_time + ' ---- ' + article_title_win)
181 | os.makedirs(img_save_path, exist_ok=True)
182 | print('设置文章图片存储路径>>>> ' + img_save_path)
183 |
184 | # 保存该文章图片内容
185 | images = content_info['content'].split('https://mmbiz.qpic.cn/')
186 | # print(images)
187 |
188 | for i in range(0, len(images) - 1):
189 | image_url = 'https://mmbiz.qpic.cn/' + images[i + 1].split('"')[0]
190 | # print('正在获取图片:' + image_url)
191 | image_name = ''
192 |
193 | try:
194 | # 添加随机延迟,避免请求过快
195 | time.sleep(0.5 + random.random())
196 |
197 | # 使用session发送请求,设置超时
198 | response = self.session.get(image_url, verify=False, timeout=self.timeout)
199 |
200 | # 检查响应状态码
201 | if response.status_code == 200:
202 | # 图片命名
203 | img_hz = ['gif', 'jpg', 'jpeg', 'png', 'webp']
204 | for imghz in img_hz:
205 | if imghz in image_url:
206 | image_name = str(i + 1) + '.' + imghz
207 | if image_name == '': # 如果链接中没有标明图片属性
208 | image_name = str(i + 1) + '.jpg'
209 | file_path = os.path.join(img_save_path, image_name)
210 | # 保存图片
211 | with open(file_path, 'wb') as f:
212 | f.write(response.content)
213 | print(f"已成功下载图片: {file_path}")
214 | else:
215 | print(f"无法下载图片,状态码: {response.status_code}")
216 | except Exception as e:
217 | print(f"下载图片时出错:{str(e)}")
218 | time.sleep(1) # 重试前等待
219 | print('已保存文章图片>>>> ' + article_title)
220 |
221 |
--------------------------------------------------------------------------------
/src/all_process.py:
--------------------------------------------------------------------------------
1 | """
2 | 汇总所有流程
3 | """
4 | from src.base_spider import BaseSpider
5 | from src.wechat_funcs import ArticleDetail
6 | from src.save_to_excel import SaveToExcel
7 |
8 | import os
9 |
10 |
11 | data_path = r'all_data' # 数据存储目录
12 |
13 | class AccessWechatArticle:
14 | def __init__(self):
15 | self.base_spider = BaseSpider() # 获取主页链接
16 | self.article_detail = ArticleDetail() # 共用微信token
17 | self.nickname = None
18 | self.public_token_link = None
19 |
20 | def get_public_main_link(self, article_url):
21 | """
22 | 获取文章的公共号主页链接
23 | """
24 | content = self.base_spider.get_an_article(article_url)
25 | if content['content_flag'] == 1:
26 | self.base_spider.format_content(content['content'])
27 | self.nickname = self.base_spider.nickname
28 | public_main_link = self.base_spider.public_main_link
29 | print(f'公众号名称:{self.nickname}\n公众号主页: ↘ ↘ ↘ ↘\n{public_main_link}')
30 | print('将此链接 ( ̄︶ ̄)↗ ↗ ↗ ↗ 粘贴发送到 "微信PC端-文件传输助手"')
31 | else:
32 | print('获取文章内容失败')
33 | return None
34 |
35 | def get_article_list(self, public_token_link, page_start, page_end=1):
36 | """
37 | 获取文章列表
38 | """
39 | # 检查输入参数是否合法
40 | access_token = self.article_detail.format_raw_link(public_token_link)
41 | if not access_token:
42 | print('请检查输入参数是否正确')
43 | return None
44 | print('参数齐全,开始获取文章信息,默认状态获取全部文章')
45 | self.public_token_link = public_token_link # 供其他功能使用
46 | # 获取文章列表 [[temproary_page, local_time, create_time, article_title, content_cover, content_url, format_url]]
47 | list_info = None
48 | try:
49 | if page_start == 0 and page_end == 1:
50 | list_info = self.article_detail.whole_article_list(0,0)
51 | elif page_start > page_end and page_end == 1:
52 | print('防呆输入,已自动交换页码')
53 | list_info = self.article_detail.whole_article_list(page_end, page_start)
54 | else:
55 | list_info = self.article_detail.whole_article_list(page_start, page_end)
56 | except:
57 | print('获取文章列表失败')
58 |
59 | # 保存操作, 先获取公众号名称
60 | if self.nickname is None and list_info is not None:
61 | # 获取公众号名称
62 | article_url = list_info[0][6]
63 | content = self.base_spider.get_an_article(article_url)
64 | if content['content_flag'] == 1:
65 | self.base_spider.format_content(content['content'])
66 | self.nickname = self.base_spider.nickname
67 | elif self.nickname is not None:
68 | print('已检测到公众号名称: ' + self.nickname + '\n')
69 | else:
70 | print('未获取到文章列表, 请检查!!!')
71 | return None
72 |
73 | # 实例化存储对象
74 | save_to_excel = SaveToExcel(data_path, self.nickname)
75 | if list_info is None:
76 | print('获取到 0 篇文章, 请检查!!!')
77 | return None
78 | else:
79 | # 保存文章列表
80 | article_list_columns = ['临时页码', '本地保存时间', '文章发布时间', '文章标题', \
81 | '文章封面链接', '文章原始链接(直接访问会提示验证)', '文章直连链接'] # 列名
82 | article_list_savepath = save_to_excel.article_raw_path
83 | save_to_excel.save_article_content(article_list_savepath, article_list_columns, list_info)
84 | print('文章列表保存成功')
85 | return None
86 |
87 | def save_article_content(self, nickname=None, save_img=False):
88 | """
89 | 保存已有的文章列表内容
90 | 输入:
91 | 公众号名称, 如已获取过主页链接, 则跳过输入
92 | 默认认为已获取文章列表
93 | 输出:
94 | 无(文章内容保存到Excel文件中)
95 | """
96 | if nickname == '' and self.nickname is None:
97 | print('检测到当前会话未涉及公众号信息获取操作!!!')
98 | print('请输入需要保存的公众号名称')
99 | return None
100 | elif nickname == '' and self.nickname is not None:
101 | print('已检测到公众号名称: ' + self.nickname + '\n')
102 | nickname = self.nickname
103 | else:
104 | print('当前输入公众号名称: ' + nickname + '\n')
105 | self.nickname = nickname
106 |
107 | # 实例化存储对象
108 | save_to_excel = SaveToExcel(data_path, nickname)
109 | article_list_path = save_to_excel.article_raw_path
110 | article_error_list = []
111 | if not os.path.exists(article_list_path): # 检查文件是否存在
112 | print('请先获取文章列表, 并确认已保存文章列表到Excel文件中, 再执行此操作')
113 | return None
114 | # 读取文章列表
115 | article_list = save_to_excel.read_article_list(article_list_path)
116 | # 遍历文章列表, 保存文章内容
117 | for article in article_list:
118 | # 获取文章内容
119 | content = self.base_spider.get_an_article(article[6])
120 | if content['content_flag'] == 1: # 检查文章内容是否获取成功
121 | article_content = self.base_spider.format_content(content['content'])
122 | # 修改文章创建时间
123 | article[2] = article_content['createTime']
124 | # 添加格式化后的文章内容
125 | article.append(str(article_content['format_texts']))
126 | # 保存单篇文章图片
127 | self.base_spider.nickname = nickname
128 | if save_img: self.base_spider.save_article_img(data_path, article_content)
129 | else:
130 | # print(f'获取文章内容失败, 文章链接: {article[6]}')
131 | article_list.remove(article) # 删除当前文章
132 | article_error_list.append(article)
133 |
134 |
135 | # 保存文章内容
136 | article_list_columns = ['临时页码', '本地保存时间', '文章发布时间', '文章标题', \
137 | '文章封面链接', '文章原始链接(直接访问会提示验证)', '文章直连链接', '文章内容'] # 列名
138 | article_list_savepath = save_to_excel.article_contents_path
139 | save_to_excel.save_article_content(article_list_savepath, article_list_columns, article_list)
140 |
141 | # 保存错误文章列表
142 | article_list_columns = ['临时页码', '本地保存时间', '文章发布时间', '文章标题', \
143 | '文章封面链接', '文章原始链接(直接访问会提示验证)', '文章直连链接'] # 列名
144 | article_list_savepath = save_to_excel.article_error_path
145 | save_to_excel.save_article_content(article_list_savepath, article_list_columns, article_error_list)
146 |
147 |
148 | def save_article_details(self, public_token_link):
149 | """
150 | 功能描述:
151 | 保存文章的详情数据
152 | 输入:
153 | 微信客户端token
154 | 输出:
155 | 无(文章详情保存到Excel文件中)
156 | """
157 | # 检查输入参数是否合法
158 | access_token = self.article_detail.format_raw_link(public_token_link)
159 | if not access_token:
160 | print('请检查输入参数是否正确')
161 | return None
162 | print('参数齐全,开始获取文章信息,默认状态获取全部文章')
163 |
164 | # 使用token获取公众号名称
165 | self.article_detail.get_detail_nickname()
166 |
167 | # 实例化存储对象
168 | save_to_excel = SaveToExcel(data_path, self.article_detail.nickname)
169 | article_list_path = save_to_excel.article_raw_path # 文章列表路径
170 | article_error_list = []
171 | if not os.path.exists(article_list_path): # 检查文件是否存在
172 | print('请先获取文章列表, 并确认已保存文章列表到Excel文件中, 再执行此操作')
173 | return None
174 | # 读取文章列表
175 | article_list = save_to_excel.read_article_list(article_list_path)
176 | # 遍历文章列表, 保存文章内容
177 | for article in article_list:
178 | # 获取文章内容
179 | content = self.base_spider.get_an_article(article[6])
180 | if content['content_flag'] == 1: # 检查文章内容是否获取成功
181 | article_content = self.base_spider.format_content(content['content'])
182 | # 修改文章创建时间
183 | article[2] = article_content['createTime']
184 | # 添加格式化后的文章内容
185 | article.append(str(article_content['format_texts']))
186 | # 获取文章详情, 仅当文章内容没问题时执行
187 | article_detail = self.article_detail.get_detail_content(article[5], article[3], content['content'])
188 | if article_detail is None: article.append('******文章详情获取失败!!!*******')
189 | else: article.extend(article_detail) # 批量添加文章详情
190 | else:
191 | # print(f'获取文章内容失败, 文章链接: {article[6]}')
192 | article_list.remove(article) # 删除当前文章
193 | article_error_list.append(article)
194 |
195 |
196 | # 保存文章内容
197 | article_list_columns = ['临时页码', '本地保存时间', '文章发布时间', '文章标题', \
198 | '文章封面链接', '文章原始链接(直接访问会提示验证)', '文章直连链接', '文章内容', \
199 | '阅读量', '点赞数', '转发数', '在看数', '评论数', '评论点赞数'] # 列名
200 | article_list_savepath = save_to_excel.article_details_path
201 | save_to_excel.save_article_content(article_list_savepath, article_list_columns, article_list)
202 |
203 | # 保存错误文章列表
204 | article_list_columns = ['临时页码', '本地保存时间', '文章发布时间', '文章标题', \
205 | '文章封面链接', '文章原始链接(直接访问会提示验证)', '文章直连链接'] # 列名
206 | article_list_savepath = save_to_excel.article_error_path
207 | save_to_excel.save_article_content(article_list_savepath, article_list_columns, article_error_list)
208 |
209 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## 微信公众号/文章 获取(Access_wechat_article)
2 |
3 | 更新时间:2025-12-03
4 |
5 | 本项目是基于Python语言的爬虫程序,支持对微信公众号文章内容获取
6 |
7 | 目前支持 Windows / Linux 开箱即用,**建议使用虚拟环境运行项目**
8 |
9 | 如果感兴趣,请 **Fork** 项目后自行研究使用
10 |
11 | 使用过程中如遇到错误,欢迎提交 [issues](https://github.com/yeximm/Access_wechat_article/issues) 来讨论
12 |
13 | **注**:请在 [GitHub](https://github.com/) 平台提交 [issues](https://github.com/yeximm/Access_wechat_article/issues)
14 |
15 | ## 一、主要功能
16 |
17 | 1. 获取**公众号主页链接**,通过微信内置浏览器可直接打开
18 | 2. 获取公众号**已发布**的文章列表(**微信公众号**下的历史文章)
19 | 3. 批量下载公众号文章的**网页文本数据**
20 | 4. 获取微信公众号文章的**所有信息**,如阅读量、点赞数、转发数、评论、评论点赞等信息。
21 |
22 | ## 二、项目开发环境及工具
23 |
24 | 1. 系统环境:Windows 11 ×64
25 | 2. 程序运行环境:python 3.13
26 | 3. 涉及应用:微信**PC版**,当前项目已适配的微信版本:**`4.1.5.16`**
27 | 4. 使用工具:[Fiddler Classic](https://www.telerik.com/fiddler/fiddler-classic),当前项目适配的Fiddler Classic版本:**`v5.0.20253.3311`**
28 |
29 | **目录架构**
30 |
31 | ```bash
32 | Access_wechat_article/
33 | ├── .venv/ # 虚拟环境目录
34 | ├── src/ # 源代码目录
35 | │ ├── all_process.py # 流程汇总
36 | │ ├── base_spider.py # 基础爬虫模块
37 | │ ├── save_to_excel.py # 存储模块
38 | │ ├── tools.py # 其他工具模块
39 | │ └── wechat_funcs.py # 微信token模块
40 | ├── LICENSE # 许可凭证
41 | ├── main.py # 项目主文件
42 | ├── README/ # 项目说明文档资源(图片、文件)
43 | ├── README.md # 项目说明文档
44 | └── requirements.txt # 项目依赖列表
45 | ```
46 |
47 | ## 三、程序使用
48 |
49 | ### 3.1下载 / Download
50 |
51 | - 下载地址:[https://github.com/yeximm/Access_wechat_article/releases](https://github.com/yeximm/Access_wechat_article/releases)
52 | - 👆👆👆以上为本项目发布页地址,选取所需版本下载即可。
53 |
54 |
55 | - 存储库快照:[Github_master](https://github.com/yeximm/Access_wechat_article/archive/refs/heads/master.zip)
56 | - 存储库快照等同于 [Releases](https://github.com/yeximm/Access_wechat_article/releases) 中的 [Source Code (zip)](https://github.com/yeximm/Access_wechat_article/archive/refs/heads/master.zip) 等,包含 `README` 等内容
57 |
58 | ### 3.2 Python环境配置
59 |
60 | (1)创建虚拟环境
61 |
62 | ```bash
63 | python -m venv .venv
64 | ```
65 |
66 | `venv`指定存放环境的目录,一般使用 `venv`,这是一个不成文的规定。
67 |
68 | (2)**激活**环境
69 |
70 | - Windows
71 |
72 | ```bash
73 | .\.venv\Scripts\activate
74 | ```
75 |
76 | - Unix/macOS
77 |
78 | ```bash
79 | source .venv/bin/activate
80 | ```
81 |
82 | (3)退出环境
83 |
84 | ```bash
85 | deactivate
86 | ```
87 |
88 | ### 3.3 安装项目依赖包
89 |
90 | `requirements.txt`中包含所需python包文件名称,用来批量安装python包文件
91 |
92 | 安装命令:
93 |
94 | ```bash
95 | pip install -r requirements.txt
96 | ```
97 |
98 | ### 3.4 运行参数
99 |
100 | 1. 项目主文件为:`main.py`,其功能调用方式详见于此。
101 | 项目中**生成文件的存储路径**为:`./all_data`(该目录由程序**自动创建**)
102 | 2. 运行命令:
103 |
104 | 1. 首先进入**虚拟环境**(详见**激活**虚拟环境)
105 |
106 | 2. 安装python包文件(如已安装则进行下一步)
107 |
108 | 3. 在项目目录运行:
109 |
110 | - ```bash
111 | python main.py
112 | ```
113 |
114 | 4. 根据控制台提示输入
115 |
116 | 5. 如需**自定义功能**,参照`main.py`中的函数调用方式自行编写。
117 |
118 | ## 四、功能示例
119 |
120 | ### 4.1 功能1
121 |
122 | ```bash
123 | 欢迎使用, 请输入数字键!
124 | 数字键1: 获取公众号主页链接
125 | 数字键2: 获取公众号已发布的文章列表
126 | 数字键3: 下载公众号文章内容 (默认下载 "文章列表" 中的所有文章)
127 | 数字键4: 同功能3, 另外获取每篇文章的 "阅读量"、"点赞数"等信息
128 | (请注意请求间隔,若请求太多太快可能会触发封禁!!)
129 | 输入其他任意字符退出!
130 | 请输入功能数字: 1
131 | ```
132 |
133 | **程序执行结果**
134 |
135 | ```bash
136 | ########## 默认公众号主页链接为“研招网资讯”,按回车键使用。##########
137 | ########## 若需获取其他公众号主页链接,请输入公众号下任意一篇已发布的文章链接。##########
138 | 请输入文章链接:https://mp.weixin.qq.com/s/ZNXDr2ErJno9-NdS4RYDCg
139 | 为预防被封禁, 短延时:0.906秒
140 | 正常获取到文章内容
141 | 当前文章为>>>> 法国总统马克龙抵达北京开始访华
142 | 公众号名称:新华网
143 | 公众号主页: https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzA4MjQxNjQzMA==&scene=124#wechat_redirect
144 | 将此链接 ( ̄︶ ̄)↗ ↗ ↗ ↗ 粘贴发送到 "微信PC端-文件传输助手"
145 | 按回车键继续...
146 | ```
147 |
148 | ### 4.2 功能2
149 |
150 | ```bash
151 | 请输入数字键!
152 | 数字键1: 获取公众号主页链接
153 | 数字键2: 获取公众号已发布的文章列表
154 | 数字键3: 下载公众号文章内容 (默认下载 "文章列表" 中的所有文章)
155 | 数字键4: 同功能3, 另外获取每篇文章的 "阅读量"、"点赞数"等信息
156 | (请注意请求间隔,若请求太多太快可能会触发封禁!!)
157 | 输入其他任意字符退出!
158 | 请输入功能数字: 2
159 | ```
160 |
161 | **输入参数**
162 |
163 | ```bash
164 | ########## 以下内容需要用到fiddler工具 ##########
165 | (1) 在微信客户端打开步骤1获取到的链接,
166 | (2) 在fiddler中查看——主机地址为https://mp.weixin.qq.com, URL地址为: /mp/profile_ext?acti
167 | (3) 选中此项后按快捷键: Ctrl+U 复制该网址到剪贴板, 将内容粘贴到此处
168 | 请输入复制的链接(づ ̄ 3 ̄)づ:https://mp.weixin.qq.com/mp/profile_ext?xxxxxx...
169 | ```
170 |
171 | ```bash
172 | ########## 获取指定页数的文章列表 ##########
173 | 一页文章数量约 15 篇, 请根据实际情况估算 (即: input * 15 = 文章数量)
174 | 例如: 获取前3页的文章列表, 请输入 3
175 | 公众号下全部文章列表, 请输入: 0 (注意: 若输入0, 全部列表可能需要较长时间, 视文章数量而定)
176 | 公众号下第2页到第5页的文章列表, 请输入 2-5
177 | 请输入需要下载的页数(默认: 1): 2-5
178 | ```
179 |
180 | **程序执行结果**
181 |
182 | ```bash
183 | 参数齐全,开始获取文章信息,默认状态获取全部文章
184 | 获取 2 至 5 页的文章列表
185 | 正在获取第 2 页文章列表
186 | 该页包含 15 篇文章
187 | 为预防被封禁,开始延时操作,延时时间:4.962秒
188 | 正在获取第 3 页文章列表
189 | 该页包含 13 篇文章
190 | 为预防被封禁,开始延时操作,延时时间:3.599秒
191 | 正在获取第 4 页文章列表
192 | 该页包含 14 篇文章
193 | 为预防被封禁,开始延时操作,延时时间:6.705秒
194 | 正在获取第 5 页文章列表
195 | 该页包含 12 篇文章
196 | 为预防被封禁,开始延时操作,延时时间:3.075秒
197 | 已检测到公众号名称: 新华网
198 |
199 | 2025-12-03 17:37:16 存储路径>>>> all_data\公众号----新华网\文章列表 (article_list).xlsx
200 | 文章列表保存成功
201 | 按回车键继续...
202 | ```
203 |
204 | ### 4.3 功能3
205 |
206 | ```bash
207 | 请输入数字键!
208 | 数字键1: 获取公众号主页链接
209 | 数字键2: 获取公众号已发布的文章列表
210 | 数字键3: 下载公众号文章内容 (默认下载 "文章列表" 中的所有文章)
211 | 数字键4: 同功能3, 另外获取每篇文章的 "阅读量"、"点赞数"等信息
212 | (请注意请求间隔,若请求太多太快可能会触发封禁!!)
213 | 输入其他任意字符退出!
214 | 请输入功能数字: 3
215 | ```
216 |
217 | **输入参数**
218 |
219 | ```bash
220 | ########## 保存公众号文章内容 ##########
221 | 输入: 已下载文章列表的公众号名称 (例如: 研招网资讯) 或 公众号的一篇文章链接
222 | (若当前会话已执行过步骤2, 可按回车跳过)
223 | 请输入: 新华网
224 | ```
225 |
226 | ```bash
227 | ########## 是否保存图片 ##########
228 | 是否保存图片? 是(输入任意值), 否(默认,按回车跳过):y
229 | ```
230 |
231 | **程序执行结果**
232 |
233 | ```bash
234 | 为预防被封禁, 短延时:1.043秒
235 | 正常获取到文章内容
236 | 当前文章为>>>> “时速能破150公里”?这种“爆改”太吓人!
237 | 为预防被封禁, 短延时:0.988秒
238 | 正常获取到文章内容
239 | 当前文章为>>>> 流感季,发烧了怎么办?
240 | ...
241 | 正常获取到文章内容
242 | 当前文章为>>>> 武装袭击事件,中国公民3死1伤!我使馆紧急提醒→
243 | 2025-12-03 17:40:43 存储路径>>>> all_data\公众号----新华网\文章内容 (article_contents).xlsx
244 | 2025-12-03 17:40:43 存储路径>>>> all_data\公众号----新华网\问题链接 (error_links).xlsx
245 | 按回车键继续...
246 | ```
247 |
248 | ### 4.4 功能4
249 |
250 | ```bash
251 | 请输入数字键!
252 | 数字键1: 获取公众号主页链接
253 | 数字键2: 获取公众号已发布的文章列表
254 | 数字键3: 下载公众号文章内容 (默认下载 "文章列表" 中的所有文章)
255 | 数字键4: 同功能3, 另外获取每篇文章的 "阅读量"、"点赞数"等信息
256 | (请注意请求间隔,若请求太多太快可能会触发封禁!!)
257 | 输入其他任意字符退出!
258 | 请输入功能数字: 4
259 | ```
260 |
261 | **输入参数**
262 |
263 | ```bash
264 | ########## 保存公众号文章详情 ##########
265 | 以下内容需要用到fiddler工具, 参考步骤2将 URL地址 粘贴到此处
266 | 请输入复制的链接(づ ̄ 3 ̄)づ: https://mp.weixin.qq.com/mp/profile_ext?xxxxxx...
267 | ```
268 |
269 | **程序执行结果**
270 |
271 | ```bash
272 | 参数齐全,开始获取文章信息,默认状态获取全部文章
273 | 获取 1 至 1 页的文章列表
274 | 正在获取第 1 页文章列表
275 | 该页包含 13 篇文章
276 | 为预防被封禁,开始延时操作,延时时间:5.049秒
277 | 为预防被封禁, 短延时:0.148秒
278 | 正常获取到文章内容
279 | 当前文章为>>>> 湖南省人大常委会原党组成员、副主任叶红专被查
280 | 为预防被封禁, 短延时:0.702秒
281 | ...
282 | 正常获取到文章内容
283 | 当前文章为>>>> 武装袭击事件,中国公民3死1伤!我使馆紧急提醒→
284 | 为预防被封禁,开始延时操作,延时时间:5.352秒
285 | 2025-12-03 17:48:43请求完成, 文章标题为: 武装袭击事件,中国公民3死1伤!我使馆紧急提醒→
286 | 2025-12-03 17:48:44 存储路径>>>> all_data\公众号----新华网\文章详情 (article_detiles).xlsx
287 | 2025-12-03 17:48:44 存储路径>>>> all_data\公众号----新华网\问题链接 (error_links).xlsx
288 | 按回车键继续...
289 | ```
290 |
291 | ## 五、鼓励一下
292 |
293 | 开源不易,若此项目有帮到你,望你能动用你的发财小手**Star**☆一下。
294 |
295 | 如有遇到代码方面的问题,欢迎一起讨论,你的鼓励是这个项目继续更新的最大动力!
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 | 另外,十分感谢大家对于本项目的关注。
304 |
305 | [](https://github.com/yeximm/Access_wechat_article/stargazers)
306 | [](https://github.com/yeximm/Access_wechat_article/network/members)
307 |
308 | ## 六、程序流程图
309 |
310 | 
311 |
312 | ### 6.1 基础爬虫模块
313 |
314 | 
315 |
316 | ### 6.2 获取文章列表模块(需token)
317 |
318 | 
319 |
320 | ### 6.3 文章内容获取
321 |
322 | 
323 |
324 | ### 6.4 文章详细信息获取(需token)
325 |
326 | 
327 |
328 | ## LICENSE
329 |
330 | 本作品采用许可协议 Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International ,简称 **[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/)**。
331 |
332 | 所有以任何方式查看本仓库内容的人、或直接或间接使用本仓库内容的使用者都应仔细阅读此声明。本仓库管理者保留随时更改或补充此免责声明的权利。一旦使用、复制、修改了本仓库内容,则视为您已接受此免责声明。
333 |
334 | 项目内容仅供学习研究,请勿用于商业用途。如对本仓库内容的功能有需求,应自行开发相关功能。所有基于本仓库内容的源代码,进行的任何修改,为其他个人或组织的自发行为,与本仓库内容没有任何直接或间接的关系,所造成的一切后果亦与本仓库内容和本仓库管理者无关。
335 |
336 | 本仓库内容中涉及的第三方硬件、软件等,与本仓库内容没有任何直接或间接的关系。本仓库内容仅对部署和使用过程进行客观描述,不代表支持使用任何第三方硬件、软件。使用任何第三方硬件、软件,所造成的一切后果由使用的个人或组织承担,与本仓库内容无关。
337 |
338 | ## Star History
339 |
340 | [](https://www.star-history.com/#yeximm/Access_wechat_article&Date)
341 |
342 |
--------------------------------------------------------------------------------
/src/wechat_funcs.py:
--------------------------------------------------------------------------------
1 | """
2 | 该模块包含了微信公众号token相关的函数
3 | """
4 | from urllib import parse # 用于解析获取url参数
5 | import re
6 | import time
7 | import json
8 | import random
9 | import jsonpath
10 |
11 | from src.base_spider import BaseSpider
12 |
13 |
14 | class ArticleDetail(BaseSpider):
15 | """
16 | 功能描述:
17 | 使用token获取公众号内容
18 | """
19 | def __init__(self):
20 | super().__init__()
21 | self.biz = None
22 | self.uin = None
23 | self.key = None
24 | self.pass_ticket = None
25 | self.text = 'website' # 预留位,保证返回值不会报错
26 |
27 | def format_raw_link(self, token_url):
28 | """
29 | 功能描述:
30 | 检验token是否合法
31 | 格式化原始链接, 提取出biz、uin、key、pass_ticket等参数
32 | 输入:
33 | 原始链接(包含token等参数)
34 | 输出:
35 | 无(参数值存储在类的属性中)
36 | """
37 | # 检验access_token是否合法
38 | access_token=parse.urlparse(token_url) # 解析url,拆解为各主体信息,目标为query参数
39 | query_dict = parse.parse_qs(access_token.query)
40 |
41 | self.biz = query_dict['__biz'][0]
42 | self.uin = query_dict['uin'][0]
43 | self.key = query_dict['key'][0]
44 | self.pass_ticket = query_dict['pass_ticket'][0]
45 |
46 | if self.biz and self.uin and self.pass_ticket and self.key:
47 | # print('参数齐全,开始获取文章信息,默认状态获取全部文章')
48 | return True
49 | else:
50 | print('\n※※※ 参数有误,请重新输入')
51 | return False
52 |
53 |
54 | def whole_article_list(self, pages_start, pages_end):
55 | """
56 | 功能描述:
57 | 获取文章指定页数的链接列表
58 | 输入:
59 | 需要下载的页数(默认1页)
60 | 若输入为0, 则获取全部文章
61 | 输出:
62 | 无(内容保存在文件)
63 | """
64 | all_article_link = [] # 存储所有文章链接
65 | if pages_start == 0:
66 | # 遍历公众号下所有文章链接
67 | page = 0
68 | passage_list = []
69 | print('开始获取公众号下所有的文章列表')
70 | while True:
71 | p_data = self.get_next_list(page)
72 | if p_data['m_flag'] == 1:
73 | for i in p_data['one_page_list']:
74 | passage_list.append(i)
75 | else:
76 | print('请求结束,文章列表获取完毕!')
77 | break
78 | page = page + 1
79 | self.delay_time() # 随机延时做模拟手动操作,预防被封禁
80 | all_article_link = passage_list
81 | else:
82 | # 遍历指定页数的文章链接
83 | print('获取 ' + str(pages_start) + ' 至 ' + str(pages_end) + ' 页的文章列表')
84 | passage_list = []
85 | for pages in range(pages_start-1, pages_end):
86 | p_data = self.get_next_list(pages)
87 | if p_data['m_flag'] == 1:
88 | for i in p_data['one_page_list']:
89 | passage_list.append(i)
90 | else:
91 | print('请求结束,文章列表获取完毕!')
92 | break
93 | self.delay_time() # 随机延时做模拟手动操作,预防被封禁
94 | all_article_link = passage_list
95 | # print('********************共获取到 ' + str(len(all_article_link)) + ' 篇文章,开始保存文章,若为 0 篇请检查错误!!!\n')
96 | if not all_article_link:
97 | print('获取到文章列表为空,请注意检查!!!!')
98 | return None # 如果获取为空
99 | else:
100 | return all_article_link
101 |
102 |
103 | def get_next_list(self, page):
104 | """
105 | 功能描述:
106 | 获取指定页的文章列表
107 | 输入:
108 | 页码
109 | 输出:
110 | 文章列表
111 | """
112 | # 从0开始计数,第 0 页相当于默认页数据
113 | pages = int(page) * 10
114 | print('正在获取第 ' + str(page + 1) + ' 页文章列表')
115 | url = ('https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=' + self.biz + '&f=json&offset='
116 | + str(pages) + '&count=10&is_ok=1&scene=124&uin=' + self.uin + '&key=' + self.key + '&pass_ticket='
117 | + self.pass_ticket + '&wxtoken=&appmsg_token=&x5=0&f=json')
118 | try:
119 | res = self.session.get(url=url, headers=self.headers, timeout=10, verify=False)
120 | except:
121 | print('失败!!!获取第 ' + str(page + 1) + ' 页文章列表失败!!!')
122 | res = ArticleDetail() # 保证返回值不会报错
123 | if 'app_msg_ext_info' in res.text:
124 | # 解码json数据
125 | get_page = json.loads(json.loads(res.text)['general_msg_list'])['list']
126 | '''
127 | 返回内容解析
128 | get_page[0]为
129 | {
130 | 'comm_msg_info': {
131 | 'id': 1000000107, 'type': 49,
132 | 'datetime': 1722467332, 'fakeid': '3910318108', 'status': 2, 'content': ''
133 | },
134 | 'app_msg_ext_info': {
135 | 'title': '国务院7月重要政策',
136 | 'digest': '', 'content': '', 'fileid': 100007840,
137 | 'content_url': 'http://mp.weixin.qq.com/s?__biz=MzkxMDMxODEwOA==&mid=2247491511&idx=1&sn=a36291fdee52a0f53d145edec8058e04&chksm=c0084d6abbcac962a50153c89fe9c19b6f8b1c5e5ac50b05adcb49bdfad8638522ab426c3f4b&scene=27#wechat_redirect',
138 | 'source_url': '',
139 | 'cover': 'https://mmbiz.qpic.cn/mmbiz_jpg/JRAjbHqmggrlZibDMibLP4ryNqhYXgolJOdQj2P8t2QQFVicickzAo7Gv1SzazwJY6lDylcanx2ic60HDbMvK8OKQpg/0?wx_fmt=jpeg',
140 | 'subtype': 9, 'is_multi': 1,
141 | 'multi_app_msg_item_list': [
142 | {'
143 | 'title': '8月起,这些新规将影响你我生活!',
144 | 'digest': '', 'content': '', 'fileid': 0,
145 | 'content_url': 'http://mp.weixin.qq.com/s?__biz=MzkxMDMxODEwOA==&mid=2247491511&idx=2&sn=b3f5b6bcf8727c8c90fce7e588e6e7da&chksm=c0eb20c99ca2f90032a6234002ed2cc9c2c000f87cff34f4d8d763878c0bb5275800db876ca7&scene=27#wechat_redirect',
146 | 'source_url': '',
147 | 'cover': 'https://mmbiz.qpic.cn/mmbiz_jpg/JRAjbHqmggrc08yJMZ6CQ3VL6VzmEIymSUyATlL6o3xaDJJ0D2CtpQg31Vy7jdCaic86zqkgJ9oAFGyia78ZOq7g/0?wx_fmt=jpeg',
148 | 'author': '', 'copyright_stat': 100, 'del_flag': 1, 'item_show_type': 0, 'audio_fileid': 0,
149 | 'duration': 0, 'play_url': '', 'malicious_title_reason_id': 0, 'malicious_content_type': 0
150 | },
151 | {
152 | 'title': '8月,你好!',
153 | 'digest': '', 'content': '', 'fileid': 100007860,
154 | 'content_url': 'http://mp.weixin.qq.com/s?__biz=MzkxMDMxODEwOA==&mid=2247491511&idx=3&sn=cd25de57b74b63b0f3b1a9888b9cd94d&chksm=c0c7f30fdd5fc0ea4a2765f5fd29e1faeb0e352e888ee8556521ab23bc9528d68f42deaa9d15&scene=27#wechat_redirect',
155 | 'source_url': '',
156 | 'cover': 'https://mmbiz.qpic.cn/mmbiz_jpg/JRAjbHqmggrlZibDMibLP4ryNqhYXgolJO9CnECAnMLDPY39Y9iarcFtM1ibrBvhKcGFyl1wicHysvTrYx4GfLybt8g/0?wx_fmt=jpeg',
157 | 'author': '', 'copyright_stat': 100, 'del_flag': 1, 'item_show_type': 0, 'audio_fileid': 0,
158 | 'duration': 0, 'play_url': '', 'malicious_title_reason_id': 0, 'malicious_content_type': 0}
159 | ],
160 | 'author': '', 'copyright_stat': 100, 'duration': 0, 'del_flag': 1, 'item_show_type': 0, 'audio_fileid': 0, 'play_url': '', 'malicious_title_reason_id': 0, 'malicious_content_type': 0
161 | }
162 | }
163 | '''
164 | one_page_list = [] # 存放一页内的所有文章
165 | for i in get_page:
166 | # 时间戳转换
167 | time_tuple = time.localtime(i['comm_msg_info']['datetime'])
168 | create_time = time.strftime("%Y-%m-%d", time_tuple)
169 | local_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) # 本地时间
170 |
171 | # 每日首篇文章的标题
172 | article_title = i['app_msg_ext_info']['title']
173 | content_cover = i['app_msg_ext_info']['cover']
174 | content_url = i['app_msg_ext_info']['content_url'].replace('#wechat_redirect', '')
175 | format_url = content_url.replace('amp;', '')
176 |
177 | temproary_page = page + 1 # 临时页码,用于抓取中断时的记录
178 | one_page_list.append([temproary_page, local_time, create_time, article_title, content_cover, content_url, format_url])
179 |
180 | if i['app_msg_ext_info']['multi_app_msg_item_list']:
181 | for j in i['app_msg_ext_info']['multi_app_msg_item_list']:
182 | article_title = j['title']
183 | content_cover = j['cover']
184 | content_url = j['content_url'].replace('#wechat_redirect', '')
185 | format_url = content_url.replace('amp;', '')
186 | one_page_list.append([temproary_page, local_time, create_time, article_title, content_cover, content_url, format_url])
187 | print('该页包含 ' + str(len(one_page_list)) + ' 篇文章')
188 | return {
189 | 'm_flag': 1,
190 | 'one_page_list': one_page_list,
191 | 'length': len(one_page_list)
192 | }
193 | elif '"home_page_list":[]' in res.text:
194 | print('\n出现:操作频繁,请稍后再试\n该号已被封禁,请解封后再来!!!\n')
195 | return {'m_flag': 0}
196 | else:
197 | print('请求结束!未获取到第 ' + str(page + 1) + ' 页文章列表')
198 | return {'m_flag': 0}
199 |
200 | def get_detail_nickname(self):
201 | """
202 | 功能描述:
203 | 使用token获取公众号名称
204 | 输入:
205 | 无
206 | 输出:
207 | 无
208 | """
209 | # 提取nickname
210 | first_page_list = self.whole_article_list(1, 1) # 获取第一页的文章列表
211 | if not first_page_list:
212 | print('获取文章列表失败')
213 | return None
214 | article_index = 0
215 | first_content = self.get_an_article(first_page_list[article_index][6]) # 提取nickname
216 | if first_content['content_flag'] == 0:
217 | print('提取第 ' + str(article_index + 1) + ' 篇文章的公众号名称失败')
218 | for i in range(1, len(first_page_list)):
219 | first_content = self.get_detail_nickname(first_page_list[i][6]) # 提取nickname
220 | if first_content['content_flag'] == 1:
221 | break
222 | return None
223 | self.format_content(first_content['content']) # 将公众号名称填入类属性
224 |
225 | def get_detail_content(self, source_url, article_title, one_content):
226 | """
227 | 功能描述:
228 | 获取单篇文章的文章详情
229 | 输入:
230 | 单篇文章的列表数据
231 | 输出:
232 | 当前文章的详情数据
233 | """
234 | # 设置延时
235 | self.delay_time()
236 | # 构建关键参数
237 | r = ''
238 | for rand in range(0, 16):
239 | r += str(random.randint(0, 9))
240 | r = '0.' + r
241 | appmsg_type = "9"
242 | mid = str(source_url).split('mid=')[1].split('&')[0]
243 | sn = str(source_url).split('sn=')[1].split('&')[0]
244 | idx = str(source_url).split('idx=')[1].split('&')[0]
245 |
246 | comment_id = re.search("var comment_id = '(.*?)'.*", one_content)
247 | if comment_id:
248 | comment_id = comment_id.group(1)
249 | else:
250 | print('没有匹配到comment_id, 文章标题为: ' + article_title)
251 | comment_id = ''
252 | if 'var req_id = ' in one_content:
253 | req_id = one_content.split('var req_id = ')[1].split(';')[0].replace("'", "").replace('"', '')
254 | else:
255 | print('没有匹配到req_id, 文章标题为: ' + article_title)
256 | req_id = ''
257 |
258 | '''获取文章详情信息'''
259 | detail_url = ('https://mp.weixin.qq.com/mp/getappmsgext?f=json&mock=&fasttmplajax=1&f=json' + '&uin=' + self.uin
260 | + '&key=' + self.key + '&pass_ticket=' + self.pass_ticket + '&__biz=' + self.biz)
261 | data = {
262 | 'r': r,
263 | 'sn': sn,
264 | 'mid': mid,
265 | 'idx': idx,
266 | 'req_id': req_id,
267 | 'title': article_title,
268 | 'comment_id': comment_id,
269 | 'appmsg_type': appmsg_type,
270 | '__biz': self.biz,
271 | 'pass_ticket': self.pass_ticket,
272 | 'abtest_cookie': '', 'devicetype': 'Windows 10 x64', 'version': '63090b13', 'is_need_ticket': '0',
273 | 'is_need_ad': '0', 'is_need_reward': '0', 'both_ad': '0', 'reward_uin_count': '0', 'send_time': '',
274 | 'msg_daily_idx': '1', 'is_original': '0', 'is_only_read': '1', 'scene': '38', 'is_temp_url': '0',
275 | 'item_show_type': '0', 'tmp_version': '1', 'more_read_type': '0', 'appmsg_like_type': '2',
276 | 'related_video_sn': '', 'related_video_num': '5', 'vid': '', 'is_pay_subscribe': '0',
277 | 'pay_subscribe_uin_count': '0', 'has_red_packet_cover': '0', 'album_id': '1296223588617486300',
278 | 'album_video_num': '5', 'cur_album_id': 'undefined', 'is_public_related_video': 'NaN',
279 | 'encode_info_by_base64': 'undefined', 'exptype': '', 'export_key_extinfo': '', 'business_type': '0',
280 | }
281 | res = self.session.post(url=detail_url, data=data, headers=self.headers, cookies=self.cookies, verify=False)
282 | # print(res.text)
283 | read_num = jsonpath.jsonpath(json.loads(res.text), "$.." + "read_num")
284 | like_num = jsonpath.jsonpath(json.loads(res.text), "$.." + "old_like_num")
285 | share_num = jsonpath.jsonpath(json.loads(res.text), "$.." + "share_num")
286 | show_read = jsonpath.jsonpath(json.loads(res.text), "$.." + "show_read")
287 |
288 | # 获取评论以及评论点赞数
289 | comment_url = ('https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=' + self.biz +
290 | '&appmsgid=2247491372&idx=1&comment_id=' + comment_id + '&offset=0&limit=100&uin='
291 | + self.uin + '&key=' + self.key + '&pass_ticket=' + self.pass_ticket
292 | + '&wxtoken=&devicetype=Windows+10&clientversion=62060833&appmsg_token=')
293 | response = self.session.get(comment_url, headers=self.headers, cookies=self.cookies, verify=False)
294 | json_content = json.loads(response.text)
295 | comments = jsonpath.jsonpath(json_content, '$..content') # 评论
296 | comments_star_nums = jsonpath.jsonpath(json_content, '$..like_num') # 评论点赞数
297 |
298 | local_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) # 本地时间
299 | print(str(local_time) + '请求完成, 文章标题为: ' + article_title)
300 | if read_num == [] or read_num == '':
301 | return None
302 | else:
303 | return (read_num[0], like_num[0], share_num[0], show_read[0], # 阅读量,点赞数,转发数,在看数,
304 | comments, comments_star_nums) # 评论,评论点赞
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Attribution-NonCommercial-ShareAlike 4.0 International
2 |
3 | =======================================================================
4 |
5 | Creative Commons Corporation ("Creative Commons") is not a law firm and
6 | does not provide legal services or legal advice. Distribution of
7 | Creative Commons public licenses does not create a lawyer-client or
8 | other relationship. Creative Commons makes its licenses and related
9 | information available on an "as-is" basis. Creative Commons gives no
10 | warranties regarding its licenses, any material licensed under their
11 | terms and conditions, or any related information. Creative Commons
12 | disclaims all liability for damages resulting from their use to the
13 | fullest extent possible.
14 |
15 | Using Creative Commons Public Licenses
16 |
17 | Creative Commons public licenses provide a standard set of terms and
18 | conditions that creators and other rights holders may use to share
19 | original works of authorship and other material subject to copyright
20 | and certain other rights specified in the public license below. The
21 | following considerations are for informational purposes only, are not
22 | exhaustive, and do not form part of our licenses.
23 |
24 | Considerations for licensors: Our public licenses are
25 | intended for use by those authorized to give the public
26 | permission to use material in ways otherwise restricted by
27 | copyright and certain other rights. Our licenses are
28 | irrevocable. Licensors should read and understand the terms
29 | and conditions of the license they choose before applying it.
30 | Licensors should also secure all rights necessary before
31 | applying our licenses so that the public can reuse the
32 | material as expected. Licensors should clearly mark any
33 | material not subject to the license. This includes other CC-
34 | licensed material, or material used under an exception or
35 | limitation to copyright. More considerations for licensors:
36 | wiki.creativecommons.org/Considerations_for_licensors
37 |
38 | Considerations for the public: By using one of our public
39 | licenses, a licensor grants the public permission to use the
40 | licensed material under specified terms and conditions. If
41 | the licensor's permission is not necessary for any reason--for
42 | example, because of any applicable exception or limitation to
43 | copyright--then that use is not regulated by the license. Our
44 | licenses grant only permissions under copyright and certain
45 | other rights that a licensor has authority to grant. Use of
46 | the licensed material may still be restricted for other
47 | reasons, including because others have copyright or other
48 | rights in the material. A licensor may make special requests,
49 | such as asking that all changes be marked or described.
50 | Although not required by our licenses, you are encouraged to
51 | respect those requests where reasonable. More considerations
52 | for the public:
53 | wiki.creativecommons.org/Considerations_for_licensees
54 |
55 | =======================================================================
56 |
57 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
58 | Public License
59 |
60 | By exercising the Licensed Rights (defined below), You accept and agree
61 | to be bound by the terms and conditions of this Creative Commons
62 | Attribution-NonCommercial-ShareAlike 4.0 International Public License
63 | ("Public License"). To the extent this Public License may be
64 | interpreted as a contract, You are granted the Licensed Rights in
65 | consideration of Your acceptance of these terms and conditions, and the
66 | Licensor grants You such rights in consideration of benefits the
67 | Licensor receives from making the Licensed Material available under
68 | these terms and conditions.
69 |
70 |
71 | Section 1 -- Definitions.
72 |
73 | a. Adapted Material means material subject to Copyright and Similar
74 | Rights that is derived from or based upon the Licensed Material
75 | and in which the Licensed Material is translated, altered,
76 | arranged, transformed, or otherwise modified in a manner requiring
77 | permission under the Copyright and Similar Rights held by the
78 | Licensor. For purposes of this Public License, where the Licensed
79 | Material is a musical work, performance, or sound recording,
80 | Adapted Material is always produced where the Licensed Material is
81 | synched in timed relation with a moving image.
82 |
83 | b. Adapter's License means the license You apply to Your Copyright
84 | and Similar Rights in Your contributions to Adapted Material in
85 | accordance with the terms and conditions of this Public License.
86 |
87 | c. BY-NC-SA Compatible License means a license listed at
88 | creativecommons.org/compatiblelicenses, approved by Creative
89 | Commons as essentially the equivalent of this Public License.
90 |
91 | d. Copyright and Similar Rights means copyright and/or similar rights
92 | closely related to copyright including, without limitation,
93 | performance, broadcast, sound recording, and Sui Generis Database
94 | Rights, without regard to how the rights are labeled or
95 | categorized. For purposes of this Public License, the rights
96 | specified in Section 2(b)(1)-(2) are not Copyright and Similar
97 | Rights.
98 |
99 | e. Effective Technological Measures means those measures that, in the
100 | absence of proper authority, may not be circumvented under laws
101 | fulfilling obligations under Article 11 of the WIPO Copyright
102 | Treaty adopted on December 20, 1996, and/or similar international
103 | agreements.
104 |
105 | f. Exceptions and Limitations means fair use, fair dealing, and/or
106 | any other exception or limitation to Copyright and Similar Rights
107 | that applies to Your use of the Licensed Material.
108 |
109 | g. License Elements means the license attributes listed in the name
110 | of a Creative Commons Public License. The License Elements of this
111 | Public License are Attribution, NonCommercial, and ShareAlike.
112 |
113 | h. Licensed Material means the artistic or literary work, database,
114 | or other material to which the Licensor applied this Public
115 | License.
116 |
117 | i. Licensed Rights means the rights granted to You subject to the
118 | terms and conditions of this Public License, which are limited to
119 | all Copyright and Similar Rights that apply to Your use of the
120 | Licensed Material and that the Licensor has authority to license.
121 |
122 | j. Licensor means the individual(s) or entity(ies) granting rights
123 | under this Public License.
124 |
125 | k. NonCommercial means not primarily intended for or directed towards
126 | commercial advantage or monetary compensation. For purposes of
127 | this Public License, the exchange of the Licensed Material for
128 | other material subject to Copyright and Similar Rights by digital
129 | file-sharing or similar means is NonCommercial provided there is
130 | no payment of monetary compensation in connection with the
131 | exchange.
132 |
133 | l. Share means to provide material to the public by any means or
134 | process that requires permission under the Licensed Rights, such
135 | as reproduction, public display, public performance, distribution,
136 | dissemination, communication, or importation, and to make material
137 | available to the public including in ways that members of the
138 | public may access the material from a place and at a time
139 | individually chosen by them.
140 |
141 | m. Sui Generis Database Rights means rights other than copyright
142 | resulting from Directive 96/9/EC of the European Parliament and of
143 | the Council of 11 March 1996 on the legal protection of databases,
144 | as amended and/or succeeded, as well as other essentially
145 | equivalent rights anywhere in the world.
146 |
147 | n. You means the individual or entity exercising the Licensed Rights
148 | under this Public License. Your has a corresponding meaning.
149 |
150 |
151 | Section 2 -- Scope.
152 |
153 | a. License grant.
154 |
155 | 1. Subject to the terms and conditions of this Public License,
156 | the Licensor hereby grants You a worldwide, royalty-free,
157 | non-sublicensable, non-exclusive, irrevocable license to
158 | exercise the Licensed Rights in the Licensed Material to:
159 |
160 | a. reproduce and Share the Licensed Material, in whole or
161 | in part, for NonCommercial purposes only; and
162 |
163 | b. produce, reproduce, and Share Adapted Material for
164 | NonCommercial purposes only.
165 |
166 | 2. Exceptions and Limitations. For the avoidance of doubt, where
167 | Exceptions and Limitations apply to Your use, this Public
168 | License does not apply, and You do not need to comply with
169 | its terms and conditions.
170 |
171 | 3. Term. The term of this Public License is specified in Section
172 | 6(a).
173 |
174 | 4. Media and formats; technical modifications allowed. The
175 | Licensor authorizes You to exercise the Licensed Rights in
176 | all media and formats whether now known or hereafter created,
177 | and to make technical modifications necessary to do so. The
178 | Licensor waives and/or agrees not to assert any right or
179 | authority to forbid You from making technical modifications
180 | necessary to exercise the Licensed Rights, including
181 | technical modifications necessary to circumvent Effective
182 | Technological Measures. For purposes of this Public License,
183 | simply making modifications authorized by this Section 2(a)
184 | (4) never produces Adapted Material.
185 |
186 | 5. Downstream recipients.
187 |
188 | a. Offer from the Licensor -- Licensed Material. Every
189 | recipient of the Licensed Material automatically
190 | receives an offer from the Licensor to exercise the
191 | Licensed Rights under the terms and conditions of this
192 | Public License.
193 |
194 | b. Additional offer from the Licensor -- Adapted Material.
195 | Every recipient of Adapted Material from You
196 | automatically receives an offer from the Licensor to
197 | exercise the Licensed Rights in the Adapted Material
198 | under the conditions of the Adapter's License You apply.
199 |
200 | c. No downstream restrictions. You may not offer or impose
201 | any additional or different terms or conditions on, or
202 | apply any Effective Technological Measures to, the
203 | Licensed Material if doing so restricts exercise of the
204 | Licensed Rights by any recipient of the Licensed
205 | Material.
206 |
207 | 6. No endorsement. Nothing in this Public License constitutes or
208 | may be construed as permission to assert or imply that You
209 | are, or that Your use of the Licensed Material is, connected
210 | with, or sponsored, endorsed, or granted official status by,
211 | the Licensor or others designated to receive attribution as
212 | provided in Section 3(a)(1)(A)(i).
213 |
214 | b. Other rights.
215 |
216 | 1. Moral rights, such as the right of integrity, are not
217 | licensed under this Public License, nor are publicity,
218 | privacy, and/or other similar personality rights; however, to
219 | the extent possible, the Licensor waives and/or agrees not to
220 | assert any such rights held by the Licensor to the limited
221 | extent necessary to allow You to exercise the Licensed
222 | Rights, but not otherwise.
223 |
224 | 2. Patent and trademark rights are not licensed under this
225 | Public License.
226 |
227 | 3. To the extent possible, the Licensor waives any right to
228 | collect royalties from You for the exercise of the Licensed
229 | Rights, whether directly or through a collecting society
230 | under any voluntary or waivable statutory or compulsory
231 | licensing scheme. In all other cases the Licensor expressly
232 | reserves any right to collect such royalties, including when
233 | the Licensed Material is used other than for NonCommercial
234 | purposes.
235 |
236 |
237 | Section 3 -- License Conditions.
238 |
239 | Your exercise of the Licensed Rights is expressly made subject to the
240 | following conditions.
241 |
242 | a. Attribution.
243 |
244 | 1. If You Share the Licensed Material (including in modified
245 | form), You must:
246 |
247 | a. retain the following if it is supplied by the Licensor
248 | with the Licensed Material:
249 |
250 | i. identification of the creator(s) of the Licensed
251 | Material and any others designated to receive
252 | attribution, in any reasonable manner requested by
253 | the Licensor (including by pseudonym if
254 | designated);
255 |
256 | ii. a copyright notice;
257 |
258 | iii. a notice that refers to this Public License;
259 |
260 | iv. a notice that refers to the disclaimer of
261 | warranties;
262 |
263 | v. a URI or hyperlink to the Licensed Material to the
264 | extent reasonably practicable;
265 |
266 | b. indicate if You modified the Licensed Material and
267 | retain an indication of any previous modifications; and
268 |
269 | c. indicate the Licensed Material is licensed under this
270 | Public License, and include the text of, or the URI or
271 | hyperlink to, this Public License.
272 |
273 | 2. You may satisfy the conditions in Section 3(a)(1) in any
274 | reasonable manner based on the medium, means, and context in
275 | which You Share the Licensed Material. For example, it may be
276 | reasonable to satisfy the conditions by providing a URI or
277 | hyperlink to a resource that includes the required
278 | information.
279 | 3. If requested by the Licensor, You must remove any of the
280 | information required by Section 3(a)(1)(A) to the extent
281 | reasonably practicable.
282 |
283 | b. ShareAlike.
284 |
285 | In addition to the conditions in Section 3(a), if You Share
286 | Adapted Material You produce, the following conditions also apply.
287 |
288 | 1. The Adapter's License You apply must be a Creative Commons
289 | license with the same License Elements, this version or
290 | later, or a BY-NC-SA Compatible License.
291 |
292 | 2. You must include the text of, or the URI or hyperlink to, the
293 | Adapter's License You apply. You may satisfy this condition
294 | in any reasonable manner based on the medium, means, and
295 | context in which You Share Adapted Material.
296 |
297 | 3. You may not offer or impose any additional or different terms
298 | or conditions on, or apply any Effective Technological
299 | Measures to, Adapted Material that restrict exercise of the
300 | rights granted under the Adapter's License You apply.
301 |
302 |
303 | Section 4 -- Sui Generis Database Rights.
304 |
305 | Where the Licensed Rights include Sui Generis Database Rights that
306 | apply to Your use of the Licensed Material:
307 |
308 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right
309 | to extract, reuse, reproduce, and Share all or a substantial
310 | portion of the contents of the database for NonCommercial purposes
311 | only;
312 |
313 | b. if You include all or a substantial portion of the database
314 | contents in a database in which You have Sui Generis Database
315 | Rights, then the database in which You have Sui Generis Database
316 | Rights (but not its individual contents) is Adapted Material,
317 | including for purposes of Section 3(b); and
318 |
319 | c. You must comply with the conditions in Section 3(a) if You Share
320 | all or a substantial portion of the contents of the database.
321 |
322 | For the avoidance of doubt, this Section 4 supplements and does not
323 | replace Your obligations under this Public License where the Licensed
324 | Rights include other Copyright and Similar Rights.
325 |
326 |
327 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
328 |
329 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
330 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
331 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
332 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
333 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
334 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
335 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
336 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
337 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
338 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
339 |
340 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
341 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
342 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
343 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
344 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
345 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
346 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
347 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
348 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
349 |
350 | c. The disclaimer of warranties and limitation of liability provided
351 | above shall be interpreted in a manner that, to the extent
352 | possible, most closely approximates an absolute disclaimer and
353 | waiver of all liability.
354 |
355 |
356 | Section 6 -- Term and Termination.
357 |
358 | a. This Public License applies for the term of the Copyright and
359 | Similar Rights licensed here. However, if You fail to comply with
360 | this Public License, then Your rights under this Public License
361 | terminate automatically.
362 |
363 | b. Where Your right to use the Licensed Material has terminated under
364 | Section 6(a), it reinstates:
365 |
366 | 1. automatically as of the date the violation is cured, provided
367 | it is cured within 30 days of Your discovery of the
368 | violation; or
369 |
370 | 2. upon express reinstatement by the Licensor.
371 |
372 | For the avoidance of doubt, this Section 6(b) does not affect any
373 | right the Licensor may have to seek remedies for Your violations
374 | of this Public License.
375 |
376 | c. For the avoidance of doubt, the Licensor may also offer the
377 | Licensed Material under separate terms or conditions or stop
378 | distributing the Licensed Material at any time; however, doing so
379 | will not terminate this Public License.
380 |
381 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
382 | License.
383 |
384 |
385 | Section 7 -- Other Terms and Conditions.
386 |
387 | a. The Licensor shall not be bound by any additional or different
388 | terms or conditions communicated by You unless expressly agreed.
389 |
390 | b. Any arrangements, understandings, or agreements regarding the
391 | Licensed Material not stated herein are separate from and
392 | independent of the terms and conditions of this Public License.
393 |
394 |
395 | Section 8 -- Interpretation.
396 |
397 | a. For the avoidance of doubt, this Public License does not, and
398 | shall not be interpreted to, reduce, limit, restrict, or impose
399 | conditions on any use of the Licensed Material that could lawfully
400 | be made without permission under this Public License.
401 |
402 | b. To the extent possible, if any provision of this Public License is
403 | deemed unenforceable, it shall be automatically reformed to the
404 | minimum extent necessary to make it enforceable. If the provision
405 | cannot be reformed, it shall be severed from this Public License
406 | without affecting the enforceability of the remaining terms and
407 | conditions.
408 |
409 | c. No term or condition of this Public License will be waived and no
410 | failure to comply consented to unless expressly agreed to by the
411 | Licensor.
412 |
413 | d. Nothing in this Public License constitutes or may be interpreted
414 | as a limitation upon, or waiver of, any privileges and immunities
415 | that apply to the Licensor or You, including from the legal
416 | processes of any jurisdiction or authority.
417 |
418 | =======================================================================
419 |
420 | Creative Commons is not a party to its public
421 | licenses. Notwithstanding, Creative Commons may elect to apply one of
422 | its public licenses to material it publishes and in those instances
423 | will be considered the “Licensor.” The text of the Creative Commons
424 | public licenses is dedicated to the public domain under the CC0 Public
425 | Domain Dedication. Except for the limited purpose of indicating that
426 | material is shared under a Creative Commons public license or as
427 | otherwise permitted by the Creative Commons policies published at
428 | creativecommons.org/policies, Creative Commons does not authorize the
429 | use of the trademark "Creative Commons" or any other trademark or logo
430 | of Creative Commons without its prior written consent including,
431 | without limitation, in connection with any unauthorized modifications
432 | to any of its public licenses or any other arrangements,
433 | understandings, or agreements concerning use of licensed material. For
434 | the avoidance of doubt, this paragraph does not form part of the
435 | public licenses.
436 |
437 | Creative Commons may be contacted at creativecommons.org.
438 |
439 |
440 |
--------------------------------------------------------------------------------