├── src ├── __init__.py ├── tools.py ├── save_to_excel.py ├── base_spider.py ├── all_process.py └── wechat_funcs.py ├── README ├── qrcode_1749894334903.jpg ├── wechat_article_drawio.png ├── image-20251203185742977.png ├── image-20251203185757196.png ├── image-20251203185810439.png └── image-20251203185822659.png ├── requirements.txt ├── main.py ├── README.md └── LICENSE /src/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /README/qrcode_1749894334903.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeximm/Access_wechat_article/HEAD/README/qrcode_1749894334903.jpg -------------------------------------------------------------------------------- /README/wechat_article_drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeximm/Access_wechat_article/HEAD/README/wechat_article_drawio.png -------------------------------------------------------------------------------- /README/image-20251203185742977.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeximm/Access_wechat_article/HEAD/README/image-20251203185742977.png -------------------------------------------------------------------------------- /README/image-20251203185757196.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeximm/Access_wechat_article/HEAD/README/image-20251203185757196.png -------------------------------------------------------------------------------- /README/image-20251203185810439.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeximm/Access_wechat_article/HEAD/README/image-20251203185810439.png -------------------------------------------------------------------------------- /README/image-20251203185822659.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeximm/Access_wechat_article/HEAD/README/image-20251203185822659.png -------------------------------------------------------------------------------- /src/tools.py: -------------------------------------------------------------------------------- 1 | """ 2 | 工具模块,包含一些常用的工具函数,如保存内容到缓存文件等 3 | 功能1: 4 | save_cache(content) # 保存内容到缓存文件 5 | 6 | 7 | """ 8 | 9 | 10 | # 保存内容到缓存文件, 用于调试 11 | def save_cache(content): 12 | with open(r'src/cache/test_cache.txt', 'w', encoding='utf-8') as f: 13 | f.write(content) 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.13.3 2 | certifi==2025.1.31 3 | charset-normalizer==3.4.1 4 | et_xmlfile==2.0.0 5 | fake-useragent==2.0.3 6 | idna==3.10 7 | jsonpath==0.82 8 | lxml==6.0.2 9 | numpy==2.2.3 10 | openpyxl==3.1.5 11 | pandas==2.2.3 12 | python-dateutil==2.9.0.post0 13 | pytz==2025.1 14 | requests==2.32.3 15 | six==1.17.0 16 | soupsieve==2.6 17 | typing_extensions==4.12.2 18 | tzdata==2025.1 19 | urllib3==2.3.0 20 | -------------------------------------------------------------------------------- /src/save_to_excel.py: -------------------------------------------------------------------------------- 1 | """ 2 | 爬虫基类模块 3 | 主要通过单篇文章获取信息 4 | """ 5 | 6 | import pandas as pd # 修改excel 7 | import os 8 | import time 9 | 10 | from src.tools import * 11 | 12 | 13 | class SaveToExcel(): 14 | """ 15 | 功能描述: 16 | 保存文章信息到excel文件 17 | """ 18 | def __init__(self,data_path, nickname): 19 | self.nickname = nickname 20 | # 创建excel文件保存目录 21 | self.excel_save_path = os.path.join(data_path, '公众号----' + nickname) 22 | # print('excel文件保存目录: ', self.excel_save_path) 23 | os.makedirs(self.excel_save_path, exist_ok=True) # 创建数据存储目录 24 | 25 | self.article_raw_path = os.path.join(self.excel_save_path, '文章列表 (article_list).xlsx') 26 | self.article_contents_path = os.path.join(self.excel_save_path, '文章内容 (article_contents).xlsx') 27 | self.article_details_path = os.path.join(self.excel_save_path, '文章详情 (article_detiles).xlsx') 28 | self.article_error_path = os.path.join(self.excel_save_path, '问题链接 (error_links).xlsx') 29 | 30 | def read_article_list(self, article_list_path): 31 | """ 32 | 功能描述: 33 | 读取文章列表 34 | 输入: 35 | 文章列表文件路径 36 | 输出: 37 | 文章列表 38 | """ 39 | # 读取文章列表 40 | article_list = pd.read_excel(article_list_path) 41 | all_article_list = [] 42 | for index, row in article_list.iterrows(): 43 | if pd.isna(row.iloc[6]): 44 | print('检测到存在空数据,跳过') 45 | continue 46 | else: 47 | all_article_list.append(row.to_list()) 48 | return all_article_list 49 | 50 | def save_article_content(self, file_path, columns,content_info): 51 | """ 52 | 功能描述: 53 | 保存单篇文章信息到excel文件 54 | 输入: 55 | 文件路径(已经合并好的路径) 56 | 表头 (一维数组) columns = ['本地存储时间', '文章发布时间'] # 列名 57 | 文章信息 (二维数组) content_info 58 | 输出: 59 | None 60 | """ 61 | # 创建 or 打开表格,检查文件是否存在,判断不存在时创建表格文件 62 | article_contents_path = file_path # 文章内容文件路径 63 | if not os.path.exists(article_contents_path): 64 | pd.DataFrame().to_excel(article_contents_path, index=False) 65 | frame_df = pd.read_excel(article_contents_path) # 读取表格内容,默认打开DataFrame对象包含第一个工作表中的数据 66 | 67 | # 将新数据转换为 DataFrame 并添加到现有 DataFrame 的末尾 68 | new_data_df = pd.DataFrame(content_info, columns=columns) 69 | df = pd.concat([frame_df, new_data_df], ignore_index=True) 70 | 71 | # 将更新后的数据写入 Excel 文件 72 | df.to_excel(article_contents_path, index=False) 73 | local_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) # 本地时间 74 | print(local_time + ' 存储路径>>>> ' + article_contents_path) 75 | 76 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from src.all_process import AccessWechatArticle 2 | 3 | 4 | if __name__=="__main__": 5 | AWA = AccessWechatArticle() 6 | screen_text = '''请输入数字键! 7 | 数字键1: 获取公众号主页链接 8 | 数字键2: 获取公众号已发布的文章列表 9 | 数字键3: 下载公众号文章内容 (默认下载 "文章列表" 中的所有文章) 10 | 数字键4: 同功能3, 另外获取每篇文章的 "阅读量"、"点赞数"等信息 11 | (请注意请求间隔,若请求太多太快可能会触发封禁!!) 12 | 输入其他任意字符退出!''' 13 | print('欢迎使用, ' + screen_text) 14 | while True: 15 | text = str(input('请输入功能数字: ')) 16 | 17 | if text == '1': 18 | screen_text1 = '########## 默认公众号主页链接为“研招网资讯”,按回车键使用。##########\n' + \ 19 | '########## 若需获取其他公众号主页链接,请输入公众号下任意一篇已发布的文章链接。##########\n' + \ 20 | '请输入文章链接:' 21 | temporary_url = (input(screen_text1) or 'https://mp.weixin.qq.com/s/4r_LKJu0mOeUc70ZZXK9LA') 22 | AWA.get_public_main_link(temporary_url) 23 | 24 | input("按回车键继续...") 25 | print('\n' + screen_text) 26 | 27 | elif text == '2': 28 | screen_text21 = '\n########## 以下内容需要用到fiddler工具 ##########\n' + \ 29 | ' (1) 在微信客户端打开步骤1获取到的链接,\n' + \ 30 | ' (2) 在fiddler中查看——主机地址为https://mp.weixin.qq.com, URL地址为: /mp/profile_ext?acti\n' + \ 31 | ' (3) 选中此项后按快捷键: Ctrl+U 复制该网址到剪贴板, 将内容粘贴到此处\n' + \ 32 | '请输入复制的链接(づ ̄ 3 ̄)づ:' 33 | access_token = input(screen_text21) 34 | screen_text22 = '\n########## 获取指定页数的文章列表 ##########\n' + \ 35 | '一页文章数量约 15 篇, 请根据实际情况估算 (即: input * 15 = 文章数量)\n' + \ 36 | '例如: 获取前3页的文章列表, 请输入 3 \n' + \ 37 | ' 公众号下全部文章列表, 请输入: 0 (注意: 若输入0, 全部列表可能需要较长时间, 视文章数量而定)\n' + \ 38 | ' 公众号下第2页到第5页的文章列表, 请输入 2-5 \n' + \ 39 | '请输入需要下载的页数(默认: 1): ' 40 | pages = input(screen_text22) or '1' 41 | if '-' in pages: 42 | pages = pages.split('-') 43 | pages_start = int(pages[0]) 44 | pages_end = int(pages[1]) 45 | AWA.get_article_list(access_token, pages_start, pages_end) 46 | else: 47 | pages = int(pages) 48 | AWA.get_article_list(access_token, pages) 49 | 50 | input("按回车键继续...") 51 | print('\n' + screen_text) 52 | 53 | elif text == '3': # 该功能不需要token 54 | screen_text31 = '\n########## 保存公众号文章内容 ##########\n' + \ 55 | '输入: 已下载文章列表的公众号名称 (例如: 研招网资讯) 或 公众号的一篇文章链接 \n' + \ 56 | '(若当前会话已执行过步骤2, 可按回车跳过)\n' + \ 57 | '请输入: ' 58 | nickname = input(screen_text31) 59 | screen_text32 = '\n########## 是否保存图片 ##########\n' + \ 60 | '是否保存图片? 是(输入任意值), 否(默认,按回车跳过): ' 61 | save_img = input(screen_text32) 62 | AWA.save_article_content(str(nickname), save_img) 63 | 64 | input("按回车键继续...") 65 | print('\n' + screen_text) 66 | 67 | elif text == '4': 68 | screen_text41 = '\n########## 保存公众号文章详情 ##########\n' + \ 69 | '以下内容需要用到fiddler工具, 参考步骤2将 URL地址 粘贴到此处\n' + \ 70 | '请输入复制的链接(づ ̄ 3 ̄)づ: ' 71 | access_token = input(screen_text41) 72 | AWA.save_article_details(access_token) 73 | 74 | input("按回车键继续...") 75 | print('\n未成功获取的链接已保存到本地。' + '\n' + screen_text) 76 | 77 | else: 78 | print('\n已成功退出!') 79 | break 80 | 81 | 82 | -------------------------------------------------------------------------------- /src/base_spider.py: -------------------------------------------------------------------------------- 1 | """ 2 | 爬虫基类模块 3 | 主要通过单篇文章获取信息 4 | """ 5 | import requests 6 | requests.packages.urllib3.disable_warnings() # 去除网络请求警告 7 | 8 | from fake_useragent import UserAgent # 生成随机浏览器代理 9 | User_Agent = UserAgent().chrome # 获取chrome浏览器标识 10 | 11 | from bs4 import BeautifulSoup 12 | import lxml 13 | import os 14 | import re 15 | import time 16 | import random 17 | 18 | 19 | from src.tools import * 20 | 21 | 22 | class BaseSpider: 23 | """ 24 | 功能描述: 25 | 爬虫基类,所有爬虫类均需继承该类 26 | """ 27 | def __init__(self): 28 | self.session = requests.Session() # 配置共享请求会话 29 | self.timeout = 10 # 设置超时 30 | self.headers = {'User-Agent': User_Agent} 31 | self.data = {} 32 | self.cookies = {} 33 | self.nickname = '' # 公众号名称 34 | self.public_main_link = '' # 公众号主页链接 35 | 36 | def delay_time(self): 37 | """ 38 | 功能描述: 39 | 延时函数, 用于避免频繁请求导致的IP被封禁 40 | 输入: 41 | 无 42 | 输出: 43 | 无 44 | """ 45 | second_max_num = 7 46 | second_min_num = 3 47 | second_num = random.uniform(second_min_num, second_max_num) 48 | second_num = round(second_num, 3) # 保留3位小数 49 | print('为预防被封禁,开始延时操作,延时时间:' + str(second_num) + '秒') 50 | 51 | time.sleep(second_num) 52 | 53 | def delay_short_time(self): 54 | """ 55 | 功能描述: 56 | 延时函数, 用于避免频繁请求导致的IP被封禁 57 | 输入: 58 | 无 59 | 输出: 60 | 无 61 | """ 62 | second_max_num = 1.5 63 | second_min_num = 0.1 64 | second_num = random.uniform(second_min_num, second_max_num) 65 | second_num = round(second_num, 3) # 保留3位小数 66 | print('为预防被封禁, 短延时:' + str(second_num) + '秒') 67 | 68 | time.sleep(second_num) 69 | 70 | def get_an_article(self, content_url): 71 | """ 72 | 功能描述: 73 | 单独获取一篇文章, 只负责是否获取成功, 成功则返回文章内容, 失败则返回空字符串 74 | 输入: 75 | 微信文章链接(永久链接或短链接) 76 | 输出: 77 | 1.状态码 78 | 2.文章内容 79 | """ 80 | res = self.session.get( 81 | url=content_url, 82 | headers=self.headers, 83 | cookies=self.cookies, 84 | verify=False) 85 | self.delay_short_time() 86 | # 验证请求 87 | if 'var createTime = ' in res.text: # 正常获取到文章内容 88 | print('正常获取到文章内容') 89 | # save_cache(res.text) # 保存文章内容到缓存文件,方便后续检查内容 90 | return {'content_flag': 1, 'content': res.text} 91 | elif '>当前环境异常, 完成验证后即可继续访问 <' in res.text: 92 | print('当前环境异常, 请检查链接后访问!!!') # 代码访问遇到人机验证,需进行验证操作 93 | return {'content_flag': 0, 'current_url': content_url} 94 | elif '操作频繁, 请稍后再试' in res.text: 95 | print('操作频繁了, 等会再弄或换ip弄!!!') # 遇到次数较少,如有遇到请前往GitHub留言 96 | return {'content_flag': 0, 'current_url': content_url} 97 | else: 98 | print('出现其他问题, 请查找原因后再试!!!!\n' 99 | '************************************\n' 100 | '一般情况下, 这篇文章可能是一整页的图片, 没有文本内容, 具体原因待讨论.该文章链接为:\n' 101 | + content_url + '\n' 102 | '************************************\n') # 出现错误信息,如有遇到请前往GitHub留言 103 | return {'content_flag': 0, 'current_url': content_url} 104 | 105 | def format_content(self, content): 106 | """ 107 | 功能描述: 108 | 格式化文章内容,提取出文章中的文本内容 109 | 输入: 110 | 文章内容 111 | 输出: 112 | 格式化后的文章内容 113 | """ 114 | # 整理文章关键信息 115 | # nickname = re.search(r'var nickname.*"(.*?)".*', article_content).group(1) # 公众号名称 116 | # article_link = re.search(r'var msg_link = .*"(.*?)".*', article_content).group(1) # 文章链接 117 | # createTime = re.search(r"var createTime = '(.*?)'.*", article_content).group(1) # 文章创建时间 118 | # # year, month, day = createTime.split(" ")[0].split("-") # 年,月,日 119 | # # hour, minute = createTime.split(" ")[1].split(":") # 小时,分钟 120 | # author = re.search(r'var author = "(.*?)".*', article_content).group(1) # 文章作者 121 | # print(article_content) 122 | 123 | # 整理文章关键信息 124 | soup = BeautifulSoup(content, 'lxml') 125 | self.nickname = soup.find("a", id="js_name").get_text().strip() # 公众号名称 126 | author = soup.find("meta", {"name": "author"}).get("content").strip() # 文章作者 127 | article_link = soup.find("meta", property="og:url").get("content") # 文章链接 128 | article_title = soup.find("h1", id="activity-name").get_text().strip() # 文章标题 129 | print('当前文章为>>>> ' + article_title) 130 | 131 | # 将文字内容转换为列表形式存储 132 | original_texts = soup.getText().split('\n') # 将页面所有的文本内容提取,并转为列表形式 133 | format_texts = list(filter(lambda x: bool(x.strip()), original_texts)) # filter() 函数可以根据指定的函数对可迭代对象进行过滤 134 | 135 | # 正则方式 136 | createTime = re.search(r"var createTime = '(.*?)'.*", content).group(1) # 文章创建时间 137 | year, month, day = createTime.split(" ")[0].split("-") # 年,月,日 138 | hour, minute = createTime.split(" ")[1].split(":") 139 | 140 | # 提取公众号biz值, 拼凑主页链接 141 | appuin = re.search(r"var appuin = (.*?);", content).group(1) # 公众号biz值 142 | quoted_values = re.findall(r'["\']([^"\']*)["\']', appuin) 143 | for value in quoted_values: 144 | if value: 145 | self.biz = value 146 | # 公众号主页链接 147 | self.public_main_link = ('https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=' 148 | + self.biz + '&scene=124#wechat_redirect') 149 | 150 | return { 151 | 'nickname': self.nickname, 152 | 'author': author, 153 | 'article_link': article_link, 154 | 'article_title': article_title, 155 | 'createTime': createTime, 156 | 'content': content, 157 | 'format_texts': format_texts, # 格式化后的文章内容 158 | } 159 | 160 | def save_article_img(self, data_path, content_info): 161 | """ 162 | 功能描述: 163 | 保存单篇文章的图片内容到本地目录 164 | 输入: 165 | 文章信息 166 | 输出: 167 | None 168 | """ 169 | # 文章图片保存目录 170 | nickname_path = os.path.join(data_path, '公众号----' + self.nickname) 171 | os.makedirs(nickname_path, exist_ok=True) # 创建数据存储目录 172 | 173 | # 适配Windows系统路径 174 | article_title = content_info['article_title'] # 文章标题 175 | article_title_win = re.sub(r'[\\/*?:"<>|].', '_', article_title) # Windows下标题 176 | article_title_win = article_title_win.replace('.', '') # Windows下标题,去除小数点,防止自动省略报错 177 | title_time = content_info['createTime'].replace(':', '_') # 文章发布时间,Windows下文件名不能包含冒号 178 | 179 | # 创建图片保存目录 180 | img_save_path = os.path.join(nickname_path, title_time + ' ---- ' + article_title_win) 181 | os.makedirs(img_save_path, exist_ok=True) 182 | print('设置文章图片存储路径>>>> ' + img_save_path) 183 | 184 | # 保存该文章图片内容 185 | images = content_info['content'].split('https://mmbiz.qpic.cn/') 186 | # print(images) 187 | 188 | for i in range(0, len(images) - 1): 189 | image_url = 'https://mmbiz.qpic.cn/' + images[i + 1].split('"')[0] 190 | # print('正在获取图片:' + image_url) 191 | image_name = '' 192 | 193 | try: 194 | # 添加随机延迟,避免请求过快 195 | time.sleep(0.5 + random.random()) 196 | 197 | # 使用session发送请求,设置超时 198 | response = self.session.get(image_url, verify=False, timeout=self.timeout) 199 | 200 | # 检查响应状态码 201 | if response.status_code == 200: 202 | # 图片命名 203 | img_hz = ['gif', 'jpg', 'jpeg', 'png', 'webp'] 204 | for imghz in img_hz: 205 | if imghz in image_url: 206 | image_name = str(i + 1) + '.' + imghz 207 | if image_name == '': # 如果链接中没有标明图片属性 208 | image_name = str(i + 1) + '.jpg' 209 | file_path = os.path.join(img_save_path, image_name) 210 | # 保存图片 211 | with open(file_path, 'wb') as f: 212 | f.write(response.content) 213 | print(f"已成功下载图片: {file_path}") 214 | else: 215 | print(f"无法下载图片,状态码: {response.status_code}") 216 | except Exception as e: 217 | print(f"下载图片时出错:{str(e)}") 218 | time.sleep(1) # 重试前等待 219 | print('已保存文章图片>>>> ' + article_title) 220 | 221 | -------------------------------------------------------------------------------- /src/all_process.py: -------------------------------------------------------------------------------- 1 | """ 2 | 汇总所有流程 3 | """ 4 | from src.base_spider import BaseSpider 5 | from src.wechat_funcs import ArticleDetail 6 | from src.save_to_excel import SaveToExcel 7 | 8 | import os 9 | 10 | 11 | data_path = r'all_data' # 数据存储目录 12 | 13 | class AccessWechatArticle: 14 | def __init__(self): 15 | self.base_spider = BaseSpider() # 获取主页链接 16 | self.article_detail = ArticleDetail() # 共用微信token 17 | self.nickname = None 18 | self.public_token_link = None 19 | 20 | def get_public_main_link(self, article_url): 21 | """ 22 | 获取文章的公共号主页链接 23 | """ 24 | content = self.base_spider.get_an_article(article_url) 25 | if content['content_flag'] == 1: 26 | self.base_spider.format_content(content['content']) 27 | self.nickname = self.base_spider.nickname 28 | public_main_link = self.base_spider.public_main_link 29 | print(f'公众号名称:{self.nickname}\n公众号主页: ↘ ↘ ↘ ↘\n{public_main_link}') 30 | print('将此链接 ( ̄︶ ̄)↗ ↗ ↗ ↗ 粘贴发送到 "微信PC端-文件传输助手"') 31 | else: 32 | print('获取文章内容失败') 33 | return None 34 | 35 | def get_article_list(self, public_token_link, page_start, page_end=1): 36 | """ 37 | 获取文章列表 38 | """ 39 | # 检查输入参数是否合法 40 | access_token = self.article_detail.format_raw_link(public_token_link) 41 | if not access_token: 42 | print('请检查输入参数是否正确') 43 | return None 44 | print('参数齐全,开始获取文章信息,默认状态获取全部文章') 45 | self.public_token_link = public_token_link # 供其他功能使用 46 | # 获取文章列表 [[temproary_page, local_time, create_time, article_title, content_cover, content_url, format_url]] 47 | list_info = None 48 | try: 49 | if page_start == 0 and page_end == 1: 50 | list_info = self.article_detail.whole_article_list(0,0) 51 | elif page_start > page_end and page_end == 1: 52 | print('防呆输入,已自动交换页码') 53 | list_info = self.article_detail.whole_article_list(page_end, page_start) 54 | else: 55 | list_info = self.article_detail.whole_article_list(page_start, page_end) 56 | except: 57 | print('获取文章列表失败') 58 | 59 | # 保存操作, 先获取公众号名称 60 | if self.nickname is None and list_info is not None: 61 | # 获取公众号名称 62 | article_url = list_info[0][6] 63 | content = self.base_spider.get_an_article(article_url) 64 | if content['content_flag'] == 1: 65 | self.base_spider.format_content(content['content']) 66 | self.nickname = self.base_spider.nickname 67 | elif self.nickname is not None: 68 | print('已检测到公众号名称: ' + self.nickname + '\n') 69 | else: 70 | print('未获取到文章列表, 请检查!!!') 71 | return None 72 | 73 | # 实例化存储对象 74 | save_to_excel = SaveToExcel(data_path, self.nickname) 75 | if list_info is None: 76 | print('获取到 0 篇文章, 请检查!!!') 77 | return None 78 | else: 79 | # 保存文章列表 80 | article_list_columns = ['临时页码', '本地保存时间', '文章发布时间', '文章标题', \ 81 | '文章封面链接', '文章原始链接(直接访问会提示验证)', '文章直连链接'] # 列名 82 | article_list_savepath = save_to_excel.article_raw_path 83 | save_to_excel.save_article_content(article_list_savepath, article_list_columns, list_info) 84 | print('文章列表保存成功') 85 | return None 86 | 87 | def save_article_content(self, nickname=None, save_img=False): 88 | """ 89 | 保存已有的文章列表内容 90 | 输入: 91 | 公众号名称, 如已获取过主页链接, 则跳过输入 92 | 默认认为已获取文章列表 93 | 输出: 94 | 无(文章内容保存到Excel文件中) 95 | """ 96 | if nickname == '' and self.nickname is None: 97 | print('检测到当前会话未涉及公众号信息获取操作!!!') 98 | print('请输入需要保存的公众号名称') 99 | return None 100 | elif nickname == '' and self.nickname is not None: 101 | print('已检测到公众号名称: ' + self.nickname + '\n') 102 | nickname = self.nickname 103 | else: 104 | print('当前输入公众号名称: ' + nickname + '\n') 105 | self.nickname = nickname 106 | 107 | # 实例化存储对象 108 | save_to_excel = SaveToExcel(data_path, nickname) 109 | article_list_path = save_to_excel.article_raw_path 110 | article_error_list = [] 111 | if not os.path.exists(article_list_path): # 检查文件是否存在 112 | print('请先获取文章列表, 并确认已保存文章列表到Excel文件中, 再执行此操作') 113 | return None 114 | # 读取文章列表 115 | article_list = save_to_excel.read_article_list(article_list_path) 116 | # 遍历文章列表, 保存文章内容 117 | for article in article_list: 118 | # 获取文章内容 119 | content = self.base_spider.get_an_article(article[6]) 120 | if content['content_flag'] == 1: # 检查文章内容是否获取成功 121 | article_content = self.base_spider.format_content(content['content']) 122 | # 修改文章创建时间 123 | article[2] = article_content['createTime'] 124 | # 添加格式化后的文章内容 125 | article.append(str(article_content['format_texts'])) 126 | # 保存单篇文章图片 127 | self.base_spider.nickname = nickname 128 | if save_img: self.base_spider.save_article_img(data_path, article_content) 129 | else: 130 | # print(f'获取文章内容失败, 文章链接: {article[6]}') 131 | article_list.remove(article) # 删除当前文章 132 | article_error_list.append(article) 133 | 134 | 135 | # 保存文章内容 136 | article_list_columns = ['临时页码', '本地保存时间', '文章发布时间', '文章标题', \ 137 | '文章封面链接', '文章原始链接(直接访问会提示验证)', '文章直连链接', '文章内容'] # 列名 138 | article_list_savepath = save_to_excel.article_contents_path 139 | save_to_excel.save_article_content(article_list_savepath, article_list_columns, article_list) 140 | 141 | # 保存错误文章列表 142 | article_list_columns = ['临时页码', '本地保存时间', '文章发布时间', '文章标题', \ 143 | '文章封面链接', '文章原始链接(直接访问会提示验证)', '文章直连链接'] # 列名 144 | article_list_savepath = save_to_excel.article_error_path 145 | save_to_excel.save_article_content(article_list_savepath, article_list_columns, article_error_list) 146 | 147 | 148 | def save_article_details(self, public_token_link): 149 | """ 150 | 功能描述: 151 | 保存文章的详情数据 152 | 输入: 153 | 微信客户端token 154 | 输出: 155 | 无(文章详情保存到Excel文件中) 156 | """ 157 | # 检查输入参数是否合法 158 | access_token = self.article_detail.format_raw_link(public_token_link) 159 | if not access_token: 160 | print('请检查输入参数是否正确') 161 | return None 162 | print('参数齐全,开始获取文章信息,默认状态获取全部文章') 163 | 164 | # 使用token获取公众号名称 165 | self.article_detail.get_detail_nickname() 166 | 167 | # 实例化存储对象 168 | save_to_excel = SaveToExcel(data_path, self.article_detail.nickname) 169 | article_list_path = save_to_excel.article_raw_path # 文章列表路径 170 | article_error_list = [] 171 | if not os.path.exists(article_list_path): # 检查文件是否存在 172 | print('请先获取文章列表, 并确认已保存文章列表到Excel文件中, 再执行此操作') 173 | return None 174 | # 读取文章列表 175 | article_list = save_to_excel.read_article_list(article_list_path) 176 | # 遍历文章列表, 保存文章内容 177 | for article in article_list: 178 | # 获取文章内容 179 | content = self.base_spider.get_an_article(article[6]) 180 | if content['content_flag'] == 1: # 检查文章内容是否获取成功 181 | article_content = self.base_spider.format_content(content['content']) 182 | # 修改文章创建时间 183 | article[2] = article_content['createTime'] 184 | # 添加格式化后的文章内容 185 | article.append(str(article_content['format_texts'])) 186 | # 获取文章详情, 仅当文章内容没问题时执行 187 | article_detail = self.article_detail.get_detail_content(article[5], article[3], content['content']) 188 | if article_detail is None: article.append('******文章详情获取失败!!!*******') 189 | else: article.extend(article_detail) # 批量添加文章详情 190 | else: 191 | # print(f'获取文章内容失败, 文章链接: {article[6]}') 192 | article_list.remove(article) # 删除当前文章 193 | article_error_list.append(article) 194 | 195 | 196 | # 保存文章内容 197 | article_list_columns = ['临时页码', '本地保存时间', '文章发布时间', '文章标题', \ 198 | '文章封面链接', '文章原始链接(直接访问会提示验证)', '文章直连链接', '文章内容', \ 199 | '阅读量', '点赞数', '转发数', '在看数', '评论数', '评论点赞数'] # 列名 200 | article_list_savepath = save_to_excel.article_details_path 201 | save_to_excel.save_article_content(article_list_savepath, article_list_columns, article_list) 202 | 203 | # 保存错误文章列表 204 | article_list_columns = ['临时页码', '本地保存时间', '文章发布时间', '文章标题', \ 205 | '文章封面链接', '文章原始链接(直接访问会提示验证)', '文章直连链接'] # 列名 206 | article_list_savepath = save_to_excel.article_error_path 207 | save_to_excel.save_article_content(article_list_savepath, article_list_columns, article_error_list) 208 | 209 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 微信公众号/文章 获取(Access_wechat_article) 2 | 3 | 更新时间:2025-12-03 4 | 5 | 本项目是基于Python语言的爬虫程序,支持对微信公众号文章内容获取 6 | 7 | 目前支持 Windows / Linux 开箱即用,**建议使用虚拟环境运行项目** 8 | 9 | 如果感兴趣,请 **Fork** 项目后自行研究使用 10 | 11 | 使用过程中如遇到错误,欢迎提交 [issues](https://github.com/yeximm/Access_wechat_article/issues) 来讨论 12 | 13 | **注**:请在 [GitHub](https://github.com/) 平台提交 [issues](https://github.com/yeximm/Access_wechat_article/issues) 14 | 15 | ## 一、主要功能 16 | 17 | 1. 获取**公众号主页链接**,通过微信内置浏览器可直接打开 18 | 2. 获取公众号**已发布**的文章列表(**微信公众号**下的历史文章) 19 | 3. 批量下载公众号文章的**网页文本数据** 20 | 4. 获取微信公众号文章的**所有信息**,如阅读量、点赞数、转发数、评论、评论点赞等信息。 21 | 22 | ## 二、项目开发环境及工具 23 | 24 | 1. 系统环境:Windows 11 ×64 25 | 2. 程序运行环境:python 3.13 26 | 3. 涉及应用:微信**PC版**,当前项目已适配的微信版本:**`4.1.5.16`** 27 | 4. 使用工具:[Fiddler Classic](https://www.telerik.com/fiddler/fiddler-classic),当前项目适配的Fiddler Classic版本:**`v5.0.20253.3311`** 28 | 29 | **目录架构** 30 | 31 | ```bash 32 | Access_wechat_article/ 33 | ├── .venv/ # 虚拟环境目录 34 | ├── src/ # 源代码目录 35 | │ ├── all_process.py # 流程汇总 36 | │ ├── base_spider.py # 基础爬虫模块 37 | │ ├── save_to_excel.py # 存储模块 38 | │ ├── tools.py # 其他工具模块 39 | │ └── wechat_funcs.py # 微信token模块 40 | ├── LICENSE # 许可凭证 41 | ├── main.py # 项目主文件 42 | ├── README/ # 项目说明文档资源(图片、文件) 43 | ├── README.md # 项目说明文档 44 | └── requirements.txt # 项目依赖列表 45 | ``` 46 | 47 | ## 三、程序使用 48 | 49 | ### 3.1下载 / Download 50 | 51 | - 下载地址:[https://github.com/yeximm/Access_wechat_article/releases](https://github.com/yeximm/Access_wechat_article/releases) 52 | - 👆👆👆以上为本项目发布页地址,选取所需版本下载即可。 53 | 54 | 55 | - 存储库快照:[Github_master](https://github.com/yeximm/Access_wechat_article/archive/refs/heads/master.zip) 56 | - 存储库快照等同于 [Releases](https://github.com/yeximm/Access_wechat_article/releases) 中的 [Source Code (zip)](https://github.com/yeximm/Access_wechat_article/archive/refs/heads/master.zip) 等,包含 `README` 等内容 57 | 58 | ### 3.2 Python环境配置 59 | 60 | (1)创建虚拟环境 61 | 62 | ```bash 63 | python -m venv .venv 64 | ``` 65 | 66 | `venv`指定存放环境的目录,一般使用 `venv`,这是一个不成文的规定。 67 | 68 | (2)**激活**环境 69 | 70 | - Windows 71 | 72 | ```bash 73 | .\.venv\Scripts\activate 74 | ``` 75 | 76 | - Unix/macOS 77 | 78 | ```bash 79 | source .venv/bin/activate 80 | ``` 81 | 82 | (3)退出环境 83 | 84 | ```bash 85 | deactivate 86 | ``` 87 | 88 | ### 3.3 安装项目依赖包 89 | 90 | `requirements.txt`中包含所需python包文件名称,用来批量安装python包文件 91 | 92 | 安装命令: 93 | 94 | ```bash 95 | pip install -r requirements.txt 96 | ``` 97 | 98 | ### 3.4 运行参数 99 | 100 | 1. 项目主文件为:`main.py`,其功能调用方式详见于此。 101 | 项目中**生成文件的存储路径**为:`./all_data`(该目录由程序**自动创建**) 102 | 2. 运行命令: 103 | 104 | 1. 首先进入**虚拟环境**(详见**激活**虚拟环境) 105 | 106 | 2. 安装python包文件(如已安装则进行下一步) 107 | 108 | 3. 在项目目录运行: 109 | 110 | - ```bash 111 | python main.py 112 | ``` 113 | 114 | 4. 根据控制台提示输入 115 | 116 | 5. 如需**自定义功能**,参照`main.py`中的函数调用方式自行编写。 117 | 118 | ## 四、功能示例 119 | 120 | ### 4.1 功能1 121 | 122 | ```bash 123 | 欢迎使用, 请输入数字键! 124 | 数字键1: 获取公众号主页链接 125 | 数字键2: 获取公众号已发布的文章列表 126 | 数字键3: 下载公众号文章内容 (默认下载 "文章列表" 中的所有文章) 127 | 数字键4: 同功能3, 另外获取每篇文章的 "阅读量"、"点赞数"等信息 128 | (请注意请求间隔,若请求太多太快可能会触发封禁!!) 129 | 输入其他任意字符退出! 130 | 请输入功能数字: 1 131 | ``` 132 | 133 | **程序执行结果** 134 | 135 | ```bash 136 | ########## 默认公众号主页链接为“研招网资讯”,按回车键使用。########## 137 | ########## 若需获取其他公众号主页链接,请输入公众号下任意一篇已发布的文章链接。########## 138 | 请输入文章链接:https://mp.weixin.qq.com/s/ZNXDr2ErJno9-NdS4RYDCg 139 | 为预防被封禁, 短延时:0.906秒 140 | 正常获取到文章内容 141 | 当前文章为>>>> 法国总统马克龙抵达北京开始访华 142 | 公众号名称:新华网 143 | 公众号主页: https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzA4MjQxNjQzMA==&scene=124#wechat_redirect 144 | 将此链接 ( ̄︶ ̄)↗ ↗ ↗ ↗ 粘贴发送到 "微信PC端-文件传输助手" 145 | 按回车键继续... 146 | ``` 147 | 148 | ### 4.2 功能2 149 | 150 | ```bash 151 | 请输入数字键! 152 | 数字键1: 获取公众号主页链接 153 | 数字键2: 获取公众号已发布的文章列表 154 | 数字键3: 下载公众号文章内容 (默认下载 "文章列表" 中的所有文章) 155 | 数字键4: 同功能3, 另外获取每篇文章的 "阅读量"、"点赞数"等信息 156 | (请注意请求间隔,若请求太多太快可能会触发封禁!!) 157 | 输入其他任意字符退出! 158 | 请输入功能数字: 2 159 | ``` 160 | 161 | **输入参数** 162 | 163 | ```bash 164 | ########## 以下内容需要用到fiddler工具 ########## 165 | (1) 在微信客户端打开步骤1获取到的链接, 166 | (2) 在fiddler中查看——主机地址为https://mp.weixin.qq.com, URL地址为: /mp/profile_ext?acti 167 | (3) 选中此项后按快捷键: Ctrl+U 复制该网址到剪贴板, 将内容粘贴到此处 168 | 请输入复制的链接(づ ̄ 3 ̄)づ:https://mp.weixin.qq.com/mp/profile_ext?xxxxxx... 169 | ``` 170 | 171 | ```bash 172 | ########## 获取指定页数的文章列表 ########## 173 | 一页文章数量约 15 篇, 请根据实际情况估算 (即: input * 15 = 文章数量) 174 | 例如: 获取前3页的文章列表, 请输入 3 175 | 公众号下全部文章列表, 请输入: 0 (注意: 若输入0, 全部列表可能需要较长时间, 视文章数量而定) 176 | 公众号下第2页到第5页的文章列表, 请输入 2-5 177 | 请输入需要下载的页数(默认: 1): 2-5 178 | ``` 179 | 180 | **程序执行结果** 181 | 182 | ```bash 183 | 参数齐全,开始获取文章信息,默认状态获取全部文章 184 | 获取 2 至 5 页的文章列表 185 | 正在获取第 2 页文章列表 186 | 该页包含 15 篇文章 187 | 为预防被封禁,开始延时操作,延时时间:4.962秒 188 | 正在获取第 3 页文章列表 189 | 该页包含 13 篇文章 190 | 为预防被封禁,开始延时操作,延时时间:3.599秒 191 | 正在获取第 4 页文章列表 192 | 该页包含 14 篇文章 193 | 为预防被封禁,开始延时操作,延时时间:6.705秒 194 | 正在获取第 5 页文章列表 195 | 该页包含 12 篇文章 196 | 为预防被封禁,开始延时操作,延时时间:3.075秒 197 | 已检测到公众号名称: 新华网 198 | 199 | 2025-12-03 17:37:16 存储路径>>>> all_data\公众号----新华网\文章列表 (article_list).xlsx 200 | 文章列表保存成功 201 | 按回车键继续... 202 | ``` 203 | 204 | ### 4.3 功能3 205 | 206 | ```bash 207 | 请输入数字键! 208 | 数字键1: 获取公众号主页链接 209 | 数字键2: 获取公众号已发布的文章列表 210 | 数字键3: 下载公众号文章内容 (默认下载 "文章列表" 中的所有文章) 211 | 数字键4: 同功能3, 另外获取每篇文章的 "阅读量"、"点赞数"等信息 212 | (请注意请求间隔,若请求太多太快可能会触发封禁!!) 213 | 输入其他任意字符退出! 214 | 请输入功能数字: 3 215 | ``` 216 | 217 | **输入参数** 218 | 219 | ```bash 220 | ########## 保存公众号文章内容 ########## 221 | 输入: 已下载文章列表的公众号名称 (例如: 研招网资讯) 或 公众号的一篇文章链接 222 | (若当前会话已执行过步骤2, 可按回车跳过) 223 | 请输入: 新华网 224 | ``` 225 | 226 | ```bash 227 | ########## 是否保存图片 ########## 228 | 是否保存图片? 是(输入任意值), 否(默认,按回车跳过):y 229 | ``` 230 | 231 | **程序执行结果** 232 | 233 | ```bash 234 | 为预防被封禁, 短延时:1.043秒 235 | 正常获取到文章内容 236 | 当前文章为>>>> “时速能破150公里”?这种“爆改”太吓人! 237 | 为预防被封禁, 短延时:0.988秒 238 | 正常获取到文章内容 239 | 当前文章为>>>> 流感季,发烧了怎么办? 240 | ... 241 | 正常获取到文章内容 242 | 当前文章为>>>> 武装袭击事件,中国公民3死1伤!我使馆紧急提醒→ 243 | 2025-12-03 17:40:43 存储路径>>>> all_data\公众号----新华网\文章内容 (article_contents).xlsx 244 | 2025-12-03 17:40:43 存储路径>>>> all_data\公众号----新华网\问题链接 (error_links).xlsx 245 | 按回车键继续... 246 | ``` 247 | 248 | ### 4.4 功能4 249 | 250 | ```bash 251 | 请输入数字键! 252 | 数字键1: 获取公众号主页链接 253 | 数字键2: 获取公众号已发布的文章列表 254 | 数字键3: 下载公众号文章内容 (默认下载 "文章列表" 中的所有文章) 255 | 数字键4: 同功能3, 另外获取每篇文章的 "阅读量"、"点赞数"等信息 256 | (请注意请求间隔,若请求太多太快可能会触发封禁!!) 257 | 输入其他任意字符退出! 258 | 请输入功能数字: 4 259 | ``` 260 | 261 | **输入参数** 262 | 263 | ```bash 264 | ########## 保存公众号文章详情 ########## 265 | 以下内容需要用到fiddler工具, 参考步骤2将 URL地址 粘贴到此处 266 | 请输入复制的链接(づ ̄ 3 ̄)づ: https://mp.weixin.qq.com/mp/profile_ext?xxxxxx... 267 | ``` 268 | 269 | **程序执行结果** 270 | 271 | ```bash 272 | 参数齐全,开始获取文章信息,默认状态获取全部文章 273 | 获取 1 至 1 页的文章列表 274 | 正在获取第 1 页文章列表 275 | 该页包含 13 篇文章 276 | 为预防被封禁,开始延时操作,延时时间:5.049秒 277 | 为预防被封禁, 短延时:0.148秒 278 | 正常获取到文章内容 279 | 当前文章为>>>> 湖南省人大常委会原党组成员、副主任叶红专被查 280 | 为预防被封禁, 短延时:0.702秒 281 | ... 282 | 正常获取到文章内容 283 | 当前文章为>>>> 武装袭击事件,中国公民3死1伤!我使馆紧急提醒→ 284 | 为预防被封禁,开始延时操作,延时时间:5.352秒 285 | 2025-12-03 17:48:43请求完成, 文章标题为: 武装袭击事件,中国公民3死1伤!我使馆紧急提醒→ 286 | 2025-12-03 17:48:44 存储路径>>>> all_data\公众号----新华网\文章详情 (article_detiles).xlsx 287 | 2025-12-03 17:48:44 存储路径>>>> all_data\公众号----新华网\问题链接 (error_links).xlsx 288 | 按回车键继续... 289 | ``` 290 | 291 | ## 五、鼓励一下 292 | 293 | 开源不易,若此项目有帮到你,望你能动用你的发财小手**Star**☆一下。 294 | 295 | 如有遇到代码方面的问题,欢迎一起讨论,你的鼓励是这个项目继续更新的最大动力! 296 | 297 |

298 | 299 |

300 | 301 | 302 | 303 | 另外,十分感谢大家对于本项目的关注。 304 | 305 | [![Stargazers repo roster for @yeximm/Access_wechat_article](https://reporoster.com/stars/yeximm/Access_wechat_article)](https://github.com/yeximm/Access_wechat_article/stargazers) 306 | [![Forkers repo roster for @yeximm/Access_wechat_article](https://reporoster.com/forks/yeximm/Access_wechat_article)](https://github.com/yeximm/Access_wechat_article/network/members) 307 | 308 | ## 六、程序流程图 309 | 310 | ![wechat_article_drawio](./README/wechat_article_drawio.png) 311 | 312 | ### 6.1 基础爬虫模块 313 | 314 | ![image-20251203185742977](README/image-20251203185742977.png) 315 | 316 | ### 6.2 获取文章列表模块(需token) 317 | 318 | ![image-20251203185757196](README/image-20251203185757196.png) 319 | 320 | ### 6.3 文章内容获取 321 | 322 | ![image-20251203185810439](README/image-20251203185810439.png) 323 | 324 | ### 6.4 文章详细信息获取(需token) 325 | 326 | ![image-20251203185822659](README/image-20251203185822659.png) 327 | 328 | ## LICENSE 329 | 330 | 本作品采用许可协议 Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International ,简称 **[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/)**。 331 | 332 | 所有以任何方式查看本仓库内容的人、或直接或间接使用本仓库内容的使用者都应仔细阅读此声明。本仓库管理者保留随时更改或补充此免责声明的权利。一旦使用、复制、修改了本仓库内容,则视为您已接受此免责声明。 333 | 334 | 项目内容仅供学习研究,请勿用于商业用途。如对本仓库内容的功能有需求,应自行开发相关功能。所有基于本仓库内容的源代码,进行的任何修改,为其他个人或组织的自发行为,与本仓库内容没有任何直接或间接的关系,所造成的一切后果亦与本仓库内容和本仓库管理者无关。 335 | 336 | 本仓库内容中涉及的第三方硬件、软件等,与本仓库内容没有任何直接或间接的关系。本仓库内容仅对部署和使用过程进行客观描述,不代表支持使用任何第三方硬件、软件。使用任何第三方硬件、软件,所造成的一切后果由使用的个人或组织承担,与本仓库内容无关。 337 | 338 | ## Star History 339 | 340 | [![Star History Chart](https://api.star-history.com/svg?repos=yeximm/Access_wechat_article&type=Date)](https://www.star-history.com/#yeximm/Access_wechat_article&Date) 341 | 342 | -------------------------------------------------------------------------------- /src/wechat_funcs.py: -------------------------------------------------------------------------------- 1 | """ 2 | 该模块包含了微信公众号token相关的函数 3 | """ 4 | from urllib import parse # 用于解析获取url参数 5 | import re 6 | import time 7 | import json 8 | import random 9 | import jsonpath 10 | 11 | from src.base_spider import BaseSpider 12 | 13 | 14 | class ArticleDetail(BaseSpider): 15 | """ 16 | 功能描述: 17 | 使用token获取公众号内容 18 | """ 19 | def __init__(self): 20 | super().__init__() 21 | self.biz = None 22 | self.uin = None 23 | self.key = None 24 | self.pass_ticket = None 25 | self.text = 'website' # 预留位,保证返回值不会报错 26 | 27 | def format_raw_link(self, token_url): 28 | """ 29 | 功能描述: 30 | 检验token是否合法 31 | 格式化原始链接, 提取出biz、uin、key、pass_ticket等参数 32 | 输入: 33 | 原始链接(包含token等参数) 34 | 输出: 35 | 无(参数值存储在类的属性中) 36 | """ 37 | # 检验access_token是否合法 38 | access_token=parse.urlparse(token_url) # 解析url,拆解为各主体信息,目标为query参数 39 | query_dict = parse.parse_qs(access_token.query) 40 | 41 | self.biz = query_dict['__biz'][0] 42 | self.uin = query_dict['uin'][0] 43 | self.key = query_dict['key'][0] 44 | self.pass_ticket = query_dict['pass_ticket'][0] 45 | 46 | if self.biz and self.uin and self.pass_ticket and self.key: 47 | # print('参数齐全,开始获取文章信息,默认状态获取全部文章') 48 | return True 49 | else: 50 | print('\n※※※ 参数有误,请重新输入') 51 | return False 52 | 53 | 54 | def whole_article_list(self, pages_start, pages_end): 55 | """ 56 | 功能描述: 57 | 获取文章指定页数的链接列表 58 | 输入: 59 | 需要下载的页数(默认1页) 60 | 若输入为0, 则获取全部文章 61 | 输出: 62 | 无(内容保存在文件) 63 | """ 64 | all_article_link = [] # 存储所有文章链接 65 | if pages_start == 0: 66 | # 遍历公众号下所有文章链接 67 | page = 0 68 | passage_list = [] 69 | print('开始获取公众号下所有的文章列表') 70 | while True: 71 | p_data = self.get_next_list(page) 72 | if p_data['m_flag'] == 1: 73 | for i in p_data['one_page_list']: 74 | passage_list.append(i) 75 | else: 76 | print('请求结束,文章列表获取完毕!') 77 | break 78 | page = page + 1 79 | self.delay_time() # 随机延时做模拟手动操作,预防被封禁 80 | all_article_link = passage_list 81 | else: 82 | # 遍历指定页数的文章链接 83 | print('获取 ' + str(pages_start) + ' 至 ' + str(pages_end) + ' 页的文章列表') 84 | passage_list = [] 85 | for pages in range(pages_start-1, pages_end): 86 | p_data = self.get_next_list(pages) 87 | if p_data['m_flag'] == 1: 88 | for i in p_data['one_page_list']: 89 | passage_list.append(i) 90 | else: 91 | print('请求结束,文章列表获取完毕!') 92 | break 93 | self.delay_time() # 随机延时做模拟手动操作,预防被封禁 94 | all_article_link = passage_list 95 | # print('********************共获取到 ' + str(len(all_article_link)) + ' 篇文章,开始保存文章,若为 0 篇请检查错误!!!\n') 96 | if not all_article_link: 97 | print('获取到文章列表为空,请注意检查!!!!') 98 | return None # 如果获取为空 99 | else: 100 | return all_article_link 101 | 102 | 103 | def get_next_list(self, page): 104 | """ 105 | 功能描述: 106 | 获取指定页的文章列表 107 | 输入: 108 | 页码 109 | 输出: 110 | 文章列表 111 | """ 112 | # 从0开始计数,第 0 页相当于默认页数据 113 | pages = int(page) * 10 114 | print('正在获取第 ' + str(page + 1) + ' 页文章列表') 115 | url = ('https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=' + self.biz + '&f=json&offset=' 116 | + str(pages) + '&count=10&is_ok=1&scene=124&uin=' + self.uin + '&key=' + self.key + '&pass_ticket=' 117 | + self.pass_ticket + '&wxtoken=&appmsg_token=&x5=0&f=json') 118 | try: 119 | res = self.session.get(url=url, headers=self.headers, timeout=10, verify=False) 120 | except: 121 | print('失败!!!获取第 ' + str(page + 1) + ' 页文章列表失败!!!') 122 | res = ArticleDetail() # 保证返回值不会报错 123 | if 'app_msg_ext_info' in res.text: 124 | # 解码json数据 125 | get_page = json.loads(json.loads(res.text)['general_msg_list'])['list'] 126 | ''' 127 | 返回内容解析 128 | get_page[0]为 129 | { 130 | 'comm_msg_info': { 131 | 'id': 1000000107, 'type': 49, 132 | 'datetime': 1722467332, 'fakeid': '3910318108', 'status': 2, 'content': '' 133 | }, 134 | 'app_msg_ext_info': { 135 | 'title': '国务院7月重要政策', 136 | 'digest': '', 'content': '', 'fileid': 100007840, 137 | 'content_url': 'http://mp.weixin.qq.com/s?__biz=MzkxMDMxODEwOA==&mid=2247491511&idx=1&sn=a36291fdee52a0f53d145edec8058e04&chksm=c0084d6abbcac962a50153c89fe9c19b6f8b1c5e5ac50b05adcb49bdfad8638522ab426c3f4b&scene=27#wechat_redirect', 138 | 'source_url': '', 139 | 'cover': 'https://mmbiz.qpic.cn/mmbiz_jpg/JRAjbHqmggrlZibDMibLP4ryNqhYXgolJOdQj2P8t2QQFVicickzAo7Gv1SzazwJY6lDylcanx2ic60HDbMvK8OKQpg/0?wx_fmt=jpeg', 140 | 'subtype': 9, 'is_multi': 1, 141 | 'multi_app_msg_item_list': [ 142 | {' 143 | 'title': '8月起,这些新规将影响你我生活!', 144 | 'digest': '', 'content': '', 'fileid': 0, 145 | 'content_url': 'http://mp.weixin.qq.com/s?__biz=MzkxMDMxODEwOA==&mid=2247491511&idx=2&sn=b3f5b6bcf8727c8c90fce7e588e6e7da&chksm=c0eb20c99ca2f90032a6234002ed2cc9c2c000f87cff34f4d8d763878c0bb5275800db876ca7&scene=27#wechat_redirect', 146 | 'source_url': '', 147 | 'cover': 'https://mmbiz.qpic.cn/mmbiz_jpg/JRAjbHqmggrc08yJMZ6CQ3VL6VzmEIymSUyATlL6o3xaDJJ0D2CtpQg31Vy7jdCaic86zqkgJ9oAFGyia78ZOq7g/0?wx_fmt=jpeg', 148 | 'author': '', 'copyright_stat': 100, 'del_flag': 1, 'item_show_type': 0, 'audio_fileid': 0, 149 | 'duration': 0, 'play_url': '', 'malicious_title_reason_id': 0, 'malicious_content_type': 0 150 | }, 151 | { 152 | 'title': '8月,你好!', 153 | 'digest': '', 'content': '', 'fileid': 100007860, 154 | 'content_url': 'http://mp.weixin.qq.com/s?__biz=MzkxMDMxODEwOA==&mid=2247491511&idx=3&sn=cd25de57b74b63b0f3b1a9888b9cd94d&chksm=c0c7f30fdd5fc0ea4a2765f5fd29e1faeb0e352e888ee8556521ab23bc9528d68f42deaa9d15&scene=27#wechat_redirect', 155 | 'source_url': '', 156 | 'cover': 'https://mmbiz.qpic.cn/mmbiz_jpg/JRAjbHqmggrlZibDMibLP4ryNqhYXgolJO9CnECAnMLDPY39Y9iarcFtM1ibrBvhKcGFyl1wicHysvTrYx4GfLybt8g/0?wx_fmt=jpeg', 157 | 'author': '', 'copyright_stat': 100, 'del_flag': 1, 'item_show_type': 0, 'audio_fileid': 0, 158 | 'duration': 0, 'play_url': '', 'malicious_title_reason_id': 0, 'malicious_content_type': 0} 159 | ], 160 | 'author': '', 'copyright_stat': 100, 'duration': 0, 'del_flag': 1, 'item_show_type': 0, 'audio_fileid': 0, 'play_url': '', 'malicious_title_reason_id': 0, 'malicious_content_type': 0 161 | } 162 | } 163 | ''' 164 | one_page_list = [] # 存放一页内的所有文章 165 | for i in get_page: 166 | # 时间戳转换 167 | time_tuple = time.localtime(i['comm_msg_info']['datetime']) 168 | create_time = time.strftime("%Y-%m-%d", time_tuple) 169 | local_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) # 本地时间 170 | 171 | # 每日首篇文章的标题 172 | article_title = i['app_msg_ext_info']['title'] 173 | content_cover = i['app_msg_ext_info']['cover'] 174 | content_url = i['app_msg_ext_info']['content_url'].replace('#wechat_redirect', '') 175 | format_url = content_url.replace('amp;', '') 176 | 177 | temproary_page = page + 1 # 临时页码,用于抓取中断时的记录 178 | one_page_list.append([temproary_page, local_time, create_time, article_title, content_cover, content_url, format_url]) 179 | 180 | if i['app_msg_ext_info']['multi_app_msg_item_list']: 181 | for j in i['app_msg_ext_info']['multi_app_msg_item_list']: 182 | article_title = j['title'] 183 | content_cover = j['cover'] 184 | content_url = j['content_url'].replace('#wechat_redirect', '') 185 | format_url = content_url.replace('amp;', '') 186 | one_page_list.append([temproary_page, local_time, create_time, article_title, content_cover, content_url, format_url]) 187 | print('该页包含 ' + str(len(one_page_list)) + ' 篇文章') 188 | return { 189 | 'm_flag': 1, 190 | 'one_page_list': one_page_list, 191 | 'length': len(one_page_list) 192 | } 193 | elif '"home_page_list":[]' in res.text: 194 | print('\n出现:操作频繁,请稍后再试\n该号已被封禁,请解封后再来!!!\n') 195 | return {'m_flag': 0} 196 | else: 197 | print('请求结束!未获取到第 ' + str(page + 1) + ' 页文章列表') 198 | return {'m_flag': 0} 199 | 200 | def get_detail_nickname(self): 201 | """ 202 | 功能描述: 203 | 使用token获取公众号名称 204 | 输入: 205 | 无 206 | 输出: 207 | 无 208 | """ 209 | # 提取nickname 210 | first_page_list = self.whole_article_list(1, 1) # 获取第一页的文章列表 211 | if not first_page_list: 212 | print('获取文章列表失败') 213 | return None 214 | article_index = 0 215 | first_content = self.get_an_article(first_page_list[article_index][6]) # 提取nickname 216 | if first_content['content_flag'] == 0: 217 | print('提取第 ' + str(article_index + 1) + ' 篇文章的公众号名称失败') 218 | for i in range(1, len(first_page_list)): 219 | first_content = self.get_detail_nickname(first_page_list[i][6]) # 提取nickname 220 | if first_content['content_flag'] == 1: 221 | break 222 | return None 223 | self.format_content(first_content['content']) # 将公众号名称填入类属性 224 | 225 | def get_detail_content(self, source_url, article_title, one_content): 226 | """ 227 | 功能描述: 228 | 获取单篇文章的文章详情 229 | 输入: 230 | 单篇文章的列表数据 231 | 输出: 232 | 当前文章的详情数据 233 | """ 234 | # 设置延时 235 | self.delay_time() 236 | # 构建关键参数 237 | r = '' 238 | for rand in range(0, 16): 239 | r += str(random.randint(0, 9)) 240 | r = '0.' + r 241 | appmsg_type = "9" 242 | mid = str(source_url).split('mid=')[1].split('&')[0] 243 | sn = str(source_url).split('sn=')[1].split('&')[0] 244 | idx = str(source_url).split('idx=')[1].split('&')[0] 245 | 246 | comment_id = re.search("var comment_id = '(.*?)'.*", one_content) 247 | if comment_id: 248 | comment_id = comment_id.group(1) 249 | else: 250 | print('没有匹配到comment_id, 文章标题为: ' + article_title) 251 | comment_id = '' 252 | if 'var req_id = ' in one_content: 253 | req_id = one_content.split('var req_id = ')[1].split(';')[0].replace("'", "").replace('"', '') 254 | else: 255 | print('没有匹配到req_id, 文章标题为: ' + article_title) 256 | req_id = '' 257 | 258 | '''获取文章详情信息''' 259 | detail_url = ('https://mp.weixin.qq.com/mp/getappmsgext?f=json&mock=&fasttmplajax=1&f=json' + '&uin=' + self.uin 260 | + '&key=' + self.key + '&pass_ticket=' + self.pass_ticket + '&__biz=' + self.biz) 261 | data = { 262 | 'r': r, 263 | 'sn': sn, 264 | 'mid': mid, 265 | 'idx': idx, 266 | 'req_id': req_id, 267 | 'title': article_title, 268 | 'comment_id': comment_id, 269 | 'appmsg_type': appmsg_type, 270 | '__biz': self.biz, 271 | 'pass_ticket': self.pass_ticket, 272 | 'abtest_cookie': '', 'devicetype': 'Windows 10 x64', 'version': '63090b13', 'is_need_ticket': '0', 273 | 'is_need_ad': '0', 'is_need_reward': '0', 'both_ad': '0', 'reward_uin_count': '0', 'send_time': '', 274 | 'msg_daily_idx': '1', 'is_original': '0', 'is_only_read': '1', 'scene': '38', 'is_temp_url': '0', 275 | 'item_show_type': '0', 'tmp_version': '1', 'more_read_type': '0', 'appmsg_like_type': '2', 276 | 'related_video_sn': '', 'related_video_num': '5', 'vid': '', 'is_pay_subscribe': '0', 277 | 'pay_subscribe_uin_count': '0', 'has_red_packet_cover': '0', 'album_id': '1296223588617486300', 278 | 'album_video_num': '5', 'cur_album_id': 'undefined', 'is_public_related_video': 'NaN', 279 | 'encode_info_by_base64': 'undefined', 'exptype': '', 'export_key_extinfo': '', 'business_type': '0', 280 | } 281 | res = self.session.post(url=detail_url, data=data, headers=self.headers, cookies=self.cookies, verify=False) 282 | # print(res.text) 283 | read_num = jsonpath.jsonpath(json.loads(res.text), "$.." + "read_num") 284 | like_num = jsonpath.jsonpath(json.loads(res.text), "$.." + "old_like_num") 285 | share_num = jsonpath.jsonpath(json.loads(res.text), "$.." + "share_num") 286 | show_read = jsonpath.jsonpath(json.loads(res.text), "$.." + "show_read") 287 | 288 | # 获取评论以及评论点赞数 289 | comment_url = ('https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=' + self.biz + 290 | '&appmsgid=2247491372&idx=1&comment_id=' + comment_id + '&offset=0&limit=100&uin=' 291 | + self.uin + '&key=' + self.key + '&pass_ticket=' + self.pass_ticket 292 | + '&wxtoken=&devicetype=Windows+10&clientversion=62060833&appmsg_token=') 293 | response = self.session.get(comment_url, headers=self.headers, cookies=self.cookies, verify=False) 294 | json_content = json.loads(response.text) 295 | comments = jsonpath.jsonpath(json_content, '$..content') # 评论 296 | comments_star_nums = jsonpath.jsonpath(json_content, '$..like_num') # 评论点赞数 297 | 298 | local_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) # 本地时间 299 | print(str(local_time) + '请求完成, 文章标题为: ' + article_title) 300 | if read_num == [] or read_num == '': 301 | return None 302 | else: 303 | return (read_num[0], like_num[0], share_num[0], show_read[0], # 阅读量,点赞数,转发数,在看数, 304 | comments, comments_star_nums) # 评论,评论点赞 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Attribution-NonCommercial-ShareAlike 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International 58 | Public License 59 | 60 | By exercising the Licensed Rights (defined below), You accept and agree 61 | to be bound by the terms and conditions of this Creative Commons 62 | Attribution-NonCommercial-ShareAlike 4.0 International Public License 63 | ("Public License"). To the extent this Public License may be 64 | interpreted as a contract, You are granted the Licensed Rights in 65 | consideration of Your acceptance of these terms and conditions, and the 66 | Licensor grants You such rights in consideration of benefits the 67 | Licensor receives from making the Licensed Material available under 68 | these terms and conditions. 69 | 70 | 71 | Section 1 -- Definitions. 72 | 73 | a. Adapted Material means material subject to Copyright and Similar 74 | Rights that is derived from or based upon the Licensed Material 75 | and in which the Licensed Material is translated, altered, 76 | arranged, transformed, or otherwise modified in a manner requiring 77 | permission under the Copyright and Similar Rights held by the 78 | Licensor. For purposes of this Public License, where the Licensed 79 | Material is a musical work, performance, or sound recording, 80 | Adapted Material is always produced where the Licensed Material is 81 | synched in timed relation with a moving image. 82 | 83 | b. Adapter's License means the license You apply to Your Copyright 84 | and Similar Rights in Your contributions to Adapted Material in 85 | accordance with the terms and conditions of this Public License. 86 | 87 | c. BY-NC-SA Compatible License means a license listed at 88 | creativecommons.org/compatiblelicenses, approved by Creative 89 | Commons as essentially the equivalent of this Public License. 90 | 91 | d. Copyright and Similar Rights means copyright and/or similar rights 92 | closely related to copyright including, without limitation, 93 | performance, broadcast, sound recording, and Sui Generis Database 94 | Rights, without regard to how the rights are labeled or 95 | categorized. For purposes of this Public License, the rights 96 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 97 | Rights. 98 | 99 | e. Effective Technological Measures means those measures that, in the 100 | absence of proper authority, may not be circumvented under laws 101 | fulfilling obligations under Article 11 of the WIPO Copyright 102 | Treaty adopted on December 20, 1996, and/or similar international 103 | agreements. 104 | 105 | f. Exceptions and Limitations means fair use, fair dealing, and/or 106 | any other exception or limitation to Copyright and Similar Rights 107 | that applies to Your use of the Licensed Material. 108 | 109 | g. License Elements means the license attributes listed in the name 110 | of a Creative Commons Public License. The License Elements of this 111 | Public License are Attribution, NonCommercial, and ShareAlike. 112 | 113 | h. Licensed Material means the artistic or literary work, database, 114 | or other material to which the Licensor applied this Public 115 | License. 116 | 117 | i. Licensed Rights means the rights granted to You subject to the 118 | terms and conditions of this Public License, which are limited to 119 | all Copyright and Similar Rights that apply to Your use of the 120 | Licensed Material and that the Licensor has authority to license. 121 | 122 | j. Licensor means the individual(s) or entity(ies) granting rights 123 | under this Public License. 124 | 125 | k. NonCommercial means not primarily intended for or directed towards 126 | commercial advantage or monetary compensation. For purposes of 127 | this Public License, the exchange of the Licensed Material for 128 | other material subject to Copyright and Similar Rights by digital 129 | file-sharing or similar means is NonCommercial provided there is 130 | no payment of monetary compensation in connection with the 131 | exchange. 132 | 133 | l. Share means to provide material to the public by any means or 134 | process that requires permission under the Licensed Rights, such 135 | as reproduction, public display, public performance, distribution, 136 | dissemination, communication, or importation, and to make material 137 | available to the public including in ways that members of the 138 | public may access the material from a place and at a time 139 | individually chosen by them. 140 | 141 | m. Sui Generis Database Rights means rights other than copyright 142 | resulting from Directive 96/9/EC of the European Parliament and of 143 | the Council of 11 March 1996 on the legal protection of databases, 144 | as amended and/or succeeded, as well as other essentially 145 | equivalent rights anywhere in the world. 146 | 147 | n. You means the individual or entity exercising the Licensed Rights 148 | under this Public License. Your has a corresponding meaning. 149 | 150 | 151 | Section 2 -- Scope. 152 | 153 | a. License grant. 154 | 155 | 1. Subject to the terms and conditions of this Public License, 156 | the Licensor hereby grants You a worldwide, royalty-free, 157 | non-sublicensable, non-exclusive, irrevocable license to 158 | exercise the Licensed Rights in the Licensed Material to: 159 | 160 | a. reproduce and Share the Licensed Material, in whole or 161 | in part, for NonCommercial purposes only; and 162 | 163 | b. produce, reproduce, and Share Adapted Material for 164 | NonCommercial purposes only. 165 | 166 | 2. Exceptions and Limitations. For the avoidance of doubt, where 167 | Exceptions and Limitations apply to Your use, this Public 168 | License does not apply, and You do not need to comply with 169 | its terms and conditions. 170 | 171 | 3. Term. The term of this Public License is specified in Section 172 | 6(a). 173 | 174 | 4. Media and formats; technical modifications allowed. The 175 | Licensor authorizes You to exercise the Licensed Rights in 176 | all media and formats whether now known or hereafter created, 177 | and to make technical modifications necessary to do so. The 178 | Licensor waives and/or agrees not to assert any right or 179 | authority to forbid You from making technical modifications 180 | necessary to exercise the Licensed Rights, including 181 | technical modifications necessary to circumvent Effective 182 | Technological Measures. For purposes of this Public License, 183 | simply making modifications authorized by this Section 2(a) 184 | (4) never produces Adapted Material. 185 | 186 | 5. Downstream recipients. 187 | 188 | a. Offer from the Licensor -- Licensed Material. Every 189 | recipient of the Licensed Material automatically 190 | receives an offer from the Licensor to exercise the 191 | Licensed Rights under the terms and conditions of this 192 | Public License. 193 | 194 | b. Additional offer from the Licensor -- Adapted Material. 195 | Every recipient of Adapted Material from You 196 | automatically receives an offer from the Licensor to 197 | exercise the Licensed Rights in the Adapted Material 198 | under the conditions of the Adapter's License You apply. 199 | 200 | c. No downstream restrictions. You may not offer or impose 201 | any additional or different terms or conditions on, or 202 | apply any Effective Technological Measures to, the 203 | Licensed Material if doing so restricts exercise of the 204 | Licensed Rights by any recipient of the Licensed 205 | Material. 206 | 207 | 6. No endorsement. Nothing in this Public License constitutes or 208 | may be construed as permission to assert or imply that You 209 | are, or that Your use of the Licensed Material is, connected 210 | with, or sponsored, endorsed, or granted official status by, 211 | the Licensor or others designated to receive attribution as 212 | provided in Section 3(a)(1)(A)(i). 213 | 214 | b. Other rights. 215 | 216 | 1. Moral rights, such as the right of integrity, are not 217 | licensed under this Public License, nor are publicity, 218 | privacy, and/or other similar personality rights; however, to 219 | the extent possible, the Licensor waives and/or agrees not to 220 | assert any such rights held by the Licensor to the limited 221 | extent necessary to allow You to exercise the Licensed 222 | Rights, but not otherwise. 223 | 224 | 2. Patent and trademark rights are not licensed under this 225 | Public License. 226 | 227 | 3. To the extent possible, the Licensor waives any right to 228 | collect royalties from You for the exercise of the Licensed 229 | Rights, whether directly or through a collecting society 230 | under any voluntary or waivable statutory or compulsory 231 | licensing scheme. In all other cases the Licensor expressly 232 | reserves any right to collect such royalties, including when 233 | the Licensed Material is used other than for NonCommercial 234 | purposes. 235 | 236 | 237 | Section 3 -- License Conditions. 238 | 239 | Your exercise of the Licensed Rights is expressly made subject to the 240 | following conditions. 241 | 242 | a. Attribution. 243 | 244 | 1. If You Share the Licensed Material (including in modified 245 | form), You must: 246 | 247 | a. retain the following if it is supplied by the Licensor 248 | with the Licensed Material: 249 | 250 | i. identification of the creator(s) of the Licensed 251 | Material and any others designated to receive 252 | attribution, in any reasonable manner requested by 253 | the Licensor (including by pseudonym if 254 | designated); 255 | 256 | ii. a copyright notice; 257 | 258 | iii. a notice that refers to this Public License; 259 | 260 | iv. a notice that refers to the disclaimer of 261 | warranties; 262 | 263 | v. a URI or hyperlink to the Licensed Material to the 264 | extent reasonably practicable; 265 | 266 | b. indicate if You modified the Licensed Material and 267 | retain an indication of any previous modifications; and 268 | 269 | c. indicate the Licensed Material is licensed under this 270 | Public License, and include the text of, or the URI or 271 | hyperlink to, this Public License. 272 | 273 | 2. You may satisfy the conditions in Section 3(a)(1) in any 274 | reasonable manner based on the medium, means, and context in 275 | which You Share the Licensed Material. For example, it may be 276 | reasonable to satisfy the conditions by providing a URI or 277 | hyperlink to a resource that includes the required 278 | information. 279 | 3. If requested by the Licensor, You must remove any of the 280 | information required by Section 3(a)(1)(A) to the extent 281 | reasonably practicable. 282 | 283 | b. ShareAlike. 284 | 285 | In addition to the conditions in Section 3(a), if You Share 286 | Adapted Material You produce, the following conditions also apply. 287 | 288 | 1. The Adapter's License You apply must be a Creative Commons 289 | license with the same License Elements, this version or 290 | later, or a BY-NC-SA Compatible License. 291 | 292 | 2. You must include the text of, or the URI or hyperlink to, the 293 | Adapter's License You apply. You may satisfy this condition 294 | in any reasonable manner based on the medium, means, and 295 | context in which You Share Adapted Material. 296 | 297 | 3. You may not offer or impose any additional or different terms 298 | or conditions on, or apply any Effective Technological 299 | Measures to, Adapted Material that restrict exercise of the 300 | rights granted under the Adapter's License You apply. 301 | 302 | 303 | Section 4 -- Sui Generis Database Rights. 304 | 305 | Where the Licensed Rights include Sui Generis Database Rights that 306 | apply to Your use of the Licensed Material: 307 | 308 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 309 | to extract, reuse, reproduce, and Share all or a substantial 310 | portion of the contents of the database for NonCommercial purposes 311 | only; 312 | 313 | b. if You include all or a substantial portion of the database 314 | contents in a database in which You have Sui Generis Database 315 | Rights, then the database in which You have Sui Generis Database 316 | Rights (but not its individual contents) is Adapted Material, 317 | including for purposes of Section 3(b); and 318 | 319 | c. You must comply with the conditions in Section 3(a) if You Share 320 | all or a substantial portion of the contents of the database. 321 | 322 | For the avoidance of doubt, this Section 4 supplements and does not 323 | replace Your obligations under this Public License where the Licensed 324 | Rights include other Copyright and Similar Rights. 325 | 326 | 327 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 328 | 329 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 330 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 331 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 332 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 333 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 334 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 335 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 336 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 337 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 338 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 339 | 340 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 341 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 342 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 343 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 344 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 345 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 346 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 347 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 348 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 349 | 350 | c. The disclaimer of warranties and limitation of liability provided 351 | above shall be interpreted in a manner that, to the extent 352 | possible, most closely approximates an absolute disclaimer and 353 | waiver of all liability. 354 | 355 | 356 | Section 6 -- Term and Termination. 357 | 358 | a. This Public License applies for the term of the Copyright and 359 | Similar Rights licensed here. However, if You fail to comply with 360 | this Public License, then Your rights under this Public License 361 | terminate automatically. 362 | 363 | b. Where Your right to use the Licensed Material has terminated under 364 | Section 6(a), it reinstates: 365 | 366 | 1. automatically as of the date the violation is cured, provided 367 | it is cured within 30 days of Your discovery of the 368 | violation; or 369 | 370 | 2. upon express reinstatement by the Licensor. 371 | 372 | For the avoidance of doubt, this Section 6(b) does not affect any 373 | right the Licensor may have to seek remedies for Your violations 374 | of this Public License. 375 | 376 | c. For the avoidance of doubt, the Licensor may also offer the 377 | Licensed Material under separate terms or conditions or stop 378 | distributing the Licensed Material at any time; however, doing so 379 | will not terminate this Public License. 380 | 381 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 382 | License. 383 | 384 | 385 | Section 7 -- Other Terms and Conditions. 386 | 387 | a. The Licensor shall not be bound by any additional or different 388 | terms or conditions communicated by You unless expressly agreed. 389 | 390 | b. Any arrangements, understandings, or agreements regarding the 391 | Licensed Material not stated herein are separate from and 392 | independent of the terms and conditions of this Public License. 393 | 394 | 395 | Section 8 -- Interpretation. 396 | 397 | a. For the avoidance of doubt, this Public License does not, and 398 | shall not be interpreted to, reduce, limit, restrict, or impose 399 | conditions on any use of the Licensed Material that could lawfully 400 | be made without permission under this Public License. 401 | 402 | b. To the extent possible, if any provision of this Public License is 403 | deemed unenforceable, it shall be automatically reformed to the 404 | minimum extent necessary to make it enforceable. If the provision 405 | cannot be reformed, it shall be severed from this Public License 406 | without affecting the enforceability of the remaining terms and 407 | conditions. 408 | 409 | c. No term or condition of this Public License will be waived and no 410 | failure to comply consented to unless expressly agreed to by the 411 | Licensor. 412 | 413 | d. Nothing in this Public License constitutes or may be interpreted 414 | as a limitation upon, or waiver of, any privileges and immunities 415 | that apply to the Licensor or You, including from the legal 416 | processes of any jurisdiction or authority. 417 | 418 | ======================================================================= 419 | 420 | Creative Commons is not a party to its public 421 | licenses. Notwithstanding, Creative Commons may elect to apply one of 422 | its public licenses to material it publishes and in those instances 423 | will be considered the “Licensor.” The text of the Creative Commons 424 | public licenses is dedicated to the public domain under the CC0 Public 425 | Domain Dedication. Except for the limited purpose of indicating that 426 | material is shared under a Creative Commons public license or as 427 | otherwise permitted by the Creative Commons policies published at 428 | creativecommons.org/policies, Creative Commons does not authorize the 429 | use of the trademark "Creative Commons" or any other trademark or logo 430 | of Creative Commons without its prior written consent including, 431 | without limitation, in connection with any unauthorized modifications 432 | to any of its public licenses or any other arrangements, 433 | understandings, or agreements concerning use of licensed material. For 434 | the avoidance of doubt, this paragraph does not form part of the 435 | public licenses. 436 | 437 | Creative Commons may be contacted at creativecommons.org. 438 | 439 | 440 | --------------------------------------------------------------------------------