├── .gitignore ├── LICENSE ├── README.md ├── fake_ids.py ├── get_article.py ├── requirements └── tags.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | wechatarticlevenv/ 55 | chromedriver 56 | .idea/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # poetry 102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 103 | # This is especially recommended for binary packages to ensure reproducibility, and is more 104 | # commonly ignored for libraries. 105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 106 | #poetry.lock 107 | 108 | # pdm 109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 110 | #pdm.lock 111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 112 | # in version control. 113 | # https://pdm.fming.dev/#use-with-ide 114 | .pdm.toml 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | #.idea/ 165 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Bowen Hu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 微信公众号文章抓取与筛选项目 2 | 3 | ## 项目概述 4 | 该项目旨在使用Python和Selenium从微信公众号抓取文章,并根据特定的标签和发布日期进行筛选。此工具对于需要自动化获取和筛选微信公众号内容的研究者或营销专业人士来说非常有用。 5 | 6 | ## 环境要求 7 | - Python 3.12(作者所使用的版本) 8 | - Selenium 9 | - Requests 10 | - Chrome WebDriver 11 | 12 | ## 安装步骤 13 | 1. 安装Python: 14 | 确保您的系统中已安装Python 3.12。可以从 [Python官网](https://www.python.org/downloads/) 下载。 15 | 16 | 2. 安装依赖库: 17 | 依赖项已在 requirements.txt 中列出,运行以下命令安装: 18 | ```bash 19 | pip install -r requirements.txt 20 | ``` 21 | 22 | 3. 安装Chrome WebDriver: 23 | 根据您的Chrome版本下载对应的Chrome WebDriver。请确保WebDriver的路径已添加到系统的PATH中,或在代码中指定路径。下载链接 [Chrome WebDriver](https://sites.google.com/a/chromium.org/chromedriver/). 24 | 25 | ## 使用说明 26 | 1. **设置项目**: 27 | 克隆或下载本项目代码到本地目录。 28 | 29 | 2. **配置参数**: 30 | 在代码中,您可以修改以下几个关键参数: 31 | - `fake_id`:设置为目标公众号的唯一标识。 32 | - `num`:在`fetch_page(num)`函数中设置,用于指定抓取的页数。 33 | 34 | 3. **运行项目**: 35 | 在终端或命令行窗口中,导航到包含代码的目录,并运行以下命令: 36 | ```bash 37 | python get_article.py 38 | ``` 39 | 40 | 4. **查看结果**: 41 | 筛选后的文章标题和链接将在命令行中打印出来。 42 | 43 | ## 功能描述 44 | - **抓取文章**:从指定微信公众号抓取文章列表。 45 | - **筛选文章**:根据发布日期和标签筛选文章。文章必须至少包含两个相关标签,并在最近一个月内发布。 46 | 47 | ## 注意事项 48 | - 本项目使用了Selenium进行网页模拟访问,可能受到目标网站反爬虫策略的影响。 49 | - 确保在使用过程中遵守相关网站的服务条款,避免过于频繁的请求。 50 | 51 | ## 贡献 52 | 欢迎对项目进行改进和优化的相关建议和贡献。您可以通过GitHub提交Pull Requests或开设Issues。 53 | 54 | ## 许可证 55 | 本项目采用MIT许可证。使用本项目之前,请确保您已阅读并同意许可证条款。 56 | 57 | --- 58 | 59 | 确保在使用本工具时遵守相关法律法规以及微信公众平台的规定。此代码仅供学习和研究使用,不得用于任何非法用途。 60 | -------------------------------------------------------------------------------- /fake_ids.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | class PublicAccount(Enum): 4 | """ 5 | 微信公众号的枚举类,用于存储不同公众号的fake_id。 6 | """ 7 | ACCOUNT_1 = 'MzIzMjA2NTg3Mw%3D%3D' # Vehicle 8 | ACCOUNT_2 = 'MzkwNjI0MDY4OA%3D%3D' # 九章智驾 9 | ACCOUNT_3 = 'MzIzOTAzNzcwMg%3D%3D' # 汽车之心 10 | ACCOUNT_4 = 'MzI3MDc3ODI5MA%3D%3D' # 元戎启行Deep Route 11 | ACCOUNT_5 = 'MjM5OTQzOTE0MA%3D%3D' # 汽车商业评论 12 | ACCOUNT_6 = 'MzkzOTE3Nzc5MA%3D%3D' # 智能车参考 13 | ACCOUNT_7 = 'MzIyNDI4NTM3Mg%3D%3D' # 高工智能汽车 14 | ACCOUNT_8 = 'MzkzNzMzMjg5MA%3D%3D' # 智车星球 15 | ACCOUNT_9 = 'MjM5NTAxMTg0MA%3D%3D' # 腾讯汽车 16 | ACCOUNT_10 = 'MzAxNjk5NjIzNw%3D%3D' # 新智驾 17 | ACCOUNT_11 = 'MzA5OTE1MjcxMw%3D%3D' # 盖世汽车每日速递 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /get_article.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import time 3 | from tags import Tag 4 | from datetime import datetime, timedelta 5 | from selenium import webdriver 6 | from selenium.webdriver.common.by import By 7 | import requests 8 | from selenium.webdriver.chrome.service import Service 9 | from selenium.webdriver.chrome.options import Options 10 | import logging 11 | from fake_ids import PublicAccount 12 | 13 | # 配置日志 14 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 15 | 16 | def fetch_article_details(article_url): 17 | """ 使用Selenium抓取文章的详细信息,包括标签和发布日期 """ 18 | logging.info(f"开始抓取文章: {article_url}") 19 | options = Options() 20 | options.headless = True # 使用无头模式 21 | options.add_argument("--disable-gpu") # 禁用GPU加速 22 | options.add_argument("--window-size=1920x1080") # 指定浏览器分辨率 23 | service = Service(executable_path='./chromedriver') 24 | driver = webdriver.Chrome(service=service, options=options) 25 | try: 26 | driver.get(article_url) 27 | time.sleep(10) # 增加等待时间以减少爬取速度,减轻服务器压力 28 | tags = {tag.text.strip() for tag in driver.find_elements(By.CLASS_NAME, 'article-tag__item')} 29 | publish_em = driver.find_element(By.ID, 'publish_time') 30 | publish_date_str = publish_em.text.strip() if publish_em else '' 31 | publish_date = datetime.strptime(publish_date_str, '%Y-%m-%d %H:%M') if publish_date_str else None 32 | logging.info(f"文章抓取完成: 标签 - {tags}, 发布日期 - {publish_date}") 33 | return tags, publish_date 34 | except Exception as e: 35 | logging.error(f"在抓取文章信息时发生错误: {e}", exc_info=True) 36 | finally: 37 | driver.quit() 38 | 39 | def filter_articles(articles): 40 | """ 根据特定条件筛选文章 """ 41 | logging.info("开始根据特定条件筛选文章") 42 | filtered_articles = [] 43 | one_week_ago = datetime.now() - timedelta(days=7) 44 | valid_tags = {tag.value for tag in Tag} 45 | for title, link in articles: 46 | logging.info(f"抓取并处理文章: {title}") 47 | tags, publish_date = fetch_article_details(link) 48 | time.sleep(20) # 每次抓取后休息20秒 49 | # 检查标题是否包含任何有效标签 50 | title_contains_valid_tag = any(tag in title for tag in valid_tags) 51 | # 确保日期有效且至少包含一个标签或标题包含有效标签 52 | if publish_date and publish_date >= one_week_ago and (len(tags.intersection(valid_tags)) >= 1 or title_contains_valid_tag): 53 | filtered_articles.append((title, link)) 54 | logging.info(f"文章 '{title}' 符合条件并被添加到过滤列表") 55 | else: 56 | logging.info(f"文章 '{title}' 不符合条件,被跳过") 57 | logging.info("文章筛选完成") 58 | return filtered_articles 59 | 60 | def fetch_page(account, num): 61 | """ 从微信公众号API抓取文章页面 """ 62 | logging.info("开始从微信公众号API抓取文章页面") 63 | headers = { 64 | "cookie": "appmsglist_action_3296508395=card; ua_id=lTGgGdyLybxOEU7YAAAAAMJOVJrBBZLXBApnARbsQFI=; _clck=1k8cupo|1|flj|0; wxuin=15005178482107; uuid=018b1cbb7cf1176ec22d471c71ea96ff; rand_info=CAESIHL0xgA81PdNaKTTdVYOA7HekCl6TYSbEYxj4FcdDWSq; slave_bizuin=3296508395; data_bizuin=3296508395; bizuin=3296508395; data_ticket=U+vcWHVEg0+pcO0Bmoz26ny44jJXfIReAqSdTjoB5oI6sRlPRPfPoxonsIfQ9FCp; slave_sid=OWs4M05iald4ZE1GMEZJaXNrR19YMTlOcnR3SDEySmJnWlZ3R0pGZGlBak9tNUF1WUl1cXlyUWcwSW1ac1U3RjVnWEk3YU9SNURJb2lNTlE5bHZHTXZnc2ZIb2tXd3BpRFlPcjROOHp4NHhPMl9CVGhrVTh6T1RWOWF6eVMwZkVwS09NQnRZa2QwWjYxMjRm; slave_user=gh_a5060f4cf8ae; xid=d6c06582ccb272f04b4cb58e0ca2f4cf; mm_lang=zh_CN; cert=rGzj7GlVHTknNEW4BcJxk84Nrw56rphG; rewardsn=; wxtokenkey=777; _clsk=19aql5s|1715008216255|3|1|mp.weixin.qq.com/weheat-agent/payload/record", 65 | "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" 66 | } 67 | url = 'https://mp.weixin.qq.com/cgi-bin/appmsg' 68 | # 爬不同公众号只需要更改fakeid 69 | fake_id = account.value 70 | titles_links = [] 71 | for page_number in range(num): 72 | params = { 73 | 'action': 'list_ex', 74 | 'begin': page_number * 5, # 为分页计算偏移量 75 | 'count': '5', 76 | 'fakeid': fake_id, 77 | 'type': '9', 78 | 'query': '', 79 | 'token': '1132247621', 80 | 'lang': 'zh_CN', 81 | 'f': 'json', 82 | 'ajax': '1', 83 | } 84 | response = requests.get(url, headers=headers, params=params) 85 | logging.info(f"第 {page_number} 页: 请求成功,状态码 200") 86 | if response.status_code == 200: 87 | try: 88 | data = response.json() 89 | if 'app_msg_list' in data: 90 | for article in data['app_msg_list']: 91 | titles_links.append((article['title'], article['link'])) 92 | else: 93 | logging.warning(f"第 {page_number} 页: 'app_msg_list' 键不存在于响应中") 94 | except ValueError: 95 | logging.error(f"第 {page_number} 页: JSON解码失败") 96 | else: 97 | logging.error(f"第 {page_number} 页: HTTP错误 {response.status_code}") 98 | return titles_links 99 | 100 | def main(): 101 | logging.info("主程序开始执行") 102 | final_output = [] 103 | for account in PublicAccount: 104 | logging.info(f"处理公众号: {account.name}") 105 | articles = fetch_page(account, 1) 106 | logging.info(f"获取到的文章数量: {len(articles)}") 107 | filtered_articles = filter_articles(articles) # 调用 filter_articles 来筛选文章 108 | for title, link in filtered_articles: 109 | logging.info(f"符合条件的文章标题: {title} 链接: {link}") 110 | final_output.extend(filtered_articles) 111 | for title, link in final_output: 112 | print(f"标题: {title}, 链接: {link}") 113 | logging.info("主程序执行结束") 114 | 115 | if __name__ == '__main__': 116 | main() -------------------------------------------------------------------------------- /requirements: -------------------------------------------------------------------------------- 1 | selenium 2 | requests -------------------------------------------------------------------------------- /tags.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | class Tag(Enum): 4 | """ 5 | 微信公众号所带标签的枚举类,用于存储理想的tags。 6 | """ 7 | # 1、智驾相关技术 8 | BEV = "BEV" 9 | TRANSFORMER = "Transformer" 10 | BEV_TRANSFORMER = "BEV与Transformer" 11 | END_TO_END = "端到端" 12 | LARGE_MODEL = "大模型" 13 | GPT = "GPT" 14 | FSD = "FSD" 15 | NOA = "NOA" 16 | SENSOR_TECHNOLOGY = "传感器技术" 17 | ARTIFICIAL_INTELLIGENCE = "人工智能" 18 | UPPER_AI = "AI" 19 | AI = "ai" 20 | LIDAR = "激光雷达" 21 | AI_CHIP = "AI芯片" 22 | CHIP = "芯片" 23 | DEEP_LEARNING = "深度学习" 24 | COMPUTER_VISION = "计算机视觉" 25 | 26 | # 2、智驾衍生名词 27 | AUTO_DRIVING = "自动驾驶" 28 | SMART_DRIVING = "智能驾驶" 29 | S_D = "智驾" 30 | DRIVING = "行车" 31 | PARKING = "泊车" 32 | SMART_CAR = "智能汽车" 33 | AUTONOMOUS_VEHICLES = "自动驾驶车辆" 34 | INTELLIGENT = "智能化" 35 | 36 | # 3、智驾相关公司 37 | TESLA = "特斯拉" 38 | HORIZON = "地平线" 39 | CAMBRIAN = "寒武纪" 40 | MOMENTA = "Momenta" 41 | WE_RIDE = "文远" 42 | PONY = "小马" 43 | WE_RIDE_AI = "文远知行" 44 | PONY_AI = "小马智行" --------------------------------------------------------------------------------