├── .gitignore
├── LICENSE
├── README.md
├── fake_ids.py
├── get_article.py
├── requirements
└── tags.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | wechatarticlevenv/
 55 | chromedriver
 56 | .idea/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | .pybuilder/
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | #   For a library or package, you might want to ignore these files since the code is
 91 | #   intended to run in multiple environments; otherwise, check them in:
 92 | # .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # poetry
102 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
104 | #   commonly ignored for libraries.
105 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106 | #poetry.lock
107 | 
108 | # pdm
109 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110 | #pdm.lock
111 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112 | #   in version control.
113 | #   https://pdm.fming.dev/#use-with-ide
114 | .pdm.toml
115 | 
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 | 
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 | 
123 | # SageMath parsed files
124 | *.sage.py
125 | 
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 | 
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 | 
139 | # Rope project settings
140 | .ropeproject
141 | 
142 | # mkdocs documentation
143 | /site
144 | 
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 | 
150 | # Pyre type checker
151 | .pyre/
152 | 
153 | # pytype static type analyzer
154 | .pytype/
155 | 
156 | # Cython debug symbols
157 | cython_debug/
158 | 
159 | # PyCharm
160 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
163 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Bowen Hu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 微信公众号文章抓取与筛选项目
 2 | 
 3 | ## 项目概述
 4 | 该项目旨在使用Python和Selenium从微信公众号抓取文章，并根据特定的标签和发布日期进行筛选。此工具对于需要自动化获取和筛选微信公众号内容的研究者或营销专业人士来说非常有用。
 5 | 
 6 | ## 环境要求
 7 | - Python 3.12（作者所使用的版本）
 8 | - Selenium
 9 | - Requests
10 | - Chrome WebDriver
11 | 
12 | ## 安装步骤
13 | 1. 安装Python：
14 |    确保您的系统中已安装Python 3.12。可以从 [Python官网](https://www.python.org/downloads/) 下载。
15 | 
16 | 2. 安装依赖库：
17 |    依赖项已在 requirements.txt 中列出，运行以下命令安装:
18 |    ```bash
19 |    pip install -r requirements.txt
20 |    ```
21 | 
22 | 3. 安装Chrome WebDriver：
23 |    根据您的Chrome版本下载对应的Chrome WebDriver。请确保WebDriver的路径已添加到系统的PATH中，或在代码中指定路径。下载链接 [Chrome WebDriver](https://sites.google.com/a/chromium.org/chromedriver/).
24 | 
25 | ## 使用说明
26 | 1. **设置项目**：
27 |    克隆或下载本项目代码到本地目录。
28 | 
29 | 2. **配置参数**：
30 |    在代码中，您可以修改以下几个关键参数：
31 |    - `fake_id`：设置为目标公众号的唯一标识。
32 |    - `num`：在`fetch_page(num)`函数中设置，用于指定抓取的页数。
33 | 
34 | 3. **运行项目**：
35 |    在终端或命令行窗口中，导航到包含代码的目录，并运行以下命令：
36 |    ```bash
37 |    python get_article.py
38 |    ```
39 | 
40 | 4. **查看结果**：
41 |    筛选后的文章标题和链接将在命令行中打印出来。
42 | 
43 | ## 功能描述
44 | - **抓取文章**：从指定微信公众号抓取文章列表。
45 | - **筛选文章**：根据发布日期和标签筛选文章。文章必须至少包含两个相关标签，并在最近一个月内发布。
46 | 
47 | ## 注意事项
48 | - 本项目使用了Selenium进行网页模拟访问，可能受到目标网站反爬虫策略的影响。
49 | - 确保在使用过程中遵守相关网站的服务条款，避免过于频繁的请求。
50 | 
51 | ## 贡献
52 | 欢迎对项目进行改进和优化的相关建议和贡献。您可以通过GitHub提交Pull Requests或开设Issues。
53 | 
54 | ## 许可证
55 | 本项目采用MIT许可证。使用本项目之前，请确保您已阅读并同意许可证条款。
56 | 
57 | ---
58 | 
59 | 确保在使用本工具时遵守相关法律法规以及微信公众平台的规定。此代码仅供学习和研究使用，不得用于任何非法用途。
60 | 


--------------------------------------------------------------------------------
/fake_ids.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | class PublicAccount(Enum):
 4 |     """
 5 |     微信公众号的枚举类，用于存储不同公众号的fake_id。
 6 |     """
 7 |     ACCOUNT_1 = 'MzIzMjA2NTg3Mw%3D%3D'   # Vehicle
 8 |     ACCOUNT_2 = 'MzkwNjI0MDY4OA%3D%3D'   # 九章智驾
 9 |     ACCOUNT_3 = 'MzIzOTAzNzcwMg%3D%3D'   # 汽车之心
10 |     ACCOUNT_4 = 'MzI3MDc3ODI5MA%3D%3D'   # 元戎启行Deep Route
11 |     ACCOUNT_5 = 'MjM5OTQzOTE0MA%3D%3D'   # 汽车商业评论
12 |     ACCOUNT_6 = 'MzkzOTE3Nzc5MA%3D%3D'   # 智能车参考
13 |     ACCOUNT_7 = 'MzIyNDI4NTM3Mg%3D%3D'   # 高工智能汽车
14 |     ACCOUNT_8 = 'MzkzNzMzMjg5MA%3D%3D'   # 智车星球
15 |     ACCOUNT_9 = 'MjM5NTAxMTg0MA%3D%3D'   # 腾讯汽车
16 |     ACCOUNT_10 = 'MzAxNjk5NjIzNw%3D%3D'   # 新智驾
17 |     ACCOUNT_11 = 'MzA5OTE1MjcxMw%3D%3D'   # 盖世汽车每日速递
18 | 
19 | 
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/get_article.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import time
  3 | from tags import Tag
  4 | from datetime import datetime, timedelta
  5 | from selenium import webdriver
  6 | from selenium.webdriver.common.by import By
  7 | import requests
  8 | from selenium.webdriver.chrome.service import Service
  9 | from selenium.webdriver.chrome.options import Options
 10 | import logging
 11 | from fake_ids import PublicAccount
 12 | 
 13 | # 配置日志
 14 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 15 | 
 16 | def fetch_article_details(article_url):
 17 |     """ 使用Selenium抓取文章的详细信息，包括标签和发布日期 """
 18 |     logging.info(f"开始抓取文章: {article_url}")
 19 |     options = Options()
 20 |     options.headless = True  # 使用无头模式
 21 |     options.add_argument("--disable-gpu")  # 禁用GPU加速
 22 |     options.add_argument("--window-size=1920x1080")  # 指定浏览器分辨率
 23 |     service = Service(executable_path='./chromedriver')
 24 |     driver = webdriver.Chrome(service=service, options=options)
 25 |     try:
 26 |         driver.get(article_url)
 27 |         time.sleep(10)  # 增加等待时间以减少爬取速度，减轻服务器压力
 28 |         tags = {tag.text.strip() for tag in driver.find_elements(By.CLASS_NAME, 'article-tag__item')}
 29 |         publish_em = driver.find_element(By.ID, 'publish_time')
 30 |         publish_date_str = publish_em.text.strip() if publish_em else ''
 31 |         publish_date = datetime.strptime(publish_date_str, '%Y-%m-%d %H:%M') if publish_date_str else None
 32 |         logging.info(f"文章抓取完成: 标签 - {tags}, 发布日期 - {publish_date}")
 33 |         return tags, publish_date
 34 |     except Exception as e:
 35 |         logging.error(f"在抓取文章信息时发生错误: {e}", exc_info=True)
 36 |     finally:
 37 |         driver.quit()
 38 | 
 39 | def filter_articles(articles):
 40 |     """ 根据特定条件筛选文章 """
 41 |     logging.info("开始根据特定条件筛选文章")
 42 |     filtered_articles = []
 43 |     one_week_ago = datetime.now() - timedelta(days=7)
 44 |     valid_tags = {tag.value for tag in Tag}
 45 |     for title, link in articles:
 46 |         logging.info(f"抓取并处理文章: {title}")
 47 |         tags, publish_date = fetch_article_details(link)
 48 |         time.sleep(20)  # 每次抓取后休息20秒
 49 |         # 检查标题是否包含任何有效标签
 50 |         title_contains_valid_tag = any(tag in title for tag in valid_tags)
 51 |         # 确保日期有效且至少包含一个标签或标题包含有效标签
 52 |         if publish_date and publish_date >= one_week_ago and (len(tags.intersection(valid_tags)) >= 1 or title_contains_valid_tag):
 53 |             filtered_articles.append((title, link))
 54 |             logging.info(f"文章 '{title}' 符合条件并被添加到过滤列表")
 55 |         else:
 56 |             logging.info(f"文章 '{title}' 不符合条件，被跳过")
 57 |         logging.info("文章筛选完成")
 58 |     return filtered_articles
 59 | 
 60 | def fetch_page(account, num):
 61 |     """ 从微信公众号API抓取文章页面 """
 62 |     logging.info("开始从微信公众号API抓取文章页面")
 63 |     headers = {
 64 |         "cookie": "appmsglist_action_3296508395=card; ua_id=lTGgGdyLybxOEU7YAAAAAMJOVJrBBZLXBApnARbsQFI=; _clck=1k8cupo|1|flj|0; wxuin=15005178482107; uuid=018b1cbb7cf1176ec22d471c71ea96ff; rand_info=CAESIHL0xgA81PdNaKTTdVYOA7HekCl6TYSbEYxj4FcdDWSq; slave_bizuin=3296508395; data_bizuin=3296508395; bizuin=3296508395; data_ticket=U+vcWHVEg0+pcO0Bmoz26ny44jJXfIReAqSdTjoB5oI6sRlPRPfPoxonsIfQ9FCp; slave_sid=OWs4M05iald4ZE1GMEZJaXNrR19YMTlOcnR3SDEySmJnWlZ3R0pGZGlBak9tNUF1WUl1cXlyUWcwSW1ac1U3RjVnWEk3YU9SNURJb2lNTlE5bHZHTXZnc2ZIb2tXd3BpRFlPcjROOHp4NHhPMl9CVGhrVTh6T1RWOWF6eVMwZkVwS09NQnRZa2QwWjYxMjRm; slave_user=gh_a5060f4cf8ae; xid=d6c06582ccb272f04b4cb58e0ca2f4cf; mm_lang=zh_CN; cert=rGzj7GlVHTknNEW4BcJxk84Nrw56rphG; rewardsn=; wxtokenkey=777; _clsk=19aql5s|1715008216255|3|1|mp.weixin.qq.com/weheat-agent/payload/record",
 65 |         "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
 66 |     }
 67 |     url = 'https://mp.weixin.qq.com/cgi-bin/appmsg'
 68 |     # 爬不同公众号只需要更改fakeid
 69 |     fake_id = account.value
 70 |     titles_links = []
 71 |     for page_number in range(num):
 72 |         params = {
 73 |             'action': 'list_ex',
 74 |             'begin': page_number * 5,  # 为分页计算偏移量
 75 |             'count': '5',
 76 |             'fakeid': fake_id,
 77 |             'type': '9',
 78 |             'query': '',
 79 |             'token': '1132247621',
 80 |             'lang': 'zh_CN',
 81 |             'f': 'json',
 82 |             'ajax': '1',
 83 |         }
 84 |         response = requests.get(url, headers=headers, params=params)
 85 |         logging.info(f"第 {page_number} 页: 请求成功，状态码 200")
 86 |         if response.status_code == 200:
 87 |             try:
 88 |                 data = response.json()
 89 |                 if 'app_msg_list' in data:
 90 |                     for article in data['app_msg_list']:
 91 |                         titles_links.append((article['title'], article['link']))
 92 |                 else:
 93 |                     logging.warning(f"第 {page_number} 页: 'app_msg_list' 键不存在于响应中")
 94 |             except ValueError:
 95 |                 logging.error(f"第 {page_number} 页: JSON解码失败")
 96 |         else:
 97 |             logging.error(f"第 {page_number} 页: HTTP错误 {response.status_code}")
 98 |     return titles_links
 99 | 
100 | def main():
101 |     logging.info("主程序开始执行")
102 |     final_output = []
103 |     for account in PublicAccount:
104 |         logging.info(f"处理公众号: {account.name}")
105 |         articles = fetch_page(account, 1)
106 |         logging.info(f"获取到的文章数量: {len(articles)}")
107 |         filtered_articles = filter_articles(articles)  # 调用 filter_articles 来筛选文章
108 |         for title, link in filtered_articles:
109 |             logging.info(f"符合条件的文章标题: {title} 链接: {link}")
110 |         final_output.extend(filtered_articles)
111 |     for title, link in final_output:
112 |         print(f"标题: {title}, 链接: {link}")
113 |     logging.info("主程序执行结束")
114 | 
115 | if __name__ == '__main__':
116 |     main()


--------------------------------------------------------------------------------
/requirements:
--------------------------------------------------------------------------------
1 | selenium
2 | requests


--------------------------------------------------------------------------------
/tags.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | class Tag(Enum):
 4 |     """
 5 |     微信公众号所带标签的枚举类，用于存储理想的tags。
 6 |     """
 7 |     # 1、智驾相关技术
 8 |     BEV = "BEV"
 9 |     TRANSFORMER = "Transformer"
10 |     BEV_TRANSFORMER = "BEV与Transformer"
11 |     END_TO_END = "端到端"
12 |     LARGE_MODEL = "大模型"
13 |     GPT = "GPT"
14 |     FSD = "FSD"
15 |     NOA = "NOA"
16 |     SENSOR_TECHNOLOGY = "传感器技术"
17 |     ARTIFICIAL_INTELLIGENCE = "人工智能"
18 |     UPPER_AI = "AI"
19 |     AI = "ai"
20 |     LIDAR = "激光雷达"
21 |     AI_CHIP = "AI芯片"
22 |     CHIP = "芯片"
23 |     DEEP_LEARNING = "深度学习"
24 |     COMPUTER_VISION = "计算机视觉"
25 | 
26 |     # 2、智驾衍生名词
27 |     AUTO_DRIVING = "自动驾驶"
28 |     SMART_DRIVING = "智能驾驶"
29 |     S_D = "智驾"
30 |     DRIVING = "行车"
31 |     PARKING = "泊车"
32 |     SMART_CAR = "智能汽车"
33 |     AUTONOMOUS_VEHICLES = "自动驾驶车辆"
34 |     INTELLIGENT = "智能化"
35 | 
36 |     # 3、智驾相关公司
37 |     TESLA = "特斯拉"
38 |     HORIZON = "地平线"
39 |     CAMBRIAN = "寒武纪"
40 |     MOMENTA = "Momenta"
41 |     WE_RIDE = "文远"
42 |     PONY = "小马"
43 |     WE_RIDE_AI = "文远知行"
44 |     PONY_AI = "小马智行"


--------------------------------------------------------------------------------