├── .gitignore ├── LICENSE ├── README.md ├── comment.css └── geek_crawler.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 zhengxiaotian 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # geek_crawler 2 | 3 | 最近极客时间有个活动,企业可以为每位员工免费领取3门课程。刚好我们公司领导也给我们申请了这个权益(没有领取的可以找领导说说帮忙弄一下,[活动地址](https://account.geekbang.org/biz/signin?redirect=https%3A%2F%2Fservice.geekbang.org%2Fdashboard%2Fhome%2F%3Futm_source%3Dfrontshow%26utm_medium%3Dwechat%26utm_campaign%3D316%26utm_term%3Dfrontend%26gk_source%3Dfrontshowwechat&gk_source=frontshowwechat&utm_source=frontshow&utm_medium=wechat&utm_campaign=316&utm_term=frontend))。 4 | 5 | 免费领取的课程只有30天有效期,因为工作日白天要正常上班,30天之内没法学完3门课程。所以就写了个脚本,将账号下所有可以看到的专栏课程自动保存到本地。 6 | 7 | ### :boom: 该项目仅限学习交流使用,请勿用于任何商业行为和损害其它人利益的行为。 :boom: 8 | 9 | ### 如何使用 10 | 11 | 1. 将代码 clone 到本地 12 | 13 | ```shell 14 | git clone git@github.com:zhengxiaotian/geek_crawler.git 15 | ``` 16 | 17 | 2. 直接在终端或者 Pycharm 中运行脚本(ps: 代码是在 Python3 下编写的,需要使用 Python3 运行) 18 | 19 | ```shell 20 | # 运行前需安装一个第三方库 requests 21 | python geek_crawler.py 22 | ``` 23 | 24 | 3. 输入账号密码 25 | 26 | ```shell 27 | E:\geek_crawler (master -> origin) 28 | λ python geek_crawler.py 29 | 请输入你的极客时间账号(手机号): ************* 30 | 请输入你的极客时间密码: ************ 31 | ``` 32 | 33 | 4. 抓取完成 34 | 35 | ```shell 36 | 2020-04-28 19:32:41,624 - geek_crawler.py[line:307] - INFO: 请求获取文章信息接口: 37 | 2020-04-28 19:32:41,633 - geek_crawler.py[line:320] - INFO: 接口请求参数:{'id': 225554, 'include_neighbors': 'tru 38 | e', 'is_freelyread': 'true'} 39 | 2020-04-28 19:32:42,047 - geek_crawler.py[line:349] - INFO: ---------------------------------------- 40 | 2020-04-28 19:32:47,131 - geek_crawler.py[line:478] - INFO: 正常抓取完成。 41 | ``` 42 | 43 | ![Snipaste_2020-04-29_08-55-08.png](http://ww1.sinaimg.cn/large/655c061fgy1geacsajgz4j20pk04lmxq.jpg) 44 | 45 | *PS:如果抓取过程中有接口报错导致抓取中断,可以查看日志中对应的报错信息,然后直接重新跑脚本继续抓取(之前抓取成功的文章会在本地有文档记录,后续不会重复抓取的)* 46 | 47 | 48 | 49 | ### 成果展示 50 | 51 | ![Snipaste_2020-04-29_08-44-44.png](http://ww1.sinaimg.cn/large/655c061fgy1geacmd7a5fj20nq035mxa.jpg) 52 | 53 | ![Snipaste_2020-04-28_19-31-52.png](http://ww1.sinaimg.cn/large/655c061fgy1ge9plld31oj20nd0h6gqf.jpg) 54 | 55 | 56 | 57 | ### 功能清单 58 | 59 | - [x] 输入账号密码后自动将该账号下所有可以看到的专栏(图文+音频),保存到本地; 60 | 61 | - [x] 可以支持选择保存成 Markdown 文档或者 HTML 文档; 62 | 63 | - [x] 支持配置排除某些课程的拉取(比如已经有的课程不再下载); 64 | 65 | - [ ] 抓取指定名称的课程; 66 | 67 | - [ ] 将每篇文章的评论与正文一起保存到本地; 68 | 69 | - [ ] 将视频拉取下来保存成 MP4 文件; 70 | 71 | 72 | -------------------------------------------------------------------------------- /comment.css: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /geek_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/python 3 | """========================================= 4 | @author: Tenma 5 | @file: geek_crawler.py 6 | @create_time: 2020/4/23 12:29 7 | @file specification: 极客时间课程爬取脚本 8 | 极客时间官网地址:https://time.geekbang.org/ 9 | 流程: 登录账号 -- 获取课程列表(专栏) -- 循环读取单个专栏的内容 -- 将内容保存成 md 文件) 10 | =========================================""" 11 | import time 12 | import datetime 13 | import requests 14 | import re 15 | from copy import deepcopy 16 | import logging 17 | import os 18 | import pathlib 19 | 20 | 21 | # 定义日志相关内容 22 | logging.basicConfig(format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s', 23 | level=logging.INFO) 24 | handler = logging.FileHandler(filename='geek_crawler.log', mode='w', encoding='utf-8') 25 | log = logging.getLogger(__name__) 26 | log.addHandler(handler) 27 | 28 | # 定义全局变量 29 | FINISH_ARTICLES = [] 30 | ALL_ARTICLES = [] 31 | 32 | 33 | class RequestError(Exception): 34 | """ 请求错误 """ 35 | pass 36 | 37 | 38 | class NotValueError(Exception): 39 | """ 没有内容错误 """ 40 | pass 41 | 42 | 43 | def _load_finish_article(): 44 | """ 将当前目录下已遍历过文章 ID 文件中的数据加载到内存中 """ 45 | result = [] 46 | _dir = pathlib.PurePosixPath() 47 | file_path = os.path.abspath(_dir / 'finish_crawler_article.txt') 48 | if os.path.exists(file_path): 49 | with open(file_path, 'r', encoding='utf-8') as f: 50 | for article_id in f.readlines(): 51 | article_id = article_id.strip('\n') 52 | if article_id: 53 | result.append(article_id) 54 | return list(set(result)) 55 | 56 | 57 | def _save_finish_article_id_to_file(): 58 | """ 将已经遍历完成的文章 ID 保存成文本,后面不用再遍历 """ 59 | global FINISH_ARTICLES 60 | _dir = pathlib.PurePosixPath() 61 | file_path = os.path.abspath(_dir / 'finish_crawler_article.txt') 62 | with open(file_path, 'a+', encoding='utf-8') as f: 63 | for i in FINISH_ARTICLES: 64 | f.write(str(i) + '\n') 65 | 66 | 67 | def check_filename(file_name): 68 | """ 69 | 校验文件名称的方法,在 windows 中文件名不能包含('\','/','*','?','<','>','|') 字符 70 | Args: 71 | file_name: 文件名称 72 | Returns: 73 | 修复后的文件名称 74 | """ 75 | return file_name.replace('\\', '') \ 76 | .replace('/', '') \ 77 | .replace('*', 'x') \ 78 | .replace('?', '') \ 79 | .replace('<', '《') \ 80 | .replace('>', '》') \ 81 | .replace('|', '_') \ 82 | .replace('\n', '') \ 83 | .replace('\b', '') \ 84 | .replace('\f', '') \ 85 | .replace('\t', '') \ 86 | .replace('\r', '') 87 | 88 | 89 | class Cookie: 90 | def __init__(self, cookie_string=None): 91 | self._cookies = {} 92 | if cookie_string: 93 | self.load_string_cookie(cookie_string) 94 | 95 | @property 96 | def cookie_string(self): 97 | """ 98 | 将对象的各属性转换成字符串形式的 Cookies 99 | Returns: 100 | 字符串形式的 cookies,方便给 HTTP 请求时使用 101 | """ 102 | return ';'.join([f'{k}={v}' for k, v in self._cookies.items()]) 103 | 104 | def set_cookie(self, key, value): 105 | self._cookies[key] = value 106 | 107 | @staticmethod 108 | def list_to_dict(lis): 109 | """ 110 | 列表转换成字典的方法 111 | Args: 112 | lis: 列表内容 113 | Returns: 114 | 转换后的字典 115 | """ 116 | result = {} 117 | for ind in lis: 118 | try: 119 | ind = ind.split('=') 120 | result[ind[0]] = ind[1] 121 | except IndexError: 122 | continue 123 | return result 124 | 125 | def load_string_cookie(self, cookie_str): 126 | """ 127 | 从字符串中加载 Cookie 的方法(将字符串转换成字典形式), 相当于 cookie_string 方法的逆反操作 128 | Args: 129 | cookie_str: 字符串形式的 Cookies,一般是从抓包请求中复制过来 130 | eg: gksskpitn=cc662cd7-0a39-430a-a603-a1c61d6f784f; LF_ID=1587783958277-6056470-8195597; 131 | Returns: 132 | """ 133 | cookie_list = cookie_str.split(';') 134 | res = self.list_to_dict(cookie_list) 135 | self._cookies = {**self._cookies, **res} 136 | 137 | def load_set_cookie(self, set_cookie): 138 | """ 139 | 从抓包返回的 Response Headers 中的 set-cookie 中提取 cookie 的方法 140 | Args: 141 | set_cookie: set-cookie 的值 142 | Returns: 143 | """ 144 | set_cookie = re.sub(".xpires=.*?;", "", set_cookie) 145 | cookies_list = set_cookie.split(',') 146 | cookie_list = [] 147 | for cookie in cookies_list: 148 | cookie_list.append(cookie.split(';')[0]) 149 | res = self.list_to_dict(cookie_list) 150 | self._cookies = {**self._cookies, **res} 151 | 152 | def __repr__(self): 153 | return f'The cookies is : {self._cookies}' 154 | 155 | 156 | class GeekCrawler: 157 | """ 极客时间相关操作的类 """ 158 | def __init__(self, cellphone=None, passwd=None, exclude=None): 159 | self.cellphone = cellphone 160 | self.password = passwd 161 | self._check() 162 | self.cookie = Cookie("LF_ID=1587783958277-6056470-8195597;_ga=GA1.2.880710184.1587783959;" 163 | "_gid=GA1.2.1020649675.1587783959; SERVERID=1fa1f330efedec1559b3abbc" 164 | "b6e30f50|1587784166|1587783958; _gat=1;Hm_lvt_022f847c4e3acd44d4a24" 165 | "81d9187f1e6=1587775851,1587775917,1587783916,1587784202; Hm_lpvt_02" 166 | "2f847c4e3acd44d4a2481d9187f1e6=1587784202;") 167 | self.common_headers = { 168 | "Accept": "application/json, text/plain, */*", 169 | "Accept-Encoding": "gzip, deflate, br", 170 | "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", 171 | "Cache-Control": "no-cache", 172 | "Connection": "keep-alive", 173 | "Pragma": "no-cache", 174 | "Sec-Fetch-Dest": "empty", 175 | "Sec-Fetch-Mode": "cors", 176 | "Sec-Fetch-Site": "same-origin", 177 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) " 178 | "AppleWebKit/537.36 (KHTML, like Gecko)Chrome/81.0.4044.122 Safari/537.36" 179 | } 180 | self.products = [] 181 | self.exclude = exclude 182 | 183 | def _check(self): 184 | if not self.cellphone: 185 | self.cellphone = str(input('请输入你要登录的手机号: ')) 186 | if not self.password: 187 | self.password = str(input('请输入你极客账号的登录密码: ')) 188 | 189 | def _login(self): 190 | """ 登录接口方法 """ 191 | log.info("请求登录接口:") 192 | url = "https://account.geekbang.org/account/ticket/login" 193 | method = "POST" 194 | headers = deepcopy(self.common_headers) 195 | headers["Host"] = "account.geekbang.org" 196 | headers["Origin"] = "https://account.geekbang.org" 197 | headers["Cookie"] = self.cookie.cookie_string 198 | params = { 199 | "country": 86, 200 | "cellphone": self.cellphone, 201 | "password": self.password, 202 | "captcha": "", 203 | "remember": 1, 204 | "platform": 3, 205 | "appid": 1, 206 | "source": "" 207 | } 208 | 209 | log.info(f"接口请求参数:{params}") 210 | res = requests.request(method, url, headers=headers, json=params) 211 | 212 | if (res.status_code != 200) or (str(res.json().get('code', '')) == '-1'): 213 | _save_finish_article_id_to_file() 214 | log.info(f"此时 products 的数据为:{self.products}") 215 | log.error(f"登录接口请求出错,返回内容为:{res.content.decode()}") 216 | raise RequestError(f"登录接口请求出错,返回内容为:{res.content.decode()}") 217 | self.cookie.load_set_cookie(res.headers['Set-Cookie']) 218 | log.info('-'*40) 219 | 220 | def _user_auth(self): 221 | """ 用户认证接口方法 """ 222 | log.info("请求用户认证接口:") 223 | now_time = int(time.time() * 1000) 224 | url = f"https://account.geekbang.org/serv/v1/user/auth?t={now_time}" 225 | method = "GET" 226 | headers = deepcopy(self.common_headers) 227 | headers["Host"] = "account.geekbang.org" 228 | headers["Origin"] = "https://time.geekbang.org" 229 | headers["Cookie"] = self.cookie.cookie_string 230 | 231 | res = requests.request(method, url, headers=headers) 232 | 233 | if (res.status_code != 200) or (str(res.json().get('code', '')) != '0'): 234 | _save_finish_article_id_to_file() 235 | log.info(f"此时 products 的数据为:{self.products}") 236 | log.error(f"用户认证接口请求出错,返回内容为:{res.json()}") 237 | raise RequestError(f"用户认证接口请求出错,返回内容为:{res.json()}") 238 | self.cookie.load_set_cookie(res.headers['Set-Cookie']) 239 | log.info('-' * 40) 240 | 241 | 242 | def _product(self, _type='c1'): 243 | """ 商品列表(就是课程)的接口)方法 """ 244 | log.info("请求获取课程列表接口:") 245 | url = "https://time.geekbang.org/serv/v3/learn/product" 246 | method = "POST" 247 | headers = deepcopy(self.common_headers) 248 | headers["Host"] = "time.geekbang.org" 249 | headers["Origin"] = "https://time.geekbang.org" 250 | headers["Cookie"] = self.cookie.cookie_string 251 | params = { 252 | "desc": 'true', 253 | "expire": 1, 254 | "last_learn": 0, 255 | "learn_status": 0, 256 | "prev": 0, 257 | "size": 20, 258 | "sort": 1, 259 | "type": "", 260 | "with_learn_count": 1 261 | } 262 | 263 | log.info(f"接口请求参数:{params}") 264 | res = requests.request(method, url, headers=headers, json=params) 265 | 266 | if res.status_code != 200: 267 | log.info(f"此时 products 的数据为:{self.products}") 268 | log.error(f"课程列表接口请求出错,返回内容为:{res.content.decode()}") 269 | raise RequestError(f"课程列表接口请求出错,返回内容为:{res.content.decode()}") 270 | data = res.json().get('data', {}) 271 | self.cookie.load_set_cookie(res.headers['Set-Cookie']) 272 | 273 | if data: 274 | self.products += self._parser_products(data, _type) 275 | else: 276 | _save_finish_article_id_to_file() 277 | log.info(f"此时 products 的数据为:{self.products}") 278 | log.error(f"课程列表接口没有获取到内容,请检查请求。返回结果为:{res.content.decode()}") 279 | raise NotValueError(f"课程列表接口没有获取到内容,请检查请求。返回结果为:{res.content.decode()}") 280 | log.info('-' * 40) 281 | 282 | def _parser_products(self, data, _type='c1'): 283 | """ 284 | 解析课程列表内容的方法(从中提取部分数据) 285 | Args: 286 | data: 课程相关信息,一般为接口返回的数据 287 | _type: 课程类型,c1 代表专栏,all 代表全部, 默认只获取专栏的内容 288 | Returns: 289 | 解析后的结果,以列表形式 290 | """ 291 | result = [] 292 | keys = ['title', 'type', 'id'] # 定义要拿取的字段 293 | products = data.get('products', []) 294 | lists = data.get('list', []) 295 | for product in products: 296 | # 如果课程标题在需要排除的列表中,则跳过该课程 297 | if product.get('title', '') in self.exclude: 298 | continue 299 | 300 | new_product = {key: value for key, value in product.items() if key in keys} 301 | new_product['articles'] = [] # 定义文章列表(用来存储文章信息) 302 | new_product['article_ids'] = [] # 定义文章 ID 列表(用来存储文章 ID 信息) ) 303 | for pro in lists: 304 | if new_product['id'] == pro['pid']: 305 | new_product['aid'] = pro['aid'] 306 | if _type.lower() == 'all' or new_product['type'] == _type: 307 | result.append(new_product) 308 | return result 309 | 310 | def _article(self, aid, pro, file_type=None, get_comments=False): 311 | """ 通过课程 ID 获取文章信息接口方法 """ 312 | global FINISH_ARTICLES 313 | log.info("请求获取文章信息接口:") 314 | url = "https://time.geekbang.org/serv/v1/article" 315 | method = "POST" 316 | headers = deepcopy(self.common_headers) 317 | headers["Host"] = "time.geekbang.org" 318 | headers["Origin"] = "https://time.geekbang.org" 319 | headers["Cookie"] = self.cookie.cookie_string 320 | params = { 321 | "id": aid, 322 | "include_neighbors": "true", 323 | "is_freelyread": "true" 324 | } 325 | 326 | log.info(f"接口请求参数:{params}") 327 | res = requests.request(method, url, headers=headers, json=params) 328 | 329 | if res.status_code != 200: 330 | _save_finish_article_id_to_file() 331 | log.info(f"此时 products 的数据为:{self.products}") 332 | log.error(f"获取文章信息接口请求出错,返回内容为:{res.content.decode()}") 333 | raise RequestError(f"获取文章信息接口请求出错,返回内容为:{res.content.decode()}") 334 | data = res.json().get('data', {}) 335 | self.cookie.load_set_cookie(res.headers['Set-Cookie']) 336 | 337 | if data: 338 | comments = self._comments(aid) if get_comments else None 339 | keys = ['article_content', 'article_title', 'id', 'audio_download_url'] # 定义要拿取的字段 340 | article = {key: value for key, value in data.items() if key in keys} 341 | self.save_to_file( 342 | pro['title'], 343 | article['article_title'], 344 | article['article_content'], 345 | audio=article['audio_download_url'], 346 | file_type=file_type, 347 | comments=comments 348 | ) 349 | 350 | FINISH_ARTICLES.append(article['id']) # 将该文章 ID 加入到遍历完成的列表中 351 | pro['cid'] = data['cid'] 352 | # pro['articles'].append(article) # 将文章信息添加到列表中 353 | else: 354 | _save_finish_article_id_to_file() 355 | log.info(f"此时 products 的数据为:{self.products}") 356 | log.error(f"获取文章信息接口没有获取到内容,请检查请求。返回结果为:{res.content.decode()}") 357 | raise NotValueError(f"获取文章信息接口没有获取到内容,请检查请求。返回结果为:{res.content.decode()}") 358 | log.info('-' * 40) 359 | 360 | def _comments(self, aid): 361 | """ 获取文章评论详情接口 """ 362 | log.info("请求获取文章评论详情接口:") 363 | url = "https://time.geekbang.org/serv/v1/comments" 364 | method = "POST" 365 | headers = deepcopy(self.common_headers) 366 | headers["Host"] = "time.geekbang.org" 367 | headers["Origin"] = "https://time.geekbang.org" 368 | headers["Cookie"] = self.cookie.cookie_string 369 | params = { 370 | "aid": aid, 371 | "prev": "0" 372 | } 373 | 374 | log.info(f"接口请求参数:{params}") 375 | res = requests.request(method, url, headers=headers, json=params) 376 | 377 | if res.status_code != 200: 378 | log.error(f"获取文章评论接口请求出错,返回内容为:{res.content.decode()}") 379 | return None 380 | data = res.json().get('data', {}).get('list', []) 381 | self.cookie.load_set_cookie(res.headers['Set-Cookie']) 382 | 383 | if data: 384 | keys = ['comment_content', 'comment_ctime', 'user_header', 'user_name', 'replies'] # 定义要拿取的字段 385 | comments = [{key: value for key, value in comment.items() if key in keys} for comment in data] 386 | return comments 387 | else: 388 | return None 389 | 390 | def _articles(self, cid, pro): 391 | """ 获取文章列表接口方法 """ 392 | global ALL_ARTICLES 393 | log.info("请求获取文章列表接口:") 394 | url = "https://time.geekbang.org/serv/v1/column/articles" 395 | method = "POST" 396 | headers = deepcopy(self.common_headers) 397 | headers["Host"] = "time.geekbang.org" 398 | headers["Origin"] = "https://time.geekbang.org" 399 | headers["Cookie"] = self.cookie.cookie_string 400 | params = { 401 | "cid": cid, 402 | "size": 100, 403 | "prev": 0, 404 | "order": "earliest", 405 | "sample": "false" 406 | } 407 | 408 | log.info(f"接口请求参数:{params}") 409 | res = requests.request(method, url, headers=headers, json=params) 410 | 411 | if res.status_code != 200: 412 | _save_finish_article_id_to_file() 413 | log.info(f"此时 products 的数据为:{self.products}") 414 | log.error(f"获取文章列表接口请求出错,返回内容为:{res.json()}") 415 | raise RequestError(f"获取文章列表接口请求出错,返回内容为:{res.json()}") 416 | data = res.json().get('data', {}) 417 | self.cookie.load_set_cookie(res.headers['Set-Cookie']) 418 | 419 | if data: 420 | ids = [] 421 | article_list = data.get('list', []) 422 | for article in article_list: 423 | ids.append(article['id']) 424 | ALL_ARTICLES += ids 425 | pro['article_ids'] += ids 426 | else: 427 | _save_finish_article_id_to_file() 428 | log.info(f"此时 products 的数据为:{self.products}") 429 | log.error(f"获取文章列表接口没有获取到内容,请检查请求。返回结果为:{res.json()}") 430 | raise NotValueError(f"获取文章列表接口没有获取到内容,请检查请求。返回结果为:{res.json()}") 431 | log.info('-' * 40) 432 | 433 | @staticmethod 434 | def save_to_file(dir_name, filename, content, audio=None, file_type=None, comments=None): 435 | """ 436 | 将结果保存成文件的方法,保存在当前目录下 437 | Args: 438 | dir_name: 文件夹名称,如果不存在该文件夹则会创建文件夹 439 | filename: 文件名称,直接新建 440 | content: 需要保存的文本内容 441 | audio: 需要填入文件中的音频文件(一般为音频地址) 442 | file_type: 文档类型(需要保存什么类型的文档),默认保存为 Markdown 文档 443 | comments: 评论相关数据 444 | Returns: 445 | """ 446 | if not file_type: file_type = '.md' 447 | dir_path = pathlib.PurePosixPath() / dir_name 448 | if not os.path.isdir(dir_path): 449 | os.mkdir(dir_path) 450 | filename = check_filename(filename) 451 | file_path = os.path.abspath(dir_path / (filename + file_type)) 452 | 453 | # 处理评论数据 454 | temp = "" 455 | if comments: 456 | with open('comment.css', 'r', encoding='utf-8') as f: 457 | comment_style = f.read() 458 | temp = comment_style + "" 481 | 482 | # 将所有数据写入文件中 483 | with open(file_path, 'w', encoding='utf-8') as f: 484 | if audio: 485 | audio_text = f' \n' 486 | f.write(audio_text) 487 | f.write(content + temp) 488 | 489 | 490 | def run(cellphone=None, passwd=None, exclude=None, file_type=None, get_comments=False): 491 | """ 整体流程的请求方法 """ 492 | global FINISH_ARTICLES 493 | global ALL_ARTICLES 494 | 495 | geek = GeekCrawler(cellphone, passwd, exclude=exclude) 496 | geek._login() # 请求登录接口进行登录 497 | geek._product() # 请求获取课程接口 498 | 499 | number = 0 500 | 501 | for pro in geek.products: 502 | geek._articles(pro['id'], pro) # 获取文章列表 503 | 504 | article_ids = pro['article_ids'] 505 | for aid in article_ids: 506 | if set(ALL_ARTICLES) == set(FINISH_ARTICLES): 507 | import sys 508 | log.info("正常抓取完成啦,不用再继续跑脚本了。") 509 | sys.exit(1) 510 | 511 | if str(aid) in FINISH_ARTICLES: 512 | continue 513 | geek._article(aid, pro, file_type=file_type, get_comments=get_comments) # 获取单个文章的信息 514 | time.sleep(5) # 做一个延时请求,避免过快请求接口被限制访问 515 | number += 1 516 | # 判断是否连续抓取过 37次,如果是则暂停 10s 517 | if number == 37: 518 | log.info("抓取达到37次了,先暂停 10s 再继续。") 519 | time.sleep(10) 520 | number = 0 # 重新计数 521 | geek._user_auth() 522 | _save_finish_article_id_to_file() 523 | log.info("正常抓取完成。") 524 | 525 | 526 | if __name__ == "__main__": 527 | # 采用在脚本中写死账号密码的方式 528 | # cellphone = "" 529 | # pwd = "" 530 | 531 | # 采用每次跑脚本手动输入账号密码的方式 532 | cellphone = str(input("请输入你的极客时间账号(手机号): ")) 533 | pwd = str(input("请输入你的极客时间密码: ")) 534 | 535 | # 需要排除的课程列表,根据自己的情况定义(比如已经有的资源就不用再继续下载了) 536 | # exclude = ['左耳听风', '趣谈网络协议'] 537 | exclude = [] 538 | 539 | # 需要保存文件的后缀名,尽量选 .md 或者 .html 540 | file_type = '.md' 541 | 542 | # 是否获取评论信息,目前暂时设置为不获取,因为 md 文档中评论显示不太好看,如果需要获取评论的话请设置保存文本为 HTML(样式好看些) 543 | get_comments = False # True 544 | 545 | try: 546 | FINISH_ARTICLES = _load_finish_article() 547 | run(cellphone, pwd, exclude=exclude, get_comments=get_comments) 548 | except Exception: 549 | import traceback 550 | log.error(f"请求过程中出错了,出错信息为:{traceback.format_exc()}") 551 | finally: 552 | _save_finish_article_id_to_file() 553 | --------------------------------------------------------------------------------