├── .gitignore ├── LICENSE ├── Pipfile ├── README.md ├── bin ├── .config │ └── systemd │ │ └── user │ │ ├── newspaper_spider.service │ │ ├── newspaper_spider.timer │ │ └── newspaper_web.service ├── git-sync.sh ├── obsoleted │ ├── restart.sh │ ├── start.sh │ └── stop.sh └── update_systemd_config.py ├── crawl_history.py ├── crawl_online.py ├── crawl_test.py ├── db_backup.py ├── db_sql.py ├── deploy.md ├── newspaper ├── api.py ├── config.py ├── crawler │ ├── main.py │ ├── sources.py │ └── spiders.py ├── loggers.py ├── models.py ├── server.py ├── static │ └── favicon.ico ├── templates │ ├── articles.html │ └── daily_python.html ├── utils.py └── views.py └── run_server.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # local test python script 107 | temp.py 108 | .vscode 109 | logs/ 110 | *.sqlite 111 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 pyld 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = false 4 | name = "pypi" 5 | 6 | [dev-packages] 7 | 8 | [packages] 9 | uvloop = {version = "*",sys_platform = "!= 'win32'"} 10 | torequests = ">=5.0.10" 11 | starlette = "*" 12 | uvicorn = ">=0.11.8" 13 | aiomysql = "*" 14 | lxml = "*" 15 | cssselect = "*" 16 | async-lru = "*" 17 | aiofiles = "*" 18 | jinja2 = "*" 19 | 20 | [requires] 21 | python_version = "3.7" 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # python-newspaper 2 | 3 | ## 服务器将与 2021.4.1 关停不再续费. sqlite 备份在 Release 里, 备份时间 2021-03-07. 之后会考虑通过另一个项目 Watchdogs 开展订阅. 4 | 5 | ### Timeline 6 | - [For English-only reader](https://www.clericpy.top/newspaper/articles.query.html?lang=EN) 7 | - [中文读者](https://www.clericpy.top/newspaper/articles.query.html?lang=CN) 8 | - [中英文读者](https://www.clericpy.top/newspaper/articles.query.html) 9 | 10 | ### RSS 日报 11 | - [For English-only reader](https://www.clericpy.top/newspaper/daily.python.list.rss.en) 12 | - [中文读者](https://www.clericpy.top/newspaper/daily.python.list.rss.cn) 13 | - [中英文读者](https://www.clericpy.top/newspaper/daily.python.list.rss.any) 14 | 15 | ### 当前进度: 16 | 17 | - [x] 购买服务器 18 | - [x] 准备备用服务器 19 | - [x] 确认内容源 20 | - [x] 准备服务器 21 | - [x] 开发 22 | - [x] 测试 23 | - [x] 上线 24 | - [x] 补充内容源 25 | - [ ] Python 日报静态页面 github pages + rss 26 | - [ ] 人工筛选生成日报/周报, 公众号推送 27 | - [ ] 实现订阅源过滤功能 28 | 29 | ### 内容源列表 30 | 31 | **内容源高分标准**: 32 | 33 | 1. 原创为主 34 | 2. 更新频率较低 35 | 3. 文章质量好 36 | 4. 信息量大, 周报 37 | 5. 广为人知 38 | 39 | 40 | 41 | * 收录进度: 37 / 37 42 | 43 | > = 待收录 | √ 已收录 | X 不收录 | - 入库不追更 44 | 45 | | 序号 | 名称 | 评分 | 语言 | 收录 | 描述 | 46 | | ---- | ---- | ---- | ---- | ---- | ---- | 47 | | 1 | [Python Software Foundation News](https://pyfound.blogspot.com/) | 4 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python+Software+Foundation+News) | [墙] 来自 Python 软件基金会的消息 | 48 | | 2 | [Python Weekly](https://www.pythonweekly.com/) | 5 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python+Weekly) | 必备周报 | 49 | | 3 | [PyCoder's Weekly](https://pycoders.com/issues) | 5 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=PyCoder%27s+Weekly) | 必备周报 | 50 | | 4 | [Import Python](https://importpython.com/newsletter/archive/) | 5 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Import+Python) | 必备周报, 2019.1.11 停更了, 希望早日康复~ | 51 | | 5 | [Awesome Python Newsletter](https://python.libhunt.com/newsletter/archive) | 5 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Awesome+Python+Newsletter) | 必备周报 | 52 | | 6 | [Real Python](https://realpython.com/) | 4 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Real+Python) | 文章质量高, 更新较少 | 53 | | 7 | [Planet Python](https://planetpython.org) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Planet+Python) | 官方推荐的博客, 收录了很多博主 | 54 | | 8 | [Julien Danjou](https://julien.danjou.info) | 4 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Julien+Danjou) | 文章质量不错, 保持更新 | 55 | | 9 | [Doug Hellmann](https://doughellmann.com/blog/) | 4 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Doug+Hellmann) | 大名鼎鼎, 文章质量很高 | 56 | | 10 | [The Mouse Vs. The Python](https://www.blog.pythonlibrary.org) | 4 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=The+Mouse+Vs.+The+Python) | 文章质量不错 | 57 | | 11 | [InfoQ](https://www.infoq.cn/topic/python) | 4 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=InfoQ) | 原创/译文的质量不错 | 58 | | 12 | [Jeff Knupp](https://jeffknupp.com/) | 4 | EN | X | [墙] 热门博客, 2018以后不更新了, 并且 planetpython 有, 暂不收录 | 59 | | 13 | [Hacker News](https://hn.algolia.com/?query=python&sort=byPopularity&prefix&page=0&dateRange=last24h&type=story) | 4 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Hacker+News) | 大名鼎鼎的 HN | 60 | | 14 | [Python Insider](https://blog.python.org/) | 3 | EN | X | 官方开发进度, 被官博和 planetPython 包含, 所以不需要收录. | 61 | | 15 | [Brett Cannon](https://snarky.ca/) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Brett+Cannon) | 核心开发者 | 62 | | 16 | [Encode](https://www.encode.io/) | 3 | EN | X | 知名 Python 开源组织, 文章太少, 暂不收录 | 63 | | 17 | [机器之心](https://www.jiqizhixin.com/search/article?keywords=python&search_internet=true&sort=time) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E6%9C%BA%E5%99%A8%E4%B9%8B%E5%BF%83) | 知名公众号 | 64 | | 18 | [依云's Blog](https://blog.lilydjwg.me/tag/python?page=1) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E4%BE%9D%E4%BA%91%27s+Blog) | 文章质量很高 | 65 | | 19 | [DEV Community](https://dev.to/t/python/latest) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=DEV+Community) | 算是个挺好的社区, post 也都不太水 | 66 | | 20 | [Python猫](https://zhuanlan.zhihu.com/pythonCat) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python%E7%8C%AB) | 2018 年末比较热情的博主, 原创 + 优质译文 | 67 | | 21 | [Python之美](https://zhuanlan.zhihu.com/python-cn) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python%E4%B9%8B%E7%BE%8E) | 早期文章较多, 创业以后更新不太多了 | 68 | | 22 | [静觅](https://cuiqingcai.com/category/technique/python) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E9%9D%99%E8%A7%85) | 崔庆才的个人博客, 保持更新的原创博主 | 69 | | 23 | [推酷(中文)](https://www.tuicool.com/topics/11130000?st=0&lang=1) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E6%8E%A8%E9%85%B7%28%E4%B8%AD%E6%96%87%29) | 推文类站点. 按热门排序 | 70 | | 24 | [推酷(英文)](https://www.tuicool.com/topics/11130000?st=0&lang=2) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E6%8E%A8%E9%85%B7%28%E8%8B%B1%E6%96%87%29) | 推文类站点. 按热门排序 | 71 | | 25 | [开发者头条](https://toutiao.io/tags/python?type=latest) | 3 | CN | X | 推文类站点, 但是没有发布时间, 暂不收录 | 72 | | 26 | [稀土掘金](https://juejin.im/tag/Python) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E7%A8%80%E5%9C%9F%E6%8E%98%E9%87%91) | 推文类站点. 按热门排序 | 73 | | 27 | [Python部落](https://python.freelycode.com/contribution/list/0?page_no=1) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python%E9%83%A8%E8%90%BD) | 推文+译文 | 74 | | 28 | [miguelgrinberg](https://blog.miguelgrinberg.com/index) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=miguelgrinberg) | Web 开发相关的内容挺多, 质量较高 | 75 | | 29 | [Ned Batchelder](https://nedbatchelder.com/blog/tag/python.html) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Ned+Batchelder) | 热门博主。planetpython 也有 | 76 | | 30 | [Full Stack Python](https://www.fullstackpython.com/blog.html) | 3 | EN | X | 热门博主。planetpython 有了, 文章比较少, 暂不收录 | 77 | | 31 | [Eli Bendersky's website](https://eli.thegreenplace.net/tag/python) | 3 | EN | X | 值得一看,planetpython 有, 暂不收录 | 78 | | 32 | [Manjusaka](https://manjusaka.itscoder.com/tags/Python/) | 3 | CN | X | 原创还不错, 但是文章较少, 暂不收录 | 79 | | 33 | [Python程序员](https://zhuanlan.zhihu.com/pythoncxy) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python%E7%A8%8B%E5%BA%8F%E5%91%98) | 关注破万的知乎专栏 | 80 | | 34 | [Python头条](https://zhuanlan.zhihu.com/c_111369541) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python%E5%A4%B4%E6%9D%A1) | 关注破万的知乎专栏 | 81 | | 35 | [the5fire的技术博客](https://www.the5fire.com/category/python/) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=the5fire%E7%9A%84%E6%8A%80%E6%9C%AF%E5%8D%9A%E5%AE%A2) | 保持更新的热门中文博主 | 82 | | 36 | [Python之禅](https://foofish.net/) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python%E4%B9%8B%E7%A6%85) | 文章较基础, 质量不错 | 83 | | 37 | [V2EX](https://www.v2ex.com/go/python) | 3 | CN | X | 社区类, api 失效, web 端乱七八糟的, 不收录 | 84 | | 38 | [伯乐在线](http://python.jobbole.com/all-posts/) | 3 | CN | X | 有点类似推酷, 质量参差不齐. HTTP ERROR 503 | 85 | | 39 | [Python 3 Module of the Week](https://pymotw.com/3/) | 3 | EN | X | 看起来不怎么更新了, 暂不收录 | 86 | | 40 | [The Invent with Python Blog](https://inventwithpython.com/blog/index.html) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=The+Invent+with+Python+Blog) | 感觉不错 | 87 | | 41 | [Armin Ronacher's Thoughts and Writings](http://lucumr.pocoo.org/) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Armin+Ronacher%27s+Thoughts+and+Writings) | Flask 作者 Armin Ronacher | 88 | | 42 | [aio-libs](https://groups.google.com/forum/#!forum/aio-libs) | 3 | EN | X | 知名 Python 开源组织, 不过没有文章类的 post | 89 | | 43 | [码农周刊](https://weekly.manong.io/issues/) | 3 | CN | X | 课外读物, 非 Python 主题, 暂不收录 | 90 | | 44 | [编程派](http://codingpy.com/) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E7%BC%96%E7%A8%8B%E6%B4%BE) | 原创+译文 | 91 | | 45 | [峰云就她了](http://xiaorui.cc/archives/category/python) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E5%B3%B0%E4%BA%91%E5%B0%B1%E5%A5%B9%E4%BA%86) | 原创博客, 质量比较不错 | 92 | | 46 | [Dan Bader](https://dbader.org/blog/) | 3 | EN | X | 一年不更新了, 先不收录了 | 93 | | 47 | [Pythonic Perambulations](https://jakevdp.github.io/) | 3 | EN | X | 最后更新 Thu 13 September 2018, 暂不收录 | 94 | | 48 | [开源中国翻译](https://www.oschina.net/translate/tag/python) | 3 | CN | X | 入库留着吧, 估计不更了, 暂不收录 | 95 | | 49 | [Trey Hunner](https://treyhunner.com/blog/archives/) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Trey+Hunner) | Help developers level-up their Python skills | 96 | | 50 | [Python Central](https://www.pythoncentral.io/) | 3 | EN | X | 不更新了, 暂不收录 | 97 | | 51 | [Inside the Head of PyDanny](https://www.pydanny.com/) | 3 | EN | X | 不更新了, 暂不收录 | 98 | | 52 | [华蟒用户组,CPyUG](https://groups.google.com/forum/#!forum/python-cn) | 3 | EN | X | [墙] 社区类, 自己看看就好, 暂不收录 | 99 | | 53 | [Treehl](https://family-treesy.github.io/tags/PYTHON/) | 3 | CN | X | 文章较基础, 久不更新, 暂不收录 | 100 | | 54 | [蠎周刊](http://weekly.pychina.org) | 4 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E8%A0%8E%E5%91%A8%E5%88%8A) | 各种 weekly 中译版 | 101 | | 55 | [zzzeek](https://techspot.zzzeek.org/) | 3 | EN | X | 2016 年后停更了 | 102 | | 56 | [Yu’s blog](https://gofisher.github.io/) | 3 | CN | X | 原创, 但是久不更新了, 网站 http://blog.rainy.im/ 挂了 | 103 | | 57 | [程序师](http://www.techug.com/tag/python) | 3 | CN | X | 原创较少, 文章较旧 | 104 | | 58 | [一根笨茄子](http://blog.guoyb.com/tags/Python/) | 3 | CN | X | 文章更新较少, 质量参差 | 105 | | 59 | [追梦人物](https://www.zmrenwu.com/) | 2 | CN | X | 像个学习博客 | 106 | | 60 | [anshengme](https://blog.ansheng.me/) | 2 | CN | X | 质量一般 | 107 | | 61 | [Pegasus](http://ningning.today/categories/python/) | 2 | CN | X | 不怎么更新 | 108 | | 62 | [FunHacks](https://funhacks.net/categories/Python/) | 2 | CN | X | 太久不更新了, 不过python 之旅还行 | 109 | | 63 | [Peter Norvig's essays](http://norvig.com/) | 2 | EN | X | 这排版驾驭不了... | 110 | | 64 | [Peterbe.com](https://www.peterbe.com/plog/) | 2 | EN | X | 不是太值得收录 | 111 | | 65 | [Python Tips](https://pythontips.com/) | 2 | EN | X | 很火, 但我不喜欢 | 112 | | 66 | [脚本之家](https://www.jb51.net/list/list_97_1.htm) | 2 | CN | X | 文章的质量啊~~~ | 113 | | 67 | [开源中国搜索](https://www.oschina.net/search?scope=translate&q=python&category=0&onlytitle=0&sort_by_time=1) | 2 | CN | X | 质量不太高 | 114 | | 68 | [伯乐在线头条](http://top.jobbole.com/tag/python/?sort=latest) | 2 | CN | X | 停更 | 115 | | 69 | [代码片段](http://www.phpxs.com/code/python) | 2 | CN | X | 文章太老了, 停更了 | 116 | | 70 | [segmentfault](https://segmentfault.com/t/python/blogs) | 2 | CN | X | 文章质量 | 117 | | 71 | [Python China](http://python-china.org/api/topics/timeline) | 2 | CN | X | 欠费网站挂了 | 118 | | 72 | [麦穗技术](http://www.58maisui.com/category/python/) | 2 | CN | X | 网站挂了 | 119 | | 73 | [CSDN](https://so.csdn.net/so/search/s.do?q=python&t=blog&u=) | 1 | CN | X | 文章质量啊~~~ | 120 | | 74 | [Stack Overflow](https://stackoverflow.com/?tab=hot) | 3 | EN | X | 已解决 + python + vote>=5, 但是问题有点弱智, 暂不收录 | 121 | | 75 | [Reddit](https://www.reddit.com/r/Python/top/) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Reddit) | 知名社区. 质量参差, 收录每日 ups>=20 | 122 | | 76 | [码天狗](https://weekly.codetengu.com/issues) | 4 | CN | X | 综合类周报, 2018-11-23 之后不更了. 挂了, 下线. | 123 | | 77 | [Medium](https://medium.com/tag/python) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Medium) | Medium 的 Python tag, 收录 RSS | 124 | 125 | 126 | 127 | 128 | ### 声明 129 | 130 | 1. 非盈利项目, 主要动机是 Python 语言学以致用, 并给大家提供学习 Python 的渠道 131 | 2. 若有侵权行为, 请在 Issues 留言, 将进行下线处理 132 | 3. 欢迎提交 PR, 欢迎在 Issues 留言提供优质内容源 133 | 4. 纯中文项目, 精力有限, 暂时先不管老外了 134 | -------------------------------------------------------------------------------- /bin/.config/systemd/user/newspaper_spider.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=newspaper spider service 3 | 4 | [Service] 5 | Type=simple 6 | ExecStart=/usr/local/bin/pipenv run python crawl_online.py 7 | WorkingDirectory=/root/newspaper 8 | 9 | [Install] 10 | WantedBy=multi-user.target 11 | WantedBy=network-online.target 12 | -------------------------------------------------------------------------------- /bin/.config/systemd/user/newspaper_spider.timer: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=newspaper spider timer 3 | 4 | [Timer] 5 | OnBootSec=10min 6 | OnUnitActiveSec=15min 7 | Unit=newspaper_spider.service 8 | 9 | [Install] 10 | WantedBy=multi-user.target 11 | WantedBy=network-online.target 12 | -------------------------------------------------------------------------------- /bin/.config/systemd/user/newspaper_web.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=newspaper web service 3 | 4 | [Service] 5 | Type=simple 6 | ExecStart=/usr/local/bin/pipenv run python run_server.py 7 | WorkingDirectory=/root/newspaper 8 | [Install] 9 | WantedBy=multi-user.target 10 | WantedBy=network-online.target 11 | -------------------------------------------------------------------------------- /bin/git-sync.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | git fetch; git reset --hard origin/master 3 | -------------------------------------------------------------------------------- /bin/obsoleted/restart.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | DIR=$(cd `dirname $0`; pwd) 3 | cd $DIR 4 | sh stop.sh 5 | sh start.sh > /dev/null 6 | -------------------------------------------------------------------------------- /bin/obsoleted/start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | DIR=$(cd `dirname $0`/..; pwd) 3 | cd $DIR 4 | nohup pipenv run python run_server.py & 5 | echo "server started" 6 | echo 7 | -------------------------------------------------------------------------------- /bin/obsoleted/stop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ps aux|grep 'newspaper-'|grep 'run_server.py'|awk '{print $2}'|xargs kill 3 | echo "server stoped" 4 | echo 5 | -------------------------------------------------------------------------------- /bin/update_systemd_config.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | user_systemd_dir = pathlib.Path.home() / '.config/systemd/user' 3 | if not user_systemd_dir.is_dir(): 4 | user_systemd_dir.mkdir() 5 | 6 | newspaper_product_dir = pathlib.Path( 7 | __file__).absolute().parent.parent.absolute() 8 | 9 | # web 服务启动 10 | 11 | newspaper_web_service = fr''' 12 | [Unit] 13 | Description=newspaper web service 14 | 15 | [Service] 16 | Type=simple 17 | ExecStart=/usr/local/bin/pipenv run python run_server.py 18 | WorkingDirectory={newspaper_product_dir} 19 | [Install] 20 | WantedBy=multi-user.target 21 | WantedBy=network-online.target 22 | ''' 23 | newspaper_web_service_fp = user_systemd_dir / 'newspaper_web.service' 24 | newspaper_web_service_fp.write_text(newspaper_web_service, encoding='utf-8') 25 | 26 | # 爬虫服务 27 | 28 | newspaper_spider_service = fr''' 29 | [Unit] 30 | Description=newspaper spider service 31 | 32 | [Service] 33 | Type=simple 34 | ExecStart=/usr/local/bin/pipenv run python crawl_online.py 35 | WorkingDirectory={newspaper_product_dir} 36 | 37 | [Install] 38 | WantedBy=multi-user.target 39 | WantedBy=network-online.target 40 | ''' 41 | newspaper_spider_service_fp = user_systemd_dir / 'newspaper_spider.service' 42 | newspaper_spider_service_fp.write_text(newspaper_spider_service, 43 | encoding='utf-8') 44 | 45 | # 爬虫定时器 46 | 47 | newspaper_spider_timer = r''' 48 | [Unit] 49 | Description=newspaper spider timer 50 | 51 | [Timer] 52 | OnBootSec=10min 53 | OnUnitActiveSec=15min 54 | Unit=newspaper_spider.service 55 | 56 | [Install] 57 | WantedBy=multi-user.target 58 | WantedBy=network-online.target 59 | ''' 60 | newspaper_spider_timer_fp = user_systemd_dir / 'newspaper_spider.timer' 61 | newspaper_spider_timer_fp.write_text(newspaper_spider_timer, encoding='utf-8') 62 | -------------------------------------------------------------------------------- /crawl_history.py: -------------------------------------------------------------------------------- 1 | from newspaper.crawler.main import history_workflow 2 | import asyncio 3 | """ 4 | 采集历史文章脚本 5 | 1. 本地脚本 6 | 2. 执行历史文章抓取任务, 并将文章入库 7 | 3. 将需要抓历史文章的内容源的函数加装饰器 register_history, 就会被自动调用 8 | 4. 一般抓历史文章的任务只在第一次收录时候使用, 后期使用 online_spiders 保持更新 9 | """ 10 | 11 | if __name__ == "__main__": 12 | loop = asyncio.get_event_loop() 13 | loop.run_until_complete(history_workflow()) 14 | -------------------------------------------------------------------------------- /crawl_online.py: -------------------------------------------------------------------------------- 1 | def main(): 2 | """ 3 | 采集线上爬虫脚本 4 | 1. 本地脚本 / 线上脚本 皆可. crontab 任务 5 | 2. 执行执行常规抓取任务, 并将文章入库 6 | 3. 将需要文章的内容源的函数加装饰器 register_online, 就会被自动调用 7 | """ 8 | from newspaper.crawler.main import online_workflow 9 | import asyncio 10 | loop = asyncio.get_event_loop() 11 | loop.run_until_complete(online_workflow()) 12 | 13 | 14 | if __name__ == "__main__": 15 | main() 16 | -------------------------------------------------------------------------------- /crawl_test.py: -------------------------------------------------------------------------------- 1 | def test(): 2 | from newspaper.crawler.main import test_spider_workflow 3 | import asyncio 4 | loop = asyncio.get_event_loop() 5 | loop.run_until_complete(test_spider_workflow()) 6 | 7 | 8 | if __name__ == "__main__": 9 | test() 10 | -------------------------------------------------------------------------------- /db_backup.py: -------------------------------------------------------------------------------- 1 | #! pipenv run python 2 | """ 3 | 从线上拉数据到本地备份 sqlite 4 | """ 5 | import re 6 | 7 | from torequests import tPool 8 | from torequests.utils import ttime, time 9 | 10 | from newspaper.models import Sqlite3Storage, logger 11 | from newspaper.config import ONLINE_HOST 12 | 13 | 14 | def fetch_artcles(ts_start): 15 | req = tPool() 16 | api = f'https://{ONLINE_HOST}/newspaper/articles.query.json' 17 | next_url = '' 18 | start_params = { 19 | 'query': '', 20 | 'start_time': ts_start, 21 | 'end_time': '', 22 | 'source': '', 23 | 'lang': 'ANY', 24 | 'order_by': 'ts_update', 25 | 'sorting': 'asc', 26 | 'limit': '100', 27 | 'offset': '0', 28 | } 29 | 30 | while 1: 31 | params = {} if next_url else start_params 32 | # 没有 next_url 的时候访问第一页, 有的时候访问 next_url 33 | url = next_url or api 34 | r = req.get(url, params=params, retry=2, timeout=10) 35 | if not r.x: 36 | logger.error(f'req init failed: {r.x}, {r.text}') 37 | raise IOError 38 | rj = r.json() 39 | articles = rj.get('articles', []) 40 | if articles: 41 | yield articles 42 | next_url = rj.get('next_url', '') 43 | if not (articles and next_url): 44 | # 没有文章, 并没有下一页 45 | logger.info(f'fetch_artcles finished, last url: {url}') 46 | return 47 | next_url = re.sub('^/', f'https://{ONLINE_HOST}/', next_url) 48 | 49 | 50 | def get_ts_latest(cursor): 51 | cursor.execute('select max(ts_update) from articles') 52 | result = cursor.fetchone()[0] 53 | if result: 54 | return result 55 | else: 56 | return ttime(0) 57 | 58 | 59 | def main(): 60 | db = Sqlite3Storage(file_path='backup.sqlite') 61 | db._ensure_article_table_exists() 62 | ts_latest = get_ts_latest(db.cursor) 63 | logger.info(f'sync articles from online api: ts_latest={ts_latest}') 64 | article_cnt = 0 65 | for articles in fetch_artcles(ts_latest): 66 | db.add_articles(articles) 67 | article_cnt += len(articles) 68 | logger.info(f'+ {len(articles)} articles => {article_cnt}') 69 | logger.info(f'+ {article_cnt} new articles.') 70 | 71 | 72 | if __name__ == "__main__": 73 | main() 74 | time.sleep(3) 75 | -------------------------------------------------------------------------------- /db_sql.py: -------------------------------------------------------------------------------- 1 | #! pipenv run python 2 | """ 3 | 同时执行线上先下数据库 4 | """ 5 | import asyncio 6 | import traceback 7 | import logging 8 | 9 | from newspaper.config import init_db 10 | from newspaper.models import Sqlite3Storage, logger 11 | 12 | 13 | async def main(): 14 | db = Sqlite3Storage(file_path='backup.sqlite') 15 | db._ensure_article_table_exists() 16 | mysql = init_db() 17 | logger.setLevel(logging.WARNING) 18 | while 1: 19 | # select count(*) from articles 20 | # select count(*) from articles where `desc` like '%本文分享 方巍%' 21 | sql = input('Input SQL:\n') 22 | if not sql: 23 | break 24 | try: 25 | print(sql) 26 | db.cursor.execute(sql) 27 | logger.warning(f'Sqlite3Storage: {db.cursor.fetchall()}') 28 | result = await mysql.execute(sql) 29 | logger.warning(f'MysqlStorage: {result}') 30 | except KeyboardInterrupt: 31 | break 32 | except Exception: 33 | traceback.print_exc() 34 | 35 | 36 | if __name__ == "__main__": 37 | asyncio.run(main()) 38 | -------------------------------------------------------------------------------- /deploy.md: -------------------------------------------------------------------------------- 1 | 2 | ## 首次部署 3 | 0. install python 3.7+ 4 | 1. git clone ... 5 | 2. pipenv install 6 | 3. python3.7 update_systemd_config.py 7 | 4. 新建 JSON 格式配置文件 /var/newspaper.conf 8 | 1. {"anti_gfw": {"url": "这里填写翻墙服务的地址, 如果没有则使用 http://localhost"}, "mysql_config": {"mysql_host": "", "mysql_port": 3306, "mysql_user": "", "mysql_password": "", "mysql_db": "db"}} 9 | 2. 当然环境变量 export newspaper_config 上面的 JSON 也是可以的 10 | 5. systemctl --user enable newspaper_web.service; systemctl --user start newspaper_web.service 11 | 6. systemctl --user enable newspaper_spider.timer; systemctl --user start newspaper_spider.timer 12 | 7. 绑定域名, 并配置 nginx 托管相关端口, 支持 SSL 13 | 14 | 15 | 16 | 17 | ### vscode task 升级更新脚本 18 | ```git co master ; git merge dev; git push; git co dev;ssh aliyun 'cd newspaper/bin;sh git-sync.sh;python3.7 update_systemd_config.py;systemctl daemon-reload;systemctl --user restart newspaper_web.service'``` 19 | -------------------------------------------------------------------------------- /newspaper/api.py: -------------------------------------------------------------------------------- 1 | #! python3 2 | 3 | import pathlib 4 | 5 | from starlette.applications import Starlette 6 | from starlette.staticfiles import StaticFiles 7 | from starlette.templating import Jinja2Templates 8 | 9 | from .config import init_db, global_configs 10 | from .loggers import logger 11 | 12 | static_dir = pathlib.Path(__file__).absolute().parent / 'static' 13 | templates_dir = pathlib.Path(__file__).absolute().parent / 'templates' 14 | 15 | app = Starlette() 16 | app.mount('/static', StaticFiles(directory=str(static_dir)), name='static') 17 | app.config = global_configs 18 | app.logger = logger 19 | app.db = init_db() 20 | app.templates = Jinja2Templates(directory=str(templates_dir)) 21 | 22 | 23 | @app.on_event('startup') 24 | async def _ensure_article_table_exists(): 25 | await app.db._ensure_article_table_exists() 26 | -------------------------------------------------------------------------------- /newspaper/config.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import pathlib 5 | 6 | 7 | def init_config(): 8 | global_configs = os.getenv( 9 | 'newspaper_config', 10 | None) or pathlib.Path('/var/newspaper.conf').read_text() 11 | if global_configs: 12 | global_configs = json.loads(global_configs) 13 | else: 14 | newspaper_config_template = '{"anti_gfw": {"url": "xxx"}, "mysql_config": {"mysql_host": "xxx", "mysql_port": 0, "mysql_user": "xxx", "mysql_password": "xxx", "mysql_db": "xxx"}}' 15 | logging.error( 16 | f'environment variable `newspaper_config` not found, it should be set as json like: {newspaper_config_template}' 17 | ) 18 | raise RuntimeError('environment variable `newspaper_config` not found') 19 | return global_configs 20 | 21 | 22 | def init_db(): 23 | from .models import MySQLStorage 24 | db = MySQLStorage(global_configs['mysql_config']) 25 | return db 26 | 27 | 28 | global_configs = init_config() 29 | ONLINE_HOST = 'www.clericpy.top' 30 | GA_ID = 'UA-150991415-2' 31 | BEIAN_ID = '鲁ICP备19021778号-1' 32 | -------------------------------------------------------------------------------- /newspaper/crawler/main.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from torequests.dummy import Requests 4 | 5 | from ..config import init_db 6 | from ..loggers import spider_logger 7 | from .spiders import history_spiders, online_spiders 8 | 9 | db = init_db() 10 | 11 | 12 | async def test_spider_workflow(): 13 | from .spiders import test_spiders 14 | from ..models import Storage 15 | from pprint import pprint 16 | 17 | for func in test_spiders: 18 | print('test start:', func.__doc__) 19 | articles = await func() 20 | articles = Storage.ensure_articles(articles) 21 | # check schema 22 | for item in articles: 23 | assert (not item.get('desc')) or isinstance(item['desc'], str) 24 | assert (not item.get('ts_publish')) or isinstance( 25 | item['ts_publish'], str) 26 | assert (not item.get('cover')) or isinstance(item['cover'], str) 27 | assert isinstance(item.get('source'), str) 28 | assert isinstance(item.get('title'), str) 29 | assert isinstance(item.get('url'), str) 30 | pprint(articles) 31 | 32 | 33 | async def clear_cache(): 34 | url = 'http://127.0.0.1:9001/newspaper/articles.cache.clear' 35 | req = Requests() 36 | r = await req.get(url, timeout=2) 37 | spider_logger.info(f'clear_cache {r.text}') 38 | 39 | 40 | async def online_workflow(): 41 | if not online_spiders: 42 | spider_logger.info('no online_spiders online.') 43 | return 44 | # 确认 articles 表存在, 否则建表 45 | await db._ensure_article_table_exists() 46 | # 生成一个 function name → source name 的映射 47 | function_sources = {func.__name__: func.__doc__ for func in online_spiders} 48 | coros = [func() for func in online_spiders] 49 | done, fail = await asyncio.wait(coros, timeout=120) 50 | spider_logger.info(f'{"=" * 30}') 51 | if fail: 52 | fail_names = [ 53 | f'{idx}. {function_sources.get(task._coro.__name__, task._coro.__name__)}' 54 | for idx, task in enumerate(fail, 1) 55 | ] 56 | spider_logger.warn( 57 | f'timeout spiders({len(fail)}): {fail_names}') 58 | pool = await db.get_pool() 59 | async with pool.acquire() as conn: 60 | async with conn.cursor() as cursor: 61 | for idx, task in enumerate(done, 1): 62 | articles = task.result() 63 | func_name = task._coro.__name__ 64 | source_name = function_sources.get(func_name, func_name) 65 | if articles: 66 | insert_result = await db.add_articles(articles, 67 | cursor=cursor) 68 | else: 69 | insert_result = 0 70 | spider_logger.info( 71 | f'{idx: 3}. {"+" if articles else "?????????"} {insert_result} / {len(articles)} articles.\t[{source_name}]' 72 | ) 73 | await clear_cache() 74 | 75 | 76 | async def history_workflow(): 77 | if not history_spiders: 78 | spider_logger.info('ignore for no history_spiders online.') 79 | return 80 | # 确认 articles 表存在, 否则建表 81 | await db._ensure_article_table_exists() 82 | # 生成一个 function name → source name 的映射 83 | function_sources = {func.__name__: func.__doc__ for func in history_spiders} 84 | coros = [func() for func in history_spiders] 85 | done, fail = await asyncio.wait(coros, timeout=9999) 86 | spider_logger.info(f'{"=" * 30}') 87 | if fail: 88 | fail_names = [ 89 | f'{idx}. {function_sources.get(task._coro.__name__, task._coro.__name__)}' 90 | for idx, task in enumerate(fail, 1) 91 | ] 92 | spider_logger.warn( 93 | f'timeout spiders({len(fail)}): {fail_names}') 94 | pool = await db.get_pool() 95 | async with pool.acquire() as conn: 96 | async with conn.cursor() as cursor: 97 | for idx, task in enumerate(done, 1): 98 | articles = task.result() 99 | func_name = task._coro.__name__ 100 | source_name = function_sources.get(func_name, func_name) 101 | if articles: 102 | insert_result = await db.add_articles(articles, 103 | cursor=cursor) 104 | else: 105 | insert_result = 0 106 | spider_logger.info( 107 | f'{idx: 3}. {"+" if articles else "?????????"} {insert_result} / {len(articles)} articles.\t[{source_name}]' 108 | ) 109 | await clear_cache() 110 | -------------------------------------------------------------------------------- /newspaper/crawler/sources.py: -------------------------------------------------------------------------------- 1 | from torequests.utils import quote_plus 2 | import sys 3 | import pathlib 4 | sys.path.append(str(pathlib.Path(__file__).absolute().parent.parent)) 5 | from config import ONLINE_HOST 6 | 7 | content_sources = [ 8 | { 9 | "title": "Python Software Foundation News", 10 | "url": "https://pyfound.blogspot.com/", 11 | "level": 4, 12 | "lang": "EN", 13 | "status": "√", 14 | "desc": "[墙] 来自 Python 软件基金会的消息" 15 | }, 16 | { 17 | "title": "Python Weekly", 18 | "url": "https://www.pythonweekly.com/", 19 | "level": 5, 20 | "lang": "EN", 21 | "status": "√", 22 | "desc": "必备周报" 23 | }, 24 | { 25 | "title": "PyCoder's Weekly", 26 | "url": "https://pycoders.com/issues", 27 | "level": 5, 28 | "lang": "EN", 29 | "status": "√", 30 | "desc": "必备周报" 31 | }, 32 | { 33 | "title": "Import Python", 34 | "url": "https://importpython.com/newsletter/archive/", 35 | "level": 5, 36 | "lang": "EN", 37 | "status": "√", 38 | "desc": "必备周报, 2019.1.11 停更了, 希望早日康复~" 39 | }, 40 | { 41 | "title": "Awesome Python Newsletter", 42 | "url": "https://python.libhunt.com/newsletter/archive", 43 | "level": 5, 44 | "lang": "EN", 45 | "status": "√", 46 | "desc": "必备周报" 47 | }, 48 | { 49 | "title": "Real Python", 50 | "url": "https://realpython.com/", 51 | "level": 4, 52 | "lang": "EN", 53 | "status": "√", 54 | "desc": "文章质量高, 更新较少" 55 | }, 56 | { 57 | "title": "Planet Python", 58 | "url": "https://planetpython.org", 59 | "level": 3, 60 | "lang": "EN", 61 | "status": "√", 62 | "desc": "官方推荐的博客, 收录了很多博主" 63 | }, 64 | { 65 | "title": "Julien Danjou", 66 | "url": "https://julien.danjou.info", 67 | "level": 4, 68 | "lang": "EN", 69 | "status": "√", 70 | "desc": "文章质量不错, 保持更新" 71 | }, 72 | { 73 | "title": "Doug Hellmann", 74 | "url": "https://doughellmann.com/blog/", 75 | "level": 4, 76 | "lang": "EN", 77 | "status": "√", 78 | "desc": "大名鼎鼎, 文章质量很高" 79 | }, 80 | { 81 | "title": "The Mouse Vs. The Python", 82 | "url": "https://www.blog.pythonlibrary.org", 83 | "level": 4, 84 | "lang": "EN", 85 | "status": "√", 86 | "desc": "文章质量不错" 87 | }, 88 | { 89 | "title": "InfoQ", 90 | "url": "https://www.infoq.cn/topic/python", 91 | "level": 4, 92 | "lang": "CN", 93 | "status": "√", 94 | "desc": "原创/译文的质量不错" 95 | }, 96 | { 97 | "title": "Jeff Knupp", 98 | "url": "https://jeffknupp.com/", 99 | "level": 4, 100 | "lang": "EN", 101 | "status": "X", 102 | "desc": "[墙] 热门博客, 2018以后不更新了, 并且 planetpython 有, 暂不收录" 103 | }, 104 | { 105 | "title": "Hacker News", 106 | "url": "https://hn.algolia.com/?query=python&sort=byPopularity&prefix&page=0&dateRange=last24h&type=story", 107 | "level": 4, 108 | "lang": "EN", 109 | "status": "√", 110 | "desc": "大名鼎鼎的 HN" 111 | }, 112 | { 113 | "title": "Python Insider", 114 | "url": "https://blog.python.org/", 115 | "level": 3, 116 | "lang": "EN", 117 | "status": "X", 118 | "desc": "官方开发进度, 被官博和 planetPython 包含, 所以不需要收录." 119 | }, 120 | { 121 | "title": "Brett Cannon", 122 | "url": "https://snarky.ca/", 123 | "level": 3, 124 | "lang": "EN", 125 | "status": "√", 126 | "desc": "核心开发者" 127 | }, 128 | { 129 | "title": "Encode", 130 | "url": "https://www.encode.io/", 131 | "level": 3, 132 | "lang": "EN", 133 | "status": "X", 134 | "desc": "知名 Python 开源组织, 文章太少, 暂不收录" 135 | }, 136 | { 137 | "title": "机器之心", 138 | "url": "https://www.jiqizhixin.com/search/article?keywords=python&search_internet=true&sort=time", 139 | "level": 3, 140 | "lang": "CN", 141 | "status": "√", 142 | "desc": "知名公众号" 143 | }, 144 | { 145 | "title": "依云's Blog", 146 | "url": "https://blog.lilydjwg.me/tag/python?page=1", 147 | "level": 3, 148 | "lang": "CN", 149 | "status": "√", 150 | "desc": "文章质量很高" 151 | }, 152 | { 153 | "title": "DEV Community", 154 | "url": "https://dev.to/t/python/latest", 155 | "level": 3, 156 | "lang": "EN", 157 | "status": "√", 158 | "desc": "算是个挺好的社区, post 也都不太水" 159 | }, 160 | { 161 | "title": "Python猫", 162 | "url": "https://zhuanlan.zhihu.com/pythonCat", 163 | "level": 3, 164 | "lang": "CN", 165 | "status": "√", 166 | "desc": "2018 年末比较热情的博主, 原创 + 优质译文" 167 | }, 168 | { 169 | "title": "Python之美", 170 | "url": "https://zhuanlan.zhihu.com/python-cn", 171 | "level": 3, 172 | "lang": "CN", 173 | "status": "√", 174 | "desc": "早期文章较多, 创业以后更新不太多了" 175 | }, 176 | { 177 | "title": "静觅", 178 | "url": "https://cuiqingcai.com/category/technique/python", 179 | "level": 3, 180 | "lang": "CN", 181 | "status": "√", 182 | "desc": " 崔庆才的个人博客, 保持更新的原创博主" 183 | }, 184 | { 185 | "title": "推酷(中文)", 186 | "url": "https://www.tuicool.com/topics/11130000?st=0&lang=1", 187 | "level": 3, 188 | "lang": "CN", 189 | "status": "√", 190 | "desc": "推文类站点. 按热门排序" 191 | }, 192 | { 193 | "title": "推酷(英文)", 194 | "url": "https://www.tuicool.com/topics/11130000?st=0&lang=2", 195 | "level": 3, 196 | "lang": "EN", 197 | "status": "√", 198 | "desc": "推文类站点. 按热门排序" 199 | }, 200 | { 201 | "title": "开发者头条", 202 | "url": "https://toutiao.io/tags/python?type=latest", 203 | "level": 3, 204 | "lang": "CN", 205 | "status": "X", 206 | "desc": "推文类站点, 但是没有发布时间, 暂不收录" 207 | }, 208 | { 209 | "title": "稀土掘金", 210 | "url": "https://juejin.im/tag/Python", 211 | "level": 3, 212 | "lang": "CN", 213 | "status": "√", 214 | "desc": "推文类站点. 按热门排序" 215 | }, 216 | { 217 | "title": "Python部落", 218 | "url": "https://python.freelycode.com/contribution/list/0?page_no=1", 219 | "level": 3, 220 | "lang": "CN", 221 | "status": "√", 222 | "desc": "推文+译文" 223 | }, 224 | { 225 | "title": "miguelgrinberg", 226 | "url": "https://blog.miguelgrinberg.com/index", 227 | "level": 3, 228 | "lang": "EN", 229 | "status": "√", 230 | "desc": "Web 开发相关的内容挺多, 质量较高" 231 | }, 232 | { 233 | "title": "Ned Batchelder", 234 | "url": "https://nedbatchelder.com/blog/tag/python.html", 235 | "level": 3, 236 | "lang": "EN", 237 | "status": "√", 238 | "desc": "热门博主。planetpython 也有" 239 | }, 240 | { 241 | "title": "Full Stack Python", 242 | "url": "https://www.fullstackpython.com/blog.html", 243 | "level": 3, 244 | "lang": "EN", 245 | "status": "X", 246 | "desc": "热门博主。planetpython 有了, 文章比较少, 暂不收录" 247 | }, 248 | { 249 | "title": "Eli Bendersky's website", 250 | "url": "https://eli.thegreenplace.net/tag/python", 251 | "level": 3, 252 | "lang": "EN", 253 | "status": "X", 254 | "desc": "值得一看,planetpython 有, 暂不收录" 255 | }, 256 | { 257 | "title": "Manjusaka", 258 | "url": "https://manjusaka.itscoder.com/tags/Python/", 259 | "level": 3, 260 | "lang": "CN", 261 | "status": "X", 262 | "desc": "原创还不错, 但是文章较少, 暂不收录" 263 | }, 264 | { 265 | "title": "Python程序员", 266 | "url": "https://zhuanlan.zhihu.com/pythoncxy", 267 | "level": 3, 268 | "lang": "CN", 269 | "status": "√", 270 | "desc": "关注破万的知乎专栏" 271 | }, 272 | { 273 | "title": "Python头条", 274 | "url": "https://zhuanlan.zhihu.com/c_111369541", 275 | "level": 3, 276 | "lang": "CN", 277 | "status": "√", 278 | "desc": "关注破万的知乎专栏" 279 | }, 280 | { 281 | "title": "the5fire的技术博客", 282 | "url": "https://www.the5fire.com/category/python/", 283 | "level": 3, 284 | "lang": "CN", 285 | "status": "√", 286 | "desc": "保持更新的热门中文博主" 287 | }, 288 | { 289 | "title": "Python之禅", 290 | "url": "https://foofish.net/", 291 | "level": 3, 292 | "lang": "CN", 293 | "status": "√", 294 | "desc": "文章较基础, 质量不错" 295 | }, 296 | { 297 | "title": "V2EX", 298 | "url": "https://www.v2ex.com/go/python", 299 | "level": 3, 300 | "lang": "CN", 301 | "status": "X", 302 | "desc": "社区类, api 失效, web 端乱七八糟的, 不收录" 303 | }, 304 | { 305 | "title": "伯乐在线", 306 | "url": "http://python.jobbole.com/all-posts/", 307 | "level": 3, 308 | "lang": "CN", 309 | "status": "X", 310 | "desc": "有点类似推酷, 质量参差不齐. HTTP ERROR 503" 311 | }, 312 | { 313 | "title": "Python 3 Module of the Week", 314 | "url": "https://pymotw.com/3/", 315 | "level": 3, 316 | "lang": "EN", 317 | "status": "X", 318 | "desc": "看起来不怎么更新了, 暂不收录" 319 | }, 320 | { 321 | "title": "The Invent with Python Blog", 322 | "url": "https://inventwithpython.com/blog/index.html", 323 | "level": 3, 324 | "lang": "EN", 325 | "status": "√", 326 | "desc": "感觉不错" 327 | }, 328 | { 329 | "title": "Armin Ronacher's Thoughts and Writings", 330 | "url": "http://lucumr.pocoo.org/", 331 | "level": 3, 332 | "lang": "EN", 333 | "status": "√", 334 | "desc": "Flask 作者 Armin Ronacher" 335 | }, 336 | { 337 | "title": "aio-libs", 338 | "url": "https://groups.google.com/forum/#!forum/aio-libs", 339 | "level": 3, 340 | "lang": "EN", 341 | "status": "X", 342 | "desc": "知名 Python 开源组织, 不过没有文章类的 post" 343 | }, 344 | { 345 | "title": "码农周刊", 346 | "url": "https://weekly.manong.io/issues/", 347 | "level": 3, 348 | "lang": "CN", 349 | "status": "X", 350 | "desc": "课外读物, 非 Python 主题, 暂不收录" 351 | }, 352 | { 353 | "title": "编程派", 354 | "url": "http://codingpy.com/", 355 | "level": 3, 356 | "lang": "CN", 357 | "status": "√", 358 | "desc": "原创+译文" 359 | }, 360 | { 361 | "title": "峰云就她了", 362 | "url": "http://xiaorui.cc/archives/category/python", 363 | "level": 3, 364 | "lang": "CN", 365 | "status": "√", 366 | "desc": "原创博客, 质量比较不错" 367 | }, 368 | { 369 | "title": "Dan Bader", 370 | "url": "https://dbader.org/blog/", 371 | "level": 3, 372 | "lang": "EN", 373 | "status": "X", 374 | "desc": "一年不更新了, 先不收录了" 375 | }, 376 | { 377 | "title": "Pythonic Perambulations", 378 | "url": "https://jakevdp.github.io/", 379 | "level": 3, 380 | "lang": "EN", 381 | "status": "X", 382 | "desc": "最后更新 Thu 13 September 2018, 暂不收录" 383 | }, 384 | { 385 | "title": "开源中国翻译", 386 | "url": "https://www.oschina.net/translate/tag/python", 387 | "level": 3, 388 | "lang": "CN", 389 | "status": "X", 390 | "desc": "入库留着吧, 估计不更了, 暂不收录" 391 | }, 392 | { 393 | "title": "Trey Hunner", 394 | "url": "https://treyhunner.com/blog/archives/", 395 | "level": 3, 396 | "lang": "EN", 397 | "status": "√", 398 | "desc": "Help developers level-up their Python skills" 399 | }, 400 | { 401 | "title": "Python Central", 402 | "url": "https://www.pythoncentral.io/", 403 | "level": 3, 404 | "lang": "EN", 405 | "status": "X", 406 | "desc": "不更新了, 暂不收录" 407 | }, 408 | { 409 | "title": "Inside the Head of PyDanny", 410 | "url": "https://www.pydanny.com/", 411 | "level": 3, 412 | "lang": "EN", 413 | "status": "X", 414 | "desc": "不更新了, 暂不收录" 415 | }, 416 | { 417 | "title": "华蟒用户组,CPyUG", 418 | "url": "https://groups.google.com/forum/#!forum/python-cn", 419 | "level": 3, 420 | "lang": "EN", 421 | "status": "X", 422 | "desc": "[墙] 社区类, 自己看看就好, 暂不收录" 423 | }, 424 | { 425 | "title": "Treehl", 426 | "url": "https://family-treesy.github.io/tags/PYTHON/", 427 | "level": 3, 428 | "lang": "CN", 429 | "status": "X", 430 | "desc": "文章较基础, 久不更新, 暂不收录" 431 | }, 432 | { 433 | "title": "蠎周刊", 434 | "url": "http://weekly.pychina.org", 435 | "level": 4, 436 | "lang": "CN", 437 | "status": "√", 438 | "desc": "各种 weekly 中译版" 439 | }, 440 | { 441 | "title": "zzzeek", 442 | "url": "https://techspot.zzzeek.org/", 443 | "level": 3, 444 | "lang": "EN", 445 | "status": "X", 446 | "desc": "2016 年后停更了" 447 | }, 448 | { 449 | "title": "Yu’s blog", 450 | "url": "https://gofisher.github.io/", 451 | "level": 3, 452 | "lang": "CN", 453 | "status": "X", 454 | "desc": "原创, 但是久不更新了, 网站 http://blog.rainy.im/ 挂了" 455 | }, 456 | { 457 | "title": "程序师", 458 | "url": "http://www.techug.com/tag/python", 459 | "level": 3, 460 | "lang": "CN", 461 | "status": "X", 462 | "desc": "原创较少, 文章较旧" 463 | }, 464 | { 465 | "title": "一根笨茄子", 466 | "url": "http://blog.guoyb.com/tags/Python/", 467 | "level": 3, 468 | "lang": "CN", 469 | "status": "X", 470 | "desc": "文章更新较少, 质量参差" 471 | }, 472 | { 473 | "title": "追梦人物", 474 | "url": "https://www.zmrenwu.com/", 475 | "level": 2, 476 | "lang": "CN", 477 | "status": "X", 478 | "desc": "像个学习博客" 479 | }, 480 | { 481 | "title": "anshengme", 482 | "url": "https://blog.ansheng.me/", 483 | "level": 2, 484 | "lang": "CN", 485 | "status": "X", 486 | "desc": "质量一般" 487 | }, 488 | { 489 | "title": "Pegasus", 490 | "url": "http://ningning.today/categories/python/", 491 | "level": 2, 492 | "lang": "CN", 493 | "status": "X", 494 | "desc": "不怎么更新" 495 | }, 496 | { 497 | "title": "FunHacks", 498 | "url": "https://funhacks.net/categories/Python/", 499 | "level": 2, 500 | "lang": "CN", 501 | "status": "X", 502 | "desc": "太久不更新了, 不过python 之旅还行" 503 | }, 504 | { 505 | "title": "Peter Norvig's essays", 506 | "url": "http://norvig.com/", 507 | "level": 2, 508 | "lang": "EN", 509 | "status": "X", 510 | "desc": "这排版驾驭不了..." 511 | }, 512 | { 513 | "title": "Peterbe.com", 514 | "url": "https://www.peterbe.com/plog/", 515 | "level": 2, 516 | "lang": "EN", 517 | "status": "X", 518 | "desc": "不是太值得收录" 519 | }, 520 | { 521 | "title": "Python Tips", 522 | "url": "https://pythontips.com/", 523 | "level": 2, 524 | "lang": "EN", 525 | "status": "X", 526 | "desc": "很火, 但我不喜欢" 527 | }, 528 | { 529 | "title": "脚本之家", 530 | "url": "https://www.jb51.net/list/list_97_1.htm", 531 | "level": 2, 532 | "lang": "CN", 533 | "status": "X", 534 | "desc": "文章的质量啊~~~" 535 | }, 536 | { 537 | "title": "开源中国搜索", 538 | "url": "https://www.oschina.net/search?scope=translate&q=python&category=0&onlytitle=0&sort_by_time=1", 539 | "level": 2, 540 | "lang": "CN", 541 | "status": "X", 542 | "desc": "质量不太高" 543 | }, 544 | { 545 | "title": "伯乐在线头条", 546 | "url": "http://top.jobbole.com/tag/python/?sort=latest", 547 | "level": 2, 548 | "lang": "CN", 549 | "status": "X", 550 | "desc": "停更" 551 | }, 552 | { 553 | "title": "代码片段", 554 | "url": "http://www.phpxs.com/code/python", 555 | "level": 2, 556 | "lang": "CN", 557 | "status": "X", 558 | "desc": "文章太老了, 停更了" 559 | }, 560 | { 561 | "title": "segmentfault", 562 | "url": "https://segmentfault.com/t/python/blogs", 563 | "level": 2, 564 | "lang": "CN", 565 | "status": "X", 566 | "desc": "文章质量" 567 | }, 568 | { 569 | "title": "Python China", 570 | "url": "http://python-china.org/api/topics/timeline", 571 | "level": 2, 572 | "lang": "CN", 573 | "status": "X", 574 | "desc": "欠费网站挂了" 575 | }, 576 | { 577 | "title": "麦穗技术", 578 | "url": "http://www.58maisui.com/category/python/", 579 | "level": 2, 580 | "lang": "CN", 581 | "status": "X", 582 | "desc": "网站挂了" 583 | }, 584 | { 585 | "title": "CSDN", 586 | "url": "https://so.csdn.net/so/search/s.do?q=python&t=blog&u=", 587 | "level": 1, 588 | "lang": "CN", 589 | "status": "X", 590 | "desc": "文章质量啊~~~" 591 | }, 592 | { 593 | "title": "Stack Overflow", 594 | "url": "https://stackoverflow.com/?tab=hot", 595 | "level": 3, 596 | "lang": "EN", 597 | "status": "X", 598 | "desc": "已解决 + python + vote>=5, 但是问题有点弱智, 暂不收录" 599 | }, 600 | { 601 | "title": "Reddit", 602 | "url": "https://www.reddit.com/r/Python/top/", 603 | "level": 3, 604 | "lang": "EN", 605 | "status": "√", 606 | "desc": "知名社区. 质量参差, 收录每日 ups>=20" 607 | }, 608 | { 609 | "title": "码天狗", 610 | "url": "https://weekly.codetengu.com/issues", 611 | "level": 4, 612 | "lang": "CN", 613 | "status": "X", 614 | "desc": "综合类周报, 2018-11-23 之后不更了. 挂了, 下线." 615 | }, 616 | { 617 | "title": "Medium", 618 | "url": "https://medium.com/tag/python", 619 | "level": 3, 620 | "lang": "EN", 621 | "status": "√", 622 | "desc": "Medium 的 Python tag, 收录 RSS" 623 | }, 624 | ] 625 | 626 | content_sources_dict = {i['title']: i for i in content_sources} 627 | 628 | 629 | def main(): 630 | import pathlib 631 | import re 632 | # =: 待收录, √: 已收录, X: 不收录, -: 入库不追更 633 | 634 | titles = [i['title'] for i in content_sources] 635 | # 确保没有重复的 636 | if len(titles) != len(set(titles)): 637 | raise RuntimeError('不能有重复的 title') 638 | if '|' in str(content_sources): 639 | raise RuntimeError('尽量不要有 |') 640 | 641 | providers = '' 642 | providers += '| 序号 | 名称 | 评分 | 语言 | 收录 | 描述 |\n' 643 | providers += '| ---- | ---- | ---- | ---- | ---- | ---- |\n' 644 | todo_counts = 0 645 | finish_counts = 0 646 | for x, item in enumerate(content_sources, 1): 647 | data = [str(x)] 648 | title_link = f'[{item["title"]}]({item["url"]})' 649 | data.append(title_link) 650 | data.append(str(item['level'])) 651 | data.append(item['lang']) 652 | status = item['status'] 653 | if item['status'] == '√': 654 | finish_counts += 1 655 | status = f'[√](https://{ONLINE_HOST}/newspaper/articles.query.html?source={quote_plus(item["title"])})' 656 | elif item['status'] == '=': 657 | todo_counts += 1 658 | data.append(status) 659 | data.append(item['desc']) 660 | string = ' | '.join(data) 661 | providers += '| ' + string + ' |\n' 662 | proc = f'* 收录进度: {finish_counts} / {finish_counts + todo_counts}\n\n\t> = 待收录 | √ 已收录 | X 不收录 | - 入库不追更\n\n' 663 | README_FP = pathlib.Path( 664 | __file__).absolute().parent.parent.parent / 'README.md' 665 | with README_FP.open('r', encoding='u8') as f: 666 | old = f.read() 667 | new = re.sub( 668 | '[\s\S]*?', 669 | f'\n\n{proc}{providers}\n\n', 670 | old) 671 | print(new) 672 | with README_FP.open('w', encoding='u8') as f: 673 | f.write(new) 674 | 675 | 676 | if __name__ == "__main__": 677 | main() 678 | -------------------------------------------------------------------------------- /newspaper/crawler/spiders.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import traceback 4 | import typing 5 | import zlib 6 | 7 | from lxml.etree import ElementBase, XMLParser 8 | from lxml.html import fromstring, tostring 9 | from torequests.dummy import Requests 10 | from torequests.utils import (curlparse, escape, find_one, md5, parse_qsl, 11 | ptime, re, time, timeago, ttime, unparse_qsl, 12 | urlparse, urlunparse) 13 | 14 | from ..config import global_configs 15 | from ..loggers import spider_logger as logger 16 | from ..utils import ensure_cn_en 17 | 18 | START_TIME = time.time() 19 | test_spiders = [] 20 | online_spiders = [] 21 | history_spiders = [] 22 | CHROME_PC_UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36' 23 | friendly_crawling_interval = 1 24 | outlands_req = Requests() 25 | # default_host_frequency 是默认的单域名并发控制: 每 3 秒一次请求 26 | req = Requests(default_host_frequency=(1, 3)) 27 | # 多次请求时的友好抓取频率 28 | # req.set_frequency('zhuanlan.zhihu.com', 1, 3) 29 | req.set_frequency('www.tuicool.com', 1, 3) 30 | # 免费代理 31 | proxy = 'http://218.60.8.99:3129' 32 | 33 | 34 | class null_tree: 35 | text = '' 36 | 37 | @classmethod 38 | def text_content(cls): 39 | return '' 40 | 41 | def get(self, key, default=''): 42 | return default 43 | 44 | @classmethod 45 | def css(cls, item, csspath, idx=0): 46 | return (item.cssselect(csspath) or [cls])[idx] 47 | 48 | @classmethod 49 | def tostring(cls, doc, **kwargs): 50 | if isinstance(doc, ElementBase): 51 | return tostring(doc, **kwargs) 52 | else: 53 | return '' 54 | 55 | 56 | def sort_url_query(url, reverse=False, _replace_kwargs=None): 57 | """sort url query args. 58 | _replace_kwargs is a dict to update attributes before sorting (such as scheme / netloc...). 59 | http://www.google.com?b=2&z=26&a=1 => http://www.google.com?a=1&b=2&z=26 60 | """ 61 | parsed = urlparse(url) 62 | if _replace_kwargs: 63 | parsed = parsed._replace(**_replace_kwargs) 64 | sorted_parsed = parsed._replace( 65 | query=unparse_qsl(parse_qsl(parsed.query), sort=True, reverse=reverse)) 66 | return urlunparse(sorted_parsed) 67 | 68 | 69 | def get_url_key(url) -> str: 70 | """通过 url 来计算 key, 一方面计算 md5, 另一方面净化无用参数. 71 | 以后再考虑要不要纯数字... 72 | import hashlib 73 | a = hashlib.md5(b'url') 74 | b = a.hexdigest() 75 | as_int = int(b, 16) 76 | url_key = str(as_int)[5:][:20] 77 | print(url_key) 78 | """ 79 | if url: 80 | key = md5(sort_url_query(url, _replace_kwargs={'scheme': 'https'})) 81 | return key 82 | return "" 83 | 84 | 85 | def add_host(url: str, host: str) -> str: 86 | if not url: 87 | return '' 88 | if re.match('^https?://', url): 89 | return url 90 | if url.startswith('//'): 91 | return f'https:{url}' 92 | if not host.endswith('/'): 93 | host = f'{host}/' 94 | return re.sub('^/', host, url) 95 | 96 | 97 | def shorten_desc(desc: str) -> str: 98 | """Shorten the desc too long (more than 50).""" 99 | if not desc: 100 | return '' 101 | # remain sentence before ./\n/。/! 102 | desc = re.sub(r'(.{50,})(\n|\.|。|!|!|?|\?)\s?[\s\S]+', r'\1\2', desc) 103 | # remove html tag 104 | desc = re.sub('<[^>]+>', '', desc).strip() 105 | return escape(desc) 106 | 107 | 108 | async def outlands_request(request_dict: dict = None, 109 | encoding: str = 'u8', 110 | **request_args) -> str: 111 | """小水管不开源, 无法用来 FQ. 112 | 113 | 例: 114 | async def test(): 115 | text = await outlands_request({ 116 | 'method': 'get', 117 | 'url': 'https://pyfound.blogspot.com/' 118 | }, 'u8') 119 | print(text) 120 | return text 121 | """ 122 | request_dict = request_dict or {} 123 | request_dict.update(request_args) 124 | request_dict.setdefault('method', 'get') 125 | request_dict.setdefault('ssl', False) 126 | request_dict.setdefault('headers', {}) 127 | request_dict['headers'].setdefault('User-Agent', CHROME_PC_UA) 128 | json_data = json.dumps(request_dict) 129 | data = zlib.compress(json_data.encode('u8')) 130 | url = global_configs['anti_gfw']['url'] 131 | r = await outlands_req.post(url, timeout=60, data=data) 132 | if r: 133 | return zlib.decompress(r.content).decode(encoding) 134 | else: 135 | return r.text 136 | 137 | 138 | def register_test(function: typing.Callable) -> typing.Callable: 139 | """把爬虫注册到测试列表 140 | 141 | :param function: 爬虫函数, 一般没有参数. 142 | :type function: typing.Callable 143 | :return: 爬虫函数, 一般没有参数. 144 | :rtype: typing.Callable 145 | """ 146 | 147 | test_spiders.append(function) 148 | return function 149 | 150 | 151 | def register_online(function: typing.Callable) -> typing.Callable: 152 | """把爬虫注册到线上可用列表 153 | 154 | :param function: 爬虫函数, 一般没有参数. 155 | :type function: typing.Callable 156 | :return: 爬虫函数, 一般没有参数. 157 | :rtype: typing.Callable 158 | """ 159 | 160 | online_spiders.append(function) 161 | return function 162 | 163 | 164 | def register_history(function: typing.Callable) -> typing.Callable: 165 | """把爬虫注册到历史文章抓取任务列表 166 | 167 | :param function: 爬虫函数, 一般没有参数. 168 | :type function: typing.Callable 169 | :return: 爬虫函数, 一般没有参数. 170 | :rtype: typing.Callable 171 | """ 172 | 173 | history_spiders.append(function) 174 | return function 175 | 176 | 177 | async def common_spider_zhihu_zhuanlan(name, source, limit=10): 178 | articles = [] 179 | offset: int = 0 180 | # 分页 181 | chunk_size: int = 50 182 | # 最多只要 2000 篇,再多没意义 183 | for _ in range(2000 // chunk_size): 184 | _limit = min((limit - offset, chunk_size)) 185 | # or limit == offset 186 | if not _limit: 187 | break 188 | api: str = f'https://zhuanlan.zhihu.com/api/columns/{name}/articles?limit={_limit}&offset={offset}' 189 | r = await req.get( 190 | api, 191 | ssl=False, 192 | headers={ 193 | "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36' 194 | }) 195 | if not r: 196 | logger.info( 197 | f'crawl zhihu_zhuanlan {name} limit={limit} failed: {r}') 198 | return articles 199 | items = r.json()['data'] 200 | if not items: 201 | break 202 | for item in items: 203 | if not (item['type'] == 'article' and item['state'] == 'published'): 204 | continue 205 | article: dict = {'source': source} 206 | article['ts_publish'] = ttime(item['created']) 207 | article['cover'] = item['image_url'] 208 | article['title'] = item['title'] 209 | article['desc'] = re.sub('<[^>]+>', ' ', item.get('excerpt') or '') 210 | article['url'] = item['url'] 211 | article['url_key'] = get_url_key(article['url']) 212 | articles.append(article) 213 | offset += _limit 214 | 215 | return articles 216 | 217 | 218 | async def common_spider_tuicool(lang, source, max_page=1, ignore_descs=None): 219 | articles = [] 220 | langs = {'CN': 1, 'EN': 2} 221 | lang_num = langs[lang] 222 | host = 'https://www.tuicool.com/' 223 | this_year = ttime()[:4] 224 | ignore_descs = ignore_descs or set() 225 | # 非登录用户只能采集前两页, 想采集更多需要 `_tuicool_session` cookie. 226 | headers = { 227 | 'Connection': 'keep-alive', 228 | 'Upgrade-Insecure-Requests': '1', 229 | 'If-None-Match': 'W/"41a6894d66c0f07fcfac6ec1d84446a3"', 230 | 'Dnt': '1', 231 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', 232 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 233 | 'Referer': 'https://www.tuicool.com/', 234 | 'Host': 'www.tuicool.com', 235 | 'Accept-Encoding': 'gzip, deflate, br', 236 | 'Accept-Language': 'zh-CN,zh;q=0.9', 237 | 'Cookie': '_tuicool_session=', 238 | } 239 | proxy = None 240 | for page in range(0, max_page): 241 | # st 参数: 0 是按时间顺序, 1 是热门文章 242 | api: str = f'https://www.tuicool.com/ah/0?st=1&lang={lang_num}&pn={page}' 243 | r = await req.get(api, 244 | ssl=False, 245 | proxy=proxy, 246 | retry=1, 247 | timeout=5, 248 | headers=headers) 249 | # print(r.text) 250 | if not r: 251 | logger.info(f'crawl tuicool {lang} page={page} failed: {r}') 252 | return articles 253 | items = fromstring( 254 | r.text).cssselect('#list_article>div.list_article_item') 255 | if max_page > 1: 256 | logger.info( 257 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles' 258 | ) 259 | if not items: 260 | break 261 | for item in items: 262 | article: dict = {'source': source} 263 | url = null_tree.css(item, 264 | '.aricle_item_info>.title>a').get('href', '') 265 | url = add_host(url, host) 266 | title = null_tree.css(item, '.aricle_item_info>.title>a').text 267 | cover = null_tree.css(item, 268 | '.article_thumb_image>img').get('src', '') 269 | cover = cover.replace( 270 | 'https://static0.tuicool.com/images/abs_img_no_small.jpg', '') 271 | time_span = null_tree.css(item, 272 | '.aricle_item_info>.tip').text_content() 273 | raw_time = find_one(r'\d\d-\d\d \d\d:\d\d', time_span)[0] 274 | if raw_time: 275 | # 避免是个怪异的时间, ensure 一下 276 | article['ts_publish'] = ttime( 277 | ptime(f'{this_year}-{raw_time}:00')) 278 | desc = null_tree.css( 279 | item, 280 | '.aricle_item_info>div.tip>span:nth-of-type(1)').text.strip() 281 | if not re.search('Python|python', f'{title}{desc}'): 282 | continue 283 | if desc in ignore_descs: 284 | continue 285 | article['cover'] = cover 286 | article['title'] = title 287 | article['desc'] = desc 288 | article['url'] = url 289 | article['url_key'] = get_url_key(article['url']) 290 | articles.append(article) 291 | return articles 292 | 293 | 294 | async def common_spider_juejin(user, source, max_page=1): 295 | articles = [] 296 | host = 'https://juejin.im/' 297 | now = ttime(fmt="%Y-%m-%dT%H:%M:%S.000Z") 298 | api: str = 'https://timeline-merger-ms.juejin.im/v1/get_entry_by_self' 299 | params: dict = { 300 | 'src': 'web', 301 | 'targetUid': user, 302 | 'type': 'post', 303 | 'before': now, 304 | 'limit': 20, 305 | 'order': 'createdAt' 306 | } 307 | for page in range(max_page): 308 | try: 309 | params['before'] = now 310 | r = await req.get(api, 311 | ssl=False, 312 | params=params, 313 | retry=1, 314 | timeout=5, 315 | headers={"User-Agent": CHROME_PC_UA}) 316 | if not r: 317 | logger.info(f'crawl juejin page={page} failed: {r}') 318 | return articles 319 | items = r.json()['d']['entrylist'] 320 | if max_page > 1: 321 | logger.info( 322 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles' 323 | ) 324 | if not items: 325 | break 326 | for item in items: 327 | article: dict = {'source': source} 328 | url = item['originalUrl'] 329 | url = add_host(url, host) 330 | title = item['title'] 331 | cover = item.get('screenshot') or '' 332 | now = item['createdAt'] 333 | if now: 334 | ts_publish = re.sub('\..*', '', now) 335 | article['ts_publish'] = ts_publish.replace('T', ' ') 336 | desc = item.get('summaryInfo') or '' 337 | article['cover'] = cover 338 | article['title'] = title 339 | article['desc'] = desc 340 | article['url'] = url 341 | article['url_key'] = get_url_key(article['url']) 342 | articles.append(article) 343 | if not now: 344 | break 345 | except Exception: 346 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 347 | return articles 348 | 349 | 350 | @register_online 351 | async def python_news() -> list: 352 | """Python Software Foundation News""" 353 | source: str = 'Python Software Foundation News' 354 | articles: list = [] 355 | seed = 'https://pyfound.blogspot.com/search?max-results=10' 356 | scode = await outlands_request({ 357 | 'method': 'get', 358 | 'url': seed, 359 | }, 'u8') 360 | if scode: 361 | tree = fromstring(scode) 362 | for item in tree.cssselect('.blog-posts>.date-outer'): 363 | try: 364 | article: dict = {'source': source} 365 | raw_pub_time = item.cssselect('.published')[0].get('title', '') 366 | ts_publish = ttime( 367 | ptime(raw_pub_time, fmt='%Y-%m-%dT%H:%M:%S%z')) 368 | article['ts_publish'] = ts_publish 369 | article['title'] = item.cssselect( 370 | '.post-title.entry-title>a')[0].text 371 | # 兼容下没有 desc 的情况 372 | node = item.cssselect('.post-body.entry-content') or [null_tree] 373 | desc = node[0].text_content() 374 | article['desc'] = desc.split('\n\n\n', 375 | 1)[0].strip().replace('\n', ' ') 376 | article['url'] = item.cssselect( 377 | '.post-title.entry-title>a')[0].get('href', '') 378 | article['url_key'] = get_url_key(article['url']) 379 | articles.append(article) 380 | except Exception: 381 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 382 | logger.info( 383 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 384 | ) 385 | return articles 386 | 387 | 388 | # @register_history 389 | async def python_news_history() -> list: 390 | """Python Software Foundation News""" 391 | source: str = 'Python Software Foundation News' 392 | articles: list = [] 393 | current_year = int(time.strftime('%Y')) 394 | for year in range(2006, current_year + 1): 395 | seed = f'https://pyfound.blogspot.com/{year}/' 396 | scode = await outlands_request({ 397 | 'method': 'get', 398 | 'url': seed, 399 | }, 'u8') 400 | await asyncio.sleep(3) 401 | if not scode: 402 | continue 403 | tree = fromstring(scode) 404 | for item in tree.cssselect('.blog-posts>.date-outer'): 405 | try: 406 | article: dict = {'source': source} 407 | raw_pub_time = item.cssselect('.published')[0].get('title', '') 408 | ts_publish = ttime( 409 | ptime(raw_pub_time, fmt='%Y-%m-%dT%H:%M:%S%z')) 410 | article['ts_publish'] = ts_publish 411 | article['title'] = item.cssselect( 412 | '.post-title.entry-title>a')[0].text 413 | # 兼容下没有 desc 的情况 414 | node = item.cssselect('.post-body.entry-content') or [null_tree] 415 | desc = node[0].text_content() 416 | article['desc'] = desc.split('\n\n\n', 417 | 1)[0].strip().replace('\n', ' ') 418 | article['url'] = item.cssselect( 419 | '.post-title.entry-title>a')[0].get('href', '') 420 | article['url_key'] = get_url_key(article['url']) 421 | articles.append(article) 422 | except Exception: 423 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 424 | logger.info( 425 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 426 | ) 427 | return articles 428 | 429 | 430 | def _python_weekly_calculate_date(issue_id): 431 | diff = 396 - int(issue_id) 432 | return ttime(1557331200 - diff * 86400 * 7) 433 | 434 | 435 | @register_online 436 | # @register_history 437 | # @register_test 438 | async def python_weekly() -> list: 439 | """Python Weekly""" 440 | source: str = 'Python Weekly' 441 | articles: list = [] 442 | # 一周一更, 所以只取第一个就可以了 443 | limit = 1 444 | seed = 'https://us2.campaign-archive.com/home/?u=e2e180baf855ac797ef407fc7&id=9e26887fc5' 445 | scode = await outlands_request({ 446 | 'method': 'get', 447 | 'url': seed, 448 | }, 'u8') 449 | box = find_one( 450 | r'(?:
)(
  • )(?:
  • )', 451 | scode)[1] 452 | items = re.findall(r'(
  • )', box) 453 | for item in items[:limit]: 454 | try: 455 | article: dict = {'source': source} 456 | # 从列表页取 ts_publish 和 issue_id, 其他的去详情页里采集 457 | #
  • 05/09/2019 - Python Weekly - Issue 396
  • 458 | title = find_one('title="(.*?)"', item)[1] 459 | issue_id = find_one(r' - Issue (\d+)', title)[1] 460 | pub_dates = find_one(r'class="campaign">(\d\d)/(\d\d)/(\d\d\d\d)', 461 | item)[1] 462 | if not issue_id: 463 | continue 464 | if len(pub_dates) == 3: 465 | ts_publish = f'{pub_dates[2]}-{pub_dates[0]}-{pub_dates[1]} 00:00:00' 466 | else: 467 | ts_publish = _python_weekly_calculate_date(issue_id) 468 | article['ts_publish'] = ts_publish 469 | detail_url = f'https://mailchi.mp/pythonweekly/python-weekly-issue-{issue_id}' 470 | r = await req.get( 471 | detail_url, 472 | ssl=False, 473 | headers={ 474 | "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36' 475 | }) 476 | if not r: 477 | logger.error(f'fetch {detail_url} failed: {r}') 478 | continue 479 | scode = r.text 480 | title = find_one('(.*?)', r.text)[1] 481 | title = title.strip(' ') 482 | translate_url = find_one( 483 | r'(://translate\.google\.com/translate\?[^"]+)', scode)[1] 484 | backup_url = dict( 485 | parse_qsl(translate_url))['u'] if translate_url else '' 486 | backup_url_desc = f'View this email in your browser
    ' if backup_url else '' 487 | nodes = fromstring(scode).cssselect('[style="font-size:14px"]>a') 488 | all_links = [ 489 | f"「{tostring(i, method='html', with_tail=0, encoding='unicode')} 」" 490 | for i in nodes 491 | ] 492 | all_links_desc = '
    '.join(all_links) 493 | article['title'] = title 494 | article['desc'] = f'{backup_url_desc}{all_links_desc}' 495 | article['url'] = detail_url 496 | article['url_key'] = get_url_key(article['url']) 497 | articles.append(article) 498 | except Exception: 499 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 500 | break 501 | logger.info( 502 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 503 | ) 504 | return articles 505 | 506 | 507 | # @register_history 508 | async def python_weekly_history() -> list: 509 | """Python Weekly""" 510 | source: str = 'Python Weekly' 511 | articles: list = [] 512 | for issue_id in range(324, 1000): 513 | try: 514 | article: dict = {'source': source} 515 | article['ts_publish'] = _python_weekly_calculate_date(issue_id) 516 | detail_url = f'https://mailchi.mp/pythonweekly/python-weekly-issue-{issue_id}' 517 | r = await req.get( 518 | detail_url, 519 | ssl=False, 520 | headers={ 521 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36' 522 | }) 523 | if '404: Page Not Found' in r.text: 524 | logger.warn('python_weekly_history break for 404 page') 525 | break 526 | if not r: 527 | logger.error(f'python_weekly_history break for {r}') 528 | break 529 | scode = r.text 530 | title = find_one('<title>(.*?)', r.text)[1] 531 | title = title.strip(' ') 532 | translate_url = find_one( 533 | r'(://translate\.google\.com/translate\?[^"]+)', scode)[1] 534 | backup_url = dict( 535 | parse_qsl(translate_url))['u'] if translate_url else '' 536 | backup_url_desc = f'View this email in your browser
    ' if backup_url else '' 537 | nodes = fromstring(scode).cssselect('[style="font-size:14px"]>a') 538 | all_links = [ 539 | f"「{tostring(i, method='html', with_tail=0, encoding='unicode')} 」" 540 | for i in nodes 541 | ] 542 | all_links_desc = '
    '.join(all_links) 543 | article['title'] = title 544 | article['desc'] = f'{backup_url_desc}{all_links_desc}' 545 | article['url'] = detail_url 546 | article['url_key'] = get_url_key(article['url']) 547 | articles.append(article) 548 | except Exception: 549 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 550 | break 551 | logger.info( 552 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 553 | ) 554 | return articles 555 | 556 | 557 | @register_online 558 | async def pycoder_weekly() -> list: 559 | """PyCoder's Weekly""" 560 | # 把 limit 改 999 就可以抓历史了 561 | source: str = "PyCoder's Weekly" 562 | articles: list = [] 563 | # 一周一更, 所以只取第一个就可以了 564 | limit = 1 565 | seed = 'https://pycoders.com/issues' 566 | base_url = find_one('^https?://[^/]+', seed)[0] 567 | r = await req.get(seed, headers={'User-Agent': CHROME_PC_UA}) 568 | if not r: 569 | logger.error(f'{source} crawl failed: {r}, {r.text}') 570 | return articles 571 | items = re.findall(r'Issue #\d+ .*?', r.text) 572 | for item in items[:limit]: 573 | try: 574 | article: dict = {'source': source} 575 | # Issue #368 (May 14, 2019) 576 | title = find_one('>(Issue.*?)<', item)[1] 577 | article['title'] = f"PyCoder's Weekly | {title}" 578 | month, day, year = re.findall(r'\((.*?) (\d+), (\d+)\)', 579 | article['title'])[0] 580 | month = month[:3] 581 | raw_time = f'{year}-{month}-{day}' 582 | ts_publish = ttime(ptime(raw_time, fmt='%Y-%b-%d')) 583 | article['ts_publish'] = ts_publish 584 | article['desc'] = '' 585 | url = find_one(r'href="(/issues/\d+)"', item)[1] 586 | article['url'] = base_url + url 587 | article['url_key'] = get_url_key(article['url']) 588 | articles.append(article) 589 | except Exception: 590 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 591 | break 592 | logger.info( 593 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 594 | ) 595 | return articles 596 | 597 | 598 | @register_online 599 | # @register_test 600 | async def importpython() -> list: 601 | """Import Python""" 602 | source: str = 'Import Python' 603 | articles: list = [] 604 | # 一周一更, 所以只取第一个就可以了 605 | limit = 1 606 | seed = 'https://importpython.com/newsletter/archive/' 607 | r = await req.get(seed, 608 | timeout=15, 609 | ssl=False, 610 | headers={"User-Agent": CHROME_PC_UA}) 611 | if not r: 612 | logger.error(f'{source} crawl failed: {r}, {r.text}') 613 | return articles 614 | items = fromstring(r.text).cssselect('#tourpackages-carousel>.row>div') 615 | for item in items[:limit]: 616 | try: 617 | article: dict = {'source': source} 618 | href = item.cssselect('div.caption>a')[0].get('href', '') 619 | if not href: 620 | continue 621 | url = add_host(href, 'https://importpython.com/') 622 | title = item.cssselect('div.caption>.well-add-card>h4')[0].text 623 | desc_node = item.cssselect('div.caption>div[class="col-lg-12"]')[0] 624 | desc = tostring(desc_node, 625 | method='html', 626 | with_tail=0, 627 | encoding='unicode') 628 | day, month, year = re.findall(r'- (\d+) (\S+) (\d+)', title)[0] 629 | month = month[:3] 630 | raw_time = f'{year}-{month}-{day}' 631 | ts_publish = ttime(ptime(raw_time, fmt='%Y-%b-%d')) 632 | article['ts_publish'] = ts_publish 633 | clean_title = re.sub(' - .*', '', title) 634 | title = f"{source} - {clean_title}" 635 | article['title'] = title 636 | article['desc'] = desc.replace('\n ', ' ') 637 | article['url'] = url 638 | article['url_key'] = get_url_key(article['url']) 639 | articles.append(article) 640 | except Exception: 641 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 642 | break 643 | logger.info( 644 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 645 | ) 646 | return articles 647 | 648 | 649 | @register_online 650 | # @register_test 651 | async def awesome_python() -> list: 652 | """Awesome Python Newsletter""" 653 | source: str = 'Awesome Python Newsletter' 654 | articles: list = [] 655 | # 一周一更, 所以只取第一个就可以了 656 | limit = 1 657 | seed = 'https://python.libhunt.com/newsletter/archive' 658 | scode = await outlands_request({ 659 | 'method': 'get', 660 | 'url': seed, 661 | }, 'u8') 662 | hrefs = re.findall( 663 | r'\s*', scode) 664 | for href in hrefs[:limit]: 665 | try: 666 | article: dict = {'source': source} 667 | url = add_host(href, 'https://python.libhunt.com/') 668 | r = await req.get(url, 669 | retry=2, 670 | timeout=15, 671 | headers={"User-Agent": CHROME_PC_UA}) 672 | if not r: 673 | logger.error(f'fetch {url} failed: {r}') 674 | break 675 | tree = fromstring(r.text) 676 | raw_title = tree.cssselect('title')[0].text 677 | title = re.sub(', .*', '', raw_title) 678 | raw_pub_date = find_one(r', (.*?) \|', raw_title)[1] 679 | # May 17, 2019 680 | ts_publish = ttime(ptime(raw_pub_date, fmt='%b %d, %Y')) 681 | nodes = tree.cssselect( 682 | 'li[class="story row"]>div[class="column"]>a') 683 | descs = [ 684 | tostring(i, method='html', with_tail=0, encoding='unicode') 685 | for i in nodes 686 | ] 687 | desc = '
    '.join(descs) 688 | article['ts_publish'] = ts_publish 689 | article['title'] = title 690 | article['desc'] = desc 691 | article['url'] = url 692 | article['url_key'] = get_url_key(article['url']) 693 | articles.append(article) 694 | except Exception: 695 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 696 | break 697 | logger.info( 698 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 699 | ) 700 | return articles 701 | 702 | 703 | @register_online 704 | async def real_python() -> list: 705 | """Real Python""" 706 | source: str = 'Real Python' 707 | articles: list = [] 708 | limit = 20 709 | seed = 'https://realpython.com/' 710 | r = await req.get(seed, 711 | retry=1, 712 | timeout=20, 713 | headers={"User-Agent": CHROME_PC_UA}) 714 | if not r: 715 | logger.error(f'{source} crawl failed: {r}, {r.text}') 716 | return articles 717 | items = fromstring(r.text).cssselect('div[class="card border-0"]') 718 | for item in items[:limit]: 719 | try: 720 | article: dict = {'source': source} 721 | href = item.cssselect('a')[0].get('href', '') 722 | url = add_host(href, 'https://realpython.com/') 723 | title = item.cssselect('h2.card-title')[0].text 724 | pub_date_node = item.cssselect('.mr-2') or [null_tree] 725 | raw_pub_date = pub_date_node[0].text 726 | # May 16, 2019 727 | ts_publish = ttime(ptime(raw_pub_date, fmt='%b %d, %Y')) 728 | cover_item = item.cssselect('img.card-img-top') 729 | if cover_item: 730 | cover = cover_item[0].get('src', '') 731 | if cover: 732 | article['cover'] = cover 733 | article['ts_publish'] = ts_publish 734 | article['title'] = title 735 | article['desc'] = '' 736 | article['url'] = url 737 | article['url_key'] = get_url_key(article['url']) 738 | articles.append(article) 739 | except Exception: 740 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 741 | break 742 | logger.info( 743 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 744 | ) 745 | return articles 746 | 747 | 748 | @register_online 749 | async def planet_python() -> list: 750 | """Planet Python""" 751 | source: str = 'Planet Python' 752 | articles: list = [] 753 | limit = 100 754 | seed = 'https://planetpython.org/rss20.xml' 755 | # 避免超时, 用外部访问 756 | scode = await outlands_request({ 757 | 'method': 'get', 758 | 'url': seed, 759 | }, 'u8') 760 | items = fromstring(scode).xpath('//channel/item') 761 | now = ttime() 762 | for item in items[:limit]: 763 | try: 764 | article: dict = {'source': source} 765 | guid = item.xpath('./guid/text()') 766 | title = item.xpath('./title/text()') 767 | description = item.xpath('./description/text()') 768 | pubDate = item.xpath('./pubdate/text()') 769 | if not (guid and title): 770 | continue 771 | url = guid[0] 772 | title = title[0] 773 | if 'بايثون العربي' in title: 774 | continue 775 | if 'Python Software Foundation: ' in title: 776 | # 已经单独收录过, 不需要再收录一次 777 | continue 778 | if description: 779 | desc = fromstring(description[0]).text_content() 780 | # 去掉 <> 781 | desc = re.sub('<[^>]*>', ' ', desc) 782 | # 只保留第一个换行前面的 783 | desc = shorten_desc(desc) 784 | else: 785 | desc = '' 786 | if pubDate: 787 | raw_pub_date = pubDate[0] 788 | # Wed, 22 May 2019 01:47:44 +0000 789 | raw_pub_date = re.sub('^.*?, ', '', raw_pub_date).strip() 790 | ts_publish = ttime( 791 | ptime(raw_pub_date, fmt='%d %b %Y %H:%M:%S %z')) 792 | else: 793 | ts_publish = now 794 | article['ts_publish'] = ts_publish 795 | article['title'] = title 796 | article['desc'] = desc 797 | article['url'] = url 798 | article['url_key'] = get_url_key(article['url']) 799 | articles.append(article) 800 | except Exception: 801 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 802 | break 803 | logger.info( 804 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 805 | ) 806 | return articles 807 | 808 | 809 | @register_online 810 | # @register_test 811 | async def julien_danjou() -> list: 812 | """Julien Danjou""" 813 | # 历史文章只要不断改页码迭代就好了 814 | source: str = 'Julien Danjou' 815 | articles: list = [] 816 | seed = 'https://julien.danjou.info/page/1/' 817 | scode = await outlands_request( 818 | { 819 | 'method': 'get', 820 | 'timeout': 5, 821 | 'retry': 2, 822 | 'url': seed, 823 | }, 'u8') 824 | items = fromstring(scode).cssselect('.post-feed>article.post-card') 825 | # 判断发布时间如果是 1 小时前就 break 826 | break_time = ttime(time.time() - 60 * 60) 827 | host = 'https://julien.danjou.info/' 828 | for item in items: 829 | try: 830 | article: dict = {'source': source} 831 | href = item.cssselect('a.post-card-content-link')[0].get('href', '') 832 | if not href: 833 | raise ValueError(f'{source} not found href from {seed}') 834 | url = add_host(href, host) 835 | title = (item.cssselect('h2.post-card-title') or 836 | [null_tree])[0].text 837 | desc = (item.cssselect('.post-card-excerpt>p') or 838 | [null_tree])[0].text 839 | if not (title and url): 840 | raise ValueError(f'{source} no title {url}') 841 | detail_scode = await outlands_request( 842 | { 843 | 'method': 'get', 844 | 'timeout': 5, 845 | 'retry': 2, 846 | 'url': url, 847 | }, 'u8') 848 | if not detail_scode: 849 | raise ValueError(f'{source} has no detail_scode {url}') 850 | raw_pub_time = find_one( 851 | 'property="article:published_time" content="(.+?)"', 852 | detail_scode)[1] 853 | # 2019-05-06T08:58:00.000Z 854 | ts_publish = ttime(ptime(raw_pub_time, 855 | fmt='%Y-%m-%dT%H:%M:%S.000Z')) 856 | cover_item = item.cssselect('img.post-card-image') 857 | if cover_item: 858 | cover = cover_item[0].get('src', '') 859 | if cover: 860 | article['cover'] = add_host(cover, host) 861 | article['ts_publish'] = ts_publish 862 | article['title'] = title 863 | article['desc'] = desc 864 | article['url'] = url 865 | article['url_key'] = get_url_key(article['url']) 866 | articles.append(article) 867 | if ts_publish < break_time: 868 | # 文章的发布时间超过抓取间隔, 则 break 869 | break 870 | except Exception: 871 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 872 | break 873 | logger.info( 874 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 875 | ) 876 | return articles 877 | 878 | 879 | @register_online 880 | async def doughellmann() -> list: 881 | """Doug Hellmann""" 882 | source: str = 'Doug Hellmann' 883 | articles: list = [] 884 | max_page: int = 1 885 | seed = 'https://doughellmann.com/blog/page/{page}/' 886 | for page in range(1, max_page + 1): 887 | r = await req.get(seed.format(page=page), 888 | retry=1, 889 | timeout=20, 890 | headers={"User-Agent": CHROME_PC_UA}) 891 | if not r: 892 | logger.error(f'{source} crawl failed: {r}, {r.text}') 893 | return articles 894 | scode = r.text 895 | items = fromstring(scode).cssselect('#main>article') 896 | if max_page > 1: 897 | logger.info( 898 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles' 899 | ) 900 | if not items and page > 1: 901 | logger.info(f'{source} break for page {page} has no items') 902 | break 903 | for item in items: 904 | try: 905 | article: dict = {'source': source} 906 | title = item.cssselect('.entry-title>a')[0].text 907 | url = item.cssselect('.entry-title>a')[0].get('href') 908 | desc = item.cssselect('.entry-content')[0].text_content() 909 | pub_time = item.cssselect('time.entry-date')[0].get('datetime') 910 | ts_publish = ttime(ptime(pub_time, fmt='%Y-%m-%dT%H:%M:%S%z')) 911 | article['ts_publish'] = ts_publish 912 | article['title'] = title 913 | article['desc'] = shorten_desc(desc) 914 | article['url'] = url 915 | article['url_key'] = get_url_key(article['url']) 916 | articles.append(article) 917 | except Exception: 918 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 919 | break 920 | logger.info( 921 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 922 | ) 923 | return articles 924 | 925 | 926 | @register_online 927 | # @register_history 928 | # @register_test 929 | async def mouse_vs_python() -> list: 930 | """The Mouse Vs. The Python""" 931 | source: str = 'The Mouse Vs. The Python' 932 | articles: list = [] 933 | max_page: int = 1 934 | # max_page:int = 101 935 | seed = 'https://www.blog.pythonlibrary.org/page/{page}/' 936 | for page in range(1, max_page + 1): 937 | api = seed.format(page=page) 938 | scode = await outlands_request( 939 | { 940 | 'method': 'get', 941 | 'timeout': 5, 942 | 'retry': 2, 943 | 'url': api, 944 | }, 'u8') 945 | items = fromstring(scode).cssselect('#content>article') 946 | if max_page > 1: 947 | logger.info( 948 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles' 949 | ) 950 | if not items: 951 | if page > 1: 952 | logger.info(f'{source} break for page {page} has no items') 953 | break 954 | for item in items: 955 | try: 956 | article: dict = {'source': source} 957 | title = item.cssselect('.entry-title>a')[0].text 958 | url = item.cssselect('.entry-title>a')[0].get('href') 959 | desc = item.cssselect('.entry-content')[0].text_content() 960 | pub_time = item.cssselect('time.entry-date')[0].get('datetime') 961 | ts_publish = ttime(ptime(pub_time, fmt='%Y-%m-%dT%H:%M:%S%z')) 962 | article['ts_publish'] = ts_publish 963 | article['title'] = title 964 | article['desc'] = shorten_desc(desc) 965 | article['url'] = url 966 | article['url_key'] = get_url_key(article['url']) 967 | articles.append(article) 968 | except Exception: 969 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 970 | break 971 | logger.info( 972 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 973 | ) 974 | return articles 975 | 976 | 977 | @register_online 978 | # @register_history 979 | # @register_test 980 | async def infoq_python() -> list: 981 | """InfoQ""" 982 | source: str = 'InfoQ' 983 | articles: list = [] 984 | max_page: int = 1 985 | # max_page:int = 101 986 | curl_string = r'''curl 'https://www.infoq.cn/public/v1/article/getList' -H 'Origin: https://www.infoq.cn' -H 'Accept-Encoding: gzip, deflate, br' -H 'Accept-Language: zh-CN,zh;q=0.9' -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36' -H 'Content-Type: application/json' -H 'Accept: application/json, text/plain, */*' -H 'Referer: https://www.infoq.cn/topic/python' -H 'Cookie: SERVERID=0|0|0' -H 'Connection: keep-alive' -H 'DNT: 1' --data-binary '{"type":1,"size":12,"id":50,"score":0}' --compressed''' 987 | request_args = curlparse(curl_string) 988 | for page in range(1, max_page + 1): 989 | r = await req.request(retry=2, timeout=5, **request_args) 990 | if not r: 991 | logger.error(f'{source} crawl failed: {r}, {r.text}') 992 | return articles 993 | items = r.json().get('data') or [] 994 | if max_page > 1: 995 | logger.info( 996 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles' 997 | ) 998 | if items: 999 | # 调整上一页最后一个 score 实现翻页 1000 | data = json.loads(request_args['data']) 1001 | data['score'] = items[-1]['score'] 1002 | request_args['data'] = json.dumps(data).encode('u8') 1003 | elif page > 1: 1004 | logger.info(f'{source} break for page {page} has no items') 1005 | break 1006 | for item in items: 1007 | try: 1008 | article: dict = {'source': source} 1009 | desc = shorten_desc(item['article_summary']) 1010 | if '本文分享 方巍' in desc: 1011 | continue 1012 | title = item['article_title'] 1013 | url = f"https://www.infoq.cn/article/{item['uuid']}" 1014 | ts_publish = ttime(item['publish_time']) 1015 | article['ts_publish'] = ts_publish 1016 | article['title'] = title 1017 | article['desc'] = desc 1018 | article['url'] = url 1019 | article['url_key'] = get_url_key(article['url']) 1020 | articles.append(article) 1021 | except Exception: 1022 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 1023 | break 1024 | logger.info( 1025 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 1026 | ) 1027 | return articles 1028 | 1029 | 1030 | @register_online 1031 | # @register_history 1032 | # @register_test 1033 | async def hn_python() -> list: 1034 | """Hacker News""" 1035 | source: str = 'Hacker News' 1036 | articles: list = [] 1037 | max_page = 999 1038 | # 默认收录 24 小时内的 3 points 以上 1039 | min_points = 3 1040 | now_ts = int(time.time()) 1041 | ts_start = now_ts - 86400 1042 | ts_end = now_ts 1043 | # 历史文章收录 90 天内的历史文章, 对方有个每次 query 1000 的上限配置 paginationLimitedTo 1044 | # 如果需要更久的, 不断修改起止时间就可以了 1045 | # ts_start = now_ts - 86400 * 90 1046 | # ts_end = now_ts 1047 | per_page: int = 100 1048 | api: str = 'https://hn.algolia.com/api/v1/search_by_date' 1049 | # tags=story&query=python&numericFilters=created_at_i%3E1553174400,points%3E1&page=2&hitsPerPage=10 1050 | params: dict = { 1051 | 'tags': 'story', 1052 | 'query': 'python', 1053 | 'numericFilters': f'created_at_i>={ts_start},created_at_i<={ts_end},points>={min_points}', 1054 | 'page': 0, 1055 | 'hitsPerPage': per_page, 1056 | } 1057 | for page in range(max_page): 1058 | params['page'] = page 1059 | r = await req.get(api, 1060 | params=params, 1061 | retry=2, 1062 | timeout=10, 1063 | headers={"User-Agent": CHROME_PC_UA}) 1064 | if not r: 1065 | logger.error(f'{source} crawl failed: {r}, {r.text}') 1066 | return articles 1067 | items = r.json().get('hits') or [] 1068 | if not items: 1069 | break 1070 | if page > 0: 1071 | logger.info( 1072 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles' 1073 | ) 1074 | if not items and page > 0: 1075 | logger.info(f'{source} break for page {page} has no items') 1076 | break 1077 | for item in items: 1078 | try: 1079 | article: dict = {'source': source} 1080 | title = item['title'] 1081 | url = item['url'] or '' 1082 | if not url: 1083 | url = f'https://news.ycombinator.com/item?id={item["objectID"]}' 1084 | desc = item['story_text'] or '' 1085 | ts_publish = ttime(item['created_at_i']) 1086 | article['ts_publish'] = ts_publish 1087 | article['title'] = title 1088 | article['desc'] = shorten_desc(desc) 1089 | article['url'] = url 1090 | article['url_key'] = get_url_key(article['url']) 1091 | articles.append(article) 1092 | except Exception: 1093 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 1094 | break 1095 | logger.info( 1096 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 1097 | ) 1098 | return articles 1099 | 1100 | 1101 | @register_online 1102 | # @register_history 1103 | # @register_test 1104 | async def snarky() -> list: 1105 | """Brett Cannon""" 1106 | source: str = 'Brett Cannon' 1107 | articles: list = [] 1108 | max_page: int = 1 1109 | api: str = 'https://snarky.ca/page/{page}/' 1110 | # 判断发布时间如果是 1 小时前就 break 1111 | break_time = ttime(time.time() - 60 * 60) 1112 | host = 'https://snarky.ca/' 1113 | for page in range(1, max_page + 1): 1114 | seed = api.format(page=page) 1115 | scode = await outlands_request(url=seed, retry=1, timeout=20) 1116 | if not scode: 1117 | logger.error(f'{source} crawl failed: {scode}') 1118 | return articles 1119 | items = fromstring(scode).cssselect('.post-feed>article.post-card') 1120 | if not items: 1121 | break 1122 | for item in items: 1123 | try: 1124 | article: dict = {'source': source} 1125 | href = item.cssselect('a.post-card-content-link')[0].get( 1126 | 'href', '') 1127 | if not href: 1128 | raise ValueError(f'{source} not found href from {seed}') 1129 | url = add_host(href, host) 1130 | title = (item.cssselect('h2.post-card-title') or 1131 | [null_tree])[0].text 1132 | desc = (item.cssselect('.post-card-excerpt>p') or 1133 | [null_tree])[0].text 1134 | if not (title and url): 1135 | raise ValueError(f'{source} no title {url}') 1136 | detail_resp = await req.get( 1137 | url, 1138 | ssl=False, 1139 | headers={ 1140 | "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36' 1141 | }) 1142 | if not detail_resp: 1143 | raise ValueError( 1144 | f'{source} request href failed {detail_resp}') 1145 | detail_scode = detail_resp.text 1146 | raw_pub_time = find_one( 1147 | 'property="article:published_time" content="(.+?)"', 1148 | detail_scode)[1] 1149 | # 2019-05-06T08:58:00.000Z 1150 | ts_publish = ttime( 1151 | ptime(raw_pub_time, fmt='%Y-%m-%dT%H:%M:%S.000Z')) 1152 | cover_item = item.cssselect('img.post-card-image') 1153 | if cover_item: 1154 | cover = cover_item[0].get('src', '') 1155 | if cover: 1156 | article['cover'] = add_host(cover, host) 1157 | article['ts_publish'] = ts_publish 1158 | article['title'] = title 1159 | article['desc'] = desc 1160 | article['url'] = url 1161 | article['url_key'] = get_url_key(article['url']) 1162 | articles.append(article) 1163 | if ts_publish < break_time: 1164 | # 文章的发布时间超过抓取间隔, 则 break 1165 | break 1166 | except Exception: 1167 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 1168 | break 1169 | logger.info( 1170 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 1171 | ) 1172 | return articles 1173 | 1174 | 1175 | @register_online 1176 | # @register_history 1177 | # @register_test 1178 | async def jiqizhixin() -> list: 1179 | """机器之心""" 1180 | source: str = '机器之心' 1181 | articles: list = [] 1182 | max_page: int = 1 1183 | # 有 cookie 和 防跨域验证 1184 | curl_string = r'''curl 'https://www.jiqizhixin.com/api/v1/search?type=articles&page=1&keywords=python&published=0&is_exact_match=false&search_internet=true&sort=time' -H 'Cookie: ahoy_visitor=1; _Synced_session=2' -H 'DNT: 1' -H 'Accept-Encoding: gzip, deflate, br' -H 'Accept-Language: zh-CN,zh;q=0.9' -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36' -H 'Accept: */*' -H 'Referer: https://www.jiqizhixin.com/search/article?keywords=python&search_internet=true&sort=time' -H 'X-Requested-With: XMLHttpRequest' -H 'If-None-Match: W/"3e034aa5e8cb79dd92652f5ba70a65a5"' -H 'Connection: keep-alive' --compressed''' 1185 | request_args = curlparse(curl_string) 1186 | for page in range(1, max_page + 1): 1187 | # 部分时候请求返回结果为空, 需要重试 1188 | for _ in range(2, 5): 1189 | r = await req.request(retry=1, timeout=20, **request_args) 1190 | if not r: 1191 | logger.error(f'{source} crawl failed: {r}, {r.text}') 1192 | return articles 1193 | try: 1194 | items = r.json().get('articles', {}).get('nodes', []) 1195 | if not items: 1196 | continue 1197 | break 1198 | except json.decoder.JSONDecodeError: 1199 | await asyncio.sleep(_) 1200 | continue 1201 | else: 1202 | # 试了 3 次都没 break, 放弃 1203 | return articles 1204 | if max_page > 1: 1205 | logger.info( 1206 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles' 1207 | ) 1208 | # 翻页, 修改 page 1209 | curl_string = re.sub(r'&page=\d+', f'&page={page + 1}', curl_string) 1210 | request_args = curlparse(curl_string) 1211 | if not r.json().get('articles', {}).get('hasNextPage'): 1212 | break 1213 | for item in items: 1214 | try: 1215 | article: dict = {'source': source} 1216 | desc = item['content'] 1217 | # 2019/05/27 00:09 1218 | article['ts_publish'] = ttime( 1219 | ptime(item['published_at'], fmt='%Y/%m/%d %H:%M')) 1220 | title = item.get('title') or '' 1221 | title = title.replace('Python', 1222 | 'Python').replace('python', 1223 | 'Python') 1224 | article['title'] = title 1225 | article['cover'] = item.get('cover_image_url') or '' 1226 | article['desc'] = f'「{item["author"]}」 {shorten_desc(desc)}' 1227 | article['url'] = item['path'] 1228 | article['url_key'] = get_url_key(article['url']) 1229 | articles.append(article) 1230 | except Exception: 1231 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 1232 | break 1233 | logger.info( 1234 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 1235 | ) 1236 | return articles 1237 | 1238 | 1239 | @register_online 1240 | # @register_history 1241 | # @register_test 1242 | async def lilydjwg() -> list: 1243 | """依云's Blog""" 1244 | source: str = "依云's Blog" 1245 | articles: list = [] 1246 | max_page: int = 1 1247 | seed = 'https://blog.lilydjwg.me/tag/python?page={page}' 1248 | for page in range(1, max_page + 1): 1249 | r = await req.get(seed.format(page=page), 1250 | retry=1, 1251 | timeout=20, 1252 | headers={"User-Agent": CHROME_PC_UA}) 1253 | if not r: 1254 | logger.error(f'{source} crawl failed: {r}, {r.text}') 1255 | return articles 1256 | scode = r.content.decode('u8') 1257 | items = fromstring(scode).cssselect('#content>.posttotal') 1258 | if not items: 1259 | break 1260 | host = 'https://blog.lilydjwg.me/' 1261 | if max_page > 1: 1262 | logger.info( 1263 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles' 1264 | ) 1265 | for item in items: 1266 | try: 1267 | article: dict = {'source': source} 1268 | title = item.cssselect('.storytitle>a')[0].text 1269 | href = item.cssselect('.storytitle>a')[0].get('href', '') 1270 | url = add_host(href, host).replace( 1271 | 'https://lilydjwg.is-programmer.com/', host) 1272 | desc = shorten_desc((item.cssselect('.post_brief>p') or 1273 | [null_tree])[0].text_content()) 1274 | cover = (item.cssselect('img') or [null_tree])[0].get('src', '') 1275 | month, day, year = item.cssselect( 1276 | '.date')[0].text_content().strip().split() 1277 | month = f'0{month}'[-2:] 1278 | day = f'0{day}'[-2:] 1279 | article['ts_publish'] = ttime( 1280 | ptime(f'{year}/{month}/{day}', fmt='%Y/%m/%d')) 1281 | article['title'] = title 1282 | article['cover'] = cover 1283 | article['desc'] = desc 1284 | article['url'] = url 1285 | article['url_key'] = get_url_key(article['url']) 1286 | articles.append(article) 1287 | except Exception: 1288 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 1289 | break 1290 | logger.info( 1291 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 1292 | ) 1293 | return articles 1294 | 1295 | 1296 | @register_online 1297 | # @register_history 1298 | # @register_test 1299 | async def dev_io() -> list: 1300 | """DEV Community""" 1301 | source: str = "DEV Community" 1302 | articles: list = [] 1303 | max_page: int = 1 1304 | per_page: int = 30 1305 | filt_score: int = 10 1306 | for page in range(0, max_page): 1307 | r = await req.get( 1308 | f'https://dev.to/search/feed_content?per_page={per_page}&page={page}&tag=python&sort_by=published_at&sort_direction=desc&tag_names%5B%5D=python&approved=&class_name=Article', 1309 | headers={ 1310 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36', 1311 | 'Referer': 'https://dev.to/t/python/latest' 1312 | }, 1313 | retry=1, 1314 | timeout=20) 1315 | if not r: 1316 | logger.error(f'{source} crawl failed: {r}, {r.text}') 1317 | return articles 1318 | items = r.json().get('result') or [] 1319 | if not items: 1320 | break 1321 | host = 'https://dev.to/' 1322 | if max_page > 1: 1323 | logger.info( 1324 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles' 1325 | ) 1326 | for item in items: 1327 | try: 1328 | if item['public_reactions_count'] + item[ 1329 | 'comments_count'] < filt_score: 1330 | # filt by min score 1331 | continue 1332 | article: dict = {'source': source} 1333 | title = item['title'] 1334 | path = item['path'] 1335 | url = add_host(path, host) 1336 | desc = item['user']['name'] 1337 | article['ts_publish'] = ttime(item['published_at_int']) 1338 | article['title'] = title 1339 | article['desc'] = desc 1340 | article['url'] = url 1341 | article['url_key'] = get_url_key(article['url']) 1342 | articles.append(article) 1343 | except Exception: 1344 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 1345 | break 1346 | logger.info( 1347 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 1348 | ) 1349 | return articles 1350 | 1351 | 1352 | # @register_online 1353 | # @register_history 1354 | # @register_test 1355 | async def pythoncat() -> list: 1356 | """Python猫""" 1357 | # 采集掘金的, 知乎专栏的更新太慢了 1358 | source: str = "Python猫" 1359 | user: str = '57b26118a341310060fa74da' 1360 | max_page = 1 1361 | articles: list = await common_spider_juejin(user, source, max_page=max_page) 1362 | logger.info( 1363 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 1364 | ) 1365 | return articles 1366 | 1367 | 1368 | @register_online 1369 | # @register_history 1370 | # @register_test 1371 | async def zhihu_zhuanlan_python_cn() -> list: 1372 | """Python之美""" 1373 | source: str = "Python之美" 1374 | name: str = 'python-cn' 1375 | articles: list = [] 1376 | limit = 10 1377 | articles = await common_spider_zhihu_zhuanlan(name, source, limit=limit) 1378 | logger.info( 1379 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 1380 | ) 1381 | return articles 1382 | 1383 | 1384 | @register_online 1385 | # @register_history 1386 | # @register_test 1387 | async def zhihu_zhuanlan_python_cat() -> list: 1388 | """Python猫""" 1389 | source: str = "Python猫" 1390 | name: str = 'pythonCat' 1391 | articles: list = [] 1392 | limit = 10 1393 | articles = await common_spider_zhihu_zhuanlan(name, source, limit=limit) 1394 | logger.info( 1395 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 1396 | ) 1397 | return articles 1398 | 1399 | 1400 | @register_online 1401 | # @register_history 1402 | # @register_test 1403 | async def zhihu_zhuanlan_pythoncxy() -> list: 1404 | """Python程序员""" 1405 | source: str = "Python程序员" 1406 | name: str = 'pythoncxy' 1407 | articles: list = [] 1408 | limit = 10 1409 | articles = await common_spider_zhihu_zhuanlan(name, source, limit=limit) 1410 | logger.info( 1411 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 1412 | ) 1413 | return articles 1414 | 1415 | 1416 | @register_online 1417 | # @register_history 1418 | # @register_test 1419 | async def zhihu_zhuanlan_c_111369541() -> list: 1420 | """Python头条""" 1421 | source: str = "Python头条" 1422 | name: str = 'c_111369541' 1423 | articles: list = [] 1424 | limit = 10 1425 | articles = await common_spider_zhihu_zhuanlan(name, source, limit=limit) 1426 | logger.info( 1427 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 1428 | ) 1429 | return articles 1430 | 1431 | 1432 | @register_online 1433 | # @register_history 1434 | # @register_test 1435 | async def cuiqingcai() -> list: 1436 | """静觅""" 1437 | source: str = "静觅" 1438 | articles: list = [] 1439 | max_page: int = 1 1440 | # max_page = 20 1441 | api: str = 'https://cuiqingcai.com/category/technique/python/page/' 1442 | now = ttime() 1443 | this_date = now[5:10] 1444 | this_year = now[:4] 1445 | last_year_int = int(this_year) - 1 1446 | timestamp_today_0 = ptime(now[:10] + ' 00:00:00') 1447 | 1448 | def translate_time_text(raw_time): 1449 | if not raw_time: 1450 | return '' 1451 | raw_time = raw_time.strip() 1452 | # 针对每种情况做时间转换 1453 | # 4个月前 (02-21) 1454 | # 2天前 1455 | # 4年前 (2015-02-12) 1456 | # 先尝试取得横线/:分割的时间, 取不到的应该是 n 天前的情况 1457 | date = find_one(r'([\d:\- ]+)', raw_time)[1] 1458 | if date: 1459 | if re.match(r'^\d\d-\d\d$', date): 1460 | # 只有月日 1461 | # 这里有可能遇到的是去年的月份, 所以先判断 1462 | if date >= this_date: 1463 | date = f'{last_year_int}-{date}' 1464 | else: 1465 | date = f'{this_year}-{date}' 1466 | result = f'{date} 00:00:00' 1467 | elif re.match(r'^\d\d\d\d-\d\d-\d\d$', date): 1468 | # 有年月日 1469 | result = f'{date} 00:00:00' 1470 | elif re.match(r'^\d\d\d\d-\d\d-\d\d \d\d:\d\d$', date): 1471 | # 有年月日时分 1472 | result = f'{date}:00' 1473 | elif re.match(r'^\d\d\d\d-\d\d-\d\d \d:\d\d$', date): 1474 | # 有年月日时分 1475 | result = f'{date[:11]}0{date[11:]}:00' 1476 | else: 1477 | raise ValueError(f'bad time pattern {raw_time}') 1478 | elif re.match(r'^\d+小时前$', raw_time): 1479 | n_hour = int(find_one(r'\d+', raw_time)[0]) 1480 | result = ttime(timestamp_today_0 - n_hour * 3600) 1481 | elif re.match(r'^\d+天前$', raw_time): 1482 | n_day = int(find_one(r'\d+', raw_time)[0]) 1483 | result = ttime(timestamp_today_0 - n_day * 86400) 1484 | else: 1485 | raise ValueError(f'bad time pattern {raw_time}') 1486 | return result 1487 | 1488 | for page in range(1, max_page + 1): 1489 | seed = f'{api}{page}' 1490 | r = await req.get( 1491 | seed, 1492 | retry=1, 1493 | timeout=20, 1494 | ssl=False, 1495 | headers={ 1496 | "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36' 1497 | }) 1498 | if not r: 1499 | logger.error(f'{source} crawl failed: {r}, {r.text}') 1500 | return articles 1501 | items = fromstring( 1502 | r.content.decode('u8')).cssselect('div.content>article') 1503 | if not items: 1504 | break 1505 | if max_page > 1: 1506 | logger.info( 1507 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles' 1508 | ) 1509 | for item in items: 1510 | try: 1511 | article: dict = {'source': source} 1512 | title = null_tree.css(item, 'header>h2>a').text 1513 | url = null_tree.css(item, 'header>h2>a').get('href', '') 1514 | desc = null_tree.css(item, '.note').text_content() 1515 | cover = null_tree.css(item, 'img.thumb').get('src', '') 1516 | raw_time_text = null_tree.css( 1517 | item, 'p > span:nth-child(2)').text_content() 1518 | article['ts_publish'] = translate_time_text(raw_time_text) 1519 | article['title'] = title 1520 | article['cover'] = cover 1521 | article['desc'] = desc 1522 | article['url'] = url 1523 | article['url_key'] = get_url_key(article['url']) 1524 | articles.append(article) 1525 | except Exception: 1526 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 1527 | break 1528 | logger.info( 1529 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 1530 | ) 1531 | return articles 1532 | 1533 | 1534 | @register_online 1535 | # @register_history 1536 | # @register_test 1537 | async def tuicool_cn() -> list: 1538 | """推酷(中文)""" 1539 | source: str = "推酷(中文)" 1540 | articles: list = [] 1541 | max_page: int = 1 1542 | articles = await common_spider_tuicool( 1543 | 'CN', 1544 | source, 1545 | max_page=max_page, 1546 | ignore_descs={'稀土掘金', 'Python猫', 'InfoQ'}) 1547 | logger.info( 1548 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 1549 | ) 1550 | return articles 1551 | 1552 | 1553 | @register_online 1554 | # @register_history 1555 | # @register_test 1556 | async def tuicool_en() -> list: 1557 | """推酷(英文)""" 1558 | source: str = "推酷(英文)" 1559 | articles: list = [] 1560 | max_page: int = 1 1561 | articles = await common_spider_tuicool('EN', 1562 | source, 1563 | max_page=max_page, 1564 | ignore_descs={'Real Python'}) 1565 | logger.info( 1566 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 1567 | ) 1568 | return articles 1569 | 1570 | 1571 | # @register_online 1572 | # @register_history 1573 | # @register_test 1574 | async def kf_toutiao() -> list: 1575 | """稀土掘金""" 1576 | source: str = "稀土掘金" 1577 | articles: list = [] 1578 | max_page: int = 1 1579 | per_page: int = 20 1580 | sort_by = 'rankIndex' # 'createdAt' 是按时间顺序 1581 | api: str = 'https://timeline-merger-ms.juejin.im/v1/get_tag_entry' 1582 | params: dict = { 1583 | 'src': 'web', 1584 | 'tagId': '559a7227e4b08a686d25744f', 1585 | 'page': 0, 1586 | 'pageSize': per_page, 1587 | 'sort': sort_by 1588 | } 1589 | # 豌豆花下猫 单独收录了 1590 | ignore_usernames: set = {'豌豆花下猫'} 1591 | for page in range(0, max_page): 1592 | params['page'] = page 1593 | scode = await outlands_request( 1594 | { 1595 | 'method': 'get', 1596 | 'params': params, 1597 | 'url': api, 1598 | 'ssl': False, 1599 | 'retry': 1, 1600 | 'headers': { 1601 | 'Referer': 'https://juejin.im/tag/Python?sort=popular', 1602 | 'Origin': 'https://juejin.im', 1603 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', 1604 | } 1605 | }, 'u8') 1606 | if not scode: 1607 | logger.error(f'{source} crawl failed: {scode}') 1608 | return articles 1609 | items = json.loads(scode).get('d', {}).get('entrylist', []) 1610 | if not items: 1611 | break 1612 | if max_page > 1: 1613 | logger.info( 1614 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles' 1615 | ) 1616 | for item in items: 1617 | try: 1618 | article: dict = {'source': source} 1619 | # 过滤一下已收录过的源 1620 | if item.get('user', {}).get('username', '') in ignore_usernames: 1621 | continue 1622 | # 2019-05-05T03:51:12.886Z 1623 | gmt_time = re.sub(r'\..*', '', 1624 | item['createdAt']).replace('T', ' ') 1625 | ts_publish = ttime(ptime(gmt_time, tzone=0)) 1626 | article['ts_publish'] = ts_publish 1627 | article['lang'] = 'en' if item['english'] else 'CN' 1628 | article['title'] = item['title'] 1629 | article['cover'] = item['screenshot'] 1630 | article['desc'] = item['summaryInfo'] 1631 | article['url'] = item['originalUrl'] 1632 | article['url_key'] = get_url_key(article['url']) 1633 | articles.append(article) 1634 | except Exception: 1635 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 1636 | break 1637 | logger.info( 1638 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 1639 | ) 1640 | return articles 1641 | 1642 | 1643 | @register_online 1644 | # @register_history 1645 | # @register_test 1646 | async def freelycode() -> list: 1647 | """Python部落""" 1648 | source: str = "Python部落" 1649 | articles: list = [] 1650 | max_page: int = 1 1651 | api: str = 'https://python.freelycode.com/contribution/list/0' 1652 | params: dict = { 1653 | 'page_no': 1, 1654 | } 1655 | host: str = 'https://python.freelycode.com/' 1656 | 1657 | def fix_time(raw_time): 1658 | # 2019-03-27 7:02 a.m. 1659 | # 2019-03-22 9:27 a.m. 1660 | # 2019-07-17 9 a.m. 1661 | raw_time = raw_time.replace('中午', '12:01 p.m.') 1662 | if ':' not in raw_time: 1663 | raw_time = f'{raw_time[:-5]}:00{raw_time[-5:]}' 1664 | raw_time = raw_time.replace('.m.', 'm') 1665 | formated_time = ttime(ptime(raw_time, fmt='%Y-%m-%d %I:%M %p')) 1666 | return formated_time 1667 | 1668 | for page in range(1, max_page + 1): 1669 | params['page_no'] = page 1670 | r = await req.get( 1671 | api, 1672 | ssl=False, 1673 | params=params, 1674 | # proxy=proxy, 1675 | retry=2, 1676 | timeout=5, 1677 | headers={ 1678 | 'Referer': api, 1679 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', 1680 | }, 1681 | ) 1682 | if not r: 1683 | logger.error(f'{source} crawl failed: {r}, {r.text}') 1684 | return articles 1685 | scode: str = r.content.decode('u8', 'ignore') 1686 | items: list = fromstring(scode).cssselect( 1687 | '.table-bordered tr:nth-child(n+2)') 1688 | if not items: 1689 | break 1690 | if max_page > 1: 1691 | logger.info( 1692 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles' 1693 | ) 1694 | for item in items: 1695 | try: 1696 | article: dict = {'source': source} 1697 | title_href = item.cssselect('td:nth-child(2)>a') 1698 | if not title_href: 1699 | continue 1700 | title: str = title_href[0].text 1701 | href: str = title_href[0].get('href', '') 1702 | url: str = add_host(href, host) 1703 | desc: str = null_tree.css(item, 'td:nth-child(3)').text 1704 | if desc: 1705 | desc = f'作者: {desc}' 1706 | raw_time: str = null_tree.css(item, 'td:nth-child(4)').text 1707 | ts_publish = fix_time(raw_time) 1708 | article['ts_publish'] = ts_publish 1709 | article['title'] = title 1710 | article['desc'] = desc 1711 | article['url'] = url 1712 | article['url_key'] = get_url_key(article['url']) 1713 | articles.append(article) 1714 | except Exception: 1715 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 1716 | break 1717 | logger.info( 1718 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 1719 | ) 1720 | return articles 1721 | 1722 | 1723 | @register_online 1724 | # @register_history 1725 | # @register_test 1726 | async def miguelgrinberg() -> list: 1727 | """miguelgrinberg""" 1728 | source: str = "miguelgrinberg" 1729 | articles: list = [] 1730 | start_page: int = 1 1731 | max_page: int = 1 1732 | api: str = 'https://blog.miguelgrinberg.com/index/page/' 1733 | host: str = 'https://blog.miguelgrinberg.com/' 1734 | 1735 | for page in range(start_page, max_page + 1): 1736 | page_url = f'{api}{page}' 1737 | scode = await outlands_request({'url': page_url}, retry=1) 1738 | if not scode: 1739 | logger.error(f'{source} crawl failed: {scode}') 1740 | return articles 1741 | scode = re.sub(r'', '', scode) 1742 | items: list = fromstring(scode).cssselect('#main>.post') 1743 | if not items: 1744 | break 1745 | if max_page > 1: 1746 | logger.info( 1747 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles' 1748 | ) 1749 | for item in items: 1750 | try: 1751 | article: dict = {'source': source} 1752 | title_href = item.cssselect('h1.post-title>a') 1753 | if not title_href: 1754 | continue 1755 | title: str = title_href[0].text 1756 | href: str = title_href[0].get('href', '') 1757 | url: str = add_host(href, host) 1758 | desc: str = null_tree.css(item, '.post_body>p').text_content() 1759 | raw_time: str = null_tree.css(item, '.date>span').get( 1760 | 'data-timestamp', '').replace('T', ' ').replace('Z', '') 1761 | ts_publish = ttime(ptime(raw_time, tzone=0)) 1762 | article['ts_publish'] = ts_publish 1763 | article['title'] = title 1764 | article['desc'] = shorten_desc(desc) 1765 | article['url'] = url 1766 | article['url_key'] = get_url_key(article['url']) 1767 | articles.append(article) 1768 | except Exception: 1769 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 1770 | break 1771 | logger.info( 1772 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 1773 | ) 1774 | return articles 1775 | 1776 | 1777 | @register_online 1778 | # @register_history 1779 | # @register_test 1780 | async def codingpy() -> list: 1781 | """编程派""" 1782 | source: str = "编程派" 1783 | articles: list = [] 1784 | start_page: int = 1 1785 | max_page: int = 1 1786 | api: str = 'https://codingpy.com/article/' 1787 | params: dict = {'page': 1} 1788 | host: str = 'https://codingpy.com/' 1789 | 1790 | for page in range(start_page, max_page + 1): 1791 | params['page'] = page 1792 | r = await req.get( 1793 | api, 1794 | params=params, 1795 | ssl=False, 1796 | # proxy=proxy, 1797 | retry=2, 1798 | timeout=5, 1799 | headers={ 1800 | 'Referer': api, 1801 | 'User-Agent': CHROME_PC_UA 1802 | }, 1803 | ) 1804 | if not r: 1805 | logger.error(f'{source} crawl failed: {r}, {r.text}') 1806 | return articles 1807 | scode: str = r.content.decode('u8', 'ignore') 1808 | items: list = fromstring(scode).cssselect('.archive-main>article') 1809 | if not items: 1810 | break 1811 | if max_page > 1: 1812 | logger.info( 1813 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles' 1814 | ) 1815 | for item in items: 1816 | try: 1817 | article: dict = {'source': source} 1818 | title_href = item.cssselect('.list-item-title>a') 1819 | title: str = title_href[0].text 1820 | href: str = title_href[0].get('href', '') 1821 | bg: str = null_tree.css(item, '.lim-cover').get('style', '') 1822 | # background-image:url(/media/articles/why-python-for-startups.jpg) 1823 | cover: str = find_one(r'background-image:url\((.*?)\)', bg)[1] 1824 | cover = add_host(cover, host) 1825 | url: str = add_host(href, host) 1826 | desc: str = null_tree.css( 1827 | item, '.list-item-summary>p').text_content() 1828 | raw_time: str = null_tree.css(item, 1829 | '.list-item-meta>p>span').text 1830 | # 2015.11.03 1831 | ts_publish = ttime(ptime(raw_time, fmt='%Y.%m.%d')) 1832 | article['ts_publish'] = ts_publish 1833 | article['title'] = title 1834 | article['cover'] = cover 1835 | article['desc'] = shorten_desc(desc) 1836 | article['url'] = url 1837 | article['url_key'] = get_url_key(article['url']) 1838 | articles.append(article) 1839 | except Exception: 1840 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 1841 | break 1842 | logger.info( 1843 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 1844 | ) 1845 | return articles 1846 | 1847 | 1848 | @register_online 1849 | # @register_history 1850 | # @register_test 1851 | async def nedbatchelder() -> list: 1852 | """Ned Batchelder""" 1853 | source: str = "Ned Batchelder" 1854 | articles: list = [] 1855 | limit: int = 5 1856 | api: str = 'https://nedbatchelder.com/blog/tag/python.html' 1857 | host: str = 'https://nedbatchelder.com/' 1858 | scode = await outlands_request( 1859 | { 1860 | 'method': 'get', 1861 | 'timeout': 5, 1862 | 'headers': { 1863 | 'Referer': api, 1864 | 'User-Agent': CHROME_PC_UA, 1865 | }, 1866 | 'url': api, 1867 | }, 'u8') 1868 | container_html = null_tree.tostring( 1869 | null_tree.css(fromstring(scode), '.category')).decode('utf-8') 1870 | if not container_html: 1871 | logger.error(f'{source} not found container_html.') 1872 | return articles 1873 | split_by: str = '' 1874 | container_html = container_html.replace( 1875 | '

    ', f'{split_by}

    ').replace( 1876 | '', '').replace('

    ', '') 1877 | items: list = container_html.split(split_by)[1:limit + 1] 1878 | if not items: 1879 | return articles 1880 | for item in items: 1881 | try: 1882 | article: dict = {'source': source} 1883 | title_href = find_one(r'

    \s*([^<]+?)', item) 1884 | title: str = title_href[2] 1885 | href: str = title_href[1] 1886 | url: str = add_host(href, host) 1887 | raw_time: str = find_one(r'

    (\d+ .*?\d+):

    ', 1888 | item)[1] 1889 | ts_publish = ttime(ptime(raw_time, fmt='%d %b %Y')) 1890 | article['ts_publish'] = ts_publish 1891 | article['title'] = title 1892 | article['url'] = url 1893 | article['url_key'] = get_url_key(article['url']) 1894 | articles.append(article) 1895 | except Exception: 1896 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 1897 | break 1898 | logger.info( 1899 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 1900 | ) 1901 | return articles 1902 | 1903 | 1904 | @register_online 1905 | # @register_history 1906 | # @register_test 1907 | async def the5fire() -> list: 1908 | """the5fire的技术博客""" 1909 | source: str = "the5fire的技术博客" 1910 | articles: list = [] 1911 | start_page: int = 1 1912 | max_page: int = 1 1913 | api: str = 'https://www.the5fire.com/category/python/' 1914 | host: str = 'https://www.the5fire.com/' 1915 | params: dict = {'page': 1} 1916 | 1917 | for page in range(start_page, max_page + 1): 1918 | params['page'] = page 1919 | r = await req.get( 1920 | api, 1921 | params=params, 1922 | ssl=False, 1923 | # proxy=proxy, 1924 | retry=1, 1925 | headers={ 1926 | 'Referer': api, 1927 | 'User-Agent': CHROME_PC_UA 1928 | }, 1929 | ) 1930 | if not r: 1931 | logger.error(f'{source} crawl failed: {r}, {r.text}') 1932 | return articles 1933 | scode: str = r.content.decode('u8', 'ignore') 1934 | items: list = fromstring(scode).cssselect('#main>.caption') 1935 | if not items: 1936 | break 1937 | if max_page > 1: 1938 | logger.info( 1939 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles' 1940 | ) 1941 | for item in items: 1942 | try: 1943 | article: dict = {'source': source} 1944 | title_href = item.cssselect('h3>a') 1945 | title: str = title_href[0].text 1946 | href: str = title_href[0].get('href', '') 1947 | url: str = add_host(href, host) 1948 | desc: str = null_tree.css(item, '.caption>p').text_content() 1949 | raw_time: str = null_tree.css(item, '.info').text_content() 1950 | # 发布:2019-02-22 9:47 p.m. 1951 | raw_time = find_one(r'发布:(\d\d\d\d-\d{1,2}-\d{1,2}.*)', 1952 | raw_time)[1].replace('.', '') 1953 | # 2019-03-20 10:07 p.m. 1954 | # 2011-05-28 10 a.m. 1955 | # 2011-12-08 午夜 1956 | if ':' not in raw_time: 1957 | if 'm' in raw_time: 1958 | raw_time = re.sub('m.*', 'm', raw_time) 1959 | ts_publish = ttime(ptime(raw_time, 1960 | fmt='%Y-%m-%d %I %p')) 1961 | else: 1962 | raw_time = raw_time[:10] 1963 | ts_publish = ttime(ptime(raw_time, fmt='%Y-%m-%d')) 1964 | else: 1965 | ts_publish = ttime(ptime(raw_time, fmt='%Y-%m-%d %I:%M %p')) 1966 | article['ts_publish'] = ts_publish 1967 | article['title'] = title 1968 | article['desc'] = shorten_desc(desc) 1969 | article['url'] = url 1970 | article['url_key'] = get_url_key(article['url']) 1971 | articles.append(article) 1972 | except Exception: 1973 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 1974 | break 1975 | logger.info( 1976 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 1977 | ) 1978 | return articles 1979 | 1980 | 1981 | @register_online 1982 | # @register_history 1983 | # @register_test 1984 | async def foofish() -> list: 1985 | """Python之禅""" 1986 | source: str = "Python之禅" 1987 | articles: list = [] 1988 | start_page: int = 1 1989 | max_page: int = 1 1990 | api: str = 'https://foofish.net/index.html' 1991 | host: str = 'https://foofish.net/' 1992 | 1993 | for page in range(start_page, max_page + 1): 1994 | if page == 1: 1995 | seed = api 1996 | else: 1997 | seed = api.replace('index.html', f'index{page}.html') 1998 | r = await req.get( 1999 | seed, 2000 | ssl=False, 2001 | # proxy=proxy, 2002 | retry=1, 2003 | headers={ 2004 | 'Referer': api, 2005 | 'User-Agent': CHROME_PC_UA 2006 | }, 2007 | ) 2008 | if not r: 2009 | logger.error(f'{source} crawl failed: {r}, {r.text}') 2010 | return articles 2011 | scode: str = r.content.decode('u8', 'ignore') 2012 | container: str = find_one(r'
    [\s\S]*?
    ', 2013 | scode)[0] 2014 | if not container: 2015 | logger.error('container not found') 2016 | return articles 2017 | items: list = re.findall(r'
    [\S\s]*?', container) 2018 | if not items: 2019 | break 2020 | if max_page > 1: 2021 | logger.info( 2022 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles' 2023 | ) 2024 | for item_html in items: 2025 | try: 2026 | article: dict = {'source': source} 2027 | item = fromstring(item_html) 2028 | title_href = item.cssselect('a') 2029 | title: str = title_href[0].text 2030 | href: str = title_href[0].get('href', '') 2031 | url: str = add_host(href, host) 2032 | raw_time: str = null_tree.css(item, 'dt').text 2033 | ts_publish = ttime(ptime(raw_time, fmt='%Y-%m-%d')) 2034 | article['ts_publish'] = ts_publish 2035 | article['title'] = title 2036 | article['url'] = url 2037 | article['url_key'] = get_url_key(article['url']) 2038 | articles.append(article) 2039 | except Exception: 2040 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 2041 | break 2042 | logger.info( 2043 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 2044 | ) 2045 | return articles 2046 | 2047 | 2048 | @register_online 2049 | # @register_history 2050 | # @register_test 2051 | async def inventwithpython() -> list: 2052 | """The Invent with Python Blog""" 2053 | source: str = "The Invent with Python Blog" 2054 | articles: list = [] 2055 | start_page: int = 1 2056 | max_page: int = 1 2057 | api: str = 'https://inventwithpython.com/blog/index.html' 2058 | host: str = 'https://inventwithpython.com/' 2059 | 2060 | for page in range(start_page, max_page + 1): 2061 | if page == 1: 2062 | seed = api 2063 | else: 2064 | seed = api.replace('index.html', f'index{page}.html') 2065 | r = await req.get( 2066 | seed, 2067 | ssl=False, 2068 | # proxy=proxy, 2069 | retry=1, 2070 | headers={ 2071 | 'Referer': api, 2072 | 'User-Agent': CHROME_PC_UA 2073 | }, 2074 | ) 2075 | if not r: 2076 | logger.error(f'{source} crawl failed: {r}, {r.text}') 2077 | return articles 2078 | scode: str = r.content.decode('u8', 'ignore') 2079 | items: list = fromstring(scode).cssselect('#content>article') 2080 | if not items: 2081 | break 2082 | if max_page > 1: 2083 | logger.info( 2084 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles' 2085 | ) 2086 | for item in items: 2087 | try: 2088 | article: dict = {'source': source} 2089 | title_href = null_tree.css(item, 'h1>a') 2090 | title: str = title_href.text 2091 | href: str = title_href.get('href', '') 2092 | url: str = add_host(href, host) 2093 | raw_time: str = null_tree.css( 2094 | item, '.article-header-date').text.strip() 2095 | # Wed 05 June 2019 2096 | ts_publish = ttime(ptime(raw_time, fmt='%a %d %B %Y')) 2097 | article['ts_publish'] = ts_publish 2098 | article['title'] = title 2099 | article['url'] = url 2100 | article['url_key'] = get_url_key(article['url']) 2101 | articles.append(article) 2102 | except Exception: 2103 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 2104 | break 2105 | logger.info( 2106 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 2107 | ) 2108 | return articles 2109 | 2110 | 2111 | @register_online 2112 | # @register_history 2113 | # @register_test 2114 | async def lucumr() -> list: 2115 | """Armin Ronacher's Thoughts and Writings""" 2116 | source: str = "Armin Ronacher's Thoughts and Writings" 2117 | articles: list = [] 2118 | start_page: int = 1 2119 | max_page: int = 1 2120 | api: str = 'http://lucumr.pocoo.org/' 2121 | host: str = 'http://lucumr.pocoo.org/' 2122 | 2123 | for page in range(start_page, max_page + 1): 2124 | if page == 1: 2125 | seed = api 2126 | else: 2127 | seed = add_host(f'/page/{page}/', host) 2128 | r = await req.get( 2129 | seed, 2130 | ssl=False, 2131 | # proxy=proxy, 2132 | retry=1, 2133 | headers={ 2134 | 'Referer': api, 2135 | 'User-Agent': CHROME_PC_UA 2136 | }, 2137 | ) 2138 | if not r: 2139 | logger.error(f'{source} crawl failed: {r}, {r.text}') 2140 | return articles 2141 | scode: str = r.content.decode('u8', 'ignore') 2142 | items: list = fromstring(scode).cssselect( 2143 | '.entry-wrapper>.entry-overview') 2144 | if not items: 2145 | break 2146 | if max_page > 1: 2147 | logger.info( 2148 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles' 2149 | ) 2150 | for item in items: 2151 | try: 2152 | article: dict = {'source': source} 2153 | title_href = null_tree.css(item, 'h1>a') 2154 | title: str = title_href.text 2155 | href: str = title_href.get('href', '') 2156 | url: str = add_host(href, host) 2157 | desc: str = null_tree.css(item, '.summary>p').text 2158 | raw_time: str = null_tree.css(item, '.date').text.strip() 2159 | # Jun 5, 2017 2160 | ts_publish = ttime(ptime(raw_time, fmt='%b %d, %Y')) 2161 | article['ts_publish'] = ts_publish 2162 | article['title'] = title 2163 | article['desc'] = desc 2164 | article['url'] = url 2165 | article['url_key'] = get_url_key(article['url']) 2166 | articles.append(article) 2167 | except Exception: 2168 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 2169 | break 2170 | logger.info( 2171 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 2172 | ) 2173 | return articles 2174 | 2175 | 2176 | @register_online 2177 | # @register_history 2178 | # @register_test 2179 | async def treyhunner() -> list: 2180 | """Trey Hunner""" 2181 | source: str = "Trey Hunner" 2182 | articles: list = [] 2183 | limit: int = 5 2184 | api: str = 'https://treyhunner.com/blog/categories/python/' 2185 | host: str = 'https://treyhunner.com/' 2186 | 2187 | r = await req.get( 2188 | api, 2189 | ssl=False, 2190 | # proxy=proxy, 2191 | retry=1, 2192 | headers={ 2193 | 'Referer': api, 2194 | 'User-Agent': CHROME_PC_UA 2195 | }, 2196 | ) 2197 | if not r: 2198 | logger.error(f'{source} crawl failed: {r}, {r.text}') 2199 | return articles 2200 | scode: str = r.content.decode('u8', 'ignore') 2201 | items: list = fromstring(scode).cssselect('#blog-archives>article') 2202 | for item in items[:limit]: 2203 | try: 2204 | article: dict = {'source': source} 2205 | title_href = null_tree.css(item, 'h1>a') 2206 | title: str = title_href.text 2207 | href: str = title_href.get('href', '') 2208 | url: str = add_host(href, host) 2209 | raw_time: str = null_tree.css(item, 'time').get('datetime') 2210 | # 2019-06-18T09:15:00-07:00 2211 | ts_publish = ttime(ptime(raw_time.replace('T', ' ')[:19], tzone=-7)) 2212 | article['ts_publish'] = ts_publish 2213 | article['title'] = title 2214 | article['url'] = url 2215 | article['url_key'] = get_url_key(article['url']) 2216 | articles.append(article) 2217 | except Exception: 2218 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 2219 | break 2220 | logger.info( 2221 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 2222 | ) 2223 | return articles 2224 | 2225 | 2226 | @register_online 2227 | # @register_history 2228 | # @register_test 2229 | async def reddit() -> list: 2230 | """Reddit""" 2231 | source: str = "Reddit" 2232 | articles: list = [] 2233 | limit: int = 22 2234 | # 有 20 赞以上的才收录 2235 | min_ups: int = 20 2236 | # 或者 10 评论的才收录 2237 | min_cmts: int = 10 2238 | # api doc: https://www.reddit.com/dev/api/#GET_top 2239 | api: str = f'https://api.reddit.com/r/Python/top/?t=day&limit={limit}' 2240 | host: str = 'https://www.reddit.com/' 2241 | for _ in range(2): 2242 | scode = await outlands_request( 2243 | { 2244 | 'method': 'get', 2245 | 'url': api, 2246 | 'headers': { 2247 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' 2248 | } 2249 | }, 'u8') 2250 | # print(scode) 2251 | if scode: 2252 | break 2253 | else: 2254 | logger.error(f'{source} crawl failed') 2255 | return articles 2256 | rj: dict = json.loads(scode) 2257 | items: list = rj['data']['children'] 2258 | for item in items: 2259 | try: 2260 | if item['kind'] != 't3': 2261 | continue 2262 | data = item['data'] 2263 | if (data.get('ups') or data.get('score') or 2264 | 0) < min_ups and (data.get('num_comments') or 0) < min_cmts: 2265 | continue 2266 | article: dict = {'source': source} 2267 | title: str = data['title'] 2268 | href: str = data['permalink'] 2269 | url: str = add_host(href, host) 2270 | raw_time: str = data['created_utc'] 2271 | # 1564420248 2272 | ts_publish = ttime(raw_time, tzone=0) 2273 | desc: str = data.get('author') or '' 2274 | article['ts_publish'] = ts_publish 2275 | article['title'] = title 2276 | article['url'] = url 2277 | article['desc'] = desc 2278 | article['url_key'] = get_url_key(article['url']) 2279 | articles.append(article) 2280 | except Exception: 2281 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 2282 | break 2283 | logger.info( 2284 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 2285 | ) 2286 | return articles 2287 | 2288 | 2289 | # @register_online 2290 | # @register_history 2291 | # @register_test 2292 | async def codetengu() -> list: 2293 | """码天狗""" 2294 | source: str = "码天狗" 2295 | articles: list = [] 2296 | start_page: int = 1 2297 | # max_page: int = 999 2298 | max_page: int = 1 2299 | api: str = 'https://weekly.codetengu.com/issues' 2300 | params: dict = {'page': 1} 2301 | host: str = 'https://weekly.codetengu.com/' 2302 | 2303 | for page in range(start_page, max_page + 1): 2304 | params['page'] = page 2305 | r = await req.get( 2306 | api, 2307 | params=params, 2308 | ssl=False, 2309 | # proxy=proxy, 2310 | retry=1, 2311 | timeout=10, 2312 | headers={ 2313 | 'Referer': api, 2314 | 'User-Agent': CHROME_PC_UA 2315 | }, 2316 | ) 2317 | if not r: 2318 | logger.error(f'{source} crawl failed: {r}, {r.text}') 2319 | return articles 2320 | scode: str = r.content.decode('u8', 'ignore') 2321 | items: list = fromstring(scode).cssselect('.item__list > li.item') 2322 | if not items: 2323 | break 2324 | if max_page > 1: 2325 | logger.info( 2326 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles' 2327 | ) 2328 | for item in items: 2329 | try: 2330 | article: dict = {'source': source} 2331 | title: str = item.cssselect('.item__title')[0].text 2332 | href: str = item.cssselect('a')[0].get('href', '') 2333 | cover: str = null_tree.css(item, 'img').get('src', '') 2334 | cover = add_host(cover, host) 2335 | url: str = add_host(href, host) 2336 | desc: str = null_tree.css(item, '.item__title').text_content() 2337 | raw_time: str = null_tree.css(item, 'time.published').get( 2338 | 'datetime', '1970-01-01') 2339 | ts_publish = ttime(ptime(raw_time, fmt='%Y-%m-%d')) 2340 | article['ts_publish'] = ts_publish 2341 | article['title'] = title 2342 | article['cover'] = cover 2343 | article['desc'] = shorten_desc(desc) 2344 | article['url'] = url 2345 | article['url_key'] = get_url_key(article['url']) 2346 | articles.append(article) 2347 | except Exception: 2348 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 2349 | break 2350 | logger.info( 2351 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 2352 | ) 2353 | return articles 2354 | 2355 | 2356 | @register_online 2357 | # @register_history 2358 | # @register_test 2359 | async def pychina() -> list: 2360 | """蠎周刊""" 2361 | source: str = "蠎周刊" 2362 | articles: list = [] 2363 | limit: int = 5 2364 | api: str = 'http://weekly.pychina.org/archives.html' 2365 | host: str = 'http://weekly.pychina.org/' 2366 | 2367 | r = await req.get( 2368 | api, 2369 | ssl=False, 2370 | # proxy=proxy, 2371 | retry=1, 2372 | timeout=10, 2373 | headers={ 2374 | 'Referer': '', 2375 | 'User-Agent': CHROME_PC_UA 2376 | }, 2377 | ) 2378 | if not r: 2379 | logger.error(f'{source} crawl failed: {r}, {r.text}') 2380 | return articles 2381 | scode: str = r.content.decode('u8', 'ignore') 2382 | items: list = fromstring(scode).cssselect('#content li') 2383 | for item in items[:limit]: 2384 | try: 2385 | article: dict = {'source': source} 2386 | title_href = item.cssselect('a[title]') 2387 | if not title_href: 2388 | continue 2389 | title: str = title_href[0].text.strip() 2390 | href: str = title_href[0].get('href', '') 2391 | url: str = add_host(href, host) 2392 | raw_time: str = null_tree.css(item, 'sup').text 2393 | ts_publish = ttime(ptime(raw_time, fmt='%Y-%m-%d %H:%M')) 2394 | article['ts_publish'] = ts_publish 2395 | article['title'] = title 2396 | article['cover'] = '' 2397 | article['desc'] = '' 2398 | article['url'] = url 2399 | article['url_key'] = get_url_key(article['url']) 2400 | articles.append(article) 2401 | except Exception: 2402 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 2403 | break 2404 | logger.info( 2405 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 2406 | ) 2407 | return articles 2408 | 2409 | 2410 | @register_online 2411 | # @register_history 2412 | # @register_test 2413 | async def xiaoruicc() -> list: 2414 | """峰云就她了""" 2415 | source: str = "峰云就她了" 2416 | articles: list = [] 2417 | start_page: int = 1 2418 | max_page: int = 1 2419 | # max_page: int = 999 2420 | api: str = 'http://xiaorui.cc/archives/category/python' 2421 | host: str = 'http://xiaorui.cc/' 2422 | 2423 | for page in range(start_page, max_page + 1): 2424 | api_url = f'{api}{"/page/" + str(page) if page != 1 else ""}/' 2425 | r = await req.get( 2426 | api_url, 2427 | ssl=False, 2428 | # proxy=proxy, 2429 | retry=2, 2430 | timeout=8, 2431 | headers={ 2432 | 'Referer': api_url, 2433 | 'User-Agent': CHROME_PC_UA 2434 | }, 2435 | ) 2436 | if not r: 2437 | if getattr(r, 'status_code', None) != 404: 2438 | logger.error(f'{source} crawl failed: {r}, {r.text}') 2439 | return articles 2440 | scode: str = r.content.decode('u8', 'ignore') 2441 | items: list = fromstring(scode).cssselect('.content-area>article') 2442 | if not items: 2443 | break 2444 | if max_page > 1: 2445 | logger.info( 2446 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles' 2447 | ) 2448 | for item in items: 2449 | try: 2450 | article: dict = {'source': source} 2451 | title_href = item.cssselect('.entry-title>a') 2452 | if not title_href: 2453 | continue 2454 | title: str = title_href[0].text.strip() 2455 | href: str = title_href[0].get('href', '') 2456 | url: str = add_host(href, host) 2457 | desc: str = null_tree.css( 2458 | item, '.entry-summary>*:first-child').text_content() 2459 | raw_time: str = null_tree.css(item, 'time.published').get( 2460 | 'datetime', '1970-01-01') 2461 | ts_publish = ttime(ptime(raw_time, fmt='%Y-%m-%dT%H:%M:%S%z')) 2462 | article['ts_publish'] = ts_publish 2463 | article['title'] = title 2464 | article['cover'] = '' 2465 | article['desc'] = shorten_desc(desc) 2466 | article['url'] = url 2467 | article['url_key'] = get_url_key(article['url']) 2468 | articles.append(article) 2469 | except Exception: 2470 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 2471 | break 2472 | logger.info( 2473 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 2474 | ) 2475 | return articles 2476 | 2477 | 2478 | @register_online 2479 | # @register_history 2480 | # @register_test 2481 | async def medium_python() -> list: 2482 | """Medium""" 2483 | source: str = 'Medium' 2484 | articles: list = [] 2485 | limit = 10 2486 | seed = 'https://medium.com/feed/tag/python' 2487 | # 避免超时, 用外部访问 2488 | scode = await outlands_request({ 2489 | 'method': 'get', 2490 | 'url': seed, 2491 | }, 'u8') 2492 | items = fromstring(scode.encode('utf-8'), 2493 | parser=XMLParser()).xpath('//channel/item') 2494 | now = ttime() 2495 | for item in items[:limit]: 2496 | try: 2497 | article: dict = {'source': source} 2498 | guid = item.xpath('./guid/text()') 2499 | title = item.xpath('./title/text()') 2500 | description = item.xpath('./description/text()') 2501 | author = item.xpath("./*[local-name()='creator']/text()") 2502 | pubDate = item.xpath("./*[local-name()='updated']/text()") 2503 | if not (guid and title): 2504 | continue 2505 | url = guid[0] 2506 | title = title[0] 2507 | if description: 2508 | desc = fromstring(description[0]).text_content() 2509 | # 去掉 <> 2510 | desc = re.sub('<[^>]*>', ' ', desc) 2511 | # 只保留第一个换行前面的 2512 | desc = shorten_desc(desc) 2513 | else: 2514 | desc = '' 2515 | if 'Continue reading on' in desc: 2516 | continue 2517 | if author: 2518 | desc = f'[{author[0]}] {desc}' 2519 | if not ensure_cn_en(f'{title}{desc}'): 2520 | continue 2521 | if pubDate: 2522 | raw_pub_date = pubDate[0] 2523 | # Wed, 22 May 2019 01:47:44 +0000 2524 | raw_pub_date = re.sub(r'\..*', '', raw_pub_date).strip() 2525 | ts_publish = ttime( 2526 | ptime(raw_pub_date, fmt='%Y-%m-%dT%H:%M:%S') + 3600 * 8) 2527 | else: 2528 | ts_publish = now 2529 | article['ts_publish'] = ts_publish 2530 | article['title'] = title 2531 | article['desc'] = desc 2532 | article['url'] = url 2533 | article['url_key'] = get_url_key(article['url']) 2534 | articles.append(article) 2535 | except Exception: 2536 | logger.error(f'{source} crawl failed: {traceback.format_exc()}') 2537 | break 2538 | logger.info( 2539 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' 2540 | ) 2541 | return articles 2542 | -------------------------------------------------------------------------------- /newspaper/loggers.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pathlib 3 | from logging.handlers import RotatingFileHandler 4 | 5 | log_dir = pathlib.Path(__file__).absolute().parent.parent / 'logs' 6 | 7 | 8 | def init_logger(logger_name=None, 9 | file_name='server.log', 10 | max_mb=50, 11 | backupCount=1): 12 | if not log_dir.is_dir(): 13 | log_dir.mkdir() 14 | formatter_str = ( 15 | "%(asctime)s %(levelname)-5s [%(name)s] %(filename)s(%(lineno)s): %(message)s" 16 | ) 17 | datefmt = "%Y-%m-%d %H:%M:%S" 18 | formatter = logging.Formatter(formatter_str, datefmt=datefmt) 19 | logger = logging.getLogger(logger_name) 20 | logger.setLevel(logging.INFO) 21 | stream_hl = logging.StreamHandler() 22 | stream_hl.setFormatter(formatter) 23 | stream_hl.setLevel(logging.INFO) 24 | logger.addHandler(stream_hl) 25 | 26 | file_hl = RotatingFileHandler(filename=log_dir / file_name, 27 | maxBytes=1024 * 1024 * max_mb, 28 | backupCount=backupCount, 29 | encoding='utf-8') 30 | file_hl.setFormatter(formatter) 31 | file_hl.setLevel(logging.INFO) 32 | logger.addHandler(file_hl) 33 | return logger 34 | 35 | 36 | logger = init_logger('server', 'server.log') 37 | spider_logger = init_logger('spider_logger', 38 | 'spider.log', 39 | max_mb=5, 40 | backupCount=1) 41 | -------------------------------------------------------------------------------- /newspaper/models.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import re 3 | import sqlite3 4 | import typing 5 | import warnings 6 | from datetime import datetime 7 | 8 | import aiomysql 9 | from async_lru import alru_cache 10 | from torequests.utils import ptime, time, ttime 11 | 12 | from .loggers import logger 13 | from .crawler.sources import content_sources_dict 14 | 15 | # 用了 insert ignore 还总是 warning, 又不想 insert try, 只好全禁掉了... 16 | warnings.filterwarnings('ignore', category=aiomysql.Warning) 17 | 18 | 19 | class Storage(object, metaclass=abc.ABCMeta): 20 | """存储器抽象. 统一参数对文章数据库进行增删改查.""" 21 | max_limit = 100 # 避免 limit 设置的太大一次提取太多导致拥堵 22 | articles_table_columns = ('url_key', 'title', 'url', 'cover', 'desc', 23 | 'source', 'level', 'review', 'ts_publish', 24 | 'ts_create', 'ts_update') 25 | 26 | def format_output_articles(self, articles: typing.Sequence[dict]): 27 | for article in articles: 28 | for key, value in article.items(): 29 | if isinstance(value, datetime): 30 | article[key] = str(value) 31 | return articles 32 | 33 | @staticmethod 34 | def ensure_articles(articles: typing.Sequence[dict]) -> list: 35 | valid_articles = [] 36 | # ensure_keys = ("url_key", "title", "cover", "desc", "source", 37 | # "review", "ts_publish", "lang") 38 | keys_set = None 39 | now = ttime() 40 | before_3_day_0_0 = f'{ttime(time.time() - 86400*3)[:10]} 00:00:00' 41 | for article in articles: 42 | if not isinstance(article, dict): 43 | continue 44 | if not keys_set: 45 | keys_set = set(article.keys()) 46 | else: 47 | # 如果 keys 和第一个不一样, 就没法使用 executemany, 所以跳过 48 | if set(article.keys()) != keys_set: 49 | continue 50 | # 这些 key 必须都存在才能入库 51 | source = content_sources_dict.get(article['source']) 52 | if not source: 53 | continue 54 | for ensure_key in ('url_key', 'title'): 55 | if not article.get(ensure_key): 56 | continue 57 | article.setdefault('cover', '') 58 | article.setdefault('desc', '') 59 | article.setdefault('source', 'unknown') 60 | article.setdefault('review', '') 61 | article.setdefault('level', source.get('level', 3)) 62 | article.setdefault('lang', source.get('lang', 'CN')) 63 | article.setdefault('ts_publish', '1970-01-01 08:00:01') 64 | article['lang'] = article['lang'].upper() 65 | article['desc'] = re.sub( 66 | r'|', '', 67 | article['desc']).strip() 68 | article['title'] = article['title'].strip() 69 | # mysql 会报错 0000-00-00 00:00:00 格式错误; 顺便尝试转换掉错误的发布时间 70 | if ttime(ptime(article['ts_publish'])) == '1970-01-01 08:00:00': 71 | article['ts_publish'] = '1970-01-01 08:00:01' 72 | if not article.get('ts_create'): 73 | # 最近 3 天发布的, 使用当前时间做抓取时间 74 | # 如果发布时间不存在, 也使用当前时间做抓取时间 75 | if article['ts_publish'] >= before_3_day_0_0 or article[ 76 | 'ts_publish'] == '1970-01-01 08:00:01': 77 | article['ts_create'] = now 78 | else: 79 | # 不是 3 天内发布的, 使用发布时间做抓取时间 80 | article['ts_create'] = article['ts_publish'] 81 | valid_articles.append(article) 82 | return valid_articles 83 | 84 | @abc.abstractmethod 85 | async def add_articles(self, *args, **kwargs): 86 | raise NotImplementedError 87 | 88 | @abc.abstractmethod 89 | async def del_articles(self, *args, **kwargs): 90 | raise NotImplementedError 91 | 92 | @abc.abstractmethod 93 | async def update_articles(self, *args, **kwargs): 94 | raise NotImplementedError 95 | 96 | @abc.abstractmethod 97 | async def query_articles(self, *args, **kwargs): 98 | raise NotImplementedError 99 | 100 | 101 | class MySQLStorage(Storage): 102 | """连接 mysql 线上数据库, 目前不需要读写分离, 因为只初始化一次, 所以不需要单例.""" 103 | 104 | def __init__(self, mysql_config): 105 | self.host = mysql_config['mysql_host'] 106 | self.port = mysql_config['mysql_port'] 107 | self.user = mysql_config['mysql_user'] 108 | self.password = mysql_config['mysql_password'] 109 | self.db = mysql_config['mysql_db'] 110 | self.autocommit = True 111 | self.pool_recycle = 600 112 | self.connect_args = dict(host=self.host, 113 | port=self.port, 114 | user=self.user, 115 | password=self.password, 116 | db=self.db, 117 | autocommit=self.autocommit, 118 | pool_recycle=self.pool_recycle) 119 | self.pool = None 120 | 121 | async def get_pool(self): 122 | if self.pool and not self.pool._closed: 123 | return self.pool 124 | self.pool = await aiomysql.create_pool(**self.connect_args) 125 | return self.pool 126 | 127 | async def _execute(self, 128 | cursor, 129 | execute_cmd: str, 130 | sql: str, 131 | args: typing.Union[list, dict] = None, 132 | fetchall: typing.Union[bool, None] = True, 133 | cursor_class: aiomysql.Cursor = aiomysql.DictCursor): 134 | """用来在指定 cursor 对象的时候执行语句""" 135 | result = await getattr(cursor, execute_cmd)(sql, args) 136 | if isinstance(cursor._executed, str): 137 | # 有时候是 bytesarray, 没什么必要看 138 | logger.info(f'[Execute SQL]: {cursor._executed[:256]}') 139 | if fetchall: 140 | result = await cursor.fetchall() 141 | elif fetchall is False: 142 | result = await cursor.fetchone() 143 | elif fetchall is None: 144 | result = result 145 | return result 146 | 147 | async def execute(self, 148 | sql: str, 149 | args: typing.Union[list, dict] = None, 150 | fetchall: typing.Union[bool, None] = True, 151 | cursor_class: aiomysql.Cursor = aiomysql.DictCursor, 152 | cursor: aiomysql.Cursor = None) -> typing.Any: 153 | """简单的通过 sql 获取数据. 154 | 155 | :param sql: query 的 sql 语句 156 | :type sql: str 157 | :param args: query 语句的参数, defaults to None 158 | :type args: typing.Union[list, dict], optional 159 | :param fetchall: 是否全部取出来, 默认为 True, 调用 fetchall; 如果设为 None(默认), 只返回受影响的行数; 如果设为 False, 则调用 fetchone 160 | :type fetchall: bool, optional 161 | :param cursor_class: 默认使用字典表示一行数据, defaults to aiomysql.DictCursor 162 | :type cursor_class: aiomysql.Cursor, optional 163 | :param cursor: 现成的 cursor, 如果没有指定, 则去连接池里创建 164 | :type cursor_class: aiomysql.Cursor 165 | :return: 返回 fetchmany / fetchone 的结果 166 | :rtype: typing.Any 167 | """ 168 | if cursor: 169 | return await self._execute(cursor, 170 | 'execute', 171 | sql=sql, 172 | args=args, 173 | fetchall=fetchall, 174 | cursor_class=cursor_class) 175 | conn_pool = await self.get_pool() 176 | async with conn_pool.acquire() as conn: 177 | async with conn.cursor(cursor_class) as cursor: 178 | return await self._execute(cursor, 179 | 'execute', 180 | sql=sql, 181 | args=args, 182 | fetchall=fetchall, 183 | cursor_class=cursor_class) 184 | 185 | async def executemany(self, 186 | sql: str, 187 | args: list = None, 188 | fetchall: typing.Union[bool, None] = True, 189 | cursor_class: aiomysql.Cursor = aiomysql.DictCursor, 190 | cursor: aiomysql.Cursor = None) -> typing.Any: 191 | """简单的通过 sql 获取数据. 192 | 193 | :param sql: query 的 sql 语句 194 | :type sql: str 195 | :param args: query 语句的参数, 只能为 list, defaults to None 196 | :type args: list, optional 197 | :param fetchall: 是否全部取出来, 默认为 True, 调用 fetchall; 如果设为 None(默认), 只返回受影响的行数; 如果设为 False, 则调用 fetchone 198 | :type fetchall: bool, optional 199 | :param cursor_class: 默认使用字典表示一行数据, defaults to aiomysql.DictCursor 200 | :type cursor_class: aiomysql.Cursor, optional 201 | :return: 返回 fetchmany / fetchone 的结果 202 | :rtype: typing.Any 203 | """ 204 | if cursor: 205 | return await self._execute(cursor, 206 | 'executemany', 207 | sql=sql, 208 | args=args, 209 | fetchall=fetchall, 210 | cursor_class=cursor_class) 211 | conn_pool = await self.get_pool() 212 | async with conn_pool.acquire() as conn: 213 | async with conn.cursor(cursor_class) as cursor: 214 | return await self._execute(cursor, 215 | 'executemany', 216 | sql=sql, 217 | args=args, 218 | fetchall=fetchall, 219 | cursor_class=cursor_class) 220 | 221 | async def _ensure_article_table_exists(self): 222 | is_exists = await self.execute( 223 | "SELECT table_name FROM information_schema.TABLES WHERE table_name ='articles'", 224 | fetchall=False) 225 | if is_exists: 226 | logger.info('`articles` table exists.') 227 | return 228 | logger.info('start creating `articles` table for table missing.') 229 | #! 每次修改这里要确定好和下面的 sqlite 部分是一致的 230 | sql = '''CREATE TABLE `articles` ( 231 | `url_key` char(32) NOT NULL COMMENT '通过 url 计算的 md5', 232 | `title` varchar(128) NOT NULL DEFAULT '无题' COMMENT '文章标题', 233 | `url` varchar(255) NOT NULL COMMENT '文章地址', 234 | `cover` varchar(255) NOT NULL DEFAULT '' COMMENT '文章封面图片', 235 | `desc` text COMMENT '文章描述, 如果是周报, 则包含所有文字', 236 | `source` varchar(32) NOT NULL DEFAULT '未知' COMMENT '文章来源', 237 | `level` tinyint(4) NOT NULL COMMENT '来源评分', 238 | `lang` char(2) DEFAULT NULL COMMENT '语言类型 cn, en', 239 | `review` varchar(255) NOT NULL DEFAULT '' COMMENT '点评评语', 240 | `ts_publish` timestamp NOT NULL DEFAULT '1970-01-01 08:00:01' COMMENT '发布时间', 241 | `ts_create` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '抓取时间', 242 | `ts_update` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间', 243 | PRIMARY KEY (`url_key`), 244 | KEY `ts_create_index` (`ts_create`) USING BTREE, 245 | KEY `ts_publish_index` (`ts_publish`), 246 | FULLTEXT KEY `full_text_index` (`title`,`desc`,`url`) /*!50100 WITH PARSER `ngram` */ 247 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='存放文章数据.' 248 | ''' 249 | await self.execute(sql, fetchall=None) 250 | logger.info('`articles` table created.') 251 | 252 | async def add_articles(self, articles, cursor=None): 253 | """事先要注意保证 articles list 的每个 dict keys 是一样的""" 254 | old_articles_length = len(articles) 255 | articles = self.ensure_articles(articles) 256 | if not articles: 257 | return 258 | # 拿到第一个 article 的 keys 拼凑 sql 259 | keys = ', '.join([f'`{key}`' for key in articles[0].keys()]) 260 | value_keys = ','.join([f'%({key})s' for key in articles[0].keys()]) 261 | sql = f'''insert ignore into `articles` ({keys}) values ({value_keys})''' 262 | result = await self.executemany(sql, 263 | articles, 264 | fetchall=None, 265 | cursor=cursor) 266 | source = articles[0]['source'] 267 | if result: 268 | logger.info( 269 | f'[{source}]: crawled {old_articles_length} articles, inserted {result}.' 270 | ) 271 | return result 272 | 273 | async def del_articles(self, *args, **kwargs): 274 | raise NotImplementedError 275 | 276 | async def update_articles(self, *args, **kwargs): 277 | raise NotImplementedError 278 | 279 | @alru_cache(maxsize=30) 280 | async def query_articles( 281 | self, 282 | query: str = None, 283 | start_time: str = "", 284 | end_time: str = "", 285 | source: str = "", 286 | order_by: str = 'ts_create', 287 | sorting: str = 'desc', 288 | limit: int = 30, 289 | offset: int = 0, 290 | date: str = '', 291 | lang: str = 'ANY', 292 | ) -> dict: 293 | args: list = [] 294 | where_list: list = [] 295 | result: dict = {} 296 | source = str(source) 297 | order_by = order_by.strip(' `') 298 | limit = min((self.max_limit, int(limit))) 299 | offset = int(offset) 300 | lang = str(lang).upper() 301 | extra_select_words: str = '' 302 | 303 | if query: 304 | # 带检索词的,添加上字段方便排序 305 | extra_select_words = ', MATCH (`title`, `desc`, `url`) AGAINST (%s IN BOOLEAN MODE) as relevance' 306 | args.append(query) 307 | where_list.append( 308 | 'MATCH (`title`, `desc`, `url`) AGAINST (%s in BOOLEAN MODE)') 309 | args.append(query) 310 | if order_by not in self.articles_table_columns and order_by != 'relevance': 311 | order_by = 'ts_create' 312 | order_by_sorting = f'order by {order_by} {sorting}' 313 | if date: 314 | if date == 'today': 315 | date = ttime()[:10] 316 | elif date == 'yesterday': 317 | date = ttime(time.time() - 86400)[:10] 318 | # 将 date 换算成起止时间并覆盖 319 | date = str(date) 320 | if not re.match('\\d\\d\\d\\d-\\d\\d-\\d\\d', date): 321 | raise ValueError(f'日期参数的格式不对 {date}, 例: 2019-05-14') 322 | start_time = f'{date} 00:00:00' 323 | end_time = f'{date} 23:59:59' 324 | limit = 9999 325 | if sorting.lower() not in ('desc', 'asc'): 326 | sorting = 'desc' 327 | if start_time: 328 | where_list.append("`ts_publish` >= %s") 329 | args.append(start_time) 330 | result['start_time'] = start_time 331 | if end_time: 332 | where_list.append("`ts_publish` <= %s") 333 | args.append(end_time) 334 | result['end_time'] = end_time 335 | if source: 336 | where_list.append("`source` = %s") 337 | args.append(source) 338 | result['source'] = source 339 | 340 | if lang in {'CN', 'EN'}: 341 | where_list.append("`lang` = %s") 342 | args.append(lang) 343 | else: 344 | lang = 'ANY' 345 | 346 | result['order_by'] = order_by 347 | result['query'] = query or '' 348 | result['sorting'] = sorting 349 | result['limit'] = limit 350 | result['offset'] = offset 351 | result['date'] = date 352 | args.extend([limit + 1, offset]) 353 | if where_list: 354 | where_string = 'where ' + ' and '.join(where_list) 355 | else: 356 | where_string = '' 357 | sql = f"SELECT *{extra_select_words} from articles {where_string} {order_by_sorting} limit %s offset %s" 358 | logger.info(f'fetching articles sql: {sql}, args: {args}') 359 | items = await self.execute(sql, args) 360 | result['has_more'] = 1 if len(items) > limit else 0 361 | articles = self.format_output_articles(items[:limit]) 362 | result['articles'] = articles 363 | result['lang'] = lang 364 | return result 365 | 366 | 367 | class Sqlite3Storage(Storage): 368 | """本地数据库, 主要用来备份线上数据避免阿里云翻车或者迁移的时候用.""" 369 | 370 | def __init__(self, file_path): 371 | self.db = sqlite3.connect(file_path) 372 | self.cursor = self.db.cursor() 373 | 374 | def __del__(self): 375 | self.db.close() 376 | 377 | def add_articles(self, articles): 378 | articles = self.ensure_articles(articles) 379 | if not articles: 380 | return 381 | for article in articles: 382 | keys = list(article.keys()) 383 | keys_str = ', '.join([f'`{key}`' for key in keys]) 384 | values = [article[key] for key in keys] 385 | value_keys = ','.join([f'?' for key in keys]) 386 | sql = f'''insert or ignore into `articles` ({keys_str}) values ({value_keys})''' 387 | result = self.cursor.execute(sql, values) 388 | self.db.commit() 389 | return result 390 | 391 | def del_articles(self, *args, **kwargs): 392 | pass 393 | 394 | def update_articles(self, *args, **kwargs): 395 | pass 396 | 397 | def query_articles(self, *args, **kwargs): 398 | pass 399 | 400 | def _ensure_article_table_exists(self): 401 | self.cursor.execute( 402 | "SELECT count(*) FROM sqlite_master WHERE type='table' AND name='articles'" 403 | ) 404 | is_exists = bool(self.cursor.fetchone()[0]) 405 | if is_exists: 406 | logger.info('`articles` table exists. [sqlite]') 407 | return 408 | logger.info( 409 | 'start creating `articles` table for table missing. [sqlite]') 410 | #! sqlite 只用来备份, 所以不建索引, 不支持 mysql 的 ENGINE, INDEX, COMMENT 411 | self.cursor.execute("""CREATE TABLE `articles` ( 412 | `url_key` char(32) NOT NULL , 413 | `title` varchar(128) NOT NULL DEFAULT '无题' , 414 | `url` varchar(255) NOT NULL , 415 | `cover` varchar(255) NOT NULL DEFAULT '' , 416 | `desc` text , 417 | `source` varchar(32) NOT NULL DEFAULT '未知' , 418 | `level` tinyint(4) NOT NULL , 419 | `lang` char(2) DEFAULT NULL , 420 | `review` varchar(255) NOT NULL DEFAULT '' , 421 | `ts_publish` timestamp NOT NULL DEFAULT '1970-01-01 08:00:01' , 422 | `ts_create` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP , 423 | `ts_update` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, 424 | PRIMARY KEY (`url_key`) 425 | )""") 426 | 427 | 428 | class MongoDBStorage(Storage): 429 | """连接免费的 mongolab 数据库, 之后迁移到 heroku + mlab(免费 mongodb) 的时候使用它.""" 430 | -------------------------------------------------------------------------------- /newspaper/server.py: -------------------------------------------------------------------------------- 1 | from .views import app 2 | import uvicorn 3 | 4 | 5 | def main(): 6 | uvicorn.run( 7 | 'newspaper.views:app', 8 | host='127.0.0.1', 9 | port=9001, 10 | proxy_headers=True, 11 | ) 12 | # logger=app.logger) 13 | -------------------------------------------------------------------------------- /newspaper/static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ClericPy/newspaper/e7826da716aec72dce60da345058337f8bc7726a/newspaper/static/favicon.ico -------------------------------------------------------------------------------- /newspaper/templates/articles.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | Python Timeline 11 | 12 | 13 | 23 | 24 | 222 | 223 | 224 | 225 |
    226 |
    227 |

    Python Timeline

    228 |
    229 |
    230 | 231 |
    232 |
    233 |
      234 | 235 | 238 | 239 |
      240 |

      query:

      241 | ? 243 |

      date:

      244 | 245 |

      start_time:

      246 | 247 |

      end_time:

      248 | 249 |

      source:

      250 | 251 |

      lang:

      252 | 257 |

      order_by:

      258 | 263 |

      sorting:

      264 | 268 |

      limit:

      269 | 270 |

      offset:

      271 | 272 |

      273 |

      274 |
      275 |
    • Github
    • 276 |
    • RSS
    • 277 |
    • Sources
    • 279 |
    280 |
    281 |
    282 |
    283 |
    284 |

    285 | Next Page 286 |

    287 |
    288 |
    289 |



    290 |
    291 | {{BEIAN_ID}} 292 |
    293 | 294 | 406 | 407 | 408 | 409 | -------------------------------------------------------------------------------- /newspaper/templates/daily_python.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | Python Daily: {{title|safe}} 11 | 12 | 13 | 23 | 24 | 174 | 175 | 176 | 177 |
    178 | 181 |
    182 | 183 |
    184 |
    185 |
    186 |
    187 |
    188 |
    189 |
    190 |



    191 |
    192 | {{BEIAN_ID}} 193 |
    194 | 258 | 259 | 260 | 261 | -------------------------------------------------------------------------------- /newspaper/utils.py: -------------------------------------------------------------------------------- 1 | from re import compile as re_compile 2 | 3 | import aiofiles 4 | from torequests.utils import escape 5 | 6 | com = re_compile(r'[\u4e00-\u9fa5\Wa-zA-Z0-9]+') 7 | 8 | 9 | def ensure_cn_en(string): 10 | new_string = com.sub('', string) 11 | return new_string.encode('u8') == new_string.encode('gb18030') 12 | 13 | 14 | async def tail_file(fp, size=100): 15 | current_seek = 0 16 | async with aiofiles.open(fp, encoding='u8', errors='ignore') as f: 17 | while 1: 18 | await f.seek(current_seek) 19 | text = await f.read(1) 20 | if not text: 21 | stop_pos = current_seek - size * 2 22 | break 23 | current_seek += size 24 | if stop_pos < 0: 25 | stop_pos = 0 26 | await f.seek(stop_pos) 27 | text = (await f.read())[-size:] 28 | return text 29 | 30 | 31 | def gen_rss(data): 32 | nodes = [] 33 | channel = data['channel'] 34 | channel_title = channel['title'] 35 | channel_desc = channel['description'] 36 | channel_link = channel['link'] 37 | channel_language = channel.get('language', 'zh-cn') 38 | item_keys = ['title', 'description', 'link', 'guid', 'pubDate'] 39 | for item in data['items']: 40 | item_nodes = [] 41 | for key in item_keys: 42 | value = item.get(key) 43 | if value: 44 | item_nodes.append(f'<{key}>{escape(value)}') 45 | nodes.append(''.join(item_nodes)) 46 | items_string = ''.join((f'{tmp}' for tmp in nodes)) 47 | return rf''' 48 | 49 | 50 | {channel_title} 51 | {channel_link} 52 | {channel_desc} 53 | {channel_language} 54 | {items_string} 55 | 56 | 57 | ''' 58 | -------------------------------------------------------------------------------- /newspaper/views.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import traceback 4 | from urllib.parse import urlencode 5 | 6 | from starlette.responses import (HTMLResponse, JSONResponse, PlainTextResponse, 7 | RedirectResponse, Response) 8 | from torequests.utils import ptime, time, ttime 9 | 10 | from .api import app 11 | from .config import BEIAN_ID, GA_ID, ONLINE_HOST 12 | from .crawler.sources import content_sources_dict 13 | from .loggers import log_dir 14 | from .utils import gen_rss, tail_file 15 | 16 | 17 | class APIError(Exception): 18 | pass 19 | 20 | 21 | def handle_pagination_response(url: str, result: dict) -> dict: 22 | base_url = re.sub(r'^https?://[^/]+|\?.*', '', url) 23 | result['ok'] = True 24 | params = { 25 | k: v 26 | for k, v in sorted(result.items(), key=lambda x: x[0]) 27 | if k not in {'articles', 'has_more', 'next_url', 'prev_url', 'ok'} 28 | } 29 | prev_offset = max((result['offset'] - result['limit'], 0)) 30 | next_offset = result['offset'] + result['limit'] 31 | if result['offset'] > 0: 32 | # 前页地址 33 | params['offset'] = prev_offset 34 | result['prev_url'] = f'{base_url}?{urlencode(params)}' 35 | if result.get('has_more'): 36 | # 后页地址 37 | params['offset'] = next_offset 38 | result['next_url'] = f'{base_url}?{urlencode(params)}' 39 | return result 40 | 41 | 42 | @app.exception_handler(Exception) 43 | def handle_default_exception(req, error): 44 | """非 API 错误的捕获, 会被下面的 API 错误覆盖""" 45 | err_string = f'{error.__class__.__name__}: {str(error)}' 46 | app.logger.error(f"{str(req.url)}, {err_string}.\n{traceback.format_exc()}") 47 | # 避免泄漏信息, 只输出错误类型 48 | return JSONResponse({"ok": False, "error": error.__class__.__name__}) 49 | 50 | 51 | @app.exception_handler(APIError) 52 | def handle_api_error(req, error): 53 | """只捕获主动 raise 出来的 API error""" 54 | err_string = str(error) 55 | app.logger.error( 56 | f"{str(req.url)}, APIError: {err_string}.\n{traceback.format_exc()}") 57 | # APIError 一般不会带上敏感信息 58 | return JSONResponse({"ok": False, "error": err_string}) 59 | 60 | 61 | @app.route('/') 62 | async def index(req): 63 | beian = '
    {BEIAN_ID}'.format( 64 | BEIAN_ID=BEIAN_ID) 65 | return HTMLResponse(beian) 66 | 67 | 68 | @app.route("/newspaper/articles.cache.clear") 69 | async def articles_query_cache_clear(req): 70 | if req.client.host == '127.0.0.1': 71 | app.db.query_articles.cache_clear() 72 | return PlainTextResponse('ok') 73 | else: 74 | return PlainTextResponse('fail') 75 | 76 | 77 | @app.route("/newspaper/logs/spider") 78 | async def spider_log(req): 79 | """只允许查看 spider log, 其他的信息不对外开放""" 80 | fp = log_dir / 'spider.log' 81 | size = req.query_params.get('size') or req.query_params.get('s') 82 | if size: 83 | size = int(size) 84 | else: 85 | size = len([ 86 | i for i in content_sources_dict.values() if i['status'] == '√' 87 | ]) * 120 88 | text = await tail_file(fp, size) 89 | return PlainTextResponse(text) 90 | 91 | 92 | @app.route('/favicon.ico') 93 | async def redirect_ico(req): 94 | return RedirectResponse('/static/favicon.ico', 301) 95 | 96 | 97 | @app.route("/newspaper/articles.query.{output}") 98 | async def articles_query(req): 99 | """搜索文章 100 | output 支持: html(默认), json, rss 101 | 102 | 支持参数: 103 | query: str = None, 104 | start_time: str = "", 105 | end_time: str = "", 106 | source: str = "", 107 | order_by: str = 'ts_create', 108 | sorting: str = 'desc', 109 | limit: int = 10, 110 | offset: int = 0 111 | """ 112 | output = req.path_params['output'] 113 | if output == 'json': 114 | params = dict(req.query_params) 115 | params['limit'] = params.get('limit') or 10 116 | params['offset'] = params.get('offset') or 0 117 | result = await app.db.query_articles(**params) 118 | return JSONResponse(handle_pagination_response(req.url._url, result)) 119 | elif output == 'html': 120 | return app.templates.TemplateResponse('articles.html', { 121 | "request": req, 122 | "GA_ID": GA_ID, 123 | "BEIAN_ID": BEIAN_ID, 124 | }) 125 | elif output == 'rss': 126 | # 只保留日报的 RSS 接口, 不再对 Timeline 做 rss 了, 没有必要 127 | # https://www.clericpy.top/newspaper/daily.python.list.rss.any 128 | params = dict(req.query_params) 129 | lang = params.get('lang', 'any').lower() 130 | return RedirectResponse(f'/newspaper/daily.python.list.rss.{lang}', 302) 131 | else: 132 | return PlainTextResponse("NotImplemented") 133 | 134 | 135 | @app.route("/newspaper/daily.python/{date}") 136 | async def daily_python(req): 137 | """Python 日报, 按 date 取文章, 以后考虑支持更多参数(过滤订阅源, 过滤 level, 过滤中英文)""" 138 | date = req.path_params['date'] 139 | params = dict(req.query_params) 140 | # 默认按发布时间 141 | params.setdefault('order_by', 'ts_publish') 142 | result = await app.db.query_articles(date=date, **params) 143 | return app.templates.TemplateResponse( 144 | 'daily_python.html', { 145 | "request": req, 146 | "articles": json.dumps(result).replace('`', "'"), 147 | "title": date, 148 | "GA_ID": GA_ID, 149 | "BEIAN_ID": BEIAN_ID, 150 | }) 151 | 152 | 153 | @app.route("/newspaper/daily.python.list.rss.{language}") 154 | async def daily_python_list(req): 155 | """Python 日报列表, 其实就是个按照日期伪造的页面, 用来订阅 rss""" 156 | language = req.path_params['language'].lower() 157 | if language not in {'cn', 'en', 'any'}: 158 | return PlainTextResponse('language should be cn / en / any.') 159 | limit: int = int(req.query_params.get('limit') or 10) 160 | xml_data: dict = { 161 | 'channel': { 162 | 'title': 'Python Daily', 163 | 'description': 'Python Daily Newspaper', 164 | 'link': f'https://{ONLINE_HOST}/newspaper/daily.python.list.rss.{language}', 165 | 'language': { 166 | 'cn': 'zh-cn', 167 | 'any': 'zh-cn' 168 | }.get(language, 'en'), 169 | }, 170 | 'items': [] 171 | } 172 | for date_delta in range(1, limit): 173 | title_date: str = ttime(time.time() - 86400 * date_delta)[:10] 174 | # 当日 0 点发布前一天的结果 175 | pubDate: str = ttime( 176 | ptime( 177 | ttime(time.time() - 86400 * (date_delta - 1))[:10], 178 | fmt='%Y-%m-%d'), 179 | fmt='%a, %d %b %Y') 180 | link: str = f'https://{ONLINE_HOST}/newspaper/daily.python/{title_date}?lang={language}' 181 | item: dict = { 182 | 'title': f'Python Daily [{title_date}]', 183 | 'link': link, 184 | 'guid': link, 185 | 'pubDate': pubDate 186 | } 187 | xml_data['items'].append(item) 188 | xml: str = gen_rss(xml_data) 189 | return Response(xml, media_type='text/xml') 190 | 191 | 192 | @app.route("/newspaper/source.redirect") 193 | async def source_redirect(req): 194 | """Python 日报, 按 date 取文章, 以后考虑支持更多参数(过滤订阅源, 过滤 level, 过滤中英文)""" 195 | name = req.query_params['name'] 196 | return RedirectResponse( 197 | content_sources_dict.get(name, {}).get( 198 | 'url', '/newspaper/articles.query.html'), 302) 199 | -------------------------------------------------------------------------------- /run_server.py: -------------------------------------------------------------------------------- 1 | #! pipenv run python 2 | """ 3 | 启动后端 API 服务 4 | """ 5 | 6 | from newspaper.server import main 7 | 8 | if __name__ == "__main__": 9 | main() 10 | --------------------------------------------------------------------------------