├── .gitignore
├── LICENSE
├── Pipfile
├── README.md
├── bin
├── .config
│ └── systemd
│ │ └── user
│ │ ├── newspaper_spider.service
│ │ ├── newspaper_spider.timer
│ │ └── newspaper_web.service
├── git-sync.sh
├── obsoleted
│ ├── restart.sh
│ ├── start.sh
│ └── stop.sh
└── update_systemd_config.py
├── crawl_history.py
├── crawl_online.py
├── crawl_test.py
├── db_backup.py
├── db_sql.py
├── deploy.md
├── newspaper
├── api.py
├── config.py
├── crawler
│ ├── main.py
│ ├── sources.py
│ └── spiders.py
├── loggers.py
├── models.py
├── server.py
├── static
│ └── favicon.ico
├── templates
│ ├── articles.html
│ └── daily_python.html
├── utils.py
└── views.py
└── run_server.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
106 | # local test python script
107 | temp.py
108 | .vscode
109 | logs/
110 | *.sqlite
111 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 pyld
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
1 | [[source]]
2 | url = "https://pypi.org/simple"
3 | verify_ssl = false
4 | name = "pypi"
5 |
6 | [dev-packages]
7 |
8 | [packages]
9 | uvloop = {version = "*",sys_platform = "!= 'win32'"}
10 | torequests = ">=5.0.10"
11 | starlette = "*"
12 | uvicorn = ">=0.11.8"
13 | aiomysql = "*"
14 | lxml = "*"
15 | cssselect = "*"
16 | async-lru = "*"
17 | aiofiles = "*"
18 | jinja2 = "*"
19 |
20 | [requires]
21 | python_version = "3.7"
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # python-newspaper
2 |
3 | ## 服务器将与 2021.4.1 关停不再续费. sqlite 备份在 Release 里, 备份时间 2021-03-07. 之后会考虑通过另一个项目 Watchdogs 开展订阅.
4 |
5 | ### Timeline
6 | - [For English-only reader](https://www.clericpy.top/newspaper/articles.query.html?lang=EN)
7 | - [中文读者](https://www.clericpy.top/newspaper/articles.query.html?lang=CN)
8 | - [中英文读者](https://www.clericpy.top/newspaper/articles.query.html)
9 |
10 | ### RSS 日报
11 | - [For English-only reader](https://www.clericpy.top/newspaper/daily.python.list.rss.en)
12 | - [中文读者](https://www.clericpy.top/newspaper/daily.python.list.rss.cn)
13 | - [中英文读者](https://www.clericpy.top/newspaper/daily.python.list.rss.any)
14 |
15 | ### 当前进度:
16 |
17 | - [x] 购买服务器
18 | - [x] 准备备用服务器
19 | - [x] 确认内容源
20 | - [x] 准备服务器
21 | - [x] 开发
22 | - [x] 测试
23 | - [x] 上线
24 | - [x] 补充内容源
25 | - [ ] Python 日报静态页面 github pages + rss
26 | - [ ] 人工筛选生成日报/周报, 公众号推送
27 | - [ ] 实现订阅源过滤功能
28 |
29 | ### 内容源列表
30 |
31 | **内容源高分标准**:
32 |
33 | 1. 原创为主
34 | 2. 更新频率较低
35 | 3. 文章质量好
36 | 4. 信息量大, 周报
37 | 5. 广为人知
38 |
39 |
40 |
41 | * 收录进度: 37 / 37
42 |
43 | > = 待收录 | √ 已收录 | X 不收录 | - 入库不追更
44 |
45 | | 序号 | 名称 | 评分 | 语言 | 收录 | 描述 |
46 | | ---- | ---- | ---- | ---- | ---- | ---- |
47 | | 1 | [Python Software Foundation News](https://pyfound.blogspot.com/) | 4 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python+Software+Foundation+News) | [墙] 来自 Python 软件基金会的消息 |
48 | | 2 | [Python Weekly](https://www.pythonweekly.com/) | 5 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python+Weekly) | 必备周报 |
49 | | 3 | [PyCoder's Weekly](https://pycoders.com/issues) | 5 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=PyCoder%27s+Weekly) | 必备周报 |
50 | | 4 | [Import Python](https://importpython.com/newsletter/archive/) | 5 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Import+Python) | 必备周报, 2019.1.11 停更了, 希望早日康复~ |
51 | | 5 | [Awesome Python Newsletter](https://python.libhunt.com/newsletter/archive) | 5 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Awesome+Python+Newsletter) | 必备周报 |
52 | | 6 | [Real Python](https://realpython.com/) | 4 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Real+Python) | 文章质量高, 更新较少 |
53 | | 7 | [Planet Python](https://planetpython.org) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Planet+Python) | 官方推荐的博客, 收录了很多博主 |
54 | | 8 | [Julien Danjou](https://julien.danjou.info) | 4 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Julien+Danjou) | 文章质量不错, 保持更新 |
55 | | 9 | [Doug Hellmann](https://doughellmann.com/blog/) | 4 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Doug+Hellmann) | 大名鼎鼎, 文章质量很高 |
56 | | 10 | [The Mouse Vs. The Python](https://www.blog.pythonlibrary.org) | 4 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=The+Mouse+Vs.+The+Python) | 文章质量不错 |
57 | | 11 | [InfoQ](https://www.infoq.cn/topic/python) | 4 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=InfoQ) | 原创/译文的质量不错 |
58 | | 12 | [Jeff Knupp](https://jeffknupp.com/) | 4 | EN | X | [墙] 热门博客, 2018以后不更新了, 并且 planetpython 有, 暂不收录 |
59 | | 13 | [Hacker News](https://hn.algolia.com/?query=python&sort=byPopularity&prefix&page=0&dateRange=last24h&type=story) | 4 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Hacker+News) | 大名鼎鼎的 HN |
60 | | 14 | [Python Insider](https://blog.python.org/) | 3 | EN | X | 官方开发进度, 被官博和 planetPython 包含, 所以不需要收录. |
61 | | 15 | [Brett Cannon](https://snarky.ca/) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Brett+Cannon) | 核心开发者 |
62 | | 16 | [Encode](https://www.encode.io/) | 3 | EN | X | 知名 Python 开源组织, 文章太少, 暂不收录 |
63 | | 17 | [机器之心](https://www.jiqizhixin.com/search/article?keywords=python&search_internet=true&sort=time) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E6%9C%BA%E5%99%A8%E4%B9%8B%E5%BF%83) | 知名公众号 |
64 | | 18 | [依云's Blog](https://blog.lilydjwg.me/tag/python?page=1) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E4%BE%9D%E4%BA%91%27s+Blog) | 文章质量很高 |
65 | | 19 | [DEV Community](https://dev.to/t/python/latest) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=DEV+Community) | 算是个挺好的社区, post 也都不太水 |
66 | | 20 | [Python猫](https://zhuanlan.zhihu.com/pythonCat) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python%E7%8C%AB) | 2018 年末比较热情的博主, 原创 + 优质译文 |
67 | | 21 | [Python之美](https://zhuanlan.zhihu.com/python-cn) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python%E4%B9%8B%E7%BE%8E) | 早期文章较多, 创业以后更新不太多了 |
68 | | 22 | [静觅](https://cuiqingcai.com/category/technique/python) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E9%9D%99%E8%A7%85) | 崔庆才的个人博客, 保持更新的原创博主 |
69 | | 23 | [推酷(中文)](https://www.tuicool.com/topics/11130000?st=0&lang=1) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E6%8E%A8%E9%85%B7%28%E4%B8%AD%E6%96%87%29) | 推文类站点. 按热门排序 |
70 | | 24 | [推酷(英文)](https://www.tuicool.com/topics/11130000?st=0&lang=2) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E6%8E%A8%E9%85%B7%28%E8%8B%B1%E6%96%87%29) | 推文类站点. 按热门排序 |
71 | | 25 | [开发者头条](https://toutiao.io/tags/python?type=latest) | 3 | CN | X | 推文类站点, 但是没有发布时间, 暂不收录 |
72 | | 26 | [稀土掘金](https://juejin.im/tag/Python) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E7%A8%80%E5%9C%9F%E6%8E%98%E9%87%91) | 推文类站点. 按热门排序 |
73 | | 27 | [Python部落](https://python.freelycode.com/contribution/list/0?page_no=1) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python%E9%83%A8%E8%90%BD) | 推文+译文 |
74 | | 28 | [miguelgrinberg](https://blog.miguelgrinberg.com/index) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=miguelgrinberg) | Web 开发相关的内容挺多, 质量较高 |
75 | | 29 | [Ned Batchelder](https://nedbatchelder.com/blog/tag/python.html) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Ned+Batchelder) | 热门博主。planetpython 也有 |
76 | | 30 | [Full Stack Python](https://www.fullstackpython.com/blog.html) | 3 | EN | X | 热门博主。planetpython 有了, 文章比较少, 暂不收录 |
77 | | 31 | [Eli Bendersky's website](https://eli.thegreenplace.net/tag/python) | 3 | EN | X | 值得一看,planetpython 有, 暂不收录 |
78 | | 32 | [Manjusaka](https://manjusaka.itscoder.com/tags/Python/) | 3 | CN | X | 原创还不错, 但是文章较少, 暂不收录 |
79 | | 33 | [Python程序员](https://zhuanlan.zhihu.com/pythoncxy) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python%E7%A8%8B%E5%BA%8F%E5%91%98) | 关注破万的知乎专栏 |
80 | | 34 | [Python头条](https://zhuanlan.zhihu.com/c_111369541) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python%E5%A4%B4%E6%9D%A1) | 关注破万的知乎专栏 |
81 | | 35 | [the5fire的技术博客](https://www.the5fire.com/category/python/) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=the5fire%E7%9A%84%E6%8A%80%E6%9C%AF%E5%8D%9A%E5%AE%A2) | 保持更新的热门中文博主 |
82 | | 36 | [Python之禅](https://foofish.net/) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python%E4%B9%8B%E7%A6%85) | 文章较基础, 质量不错 |
83 | | 37 | [V2EX](https://www.v2ex.com/go/python) | 3 | CN | X | 社区类, api 失效, web 端乱七八糟的, 不收录 |
84 | | 38 | [伯乐在线](http://python.jobbole.com/all-posts/) | 3 | CN | X | 有点类似推酷, 质量参差不齐. HTTP ERROR 503 |
85 | | 39 | [Python 3 Module of the Week](https://pymotw.com/3/) | 3 | EN | X | 看起来不怎么更新了, 暂不收录 |
86 | | 40 | [The Invent with Python Blog](https://inventwithpython.com/blog/index.html) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=The+Invent+with+Python+Blog) | 感觉不错 |
87 | | 41 | [Armin Ronacher's Thoughts and Writings](http://lucumr.pocoo.org/) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Armin+Ronacher%27s+Thoughts+and+Writings) | Flask 作者 Armin Ronacher |
88 | | 42 | [aio-libs](https://groups.google.com/forum/#!forum/aio-libs) | 3 | EN | X | 知名 Python 开源组织, 不过没有文章类的 post |
89 | | 43 | [码农周刊](https://weekly.manong.io/issues/) | 3 | CN | X | 课外读物, 非 Python 主题, 暂不收录 |
90 | | 44 | [编程派](http://codingpy.com/) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E7%BC%96%E7%A8%8B%E6%B4%BE) | 原创+译文 |
91 | | 45 | [峰云就她了](http://xiaorui.cc/archives/category/python) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E5%B3%B0%E4%BA%91%E5%B0%B1%E5%A5%B9%E4%BA%86) | 原创博客, 质量比较不错 |
92 | | 46 | [Dan Bader](https://dbader.org/blog/) | 3 | EN | X | 一年不更新了, 先不收录了 |
93 | | 47 | [Pythonic Perambulations](https://jakevdp.github.io/) | 3 | EN | X | 最后更新 Thu 13 September 2018, 暂不收录 |
94 | | 48 | [开源中国翻译](https://www.oschina.net/translate/tag/python) | 3 | CN | X | 入库留着吧, 估计不更了, 暂不收录 |
95 | | 49 | [Trey Hunner](https://treyhunner.com/blog/archives/) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Trey+Hunner) | Help developers level-up their Python skills |
96 | | 50 | [Python Central](https://www.pythoncentral.io/) | 3 | EN | X | 不更新了, 暂不收录 |
97 | | 51 | [Inside the Head of PyDanny](https://www.pydanny.com/) | 3 | EN | X | 不更新了, 暂不收录 |
98 | | 52 | [华蟒用户组,CPyUG](https://groups.google.com/forum/#!forum/python-cn) | 3 | EN | X | [墙] 社区类, 自己看看就好, 暂不收录 |
99 | | 53 | [Treehl](https://family-treesy.github.io/tags/PYTHON/) | 3 | CN | X | 文章较基础, 久不更新, 暂不收录 |
100 | | 54 | [蠎周刊](http://weekly.pychina.org) | 4 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E8%A0%8E%E5%91%A8%E5%88%8A) | 各种 weekly 中译版 |
101 | | 55 | [zzzeek](https://techspot.zzzeek.org/) | 3 | EN | X | 2016 年后停更了 |
102 | | 56 | [Yu’s blog](https://gofisher.github.io/) | 3 | CN | X | 原创, 但是久不更新了, 网站 http://blog.rainy.im/ 挂了 |
103 | | 57 | [程序师](http://www.techug.com/tag/python) | 3 | CN | X | 原创较少, 文章较旧 |
104 | | 58 | [一根笨茄子](http://blog.guoyb.com/tags/Python/) | 3 | CN | X | 文章更新较少, 质量参差 |
105 | | 59 | [追梦人物](https://www.zmrenwu.com/) | 2 | CN | X | 像个学习博客 |
106 | | 60 | [anshengme](https://blog.ansheng.me/) | 2 | CN | X | 质量一般 |
107 | | 61 | [Pegasus](http://ningning.today/categories/python/) | 2 | CN | X | 不怎么更新 |
108 | | 62 | [FunHacks](https://funhacks.net/categories/Python/) | 2 | CN | X | 太久不更新了, 不过python 之旅还行 |
109 | | 63 | [Peter Norvig's essays](http://norvig.com/) | 2 | EN | X | 这排版驾驭不了... |
110 | | 64 | [Peterbe.com](https://www.peterbe.com/plog/) | 2 | EN | X | 不是太值得收录 |
111 | | 65 | [Python Tips](https://pythontips.com/) | 2 | EN | X | 很火, 但我不喜欢 |
112 | | 66 | [脚本之家](https://www.jb51.net/list/list_97_1.htm) | 2 | CN | X | 文章的质量啊~~~ |
113 | | 67 | [开源中国搜索](https://www.oschina.net/search?scope=translate&q=python&category=0&onlytitle=0&sort_by_time=1) | 2 | CN | X | 质量不太高 |
114 | | 68 | [伯乐在线头条](http://top.jobbole.com/tag/python/?sort=latest) | 2 | CN | X | 停更 |
115 | | 69 | [代码片段](http://www.phpxs.com/code/python) | 2 | CN | X | 文章太老了, 停更了 |
116 | | 70 | [segmentfault](https://segmentfault.com/t/python/blogs) | 2 | CN | X | 文章质量 |
117 | | 71 | [Python China](http://python-china.org/api/topics/timeline) | 2 | CN | X | 欠费网站挂了 |
118 | | 72 | [麦穗技术](http://www.58maisui.com/category/python/) | 2 | CN | X | 网站挂了 |
119 | | 73 | [CSDN](https://so.csdn.net/so/search/s.do?q=python&t=blog&u=) | 1 | CN | X | 文章质量啊~~~ |
120 | | 74 | [Stack Overflow](https://stackoverflow.com/?tab=hot) | 3 | EN | X | 已解决 + python + vote>=5, 但是问题有点弱智, 暂不收录 |
121 | | 75 | [Reddit](https://www.reddit.com/r/Python/top/) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Reddit) | 知名社区. 质量参差, 收录每日 ups>=20 |
122 | | 76 | [码天狗](https://weekly.codetengu.com/issues) | 4 | CN | X | 综合类周报, 2018-11-23 之后不更了. 挂了, 下线. |
123 | | 77 | [Medium](https://medium.com/tag/python) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Medium) | Medium 的 Python tag, 收录 RSS |
124 |
125 |
126 |
127 |
128 | ### 声明
129 |
130 | 1. 非盈利项目, 主要动机是 Python 语言学以致用, 并给大家提供学习 Python 的渠道
131 | 2. 若有侵权行为, 请在 Issues 留言, 将进行下线处理
132 | 3. 欢迎提交 PR, 欢迎在 Issues 留言提供优质内容源
133 | 4. 纯中文项目, 精力有限, 暂时先不管老外了
134 |
--------------------------------------------------------------------------------
/bin/.config/systemd/user/newspaper_spider.service:
--------------------------------------------------------------------------------
1 | [Unit]
2 | Description=newspaper spider service
3 |
4 | [Service]
5 | Type=simple
6 | ExecStart=/usr/local/bin/pipenv run python crawl_online.py
7 | WorkingDirectory=/root/newspaper
8 |
9 | [Install]
10 | WantedBy=multi-user.target
11 | WantedBy=network-online.target
12 |
--------------------------------------------------------------------------------
/bin/.config/systemd/user/newspaper_spider.timer:
--------------------------------------------------------------------------------
1 | [Unit]
2 | Description=newspaper spider timer
3 |
4 | [Timer]
5 | OnBootSec=10min
6 | OnUnitActiveSec=15min
7 | Unit=newspaper_spider.service
8 |
9 | [Install]
10 | WantedBy=multi-user.target
11 | WantedBy=network-online.target
12 |
--------------------------------------------------------------------------------
/bin/.config/systemd/user/newspaper_web.service:
--------------------------------------------------------------------------------
1 | [Unit]
2 | Description=newspaper web service
3 |
4 | [Service]
5 | Type=simple
6 | ExecStart=/usr/local/bin/pipenv run python run_server.py
7 | WorkingDirectory=/root/newspaper
8 | [Install]
9 | WantedBy=multi-user.target
10 | WantedBy=network-online.target
11 |
--------------------------------------------------------------------------------
/bin/git-sync.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | git fetch; git reset --hard origin/master
3 |
--------------------------------------------------------------------------------
/bin/obsoleted/restart.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | DIR=$(cd `dirname $0`; pwd)
3 | cd $DIR
4 | sh stop.sh
5 | sh start.sh > /dev/null
6 |
--------------------------------------------------------------------------------
/bin/obsoleted/start.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | DIR=$(cd `dirname $0`/..; pwd)
3 | cd $DIR
4 | nohup pipenv run python run_server.py &
5 | echo "server started"
6 | echo
7 |
--------------------------------------------------------------------------------
/bin/obsoleted/stop.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ps aux|grep 'newspaper-'|grep 'run_server.py'|awk '{print $2}'|xargs kill
3 | echo "server stoped"
4 | echo
5 |
--------------------------------------------------------------------------------
/bin/update_systemd_config.py:
--------------------------------------------------------------------------------
1 | import pathlib
2 | user_systemd_dir = pathlib.Path.home() / '.config/systemd/user'
3 | if not user_systemd_dir.is_dir():
4 | user_systemd_dir.mkdir()
5 |
6 | newspaper_product_dir = pathlib.Path(
7 | __file__).absolute().parent.parent.absolute()
8 |
9 | # web 服务启动
10 |
11 | newspaper_web_service = fr'''
12 | [Unit]
13 | Description=newspaper web service
14 |
15 | [Service]
16 | Type=simple
17 | ExecStart=/usr/local/bin/pipenv run python run_server.py
18 | WorkingDirectory={newspaper_product_dir}
19 | [Install]
20 | WantedBy=multi-user.target
21 | WantedBy=network-online.target
22 | '''
23 | newspaper_web_service_fp = user_systemd_dir / 'newspaper_web.service'
24 | newspaper_web_service_fp.write_text(newspaper_web_service, encoding='utf-8')
25 |
26 | # 爬虫服务
27 |
28 | newspaper_spider_service = fr'''
29 | [Unit]
30 | Description=newspaper spider service
31 |
32 | [Service]
33 | Type=simple
34 | ExecStart=/usr/local/bin/pipenv run python crawl_online.py
35 | WorkingDirectory={newspaper_product_dir}
36 |
37 | [Install]
38 | WantedBy=multi-user.target
39 | WantedBy=network-online.target
40 | '''
41 | newspaper_spider_service_fp = user_systemd_dir / 'newspaper_spider.service'
42 | newspaper_spider_service_fp.write_text(newspaper_spider_service,
43 | encoding='utf-8')
44 |
45 | # 爬虫定时器
46 |
47 | newspaper_spider_timer = r'''
48 | [Unit]
49 | Description=newspaper spider timer
50 |
51 | [Timer]
52 | OnBootSec=10min
53 | OnUnitActiveSec=15min
54 | Unit=newspaper_spider.service
55 |
56 | [Install]
57 | WantedBy=multi-user.target
58 | WantedBy=network-online.target
59 | '''
60 | newspaper_spider_timer_fp = user_systemd_dir / 'newspaper_spider.timer'
61 | newspaper_spider_timer_fp.write_text(newspaper_spider_timer, encoding='utf-8')
62 |
--------------------------------------------------------------------------------
/crawl_history.py:
--------------------------------------------------------------------------------
1 | from newspaper.crawler.main import history_workflow
2 | import asyncio
3 | """
4 | 采集历史文章脚本
5 | 1. 本地脚本
6 | 2. 执行历史文章抓取任务, 并将文章入库
7 | 3. 将需要抓历史文章的内容源的函数加装饰器 register_history, 就会被自动调用
8 | 4. 一般抓历史文章的任务只在第一次收录时候使用, 后期使用 online_spiders 保持更新
9 | """
10 |
11 | if __name__ == "__main__":
12 | loop = asyncio.get_event_loop()
13 | loop.run_until_complete(history_workflow())
14 |
--------------------------------------------------------------------------------
/crawl_online.py:
--------------------------------------------------------------------------------
1 | def main():
2 | """
3 | 采集线上爬虫脚本
4 | 1. 本地脚本 / 线上脚本 皆可. crontab 任务
5 | 2. 执行执行常规抓取任务, 并将文章入库
6 | 3. 将需要文章的内容源的函数加装饰器 register_online, 就会被自动调用
7 | """
8 | from newspaper.crawler.main import online_workflow
9 | import asyncio
10 | loop = asyncio.get_event_loop()
11 | loop.run_until_complete(online_workflow())
12 |
13 |
14 | if __name__ == "__main__":
15 | main()
16 |
--------------------------------------------------------------------------------
/crawl_test.py:
--------------------------------------------------------------------------------
1 | def test():
2 | from newspaper.crawler.main import test_spider_workflow
3 | import asyncio
4 | loop = asyncio.get_event_loop()
5 | loop.run_until_complete(test_spider_workflow())
6 |
7 |
8 | if __name__ == "__main__":
9 | test()
10 |
--------------------------------------------------------------------------------
/db_backup.py:
--------------------------------------------------------------------------------
1 | #! pipenv run python
2 | """
3 | 从线上拉数据到本地备份 sqlite
4 | """
5 | import re
6 |
7 | from torequests import tPool
8 | from torequests.utils import ttime, time
9 |
10 | from newspaper.models import Sqlite3Storage, logger
11 | from newspaper.config import ONLINE_HOST
12 |
13 |
14 | def fetch_artcles(ts_start):
15 | req = tPool()
16 | api = f'https://{ONLINE_HOST}/newspaper/articles.query.json'
17 | next_url = ''
18 | start_params = {
19 | 'query': '',
20 | 'start_time': ts_start,
21 | 'end_time': '',
22 | 'source': '',
23 | 'lang': 'ANY',
24 | 'order_by': 'ts_update',
25 | 'sorting': 'asc',
26 | 'limit': '100',
27 | 'offset': '0',
28 | }
29 |
30 | while 1:
31 | params = {} if next_url else start_params
32 | # 没有 next_url 的时候访问第一页, 有的时候访问 next_url
33 | url = next_url or api
34 | r = req.get(url, params=params, retry=2, timeout=10)
35 | if not r.x:
36 | logger.error(f'req init failed: {r.x}, {r.text}')
37 | raise IOError
38 | rj = r.json()
39 | articles = rj.get('articles', [])
40 | if articles:
41 | yield articles
42 | next_url = rj.get('next_url', '')
43 | if not (articles and next_url):
44 | # 没有文章, 并没有下一页
45 | logger.info(f'fetch_artcles finished, last url: {url}')
46 | return
47 | next_url = re.sub('^/', f'https://{ONLINE_HOST}/', next_url)
48 |
49 |
50 | def get_ts_latest(cursor):
51 | cursor.execute('select max(ts_update) from articles')
52 | result = cursor.fetchone()[0]
53 | if result:
54 | return result
55 | else:
56 | return ttime(0)
57 |
58 |
59 | def main():
60 | db = Sqlite3Storage(file_path='backup.sqlite')
61 | db._ensure_article_table_exists()
62 | ts_latest = get_ts_latest(db.cursor)
63 | logger.info(f'sync articles from online api: ts_latest={ts_latest}')
64 | article_cnt = 0
65 | for articles in fetch_artcles(ts_latest):
66 | db.add_articles(articles)
67 | article_cnt += len(articles)
68 | logger.info(f'+ {len(articles)} articles => {article_cnt}')
69 | logger.info(f'+ {article_cnt} new articles.')
70 |
71 |
72 | if __name__ == "__main__":
73 | main()
74 | time.sleep(3)
75 |
--------------------------------------------------------------------------------
/db_sql.py:
--------------------------------------------------------------------------------
1 | #! pipenv run python
2 | """
3 | 同时执行线上先下数据库
4 | """
5 | import asyncio
6 | import traceback
7 | import logging
8 |
9 | from newspaper.config import init_db
10 | from newspaper.models import Sqlite3Storage, logger
11 |
12 |
13 | async def main():
14 | db = Sqlite3Storage(file_path='backup.sqlite')
15 | db._ensure_article_table_exists()
16 | mysql = init_db()
17 | logger.setLevel(logging.WARNING)
18 | while 1:
19 | # select count(*) from articles
20 | # select count(*) from articles where `desc` like '%本文分享 方巍%'
21 | sql = input('Input SQL:\n')
22 | if not sql:
23 | break
24 | try:
25 | print(sql)
26 | db.cursor.execute(sql)
27 | logger.warning(f'Sqlite3Storage: {db.cursor.fetchall()}')
28 | result = await mysql.execute(sql)
29 | logger.warning(f'MysqlStorage: {result}')
30 | except KeyboardInterrupt:
31 | break
32 | except Exception:
33 | traceback.print_exc()
34 |
35 |
36 | if __name__ == "__main__":
37 | asyncio.run(main())
38 |
--------------------------------------------------------------------------------
/deploy.md:
--------------------------------------------------------------------------------
1 |
2 | ## 首次部署
3 | 0. install python 3.7+
4 | 1. git clone ...
5 | 2. pipenv install
6 | 3. python3.7 update_systemd_config.py
7 | 4. 新建 JSON 格式配置文件 /var/newspaper.conf
8 | 1. {"anti_gfw": {"url": "这里填写翻墙服务的地址, 如果没有则使用 http://localhost"}, "mysql_config": {"mysql_host": "", "mysql_port": 3306, "mysql_user": "", "mysql_password": "", "mysql_db": "db"}}
9 | 2. 当然环境变量 export newspaper_config 上面的 JSON 也是可以的
10 | 5. systemctl --user enable newspaper_web.service; systemctl --user start newspaper_web.service
11 | 6. systemctl --user enable newspaper_spider.timer; systemctl --user start newspaper_spider.timer
12 | 7. 绑定域名, 并配置 nginx 托管相关端口, 支持 SSL
13 |
14 |
15 |
16 |
17 | ### vscode task 升级更新脚本
18 | ```git co master ; git merge dev; git push; git co dev;ssh aliyun 'cd newspaper/bin;sh git-sync.sh;python3.7 update_systemd_config.py;systemctl daemon-reload;systemctl --user restart newspaper_web.service'```
19 |
--------------------------------------------------------------------------------
/newspaper/api.py:
--------------------------------------------------------------------------------
1 | #! python3
2 |
3 | import pathlib
4 |
5 | from starlette.applications import Starlette
6 | from starlette.staticfiles import StaticFiles
7 | from starlette.templating import Jinja2Templates
8 |
9 | from .config import init_db, global_configs
10 | from .loggers import logger
11 |
12 | static_dir = pathlib.Path(__file__).absolute().parent / 'static'
13 | templates_dir = pathlib.Path(__file__).absolute().parent / 'templates'
14 |
15 | app = Starlette()
16 | app.mount('/static', StaticFiles(directory=str(static_dir)), name='static')
17 | app.config = global_configs
18 | app.logger = logger
19 | app.db = init_db()
20 | app.templates = Jinja2Templates(directory=str(templates_dir))
21 |
22 |
23 | @app.on_event('startup')
24 | async def _ensure_article_table_exists():
25 | await app.db._ensure_article_table_exists()
26 |
--------------------------------------------------------------------------------
/newspaper/config.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 | import os
4 | import pathlib
5 |
6 |
7 | def init_config():
8 | global_configs = os.getenv(
9 | 'newspaper_config',
10 | None) or pathlib.Path('/var/newspaper.conf').read_text()
11 | if global_configs:
12 | global_configs = json.loads(global_configs)
13 | else:
14 | newspaper_config_template = '{"anti_gfw": {"url": "xxx"}, "mysql_config": {"mysql_host": "xxx", "mysql_port": 0, "mysql_user": "xxx", "mysql_password": "xxx", "mysql_db": "xxx"}}'
15 | logging.error(
16 | f'environment variable `newspaper_config` not found, it should be set as json like: {newspaper_config_template}'
17 | )
18 | raise RuntimeError('environment variable `newspaper_config` not found')
19 | return global_configs
20 |
21 |
22 | def init_db():
23 | from .models import MySQLStorage
24 | db = MySQLStorage(global_configs['mysql_config'])
25 | return db
26 |
27 |
28 | global_configs = init_config()
29 | ONLINE_HOST = 'www.clericpy.top'
30 | GA_ID = 'UA-150991415-2'
31 | BEIAN_ID = '鲁ICP备19021778号-1'
32 |
--------------------------------------------------------------------------------
/newspaper/crawler/main.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 |
3 | from torequests.dummy import Requests
4 |
5 | from ..config import init_db
6 | from ..loggers import spider_logger
7 | from .spiders import history_spiders, online_spiders
8 |
9 | db = init_db()
10 |
11 |
12 | async def test_spider_workflow():
13 | from .spiders import test_spiders
14 | from ..models import Storage
15 | from pprint import pprint
16 |
17 | for func in test_spiders:
18 | print('test start:', func.__doc__)
19 | articles = await func()
20 | articles = Storage.ensure_articles(articles)
21 | # check schema
22 | for item in articles:
23 | assert (not item.get('desc')) or isinstance(item['desc'], str)
24 | assert (not item.get('ts_publish')) or isinstance(
25 | item['ts_publish'], str)
26 | assert (not item.get('cover')) or isinstance(item['cover'], str)
27 | assert isinstance(item.get('source'), str)
28 | assert isinstance(item.get('title'), str)
29 | assert isinstance(item.get('url'), str)
30 | pprint(articles)
31 |
32 |
33 | async def clear_cache():
34 | url = 'http://127.0.0.1:9001/newspaper/articles.cache.clear'
35 | req = Requests()
36 | r = await req.get(url, timeout=2)
37 | spider_logger.info(f'clear_cache {r.text}')
38 |
39 |
40 | async def online_workflow():
41 | if not online_spiders:
42 | spider_logger.info('no online_spiders online.')
43 | return
44 | # 确认 articles 表存在, 否则建表
45 | await db._ensure_article_table_exists()
46 | # 生成一个 function name → source name 的映射
47 | function_sources = {func.__name__: func.__doc__ for func in online_spiders}
48 | coros = [func() for func in online_spiders]
49 | done, fail = await asyncio.wait(coros, timeout=120)
50 | spider_logger.info(f'{"=" * 30}')
51 | if fail:
52 | fail_names = [
53 | f'{idx}. {function_sources.get(task._coro.__name__, task._coro.__name__)}'
54 | for idx, task in enumerate(fail, 1)
55 | ]
56 | spider_logger.warn(
57 | f'timeout spiders({len(fail)}): {fail_names}')
58 | pool = await db.get_pool()
59 | async with pool.acquire() as conn:
60 | async with conn.cursor() as cursor:
61 | for idx, task in enumerate(done, 1):
62 | articles = task.result()
63 | func_name = task._coro.__name__
64 | source_name = function_sources.get(func_name, func_name)
65 | if articles:
66 | insert_result = await db.add_articles(articles,
67 | cursor=cursor)
68 | else:
69 | insert_result = 0
70 | spider_logger.info(
71 | f'{idx: 3}. {"+" if articles else "?????????"} {insert_result} / {len(articles)} articles.\t[{source_name}]'
72 | )
73 | await clear_cache()
74 |
75 |
76 | async def history_workflow():
77 | if not history_spiders:
78 | spider_logger.info('ignore for no history_spiders online.')
79 | return
80 | # 确认 articles 表存在, 否则建表
81 | await db._ensure_article_table_exists()
82 | # 生成一个 function name → source name 的映射
83 | function_sources = {func.__name__: func.__doc__ for func in history_spiders}
84 | coros = [func() for func in history_spiders]
85 | done, fail = await asyncio.wait(coros, timeout=9999)
86 | spider_logger.info(f'{"=" * 30}')
87 | if fail:
88 | fail_names = [
89 | f'{idx}. {function_sources.get(task._coro.__name__, task._coro.__name__)}'
90 | for idx, task in enumerate(fail, 1)
91 | ]
92 | spider_logger.warn(
93 | f'timeout spiders({len(fail)}): {fail_names}')
94 | pool = await db.get_pool()
95 | async with pool.acquire() as conn:
96 | async with conn.cursor() as cursor:
97 | for idx, task in enumerate(done, 1):
98 | articles = task.result()
99 | func_name = task._coro.__name__
100 | source_name = function_sources.get(func_name, func_name)
101 | if articles:
102 | insert_result = await db.add_articles(articles,
103 | cursor=cursor)
104 | else:
105 | insert_result = 0
106 | spider_logger.info(
107 | f'{idx: 3}. {"+" if articles else "?????????"} {insert_result} / {len(articles)} articles.\t[{source_name}]'
108 | )
109 | await clear_cache()
110 |
--------------------------------------------------------------------------------
/newspaper/crawler/sources.py:
--------------------------------------------------------------------------------
1 | from torequests.utils import quote_plus
2 | import sys
3 | import pathlib
4 | sys.path.append(str(pathlib.Path(__file__).absolute().parent.parent))
5 | from config import ONLINE_HOST
6 |
7 | content_sources = [
8 | {
9 | "title": "Python Software Foundation News",
10 | "url": "https://pyfound.blogspot.com/",
11 | "level": 4,
12 | "lang": "EN",
13 | "status": "√",
14 | "desc": "[墙] 来自 Python 软件基金会的消息"
15 | },
16 | {
17 | "title": "Python Weekly",
18 | "url": "https://www.pythonweekly.com/",
19 | "level": 5,
20 | "lang": "EN",
21 | "status": "√",
22 | "desc": "必备周报"
23 | },
24 | {
25 | "title": "PyCoder's Weekly",
26 | "url": "https://pycoders.com/issues",
27 | "level": 5,
28 | "lang": "EN",
29 | "status": "√",
30 | "desc": "必备周报"
31 | },
32 | {
33 | "title": "Import Python",
34 | "url": "https://importpython.com/newsletter/archive/",
35 | "level": 5,
36 | "lang": "EN",
37 | "status": "√",
38 | "desc": "必备周报, 2019.1.11 停更了, 希望早日康复~"
39 | },
40 | {
41 | "title": "Awesome Python Newsletter",
42 | "url": "https://python.libhunt.com/newsletter/archive",
43 | "level": 5,
44 | "lang": "EN",
45 | "status": "√",
46 | "desc": "必备周报"
47 | },
48 | {
49 | "title": "Real Python",
50 | "url": "https://realpython.com/",
51 | "level": 4,
52 | "lang": "EN",
53 | "status": "√",
54 | "desc": "文章质量高, 更新较少"
55 | },
56 | {
57 | "title": "Planet Python",
58 | "url": "https://planetpython.org",
59 | "level": 3,
60 | "lang": "EN",
61 | "status": "√",
62 | "desc": "官方推荐的博客, 收录了很多博主"
63 | },
64 | {
65 | "title": "Julien Danjou",
66 | "url": "https://julien.danjou.info",
67 | "level": 4,
68 | "lang": "EN",
69 | "status": "√",
70 | "desc": "文章质量不错, 保持更新"
71 | },
72 | {
73 | "title": "Doug Hellmann",
74 | "url": "https://doughellmann.com/blog/",
75 | "level": 4,
76 | "lang": "EN",
77 | "status": "√",
78 | "desc": "大名鼎鼎, 文章质量很高"
79 | },
80 | {
81 | "title": "The Mouse Vs. The Python",
82 | "url": "https://www.blog.pythonlibrary.org",
83 | "level": 4,
84 | "lang": "EN",
85 | "status": "√",
86 | "desc": "文章质量不错"
87 | },
88 | {
89 | "title": "InfoQ",
90 | "url": "https://www.infoq.cn/topic/python",
91 | "level": 4,
92 | "lang": "CN",
93 | "status": "√",
94 | "desc": "原创/译文的质量不错"
95 | },
96 | {
97 | "title": "Jeff Knupp",
98 | "url": "https://jeffknupp.com/",
99 | "level": 4,
100 | "lang": "EN",
101 | "status": "X",
102 | "desc": "[墙] 热门博客, 2018以后不更新了, 并且 planetpython 有, 暂不收录"
103 | },
104 | {
105 | "title": "Hacker News",
106 | "url": "https://hn.algolia.com/?query=python&sort=byPopularity&prefix&page=0&dateRange=last24h&type=story",
107 | "level": 4,
108 | "lang": "EN",
109 | "status": "√",
110 | "desc": "大名鼎鼎的 HN"
111 | },
112 | {
113 | "title": "Python Insider",
114 | "url": "https://blog.python.org/",
115 | "level": 3,
116 | "lang": "EN",
117 | "status": "X",
118 | "desc": "官方开发进度, 被官博和 planetPython 包含, 所以不需要收录."
119 | },
120 | {
121 | "title": "Brett Cannon",
122 | "url": "https://snarky.ca/",
123 | "level": 3,
124 | "lang": "EN",
125 | "status": "√",
126 | "desc": "核心开发者"
127 | },
128 | {
129 | "title": "Encode",
130 | "url": "https://www.encode.io/",
131 | "level": 3,
132 | "lang": "EN",
133 | "status": "X",
134 | "desc": "知名 Python 开源组织, 文章太少, 暂不收录"
135 | },
136 | {
137 | "title": "机器之心",
138 | "url": "https://www.jiqizhixin.com/search/article?keywords=python&search_internet=true&sort=time",
139 | "level": 3,
140 | "lang": "CN",
141 | "status": "√",
142 | "desc": "知名公众号"
143 | },
144 | {
145 | "title": "依云's Blog",
146 | "url": "https://blog.lilydjwg.me/tag/python?page=1",
147 | "level": 3,
148 | "lang": "CN",
149 | "status": "√",
150 | "desc": "文章质量很高"
151 | },
152 | {
153 | "title": "DEV Community",
154 | "url": "https://dev.to/t/python/latest",
155 | "level": 3,
156 | "lang": "EN",
157 | "status": "√",
158 | "desc": "算是个挺好的社区, post 也都不太水"
159 | },
160 | {
161 | "title": "Python猫",
162 | "url": "https://zhuanlan.zhihu.com/pythonCat",
163 | "level": 3,
164 | "lang": "CN",
165 | "status": "√",
166 | "desc": "2018 年末比较热情的博主, 原创 + 优质译文"
167 | },
168 | {
169 | "title": "Python之美",
170 | "url": "https://zhuanlan.zhihu.com/python-cn",
171 | "level": 3,
172 | "lang": "CN",
173 | "status": "√",
174 | "desc": "早期文章较多, 创业以后更新不太多了"
175 | },
176 | {
177 | "title": "静觅",
178 | "url": "https://cuiqingcai.com/category/technique/python",
179 | "level": 3,
180 | "lang": "CN",
181 | "status": "√",
182 | "desc": " 崔庆才的个人博客, 保持更新的原创博主"
183 | },
184 | {
185 | "title": "推酷(中文)",
186 | "url": "https://www.tuicool.com/topics/11130000?st=0&lang=1",
187 | "level": 3,
188 | "lang": "CN",
189 | "status": "√",
190 | "desc": "推文类站点. 按热门排序"
191 | },
192 | {
193 | "title": "推酷(英文)",
194 | "url": "https://www.tuicool.com/topics/11130000?st=0&lang=2",
195 | "level": 3,
196 | "lang": "EN",
197 | "status": "√",
198 | "desc": "推文类站点. 按热门排序"
199 | },
200 | {
201 | "title": "开发者头条",
202 | "url": "https://toutiao.io/tags/python?type=latest",
203 | "level": 3,
204 | "lang": "CN",
205 | "status": "X",
206 | "desc": "推文类站点, 但是没有发布时间, 暂不收录"
207 | },
208 | {
209 | "title": "稀土掘金",
210 | "url": "https://juejin.im/tag/Python",
211 | "level": 3,
212 | "lang": "CN",
213 | "status": "√",
214 | "desc": "推文类站点. 按热门排序"
215 | },
216 | {
217 | "title": "Python部落",
218 | "url": "https://python.freelycode.com/contribution/list/0?page_no=1",
219 | "level": 3,
220 | "lang": "CN",
221 | "status": "√",
222 | "desc": "推文+译文"
223 | },
224 | {
225 | "title": "miguelgrinberg",
226 | "url": "https://blog.miguelgrinberg.com/index",
227 | "level": 3,
228 | "lang": "EN",
229 | "status": "√",
230 | "desc": "Web 开发相关的内容挺多, 质量较高"
231 | },
232 | {
233 | "title": "Ned Batchelder",
234 | "url": "https://nedbatchelder.com/blog/tag/python.html",
235 | "level": 3,
236 | "lang": "EN",
237 | "status": "√",
238 | "desc": "热门博主。planetpython 也有"
239 | },
240 | {
241 | "title": "Full Stack Python",
242 | "url": "https://www.fullstackpython.com/blog.html",
243 | "level": 3,
244 | "lang": "EN",
245 | "status": "X",
246 | "desc": "热门博主。planetpython 有了, 文章比较少, 暂不收录"
247 | },
248 | {
249 | "title": "Eli Bendersky's website",
250 | "url": "https://eli.thegreenplace.net/tag/python",
251 | "level": 3,
252 | "lang": "EN",
253 | "status": "X",
254 | "desc": "值得一看,planetpython 有, 暂不收录"
255 | },
256 | {
257 | "title": "Manjusaka",
258 | "url": "https://manjusaka.itscoder.com/tags/Python/",
259 | "level": 3,
260 | "lang": "CN",
261 | "status": "X",
262 | "desc": "原创还不错, 但是文章较少, 暂不收录"
263 | },
264 | {
265 | "title": "Python程序员",
266 | "url": "https://zhuanlan.zhihu.com/pythoncxy",
267 | "level": 3,
268 | "lang": "CN",
269 | "status": "√",
270 | "desc": "关注破万的知乎专栏"
271 | },
272 | {
273 | "title": "Python头条",
274 | "url": "https://zhuanlan.zhihu.com/c_111369541",
275 | "level": 3,
276 | "lang": "CN",
277 | "status": "√",
278 | "desc": "关注破万的知乎专栏"
279 | },
280 | {
281 | "title": "the5fire的技术博客",
282 | "url": "https://www.the5fire.com/category/python/",
283 | "level": 3,
284 | "lang": "CN",
285 | "status": "√",
286 | "desc": "保持更新的热门中文博主"
287 | },
288 | {
289 | "title": "Python之禅",
290 | "url": "https://foofish.net/",
291 | "level": 3,
292 | "lang": "CN",
293 | "status": "√",
294 | "desc": "文章较基础, 质量不错"
295 | },
296 | {
297 | "title": "V2EX",
298 | "url": "https://www.v2ex.com/go/python",
299 | "level": 3,
300 | "lang": "CN",
301 | "status": "X",
302 | "desc": "社区类, api 失效, web 端乱七八糟的, 不收录"
303 | },
304 | {
305 | "title": "伯乐在线",
306 | "url": "http://python.jobbole.com/all-posts/",
307 | "level": 3,
308 | "lang": "CN",
309 | "status": "X",
310 | "desc": "有点类似推酷, 质量参差不齐. HTTP ERROR 503"
311 | },
312 | {
313 | "title": "Python 3 Module of the Week",
314 | "url": "https://pymotw.com/3/",
315 | "level": 3,
316 | "lang": "EN",
317 | "status": "X",
318 | "desc": "看起来不怎么更新了, 暂不收录"
319 | },
320 | {
321 | "title": "The Invent with Python Blog",
322 | "url": "https://inventwithpython.com/blog/index.html",
323 | "level": 3,
324 | "lang": "EN",
325 | "status": "√",
326 | "desc": "感觉不错"
327 | },
328 | {
329 | "title": "Armin Ronacher's Thoughts and Writings",
330 | "url": "http://lucumr.pocoo.org/",
331 | "level": 3,
332 | "lang": "EN",
333 | "status": "√",
334 | "desc": "Flask 作者 Armin Ronacher"
335 | },
336 | {
337 | "title": "aio-libs",
338 | "url": "https://groups.google.com/forum/#!forum/aio-libs",
339 | "level": 3,
340 | "lang": "EN",
341 | "status": "X",
342 | "desc": "知名 Python 开源组织, 不过没有文章类的 post"
343 | },
344 | {
345 | "title": "码农周刊",
346 | "url": "https://weekly.manong.io/issues/",
347 | "level": 3,
348 | "lang": "CN",
349 | "status": "X",
350 | "desc": "课外读物, 非 Python 主题, 暂不收录"
351 | },
352 | {
353 | "title": "编程派",
354 | "url": "http://codingpy.com/",
355 | "level": 3,
356 | "lang": "CN",
357 | "status": "√",
358 | "desc": "原创+译文"
359 | },
360 | {
361 | "title": "峰云就她了",
362 | "url": "http://xiaorui.cc/archives/category/python",
363 | "level": 3,
364 | "lang": "CN",
365 | "status": "√",
366 | "desc": "原创博客, 质量比较不错"
367 | },
368 | {
369 | "title": "Dan Bader",
370 | "url": "https://dbader.org/blog/",
371 | "level": 3,
372 | "lang": "EN",
373 | "status": "X",
374 | "desc": "一年不更新了, 先不收录了"
375 | },
376 | {
377 | "title": "Pythonic Perambulations",
378 | "url": "https://jakevdp.github.io/",
379 | "level": 3,
380 | "lang": "EN",
381 | "status": "X",
382 | "desc": "最后更新 Thu 13 September 2018, 暂不收录"
383 | },
384 | {
385 | "title": "开源中国翻译",
386 | "url": "https://www.oschina.net/translate/tag/python",
387 | "level": 3,
388 | "lang": "CN",
389 | "status": "X",
390 | "desc": "入库留着吧, 估计不更了, 暂不收录"
391 | },
392 | {
393 | "title": "Trey Hunner",
394 | "url": "https://treyhunner.com/blog/archives/",
395 | "level": 3,
396 | "lang": "EN",
397 | "status": "√",
398 | "desc": "Help developers level-up their Python skills"
399 | },
400 | {
401 | "title": "Python Central",
402 | "url": "https://www.pythoncentral.io/",
403 | "level": 3,
404 | "lang": "EN",
405 | "status": "X",
406 | "desc": "不更新了, 暂不收录"
407 | },
408 | {
409 | "title": "Inside the Head of PyDanny",
410 | "url": "https://www.pydanny.com/",
411 | "level": 3,
412 | "lang": "EN",
413 | "status": "X",
414 | "desc": "不更新了, 暂不收录"
415 | },
416 | {
417 | "title": "华蟒用户组,CPyUG",
418 | "url": "https://groups.google.com/forum/#!forum/python-cn",
419 | "level": 3,
420 | "lang": "EN",
421 | "status": "X",
422 | "desc": "[墙] 社区类, 自己看看就好, 暂不收录"
423 | },
424 | {
425 | "title": "Treehl",
426 | "url": "https://family-treesy.github.io/tags/PYTHON/",
427 | "level": 3,
428 | "lang": "CN",
429 | "status": "X",
430 | "desc": "文章较基础, 久不更新, 暂不收录"
431 | },
432 | {
433 | "title": "蠎周刊",
434 | "url": "http://weekly.pychina.org",
435 | "level": 4,
436 | "lang": "CN",
437 | "status": "√",
438 | "desc": "各种 weekly 中译版"
439 | },
440 | {
441 | "title": "zzzeek",
442 | "url": "https://techspot.zzzeek.org/",
443 | "level": 3,
444 | "lang": "EN",
445 | "status": "X",
446 | "desc": "2016 年后停更了"
447 | },
448 | {
449 | "title": "Yu’s blog",
450 | "url": "https://gofisher.github.io/",
451 | "level": 3,
452 | "lang": "CN",
453 | "status": "X",
454 | "desc": "原创, 但是久不更新了, 网站 http://blog.rainy.im/ 挂了"
455 | },
456 | {
457 | "title": "程序师",
458 | "url": "http://www.techug.com/tag/python",
459 | "level": 3,
460 | "lang": "CN",
461 | "status": "X",
462 | "desc": "原创较少, 文章较旧"
463 | },
464 | {
465 | "title": "一根笨茄子",
466 | "url": "http://blog.guoyb.com/tags/Python/",
467 | "level": 3,
468 | "lang": "CN",
469 | "status": "X",
470 | "desc": "文章更新较少, 质量参差"
471 | },
472 | {
473 | "title": "追梦人物",
474 | "url": "https://www.zmrenwu.com/",
475 | "level": 2,
476 | "lang": "CN",
477 | "status": "X",
478 | "desc": "像个学习博客"
479 | },
480 | {
481 | "title": "anshengme",
482 | "url": "https://blog.ansheng.me/",
483 | "level": 2,
484 | "lang": "CN",
485 | "status": "X",
486 | "desc": "质量一般"
487 | },
488 | {
489 | "title": "Pegasus",
490 | "url": "http://ningning.today/categories/python/",
491 | "level": 2,
492 | "lang": "CN",
493 | "status": "X",
494 | "desc": "不怎么更新"
495 | },
496 | {
497 | "title": "FunHacks",
498 | "url": "https://funhacks.net/categories/Python/",
499 | "level": 2,
500 | "lang": "CN",
501 | "status": "X",
502 | "desc": "太久不更新了, 不过python 之旅还行"
503 | },
504 | {
505 | "title": "Peter Norvig's essays",
506 | "url": "http://norvig.com/",
507 | "level": 2,
508 | "lang": "EN",
509 | "status": "X",
510 | "desc": "这排版驾驭不了..."
511 | },
512 | {
513 | "title": "Peterbe.com",
514 | "url": "https://www.peterbe.com/plog/",
515 | "level": 2,
516 | "lang": "EN",
517 | "status": "X",
518 | "desc": "不是太值得收录"
519 | },
520 | {
521 | "title": "Python Tips",
522 | "url": "https://pythontips.com/",
523 | "level": 2,
524 | "lang": "EN",
525 | "status": "X",
526 | "desc": "很火, 但我不喜欢"
527 | },
528 | {
529 | "title": "脚本之家",
530 | "url": "https://www.jb51.net/list/list_97_1.htm",
531 | "level": 2,
532 | "lang": "CN",
533 | "status": "X",
534 | "desc": "文章的质量啊~~~"
535 | },
536 | {
537 | "title": "开源中国搜索",
538 | "url": "https://www.oschina.net/search?scope=translate&q=python&category=0&onlytitle=0&sort_by_time=1",
539 | "level": 2,
540 | "lang": "CN",
541 | "status": "X",
542 | "desc": "质量不太高"
543 | },
544 | {
545 | "title": "伯乐在线头条",
546 | "url": "http://top.jobbole.com/tag/python/?sort=latest",
547 | "level": 2,
548 | "lang": "CN",
549 | "status": "X",
550 | "desc": "停更"
551 | },
552 | {
553 | "title": "代码片段",
554 | "url": "http://www.phpxs.com/code/python",
555 | "level": 2,
556 | "lang": "CN",
557 | "status": "X",
558 | "desc": "文章太老了, 停更了"
559 | },
560 | {
561 | "title": "segmentfault",
562 | "url": "https://segmentfault.com/t/python/blogs",
563 | "level": 2,
564 | "lang": "CN",
565 | "status": "X",
566 | "desc": "文章质量"
567 | },
568 | {
569 | "title": "Python China",
570 | "url": "http://python-china.org/api/topics/timeline",
571 | "level": 2,
572 | "lang": "CN",
573 | "status": "X",
574 | "desc": "欠费网站挂了"
575 | },
576 | {
577 | "title": "麦穗技术",
578 | "url": "http://www.58maisui.com/category/python/",
579 | "level": 2,
580 | "lang": "CN",
581 | "status": "X",
582 | "desc": "网站挂了"
583 | },
584 | {
585 | "title": "CSDN",
586 | "url": "https://so.csdn.net/so/search/s.do?q=python&t=blog&u=",
587 | "level": 1,
588 | "lang": "CN",
589 | "status": "X",
590 | "desc": "文章质量啊~~~"
591 | },
592 | {
593 | "title": "Stack Overflow",
594 | "url": "https://stackoverflow.com/?tab=hot",
595 | "level": 3,
596 | "lang": "EN",
597 | "status": "X",
598 | "desc": "已解决 + python + vote>=5, 但是问题有点弱智, 暂不收录"
599 | },
600 | {
601 | "title": "Reddit",
602 | "url": "https://www.reddit.com/r/Python/top/",
603 | "level": 3,
604 | "lang": "EN",
605 | "status": "√",
606 | "desc": "知名社区. 质量参差, 收录每日 ups>=20"
607 | },
608 | {
609 | "title": "码天狗",
610 | "url": "https://weekly.codetengu.com/issues",
611 | "level": 4,
612 | "lang": "CN",
613 | "status": "X",
614 | "desc": "综合类周报, 2018-11-23 之后不更了. 挂了, 下线."
615 | },
616 | {
617 | "title": "Medium",
618 | "url": "https://medium.com/tag/python",
619 | "level": 3,
620 | "lang": "EN",
621 | "status": "√",
622 | "desc": "Medium 的 Python tag, 收录 RSS"
623 | },
624 | ]
625 |
626 | content_sources_dict = {i['title']: i for i in content_sources}
627 |
628 |
629 | def main():
630 | import pathlib
631 | import re
632 | # =: 待收录, √: 已收录, X: 不收录, -: 入库不追更
633 |
634 | titles = [i['title'] for i in content_sources]
635 | # 确保没有重复的
636 | if len(titles) != len(set(titles)):
637 | raise RuntimeError('不能有重复的 title')
638 | if '|' in str(content_sources):
639 | raise RuntimeError('尽量不要有 |')
640 |
641 | providers = ''
642 | providers += '| 序号 | 名称 | 评分 | 语言 | 收录 | 描述 |\n'
643 | providers += '| ---- | ---- | ---- | ---- | ---- | ---- |\n'
644 | todo_counts = 0
645 | finish_counts = 0
646 | for x, item in enumerate(content_sources, 1):
647 | data = [str(x)]
648 | title_link = f'[{item["title"]}]({item["url"]})'
649 | data.append(title_link)
650 | data.append(str(item['level']))
651 | data.append(item['lang'])
652 | status = item['status']
653 | if item['status'] == '√':
654 | finish_counts += 1
655 | status = f'[√](https://{ONLINE_HOST}/newspaper/articles.query.html?source={quote_plus(item["title"])})'
656 | elif item['status'] == '=':
657 | todo_counts += 1
658 | data.append(status)
659 | data.append(item['desc'])
660 | string = ' | '.join(data)
661 | providers += '| ' + string + ' |\n'
662 | proc = f'* 收录进度: {finish_counts} / {finish_counts + todo_counts}\n\n\t> = 待收录 | √ 已收录 | X 不收录 | - 入库不追更\n\n'
663 | README_FP = pathlib.Path(
664 | __file__).absolute().parent.parent.parent / 'README.md'
665 | with README_FP.open('r', encoding='u8') as f:
666 | old = f.read()
667 | new = re.sub(
668 | '[\s\S]*?',
669 | f'\n\n{proc}{providers}\n\n',
670 | old)
671 | print(new)
672 | with README_FP.open('w', encoding='u8') as f:
673 | f.write(new)
674 |
675 |
676 | if __name__ == "__main__":
677 | main()
678 |
--------------------------------------------------------------------------------
/newspaper/crawler/spiders.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import json
3 | import traceback
4 | import typing
5 | import zlib
6 |
7 | from lxml.etree import ElementBase, XMLParser
8 | from lxml.html import fromstring, tostring
9 | from torequests.dummy import Requests
10 | from torequests.utils import (curlparse, escape, find_one, md5, parse_qsl,
11 | ptime, re, time, timeago, ttime, unparse_qsl,
12 | urlparse, urlunparse)
13 |
14 | from ..config import global_configs
15 | from ..loggers import spider_logger as logger
16 | from ..utils import ensure_cn_en
17 |
18 | START_TIME = time.time()
19 | test_spiders = []
20 | online_spiders = []
21 | history_spiders = []
22 | CHROME_PC_UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36'
23 | friendly_crawling_interval = 1
24 | outlands_req = Requests()
25 | # default_host_frequency 是默认的单域名并发控制: 每 3 秒一次请求
26 | req = Requests(default_host_frequency=(1, 3))
27 | # 多次请求时的友好抓取频率
28 | # req.set_frequency('zhuanlan.zhihu.com', 1, 3)
29 | req.set_frequency('www.tuicool.com', 1, 3)
30 | # 免费代理
31 | proxy = 'http://218.60.8.99:3129'
32 |
33 |
34 | class null_tree:
35 | text = ''
36 |
37 | @classmethod
38 | def text_content(cls):
39 | return ''
40 |
41 | def get(self, key, default=''):
42 | return default
43 |
44 | @classmethod
45 | def css(cls, item, csspath, idx=0):
46 | return (item.cssselect(csspath) or [cls])[idx]
47 |
48 | @classmethod
49 | def tostring(cls, doc, **kwargs):
50 | if isinstance(doc, ElementBase):
51 | return tostring(doc, **kwargs)
52 | else:
53 | return ''
54 |
55 |
56 | def sort_url_query(url, reverse=False, _replace_kwargs=None):
57 | """sort url query args.
58 | _replace_kwargs is a dict to update attributes before sorting (such as scheme / netloc...).
59 | http://www.google.com?b=2&z=26&a=1 => http://www.google.com?a=1&b=2&z=26
60 | """
61 | parsed = urlparse(url)
62 | if _replace_kwargs:
63 | parsed = parsed._replace(**_replace_kwargs)
64 | sorted_parsed = parsed._replace(
65 | query=unparse_qsl(parse_qsl(parsed.query), sort=True, reverse=reverse))
66 | return urlunparse(sorted_parsed)
67 |
68 |
69 | def get_url_key(url) -> str:
70 | """通过 url 来计算 key, 一方面计算 md5, 另一方面净化无用参数.
71 | 以后再考虑要不要纯数字...
72 | import hashlib
73 | a = hashlib.md5(b'url')
74 | b = a.hexdigest()
75 | as_int = int(b, 16)
76 | url_key = str(as_int)[5:][:20]
77 | print(url_key)
78 | """
79 | if url:
80 | key = md5(sort_url_query(url, _replace_kwargs={'scheme': 'https'}))
81 | return key
82 | return ""
83 |
84 |
85 | def add_host(url: str, host: str) -> str:
86 | if not url:
87 | return ''
88 | if re.match('^https?://', url):
89 | return url
90 | if url.startswith('//'):
91 | return f'https:{url}'
92 | if not host.endswith('/'):
93 | host = f'{host}/'
94 | return re.sub('^/', host, url)
95 |
96 |
97 | def shorten_desc(desc: str) -> str:
98 | """Shorten the desc too long (more than 50)."""
99 | if not desc:
100 | return ''
101 | # remain sentence before ./\n/。/!
102 | desc = re.sub(r'(.{50,})(\n|\.|。|!|!|?|\?)\s?[\s\S]+', r'\1\2', desc)
103 | # remove html tag
104 | desc = re.sub('<[^>]+>', '', desc).strip()
105 | return escape(desc)
106 |
107 |
108 | async def outlands_request(request_dict: dict = None,
109 | encoding: str = 'u8',
110 | **request_args) -> str:
111 | """小水管不开源, 无法用来 FQ.
112 |
113 | 例:
114 | async def test():
115 | text = await outlands_request({
116 | 'method': 'get',
117 | 'url': 'https://pyfound.blogspot.com/'
118 | }, 'u8')
119 | print(text)
120 | return text
121 | """
122 | request_dict = request_dict or {}
123 | request_dict.update(request_args)
124 | request_dict.setdefault('method', 'get')
125 | request_dict.setdefault('ssl', False)
126 | request_dict.setdefault('headers', {})
127 | request_dict['headers'].setdefault('User-Agent', CHROME_PC_UA)
128 | json_data = json.dumps(request_dict)
129 | data = zlib.compress(json_data.encode('u8'))
130 | url = global_configs['anti_gfw']['url']
131 | r = await outlands_req.post(url, timeout=60, data=data)
132 | if r:
133 | return zlib.decompress(r.content).decode(encoding)
134 | else:
135 | return r.text
136 |
137 |
138 | def register_test(function: typing.Callable) -> typing.Callable:
139 | """把爬虫注册到测试列表
140 |
141 | :param function: 爬虫函数, 一般没有参数.
142 | :type function: typing.Callable
143 | :return: 爬虫函数, 一般没有参数.
144 | :rtype: typing.Callable
145 | """
146 |
147 | test_spiders.append(function)
148 | return function
149 |
150 |
151 | def register_online(function: typing.Callable) -> typing.Callable:
152 | """把爬虫注册到线上可用列表
153 |
154 | :param function: 爬虫函数, 一般没有参数.
155 | :type function: typing.Callable
156 | :return: 爬虫函数, 一般没有参数.
157 | :rtype: typing.Callable
158 | """
159 |
160 | online_spiders.append(function)
161 | return function
162 |
163 |
164 | def register_history(function: typing.Callable) -> typing.Callable:
165 | """把爬虫注册到历史文章抓取任务列表
166 |
167 | :param function: 爬虫函数, 一般没有参数.
168 | :type function: typing.Callable
169 | :return: 爬虫函数, 一般没有参数.
170 | :rtype: typing.Callable
171 | """
172 |
173 | history_spiders.append(function)
174 | return function
175 |
176 |
177 | async def common_spider_zhihu_zhuanlan(name, source, limit=10):
178 | articles = []
179 | offset: int = 0
180 | # 分页
181 | chunk_size: int = 50
182 | # 最多只要 2000 篇,再多没意义
183 | for _ in range(2000 // chunk_size):
184 | _limit = min((limit - offset, chunk_size))
185 | # or limit == offset
186 | if not _limit:
187 | break
188 | api: str = f'https://zhuanlan.zhihu.com/api/columns/{name}/articles?limit={_limit}&offset={offset}'
189 | r = await req.get(
190 | api,
191 | ssl=False,
192 | headers={
193 | "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36'
194 | })
195 | if not r:
196 | logger.info(
197 | f'crawl zhihu_zhuanlan {name} limit={limit} failed: {r}')
198 | return articles
199 | items = r.json()['data']
200 | if not items:
201 | break
202 | for item in items:
203 | if not (item['type'] == 'article' and item['state'] == 'published'):
204 | continue
205 | article: dict = {'source': source}
206 | article['ts_publish'] = ttime(item['created'])
207 | article['cover'] = item['image_url']
208 | article['title'] = item['title']
209 | article['desc'] = re.sub('<[^>]+>', ' ', item.get('excerpt') or '')
210 | article['url'] = item['url']
211 | article['url_key'] = get_url_key(article['url'])
212 | articles.append(article)
213 | offset += _limit
214 |
215 | return articles
216 |
217 |
218 | async def common_spider_tuicool(lang, source, max_page=1, ignore_descs=None):
219 | articles = []
220 | langs = {'CN': 1, 'EN': 2}
221 | lang_num = langs[lang]
222 | host = 'https://www.tuicool.com/'
223 | this_year = ttime()[:4]
224 | ignore_descs = ignore_descs or set()
225 | # 非登录用户只能采集前两页, 想采集更多需要 `_tuicool_session` cookie.
226 | headers = {
227 | 'Connection': 'keep-alive',
228 | 'Upgrade-Insecure-Requests': '1',
229 | 'If-None-Match': 'W/"41a6894d66c0f07fcfac6ec1d84446a3"',
230 | 'Dnt': '1',
231 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
232 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
233 | 'Referer': 'https://www.tuicool.com/',
234 | 'Host': 'www.tuicool.com',
235 | 'Accept-Encoding': 'gzip, deflate, br',
236 | 'Accept-Language': 'zh-CN,zh;q=0.9',
237 | 'Cookie': '_tuicool_session=',
238 | }
239 | proxy = None
240 | for page in range(0, max_page):
241 | # st 参数: 0 是按时间顺序, 1 是热门文章
242 | api: str = f'https://www.tuicool.com/ah/0?st=1&lang={lang_num}&pn={page}'
243 | r = await req.get(api,
244 | ssl=False,
245 | proxy=proxy,
246 | retry=1,
247 | timeout=5,
248 | headers=headers)
249 | # print(r.text)
250 | if not r:
251 | logger.info(f'crawl tuicool {lang} page={page} failed: {r}')
252 | return articles
253 | items = fromstring(
254 | r.text).cssselect('#list_article>div.list_article_item')
255 | if max_page > 1:
256 | logger.info(
257 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
258 | )
259 | if not items:
260 | break
261 | for item in items:
262 | article: dict = {'source': source}
263 | url = null_tree.css(item,
264 | '.aricle_item_info>.title>a').get('href', '')
265 | url = add_host(url, host)
266 | title = null_tree.css(item, '.aricle_item_info>.title>a').text
267 | cover = null_tree.css(item,
268 | '.article_thumb_image>img').get('src', '')
269 | cover = cover.replace(
270 | 'https://static0.tuicool.com/images/abs_img_no_small.jpg', '')
271 | time_span = null_tree.css(item,
272 | '.aricle_item_info>.tip').text_content()
273 | raw_time = find_one(r'\d\d-\d\d \d\d:\d\d', time_span)[0]
274 | if raw_time:
275 | # 避免是个怪异的时间, ensure 一下
276 | article['ts_publish'] = ttime(
277 | ptime(f'{this_year}-{raw_time}:00'))
278 | desc = null_tree.css(
279 | item,
280 | '.aricle_item_info>div.tip>span:nth-of-type(1)').text.strip()
281 | if not re.search('Python|python', f'{title}{desc}'):
282 | continue
283 | if desc in ignore_descs:
284 | continue
285 | article['cover'] = cover
286 | article['title'] = title
287 | article['desc'] = desc
288 | article['url'] = url
289 | article['url_key'] = get_url_key(article['url'])
290 | articles.append(article)
291 | return articles
292 |
293 |
294 | async def common_spider_juejin(user, source, max_page=1):
295 | articles = []
296 | host = 'https://juejin.im/'
297 | now = ttime(fmt="%Y-%m-%dT%H:%M:%S.000Z")
298 | api: str = 'https://timeline-merger-ms.juejin.im/v1/get_entry_by_self'
299 | params: dict = {
300 | 'src': 'web',
301 | 'targetUid': user,
302 | 'type': 'post',
303 | 'before': now,
304 | 'limit': 20,
305 | 'order': 'createdAt'
306 | }
307 | for page in range(max_page):
308 | try:
309 | params['before'] = now
310 | r = await req.get(api,
311 | ssl=False,
312 | params=params,
313 | retry=1,
314 | timeout=5,
315 | headers={"User-Agent": CHROME_PC_UA})
316 | if not r:
317 | logger.info(f'crawl juejin page={page} failed: {r}')
318 | return articles
319 | items = r.json()['d']['entrylist']
320 | if max_page > 1:
321 | logger.info(
322 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
323 | )
324 | if not items:
325 | break
326 | for item in items:
327 | article: dict = {'source': source}
328 | url = item['originalUrl']
329 | url = add_host(url, host)
330 | title = item['title']
331 | cover = item.get('screenshot') or ''
332 | now = item['createdAt']
333 | if now:
334 | ts_publish = re.sub('\..*', '', now)
335 | article['ts_publish'] = ts_publish.replace('T', ' ')
336 | desc = item.get('summaryInfo') or ''
337 | article['cover'] = cover
338 | article['title'] = title
339 | article['desc'] = desc
340 | article['url'] = url
341 | article['url_key'] = get_url_key(article['url'])
342 | articles.append(article)
343 | if not now:
344 | break
345 | except Exception:
346 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
347 | return articles
348 |
349 |
350 | @register_online
351 | async def python_news() -> list:
352 | """Python Software Foundation News"""
353 | source: str = 'Python Software Foundation News'
354 | articles: list = []
355 | seed = 'https://pyfound.blogspot.com/search?max-results=10'
356 | scode = await outlands_request({
357 | 'method': 'get',
358 | 'url': seed,
359 | }, 'u8')
360 | if scode:
361 | tree = fromstring(scode)
362 | for item in tree.cssselect('.blog-posts>.date-outer'):
363 | try:
364 | article: dict = {'source': source}
365 | raw_pub_time = item.cssselect('.published')[0].get('title', '')
366 | ts_publish = ttime(
367 | ptime(raw_pub_time, fmt='%Y-%m-%dT%H:%M:%S%z'))
368 | article['ts_publish'] = ts_publish
369 | article['title'] = item.cssselect(
370 | '.post-title.entry-title>a')[0].text
371 | # 兼容下没有 desc 的情况
372 | node = item.cssselect('.post-body.entry-content') or [null_tree]
373 | desc = node[0].text_content()
374 | article['desc'] = desc.split('\n\n\n',
375 | 1)[0].strip().replace('\n', ' ')
376 | article['url'] = item.cssselect(
377 | '.post-title.entry-title>a')[0].get('href', '')
378 | article['url_key'] = get_url_key(article['url'])
379 | articles.append(article)
380 | except Exception:
381 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
382 | logger.info(
383 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
384 | )
385 | return articles
386 |
387 |
388 | # @register_history
389 | async def python_news_history() -> list:
390 | """Python Software Foundation News"""
391 | source: str = 'Python Software Foundation News'
392 | articles: list = []
393 | current_year = int(time.strftime('%Y'))
394 | for year in range(2006, current_year + 1):
395 | seed = f'https://pyfound.blogspot.com/{year}/'
396 | scode = await outlands_request({
397 | 'method': 'get',
398 | 'url': seed,
399 | }, 'u8')
400 | await asyncio.sleep(3)
401 | if not scode:
402 | continue
403 | tree = fromstring(scode)
404 | for item in tree.cssselect('.blog-posts>.date-outer'):
405 | try:
406 | article: dict = {'source': source}
407 | raw_pub_time = item.cssselect('.published')[0].get('title', '')
408 | ts_publish = ttime(
409 | ptime(raw_pub_time, fmt='%Y-%m-%dT%H:%M:%S%z'))
410 | article['ts_publish'] = ts_publish
411 | article['title'] = item.cssselect(
412 | '.post-title.entry-title>a')[0].text
413 | # 兼容下没有 desc 的情况
414 | node = item.cssselect('.post-body.entry-content') or [null_tree]
415 | desc = node[0].text_content()
416 | article['desc'] = desc.split('\n\n\n',
417 | 1)[0].strip().replace('\n', ' ')
418 | article['url'] = item.cssselect(
419 | '.post-title.entry-title>a')[0].get('href', '')
420 | article['url_key'] = get_url_key(article['url'])
421 | articles.append(article)
422 | except Exception:
423 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
424 | logger.info(
425 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
426 | )
427 | return articles
428 |
429 |
430 | def _python_weekly_calculate_date(issue_id):
431 | diff = 396 - int(issue_id)
432 | return ttime(1557331200 - diff * 86400 * 7)
433 |
434 |
435 | @register_online
436 | # @register_history
437 | # @register_test
438 | async def python_weekly() -> list:
439 | """Python Weekly"""
440 | source: str = 'Python Weekly'
441 | articles: list = []
442 | # 一周一更, 所以只取第一个就可以了
443 | limit = 1
444 | seed = 'https://us2.campaign-archive.com/home/?u=e2e180baf855ac797ef407fc7&id=9e26887fc5'
445 | scode = await outlands_request({
446 | 'method': 'get',
447 | 'url': seed,
448 | }, 'u8')
449 | box = find_one(
450 | r'(?:
)(
)(?:)',
451 | scode)[1]
452 | items = re.findall(r'()', box)
453 | for item in items[:limit]:
454 | try:
455 | article: dict = {'source': source}
456 | # 从列表页取 ts_publish 和 issue_id, 其他的去详情页里采集
457 | # 05/09/2019 - Python Weekly - Issue 396
458 | title = find_one('title="(.*?)"', item)[1]
459 | issue_id = find_one(r' - Issue (\d+)', title)[1]
460 | pub_dates = find_one(r'class="campaign">(\d\d)/(\d\d)/(\d\d\d\d)',
461 | item)[1]
462 | if not issue_id:
463 | continue
464 | if len(pub_dates) == 3:
465 | ts_publish = f'{pub_dates[2]}-{pub_dates[0]}-{pub_dates[1]} 00:00:00'
466 | else:
467 | ts_publish = _python_weekly_calculate_date(issue_id)
468 | article['ts_publish'] = ts_publish
469 | detail_url = f'https://mailchi.mp/pythonweekly/python-weekly-issue-{issue_id}'
470 | r = await req.get(
471 | detail_url,
472 | ssl=False,
473 | headers={
474 | "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36'
475 | })
476 | if not r:
477 | logger.error(f'fetch {detail_url} failed: {r}')
478 | continue
479 | scode = r.text
480 | title = find_one('(.*?)', r.text)[1]
481 | title = title.strip('Â ')
482 | translate_url = find_one(
483 | r'(://translate\.google\.com/translate\?[^"]+)', scode)[1]
484 | backup_url = dict(
485 | parse_qsl(translate_url))['u'] if translate_url else ''
486 | backup_url_desc = f'View this email in your browser
' if backup_url else ''
487 | nodes = fromstring(scode).cssselect('[style="font-size:14px"]>a')
488 | all_links = [
489 | f"「{tostring(i, method='html', with_tail=0, encoding='unicode')} 」"
490 | for i in nodes
491 | ]
492 | all_links_desc = '
'.join(all_links)
493 | article['title'] = title
494 | article['desc'] = f'{backup_url_desc}{all_links_desc}'
495 | article['url'] = detail_url
496 | article['url_key'] = get_url_key(article['url'])
497 | articles.append(article)
498 | except Exception:
499 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
500 | break
501 | logger.info(
502 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
503 | )
504 | return articles
505 |
506 |
507 | # @register_history
508 | async def python_weekly_history() -> list:
509 | """Python Weekly"""
510 | source: str = 'Python Weekly'
511 | articles: list = []
512 | for issue_id in range(324, 1000):
513 | try:
514 | article: dict = {'source': source}
515 | article['ts_publish'] = _python_weekly_calculate_date(issue_id)
516 | detail_url = f'https://mailchi.mp/pythonweekly/python-weekly-issue-{issue_id}'
517 | r = await req.get(
518 | detail_url,
519 | ssl=False,
520 | headers={
521 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36'
522 | })
523 | if '404: Page Not Found' in r.text:
524 | logger.warn('python_weekly_history break for 404 page')
525 | break
526 | if not r:
527 | logger.error(f'python_weekly_history break for {r}')
528 | break
529 | scode = r.text
530 | title = find_one('(.*?)', r.text)[1]
531 | title = title.strip('Â ')
532 | translate_url = find_one(
533 | r'(://translate\.google\.com/translate\?[^"]+)', scode)[1]
534 | backup_url = dict(
535 | parse_qsl(translate_url))['u'] if translate_url else ''
536 | backup_url_desc = f'View this email in your browser
' if backup_url else ''
537 | nodes = fromstring(scode).cssselect('[style="font-size:14px"]>a')
538 | all_links = [
539 | f"「{tostring(i, method='html', with_tail=0, encoding='unicode')} 」"
540 | for i in nodes
541 | ]
542 | all_links_desc = '
'.join(all_links)
543 | article['title'] = title
544 | article['desc'] = f'{backup_url_desc}{all_links_desc}'
545 | article['url'] = detail_url
546 | article['url_key'] = get_url_key(article['url'])
547 | articles.append(article)
548 | except Exception:
549 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
550 | break
551 | logger.info(
552 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
553 | )
554 | return articles
555 |
556 |
557 | @register_online
558 | async def pycoder_weekly() -> list:
559 | """PyCoder's Weekly"""
560 | # 把 limit 改 999 就可以抓历史了
561 | source: str = "PyCoder's Weekly"
562 | articles: list = []
563 | # 一周一更, 所以只取第一个就可以了
564 | limit = 1
565 | seed = 'https://pycoders.com/issues'
566 | base_url = find_one('^https?://[^/]+', seed)[0]
567 | r = await req.get(seed, headers={'User-Agent': CHROME_PC_UA})
568 | if not r:
569 | logger.error(f'{source} crawl failed: {r}, {r.text}')
570 | return articles
571 | items = re.findall(r'Issue #\d+ .*?', r.text)
572 | for item in items[:limit]:
573 | try:
574 | article: dict = {'source': source}
575 | # Issue #368 (May 14, 2019)
576 | title = find_one('>(Issue.*?)<', item)[1]
577 | article['title'] = f"PyCoder's Weekly | {title}"
578 | month, day, year = re.findall(r'\((.*?) (\d+), (\d+)\)',
579 | article['title'])[0]
580 | month = month[:3]
581 | raw_time = f'{year}-{month}-{day}'
582 | ts_publish = ttime(ptime(raw_time, fmt='%Y-%b-%d'))
583 | article['ts_publish'] = ts_publish
584 | article['desc'] = ''
585 | url = find_one(r'href="(/issues/\d+)"', item)[1]
586 | article['url'] = base_url + url
587 | article['url_key'] = get_url_key(article['url'])
588 | articles.append(article)
589 | except Exception:
590 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
591 | break
592 | logger.info(
593 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
594 | )
595 | return articles
596 |
597 |
598 | @register_online
599 | # @register_test
600 | async def importpython() -> list:
601 | """Import Python"""
602 | source: str = 'Import Python'
603 | articles: list = []
604 | # 一周一更, 所以只取第一个就可以了
605 | limit = 1
606 | seed = 'https://importpython.com/newsletter/archive/'
607 | r = await req.get(seed,
608 | timeout=15,
609 | ssl=False,
610 | headers={"User-Agent": CHROME_PC_UA})
611 | if not r:
612 | logger.error(f'{source} crawl failed: {r}, {r.text}')
613 | return articles
614 | items = fromstring(r.text).cssselect('#tourpackages-carousel>.row>div')
615 | for item in items[:limit]:
616 | try:
617 | article: dict = {'source': source}
618 | href = item.cssselect('div.caption>a')[0].get('href', '')
619 | if not href:
620 | continue
621 | url = add_host(href, 'https://importpython.com/')
622 | title = item.cssselect('div.caption>.well-add-card>h4')[0].text
623 | desc_node = item.cssselect('div.caption>div[class="col-lg-12"]')[0]
624 | desc = tostring(desc_node,
625 | method='html',
626 | with_tail=0,
627 | encoding='unicode')
628 | day, month, year = re.findall(r'- (\d+) (\S+) (\d+)', title)[0]
629 | month = month[:3]
630 | raw_time = f'{year}-{month}-{day}'
631 | ts_publish = ttime(ptime(raw_time, fmt='%Y-%b-%d'))
632 | article['ts_publish'] = ts_publish
633 | clean_title = re.sub(' - .*', '', title)
634 | title = f"{source} - {clean_title}"
635 | article['title'] = title
636 | article['desc'] = desc.replace('\n ', ' ')
637 | article['url'] = url
638 | article['url_key'] = get_url_key(article['url'])
639 | articles.append(article)
640 | except Exception:
641 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
642 | break
643 | logger.info(
644 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
645 | )
646 | return articles
647 |
648 |
649 | @register_online
650 | # @register_test
651 | async def awesome_python() -> list:
652 | """Awesome Python Newsletter"""
653 | source: str = 'Awesome Python Newsletter'
654 | articles: list = []
655 | # 一周一更, 所以只取第一个就可以了
656 | limit = 1
657 | seed = 'https://python.libhunt.com/newsletter/archive'
658 | scode = await outlands_request({
659 | 'method': 'get',
660 | 'url': seed,
661 | }, 'u8')
662 | hrefs = re.findall(
663 | r'\s*', scode)
664 | for href in hrefs[:limit]:
665 | try:
666 | article: dict = {'source': source}
667 | url = add_host(href, 'https://python.libhunt.com/')
668 | r = await req.get(url,
669 | retry=2,
670 | timeout=15,
671 | headers={"User-Agent": CHROME_PC_UA})
672 | if not r:
673 | logger.error(f'fetch {url} failed: {r}')
674 | break
675 | tree = fromstring(r.text)
676 | raw_title = tree.cssselect('title')[0].text
677 | title = re.sub(', .*', '', raw_title)
678 | raw_pub_date = find_one(r', (.*?) \|', raw_title)[1]
679 | # May 17, 2019
680 | ts_publish = ttime(ptime(raw_pub_date, fmt='%b %d, %Y'))
681 | nodes = tree.cssselect(
682 | 'li[class="story row"]>div[class="column"]>a')
683 | descs = [
684 | tostring(i, method='html', with_tail=0, encoding='unicode')
685 | for i in nodes
686 | ]
687 | desc = ' '.join(descs)
688 | article['ts_publish'] = ts_publish
689 | article['title'] = title
690 | article['desc'] = desc
691 | article['url'] = url
692 | article['url_key'] = get_url_key(article['url'])
693 | articles.append(article)
694 | except Exception:
695 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
696 | break
697 | logger.info(
698 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
699 | )
700 | return articles
701 |
702 |
703 | @register_online
704 | async def real_python() -> list:
705 | """Real Python"""
706 | source: str = 'Real Python'
707 | articles: list = []
708 | limit = 20
709 | seed = 'https://realpython.com/'
710 | r = await req.get(seed,
711 | retry=1,
712 | timeout=20,
713 | headers={"User-Agent": CHROME_PC_UA})
714 | if not r:
715 | logger.error(f'{source} crawl failed: {r}, {r.text}')
716 | return articles
717 | items = fromstring(r.text).cssselect('div[class="card border-0"]')
718 | for item in items[:limit]:
719 | try:
720 | article: dict = {'source': source}
721 | href = item.cssselect('a')[0].get('href', '')
722 | url = add_host(href, 'https://realpython.com/')
723 | title = item.cssselect('h2.card-title')[0].text
724 | pub_date_node = item.cssselect('.mr-2') or [null_tree]
725 | raw_pub_date = pub_date_node[0].text
726 | # May 16, 2019
727 | ts_publish = ttime(ptime(raw_pub_date, fmt='%b %d, %Y'))
728 | cover_item = item.cssselect('img.card-img-top')
729 | if cover_item:
730 | cover = cover_item[0].get('src', '')
731 | if cover:
732 | article['cover'] = cover
733 | article['ts_publish'] = ts_publish
734 | article['title'] = title
735 | article['desc'] = ''
736 | article['url'] = url
737 | article['url_key'] = get_url_key(article['url'])
738 | articles.append(article)
739 | except Exception:
740 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
741 | break
742 | logger.info(
743 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
744 | )
745 | return articles
746 |
747 |
748 | @register_online
749 | async def planet_python() -> list:
750 | """Planet Python"""
751 | source: str = 'Planet Python'
752 | articles: list = []
753 | limit = 100
754 | seed = 'https://planetpython.org/rss20.xml'
755 | # 避免超时, 用外部访问
756 | scode = await outlands_request({
757 | 'method': 'get',
758 | 'url': seed,
759 | }, 'u8')
760 | items = fromstring(scode).xpath('//channel/item')
761 | now = ttime()
762 | for item in items[:limit]:
763 | try:
764 | article: dict = {'source': source}
765 | guid = item.xpath('./guid/text()')
766 | title = item.xpath('./title/text()')
767 | description = item.xpath('./description/text()')
768 | pubDate = item.xpath('./pubdate/text()')
769 | if not (guid and title):
770 | continue
771 | url = guid[0]
772 | title = title[0]
773 | if 'بايثون العربي' in title:
774 | continue
775 | if 'Python Software Foundation: ' in title:
776 | # 已经单独收录过, 不需要再收录一次
777 | continue
778 | if description:
779 | desc = fromstring(description[0]).text_content()
780 | # 去掉 <>
781 | desc = re.sub('<[^>]*>', ' ', desc)
782 | # 只保留第一个换行前面的
783 | desc = shorten_desc(desc)
784 | else:
785 | desc = ''
786 | if pubDate:
787 | raw_pub_date = pubDate[0]
788 | # Wed, 22 May 2019 01:47:44 +0000
789 | raw_pub_date = re.sub('^.*?, ', '', raw_pub_date).strip()
790 | ts_publish = ttime(
791 | ptime(raw_pub_date, fmt='%d %b %Y %H:%M:%S %z'))
792 | else:
793 | ts_publish = now
794 | article['ts_publish'] = ts_publish
795 | article['title'] = title
796 | article['desc'] = desc
797 | article['url'] = url
798 | article['url_key'] = get_url_key(article['url'])
799 | articles.append(article)
800 | except Exception:
801 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
802 | break
803 | logger.info(
804 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
805 | )
806 | return articles
807 |
808 |
809 | @register_online
810 | # @register_test
811 | async def julien_danjou() -> list:
812 | """Julien Danjou"""
813 | # 历史文章只要不断改页码迭代就好了
814 | source: str = 'Julien Danjou'
815 | articles: list = []
816 | seed = 'https://julien.danjou.info/page/1/'
817 | scode = await outlands_request(
818 | {
819 | 'method': 'get',
820 | 'timeout': 5,
821 | 'retry': 2,
822 | 'url': seed,
823 | }, 'u8')
824 | items = fromstring(scode).cssselect('.post-feed>article.post-card')
825 | # 判断发布时间如果是 1 小时前就 break
826 | break_time = ttime(time.time() - 60 * 60)
827 | host = 'https://julien.danjou.info/'
828 | for item in items:
829 | try:
830 | article: dict = {'source': source}
831 | href = item.cssselect('a.post-card-content-link')[0].get('href', '')
832 | if not href:
833 | raise ValueError(f'{source} not found href from {seed}')
834 | url = add_host(href, host)
835 | title = (item.cssselect('h2.post-card-title') or
836 | [null_tree])[0].text
837 | desc = (item.cssselect('.post-card-excerpt>p') or
838 | [null_tree])[0].text
839 | if not (title and url):
840 | raise ValueError(f'{source} no title {url}')
841 | detail_scode = await outlands_request(
842 | {
843 | 'method': 'get',
844 | 'timeout': 5,
845 | 'retry': 2,
846 | 'url': url,
847 | }, 'u8')
848 | if not detail_scode:
849 | raise ValueError(f'{source} has no detail_scode {url}')
850 | raw_pub_time = find_one(
851 | 'property="article:published_time" content="(.+?)"',
852 | detail_scode)[1]
853 | # 2019-05-06T08:58:00.000Z
854 | ts_publish = ttime(ptime(raw_pub_time,
855 | fmt='%Y-%m-%dT%H:%M:%S.000Z'))
856 | cover_item = item.cssselect('img.post-card-image')
857 | if cover_item:
858 | cover = cover_item[0].get('src', '')
859 | if cover:
860 | article['cover'] = add_host(cover, host)
861 | article['ts_publish'] = ts_publish
862 | article['title'] = title
863 | article['desc'] = desc
864 | article['url'] = url
865 | article['url_key'] = get_url_key(article['url'])
866 | articles.append(article)
867 | if ts_publish < break_time:
868 | # 文章的发布时间超过抓取间隔, 则 break
869 | break
870 | except Exception:
871 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
872 | break
873 | logger.info(
874 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
875 | )
876 | return articles
877 |
878 |
879 | @register_online
880 | async def doughellmann() -> list:
881 | """Doug Hellmann"""
882 | source: str = 'Doug Hellmann'
883 | articles: list = []
884 | max_page: int = 1
885 | seed = 'https://doughellmann.com/blog/page/{page}/'
886 | for page in range(1, max_page + 1):
887 | r = await req.get(seed.format(page=page),
888 | retry=1,
889 | timeout=20,
890 | headers={"User-Agent": CHROME_PC_UA})
891 | if not r:
892 | logger.error(f'{source} crawl failed: {r}, {r.text}')
893 | return articles
894 | scode = r.text
895 | items = fromstring(scode).cssselect('#main>article')
896 | if max_page > 1:
897 | logger.info(
898 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
899 | )
900 | if not items and page > 1:
901 | logger.info(f'{source} break for page {page} has no items')
902 | break
903 | for item in items:
904 | try:
905 | article: dict = {'source': source}
906 | title = item.cssselect('.entry-title>a')[0].text
907 | url = item.cssselect('.entry-title>a')[0].get('href')
908 | desc = item.cssselect('.entry-content')[0].text_content()
909 | pub_time = item.cssselect('time.entry-date')[0].get('datetime')
910 | ts_publish = ttime(ptime(pub_time, fmt='%Y-%m-%dT%H:%M:%S%z'))
911 | article['ts_publish'] = ts_publish
912 | article['title'] = title
913 | article['desc'] = shorten_desc(desc)
914 | article['url'] = url
915 | article['url_key'] = get_url_key(article['url'])
916 | articles.append(article)
917 | except Exception:
918 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
919 | break
920 | logger.info(
921 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
922 | )
923 | return articles
924 |
925 |
926 | @register_online
927 | # @register_history
928 | # @register_test
929 | async def mouse_vs_python() -> list:
930 | """The Mouse Vs. The Python"""
931 | source: str = 'The Mouse Vs. The Python'
932 | articles: list = []
933 | max_page: int = 1
934 | # max_page:int = 101
935 | seed = 'https://www.blog.pythonlibrary.org/page/{page}/'
936 | for page in range(1, max_page + 1):
937 | api = seed.format(page=page)
938 | scode = await outlands_request(
939 | {
940 | 'method': 'get',
941 | 'timeout': 5,
942 | 'retry': 2,
943 | 'url': api,
944 | }, 'u8')
945 | items = fromstring(scode).cssselect('#content>article')
946 | if max_page > 1:
947 | logger.info(
948 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
949 | )
950 | if not items:
951 | if page > 1:
952 | logger.info(f'{source} break for page {page} has no items')
953 | break
954 | for item in items:
955 | try:
956 | article: dict = {'source': source}
957 | title = item.cssselect('.entry-title>a')[0].text
958 | url = item.cssselect('.entry-title>a')[0].get('href')
959 | desc = item.cssselect('.entry-content')[0].text_content()
960 | pub_time = item.cssselect('time.entry-date')[0].get('datetime')
961 | ts_publish = ttime(ptime(pub_time, fmt='%Y-%m-%dT%H:%M:%S%z'))
962 | article['ts_publish'] = ts_publish
963 | article['title'] = title
964 | article['desc'] = shorten_desc(desc)
965 | article['url'] = url
966 | article['url_key'] = get_url_key(article['url'])
967 | articles.append(article)
968 | except Exception:
969 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
970 | break
971 | logger.info(
972 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
973 | )
974 | return articles
975 |
976 |
977 | @register_online
978 | # @register_history
979 | # @register_test
980 | async def infoq_python() -> list:
981 | """InfoQ"""
982 | source: str = 'InfoQ'
983 | articles: list = []
984 | max_page: int = 1
985 | # max_page:int = 101
986 | curl_string = r'''curl 'https://www.infoq.cn/public/v1/article/getList' -H 'Origin: https://www.infoq.cn' -H 'Accept-Encoding: gzip, deflate, br' -H 'Accept-Language: zh-CN,zh;q=0.9' -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36' -H 'Content-Type: application/json' -H 'Accept: application/json, text/plain, */*' -H 'Referer: https://www.infoq.cn/topic/python' -H 'Cookie: SERVERID=0|0|0' -H 'Connection: keep-alive' -H 'DNT: 1' --data-binary '{"type":1,"size":12,"id":50,"score":0}' --compressed'''
987 | request_args = curlparse(curl_string)
988 | for page in range(1, max_page + 1):
989 | r = await req.request(retry=2, timeout=5, **request_args)
990 | if not r:
991 | logger.error(f'{source} crawl failed: {r}, {r.text}')
992 | return articles
993 | items = r.json().get('data') or []
994 | if max_page > 1:
995 | logger.info(
996 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
997 | )
998 | if items:
999 | # 调整上一页最后一个 score 实现翻页
1000 | data = json.loads(request_args['data'])
1001 | data['score'] = items[-1]['score']
1002 | request_args['data'] = json.dumps(data).encode('u8')
1003 | elif page > 1:
1004 | logger.info(f'{source} break for page {page} has no items')
1005 | break
1006 | for item in items:
1007 | try:
1008 | article: dict = {'source': source}
1009 | desc = shorten_desc(item['article_summary'])
1010 | if '本文分享 方巍' in desc:
1011 | continue
1012 | title = item['article_title']
1013 | url = f"https://www.infoq.cn/article/{item['uuid']}"
1014 | ts_publish = ttime(item['publish_time'])
1015 | article['ts_publish'] = ts_publish
1016 | article['title'] = title
1017 | article['desc'] = desc
1018 | article['url'] = url
1019 | article['url_key'] = get_url_key(article['url'])
1020 | articles.append(article)
1021 | except Exception:
1022 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1023 | break
1024 | logger.info(
1025 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1026 | )
1027 | return articles
1028 |
1029 |
1030 | @register_online
1031 | # @register_history
1032 | # @register_test
1033 | async def hn_python() -> list:
1034 | """Hacker News"""
1035 | source: str = 'Hacker News'
1036 | articles: list = []
1037 | max_page = 999
1038 | # 默认收录 24 小时内的 3 points 以上
1039 | min_points = 3
1040 | now_ts = int(time.time())
1041 | ts_start = now_ts - 86400
1042 | ts_end = now_ts
1043 | # 历史文章收录 90 天内的历史文章, 对方有个每次 query 1000 的上限配置 paginationLimitedTo
1044 | # 如果需要更久的, 不断修改起止时间就可以了
1045 | # ts_start = now_ts - 86400 * 90
1046 | # ts_end = now_ts
1047 | per_page: int = 100
1048 | api: str = 'https://hn.algolia.com/api/v1/search_by_date'
1049 | # tags=story&query=python&numericFilters=created_at_i%3E1553174400,points%3E1&page=2&hitsPerPage=10
1050 | params: dict = {
1051 | 'tags': 'story',
1052 | 'query': 'python',
1053 | 'numericFilters': f'created_at_i>={ts_start},created_at_i<={ts_end},points>={min_points}',
1054 | 'page': 0,
1055 | 'hitsPerPage': per_page,
1056 | }
1057 | for page in range(max_page):
1058 | params['page'] = page
1059 | r = await req.get(api,
1060 | params=params,
1061 | retry=2,
1062 | timeout=10,
1063 | headers={"User-Agent": CHROME_PC_UA})
1064 | if not r:
1065 | logger.error(f'{source} crawl failed: {r}, {r.text}')
1066 | return articles
1067 | items = r.json().get('hits') or []
1068 | if not items:
1069 | break
1070 | if page > 0:
1071 | logger.info(
1072 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
1073 | )
1074 | if not items and page > 0:
1075 | logger.info(f'{source} break for page {page} has no items')
1076 | break
1077 | for item in items:
1078 | try:
1079 | article: dict = {'source': source}
1080 | title = item['title']
1081 | url = item['url'] or ''
1082 | if not url:
1083 | url = f'https://news.ycombinator.com/item?id={item["objectID"]}'
1084 | desc = item['story_text'] or ''
1085 | ts_publish = ttime(item['created_at_i'])
1086 | article['ts_publish'] = ts_publish
1087 | article['title'] = title
1088 | article['desc'] = shorten_desc(desc)
1089 | article['url'] = url
1090 | article['url_key'] = get_url_key(article['url'])
1091 | articles.append(article)
1092 | except Exception:
1093 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1094 | break
1095 | logger.info(
1096 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1097 | )
1098 | return articles
1099 |
1100 |
1101 | @register_online
1102 | # @register_history
1103 | # @register_test
1104 | async def snarky() -> list:
1105 | """Brett Cannon"""
1106 | source: str = 'Brett Cannon'
1107 | articles: list = []
1108 | max_page: int = 1
1109 | api: str = 'https://snarky.ca/page/{page}/'
1110 | # 判断发布时间如果是 1 小时前就 break
1111 | break_time = ttime(time.time() - 60 * 60)
1112 | host = 'https://snarky.ca/'
1113 | for page in range(1, max_page + 1):
1114 | seed = api.format(page=page)
1115 | scode = await outlands_request(url=seed, retry=1, timeout=20)
1116 | if not scode:
1117 | logger.error(f'{source} crawl failed: {scode}')
1118 | return articles
1119 | items = fromstring(scode).cssselect('.post-feed>article.post-card')
1120 | if not items:
1121 | break
1122 | for item in items:
1123 | try:
1124 | article: dict = {'source': source}
1125 | href = item.cssselect('a.post-card-content-link')[0].get(
1126 | 'href', '')
1127 | if not href:
1128 | raise ValueError(f'{source} not found href from {seed}')
1129 | url = add_host(href, host)
1130 | title = (item.cssselect('h2.post-card-title') or
1131 | [null_tree])[0].text
1132 | desc = (item.cssselect('.post-card-excerpt>p') or
1133 | [null_tree])[0].text
1134 | if not (title and url):
1135 | raise ValueError(f'{source} no title {url}')
1136 | detail_resp = await req.get(
1137 | url,
1138 | ssl=False,
1139 | headers={
1140 | "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36'
1141 | })
1142 | if not detail_resp:
1143 | raise ValueError(
1144 | f'{source} request href failed {detail_resp}')
1145 | detail_scode = detail_resp.text
1146 | raw_pub_time = find_one(
1147 | 'property="article:published_time" content="(.+?)"',
1148 | detail_scode)[1]
1149 | # 2019-05-06T08:58:00.000Z
1150 | ts_publish = ttime(
1151 | ptime(raw_pub_time, fmt='%Y-%m-%dT%H:%M:%S.000Z'))
1152 | cover_item = item.cssselect('img.post-card-image')
1153 | if cover_item:
1154 | cover = cover_item[0].get('src', '')
1155 | if cover:
1156 | article['cover'] = add_host(cover, host)
1157 | article['ts_publish'] = ts_publish
1158 | article['title'] = title
1159 | article['desc'] = desc
1160 | article['url'] = url
1161 | article['url_key'] = get_url_key(article['url'])
1162 | articles.append(article)
1163 | if ts_publish < break_time:
1164 | # 文章的发布时间超过抓取间隔, 则 break
1165 | break
1166 | except Exception:
1167 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1168 | break
1169 | logger.info(
1170 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1171 | )
1172 | return articles
1173 |
1174 |
1175 | @register_online
1176 | # @register_history
1177 | # @register_test
1178 | async def jiqizhixin() -> list:
1179 | """机器之心"""
1180 | source: str = '机器之心'
1181 | articles: list = []
1182 | max_page: int = 1
1183 | # 有 cookie 和 防跨域验证
1184 | curl_string = r'''curl 'https://www.jiqizhixin.com/api/v1/search?type=articles&page=1&keywords=python&published=0&is_exact_match=false&search_internet=true&sort=time' -H 'Cookie: ahoy_visitor=1; _Synced_session=2' -H 'DNT: 1' -H 'Accept-Encoding: gzip, deflate, br' -H 'Accept-Language: zh-CN,zh;q=0.9' -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36' -H 'Accept: */*' -H 'Referer: https://www.jiqizhixin.com/search/article?keywords=python&search_internet=true&sort=time' -H 'X-Requested-With: XMLHttpRequest' -H 'If-None-Match: W/"3e034aa5e8cb79dd92652f5ba70a65a5"' -H 'Connection: keep-alive' --compressed'''
1185 | request_args = curlparse(curl_string)
1186 | for page in range(1, max_page + 1):
1187 | # 部分时候请求返回结果为空, 需要重试
1188 | for _ in range(2, 5):
1189 | r = await req.request(retry=1, timeout=20, **request_args)
1190 | if not r:
1191 | logger.error(f'{source} crawl failed: {r}, {r.text}')
1192 | return articles
1193 | try:
1194 | items = r.json().get('articles', {}).get('nodes', [])
1195 | if not items:
1196 | continue
1197 | break
1198 | except json.decoder.JSONDecodeError:
1199 | await asyncio.sleep(_)
1200 | continue
1201 | else:
1202 | # 试了 3 次都没 break, 放弃
1203 | return articles
1204 | if max_page > 1:
1205 | logger.info(
1206 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
1207 | )
1208 | # 翻页, 修改 page
1209 | curl_string = re.sub(r'&page=\d+', f'&page={page + 1}', curl_string)
1210 | request_args = curlparse(curl_string)
1211 | if not r.json().get('articles', {}).get('hasNextPage'):
1212 | break
1213 | for item in items:
1214 | try:
1215 | article: dict = {'source': source}
1216 | desc = item['content']
1217 | # 2019/05/27 00:09
1218 | article['ts_publish'] = ttime(
1219 | ptime(item['published_at'], fmt='%Y/%m/%d %H:%M'))
1220 | title = item.get('title') or ''
1221 | title = title.replace('Python',
1222 | 'Python').replace('python',
1223 | 'Python')
1224 | article['title'] = title
1225 | article['cover'] = item.get('cover_image_url') or ''
1226 | article['desc'] = f'「{item["author"]}」 {shorten_desc(desc)}'
1227 | article['url'] = item['path']
1228 | article['url_key'] = get_url_key(article['url'])
1229 | articles.append(article)
1230 | except Exception:
1231 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1232 | break
1233 | logger.info(
1234 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1235 | )
1236 | return articles
1237 |
1238 |
1239 | @register_online
1240 | # @register_history
1241 | # @register_test
1242 | async def lilydjwg() -> list:
1243 | """依云's Blog"""
1244 | source: str = "依云's Blog"
1245 | articles: list = []
1246 | max_page: int = 1
1247 | seed = 'https://blog.lilydjwg.me/tag/python?page={page}'
1248 | for page in range(1, max_page + 1):
1249 | r = await req.get(seed.format(page=page),
1250 | retry=1,
1251 | timeout=20,
1252 | headers={"User-Agent": CHROME_PC_UA})
1253 | if not r:
1254 | logger.error(f'{source} crawl failed: {r}, {r.text}')
1255 | return articles
1256 | scode = r.content.decode('u8')
1257 | items = fromstring(scode).cssselect('#content>.posttotal')
1258 | if not items:
1259 | break
1260 | host = 'https://blog.lilydjwg.me/'
1261 | if max_page > 1:
1262 | logger.info(
1263 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
1264 | )
1265 | for item in items:
1266 | try:
1267 | article: dict = {'source': source}
1268 | title = item.cssselect('.storytitle>a')[0].text
1269 | href = item.cssselect('.storytitle>a')[0].get('href', '')
1270 | url = add_host(href, host).replace(
1271 | 'https://lilydjwg.is-programmer.com/', host)
1272 | desc = shorten_desc((item.cssselect('.post_brief>p') or
1273 | [null_tree])[0].text_content())
1274 | cover = (item.cssselect('img') or [null_tree])[0].get('src', '')
1275 | month, day, year = item.cssselect(
1276 | '.date')[0].text_content().strip().split()
1277 | month = f'0{month}'[-2:]
1278 | day = f'0{day}'[-2:]
1279 | article['ts_publish'] = ttime(
1280 | ptime(f'{year}/{month}/{day}', fmt='%Y/%m/%d'))
1281 | article['title'] = title
1282 | article['cover'] = cover
1283 | article['desc'] = desc
1284 | article['url'] = url
1285 | article['url_key'] = get_url_key(article['url'])
1286 | articles.append(article)
1287 | except Exception:
1288 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1289 | break
1290 | logger.info(
1291 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1292 | )
1293 | return articles
1294 |
1295 |
1296 | @register_online
1297 | # @register_history
1298 | # @register_test
1299 | async def dev_io() -> list:
1300 | """DEV Community"""
1301 | source: str = "DEV Community"
1302 | articles: list = []
1303 | max_page: int = 1
1304 | per_page: int = 30
1305 | filt_score: int = 10
1306 | for page in range(0, max_page):
1307 | r = await req.get(
1308 | f'https://dev.to/search/feed_content?per_page={per_page}&page={page}&tag=python&sort_by=published_at&sort_direction=desc&tag_names%5B%5D=python&approved=&class_name=Article',
1309 | headers={
1310 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
1311 | 'Referer': 'https://dev.to/t/python/latest'
1312 | },
1313 | retry=1,
1314 | timeout=20)
1315 | if not r:
1316 | logger.error(f'{source} crawl failed: {r}, {r.text}')
1317 | return articles
1318 | items = r.json().get('result') or []
1319 | if not items:
1320 | break
1321 | host = 'https://dev.to/'
1322 | if max_page > 1:
1323 | logger.info(
1324 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
1325 | )
1326 | for item in items:
1327 | try:
1328 | if item['public_reactions_count'] + item[
1329 | 'comments_count'] < filt_score:
1330 | # filt by min score
1331 | continue
1332 | article: dict = {'source': source}
1333 | title = item['title']
1334 | path = item['path']
1335 | url = add_host(path, host)
1336 | desc = item['user']['name']
1337 | article['ts_publish'] = ttime(item['published_at_int'])
1338 | article['title'] = title
1339 | article['desc'] = desc
1340 | article['url'] = url
1341 | article['url_key'] = get_url_key(article['url'])
1342 | articles.append(article)
1343 | except Exception:
1344 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1345 | break
1346 | logger.info(
1347 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1348 | )
1349 | return articles
1350 |
1351 |
1352 | # @register_online
1353 | # @register_history
1354 | # @register_test
1355 | async def pythoncat() -> list:
1356 | """Python猫"""
1357 | # 采集掘金的, 知乎专栏的更新太慢了
1358 | source: str = "Python猫"
1359 | user: str = '57b26118a341310060fa74da'
1360 | max_page = 1
1361 | articles: list = await common_spider_juejin(user, source, max_page=max_page)
1362 | logger.info(
1363 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1364 | )
1365 | return articles
1366 |
1367 |
1368 | @register_online
1369 | # @register_history
1370 | # @register_test
1371 | async def zhihu_zhuanlan_python_cn() -> list:
1372 | """Python之美"""
1373 | source: str = "Python之美"
1374 | name: str = 'python-cn'
1375 | articles: list = []
1376 | limit = 10
1377 | articles = await common_spider_zhihu_zhuanlan(name, source, limit=limit)
1378 | logger.info(
1379 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1380 | )
1381 | return articles
1382 |
1383 |
1384 | @register_online
1385 | # @register_history
1386 | # @register_test
1387 | async def zhihu_zhuanlan_python_cat() -> list:
1388 | """Python猫"""
1389 | source: str = "Python猫"
1390 | name: str = 'pythonCat'
1391 | articles: list = []
1392 | limit = 10
1393 | articles = await common_spider_zhihu_zhuanlan(name, source, limit=limit)
1394 | logger.info(
1395 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1396 | )
1397 | return articles
1398 |
1399 |
1400 | @register_online
1401 | # @register_history
1402 | # @register_test
1403 | async def zhihu_zhuanlan_pythoncxy() -> list:
1404 | """Python程序员"""
1405 | source: str = "Python程序员"
1406 | name: str = 'pythoncxy'
1407 | articles: list = []
1408 | limit = 10
1409 | articles = await common_spider_zhihu_zhuanlan(name, source, limit=limit)
1410 | logger.info(
1411 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1412 | )
1413 | return articles
1414 |
1415 |
1416 | @register_online
1417 | # @register_history
1418 | # @register_test
1419 | async def zhihu_zhuanlan_c_111369541() -> list:
1420 | """Python头条"""
1421 | source: str = "Python头条"
1422 | name: str = 'c_111369541'
1423 | articles: list = []
1424 | limit = 10
1425 | articles = await common_spider_zhihu_zhuanlan(name, source, limit=limit)
1426 | logger.info(
1427 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1428 | )
1429 | return articles
1430 |
1431 |
1432 | @register_online
1433 | # @register_history
1434 | # @register_test
1435 | async def cuiqingcai() -> list:
1436 | """静觅"""
1437 | source: str = "静觅"
1438 | articles: list = []
1439 | max_page: int = 1
1440 | # max_page = 20
1441 | api: str = 'https://cuiqingcai.com/category/technique/python/page/'
1442 | now = ttime()
1443 | this_date = now[5:10]
1444 | this_year = now[:4]
1445 | last_year_int = int(this_year) - 1
1446 | timestamp_today_0 = ptime(now[:10] + ' 00:00:00')
1447 |
1448 | def translate_time_text(raw_time):
1449 | if not raw_time:
1450 | return ''
1451 | raw_time = raw_time.strip()
1452 | # 针对每种情况做时间转换
1453 | # 4个月前 (02-21)
1454 | # 2天前
1455 | # 4年前 (2015-02-12)
1456 | # 先尝试取得横线/:分割的时间, 取不到的应该是 n 天前的情况
1457 | date = find_one(r'([\d:\- ]+)', raw_time)[1]
1458 | if date:
1459 | if re.match(r'^\d\d-\d\d$', date):
1460 | # 只有月日
1461 | # 这里有可能遇到的是去年的月份, 所以先判断
1462 | if date >= this_date:
1463 | date = f'{last_year_int}-{date}'
1464 | else:
1465 | date = f'{this_year}-{date}'
1466 | result = f'{date} 00:00:00'
1467 | elif re.match(r'^\d\d\d\d-\d\d-\d\d$', date):
1468 | # 有年月日
1469 | result = f'{date} 00:00:00'
1470 | elif re.match(r'^\d\d\d\d-\d\d-\d\d \d\d:\d\d$', date):
1471 | # 有年月日时分
1472 | result = f'{date}:00'
1473 | elif re.match(r'^\d\d\d\d-\d\d-\d\d \d:\d\d$', date):
1474 | # 有年月日时分
1475 | result = f'{date[:11]}0{date[11:]}:00'
1476 | else:
1477 | raise ValueError(f'bad time pattern {raw_time}')
1478 | elif re.match(r'^\d+小时前$', raw_time):
1479 | n_hour = int(find_one(r'\d+', raw_time)[0])
1480 | result = ttime(timestamp_today_0 - n_hour * 3600)
1481 | elif re.match(r'^\d+天前$', raw_time):
1482 | n_day = int(find_one(r'\d+', raw_time)[0])
1483 | result = ttime(timestamp_today_0 - n_day * 86400)
1484 | else:
1485 | raise ValueError(f'bad time pattern {raw_time}')
1486 | return result
1487 |
1488 | for page in range(1, max_page + 1):
1489 | seed = f'{api}{page}'
1490 | r = await req.get(
1491 | seed,
1492 | retry=1,
1493 | timeout=20,
1494 | ssl=False,
1495 | headers={
1496 | "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36'
1497 | })
1498 | if not r:
1499 | logger.error(f'{source} crawl failed: {r}, {r.text}')
1500 | return articles
1501 | items = fromstring(
1502 | r.content.decode('u8')).cssselect('div.content>article')
1503 | if not items:
1504 | break
1505 | if max_page > 1:
1506 | logger.info(
1507 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
1508 | )
1509 | for item in items:
1510 | try:
1511 | article: dict = {'source': source}
1512 | title = null_tree.css(item, 'header>h2>a').text
1513 | url = null_tree.css(item, 'header>h2>a').get('href', '')
1514 | desc = null_tree.css(item, '.note').text_content()
1515 | cover = null_tree.css(item, 'img.thumb').get('src', '')
1516 | raw_time_text = null_tree.css(
1517 | item, 'p > span:nth-child(2)').text_content()
1518 | article['ts_publish'] = translate_time_text(raw_time_text)
1519 | article['title'] = title
1520 | article['cover'] = cover
1521 | article['desc'] = desc
1522 | article['url'] = url
1523 | article['url_key'] = get_url_key(article['url'])
1524 | articles.append(article)
1525 | except Exception:
1526 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1527 | break
1528 | logger.info(
1529 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1530 | )
1531 | return articles
1532 |
1533 |
1534 | @register_online
1535 | # @register_history
1536 | # @register_test
1537 | async def tuicool_cn() -> list:
1538 | """推酷(中文)"""
1539 | source: str = "推酷(中文)"
1540 | articles: list = []
1541 | max_page: int = 1
1542 | articles = await common_spider_tuicool(
1543 | 'CN',
1544 | source,
1545 | max_page=max_page,
1546 | ignore_descs={'稀土掘金', 'Python猫', 'InfoQ'})
1547 | logger.info(
1548 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1549 | )
1550 | return articles
1551 |
1552 |
1553 | @register_online
1554 | # @register_history
1555 | # @register_test
1556 | async def tuicool_en() -> list:
1557 | """推酷(英文)"""
1558 | source: str = "推酷(英文)"
1559 | articles: list = []
1560 | max_page: int = 1
1561 | articles = await common_spider_tuicool('EN',
1562 | source,
1563 | max_page=max_page,
1564 | ignore_descs={'Real Python'})
1565 | logger.info(
1566 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1567 | )
1568 | return articles
1569 |
1570 |
1571 | # @register_online
1572 | # @register_history
1573 | # @register_test
1574 | async def kf_toutiao() -> list:
1575 | """稀土掘金"""
1576 | source: str = "稀土掘金"
1577 | articles: list = []
1578 | max_page: int = 1
1579 | per_page: int = 20
1580 | sort_by = 'rankIndex' # 'createdAt' 是按时间顺序
1581 | api: str = 'https://timeline-merger-ms.juejin.im/v1/get_tag_entry'
1582 | params: dict = {
1583 | 'src': 'web',
1584 | 'tagId': '559a7227e4b08a686d25744f',
1585 | 'page': 0,
1586 | 'pageSize': per_page,
1587 | 'sort': sort_by
1588 | }
1589 | # 豌豆花下猫 单独收录了
1590 | ignore_usernames: set = {'豌豆花下猫'}
1591 | for page in range(0, max_page):
1592 | params['page'] = page
1593 | scode = await outlands_request(
1594 | {
1595 | 'method': 'get',
1596 | 'params': params,
1597 | 'url': api,
1598 | 'ssl': False,
1599 | 'retry': 1,
1600 | 'headers': {
1601 | 'Referer': 'https://juejin.im/tag/Python?sort=popular',
1602 | 'Origin': 'https://juejin.im',
1603 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
1604 | }
1605 | }, 'u8')
1606 | if not scode:
1607 | logger.error(f'{source} crawl failed: {scode}')
1608 | return articles
1609 | items = json.loads(scode).get('d', {}).get('entrylist', [])
1610 | if not items:
1611 | break
1612 | if max_page > 1:
1613 | logger.info(
1614 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
1615 | )
1616 | for item in items:
1617 | try:
1618 | article: dict = {'source': source}
1619 | # 过滤一下已收录过的源
1620 | if item.get('user', {}).get('username', '') in ignore_usernames:
1621 | continue
1622 | # 2019-05-05T03:51:12.886Z
1623 | gmt_time = re.sub(r'\..*', '',
1624 | item['createdAt']).replace('T', ' ')
1625 | ts_publish = ttime(ptime(gmt_time, tzone=0))
1626 | article['ts_publish'] = ts_publish
1627 | article['lang'] = 'en' if item['english'] else 'CN'
1628 | article['title'] = item['title']
1629 | article['cover'] = item['screenshot']
1630 | article['desc'] = item['summaryInfo']
1631 | article['url'] = item['originalUrl']
1632 | article['url_key'] = get_url_key(article['url'])
1633 | articles.append(article)
1634 | except Exception:
1635 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1636 | break
1637 | logger.info(
1638 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1639 | )
1640 | return articles
1641 |
1642 |
1643 | @register_online
1644 | # @register_history
1645 | # @register_test
1646 | async def freelycode() -> list:
1647 | """Python部落"""
1648 | source: str = "Python部落"
1649 | articles: list = []
1650 | max_page: int = 1
1651 | api: str = 'https://python.freelycode.com/contribution/list/0'
1652 | params: dict = {
1653 | 'page_no': 1,
1654 | }
1655 | host: str = 'https://python.freelycode.com/'
1656 |
1657 | def fix_time(raw_time):
1658 | # 2019-03-27 7:02 a.m.
1659 | # 2019-03-22 9:27 a.m.
1660 | # 2019-07-17 9 a.m.
1661 | raw_time = raw_time.replace('中午', '12:01 p.m.')
1662 | if ':' not in raw_time:
1663 | raw_time = f'{raw_time[:-5]}:00{raw_time[-5:]}'
1664 | raw_time = raw_time.replace('.m.', 'm')
1665 | formated_time = ttime(ptime(raw_time, fmt='%Y-%m-%d %I:%M %p'))
1666 | return formated_time
1667 |
1668 | for page in range(1, max_page + 1):
1669 | params['page_no'] = page
1670 | r = await req.get(
1671 | api,
1672 | ssl=False,
1673 | params=params,
1674 | # proxy=proxy,
1675 | retry=2,
1676 | timeout=5,
1677 | headers={
1678 | 'Referer': api,
1679 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
1680 | },
1681 | )
1682 | if not r:
1683 | logger.error(f'{source} crawl failed: {r}, {r.text}')
1684 | return articles
1685 | scode: str = r.content.decode('u8', 'ignore')
1686 | items: list = fromstring(scode).cssselect(
1687 | '.table-bordered tr:nth-child(n+2)')
1688 | if not items:
1689 | break
1690 | if max_page > 1:
1691 | logger.info(
1692 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
1693 | )
1694 | for item in items:
1695 | try:
1696 | article: dict = {'source': source}
1697 | title_href = item.cssselect('td:nth-child(2)>a')
1698 | if not title_href:
1699 | continue
1700 | title: str = title_href[0].text
1701 | href: str = title_href[0].get('href', '')
1702 | url: str = add_host(href, host)
1703 | desc: str = null_tree.css(item, 'td:nth-child(3)').text
1704 | if desc:
1705 | desc = f'作者: {desc}'
1706 | raw_time: str = null_tree.css(item, 'td:nth-child(4)').text
1707 | ts_publish = fix_time(raw_time)
1708 | article['ts_publish'] = ts_publish
1709 | article['title'] = title
1710 | article['desc'] = desc
1711 | article['url'] = url
1712 | article['url_key'] = get_url_key(article['url'])
1713 | articles.append(article)
1714 | except Exception:
1715 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1716 | break
1717 | logger.info(
1718 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1719 | )
1720 | return articles
1721 |
1722 |
1723 | @register_online
1724 | # @register_history
1725 | # @register_test
1726 | async def miguelgrinberg() -> list:
1727 | """miguelgrinberg"""
1728 | source: str = "miguelgrinberg"
1729 | articles: list = []
1730 | start_page: int = 1
1731 | max_page: int = 1
1732 | api: str = 'https://blog.miguelgrinberg.com/index/page/'
1733 | host: str = 'https://blog.miguelgrinberg.com/'
1734 |
1735 | for page in range(start_page, max_page + 1):
1736 | page_url = f'{api}{page}'
1737 | scode = await outlands_request({'url': page_url}, retry=1)
1738 | if not scode:
1739 | logger.error(f'{source} crawl failed: {scode}')
1740 | return articles
1741 | scode = re.sub(r'', '', scode)
1742 | items: list = fromstring(scode).cssselect('#main>.post')
1743 | if not items:
1744 | break
1745 | if max_page > 1:
1746 | logger.info(
1747 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
1748 | )
1749 | for item in items:
1750 | try:
1751 | article: dict = {'source': source}
1752 | title_href = item.cssselect('h1.post-title>a')
1753 | if not title_href:
1754 | continue
1755 | title: str = title_href[0].text
1756 | href: str = title_href[0].get('href', '')
1757 | url: str = add_host(href, host)
1758 | desc: str = null_tree.css(item, '.post_body>p').text_content()
1759 | raw_time: str = null_tree.css(item, '.date>span').get(
1760 | 'data-timestamp', '').replace('T', ' ').replace('Z', '')
1761 | ts_publish = ttime(ptime(raw_time, tzone=0))
1762 | article['ts_publish'] = ts_publish
1763 | article['title'] = title
1764 | article['desc'] = shorten_desc(desc)
1765 | article['url'] = url
1766 | article['url_key'] = get_url_key(article['url'])
1767 | articles.append(article)
1768 | except Exception:
1769 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1770 | break
1771 | logger.info(
1772 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1773 | )
1774 | return articles
1775 |
1776 |
1777 | @register_online
1778 | # @register_history
1779 | # @register_test
1780 | async def codingpy() -> list:
1781 | """编程派"""
1782 | source: str = "编程派"
1783 | articles: list = []
1784 | start_page: int = 1
1785 | max_page: int = 1
1786 | api: str = 'https://codingpy.com/article/'
1787 | params: dict = {'page': 1}
1788 | host: str = 'https://codingpy.com/'
1789 |
1790 | for page in range(start_page, max_page + 1):
1791 | params['page'] = page
1792 | r = await req.get(
1793 | api,
1794 | params=params,
1795 | ssl=False,
1796 | # proxy=proxy,
1797 | retry=2,
1798 | timeout=5,
1799 | headers={
1800 | 'Referer': api,
1801 | 'User-Agent': CHROME_PC_UA
1802 | },
1803 | )
1804 | if not r:
1805 | logger.error(f'{source} crawl failed: {r}, {r.text}')
1806 | return articles
1807 | scode: str = r.content.decode('u8', 'ignore')
1808 | items: list = fromstring(scode).cssselect('.archive-main>article')
1809 | if not items:
1810 | break
1811 | if max_page > 1:
1812 | logger.info(
1813 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
1814 | )
1815 | for item in items:
1816 | try:
1817 | article: dict = {'source': source}
1818 | title_href = item.cssselect('.list-item-title>a')
1819 | title: str = title_href[0].text
1820 | href: str = title_href[0].get('href', '')
1821 | bg: str = null_tree.css(item, '.lim-cover').get('style', '')
1822 | # background-image:url(/media/articles/why-python-for-startups.jpg)
1823 | cover: str = find_one(r'background-image:url\((.*?)\)', bg)[1]
1824 | cover = add_host(cover, host)
1825 | url: str = add_host(href, host)
1826 | desc: str = null_tree.css(
1827 | item, '.list-item-summary>p').text_content()
1828 | raw_time: str = null_tree.css(item,
1829 | '.list-item-meta>p>span').text
1830 | # 2015.11.03
1831 | ts_publish = ttime(ptime(raw_time, fmt='%Y.%m.%d'))
1832 | article['ts_publish'] = ts_publish
1833 | article['title'] = title
1834 | article['cover'] = cover
1835 | article['desc'] = shorten_desc(desc)
1836 | article['url'] = url
1837 | article['url_key'] = get_url_key(article['url'])
1838 | articles.append(article)
1839 | except Exception:
1840 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1841 | break
1842 | logger.info(
1843 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1844 | )
1845 | return articles
1846 |
1847 |
1848 | @register_online
1849 | # @register_history
1850 | # @register_test
1851 | async def nedbatchelder() -> list:
1852 | """Ned Batchelder"""
1853 | source: str = "Ned Batchelder"
1854 | articles: list = []
1855 | limit: int = 5
1856 | api: str = 'https://nedbatchelder.com/blog/tag/python.html'
1857 | host: str = 'https://nedbatchelder.com/'
1858 | scode = await outlands_request(
1859 | {
1860 | 'method': 'get',
1861 | 'timeout': 5,
1862 | 'headers': {
1863 | 'Referer': api,
1864 | 'User-Agent': CHROME_PC_UA,
1865 | },
1866 | 'url': api,
1867 | }, 'u8')
1868 | container_html = null_tree.tostring(
1869 | null_tree.css(fromstring(scode), '.category')).decode('utf-8')
1870 | if not container_html:
1871 | logger.error(f'{source} not found container_html.')
1872 | return articles
1873 | split_by: str = ''
1874 | container_html = container_html.replace(
1875 | '', f'{split_by} ').replace(
1876 | '', '').replace(' ', '')
1877 | items: list = container_html.split(split_by)[1:limit + 1]
1878 | if not items:
1879 | return articles
1880 | for item in items:
1881 | try:
1882 | article: dict = {'source': source}
1883 | title_href = find_one(r' \s*([^<]+?)', item)
1884 | title: str = title_href[2]
1885 | href: str = title_href[1]
1886 | url: str = add_host(href, host)
1887 | raw_time: str = find_one(r' (\d+ .*?\d+): ',
1888 | item)[1]
1889 | ts_publish = ttime(ptime(raw_time, fmt='%d %b %Y'))
1890 | article['ts_publish'] = ts_publish
1891 | article['title'] = title
1892 | article['url'] = url
1893 | article['url_key'] = get_url_key(article['url'])
1894 | articles.append(article)
1895 | except Exception:
1896 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1897 | break
1898 | logger.info(
1899 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1900 | )
1901 | return articles
1902 |
1903 |
1904 | @register_online
1905 | # @register_history
1906 | # @register_test
1907 | async def the5fire() -> list:
1908 | """the5fire的技术博客"""
1909 | source: str = "the5fire的技术博客"
1910 | articles: list = []
1911 | start_page: int = 1
1912 | max_page: int = 1
1913 | api: str = 'https://www.the5fire.com/category/python/'
1914 | host: str = 'https://www.the5fire.com/'
1915 | params: dict = {'page': 1}
1916 |
1917 | for page in range(start_page, max_page + 1):
1918 | params['page'] = page
1919 | r = await req.get(
1920 | api,
1921 | params=params,
1922 | ssl=False,
1923 | # proxy=proxy,
1924 | retry=1,
1925 | headers={
1926 | 'Referer': api,
1927 | 'User-Agent': CHROME_PC_UA
1928 | },
1929 | )
1930 | if not r:
1931 | logger.error(f'{source} crawl failed: {r}, {r.text}')
1932 | return articles
1933 | scode: str = r.content.decode('u8', 'ignore')
1934 | items: list = fromstring(scode).cssselect('#main>.caption')
1935 | if not items:
1936 | break
1937 | if max_page > 1:
1938 | logger.info(
1939 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
1940 | )
1941 | for item in items:
1942 | try:
1943 | article: dict = {'source': source}
1944 | title_href = item.cssselect('h3>a')
1945 | title: str = title_href[0].text
1946 | href: str = title_href[0].get('href', '')
1947 | url: str = add_host(href, host)
1948 | desc: str = null_tree.css(item, '.caption>p').text_content()
1949 | raw_time: str = null_tree.css(item, '.info').text_content()
1950 | # 发布:2019-02-22 9:47 p.m.
1951 | raw_time = find_one(r'发布:(\d\d\d\d-\d{1,2}-\d{1,2}.*)',
1952 | raw_time)[1].replace('.', '')
1953 | # 2019-03-20 10:07 p.m.
1954 | # 2011-05-28 10 a.m.
1955 | # 2011-12-08 午夜
1956 | if ':' not in raw_time:
1957 | if 'm' in raw_time:
1958 | raw_time = re.sub('m.*', 'm', raw_time)
1959 | ts_publish = ttime(ptime(raw_time,
1960 | fmt='%Y-%m-%d %I %p'))
1961 | else:
1962 | raw_time = raw_time[:10]
1963 | ts_publish = ttime(ptime(raw_time, fmt='%Y-%m-%d'))
1964 | else:
1965 | ts_publish = ttime(ptime(raw_time, fmt='%Y-%m-%d %I:%M %p'))
1966 | article['ts_publish'] = ts_publish
1967 | article['title'] = title
1968 | article['desc'] = shorten_desc(desc)
1969 | article['url'] = url
1970 | article['url_key'] = get_url_key(article['url'])
1971 | articles.append(article)
1972 | except Exception:
1973 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1974 | break
1975 | logger.info(
1976 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1977 | )
1978 | return articles
1979 |
1980 |
1981 | @register_online
1982 | # @register_history
1983 | # @register_test
1984 | async def foofish() -> list:
1985 | """Python之禅"""
1986 | source: str = "Python之禅"
1987 | articles: list = []
1988 | start_page: int = 1
1989 | max_page: int = 1
1990 | api: str = 'https://foofish.net/index.html'
1991 | host: str = 'https://foofish.net/'
1992 |
1993 | for page in range(start_page, max_page + 1):
1994 | if page == 1:
1995 | seed = api
1996 | else:
1997 | seed = api.replace('index.html', f'index{page}.html')
1998 | r = await req.get(
1999 | seed,
2000 | ssl=False,
2001 | # proxy=proxy,
2002 | retry=1,
2003 | headers={
2004 | 'Referer': api,
2005 | 'User-Agent': CHROME_PC_UA
2006 | },
2007 | )
2008 | if not r:
2009 | logger.error(f'{source} crawl failed: {r}, {r.text}')
2010 | return articles
2011 | scode: str = r.content.decode('u8', 'ignore')
2012 | container: str = find_one(r' [\s\S]*? ',
2013 | scode)[0]
2014 | if not container:
2015 | logger.error('container not found')
2016 | return articles
2017 | items: list = re.findall(r' [\S\s]*?', container)
2018 | if not items:
2019 | break
2020 | if max_page > 1:
2021 | logger.info(
2022 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
2023 | )
2024 | for item_html in items:
2025 | try:
2026 | article: dict = {'source': source}
2027 | item = fromstring(item_html)
2028 | title_href = item.cssselect('a')
2029 | title: str = title_href[0].text
2030 | href: str = title_href[0].get('href', '')
2031 | url: str = add_host(href, host)
2032 | raw_time: str = null_tree.css(item, 'dt').text
2033 | ts_publish = ttime(ptime(raw_time, fmt='%Y-%m-%d'))
2034 | article['ts_publish'] = ts_publish
2035 | article['title'] = title
2036 | article['url'] = url
2037 | article['url_key'] = get_url_key(article['url'])
2038 | articles.append(article)
2039 | except Exception:
2040 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
2041 | break
2042 | logger.info(
2043 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
2044 | )
2045 | return articles
2046 |
2047 |
2048 | @register_online
2049 | # @register_history
2050 | # @register_test
2051 | async def inventwithpython() -> list:
2052 | """The Invent with Python Blog"""
2053 | source: str = "The Invent with Python Blog"
2054 | articles: list = []
2055 | start_page: int = 1
2056 | max_page: int = 1
2057 | api: str = 'https://inventwithpython.com/blog/index.html'
2058 | host: str = 'https://inventwithpython.com/'
2059 |
2060 | for page in range(start_page, max_page + 1):
2061 | if page == 1:
2062 | seed = api
2063 | else:
2064 | seed = api.replace('index.html', f'index{page}.html')
2065 | r = await req.get(
2066 | seed,
2067 | ssl=False,
2068 | # proxy=proxy,
2069 | retry=1,
2070 | headers={
2071 | 'Referer': api,
2072 | 'User-Agent': CHROME_PC_UA
2073 | },
2074 | )
2075 | if not r:
2076 | logger.error(f'{source} crawl failed: {r}, {r.text}')
2077 | return articles
2078 | scode: str = r.content.decode('u8', 'ignore')
2079 | items: list = fromstring(scode).cssselect('#content>article')
2080 | if not items:
2081 | break
2082 | if max_page > 1:
2083 | logger.info(
2084 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
2085 | )
2086 | for item in items:
2087 | try:
2088 | article: dict = {'source': source}
2089 | title_href = null_tree.css(item, 'h1>a')
2090 | title: str = title_href.text
2091 | href: str = title_href.get('href', '')
2092 | url: str = add_host(href, host)
2093 | raw_time: str = null_tree.css(
2094 | item, '.article-header-date').text.strip()
2095 | # Wed 05 June 2019
2096 | ts_publish = ttime(ptime(raw_time, fmt='%a %d %B %Y'))
2097 | article['ts_publish'] = ts_publish
2098 | article['title'] = title
2099 | article['url'] = url
2100 | article['url_key'] = get_url_key(article['url'])
2101 | articles.append(article)
2102 | except Exception:
2103 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
2104 | break
2105 | logger.info(
2106 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
2107 | )
2108 | return articles
2109 |
2110 |
2111 | @register_online
2112 | # @register_history
2113 | # @register_test
2114 | async def lucumr() -> list:
2115 | """Armin Ronacher's Thoughts and Writings"""
2116 | source: str = "Armin Ronacher's Thoughts and Writings"
2117 | articles: list = []
2118 | start_page: int = 1
2119 | max_page: int = 1
2120 | api: str = 'http://lucumr.pocoo.org/'
2121 | host: str = 'http://lucumr.pocoo.org/'
2122 |
2123 | for page in range(start_page, max_page + 1):
2124 | if page == 1:
2125 | seed = api
2126 | else:
2127 | seed = add_host(f'/page/{page}/', host)
2128 | r = await req.get(
2129 | seed,
2130 | ssl=False,
2131 | # proxy=proxy,
2132 | retry=1,
2133 | headers={
2134 | 'Referer': api,
2135 | 'User-Agent': CHROME_PC_UA
2136 | },
2137 | )
2138 | if not r:
2139 | logger.error(f'{source} crawl failed: {r}, {r.text}')
2140 | return articles
2141 | scode: str = r.content.decode('u8', 'ignore')
2142 | items: list = fromstring(scode).cssselect(
2143 | '.entry-wrapper>.entry-overview')
2144 | if not items:
2145 | break
2146 | if max_page > 1:
2147 | logger.info(
2148 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
2149 | )
2150 | for item in items:
2151 | try:
2152 | article: dict = {'source': source}
2153 | title_href = null_tree.css(item, 'h1>a')
2154 | title: str = title_href.text
2155 | href: str = title_href.get('href', '')
2156 | url: str = add_host(href, host)
2157 | desc: str = null_tree.css(item, '.summary>p').text
2158 | raw_time: str = null_tree.css(item, '.date').text.strip()
2159 | # Jun 5, 2017
2160 | ts_publish = ttime(ptime(raw_time, fmt='%b %d, %Y'))
2161 | article['ts_publish'] = ts_publish
2162 | article['title'] = title
2163 | article['desc'] = desc
2164 | article['url'] = url
2165 | article['url_key'] = get_url_key(article['url'])
2166 | articles.append(article)
2167 | except Exception:
2168 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
2169 | break
2170 | logger.info(
2171 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
2172 | )
2173 | return articles
2174 |
2175 |
2176 | @register_online
2177 | # @register_history
2178 | # @register_test
2179 | async def treyhunner() -> list:
2180 | """Trey Hunner"""
2181 | source: str = "Trey Hunner"
2182 | articles: list = []
2183 | limit: int = 5
2184 | api: str = 'https://treyhunner.com/blog/categories/python/'
2185 | host: str = 'https://treyhunner.com/'
2186 |
2187 | r = await req.get(
2188 | api,
2189 | ssl=False,
2190 | # proxy=proxy,
2191 | retry=1,
2192 | headers={
2193 | 'Referer': api,
2194 | 'User-Agent': CHROME_PC_UA
2195 | },
2196 | )
2197 | if not r:
2198 | logger.error(f'{source} crawl failed: {r}, {r.text}')
2199 | return articles
2200 | scode: str = r.content.decode('u8', 'ignore')
2201 | items: list = fromstring(scode).cssselect('#blog-archives>article')
2202 | for item in items[:limit]:
2203 | try:
2204 | article: dict = {'source': source}
2205 | title_href = null_tree.css(item, 'h1>a')
2206 | title: str = title_href.text
2207 | href: str = title_href.get('href', '')
2208 | url: str = add_host(href, host)
2209 | raw_time: str = null_tree.css(item, 'time').get('datetime')
2210 | # 2019-06-18T09:15:00-07:00
2211 | ts_publish = ttime(ptime(raw_time.replace('T', ' ')[:19], tzone=-7))
2212 | article['ts_publish'] = ts_publish
2213 | article['title'] = title
2214 | article['url'] = url
2215 | article['url_key'] = get_url_key(article['url'])
2216 | articles.append(article)
2217 | except Exception:
2218 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
2219 | break
2220 | logger.info(
2221 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
2222 | )
2223 | return articles
2224 |
2225 |
2226 | @register_online
2227 | # @register_history
2228 | # @register_test
2229 | async def reddit() -> list:
2230 | """Reddit"""
2231 | source: str = "Reddit"
2232 | articles: list = []
2233 | limit: int = 22
2234 | # 有 20 赞以上的才收录
2235 | min_ups: int = 20
2236 | # 或者 10 评论的才收录
2237 | min_cmts: int = 10
2238 | # api doc: https://www.reddit.com/dev/api/#GET_top
2239 | api: str = f'https://api.reddit.com/r/Python/top/?t=day&limit={limit}'
2240 | host: str = 'https://www.reddit.com/'
2241 | for _ in range(2):
2242 | scode = await outlands_request(
2243 | {
2244 | 'method': 'get',
2245 | 'url': api,
2246 | 'headers': {
2247 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
2248 | }
2249 | }, 'u8')
2250 | # print(scode)
2251 | if scode:
2252 | break
2253 | else:
2254 | logger.error(f'{source} crawl failed')
2255 | return articles
2256 | rj: dict = json.loads(scode)
2257 | items: list = rj['data']['children']
2258 | for item in items:
2259 | try:
2260 | if item['kind'] != 't3':
2261 | continue
2262 | data = item['data']
2263 | if (data.get('ups') or data.get('score') or
2264 | 0) < min_ups and (data.get('num_comments') or 0) < min_cmts:
2265 | continue
2266 | article: dict = {'source': source}
2267 | title: str = data['title']
2268 | href: str = data['permalink']
2269 | url: str = add_host(href, host)
2270 | raw_time: str = data['created_utc']
2271 | # 1564420248
2272 | ts_publish = ttime(raw_time, tzone=0)
2273 | desc: str = data.get('author') or ''
2274 | article['ts_publish'] = ts_publish
2275 | article['title'] = title
2276 | article['url'] = url
2277 | article['desc'] = desc
2278 | article['url_key'] = get_url_key(article['url'])
2279 | articles.append(article)
2280 | except Exception:
2281 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
2282 | break
2283 | logger.info(
2284 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
2285 | )
2286 | return articles
2287 |
2288 |
2289 | # @register_online
2290 | # @register_history
2291 | # @register_test
2292 | async def codetengu() -> list:
2293 | """码天狗"""
2294 | source: str = "码天狗"
2295 | articles: list = []
2296 | start_page: int = 1
2297 | # max_page: int = 999
2298 | max_page: int = 1
2299 | api: str = 'https://weekly.codetengu.com/issues'
2300 | params: dict = {'page': 1}
2301 | host: str = 'https://weekly.codetengu.com/'
2302 |
2303 | for page in range(start_page, max_page + 1):
2304 | params['page'] = page
2305 | r = await req.get(
2306 | api,
2307 | params=params,
2308 | ssl=False,
2309 | # proxy=proxy,
2310 | retry=1,
2311 | timeout=10,
2312 | headers={
2313 | 'Referer': api,
2314 | 'User-Agent': CHROME_PC_UA
2315 | },
2316 | )
2317 | if not r:
2318 | logger.error(f'{source} crawl failed: {r}, {r.text}')
2319 | return articles
2320 | scode: str = r.content.decode('u8', 'ignore')
2321 | items: list = fromstring(scode).cssselect('.item__list > li.item')
2322 | if not items:
2323 | break
2324 | if max_page > 1:
2325 | logger.info(
2326 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
2327 | )
2328 | for item in items:
2329 | try:
2330 | article: dict = {'source': source}
2331 | title: str = item.cssselect('.item__title')[0].text
2332 | href: str = item.cssselect('a')[0].get('href', '')
2333 | cover: str = null_tree.css(item, 'img').get('src', '')
2334 | cover = add_host(cover, host)
2335 | url: str = add_host(href, host)
2336 | desc: str = null_tree.css(item, '.item__title').text_content()
2337 | raw_time: str = null_tree.css(item, 'time.published').get(
2338 | 'datetime', '1970-01-01')
2339 | ts_publish = ttime(ptime(raw_time, fmt='%Y-%m-%d'))
2340 | article['ts_publish'] = ts_publish
2341 | article['title'] = title
2342 | article['cover'] = cover
2343 | article['desc'] = shorten_desc(desc)
2344 | article['url'] = url
2345 | article['url_key'] = get_url_key(article['url'])
2346 | articles.append(article)
2347 | except Exception:
2348 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
2349 | break
2350 | logger.info(
2351 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
2352 | )
2353 | return articles
2354 |
2355 |
2356 | @register_online
2357 | # @register_history
2358 | # @register_test
2359 | async def pychina() -> list:
2360 | """蠎周刊"""
2361 | source: str = "蠎周刊"
2362 | articles: list = []
2363 | limit: int = 5
2364 | api: str = 'http://weekly.pychina.org/archives.html'
2365 | host: str = 'http://weekly.pychina.org/'
2366 |
2367 | r = await req.get(
2368 | api,
2369 | ssl=False,
2370 | # proxy=proxy,
2371 | retry=1,
2372 | timeout=10,
2373 | headers={
2374 | 'Referer': '',
2375 | 'User-Agent': CHROME_PC_UA
2376 | },
2377 | )
2378 | if not r:
2379 | logger.error(f'{source} crawl failed: {r}, {r.text}')
2380 | return articles
2381 | scode: str = r.content.decode('u8', 'ignore')
2382 | items: list = fromstring(scode).cssselect('#content li')
2383 | for item in items[:limit]:
2384 | try:
2385 | article: dict = {'source': source}
2386 | title_href = item.cssselect('a[title]')
2387 | if not title_href:
2388 | continue
2389 | title: str = title_href[0].text.strip()
2390 | href: str = title_href[0].get('href', '')
2391 | url: str = add_host(href, host)
2392 | raw_time: str = null_tree.css(item, 'sup').text
2393 | ts_publish = ttime(ptime(raw_time, fmt='%Y-%m-%d %H:%M'))
2394 | article['ts_publish'] = ts_publish
2395 | article['title'] = title
2396 | article['cover'] = ''
2397 | article['desc'] = ''
2398 | article['url'] = url
2399 | article['url_key'] = get_url_key(article['url'])
2400 | articles.append(article)
2401 | except Exception:
2402 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
2403 | break
2404 | logger.info(
2405 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
2406 | )
2407 | return articles
2408 |
2409 |
2410 | @register_online
2411 | # @register_history
2412 | # @register_test
2413 | async def xiaoruicc() -> list:
2414 | """峰云就她了"""
2415 | source: str = "峰云就她了"
2416 | articles: list = []
2417 | start_page: int = 1
2418 | max_page: int = 1
2419 | # max_page: int = 999
2420 | api: str = 'http://xiaorui.cc/archives/category/python'
2421 | host: str = 'http://xiaorui.cc/'
2422 |
2423 | for page in range(start_page, max_page + 1):
2424 | api_url = f'{api}{"/page/" + str(page) if page != 1 else ""}/'
2425 | r = await req.get(
2426 | api_url,
2427 | ssl=False,
2428 | # proxy=proxy,
2429 | retry=2,
2430 | timeout=8,
2431 | headers={
2432 | 'Referer': api_url,
2433 | 'User-Agent': CHROME_PC_UA
2434 | },
2435 | )
2436 | if not r:
2437 | if getattr(r, 'status_code', None) != 404:
2438 | logger.error(f'{source} crawl failed: {r}, {r.text}')
2439 | return articles
2440 | scode: str = r.content.decode('u8', 'ignore')
2441 | items: list = fromstring(scode).cssselect('.content-area>article')
2442 | if not items:
2443 | break
2444 | if max_page > 1:
2445 | logger.info(
2446 | f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
2447 | )
2448 | for item in items:
2449 | try:
2450 | article: dict = {'source': source}
2451 | title_href = item.cssselect('.entry-title>a')
2452 | if not title_href:
2453 | continue
2454 | title: str = title_href[0].text.strip()
2455 | href: str = title_href[0].get('href', '')
2456 | url: str = add_host(href, host)
2457 | desc: str = null_tree.css(
2458 | item, '.entry-summary>*:first-child').text_content()
2459 | raw_time: str = null_tree.css(item, 'time.published').get(
2460 | 'datetime', '1970-01-01')
2461 | ts_publish = ttime(ptime(raw_time, fmt='%Y-%m-%dT%H:%M:%S%z'))
2462 | article['ts_publish'] = ts_publish
2463 | article['title'] = title
2464 | article['cover'] = ''
2465 | article['desc'] = shorten_desc(desc)
2466 | article['url'] = url
2467 | article['url_key'] = get_url_key(article['url'])
2468 | articles.append(article)
2469 | except Exception:
2470 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
2471 | break
2472 | logger.info(
2473 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
2474 | )
2475 | return articles
2476 |
2477 |
2478 | @register_online
2479 | # @register_history
2480 | # @register_test
2481 | async def medium_python() -> list:
2482 | """Medium"""
2483 | source: str = 'Medium'
2484 | articles: list = []
2485 | limit = 10
2486 | seed = 'https://medium.com/feed/tag/python'
2487 | # 避免超时, 用外部访问
2488 | scode = await outlands_request({
2489 | 'method': 'get',
2490 | 'url': seed,
2491 | }, 'u8')
2492 | items = fromstring(scode.encode('utf-8'),
2493 | parser=XMLParser()).xpath('//channel/item')
2494 | now = ttime()
2495 | for item in items[:limit]:
2496 | try:
2497 | article: dict = {'source': source}
2498 | guid = item.xpath('./guid/text()')
2499 | title = item.xpath('./title/text()')
2500 | description = item.xpath('./description/text()')
2501 | author = item.xpath("./*[local-name()='creator']/text()")
2502 | pubDate = item.xpath("./*[local-name()='updated']/text()")
2503 | if not (guid and title):
2504 | continue
2505 | url = guid[0]
2506 | title = title[0]
2507 | if description:
2508 | desc = fromstring(description[0]).text_content()
2509 | # 去掉 <>
2510 | desc = re.sub('<[^>]*>', ' ', desc)
2511 | # 只保留第一个换行前面的
2512 | desc = shorten_desc(desc)
2513 | else:
2514 | desc = ''
2515 | if 'Continue reading on' in desc:
2516 | continue
2517 | if author:
2518 | desc = f'[{author[0]}] {desc}'
2519 | if not ensure_cn_en(f'{title}{desc}'):
2520 | continue
2521 | if pubDate:
2522 | raw_pub_date = pubDate[0]
2523 | # Wed, 22 May 2019 01:47:44 +0000
2524 | raw_pub_date = re.sub(r'\..*', '', raw_pub_date).strip()
2525 | ts_publish = ttime(
2526 | ptime(raw_pub_date, fmt='%Y-%m-%dT%H:%M:%S') + 3600 * 8)
2527 | else:
2528 | ts_publish = now
2529 | article['ts_publish'] = ts_publish
2530 | article['title'] = title
2531 | article['desc'] = desc
2532 | article['url'] = url
2533 | article['url_key'] = get_url_key(article['url'])
2534 | articles.append(article)
2535 | except Exception:
2536 | logger.error(f'{source} crawl failed: {traceback.format_exc()}')
2537 | break
2538 | logger.info(
2539 | f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
2540 | )
2541 | return articles
2542 |
--------------------------------------------------------------------------------
/newspaper/loggers.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import pathlib
3 | from logging.handlers import RotatingFileHandler
4 |
5 | log_dir = pathlib.Path(__file__).absolute().parent.parent / 'logs'
6 |
7 |
8 | def init_logger(logger_name=None,
9 | file_name='server.log',
10 | max_mb=50,
11 | backupCount=1):
12 | if not log_dir.is_dir():
13 | log_dir.mkdir()
14 | formatter_str = (
15 | "%(asctime)s %(levelname)-5s [%(name)s] %(filename)s(%(lineno)s): %(message)s"
16 | )
17 | datefmt = "%Y-%m-%d %H:%M:%S"
18 | formatter = logging.Formatter(formatter_str, datefmt=datefmt)
19 | logger = logging.getLogger(logger_name)
20 | logger.setLevel(logging.INFO)
21 | stream_hl = logging.StreamHandler()
22 | stream_hl.setFormatter(formatter)
23 | stream_hl.setLevel(logging.INFO)
24 | logger.addHandler(stream_hl)
25 |
26 | file_hl = RotatingFileHandler(filename=log_dir / file_name,
27 | maxBytes=1024 * 1024 * max_mb,
28 | backupCount=backupCount,
29 | encoding='utf-8')
30 | file_hl.setFormatter(formatter)
31 | file_hl.setLevel(logging.INFO)
32 | logger.addHandler(file_hl)
33 | return logger
34 |
35 |
36 | logger = init_logger('server', 'server.log')
37 | spider_logger = init_logger('spider_logger',
38 | 'spider.log',
39 | max_mb=5,
40 | backupCount=1)
41 |
--------------------------------------------------------------------------------
/newspaper/models.py:
--------------------------------------------------------------------------------
1 | import abc
2 | import re
3 | import sqlite3
4 | import typing
5 | import warnings
6 | from datetime import datetime
7 |
8 | import aiomysql
9 | from async_lru import alru_cache
10 | from torequests.utils import ptime, time, ttime
11 |
12 | from .loggers import logger
13 | from .crawler.sources import content_sources_dict
14 |
15 | # 用了 insert ignore 还总是 warning, 又不想 insert try, 只好全禁掉了...
16 | warnings.filterwarnings('ignore', category=aiomysql.Warning)
17 |
18 |
19 | class Storage(object, metaclass=abc.ABCMeta):
20 | """存储器抽象. 统一参数对文章数据库进行增删改查."""
21 | max_limit = 100 # 避免 limit 设置的太大一次提取太多导致拥堵
22 | articles_table_columns = ('url_key', 'title', 'url', 'cover', 'desc',
23 | 'source', 'level', 'review', 'ts_publish',
24 | 'ts_create', 'ts_update')
25 |
26 | def format_output_articles(self, articles: typing.Sequence[dict]):
27 | for article in articles:
28 | for key, value in article.items():
29 | if isinstance(value, datetime):
30 | article[key] = str(value)
31 | return articles
32 |
33 | @staticmethod
34 | def ensure_articles(articles: typing.Sequence[dict]) -> list:
35 | valid_articles = []
36 | # ensure_keys = ("url_key", "title", "cover", "desc", "source",
37 | # "review", "ts_publish", "lang")
38 | keys_set = None
39 | now = ttime()
40 | before_3_day_0_0 = f'{ttime(time.time() - 86400*3)[:10]} 00:00:00'
41 | for article in articles:
42 | if not isinstance(article, dict):
43 | continue
44 | if not keys_set:
45 | keys_set = set(article.keys())
46 | else:
47 | # 如果 keys 和第一个不一样, 就没法使用 executemany, 所以跳过
48 | if set(article.keys()) != keys_set:
49 | continue
50 | # 这些 key 必须都存在才能入库
51 | source = content_sources_dict.get(article['source'])
52 | if not source:
53 | continue
54 | for ensure_key in ('url_key', 'title'):
55 | if not article.get(ensure_key):
56 | continue
57 | article.setdefault('cover', '')
58 | article.setdefault('desc', '')
59 | article.setdefault('source', 'unknown')
60 | article.setdefault('review', '')
61 | article.setdefault('level', source.get('level', 3))
62 | article.setdefault('lang', source.get('lang', 'CN'))
63 | article.setdefault('ts_publish', '1970-01-01 08:00:01')
64 | article['lang'] = article['lang'].upper()
65 | article['desc'] = re.sub(
66 | r'
13 |
23 |
24 |
222 |
223 |
224 |
225 |
226 |
229 |
230 |
231 |
281 |
282 |
283 |
284 |
285 | Next Page
286 |
287 |
288 |
289 |
290 |
291 | {{BEIAN_ID}}
292 |
293 |
294 |
406 |
407 |
408 | |