├── .gitignore
├── LICENSE
├── Pipfile
├── README.md
├── bin
    ├── .config
    │   └── systemd
    │   │   └── user
    │   │       ├── newspaper_spider.service
    │   │       ├── newspaper_spider.timer
    │   │       └── newspaper_web.service
    ├── git-sync.sh
    ├── obsoleted
    │   ├── restart.sh
    │   ├── start.sh
    │   └── stop.sh
    └── update_systemd_config.py
├── crawl_history.py
├── crawl_online.py
├── crawl_test.py
├── db_backup.py
├── db_sql.py
├── deploy.md
├── newspaper
    ├── api.py
    ├── config.py
    ├── crawler
    │   ├── main.py
    │   ├── sources.py
    │   └── spiders.py
    ├── loggers.py
    ├── models.py
    ├── server.py
    ├── static
    │   └── favicon.ico
    ├── templates
    │   ├── articles.html
    │   └── daily_python.html
    ├── utils.py
    └── views.py
└── run_server.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # local test python script
107 | temp.py
108 | .vscode
109 | logs/
110 | *.sqlite
111 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 pyld
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = false
 4 | name = "pypi"
 5 | 
 6 | [dev-packages]
 7 | 
 8 | [packages]
 9 | uvloop = {version = "*",sys_platform = "!= 'win32'"}
10 | torequests = ">=5.0.10"
11 | starlette = "*"
12 | uvicorn = ">=0.11.8"
13 | aiomysql = "*"
14 | lxml = "*"
15 | cssselect = "*"
16 | async-lru = "*"
17 | aiofiles = "*"
18 | jinja2 = "*"
19 | 
20 | [requires]
21 | python_version = "3.7"
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # python-newspaper
  2 | 
  3 | ##  服务器将与 2021.4.1 关停不再续费. sqlite 备份在 Release 里, 备份时间 2021-03-07. 之后会考虑通过另一个项目 Watchdogs 开展订阅.
  4 | 
  5 | ### Timeline
  6 | - [For English-only reader](https://www.clericpy.top/newspaper/articles.query.html?lang=EN)
  7 | - [中文读者](https://www.clericpy.top/newspaper/articles.query.html?lang=CN)
  8 | - [中英文读者](https://www.clericpy.top/newspaper/articles.query.html)
  9 | 
 10 | ### RSS 日报
 11 | - [For English-only reader](https://www.clericpy.top/newspaper/daily.python.list.rss.en)
 12 | - [中文读者](https://www.clericpy.top/newspaper/daily.python.list.rss.cn)
 13 | - [中英文读者](https://www.clericpy.top/newspaper/daily.python.list.rss.any)
 14 | 
 15 | ### 当前进度:
 16 | 
 17 | - [x] 购买服务器
 18 | - [x] 准备备用服务器
 19 | - [x] 确认内容源
 20 | - [x] 准备服务器
 21 | - [x] 开发
 22 | - [x] 测试
 23 | - [x] 上线
 24 | - [x] 补充内容源
 25 | - [ ] Python 日报静态页面 github pages + rss
 26 | - [ ] 人工筛选生成日报/周报, 公众号推送
 27 | - [ ] 实现订阅源过滤功能
 28 | 
 29 | ### 内容源列表
 30 | 
 31 | **内容源高分标准**: 
 32 | 
 33 | 1. 原创为主
 34 | 2. 更新频率较低
 35 | 3. 文章质量好
 36 | 4. 信息量大, 周报
 37 | 5. 广为人知
 38 | 
 39 | <!-- providers start -->
 40 | 
 41 | * 收录进度: 37 / 37
 42 | 
 43 | 	> = 待收录  |  √ 已收录  |  X 不收录  |  - 入库不追更
 44 | 
 45 | | 序号 | 名称 | 评分 | 语言 | 收录 | 描述 |
 46 | | ---- | ---- | ---- | ---- | ---- | ---- |
 47 | | 1 | [Python Software Foundation News](https://pyfound.blogspot.com/) | 4 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python+Software+Foundation+News) | [墙] 来自 Python 软件基金会的消息 |
 48 | | 2 | [Python Weekly](https://www.pythonweekly.com/) | 5 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python+Weekly) | 必备周报 |
 49 | | 3 | [PyCoder's Weekly](https://pycoders.com/issues) | 5 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=PyCoder%27s+Weekly) | 必备周报 |
 50 | | 4 | [Import Python](https://importpython.com/newsletter/archive/) | 5 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Import+Python) | 必备周报, 2019.1.11 停更了, 希望早日康复~ |
 51 | | 5 | [Awesome Python Newsletter](https://python.libhunt.com/newsletter/archive) | 5 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Awesome+Python+Newsletter) | 必备周报 |
 52 | | 6 | [Real Python](https://realpython.com/) | 4 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Real+Python) | 文章质量高, 更新较少 |
 53 | | 7 | [Planet Python](https://planetpython.org) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Planet+Python) | 官方推荐的博客, 收录了很多博主 |
 54 | | 8 | [Julien Danjou](https://julien.danjou.info) | 4 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Julien+Danjou) | 文章质量不错, 保持更新 |
 55 | | 9 | [Doug Hellmann](https://doughellmann.com/blog/) | 4 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Doug+Hellmann) | 大名鼎鼎, 文章质量很高 |
 56 | | 10 | [The Mouse Vs. The Python](https://www.blog.pythonlibrary.org) | 4 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=The+Mouse+Vs.+The+Python) | 文章质量不错 |
 57 | | 11 | [InfoQ](https://www.infoq.cn/topic/python) | 4 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=InfoQ) | 原创/译文的质量不错 |
 58 | | 12 | [Jeff Knupp](https://jeffknupp.com/) | 4 | EN | X | [墙] 热门博客, 2018以后不更新了, 并且 planetpython 有, 暂不收录 |
 59 | | 13 | [Hacker News](https://hn.algolia.com/?query=python&sort=byPopularity&prefix&page=0&dateRange=last24h&type=story) | 4 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Hacker+News) | 大名鼎鼎的 HN |
 60 | | 14 | [Python Insider](https://blog.python.org/) | 3 | EN | X | 官方开发进度, 被官博和 planetPython 包含, 所以不需要收录. |
 61 | | 15 | [Brett Cannon](https://snarky.ca/) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Brett+Cannon) | 核心开发者 |
 62 | | 16 | [Encode](https://www.encode.io/) | 3 | EN | X | 知名 Python 开源组织, 文章太少, 暂不收录 |
 63 | | 17 | [机器之心](https://www.jiqizhixin.com/search/article?keywords=python&search_internet=true&sort=time) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E6%9C%BA%E5%99%A8%E4%B9%8B%E5%BF%83) | 知名公众号 |
 64 | | 18 | [依云's Blog](https://blog.lilydjwg.me/tag/python?page=1) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E4%BE%9D%E4%BA%91%27s+Blog) | 文章质量很高 |
 65 | | 19 | [DEV Community](https://dev.to/t/python/latest) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=DEV+Community) | 算是个挺好的社区, post 也都不太水 |
 66 | | 20 | [Python猫](https://zhuanlan.zhihu.com/pythonCat) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python%E7%8C%AB) | 2018 年末比较热情的博主, 原创 + 优质译文 |
 67 | | 21 | [Python之美](https://zhuanlan.zhihu.com/python-cn) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python%E4%B9%8B%E7%BE%8E) | 早期文章较多, 创业以后更新不太多了 |
 68 | | 22 | [静觅](https://cuiqingcai.com/category/technique/python) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E9%9D%99%E8%A7%85) |  崔庆才的个人博客, 保持更新的原创博主 |
 69 | | 23 | [推酷(中文)](https://www.tuicool.com/topics/11130000?st=0&lang=1) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E6%8E%A8%E9%85%B7%28%E4%B8%AD%E6%96%87%29) | 推文类站点. 按热门排序 |
 70 | | 24 | [推酷(英文)](https://www.tuicool.com/topics/11130000?st=0&lang=2) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E6%8E%A8%E9%85%B7%28%E8%8B%B1%E6%96%87%29) | 推文类站点. 按热门排序 |
 71 | | 25 | [开发者头条](https://toutiao.io/tags/python?type=latest) | 3 | CN | X | 推文类站点, 但是没有发布时间, 暂不收录 |
 72 | | 26 | [稀土掘金](https://juejin.im/tag/Python) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E7%A8%80%E5%9C%9F%E6%8E%98%E9%87%91) | 推文类站点. 按热门排序 |
 73 | | 27 | [Python部落](https://python.freelycode.com/contribution/list/0?page_no=1) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python%E9%83%A8%E8%90%BD) | 推文+译文 |
 74 | | 28 | [miguelgrinberg](https://blog.miguelgrinberg.com/index) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=miguelgrinberg) | Web 开发相关的内容挺多, 质量较高 |
 75 | | 29 | [Ned Batchelder](https://nedbatchelder.com/blog/tag/python.html) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Ned+Batchelder) | 热门博主。planetpython 也有 |
 76 | | 30 | [Full Stack Python](https://www.fullstackpython.com/blog.html) | 3 | EN | X | 热门博主。planetpython 有了, 文章比较少, 暂不收录 |
 77 | | 31 | [Eli Bendersky's website](https://eli.thegreenplace.net/tag/python) | 3 | EN | X | 值得一看，planetpython 有, 暂不收录 |
 78 | | 32 | [Manjusaka](https://manjusaka.itscoder.com/tags/Python/) | 3 | CN | X | 原创还不错, 但是文章较少, 暂不收录 |
 79 | | 33 | [Python程序员](https://zhuanlan.zhihu.com/pythoncxy) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python%E7%A8%8B%E5%BA%8F%E5%91%98) | 关注破万的知乎专栏 |
 80 | | 34 | [Python头条](https://zhuanlan.zhihu.com/c_111369541) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python%E5%A4%B4%E6%9D%A1) | 关注破万的知乎专栏 |
 81 | | 35 | [the5fire的技术博客](https://www.the5fire.com/category/python/) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=the5fire%E7%9A%84%E6%8A%80%E6%9C%AF%E5%8D%9A%E5%AE%A2) | 保持更新的热门中文博主 |
 82 | | 36 | [Python之禅](https://foofish.net/) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python%E4%B9%8B%E7%A6%85) | 文章较基础, 质量不错 |
 83 | | 37 | [V2EX](https://www.v2ex.com/go/python) | 3 | CN | X | 社区类, api 失效, web 端乱七八糟的, 不收录 |
 84 | | 38 | [伯乐在线](http://python.jobbole.com/all-posts/) | 3 | CN | X | 有点类似推酷, 质量参差不齐. HTTP ERROR 503 |
 85 | | 39 | [Python 3 Module of the Week](https://pymotw.com/3/) | 3 | EN | X | 看起来不怎么更新了, 暂不收录 |
 86 | | 40 | [The Invent with Python Blog](https://inventwithpython.com/blog/index.html) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=The+Invent+with+Python+Blog) | 感觉不错 |
 87 | | 41 | [Armin Ronacher's Thoughts and Writings](http://lucumr.pocoo.org/) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Armin+Ronacher%27s+Thoughts+and+Writings) | Flask 作者 Armin Ronacher |
 88 | | 42 | [aio-libs](https://groups.google.com/forum/#!forum/aio-libs) | 3 | EN | X | 知名 Python 开源组织, 不过没有文章类的 post |
 89 | | 43 | [码农周刊](https://weekly.manong.io/issues/) | 3 | CN | X | 课外读物, 非 Python 主题, 暂不收录 |
 90 | | 44 | [编程派](http://codingpy.com/) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E7%BC%96%E7%A8%8B%E6%B4%BE) | 原创+译文 |
 91 | | 45 | [峰云就她了](http://xiaorui.cc/archives/category/python) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E5%B3%B0%E4%BA%91%E5%B0%B1%E5%A5%B9%E4%BA%86) | 原创博客, 质量比较不错 |
 92 | | 46 | [Dan Bader](https://dbader.org/blog/) | 3 | EN | X | 一年不更新了, 先不收录了 |
 93 | | 47 | [Pythonic Perambulations](https://jakevdp.github.io/) | 3 | EN | X | 最后更新 Thu 13 September 2018, 暂不收录 |
 94 | | 48 | [开源中国翻译](https://www.oschina.net/translate/tag/python) | 3 | CN | X | 入库留着吧, 估计不更了, 暂不收录 |
 95 | | 49 | [Trey Hunner](https://treyhunner.com/blog/archives/) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Trey+Hunner) | Help developers level-up their Python skills |
 96 | | 50 | [Python Central](https://www.pythoncentral.io/) | 3 | EN | X | 不更新了, 暂不收录 |
 97 | | 51 | [Inside the Head of PyDanny](https://www.pydanny.com/) | 3 | EN | X | 不更新了, 暂不收录 |
 98 | | 52 | [华蟒用户组,CPyUG](https://groups.google.com/forum/#!forum/python-cn) | 3 | EN | X | [墙] 社区类, 自己看看就好, 暂不收录 |
 99 | | 53 | [Treehl](https://family-treesy.github.io/tags/PYTHON/) | 3 | CN | X | 文章较基础, 久不更新, 暂不收录 |
100 | | 54 | [蠎周刊](http://weekly.pychina.org) | 4 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E8%A0%8E%E5%91%A8%E5%88%8A) | 各种 weekly 中译版 |
101 | | 55 | [zzzeek](https://techspot.zzzeek.org/) | 3 | EN | X | 2016 年后停更了 |
102 | | 56 | [Yu’s blog](https://gofisher.github.io/) | 3 | CN | X | 原创, 但是久不更新了, 网站 http://blog.rainy.im/ 挂了 |
103 | | 57 | [程序师](http://www.techug.com/tag/python) | 3 | CN | X | 原创较少, 文章较旧 |
104 | | 58 | [一根笨茄子](http://blog.guoyb.com/tags/Python/) | 3 | CN | X | 文章更新较少, 质量参差 |
105 | | 59 | [追梦人物](https://www.zmrenwu.com/) | 2 | CN | X | 像个学习博客 |
106 | | 60 | [anshengme](https://blog.ansheng.me/) | 2 | CN | X | 质量一般 |
107 | | 61 | [Pegasus](http://ningning.today/categories/python/) | 2 | CN | X | 不怎么更新 |
108 | | 62 | [FunHacks](https://funhacks.net/categories/Python/) | 2 | CN | X | 太久不更新了, 不过python 之旅还行 |
109 | | 63 | [Peter Norvig's essays](http://norvig.com/) | 2 | EN | X | 这排版驾驭不了... |
110 | | 64 | [Peterbe.com](https://www.peterbe.com/plog/) | 2 | EN | X | 不是太值得收录 |
111 | | 65 | [Python Tips](https://pythontips.com/) | 2 | EN | X | 很火, 但我不喜欢 |
112 | | 66 | [脚本之家](https://www.jb51.net/list/list_97_1.htm) | 2 | CN | X | 文章的质量啊~~~ |
113 | | 67 | [开源中国搜索](https://www.oschina.net/search?scope=translate&q=python&category=0&onlytitle=0&sort_by_time=1) | 2 | CN | X | 质量不太高 |
114 | | 68 | [伯乐在线头条](http://top.jobbole.com/tag/python/?sort=latest) | 2 | CN | X | 停更 |
115 | | 69 | [代码片段](http://www.phpxs.com/code/python) | 2 | CN | X | 文章太老了, 停更了 |
116 | | 70 | [segmentfault](https://segmentfault.com/t/python/blogs) | 2 | CN | X | 文章质量 |
117 | | 71 | [Python China](http://python-china.org/api/topics/timeline) | 2 | CN | X | 欠费网站挂了 |
118 | | 72 | [麦穗技术](http://www.58maisui.com/category/python/) | 2 | CN | X | 网站挂了 |
119 | | 73 | [CSDN](https://so.csdn.net/so/search/s.do?q=python&t=blog&u=) | 1 | CN | X | 文章质量啊~~~ |
120 | | 74 | [Stack Overflow](https://stackoverflow.com/?tab=hot) | 3 | EN | X | 已解决 + python + vote>=5, 但是问题有点弱智, 暂不收录 |
121 | | 75 | [Reddit](https://www.reddit.com/r/Python/top/) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Reddit) | 知名社区. 质量参差, 收录每日 ups>=20 |
122 | | 76 | [码天狗](https://weekly.codetengu.com/issues) | 4 | CN | X | 综合类周报, 2018-11-23 之后不更了. 挂了, 下线. |
123 | | 77 | [Medium](https://medium.com/tag/python) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Medium) | Medium 的 Python tag, 收录 RSS |
124 | 
125 | 
126 | <!-- providers end -->
127 | 
128 | ### 声明
129 | 
130 | 1. 非盈利项目, 主要动机是 Python 语言学以致用, 并给大家提供学习 Python 的渠道
131 | 2. 若有侵权行为, 请在 Issues 留言, 将进行下线处理
132 | 3. 欢迎提交 PR, 欢迎在 Issues 留言提供优质内容源
133 | 4. 纯中文项目, 精力有限, 暂时先不管老外了
134 | 


--------------------------------------------------------------------------------
/bin/.config/systemd/user/newspaper_spider.service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=newspaper spider service
 3 | 
 4 | [Service]
 5 | Type=simple
 6 | ExecStart=/usr/local/bin/pipenv run python crawl_online.py
 7 | WorkingDirectory=/root/newspaper
 8 | 
 9 | [Install]
10 | WantedBy=multi-user.target
11 | WantedBy=network-online.target
12 | 


--------------------------------------------------------------------------------
/bin/.config/systemd/user/newspaper_spider.timer:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=newspaper spider timer
 3 | 
 4 | [Timer]
 5 | OnBootSec=10min
 6 | OnUnitActiveSec=15min
 7 | Unit=newspaper_spider.service
 8 | 
 9 | [Install]
10 | WantedBy=multi-user.target
11 | WantedBy=network-online.target
12 | 


--------------------------------------------------------------------------------
/bin/.config/systemd/user/newspaper_web.service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=newspaper web service
 3 | 
 4 | [Service]
 5 | Type=simple
 6 | ExecStart=/usr/local/bin/pipenv run python run_server.py
 7 | WorkingDirectory=/root/newspaper
 8 | [Install]
 9 | WantedBy=multi-user.target
10 | WantedBy=network-online.target
11 | 


--------------------------------------------------------------------------------
/bin/git-sync.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | git fetch; git reset --hard origin/master
3 | 


--------------------------------------------------------------------------------
/bin/obsoleted/restart.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | DIR=$(cd `dirname $0`; pwd)
3 | cd $DIR
4 | sh stop.sh
5 | sh start.sh > /dev/null
6 | 


--------------------------------------------------------------------------------
/bin/obsoleted/start.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | DIR=$(cd `dirname $0`/..; pwd)
3 | cd $DIR
4 | nohup pipenv run python run_server.py &
5 | echo "server started"
6 | echo
7 | 


--------------------------------------------------------------------------------
/bin/obsoleted/stop.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ps aux|grep 'newspaper-'|grep 'run_server.py'|awk '{print $2}'|xargs kill
3 | echo "server stoped"
4 | echo
5 | 


--------------------------------------------------------------------------------
/bin/update_systemd_config.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | user_systemd_dir = pathlib.Path.home() / '.config/systemd/user'
 3 | if not user_systemd_dir.is_dir():
 4 |     user_systemd_dir.mkdir()
 5 | 
 6 | newspaper_product_dir = pathlib.Path(
 7 |     __file__).absolute().parent.parent.absolute()
 8 | 
 9 | # web 服务启动
10 | 
11 | newspaper_web_service = fr'''
12 | [Unit]
13 | Description=newspaper web service
14 | 
15 | [Service]
16 | Type=simple
17 | ExecStart=/usr/local/bin/pipenv run python run_server.py
18 | WorkingDirectory={newspaper_product_dir}
19 | [Install]
20 | WantedBy=multi-user.target
21 | WantedBy=network-online.target
22 | '''
23 | newspaper_web_service_fp = user_systemd_dir / 'newspaper_web.service'
24 | newspaper_web_service_fp.write_text(newspaper_web_service, encoding='utf-8')
25 | 
26 | # 爬虫服务
27 | 
28 | newspaper_spider_service = fr'''
29 | [Unit]
30 | Description=newspaper spider service
31 | 
32 | [Service]
33 | Type=simple
34 | ExecStart=/usr/local/bin/pipenv run python crawl_online.py
35 | WorkingDirectory={newspaper_product_dir}
36 | 
37 | [Install]
38 | WantedBy=multi-user.target
39 | WantedBy=network-online.target
40 | '''
41 | newspaper_spider_service_fp = user_systemd_dir / 'newspaper_spider.service'
42 | newspaper_spider_service_fp.write_text(newspaper_spider_service,
43 |                                        encoding='utf-8')
44 | 
45 | # 爬虫定时器
46 | 
47 | newspaper_spider_timer = r'''
48 | [Unit]
49 | Description=newspaper spider timer
50 | 
51 | [Timer]
52 | OnBootSec=10min
53 | OnUnitActiveSec=15min
54 | Unit=newspaper_spider.service
55 | 
56 | [Install]
57 | WantedBy=multi-user.target
58 | WantedBy=network-online.target
59 | '''
60 | newspaper_spider_timer_fp = user_systemd_dir / 'newspaper_spider.timer'
61 | newspaper_spider_timer_fp.write_text(newspaper_spider_timer, encoding='utf-8')
62 | 


--------------------------------------------------------------------------------
/crawl_history.py:
--------------------------------------------------------------------------------
 1 | from newspaper.crawler.main import history_workflow
 2 | import asyncio
 3 | """
 4 | 采集历史文章脚本
 5 | 1. 本地脚本
 6 | 2. 执行历史文章抓取任务, 并将文章入库
 7 | 3. 将需要抓历史文章的内容源的函数加装饰器 register_history, 就会被自动调用
 8 | 4. 一般抓历史文章的任务只在第一次收录时候使用, 后期使用 online_spiders 保持更新
 9 | """
10 | 
11 | if __name__ == "__main__":
12 |     loop = asyncio.get_event_loop()
13 |     loop.run_until_complete(history_workflow())
14 | 


--------------------------------------------------------------------------------
/crawl_online.py:
--------------------------------------------------------------------------------
 1 | def main():
 2 |     """
 3 |     采集线上爬虫脚本
 4 |     1. 本地脚本 / 线上脚本 皆可. crontab 任务
 5 |     2. 执行执行常规抓取任务, 并将文章入库
 6 |     3. 将需要文章的内容源的函数加装饰器 register_online, 就会被自动调用
 7 |     """
 8 |     from newspaper.crawler.main import online_workflow
 9 |     import asyncio
10 |     loop = asyncio.get_event_loop()
11 |     loop.run_until_complete(online_workflow())
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     main()
16 | 


--------------------------------------------------------------------------------
/crawl_test.py:
--------------------------------------------------------------------------------
 1 | def test():
 2 |     from newspaper.crawler.main import test_spider_workflow
 3 |     import asyncio
 4 |     loop = asyncio.get_event_loop()
 5 |     loop.run_until_complete(test_spider_workflow())
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     test()
10 | 


--------------------------------------------------------------------------------
/db_backup.py:
--------------------------------------------------------------------------------
 1 | #! pipenv run python
 2 | """
 3 | 从线上拉数据到本地备份 sqlite
 4 | """
 5 | import re
 6 | 
 7 | from torequests import tPool
 8 | from torequests.utils import ttime, time
 9 | 
10 | from newspaper.models import Sqlite3Storage, logger
11 | from newspaper.config import ONLINE_HOST
12 | 
13 | 
14 | def fetch_artcles(ts_start):
15 |     req = tPool()
16 |     api = f'https://{ONLINE_HOST}/newspaper/articles.query.json'
17 |     next_url = ''
18 |     start_params = {
19 |         'query': '',
20 |         'start_time': ts_start,
21 |         'end_time': '',
22 |         'source': '',
23 |         'lang': 'ANY',
24 |         'order_by': 'ts_update',
25 |         'sorting': 'asc',
26 |         'limit': '100',
27 |         'offset': '0',
28 |     }
29 | 
30 |     while 1:
31 |         params = {} if next_url else start_params
32 |         # 没有 next_url 的时候访问第一页, 有的时候访问 next_url
33 |         url = next_url or api
34 |         r = req.get(url, params=params, retry=2, timeout=10)
35 |         if not r.x:
36 |             logger.error(f'req init failed: {r.x}, {r.text}')
37 |             raise IOError
38 |         rj = r.json()
39 |         articles = rj.get('articles', [])
40 |         if articles:
41 |             yield articles
42 |         next_url = rj.get('next_url', '')
43 |         if not (articles and next_url):
44 |             # 没有文章, 并没有下一页
45 |             logger.info(f'fetch_artcles finished, last url: {url}')
46 |             return
47 |         next_url = re.sub('^/', f'https://{ONLINE_HOST}/', next_url)
48 | 
49 | 
50 | def get_ts_latest(cursor):
51 |     cursor.execute('select max(ts_update) from articles')
52 |     result = cursor.fetchone()[0]
53 |     if result:
54 |         return result
55 |     else:
56 |         return ttime(0)
57 | 
58 | 
59 | def main():
60 |     db = Sqlite3Storage(file_path='backup.sqlite')
61 |     db._ensure_article_table_exists()
62 |     ts_latest = get_ts_latest(db.cursor)
63 |     logger.info(f'sync articles from online api: ts_latest={ts_latest}')
64 |     article_cnt = 0
65 |     for articles in fetch_artcles(ts_latest):
66 |         db.add_articles(articles)
67 |         article_cnt += len(articles)
68 |         logger.info(f'+ {len(articles)} articles => {article_cnt}')
69 |     logger.info(f'+ {article_cnt} new articles.')
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     main()
74 |     time.sleep(3)
75 | 


--------------------------------------------------------------------------------
/db_sql.py:
--------------------------------------------------------------------------------
 1 | #! pipenv run python
 2 | """
 3 | 同时执行线上先下数据库
 4 | """
 5 | import asyncio
 6 | import traceback
 7 | import logging
 8 | 
 9 | from newspaper.config import init_db
10 | from newspaper.models import Sqlite3Storage, logger
11 | 
12 | 
13 | async def main():
14 |     db = Sqlite3Storage(file_path='backup.sqlite')
15 |     db._ensure_article_table_exists()
16 |     mysql = init_db()
17 |     logger.setLevel(logging.WARNING)
18 |     while 1:
19 |         # select count(*) from articles
20 |         # select count(*) from articles where `desc` like '%本文分享 方巍%'
21 |         sql = input('Input SQL:\n')
22 |         if not sql:
23 |             break
24 |         try:
25 |             print(sql)
26 |             db.cursor.execute(sql)
27 |             logger.warning(f'Sqlite3Storage: {db.cursor.fetchall()}')
28 |             result = await mysql.execute(sql)
29 |             logger.warning(f'MysqlStorage: {result}')
30 |         except KeyboardInterrupt:
31 |             break
32 |         except Exception:
33 |             traceback.print_exc()
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     asyncio.run(main())
38 | 


--------------------------------------------------------------------------------
/deploy.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## 首次部署
 3 | 0. install python 3.7+
 4 | 1. git clone ...
 5 | 2. pipenv install
 6 | 3. python3.7 update_systemd_config.py
 7 | 4. 新建 JSON 格式配置文件 /var/newspaper.conf
 8 |    1. {"anti_gfw": {"url": "这里填写翻墙服务的地址, 如果没有则使用 http://localhost"}, "mysql_config": {"mysql_host": "", "mysql_port": 3306, "mysql_user": "", "mysql_password": "", "mysql_db": "db"}}
 9 |    2. 当然环境变量 export newspaper_config 上面的 JSON 也是可以的
10 | 5. systemctl --user enable newspaper_web.service; systemctl --user start newspaper_web.service
11 | 6. systemctl --user enable newspaper_spider.timer; systemctl --user start newspaper_spider.timer
12 | 7. 绑定域名, 并配置 nginx 托管相关端口, 支持 SSL
13 | 
14 | 
15 | 
16 | 
17 | ### vscode task 升级更新脚本
18 | ```git co master ; git merge dev; git push; git co dev;ssh aliyun 'cd newspaper/bin;sh git-sync.sh;python3.7 update_systemd_config.py;systemctl daemon-reload;systemctl --user restart newspaper_web.service'```
19 | 


--------------------------------------------------------------------------------
/newspaper/api.py:
--------------------------------------------------------------------------------
 1 | #! python3
 2 | 
 3 | import pathlib
 4 | 
 5 | from starlette.applications import Starlette
 6 | from starlette.staticfiles import StaticFiles
 7 | from starlette.templating import Jinja2Templates
 8 | 
 9 | from .config import init_db, global_configs
10 | from .loggers import logger
11 | 
12 | static_dir = pathlib.Path(__file__).absolute().parent / 'static'
13 | templates_dir = pathlib.Path(__file__).absolute().parent / 'templates'
14 | 
15 | app = Starlette()
16 | app.mount('/static', StaticFiles(directory=str(static_dir)), name='static')
17 | app.config = global_configs
18 | app.logger = logger
19 | app.db = init_db()
20 | app.templates = Jinja2Templates(directory=str(templates_dir))
21 | 
22 | 
23 | @app.on_event('startup')
24 | async def _ensure_article_table_exists():
25 |     await app.db._ensure_article_table_exists()
26 | 


--------------------------------------------------------------------------------
/newspaper/config.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import os
 4 | import pathlib
 5 | 
 6 | 
 7 | def init_config():
 8 |     global_configs = os.getenv(
 9 |         'newspaper_config',
10 |         None) or pathlib.Path('/var/newspaper.conf').read_text()
11 |     if global_configs:
12 |         global_configs = json.loads(global_configs)
13 |     else:
14 |         newspaper_config_template = '{"anti_gfw": {"url": "xxx"}, "mysql_config": {"mysql_host": "xxx", "mysql_port": 0, "mysql_user": "xxx", "mysql_password": "xxx", "mysql_db": "xxx"}}'
15 |         logging.error(
16 |             f'environment variable `newspaper_config` not found, it should be set as json like: {newspaper_config_template}'
17 |         )
18 |         raise RuntimeError('environment variable `newspaper_config` not found')
19 |     return global_configs
20 | 
21 | 
22 | def init_db():
23 |     from .models import MySQLStorage
24 |     db = MySQLStorage(global_configs['mysql_config'])
25 |     return db
26 | 
27 | 
28 | global_configs = init_config()
29 | ONLINE_HOST = 'www.clericpy.top'
30 | GA_ID = 'UA-150991415-2'
31 | BEIAN_ID = '鲁ICP备19021778号-1'
32 | 


--------------------------------------------------------------------------------
/newspaper/crawler/main.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | 
  3 | from torequests.dummy import Requests
  4 | 
  5 | from ..config import init_db
  6 | from ..loggers import spider_logger
  7 | from .spiders import history_spiders, online_spiders
  8 | 
  9 | db = init_db()
 10 | 
 11 | 
 12 | async def test_spider_workflow():
 13 |     from .spiders import test_spiders
 14 |     from ..models import Storage
 15 |     from pprint import pprint
 16 | 
 17 |     for func in test_spiders:
 18 |         print('test start:', func.__doc__)
 19 |         articles = await func()
 20 |         articles = Storage.ensure_articles(articles)
 21 |         # check schema
 22 |         for item in articles:
 23 |             assert (not item.get('desc')) or isinstance(item['desc'], str)
 24 |             assert (not item.get('ts_publish')) or isinstance(
 25 |                 item['ts_publish'], str)
 26 |             assert (not item.get('cover')) or isinstance(item['cover'], str)
 27 |             assert isinstance(item.get('source'), str)
 28 |             assert isinstance(item.get('title'), str)
 29 |             assert isinstance(item.get('url'), str)
 30 |         pprint(articles)
 31 | 
 32 | 
 33 | async def clear_cache():
 34 |     url = 'http://127.0.0.1:9001/newspaper/articles.cache.clear'
 35 |     req = Requests()
 36 |     r = await req.get(url, timeout=2)
 37 |     spider_logger.info(f'clear_cache {r.text}')
 38 | 
 39 | 
 40 | async def online_workflow():
 41 |     if not online_spiders:
 42 |         spider_logger.info('no online_spiders online.')
 43 |         return
 44 |     # 确认 articles 表存在, 否则建表
 45 |     await db._ensure_article_table_exists()
 46 |     # 生成一个 function name → source name 的映射
 47 |     function_sources = {func.__name__: func.__doc__ for func in online_spiders}
 48 |     coros = [func() for func in online_spiders]
 49 |     done, fail = await asyncio.wait(coros, timeout=120)
 50 |     spider_logger.info(f'{"=" * 30}')
 51 |     if fail:
 52 |         fail_names = [
 53 |             f'{idx}. {function_sources.get(task._coro.__name__, task._coro.__name__)}'
 54 |             for idx, task in enumerate(fail, 1)
 55 |         ]
 56 |         spider_logger.warn(
 57 |             f'timeout spiders({len(fail)}): {fail_names}')
 58 |     pool = await db.get_pool()
 59 |     async with pool.acquire() as conn:
 60 |         async with conn.cursor() as cursor:
 61 |             for idx, task in enumerate(done, 1):
 62 |                 articles = task.result()
 63 |                 func_name = task._coro.__name__
 64 |                 source_name = function_sources.get(func_name, func_name)
 65 |                 if articles:
 66 |                     insert_result = await db.add_articles(articles,
 67 |                                                           cursor=cursor)
 68 |                 else:
 69 |                     insert_result = 0
 70 |                 spider_logger.info(
 71 |                     f'{idx: 3}. {"+" if articles else "?????????"} {insert_result} / {len(articles)} articles.\t[{source_name}]'
 72 |                 )
 73 |     await clear_cache()
 74 | 
 75 | 
 76 | async def history_workflow():
 77 |     if not history_spiders:
 78 |         spider_logger.info('ignore for no history_spiders online.')
 79 |         return
 80 |     # 确认 articles 表存在, 否则建表
 81 |     await db._ensure_article_table_exists()
 82 |     # 生成一个 function name → source name 的映射
 83 |     function_sources = {func.__name__: func.__doc__ for func in history_spiders}
 84 |     coros = [func() for func in history_spiders]
 85 |     done, fail = await asyncio.wait(coros, timeout=9999)
 86 |     spider_logger.info(f'{"=" * 30}')
 87 |     if fail:
 88 |         fail_names = [
 89 |             f'{idx}. {function_sources.get(task._coro.__name__, task._coro.__name__)}'
 90 |             for idx, task in enumerate(fail, 1)
 91 |         ]
 92 |         spider_logger.warn(
 93 |             f'timeout spiders({len(fail)}): {fail_names}')
 94 |     pool = await db.get_pool()
 95 |     async with pool.acquire() as conn:
 96 |         async with conn.cursor() as cursor:
 97 |             for idx, task in enumerate(done, 1):
 98 |                 articles = task.result()
 99 |                 func_name = task._coro.__name__
100 |                 source_name = function_sources.get(func_name, func_name)
101 |                 if articles:
102 |                     insert_result = await db.add_articles(articles,
103 |                                                           cursor=cursor)
104 |                 else:
105 |                     insert_result = 0
106 |                 spider_logger.info(
107 |                     f'{idx: 3}. {"+" if articles else "?????????"} {insert_result} / {len(articles)} articles.\t[{source_name}]'
108 |                 )
109 |     await clear_cache()
110 | 


--------------------------------------------------------------------------------
/newspaper/crawler/sources.py:
--------------------------------------------------------------------------------
  1 | from torequests.utils import quote_plus
  2 | import sys
  3 | import pathlib
  4 | sys.path.append(str(pathlib.Path(__file__).absolute().parent.parent))
  5 | from config import ONLINE_HOST
  6 | 
  7 | content_sources = [
  8 |     {
  9 |         "title": "Python Software Foundation News",
 10 |         "url": "https://pyfound.blogspot.com/",
 11 |         "level": 4,
 12 |         "lang": "EN",
 13 |         "status": "√",
 14 |         "desc": "[墙] 来自 Python 软件基金会的消息"
 15 |     },
 16 |     {
 17 |         "title": "Python Weekly",
 18 |         "url": "https://www.pythonweekly.com/",
 19 |         "level": 5,
 20 |         "lang": "EN",
 21 |         "status": "√",
 22 |         "desc": "必备周报"
 23 |     },
 24 |     {
 25 |         "title": "PyCoder's Weekly",
 26 |         "url": "https://pycoders.com/issues",
 27 |         "level": 5,
 28 |         "lang": "EN",
 29 |         "status": "√",
 30 |         "desc": "必备周报"
 31 |     },
 32 |     {
 33 |         "title": "Import Python",
 34 |         "url": "https://importpython.com/newsletter/archive/",
 35 |         "level": 5,
 36 |         "lang": "EN",
 37 |         "status": "√",
 38 |         "desc": "必备周报, 2019.1.11 停更了, 希望早日康复~"
 39 |     },
 40 |     {
 41 |         "title": "Awesome Python Newsletter",
 42 |         "url": "https://python.libhunt.com/newsletter/archive",
 43 |         "level": 5,
 44 |         "lang": "EN",
 45 |         "status": "√",
 46 |         "desc": "必备周报"
 47 |     },
 48 |     {
 49 |         "title": "Real Python",
 50 |         "url": "https://realpython.com/",
 51 |         "level": 4,
 52 |         "lang": "EN",
 53 |         "status": "√",
 54 |         "desc": "文章质量高, 更新较少"
 55 |     },
 56 |     {
 57 |         "title": "Planet Python",
 58 |         "url": "https://planetpython.org",
 59 |         "level": 3,
 60 |         "lang": "EN",
 61 |         "status": "√",
 62 |         "desc": "官方推荐的博客, 收录了很多博主"
 63 |     },
 64 |     {
 65 |         "title": "Julien Danjou",
 66 |         "url": "https://julien.danjou.info",
 67 |         "level": 4,
 68 |         "lang": "EN",
 69 |         "status": "√",
 70 |         "desc": "文章质量不错, 保持更新"
 71 |     },
 72 |     {
 73 |         "title": "Doug Hellmann",
 74 |         "url": "https://doughellmann.com/blog/",
 75 |         "level": 4,
 76 |         "lang": "EN",
 77 |         "status": "√",
 78 |         "desc": "大名鼎鼎, 文章质量很高"
 79 |     },
 80 |     {
 81 |         "title": "The Mouse Vs. The Python",
 82 |         "url": "https://www.blog.pythonlibrary.org",
 83 |         "level": 4,
 84 |         "lang": "EN",
 85 |         "status": "√",
 86 |         "desc": "文章质量不错"
 87 |     },
 88 |     {
 89 |         "title": "InfoQ",
 90 |         "url": "https://www.infoq.cn/topic/python",
 91 |         "level": 4,
 92 |         "lang": "CN",
 93 |         "status": "√",
 94 |         "desc": "原创/译文的质量不错"
 95 |     },
 96 |     {
 97 |         "title": "Jeff Knupp",
 98 |         "url": "https://jeffknupp.com/",
 99 |         "level": 4,
100 |         "lang": "EN",
101 |         "status": "X",
102 |         "desc": "[墙] 热门博客, 2018以后不更新了, 并且 planetpython 有, 暂不收录"
103 |     },
104 |     {
105 |         "title": "Hacker News",
106 |         "url": "https://hn.algolia.com/?query=python&sort=byPopularity&prefix&page=0&dateRange=last24h&type=story",
107 |         "level": 4,
108 |         "lang": "EN",
109 |         "status": "√",
110 |         "desc": "大名鼎鼎的 HN"
111 |     },
112 |     {
113 |         "title": "Python Insider",
114 |         "url": "https://blog.python.org/",
115 |         "level": 3,
116 |         "lang": "EN",
117 |         "status": "X",
118 |         "desc": "官方开发进度, 被官博和 planetPython 包含, 所以不需要收录."
119 |     },
120 |     {
121 |         "title": "Brett Cannon",
122 |         "url": "https://snarky.ca/",
123 |         "level": 3,
124 |         "lang": "EN",
125 |         "status": "√",
126 |         "desc": "核心开发者"
127 |     },
128 |     {
129 |         "title": "Encode",
130 |         "url": "https://www.encode.io/",
131 |         "level": 3,
132 |         "lang": "EN",
133 |         "status": "X",
134 |         "desc": "知名 Python 开源组织, 文章太少, 暂不收录"
135 |     },
136 |     {
137 |         "title": "机器之心",
138 |         "url": "https://www.jiqizhixin.com/search/article?keywords=python&search_internet=true&sort=time",
139 |         "level": 3,
140 |         "lang": "CN",
141 |         "status": "√",
142 |         "desc": "知名公众号"
143 |     },
144 |     {
145 |         "title": "依云's Blog",
146 |         "url": "https://blog.lilydjwg.me/tag/python?page=1",
147 |         "level": 3,
148 |         "lang": "CN",
149 |         "status": "√",
150 |         "desc": "文章质量很高"
151 |     },
152 |     {
153 |         "title": "DEV Community",
154 |         "url": "https://dev.to/t/python/latest",
155 |         "level": 3,
156 |         "lang": "EN",
157 |         "status": "√",
158 |         "desc": "算是个挺好的社区, post 也都不太水"
159 |     },
160 |     {
161 |         "title": "Python猫",
162 |         "url": "https://zhuanlan.zhihu.com/pythonCat",
163 |         "level": 3,
164 |         "lang": "CN",
165 |         "status": "√",
166 |         "desc": "2018 年末比较热情的博主, 原创 + 优质译文"
167 |     },
168 |     {
169 |         "title": "Python之美",
170 |         "url": "https://zhuanlan.zhihu.com/python-cn",
171 |         "level": 3,
172 |         "lang": "CN",
173 |         "status": "√",
174 |         "desc": "早期文章较多, 创业以后更新不太多了"
175 |     },
176 |     {
177 |         "title": "静觅",
178 |         "url": "https://cuiqingcai.com/category/technique/python",
179 |         "level": 3,
180 |         "lang": "CN",
181 |         "status": "√",
182 |         "desc": " 崔庆才的个人博客, 保持更新的原创博主"
183 |     },
184 |     {
185 |         "title": "推酷(中文)",
186 |         "url": "https://www.tuicool.com/topics/11130000?st=0&lang=1",
187 |         "level": 3,
188 |         "lang": "CN",
189 |         "status": "√",
190 |         "desc": "推文类站点. 按热门排序"
191 |     },
192 |     {
193 |         "title": "推酷(英文)",
194 |         "url": "https://www.tuicool.com/topics/11130000?st=0&lang=2",
195 |         "level": 3,
196 |         "lang": "EN",
197 |         "status": "√",
198 |         "desc": "推文类站点. 按热门排序"
199 |     },
200 |     {
201 |         "title": "开发者头条",
202 |         "url": "https://toutiao.io/tags/python?type=latest",
203 |         "level": 3,
204 |         "lang": "CN",
205 |         "status": "X",
206 |         "desc": "推文类站点, 但是没有发布时间, 暂不收录"
207 |     },
208 |     {
209 |         "title": "稀土掘金",
210 |         "url": "https://juejin.im/tag/Python",
211 |         "level": 3,
212 |         "lang": "CN",
213 |         "status": "√",
214 |         "desc": "推文类站点. 按热门排序"
215 |     },
216 |     {
217 |         "title": "Python部落",
218 |         "url": "https://python.freelycode.com/contribution/list/0?page_no=1",
219 |         "level": 3,
220 |         "lang": "CN",
221 |         "status": "√",
222 |         "desc": "推文+译文"
223 |     },
224 |     {
225 |         "title": "miguelgrinberg",
226 |         "url": "https://blog.miguelgrinberg.com/index",
227 |         "level": 3,
228 |         "lang": "EN",
229 |         "status": "√",
230 |         "desc": "Web 开发相关的内容挺多, 质量较高"
231 |     },
232 |     {
233 |         "title": "Ned Batchelder",
234 |         "url": "https://nedbatchelder.com/blog/tag/python.html",
235 |         "level": 3,
236 |         "lang": "EN",
237 |         "status": "√",
238 |         "desc": "热门博主。planetpython 也有"
239 |     },
240 |     {
241 |         "title": "Full Stack Python",
242 |         "url": "https://www.fullstackpython.com/blog.html",
243 |         "level": 3,
244 |         "lang": "EN",
245 |         "status": "X",
246 |         "desc": "热门博主。planetpython 有了, 文章比较少, 暂不收录"
247 |     },
248 |     {
249 |         "title": "Eli Bendersky's website",
250 |         "url": "https://eli.thegreenplace.net/tag/python",
251 |         "level": 3,
252 |         "lang": "EN",
253 |         "status": "X",
254 |         "desc": "值得一看，planetpython 有, 暂不收录"
255 |     },
256 |     {
257 |         "title": "Manjusaka",
258 |         "url": "https://manjusaka.itscoder.com/tags/Python/",
259 |         "level": 3,
260 |         "lang": "CN",
261 |         "status": "X",
262 |         "desc": "原创还不错, 但是文章较少, 暂不收录"
263 |     },
264 |     {
265 |         "title": "Python程序员",
266 |         "url": "https://zhuanlan.zhihu.com/pythoncxy",
267 |         "level": 3,
268 |         "lang": "CN",
269 |         "status": "√",
270 |         "desc": "关注破万的知乎专栏"
271 |     },
272 |     {
273 |         "title": "Python头条",
274 |         "url": "https://zhuanlan.zhihu.com/c_111369541",
275 |         "level": 3,
276 |         "lang": "CN",
277 |         "status": "√",
278 |         "desc": "关注破万的知乎专栏"
279 |     },
280 |     {
281 |         "title": "the5fire的技术博客",
282 |         "url": "https://www.the5fire.com/category/python/",
283 |         "level": 3,
284 |         "lang": "CN",
285 |         "status": "√",
286 |         "desc": "保持更新的热门中文博主"
287 |     },
288 |     {
289 |         "title": "Python之禅",
290 |         "url": "https://foofish.net/",
291 |         "level": 3,
292 |         "lang": "CN",
293 |         "status": "√",
294 |         "desc": "文章较基础, 质量不错"
295 |     },
296 |     {
297 |         "title": "V2EX",
298 |         "url": "https://www.v2ex.com/go/python",
299 |         "level": 3,
300 |         "lang": "CN",
301 |         "status": "X",
302 |         "desc": "社区类, api 失效, web 端乱七八糟的, 不收录"
303 |     },
304 |     {
305 |         "title": "伯乐在线",
306 |         "url": "http://python.jobbole.com/all-posts/",
307 |         "level": 3,
308 |         "lang": "CN",
309 |         "status": "X",
310 |         "desc": "有点类似推酷, 质量参差不齐. HTTP ERROR 503"
311 |     },
312 |     {
313 |         "title": "Python 3 Module of the Week",
314 |         "url": "https://pymotw.com/3/",
315 |         "level": 3,
316 |         "lang": "EN",
317 |         "status": "X",
318 |         "desc": "看起来不怎么更新了, 暂不收录"
319 |     },
320 |     {
321 |         "title": "The Invent with Python Blog",
322 |         "url": "https://inventwithpython.com/blog/index.html",
323 |         "level": 3,
324 |         "lang": "EN",
325 |         "status": "√",
326 |         "desc": "感觉不错"
327 |     },
328 |     {
329 |         "title": "Armin Ronacher's Thoughts and Writings",
330 |         "url": "http://lucumr.pocoo.org/",
331 |         "level": 3,
332 |         "lang": "EN",
333 |         "status": "√",
334 |         "desc": "Flask 作者 Armin Ronacher"
335 |     },
336 |     {
337 |         "title": "aio-libs",
338 |         "url": "https://groups.google.com/forum/#!forum/aio-libs",
339 |         "level": 3,
340 |         "lang": "EN",
341 |         "status": "X",
342 |         "desc": "知名 Python 开源组织, 不过没有文章类的 post"
343 |     },
344 |     {
345 |         "title": "码农周刊",
346 |         "url": "https://weekly.manong.io/issues/",
347 |         "level": 3,
348 |         "lang": "CN",
349 |         "status": "X",
350 |         "desc": "课外读物, 非 Python 主题, 暂不收录"
351 |     },
352 |     {
353 |         "title": "编程派",
354 |         "url": "http://codingpy.com/",
355 |         "level": 3,
356 |         "lang": "CN",
357 |         "status": "√",
358 |         "desc": "原创+译文"
359 |     },
360 |     {
361 |         "title": "峰云就她了",
362 |         "url": "http://xiaorui.cc/archives/category/python",
363 |         "level": 3,
364 |         "lang": "CN",
365 |         "status": "√",
366 |         "desc": "原创博客, 质量比较不错"
367 |     },
368 |     {
369 |         "title": "Dan Bader",
370 |         "url": "https://dbader.org/blog/",
371 |         "level": 3,
372 |         "lang": "EN",
373 |         "status": "X",
374 |         "desc": "一年不更新了, 先不收录了"
375 |     },
376 |     {
377 |         "title": "Pythonic Perambulations",
378 |         "url": "https://jakevdp.github.io/",
379 |         "level": 3,
380 |         "lang": "EN",
381 |         "status": "X",
382 |         "desc": "最后更新 Thu 13 September 2018, 暂不收录"
383 |     },
384 |     {
385 |         "title": "开源中国翻译",
386 |         "url": "https://www.oschina.net/translate/tag/python",
387 |         "level": 3,
388 |         "lang": "CN",
389 |         "status": "X",
390 |         "desc": "入库留着吧, 估计不更了, 暂不收录"
391 |     },
392 |     {
393 |         "title": "Trey Hunner",
394 |         "url": "https://treyhunner.com/blog/archives/",
395 |         "level": 3,
396 |         "lang": "EN",
397 |         "status": "√",
398 |         "desc": "Help developers level-up their Python skills"
399 |     },
400 |     {
401 |         "title": "Python Central",
402 |         "url": "https://www.pythoncentral.io/",
403 |         "level": 3,
404 |         "lang": "EN",
405 |         "status": "X",
406 |         "desc": "不更新了, 暂不收录"
407 |     },
408 |     {
409 |         "title": "Inside the Head of PyDanny",
410 |         "url": "https://www.pydanny.com/",
411 |         "level": 3,
412 |         "lang": "EN",
413 |         "status": "X",
414 |         "desc": "不更新了, 暂不收录"
415 |     },
416 |     {
417 |         "title": "华蟒用户组,CPyUG",
418 |         "url": "https://groups.google.com/forum/#!forum/python-cn",
419 |         "level": 3,
420 |         "lang": "EN",
421 |         "status": "X",
422 |         "desc": "[墙] 社区类, 自己看看就好, 暂不收录"
423 |     },
424 |     {
425 |         "title": "Treehl",
426 |         "url": "https://family-treesy.github.io/tags/PYTHON/",
427 |         "level": 3,
428 |         "lang": "CN",
429 |         "status": "X",
430 |         "desc": "文章较基础, 久不更新, 暂不收录"
431 |     },
432 |     {
433 |         "title": "蠎周刊",
434 |         "url": "http://weekly.pychina.org",
435 |         "level": 4,
436 |         "lang": "CN",
437 |         "status": "√",
438 |         "desc": "各种 weekly 中译版"
439 |     },
440 |     {
441 |         "title": "zzzeek",
442 |         "url": "https://techspot.zzzeek.org/",
443 |         "level": 3,
444 |         "lang": "EN",
445 |         "status": "X",
446 |         "desc": "2016 年后停更了"
447 |     },
448 |     {
449 |         "title": "Yu’s blog",
450 |         "url": "https://gofisher.github.io/",
451 |         "level": 3,
452 |         "lang": "CN",
453 |         "status": "X",
454 |         "desc": "原创, 但是久不更新了, 网站 http://blog.rainy.im/ 挂了"
455 |     },
456 |     {
457 |         "title": "程序师",
458 |         "url": "http://www.techug.com/tag/python",
459 |         "level": 3,
460 |         "lang": "CN",
461 |         "status": "X",
462 |         "desc": "原创较少, 文章较旧"
463 |     },
464 |     {
465 |         "title": "一根笨茄子",
466 |         "url": "http://blog.guoyb.com/tags/Python/",
467 |         "level": 3,
468 |         "lang": "CN",
469 |         "status": "X",
470 |         "desc": "文章更新较少, 质量参差"
471 |     },
472 |     {
473 |         "title": "追梦人物",
474 |         "url": "https://www.zmrenwu.com/",
475 |         "level": 2,
476 |         "lang": "CN",
477 |         "status": "X",
478 |         "desc": "像个学习博客"
479 |     },
480 |     {
481 |         "title": "anshengme",
482 |         "url": "https://blog.ansheng.me/",
483 |         "level": 2,
484 |         "lang": "CN",
485 |         "status": "X",
486 |         "desc": "质量一般"
487 |     },
488 |     {
489 |         "title": "Pegasus",
490 |         "url": "http://ningning.today/categories/python/",
491 |         "level": 2,
492 |         "lang": "CN",
493 |         "status": "X",
494 |         "desc": "不怎么更新"
495 |     },
496 |     {
497 |         "title": "FunHacks",
498 |         "url": "https://funhacks.net/categories/Python/",
499 |         "level": 2,
500 |         "lang": "CN",
501 |         "status": "X",
502 |         "desc": "太久不更新了, 不过python 之旅还行"
503 |     },
504 |     {
505 |         "title": "Peter Norvig's essays",
506 |         "url": "http://norvig.com/",
507 |         "level": 2,
508 |         "lang": "EN",
509 |         "status": "X",
510 |         "desc": "这排版驾驭不了..."
511 |     },
512 |     {
513 |         "title": "Peterbe.com",
514 |         "url": "https://www.peterbe.com/plog/",
515 |         "level": 2,
516 |         "lang": "EN",
517 |         "status": "X",
518 |         "desc": "不是太值得收录"
519 |     },
520 |     {
521 |         "title": "Python Tips",
522 |         "url": "https://pythontips.com/",
523 |         "level": 2,
524 |         "lang": "EN",
525 |         "status": "X",
526 |         "desc": "很火, 但我不喜欢"
527 |     },
528 |     {
529 |         "title": "脚本之家",
530 |         "url": "https://www.jb51.net/list/list_97_1.htm",
531 |         "level": 2,
532 |         "lang": "CN",
533 |         "status": "X",
534 |         "desc": "文章的质量啊~~~"
535 |     },
536 |     {
537 |         "title": "开源中国搜索",
538 |         "url": "https://www.oschina.net/search?scope=translate&q=python&category=0&onlytitle=0&sort_by_time=1",
539 |         "level": 2,
540 |         "lang": "CN",
541 |         "status": "X",
542 |         "desc": "质量不太高"
543 |     },
544 |     {
545 |         "title": "伯乐在线头条",
546 |         "url": "http://top.jobbole.com/tag/python/?sort=latest",
547 |         "level": 2,
548 |         "lang": "CN",
549 |         "status": "X",
550 |         "desc": "停更"
551 |     },
552 |     {
553 |         "title": "代码片段",
554 |         "url": "http://www.phpxs.com/code/python",
555 |         "level": 2,
556 |         "lang": "CN",
557 |         "status": "X",
558 |         "desc": "文章太老了, 停更了"
559 |     },
560 |     {
561 |         "title": "segmentfault",
562 |         "url": "https://segmentfault.com/t/python/blogs",
563 |         "level": 2,
564 |         "lang": "CN",
565 |         "status": "X",
566 |         "desc": "文章质量"
567 |     },
568 |     {
569 |         "title": "Python China",
570 |         "url": "http://python-china.org/api/topics/timeline",
571 |         "level": 2,
572 |         "lang": "CN",
573 |         "status": "X",
574 |         "desc": "欠费网站挂了"
575 |     },
576 |     {
577 |         "title": "麦穗技术",
578 |         "url": "http://www.58maisui.com/category/python/",
579 |         "level": 2,
580 |         "lang": "CN",
581 |         "status": "X",
582 |         "desc": "网站挂了"
583 |     },
584 |     {
585 |         "title": "CSDN",
586 |         "url": "https://so.csdn.net/so/search/s.do?q=python&t=blog&u=",
587 |         "level": 1,
588 |         "lang": "CN",
589 |         "status": "X",
590 |         "desc": "文章质量啊~~~"
591 |     },
592 |     {
593 |         "title": "Stack Overflow",
594 |         "url": "https://stackoverflow.com/?tab=hot",
595 |         "level": 3,
596 |         "lang": "EN",
597 |         "status": "X",
598 |         "desc": "已解决 + python + vote>=5, 但是问题有点弱智, 暂不收录"
599 |     },
600 |     {
601 |         "title": "Reddit",
602 |         "url": "https://www.reddit.com/r/Python/top/",
603 |         "level": 3,
604 |         "lang": "EN",
605 |         "status": "√",
606 |         "desc": "知名社区. 质量参差, 收录每日 ups>=20"
607 |     },
608 |     {
609 |         "title": "码天狗",
610 |         "url": "https://weekly.codetengu.com/issues",
611 |         "level": 4,
612 |         "lang": "CN",
613 |         "status": "X",
614 |         "desc": "综合类周报, 2018-11-23 之后不更了. 挂了, 下线."
615 |     },
616 |     {
617 |         "title": "Medium",
618 |         "url": "https://medium.com/tag/python",
619 |         "level": 3,
620 |         "lang": "EN",
621 |         "status": "√",
622 |         "desc": "Medium 的 Python tag, 收录 RSS"
623 |     },
624 | ]
625 | 
626 | content_sources_dict = {i['title']: i for i in content_sources}
627 | 
628 | 
629 | def main():
630 |     import pathlib
631 |     import re
632 |     # =: 待收录, √: 已收录, X: 不收录, -: 入库不追更
633 | 
634 |     titles = [i['title'] for i in content_sources]
635 |     # 确保没有重复的
636 |     if len(titles) != len(set(titles)):
637 |         raise RuntimeError('不能有重复的 title')
638 |     if '|' in str(content_sources):
639 |         raise RuntimeError('尽量不要有 |')
640 | 
641 |     providers = ''
642 |     providers += '| 序号 | 名称 | 评分 | 语言 | 收录 | 描述 |\n'
643 |     providers += '| ---- | ---- | ---- | ---- | ---- | ---- |\n'
644 |     todo_counts = 0
645 |     finish_counts = 0
646 |     for x, item in enumerate(content_sources, 1):
647 |         data = [str(x)]
648 |         title_link = f'[{item["title"]}]({item["url"]})'
649 |         data.append(title_link)
650 |         data.append(str(item['level']))
651 |         data.append(item['lang'])
652 |         status = item['status']
653 |         if item['status'] == '√':
654 |             finish_counts += 1
655 |             status = f'[√](https://{ONLINE_HOST}/newspaper/articles.query.html?source={quote_plus(item["title"])})'
656 |         elif item['status'] == '=':
657 |             todo_counts += 1
658 |         data.append(status)
659 |         data.append(item['desc'])
660 |         string = ' | '.join(data)
661 |         providers += '| ' + string + ' |\n'
662 |     proc = f'* 收录进度: {finish_counts} / {finish_counts + todo_counts}\n\n\t> = 待收录  |  √ 已收录  |  X 不收录  |  - 入库不追更\n\n'
663 |     README_FP = pathlib.Path(
664 |         __file__).absolute().parent.parent.parent / 'README.md'
665 |     with README_FP.open('r', encoding='u8') as f:
666 |         old = f.read()
667 |         new = re.sub(
668 |             '<!-- providers start -->[\s\S]*?<!-- providers end -->',
669 |             f'<!-- providers start -->\n\n{proc}{providers}\n\n<!-- providers end -->',
670 |             old)
671 |         print(new)
672 |     with README_FP.open('w', encoding='u8') as f:
673 |         f.write(new)
674 | 
675 | 
676 | if __name__ == "__main__":
677 |     main()
678 | 


--------------------------------------------------------------------------------
/newspaper/crawler/spiders.py:
--------------------------------------------------------------------------------
   1 | import asyncio
   2 | import json
   3 | import traceback
   4 | import typing
   5 | import zlib
   6 | 
   7 | from lxml.etree import ElementBase, XMLParser
   8 | from lxml.html import fromstring, tostring
   9 | from torequests.dummy import Requests
  10 | from torequests.utils import (curlparse, escape, find_one, md5, parse_qsl,
  11 |                               ptime, re, time, timeago, ttime, unparse_qsl,
  12 |                               urlparse, urlunparse)
  13 | 
  14 | from ..config import global_configs
  15 | from ..loggers import spider_logger as logger
  16 | from ..utils import ensure_cn_en
  17 | 
  18 | START_TIME = time.time()
  19 | test_spiders = []
  20 | online_spiders = []
  21 | history_spiders = []
  22 | CHROME_PC_UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36'
  23 | friendly_crawling_interval = 1
  24 | outlands_req = Requests()
  25 | # default_host_frequency 是默认的单域名并发控制: 每 3 秒一次请求
  26 | req = Requests(default_host_frequency=(1, 3))
  27 | # 多次请求时的友好抓取频率
  28 | # req.set_frequency('zhuanlan.zhihu.com', 1, 3)
  29 | req.set_frequency('www.tuicool.com', 1, 3)
  30 | # 免费代理
  31 | proxy = 'http://218.60.8.99:3129'
  32 | 
  33 | 
  34 | class null_tree:
  35 |     text = ''
  36 | 
  37 |     @classmethod
  38 |     def text_content(cls):
  39 |         return ''
  40 | 
  41 |     def get(self, key, default=''):
  42 |         return default
  43 | 
  44 |     @classmethod
  45 |     def css(cls, item, csspath, idx=0):
  46 |         return (item.cssselect(csspath) or [cls])[idx]
  47 | 
  48 |     @classmethod
  49 |     def tostring(cls, doc, **kwargs):
  50 |         if isinstance(doc, ElementBase):
  51 |             return tostring(doc, **kwargs)
  52 |         else:
  53 |             return ''
  54 | 
  55 | 
  56 | def sort_url_query(url, reverse=False, _replace_kwargs=None):
  57 |     """sort url query args.
  58 |     _replace_kwargs is a dict to update attributes before sorting  (such as scheme / netloc...).
  59 |     http://www.google.com?b=2&z=26&a=1 => http://www.google.com?a=1&b=2&z=26
  60 |     """
  61 |     parsed = urlparse(url)
  62 |     if _replace_kwargs:
  63 |         parsed = parsed._replace(**_replace_kwargs)
  64 |     sorted_parsed = parsed._replace(
  65 |         query=unparse_qsl(parse_qsl(parsed.query), sort=True, reverse=reverse))
  66 |     return urlunparse(sorted_parsed)
  67 | 
  68 | 
  69 | def get_url_key(url) -> str:
  70 |     """通过 url 来计算 key, 一方面计算 md5, 另一方面净化无用参数.
  71 |     以后再考虑要不要纯数字...
  72 |     import hashlib
  73 |     a = hashlib.md5(b'url')
  74 |     b = a.hexdigest()
  75 |     as_int = int(b, 16)
  76 |     url_key = str(as_int)[5:][:20]
  77 |     print(url_key)
  78 | """
  79 |     if url:
  80 |         key = md5(sort_url_query(url, _replace_kwargs={'scheme': 'https'}))
  81 |         return key
  82 |     return ""
  83 | 
  84 | 
  85 | def add_host(url: str, host: str) -> str:
  86 |     if not url:
  87 |         return ''
  88 |     if re.match('^https?://', url):
  89 |         return url
  90 |     if url.startswith('//'):
  91 |         return f'https:{url}'
  92 |     if not host.endswith('/'):
  93 |         host = f'{host}/'
  94 |     return re.sub('^/', host, url)
  95 | 
  96 | 
  97 | def shorten_desc(desc: str) -> str:
  98 |     """Shorten the desc too long (more than 50)."""
  99 |     if not desc:
 100 |         return ''
 101 |     # remain sentence before ./\n/。/!
 102 |     desc = re.sub(r'(.{50,})(\n|\.|。|！|!|？|\?)\s?[\s\S]+', r'\1\2', desc)
 103 |     # remove html tag
 104 |     desc = re.sub('<[^>]+>', '', desc).strip()
 105 |     return escape(desc)
 106 | 
 107 | 
 108 | async def outlands_request(request_dict: dict = None,
 109 |                            encoding: str = 'u8',
 110 |                            **request_args) -> str:
 111 |     """小水管不开源, 无法用来 FQ.
 112 | 
 113 |     例:
 114 |         async def test():
 115 |             text = await outlands_request({
 116 |                 'method': 'get',
 117 |                 'url': 'https://pyfound.blogspot.com/'
 118 |             }, 'u8')
 119 |             print(text)
 120 |             return text
 121 |     """
 122 |     request_dict = request_dict or {}
 123 |     request_dict.update(request_args)
 124 |     request_dict.setdefault('method', 'get')
 125 |     request_dict.setdefault('ssl', False)
 126 |     request_dict.setdefault('headers', {})
 127 |     request_dict['headers'].setdefault('User-Agent', CHROME_PC_UA)
 128 |     json_data = json.dumps(request_dict)
 129 |     data = zlib.compress(json_data.encode('u8'))
 130 |     url = global_configs['anti_gfw']['url']
 131 |     r = await outlands_req.post(url, timeout=60, data=data)
 132 |     if r:
 133 |         return zlib.decompress(r.content).decode(encoding)
 134 |     else:
 135 |         return r.text
 136 | 
 137 | 
 138 | def register_test(function: typing.Callable) -> typing.Callable:
 139 |     """把爬虫注册到测试列表
 140 | 
 141 |     :param function: 爬虫函数, 一般没有参数.
 142 |     :type function: typing.Callable
 143 |     :return: 爬虫函数, 一般没有参数.
 144 |     :rtype: typing.Callable
 145 |     """
 146 | 
 147 |     test_spiders.append(function)
 148 |     return function
 149 | 
 150 | 
 151 | def register_online(function: typing.Callable) -> typing.Callable:
 152 |     """把爬虫注册到线上可用列表
 153 | 
 154 |     :param function: 爬虫函数, 一般没有参数.
 155 |     :type function: typing.Callable
 156 |     :return: 爬虫函数, 一般没有参数.
 157 |     :rtype: typing.Callable
 158 |     """
 159 | 
 160 |     online_spiders.append(function)
 161 |     return function
 162 | 
 163 | 
 164 | def register_history(function: typing.Callable) -> typing.Callable:
 165 |     """把爬虫注册到历史文章抓取任务列表
 166 | 
 167 |     :param function: 爬虫函数, 一般没有参数.
 168 |     :type function: typing.Callable
 169 |     :return: 爬虫函数, 一般没有参数.
 170 |     :rtype: typing.Callable
 171 |     """
 172 | 
 173 |     history_spiders.append(function)
 174 |     return function
 175 | 
 176 | 
 177 | async def common_spider_zhihu_zhuanlan(name, source, limit=10):
 178 |     articles = []
 179 |     offset: int = 0
 180 |     # 分页
 181 |     chunk_size: int = 50
 182 |     # 最多只要 2000 篇，再多没意义
 183 |     for _ in range(2000 // chunk_size):
 184 |         _limit = min((limit - offset, chunk_size))
 185 |         # or limit == offset
 186 |         if not _limit:
 187 |             break
 188 |         api: str = f'https://zhuanlan.zhihu.com/api/columns/{name}/articles?limit={_limit}&offset={offset}'
 189 |         r = await req.get(
 190 |             api,
 191 |             ssl=False,
 192 |             headers={
 193 |                 "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36'
 194 |             })
 195 |         if not r:
 196 |             logger.info(
 197 |                 f'crawl zhihu_zhuanlan {name} limit={limit} failed: {r}')
 198 |             return articles
 199 |         items = r.json()['data']
 200 |         if not items:
 201 |             break
 202 |         for item in items:
 203 |             if not (item['type'] == 'article' and item['state'] == 'published'):
 204 |                 continue
 205 |             article: dict = {'source': source}
 206 |             article['ts_publish'] = ttime(item['created'])
 207 |             article['cover'] = item['image_url']
 208 |             article['title'] = item['title']
 209 |             article['desc'] = re.sub('<[^>]+>', ' ', item.get('excerpt') or '')
 210 |             article['url'] = item['url']
 211 |             article['url_key'] = get_url_key(article['url'])
 212 |             articles.append(article)
 213 |         offset += _limit
 214 | 
 215 |     return articles
 216 | 
 217 | 
 218 | async def common_spider_tuicool(lang, source, max_page=1, ignore_descs=None):
 219 |     articles = []
 220 |     langs = {'CN': 1, 'EN': 2}
 221 |     lang_num = langs[lang]
 222 |     host = 'https://www.tuicool.com/'
 223 |     this_year = ttime()[:4]
 224 |     ignore_descs = ignore_descs or set()
 225 |     # 非登录用户只能采集前两页, 想采集更多需要 `_tuicool_session` cookie.
 226 |     headers = {
 227 |         'Connection': 'keep-alive',
 228 |         'Upgrade-Insecure-Requests': '1',
 229 |         'If-None-Match': 'W/"41a6894d66c0f07fcfac6ec1d84446a3"',
 230 |         'Dnt': '1',
 231 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
 232 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
 233 |         'Referer': 'https://www.tuicool.com/',
 234 |         'Host': 'www.tuicool.com',
 235 |         'Accept-Encoding': 'gzip, deflate, br',
 236 |         'Accept-Language': 'zh-CN,zh;q=0.9',
 237 |         'Cookie': '_tuicool_session=',
 238 |     }
 239 |     proxy = None
 240 |     for page in range(0, max_page):
 241 |         # st 参数: 0 是按时间顺序, 1 是热门文章
 242 |         api: str = f'https://www.tuicool.com/ah/0?st=1&lang={lang_num}&pn={page}'
 243 |         r = await req.get(api,
 244 |                           ssl=False,
 245 |                           proxy=proxy,
 246 |                           retry=1,
 247 |                           timeout=5,
 248 |                           headers=headers)
 249 |         # print(r.text)
 250 |         if not r:
 251 |             logger.info(f'crawl tuicool {lang} page={page} failed: {r}')
 252 |             return articles
 253 |         items = fromstring(
 254 |             r.text).cssselect('#list_article>div.list_article_item')
 255 |         if max_page > 1:
 256 |             logger.info(
 257 |                 f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
 258 |             )
 259 |         if not items:
 260 |             break
 261 |         for item in items:
 262 |             article: dict = {'source': source}
 263 |             url = null_tree.css(item,
 264 |                                 '.aricle_item_info>.title>a').get('href', '')
 265 |             url = add_host(url, host)
 266 |             title = null_tree.css(item, '.aricle_item_info>.title>a').text
 267 |             cover = null_tree.css(item,
 268 |                                   '.article_thumb_image>img').get('src', '')
 269 |             cover = cover.replace(
 270 |                 'https://static0.tuicool.com/images/abs_img_no_small.jpg', '')
 271 |             time_span = null_tree.css(item,
 272 |                                       '.aricle_item_info>.tip').text_content()
 273 |             raw_time = find_one(r'\d\d-\d\d \d\d:\d\d', time_span)[0]
 274 |             if raw_time:
 275 |                 # 避免是个怪异的时间, ensure 一下
 276 |                 article['ts_publish'] = ttime(
 277 |                     ptime(f'{this_year}-{raw_time}:00'))
 278 |             desc = null_tree.css(
 279 |                 item,
 280 |                 '.aricle_item_info>div.tip>span:nth-of-type(1)').text.strip()
 281 |             if not re.search('Python|python', f'{title}{desc}'):
 282 |                 continue
 283 |             if desc in ignore_descs:
 284 |                 continue
 285 |             article['cover'] = cover
 286 |             article['title'] = title
 287 |             article['desc'] = desc
 288 |             article['url'] = url
 289 |             article['url_key'] = get_url_key(article['url'])
 290 |             articles.append(article)
 291 |     return articles
 292 | 
 293 | 
 294 | async def common_spider_juejin(user, source, max_page=1):
 295 |     articles = []
 296 |     host = 'https://juejin.im/'
 297 |     now = ttime(fmt="%Y-%m-%dT%H:%M:%S.000Z")
 298 |     api: str = 'https://timeline-merger-ms.juejin.im/v1/get_entry_by_self'
 299 |     params: dict = {
 300 |         'src': 'web',
 301 |         'targetUid': user,
 302 |         'type': 'post',
 303 |         'before': now,
 304 |         'limit': 20,
 305 |         'order': 'createdAt'
 306 |     }
 307 |     for page in range(max_page):
 308 |         try:
 309 |             params['before'] = now
 310 |             r = await req.get(api,
 311 |                               ssl=False,
 312 |                               params=params,
 313 |                               retry=1,
 314 |                               timeout=5,
 315 |                               headers={"User-Agent": CHROME_PC_UA})
 316 |             if not r:
 317 |                 logger.info(f'crawl juejin page={page} failed: {r}')
 318 |                 return articles
 319 |             items = r.json()['d']['entrylist']
 320 |             if max_page > 1:
 321 |                 logger.info(
 322 |                     f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
 323 |                 )
 324 |             if not items:
 325 |                 break
 326 |             for item in items:
 327 |                 article: dict = {'source': source}
 328 |                 url = item['originalUrl']
 329 |                 url = add_host(url, host)
 330 |                 title = item['title']
 331 |                 cover = item.get('screenshot') or ''
 332 |                 now = item['createdAt']
 333 |                 if now:
 334 |                     ts_publish = re.sub('\..*', '', now)
 335 |                     article['ts_publish'] = ts_publish.replace('T', ' ')
 336 |                 desc = item.get('summaryInfo') or ''
 337 |                 article['cover'] = cover
 338 |                 article['title'] = title
 339 |                 article['desc'] = desc
 340 |                 article['url'] = url
 341 |                 article['url_key'] = get_url_key(article['url'])
 342 |                 articles.append(article)
 343 |                 if not now:
 344 |                     break
 345 |         except Exception:
 346 |             logger.error(f'{source} crawl failed: {traceback.format_exc()}')
 347 |     return articles
 348 | 
 349 | 
 350 | @register_online
 351 | async def python_news() -> list:
 352 |     """Python Software Foundation News"""
 353 |     source: str = 'Python Software Foundation News'
 354 |     articles: list = []
 355 |     seed = 'https://pyfound.blogspot.com/search?max-results=10'
 356 |     scode = await outlands_request({
 357 |         'method': 'get',
 358 |         'url': seed,
 359 |     }, 'u8')
 360 |     if scode:
 361 |         tree = fromstring(scode)
 362 |         for item in tree.cssselect('.blog-posts>.date-outer'):
 363 |             try:
 364 |                 article: dict = {'source': source}
 365 |                 raw_pub_time = item.cssselect('.published')[0].get('title', '')
 366 |                 ts_publish = ttime(
 367 |                     ptime(raw_pub_time, fmt='%Y-%m-%dT%H:%M:%S%z'))
 368 |                 article['ts_publish'] = ts_publish
 369 |                 article['title'] = item.cssselect(
 370 |                     '.post-title.entry-title>a')[0].text
 371 |                 # 兼容下没有 desc 的情况
 372 |                 node = item.cssselect('.post-body.entry-content') or [null_tree]
 373 |                 desc = node[0].text_content()
 374 |                 article['desc'] = desc.split('\n\n\n',
 375 |                                              1)[0].strip().replace('\n', ' ')
 376 |                 article['url'] = item.cssselect(
 377 |                     '.post-title.entry-title>a')[0].get('href', '')
 378 |                 article['url_key'] = get_url_key(article['url'])
 379 |                 articles.append(article)
 380 |             except Exception:
 381 |                 logger.error(f'{source} crawl failed: {traceback.format_exc()}')
 382 |     logger.info(
 383 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
 384 |     )
 385 |     return articles
 386 | 
 387 | 
 388 | # @register_history
 389 | async def python_news_history() -> list:
 390 |     """Python Software Foundation News"""
 391 |     source: str = 'Python Software Foundation News'
 392 |     articles: list = []
 393 |     current_year = int(time.strftime('%Y'))
 394 |     for year in range(2006, current_year + 1):
 395 |         seed = f'https://pyfound.blogspot.com/{year}/'
 396 |         scode = await outlands_request({
 397 |             'method': 'get',
 398 |             'url': seed,
 399 |         }, 'u8')
 400 |         await asyncio.sleep(3)
 401 |         if not scode:
 402 |             continue
 403 |         tree = fromstring(scode)
 404 |         for item in tree.cssselect('.blog-posts>.date-outer'):
 405 |             try:
 406 |                 article: dict = {'source': source}
 407 |                 raw_pub_time = item.cssselect('.published')[0].get('title', '')
 408 |                 ts_publish = ttime(
 409 |                     ptime(raw_pub_time, fmt='%Y-%m-%dT%H:%M:%S%z'))
 410 |                 article['ts_publish'] = ts_publish
 411 |                 article['title'] = item.cssselect(
 412 |                     '.post-title.entry-title>a')[0].text
 413 |                 # 兼容下没有 desc 的情况
 414 |                 node = item.cssselect('.post-body.entry-content') or [null_tree]
 415 |                 desc = node[0].text_content()
 416 |                 article['desc'] = desc.split('\n\n\n',
 417 |                                              1)[0].strip().replace('\n', ' ')
 418 |                 article['url'] = item.cssselect(
 419 |                     '.post-title.entry-title>a')[0].get('href', '')
 420 |                 article['url_key'] = get_url_key(article['url'])
 421 |                 articles.append(article)
 422 |             except Exception:
 423 |                 logger.error(f'{source} crawl failed: {traceback.format_exc()}')
 424 |     logger.info(
 425 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
 426 |     )
 427 |     return articles
 428 | 
 429 | 
 430 | def _python_weekly_calculate_date(issue_id):
 431 |     diff = 396 - int(issue_id)
 432 |     return ttime(1557331200 - diff * 86400 * 7)
 433 | 
 434 | 
 435 | @register_online
 436 | # @register_history
 437 | # @register_test
 438 | async def python_weekly() -> list:
 439 |     """Python Weekly"""
 440 |     source: str = 'Python Weekly'
 441 |     articles: list = []
 442 |     # 一周一更, 所以只取第一个就可以了
 443 |     limit = 1
 444 |     seed = 'https://us2.campaign-archive.com/home/?u=e2e180baf855ac797ef407fc7&id=9e26887fc5'
 445 |     scode = await outlands_request({
 446 |         'method': 'get',
 447 |         'url': seed,
 448 |     }, 'u8')
 449 |     box = find_one(
 450 |         r'(?:<div class="display_archive">)(<li [\s\S]*?</li>)(?:</div>)',
 451 |         scode)[1]
 452 |     items = re.findall(r'(<li [\s\S]*?</li>)', box)
 453 |     for item in items[:limit]:
 454 |         try:
 455 |             article: dict = {'source': source}
 456 |             # 从列表页取 ts_publish 和 issue_id, 其他的去详情页里采集
 457 |             # <li class="campaign">05/09/2019 - <a href="http://eepurl.com/gqB4vv" title="Python Weekly - Issue 396" target="_blank">Python Weekly - Issue 396</a></li>
 458 |             title = find_one('title="(.*?)"', item)[1]
 459 |             issue_id = find_one(r' - Issue (\d+)', title)[1]
 460 |             pub_dates = find_one(r'class="campaign">(\d\d)/(\d\d)/(\d\d\d\d)',
 461 |                                  item)[1]
 462 |             if not issue_id:
 463 |                 continue
 464 |             if len(pub_dates) == 3:
 465 |                 ts_publish = f'{pub_dates[2]}-{pub_dates[0]}-{pub_dates[1]} 00:00:00'
 466 |             else:
 467 |                 ts_publish = _python_weekly_calculate_date(issue_id)
 468 |             article['ts_publish'] = ts_publish
 469 |             detail_url = f'https://mailchi.mp/pythonweekly/python-weekly-issue-{issue_id}'
 470 |             r = await req.get(
 471 |                 detail_url,
 472 |                 ssl=False,
 473 |                 headers={
 474 |                     "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36'
 475 |                 })
 476 |             if not r:
 477 |                 logger.error(f'fetch {detail_url} failed: {r}')
 478 |                 continue
 479 |             scode = r.text
 480 |             title = find_one('<title>(.*?)</title>', r.text)[1]
 481 |             title = title.strip('Â ')
 482 |             translate_url = find_one(
 483 |                 r'(://translate\.google\.com/translate\?[^"]+)', scode)[1]
 484 |             backup_url = dict(
 485 |                 parse_qsl(translate_url))['u'] if translate_url else ''
 486 |             backup_url_desc = f'<a href="{backup_url}" target="_blank" rel="noopener noreferrer"><b>View this email in your browser</b></a><br>' if backup_url else ''
 487 |             nodes = fromstring(scode).cssselect('[style="font-size:14px"]>a')
 488 |             all_links = [
 489 |                 f"「{tostring(i, method='html', with_tail=0, encoding='unicode')} 」"
 490 |                 for i in nodes
 491 |             ]
 492 |             all_links_desc = '<br>'.join(all_links)
 493 |             article['title'] = title
 494 |             article['desc'] = f'{backup_url_desc}{all_links_desc}'
 495 |             article['url'] = detail_url
 496 |             article['url_key'] = get_url_key(article['url'])
 497 |             articles.append(article)
 498 |         except Exception:
 499 |             logger.error(f'{source} crawl failed: {traceback.format_exc()}')
 500 |             break
 501 |     logger.info(
 502 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
 503 |     )
 504 |     return articles
 505 | 
 506 | 
 507 | # @register_history
 508 | async def python_weekly_history() -> list:
 509 |     """Python Weekly"""
 510 |     source: str = 'Python Weekly'
 511 |     articles: list = []
 512 |     for issue_id in range(324, 1000):
 513 |         try:
 514 |             article: dict = {'source': source}
 515 |             article['ts_publish'] = _python_weekly_calculate_date(issue_id)
 516 |             detail_url = f'https://mailchi.mp/pythonweekly/python-weekly-issue-{issue_id}'
 517 |             r = await req.get(
 518 |                 detail_url,
 519 |                 ssl=False,
 520 |                 headers={
 521 |                     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36'
 522 |                 })
 523 |             if '<title>404: Page Not Found' in r.text:
 524 |                 logger.warn('python_weekly_history break for 404 page')
 525 |                 break
 526 |             if not r:
 527 |                 logger.error(f'python_weekly_history break for {r}')
 528 |                 break
 529 |             scode = r.text
 530 |             title = find_one('<title>(.*?)</title>', r.text)[1]
 531 |             title = title.strip('Â ')
 532 |             translate_url = find_one(
 533 |                 r'(://translate\.google\.com/translate\?[^"]+)', scode)[1]
 534 |             backup_url = dict(
 535 |                 parse_qsl(translate_url))['u'] if translate_url else ''
 536 |             backup_url_desc = f'<a href="{backup_url}" target="_blank" rel="noopener noreferrer"><b>View this email in your browser</b></a><br>' if backup_url else ''
 537 |             nodes = fromstring(scode).cssselect('[style="font-size:14px"]>a')
 538 |             all_links = [
 539 |                 f"「{tostring(i, method='html', with_tail=0, encoding='unicode')} 」"
 540 |                 for i in nodes
 541 |             ]
 542 |             all_links_desc = '<br>'.join(all_links)
 543 |             article['title'] = title
 544 |             article['desc'] = f'{backup_url_desc}{all_links_desc}'
 545 |             article['url'] = detail_url
 546 |             article['url_key'] = get_url_key(article['url'])
 547 |             articles.append(article)
 548 |         except Exception:
 549 |             logger.error(f'{source} crawl failed: {traceback.format_exc()}')
 550 |             break
 551 |     logger.info(
 552 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
 553 |     )
 554 |     return articles
 555 | 
 556 | 
 557 | @register_online
 558 | async def pycoder_weekly() -> list:
 559 |     """PyCoder's Weekly"""
 560 |     # 把 limit 改 999 就可以抓历史了
 561 |     source: str = "PyCoder's Weekly"
 562 |     articles: list = []
 563 |     # 一周一更, 所以只取第一个就可以了
 564 |     limit = 1
 565 |     seed = 'https://pycoders.com/issues'
 566 |     base_url = find_one('^https?://[^/]+', seed)[0]
 567 |     r = await req.get(seed, headers={'User-Agent': CHROME_PC_UA})
 568 |     if not r:
 569 |         logger.error(f'{source} crawl failed: {r}, {r.text}')
 570 |         return articles
 571 |     items = re.findall(r'<a href="/issues/\d+">Issue #\d+ .*?</a>', r.text)
 572 |     for item in items[:limit]:
 573 |         try:
 574 |             article: dict = {'source': source}
 575 |             # <a href="/issues/368">Issue #368 (May 14, 2019)</a>
 576 |             title = find_one('>(Issue.*?)<', item)[1]
 577 |             article['title'] = f"PyCoder's Weekly | {title}"
 578 |             month, day, year = re.findall(r'\((.*?) (\d+), (\d+)\)',
 579 |                                           article['title'])[0]
 580 |             month = month[:3]
 581 |             raw_time = f'{year}-{month}-{day}'
 582 |             ts_publish = ttime(ptime(raw_time, fmt='%Y-%b-%d'))
 583 |             article['ts_publish'] = ts_publish
 584 |             article['desc'] = ''
 585 |             url = find_one(r'href="(/issues/\d+)"', item)[1]
 586 |             article['url'] = base_url + url
 587 |             article['url_key'] = get_url_key(article['url'])
 588 |             articles.append(article)
 589 |         except Exception:
 590 |             logger.error(f'{source} crawl failed: {traceback.format_exc()}')
 591 |             break
 592 |     logger.info(
 593 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
 594 |     )
 595 |     return articles
 596 | 
 597 | 
 598 | @register_online
 599 | # @register_test
 600 | async def importpython() -> list:
 601 |     """Import Python"""
 602 |     source: str = 'Import Python'
 603 |     articles: list = []
 604 |     # 一周一更, 所以只取第一个就可以了
 605 |     limit = 1
 606 |     seed = 'https://importpython.com/newsletter/archive/'
 607 |     r = await req.get(seed,
 608 |                       timeout=15,
 609 |                       ssl=False,
 610 |                       headers={"User-Agent": CHROME_PC_UA})
 611 |     if not r:
 612 |         logger.error(f'{source} crawl failed: {r}, {r.text}')
 613 |         return articles
 614 |     items = fromstring(r.text).cssselect('#tourpackages-carousel>.row>div')
 615 |     for item in items[:limit]:
 616 |         try:
 617 |             article: dict = {'source': source}
 618 |             href = item.cssselect('div.caption>a')[0].get('href', '')
 619 |             if not href:
 620 |                 continue
 621 |             url = add_host(href, 'https://importpython.com/')
 622 |             title = item.cssselect('div.caption>.well-add-card>h4')[0].text
 623 |             desc_node = item.cssselect('div.caption>div[class="col-lg-12"]')[0]
 624 |             desc = tostring(desc_node,
 625 |                             method='html',
 626 |                             with_tail=0,
 627 |                             encoding='unicode')
 628 |             day, month, year = re.findall(r'- (\d+) (\S+) (\d+)', title)[0]
 629 |             month = month[:3]
 630 |             raw_time = f'{year}-{month}-{day}'
 631 |             ts_publish = ttime(ptime(raw_time, fmt='%Y-%b-%d'))
 632 |             article['ts_publish'] = ts_publish
 633 |             clean_title = re.sub(' - .*', '', title)
 634 |             title = f"{source} - {clean_title}"
 635 |             article['title'] = title
 636 |             article['desc'] = desc.replace('\n                    ', ' ')
 637 |             article['url'] = url
 638 |             article['url_key'] = get_url_key(article['url'])
 639 |             articles.append(article)
 640 |         except Exception:
 641 |             logger.error(f'{source} crawl failed: {traceback.format_exc()}')
 642 |             break
 643 |     logger.info(
 644 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
 645 |     )
 646 |     return articles
 647 | 
 648 | 
 649 | @register_online
 650 | # @register_test
 651 | async def awesome_python() -> list:
 652 |     """Awesome Python Newsletter"""
 653 |     source: str = 'Awesome Python Newsletter'
 654 |     articles: list = []
 655 |     # 一周一更, 所以只取第一个就可以了
 656 |     limit = 1
 657 |     seed = 'https://python.libhunt.com/newsletter/archive'
 658 |     scode = await outlands_request({
 659 |         'method': 'get',
 660 |         'url': seed,
 661 |     }, 'u8')
 662 |     hrefs = re.findall(
 663 |         r'<td class="text-right">\s*<a href=\'(/newsletter/\d+)\'>', scode)
 664 |     for href in hrefs[:limit]:
 665 |         try:
 666 |             article: dict = {'source': source}
 667 |             url = add_host(href, 'https://python.libhunt.com/')
 668 |             r = await req.get(url,
 669 |                               retry=2,
 670 |                               timeout=15,
 671 |                               headers={"User-Agent": CHROME_PC_UA})
 672 |             if not r:
 673 |                 logger.error(f'fetch {url} failed: {r}')
 674 |                 break
 675 |             tree = fromstring(r.text)
 676 |             raw_title = tree.cssselect('title')[0].text
 677 |             title = re.sub(', .*', '', raw_title)
 678 |             raw_pub_date = find_one(r', (.*?) \|', raw_title)[1]
 679 |             # May 17, 2019
 680 |             ts_publish = ttime(ptime(raw_pub_date, fmt='%b %d, %Y'))
 681 |             nodes = tree.cssselect(
 682 |                 'li[class="story row"]>div[class="column"]>a')
 683 |             descs = [
 684 |                 tostring(i, method='html', with_tail=0, encoding='unicode')
 685 |                 for i in nodes
 686 |             ]
 687 |             desc = '<br>'.join(descs)
 688 |             article['ts_publish'] = ts_publish
 689 |             article['title'] = title
 690 |             article['desc'] = desc
 691 |             article['url'] = url
 692 |             article['url_key'] = get_url_key(article['url'])
 693 |             articles.append(article)
 694 |         except Exception:
 695 |             logger.error(f'{source} crawl failed: {traceback.format_exc()}')
 696 |             break
 697 |     logger.info(
 698 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
 699 |     )
 700 |     return articles
 701 | 
 702 | 
 703 | @register_online
 704 | async def real_python() -> list:
 705 |     """Real Python"""
 706 |     source: str = 'Real Python'
 707 |     articles: list = []
 708 |     limit = 20
 709 |     seed = 'https://realpython.com/'
 710 |     r = await req.get(seed,
 711 |                       retry=1,
 712 |                       timeout=20,
 713 |                       headers={"User-Agent": CHROME_PC_UA})
 714 |     if not r:
 715 |         logger.error(f'{source} crawl failed: {r}, {r.text}')
 716 |         return articles
 717 |     items = fromstring(r.text).cssselect('div[class="card border-0"]')
 718 |     for item in items[:limit]:
 719 |         try:
 720 |             article: dict = {'source': source}
 721 |             href = item.cssselect('a')[0].get('href', '')
 722 |             url = add_host(href, 'https://realpython.com/')
 723 |             title = item.cssselect('h2.card-title')[0].text
 724 |             pub_date_node = item.cssselect('.mr-2') or [null_tree]
 725 |             raw_pub_date = pub_date_node[0].text
 726 |             # May 16, 2019
 727 |             ts_publish = ttime(ptime(raw_pub_date, fmt='%b %d, %Y'))
 728 |             cover_item = item.cssselect('img.card-img-top')
 729 |             if cover_item:
 730 |                 cover = cover_item[0].get('src', '')
 731 |                 if cover:
 732 |                     article['cover'] = cover
 733 |             article['ts_publish'] = ts_publish
 734 |             article['title'] = title
 735 |             article['desc'] = ''
 736 |             article['url'] = url
 737 |             article['url_key'] = get_url_key(article['url'])
 738 |             articles.append(article)
 739 |         except Exception:
 740 |             logger.error(f'{source} crawl failed: {traceback.format_exc()}')
 741 |             break
 742 |     logger.info(
 743 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
 744 |     )
 745 |     return articles
 746 | 
 747 | 
 748 | @register_online
 749 | async def planet_python() -> list:
 750 |     """Planet Python"""
 751 |     source: str = 'Planet Python'
 752 |     articles: list = []
 753 |     limit = 100
 754 |     seed = 'https://planetpython.org/rss20.xml'
 755 |     # 避免超时, 用外部访问
 756 |     scode = await outlands_request({
 757 |         'method': 'get',
 758 |         'url': seed,
 759 |     }, 'u8')
 760 |     items = fromstring(scode).xpath('//channel/item')
 761 |     now = ttime()
 762 |     for item in items[:limit]:
 763 |         try:
 764 |             article: dict = {'source': source}
 765 |             guid = item.xpath('./guid/text()')
 766 |             title = item.xpath('./title/text()')
 767 |             description = item.xpath('./description/text()')
 768 |             pubDate = item.xpath('./pubdate/text()')
 769 |             if not (guid and title):
 770 |                 continue
 771 |             url = guid[0]
 772 |             title = title[0]
 773 |             if 'بايثون العربي' in title:
 774 |                 continue
 775 |             if 'Python Software Foundation: ' in title:
 776 |                 # 已经单独收录过, 不需要再收录一次
 777 |                 continue
 778 |             if description:
 779 |                 desc = fromstring(description[0]).text_content()
 780 |                 # 去掉 <>
 781 |                 desc = re.sub('<[^>]*>', ' ', desc)
 782 |                 # 只保留第一个换行前面的
 783 |                 desc = shorten_desc(desc)
 784 |             else:
 785 |                 desc = ''
 786 |             if pubDate:
 787 |                 raw_pub_date = pubDate[0]
 788 |                 # Wed, 22 May 2019 01:47:44 +0000
 789 |                 raw_pub_date = re.sub('^.*?, ', '', raw_pub_date).strip()
 790 |                 ts_publish = ttime(
 791 |                     ptime(raw_pub_date, fmt='%d %b %Y %H:%M:%S %z'))
 792 |             else:
 793 |                 ts_publish = now
 794 |             article['ts_publish'] = ts_publish
 795 |             article['title'] = title
 796 |             article['desc'] = desc
 797 |             article['url'] = url
 798 |             article['url_key'] = get_url_key(article['url'])
 799 |             articles.append(article)
 800 |         except Exception:
 801 |             logger.error(f'{source} crawl failed: {traceback.format_exc()}')
 802 |             break
 803 |     logger.info(
 804 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
 805 |     )
 806 |     return articles
 807 | 
 808 | 
 809 | @register_online
 810 | # @register_test
 811 | async def julien_danjou() -> list:
 812 |     """Julien Danjou"""
 813 |     # 历史文章只要不断改页码迭代就好了
 814 |     source: str = 'Julien Danjou'
 815 |     articles: list = []
 816 |     seed = 'https://julien.danjou.info/page/1/'
 817 |     scode = await outlands_request(
 818 |         {
 819 |             'method': 'get',
 820 |             'timeout': 5,
 821 |             'retry': 2,
 822 |             'url': seed,
 823 |         }, 'u8')
 824 |     items = fromstring(scode).cssselect('.post-feed>article.post-card')
 825 |     # 判断发布时间如果是 1 小时前就 break
 826 |     break_time = ttime(time.time() - 60 * 60)
 827 |     host = 'https://julien.danjou.info/'
 828 |     for item in items:
 829 |         try:
 830 |             article: dict = {'source': source}
 831 |             href = item.cssselect('a.post-card-content-link')[0].get('href', '')
 832 |             if not href:
 833 |                 raise ValueError(f'{source} not found href from {seed}')
 834 |             url = add_host(href, host)
 835 |             title = (item.cssselect('h2.post-card-title') or
 836 |                      [null_tree])[0].text
 837 |             desc = (item.cssselect('.post-card-excerpt>p') or
 838 |                     [null_tree])[0].text
 839 |             if not (title and url):
 840 |                 raise ValueError(f'{source} no title {url}')
 841 |             detail_scode = await outlands_request(
 842 |                 {
 843 |                     'method': 'get',
 844 |                     'timeout': 5,
 845 |                     'retry': 2,
 846 |                     'url': url,
 847 |                 }, 'u8')
 848 |             if not detail_scode:
 849 |                 raise ValueError(f'{source} has no detail_scode {url}')
 850 |             raw_pub_time = find_one(
 851 |                 'property="article:published_time" content="(.+?)"',
 852 |                 detail_scode)[1]
 853 |             # 2019-05-06T08:58:00.000Z
 854 |             ts_publish = ttime(ptime(raw_pub_time,
 855 |                                      fmt='%Y-%m-%dT%H:%M:%S.000Z'))
 856 |             cover_item = item.cssselect('img.post-card-image')
 857 |             if cover_item:
 858 |                 cover = cover_item[0].get('src', '')
 859 |                 if cover:
 860 |                     article['cover'] = add_host(cover, host)
 861 |             article['ts_publish'] = ts_publish
 862 |             article['title'] = title
 863 |             article['desc'] = desc
 864 |             article['url'] = url
 865 |             article['url_key'] = get_url_key(article['url'])
 866 |             articles.append(article)
 867 |             if ts_publish < break_time:
 868 |                 # 文章的发布时间超过抓取间隔, 则 break
 869 |                 break
 870 |         except Exception:
 871 |             logger.error(f'{source} crawl failed: {traceback.format_exc()}')
 872 |             break
 873 |     logger.info(
 874 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
 875 |     )
 876 |     return articles
 877 | 
 878 | 
 879 | @register_online
 880 | async def doughellmann() -> list:
 881 |     """Doug Hellmann"""
 882 |     source: str = 'Doug Hellmann'
 883 |     articles: list = []
 884 |     max_page: int = 1
 885 |     seed = 'https://doughellmann.com/blog/page/{page}/'
 886 |     for page in range(1, max_page + 1):
 887 |         r = await req.get(seed.format(page=page),
 888 |                           retry=1,
 889 |                           timeout=20,
 890 |                           headers={"User-Agent": CHROME_PC_UA})
 891 |         if not r:
 892 |             logger.error(f'{source} crawl failed: {r}, {r.text}')
 893 |             return articles
 894 |         scode = r.text
 895 |         items = fromstring(scode).cssselect('#main>article')
 896 |         if max_page > 1:
 897 |             logger.info(
 898 |                 f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
 899 |             )
 900 |             if not items and page > 1:
 901 |                 logger.info(f'{source} break for page {page} has no items')
 902 |                 break
 903 |         for item in items:
 904 |             try:
 905 |                 article: dict = {'source': source}
 906 |                 title = item.cssselect('.entry-title>a')[0].text
 907 |                 url = item.cssselect('.entry-title>a')[0].get('href')
 908 |                 desc = item.cssselect('.entry-content')[0].text_content()
 909 |                 pub_time = item.cssselect('time.entry-date')[0].get('datetime')
 910 |                 ts_publish = ttime(ptime(pub_time, fmt='%Y-%m-%dT%H:%M:%S%z'))
 911 |                 article['ts_publish'] = ts_publish
 912 |                 article['title'] = title
 913 |                 article['desc'] = shorten_desc(desc)
 914 |                 article['url'] = url
 915 |                 article['url_key'] = get_url_key(article['url'])
 916 |                 articles.append(article)
 917 |             except Exception:
 918 |                 logger.error(f'{source} crawl failed: {traceback.format_exc()}')
 919 |                 break
 920 |     logger.info(
 921 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
 922 |     )
 923 |     return articles
 924 | 
 925 | 
 926 | @register_online
 927 | # @register_history
 928 | # @register_test
 929 | async def mouse_vs_python() -> list:
 930 |     """The Mouse Vs. The Python"""
 931 |     source: str = 'The Mouse Vs. The Python'
 932 |     articles: list = []
 933 |     max_page: int = 1
 934 |     # max_page:int = 101
 935 |     seed = 'https://www.blog.pythonlibrary.org/page/{page}/'
 936 |     for page in range(1, max_page + 1):
 937 |         api = seed.format(page=page)
 938 |         scode = await outlands_request(
 939 |             {
 940 |                 'method': 'get',
 941 |                 'timeout': 5,
 942 |                 'retry': 2,
 943 |                 'url': api,
 944 |             }, 'u8')
 945 |         items = fromstring(scode).cssselect('#content>article')
 946 |         if max_page > 1:
 947 |             logger.info(
 948 |                 f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
 949 |             )
 950 |         if not items:
 951 |             if page > 1:
 952 |                 logger.info(f'{source} break for page {page} has no items')
 953 |             break
 954 |         for item in items:
 955 |             try:
 956 |                 article: dict = {'source': source}
 957 |                 title = item.cssselect('.entry-title>a')[0].text
 958 |                 url = item.cssselect('.entry-title>a')[0].get('href')
 959 |                 desc = item.cssselect('.entry-content')[0].text_content()
 960 |                 pub_time = item.cssselect('time.entry-date')[0].get('datetime')
 961 |                 ts_publish = ttime(ptime(pub_time, fmt='%Y-%m-%dT%H:%M:%S%z'))
 962 |                 article['ts_publish'] = ts_publish
 963 |                 article['title'] = title
 964 |                 article['desc'] = shorten_desc(desc)
 965 |                 article['url'] = url
 966 |                 article['url_key'] = get_url_key(article['url'])
 967 |                 articles.append(article)
 968 |             except Exception:
 969 |                 logger.error(f'{source} crawl failed: {traceback.format_exc()}')
 970 |                 break
 971 |     logger.info(
 972 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
 973 |     )
 974 |     return articles
 975 | 
 976 | 
 977 | @register_online
 978 | # @register_history
 979 | # @register_test
 980 | async def infoq_python() -> list:
 981 |     """InfoQ"""
 982 |     source: str = 'InfoQ'
 983 |     articles: list = []
 984 |     max_page: int = 1
 985 |     # max_page:int = 101
 986 |     curl_string = r'''curl 'https://www.infoq.cn/public/v1/article/getList' -H 'Origin: https://www.infoq.cn' -H 'Accept-Encoding: gzip, deflate, br' -H 'Accept-Language: zh-CN,zh;q=0.9' -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36' -H 'Content-Type: application/json' -H 'Accept: application/json, text/plain, */*' -H 'Referer: https://www.infoq.cn/topic/python' -H 'Cookie: SERVERID=0|0|0' -H 'Connection: keep-alive' -H 'DNT: 1' --data-binary '{"type":1,"size":12,"id":50,"score":0}' --compressed'''
 987 |     request_args = curlparse(curl_string)
 988 |     for page in range(1, max_page + 1):
 989 |         r = await req.request(retry=2, timeout=5, **request_args)
 990 |         if not r:
 991 |             logger.error(f'{source} crawl failed: {r}, {r.text}')
 992 |             return articles
 993 |         items = r.json().get('data') or []
 994 |         if max_page > 1:
 995 |             logger.info(
 996 |                 f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
 997 |             )
 998 |             if items:
 999 |                 # 调整上一页最后一个 score 实现翻页
1000 |                 data = json.loads(request_args['data'])
1001 |                 data['score'] = items[-1]['score']
1002 |                 request_args['data'] = json.dumps(data).encode('u8')
1003 |             elif page > 1:
1004 |                 logger.info(f'{source} break for page {page} has no items')
1005 |                 break
1006 |         for item in items:
1007 |             try:
1008 |                 article: dict = {'source': source}
1009 |                 desc = shorten_desc(item['article_summary'])
1010 |                 if '本文分享 方巍' in desc:
1011 |                     continue
1012 |                 title = item['article_title']
1013 |                 url = f"https://www.infoq.cn/article/{item['uuid']}"
1014 |                 ts_publish = ttime(item['publish_time'])
1015 |                 article['ts_publish'] = ts_publish
1016 |                 article['title'] = title
1017 |                 article['desc'] = desc
1018 |                 article['url'] = url
1019 |                 article['url_key'] = get_url_key(article['url'])
1020 |                 articles.append(article)
1021 |             except Exception:
1022 |                 logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1023 |                 break
1024 |     logger.info(
1025 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1026 |     )
1027 |     return articles
1028 | 
1029 | 
1030 | @register_online
1031 | # @register_history
1032 | # @register_test
1033 | async def hn_python() -> list:
1034 |     """Hacker News"""
1035 |     source: str = 'Hacker News'
1036 |     articles: list = []
1037 |     max_page = 999
1038 |     # 默认收录 24 小时内的 3 points 以上
1039 |     min_points = 3
1040 |     now_ts = int(time.time())
1041 |     ts_start = now_ts - 86400
1042 |     ts_end = now_ts
1043 |     # 历史文章收录 90 天内的历史文章, 对方有个每次 query 1000 的上限配置 paginationLimitedTo
1044 |     # 如果需要更久的, 不断修改起止时间就可以了
1045 |     # ts_start = now_ts - 86400 * 90
1046 |     # ts_end = now_ts
1047 |     per_page: int = 100
1048 |     api: str = 'https://hn.algolia.com/api/v1/search_by_date'
1049 |     # tags=story&query=python&numericFilters=created_at_i%3E1553174400,points%3E1&page=2&hitsPerPage=10
1050 |     params: dict = {
1051 |         'tags': 'story',
1052 |         'query': 'python',
1053 |         'numericFilters': f'created_at_i>={ts_start},created_at_i<={ts_end},points>={min_points}',
1054 |         'page': 0,
1055 |         'hitsPerPage': per_page,
1056 |     }
1057 |     for page in range(max_page):
1058 |         params['page'] = page
1059 |         r = await req.get(api,
1060 |                           params=params,
1061 |                           retry=2,
1062 |                           timeout=10,
1063 |                           headers={"User-Agent": CHROME_PC_UA})
1064 |         if not r:
1065 |             logger.error(f'{source} crawl failed: {r}, {r.text}')
1066 |             return articles
1067 |         items = r.json().get('hits') or []
1068 |         if not items:
1069 |             break
1070 |         if page > 0:
1071 |             logger.info(
1072 |                 f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
1073 |             )
1074 |             if not items and page > 0:
1075 |                 logger.info(f'{source} break for page {page} has no items')
1076 |                 break
1077 |         for item in items:
1078 |             try:
1079 |                 article: dict = {'source': source}
1080 |                 title = item['title']
1081 |                 url = item['url'] or ''
1082 |                 if not url:
1083 |                     url = f'https://news.ycombinator.com/item?id={item["objectID"]}'
1084 |                 desc = item['story_text'] or ''
1085 |                 ts_publish = ttime(item['created_at_i'])
1086 |                 article['ts_publish'] = ts_publish
1087 |                 article['title'] = title
1088 |                 article['desc'] = shorten_desc(desc)
1089 |                 article['url'] = url
1090 |                 article['url_key'] = get_url_key(article['url'])
1091 |                 articles.append(article)
1092 |             except Exception:
1093 |                 logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1094 |                 break
1095 |     logger.info(
1096 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1097 |     )
1098 |     return articles
1099 | 
1100 | 
1101 | @register_online
1102 | # @register_history
1103 | # @register_test
1104 | async def snarky() -> list:
1105 |     """Brett Cannon"""
1106 |     source: str = 'Brett Cannon'
1107 |     articles: list = []
1108 |     max_page: int = 1
1109 |     api: str = 'https://snarky.ca/page/{page}/'
1110 |     # 判断发布时间如果是 1 小时前就 break
1111 |     break_time = ttime(time.time() - 60 * 60)
1112 |     host = 'https://snarky.ca/'
1113 |     for page in range(1, max_page + 1):
1114 |         seed = api.format(page=page)
1115 |         scode = await outlands_request(url=seed, retry=1, timeout=20)
1116 |         if not scode:
1117 |             logger.error(f'{source} crawl failed: {scode}')
1118 |             return articles
1119 |         items = fromstring(scode).cssselect('.post-feed>article.post-card')
1120 |         if not items:
1121 |             break
1122 |         for item in items:
1123 |             try:
1124 |                 article: dict = {'source': source}
1125 |                 href = item.cssselect('a.post-card-content-link')[0].get(
1126 |                     'href', '')
1127 |                 if not href:
1128 |                     raise ValueError(f'{source} not found href from {seed}')
1129 |                 url = add_host(href, host)
1130 |                 title = (item.cssselect('h2.post-card-title') or
1131 |                          [null_tree])[0].text
1132 |                 desc = (item.cssselect('.post-card-excerpt>p') or
1133 |                         [null_tree])[0].text
1134 |                 if not (title and url):
1135 |                     raise ValueError(f'{source} no title {url}')
1136 |                 detail_resp = await req.get(
1137 |                     url,
1138 |                     ssl=False,
1139 |                     headers={
1140 |                         "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36'
1141 |                     })
1142 |                 if not detail_resp:
1143 |                     raise ValueError(
1144 |                         f'{source} request href failed {detail_resp}')
1145 |                 detail_scode = detail_resp.text
1146 |                 raw_pub_time = find_one(
1147 |                     'property="article:published_time" content="(.+?)"',
1148 |                     detail_scode)[1]
1149 |                 # 2019-05-06T08:58:00.000Z
1150 |                 ts_publish = ttime(
1151 |                     ptime(raw_pub_time, fmt='%Y-%m-%dT%H:%M:%S.000Z'))
1152 |                 cover_item = item.cssselect('img.post-card-image')
1153 |                 if cover_item:
1154 |                     cover = cover_item[0].get('src', '')
1155 |                     if cover:
1156 |                         article['cover'] = add_host(cover, host)
1157 |                 article['ts_publish'] = ts_publish
1158 |                 article['title'] = title
1159 |                 article['desc'] = desc
1160 |                 article['url'] = url
1161 |                 article['url_key'] = get_url_key(article['url'])
1162 |                 articles.append(article)
1163 |                 if ts_publish < break_time:
1164 |                     # 文章的发布时间超过抓取间隔, 则 break
1165 |                     break
1166 |             except Exception:
1167 |                 logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1168 |                 break
1169 |     logger.info(
1170 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1171 |     )
1172 |     return articles
1173 | 
1174 | 
1175 | @register_online
1176 | # @register_history
1177 | # @register_test
1178 | async def jiqizhixin() -> list:
1179 |     """机器之心"""
1180 |     source: str = '机器之心'
1181 |     articles: list = []
1182 |     max_page: int = 1
1183 |     # 有 cookie 和 防跨域验证
1184 |     curl_string = r'''curl 'https://www.jiqizhixin.com/api/v1/search?type=articles&page=1&keywords=python&published=0&is_exact_match=false&search_internet=true&sort=time' -H 'Cookie: ahoy_visitor=1; _Synced_session=2' -H 'DNT: 1' -H 'Accept-Encoding: gzip, deflate, br' -H 'Accept-Language: zh-CN,zh;q=0.9' -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36' -H 'Accept: */*' -H 'Referer: https://www.jiqizhixin.com/search/article?keywords=python&search_internet=true&sort=time' -H 'X-Requested-With: XMLHttpRequest' -H 'If-None-Match: W/"3e034aa5e8cb79dd92652f5ba70a65a5"' -H 'Connection: keep-alive' --compressed'''
1185 |     request_args = curlparse(curl_string)
1186 |     for page in range(1, max_page + 1):
1187 |         # 部分时候请求返回结果为空, 需要重试
1188 |         for _ in range(2, 5):
1189 |             r = await req.request(retry=1, timeout=20, **request_args)
1190 |             if not r:
1191 |                 logger.error(f'{source} crawl failed: {r}, {r.text}')
1192 |                 return articles
1193 |             try:
1194 |                 items = r.json().get('articles', {}).get('nodes', [])
1195 |                 if not items:
1196 |                     continue
1197 |                 break
1198 |             except json.decoder.JSONDecodeError:
1199 |                 await asyncio.sleep(_)
1200 |                 continue
1201 |         else:
1202 |             # 试了 3 次都没 break, 放弃
1203 |             return articles
1204 |         if max_page > 1:
1205 |             logger.info(
1206 |                 f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
1207 |             )
1208 |             # 翻页, 修改 page
1209 |             curl_string = re.sub(r'&page=\d+', f'&page={page + 1}', curl_string)
1210 |             request_args = curlparse(curl_string)
1211 |         if not r.json().get('articles', {}).get('hasNextPage'):
1212 |             break
1213 |         for item in items:
1214 |             try:
1215 |                 article: dict = {'source': source}
1216 |                 desc = item['content']
1217 |                 # 2019/05/27 00:09
1218 |                 article['ts_publish'] = ttime(
1219 |                     ptime(item['published_at'], fmt='%Y/%m/%d %H:%M'))
1220 |                 title = item.get('title') or ''
1221 |                 title = title.replace('<em>Python</em>',
1222 |                                       'Python').replace('<em>python</em>',
1223 |                                                         'Python')
1224 |                 article['title'] = title
1225 |                 article['cover'] = item.get('cover_image_url') or ''
1226 |                 article['desc'] = f'「{item["author"]}」 {shorten_desc(desc)}'
1227 |                 article['url'] = item['path']
1228 |                 article['url_key'] = get_url_key(article['url'])
1229 |                 articles.append(article)
1230 |             except Exception:
1231 |                 logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1232 |                 break
1233 |     logger.info(
1234 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1235 |     )
1236 |     return articles
1237 | 
1238 | 
1239 | @register_online
1240 | # @register_history
1241 | # @register_test
1242 | async def lilydjwg() -> list:
1243 |     """依云's Blog"""
1244 |     source: str = "依云's Blog"
1245 |     articles: list = []
1246 |     max_page: int = 1
1247 |     seed = 'https://blog.lilydjwg.me/tag/python?page={page}'
1248 |     for page in range(1, max_page + 1):
1249 |         r = await req.get(seed.format(page=page),
1250 |                           retry=1,
1251 |                           timeout=20,
1252 |                           headers={"User-Agent": CHROME_PC_UA})
1253 |         if not r:
1254 |             logger.error(f'{source} crawl failed: {r}, {r.text}')
1255 |             return articles
1256 |         scode = r.content.decode('u8')
1257 |         items = fromstring(scode).cssselect('#content>.posttotal')
1258 |         if not items:
1259 |             break
1260 |         host = 'https://blog.lilydjwg.me/'
1261 |         if max_page > 1:
1262 |             logger.info(
1263 |                 f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
1264 |             )
1265 |         for item in items:
1266 |             try:
1267 |                 article: dict = {'source': source}
1268 |                 title = item.cssselect('.storytitle>a')[0].text
1269 |                 href = item.cssselect('.storytitle>a')[0].get('href', '')
1270 |                 url = add_host(href, host).replace(
1271 |                     'https://lilydjwg.is-programmer.com/', host)
1272 |                 desc = shorten_desc((item.cssselect('.post_brief>p') or
1273 |                                      [null_tree])[0].text_content())
1274 |                 cover = (item.cssselect('img') or [null_tree])[0].get('src', '')
1275 |                 month, day, year = item.cssselect(
1276 |                     '.date')[0].text_content().strip().split()
1277 |                 month = f'0{month}'[-2:]
1278 |                 day = f'0{day}'[-2:]
1279 |                 article['ts_publish'] = ttime(
1280 |                     ptime(f'{year}/{month}/{day}', fmt='%Y/%m/%d'))
1281 |                 article['title'] = title
1282 |                 article['cover'] = cover
1283 |                 article['desc'] = desc
1284 |                 article['url'] = url
1285 |                 article['url_key'] = get_url_key(article['url'])
1286 |                 articles.append(article)
1287 |             except Exception:
1288 |                 logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1289 |                 break
1290 |     logger.info(
1291 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1292 |     )
1293 |     return articles
1294 | 
1295 | 
1296 | @register_online
1297 | # @register_history
1298 | # @register_test
1299 | async def dev_io() -> list:
1300 |     """DEV Community"""
1301 |     source: str = "DEV Community"
1302 |     articles: list = []
1303 |     max_page: int = 1
1304 |     per_page: int = 30
1305 |     filt_score: int = 10
1306 |     for page in range(0, max_page):
1307 |         r = await req.get(
1308 |             f'https://dev.to/search/feed_content?per_page={per_page}&page={page}&tag=python&sort_by=published_at&sort_direction=desc&tag_names%5B%5D=python&approved=&class_name=Article',
1309 |             headers={
1310 |                 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
1311 |                 'Referer': 'https://dev.to/t/python/latest'
1312 |             },
1313 |             retry=1,
1314 |             timeout=20)
1315 |         if not r:
1316 |             logger.error(f'{source} crawl failed: {r}, {r.text}')
1317 |             return articles
1318 |         items = r.json().get('result') or []
1319 |         if not items:
1320 |             break
1321 |         host = 'https://dev.to/'
1322 |         if max_page > 1:
1323 |             logger.info(
1324 |                 f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
1325 |             )
1326 |         for item in items:
1327 |             try:
1328 |                 if item['public_reactions_count'] + item[
1329 |                         'comments_count'] < filt_score:
1330 |                     # filt by min score
1331 |                     continue
1332 |                 article: dict = {'source': source}
1333 |                 title = item['title']
1334 |                 path = item['path']
1335 |                 url = add_host(path, host)
1336 |                 desc = item['user']['name']
1337 |                 article['ts_publish'] = ttime(item['published_at_int'])
1338 |                 article['title'] = title
1339 |                 article['desc'] = desc
1340 |                 article['url'] = url
1341 |                 article['url_key'] = get_url_key(article['url'])
1342 |                 articles.append(article)
1343 |             except Exception:
1344 |                 logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1345 |                 break
1346 |     logger.info(
1347 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1348 |     )
1349 |     return articles
1350 | 
1351 | 
1352 | # @register_online
1353 | # @register_history
1354 | # @register_test
1355 | async def pythoncat() -> list:
1356 |     """Python猫"""
1357 |     # 采集掘金的, 知乎专栏的更新太慢了
1358 |     source: str = "Python猫"
1359 |     user: str = '57b26118a341310060fa74da'
1360 |     max_page = 1
1361 |     articles: list = await common_spider_juejin(user, source, max_page=max_page)
1362 |     logger.info(
1363 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1364 |     )
1365 |     return articles
1366 | 
1367 | 
1368 | @register_online
1369 | # @register_history
1370 | # @register_test
1371 | async def zhihu_zhuanlan_python_cn() -> list:
1372 |     """Python之美"""
1373 |     source: str = "Python之美"
1374 |     name: str = 'python-cn'
1375 |     articles: list = []
1376 |     limit = 10
1377 |     articles = await common_spider_zhihu_zhuanlan(name, source, limit=limit)
1378 |     logger.info(
1379 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1380 |     )
1381 |     return articles
1382 | 
1383 | 
1384 | @register_online
1385 | # @register_history
1386 | # @register_test
1387 | async def zhihu_zhuanlan_python_cat() -> list:
1388 |     """Python猫"""
1389 |     source: str = "Python猫"
1390 |     name: str = 'pythonCat'
1391 |     articles: list = []
1392 |     limit = 10
1393 |     articles = await common_spider_zhihu_zhuanlan(name, source, limit=limit)
1394 |     logger.info(
1395 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1396 |     )
1397 |     return articles
1398 | 
1399 | 
1400 | @register_online
1401 | # @register_history
1402 | # @register_test
1403 | async def zhihu_zhuanlan_pythoncxy() -> list:
1404 |     """Python程序员"""
1405 |     source: str = "Python程序员"
1406 |     name: str = 'pythoncxy'
1407 |     articles: list = []
1408 |     limit = 10
1409 |     articles = await common_spider_zhihu_zhuanlan(name, source, limit=limit)
1410 |     logger.info(
1411 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1412 |     )
1413 |     return articles
1414 | 
1415 | 
1416 | @register_online
1417 | # @register_history
1418 | # @register_test
1419 | async def zhihu_zhuanlan_c_111369541() -> list:
1420 |     """Python头条"""
1421 |     source: str = "Python头条"
1422 |     name: str = 'c_111369541'
1423 |     articles: list = []
1424 |     limit = 10
1425 |     articles = await common_spider_zhihu_zhuanlan(name, source, limit=limit)
1426 |     logger.info(
1427 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1428 |     )
1429 |     return articles
1430 | 
1431 | 
1432 | @register_online
1433 | # @register_history
1434 | # @register_test
1435 | async def cuiqingcai() -> list:
1436 |     """静觅"""
1437 |     source: str = "静觅"
1438 |     articles: list = []
1439 |     max_page: int = 1
1440 |     # max_page = 20
1441 |     api: str = 'https://cuiqingcai.com/category/technique/python/page/'
1442 |     now = ttime()
1443 |     this_date = now[5:10]
1444 |     this_year = now[:4]
1445 |     last_year_int = int(this_year) - 1
1446 |     timestamp_today_0 = ptime(now[:10] + ' 00:00:00')
1447 | 
1448 |     def translate_time_text(raw_time):
1449 |         if not raw_time:
1450 |             return ''
1451 |         raw_time = raw_time.strip()
1452 |         # 针对每种情况做时间转换
1453 |         # 4个月前 (02-21)
1454 |         # 2天前
1455 |         # 4年前 (2015-02-12)
1456 |         # 先尝试取得横线/:分割的时间, 取不到的应该是 n 天前的情况
1457 |         date = find_one(r'([\d:\- ]+)', raw_time)[1]
1458 |         if date:
1459 |             if re.match(r'^\d\d-\d\d$', date):
1460 |                 # 只有月日
1461 |                 # 这里有可能遇到的是去年的月份, 所以先判断
1462 |                 if date >= this_date:
1463 |                     date = f'{last_year_int}-{date}'
1464 |                 else:
1465 |                     date = f'{this_year}-{date}'
1466 |                 result = f'{date} 00:00:00'
1467 |             elif re.match(r'^\d\d\d\d-\d\d-\d\d$', date):
1468 |                 # 有年月日
1469 |                 result = f'{date} 00:00:00'
1470 |             elif re.match(r'^\d\d\d\d-\d\d-\d\d \d\d:\d\d$', date):
1471 |                 # 有年月日时分
1472 |                 result = f'{date}:00'
1473 |             elif re.match(r'^\d\d\d\d-\d\d-\d\d \d:\d\d$', date):
1474 |                 # 有年月日时分
1475 |                 result = f'{date[:11]}0{date[11:]}:00'
1476 |             else:
1477 |                 raise ValueError(f'bad time pattern {raw_time}')
1478 |         elif re.match(r'^\d+小时前$', raw_time):
1479 |             n_hour = int(find_one(r'\d+', raw_time)[0])
1480 |             result = ttime(timestamp_today_0 - n_hour * 3600)
1481 |         elif re.match(r'^\d+天前$', raw_time):
1482 |             n_day = int(find_one(r'\d+', raw_time)[0])
1483 |             result = ttime(timestamp_today_0 - n_day * 86400)
1484 |         else:
1485 |             raise ValueError(f'bad time pattern {raw_time}')
1486 |         return result
1487 | 
1488 |     for page in range(1, max_page + 1):
1489 |         seed = f'{api}{page}'
1490 |         r = await req.get(
1491 |             seed,
1492 |             retry=1,
1493 |             timeout=20,
1494 |             ssl=False,
1495 |             headers={
1496 |                 "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36'
1497 |             })
1498 |         if not r:
1499 |             logger.error(f'{source} crawl failed: {r}, {r.text}')
1500 |             return articles
1501 |         items = fromstring(
1502 |             r.content.decode('u8')).cssselect('div.content>article')
1503 |         if not items:
1504 |             break
1505 |         if max_page > 1:
1506 |             logger.info(
1507 |                 f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
1508 |             )
1509 |         for item in items:
1510 |             try:
1511 |                 article: dict = {'source': source}
1512 |                 title = null_tree.css(item, 'header>h2>a').text
1513 |                 url = null_tree.css(item, 'header>h2>a').get('href', '')
1514 |                 desc = null_tree.css(item, '.note').text_content()
1515 |                 cover = null_tree.css(item, 'img.thumb').get('src', '')
1516 |                 raw_time_text = null_tree.css(
1517 |                     item, 'p > span:nth-child(2)').text_content()
1518 |                 article['ts_publish'] = translate_time_text(raw_time_text)
1519 |                 article['title'] = title
1520 |                 article['cover'] = cover
1521 |                 article['desc'] = desc
1522 |                 article['url'] = url
1523 |                 article['url_key'] = get_url_key(article['url'])
1524 |                 articles.append(article)
1525 |             except Exception:
1526 |                 logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1527 |                 break
1528 |     logger.info(
1529 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1530 |     )
1531 |     return articles
1532 | 
1533 | 
1534 | @register_online
1535 | # @register_history
1536 | # @register_test
1537 | async def tuicool_cn() -> list:
1538 |     """推酷(中文)"""
1539 |     source: str = "推酷(中文)"
1540 |     articles: list = []
1541 |     max_page: int = 1
1542 |     articles = await common_spider_tuicool(
1543 |         'CN',
1544 |         source,
1545 |         max_page=max_page,
1546 |         ignore_descs={'稀土掘金', 'Python猫', 'InfoQ'})
1547 |     logger.info(
1548 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1549 |     )
1550 |     return articles
1551 | 
1552 | 
1553 | @register_online
1554 | # @register_history
1555 | # @register_test
1556 | async def tuicool_en() -> list:
1557 |     """推酷(英文)"""
1558 |     source: str = "推酷(英文)"
1559 |     articles: list = []
1560 |     max_page: int = 1
1561 |     articles = await common_spider_tuicool('EN',
1562 |                                            source,
1563 |                                            max_page=max_page,
1564 |                                            ignore_descs={'Real Python'})
1565 |     logger.info(
1566 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1567 |     )
1568 |     return articles
1569 | 
1570 | 
1571 | # @register_online
1572 | # @register_history
1573 | # @register_test
1574 | async def kf_toutiao() -> list:
1575 |     """稀土掘金"""
1576 |     source: str = "稀土掘金"
1577 |     articles: list = []
1578 |     max_page: int = 1
1579 |     per_page: int = 20
1580 |     sort_by = 'rankIndex'  # 'createdAt' 是按时间顺序
1581 |     api: str = 'https://timeline-merger-ms.juejin.im/v1/get_tag_entry'
1582 |     params: dict = {
1583 |         'src': 'web',
1584 |         'tagId': '559a7227e4b08a686d25744f',
1585 |         'page': 0,
1586 |         'pageSize': per_page,
1587 |         'sort': sort_by
1588 |     }
1589 |     # 豌豆花下猫 单独收录了
1590 |     ignore_usernames: set = {'豌豆花下猫'}
1591 |     for page in range(0, max_page):
1592 |         params['page'] = page
1593 |         scode = await outlands_request(
1594 |             {
1595 |                 'method': 'get',
1596 |                 'params': params,
1597 |                 'url': api,
1598 |                 'ssl': False,
1599 |                 'retry': 1,
1600 |                 'headers': {
1601 |                     'Referer': 'https://juejin.im/tag/Python?sort=popular',
1602 |                     'Origin': 'https://juejin.im',
1603 |                     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
1604 |                 }
1605 |             }, 'u8')
1606 |         if not scode:
1607 |             logger.error(f'{source} crawl failed: {scode}')
1608 |             return articles
1609 |         items = json.loads(scode).get('d', {}).get('entrylist', [])
1610 |         if not items:
1611 |             break
1612 |         if max_page > 1:
1613 |             logger.info(
1614 |                 f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
1615 |             )
1616 |         for item in items:
1617 |             try:
1618 |                 article: dict = {'source': source}
1619 |                 # 过滤一下已收录过的源
1620 |                 if item.get('user', {}).get('username', '') in ignore_usernames:
1621 |                     continue
1622 |                 # 2019-05-05T03:51:12.886Z
1623 |                 gmt_time = re.sub(r'\..*', '',
1624 |                                   item['createdAt']).replace('T', ' ')
1625 |                 ts_publish = ttime(ptime(gmt_time, tzone=0))
1626 |                 article['ts_publish'] = ts_publish
1627 |                 article['lang'] = 'en' if item['english'] else 'CN'
1628 |                 article['title'] = item['title']
1629 |                 article['cover'] = item['screenshot']
1630 |                 article['desc'] = item['summaryInfo']
1631 |                 article['url'] = item['originalUrl']
1632 |                 article['url_key'] = get_url_key(article['url'])
1633 |                 articles.append(article)
1634 |             except Exception:
1635 |                 logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1636 |                 break
1637 |     logger.info(
1638 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1639 |     )
1640 |     return articles
1641 | 
1642 | 
1643 | @register_online
1644 | # @register_history
1645 | # @register_test
1646 | async def freelycode() -> list:
1647 |     """Python部落"""
1648 |     source: str = "Python部落"
1649 |     articles: list = []
1650 |     max_page: int = 1
1651 |     api: str = 'https://python.freelycode.com/contribution/list/0'
1652 |     params: dict = {
1653 |         'page_no': 1,
1654 |     }
1655 |     host: str = 'https://python.freelycode.com/'
1656 | 
1657 |     def fix_time(raw_time):
1658 |         # 2019-03-27 7:02 a.m.
1659 |         # 2019-03-22 9:27 a.m.
1660 |         # 2019-07-17 9 a.m.
1661 |         raw_time = raw_time.replace('中午', '12:01 p.m.')
1662 |         if ':' not in raw_time:
1663 |             raw_time = f'{raw_time[:-5]}:00{raw_time[-5:]}'
1664 |         raw_time = raw_time.replace('.m.', 'm')
1665 |         formated_time = ttime(ptime(raw_time, fmt='%Y-%m-%d %I:%M %p'))
1666 |         return formated_time
1667 | 
1668 |     for page in range(1, max_page + 1):
1669 |         params['page_no'] = page
1670 |         r = await req.get(
1671 |             api,
1672 |             ssl=False,
1673 |             params=params,
1674 |             # proxy=proxy,
1675 |             retry=2,
1676 |             timeout=5,
1677 |             headers={
1678 |                 'Referer': api,
1679 |                 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
1680 |             },
1681 |         )
1682 |         if not r:
1683 |             logger.error(f'{source} crawl failed: {r}, {r.text}')
1684 |             return articles
1685 |         scode: str = r.content.decode('u8', 'ignore')
1686 |         items: list = fromstring(scode).cssselect(
1687 |             '.table-bordered tr:nth-child(n+2)')
1688 |         if not items:
1689 |             break
1690 |         if max_page > 1:
1691 |             logger.info(
1692 |                 f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
1693 |             )
1694 |         for item in items:
1695 |             try:
1696 |                 article: dict = {'source': source}
1697 |                 title_href = item.cssselect('td:nth-child(2)>a')
1698 |                 if not title_href:
1699 |                     continue
1700 |                 title: str = title_href[0].text
1701 |                 href: str = title_href[0].get('href', '')
1702 |                 url: str = add_host(href, host)
1703 |                 desc: str = null_tree.css(item, 'td:nth-child(3)').text
1704 |                 if desc:
1705 |                     desc = f'作者: {desc}'
1706 |                 raw_time: str = null_tree.css(item, 'td:nth-child(4)').text
1707 |                 ts_publish = fix_time(raw_time)
1708 |                 article['ts_publish'] = ts_publish
1709 |                 article['title'] = title
1710 |                 article['desc'] = desc
1711 |                 article['url'] = url
1712 |                 article['url_key'] = get_url_key(article['url'])
1713 |                 articles.append(article)
1714 |             except Exception:
1715 |                 logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1716 |                 break
1717 |     logger.info(
1718 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1719 |     )
1720 |     return articles
1721 | 
1722 | 
1723 | @register_online
1724 | # @register_history
1725 | # @register_test
1726 | async def miguelgrinberg() -> list:
1727 |     """miguelgrinberg"""
1728 |     source: str = "miguelgrinberg"
1729 |     articles: list = []
1730 |     start_page: int = 1
1731 |     max_page: int = 1
1732 |     api: str = 'https://blog.miguelgrinberg.com/index/page/'
1733 |     host: str = 'https://blog.miguelgrinberg.com/'
1734 | 
1735 |     for page in range(start_page, max_page + 1):
1736 |         page_url = f'{api}{page}'
1737 |         scode = await outlands_request({'url': page_url}, retry=1)
1738 |         if not scode:
1739 |             logger.error(f'{source} crawl failed: {scode}')
1740 |             return articles
1741 |         scode = re.sub(r'<!--[\s\S]*?-->', '', scode)
1742 |         items: list = fromstring(scode).cssselect('#main>.post')
1743 |         if not items:
1744 |             break
1745 |         if max_page > 1:
1746 |             logger.info(
1747 |                 f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
1748 |             )
1749 |         for item in items:
1750 |             try:
1751 |                 article: dict = {'source': source}
1752 |                 title_href = item.cssselect('h1.post-title>a')
1753 |                 if not title_href:
1754 |                     continue
1755 |                 title: str = title_href[0].text
1756 |                 href: str = title_href[0].get('href', '')
1757 |                 url: str = add_host(href, host)
1758 |                 desc: str = null_tree.css(item, '.post_body>p').text_content()
1759 |                 raw_time: str = null_tree.css(item, '.date>span').get(
1760 |                     'data-timestamp', '').replace('T', ' ').replace('Z', '')
1761 |                 ts_publish = ttime(ptime(raw_time, tzone=0))
1762 |                 article['ts_publish'] = ts_publish
1763 |                 article['title'] = title
1764 |                 article['desc'] = shorten_desc(desc)
1765 |                 article['url'] = url
1766 |                 article['url_key'] = get_url_key(article['url'])
1767 |                 articles.append(article)
1768 |             except Exception:
1769 |                 logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1770 |                 break
1771 |     logger.info(
1772 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1773 |     )
1774 |     return articles
1775 | 
1776 | 
1777 | @register_online
1778 | # @register_history
1779 | # @register_test
1780 | async def codingpy() -> list:
1781 |     """编程派"""
1782 |     source: str = "编程派"
1783 |     articles: list = []
1784 |     start_page: int = 1
1785 |     max_page: int = 1
1786 |     api: str = 'https://codingpy.com/article/'
1787 |     params: dict = {'page': 1}
1788 |     host: str = 'https://codingpy.com/'
1789 | 
1790 |     for page in range(start_page, max_page + 1):
1791 |         params['page'] = page
1792 |         r = await req.get(
1793 |             api,
1794 |             params=params,
1795 |             ssl=False,
1796 |             # proxy=proxy,
1797 |             retry=2,
1798 |             timeout=5,
1799 |             headers={
1800 |                 'Referer': api,
1801 |                 'User-Agent': CHROME_PC_UA
1802 |             },
1803 |         )
1804 |         if not r:
1805 |             logger.error(f'{source} crawl failed: {r}, {r.text}')
1806 |             return articles
1807 |         scode: str = r.content.decode('u8', 'ignore')
1808 |         items: list = fromstring(scode).cssselect('.archive-main>article')
1809 |         if not items:
1810 |             break
1811 |         if max_page > 1:
1812 |             logger.info(
1813 |                 f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
1814 |             )
1815 |         for item in items:
1816 |             try:
1817 |                 article: dict = {'source': source}
1818 |                 title_href = item.cssselect('.list-item-title>a')
1819 |                 title: str = title_href[0].text
1820 |                 href: str = title_href[0].get('href', '')
1821 |                 bg: str = null_tree.css(item, '.lim-cover').get('style', '')
1822 |                 # background-image:url(/media/articles/why-python-for-startups.jpg)
1823 |                 cover: str = find_one(r'background-image:url\((.*?)\)', bg)[1]
1824 |                 cover = add_host(cover, host)
1825 |                 url: str = add_host(href, host)
1826 |                 desc: str = null_tree.css(
1827 |                     item, '.list-item-summary>p').text_content()
1828 |                 raw_time: str = null_tree.css(item,
1829 |                                               '.list-item-meta>p>span').text
1830 |                 # 2015.11.03
1831 |                 ts_publish = ttime(ptime(raw_time, fmt='%Y.%m.%d'))
1832 |                 article['ts_publish'] = ts_publish
1833 |                 article['title'] = title
1834 |                 article['cover'] = cover
1835 |                 article['desc'] = shorten_desc(desc)
1836 |                 article['url'] = url
1837 |                 article['url_key'] = get_url_key(article['url'])
1838 |                 articles.append(article)
1839 |             except Exception:
1840 |                 logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1841 |                 break
1842 |     logger.info(
1843 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1844 |     )
1845 |     return articles
1846 | 
1847 | 
1848 | @register_online
1849 | # @register_history
1850 | # @register_test
1851 | async def nedbatchelder() -> list:
1852 |     """Ned Batchelder"""
1853 |     source: str = "Ned Batchelder"
1854 |     articles: list = []
1855 |     limit: int = 5
1856 |     api: str = 'https://nedbatchelder.com/blog/tag/python.html'
1857 |     host: str = 'https://nedbatchelder.com/'
1858 |     scode = await outlands_request(
1859 |         {
1860 |             'method': 'get',
1861 |             'timeout': 5,
1862 |             'headers': {
1863 |                 'Referer': api,
1864 |                 'User-Agent': CHROME_PC_UA,
1865 |             },
1866 |             'url': api,
1867 |         }, 'u8')
1868 |     container_html = null_tree.tostring(
1869 |         null_tree.css(fromstring(scode), '.category')).decode('utf-8')
1870 |     if not container_html:
1871 |         logger.error(f'{source} not found container_html.')
1872 |         return articles
1873 |     split_by: str = '<!--split-tag-->'
1874 |     container_html = container_html.replace(
1875 |         '<p class="date">', f'{split_by}<p class="date">').replace(
1876 |             '</div>', '').replace('<div class="category">', '')
1877 |     items: list = container_html.split(split_by)[1:limit + 1]
1878 |     if not items:
1879 |         return articles
1880 |     for item in items:
1881 |         try:
1882 |             article: dict = {'source': source}
1883 |             title_href = find_one(r'<p>\s*<a href="([^"]+)">([^<]+?)</a>', item)
1884 |             title: str = title_href[2]
1885 |             href: str = title_href[1]
1886 |             url: str = add_host(href, host)
1887 |             raw_time: str = find_one(r'<p class="date">(\d+ .*?\d+):</p>',
1888 |                                      item)[1]
1889 |             ts_publish = ttime(ptime(raw_time, fmt='%d %b %Y'))
1890 |             article['ts_publish'] = ts_publish
1891 |             article['title'] = title
1892 |             article['url'] = url
1893 |             article['url_key'] = get_url_key(article['url'])
1894 |             articles.append(article)
1895 |         except Exception:
1896 |             logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1897 |             break
1898 |     logger.info(
1899 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1900 |     )
1901 |     return articles
1902 | 
1903 | 
1904 | @register_online
1905 | # @register_history
1906 | # @register_test
1907 | async def the5fire() -> list:
1908 |     """the5fire的技术博客"""
1909 |     source: str = "the5fire的技术博客"
1910 |     articles: list = []
1911 |     start_page: int = 1
1912 |     max_page: int = 1
1913 |     api: str = 'https://www.the5fire.com/category/python/'
1914 |     host: str = 'https://www.the5fire.com/'
1915 |     params: dict = {'page': 1}
1916 | 
1917 |     for page in range(start_page, max_page + 1):
1918 |         params['page'] = page
1919 |         r = await req.get(
1920 |             api,
1921 |             params=params,
1922 |             ssl=False,
1923 |             # proxy=proxy,
1924 |             retry=1,
1925 |             headers={
1926 |                 'Referer': api,
1927 |                 'User-Agent': CHROME_PC_UA
1928 |             },
1929 |         )
1930 |         if not r:
1931 |             logger.error(f'{source} crawl failed: {r}, {r.text}')
1932 |             return articles
1933 |         scode: str = r.content.decode('u8', 'ignore')
1934 |         items: list = fromstring(scode).cssselect('#main>.caption')
1935 |         if not items:
1936 |             break
1937 |         if max_page > 1:
1938 |             logger.info(
1939 |                 f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
1940 |             )
1941 |         for item in items:
1942 |             try:
1943 |                 article: dict = {'source': source}
1944 |                 title_href = item.cssselect('h3>a')
1945 |                 title: str = title_href[0].text
1946 |                 href: str = title_href[0].get('href', '')
1947 |                 url: str = add_host(href, host)
1948 |                 desc: str = null_tree.css(item, '.caption>p').text_content()
1949 |                 raw_time: str = null_tree.css(item, '.info').text_content()
1950 |                 # 发布：2019-02-22 9:47 p.m.
1951 |                 raw_time = find_one(r'发布：(\d\d\d\d-\d{1,2}-\d{1,2}.*)',
1952 |                                     raw_time)[1].replace('.', '')
1953 |                 # 2019-03-20 10:07 p.m.
1954 |                 # 2011-05-28 10 a.m.
1955 |                 # 2011-12-08 午夜
1956 |                 if ':' not in raw_time:
1957 |                     if 'm' in raw_time:
1958 |                         raw_time = re.sub('m.*', 'm', raw_time)
1959 |                         ts_publish = ttime(ptime(raw_time,
1960 |                                                  fmt='%Y-%m-%d %I %p'))
1961 |                     else:
1962 |                         raw_time = raw_time[:10]
1963 |                         ts_publish = ttime(ptime(raw_time, fmt='%Y-%m-%d'))
1964 |                 else:
1965 |                     ts_publish = ttime(ptime(raw_time, fmt='%Y-%m-%d %I:%M %p'))
1966 |                 article['ts_publish'] = ts_publish
1967 |                 article['title'] = title
1968 |                 article['desc'] = shorten_desc(desc)
1969 |                 article['url'] = url
1970 |                 article['url_key'] = get_url_key(article['url'])
1971 |                 articles.append(article)
1972 |             except Exception:
1973 |                 logger.error(f'{source} crawl failed: {traceback.format_exc()}')
1974 |                 break
1975 |     logger.info(
1976 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
1977 |     )
1978 |     return articles
1979 | 
1980 | 
1981 | @register_online
1982 | # @register_history
1983 | # @register_test
1984 | async def foofish() -> list:
1985 |     """Python之禅"""
1986 |     source: str = "Python之禅"
1987 |     articles: list = []
1988 |     start_page: int = 1
1989 |     max_page: int = 1
1990 |     api: str = 'https://foofish.net/index.html'
1991 |     host: str = 'https://foofish.net/'
1992 | 
1993 |     for page in range(start_page, max_page + 1):
1994 |         if page == 1:
1995 |             seed = api
1996 |         else:
1997 |             seed = api.replace('index.html', f'index{page}.html')
1998 |         r = await req.get(
1999 |             seed,
2000 |             ssl=False,
2001 |             # proxy=proxy,
2002 |             retry=1,
2003 |             headers={
2004 |                 'Referer': api,
2005 |                 'User-Agent': CHROME_PC_UA
2006 |             },
2007 |         )
2008 |         if not r:
2009 |             logger.error(f'{source} crawl failed: {r}, {r.text}')
2010 |             return articles
2011 |         scode: str = r.content.decode('u8', 'ignore')
2012 |         container: str = find_one(r'<dl class="dl-horizontal">[\s\S]*?</dl>',
2013 |                                   scode)[0]
2014 |         if not container:
2015 |             logger.error('container not found')
2016 |             return articles
2017 |         items: list = re.findall(r'<dt>[\S\s]*?</dd>', container)
2018 |         if not items:
2019 |             break
2020 |         if max_page > 1:
2021 |             logger.info(
2022 |                 f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
2023 |             )
2024 |         for item_html in items:
2025 |             try:
2026 |                 article: dict = {'source': source}
2027 |                 item = fromstring(item_html)
2028 |                 title_href = item.cssselect('a')
2029 |                 title: str = title_href[0].text
2030 |                 href: str = title_href[0].get('href', '')
2031 |                 url: str = add_host(href, host)
2032 |                 raw_time: str = null_tree.css(item, 'dt').text
2033 |                 ts_publish = ttime(ptime(raw_time, fmt='%Y-%m-%d'))
2034 |                 article['ts_publish'] = ts_publish
2035 |                 article['title'] = title
2036 |                 article['url'] = url
2037 |                 article['url_key'] = get_url_key(article['url'])
2038 |                 articles.append(article)
2039 |             except Exception:
2040 |                 logger.error(f'{source} crawl failed: {traceback.format_exc()}')
2041 |                 break
2042 |     logger.info(
2043 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
2044 |     )
2045 |     return articles
2046 | 
2047 | 
2048 | @register_online
2049 | # @register_history
2050 | # @register_test
2051 | async def inventwithpython() -> list:
2052 |     """The Invent with Python Blog"""
2053 |     source: str = "The Invent with Python Blog"
2054 |     articles: list = []
2055 |     start_page: int = 1
2056 |     max_page: int = 1
2057 |     api: str = 'https://inventwithpython.com/blog/index.html'
2058 |     host: str = 'https://inventwithpython.com/'
2059 | 
2060 |     for page in range(start_page, max_page + 1):
2061 |         if page == 1:
2062 |             seed = api
2063 |         else:
2064 |             seed = api.replace('index.html', f'index{page}.html')
2065 |         r = await req.get(
2066 |             seed,
2067 |             ssl=False,
2068 |             # proxy=proxy,
2069 |             retry=1,
2070 |             headers={
2071 |                 'Referer': api,
2072 |                 'User-Agent': CHROME_PC_UA
2073 |             },
2074 |         )
2075 |         if not r:
2076 |             logger.error(f'{source} crawl failed: {r}, {r.text}')
2077 |             return articles
2078 |         scode: str = r.content.decode('u8', 'ignore')
2079 |         items: list = fromstring(scode).cssselect('#content>article')
2080 |         if not items:
2081 |             break
2082 |         if max_page > 1:
2083 |             logger.info(
2084 |                 f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
2085 |             )
2086 |         for item in items:
2087 |             try:
2088 |                 article: dict = {'source': source}
2089 |                 title_href = null_tree.css(item, 'h1>a')
2090 |                 title: str = title_href.text
2091 |                 href: str = title_href.get('href', '')
2092 |                 url: str = add_host(href, host)
2093 |                 raw_time: str = null_tree.css(
2094 |                     item, '.article-header-date').text.strip()
2095 |                 # Wed 05 June 2019
2096 |                 ts_publish = ttime(ptime(raw_time, fmt='%a %d %B %Y'))
2097 |                 article['ts_publish'] = ts_publish
2098 |                 article['title'] = title
2099 |                 article['url'] = url
2100 |                 article['url_key'] = get_url_key(article['url'])
2101 |                 articles.append(article)
2102 |             except Exception:
2103 |                 logger.error(f'{source} crawl failed: {traceback.format_exc()}')
2104 |                 break
2105 |     logger.info(
2106 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
2107 |     )
2108 |     return articles
2109 | 
2110 | 
2111 | @register_online
2112 | # @register_history
2113 | # @register_test
2114 | async def lucumr() -> list:
2115 |     """Armin Ronacher's Thoughts and Writings"""
2116 |     source: str = "Armin Ronacher's Thoughts and Writings"
2117 |     articles: list = []
2118 |     start_page: int = 1
2119 |     max_page: int = 1
2120 |     api: str = 'http://lucumr.pocoo.org/'
2121 |     host: str = 'http://lucumr.pocoo.org/'
2122 | 
2123 |     for page in range(start_page, max_page + 1):
2124 |         if page == 1:
2125 |             seed = api
2126 |         else:
2127 |             seed = add_host(f'/page/{page}/', host)
2128 |         r = await req.get(
2129 |             seed,
2130 |             ssl=False,
2131 |             # proxy=proxy,
2132 |             retry=1,
2133 |             headers={
2134 |                 'Referer': api,
2135 |                 'User-Agent': CHROME_PC_UA
2136 |             },
2137 |         )
2138 |         if not r:
2139 |             logger.error(f'{source} crawl failed: {r}, {r.text}')
2140 |             return articles
2141 |         scode: str = r.content.decode('u8', 'ignore')
2142 |         items: list = fromstring(scode).cssselect(
2143 |             '.entry-wrapper>.entry-overview')
2144 |         if not items:
2145 |             break
2146 |         if max_page > 1:
2147 |             logger.info(
2148 |                 f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
2149 |             )
2150 |         for item in items:
2151 |             try:
2152 |                 article: dict = {'source': source}
2153 |                 title_href = null_tree.css(item, 'h1>a')
2154 |                 title: str = title_href.text
2155 |                 href: str = title_href.get('href', '')
2156 |                 url: str = add_host(href, host)
2157 |                 desc: str = null_tree.css(item, '.summary>p').text
2158 |                 raw_time: str = null_tree.css(item, '.date').text.strip()
2159 |                 # Jun 5, 2017
2160 |                 ts_publish = ttime(ptime(raw_time, fmt='%b %d, %Y'))
2161 |                 article['ts_publish'] = ts_publish
2162 |                 article['title'] = title
2163 |                 article['desc'] = desc
2164 |                 article['url'] = url
2165 |                 article['url_key'] = get_url_key(article['url'])
2166 |                 articles.append(article)
2167 |             except Exception:
2168 |                 logger.error(f'{source} crawl failed: {traceback.format_exc()}')
2169 |                 break
2170 |     logger.info(
2171 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
2172 |     )
2173 |     return articles
2174 | 
2175 | 
2176 | @register_online
2177 | # @register_history
2178 | # @register_test
2179 | async def treyhunner() -> list:
2180 |     """Trey Hunner"""
2181 |     source: str = "Trey Hunner"
2182 |     articles: list = []
2183 |     limit: int = 5
2184 |     api: str = 'https://treyhunner.com/blog/categories/python/'
2185 |     host: str = 'https://treyhunner.com/'
2186 | 
2187 |     r = await req.get(
2188 |         api,
2189 |         ssl=False,
2190 |         # proxy=proxy,
2191 |         retry=1,
2192 |         headers={
2193 |             'Referer': api,
2194 |             'User-Agent': CHROME_PC_UA
2195 |         },
2196 |     )
2197 |     if not r:
2198 |         logger.error(f'{source} crawl failed: {r}, {r.text}')
2199 |         return articles
2200 |     scode: str = r.content.decode('u8', 'ignore')
2201 |     items: list = fromstring(scode).cssselect('#blog-archives>article')
2202 |     for item in items[:limit]:
2203 |         try:
2204 |             article: dict = {'source': source}
2205 |             title_href = null_tree.css(item, 'h1>a')
2206 |             title: str = title_href.text
2207 |             href: str = title_href.get('href', '')
2208 |             url: str = add_host(href, host)
2209 |             raw_time: str = null_tree.css(item, 'time').get('datetime')
2210 |             # 2019-06-18T09:15:00-07:00
2211 |             ts_publish = ttime(ptime(raw_time.replace('T', ' ')[:19], tzone=-7))
2212 |             article['ts_publish'] = ts_publish
2213 |             article['title'] = title
2214 |             article['url'] = url
2215 |             article['url_key'] = get_url_key(article['url'])
2216 |             articles.append(article)
2217 |         except Exception:
2218 |             logger.error(f'{source} crawl failed: {traceback.format_exc()}')
2219 |             break
2220 |     logger.info(
2221 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
2222 |     )
2223 |     return articles
2224 | 
2225 | 
2226 | @register_online
2227 | # @register_history
2228 | # @register_test
2229 | async def reddit() -> list:
2230 |     """Reddit"""
2231 |     source: str = "Reddit"
2232 |     articles: list = []
2233 |     limit: int = 22
2234 |     # 有 20 赞以上的才收录
2235 |     min_ups: int = 20
2236 |     # 或者 10 评论的才收录
2237 |     min_cmts: int = 10
2238 |     # api doc: https://www.reddit.com/dev/api/#GET_top
2239 |     api: str = f'https://api.reddit.com/r/Python/top/?t=day&limit={limit}'
2240 |     host: str = 'https://www.reddit.com/'
2241 |     for _ in range(2):
2242 |         scode = await outlands_request(
2243 |             {
2244 |                 'method': 'get',
2245 |                 'url': api,
2246 |                 'headers': {
2247 |                     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
2248 |                 }
2249 |             }, 'u8')
2250 |         # print(scode)
2251 |         if scode:
2252 |             break
2253 |     else:
2254 |         logger.error(f'{source} crawl failed')
2255 |         return articles
2256 |     rj: dict = json.loads(scode)
2257 |     items: list = rj['data']['children']
2258 |     for item in items:
2259 |         try:
2260 |             if item['kind'] != 't3':
2261 |                 continue
2262 |             data = item['data']
2263 |             if (data.get('ups') or data.get('score') or
2264 |                     0) < min_ups and (data.get('num_comments') or 0) < min_cmts:
2265 |                 continue
2266 |             article: dict = {'source': source}
2267 |             title: str = data['title']
2268 |             href: str = data['permalink']
2269 |             url: str = add_host(href, host)
2270 |             raw_time: str = data['created_utc']
2271 |             # 1564420248
2272 |             ts_publish = ttime(raw_time, tzone=0)
2273 |             desc: str = data.get('author') or ''
2274 |             article['ts_publish'] = ts_publish
2275 |             article['title'] = title
2276 |             article['url'] = url
2277 |             article['desc'] = desc
2278 |             article['url_key'] = get_url_key(article['url'])
2279 |             articles.append(article)
2280 |         except Exception:
2281 |             logger.error(f'{source} crawl failed: {traceback.format_exc()}')
2282 |             break
2283 |     logger.info(
2284 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
2285 |     )
2286 |     return articles
2287 | 
2288 | 
2289 | # @register_online
2290 | # @register_history
2291 | # @register_test
2292 | async def codetengu() -> list:
2293 |     """码天狗"""
2294 |     source: str = "码天狗"
2295 |     articles: list = []
2296 |     start_page: int = 1
2297 |     # max_page: int = 999
2298 |     max_page: int = 1
2299 |     api: str = 'https://weekly.codetengu.com/issues'
2300 |     params: dict = {'page': 1}
2301 |     host: str = 'https://weekly.codetengu.com/'
2302 | 
2303 |     for page in range(start_page, max_page + 1):
2304 |         params['page'] = page
2305 |         r = await req.get(
2306 |             api,
2307 |             params=params,
2308 |             ssl=False,
2309 |             # proxy=proxy,
2310 |             retry=1,
2311 |             timeout=10,
2312 |             headers={
2313 |                 'Referer': api,
2314 |                 'User-Agent': CHROME_PC_UA
2315 |             },
2316 |         )
2317 |         if not r:
2318 |             logger.error(f'{source} crawl failed: {r}, {r.text}')
2319 |             return articles
2320 |         scode: str = r.content.decode('u8', 'ignore')
2321 |         items: list = fromstring(scode).cssselect('.item__list > li.item')
2322 |         if not items:
2323 |             break
2324 |         if max_page > 1:
2325 |             logger.info(
2326 |                 f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
2327 |             )
2328 |         for item in items:
2329 |             try:
2330 |                 article: dict = {'source': source}
2331 |                 title: str = item.cssselect('.item__title')[0].text
2332 |                 href: str = item.cssselect('a')[0].get('href', '')
2333 |                 cover: str = null_tree.css(item, 'img').get('src', '')
2334 |                 cover = add_host(cover, host)
2335 |                 url: str = add_host(href, host)
2336 |                 desc: str = null_tree.css(item, '.item__title').text_content()
2337 |                 raw_time: str = null_tree.css(item, 'time.published').get(
2338 |                     'datetime', '1970-01-01')
2339 |                 ts_publish = ttime(ptime(raw_time, fmt='%Y-%m-%d'))
2340 |                 article['ts_publish'] = ts_publish
2341 |                 article['title'] = title
2342 |                 article['cover'] = cover
2343 |                 article['desc'] = shorten_desc(desc)
2344 |                 article['url'] = url
2345 |                 article['url_key'] = get_url_key(article['url'])
2346 |                 articles.append(article)
2347 |             except Exception:
2348 |                 logger.error(f'{source} crawl failed: {traceback.format_exc()}')
2349 |                 break
2350 |     logger.info(
2351 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
2352 |     )
2353 |     return articles
2354 | 
2355 | 
2356 | @register_online
2357 | # @register_history
2358 | # @register_test
2359 | async def pychina() -> list:
2360 |     """蠎周刊"""
2361 |     source: str = "蠎周刊"
2362 |     articles: list = []
2363 |     limit: int = 5
2364 |     api: str = 'http://weekly.pychina.org/archives.html'
2365 |     host: str = 'http://weekly.pychina.org/'
2366 | 
2367 |     r = await req.get(
2368 |         api,
2369 |         ssl=False,
2370 |         # proxy=proxy,
2371 |         retry=1,
2372 |         timeout=10,
2373 |         headers={
2374 |             'Referer': '',
2375 |             'User-Agent': CHROME_PC_UA
2376 |         },
2377 |     )
2378 |     if not r:
2379 |         logger.error(f'{source} crawl failed: {r}, {r.text}')
2380 |         return articles
2381 |     scode: str = r.content.decode('u8', 'ignore')
2382 |     items: list = fromstring(scode).cssselect('#content li')
2383 |     for item in items[:limit]:
2384 |         try:
2385 |             article: dict = {'source': source}
2386 |             title_href = item.cssselect('a[title]')
2387 |             if not title_href:
2388 |                 continue
2389 |             title: str = title_href[0].text.strip()
2390 |             href: str = title_href[0].get('href', '')
2391 |             url: str = add_host(href, host)
2392 |             raw_time: str = null_tree.css(item, 'sup').text
2393 |             ts_publish = ttime(ptime(raw_time, fmt='%Y-%m-%d %H:%M'))
2394 |             article['ts_publish'] = ts_publish
2395 |             article['title'] = title
2396 |             article['cover'] = ''
2397 |             article['desc'] = ''
2398 |             article['url'] = url
2399 |             article['url_key'] = get_url_key(article['url'])
2400 |             articles.append(article)
2401 |         except Exception:
2402 |             logger.error(f'{source} crawl failed: {traceback.format_exc()}')
2403 |             break
2404 |     logger.info(
2405 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
2406 |     )
2407 |     return articles
2408 | 
2409 | 
2410 | @register_online
2411 | # @register_history
2412 | # @register_test
2413 | async def xiaoruicc() -> list:
2414 |     """峰云就她了"""
2415 |     source: str = "峰云就她了"
2416 |     articles: list = []
2417 |     start_page: int = 1
2418 |     max_page: int = 1
2419 |     # max_page: int = 999
2420 |     api: str = 'http://xiaorui.cc/archives/category/python'
2421 |     host: str = 'http://xiaorui.cc/'
2422 | 
2423 |     for page in range(start_page, max_page + 1):
2424 |         api_url = f'{api}{"/page/" + str(page) if page != 1 else ""}/'
2425 |         r = await req.get(
2426 |             api_url,
2427 |             ssl=False,
2428 |             # proxy=proxy,
2429 |             retry=2,
2430 |             timeout=8,
2431 |             headers={
2432 |                 'Referer': api_url,
2433 |                 'User-Agent': CHROME_PC_UA
2434 |             },
2435 |         )
2436 |         if not r:
2437 |             if getattr(r, 'status_code', None) != 404:
2438 |                 logger.error(f'{source} crawl failed: {r}, {r.text}')
2439 |             return articles
2440 |         scode: str = r.content.decode('u8', 'ignore')
2441 |         items: list = fromstring(scode).cssselect('.content-area>article')
2442 |         if not items:
2443 |             break
2444 |         if max_page > 1:
2445 |             logger.info(
2446 |                 f'{source} crawling page {page}, + {len(items)} items = {len(articles)} articles'
2447 |             )
2448 |         for item in items:
2449 |             try:
2450 |                 article: dict = {'source': source}
2451 |                 title_href = item.cssselect('.entry-title>a')
2452 |                 if not title_href:
2453 |                     continue
2454 |                 title: str = title_href[0].text.strip()
2455 |                 href: str = title_href[0].get('href', '')
2456 |                 url: str = add_host(href, host)
2457 |                 desc: str = null_tree.css(
2458 |                     item, '.entry-summary>*:first-child').text_content()
2459 |                 raw_time: str = null_tree.css(item, 'time.published').get(
2460 |                     'datetime', '1970-01-01')
2461 |                 ts_publish = ttime(ptime(raw_time, fmt='%Y-%m-%dT%H:%M:%S%z'))
2462 |                 article['ts_publish'] = ts_publish
2463 |                 article['title'] = title
2464 |                 article['cover'] = ''
2465 |                 article['desc'] = shorten_desc(desc)
2466 |                 article['url'] = url
2467 |                 article['url_key'] = get_url_key(article['url'])
2468 |                 articles.append(article)
2469 |             except Exception:
2470 |                 logger.error(f'{source} crawl failed: {traceback.format_exc()}')
2471 |                 break
2472 |     logger.info(
2473 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
2474 |     )
2475 |     return articles
2476 | 
2477 | 
2478 | @register_online
2479 | # @register_history
2480 | # @register_test
2481 | async def medium_python() -> list:
2482 |     """Medium"""
2483 |     source: str = 'Medium'
2484 |     articles: list = []
2485 |     limit = 10
2486 |     seed = 'https://medium.com/feed/tag/python'
2487 |     # 避免超时, 用外部访问
2488 |     scode = await outlands_request({
2489 |         'method': 'get',
2490 |         'url': seed,
2491 |     }, 'u8')
2492 |     items = fromstring(scode.encode('utf-8'),
2493 |                        parser=XMLParser()).xpath('//channel/item')
2494 |     now = ttime()
2495 |     for item in items[:limit]:
2496 |         try:
2497 |             article: dict = {'source': source}
2498 |             guid = item.xpath('./guid/text()')
2499 |             title = item.xpath('./title/text()')
2500 |             description = item.xpath('./description/text()')
2501 |             author = item.xpath("./*[local-name()='creator']/text()")
2502 |             pubDate = item.xpath("./*[local-name()='updated']/text()")
2503 |             if not (guid and title):
2504 |                 continue
2505 |             url = guid[0]
2506 |             title = title[0]
2507 |             if description:
2508 |                 desc = fromstring(description[0]).text_content()
2509 |                 # 去掉 <>
2510 |                 desc = re.sub('<[^>]*>', ' ', desc)
2511 |                 # 只保留第一个换行前面的
2512 |                 desc = shorten_desc(desc)
2513 |             else:
2514 |                 desc = ''
2515 |             if 'Continue reading on' in desc:
2516 |                 continue
2517 |             if author:
2518 |                 desc = f'[{author[0]}] {desc}'
2519 |             if not ensure_cn_en(f'{title}{desc}'):
2520 |                 continue
2521 |             if pubDate:
2522 |                 raw_pub_date = pubDate[0]
2523 |                 # Wed, 22 May 2019 01:47:44 +0000
2524 |                 raw_pub_date = re.sub(r'\..*', '', raw_pub_date).strip()
2525 |                 ts_publish = ttime(
2526 |                     ptime(raw_pub_date, fmt='%Y-%m-%dT%H:%M:%S') + 3600 * 8)
2527 |             else:
2528 |                 ts_publish = now
2529 |             article['ts_publish'] = ts_publish
2530 |             article['title'] = title
2531 |             article['desc'] = desc
2532 |             article['url'] = url
2533 |             article['url_key'] = get_url_key(article['url'])
2534 |             articles.append(article)
2535 |         except Exception:
2536 |             logger.error(f'{source} crawl failed: {traceback.format_exc()}')
2537 |             break
2538 |     logger.info(
2539 |         f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}'
2540 |     )
2541 |     return articles
2542 | 


--------------------------------------------------------------------------------
/newspaper/loggers.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pathlib
 3 | from logging.handlers import RotatingFileHandler
 4 | 
 5 | log_dir = pathlib.Path(__file__).absolute().parent.parent / 'logs'
 6 | 
 7 | 
 8 | def init_logger(logger_name=None,
 9 |                 file_name='server.log',
10 |                 max_mb=50,
11 |                 backupCount=1):
12 |     if not log_dir.is_dir():
13 |         log_dir.mkdir()
14 |     formatter_str = (
15 |         "%(asctime)s %(levelname)-5s [%(name)s] %(filename)s(%(lineno)s): %(message)s"
16 |     )
17 |     datefmt = "%Y-%m-%d %H:%M:%S"
18 |     formatter = logging.Formatter(formatter_str, datefmt=datefmt)
19 |     logger = logging.getLogger(logger_name)
20 |     logger.setLevel(logging.INFO)
21 |     stream_hl = logging.StreamHandler()
22 |     stream_hl.setFormatter(formatter)
23 |     stream_hl.setLevel(logging.INFO)
24 |     logger.addHandler(stream_hl)
25 | 
26 |     file_hl = RotatingFileHandler(filename=log_dir / file_name,
27 |                                   maxBytes=1024 * 1024 * max_mb,
28 |                                   backupCount=backupCount,
29 |                                   encoding='utf-8')
30 |     file_hl.setFormatter(formatter)
31 |     file_hl.setLevel(logging.INFO)
32 |     logger.addHandler(file_hl)
33 |     return logger
34 | 
35 | 
36 | logger = init_logger('server', 'server.log')
37 | spider_logger = init_logger('spider_logger',
38 |                             'spider.log',
39 |                             max_mb=5,
40 |                             backupCount=1)
41 | 


--------------------------------------------------------------------------------
/newspaper/models.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | import re
  3 | import sqlite3
  4 | import typing
  5 | import warnings
  6 | from datetime import datetime
  7 | 
  8 | import aiomysql
  9 | from async_lru import alru_cache
 10 | from torequests.utils import ptime, time, ttime
 11 | 
 12 | from .loggers import logger
 13 | from .crawler.sources import content_sources_dict
 14 | 
 15 | # 用了 insert ignore 还总是 warning, 又不想 insert try, 只好全禁掉了...
 16 | warnings.filterwarnings('ignore', category=aiomysql.Warning)
 17 | 
 18 | 
 19 | class Storage(object, metaclass=abc.ABCMeta):
 20 |     """存储器抽象. 统一参数对文章数据库进行增删改查."""
 21 |     max_limit = 100  # 避免 limit 设置的太大一次提取太多导致拥堵
 22 |     articles_table_columns = ('url_key', 'title', 'url', 'cover', 'desc',
 23 |                               'source', 'level', 'review', 'ts_publish',
 24 |                               'ts_create', 'ts_update')
 25 | 
 26 |     def format_output_articles(self, articles: typing.Sequence[dict]):
 27 |         for article in articles:
 28 |             for key, value in article.items():
 29 |                 if isinstance(value, datetime):
 30 |                     article[key] = str(value)
 31 |         return articles
 32 | 
 33 |     @staticmethod
 34 |     def ensure_articles(articles: typing.Sequence[dict]) -> list:
 35 |         valid_articles = []
 36 |         # ensure_keys = ("url_key", "title", "cover", "desc", "source",
 37 |         #                "review", "ts_publish", "lang")
 38 |         keys_set = None
 39 |         now = ttime()
 40 |         before_3_day_0_0 = f'{ttime(time.time() - 86400*3)[:10]} 00:00:00'
 41 |         for article in articles:
 42 |             if not isinstance(article, dict):
 43 |                 continue
 44 |             if not keys_set:
 45 |                 keys_set = set(article.keys())
 46 |             else:
 47 |                 # 如果 keys 和第一个不一样, 就没法使用 executemany, 所以跳过
 48 |                 if set(article.keys()) != keys_set:
 49 |                     continue
 50 |             # 这些 key 必须都存在才能入库
 51 |             source = content_sources_dict.get(article['source'])
 52 |             if not source:
 53 |                 continue
 54 |             for ensure_key in ('url_key', 'title'):
 55 |                 if not article.get(ensure_key):
 56 |                     continue
 57 |             article.setdefault('cover', '')
 58 |             article.setdefault('desc', '')
 59 |             article.setdefault('source', 'unknown')
 60 |             article.setdefault('review', '')
 61 |             article.setdefault('level', source.get('level', 3))
 62 |             article.setdefault('lang', source.get('lang', 'CN'))
 63 |             article.setdefault('ts_publish', '1970-01-01 08:00:01')
 64 |             article['lang'] = article['lang'].upper()
 65 |             article['desc'] = re.sub(
 66 |                 r'<script[\s\S]*?</script>|<style[\s\S]*?</style>', '',
 67 |                 article['desc']).strip()
 68 |             article['title'] = article['title'].strip()
 69 |             # mysql 会报错 0000-00-00 00:00:00 格式错误; 顺便尝试转换掉错误的发布时间
 70 |             if ttime(ptime(article['ts_publish'])) == '1970-01-01 08:00:00':
 71 |                 article['ts_publish'] = '1970-01-01 08:00:01'
 72 |             if not article.get('ts_create'):
 73 |                 # 最近 3 天发布的, 使用当前时间做抓取时间
 74 |                 # 如果发布时间不存在, 也使用当前时间做抓取时间
 75 |                 if article['ts_publish'] >= before_3_day_0_0 or article[
 76 |                         'ts_publish'] == '1970-01-01 08:00:01':
 77 |                     article['ts_create'] = now
 78 |                 else:
 79 |                     # 不是 3 天内发布的, 使用发布时间做抓取时间
 80 |                     article['ts_create'] = article['ts_publish']
 81 |             valid_articles.append(article)
 82 |         return valid_articles
 83 | 
 84 |     @abc.abstractmethod
 85 |     async def add_articles(self, *args, **kwargs):
 86 |         raise NotImplementedError
 87 | 
 88 |     @abc.abstractmethod
 89 |     async def del_articles(self, *args, **kwargs):
 90 |         raise NotImplementedError
 91 | 
 92 |     @abc.abstractmethod
 93 |     async def update_articles(self, *args, **kwargs):
 94 |         raise NotImplementedError
 95 | 
 96 |     @abc.abstractmethod
 97 |     async def query_articles(self, *args, **kwargs):
 98 |         raise NotImplementedError
 99 | 
100 | 
101 | class MySQLStorage(Storage):
102 |     """连接 mysql 线上数据库, 目前不需要读写分离, 因为只初始化一次, 所以不需要单例."""
103 | 
104 |     def __init__(self, mysql_config):
105 |         self.host = mysql_config['mysql_host']
106 |         self.port = mysql_config['mysql_port']
107 |         self.user = mysql_config['mysql_user']
108 |         self.password = mysql_config['mysql_password']
109 |         self.db = mysql_config['mysql_db']
110 |         self.autocommit = True
111 |         self.pool_recycle = 600
112 |         self.connect_args = dict(host=self.host,
113 |                                  port=self.port,
114 |                                  user=self.user,
115 |                                  password=self.password,
116 |                                  db=self.db,
117 |                                  autocommit=self.autocommit,
118 |                                  pool_recycle=self.pool_recycle)
119 |         self.pool = None
120 | 
121 |     async def get_pool(self):
122 |         if self.pool and not self.pool._closed:
123 |             return self.pool
124 |         self.pool = await aiomysql.create_pool(**self.connect_args)
125 |         return self.pool
126 | 
127 |     async def _execute(self,
128 |                        cursor,
129 |                        execute_cmd: str,
130 |                        sql: str,
131 |                        args: typing.Union[list, dict] = None,
132 |                        fetchall: typing.Union[bool, None] = True,
133 |                        cursor_class: aiomysql.Cursor = aiomysql.DictCursor):
134 |         """用来在指定 cursor 对象的时候执行语句"""
135 |         result = await getattr(cursor, execute_cmd)(sql, args)
136 |         if isinstance(cursor._executed, str):
137 |             # 有时候是 bytesarray, 没什么必要看
138 |             logger.info(f'[Execute SQL]: {cursor._executed[:256]}')
139 |         if fetchall:
140 |             result = await cursor.fetchall()
141 |         elif fetchall is False:
142 |             result = await cursor.fetchone()
143 |         elif fetchall is None:
144 |             result = result
145 |         return result
146 | 
147 |     async def execute(self,
148 |                       sql: str,
149 |                       args: typing.Union[list, dict] = None,
150 |                       fetchall: typing.Union[bool, None] = True,
151 |                       cursor_class: aiomysql.Cursor = aiomysql.DictCursor,
152 |                       cursor: aiomysql.Cursor = None) -> typing.Any:
153 |         """简单的通过 sql 获取数据.
154 | 
155 |         :param sql: query 的 sql 语句
156 |         :type sql: str
157 |         :param args: query 语句的参数, defaults to None
158 |         :type args: typing.Union[list, dict], optional
159 |         :param fetchall: 是否全部取出来, 默认为 True, 调用 fetchall; 如果设为 None(默认), 只返回受影响的行数; 如果设为 False, 则调用 fetchone
160 |         :type fetchall: bool, optional
161 |         :param cursor_class: 默认使用字典表示一行数据, defaults to aiomysql.DictCursor
162 |         :type cursor_class: aiomysql.Cursor, optional
163 |         :param cursor: 现成的 cursor, 如果没有指定, 则去连接池里创建
164 |         :type cursor_class: aiomysql.Cursor
165 |         :return: 返回 fetchmany / fetchone 的结果
166 |         :rtype: typing.Any
167 |         """
168 |         if cursor:
169 |             return await self._execute(cursor,
170 |                                        'execute',
171 |                                        sql=sql,
172 |                                        args=args,
173 |                                        fetchall=fetchall,
174 |                                        cursor_class=cursor_class)
175 |         conn_pool = await self.get_pool()
176 |         async with conn_pool.acquire() as conn:
177 |             async with conn.cursor(cursor_class) as cursor:
178 |                 return await self._execute(cursor,
179 |                                            'execute',
180 |                                            sql=sql,
181 |                                            args=args,
182 |                                            fetchall=fetchall,
183 |                                            cursor_class=cursor_class)
184 | 
185 |     async def executemany(self,
186 |                           sql: str,
187 |                           args: list = None,
188 |                           fetchall: typing.Union[bool, None] = True,
189 |                           cursor_class: aiomysql.Cursor = aiomysql.DictCursor,
190 |                           cursor: aiomysql.Cursor = None) -> typing.Any:
191 |         """简单的通过 sql 获取数据.
192 | 
193 |         :param sql: query 的 sql 语句
194 |         :type sql: str
195 |         :param args: query 语句的参数, 只能为 list, defaults to None
196 |         :type args: list, optional
197 |         :param fetchall: 是否全部取出来, 默认为 True, 调用 fetchall; 如果设为 None(默认), 只返回受影响的行数; 如果设为 False, 则调用 fetchone
198 |         :type fetchall: bool, optional
199 |         :param cursor_class: 默认使用字典表示一行数据, defaults to aiomysql.DictCursor
200 |         :type cursor_class: aiomysql.Cursor, optional
201 |         :return: 返回 fetchmany / fetchone 的结果
202 |         :rtype: typing.Any
203 |         """
204 |         if cursor:
205 |             return await self._execute(cursor,
206 |                                        'executemany',
207 |                                        sql=sql,
208 |                                        args=args,
209 |                                        fetchall=fetchall,
210 |                                        cursor_class=cursor_class)
211 |         conn_pool = await self.get_pool()
212 |         async with conn_pool.acquire() as conn:
213 |             async with conn.cursor(cursor_class) as cursor:
214 |                 return await self._execute(cursor,
215 |                                            'executemany',
216 |                                            sql=sql,
217 |                                            args=args,
218 |                                            fetchall=fetchall,
219 |                                            cursor_class=cursor_class)
220 | 
221 |     async def _ensure_article_table_exists(self):
222 |         is_exists = await self.execute(
223 |             "SELECT table_name FROM information_schema.TABLES WHERE table_name ='articles'",
224 |             fetchall=False)
225 |         if is_exists:
226 |             logger.info('`articles` table exists.')
227 |             return
228 |         logger.info('start creating `articles` table for table missing.')
229 |         #! 每次修改这里要确定好和下面的 sqlite 部分是一致的
230 |         sql = '''CREATE TABLE `articles` (
231 |   `url_key` char(32) NOT NULL COMMENT '通过 url 计算的 md5',
232 |   `title` varchar(128) NOT NULL DEFAULT '无题' COMMENT '文章标题',
233 |   `url` varchar(255) NOT NULL COMMENT '文章地址',
234 |   `cover` varchar(255) NOT NULL DEFAULT '' COMMENT '文章封面图片',
235 |   `desc` text COMMENT '文章描述, 如果是周报, 则包含所有文字',
236 |   `source` varchar(32) NOT NULL DEFAULT '未知' COMMENT '文章来源',
237 |   `level` tinyint(4) NOT NULL COMMENT '来源评分',
238 |   `lang` char(2) DEFAULT NULL COMMENT '语言类型 cn, en',
239 |   `review` varchar(255) NOT NULL DEFAULT '' COMMENT '点评评语',
240 |   `ts_publish` timestamp NOT NULL DEFAULT '1970-01-01 08:00:01' COMMENT '发布时间',
241 |   `ts_create` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '抓取时间',
242 |   `ts_update` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
243 |   PRIMARY KEY (`url_key`),
244 |   KEY `ts_create_index` (`ts_create`) USING BTREE,
245 |   KEY `ts_publish_index` (`ts_publish`),
246 |   FULLTEXT KEY `full_text_index` (`title`,`desc`,`url`) /*!50100 WITH PARSER `ngram` */ 
247 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='存放文章数据.'
248 | '''
249 |         await self.execute(sql, fetchall=None)
250 |         logger.info('`articles` table created.')
251 | 
252 |     async def add_articles(self, articles, cursor=None):
253 |         """事先要注意保证 articles list 的每个 dict keys 是一样的"""
254 |         old_articles_length = len(articles)
255 |         articles = self.ensure_articles(articles)
256 |         if not articles:
257 |             return
258 |         # 拿到第一个 article 的 keys 拼凑 sql
259 |         keys = ', '.join([f'`{key}`' for key in articles[0].keys()])
260 |         value_keys = ','.join([f'%({key})s' for key in articles[0].keys()])
261 |         sql = f'''insert ignore into `articles` ({keys}) values ({value_keys})'''
262 |         result = await self.executemany(sql,
263 |                                         articles,
264 |                                         fetchall=None,
265 |                                         cursor=cursor)
266 |         source = articles[0]['source']
267 |         if result:
268 |             logger.info(
269 |                 f'[{source}]: crawled {old_articles_length} articles, inserted {result}.'
270 |             )
271 |         return result
272 | 
273 |     async def del_articles(self, *args, **kwargs):
274 |         raise NotImplementedError
275 | 
276 |     async def update_articles(self, *args, **kwargs):
277 |         raise NotImplementedError
278 | 
279 |     @alru_cache(maxsize=30)
280 |     async def query_articles(
281 |             self,
282 |             query: str = None,
283 |             start_time: str = "",
284 |             end_time: str = "",
285 |             source: str = "",
286 |             order_by: str = 'ts_create',
287 |             sorting: str = 'desc',
288 |             limit: int = 30,
289 |             offset: int = 0,
290 |             date: str = '',
291 |             lang: str = 'ANY',
292 |     ) -> dict:
293 |         args: list = []
294 |         where_list: list = []
295 |         result: dict = {}
296 |         source = str(source)
297 |         order_by = order_by.strip(' `')
298 |         limit = min((self.max_limit, int(limit)))
299 |         offset = int(offset)
300 |         lang = str(lang).upper()
301 |         extra_select_words: str = ''
302 | 
303 |         if query:
304 |             # 带检索词的，添加上字段方便排序
305 |             extra_select_words = ', MATCH (`title`, `desc`, `url`) AGAINST (%s IN BOOLEAN MODE)  as relevance'
306 |             args.append(query)
307 |             where_list.append(
308 |                 'MATCH (`title`, `desc`, `url`) AGAINST (%s in BOOLEAN MODE)')
309 |             args.append(query)
310 |         if order_by not in self.articles_table_columns and order_by != 'relevance':
311 |             order_by = 'ts_create'
312 |         order_by_sorting = f'order by {order_by} {sorting}'
313 |         if date:
314 |             if date == 'today':
315 |                 date = ttime()[:10]
316 |             elif date == 'yesterday':
317 |                 date = ttime(time.time() - 86400)[:10]
318 |             # 将 date 换算成起止时间并覆盖
319 |             date = str(date)
320 |             if not re.match('\\d\\d\\d\\d-\\d\\d-\\d\\d', date):
321 |                 raise ValueError(f'日期参数的格式不对 {date}, 例: 2019-05-14')
322 |             start_time = f'{date} 00:00:00'
323 |             end_time = f'{date} 23:59:59'
324 |             limit = 9999
325 |         if sorting.lower() not in ('desc', 'asc'):
326 |             sorting = 'desc'
327 |         if start_time:
328 |             where_list.append("`ts_publish` >= %s")
329 |             args.append(start_time)
330 |             result['start_time'] = start_time
331 |         if end_time:
332 |             where_list.append("`ts_publish` <= %s")
333 |             args.append(end_time)
334 |             result['end_time'] = end_time
335 |         if source:
336 |             where_list.append("`source` = %s")
337 |             args.append(source)
338 |             result['source'] = source
339 | 
340 |         if lang in {'CN', 'EN'}:
341 |             where_list.append("`lang` = %s")
342 |             args.append(lang)
343 |         else:
344 |             lang = 'ANY'
345 | 
346 |         result['order_by'] = order_by
347 |         result['query'] = query or ''
348 |         result['sorting'] = sorting
349 |         result['limit'] = limit
350 |         result['offset'] = offset
351 |         result['date'] = date
352 |         args.extend([limit + 1, offset])
353 |         if where_list:
354 |             where_string = 'where ' + ' and '.join(where_list)
355 |         else:
356 |             where_string = ''
357 |         sql = f"SELECT *{extra_select_words} from articles {where_string} {order_by_sorting} limit %s offset %s"
358 |         logger.info(f'fetching articles sql: {sql}, args: {args}')
359 |         items = await self.execute(sql, args)
360 |         result['has_more'] = 1 if len(items) > limit else 0
361 |         articles = self.format_output_articles(items[:limit])
362 |         result['articles'] = articles
363 |         result['lang'] = lang
364 |         return result
365 | 
366 | 
367 | class Sqlite3Storage(Storage):
368 |     """本地数据库, 主要用来备份线上数据避免阿里云翻车或者迁移的时候用."""
369 | 
370 |     def __init__(self, file_path):
371 |         self.db = sqlite3.connect(file_path)
372 |         self.cursor = self.db.cursor()
373 | 
374 |     def __del__(self):
375 |         self.db.close()
376 | 
377 |     def add_articles(self, articles):
378 |         articles = self.ensure_articles(articles)
379 |         if not articles:
380 |             return
381 |         for article in articles:
382 |             keys = list(article.keys())
383 |             keys_str = ', '.join([f'`{key}`' for key in keys])
384 |             values = [article[key] for key in keys]
385 |             value_keys = ','.join([f'?' for key in keys])
386 |             sql = f'''insert or ignore into `articles` ({keys_str}) values ({value_keys})'''
387 |             result = self.cursor.execute(sql, values)
388 |         self.db.commit()
389 |         return result
390 | 
391 |     def del_articles(self, *args, **kwargs):
392 |         pass
393 | 
394 |     def update_articles(self, *args, **kwargs):
395 |         pass
396 | 
397 |     def query_articles(self, *args, **kwargs):
398 |         pass
399 | 
400 |     def _ensure_article_table_exists(self):
401 |         self.cursor.execute(
402 |             "SELECT count(*) FROM sqlite_master WHERE type='table' AND name='articles'"
403 |         )
404 |         is_exists = bool(self.cursor.fetchone()[0])
405 |         if is_exists:
406 |             logger.info('`articles` table exists. [sqlite]')
407 |             return
408 |         logger.info(
409 |             'start creating `articles` table for table missing. [sqlite]')
410 |         #! sqlite 只用来备份, 所以不建索引, 不支持 mysql 的 ENGINE, INDEX, COMMENT
411 |         self.cursor.execute("""CREATE TABLE `articles` (
412 | `url_key` char(32) NOT NULL ,
413 | `title` varchar(128) NOT NULL DEFAULT '无题' ,
414 | `url` varchar(255) NOT NULL ,
415 | `cover` varchar(255) NOT NULL DEFAULT '' ,
416 | `desc` text ,
417 | `source` varchar(32) NOT NULL DEFAULT '未知' ,
418 | `level` tinyint(4) NOT NULL ,
419 | `lang` char(2) DEFAULT NULL ,
420 | `review` varchar(255) NOT NULL DEFAULT '' ,
421 | `ts_publish` timestamp NOT NULL DEFAULT '1970-01-01 08:00:01' ,
422 | `ts_create` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ,
423 | `ts_update` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
424 | PRIMARY KEY (`url_key`)
425 | )""")
426 | 
427 | 
428 | class MongoDBStorage(Storage):
429 |     """连接免费的 mongolab 数据库, 之后迁移到 heroku + mlab(免费 mongodb) 的时候使用它."""
430 | 


--------------------------------------------------------------------------------
/newspaper/server.py:
--------------------------------------------------------------------------------
 1 | from .views import app
 2 | import uvicorn
 3 | 
 4 | 
 5 | def main():
 6 |     uvicorn.run(
 7 |         'newspaper.views:app',
 8 |         host='127.0.0.1',
 9 |         port=9001,
10 |         proxy_headers=True,
11 |     )
12 |     # logger=app.logger)
13 | 


--------------------------------------------------------------------------------
/newspaper/static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClericPy/newspaper/e7826da716aec72dce60da345058337f8bc7726a/newspaper/static/favicon.ico


--------------------------------------------------------------------------------
/newspaper/templates/articles.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <!-- 省流量的极简乞丐版前端, 以后再考虑优化成使用 vue 框架的版本 -->
  4 | 
  5 | <head>
  6 |     <meta charset="UTF-8">
  7 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 |     <link rel="icon" href="/static/favicon.ico" type="image/x-icon" />
  9 |     <meta http-equiv="X-UA-Compatible" content="ie=edge">
 10 |     <title>Python Timeline</title>
 11 |     <!-- Global site tag (gtag.js) - Google Analytics -->
 12 |     <script async src="https://www.googletagmanager.com/gtag/js?id={{GA_ID}}"></script>
 13 |     <script>
 14 |         window.dataLayer = window.dataLayer || [];
 15 | 
 16 |         function gtag() {
 17 |             dataLayer.push(arguments);
 18 |         }
 19 |         gtag('js', new Date());
 20 | 
 21 |         gtag('config', '{{GA_ID}}');
 22 |     </script>
 23 | 
 24 |     <style>
 25 |         a[href^="http"]:empty::before {
 26 |             content: attr(href);
 27 |         }
 28 | 
 29 |         html,
 30 |         body {
 31 |             margin: 0 0 0 0;
 32 |             background-color: #f4f8fb;
 33 |         }
 34 | 
 35 |         input {
 36 |             border-color: #ecf0f1;
 37 |         }
 38 | 
 39 |         a {
 40 |             color: #2e3135;
 41 |             text-decoration: none;
 42 |         }
 43 | 
 44 |         #params,
 45 |         #params-toggle {
 46 |             display: none;
 47 |         }
 48 | 
 49 |         div.main-body {
 50 |             width: 55%;
 51 |             margin: 0 auto;
 52 |         }
 53 | 
 54 |         #params-toggle:checked+#params {
 55 |             display: block;
 56 |         }
 57 | 
 58 |         div.articles-main {
 59 |             display: flex;
 60 |             flex-direction: row-reverse;
 61 |             flex-wrap: nowrap;
 62 |             width: 100%;
 63 | 
 64 |         }
 65 | 
 66 |         div.container {
 67 |             width: 80%;
 68 |         }
 69 | 
 70 |         div.articles {
 71 |             width: 100%;
 72 |             /* margin: 0 auto; */
 73 |             display: flex;
 74 |             flex-direction: column;
 75 |             flex-wrap: nowrap;
 76 |             justify-content: flex-start;
 77 |             align-items: stretch;
 78 |             align-content: stretch;
 79 |             /* padding: 10px 10px 0 10px; */
 80 | 
 81 |             /* background-color: var(--box-background-color); */
 82 |             /* border-radius: var(--box-border-radius); */
 83 |             /* box-shadow: 0 2px 3px rgba(0, 0, 0, .1); */
 84 |             /* border-bottom: 1px solid #e2e2e2; */
 85 |         }
 86 | 
 87 |         div.side {
 88 |             width: 20%;
 89 |             margin-left: 1em;
 90 |         }
 91 | 
 92 |         div.side li {
 93 |             list-style: square;
 94 |             font-size: 1em;
 95 |             color: #2c3e50;
 96 |             border-bottom: 1px solid #e2e2e2;
 97 |             font-weight: bold;
 98 |             margin: 0.5em;
 99 |         }
100 | 
101 |         #params>p.params-title {
102 |             margin: 5px 0 5px 0;
103 |             font-size: 0.8em;
104 |         }
105 | 
106 |         #params {
107 |             margin-left: 0.8em;
108 |         }
109 | 
110 |         div.head {
111 |             background-color: #18688d;
112 |             margin: auto;
113 |             padding: 2px;
114 |             text-align: center;
115 |         }
116 | 
117 |         div.head a {
118 |             color: #ecf0f1;
119 |             text-decoration: none;
120 |             text-shadow: black 8px 8px 8px;
121 |         }
122 | 
123 |         div.article {
124 |             background-color: white;
125 |             margin: 5px 0 5px 0;
126 |             padding: 0 1.5em 0px 1.5em;
127 | 
128 |             border-radius: 2px;
129 |             box-shadow: 1px 5px 15px rgba(0, 0, 0, .1);
130 |         }
131 | 
132 |         div.article .title {
133 |             margin-bottom: 0;
134 |         }
135 | 
136 |         div.article .desc {
137 |             color: #999;
138 |             padding-left: 0.2em;
139 |             font-size: 0.9em;
140 |             overflow: hidden;
141 |             text-overflow: ellipsis;
142 |             display: -webkit-box;
143 |             -webkit-box-orient: vertical;
144 |             -webkit-line-clamp: 2;
145 |         }
146 | 
147 |         div.article .article-source {
148 |             color: #999;
149 |             font-size: 0.9em;
150 |             padding: 8px 0 0 0;
151 |             border-top: 1px dotted rgba(0, 0, 0, .1);
152 |         }
153 | 
154 |         .is_new_article {
155 |             color: white;
156 |             background-color: #5dade2;
157 |             font-weight: bold;
158 |             /* border: 2px solid #e0e0e0; */
159 |             padding: 2px 4px 2px 4px;
160 |             font-size: 0.8em;
161 |         }
162 | 
163 |         @media (max-width: 900px) {
164 |             div.articles-main {
165 |                 flex-direction: column;
166 |             }
167 | 
168 |             div.main-body {
169 |                 width: 88%;
170 |                 margin: 0 auto;
171 |             }
172 | 
173 |             div.container {
174 |                 width: 100%;
175 |             }
176 | 
177 |             div.side {
178 |                 width: 100%;
179 |                 margin: 0 auto;
180 |             }
181 |         }
182 | 
183 |         #nextpage {
184 |             cursor: pointer;
185 |             text-align: center;
186 |         }
187 | 
188 |         .star_level_1,
189 |         .star_level_2,
190 |         .star_level_3 {
191 |             color: #f1c40f;
192 |         }
193 | 
194 |         .star_level_4 {
195 |             color: #f39c12;
196 |         }
197 | 
198 |         .star_level_5 {
199 |             color: red;
200 |         }
201 | 
202 |         input,
203 |         select {
204 |             outline-style: none;
205 |             border: 1px solid #ccc;
206 |             padding: 5px;
207 |             margin: 2px 0 0 0;
208 |         }
209 | 
210 |         input:focus {
211 |             border-color: #03A9F4;
212 |             outline: 0;
213 |             -webkit-box-shadow: inset 0 1px 1px rgba(0, 0, 0, .075), 0 0 8px rgba(102, 175, 233, .6);
214 |             box-shadow: inset 0 1px 1px rgba(0, 0, 0, .075), 0 0 8px rgba(102, 175, 233, .6)
215 |         }
216 | 
217 |         button {
218 |             border: 1px outset;
219 |             background-color: white;
220 |         }
221 |     </style>
222 | </head>
223 | 
224 | <body>
225 |     <div class="main-body">
226 |         <div class="head">
227 |             <h1><a href="/newspaper/articles.query.html">Python Timeline</a></h1>
228 |         </div>
229 |         <div class="articles-main">
230 | 
231 |             <div class="side">
232 |                 <div></div>
233 |                 <ul>
234 | 
235 |                     <label for="params-toggle" type="button">
236 |                         <li>Settings ▽</li>
237 |                     </label>
238 |                     <input type="checkbox" id="params-toggle" />
239 |                     <form id="params" action="./articles.query.html" method="get">
240 |                         <p class="params-title">query:</p>
241 |                         <input name="query" id="form_query" placeholder="Boolean Full-Text Searches" type="text"> <a
242 |                             href="https://dev.mysql.com/doc/refman/5.7/en/fulltext-boolean.html" target="_blank">?</a>
243 |                         <p class="params-title">date:</p>
244 |                         <input name="date" id="form_date" placeholder="1970-01-01, today, yesterday" type="text">
245 |                         <p class="params-title">start_time:</p>
246 |                         <input name="start_time" id="form_start_time" placeholder="1970-01-01 08:00:00" type="text">
247 |                         <p class="params-title">end_time:</p>
248 |                         <input name="end_time" id="form_end_time" placeholder="1970-01-01 08:00:00" type="text">
249 |                         <p class="params-title">source:</p>
250 |                         <input name="source" id="form_source" placeholder="" type="text">
251 |                         <p class="params-title">lang:</p>
252 |                         <select name="lang" id="form_lang">
253 |                             <option value="ANY">ANY</option>
254 |                             <option value="EN">EN</option>
255 |                             <option value="CN">CN</option>
256 |                         </select>
257 |                         <p class="params-title">order_by:</p>
258 |                         <select name="order_by" id="form_order_by">
259 |                             <option value="ts_create">ts_create</option>
260 |                             <option value="ts_publish">ts_publish</option>
261 |                             <option value="relevance">relevance</option>
262 |                         </select>
263 |                         <p class="params-title">sorting:</p>
264 |                         <select name="sorting" id="form_sorting">
265 |                             <option value="desc">desc</option>
266 |                             <option value="asc">asc</option>
267 |                         </select>
268 |                         <p class="params-title">limit:</p>
269 |                         <input name="limit" id="form_limit" placeholder="" type="text">
270 |                         <p class="params-title">offset:</p>
271 |                         <input name="offset" id="form_offset" placeholder="" type="text">
272 |                         <p class="params-title"><input type="submit" value="Submit" style="background-color: white;">
273 |                         </p>
274 |                     </form>
275 |                     <li><a href="https://github.com/ClericPy/newspaper" target="_blank" class="github">Github</a></li>
276 |                     <li><a href="./daily.python.list.rss.any" target="_blank" class="rss">RSS</a></li>
277 |                     <li><a href="https://github.com/ClericPy/newspaper#%E5%86%85%E5%AE%B9%E6%BA%90%E5%88%97%E8%A1%A8"
278 |                             target="_blank" class="sources">Sources</a></li>
279 |                 </ul>
280 |             </div>
281 |             <div class="container">
282 |                 <div class="articles" id="articles">
283 |                 </div>
284 |                 <h3 id="nextpage" onclick="load_next_page()">
285 |                     Next Page
286 |                 </h3>
287 |             </div>
288 |         </div>
289 |         <br><br><br><br>
290 |         <hr>
291 |         <a href="http://www.beian.miit.gov.cn" style="font-size: 0.4em;float: right;" target="_blank">{{BEIAN_ID}}</a>
292 |     </div>
293 |     <script src="https://cdn.staticfile.org/clipboard.js/2.0.4/clipboard.min.js"></script>
294 |     <script>
295 |         try {
296 |             new window.ClipboardJS('.share-button', {
297 |                 text: function (trigger) {
298 |                     var article_id = trigger.getAttribute('article_id')
299 |                     var title_node = document.querySelector('#' + article_id + '>.title>a')
300 |                     var text = '[' + title_node.innerText + '](' + title_node.getAttribute("href") + ')'
301 |                     return text
302 |                 }
303 |             });
304 |         } catch (error) {
305 |             console.error(error)
306 |         }
307 | 
308 |         var next_url = window.location.href.replace('articles.query.html', 'articles.query.json')
309 | 
310 |         function fill_form_params(data) {
311 |             var keys = ['query', 'date', 'start_time', 'end_time', 'source', 'lang', 'order_by', 'sorting', 'limit']
312 |             for (let index = 0; index < keys.length; index++) {
313 |                 const key = keys[index];
314 |                 var element = document.getElementById('form_' + key)
315 |                 element.value = data[key] || ''
316 |             }
317 |         }
318 | 
319 |         function enable_nextpage() {
320 |             var np = document.getElementById('nextpage')
321 |             np.style.cursor = 'pointer'
322 |             return true
323 |         }
324 | 
325 |         function disable_nextpage() {
326 |             var np = document.getElementById('nextpage')
327 |             np.style.cursor = 'not-allowed'
328 |             return true
329 |         }
330 | 
331 |         function get_time(offset) {
332 |             var d = new Date(new Date().getTime() + offset);
333 |             var year = d.getFullYear();
334 |             var month = change(d.getMonth() + 1);
335 |             var day = change(d.getDate());
336 |             var hour = change(d.getHours());
337 |             var minute = change(d.getMinutes());
338 |             var second = change(d.getSeconds());
339 | 
340 |             function change(t) {
341 |                 if (t < 10) {
342 |                     return "0" + t;
343 |                 } else {
344 |                     return t;
345 |                 }
346 |             }
347 |             return year + '-' + month + '-' + day + ' ' + hour + ':' + minute + ':' + second;
348 |         }
349 | 
350 |         function fill_data(data) {
351 |             // 一小时内算新的
352 |             var ts_before_1_hour = get_time(-3600000 * 1)
353 |             var articles = data.articles
354 |             fill_form_params(data)
355 |             next_url = data['next_url'] || ''
356 |             const level_stars = ['★', '★★', '★★★', '★★★★', '★★★★★']
357 |             var node = document.getElementById('articles')
358 |             for (let index = 0; index < articles.length; index++) {
359 |                 const item = articles[index];
360 |                 var child = document.createElement('div')
361 |                 child.setAttribute('id', '_' + item.url_key)
362 |                 child.setAttribute('class', 'article')
363 |                 let stars = level_stars[item.level - 1]
364 |                 let host = item.url.split('/')[2] || ''
365 |                 let is_new = item.ts_create > ts_before_1_hour ? '<span class="is_new_article">New</span>' : ''
366 |                 child.innerHTML =
367 |                     `<h3 class="title"><a href="${item.url}" target="_blank">${item.title}</a></h3>
368 |                     <p class="desc" onclick="show_all_desc('_${item.url_key}')">${item.desc}</p>
369 |                     <p class="review">${item.review}</p>
370 |                     <p class="article-source">
371 |                         ${is_new} <span class="ts_create" title="${item.ts_publish}">${item.ts_create}</span> - 
372 |                         <span class="stars star_level_${item.level}">${stars}</span> - 
373 |                         <span class="source"><a href="?source=${item.source}">🔍</a> <a href="/newspaper/source.redirect?name=${item.source}" target="_blank">${item.source}</a></span> - 
374 |                         <span class="host">${host}</span>
375 |                         <button class="share share-button" title="Click to Copy" article_id="_${item.url_key}">Share</button>
376 |                     </p>
377 |                     `
378 |                 node.appendChild(child)
379 |             }
380 |             if (!next_url) {
381 |                 var np = document.getElementById('nextpage')
382 |                 np.style.cursor = 'not-allowed'
383 |                 np.innerText = 'No more'
384 |                 np.onclick = null
385 |             }
386 |         }
387 | 
388 |         function load_next_page() {
389 |             disable_nextpage()
390 |             fetch(next_url)
391 |                 .then(response => response.json())
392 |                 .then(data => enable_nextpage() && fill_data(data))
393 |                 .catch(e => enable_nextpage() && console.log("error catched", e))
394 |         }
395 | 
396 |         function show_all_desc(article_id) {
397 |             var desc_element = document.querySelector('#' + article_id + '>p.desc')
398 |             desc_element.style['-webkit-line-clamp'] = 999;
399 |         }
400 | 
401 |         function start_up() {
402 |             load_next_page()
403 |         }
404 |         window.onload = start_up
405 |     </script>
406 | </body>
407 | 
408 | </html>
409 | 


--------------------------------------------------------------------------------
/newspaper/templates/daily_python.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <!-- 省流量的极简乞丐版前端, 以后再考虑优化成使用 vue 框架的版本 -->
  4 | 
  5 | <head>
  6 |     <meta charset="UTF-8">
  7 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 |     <link rel="icon" href="/static/favicon.ico" type="image/x-icon" />
  9 |     <meta http-equiv="X-UA-Compatible" content="ie=edge">
 10 |     <title>Python Daily: {{title|safe}}</title>
 11 |     <!-- Global site tag (gtag.js) - Google Analytics -->
 12 |     <script async src="https://www.googletagmanager.com/gtag/js?id={{GA_ID}}"></script>
 13 |     <script>
 14 |         window.dataLayer = window.dataLayer || [];
 15 | 
 16 |         function gtag() {
 17 |             dataLayer.push(arguments);
 18 |         }
 19 |         gtag('js', new Date());
 20 | 
 21 |         gtag('config', '{{GA_ID}}');
 22 |     </script>
 23 | 
 24 |     <style>
 25 |         a[href^="http"]:empty::before {
 26 |             content: attr(href);
 27 |         }
 28 | 
 29 |         html,
 30 |         body {
 31 |             margin: 0 0 0 0;
 32 |             background-color: #f4f8fb;
 33 |         }
 34 | 
 35 |         form>p {
 36 |             margin: 5px 0 5px 0;
 37 |         }
 38 | 
 39 |         input {
 40 |             border-color: #ecf0f1;
 41 |         }
 42 | 
 43 |         a {
 44 |             color: #2e3135;
 45 |             text-decoration: none;
 46 |         }
 47 | 
 48 |         #params,
 49 |         #params-toggle {
 50 |             display: none;
 51 |         }
 52 | 
 53 |         div.main-body {
 54 |             width: 55%;
 55 |             margin: 0 auto;
 56 |         }
 57 | 
 58 |         #params-toggle:checked+#params {
 59 |             display: block;
 60 |         }
 61 | 
 62 |         div.articles-main {
 63 |             display: flex;
 64 |             flex-direction: row-reverse;
 65 |             flex-wrap: nowrap;
 66 |             width: 100%;
 67 | 
 68 |         }
 69 | 
 70 |         div.container {
 71 |             width: 100%;
 72 |         }
 73 | 
 74 |         div.articles {
 75 |             width: 100%;
 76 |             /* margin: 0 auto; */
 77 |             display: flex;
 78 |             flex-direction: column;
 79 |             flex-wrap: nowrap;
 80 |             justify-content: flex-start;
 81 |             align-items: stretch;
 82 |             align-content: stretch;
 83 |             /* padding: 10px 10px 0 10px; */
 84 | 
 85 |             /* background-color: var(--box-background-color); */
 86 |             /* border-radius: var(--box-border-radius); */
 87 |             /* box-shadow: 0 2px 3px rgba(0, 0, 0, .1); */
 88 |             /* border-bottom: 1px solid #e2e2e2; */
 89 |         }
 90 | 
 91 |         div.head {
 92 |             background-color: #34495e;
 93 |             margin: auto;
 94 |             padding: 2px;
 95 |             text-align: center;
 96 |         }
 97 | 
 98 |         div.head a {
 99 |             color: #ecf0f1;
100 |             text-decoration: none;
101 |             text-shadow: black 8px 8px 8px;
102 |         }
103 | 
104 |         div.article {
105 |             background-color: white;
106 |             margin: 5px 0 5px 0;
107 |             padding: 0 1.5em 0px 1.5em;
108 | 
109 |             border-radius: 2px;
110 |             box-shadow: 1px 5px 15px rgba(0, 0, 0, .1);
111 |         }
112 | 
113 |         div.article .title {
114 |             margin-bottom: 0;
115 |         }
116 | 
117 |         div.article .desc {
118 |             color: #999;
119 |             padding-left: 0.2em;
120 |             font-size: 0.9em;
121 |             overflow: hidden;
122 |             text-overflow: ellipsis;
123 |             display: -webkit-box;
124 |             -webkit-box-orient: vertical;
125 |             -webkit-line-clamp: 2;
126 |         }
127 | 
128 |         div.article .article-source {
129 |             color: #999;
130 |             font-size: 0.9em;
131 |             padding: 8px 0 0 0;
132 |             border-top: 1px dotted rgba(0, 0, 0, .1);
133 |         }
134 | 
135 |         @media (max-width: 900px) {
136 |             div.articles-main {
137 |                 flex-direction: column;
138 |             }
139 | 
140 |             div.main-body {
141 |                 width: 88%;
142 |                 margin: 0 auto;
143 |             }
144 | 
145 |             div.container {
146 |                 width: 100%;
147 |             }
148 | 
149 |             div.side {
150 |                 width: 100%;
151 |                 margin: 0 auto;
152 |             }
153 |         }
154 | 
155 |         #nextpage {
156 |             cursor: pointer;
157 |             text-align: center;
158 |         }
159 | 
160 |         .star_level_1,
161 |         .star_level_2,
162 |         .star_level_3 {
163 |             color: #f1c40f;
164 |         }
165 | 
166 |         .star_level_4 {
167 |             color: #f39c12;
168 |         }
169 | 
170 |         .star_level_5 {
171 |             color: red;
172 |         }
173 |     </style>
174 | </head>
175 | 
176 | <body>
177 |     <div class="main-body">
178 |         <div class="head">
179 |             <h1><a href="/newspaper/daily.python/today">Python Daily: {{title|safe}}</a></h1>
180 |         </div>
181 |         <div class="articles-main">
182 | 
183 |             <div class="side">
184 |             </div>
185 |             <div class="container">
186 |                 <div class="articles" id="articles">
187 |                 </div>
188 |             </div>
189 |         </div>
190 |         <br><br><br><br>
191 |         <hr>
192 |         <a href="http://www.beian.miit.gov.cn" style="font-size: 0.4em;float: right;" target="_blank">{{BEIAN_ID}}</a>
193 |     </div>
194 |     <script>
195 |         function get_time(offset) {
196 |             var d = new Date(new Date().getTime() + offset);
197 |             var year = d.getFullYear();
198 |             var month = change(d.getMonth() + 1);
199 |             var day = change(d.getDate());
200 |             var hour = change(d.getHours());
201 |             var minute = change(d.getMinutes());
202 |             var second = change(d.getSeconds());
203 | 
204 |             function change(t) {
205 |                 if (t < 10) {
206 |                     return "0" + t;
207 |                 } else {
208 |                     return t;
209 |                 }
210 |             }
211 |             return year + '-' + month + '-' + day + ' ' + hour + ':' + minute + ':' + second;
212 |         }
213 | 
214 |         function fill_data(data) {
215 |             // 一小时内算新的
216 |             var ts_before_1_hour = get_time(-3600000 * 1)
217 |             var articles = data.articles
218 |             // fill_form_params(data)
219 |             const level_stars = ['★', '★★', '★★★', '★★★★', '★★★★★']
220 |             var node = document.getElementById('articles')
221 |             for (let index = 0; index < articles.length; index++) {
222 |                 const item = articles[index];
223 |                 var child = document.createElement('div')
224 |                 child.setAttribute('id', '_' + item.url_key)
225 |                 child.setAttribute('class', 'article')
226 |                 let stars = level_stars[item.level - 1]
227 |                 let host = item.url.split('/')[2] || ''
228 |                 let is_new = item.ts_create > ts_before_1_hour ? '<span class="is_new_article">New</span>' : ''
229 |                 child.innerHTML =
230 |                     `<h3 class="title"><a href="${item.url}" target="_blank">${item.title}</a></h3>
231 |                     <p class="desc" onclick="show_all_desc('_${item.url_key}')">${item.desc}</p>
232 |                     <p class="review">${item.review}</p>
233 |                     <p class="article-source">
234 |                         ${is_new} <span class="ts_publish">${item.ts_publish}</span> - 
235 | 
236 |                         <span class="stars star_level_${item.level}">${stars}</span> - 
237 |                         <span class="source"><a href="?source=${item.source}">🔍</a> <a href="/newspaper/source.redirect?name=${item.source}" target="_blank">${item.source}</a></span> - 
238 |                         <span class="host">${host}</span>
239 |                     </p>
240 | 
241 |                     `
242 |                 node.appendChild(child)
243 |             }
244 |         }
245 | 
246 |         function show_all_desc(article_id) {
247 |             var desc_element = document.querySelector('#' + article_id + '>p.desc')
248 |             desc_element.style['-webkit-line-clamp'] = 999;
249 |         }
250 | 
251 |         function start_up() {
252 |             var daily_python_data = String.raw `{{articles|safe}}`
253 |             daily_python_data = JSON.parse(daily_python_data)
254 |             fill_data(daily_python_data)
255 |         }
256 |         window.onload = start_up
257 |     </script>
258 | </body>
259 | 
260 | </html>
261 | 


--------------------------------------------------------------------------------
/newspaper/utils.py:
--------------------------------------------------------------------------------
 1 | from re import compile as re_compile
 2 | 
 3 | import aiofiles
 4 | from torequests.utils import escape
 5 | 
 6 | com = re_compile(r'[\u4e00-\u9fa5\Wa-zA-Z0-9]+')
 7 | 
 8 | 
 9 | def ensure_cn_en(string):
10 |     new_string = com.sub('', string)
11 |     return new_string.encode('u8') == new_string.encode('gb18030')
12 | 
13 | 
14 | async def tail_file(fp, size=100):
15 |     current_seek = 0
16 |     async with aiofiles.open(fp, encoding='u8', errors='ignore') as f:
17 |         while 1:
18 |             await f.seek(current_seek)
19 |             text = await f.read(1)
20 |             if not text:
21 |                 stop_pos = current_seek - size * 2
22 |                 break
23 |             current_seek += size
24 |         if stop_pos < 0:
25 |             stop_pos = 0
26 |         await f.seek(stop_pos)
27 |         text = (await f.read())[-size:]
28 |         return text
29 | 
30 | 
31 | def gen_rss(data):
32 |     nodes = []
33 |     channel = data['channel']
34 |     channel_title = channel['title']
35 |     channel_desc = channel['description']
36 |     channel_link = channel['link']
37 |     channel_language = channel.get('language', 'zh-cn')
38 |     item_keys = ['title', 'description', 'link', 'guid', 'pubDate']
39 |     for item in data['items']:
40 |         item_nodes = []
41 |         for key in item_keys:
42 |             value = item.get(key)
43 |             if value:
44 |                 item_nodes.append(f'<{key}>{escape(value)}</{key}>')
45 |         nodes.append(''.join(item_nodes))
46 |     items_string = ''.join((f'<item>{tmp}</item>' for tmp in nodes))
47 |     return rf'''<?xml version="1.0" encoding="UTF-8" ?>
48 | <rss version="2.0">
49 | <channel>
50 |   <title>{channel_title}</title>
51 |   <link>{channel_link}</link>
52 |   <description>{channel_desc}</description>
53 |   <language>{channel_language}</language>
54 |   {items_string}
55 | </channel>
56 | </rss>
57 | '''
58 | 


--------------------------------------------------------------------------------
/newspaper/views.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | import traceback
  4 | from urllib.parse import urlencode
  5 | 
  6 | from starlette.responses import (HTMLResponse, JSONResponse, PlainTextResponse,
  7 |                                  RedirectResponse, Response)
  8 | from torequests.utils import ptime, time, ttime
  9 | 
 10 | from .api import app
 11 | from .config import BEIAN_ID, GA_ID, ONLINE_HOST
 12 | from .crawler.sources import content_sources_dict
 13 | from .loggers import log_dir
 14 | from .utils import gen_rss, tail_file
 15 | 
 16 | 
 17 | class APIError(Exception):
 18 |     pass
 19 | 
 20 | 
 21 | def handle_pagination_response(url: str, result: dict) -> dict:
 22 |     base_url = re.sub(r'^https?://[^/]+|\?.*', '', url)
 23 |     result['ok'] = True
 24 |     params = {
 25 |         k: v
 26 |         for k, v in sorted(result.items(), key=lambda x: x[0])
 27 |         if k not in {'articles', 'has_more', 'next_url', 'prev_url', 'ok'}
 28 |     }
 29 |     prev_offset = max((result['offset'] - result['limit'], 0))
 30 |     next_offset = result['offset'] + result['limit']
 31 |     if result['offset'] > 0:
 32 |         # 前页地址
 33 |         params['offset'] = prev_offset
 34 |         result['prev_url'] = f'{base_url}?{urlencode(params)}'
 35 |     if result.get('has_more'):
 36 |         # 后页地址
 37 |         params['offset'] = next_offset
 38 |         result['next_url'] = f'{base_url}?{urlencode(params)}'
 39 |     return result
 40 | 
 41 | 
 42 | @app.exception_handler(Exception)
 43 | def handle_default_exception(req, error):
 44 |     """非 API 错误的捕获, 会被下面的 API 错误覆盖"""
 45 |     err_string = f'{error.__class__.__name__}: {str(error)}'
 46 |     app.logger.error(f"{str(req.url)}, {err_string}.\n{traceback.format_exc()}")
 47 |     # 避免泄漏信息, 只输出错误类型
 48 |     return JSONResponse({"ok": False, "error": error.__class__.__name__})
 49 | 
 50 | 
 51 | @app.exception_handler(APIError)
 52 | def handle_api_error(req, error):
 53 |     """只捕获主动 raise 出来的 API error"""
 54 |     err_string = str(error)
 55 |     app.logger.error(
 56 |         f"{str(req.url)}, APIError: {err_string}.\n{traceback.format_exc()}")
 57 |     # APIError 一般不会带上敏感信息
 58 |     return JSONResponse({"ok": False, "error": err_string})
 59 | 
 60 | 
 61 | @app.route('/')
 62 | async def index(req):
 63 |     beian = '<div style="height: 90%;"></div><a href="http://www.beian.miit.gov.cn" style="font-size: 0.4em;" target="_blank">{BEIAN_ID}</a><script>setTimeout(() => {{window.location.href="/newspaper/articles.query.html"}}, 1000);</script>'.format(
 64 |         BEIAN_ID=BEIAN_ID)
 65 |     return HTMLResponse(beian)
 66 | 
 67 | 
 68 | @app.route("/newspaper/articles.cache.clear")
 69 | async def articles_query_cache_clear(req):
 70 |     if req.client.host == '127.0.0.1':
 71 |         app.db.query_articles.cache_clear()
 72 |         return PlainTextResponse('ok')
 73 |     else:
 74 |         return PlainTextResponse('fail')
 75 | 
 76 | 
 77 | @app.route("/newspaper/logs/spider")
 78 | async def spider_log(req):
 79 |     """只允许查看 spider log, 其他的信息不对外开放"""
 80 |     fp = log_dir / 'spider.log'
 81 |     size = req.query_params.get('size') or req.query_params.get('s')
 82 |     if size:
 83 |         size = int(size)
 84 |     else:
 85 |         size = len([
 86 |             i for i in content_sources_dict.values() if i['status'] == '√'
 87 |         ]) * 120
 88 |     text = await tail_file(fp, size)
 89 |     return PlainTextResponse(text)
 90 | 
 91 | 
 92 | @app.route('/favicon.ico')
 93 | async def redirect_ico(req):
 94 |     return RedirectResponse('/static/favicon.ico', 301)
 95 | 
 96 | 
 97 | @app.route("/newspaper/articles.query.{output}")
 98 | async def articles_query(req):
 99 |     """搜索文章
100 |     output 支持: html(默认), json, rss
101 | 
102 |     支持参数:
103 |     query: str = None,
104 |     start_time: str = "",
105 |     end_time: str = "",
106 |     source: str = "",
107 |     order_by: str = 'ts_create',
108 |     sorting: str = 'desc',
109 |     limit: int = 10,
110 |     offset: int = 0
111 |     """
112 |     output = req.path_params['output']
113 |     if output == 'json':
114 |         params = dict(req.query_params)
115 |         params['limit'] = params.get('limit') or 10
116 |         params['offset'] = params.get('offset') or 0
117 |         result = await app.db.query_articles(**params)
118 |         return JSONResponse(handle_pagination_response(req.url._url, result))
119 |     elif output == 'html':
120 |         return app.templates.TemplateResponse('articles.html', {
121 |             "request": req,
122 |             "GA_ID": GA_ID,
123 |             "BEIAN_ID": BEIAN_ID,
124 |         })
125 |     elif output == 'rss':
126 |         # 只保留日报的 RSS 接口, 不再对 Timeline 做 rss 了, 没有必要
127 |         # https://www.clericpy.top/newspaper/daily.python.list.rss.any
128 |         params = dict(req.query_params)
129 |         lang = params.get('lang', 'any').lower()
130 |         return RedirectResponse(f'/newspaper/daily.python.list.rss.{lang}', 302)
131 |     else:
132 |         return PlainTextResponse("NotImplemented")
133 | 
134 | 
135 | @app.route("/newspaper/daily.python/{date}")
136 | async def daily_python(req):
137 |     """Python 日报, 按 date 取文章, 以后考虑支持更多参数(过滤订阅源, 过滤 level, 过滤中英文)"""
138 |     date = req.path_params['date']
139 |     params = dict(req.query_params)
140 |     # 默认按发布时间
141 |     params.setdefault('order_by', 'ts_publish')
142 |     result = await app.db.query_articles(date=date, **params)
143 |     return app.templates.TemplateResponse(
144 |         'daily_python.html', {
145 |             "request": req,
146 |             "articles": json.dumps(result).replace('`', "'"),
147 |             "title": date,
148 |             "GA_ID": GA_ID,
149 |             "BEIAN_ID": BEIAN_ID,
150 |         })
151 | 
152 | 
153 | @app.route("/newspaper/daily.python.list.rss.{language}")
154 | async def daily_python_list(req):
155 |     """Python 日报列表, 其实就是个按照日期伪造的页面, 用来订阅 rss"""
156 |     language = req.path_params['language'].lower()
157 |     if language not in {'cn', 'en', 'any'}:
158 |         return PlainTextResponse('language should be cn / en / any.')
159 |     limit: int = int(req.query_params.get('limit') or 10)
160 |     xml_data: dict = {
161 |         'channel': {
162 |             'title': 'Python Daily',
163 |             'description': 'Python Daily Newspaper',
164 |             'link': f'https://{ONLINE_HOST}/newspaper/daily.python.list.rss.{language}',
165 |             'language': {
166 |                 'cn': 'zh-cn',
167 |                 'any': 'zh-cn'
168 |             }.get(language, 'en'),
169 |         },
170 |         'items': []
171 |     }
172 |     for date_delta in range(1, limit):
173 |         title_date: str = ttime(time.time() - 86400 * date_delta)[:10]
174 |         # 当日 0 点发布前一天的结果
175 |         pubDate: str = ttime(
176 |             ptime(
177 |                 ttime(time.time() - 86400 * (date_delta - 1))[:10],
178 |                 fmt='%Y-%m-%d'),
179 |             fmt='%a, %d %b %Y')
180 |         link: str = f'https://{ONLINE_HOST}/newspaper/daily.python/{title_date}?lang={language}'
181 |         item: dict = {
182 |             'title': f'Python Daily [{title_date}]',
183 |             'link': link,
184 |             'guid': link,
185 |             'pubDate': pubDate
186 |         }
187 |         xml_data['items'].append(item)
188 |     xml: str = gen_rss(xml_data)
189 |     return Response(xml, media_type='text/xml')
190 | 
191 | 
192 | @app.route("/newspaper/source.redirect")
193 | async def source_redirect(req):
194 |     """Python 日报, 按 date 取文章, 以后考虑支持更多参数(过滤订阅源, 过滤 level, 过滤中英文)"""
195 |     name = req.query_params['name']
196 |     return RedirectResponse(
197 |         content_sources_dict.get(name, {}).get(
198 |             'url', '/newspaper/articles.query.html'), 302)
199 | 


--------------------------------------------------------------------------------
/run_server.py:
--------------------------------------------------------------------------------
 1 | #! pipenv run python
 2 | """
 3 | 启动后端 API 服务
 4 | """
 5 | 
 6 | from newspaper.server import main
 7 | 
 8 | if __name__ == "__main__":
 9 |     main()
10 | 


--------------------------------------------------------------------------------