├── .env_example ├── .gitignore ├── LICENSE ├── README.md ├── db ├── Dockerfile └── setup.sh ├── docker-compose.yml └── observer ├── Dockerfile ├── __init__.py ├── app ├── __init__.py ├── render.py ├── run.py └── static │ ├── cover.png │ ├── favicon.png │ └── index.html ├── config.py ├── fetcher ├── __init__.py ├── run.py ├── summary.py └── updater.py ├── models ├── __init__.py ├── article.py ├── feed.py └── summary.py ├── repository ├── __init__.py ├── interface.py └── mongo.py ├── requirements.txt ├── run_app.py └── run_fetcher.py /.env_example: -------------------------------------------------------------------------------- 1 | OBSERVER_MONGO_USER=default 2 | OBSERVER_MONGO_PASS=default 3 | OBSERVER_MONGO_DB=observer 4 | OBSERVER_MONGO_ARTICLES=articles 5 | OBSERVER_MONGO_FEEDS=feeds 6 | OBSERVER_AUTH_TOKEN=default 7 | OBSERVER_SL_PORT=8501 8 | OBSERVER_FEED_UPDATE_TIMEOUT=600 9 | OBSERVER_FEED_CACHE_TTL=60 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | .idea/ 161 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Kirill Plotnikov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🧐 Обозреватель Хабра 2 | ### Лента кратких пересказов лучших статей с Хабра от нейросети YandexGPT 3 | 4 | #### Приложение доступно по адресу https://habr.observer 5 | В приложении используются материалы сайта [habr.com](https://habr.com), краткие пересказы которых получены с помощью сервиса [300.ya.ru](https://300.ya.ru). 6 | 7 | #### Деплой 8 | - Установить [Docker](https://docs.docker.com/engine/install/) и [Docker Compose](https://docs.docker.com/compose/install/) 9 | - Склонировать репозиторий: `git clone https://github.com/pltnk/habr-observer.git` 10 | - Создать внутри `.env` файл: `cp .env_example .env` 11 | - В нём установить пользователя и пароль для базы данных, изменив значения переменных `OBSERVER_MONGO_USER` и `OBSERVER_MONGO_PASS` 12 | - Добавить API токен для сервиса [300.ya.ru](https://300.ya.ru), изменив значение переменной `OBSERVER_AUTH_TOKEN` \ 13 | Чтобы получить токен, нужно нажать на `API` в левом нижнем углу главной страницы сервиса, а затем нажать кнопку `Получить токен` в правом верхнем углу 14 | - Выполнить `docker compose up -d` из корня склонированного репозитория 15 | - Первоначальный сбор статей может занять несколько минут, так как соблюдается rate limit для API сервиса 300.ya.ru 16 | 17 | #### Сделано с помощью 18 | - [Streamlit](https://github.com/streamlit/streamlit) 19 | - [HTTPX](https://github.com/encode/httpx) 20 | - [Beautiful Soup 4](https://www.crummy.com/software/BeautifulSoup/) 21 | - [lxml](https://github.com/lxml/lxml) 22 | - [Motor](https://github.com/mongodb/motor) 23 | 24 | #### Лицензия 25 | Проект находится под лицензией [MIT](https://choosealicense.com/licenses/mit/) — подробности в файле [LICENSE](LICENSE). 26 | -------------------------------------------------------------------------------- /db/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM mongo:6.0.7 2 | COPY setup.sh /docker-entrypoint-initdb.d/ 3 | ARG OBSERVER_MONGO_DB=observer 4 | HEALTHCHECK CMD mongosh ${OBSERVER_MONGO_DB} --eval 'db.runCommand("ping").ok' --quiet | grep 1 5 | -------------------------------------------------------------------------------- /db/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # create db and collections for feeds and articles 4 | DB="${OBSERVER_MONGO_DB:-observer}" 5 | CMD="db.createCollection( '${OBSERVER_MONGO_ARTICLES:-articles}' ); db.createCollection( '${OBSERVER_MONGO_FEEDS:-feeds}' );" 6 | mongosh "${DB}" --eval "${CMD}" -u "${OBSERVER_MONGO_USER:-default}" -p "${OBSERVER_MONGO_PASS:-default}" --authenticationDatabase admin 7 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.8" 2 | 3 | services: 4 | 5 | db: 6 | build: 7 | context: ./db 8 | dockerfile: Dockerfile 9 | container_name: observer-db 10 | command: --auth 11 | environment: 12 | MONGO_INITDB_ROOT_USERNAME: ${OBSERVER_MONGO_USER:-default} 13 | MONGO_INITDB_ROOT_PASSWORD: ${OBSERVER_MONGO_PASS:-default} 14 | env_file: .env 15 | volumes: 16 | - "~/.habr_observer:/data/db/:rw" 17 | restart: always 18 | healthcheck: 19 | interval: 10s 20 | timeout: 10s 21 | retries: 5 22 | start_period: 30s 23 | logging: 24 | driver: "json-file" 25 | options: 26 | max-size: "10m" 27 | max-file: "1" 28 | 29 | app: 30 | build: 31 | context: ./observer 32 | dockerfile: Dockerfile 33 | args: 34 | SL_PORT: ${OBSERVER_SL_PORT:-8501} 35 | container_name: observer-app 36 | env_file: .env 37 | ports: 38 | - ${OBSERVER_SL_PORT:-8501}:${OBSERVER_SL_PORT:-8501} 39 | depends_on: 40 | db: 41 | condition: service_healthy 42 | restart: always 43 | healthcheck: 44 | interval: 10s 45 | timeout: 10s 46 | retries: 5 47 | start_period: 10s 48 | logging: 49 | driver: "json-file" 50 | options: 51 | max-size: "10m" 52 | max-file: "1" 53 | -------------------------------------------------------------------------------- /observer/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim 2 | WORKDIR /opt/observer 3 | ENV PYTHONPATH "/opt/observer:$PYTHONPATH" 4 | ENV PYTHONUNBUFFERED true 5 | COPY requirements.txt /opt/observer/requirements.txt 6 | RUN pip install --no-cache-dir --ignore-installed -r requirements.txt 7 | COPY ./app/static/* /usr/local/lib/python3.9/site-packages/streamlit/static/ 8 | COPY . /opt/observer 9 | RUN apt update && apt install -y curl \ 10 | && rm -rf /var/lib/apt/lists/* 11 | ARG SL_PORT=8501 12 | EXPOSE ${SL_PORT} 13 | ENV OBSERVER_SL_PORT=${SL_PORT} 14 | HEALTHCHECK CMD curl --fail http://localhost:${OBSERVER_SL_PORT}/_stcore/health 15 | CMD python3 run_fetcher.py & streamlit run run_app.py --server.address=0.0.0.0 --server.port=${OBSERVER_SL_PORT} --browser.gatherUsageStats=false 16 | -------------------------------------------------------------------------------- /observer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pltnk/habr-observer/f9c9c817c36681e6fd2f2a62e95a33d2be1e3b8a/observer/__init__.py -------------------------------------------------------------------------------- /observer/app/__init__.py: -------------------------------------------------------------------------------- 1 | from .run import run_app 2 | -------------------------------------------------------------------------------- /observer/app/render.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable 2 | 3 | import streamlit as st 4 | from streamlit.components.v1 import html 5 | 6 | from models import Article, Feed 7 | 8 | 9 | def render_header() -> None: 10 | st.markdown( 11 | """ 12 |
13 |

14 |
15 |

🧐
Обозреватель Хабра

16 |

Краткий пересказ лучших статей с Хабра от нейросети YandexGPT

17 |
18 | """, 19 | unsafe_allow_html=True, 20 | ) 21 | 22 | 23 | def render_toggle() -> bool: 24 | st.write( 25 | """ 26 | 32 | """, 33 | unsafe_allow_html=True, 34 | ) 35 | return st.toggle( 36 | label="Сворачивать пересказы", 37 | value=True, 38 | key="collapse_summaries", 39 | help="Отключите, чтобы показывать пересказы целиком, не сворачивая их", 40 | ) 41 | 42 | 43 | def render_theses(theses: Iterable[str]) -> None: 44 | st.markdown("\n".join("* " + i for i in theses)) 45 | 46 | 47 | def render_article( 48 | article: Article, collapse_summary: bool = True, visible_theses: int = 3 49 | ) -> None: 50 | with st.container(): 51 | st.subheader( 52 | article.title, 53 | help=f"Дата публикации: {article.pub_date}", 54 | anchor=False, 55 | ) 56 | if collapse_summary: 57 | render_theses(article.summary.content[:visible_theses]) 58 | if len(article.summary.content) > visible_theses: 59 | with st.expander(label="Продолжение пересказа"): 60 | render_theses(article.summary.content[visible_theses:]) 61 | else: 62 | render_theses(article.summary.content) 63 | st.caption( 64 | f""" 65 |
66 | 67 | 🤖 Ссылка на пересказ 68 | 69 |    70 | 71 | 📃 Открыть оригинал 72 | 73 |
74 | """, 75 | unsafe_allow_html=True, 76 | ) 77 | st.divider() 78 | 79 | 80 | def render_tab( 81 | tab: st.delta_generator.DeltaGenerator, 82 | articles: Iterable[Article], 83 | collapse_summaries: bool = True, 84 | ) -> None: 85 | with tab: 86 | for a in articles: 87 | render_article(article=a, collapse_summary=collapse_summaries) 88 | 89 | 90 | def render_tabs(feeds: Iterable[Feed], collapse_summaries: bool = True) -> None: 91 | st.write( 92 | """ 93 | 99 | """, 100 | unsafe_allow_html=True, 101 | ) 102 | tabs = st.tabs([feed.name for feed in feeds]) 103 | for tab, feed in zip(tabs, feeds): 104 | render_tab( 105 | tab=tab, articles=feed.articles, collapse_summaries=collapse_summaries 106 | ) 107 | 108 | # see for an explanation of the below code: 109 | # https://discuss.streamlit.io/t/bug-with-st-tabs-glitches-for-1-frame-while-rendering/33497/12 110 | html( 111 | """ 112 | 155 | """, 156 | height=0, 157 | ) 158 | 159 | 160 | def render_footer() -> None: 161 | st.caption( 162 | """ 163 |
164 | 165 | 😎 Автор pltnk.dev 166 | 167 |    168 | 169 | 🍝 Код на GitHub 170 | 171 |
172 | """, 173 | unsafe_allow_html=True, 174 | ) 175 | st.caption( 176 | """ 177 |
178 | В приложении используются материалы сайта 179 | 180 | habr.com, краткие пересказы которых получены с помощью сервиса 181 | 182 | 300.ya.ru. 183 |
184 | """, 185 | unsafe_allow_html=True, 186 | ) 187 | st.markdown( 188 | """ 189 |
190 | ⬆️ 191 |
192 | """, 193 | unsafe_allow_html=True, 194 | ) 195 | -------------------------------------------------------------------------------- /observer/app/run.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | import streamlit as st 4 | 5 | from config import ( 6 | OBSERVER_FEEDS, 7 | OBSERVER_MONGO_URI, 8 | OBSERVER_MONGO_DB, 9 | OBSERVER_MONGO_ARTICLES, 10 | OBSERVER_MONGO_FEEDS, 11 | OBSERVER_FEED_CACHE_TTL, 12 | ) 13 | from repository import MongoAsyncRepository 14 | from .render import render_header, render_toggle, render_tabs, render_footer 15 | 16 | 17 | @st.cache_resource 18 | def get_event_loop() -> asyncio.AbstractEventLoop: 19 | return asyncio.new_event_loop() 20 | 21 | 22 | @st.cache_resource 23 | def create_repository(): 24 | return MongoAsyncRepository( 25 | mongo_uri=OBSERVER_MONGO_URI, 26 | db_name=OBSERVER_MONGO_DB, 27 | articles_col_name=OBSERVER_MONGO_ARTICLES, 28 | feeds_col_name=OBSERVER_MONGO_FEEDS, 29 | loop=get_event_loop(), 30 | ) 31 | 32 | 33 | @st.cache_data(ttl=OBSERVER_FEED_CACHE_TTL) 34 | def get_feeds_sync(): 35 | return get_event_loop().run_until_complete( 36 | create_repository().get_feeds(list(OBSERVER_FEEDS.values())), 37 | ) 38 | 39 | 40 | def run_app(): 41 | render_header() 42 | with st.spinner(text="Читаю статьи..."): 43 | feeds = get_feeds_sync() 44 | if feeds: 45 | collapse_summaries = render_toggle() 46 | render_tabs(feeds, collapse_summaries=collapse_summaries) 47 | else: 48 | st.info("Лента пересобирается, загляните позже 😉") 49 | render_footer() 50 | -------------------------------------------------------------------------------- /observer/app/static/cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pltnk/habr-observer/f9c9c817c36681e6fd2f2a62e95a33d2be1e3b8a/observer/app/static/cover.png -------------------------------------------------------------------------------- /observer/app/static/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pltnk/habr-observer/f9c9c817c36681e6fd2f2a62e95a33d2be1e3b8a/observer/app/static/favicon.png -------------------------------------------------------------------------------- /observer/app/static/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Обозреватель Хабра 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 |
35 | 36 | 37 | -------------------------------------------------------------------------------- /observer/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | OBSERVER_FEEDS = { 4 | "Сутки": "https://habr.com/ru/rss/articles/top/daily/?fl=ru", 5 | "Неделя": "https://habr.com/ru/rss/articles/top/weekly/?fl=ru", 6 | "Месяц": "https://habr.com/ru/rss/articles/top/monthly/?fl=ru", 7 | "Год": "https://habr.com/ru/rss/articles/top/yearly/?fl=ru", 8 | "Всё время": "https://habr.com/ru/rss/articles/top/alltime/?fl=ru", 9 | } 10 | 11 | OBSERVER_MONGO_USER = os.environ.get("OBSERVER_MONGO_USER", "default") 12 | OBSERVER_MONGO_PASS = os.environ.get("OBSERVER_MONGO_PASS", "default") 13 | OBSERVER_MONGO_DB = os.environ.get("OBSERVER_MONGO_DB", "observer") 14 | OBSERVER_MONGO_ARTICLES = os.environ.get("OBSERVER_MONGO_ARTICLES", "articles") 15 | OBSERVER_MONGO_FEEDS = os.environ.get("OBSERVER_MONGO_FEEDS", "feeds") 16 | OBSERVER_MONGO_URI = f"mongodb://{OBSERVER_MONGO_USER}:{OBSERVER_MONGO_PASS}@db" 17 | OBSERVER_AUTH_TOKEN = os.environ.get("OBSERVER_AUTH_TOKEN", "default") 18 | OBSERVER_FEED_UPDATE_TIMEOUT = int(os.environ.get("OBSERVER_FEED_UPDATE_TIMEOUT", 600)) 19 | OBSERVER_FEED_CACHE_TTL = int(os.environ.get("OBSERVER_FEED_CACHE_TTL", 60)) 20 | -------------------------------------------------------------------------------- /observer/fetcher/__init__.py: -------------------------------------------------------------------------------- 1 | from .run import update_feeds_task 2 | from .updater import FeedUpdater 3 | -------------------------------------------------------------------------------- /observer/fetcher/run.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from config import ( 4 | OBSERVER_FEEDS, 5 | OBSERVER_MONGO_URI, 6 | OBSERVER_MONGO_DB, 7 | OBSERVER_MONGO_ARTICLES, 8 | OBSERVER_MONGO_FEEDS, 9 | OBSERVER_AUTH_TOKEN, 10 | OBSERVER_FEED_UPDATE_TIMEOUT, 11 | ) 12 | from repository import MongoAsyncRepository 13 | from .updater import FeedUpdater 14 | 15 | 16 | async def update_feeds(repository: MongoAsyncRepository) -> None: 17 | lock = asyncio.Lock() 18 | tasks = ( 19 | FeedUpdater( 20 | name=name, 21 | url=url, 22 | summary_auth_token=OBSERVER_AUTH_TOKEN, 23 | repository=repository, 24 | throttle_lock=lock, 25 | ).update_feed() 26 | for name, url in OBSERVER_FEEDS.items() 27 | ) 28 | await asyncio.gather(*tasks, return_exceptions=True) 29 | 30 | 31 | async def update_feeds_task() -> None: 32 | repository = MongoAsyncRepository( 33 | mongo_uri=OBSERVER_MONGO_URI, 34 | db_name=OBSERVER_MONGO_DB, 35 | articles_col_name=OBSERVER_MONGO_ARTICLES, 36 | feeds_col_name=OBSERVER_MONGO_FEEDS, 37 | ) 38 | while True: 39 | await update_feeds(repository) 40 | await asyncio.sleep(OBSERVER_FEED_UPDATE_TIMEOUT) 41 | -------------------------------------------------------------------------------- /observer/fetcher/summary.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import List, Optional 3 | 4 | import httpx 5 | from bs4 import BeautifulSoup 6 | 7 | from models import Summary 8 | 9 | 10 | async def get_summary_url( 11 | article_url: str, 12 | auth_token: str, 13 | lock: Optional[asyncio.Lock] = None, 14 | timeout: int = 4, 15 | is_retry: bool = False, 16 | ) -> Optional[str]: 17 | if lock: 18 | await lock.acquire() 19 | await asyncio.sleep(timeout) 20 | async with httpx.AsyncClient( 21 | headers={"Authorization": f"OAuth {auth_token}"}, timeout=20 22 | ) as client: 23 | res = await client.post( 24 | "https://300.ya.ru/api/sharing-url", json={"article_url": article_url} 25 | ) 26 | if lock: 27 | lock.release() 28 | if res.status_code == 404: 29 | return None 30 | if res.status_code == 429 and not is_retry: 31 | return await get_summary_url( 32 | article_url=article_url, 33 | auth_token=auth_token, 34 | lock=lock, 35 | timeout=timeout * 2, 36 | is_retry=True, 37 | ) 38 | res.raise_for_status() 39 | parsed = res.json() 40 | return parsed["sharing_url"] 41 | 42 | 43 | async def get_summary_content_api(summary_url: str) -> List[str]: 44 | token = summary_url.rsplit("/", 1)[-1] 45 | async with httpx.AsyncClient(timeout=10) as client: 46 | res = await client.post("https://300.ya.ru/api/sharing", json={"token": token}) 47 | res.raise_for_status() 48 | parsed = res.json() 49 | return [i["content"] for i in parsed["thesis"]] 50 | 51 | 52 | async def get_summary_content_noapi(summary_url: str) -> List[str]: 53 | async with httpx.AsyncClient(timeout=10) as client: 54 | res = await client.get(summary_url) 55 | res.raise_for_status() 56 | parsed = BeautifulSoup(res.content, features="lxml") 57 | tag = parsed.find( 58 | "ul", 59 | attrs={"class": lambda c: isinstance(c, str) and c.startswith("theses")}, 60 | ) 61 | return [i.get_text(strip=True).strip("• \n") for i in tag.find_all("li")] 62 | 63 | 64 | async def get_summary_content(summary_url: str) -> List[str]: 65 | try: 66 | content = await get_summary_content_api(summary_url=summary_url) 67 | except Exception: 68 | content = await get_summary_content_noapi(summary_url=summary_url) 69 | return content 70 | 71 | 72 | async def get_summary( 73 | article_url: str, auth_token: str, lock: Optional[asyncio.Lock] = None 74 | ) -> Summary: 75 | summary_url = await get_summary_url( 76 | article_url=article_url, auth_token=auth_token, lock=lock 77 | ) 78 | if summary_url is None: 79 | return Summary( 80 | url="https://300.ya.ru", 81 | content=[ 82 | "Статья слишком длинная, нейросети пока не умеют пересказывать такие статьи 😔" 83 | ], 84 | ) 85 | content = await get_summary_content(summary_url=summary_url) 86 | return Summary(url=summary_url, content=content) 87 | -------------------------------------------------------------------------------- /observer/fetcher/updater.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import datetime 3 | from typing import Dict, Optional, Iterable 4 | 5 | import httpx 6 | from bs4 import BeautifulSoup 7 | from bs4.element import Tag 8 | 9 | from models import Article, Feed 10 | from repository import AsyncRepository 11 | from .summary import get_summary 12 | 13 | DT_FORMAT = "%a, %d %b %Y %H:%M:%S %Z" 14 | 15 | 16 | class FeedUpdater: 17 | def __init__( 18 | self, 19 | name: str, 20 | url: str, 21 | summary_auth_token: str, 22 | repository: AsyncRepository, 23 | throttle_lock: Optional[asyncio.Lock] = None, 24 | ): 25 | self._name = name 26 | self._url = url 27 | self._summary_auth_token = summary_auth_token 28 | self._repository = repository 29 | self._throttle_lock = throttle_lock 30 | self._parsed: Optional[BeautifulSoup] = None 31 | self._article_urls: Optional[Iterable[str]] = None 32 | self._articles_present: Dict[str, Article] = {} 33 | self._articles_to_scrape: Optional[Iterable[Tag]] = None 34 | 35 | async def _get_feed_content(self) -> None: 36 | async with httpx.AsyncClient(timeout=10) as client: 37 | res = await client.get(self._url) 38 | res.raise_for_status() 39 | self._parsed = BeautifulSoup(res.content, features="xml") 40 | 41 | async def _filter_articles(self) -> None: 42 | self._article_urls = [i.text for i in self._parsed.find_all("guid")] 43 | articles = await self._repository.get_articles(self._article_urls) 44 | for a in articles: 45 | self._articles_present[a.url] = a 46 | self._articles_to_scrape = [ 47 | tag 48 | for tag in self._parsed.find_all("item") 49 | if tag.find("guid").text not in self._articles_present 50 | ] 51 | 52 | async def _scrape_articles(self) -> None: 53 | tasks = (self._get_article(tag=tag) for tag in self._articles_to_scrape) 54 | result = await asyncio.gather(*tasks, return_exceptions=True) 55 | articles = [a for a in result if isinstance(a, Article)] 56 | await self._repository.insert_articles(articles) 57 | for a in articles: 58 | self._articles_present[a.url] = a 59 | 60 | async def _get_article(self, tag: Tag) -> Article: 61 | url = tag.find("guid").text 62 | summary = await get_summary( 63 | article_url=url, 64 | auth_token=self._summary_auth_token, 65 | lock=self._throttle_lock, 66 | ) 67 | title = tag.find("title").text or "Без названия" 68 | pub_date = datetime.datetime.strptime(tag.find("pubDate").text, DT_FORMAT) 69 | author = tag.find("dc:creator").text 70 | return Article( 71 | _id=url, title=title, pub_date=pub_date, author=author, summary=summary 72 | ) 73 | 74 | async def _insert_feed(self): 75 | feed = Feed( 76 | _id=self._url, 77 | name=self._name, 78 | articles=[self._articles_present[url] for url in self._article_urls], 79 | ) 80 | await self._repository.insert_feed(feed) 81 | 82 | async def update_feed(self): 83 | await self._get_feed_content() 84 | await self._filter_articles() 85 | if self._articles_to_scrape: 86 | await self._scrape_articles() 87 | await self._insert_feed() 88 | -------------------------------------------------------------------------------- /observer/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .article import Article 2 | from .feed import Feed 3 | from .summary import Summary 4 | -------------------------------------------------------------------------------- /observer/models/article.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import datetime 4 | from dataclasses import dataclass, asdict 5 | 6 | from .summary import Summary 7 | 8 | 9 | @dataclass 10 | class Article: 11 | _id: str 12 | title: str 13 | pub_date: datetime.datetime 14 | author: str 15 | summary: Summary 16 | 17 | @property 18 | def url(self) -> str: 19 | return self._id 20 | 21 | def as_dict(self) -> dict: 22 | return asdict(self) 23 | 24 | @classmethod 25 | def from_dict(cls, d: dict) -> Article: 26 | return Article( 27 | _id=d["_id"], 28 | title=d["title"], 29 | pub_date=d["pub_date"], 30 | author=d["author"], 31 | summary=Summary.from_dict(d["summary"]), 32 | ) 33 | -------------------------------------------------------------------------------- /observer/models/feed.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from dataclasses import dataclass, asdict 4 | from typing import List 5 | 6 | from .article import Article 7 | 8 | 9 | @dataclass 10 | class Feed: 11 | _id: str 12 | name: str 13 | articles: List[Article] 14 | 15 | @property 16 | def url(self): 17 | return self._id 18 | 19 | def as_dict(self) -> dict: 20 | return asdict(self) 21 | 22 | @classmethod 23 | def from_dict(cls, d: dict) -> Feed: 24 | return Feed( 25 | _id=d["_id"], 26 | name=d["name"], 27 | articles=[Article.from_dict(a) for a in d["articles"]], 28 | ) 29 | -------------------------------------------------------------------------------- /observer/models/summary.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from dataclasses import dataclass, asdict 4 | from typing import List 5 | 6 | 7 | @dataclass 8 | class Summary: 9 | url: str 10 | content: List[str] 11 | 12 | def as_dict(self) -> dict: 13 | return asdict(self) 14 | 15 | @classmethod 16 | def from_dict(cls, d: dict) -> Summary: 17 | return Summary(url=d["url"], content=d["content"]) 18 | -------------------------------------------------------------------------------- /observer/repository/__init__.py: -------------------------------------------------------------------------------- 1 | from .interface import AsyncRepository 2 | from .mongo import MongoAsyncRepository 3 | -------------------------------------------------------------------------------- /observer/repository/interface.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Iterable 3 | 4 | from models import Article, Feed 5 | 6 | 7 | class AsyncRepository(ABC): 8 | @abstractmethod 9 | async def get_articles(self, urls: Iterable[str]) -> Iterable[Article]: 10 | raise NotImplementedError 11 | 12 | @abstractmethod 13 | async def insert_articles(self, articles: Iterable[Article]) -> None: 14 | raise NotImplementedError 15 | 16 | @abstractmethod 17 | async def insert_feed(self, feed: Feed) -> None: 18 | raise NotImplementedError 19 | 20 | @abstractmethod 21 | async def get_feeds(self, ids: Iterable[str]) -> Iterable[Feed]: 22 | raise NotImplementedError 23 | -------------------------------------------------------------------------------- /observer/repository/mongo.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import Iterable, Optional 3 | 4 | from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorCollection 5 | 6 | from models import Article, Feed 7 | from .interface import AsyncRepository 8 | 9 | 10 | class MongoAsyncRepository(AsyncRepository): 11 | def __init__( 12 | self, 13 | mongo_uri: str, 14 | db_name: str, 15 | articles_col_name: str, 16 | feeds_col_name: str, 17 | loop: Optional[asyncio.AbstractEventLoop] = None, 18 | ): 19 | if loop: 20 | self._client = AsyncIOMotorClient(host=mongo_uri, io_loop=loop) 21 | else: 22 | self._client = AsyncIOMotorClient(host=mongo_uri) 23 | self._db = self._client[db_name] 24 | self._articles: AsyncIOMotorCollection = self._db[articles_col_name] 25 | self._feeds: AsyncIOMotorCollection = self._db[feeds_col_name] 26 | 27 | async def get_articles(self, ids: Iterable[str]) -> Iterable[Article]: 28 | cursor = self._articles.find({"_id": {"$in": ids}}) 29 | articles = [Article.from_dict(d) async for d in cursor] 30 | return articles 31 | 32 | async def insert_articles(self, articles: Iterable[Article]) -> None: 33 | await self._articles.insert_many([a.as_dict() for a in articles]) 34 | 35 | async def insert_feed(self, feed: Feed) -> None: 36 | await self._feeds.update_one( 37 | {"_id": feed.url}, {"$set": feed.as_dict()}, upsert=True 38 | ) 39 | 40 | async def get_feeds(self, ids: Iterable[str]) -> Iterable[Feed]: 41 | pipeline = [ 42 | {"$match": {"_id": {"$in": ids}}}, 43 | {"$addFields": {"__order": {"$indexOfArray": [ids, "$_id"]}}}, 44 | {"$sort": {"__order": 1}}, 45 | ] 46 | res = self._feeds.aggregate(pipeline) 47 | feeds = [Feed.from_dict(d) async for d in res] 48 | return feeds 49 | -------------------------------------------------------------------------------- /observer/requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.12.2 2 | httpx==0.25.0 3 | lxml==4.9.3 4 | motor==3.3.1 5 | streamlit==1.26.0 6 | -------------------------------------------------------------------------------- /observer/run_app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | from app import run_app 4 | 5 | st.set_page_config( 6 | page_title="habr.observer", 7 | page_icon="🧐", 8 | menu_items={ 9 | "Get help": None, 10 | "Report a Bug": "https://github.com/pltnk/habr-observer/issues", 11 | "About": "Author: [pltnk.dev](https://pltnk.dev) ✧˚₊‧⋆‧ " 12 | "Source: [habr-observer](https://github.com/pltnk/habr-observer)", 13 | }, 14 | ) 15 | 16 | 17 | if __name__ == "__main__": 18 | run_app() 19 | -------------------------------------------------------------------------------- /observer/run_fetcher.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from fetcher import update_feeds_task 4 | 5 | if __name__ == "__main__": 6 | asyncio.run(update_feeds_task()) 7 | --------------------------------------------------------------------------------