├── .env_example
├── .gitignore
├── LICENSE
├── README.md
├── db
├── Dockerfile
└── setup.sh
├── docker-compose.yml
└── observer
├── Dockerfile
├── __init__.py
├── app
├── __init__.py
├── render.py
├── run.py
└── static
│ ├── cover.png
│ ├── favicon.png
│ └── index.html
├── config.py
├── fetcher
├── __init__.py
├── run.py
├── summary.py
└── updater.py
├── models
├── __init__.py
├── article.py
├── feed.py
└── summary.py
├── repository
├── __init__.py
├── interface.py
└── mongo.py
├── requirements.txt
├── run_app.py
└── run_fetcher.py
/.env_example:
--------------------------------------------------------------------------------
1 | OBSERVER_MONGO_USER=default
2 | OBSERVER_MONGO_PASS=default
3 | OBSERVER_MONGO_DB=observer
4 | OBSERVER_MONGO_ARTICLES=articles
5 | OBSERVER_MONGO_FEEDS=feeds
6 | OBSERVER_AUTH_TOKEN=default
7 | OBSERVER_SL_PORT=8501
8 | OBSERVER_FEED_UPDATE_TIMEOUT=600
9 | OBSERVER_FEED_CACHE_TTL=60
10 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | .idea/
161 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Kirill Plotnikov
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 🧐 Обозреватель Хабра
2 | ### Лента кратких пересказов лучших статей с Хабра от нейросети YandexGPT
3 |
4 | #### Приложение доступно по адресу https://habr.observer
5 | В приложении используются материалы сайта [habr.com](https://habr.com), краткие пересказы которых получены с помощью сервиса [300.ya.ru](https://300.ya.ru).
6 |
7 | #### Деплой
8 | - Установить [Docker](https://docs.docker.com/engine/install/) и [Docker Compose](https://docs.docker.com/compose/install/)
9 | - Склонировать репозиторий: `git clone https://github.com/pltnk/habr-observer.git`
10 | - Создать внутри `.env` файл: `cp .env_example .env`
11 | - В нём установить пользователя и пароль для базы данных, изменив значения переменных `OBSERVER_MONGO_USER` и `OBSERVER_MONGO_PASS`
12 | - Добавить API токен для сервиса [300.ya.ru](https://300.ya.ru), изменив значение переменной `OBSERVER_AUTH_TOKEN` \
13 | Чтобы получить токен, нужно нажать на `API` в левом нижнем углу главной страницы сервиса, а затем нажать кнопку `Получить токен` в правом верхнем углу
14 | - Выполнить `docker compose up -d` из корня склонированного репозитория
15 | - Первоначальный сбор статей может занять несколько минут, так как соблюдается rate limit для API сервиса 300.ya.ru
16 |
17 | #### Сделано с помощью
18 | - [Streamlit](https://github.com/streamlit/streamlit)
19 | - [HTTPX](https://github.com/encode/httpx)
20 | - [Beautiful Soup 4](https://www.crummy.com/software/BeautifulSoup/)
21 | - [lxml](https://github.com/lxml/lxml)
22 | - [Motor](https://github.com/mongodb/motor)
23 |
24 | #### Лицензия
25 | Проект находится под лицензией [MIT](https://choosealicense.com/licenses/mit/) — подробности в файле [LICENSE](LICENSE).
26 |
--------------------------------------------------------------------------------
/db/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM mongo:6.0.7
2 | COPY setup.sh /docker-entrypoint-initdb.d/
3 | ARG OBSERVER_MONGO_DB=observer
4 | HEALTHCHECK CMD mongosh ${OBSERVER_MONGO_DB} --eval 'db.runCommand("ping").ok' --quiet | grep 1
5 |
--------------------------------------------------------------------------------
/db/setup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # create db and collections for feeds and articles
4 | DB="${OBSERVER_MONGO_DB:-observer}"
5 | CMD="db.createCollection( '${OBSERVER_MONGO_ARTICLES:-articles}' ); db.createCollection( '${OBSERVER_MONGO_FEEDS:-feeds}' );"
6 | mongosh "${DB}" --eval "${CMD}" -u "${OBSERVER_MONGO_USER:-default}" -p "${OBSERVER_MONGO_PASS:-default}" --authenticationDatabase admin
7 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: "3.8"
2 |
3 | services:
4 |
5 | db:
6 | build:
7 | context: ./db
8 | dockerfile: Dockerfile
9 | container_name: observer-db
10 | command: --auth
11 | environment:
12 | MONGO_INITDB_ROOT_USERNAME: ${OBSERVER_MONGO_USER:-default}
13 | MONGO_INITDB_ROOT_PASSWORD: ${OBSERVER_MONGO_PASS:-default}
14 | env_file: .env
15 | volumes:
16 | - "~/.habr_observer:/data/db/:rw"
17 | restart: always
18 | healthcheck:
19 | interval: 10s
20 | timeout: 10s
21 | retries: 5
22 | start_period: 30s
23 | logging:
24 | driver: "json-file"
25 | options:
26 | max-size: "10m"
27 | max-file: "1"
28 |
29 | app:
30 | build:
31 | context: ./observer
32 | dockerfile: Dockerfile
33 | args:
34 | SL_PORT: ${OBSERVER_SL_PORT:-8501}
35 | container_name: observer-app
36 | env_file: .env
37 | ports:
38 | - ${OBSERVER_SL_PORT:-8501}:${OBSERVER_SL_PORT:-8501}
39 | depends_on:
40 | db:
41 | condition: service_healthy
42 | restart: always
43 | healthcheck:
44 | interval: 10s
45 | timeout: 10s
46 | retries: 5
47 | start_period: 10s
48 | logging:
49 | driver: "json-file"
50 | options:
51 | max-size: "10m"
52 | max-file: "1"
53 |
--------------------------------------------------------------------------------
/observer/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9-slim
2 | WORKDIR /opt/observer
3 | ENV PYTHONPATH "/opt/observer:$PYTHONPATH"
4 | ENV PYTHONUNBUFFERED true
5 | COPY requirements.txt /opt/observer/requirements.txt
6 | RUN pip install --no-cache-dir --ignore-installed -r requirements.txt
7 | COPY ./app/static/* /usr/local/lib/python3.9/site-packages/streamlit/static/
8 | COPY . /opt/observer
9 | RUN apt update && apt install -y curl \
10 | && rm -rf /var/lib/apt/lists/*
11 | ARG SL_PORT=8501
12 | EXPOSE ${SL_PORT}
13 | ENV OBSERVER_SL_PORT=${SL_PORT}
14 | HEALTHCHECK CMD curl --fail http://localhost:${OBSERVER_SL_PORT}/_stcore/health
15 | CMD python3 run_fetcher.py & streamlit run run_app.py --server.address=0.0.0.0 --server.port=${OBSERVER_SL_PORT} --browser.gatherUsageStats=false
16 |
--------------------------------------------------------------------------------
/observer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pltnk/habr-observer/f9c9c817c36681e6fd2f2a62e95a33d2be1e3b8a/observer/__init__.py
--------------------------------------------------------------------------------
/observer/app/__init__.py:
--------------------------------------------------------------------------------
1 | from .run import run_app
2 |
--------------------------------------------------------------------------------
/observer/app/render.py:
--------------------------------------------------------------------------------
1 | from typing import Iterable
2 |
3 | import streamlit as st
4 | from streamlit.components.v1 import html
5 |
6 | from models import Article, Feed
7 |
8 |
9 | def render_header() -> None:
10 | st.markdown(
11 | """
12 |
13 |
14 |
15 |
🧐 Обозреватель Хабра
16 |
Краткий пересказ лучших статей с Хабра от нейросети YandexGPT
17 |
18 | """,
19 | unsafe_allow_html=True,
20 | )
21 |
22 |
23 | def render_toggle() -> bool:
24 | st.write(
25 | """
26 |
32 | """,
33 | unsafe_allow_html=True,
34 | )
35 | return st.toggle(
36 | label="Сворачивать пересказы",
37 | value=True,
38 | key="collapse_summaries",
39 | help="Отключите, чтобы показывать пересказы целиком, не сворачивая их",
40 | )
41 |
42 |
43 | def render_theses(theses: Iterable[str]) -> None:
44 | st.markdown("\n".join("* " + i for i in theses))
45 |
46 |
47 | def render_article(
48 | article: Article, collapse_summary: bool = True, visible_theses: int = 3
49 | ) -> None:
50 | with st.container():
51 | st.subheader(
52 | article.title,
53 | help=f"Дата публикации: {article.pub_date}",
54 | anchor=False,
55 | )
56 | if collapse_summary:
57 | render_theses(article.summary.content[:visible_theses])
58 | if len(article.summary.content) > visible_theses:
59 | with st.expander(label="Продолжение пересказа"):
60 | render_theses(article.summary.content[visible_theses:])
61 | else:
62 | render_theses(article.summary.content)
63 | st.caption(
64 | f"""
65 |
74 | """,
75 | unsafe_allow_html=True,
76 | )
77 | st.divider()
78 |
79 |
80 | def render_tab(
81 | tab: st.delta_generator.DeltaGenerator,
82 | articles: Iterable[Article],
83 | collapse_summaries: bool = True,
84 | ) -> None:
85 | with tab:
86 | for a in articles:
87 | render_article(article=a, collapse_summary=collapse_summaries)
88 |
89 |
90 | def render_tabs(feeds: Iterable[Feed], collapse_summaries: bool = True) -> None:
91 | st.write(
92 | """
93 |
99 | """,
100 | unsafe_allow_html=True,
101 | )
102 | tabs = st.tabs([feed.name for feed in feeds])
103 | for tab, feed in zip(tabs, feeds):
104 | render_tab(
105 | tab=tab, articles=feed.articles, collapse_summaries=collapse_summaries
106 | )
107 |
108 | # see for an explanation of the below code:
109 | # https://discuss.streamlit.io/t/bug-with-st-tabs-glitches-for-1-frame-while-rendering/33497/12
110 | html(
111 | """
112 |
155 | """,
156 | height=0,
157 | )
158 |
159 |
160 | def render_footer() -> None:
161 | st.caption(
162 | """
163 |
172 | """,
173 | unsafe_allow_html=True,
174 | )
175 | st.caption(
176 | """
177 |
178 | В приложении используются материалы сайта
179 |
180 | habr.com , краткие пересказы которых получены с помощью сервиса
181 |
182 | 300.ya.ru .
183 |
184 | """,
185 | unsafe_allow_html=True,
186 | )
187 | st.markdown(
188 | """
189 |
192 | """,
193 | unsafe_allow_html=True,
194 | )
195 |
--------------------------------------------------------------------------------
/observer/app/run.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 |
3 | import streamlit as st
4 |
5 | from config import (
6 | OBSERVER_FEEDS,
7 | OBSERVER_MONGO_URI,
8 | OBSERVER_MONGO_DB,
9 | OBSERVER_MONGO_ARTICLES,
10 | OBSERVER_MONGO_FEEDS,
11 | OBSERVER_FEED_CACHE_TTL,
12 | )
13 | from repository import MongoAsyncRepository
14 | from .render import render_header, render_toggle, render_tabs, render_footer
15 |
16 |
17 | @st.cache_resource
18 | def get_event_loop() -> asyncio.AbstractEventLoop:
19 | return asyncio.new_event_loop()
20 |
21 |
22 | @st.cache_resource
23 | def create_repository():
24 | return MongoAsyncRepository(
25 | mongo_uri=OBSERVER_MONGO_URI,
26 | db_name=OBSERVER_MONGO_DB,
27 | articles_col_name=OBSERVER_MONGO_ARTICLES,
28 | feeds_col_name=OBSERVER_MONGO_FEEDS,
29 | loop=get_event_loop(),
30 | )
31 |
32 |
33 | @st.cache_data(ttl=OBSERVER_FEED_CACHE_TTL)
34 | def get_feeds_sync():
35 | return get_event_loop().run_until_complete(
36 | create_repository().get_feeds(list(OBSERVER_FEEDS.values())),
37 | )
38 |
39 |
40 | def run_app():
41 | render_header()
42 | with st.spinner(text="Читаю статьи..."):
43 | feeds = get_feeds_sync()
44 | if feeds:
45 | collapse_summaries = render_toggle()
46 | render_tabs(feeds, collapse_summaries=collapse_summaries)
47 | else:
48 | st.info("Лента пересобирается, загляните позже 😉")
49 | render_footer()
50 |
--------------------------------------------------------------------------------
/observer/app/static/cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pltnk/habr-observer/f9c9c817c36681e6fd2f2a62e95a33d2be1e3b8a/observer/app/static/cover.png
--------------------------------------------------------------------------------
/observer/app/static/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pltnk/habr-observer/f9c9c817c36681e6fd2f2a62e95a33d2be1e3b8a/observer/app/static/favicon.png
--------------------------------------------------------------------------------
/observer/app/static/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | Обозреватель Хабра
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 | You need to enable JavaScript to run this app.
34 |
35 |
36 |
37 |
--------------------------------------------------------------------------------
/observer/config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | OBSERVER_FEEDS = {
4 | "Сутки": "https://habr.com/ru/rss/articles/top/daily/?fl=ru",
5 | "Неделя": "https://habr.com/ru/rss/articles/top/weekly/?fl=ru",
6 | "Месяц": "https://habr.com/ru/rss/articles/top/monthly/?fl=ru",
7 | "Год": "https://habr.com/ru/rss/articles/top/yearly/?fl=ru",
8 | "Всё время": "https://habr.com/ru/rss/articles/top/alltime/?fl=ru",
9 | }
10 |
11 | OBSERVER_MONGO_USER = os.environ.get("OBSERVER_MONGO_USER", "default")
12 | OBSERVER_MONGO_PASS = os.environ.get("OBSERVER_MONGO_PASS", "default")
13 | OBSERVER_MONGO_DB = os.environ.get("OBSERVER_MONGO_DB", "observer")
14 | OBSERVER_MONGO_ARTICLES = os.environ.get("OBSERVER_MONGO_ARTICLES", "articles")
15 | OBSERVER_MONGO_FEEDS = os.environ.get("OBSERVER_MONGO_FEEDS", "feeds")
16 | OBSERVER_MONGO_URI = f"mongodb://{OBSERVER_MONGO_USER}:{OBSERVER_MONGO_PASS}@db"
17 | OBSERVER_AUTH_TOKEN = os.environ.get("OBSERVER_AUTH_TOKEN", "default")
18 | OBSERVER_FEED_UPDATE_TIMEOUT = int(os.environ.get("OBSERVER_FEED_UPDATE_TIMEOUT", 600))
19 | OBSERVER_FEED_CACHE_TTL = int(os.environ.get("OBSERVER_FEED_CACHE_TTL", 60))
20 |
--------------------------------------------------------------------------------
/observer/fetcher/__init__.py:
--------------------------------------------------------------------------------
1 | from .run import update_feeds_task
2 | from .updater import FeedUpdater
3 |
--------------------------------------------------------------------------------
/observer/fetcher/run.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 |
3 | from config import (
4 | OBSERVER_FEEDS,
5 | OBSERVER_MONGO_URI,
6 | OBSERVER_MONGO_DB,
7 | OBSERVER_MONGO_ARTICLES,
8 | OBSERVER_MONGO_FEEDS,
9 | OBSERVER_AUTH_TOKEN,
10 | OBSERVER_FEED_UPDATE_TIMEOUT,
11 | )
12 | from repository import MongoAsyncRepository
13 | from .updater import FeedUpdater
14 |
15 |
16 | async def update_feeds(repository: MongoAsyncRepository) -> None:
17 | lock = asyncio.Lock()
18 | tasks = (
19 | FeedUpdater(
20 | name=name,
21 | url=url,
22 | summary_auth_token=OBSERVER_AUTH_TOKEN,
23 | repository=repository,
24 | throttle_lock=lock,
25 | ).update_feed()
26 | for name, url in OBSERVER_FEEDS.items()
27 | )
28 | await asyncio.gather(*tasks, return_exceptions=True)
29 |
30 |
31 | async def update_feeds_task() -> None:
32 | repository = MongoAsyncRepository(
33 | mongo_uri=OBSERVER_MONGO_URI,
34 | db_name=OBSERVER_MONGO_DB,
35 | articles_col_name=OBSERVER_MONGO_ARTICLES,
36 | feeds_col_name=OBSERVER_MONGO_FEEDS,
37 | )
38 | while True:
39 | await update_feeds(repository)
40 | await asyncio.sleep(OBSERVER_FEED_UPDATE_TIMEOUT)
41 |
--------------------------------------------------------------------------------
/observer/fetcher/summary.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from typing import List, Optional
3 |
4 | import httpx
5 | from bs4 import BeautifulSoup
6 |
7 | from models import Summary
8 |
9 |
10 | async def get_summary_url(
11 | article_url: str,
12 | auth_token: str,
13 | lock: Optional[asyncio.Lock] = None,
14 | timeout: int = 4,
15 | is_retry: bool = False,
16 | ) -> Optional[str]:
17 | if lock:
18 | await lock.acquire()
19 | await asyncio.sleep(timeout)
20 | async with httpx.AsyncClient(
21 | headers={"Authorization": f"OAuth {auth_token}"}, timeout=20
22 | ) as client:
23 | res = await client.post(
24 | "https://300.ya.ru/api/sharing-url", json={"article_url": article_url}
25 | )
26 | if lock:
27 | lock.release()
28 | if res.status_code == 404:
29 | return None
30 | if res.status_code == 429 and not is_retry:
31 | return await get_summary_url(
32 | article_url=article_url,
33 | auth_token=auth_token,
34 | lock=lock,
35 | timeout=timeout * 2,
36 | is_retry=True,
37 | )
38 | res.raise_for_status()
39 | parsed = res.json()
40 | return parsed["sharing_url"]
41 |
42 |
43 | async def get_summary_content_api(summary_url: str) -> List[str]:
44 | token = summary_url.rsplit("/", 1)[-1]
45 | async with httpx.AsyncClient(timeout=10) as client:
46 | res = await client.post("https://300.ya.ru/api/sharing", json={"token": token})
47 | res.raise_for_status()
48 | parsed = res.json()
49 | return [i["content"] for i in parsed["thesis"]]
50 |
51 |
52 | async def get_summary_content_noapi(summary_url: str) -> List[str]:
53 | async with httpx.AsyncClient(timeout=10) as client:
54 | res = await client.get(summary_url)
55 | res.raise_for_status()
56 | parsed = BeautifulSoup(res.content, features="lxml")
57 | tag = parsed.find(
58 | "ul",
59 | attrs={"class": lambda c: isinstance(c, str) and c.startswith("theses")},
60 | )
61 | return [i.get_text(strip=True).strip("• \n") for i in tag.find_all("li")]
62 |
63 |
64 | async def get_summary_content(summary_url: str) -> List[str]:
65 | try:
66 | content = await get_summary_content_api(summary_url=summary_url)
67 | except Exception:
68 | content = await get_summary_content_noapi(summary_url=summary_url)
69 | return content
70 |
71 |
72 | async def get_summary(
73 | article_url: str, auth_token: str, lock: Optional[asyncio.Lock] = None
74 | ) -> Summary:
75 | summary_url = await get_summary_url(
76 | article_url=article_url, auth_token=auth_token, lock=lock
77 | )
78 | if summary_url is None:
79 | return Summary(
80 | url="https://300.ya.ru",
81 | content=[
82 | "Статья слишком длинная, нейросети пока не умеют пересказывать такие статьи 😔"
83 | ],
84 | )
85 | content = await get_summary_content(summary_url=summary_url)
86 | return Summary(url=summary_url, content=content)
87 |
--------------------------------------------------------------------------------
/observer/fetcher/updater.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import datetime
3 | from typing import Dict, Optional, Iterable
4 |
5 | import httpx
6 | from bs4 import BeautifulSoup
7 | from bs4.element import Tag
8 |
9 | from models import Article, Feed
10 | from repository import AsyncRepository
11 | from .summary import get_summary
12 |
13 | DT_FORMAT = "%a, %d %b %Y %H:%M:%S %Z"
14 |
15 |
16 | class FeedUpdater:
17 | def __init__(
18 | self,
19 | name: str,
20 | url: str,
21 | summary_auth_token: str,
22 | repository: AsyncRepository,
23 | throttle_lock: Optional[asyncio.Lock] = None,
24 | ):
25 | self._name = name
26 | self._url = url
27 | self._summary_auth_token = summary_auth_token
28 | self._repository = repository
29 | self._throttle_lock = throttle_lock
30 | self._parsed: Optional[BeautifulSoup] = None
31 | self._article_urls: Optional[Iterable[str]] = None
32 | self._articles_present: Dict[str, Article] = {}
33 | self._articles_to_scrape: Optional[Iterable[Tag]] = None
34 |
35 | async def _get_feed_content(self) -> None:
36 | async with httpx.AsyncClient(timeout=10) as client:
37 | res = await client.get(self._url)
38 | res.raise_for_status()
39 | self._parsed = BeautifulSoup(res.content, features="xml")
40 |
41 | async def _filter_articles(self) -> None:
42 | self._article_urls = [i.text for i in self._parsed.find_all("guid")]
43 | articles = await self._repository.get_articles(self._article_urls)
44 | for a in articles:
45 | self._articles_present[a.url] = a
46 | self._articles_to_scrape = [
47 | tag
48 | for tag in self._parsed.find_all("item")
49 | if tag.find("guid").text not in self._articles_present
50 | ]
51 |
52 | async def _scrape_articles(self) -> None:
53 | tasks = (self._get_article(tag=tag) for tag in self._articles_to_scrape)
54 | result = await asyncio.gather(*tasks, return_exceptions=True)
55 | articles = [a for a in result if isinstance(a, Article)]
56 | await self._repository.insert_articles(articles)
57 | for a in articles:
58 | self._articles_present[a.url] = a
59 |
60 | async def _get_article(self, tag: Tag) -> Article:
61 | url = tag.find("guid").text
62 | summary = await get_summary(
63 | article_url=url,
64 | auth_token=self._summary_auth_token,
65 | lock=self._throttle_lock,
66 | )
67 | title = tag.find("title").text or "Без названия"
68 | pub_date = datetime.datetime.strptime(tag.find("pubDate").text, DT_FORMAT)
69 | author = tag.find("dc:creator").text
70 | return Article(
71 | _id=url, title=title, pub_date=pub_date, author=author, summary=summary
72 | )
73 |
74 | async def _insert_feed(self):
75 | feed = Feed(
76 | _id=self._url,
77 | name=self._name,
78 | articles=[self._articles_present[url] for url in self._article_urls],
79 | )
80 | await self._repository.insert_feed(feed)
81 |
82 | async def update_feed(self):
83 | await self._get_feed_content()
84 | await self._filter_articles()
85 | if self._articles_to_scrape:
86 | await self._scrape_articles()
87 | await self._insert_feed()
88 |
--------------------------------------------------------------------------------
/observer/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .article import Article
2 | from .feed import Feed
3 | from .summary import Summary
4 |
--------------------------------------------------------------------------------
/observer/models/article.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import datetime
4 | from dataclasses import dataclass, asdict
5 |
6 | from .summary import Summary
7 |
8 |
9 | @dataclass
10 | class Article:
11 | _id: str
12 | title: str
13 | pub_date: datetime.datetime
14 | author: str
15 | summary: Summary
16 |
17 | @property
18 | def url(self) -> str:
19 | return self._id
20 |
21 | def as_dict(self) -> dict:
22 | return asdict(self)
23 |
24 | @classmethod
25 | def from_dict(cls, d: dict) -> Article:
26 | return Article(
27 | _id=d["_id"],
28 | title=d["title"],
29 | pub_date=d["pub_date"],
30 | author=d["author"],
31 | summary=Summary.from_dict(d["summary"]),
32 | )
33 |
--------------------------------------------------------------------------------
/observer/models/feed.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from dataclasses import dataclass, asdict
4 | from typing import List
5 |
6 | from .article import Article
7 |
8 |
9 | @dataclass
10 | class Feed:
11 | _id: str
12 | name: str
13 | articles: List[Article]
14 |
15 | @property
16 | def url(self):
17 | return self._id
18 |
19 | def as_dict(self) -> dict:
20 | return asdict(self)
21 |
22 | @classmethod
23 | def from_dict(cls, d: dict) -> Feed:
24 | return Feed(
25 | _id=d["_id"],
26 | name=d["name"],
27 | articles=[Article.from_dict(a) for a in d["articles"]],
28 | )
29 |
--------------------------------------------------------------------------------
/observer/models/summary.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from dataclasses import dataclass, asdict
4 | from typing import List
5 |
6 |
7 | @dataclass
8 | class Summary:
9 | url: str
10 | content: List[str]
11 |
12 | def as_dict(self) -> dict:
13 | return asdict(self)
14 |
15 | @classmethod
16 | def from_dict(cls, d: dict) -> Summary:
17 | return Summary(url=d["url"], content=d["content"])
18 |
--------------------------------------------------------------------------------
/observer/repository/__init__.py:
--------------------------------------------------------------------------------
1 | from .interface import AsyncRepository
2 | from .mongo import MongoAsyncRepository
3 |
--------------------------------------------------------------------------------
/observer/repository/interface.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import Iterable
3 |
4 | from models import Article, Feed
5 |
6 |
7 | class AsyncRepository(ABC):
8 | @abstractmethod
9 | async def get_articles(self, urls: Iterable[str]) -> Iterable[Article]:
10 | raise NotImplementedError
11 |
12 | @abstractmethod
13 | async def insert_articles(self, articles: Iterable[Article]) -> None:
14 | raise NotImplementedError
15 |
16 | @abstractmethod
17 | async def insert_feed(self, feed: Feed) -> None:
18 | raise NotImplementedError
19 |
20 | @abstractmethod
21 | async def get_feeds(self, ids: Iterable[str]) -> Iterable[Feed]:
22 | raise NotImplementedError
23 |
--------------------------------------------------------------------------------
/observer/repository/mongo.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from typing import Iterable, Optional
3 |
4 | from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorCollection
5 |
6 | from models import Article, Feed
7 | from .interface import AsyncRepository
8 |
9 |
10 | class MongoAsyncRepository(AsyncRepository):
11 | def __init__(
12 | self,
13 | mongo_uri: str,
14 | db_name: str,
15 | articles_col_name: str,
16 | feeds_col_name: str,
17 | loop: Optional[asyncio.AbstractEventLoop] = None,
18 | ):
19 | if loop:
20 | self._client = AsyncIOMotorClient(host=mongo_uri, io_loop=loop)
21 | else:
22 | self._client = AsyncIOMotorClient(host=mongo_uri)
23 | self._db = self._client[db_name]
24 | self._articles: AsyncIOMotorCollection = self._db[articles_col_name]
25 | self._feeds: AsyncIOMotorCollection = self._db[feeds_col_name]
26 |
27 | async def get_articles(self, ids: Iterable[str]) -> Iterable[Article]:
28 | cursor = self._articles.find({"_id": {"$in": ids}})
29 | articles = [Article.from_dict(d) async for d in cursor]
30 | return articles
31 |
32 | async def insert_articles(self, articles: Iterable[Article]) -> None:
33 | await self._articles.insert_many([a.as_dict() for a in articles])
34 |
35 | async def insert_feed(self, feed: Feed) -> None:
36 | await self._feeds.update_one(
37 | {"_id": feed.url}, {"$set": feed.as_dict()}, upsert=True
38 | )
39 |
40 | async def get_feeds(self, ids: Iterable[str]) -> Iterable[Feed]:
41 | pipeline = [
42 | {"$match": {"_id": {"$in": ids}}},
43 | {"$addFields": {"__order": {"$indexOfArray": [ids, "$_id"]}}},
44 | {"$sort": {"__order": 1}},
45 | ]
46 | res = self._feeds.aggregate(pipeline)
47 | feeds = [Feed.from_dict(d) async for d in res]
48 | return feeds
49 |
--------------------------------------------------------------------------------
/observer/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.12.2
2 | httpx==0.25.0
3 | lxml==4.9.3
4 | motor==3.3.1
5 | streamlit==1.26.0
6 |
--------------------------------------------------------------------------------
/observer/run_app.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | from app import run_app
4 |
5 | st.set_page_config(
6 | page_title="habr.observer",
7 | page_icon="🧐",
8 | menu_items={
9 | "Get help": None,
10 | "Report a Bug": "https://github.com/pltnk/habr-observer/issues",
11 | "About": "Author: [pltnk.dev](https://pltnk.dev) ✧˚₊‧⋆‧ "
12 | "Source: [habr-observer](https://github.com/pltnk/habr-observer)",
13 | },
14 | )
15 |
16 |
17 | if __name__ == "__main__":
18 | run_app()
19 |
--------------------------------------------------------------------------------
/observer/run_fetcher.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 |
3 | from fetcher import update_feeds_task
4 |
5 | if __name__ == "__main__":
6 | asyncio.run(update_feeds_task())
7 |
--------------------------------------------------------------------------------