├── .env_example
├── .gitignore
├── LICENSE
├── README.md
├── db
    ├── Dockerfile
    └── setup.sh
├── docker-compose.yml
└── observer
    ├── Dockerfile
    ├── __init__.py
    ├── app
        ├── __init__.py
        ├── render.py
        ├── run.py
        └── static
        │   ├── cover.png
        │   ├── favicon.png
        │   └── index.html
    ├── config.py
    ├── fetcher
        ├── __init__.py
        ├── run.py
        ├── summary.py
        └── updater.py
    ├── models
        ├── __init__.py
        ├── article.py
        ├── feed.py
        └── summary.py
    ├── repository
        ├── __init__.py
        ├── interface.py
        └── mongo.py
    ├── requirements.txt
    ├── run_app.py
    └── run_fetcher.py


/.env_example:
--------------------------------------------------------------------------------
 1 | OBSERVER_MONGO_USER=default
 2 | OBSERVER_MONGO_PASS=default
 3 | OBSERVER_MONGO_DB=observer
 4 | OBSERVER_MONGO_ARTICLES=articles
 5 | OBSERVER_MONGO_FEEDS=feeds
 6 | OBSERVER_AUTH_TOKEN=default
 7 | OBSERVER_SL_PORT=8501
 8 | OBSERVER_FEED_UPDATE_TIMEOUT=600
 9 | OBSERVER_FEED_CACHE_TTL=60
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | .idea/
161 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Kirill Plotnikov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 🧐 Обозреватель Хабра
 2 | ### Лента кратких пересказов лучших статей с Хабра от нейросети YandexGPT
 3 | 
 4 | #### Приложение доступно по адресу https://habr.observer
 5 | В приложении используются материалы сайта [habr.com](https://habr.com), краткие пересказы которых получены с помощью сервиса [300.ya.ru](https://300.ya.ru).
 6 | 
 7 | #### Деплой
 8 | - Установить [Docker](https://docs.docker.com/engine/install/) и [Docker Compose](https://docs.docker.com/compose/install/)
 9 | - Склонировать репозиторий: `git clone https://github.com/pltnk/habr-observer.git`
10 | - Создать внутри `.env` файл: `cp .env_example .env`
11 | - В нём установить пользователя и пароль для базы данных, изменив значения переменных `OBSERVER_MONGO_USER` и `OBSERVER_MONGO_PASS`
12 | - Добавить API токен для сервиса [300.ya.ru](https://300.ya.ru), изменив значение переменной `OBSERVER_AUTH_TOKEN` \
13 |   Чтобы получить токен, нужно нажать на `API` в левом нижнем углу главной страницы сервиса, а затем нажать кнопку `Получить токен` в правом верхнем углу
14 | - Выполнить `docker compose up -d` из корня склонированного репозитория
15 | - Первоначальный сбор статей может занять несколько минут, так как соблюдается rate limit для API сервиса 300.ya.ru
16 | 
17 | #### Сделано с помощью
18 | - [Streamlit](https://github.com/streamlit/streamlit)
19 | - [HTTPX](https://github.com/encode/httpx)
20 | - [Beautiful Soup 4](https://www.crummy.com/software/BeautifulSoup/)
21 | - [lxml](https://github.com/lxml/lxml)
22 | - [Motor](https://github.com/mongodb/motor)
23 | 
24 | #### Лицензия
25 | Проект находится под лицензией [MIT](https://choosealicense.com/licenses/mit/) — подробности в файле [LICENSE](LICENSE).
26 | 


--------------------------------------------------------------------------------
/db/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM mongo:6.0.7
2 | COPY setup.sh /docker-entrypoint-initdb.d/
3 | ARG OBSERVER_MONGO_DB=observer
4 | HEALTHCHECK CMD mongosh ${OBSERVER_MONGO_DB} --eval 'db.runCommand("ping").ok' --quiet | grep 1
5 | 


--------------------------------------------------------------------------------
/db/setup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | # create db and collections for feeds and articles
4 | DB="${OBSERVER_MONGO_DB:-observer}"
5 | CMD="db.createCollection( '${OBSERVER_MONGO_ARTICLES:-articles}' ); db.createCollection( '${OBSERVER_MONGO_FEEDS:-feeds}' );"
6 | mongosh "${DB}" --eval "${CMD}" -u "${OBSERVER_MONGO_USER:-default}" -p "${OBSERVER_MONGO_PASS:-default}" --authenticationDatabase admin
7 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.8"
 2 | 
 3 | services:
 4 | 
 5 |   db:
 6 |     build:
 7 |       context: ./db
 8 |       dockerfile: Dockerfile
 9 |     container_name: observer-db
10 |     command: --auth
11 |     environment:
12 |       MONGO_INITDB_ROOT_USERNAME: ${OBSERVER_MONGO_USER:-default}
13 |       MONGO_INITDB_ROOT_PASSWORD: ${OBSERVER_MONGO_PASS:-default}
14 |     env_file: .env
15 |     volumes:
16 |       - "~/.habr_observer:/data/db/:rw"
17 |     restart: always
18 |     healthcheck:
19 |       interval: 10s
20 |       timeout: 10s
21 |       retries: 5
22 |       start_period: 30s
23 |     logging:
24 |       driver: "json-file"
25 |       options:
26 |         max-size: "10m"
27 |         max-file: "1"
28 | 
29 |   app:
30 |     build:
31 |       context: ./observer
32 |       dockerfile: Dockerfile
33 |       args:
34 |         SL_PORT: ${OBSERVER_SL_PORT:-8501}
35 |     container_name: observer-app
36 |     env_file: .env
37 |     ports:
38 |       - ${OBSERVER_SL_PORT:-8501}:${OBSERVER_SL_PORT:-8501}
39 |     depends_on:
40 |       db:
41 |         condition: service_healthy
42 |     restart: always
43 |     healthcheck:
44 |       interval: 10s
45 |       timeout: 10s
46 |       retries: 5
47 |       start_period: 10s
48 |     logging:
49 |       driver: "json-file"
50 |       options:
51 |         max-size: "10m"
52 |         max-file: "1"
53 | 


--------------------------------------------------------------------------------
/observer/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9-slim
 2 | WORKDIR /opt/observer
 3 | ENV PYTHONPATH "/opt/observer:$PYTHONPATH"
 4 | ENV PYTHONUNBUFFERED true
 5 | COPY requirements.txt /opt/observer/requirements.txt
 6 | RUN pip install --no-cache-dir --ignore-installed -r requirements.txt
 7 | COPY ./app/static/* /usr/local/lib/python3.9/site-packages/streamlit/static/
 8 | COPY . /opt/observer
 9 | RUN apt update && apt install -y curl \
10 |     && rm -rf /var/lib/apt/lists/*
11 | ARG SL_PORT=8501
12 | EXPOSE ${SL_PORT}
13 | ENV OBSERVER_SL_PORT=${SL_PORT}
14 | HEALTHCHECK CMD curl --fail http://localhost:${OBSERVER_SL_PORT}/_stcore/health
15 | CMD python3 run_fetcher.py & streamlit run run_app.py --server.address=0.0.0.0 --server.port=${OBSERVER_SL_PORT} --browser.gatherUsageStats=false
16 | 


--------------------------------------------------------------------------------
/observer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pltnk/habr-observer/f9c9c817c36681e6fd2f2a62e95a33d2be1e3b8a/observer/__init__.py


--------------------------------------------------------------------------------
/observer/app/__init__.py:
--------------------------------------------------------------------------------
1 | from .run import run_app
2 | 


--------------------------------------------------------------------------------
/observer/app/render.py:
--------------------------------------------------------------------------------
  1 | from typing import Iterable
  2 | 
  3 | import streamlit as st
  4 | from streamlit.components.v1 import html
  5 | 
  6 | from models import Article, Feed
  7 | 
  8 | 
  9 | def render_header() -> None:
 10 |     st.markdown(
 11 |         """
 12 |         <div id='top' style='text-align: center'>
 13 |         <p id='#top'></p>
 14 |         <br>
 15 |         <h1>🧐<br>Обозреватель Хабра</h1>
 16 |         <h2>Краткий пересказ лучших статей с Хабра от нейросети YandexGPT</h2>
 17 |         </div>
 18 |         """,
 19 |         unsafe_allow_html=True,
 20 |     )
 21 | 
 22 | 
 23 | def render_toggle() -> bool:
 24 |     st.write(
 25 |         """
 26 |         <style>
 27 |         label[data-baseweb="checkbox"] {
 28 |         justify-content: center;
 29 |         align-items: center;
 30 |         }
 31 |         </style>
 32 |         """,
 33 |         unsafe_allow_html=True,
 34 |     )
 35 |     return st.toggle(
 36 |         label="Сворачивать пересказы",
 37 |         value=True,
 38 |         key="collapse_summaries",
 39 |         help="Отключите, чтобы показывать пересказы целиком, не сворачивая их",
 40 |     )
 41 | 
 42 | 
 43 | def render_theses(theses: Iterable[str]) -> None:
 44 |     st.markdown("\n".join("* " + i for i in theses))
 45 | 
 46 | 
 47 | def render_article(
 48 |     article: Article, collapse_summary: bool = True, visible_theses: int = 3
 49 | ) -> None:
 50 |     with st.container():
 51 |         st.subheader(
 52 |             article.title,
 53 |             help=f"Дата публикации: {article.pub_date}",
 54 |             anchor=False,
 55 |         )
 56 |         if collapse_summary:
 57 |             render_theses(article.summary.content[:visible_theses])
 58 |             if len(article.summary.content) > visible_theses:
 59 |                 with st.expander(label="Продолжение пересказа"):
 60 |                     render_theses(article.summary.content[visible_theses:])
 61 |         else:
 62 |             render_theses(article.summary.content)
 63 |         st.caption(
 64 |             f"""
 65 |             <div style='text-align: center'>
 66 |             <a href='{article.summary.url}' target='_blank' style='text-decoration: none; color: inherit;'>
 67 |             🤖 Ссылка на пересказ
 68 |             </a>
 69 |             &emsp;&emsp;
 70 |             <a href='{article.url}' target='_blank' style='text-decoration: none; color: inherit;'>
 71 |             📃 Открыть оригинал
 72 |             </a>
 73 |             </div>
 74 |             """,
 75 |             unsafe_allow_html=True,
 76 |         )
 77 |         st.divider()
 78 | 
 79 | 
 80 | def render_tab(
 81 |     tab: st.delta_generator.DeltaGenerator,
 82 |     articles: Iterable[Article],
 83 |     collapse_summaries: bool = True,
 84 | ) -> None:
 85 |     with tab:
 86 |         for a in articles:
 87 |             render_article(article=a, collapse_summary=collapse_summaries)
 88 | 
 89 | 
 90 | def render_tabs(feeds: Iterable[Feed], collapse_summaries: bool = True) -> None:
 91 |     st.write(
 92 |         """
 93 |         <style>
 94 |         div[data-baseweb="tab-list"] {
 95 |         justify-content: center;
 96 |         align-items: center;
 97 |         }
 98 |         </style>
 99 |         """,
100 |         unsafe_allow_html=True,
101 |     )
102 |     tabs = st.tabs([feed.name for feed in feeds])
103 |     for tab, feed in zip(tabs, feeds):
104 |         render_tab(
105 |             tab=tab, articles=feed.articles, collapse_summaries=collapse_summaries
106 |         )
107 | 
108 |     # see for an explanation of the below code:
109 |     # https://discuss.streamlit.io/t/bug-with-st-tabs-glitches-for-1-frame-while-rendering/33497/12
110 |     html(
111 |         """
112 |         <script>
113 |         function checkElements() {
114 |         
115 |             const tabs = window.parent.document.querySelectorAll('button[data-baseweb="tab"] p');
116 |             const tab_panels = window.parent.document.querySelectorAll('div[data-baseweb="tab-panel"]');
117 |         
118 |             if (tabs && tab_panels) {
119 |         
120 |                 tabs.forEach(function(tab, index) {
121 |                     const tab_panel_child = tab_panels[index].querySelectorAll("*");
122 |         
123 |                     function set_visibility(state) {
124 |                         tab_panels[index].style.visibility = state;
125 |                         tab_panel_child.forEach(function(child) {
126 |                             child.style.visibility = state;
127 |                         });
128 |                     }
129 |         
130 |                     tab.addEventListener("click", function(event) {
131 |                         set_visibility('hidden')
132 |         
133 |                         let element = tab_panels[index].querySelector('div[data-testid="stVerticalBlock"]');
134 |                         let main_block = window.parent.document.querySelector('section.main div[data-testid="stVerticalBlock"]');
135 |                         const waitMs = 1;
136 |         
137 |                         function waitForLayout() {
138 |                             if (element.offsetWidth === main_block.offsetWidth) {
139 |                                 set_visibility("visible");
140 |                             } else {
141 |                                 setTimeout(waitForLayout, waitMs);
142 |                             }
143 |                         }
144 |         
145 |                         waitForLayout();
146 |                     });
147 |                 });
148 |             } else {
149 |                 setTimeout(checkElements, 50);
150 |             }
151 |         }
152 |         
153 |         checkElements();
154 |         </script>
155 |         """,
156 |         height=0,
157 |     )
158 | 
159 | 
160 | def render_footer() -> None:
161 |     st.caption(
162 |         """
163 |         <div style='text-align: center'>
164 |         <a href='https://pltnk.dev' target='_blank' style='text-decoration: none; color: inherit;'>
165 |         😎 Автор pltnk.dev
166 |         </a>
167 |         &emsp;&emsp;
168 |         <a href='https://github.com/pltnk/habr-observer' target='_blank' style='text-decoration: none; color: inherit;'>
169 |         🍝 Код на GitHub
170 |         </a>
171 |         </div>
172 |         """,
173 |         unsafe_allow_html=True,
174 |     )
175 |     st.caption(
176 |         """
177 |         <div style='text-align: center'>
178 |         В приложении используются материалы сайта 
179 |         <a href='https://habr.com' target='_blank' style='text-decoration: none; color: inherit;'>
180 |         habr.com</a>, краткие пересказы которых получены с помощью сервиса
181 |         <a href='https://300.ya.ru' target='_blank' style='text-decoration: none; color: inherit;'>
182 |         300.ya.ru</a>.
183 |         </div>
184 |         """,
185 |         unsafe_allow_html=True,
186 |     )
187 |     st.markdown(
188 |         """
189 |         <div style='position: fixed; bottom: 0px; right: 5px; font-size: xx-large;'>
190 |         <a href='#top' style='text-decoration: none;'>⬆️</a>
191 |         </div>
192 |         """,
193 |         unsafe_allow_html=True,
194 |     )
195 | 


--------------------------------------------------------------------------------
/observer/app/run.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | import streamlit as st
 4 | 
 5 | from config import (
 6 |     OBSERVER_FEEDS,
 7 |     OBSERVER_MONGO_URI,
 8 |     OBSERVER_MONGO_DB,
 9 |     OBSERVER_MONGO_ARTICLES,
10 |     OBSERVER_MONGO_FEEDS,
11 |     OBSERVER_FEED_CACHE_TTL,
12 | )
13 | from repository import MongoAsyncRepository
14 | from .render import render_header, render_toggle, render_tabs, render_footer
15 | 
16 | 
17 | @st.cache_resource
18 | def get_event_loop() -> asyncio.AbstractEventLoop:
19 |     return asyncio.new_event_loop()
20 | 
21 | 
22 | @st.cache_resource
23 | def create_repository():
24 |     return MongoAsyncRepository(
25 |         mongo_uri=OBSERVER_MONGO_URI,
26 |         db_name=OBSERVER_MONGO_DB,
27 |         articles_col_name=OBSERVER_MONGO_ARTICLES,
28 |         feeds_col_name=OBSERVER_MONGO_FEEDS,
29 |         loop=get_event_loop(),
30 |     )
31 | 
32 | 
33 | @st.cache_data(ttl=OBSERVER_FEED_CACHE_TTL)
34 | def get_feeds_sync():
35 |     return get_event_loop().run_until_complete(
36 |         create_repository().get_feeds(list(OBSERVER_FEEDS.values())),
37 |     )
38 | 
39 | 
40 | def run_app():
41 |     render_header()
42 |     with st.spinner(text="Читаю статьи..."):
43 |         feeds = get_feeds_sync()
44 |     if feeds:
45 |         collapse_summaries = render_toggle()
46 |         render_tabs(feeds, collapse_summaries=collapse_summaries)
47 |     else:
48 |         st.info("Лента пересобирается, загляните позже 😉")
49 |     render_footer()
50 | 


--------------------------------------------------------------------------------
/observer/app/static/cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pltnk/habr-observer/f9c9c817c36681e6fd2f2a62e95a33d2be1e3b8a/observer/app/static/cover.png


--------------------------------------------------------------------------------
/observer/app/static/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pltnk/habr-observer/f9c9c817c36681e6fd2f2a62e95a33d2be1e3b8a/observer/app/static/favicon.png


--------------------------------------------------------------------------------
/observer/app/static/index.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html lang="ru">
 3 |     <head>
 4 |         <meta charset="UTF-8"/>
 5 |         <meta name="viewport" content="width=device-width,initial-scale=1,shrink-to-fit=no"/>
 6 |         <link rel="shortcut icon" href="./favicon.png"/>
 7 | 
 8 |         <!-- Primary Meta Tags -->
 9 |         <title>Обозреватель Хабра</title>
10 |         <meta name="title" content="Обозреватель Хабра" />
11 |         <meta name="description" content="Лента кратких пересказов лучших статей с Хабра за сутки, неделю, месяц, год и всё время от нейросети YandexGPT" />
12 | 
13 |         <!-- Open Graph / Facebook -->
14 |         <meta property="og:type" content="website" />
15 |         <meta property="og:url" content="https://habr.observer/" />
16 |         <meta property="og:title" content="Обозреватель Хабра" />
17 |         <meta property="og:description" content="Лента кратких пересказов лучших статей с Хабра за сутки, неделю, месяц, год и всё время от нейросети YandexGPT" />
18 |         <meta property="og:image" content="https://habr.observer/cover.png" />
19 | 
20 |         <!-- Twitter -->
21 |         <meta property="twitter:card" content="summary_large_image" />
22 |         <meta property="twitter:url" content="https://habr.observer/" />
23 |         <meta property="twitter:title" content="Обозреватель Хабра" />
24 |         <meta property="twitter:description" content="Лента кратких пересказов лучших статей с Хабра за сутки, неделю, месяц, год и всё время от нейросети YandexGPT" />
25 |         <meta property="twitter:image" content="https://habr.observer/cover.png" />
26 | 
27 |         <!-- Meta Tags Generated with https://metatags.io -->
28 | 
29 |         <script>window.prerenderReady=!1</script><script defer="defer" src="./static/js/main.a097c1ce.js"></script>
30 |         <link href="./static/css/main.f4a8738f.css" rel="stylesheet">
31 |     </head>
32 |     <body>
33 |         <noscript>You need to enable JavaScript to run this app.</noscript>
34 |         <div id="root"></div>
35 |     </body>
36 | </html>
37 | 


--------------------------------------------------------------------------------
/observer/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | OBSERVER_FEEDS = {
 4 |     "Сутки": "https://habr.com/ru/rss/articles/top/daily/?fl=ru",
 5 |     "Неделя": "https://habr.com/ru/rss/articles/top/weekly/?fl=ru",
 6 |     "Месяц": "https://habr.com/ru/rss/articles/top/monthly/?fl=ru",
 7 |     "Год": "https://habr.com/ru/rss/articles/top/yearly/?fl=ru",
 8 |     "Всё время": "https://habr.com/ru/rss/articles/top/alltime/?fl=ru",
 9 | }
10 | 
11 | OBSERVER_MONGO_USER = os.environ.get("OBSERVER_MONGO_USER", "default")
12 | OBSERVER_MONGO_PASS = os.environ.get("OBSERVER_MONGO_PASS", "default")
13 | OBSERVER_MONGO_DB = os.environ.get("OBSERVER_MONGO_DB", "observer")
14 | OBSERVER_MONGO_ARTICLES = os.environ.get("OBSERVER_MONGO_ARTICLES", "articles")
15 | OBSERVER_MONGO_FEEDS = os.environ.get("OBSERVER_MONGO_FEEDS", "feeds")
16 | OBSERVER_MONGO_URI = f"mongodb://{OBSERVER_MONGO_USER}:{OBSERVER_MONGO_PASS}@db"
17 | OBSERVER_AUTH_TOKEN = os.environ.get("OBSERVER_AUTH_TOKEN", "default")
18 | OBSERVER_FEED_UPDATE_TIMEOUT = int(os.environ.get("OBSERVER_FEED_UPDATE_TIMEOUT", 600))
19 | OBSERVER_FEED_CACHE_TTL = int(os.environ.get("OBSERVER_FEED_CACHE_TTL", 60))
20 | 


--------------------------------------------------------------------------------
/observer/fetcher/__init__.py:
--------------------------------------------------------------------------------
1 | from .run import update_feeds_task
2 | from .updater import FeedUpdater
3 | 


--------------------------------------------------------------------------------
/observer/fetcher/run.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from config import (
 4 |     OBSERVER_FEEDS,
 5 |     OBSERVER_MONGO_URI,
 6 |     OBSERVER_MONGO_DB,
 7 |     OBSERVER_MONGO_ARTICLES,
 8 |     OBSERVER_MONGO_FEEDS,
 9 |     OBSERVER_AUTH_TOKEN,
10 |     OBSERVER_FEED_UPDATE_TIMEOUT,
11 | )
12 | from repository import MongoAsyncRepository
13 | from .updater import FeedUpdater
14 | 
15 | 
16 | async def update_feeds(repository: MongoAsyncRepository) -> None:
17 |     lock = asyncio.Lock()
18 |     tasks = (
19 |         FeedUpdater(
20 |             name=name,
21 |             url=url,
22 |             summary_auth_token=OBSERVER_AUTH_TOKEN,
23 |             repository=repository,
24 |             throttle_lock=lock,
25 |         ).update_feed()
26 |         for name, url in OBSERVER_FEEDS.items()
27 |     )
28 |     await asyncio.gather(*tasks, return_exceptions=True)
29 | 
30 | 
31 | async def update_feeds_task() -> None:
32 |     repository = MongoAsyncRepository(
33 |         mongo_uri=OBSERVER_MONGO_URI,
34 |         db_name=OBSERVER_MONGO_DB,
35 |         articles_col_name=OBSERVER_MONGO_ARTICLES,
36 |         feeds_col_name=OBSERVER_MONGO_FEEDS,
37 |     )
38 |     while True:
39 |         await update_feeds(repository)
40 |         await asyncio.sleep(OBSERVER_FEED_UPDATE_TIMEOUT)
41 | 


--------------------------------------------------------------------------------
/observer/fetcher/summary.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from typing import List, Optional
 3 | 
 4 | import httpx
 5 | from bs4 import BeautifulSoup
 6 | 
 7 | from models import Summary
 8 | 
 9 | 
10 | async def get_summary_url(
11 |     article_url: str,
12 |     auth_token: str,
13 |     lock: Optional[asyncio.Lock] = None,
14 |     timeout: int = 4,
15 |     is_retry: bool = False,
16 | ) -> Optional[str]:
17 |     if lock:
18 |         await lock.acquire()
19 |         await asyncio.sleep(timeout)
20 |     async with httpx.AsyncClient(
21 |         headers={"Authorization": f"OAuth {auth_token}"}, timeout=20
22 |     ) as client:
23 |         res = await client.post(
24 |             "https://300.ya.ru/api/sharing-url", json={"article_url": article_url}
25 |         )
26 |     if lock:
27 |         lock.release()
28 |     if res.status_code == 404:
29 |         return None
30 |     if res.status_code == 429 and not is_retry:
31 |         return await get_summary_url(
32 |             article_url=article_url,
33 |             auth_token=auth_token,
34 |             lock=lock,
35 |             timeout=timeout * 2,
36 |             is_retry=True,
37 |         )
38 |     res.raise_for_status()
39 |     parsed = res.json()
40 |     return parsed["sharing_url"]
41 | 
42 | 
43 | async def get_summary_content_api(summary_url: str) -> List[str]:
44 |     token = summary_url.rsplit("/", 1)[-1]
45 |     async with httpx.AsyncClient(timeout=10) as client:
46 |         res = await client.post("https://300.ya.ru/api/sharing", json={"token": token})
47 |     res.raise_for_status()
48 |     parsed = res.json()
49 |     return [i["content"] for i in parsed["thesis"]]
50 | 
51 | 
52 | async def get_summary_content_noapi(summary_url: str) -> List[str]:
53 |     async with httpx.AsyncClient(timeout=10) as client:
54 |         res = await client.get(summary_url)
55 |     res.raise_for_status()
56 |     parsed = BeautifulSoup(res.content, features="lxml")
57 |     tag = parsed.find(
58 |         "ul",
59 |         attrs={"class": lambda c: isinstance(c, str) and c.startswith("theses")},
60 |     )
61 |     return [i.get_text(strip=True).strip("• \n") for i in tag.find_all("li")]
62 | 
63 | 
64 | async def get_summary_content(summary_url: str) -> List[str]:
65 |     try:
66 |         content = await get_summary_content_api(summary_url=summary_url)
67 |     except Exception:
68 |         content = await get_summary_content_noapi(summary_url=summary_url)
69 |     return content
70 | 
71 | 
72 | async def get_summary(
73 |     article_url: str, auth_token: str, lock: Optional[asyncio.Lock] = None
74 | ) -> Summary:
75 |     summary_url = await get_summary_url(
76 |         article_url=article_url, auth_token=auth_token, lock=lock
77 |     )
78 |     if summary_url is None:
79 |         return Summary(
80 |             url="https://300.ya.ru",
81 |             content=[
82 |                 "Статья слишком длинная, нейросети пока не умеют пересказывать такие статьи 😔"
83 |             ],
84 |         )
85 |     content = await get_summary_content(summary_url=summary_url)
86 |     return Summary(url=summary_url, content=content)
87 | 


--------------------------------------------------------------------------------
/observer/fetcher/updater.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import datetime
 3 | from typing import Dict, Optional, Iterable
 4 | 
 5 | import httpx
 6 | from bs4 import BeautifulSoup
 7 | from bs4.element import Tag
 8 | 
 9 | from models import Article, Feed
10 | from repository import AsyncRepository
11 | from .summary import get_summary
12 | 
13 | DT_FORMAT = "%a, %d %b %Y %H:%M:%S %Z"
14 | 
15 | 
16 | class FeedUpdater:
17 |     def __init__(
18 |         self,
19 |         name: str,
20 |         url: str,
21 |         summary_auth_token: str,
22 |         repository: AsyncRepository,
23 |         throttle_lock: Optional[asyncio.Lock] = None,
24 |     ):
25 |         self._name = name
26 |         self._url = url
27 |         self._summary_auth_token = summary_auth_token
28 |         self._repository = repository
29 |         self._throttle_lock = throttle_lock
30 |         self._parsed: Optional[BeautifulSoup] = None
31 |         self._article_urls: Optional[Iterable[str]] = None
32 |         self._articles_present: Dict[str, Article] = {}
33 |         self._articles_to_scrape: Optional[Iterable[Tag]] = None
34 | 
35 |     async def _get_feed_content(self) -> None:
36 |         async with httpx.AsyncClient(timeout=10) as client:
37 |             res = await client.get(self._url)
38 |         res.raise_for_status()
39 |         self._parsed = BeautifulSoup(res.content, features="xml")
40 | 
41 |     async def _filter_articles(self) -> None:
42 |         self._article_urls = [i.text for i in self._parsed.find_all("guid")]
43 |         articles = await self._repository.get_articles(self._article_urls)
44 |         for a in articles:
45 |             self._articles_present[a.url] = a
46 |         self._articles_to_scrape = [
47 |             tag
48 |             for tag in self._parsed.find_all("item")
49 |             if tag.find("guid").text not in self._articles_present
50 |         ]
51 | 
52 |     async def _scrape_articles(self) -> None:
53 |         tasks = (self._get_article(tag=tag) for tag in self._articles_to_scrape)
54 |         result = await asyncio.gather(*tasks, return_exceptions=True)
55 |         articles = [a for a in result if isinstance(a, Article)]
56 |         await self._repository.insert_articles(articles)
57 |         for a in articles:
58 |             self._articles_present[a.url] = a
59 | 
60 |     async def _get_article(self, tag: Tag) -> Article:
61 |         url = tag.find("guid").text
62 |         summary = await get_summary(
63 |             article_url=url,
64 |             auth_token=self._summary_auth_token,
65 |             lock=self._throttle_lock,
66 |         )
67 |         title = tag.find("title").text or "Без названия"
68 |         pub_date = datetime.datetime.strptime(tag.find("pubDate").text, DT_FORMAT)
69 |         author = tag.find("dc:creator").text
70 |         return Article(
71 |             _id=url, title=title, pub_date=pub_date, author=author, summary=summary
72 |         )
73 | 
74 |     async def _insert_feed(self):
75 |         feed = Feed(
76 |             _id=self._url,
77 |             name=self._name,
78 |             articles=[self._articles_present[url] for url in self._article_urls],
79 |         )
80 |         await self._repository.insert_feed(feed)
81 | 
82 |     async def update_feed(self):
83 |         await self._get_feed_content()
84 |         await self._filter_articles()
85 |         if self._articles_to_scrape:
86 |             await self._scrape_articles()
87 |         await self._insert_feed()
88 | 


--------------------------------------------------------------------------------
/observer/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .article import Article
2 | from .feed import Feed
3 | from .summary import Summary
4 | 


--------------------------------------------------------------------------------
/observer/models/article.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import datetime
 4 | from dataclasses import dataclass, asdict
 5 | 
 6 | from .summary import Summary
 7 | 
 8 | 
 9 | @dataclass
10 | class Article:
11 |     _id: str
12 |     title: str
13 |     pub_date: datetime.datetime
14 |     author: str
15 |     summary: Summary
16 | 
17 |     @property
18 |     def url(self) -> str:
19 |         return self._id
20 | 
21 |     def as_dict(self) -> dict:
22 |         return asdict(self)
23 | 
24 |     @classmethod
25 |     def from_dict(cls, d: dict) -> Article:
26 |         return Article(
27 |             _id=d["_id"],
28 |             title=d["title"],
29 |             pub_date=d["pub_date"],
30 |             author=d["author"],
31 |             summary=Summary.from_dict(d["summary"]),
32 |         )
33 | 


--------------------------------------------------------------------------------
/observer/models/feed.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from dataclasses import dataclass, asdict
 4 | from typing import List
 5 | 
 6 | from .article import Article
 7 | 
 8 | 
 9 | @dataclass
10 | class Feed:
11 |     _id: str
12 |     name: str
13 |     articles: List[Article]
14 | 
15 |     @property
16 |     def url(self):
17 |         return self._id
18 | 
19 |     def as_dict(self) -> dict:
20 |         return asdict(self)
21 | 
22 |     @classmethod
23 |     def from_dict(cls, d: dict) -> Feed:
24 |         return Feed(
25 |             _id=d["_id"],
26 |             name=d["name"],
27 |             articles=[Article.from_dict(a) for a in d["articles"]],
28 |         )
29 | 


--------------------------------------------------------------------------------
/observer/models/summary.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from dataclasses import dataclass, asdict
 4 | from typing import List
 5 | 
 6 | 
 7 | @dataclass
 8 | class Summary:
 9 |     url: str
10 |     content: List[str]
11 | 
12 |     def as_dict(self) -> dict:
13 |         return asdict(self)
14 | 
15 |     @classmethod
16 |     def from_dict(cls, d: dict) -> Summary:
17 |         return Summary(url=d["url"], content=d["content"])
18 | 


--------------------------------------------------------------------------------
/observer/repository/__init__.py:
--------------------------------------------------------------------------------
1 | from .interface import AsyncRepository
2 | from .mongo import MongoAsyncRepository
3 | 


--------------------------------------------------------------------------------
/observer/repository/interface.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Iterable
 3 | 
 4 | from models import Article, Feed
 5 | 
 6 | 
 7 | class AsyncRepository(ABC):
 8 |     @abstractmethod
 9 |     async def get_articles(self, urls: Iterable[str]) -> Iterable[Article]:
10 |         raise NotImplementedError
11 | 
12 |     @abstractmethod
13 |     async def insert_articles(self, articles: Iterable[Article]) -> None:
14 |         raise NotImplementedError
15 | 
16 |     @abstractmethod
17 |     async def insert_feed(self, feed: Feed) -> None:
18 |         raise NotImplementedError
19 | 
20 |     @abstractmethod
21 |     async def get_feeds(self, ids: Iterable[str]) -> Iterable[Feed]:
22 |         raise NotImplementedError
23 | 


--------------------------------------------------------------------------------
/observer/repository/mongo.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from typing import Iterable, Optional
 3 | 
 4 | from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorCollection
 5 | 
 6 | from models import Article, Feed
 7 | from .interface import AsyncRepository
 8 | 
 9 | 
10 | class MongoAsyncRepository(AsyncRepository):
11 |     def __init__(
12 |         self,
13 |         mongo_uri: str,
14 |         db_name: str,
15 |         articles_col_name: str,
16 |         feeds_col_name: str,
17 |         loop: Optional[asyncio.AbstractEventLoop] = None,
18 |     ):
19 |         if loop:
20 |             self._client = AsyncIOMotorClient(host=mongo_uri, io_loop=loop)
21 |         else:
22 |             self._client = AsyncIOMotorClient(host=mongo_uri)
23 |         self._db = self._client[db_name]
24 |         self._articles: AsyncIOMotorCollection = self._db[articles_col_name]
25 |         self._feeds: AsyncIOMotorCollection = self._db[feeds_col_name]
26 | 
27 |     async def get_articles(self, ids: Iterable[str]) -> Iterable[Article]:
28 |         cursor = self._articles.find({"_id": {"$in": ids}})
29 |         articles = [Article.from_dict(d) async for d in cursor]
30 |         return articles
31 | 
32 |     async def insert_articles(self, articles: Iterable[Article]) -> None:
33 |         await self._articles.insert_many([a.as_dict() for a in articles])
34 | 
35 |     async def insert_feed(self, feed: Feed) -> None:
36 |         await self._feeds.update_one(
37 |             {"_id": feed.url}, {"$set": feed.as_dict()}, upsert=True
38 |         )
39 | 
40 |     async def get_feeds(self, ids: Iterable[str]) -> Iterable[Feed]:
41 |         pipeline = [
42 |             {"$match": {"_id": {"$in": ids}}},
43 |             {"$addFields": {"__order": {"$indexOfArray": [ids, "$_id"]}}},
44 |             {"$sort": {"__order": 1}},
45 |         ]
46 |         res = self._feeds.aggregate(pipeline)
47 |         feeds = [Feed.from_dict(d) async for d in res]
48 |         return feeds
49 | 


--------------------------------------------------------------------------------
/observer/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.12.2
2 | httpx==0.25.0
3 | lxml==4.9.3
4 | motor==3.3.1
5 | streamlit==1.26.0
6 | 


--------------------------------------------------------------------------------
/observer/run_app.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | from app import run_app
 4 | 
 5 | st.set_page_config(
 6 |     page_title="habr.observer",
 7 |     page_icon="🧐",
 8 |     menu_items={
 9 |         "Get help": None,
10 |         "Report a Bug": "https://github.com/pltnk/habr-observer/issues",
11 |         "About": "Author: [pltnk.dev](https://pltnk.dev) ✧˚₊‧⋆‧ "
12 |         "Source: [habr-observer](https://github.com/pltnk/habr-observer)",
13 |     },
14 | )
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     run_app()
19 | 


--------------------------------------------------------------------------------
/observer/run_fetcher.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | 
3 | from fetcher import update_feeds_task
4 | 
5 | if __name__ == "__main__":
6 |     asyncio.run(update_feeds_task())
7 | 


--------------------------------------------------------------------------------