├── Procfile
├── .gitignore
├── .env
├── app
├── blueprints
│ ├── __ init__.py
│ ├── main.py
│ └── spider.py
├── __init__.py
├── templates
│ └── rss.xml
└── config.py
├── requirements.txt
├── wsgi.py
├── log.conf
└── README.md
/Procfile:
--------------------------------------------------------------------------------
1 | web: gunicorn wsgi:app
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | __pycache__/
3 |
4 | log/log\.txt
5 |
--------------------------------------------------------------------------------
/.env:
--------------------------------------------------------------------------------
1 | FLASK_ENV=production
2 | EXPIRE=10
3 | TITLE=NUAARSS
4 | ADMIN_NAME=NUAA
5 | PER=1
6 | URL=https://blogroll.a2os.club
7 | PROCESSES=5
--------------------------------------------------------------------------------
/app/blueprints/__ init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: LogicJake
3 | # @Date: 2019-02-15 19:39:29
4 | # @Last Modified time: 2019-02-15 20:04:29
5 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask==1.0.2
2 | feedparser==5.2.1
3 | requests==2.21.0
4 | Werkzeug==0.14.1
5 | python-dotenv==0.10.1
6 | func_timeout==4.3.0
7 | gunicorn==19.9.0
--------------------------------------------------------------------------------
/wsgi.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: LogicJake
3 | # @Date: 2019-02-15 19:53:12
4 | # @Last Modified time: 2019-02-15 23:15:25
5 | import os
6 | from app import create_app
7 | from dotenv import load_dotenv
8 |
9 | dotenv_path = os.path.join(os.path.dirname(__file__), '.env')
10 | if os.path.exists(dotenv_path):
11 | load_dotenv(dotenv_path)
12 |
13 | app = create_app(os.getenv("FLASK_ENV"))
14 |
--------------------------------------------------------------------------------
/app/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: LogicJake
3 | # @Date: 2019-02-15 19:33:23
4 | # @Last Modified time: 2019-04-02 15:25:00
5 | from flask import Flask
6 | from app.config import config
7 | from app.blueprints.main import bp as main_bp
8 |
9 |
10 | def after_request(resp):
11 | resp.headers['Access-Control-Allow-Origin'] = '*'
12 | return resp
13 |
14 |
15 | def create_app(config_name):
16 | app = Flask(__name__)
17 | app.config.from_object(config[config_name])
18 |
19 | app.register_blueprint(main_bp)
20 | app.after_request(after_request)
21 | return app
22 |
--------------------------------------------------------------------------------
/log.conf:
--------------------------------------------------------------------------------
1 | [loggers]
2 | keys=root
3 |
4 | [logger_root]
5 | level=INFO
6 | handlers=filert
7 |
8 | ###############################################
9 |
10 | [handlers]
11 | keys=filert,stream
12 |
13 | [handler_stream]
14 | class=StreamHandler
15 | level=INFO
16 | formatter=form
17 | args=(sys.stdout,)
18 |
19 |
20 | [handler_filert]
21 | class=handlers.RotatingFileHandler
22 | level=INFO
23 | formatter=form
24 | args=('log/log.txt', 'a', 10*1024*1024, 5)
25 |
26 | ###############################################
27 |
28 | [formatters]
29 | keys=form
30 |
31 | [formatter_form]
32 | format= %(asctime)s [%(filename)s:%(lineno)d][%(levelname)s]: %(message)s
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 预览地址: https://nuaa-rss.herokuapp.com/
2 | ## 流程:
3 | - 从 https://blogroll.a2os.club 获取所有rss地址
4 | - 遍历解析每一条rss,取每个rss的第一个item
5 | - 对所有item根据pubDate排序,扔进rss模块
6 |
7 | ## 应用配置
8 | 修改 .env 文件
9 |
10 | * FLASK_ENV:flask环境,决定项目启动时的模式
11 | * EXPIRE:缓存过期时间
12 | * TITLE:rss模板的title字段
13 | * ADMIN_NAME:rss模板的generator字段
14 | * PER:每个人的rss取多少条item
15 | * URL:博客地址
16 | * PROCESSES:进程数
17 |
18 | ## 安装运行
19 | ```
20 | pip install -r requirements.txt
21 | flask run -h 0.0.0.0
22 | ```
23 | 访问 `127.0.0.1:5000` 即可
24 |
25 | ## 其他
26 | 采用多进程解析rss,但由于feedparser的原因,解析rss的时候可能会卡住,所以设置了每条rss解析的超时时间(30s),防止整个rss的生成卡住。
27 | 采用了简单的内存缓存,缓存过期时间为10min,缓存过期后会重新抓取生成,需一定的时间。
28 |
--------------------------------------------------------------------------------
/app/templates/rss.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {{link}}
6 | {{generator}}
7 | {{ lastBuildDate }}
8 | {{ ttl }}
9 | {% for item in items %}-
10 |
11 |
12 | {% if item.pubDate %}{{ item.pubDate }}{% endif %}
13 | {% if item.author %}{% endif %}
14 | {{ item.link }}
15 | {{ item.link }}
16 |
{% endfor %}
17 |
18 |
19 |
--------------------------------------------------------------------------------
/app/config.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: LogicJake
3 | # @Date: 2019-02-15 19:35:17
4 | # @Last Modified time: 2019-02-15 23:07:34
5 | import os
6 | import sys
7 | import logging
8 | import logging.config
9 | import json
10 | import os
11 |
12 |
13 | os.makedirs('log', exist_ok=True)
14 | logging.config.fileConfig('log.conf')
15 | logger = logging.getLogger()
16 | logger.info('Finish loading config')
17 |
18 |
19 | basedir = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
20 |
21 |
22 | class BaseConfig:
23 | SITE_NAME = os.getenv("SITE_NAME")
24 |
25 |
26 | class DevelopmentConfig(BaseConfig):
27 | DEBUG = True
28 |
29 |
30 | class TestingConfig(BaseConfig):
31 | TESTING = True
32 |
33 |
34 | class ProductionConfig(BaseConfig):
35 | pass
36 |
37 |
38 | config = {
39 | 'development': DevelopmentConfig,
40 | 'testing': TestingConfig,
41 | 'production': ProductionConfig
42 | }
43 |
--------------------------------------------------------------------------------
/app/blueprints/main.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: LogicJake
3 | # @Date: 2019-02-15 20:04:12
4 | # @Last Modified time: 2019-04-02 15:37:29
5 | from flask import Blueprint, render_template, request, Response
6 | import json
7 | bp = Blueprint('main', __name__)
8 |
9 |
10 | @bp.app_template_global()
11 | def filter_content(ctx):
12 | include_title = request.args.get('include_title')
13 | include_description = request.args.get('include_description')
14 | exclude_title = request.args.get('exclude_title')
15 | exclude_description = request.args.get('exclude_description')
16 | limit = request.args.get('limit', type=int)
17 | items = ctx['items'].copy()
18 | items = [item for item in items if include_title in item[
19 | 'title']] if include_title else items
20 | items = [item for item in items if include_description in item[
21 | 'description']] if include_description else items
22 | items = [item for item in items if exclude_title not in item[
23 | 'title']] if exclude_title else items
24 | items = [item for item in items if exclude_description not in item[
25 | 'description']] if exclude_description else items
26 | items = items[:limit] if limit else items
27 | ctx = ctx.copy()
28 | ctx['items'] = items
29 | return ctx
30 |
31 |
32 | @bp.route('/json')
33 | def data():
34 | from .spider import ctx
35 | return Response(json.dumps(ctx()), mimetype='application/json')
36 |
37 |
38 | @bp.route('/')
39 | def index():
40 | from .spider import ctx
41 | return render_template('rss.xml', **filter_content(ctx())), 200, {'Content-Type': 'text/xml; charset=utf-8'}
42 |
--------------------------------------------------------------------------------
/app/blueprints/spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: LogicJake
3 | # @Date: 2019-02-15 20:12:43
4 | # @Last Modified time: 2019-02-16 12:43:53
5 | import requests
6 | import re
7 | import feedparser
8 | import time
9 | from flask import Flask
10 | from werkzeug.contrib.cache import SimpleCache
11 | import os
12 | from app.config import logger
13 | from func_timeout import func_set_timeout
14 | import func_timeout
15 | import multiprocessing
16 |
17 | cache = SimpleCache()
18 | PER = int(os.getenv("PER"))
19 | EXPIRE = int(os.getenv("EXPIRE"))
20 | URL = os.getenv("URL")
21 | TITLE = os.getenv("TITLE")
22 | ADMIN_NAME = os.getenv("ADMIN_NAME")
23 |
24 |
25 | def get_rss_list():
26 | html = requests.get(URL)
27 |
28 | rss = re.findall('
(.*?) | ', html.text)
29 | rss_list = []
30 | for i in range(len(rss)):
31 | if i % 3 == 0 and rss[i + 2] != '-':
32 | author = re.search('>(.*?)<', rss[i]).group(1)
33 | rss_url = rss[i + 2]
34 | rss_list.append([author, rss_url])
35 | return rss_list
36 |
37 |
38 | @func_set_timeout(30)
39 | def parse_rss(author, rss_url):
40 | feeds = feedparser.parse(rss_url)
41 | items = []
42 |
43 | for single_post in feeds.entries[:PER]:
44 | item = {}
45 | item['author'] = author
46 | item['title'] = single_post.title
47 | if single_post.has_key('content'):
48 | item['description'] = single_post.content[0].value
49 | elif single_post.has_key('summary'):
50 | item['description'] = single_post.summary
51 | else:
52 | item['description'] = single_post.title
53 | item['link'] = single_post.link
54 | item['pubDate'] = time.strftime(
55 | "%Y-%m-%d %H:%M:%S", single_post.updated_parsed)
56 |
57 | items.append(item)
58 |
59 | return items
60 |
61 |
62 | def time_limit_parse(author, rss_url):
63 | try:
64 | items = parse_rss(author, rss_url)
65 | logger.info(rss_url + ' over')
66 | return items
67 | except func_timeout.exceptions.FunctionTimedOut as e:
68 | logger.error(rss_url)
69 | logger.error(e)
70 | return None
71 |
72 |
73 | def generate_all():
74 | rss_list = get_rss_list()
75 |
76 | items = []
77 | results = []
78 | pool = multiprocessing.Pool(int(os.getenv('PROCESSES')))
79 | for author, rss_url in rss_list:
80 | results.append(pool.apply_async(time_limit_parse, (author, rss_url, )))
81 |
82 | pool.close()
83 | pool.join()
84 |
85 | for res in results:
86 | item = res.get()
87 | if item is not None:
88 | items += item
89 |
90 | items.sort(key=lambda item: item['pubDate'], reverse=True)
91 | return items
92 |
93 |
94 | def ctx():
95 | content = cache.get('content')
96 | if content is None:
97 | logger.info('not hit cache')
98 | items = generate_all()
99 | content = {
100 | 'items': items,
101 | 'link': URL,
102 | 'title': TITLE,
103 | 'generator': ADMIN_NAME,
104 | 'lastBuildDate': time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
105 | 'ttl': EXPIRE * 60
106 | }
107 | cache.set('content', content, timeout=EXPIRE * 60)
108 | return content
109 |
--------------------------------------------------------------------------------