├── Procfile ├── .gitignore ├── .env ├── app ├── blueprints │ ├── __ init__.py │ ├── main.py │ └── spider.py ├── __init__.py ├── templates │ └── rss.xml └── config.py ├── requirements.txt ├── wsgi.py ├── log.conf └── README.md /Procfile: -------------------------------------------------------------------------------- 1 | web: gunicorn wsgi:app -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | __pycache__/ 3 | 4 | log/log\.txt 5 | -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | FLASK_ENV=production 2 | EXPIRE=10 3 | TITLE=NUAARSS 4 | ADMIN_NAME=NUAA 5 | PER=1 6 | URL=https://blogroll.a2os.club 7 | PROCESSES=5 -------------------------------------------------------------------------------- /app/blueprints/__ init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: LogicJake 3 | # @Date: 2019-02-15 19:39:29 4 | # @Last Modified time: 2019-02-15 20:04:29 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Flask==1.0.2 2 | feedparser==5.2.1 3 | requests==2.21.0 4 | Werkzeug==0.14.1 5 | python-dotenv==0.10.1 6 | func_timeout==4.3.0 7 | gunicorn==19.9.0 -------------------------------------------------------------------------------- /wsgi.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: LogicJake 3 | # @Date: 2019-02-15 19:53:12 4 | # @Last Modified time: 2019-02-15 23:15:25 5 | import os 6 | from app import create_app 7 | from dotenv import load_dotenv 8 | 9 | dotenv_path = os.path.join(os.path.dirname(__file__), '.env') 10 | if os.path.exists(dotenv_path): 11 | load_dotenv(dotenv_path) 12 | 13 | app = create_app(os.getenv("FLASK_ENV")) 14 | -------------------------------------------------------------------------------- /app/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: LogicJake 3 | # @Date: 2019-02-15 19:33:23 4 | # @Last Modified time: 2019-04-02 15:25:00 5 | from flask import Flask 6 | from app.config import config 7 | from app.blueprints.main import bp as main_bp 8 | 9 | 10 | def after_request(resp): 11 | resp.headers['Access-Control-Allow-Origin'] = '*' 12 | return resp 13 | 14 | 15 | def create_app(config_name): 16 | app = Flask(__name__) 17 | app.config.from_object(config[config_name]) 18 | 19 | app.register_blueprint(main_bp) 20 | app.after_request(after_request) 21 | return app 22 | -------------------------------------------------------------------------------- /log.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root 3 | 4 | [logger_root] 5 | level=INFO 6 | handlers=filert 7 | 8 | ############################################### 9 | 10 | [handlers] 11 | keys=filert,stream 12 | 13 | [handler_stream] 14 | class=StreamHandler 15 | level=INFO 16 | formatter=form 17 | args=(sys.stdout,) 18 | 19 | 20 | [handler_filert] 21 | class=handlers.RotatingFileHandler 22 | level=INFO 23 | formatter=form 24 | args=('log/log.txt', 'a', 10*1024*1024, 5) 25 | 26 | ############################################### 27 | 28 | [formatters] 29 | keys=form 30 | 31 | [formatter_form] 32 | format= %(asctime)s [%(filename)s:%(lineno)d][%(levelname)s]: %(message)s -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 预览地址: https://nuaa-rss.herokuapp.com/ 2 | ## 流程: 3 | - 从 https://blogroll.a2os.club 获取所有rss地址 4 | - 遍历解析每一条rss,取每个rss的第一个item 5 | - 对所有item根据pubDate排序,扔进rss模块 6 | 7 | ## 应用配置 8 | 修改 .env 文件 9 | 10 | * FLASK_ENV:flask环境,决定项目启动时的模式 11 | * EXPIRE:缓存过期时间 12 | * TITLE:rss模板的title字段 13 | * ADMIN_NAME:rss模板的generator字段 14 | * PER:每个人的rss取多少条item 15 | * URL:博客地址 16 | * PROCESSES:进程数 17 | 18 | ## 安装运行 19 | ``` 20 | pip install -r requirements.txt 21 | flask run -h 0.0.0.0 22 | ``` 23 | 访问 `127.0.0.1:5000` 即可 24 | 25 | ## 其他 26 | 采用多进程解析rss,但由于feedparser的原因,解析rss的时候可能会卡住,所以设置了每条rss解析的超时时间(30s),防止整个rss的生成卡住。 27 | 采用了简单的内存缓存,缓存过期时间为10min,缓存过期后会重新抓取生成,需一定的时间。 28 | -------------------------------------------------------------------------------- /app/templates/rss.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | <![CDATA[{{title|safe}}]]> 5 | {{link}} 6 | {{generator}} 7 | {{ lastBuildDate }} 8 | {{ ttl }} 9 | {% for item in items %} 10 | <![CDATA[{{item.title|safe}}]]> 11 | 12 | {% if item.pubDate %}{{ item.pubDate }}{% endif %} 13 | {% if item.author %}{% endif %} 14 | {{ item.link }} 15 | {{ item.link }} 16 | {% endfor %} 17 | 18 | 19 | -------------------------------------------------------------------------------- /app/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: LogicJake 3 | # @Date: 2019-02-15 19:35:17 4 | # @Last Modified time: 2019-02-15 23:07:34 5 | import os 6 | import sys 7 | import logging 8 | import logging.config 9 | import json 10 | import os 11 | 12 | 13 | os.makedirs('log', exist_ok=True) 14 | logging.config.fileConfig('log.conf') 15 | logger = logging.getLogger() 16 | logger.info('Finish loading config') 17 | 18 | 19 | basedir = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) 20 | 21 | 22 | class BaseConfig: 23 | SITE_NAME = os.getenv("SITE_NAME") 24 | 25 | 26 | class DevelopmentConfig(BaseConfig): 27 | DEBUG = True 28 | 29 | 30 | class TestingConfig(BaseConfig): 31 | TESTING = True 32 | 33 | 34 | class ProductionConfig(BaseConfig): 35 | pass 36 | 37 | 38 | config = { 39 | 'development': DevelopmentConfig, 40 | 'testing': TestingConfig, 41 | 'production': ProductionConfig 42 | } 43 | -------------------------------------------------------------------------------- /app/blueprints/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: LogicJake 3 | # @Date: 2019-02-15 20:04:12 4 | # @Last Modified time: 2019-04-02 15:37:29 5 | from flask import Blueprint, render_template, request, Response 6 | import json 7 | bp = Blueprint('main', __name__) 8 | 9 | 10 | @bp.app_template_global() 11 | def filter_content(ctx): 12 | include_title = request.args.get('include_title') 13 | include_description = request.args.get('include_description') 14 | exclude_title = request.args.get('exclude_title') 15 | exclude_description = request.args.get('exclude_description') 16 | limit = request.args.get('limit', type=int) 17 | items = ctx['items'].copy() 18 | items = [item for item in items if include_title in item[ 19 | 'title']] if include_title else items 20 | items = [item for item in items if include_description in item[ 21 | 'description']] if include_description else items 22 | items = [item for item in items if exclude_title not in item[ 23 | 'title']] if exclude_title else items 24 | items = [item for item in items if exclude_description not in item[ 25 | 'description']] if exclude_description else items 26 | items = items[:limit] if limit else items 27 | ctx = ctx.copy() 28 | ctx['items'] = items 29 | return ctx 30 | 31 | 32 | @bp.route('/json') 33 | def data(): 34 | from .spider import ctx 35 | return Response(json.dumps(ctx()), mimetype='application/json') 36 | 37 | 38 | @bp.route('/') 39 | def index(): 40 | from .spider import ctx 41 | return render_template('rss.xml', **filter_content(ctx())), 200, {'Content-Type': 'text/xml; charset=utf-8'} 42 | -------------------------------------------------------------------------------- /app/blueprints/spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: LogicJake 3 | # @Date: 2019-02-15 20:12:43 4 | # @Last Modified time: 2019-02-16 12:43:53 5 | import requests 6 | import re 7 | import feedparser 8 | import time 9 | from flask import Flask 10 | from werkzeug.contrib.cache import SimpleCache 11 | import os 12 | from app.config import logger 13 | from func_timeout import func_set_timeout 14 | import func_timeout 15 | import multiprocessing 16 | 17 | cache = SimpleCache() 18 | PER = int(os.getenv("PER")) 19 | EXPIRE = int(os.getenv("EXPIRE")) 20 | URL = os.getenv("URL") 21 | TITLE = os.getenv("TITLE") 22 | ADMIN_NAME = os.getenv("ADMIN_NAME") 23 | 24 | 25 | def get_rss_list(): 26 | html = requests.get(URL) 27 | 28 | rss = re.findall('(.*?)', html.text) 29 | rss_list = [] 30 | for i in range(len(rss)): 31 | if i % 3 == 0 and rss[i + 2] != '-': 32 | author = re.search('>(.*?)<', rss[i]).group(1) 33 | rss_url = rss[i + 2] 34 | rss_list.append([author, rss_url]) 35 | return rss_list 36 | 37 | 38 | @func_set_timeout(30) 39 | def parse_rss(author, rss_url): 40 | feeds = feedparser.parse(rss_url) 41 | items = [] 42 | 43 | for single_post in feeds.entries[:PER]: 44 | item = {} 45 | item['author'] = author 46 | item['title'] = single_post.title 47 | if single_post.has_key('content'): 48 | item['description'] = single_post.content[0].value 49 | elif single_post.has_key('summary'): 50 | item['description'] = single_post.summary 51 | else: 52 | item['description'] = single_post.title 53 | item['link'] = single_post.link 54 | item['pubDate'] = time.strftime( 55 | "%Y-%m-%d %H:%M:%S", single_post.updated_parsed) 56 | 57 | items.append(item) 58 | 59 | return items 60 | 61 | 62 | def time_limit_parse(author, rss_url): 63 | try: 64 | items = parse_rss(author, rss_url) 65 | logger.info(rss_url + ' over') 66 | return items 67 | except func_timeout.exceptions.FunctionTimedOut as e: 68 | logger.error(rss_url) 69 | logger.error(e) 70 | return None 71 | 72 | 73 | def generate_all(): 74 | rss_list = get_rss_list() 75 | 76 | items = [] 77 | results = [] 78 | pool = multiprocessing.Pool(int(os.getenv('PROCESSES'))) 79 | for author, rss_url in rss_list: 80 | results.append(pool.apply_async(time_limit_parse, (author, rss_url, ))) 81 | 82 | pool.close() 83 | pool.join() 84 | 85 | for res in results: 86 | item = res.get() 87 | if item is not None: 88 | items += item 89 | 90 | items.sort(key=lambda item: item['pubDate'], reverse=True) 91 | return items 92 | 93 | 94 | def ctx(): 95 | content = cache.get('content') 96 | if content is None: 97 | logger.info('not hit cache') 98 | items = generate_all() 99 | content = { 100 | 'items': items, 101 | 'link': URL, 102 | 'title': TITLE, 103 | 'generator': ADMIN_NAME, 104 | 'lastBuildDate': time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), 105 | 'ttl': EXPIRE * 60 106 | } 107 | cache.set('content', content, timeout=EXPIRE * 60) 108 | return content 109 | --------------------------------------------------------------------------------