├── tests ├── __init__.py └── test_sinanews.py ├── rlnews ├── utils │ ├── __init__.py │ ├── disk_cache.py │ └── downloader.py ├── __version__.py ├── __init__.py ├── sina_constants.py └── sinanews.py ├── LICENSE ├── README.md ├── setup.py └── .gitignore /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /rlnews/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /rlnews/__version__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __version__ = "0.0.1" 4 | -------------------------------------------------------------------------------- /rlnews/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from rlnews.utils import downloader 4 | from rlnews.utils import disk_cache 5 | from rlnews import sinanews 6 | -------------------------------------------------------------------------------- /rlnews/sina_constants.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | template_url = 'https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid={}&k=&num={}&page={}&r={}' 4 | 5 | lid2classification = { 6 | "2509": "全部", 7 | "2510": "国内", 8 | "2511": "国际", 9 | "2669": "社会", 10 | "2512": "体育", 11 | "2513": "娱乐", 12 | "2514": "军事", 13 | "2515": "科技", 14 | "2516": "财经", 15 | "2517": "股市", 16 | "2518": "美股", 17 | "2968": "国内_国际", 18 | "2970": "国内_社会", 19 | "2972": "国际_社会", 20 | "2974": "国内国际社会" 21 | } 22 | classification2lid = dict((v, k) for k, v in lid2classification.items()) 23 | classifications = list(lid2classification.values()) # 新闻类别 24 | max_num_per_page = 50 25 | 26 | columns = ['classify', 'title', 'time', 'url', 'wapurl', 'media_name', 'keywords', 'content'] 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Jacen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # rolling-news 2 | 获取滚动新闻 3 | 4 | #### 安装和卸载 5 | * 安装 6 | ```bash 7 | python setup.py build 8 | python setup.py install --record log 9 | ``` 10 | * 卸载 11 | ```bash 12 | cat log | xargs rm -rf 13 | ``` 14 | 15 | #### 新闻 16 | 1. 新浪财经 17 | ``` 18 | 全部 19 | https://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page=1 20 | 21 | 国内 22 | https://news.sina.com.cn/roll/#pageid=153&lid=2510&k=&num=50&page=1 23 | 24 | 国际 25 | https://news.sina.com.cn/roll/#pageid=153&lid=2511&k=&num=50&page=1 26 | 27 | 社会 28 | https://news.sina.com.cn/roll/#pageid=153&lid=2669&k=&num=50&page=1 29 | 30 | 体育 31 | https://news.sina.com.cn/roll/#pageid=153&lid=2512&k=&num=50&page=1 32 | 33 | 娱乐 34 | https://news.sina.com.cn/roll/#pageid=153&lid=2513&k=&num=50&page=1 35 | 36 | 军事 37 | https://news.sina.com.cn/roll/#pageid=153&lid=2514&k=&num=50&page=1 38 | 39 | 科技 40 | https://news.sina.com.cn/roll/#pageid=153&lid=2515&k=&num=50&page=1 41 | 42 | 财经 43 | https://news.sina.com.cn/roll/#pageid=153&lid=2516&k=&num=50&page=1 44 | 45 | 股市 46 | https://news.sina.com.cn/roll/#pageid=153&lid=2517&k=&num=50&page=1 47 | 48 | 美股 49 | https://news.sina.com.cn/roll/#pageid=153&lid=2518&k=&num=50&page=1 50 | ``` 51 | -------------------------------------------------------------------------------- /tests/test_sinanews.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from rlnews import sinanews 4 | 5 | 6 | def test_sinanews(): 7 | sinanews.get_rolling_news_csv(top=5, get_content=True, classify='全部', path='全部.csv') 8 | sinanews.get_rolling_news_csv(top=5, get_content=True, classify='国内', path='国内.csv') 9 | sinanews.get_rolling_news_csv(top=5, get_content=True, classify='国际', path='国际.csv') 10 | sinanews.get_rolling_news_csv(top=5, get_content=True, classify='社会', path='社会.csv') 11 | sinanews.get_rolling_news_csv(top=5, get_content=True, classify='体育', path='体育.csv') 12 | sinanews.get_rolling_news_csv(top=5, get_content=True, classify='娱乐', path='娱乐.csv') 13 | sinanews.get_rolling_news_csv(top=5, get_content=True, classify='军事', path='军事.csv') 14 | sinanews.get_rolling_news_csv(top=5, get_content=True, classify='科技', path='科技.csv') 15 | sinanews.get_rolling_news_csv(top=5, get_content=True, classify='财经', path='财经.csv') 16 | sinanews.get_rolling_news_csv(top=5, get_content=True, classify='股市', path='股市.csv') 17 | sinanews.get_rolling_news_csv(top=5, get_content=True, classify='美股', path='美股.csv') 18 | sinanews.get_rolling_news_csv(top=5, get_content=True, classify='国内_国际', path='国内_国际.csv') 19 | sinanews.get_rolling_news_csv(top=5, get_content=True, classify='国内_社会', path='国内_社会.csv') 20 | sinanews.get_rolling_news_csv(top=5, get_content=True, classify='国际_社会', path='国际_社会.csv') 21 | sinanews.get_rolling_news_csv(top=5, get_content=True, classify='国内国际社会', path='国内国际社会.csv') 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | 5 | from setuptools import find_packages, setup 6 | 7 | here = os.path.abspath(os.path.dirname(__file__)) 8 | 9 | with open(os.path.join(here, "README.md"), encoding="utf-8") as f: 10 | long_description = "\n" + f.read() 11 | 12 | about = {} 13 | 14 | with open(os.path.join(here, "rlnews", "__version__.py")) as f: 15 | exec(f.read(), about) 16 | 17 | required = [ 18 | "lxml", 19 | "pandas" 20 | ] 21 | 22 | setup( 23 | name="rlnews", 24 | version=about["__version__"], 25 | description="get rolling news", 26 | long_description=long_description, 27 | # long_description_content_type="text/markdown", 28 | author="Jacen Ye", 29 | author_email="jacen789@gmail.com", 30 | url="https://github.com/jacen789/rolling-news", 31 | packages=find_packages(exclude=["tests", "tests.*"]), 32 | python_requires=">=3.4", 33 | setup_requires=None, 34 | install_requires=required, 35 | extras_require={}, 36 | include_package_data=True, 37 | license="MIT", 38 | classifiers=[ 39 | "License :: OSI Approved :: MIT License", 40 | "Programming Language :: Python", 41 | "Programming Language :: Python :: 3", 42 | "Programming Language :: Python :: 3.4", 43 | "Programming Language :: Python :: 3.5", 44 | "Programming Language :: Python :: 3.6", 45 | "Programming Language :: Python :: 3.7", 46 | "Programming Language :: Python :: Implementation :: CPython", 47 | "Programming Language :: Python :: Implementation :: PyPy", 48 | ], 49 | ) 50 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /rlnews/utils/disk_cache.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import re 5 | import shutil 6 | import zlib 7 | import pickle 8 | from urllib.parse import urlsplit 9 | from datetime import datetime, timedelta 10 | 11 | 12 | class DiskCache: 13 | """磁盘缓存""" 14 | 15 | def __init__(self, cache_dir='cache', expires=timedelta(days=30), compress=True): 16 | """ 17 | cache_dir: 缓存的根级别文件夹 18 | expires: 缓存项被视为过期之前的时间增量 19 | compress: 是否压缩缓存中的数据 20 | """ 21 | self.cache_dir = cache_dir 22 | self.expires = expires 23 | self.compress = compress 24 | 25 | def __getitem__(self, url): 26 | """从磁盘加载此 url 的数据 27 | """ 28 | path = self.url_to_path(url) 29 | if os.path.exists(path): 30 | with open(path, 'rb') as fp: 31 | data = fp.read() 32 | if self.compress: 33 | data = zlib.decompress(data) 34 | result, timestamp = pickle.loads(data) 35 | if self.has_expired(timestamp): 36 | raise KeyError(url + ' has expired') 37 | return result 38 | else: 39 | # url 的数据尚未缓存 40 | raise KeyError(url + ' does not exist') 41 | 42 | def __setitem__(self, url, result): 43 | """将数据保存到此 url 的磁盘 44 | """ 45 | path = self.url_to_path(url) 46 | folder = os.path.dirname(path) 47 | if not os.path.exists(folder): 48 | os.makedirs(folder) 49 | 50 | data = pickle.dumps((result, datetime.utcnow())) 51 | if self.compress: 52 | data = zlib.compress(data) 53 | with open(path, 'wb') as fp: 54 | fp.write(data) 55 | 56 | def __delitem__(self, url): 57 | """删除此 url 的数据和所有空的子目录 58 | """ 59 | path = self.url_to_path(url) 60 | try: 61 | os.remove(path) 62 | os.removedirs(os.path.dirname(path)) 63 | except OSError: 64 | pass 65 | 66 | def url_to_path(self, url): 67 | """为此 url 创建文件系统路径 68 | """ 69 | components = urlsplit(url) 70 | # 当路径为空,设置为 /index.html 71 | path = components.path 72 | if not path: 73 | path = '/index.html' 74 | elif path.endswith('/'): 75 | path += 'index.html' 76 | filename = components.netloc + path + components.query 77 | # 替换无效字符 78 | filename = re.sub('[^/0-9a-zA-Z\-.,;_ ]', '_', filename) 79 | # 限制最大字符数 80 | filename = '/'.join(segment[:255] for segment in filename.split('/')) 81 | return os.path.join(self.cache_dir, filename) 82 | 83 | def has_expired(self, timestamp): 84 | """返回此时间戳是否已过期 85 | """ 86 | return datetime.utcnow() > timestamp + self.expires 87 | 88 | def clear(self): 89 | """删除所有缓存的数据 90 | """ 91 | if os.path.exists(self.cache_dir): 92 | shutil.rmtree(self.cache_dir) 93 | -------------------------------------------------------------------------------- /rlnews/utils/downloader.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import time 4 | import random 5 | import socket 6 | import urllib.parse 7 | import urllib.request 8 | from datetime import datetime 9 | 10 | DEFAULT_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' 11 | DEFAULT_DELAY = 5 12 | DEFAULT_RETRIES = 1 13 | DEFAULT_TIMEOUT = 60 14 | 15 | 16 | class Downloader: 17 | def __init__(self, delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=None, num_retries=DEFAULT_RETRIES, 18 | timeout=DEFAULT_TIMEOUT, opener=None, cache=None): 19 | socket.setdefaulttimeout(timeout) 20 | self.throttle = Throttle(delay) 21 | self.user_agent = user_agent 22 | self.proxies = proxies 23 | self.num_retries = num_retries 24 | self.opener = opener 25 | self.cache = cache 26 | 27 | def __call__(self, url): 28 | result = None 29 | if self.cache: 30 | try: 31 | result = self.cache[url] 32 | except KeyError: 33 | # url 在缓存中不可用 34 | pass 35 | else: 36 | if (not result['code']) or (self.num_retries > 0 and 500 <= result['code'] < 600): 37 | # 服务器错误, 因此忽略 result 中的缓存,重新下载 38 | result = None 39 | if result is None: 40 | # result 没有从缓存加载, 所以仍然需要下载 41 | self.throttle.wait(url) 42 | proxy = random.choice(self.proxies) if self.proxies else None 43 | headers = {'User-agent': self.user_agent} 44 | result = self.download(url, headers, proxy=proxy, num_retries=self.num_retries) 45 | if self.cache: 46 | # 将 result 保存到缓存 47 | self.cache[url] = result 48 | return result['html'] 49 | 50 | def download(self, url, headers, proxy, num_retries, data=None): 51 | print('Downloading:', url) 52 | request = urllib.request.Request(url, data, headers or {}) 53 | opener = self.opener or urllib.request.build_opener() 54 | if proxy: 55 | proxy_params = {urllib.parse.urlparse(url).scheme: proxy} 56 | opener.add_handler(urllib.request.ProxyHandler(proxy_params)) 57 | try: 58 | response = opener.open(request) 59 | html = response.read() 60 | code = response.code 61 | except Exception as e: 62 | print('Download error:', str(e)) 63 | html = '' 64 | if hasattr(e, 'code'): 65 | code = e.code 66 | if num_retries > 0 and 500 <= code < 600: 67 | # 5XX HTTP 错误,重新下载 68 | return self.download(url, headers, proxy, num_retries - 1, data) 69 | else: 70 | code = None 71 | return {'html': html, 'code': code} 72 | 73 | 74 | class Throttle: 75 | """通过在请求同一域之间休眠一段时间来限制下载 76 | """ 77 | 78 | def __init__(self, delay): 79 | # 每个域的下载之间的延迟量 80 | self.delay = delay 81 | # 上次访问域的时间戳 82 | self.domains = {} 83 | 84 | def wait(self, url): 85 | """如果最近访问过此域则进行延迟 86 | """ 87 | domain = urllib.parse.urlsplit(url).netloc 88 | last_accessed = self.domains.get(domain) 89 | if self.delay > 0 and last_accessed is not None: 90 | sleep_secs = self.delay - (datetime.now() - last_accessed).seconds 91 | if sleep_secs > 0: 92 | time.sleep(sleep_secs) 93 | self.domains[domain] = datetime.now() 94 | -------------------------------------------------------------------------------- /rlnews/sinanews.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | 新浪新闻数据接口 5 | """ 6 | 7 | import re 8 | import json 9 | import random 10 | import lxml.html 11 | import lxml.etree 12 | import pandas as pd 13 | from datetime import datetime 14 | 15 | from rlnews import sina_constants as cts 16 | from rlnews.utils.downloader import Downloader 17 | from rlnews.utils.disk_cache import DiskCache 18 | 19 | no_cache_downloader = Downloader(cache=None) 20 | disk_cache_downloader = Downloader(cache=DiskCache()) 21 | 22 | 23 | def get_rolling_news_csv(top=50, get_content=True, classify=None, path=None): 24 | """ 25 | 获取新浪滚动新闻并保存成csv文件 26 | :param top: int, 获取的滚动新闻条数,默认为50 27 | :param get_content: bool, 是否获取新闻内容,默认为True 28 | :param classify: str, 获取的滚动新闻的类别,默认为None,即"2509:全部" 29 | :param path: str, 文件保存路径 30 | """ 31 | df = get_rolling_news(top=top, get_content=get_content, classify=classify) 32 | if not path: 33 | path = 'news.csv' 34 | df.to_csv(path, index=False, encoding='utf-8') 35 | 36 | 37 | def get_rolling_news(top=50, get_content=True, classify=None): 38 | """ 39 | 获取新浪滚动新闻 40 | :param top: int, 获取的滚动新闻条数,默认为50 41 | :param get_content: bool, 是否获取新闻内容,默认为True 42 | :param classify: str, 获取的滚动新闻的类别,默认为None,即"2509:全部" 43 | :return: pd.DataFrame, 新闻信息数据框 44 | """ 45 | if classify: 46 | assert classify in cts.classifications, ( 47 | '请设置 classify 为 {}中的一个'.format(cts.classifications) 48 | ) 49 | 50 | lid = cts.classification2lid.get(classify, '2509') 51 | classify = cts.lid2classification[lid] 52 | num_list = [cts.max_num_per_page] * (top // cts.max_num_per_page) 53 | last_page_num = top % cts.max_num_per_page 54 | if last_page_num: 55 | num_list += [last_page_num] 56 | 57 | df_data = [] 58 | for page, num in enumerate(num_list, start=1): 59 | r = random.random() 60 | url = cts.template_url.format(lid, num, page, r) 61 | response = no_cache_downloader(url) 62 | response_dict = json.loads(response) 63 | data_list = response_dict['result']['data'] 64 | 65 | for data in data_list: 66 | ctime = datetime.fromtimestamp(int(data['ctime'])) 67 | ctime = datetime.strftime(ctime, '%Y-%m-%d %H:%M') 68 | url = data['url'] 69 | row = [classify, data['title'], ctime, 70 | url, data['wapurl'], data['media_name'], data['keywords']] 71 | if get_content: 72 | row.append(get_news_content(url)) 73 | df_data.append(row) 74 | df = pd.DataFrame(df_data, columns=cts.columns if get_content else cts.columns[:-1]) 75 | return df 76 | 77 | 78 | def get_rolling_news_url(top=50, classify=None): 79 | """ 80 | 获取新浪滚动新闻url 81 | :param top: int, 获取的滚动新闻条数,默认为50 82 | :param classify: str, 获取的滚动新闻的类别,默认为None,即"2509:全部" 83 | :return: pd.DataFrame, 新闻信息数据框 84 | """ 85 | if classify: 86 | assert classify in cts.classifications, ( 87 | '请设置 classify 为 {}中的一个'.format(cts.classifications) 88 | ) 89 | 90 | lid = cts.classification2lid.get(classify, '2509') 91 | num_list = [cts.max_num_per_page] * (top // cts.max_num_per_page) 92 | last_page_num = top % cts.max_num_per_page 93 | if last_page_num: 94 | num_list += [last_page_num] 95 | 96 | urls = [] 97 | for page, num in enumerate(num_list, start=1): 98 | r = random.random() 99 | url = cts.template_url.format(lid, num, page, r) 100 | response = no_cache_downloader(url) 101 | response_dict = json.loads(response) 102 | data_list = response_dict['result']['data'] 103 | for data in data_list: 104 | url = data['url'] 105 | urls.append(url) 106 | return urls 107 | 108 | 109 | def get_news_content(url): 110 | """ 111 | 获取新闻内容 112 | :param url: str, 新闻链接 113 | :return: str, 新闻内容 114 | """ 115 | content = '' 116 | try: 117 | text = disk_cache_downloader(url) 118 | html = lxml.etree.HTML(text) 119 | res = html.xpath('//*[@id="artibody" or @id="article"]//p') 120 | p_str_list = [lxml.etree.tostring(node).decode('utf-8') for node in res] 121 | p_str = ''.join(p_str_list) 122 | html_content = lxml.html.fromstring(p_str) 123 | content = html_content.text_content() 124 | # 清理未知字符和空白字符 125 | content = re.sub(r'\u3000', '', content) 126 | content = re.sub(r'[ \xa0?]+', ' ', content) 127 | content = re.sub(r'\s*\n\s*', '\n', content) 128 | content = re.sub(r'\s*(\s)', r'\1', content) 129 | content = content.strip() 130 | except Exception as e: 131 | print('get_news_content(%s) error:' % url, e) 132 | return content 133 | 134 | 135 | if __name__ == '__main__': 136 | get_rolling_news_csv(top=5, get_content=True, classify='全部') 137 | --------------------------------------------------------------------------------