├── tests
    ├── __init__.py
    └── test_sinanews.py
├── rlnews
    ├── utils
    │   ├── __init__.py
    │   ├── disk_cache.py
    │   └── downloader.py
    ├── __version__.py
    ├── __init__.py
    ├── sina_constants.py
    └── sinanews.py
├── LICENSE
├── README.md
├── setup.py
└── .gitignore


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/rlnews/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/rlnews/__version__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | __version__ = "0.0.1"
4 | 


--------------------------------------------------------------------------------
/rlnews/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from rlnews.utils import downloader
4 | from rlnews.utils import disk_cache
5 | from rlnews import sinanews
6 | 


--------------------------------------------------------------------------------
/rlnews/sina_constants.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | template_url = 'https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid={}&k=&num={}&page={}&r={}'
 4 | 
 5 | lid2classification = {
 6 |     "2509": "全部",
 7 |     "2510": "国内",
 8 |     "2511": "国际",
 9 |     "2669": "社会",
10 |     "2512": "体育",
11 |     "2513": "娱乐",
12 |     "2514": "军事",
13 |     "2515": "科技",
14 |     "2516": "财经",
15 |     "2517": "股市",
16 |     "2518": "美股",
17 |     "2968": "国内_国际",
18 |     "2970": "国内_社会",
19 |     "2972": "国际_社会",
20 |     "2974": "国内国际社会"
21 | }
22 | classification2lid = dict((v, k) for k, v in lid2classification.items())
23 | classifications = list(lid2classification.values())  # 新闻类别
24 | max_num_per_page = 50
25 | 
26 | columns = ['classify', 'title', 'time', 'url', 'wapurl', 'media_name', 'keywords', 'content']
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Jacen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # rolling-news
 2 | 获取滚动新闻
 3 | 
 4 | #### 安装和卸载
 5 | * 安装
 6 | ```bash
 7 | python setup.py build
 8 | python setup.py install --record log
 9 | ```
10 | * 卸载
11 | ```bash
12 | cat log | xargs rm -rf
13 | ```
14 | 
15 | #### 新闻
16 | 1. 新浪财经
17 | ```
18 | 全部
19 | https://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page=1
20 | 
21 | 国内
22 | https://news.sina.com.cn/roll/#pageid=153&lid=2510&k=&num=50&page=1
23 | 
24 | 国际
25 | https://news.sina.com.cn/roll/#pageid=153&lid=2511&k=&num=50&page=1
26 | 
27 | 社会
28 | https://news.sina.com.cn/roll/#pageid=153&lid=2669&k=&num=50&page=1
29 | 
30 | 体育
31 | https://news.sina.com.cn/roll/#pageid=153&lid=2512&k=&num=50&page=1
32 | 
33 | 娱乐
34 | https://news.sina.com.cn/roll/#pageid=153&lid=2513&k=&num=50&page=1
35 | 
36 | 军事
37 | https://news.sina.com.cn/roll/#pageid=153&lid=2514&k=&num=50&page=1
38 | 
39 | 科技
40 | https://news.sina.com.cn/roll/#pageid=153&lid=2515&k=&num=50&page=1
41 | 
42 | 财经
43 | https://news.sina.com.cn/roll/#pageid=153&lid=2516&k=&num=50&page=1
44 | 
45 | 股市
46 | https://news.sina.com.cn/roll/#pageid=153&lid=2517&k=&num=50&page=1
47 | 
48 | 美股
49 | https://news.sina.com.cn/roll/#pageid=153&lid=2518&k=&num=50&page=1
50 | ```
51 | 


--------------------------------------------------------------------------------
/tests/test_sinanews.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from rlnews import sinanews
 4 | 
 5 | 
 6 | def test_sinanews():
 7 |     sinanews.get_rolling_news_csv(top=5, get_content=True, classify='全部', path='全部.csv')
 8 |     sinanews.get_rolling_news_csv(top=5, get_content=True, classify='国内', path='国内.csv')
 9 |     sinanews.get_rolling_news_csv(top=5, get_content=True, classify='国际', path='国际.csv')
10 |     sinanews.get_rolling_news_csv(top=5, get_content=True, classify='社会', path='社会.csv')
11 |     sinanews.get_rolling_news_csv(top=5, get_content=True, classify='体育', path='体育.csv')
12 |     sinanews.get_rolling_news_csv(top=5, get_content=True, classify='娱乐', path='娱乐.csv')
13 |     sinanews.get_rolling_news_csv(top=5, get_content=True, classify='军事', path='军事.csv')
14 |     sinanews.get_rolling_news_csv(top=5, get_content=True, classify='科技', path='科技.csv')
15 |     sinanews.get_rolling_news_csv(top=5, get_content=True, classify='财经', path='财经.csv')
16 |     sinanews.get_rolling_news_csv(top=5, get_content=True, classify='股市', path='股市.csv')
17 |     sinanews.get_rolling_news_csv(top=5, get_content=True, classify='美股', path='美股.csv')
18 |     sinanews.get_rolling_news_csv(top=5, get_content=True, classify='国内_国际', path='国内_国际.csv')
19 |     sinanews.get_rolling_news_csv(top=5, get_content=True, classify='国内_社会', path='国内_社会.csv')
20 |     sinanews.get_rolling_news_csv(top=5, get_content=True, classify='国际_社会', path='国际_社会.csv')
21 |     sinanews.get_rolling_news_csv(top=5, get_content=True, classify='国内国际社会', path='国内国际社会.csv')
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | 
 5 | from setuptools import find_packages, setup
 6 | 
 7 | here = os.path.abspath(os.path.dirname(__file__))
 8 | 
 9 | with open(os.path.join(here, "README.md"), encoding="utf-8") as f:
10 |     long_description = "\n" + f.read()
11 | 
12 | about = {}
13 | 
14 | with open(os.path.join(here, "rlnews", "__version__.py")) as f:
15 |     exec(f.read(), about)
16 | 
17 | required = [
18 |     "lxml",
19 |     "pandas"
20 | ]
21 | 
22 | setup(
23 |     name="rlnews",
24 |     version=about["__version__"],
25 |     description="get rolling news",
26 |     long_description=long_description,
27 |     # long_description_content_type="text/markdown",
28 |     author="Jacen Ye",
29 |     author_email="jacen789@gmail.com",
30 |     url="https://github.com/jacen789/rolling-news",
31 |     packages=find_packages(exclude=["tests", "tests.*"]),
32 |     python_requires=">=3.4",
33 |     setup_requires=None,
34 |     install_requires=required,
35 |     extras_require={},
36 |     include_package_data=True,
37 |     license="MIT",
38 |     classifiers=[
39 |         "License :: OSI Approved :: MIT License",
40 |         "Programming Language :: Python",
41 |         "Programming Language :: Python :: 3",
42 |         "Programming Language :: Python :: 3.4",
43 |         "Programming Language :: Python :: 3.5",
44 |         "Programming Language :: Python :: 3.6",
45 |         "Programming Language :: Python :: 3.7",
46 |         "Programming Language :: Python :: Implementation :: CPython",
47 |         "Programming Language :: Python :: Implementation :: PyPy",
48 |     ],
49 | )
50 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/rlnews/utils/disk_cache.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | import re
 5 | import shutil
 6 | import zlib
 7 | import pickle
 8 | from urllib.parse import urlsplit
 9 | from datetime import datetime, timedelta
10 | 
11 | 
12 | class DiskCache:
13 |     """磁盘缓存"""
14 | 
15 |     def __init__(self, cache_dir='cache', expires=timedelta(days=30), compress=True):
16 |         """
17 |         cache_dir: 缓存的根级别文件夹
18 |         expires: 缓存项被视为过期之前的时间增量
19 |         compress: 是否压缩缓存中的数据
20 |         """
21 |         self.cache_dir = cache_dir
22 |         self.expires = expires
23 |         self.compress = compress
24 | 
25 |     def __getitem__(self, url):
26 |         """从磁盘加载此 url 的数据
27 |         """
28 |         path = self.url_to_path(url)
29 |         if os.path.exists(path):
30 |             with open(path, 'rb') as fp:
31 |                 data = fp.read()
32 |                 if self.compress:
33 |                     data = zlib.decompress(data)
34 |                 result, timestamp = pickle.loads(data)
35 |                 if self.has_expired(timestamp):
36 |                     raise KeyError(url + ' has expired')
37 |                 return result
38 |         else:
39 |             # url 的数据尚未缓存
40 |             raise KeyError(url + ' does not exist')
41 | 
42 |     def __setitem__(self, url, result):
43 |         """将数据保存到此 url 的磁盘
44 |         """
45 |         path = self.url_to_path(url)
46 |         folder = os.path.dirname(path)
47 |         if not os.path.exists(folder):
48 |             os.makedirs(folder)
49 | 
50 |         data = pickle.dumps((result, datetime.utcnow()))
51 |         if self.compress:
52 |             data = zlib.compress(data)
53 |         with open(path, 'wb') as fp:
54 |             fp.write(data)
55 | 
56 |     def __delitem__(self, url):
57 |         """删除此 url 的数据和所有空的子目录
58 |         """
59 |         path = self.url_to_path(url)
60 |         try:
61 |             os.remove(path)
62 |             os.removedirs(os.path.dirname(path))
63 |         except OSError:
64 |             pass
65 | 
66 |     def url_to_path(self, url):
67 |         """为此 url 创建文件系统路径
68 |         """
69 |         components = urlsplit(url)
70 |         # 当路径为空，设置为 /index.html
71 |         path = components.path
72 |         if not path:
73 |             path = '/index.html'
74 |         elif path.endswith('/'):
75 |             path += 'index.html'
76 |         filename = components.netloc + path + components.query
77 |         # 替换无效字符
78 |         filename = re.sub('[^/0-9a-zA-Z\-.,;_ ]', '_', filename)
79 |         # 限制最大字符数
80 |         filename = '/'.join(segment[:255] for segment in filename.split('/'))
81 |         return os.path.join(self.cache_dir, filename)
82 | 
83 |     def has_expired(self, timestamp):
84 |         """返回此时间戳是否已过期
85 |         """
86 |         return datetime.utcnow() > timestamp + self.expires
87 | 
88 |     def clear(self):
89 |         """删除所有缓存的数据
90 |         """
91 |         if os.path.exists(self.cache_dir):
92 |             shutil.rmtree(self.cache_dir)
93 | 


--------------------------------------------------------------------------------
/rlnews/utils/downloader.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import time
 4 | import random
 5 | import socket
 6 | import urllib.parse
 7 | import urllib.request
 8 | from datetime import datetime
 9 | 
10 | DEFAULT_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
11 | DEFAULT_DELAY = 5
12 | DEFAULT_RETRIES = 1
13 | DEFAULT_TIMEOUT = 60
14 | 
15 | 
16 | class Downloader:
17 |     def __init__(self, delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=None, num_retries=DEFAULT_RETRIES,
18 |                  timeout=DEFAULT_TIMEOUT, opener=None, cache=None):
19 |         socket.setdefaulttimeout(timeout)
20 |         self.throttle = Throttle(delay)
21 |         self.user_agent = user_agent
22 |         self.proxies = proxies
23 |         self.num_retries = num_retries
24 |         self.opener = opener
25 |         self.cache = cache
26 | 
27 |     def __call__(self, url):
28 |         result = None
29 |         if self.cache:
30 |             try:
31 |                 result = self.cache[url]
32 |             except KeyError:
33 |                 # url 在缓存中不可用
34 |                 pass
35 |             else:
36 |                 if (not result['code']) or (self.num_retries > 0 and 500 <= result['code'] < 600):
37 |                     # 服务器错误, 因此忽略 result 中的缓存，重新下载
38 |                     result = None
39 |         if result is None:
40 |             # result 没有从缓存加载, 所以仍然需要下载
41 |             self.throttle.wait(url)
42 |             proxy = random.choice(self.proxies) if self.proxies else None
43 |             headers = {'User-agent': self.user_agent}
44 |             result = self.download(url, headers, proxy=proxy, num_retries=self.num_retries)
45 |             if self.cache:
46 |                 # 将 result 保存到缓存
47 |                 self.cache[url] = result
48 |         return result['html']
49 | 
50 |     def download(self, url, headers, proxy, num_retries, data=None):
51 |         print('Downloading:', url)
52 |         request = urllib.request.Request(url, data, headers or {})
53 |         opener = self.opener or urllib.request.build_opener()
54 |         if proxy:
55 |             proxy_params = {urllib.parse.urlparse(url).scheme: proxy}
56 |             opener.add_handler(urllib.request.ProxyHandler(proxy_params))
57 |         try:
58 |             response = opener.open(request)
59 |             html = response.read()
60 |             code = response.code
61 |         except Exception as e:
62 |             print('Download error:', str(e))
63 |             html = ''
64 |             if hasattr(e, 'code'):
65 |                 code = e.code
66 |                 if num_retries > 0 and 500 <= code < 600:
67 |                     # 5XX HTTP 错误，重新下载
68 |                     return self.download(url, headers, proxy, num_retries - 1, data)
69 |             else:
70 |                 code = None
71 |         return {'html': html, 'code': code}
72 | 
73 | 
74 | class Throttle:
75 |     """通过在请求同一域之间休眠一段时间来限制下载
76 |     """
77 | 
78 |     def __init__(self, delay):
79 |         # 每个域的下载之间的延迟量
80 |         self.delay = delay
81 |         # 上次访问域的时间戳
82 |         self.domains = {}
83 | 
84 |     def wait(self, url):
85 |         """如果最近访问过此域则进行延迟
86 |         """
87 |         domain = urllib.parse.urlsplit(url).netloc
88 |         last_accessed = self.domains.get(domain)
89 |         if self.delay > 0 and last_accessed is not None:
90 |             sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
91 |             if sleep_secs > 0:
92 |                 time.sleep(sleep_secs)
93 |         self.domains[domain] = datetime.now()
94 | 


--------------------------------------------------------------------------------
/rlnews/sinanews.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | 新浪新闻数据接口
  5 | """
  6 | 
  7 | import re
  8 | import json
  9 | import random
 10 | import lxml.html
 11 | import lxml.etree
 12 | import pandas as pd
 13 | from datetime import datetime
 14 | 
 15 | from rlnews import sina_constants as cts
 16 | from rlnews.utils.downloader import Downloader
 17 | from rlnews.utils.disk_cache import DiskCache
 18 | 
 19 | no_cache_downloader = Downloader(cache=None)
 20 | disk_cache_downloader = Downloader(cache=DiskCache())
 21 | 
 22 | 
 23 | def get_rolling_news_csv(top=50, get_content=True, classify=None, path=None):
 24 |     """
 25 |     获取新浪滚动新闻并保存成csv文件
 26 |     :param top: int, 获取的滚动新闻条数，默认为50
 27 |     :param get_content: bool, 是否获取新闻内容，默认为True
 28 |     :param classify: str, 获取的滚动新闻的类别，默认为None，即"2509:全部"
 29 |     :param path: str, 文件保存路径
 30 |     """
 31 |     df = get_rolling_news(top=top, get_content=get_content, classify=classify)
 32 |     if not path:
 33 |         path = 'news.csv'
 34 |     df.to_csv(path, index=False, encoding='utf-8')
 35 | 
 36 | 
 37 | def get_rolling_news(top=50, get_content=True, classify=None):
 38 |     """
 39 |     获取新浪滚动新闻
 40 |     :param top: int, 获取的滚动新闻条数，默认为50
 41 |     :param get_content: bool, 是否获取新闻内容，默认为True
 42 |     :param classify: str, 获取的滚动新闻的类别，默认为None，即"2509:全部"
 43 |     :return: pd.DataFrame, 新闻信息数据框
 44 |     """
 45 |     if classify:
 46 |         assert classify in cts.classifications, (
 47 |             '请设置 classify 为 {}中的一个'.format(cts.classifications)
 48 |         )
 49 | 
 50 |     lid = cts.classification2lid.get(classify, '2509')
 51 |     classify = cts.lid2classification[lid]
 52 |     num_list = [cts.max_num_per_page] * (top // cts.max_num_per_page)
 53 |     last_page_num = top % cts.max_num_per_page
 54 |     if last_page_num:
 55 |         num_list += [last_page_num]
 56 | 
 57 |     df_data = []
 58 |     for page, num in enumerate(num_list, start=1):
 59 |         r = random.random()
 60 |         url = cts.template_url.format(lid, num, page, r)
 61 |         response = no_cache_downloader(url)
 62 |         response_dict = json.loads(response)
 63 |         data_list = response_dict['result']['data']
 64 | 
 65 |         for data in data_list:
 66 |             ctime = datetime.fromtimestamp(int(data['ctime']))
 67 |             ctime = datetime.strftime(ctime, '%Y-%m-%d %H:%M')
 68 |             url = data['url']
 69 |             row = [classify, data['title'], ctime,
 70 |                    url, data['wapurl'], data['media_name'], data['keywords']]
 71 |             if get_content:
 72 |                 row.append(get_news_content(url))
 73 |             df_data.append(row)
 74 |     df = pd.DataFrame(df_data, columns=cts.columns if get_content else cts.columns[:-1])
 75 |     return df
 76 | 
 77 | 
 78 | def get_rolling_news_url(top=50, classify=None):
 79 |     """
 80 |     获取新浪滚动新闻url
 81 |     :param top: int, 获取的滚动新闻条数，默认为50
 82 |     :param classify: str, 获取的滚动新闻的类别，默认为None，即"2509:全部"
 83 |     :return: pd.DataFrame, 新闻信息数据框
 84 |     """
 85 |     if classify:
 86 |         assert classify in cts.classifications, (
 87 |             '请设置 classify 为 {}中的一个'.format(cts.classifications)
 88 |         )
 89 | 
 90 |     lid = cts.classification2lid.get(classify, '2509')
 91 |     num_list = [cts.max_num_per_page] * (top // cts.max_num_per_page)
 92 |     last_page_num = top % cts.max_num_per_page
 93 |     if last_page_num:
 94 |         num_list += [last_page_num]
 95 | 
 96 |     urls = []
 97 |     for page, num in enumerate(num_list, start=1):
 98 |         r = random.random()
 99 |         url = cts.template_url.format(lid, num, page, r)
100 |         response = no_cache_downloader(url)
101 |         response_dict = json.loads(response)
102 |         data_list = response_dict['result']['data']
103 |         for data in data_list:
104 |             url = data['url']
105 |             urls.append(url)
106 |     return urls
107 | 
108 | 
109 | def get_news_content(url):
110 |     """
111 |     获取新闻内容
112 |     :param url: str, 新闻链接
113 |     :return: str, 新闻内容
114 |     """
115 |     content = ''
116 |     try:
117 |         text = disk_cache_downloader(url)
118 |         html = lxml.etree.HTML(text)
119 |         res = html.xpath('//*[@id="artibody" or @id="article"]//p')
120 |         p_str_list = [lxml.etree.tostring(node).decode('utf-8') for node in res]
121 |         p_str = ''.join(p_str_list)
122 |         html_content = lxml.html.fromstring(p_str)
123 |         content = html_content.text_content()
124 |         # 清理未知字符和空白字符
125 |         content = re.sub(r'\u3000', '', content)
126 |         content = re.sub(r'[ \xa0?]+', ' ', content)
127 |         content = re.sub(r'\s*\n\s*', '\n', content)
128 |         content = re.sub(r'\s*(\s)', r'\1', content)
129 |         content = content.strip()
130 |     except Exception as e:
131 |         print('get_news_content(%s) error:' % url, e)
132 |     return content
133 | 
134 | 
135 | if __name__ == '__main__':
136 |     get_rolling_news_csv(top=5, get_content=True, classify='全部')
137 | 


--------------------------------------------------------------------------------