├── README.md ├── news-crawler ├── config.py ├── simple-news-crawler.py ├── bee_server.py ├── lxml_demo.py ├── news-crawler-sync.py ├── news-crawler-async.py ├── functions.py ├── bee_client.py ├── ezpymysql.py ├── urlpool.py └── maincontent.py ├── LICENSE ├── .gitignore ├── selenium-login.py └── weibologin.py /README.md: -------------------------------------------------------------------------------- 1 | # python-crawler 2 | 异步高并发分布式爬虫框架。 3 | -------------------------------------------------------------------------------- /news-crawler/config.py: -------------------------------------------------------------------------------- 1 | # Author: veelion 2 | 3 | 4 | db_host = 'localhost' 5 | db_db = 'crawler' 6 | db_user = 'your-user' 7 | db_password = 'your-password' 8 | 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 veelion 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /news-crawler/simple-news-crawler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Author: veelion 3 | 4 | 5 | import re 6 | import requests 7 | import tldextract 8 | 9 | 10 | def save_to_db(url, html): 11 | print('%s : %s' % (url, len(html))) 12 | 13 | 14 | def crawl(): 15 | # 1. download baidu news 16 | hub_url = 'http://news.baidu.com/' 17 | html = requests.get(hub_url).text 18 | 19 | # 2. extract news links 20 | ## 2.1 extract all links with 'href' 21 | links = re.findall(r'href=[\'"]?(.*?)[\'"\s]', html) 22 | print('find links:', len(links)) 23 | news_links = [] 24 | ## 2.2 filter non-news link 25 | for link in links: 26 | if not link.startswith('http'): 27 | continue 28 | tld = tldextract.extract(link) 29 | if tld.domain == 'baidu': 30 | continue 31 | news_links.append(link) 32 | 33 | print('find news links:', len(news_links)) 34 | # 3. download news and save to database 35 | for link in news_links: 36 | html = requests.get(link).text 37 | save_to_db(link, html) 38 | print('works done!') 39 | 40 | 41 | if __name__ == '__main__': 42 | crawl() 43 | -------------------------------------------------------------------------------- /news-crawler/bee_server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # encoding: utf8 3 | # author: veelion 4 | # file: bee_server.py 5 | 6 | from sanic import Sanic 7 | from sanic import response 8 | 9 | from urlpool import UrlPool 10 | 11 | urlpool = UrlPool(__file__) 12 | 13 | # 初始化urlpool,根据你的需要进行修改 14 | hub_urls = [] 15 | urlpool.set_hubs(hub_urls, 300) 16 | urlpool.add('https://news.sina.com.cn/') 17 | 18 | # init 19 | app = Sanic(__name__) 20 | 21 | 22 | @app.listener('after_server_stop') 23 | async def cache_urlpool(app, loop): 24 | global urlpool 25 | print('caching urlpool after_server_stop') 26 | del urlpool 27 | print('bye!') 28 | 29 | 30 | @app.route('/task') 31 | async def task_get(request): 32 | count = request.args.get('count', 10) 33 | try: 34 | count = int(count) 35 | except: 36 | count = 10 37 | urls = urlpool.pop(count) 38 | return response.json(urls) 39 | 40 | 41 | @app.route('/task', methods=['POST', ]) 42 | async def task_post(request): 43 | result = request.json 44 | urlpool.set_status(result['url'], result['status']) 45 | if result['url_real'] != result['url']: 46 | urlpool.set_status(result['url_real'], result['status']) 47 | if result['newurls']: 48 | print('receive URLs:', len(result['newurls'])) 49 | for url in result['newurls']: 50 | urlpool.add(url) 51 | return response.text('ok') 52 | 53 | 54 | if __name__ == '__main__': 55 | app.run( 56 | host='0.0.0.0', 57 | port=8080, 58 | debug=False, 59 | access_log=False, 60 | workers=1) 61 | 62 | -------------------------------------------------------------------------------- /news-crawler/lxml_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Author: veelion 3 | 4 | 5 | import re 6 | import requests 7 | import lxml.html 8 | from pprint import pprint 9 | 10 | 11 | def parse(li): 12 | item = {} 13 | # class="thumb"的div有两个,第一个是类别链接,第二个是文章链接 14 | thumb = li.xpath('./div[@class="thumb"]/a') 15 | item['cat'] = thumb[0].text 16 | item['link'] = thumb[1].get('href') 17 | 18 | # 获取title 19 | el_title = li.xpath('.//h2[@class="info-tit"]/a')[0] 20 | item['title'] = el_title.text 21 | 22 | el_info = li.xpath('.//div[@class="info-item"]/span') 23 | for span in el_info: 24 | attr = span.get('class') 25 | if attr == 'author': 26 | item['author'] = span.text_content() 27 | elif attr == 'time': 28 | item['time'] = span.text_content() 29 | elif attr == 'view': 30 | digit = re.findall(r'\d+', span.text_content())[0] 31 | item['view_count'] = int(digit) 32 | elif attr == 'cmt': 33 | digit = re.findall(r'\d+', span.text_content())[0] 34 | item['cmt_count'] = int(digit) 35 | return item 36 | 37 | 38 | def main(): 39 | url = 'https://www.yuanrenxue.com/' 40 | headers = {'User-Agent': 'Firefox'} 41 | resp = requests.get(url, headers=headers) 42 | html = resp.content.decode('utf8') 43 | doc = lxml.html.fromstring(html) 44 | xp = '//ul[@id="postlist"]/li' 45 | lis = doc.xpath(xp) 46 | print('lis:', len(lis)) 47 | 48 | articles = [parse(li) for li in lis] 49 | print('articles:', len(articles)) 50 | pprint(articles[0]) 51 | 52 | 53 | if __name__ == '__main__': 54 | main() 55 | 56 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /selenium-login.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Author: veelion 3 | 4 | import time 5 | import pickle 6 | import requests 7 | from selenium import webdriver 8 | from selenium.webdriver.common.keys import Keys 9 | 10 | 11 | def save_cookies(cookies, file_to_save): 12 | with open(file_to_save, 'wb') as f: 13 | pickle.dump(cookies, f) 14 | 15 | 16 | def login_auto(login_url, username, password, 17 | username_xpath, password_xpath, 18 | submit_xpath, cookies_file, browser=None): 19 | if browser is None: 20 | options = webdriver.ChromeOptions() 21 | options.add_argument('headless') 22 | options.add_argument('window-size=1200x600') 23 | browser = webdriver.Chrome(chrome_options=options) 24 | browser.maximize_window() 25 | browser.get(login_url) 26 | time.sleep(9) # 等登录加载完成 27 | browser.find_element_by_xpath(username_xpath).send_keys(username) 28 | browser.find_element_by_xpath(password_xpath).send_keys(password) 29 | browser.find_element_by_xpath(submit_xpath).send_keys(Keys.ENTER) 30 | time.sleep(9) # 等登录加载完成 31 | cookies = browser.get_cookies() 32 | print(cookies) 33 | save_cookies(cookies, cookies_file) 34 | 35 | 36 | def login_manually(login_url, cookies_file, browser=None): 37 | # 既然是手动,这里就不自动填写用户名和密码了 38 | if browser is None: 39 | browser = webdriver.Chrome() 40 | browser.get(login_url) 41 | time.sleep(30) # 给自己多了点时间输入用户名、密码、验证码 42 | cookies = browser.get_cookies() 43 | print(cookies) 44 | save_cookies(cookies, cookies_file) 45 | 46 | 47 | def load_to_browser(cookies_file, browser=None): 48 | with open(cookies_file, 'rb') as f: 49 | cookies = pickle.load(f) 50 | if browser is None: 51 | browser = webdriver.Chrome() 52 | for cookie in cookies: 53 | browser.add_cookie(cookie) 54 | return browser 55 | 56 | 57 | def load_to_requests(cookies_file, session=None): 58 | with open(cookies_file, 'rb') as f: 59 | cookies = pickle.load(f) 60 | if session is None: 61 | session = requests.Session() 62 | for cookie in cookies: 63 | session.cookies.set(cookie['name'], cookie['value']) 64 | 65 | 66 | if __name__ == '__main__': 67 | from sys import argv 68 | if argv[1] == 'manually': 69 | # login_url = 'https://passport.bilibili.com/login' 70 | login_url = 'https://www.zhihu.com/signin' 71 | login_manually(login_url, 'z-.cookies') 72 | elif argv[1] == 'auto': 73 | login_url = 'https://weibo.com/' 74 | username_xpath = '//input[@id="loginname"]' 75 | password_xpath = '//input[@name="password"]' 76 | submit_xpath = '//a[@action-type="btn_submit"]' 77 | username = 'your-username' 78 | password = 'your-password' 79 | login_auto(login_url, username, password, username_xpath, password_xpath, submit_xpath, 'z-weibo.cookies') 80 | else: 81 | print('invalid option') 82 | 83 | -------------------------------------------------------------------------------- /news-crawler/news-crawler-sync.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Author: veelion 3 | 4 | import urllib.parse as urlparse 5 | import lzma 6 | import farmhash 7 | import traceback 8 | 9 | 10 | from ezpymysql import Connection 11 | from urlpool import UrlPool 12 | import functions as fn 13 | import config 14 | 15 | class NewsCrawlerSync: 16 | def __init__(self, name): 17 | self.db = Connection( 18 | config.db_host, 19 | config.db_db, 20 | config.db_user, 21 | config.db_password 22 | ) 23 | self.logger = fn.init_file_logger(name + '.log') 24 | self.urlpool = UrlPool(name) 25 | self.hub_hosts = None 26 | self.load_hubs() 27 | 28 | def load_hubs(self,): 29 | sql = 'select url from crawler_hub' 30 | data = self.db.query(sql) 31 | self.hub_hosts = set() 32 | hubs = [] 33 | for d in data: 34 | host = urlparse.urlparse(d['url']).netloc 35 | self.hub_hosts.add(host) 36 | hubs.append(d['url']) 37 | self.urlpool.set_hubs(hubs, 300) 38 | 39 | def save_to_db(self, url, html): 40 | urlhash = farmhash.hash64(url) 41 | sql = 'select url from crawler_html where urlhash=%s' 42 | d = self.db.get(sql, urlhash) 43 | if d: 44 | if d['url'] != url: 45 | msg = 'farmhash collision: %s <=> %s' % (url, d['url']) 46 | self.logger.error(msg) 47 | return True 48 | if isinstance(html, str): 49 | html = html.encode('utf8') 50 | html_lzma = lzma.compress(html) 51 | sql = ('insert into crawler_html(urlhash, url, html_lzma) ' 52 | 'values(%s, %s, %s)') 53 | good = False 54 | try: 55 | self.db.execute(sql, urlhash, url, html_lzma) 56 | good = True 57 | except Exception as e: 58 | if e.args[0] == 1062: 59 | # Duplicate entry 60 | good = True 61 | pass 62 | else: 63 | traceback.print_exc() 64 | raise e 65 | return good 66 | 67 | def filter_good(self, urls): 68 | goodlinks = [] 69 | for url in urls: 70 | host = urlparse.urlparse(url).netloc 71 | if host in self.hub_hosts: 72 | goodlinks.append(url) 73 | return goodlinks 74 | 75 | def process(self, url, ishub): 76 | status, html, redirected_url = fn.downloader(url) 77 | self.urlpool.set_status(url, status) 78 | if redirected_url != url: 79 | self.urlpool.set_status(redirected_url, status) 80 | # 提取hub网页中的链接, 新闻网页中也有“相关新闻”的链接,按需提取 81 | if status != 200: 82 | return 83 | if ishub: 84 | newlinks = fn.extract_links_re(redirected_url, html) 85 | goodlinks = self.filter_good(newlinks) 86 | print("%s/%s, goodlinks/newlinks" % (len(goodlinks), len(newlinks))) 87 | self.urlpool.addmany(goodlinks) 88 | else: 89 | self.save_to_db(redirected_url, html) 90 | 91 | def run(self,): 92 | while 1: 93 | urls = self.urlpool.pop(5) 94 | for url, ishub in urls.items(): 95 | self.process(url, ishub) 96 | 97 | 98 | if __name__ == '__main__': 99 | crawler = NewsCrawlerSync('yuanrenxyue') 100 | crawler.run() 101 | -------------------------------------------------------------------------------- /news-crawler/news-crawler-async.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # File: news-crawler-async.py 3 | # Author: veelion 4 | 5 | import traceback 6 | import time 7 | import asyncio 8 | import aiohttp 9 | import urllib.parse as urlparse 10 | import farmhash 11 | import lzma 12 | 13 | import uvloop 14 | asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) 15 | 16 | import sanicdb 17 | 18 | from urlpool import UrlPool 19 | import functions as fn 20 | import config 21 | 22 | 23 | class NewsCrawlerAsync: 24 | def __init__(self, name): 25 | self._workers = 0 26 | self._workers_max = 30 27 | self.logger = fn.init_file_logger(name+ '.log') 28 | 29 | self.urlpool = UrlPool(name) 30 | 31 | self.loop = asyncio.get_event_loop() 32 | self.session = aiohttp.ClientSession(loop=self.loop) 33 | self.db = sanicdb.SanicDB( 34 | config.db_host, 35 | config.db_db, 36 | config.db_user, 37 | config.db_password, 38 | loop=self.loop 39 | ) 40 | 41 | async def load_hubs(self,): 42 | sql = 'select url from crawler_hub' 43 | data = await self.db.query(sql) 44 | self.hub_hosts = set() 45 | hubs = [] 46 | for d in data: 47 | host = urlparse.urlparse(d['url']).netloc 48 | self.hub_hosts.add(host) 49 | hubs.append(d['url']) 50 | self.urlpool.set_hubs(hubs, 300) 51 | 52 | async def save_to_db(self, url, html): 53 | urlhash = farmhash.hash64(url) 54 | sql = 'select url from crawler_html where urlhash=%s' 55 | d = await self.db.get(sql, urlhash) 56 | if d: 57 | if d['url'] != url: 58 | msg = 'farmhash collision: %s <=> %s' % (url, d['url']) 59 | self.logger.error(msg) 60 | return True 61 | if isinstance(html, str): 62 | html = html.encode('utf8') 63 | html_lzma = lzma.compress(html) 64 | sql = ('insert into crawler_html(urlhash, url, html_lzma) ' 65 | 'values(%s, %s, %s)') 66 | good = False 67 | try: 68 | await self.db.execute(sql, urlhash, url, html_lzma) 69 | good = True 70 | except Exception as e: 71 | if e.args[0] == 1062: 72 | # Duplicate entry 73 | good = True 74 | pass 75 | else: 76 | traceback.print_exc() 77 | raise e 78 | return good 79 | 80 | def filter_good(self, urls): 81 | goodlinks = [] 82 | for url in urls: 83 | host = urlparse.urlparse(url).netloc 84 | if host in self.hub_hosts: 85 | goodlinks.append(url) 86 | return goodlinks 87 | 88 | async def process(self, url, ishub): 89 | status, html, redirected_url = await fn.fetch(self.session, url) 90 | self.urlpool.set_status(url, status) 91 | if redirected_url != url: 92 | self.urlpool.set_status(redirected_url, status) 93 | # 提取hub网页中的链接, 新闻网页中也有“相关新闻”的链接,按需提取 94 | if status != 200: 95 | self._workers -= 1 96 | return 97 | if ishub: 98 | newlinks = fn.extract_links_re(redirected_url, html) 99 | goodlinks = self.filter_good(newlinks) 100 | print("%s/%s, goodlinks/newlinks" % (len(goodlinks), len(newlinks))) 101 | self.urlpool.addmany(goodlinks) 102 | else: 103 | await self.save_to_db(redirected_url, html) 104 | self._workers -= 1 105 | 106 | async def loop_crawl(self,): 107 | await self.load_hubs() 108 | last_rating_time = time.time() 109 | counter = 0 110 | while 1: 111 | tasks = self.urlpool.pop(self._workers_max) 112 | if not tasks: 113 | print('no url to crawl, sleep') 114 | await asyncio.sleep(3) 115 | continue 116 | for url, ishub in tasks.items(): 117 | self._workers += 1 118 | counter += 1 119 | print('crawl:', url) 120 | asyncio.ensure_future(self.process(url, ishub)) 121 | 122 | gap = time.time() - last_rating_time 123 | if gap > 5: 124 | rate = counter / gap 125 | print('\tloop_crawl() rate:%s, counter: %s, workers: %s' % (round(rate, 2), counter, self._workers)) 126 | last_rating_time = time.time() 127 | counter = 0 128 | if self._workers > self._workers_max: 129 | print('====== got workers_max, sleep 3 sec to next worker =====') 130 | await asyncio.sleep(3) 131 | 132 | def run(self): 133 | try: 134 | self.loop.run_until_complete(self.loop_crawl()) 135 | except KeyboardInterrupt: 136 | print('stopped by yourself!') 137 | del self.urlpool 138 | pass 139 | 140 | 141 | 142 | if __name__ == '__main__': 143 | nc = NewsCrawlerAsync('yrx-async') 144 | nc.run() 145 | 146 | 147 | -------------------------------------------------------------------------------- /news-crawler/functions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Author: veelion 3 | 4 | 5 | import re 6 | import urllib.parse as urlparse 7 | import requests 8 | import cchardet 9 | import traceback 10 | 11 | 12 | async def fetch(session, url, headers=None, timeout=9, binary=False): 13 | _headers = { 14 | 'User-Agent': ('Mozilla/5.0 (compatible; MSIE 9.0; ' 15 | 'Windows NT 6.1; Win64; x64; Trident/5.0)'), 16 | } 17 | if headers: 18 | _headers = headers 19 | try: 20 | async with session.get(url, headers=_headers, timeout=timeout) as response: 21 | status = response.status 22 | html = await response.read() 23 | if not binary: 24 | encoding = cchardet.detect(html)['encoding'] 25 | html = html.decode(encoding, errors='ignore') 26 | redirected_url = str(response.url) 27 | except Exception as e: 28 | msg = 'Failed download: {} | exception: {}, {}'.format(url, str(type(e)), str(e)) 29 | print(msg) 30 | html = '' 31 | status = 0 32 | redirected_url = url 33 | return status, html, redirected_url 34 | 35 | 36 | def downloader(url, timeout=10, headers=None, debug=False, binary=False): 37 | _headers = { 38 | 'User-Agent': ('Mozilla/5.0 (compatible; MSIE 9.0; ' 39 | 'Windows NT 6.1; Win64; x64; Trident/5.0)'), 40 | } 41 | redirected_url = url 42 | if headers: 43 | _headers = headers 44 | try: 45 | r = requests.get(url, headers=_headers, timeout=timeout) 46 | if binary: 47 | html = r.content 48 | else: 49 | encoding = cchardet.detect(r.content)['encoding'] 50 | html = r.content.decode(encoding, errors='ignore') 51 | status = r.status_code 52 | redirected_url = r.url 53 | except: 54 | if debug: 55 | traceback.print_exc() 56 | msg = 'failed download: {}'.format(url) 57 | print(msg) 58 | if binary: 59 | html = b'' 60 | else: 61 | html = '' 62 | status = 0 63 | return status, html, redirected_url 64 | 65 | 66 | g_bin_postfix = set([ 67 | 'exe', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx', 68 | 'pdf', 69 | 'jpg', 'png', 'bmp', 'jpeg', 'gif', 70 | 'zip', 'rar', 'tar', 'bz2', '7z', 'gz', 71 | 'flv', 'mp4', 'avi', 'wmv', 'mkv', 72 | 'apk', 73 | ]) 74 | 75 | g_news_postfix = [ 76 | '.html?', '.htm?', '.shtml?', 77 | '.shtm?', 78 | ] 79 | 80 | 81 | def clean_url(url): 82 | # 1. 是否为合法的http url 83 | if not url.startswith('http'): 84 | return '' 85 | # 2. 去掉静态化url后面的参数 86 | for np in g_news_postfix: 87 | p = url.find(np) 88 | if p > -1: 89 | p = url.find('?') 90 | url = url[:p] 91 | return url 92 | # 3. 不下载二进制类内容的链接 93 | up = urlparse.urlparse(url) 94 | path = up.path 95 | if not path: 96 | path = '/' 97 | postfix = path.split('.')[-1].lower() 98 | if postfix in g_bin_postfix: 99 | return '' 100 | 101 | # 4. 去掉标识流量来源的参数 102 | # badquery = ['spm', 'utm_source', 'utm_source', 'utm_medium', 'utm_campaign'] 103 | good_queries = [] 104 | for query in up.query.split('&'): 105 | qv = query.split('=') 106 | if qv[0].startswith('spm') or qv[0].startswith('utm_'): 107 | continue 108 | if len(qv) == 1: 109 | continue 110 | good_queries.append(query) 111 | query = '&'.join(good_queries) 112 | url = urlparse.urlunparse(( 113 | up.scheme, 114 | up.netloc, 115 | path, 116 | up.params, 117 | query, 118 | '' # crawler do not care fragment 119 | )) 120 | return url 121 | 122 | 123 | g_pattern_tag_a = re.compile(r']*?href=[\'"]?([^> \'"]+)[^>]*?>(.*?)', re.I|re.S|re.M) 124 | 125 | 126 | def extract_links_re(url, html): 127 | '''use re module to extract links from html''' 128 | newlinks = set() 129 | aa = g_pattern_tag_a.findall(html) 130 | for a in aa: 131 | link = a[0].strip() 132 | if not link: 133 | continue 134 | link = urlparse.urljoin(url, link) 135 | link = clean_url(link) 136 | if not link: 137 | continue 138 | newlinks.add(link) 139 | return newlinks 140 | 141 | 142 | def init_file_logger(fname): 143 | # config logging 144 | import logging 145 | from logging.handlers import TimedRotatingFileHandler 146 | ch = TimedRotatingFileHandler(fname, when="midnight") 147 | ch.setLevel(logging.INFO) 148 | # create formatter 149 | fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' 150 | formatter = logging.Formatter(fmt) 151 | # add formatter to ch 152 | ch.setFormatter(formatter) 153 | logger = logging.getLogger(fname) 154 | # add ch to logger 155 | logger.addHandler(ch) 156 | return logger 157 | 158 | 159 | 160 | if __name__ == '__main__': 161 | url = 'http://news.baidu.com/' 162 | s, html = downloader(url) 163 | print(s, len(html)) 164 | -------------------------------------------------------------------------------- /weibologin.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Author: veelion 3 | 4 | 5 | import re 6 | import pickle 7 | import json 8 | import base64 9 | import binascii 10 | import rsa 11 | import requests 12 | import urllib 13 | import time 14 | import traceback 15 | 16 | 17 | 18 | class WeiboLogin: 19 | user_agent = ( 20 | 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.11 (KHTML, like Gecko) ' 21 | 'Chrome/20.0.1132.57 Safari/536.11' 22 | ) 23 | 24 | def __init__(self, username, password, cookies_tosave='weibo.cookies'): 25 | self.weibo_user = username 26 | self.weibo_password = password 27 | self.cookies_tosave = cookies_tosave 28 | self.session = requests.session() 29 | self.session.headers['User-Agent'] = self.user_agent 30 | 31 | def encrypt_user(self, username): 32 | user = urllib.parse.quote(username) 33 | su = base64.b64encode(user.encode()) 34 | return su 35 | 36 | def encrypt_passwd(self, passwd, pubkey, servertime, nonce): 37 | key = rsa.PublicKey(int(pubkey, 16), int('10001', 16)) 38 | message = str(servertime) + '\t' + str(nonce) + '\n' + str(passwd) 39 | passwd = rsa.encrypt(message.encode('utf-8'), key) 40 | return binascii.b2a_hex(passwd) 41 | 42 | def prelogin(self): 43 | preloginTimeStart = int(time.time()*1000) 44 | url = ('https://login.sina.com.cn/sso/prelogin.php?' 45 | 'entry=weibo&callback=sinaSSOController.preloginCallBack&' 46 | 'su=&rsakt=mod&client=ssologin.js(v1.4.19)&' 47 | '_=%s') % preloginTimeStart 48 | resp = self.session.get(url) 49 | pre_login_str = re.match(r'[^{]+({.+?})', resp.text).group(1) 50 | pre_login = json.loads(pre_login_str) 51 | pre_login['preloginTimeStart'] = preloginTimeStart 52 | print ('pre_login 1:', pre_login) 53 | return pre_login 54 | 55 | def get_prelt(self, pre_login): 56 | prelt = int(time.time() * 1000) - pre_login['preloginTimeStart'] - pre_login['exectime'] 57 | return prelt 58 | 59 | def login(self): 60 | # step-1. prelogin 61 | pre_login = self.prelogin() 62 | su = self.encrypt_user(self.weibo_user) 63 | sp = self.encrypt_passwd( 64 | self.weibo_password, 65 | pre_login['pubkey'], 66 | pre_login['servertime'], 67 | pre_login['nonce'] 68 | ) 69 | prelt = self.get_prelt(pre_login) 70 | 71 | data = { 72 | 'entry': 'weibo', 73 | 'gateway': 1, 74 | 'from': '', 75 | 'savestate': 7, 76 | 'qrcode_flag': 'false', 77 | 'userticket': 1, 78 | 'pagerefer': '', 79 | 'vsnf': 1, 80 | 'su': su, 81 | 'service': 'miniblog', 82 | 'servertime': pre_login['servertime'], 83 | 'nonce': pre_login['nonce'], 84 | 'vsnf': 1, 85 | 'pwencode': 'rsa2', 86 | 'sp': sp, 87 | 'rsakv' : pre_login['rsakv'], 88 | 'encoding': 'UTF-8', 89 | 'prelt': prelt, 90 | 'sr': "1280*800", 91 | 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.' 92 | 'sinaSSOController.feedBackUrlCallBack', 93 | 'returntype': 'META' 94 | } 95 | 96 | # step-2 login POST 97 | login_url = 'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.19)' 98 | resp = self.session.post(login_url, data=data) 99 | print(resp.headers) 100 | print(resp.content) 101 | print('Step-2 response:', resp.text) 102 | 103 | # step-3 follow redirect 104 | redirect_url = re.findall(r'location\.replace\("(.*?)"', resp.text)[0] 105 | print('Step-3 to redirect:', redirect_url) 106 | resp = self.session.get(redirect_url) 107 | print('Step-3 response:', resp.text) 108 | 109 | # step-4 process step-3's response 110 | arrURL = re.findall(r'"arrURL":(.*?)\}', resp.text)[0] 111 | arrURL = json.loads(arrURL) 112 | print('CrossDomainUrl:', arrURL) 113 | for url in arrURL: 114 | print('set CrossDomainUrl:', url) 115 | resp_cross = self.session.get(url) 116 | print(resp_cross.text) 117 | redirect_url = re.findall(r'location\.replace\(\'(.*?)\'', resp.text)[0] 118 | print('Step-4 redirect_url:', redirect_url) 119 | resp = self.session.get(redirect_url) 120 | print(resp.text) 121 | with open(self.cookies_tosave, 'wb') as f: 122 | pickle.dump(self.session.cookies, f) 123 | return True 124 | 125 | def fetch(self, url): 126 | try: 127 | resp = self.session.get(url, timeout=10) 128 | return resp 129 | except: 130 | traceback.print_exc() 131 | return None 132 | 133 | if __name__ == '__main__': 134 | weibo_user = 'your-weibo-username' 135 | weibo_password = 'your-weibo-password' 136 | wb = WeiboLogin(weibo_user, weibo_password) 137 | wb.login() 138 | r = wb.fetch('https://weibo.com/') 139 | print(r.encoding) 140 | print(len(r.text)) 141 | -------------------------------------------------------------------------------- /news-crawler/bee_client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # encoding: UTF-8 3 | # author: veelion 4 | # file: bee_client.py 5 | 6 | import re 7 | import cchardet 8 | import traceback 9 | import time 10 | import json 11 | import asyncio 12 | import urllib.parse as urlparse 13 | import aiohttp 14 | import uvloop 15 | 16 | 17 | asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) 18 | 19 | 20 | 21 | p_tag_a = re.compile( 22 | r']*?href=[\'"]?([^> \'"]+)[^>]*?>(.*?)', 23 | re.I|re.S|re.M) 24 | 25 | 26 | def extract_links_re(url, html): 27 | newlinks = set() 28 | aa = p_tag_a.findall(html) 29 | for a in aa: 30 | link = a[0].strip() 31 | if not link: 32 | continue 33 | link = urlparse.urljoin(url, link) 34 | if not link.startswith('http'): 35 | continue 36 | newlinks.add(link) 37 | return newlinks 38 | 39 | 40 | 41 | class CrawlerClient: 42 | def __init__(self, ): 43 | self._workers = 0 44 | self.workers_max = 10 45 | self.server_host = 'localhost' 46 | self.server_port = 8080 47 | self.headers = {'User-Agent': ('Mozilla/5.0 (compatible; MSIE 9.0; ' 48 | 'Windows NT 6.1; Win64; x64; Trident/5.0)')} 49 | 50 | self.loop = asyncio.get_event_loop() 51 | self.queue = asyncio.Queue(loop=self.loop) 52 | self.session = aiohttp.ClientSession(loop=self.loop) 53 | 54 | async def download(self, url, timeout=25): 55 | status_code = 900 56 | html = '' 57 | url_now = url 58 | try: 59 | async with self.session.get(url_now, headers=self.headers, timeout=timeout) as response: 60 | status_code = response.status 61 | html = await response.read() 62 | encoding = cchardet.detect(html)['encoding'] 63 | html = html.decode(encoding, errors='ignore') 64 | url_now = str(response.url) 65 | except Exception as e: 66 | # traceback.print_exc() 67 | print('=== exception: ', e, type(e), str(e)) 68 | msg = 'Failed download: {} | exception: {}, {}'.format(url, str(type(e)), str(e)) 69 | print(msg) 70 | return status_code, html, url_now 71 | 72 | async def get_urls(self,): 73 | count = self.workers_max - self.queue.qsize() 74 | if count <= 0: 75 | print('no need to get urls this time') 76 | return None 77 | url = 'http://%s:%s/task?count=%s' % ( 78 | self.server_host, 79 | self.server_port, 80 | count 81 | ) 82 | try: 83 | async with self.session.get(url, timeout=3) as response: 84 | if response.status not in [200, 201]: 85 | return 86 | jsn = await response.text() 87 | urls = json.loads(jsn) 88 | msg = ('get_urls() to get [%s] but got[%s], @%s') % ( 89 | count, len(urls), 90 | time.strftime('%Y-%m-%d %H:%M:%S')) 91 | print(msg) 92 | for kv in urls.items(): 93 | await self.queue.put(kv) 94 | print('queue size:', self.queue.qsize(), ', _workers:', self._workers) 95 | except: 96 | traceback.print_exc() 97 | return 98 | 99 | async def send_result(self, result): 100 | url = 'http://%s:%s/task' % ( 101 | self.server_host, 102 | self.server_port 103 | ) 104 | try: 105 | async with self.session.post(url, json=result, timeout=3) as response: 106 | return response.status 107 | except: 108 | traceback.print_exc() 109 | pass 110 | 111 | def save_html(self, url, html): 112 | print('saved:', url, len(html)) 113 | 114 | def filter_good(self, urls): 115 | '''根据抓取目的过滤提取的URLs,只要你想要的''' 116 | good = [] 117 | for url in urls: 118 | if url.startswith('http'): 119 | good.append(url) 120 | return good 121 | 122 | async def process(self, url, ishub): 123 | status, html, url_now = await self.download(url) 124 | self._workers -= 1 125 | print('downloaded:', url, ', html:', len(html)) 126 | if html: 127 | newurls = extract_links_re(url, html) 128 | newurls = self.filter_good(newurls) 129 | self.save_html(url, html) 130 | else: 131 | newurls = [] 132 | result = { 133 | 'url': url, 134 | 'url_real': url_now, 135 | 'status': status, 136 | 'newurls': newurls, 137 | } 138 | await self.send_result(result) 139 | 140 | async def loop_get_urls(self,): 141 | print('loop_get_urls() start') 142 | while 1: 143 | await self.get_urls() 144 | await asyncio.sleep(1) 145 | 146 | async def loop_crawl(self,): 147 | print('loop_crawl() start') 148 | asyncio.ensure_future(self.loop_get_urls()) 149 | counter = 0 150 | while 1: 151 | item = await self.queue.get() 152 | url, url_level = item 153 | self._workers += 1 154 | counter += 1 155 | asyncio.ensure_future(self.process(url, url_level)) 156 | 157 | if self._workers > self.workers_max: 158 | print('====== got workers_max, sleep 3 sec to next worker =====') 159 | await asyncio.sleep(3) 160 | 161 | def start(self): 162 | try: 163 | self.loop.run_until_complete(self.loop_crawl()) 164 | except KeyboardInterrupt: 165 | print('stopped by yourself!') 166 | pass 167 | 168 | 169 | def run(): 170 | ant = CrawlerClient() 171 | ant.start() 172 | 173 | 174 | if __name__ == '__main__': 175 | run() 176 | 177 | -------------------------------------------------------------------------------- /news-crawler/ezpymysql.py: -------------------------------------------------------------------------------- 1 | #file: ezpymysql.py 2 | #Author: veelion 3 | 4 | """A lightweight wrapper around PyMySQL. 5 | only for python3 6 | 7 | """ 8 | 9 | import time 10 | import logging 11 | import traceback 12 | import pymysql 13 | import pymysql.cursors 14 | 15 | version = "0.7" 16 | version_info = (0, 7, 0, 0) 17 | 18 | 19 | class Connection(object): 20 | """A lightweight wrapper around PyMySQL. 21 | """ 22 | def __init__(self, host, database, user=None, password=None, 23 | port=0, 24 | max_idle_time=7 * 3600, connect_timeout=10, 25 | time_zone="+0:00", charset = "utf8mb4", sql_mode="TRADITIONAL"): 26 | self.host = host 27 | self.database = database 28 | self.max_idle_time = float(max_idle_time) 29 | 30 | args = dict(use_unicode=True, charset=charset, 31 | database=database, 32 | init_command=('SET time_zone = "%s"' % time_zone), 33 | cursorclass=pymysql.cursors.DictCursor, 34 | connect_timeout=connect_timeout, sql_mode=sql_mode) 35 | if user is not None: 36 | args["user"] = user 37 | if password is not None: 38 | args["passwd"] = password 39 | 40 | # We accept a path to a MySQL socket file or a host(:port) string 41 | if "/" in host: 42 | args["unix_socket"] = host 43 | else: 44 | self.socket = None 45 | pair = host.split(":") 46 | if len(pair) == 2: 47 | args["host"] = pair[0] 48 | args["port"] = int(pair[1]) 49 | else: 50 | args["host"] = host 51 | args["port"] = 3306 52 | if port: 53 | args['port'] = port 54 | 55 | self._db = None 56 | self._db_args = args 57 | self._last_use_time = time.time() 58 | try: 59 | self.reconnect() 60 | except Exception: 61 | logging.error("Cannot connect to MySQL on %s", self.host, 62 | exc_info=True) 63 | 64 | def _ensure_connected(self): 65 | # Mysql by default closes client connections that are idle for 66 | # 8 hours, but the client library does not report this fact until 67 | # you try to perform a query and it fails. Protect against this 68 | # case by preemptively closing and reopening the connection 69 | # if it has been idle for too long (7 hours by default). 70 | if (self._db is None or 71 | (time.time() - self._last_use_time > self.max_idle_time)): 72 | self.reconnect() 73 | self._last_use_time = time.time() 74 | 75 | def _cursor(self): 76 | self._ensure_connected() 77 | return self._db.cursor() 78 | 79 | def __del__(self): 80 | self.close() 81 | 82 | def close(self): 83 | """Closes this database connection.""" 84 | if getattr(self, "_db", None) is not None: 85 | self._db.close() 86 | self._db = None 87 | 88 | def reconnect(self): 89 | """Closes the existing database connection and re-opens it.""" 90 | self.close() 91 | self._db = pymysql.connect(**self._db_args) 92 | self._db.autocommit(True) 93 | 94 | def query(self, query, *parameters, **kwparameters): 95 | """Returns a row list for the given query and parameters.""" 96 | cursor = self._cursor() 97 | try: 98 | cursor.execute(query, kwparameters or parameters) 99 | result = cursor.fetchall() 100 | return result 101 | finally: 102 | cursor.close() 103 | 104 | def get(self, query, *parameters, **kwparameters): 105 | """Returns the (singular) row returned by the given query. 106 | """ 107 | cursor = self._cursor() 108 | try: 109 | cursor.execute(query, kwparameters or parameters) 110 | return cursor.fetchone() 111 | finally: 112 | cursor.close() 113 | 114 | def execute(self, query, *parameters, **kwparameters): 115 | """Executes the given query, returning the lastrowid from the query.""" 116 | cursor = self._cursor() 117 | try: 118 | cursor.execute(query, kwparameters or parameters) 119 | return cursor.lastrowid 120 | except Exception as e: 121 | if e.args[0] == 1062: 122 | pass 123 | else: 124 | traceback.print_exc() 125 | raise e 126 | finally: 127 | cursor.close() 128 | 129 | insert = execute 130 | 131 | ## =============== high level method for table =================== 132 | 133 | def table_has(self, table_name, field, value): 134 | if isinstance(value, str): 135 | value = value.encode('utf8') 136 | sql = 'SELECT %s FROM %s WHERE %s="%s"' % ( 137 | field, 138 | table_name, 139 | field, 140 | value) 141 | d = self.get(sql) 142 | return d 143 | 144 | def table_insert(self, table_name, item): 145 | '''item is a dict : key is mysql table field''' 146 | fields = list(item.keys()) 147 | values = list(item.values()) 148 | fieldstr = ','.join(fields) 149 | valstr = ','.join(['%s'] * len(item)) 150 | for i in range(len(values)): 151 | if isinstance(values[i], str): 152 | values[i] = values[i].encode('utf8') 153 | sql = 'INSERT INTO %s (%s) VALUES(%s)' % (table_name, fieldstr, valstr) 154 | try: 155 | last_id = self.execute(sql, *values) 156 | return last_id 157 | except Exception as e: 158 | if e.args[0] == 1062: 159 | # just skip duplicated item 160 | pass 161 | else: 162 | traceback.print_exc() 163 | print('sql:', sql) 164 | print('item:') 165 | for i in range(len(fields)): 166 | vs = str(values[i]) 167 | if len(vs) > 300: 168 | print(fields[i], ' : ', len(vs), type(values[i])) 169 | else: 170 | print(fields[i], ' : ', vs, type(values[i])) 171 | raise e 172 | 173 | def table_update(self, table_name, updates, 174 | field_where, value_where): 175 | '''updates is a dict of {field_update:value_update}''' 176 | upsets = [] 177 | values = [] 178 | for k, v in updates.items(): 179 | s = '%s=%%s' % k 180 | upsets.append(s) 181 | values.append(v) 182 | upsets = ','.join(upsets) 183 | sql = 'UPDATE %s SET %s WHERE %s="%s"' % ( 184 | table_name, 185 | upsets, 186 | field_where, value_where, 187 | ) 188 | self.execute(sql, *(values)) 189 | 190 | 191 | 192 | if __name__ == '__main__': 193 | db = Connection( 194 | 'localhost', 195 | 'db_name', 196 | 'user', 197 | 'password' 198 | ) 199 | # 获取一条记录 200 | sql = 'select * from test_table where id=%s' 201 | data = db.get(sql, 2) 202 | 203 | # 获取多天记录 204 | sql = 'select * from test_table where id>%s' 205 | data = db.query(sql, 2) 206 | 207 | # 插入一条数据 208 | sql = 'insert into test_table(title, url) values(%s, %s)' 209 | last_id = db.execute(sql, 'test', 'http://a.com/') 210 | # 或者 211 | last_id = db.insert(sql, 'test', 'http://a.com/') 212 | 213 | 214 | # 使用更高级的方法插入一条数据 215 | item = { 216 | 'title': 'test', 217 | 'url': 'http://a.com/', 218 | } 219 | last_id = db.table_insert('test_table', item) 220 | 221 | -------------------------------------------------------------------------------- /news-crawler/urlpool.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Author: veelion 3 | 4 | """ 5 | URL Pool for crawler to manage URLs 6 | """ 7 | 8 | import pickle 9 | import leveldb 10 | import time 11 | import urllib.parse as urlparse 12 | 13 | 14 | RED = '\x1b[31m' 15 | GRE = '\x1b[32m' 16 | BRO = '\x1b[33m' 17 | BLU = '\x1b[34m' 18 | PUR = '\x1b[35m' 19 | CYA = '\x1b[36m' 20 | WHI = '\x1b[37m' 21 | NOR = '\x1b[0m' 22 | 23 | 24 | class UrlDB: 25 | '''Use LevelDB to store URLs what have been done(succeed or faile) 26 | ''' 27 | status_failure = b'0' 28 | status_success = b'1' 29 | 30 | def __init__(self, db_name): 31 | self.name = db_name + '.urldb' 32 | self.db = leveldb.LevelDB(self.name) 33 | 34 | def set_success(self, url): 35 | if isinstance(url, str): 36 | url = url.encode('utf8') 37 | try: 38 | self.db.Put(url, self.status_success) 39 | s = True 40 | except: 41 | s = False 42 | return s 43 | 44 | def set_failure(self, url): 45 | if isinstance(url, str): 46 | url = url.encode('utf8') 47 | try: 48 | self.db.Put(url, self.status_failure) 49 | s = True 50 | except: 51 | s = False 52 | return s 53 | 54 | def has(self, url): 55 | if isinstance(url, str): 56 | url = url.encode('utf8') 57 | try: 58 | attr = self.db.Get(url) 59 | return attr 60 | except: 61 | pass 62 | return False 63 | 64 | 65 | class UrlPool: 66 | '''URL Pool for crawler to manage URLs 67 | ''' 68 | 69 | def __init__(self, pool_name): 70 | self.name = pool_name 71 | self.db = UrlDB(pool_name) 72 | 73 | self.waiting = {} # {host: set([urls]), } 按host分组,记录等待下载的URL 74 | self.pending = {} # {url: pended_time, } 记录已被取出(self.pop())但还未被更新状态(正在下载)的URL 75 | self.failure = {} # {url: times,} 记录失败的URL的次数 76 | self.failure_threshold = 3 77 | self.pending_threshold = 10 # pending的最大时间,过期要重新下载 78 | self.waiting_count = 0 # self.waiting 字典里面的url的个数 79 | self.max_hosts = ['', 0] # [host: url_count] 目前pool中url最多的host及其url数量 80 | self.hub_pool = {} # {url: last_query_time, } 存放hub url 81 | self.hub_refresh_span = 0 82 | self.load_cache() 83 | 84 | def __del__(self): 85 | self.dump_cache() 86 | 87 | def load_cache(self,): 88 | path = self.name + '.pkl' 89 | try: 90 | with open(path, 'rb') as f: 91 | self.waiting = pickle.load(f) 92 | cc = [len(v) for k, v in self.waiting.items()] 93 | print('saved pool loaded! urls:', sum(cc)) 94 | except: 95 | pass 96 | 97 | def dump_cache(self): 98 | path = self.name + '.pkl' 99 | try: 100 | with open(path, 'wb') as f: 101 | pickle.dump(self.waiting, f) 102 | print('self.waiting saved!') 103 | except: 104 | pass 105 | 106 | def set_hubs(self, urls, hub_refresh_span): 107 | self.hub_refresh_span = hub_refresh_span 108 | self.hub_pool = {} 109 | for url in urls: 110 | self.hub_pool[url] = 0 111 | 112 | def set_status(self, url, status_code): 113 | if url in self.pending: 114 | self.pending.pop(url) 115 | 116 | if status_code == 200: 117 | self.db.set_success(url) 118 | return 119 | if status_code == 404: 120 | self.db.set_failure(url) 121 | return 122 | if url in self.failure: 123 | self.failure[url] += 1 124 | if self.failure[url] > self.failure_threshold: 125 | self.db.set_failure(url) 126 | self.failure.pop(url) 127 | else: 128 | self.add(url) 129 | else: 130 | self.failure[url] = 1 131 | self.add(url) 132 | 133 | def push_to_pool(self, url): 134 | host = urlparse.urlparse(url).netloc 135 | if not host or '.' not in host: 136 | print('try to push_to_pool with bad url:', url, ', len of ur:', len(url)) 137 | return False 138 | if host in self.waiting: 139 | if url in self.waiting[host]: 140 | return True 141 | self.waiting[host].add(url) 142 | if len(self.waiting[host]) > self.max_hosts[1]: 143 | self.max_hosts[1] = len(self.waiting[host]) 144 | self.max_hosts[0] = host 145 | else: 146 | self.waiting[host] = set([url]) 147 | self.waiting_count += 1 148 | return True 149 | 150 | def add(self, url, always=False): 151 | if always: 152 | return self.push_to_pool(url) 153 | pended_time = self.pending.get(url, 0) 154 | if time.time() - pended_time < self.pending_threshold: 155 | print('being downloading:', url) 156 | return 157 | if self.db.has(url): 158 | return 159 | if pended_time: 160 | self.pending.pop(url) 161 | return self.push_to_pool(url) 162 | 163 | def addmany(self, urls, always=False): 164 | if isinstance(urls, str): 165 | print('urls is a str !!!!', urls) 166 | self.add(urls, always) 167 | else: 168 | for url in urls: 169 | self.add(url, always) 170 | 171 | def pop(self, count, hub_percent=50): 172 | print('\n\tmax of host:', self.max_hosts) 173 | 174 | # 取出的url有两种类型:hub=1, 普通=0 175 | url_attr_url = 0 176 | url_attr_hub = 1 177 | # 1. 首先取出hub,保证获取hub里面的最新url. 178 | hubs = {} 179 | hub_count = count * hub_percent // 100 180 | for hub in self.hub_pool: 181 | span = time.time() - self.hub_pool[hub] 182 | if span < self.hub_refresh_span: 183 | continue 184 | hubs[hub] = url_attr_hub # 1 means hub-url 185 | self.hub_pool[hub] = time.time() 186 | if len(hubs) >= hub_count: 187 | break 188 | 189 | # 2. 再取出普通url 190 | left_count = count - len(hubs) 191 | urls = {} 192 | for host in self.waiting: 193 | if not self.waiting[host]: 194 | continue 195 | url = self.waiting[host].pop() 196 | urls[url] = url_attr_url 197 | self.pending[url] = time.time() 198 | if self.max_hosts[0] == host: 199 | self.max_hosts[1] -= 1 200 | if len(urls) >= left_count: 201 | break 202 | self.waiting_count -= len(urls) 203 | print('To pop:%s, hubs: %s, urls: %s, hosts:%s' % (count, len(hubs), len(urls), len(self.waiting))) 204 | urls.update(hubs) 205 | return urls 206 | 207 | def size(self,): 208 | return self.waiting_count 209 | 210 | def empty(self,): 211 | return self.waiting_count == 0 212 | 213 | 214 | def test(): 215 | pool = UrlPool('crawl_urlpool') 216 | urls = [ 217 | 'http://1.a.cn/xyz', 218 | 'http://2.a.cn/xyz', 219 | 'http://3.a.cn/xyz', 220 | 'http://1.b.cn/xyz-1', 221 | 'http://1.b.cn/xyz-2', 222 | 'http://1.b.cn/xyz-3', 223 | 'http://1.b.cn/xyz-4', 224 | ] 225 | pool.addmany(urls) 226 | del pool 227 | 228 | pool = UrlPool('crawl_urlpool') 229 | urls = pool.pop(5) 230 | urls = list(urls.keys()) 231 | print('pop:', urls) 232 | print('pending:', pool.pending) 233 | 234 | pool.set_status(urls[0], 200) 235 | print('pending:', pool.pending) 236 | pool.set_status(urls[1], 404) 237 | print('pending:', pool.pending) 238 | 239 | 240 | if __name__ == '__main__': 241 | test() 242 | -------------------------------------------------------------------------------- /news-crawler/maincontent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | #File: maincontent.py 3 | #Author: veelion 4 | 5 | import re 6 | import time 7 | import traceback 8 | 9 | import cchardet 10 | import lxml 11 | import lxml.html 12 | from lxml.html import HtmlComment 13 | 14 | REGEXES = { 15 | 'positiveRe': re.compile( 16 | ('article|arti|body|content|entry|hentry|main|page|' 17 | 'artical|zoom|arti|context|message|editor|' 18 | 'pagination|post|txt|text|blog|story'), re.I), 19 | 'negativeRe': re.compile( 20 | ('copyright|combx|comment|com-|contact|foot|footer|footnote|decl|copy|' 21 | 'notice|' 22 | 'masthead|media|meta|outbrain|promo|related|scroll|link|pagebottom|bottom|' 23 | 'other|shoutbox|sidebar|sponsor|shopping|tags|tool|widget'), re.I), 24 | } 25 | 26 | 27 | class MainContent: 28 | def __init__(self,): 29 | self.non_content_tag = set([ 30 | 'head', 31 | 'meta', 32 | 'script', 33 | 'style', 34 | 'object', 'embed', 35 | 'iframe', 36 | 'marquee', 37 | 'select', 38 | ]) 39 | self.title = '' 40 | self.p_space = re.compile(r'\s') 41 | self.p_content_stop = re.compile(r'正文.*结束|正文下|相关阅读|声明') 42 | self.p_clean_tree = re.compile(r'author|post-add|copyright') 43 | 44 | def get_title(self, doc): 45 | title = '' 46 | title_el = doc.xpath('//title') 47 | if title_el: 48 | title = title_el[0].text_content().strip() 49 | if len(title) < 7: 50 | tt = doc.xpath('//meta[@name="title"]') 51 | if tt: 52 | title = tt[0].get('content', '') 53 | if len(title) < 7: 54 | tt = doc.xpath('//*[contains(@id, "title") or contains(@class, "title")]') 55 | if not tt: 56 | tt = doc.xpath('//*[contains(@id, "font01") or contains(@class, "font01")]') 57 | for t in tt: 58 | ti = t.text_content().strip() 59 | if ti in title and len(ti)*2 > len(title): 60 | title = ti 61 | break 62 | if len(ti) > 20: continue 63 | if len(ti) > len(title) or len(ti) > 7: 64 | title = ti 65 | return title 66 | 67 | def clean_title(self, title): 68 | spliters = [' - ', '–', '—', '-', '|', '::'] 69 | for s in spliters: 70 | if s not in title: 71 | continue 72 | tts = title.split(s) 73 | if len(tts) < 2: 74 | continue 75 | title = tts[0] 76 | break 77 | return title 78 | 79 | def calc_node_weight(self, node): 80 | weight = 1 81 | attr = '%s %s %s' % ( 82 | node.get('class', ''), 83 | node.get('id', ''), 84 | node.get('style', '') 85 | ) 86 | if attr: 87 | mm = REGEXES['negativeRe'].findall(attr) 88 | weight -= 2 * len(mm) 89 | mm = REGEXES['positiveRe'].findall(attr) 90 | weight += 4 * len(mm) 91 | if node.tag in ['div', 'p', 'table']: 92 | weight += 2 93 | return weight 94 | 95 | def get_main_block(self, url, html, clean_title=True): 96 | ''' return (title, etree_of_main_content_block) 97 | ''' 98 | if isinstance(html, bytes): 99 | encoding = cchardet.detect(html)['encoding'] 100 | if encoding is None: 101 | return None, None 102 | html = html.decode(encoding, 'ignore') 103 | try: 104 | doc = lxml.html.fromstring(html) 105 | doc.make_links_absolute(base_url=url) 106 | except : 107 | traceback.print_exc() 108 | return None, None 109 | self.title = self.get_title(doc) 110 | if clean_title: 111 | self.title = self.clean_title(self.title) 112 | body = doc.xpath('//body') 113 | if not body: 114 | return self.title, None 115 | candidates = [] 116 | nodes = body[0].getchildren() 117 | while nodes: 118 | node = nodes.pop(0) 119 | children = node.getchildren() 120 | tlen = 0 121 | for child in children: 122 | if isinstance(child, HtmlComment): 123 | continue 124 | if child.tag in self.non_content_tag: 125 | continue 126 | if child.tag == 'a': 127 | continue 128 | if child.tag == 'textarea': 129 | # FIXME: this tag is only part of content? 130 | continue 131 | attr = '%s%s%s' % (child.get('class', ''), 132 | child.get('id', ''), 133 | child.get('style')) 134 | if 'display' in attr and 'none' in attr: 135 | continue 136 | nodes.append(child) 137 | if child.tag == 'p': 138 | weight = 3 139 | else: 140 | weight = 1 141 | text = '' if not child.text else child.text.strip() 142 | tail = '' if not child.tail else child.tail.strip() 143 | tlen += (len(text) + len(tail)) * weight 144 | if tlen < 10: 145 | continue 146 | weight = self.calc_node_weight(node) 147 | candidates.append((node, tlen*weight)) 148 | if not candidates: 149 | return self.title, None 150 | candidates.sort(key=lambda a: a[1], reverse=True) 151 | good = candidates[0][0] 152 | if good.tag in ['p', 'pre', 'code', 'blockquote']: 153 | for i in range(5): 154 | good = good.getparent() 155 | if good.tag == 'div': 156 | break 157 | good = self.clean_node(good, url) 158 | return self.title, good 159 | 160 | def clean_node(self, tree, url=''): 161 | to_drop = [] 162 | drop_left = False 163 | for node in tree.iterdescendants(): 164 | if drop_left: 165 | to_drop.append(node) 166 | continue 167 | if isinstance(node, HtmlComment): 168 | to_drop.append(node) 169 | if self.p_content_stop.search(node.text): 170 | drop_left = True 171 | continue 172 | if node.tag in self.non_content_tag: 173 | to_drop.append(node) 174 | continue 175 | attr = '%s %s' % ( 176 | node.get('class', ''), 177 | node.get('id', '') 178 | ) 179 | if self.p_clean_tree.search(attr): 180 | to_drop.append(node) 181 | continue 182 | aa = node.xpath('.//a') 183 | if aa: 184 | text_node = len(self.p_space.sub('', node.text_content())) 185 | text_aa = 0 186 | for a in aa: 187 | alen = len(self.p_space.sub('', a.text_content())) 188 | if alen > 5: 189 | text_aa += alen 190 | if text_aa > text_node * 0.4: 191 | to_drop.append(node) 192 | for node in to_drop: 193 | try: 194 | node.drop_tree() 195 | except: 196 | pass 197 | return tree 198 | 199 | def get_text(self, doc): 200 | lxml.etree.strip_elements(doc, 'script') 201 | lxml.etree.strip_elements(doc, 'style') 202 | for ch in doc.iterdescendants(): 203 | if not isinstance(ch.tag, str): 204 | continue 205 | if ch.tag in ['div', 'h1', 'h2', 'h3', 'p', 'br', 'table', 'tr', 'dl']: 206 | if not ch.tail: 207 | ch.tail = '\n' 208 | else: 209 | ch.tail = '\n' + ch.tail.strip() + '\n' 210 | if ch.tag in ['th', 'td']: 211 | if not ch.text: 212 | ch.text = ' ' 213 | else: 214 | ch.text += ' ' 215 | # if ch.tail: 216 | # ch.tail = ch.tail.strip() 217 | lines = doc.text_content().split('\n') 218 | content = [] 219 | for l in lines: 220 | l = l.strip() 221 | if not l: 222 | continue 223 | content.append(l) 224 | return '\n'.join(content) 225 | 226 | def extract(self, url, html): 227 | '''return (title, content) 228 | ''' 229 | title, node = self.get_main_block(url, html) 230 | if node is None: 231 | print('\tno main block got !!!!!', url) 232 | return title, '', '' 233 | content = self.get_text(node) 234 | return title, content 235 | 236 | 237 | if __name__ == '__main__': 238 | from sys import argv 239 | f = argv[1] 240 | html = open(f, 'rb').read() 241 | encoding = cchardet.detect(html) 242 | print('encoding:', encoding) 243 | encoding = encoding['encoding'] 244 | html = html.decode(encoding, 'ignore') 245 | mc = MainContent() 246 | b = time.time() 247 | t, c = mc.extract('', html) 248 | e = time.time() 249 | print('title:', t) 250 | print('content:', len(c)) 251 | print('time cost: ', e-b) 252 | title, content = t, c 253 | txt = 'title:%s\ncontent:\n%s\n\n' % ( 254 | title, 255 | content, 256 | ) 257 | open(f+'-content2.txt','w').write(txt) 258 | --------------------------------------------------------------------------------