├── README.md
├── news-crawler
    ├── config.py
    ├── simple-news-crawler.py
    ├── bee_server.py
    ├── lxml_demo.py
    ├── news-crawler-sync.py
    ├── news-crawler-async.py
    ├── functions.py
    ├── bee_client.py
    ├── ezpymysql.py
    ├── urlpool.py
    └── maincontent.py
├── LICENSE
├── .gitignore
├── selenium-login.py
└── weibologin.py


/README.md:
--------------------------------------------------------------------------------
1 | # python-crawler
2 | 异步高并发分布式爬虫框架。
3 | 


--------------------------------------------------------------------------------
/news-crawler/config.py:
--------------------------------------------------------------------------------
1 | # Author: veelion
2 | 
3 | 
4 | db_host = 'localhost'
5 | db_db = 'crawler'
6 | db_user = 'your-user'
7 | db_password = 'your-password'
8 | 
9 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 veelion
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/news-crawler/simple-news-crawler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Author: veelion
 3 | 
 4 | 
 5 | import re
 6 | import requests
 7 | import tldextract
 8 | 
 9 | 
10 | def save_to_db(url, html):
11 |     print('%s : %s' % (url, len(html)))
12 | 
13 | 
14 | def crawl():
15 |     # 1. download baidu news
16 |     hub_url = 'http://news.baidu.com/'
17 |     html = requests.get(hub_url).text
18 | 
19 |     # 2. extract news links
20 |     ## 2.1 extract all links with 'href'
21 |     links = re.findall(r'href=[\'"]?(.*?)[\'"\s]', html)
22 |     print('find links:', len(links))
23 |     news_links = []
24 |     ## 2.2 filter non-news link
25 |     for link in links:
26 |         if not link.startswith('http'):
27 |             continue
28 |         tld = tldextract.extract(link)
29 |         if tld.domain == 'baidu':
30 |             continue
31 |         news_links.append(link)
32 | 
33 |     print('find news links:', len(news_links))
34 |     # 3. download news and save to database
35 |     for link in news_links:
36 |         html = requests.get(link).text
37 |         save_to_db(link, html)
38 |     print('works done!')
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     crawl()
43 | 


--------------------------------------------------------------------------------
/news-crawler/bee_server.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # encoding: utf8
 3 | # author: veelion
 4 | # file: bee_server.py
 5 | 
 6 | from sanic import Sanic
 7 | from sanic import response
 8 | 
 9 | from urlpool import UrlPool
10 | 
11 | urlpool = UrlPool(__file__)
12 | 
13 | # 初始化urlpool，根据你的需要进行修改
14 | hub_urls = []
15 | urlpool.set_hubs(hub_urls, 300)
16 | urlpool.add('https://news.sina.com.cn/')
17 | 
18 | # init
19 | app = Sanic(__name__)
20 | 
21 | 
22 | @app.listener('after_server_stop')
23 | async def cache_urlpool(app, loop):
24 |     global urlpool
25 |     print('caching urlpool after_server_stop')
26 |     del urlpool
27 |     print('bye!')
28 | 
29 | 
30 | @app.route('/task')
31 | async def task_get(request):
32 |     count = request.args.get('count', 10)
33 |     try:
34 |         count = int(count)
35 |     except:
36 |         count = 10
37 |     urls = urlpool.pop(count)
38 |     return response.json(urls)
39 | 
40 | 
41 | @app.route('/task', methods=['POST', ])
42 | async def task_post(request):
43 |     result = request.json
44 |     urlpool.set_status(result['url'], result['status'])
45 |     if result['url_real'] != result['url']:
46 |         urlpool.set_status(result['url_real'], result['status'])
47 |     if result['newurls']:
48 |         print('receive URLs:', len(result['newurls']))
49 |         for url in result['newurls']:
50 |             urlpool.add(url)
51 |     return response.text('ok')
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     app.run(
56 |         host='0.0.0.0',
57 |         port=8080,
58 |         debug=False,
59 |         access_log=False,
60 |         workers=1)
61 | 
62 | 


--------------------------------------------------------------------------------
/news-crawler/lxml_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Author: veelion
 3 | 
 4 | 
 5 | import re
 6 | import requests
 7 | import lxml.html
 8 | from pprint import pprint
 9 | 
10 | 
11 | def parse(li):
12 |     item = {}
13 |     # class="thumb"的div有两个<a>，第一个是类别链接，第二个是文章链接
14 |     thumb = li.xpath('./div[@class="thumb"]/a')
15 |     item['cat'] = thumb[0].text
16 |     item['link'] = thumb[1].get('href')
17 | 
18 |     # 获取title
19 |     el_title = li.xpath('.//h2[@class="info-tit"]/a')[0]
20 |     item['title'] = el_title.text
21 | 
22 |     el_info = li.xpath('.//div[@class="info-item"]/span')
23 |     for span in el_info:
24 |         attr = span.get('class')
25 |         if attr == 'author':
26 |             item['author'] = span.text_content()
27 |         elif attr == 'time':
28 |             item['time'] = span.text_content()
29 |         elif attr == 'view':
30 |             digit = re.findall(r'\d+', span.text_content())[0]
31 |             item['view_count'] = int(digit)
32 |         elif attr == 'cmt':
33 |             digit = re.findall(r'\d+', span.text_content())[0]
34 |             item['cmt_count'] = int(digit)
35 |     return item
36 | 
37 | 
38 | def main():
39 |     url = 'https://www.yuanrenxue.com/'
40 |     headers = {'User-Agent': 'Firefox'}
41 |     resp = requests.get(url, headers=headers)
42 |     html = resp.content.decode('utf8')
43 |     doc = lxml.html.fromstring(html)
44 |     xp = '//ul[@id="postlist"]/li'
45 |     lis = doc.xpath(xp)
46 |     print('lis:', len(lis))
47 | 
48 |     articles = [parse(li) for li in lis]
49 |     print('articles:', len(articles))
50 |     pprint(articles[0])
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     main()
55 | 
56 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/selenium-login.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Author: veelion
 3 | 
 4 | import time
 5 | import pickle
 6 | import requests
 7 | from selenium import webdriver
 8 | from selenium.webdriver.common.keys import Keys
 9 | 
10 | 
11 | def save_cookies(cookies, file_to_save):
12 |     with open(file_to_save, 'wb') as f:
13 |         pickle.dump(cookies, f)
14 | 
15 | 
16 | def login_auto(login_url, username, password,
17 |                username_xpath, password_xpath,
18 |                submit_xpath, cookies_file, browser=None):
19 |     if browser is None:
20 |         options = webdriver.ChromeOptions()
21 |         options.add_argument('headless')
22 |         options.add_argument('window-size=1200x600')
23 |         browser = webdriver.Chrome(chrome_options=options)
24 |     browser.maximize_window()
25 |     browser.get(login_url)
26 |     time.sleep(9) # 等登录加载完成
27 |     browser.find_element_by_xpath(username_xpath).send_keys(username)
28 |     browser.find_element_by_xpath(password_xpath).send_keys(password)
29 |     browser.find_element_by_xpath(submit_xpath).send_keys(Keys.ENTER)
30 |     time.sleep(9) # 等登录加载完成
31 |     cookies = browser.get_cookies()
32 |     print(cookies)
33 |     save_cookies(cookies, cookies_file)
34 | 
35 | 
36 | def login_manually(login_url, cookies_file, browser=None):
37 |     # 既然是手动，这里就不自动填写用户名和密码了
38 |     if browser is None:
39 |         browser = webdriver.Chrome()
40 |     browser.get(login_url)
41 |     time.sleep(30) # 给自己多了点时间输入用户名、密码、验证码
42 |     cookies = browser.get_cookies()
43 |     print(cookies)
44 |     save_cookies(cookies, cookies_file)
45 | 
46 | 
47 | def load_to_browser(cookies_file, browser=None):
48 |     with open(cookies_file, 'rb') as f:
49 |         cookies = pickle.load(f)
50 |     if browser is None:
51 |         browser = webdriver.Chrome()
52 |     for cookie in cookies:
53 |         browser.add_cookie(cookie)
54 |     return browser
55 | 
56 | 
57 | def load_to_requests(cookies_file, session=None):
58 |     with open(cookies_file, 'rb') as f:
59 |         cookies = pickle.load(f)
60 |     if session is None:
61 |         session = requests.Session()
62 |     for cookie in cookies:
63 |         session.cookies.set(cookie['name'], cookie['value'])
64 | 
65 | 
66 | if __name__ == '__main__':
67 |     from sys import argv
68 |     if argv[1] == 'manually':
69 |         # login_url = 'https://passport.bilibili.com/login'
70 |         login_url = 'https://www.zhihu.com/signin'
71 |         login_manually(login_url, 'z-.cookies')
72 |     elif argv[1] == 'auto':
73 |         login_url = 'https://weibo.com/'
74 |         username_xpath = '//input[@id="loginname"]'
75 |         password_xpath = '//input[@name="password"]'
76 |         submit_xpath = '//a[@action-type="btn_submit"]'
77 |         username = 'your-username'
78 |         password = 'your-password'
79 |         login_auto(login_url, username, password, username_xpath, password_xpath, submit_xpath, 'z-weibo.cookies')
80 |     else:
81 |         print('invalid option')
82 | 
83 | 


--------------------------------------------------------------------------------
/news-crawler/news-crawler-sync.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Author: veelion
  3 | 
  4 | import urllib.parse as urlparse
  5 | import lzma
  6 | import farmhash
  7 | import traceback
  8 | 
  9 | 
 10 | from ezpymysql import Connection
 11 | from urlpool import UrlPool
 12 | import functions as fn
 13 | import config
 14 | 
 15 | class NewsCrawlerSync:
 16 |     def __init__(self, name):
 17 |         self.db = Connection(
 18 |             config.db_host,
 19 |             config.db_db,
 20 |             config.db_user,
 21 |             config.db_password
 22 |         )
 23 |         self.logger = fn.init_file_logger(name + '.log')
 24 |         self.urlpool = UrlPool(name)
 25 |         self.hub_hosts = None
 26 |         self.load_hubs()
 27 | 
 28 |     def load_hubs(self,):
 29 |         sql = 'select url from crawler_hub'
 30 |         data = self.db.query(sql)
 31 |         self.hub_hosts = set()
 32 |         hubs = []
 33 |         for d in data:
 34 |             host = urlparse.urlparse(d['url']).netloc
 35 |             self.hub_hosts.add(host)
 36 |             hubs.append(d['url'])
 37 |         self.urlpool.set_hubs(hubs, 300)
 38 | 
 39 |     def save_to_db(self, url, html):
 40 |         urlhash = farmhash.hash64(url)
 41 |         sql = 'select url from crawler_html where urlhash=%s'
 42 |         d = self.db.get(sql, urlhash)
 43 |         if d:
 44 |             if d['url'] != url:
 45 |                 msg = 'farmhash collision: %s <=> %s' % (url, d['url'])
 46 |                 self.logger.error(msg)
 47 |             return True
 48 |         if isinstance(html, str):
 49 |             html = html.encode('utf8')
 50 |         html_lzma = lzma.compress(html)
 51 |         sql = ('insert into crawler_html(urlhash, url, html_lzma) '
 52 |                'values(%s, %s, %s)')
 53 |         good = False
 54 |         try:
 55 |             self.db.execute(sql, urlhash, url, html_lzma)
 56 |             good = True
 57 |         except Exception as e:
 58 |             if e.args[0] == 1062:
 59 |                 # Duplicate entry
 60 |                 good = True
 61 |                 pass
 62 |             else:
 63 |                 traceback.print_exc()
 64 |                 raise e
 65 |         return good
 66 | 
 67 |     def filter_good(self, urls):
 68 |         goodlinks = []
 69 |         for url in urls:
 70 |             host = urlparse.urlparse(url).netloc
 71 |             if host in self.hub_hosts:
 72 |                 goodlinks.append(url)
 73 |         return goodlinks
 74 | 
 75 |     def process(self, url, ishub):
 76 |         status, html, redirected_url = fn.downloader(url)
 77 |         self.urlpool.set_status(url, status)
 78 |         if redirected_url != url:
 79 |             self.urlpool.set_status(redirected_url, status)
 80 |         # 提取hub网页中的链接, 新闻网页中也有“相关新闻”的链接，按需提取
 81 |         if status != 200:
 82 |             return
 83 |         if ishub:
 84 |             newlinks = fn.extract_links_re(redirected_url, html)
 85 |             goodlinks = self.filter_good(newlinks)
 86 |             print("%s/%s, goodlinks/newlinks" % (len(goodlinks), len(newlinks)))
 87 |             self.urlpool.addmany(goodlinks)
 88 |         else:
 89 |             self.save_to_db(redirected_url, html)
 90 | 
 91 |     def run(self,):
 92 |         while 1:
 93 |             urls = self.urlpool.pop(5)
 94 |             for url, ishub in urls.items():
 95 |                 self.process(url, ishub)
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     crawler = NewsCrawlerSync('yuanrenxyue')
100 |     crawler.run()
101 | 


--------------------------------------------------------------------------------
/news-crawler/news-crawler-async.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # File: news-crawler-async.py
  3 | # Author: veelion
  4 | 
  5 | import traceback
  6 | import time
  7 | import asyncio
  8 | import aiohttp
  9 | import urllib.parse as urlparse
 10 | import farmhash
 11 | import lzma
 12 | 
 13 | import uvloop
 14 | asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 15 | 
 16 | import sanicdb
 17 | 
 18 | from urlpool import UrlPool
 19 | import functions as fn
 20 | import config
 21 | 
 22 | 
 23 | class NewsCrawlerAsync:
 24 |     def __init__(self, name):
 25 |         self._workers = 0
 26 |         self._workers_max = 30
 27 |         self.logger = fn.init_file_logger(name+ '.log')
 28 | 
 29 |         self.urlpool = UrlPool(name)
 30 | 
 31 |         self.loop = asyncio.get_event_loop()
 32 |         self.session = aiohttp.ClientSession(loop=self.loop)
 33 |         self.db = sanicdb.SanicDB(
 34 |             config.db_host,
 35 |             config.db_db,
 36 |             config.db_user,
 37 |             config.db_password,
 38 |             loop=self.loop
 39 |         )
 40 | 
 41 |     async def load_hubs(self,):
 42 |         sql = 'select url from crawler_hub'
 43 |         data = await self.db.query(sql)
 44 |         self.hub_hosts = set()
 45 |         hubs = []
 46 |         for d in data:
 47 |             host = urlparse.urlparse(d['url']).netloc
 48 |             self.hub_hosts.add(host)
 49 |             hubs.append(d['url'])
 50 |         self.urlpool.set_hubs(hubs, 300)
 51 | 
 52 |     async def save_to_db(self, url, html):
 53 |         urlhash = farmhash.hash64(url)
 54 |         sql = 'select url from crawler_html where urlhash=%s'
 55 |         d = await self.db.get(sql, urlhash)
 56 |         if d:
 57 |             if d['url'] != url:
 58 |                 msg = 'farmhash collision: %s <=> %s' % (url, d['url'])
 59 |                 self.logger.error(msg)
 60 |             return True
 61 |         if isinstance(html, str):
 62 |             html = html.encode('utf8')
 63 |         html_lzma = lzma.compress(html)
 64 |         sql = ('insert into crawler_html(urlhash, url, html_lzma) '
 65 |                'values(%s, %s, %s)')
 66 |         good = False
 67 |         try:
 68 |             await self.db.execute(sql, urlhash, url, html_lzma)
 69 |             good = True
 70 |         except Exception as e:
 71 |             if e.args[0] == 1062:
 72 |                 # Duplicate entry
 73 |                 good = True
 74 |                 pass
 75 |             else:
 76 |                 traceback.print_exc()
 77 |                 raise e
 78 |         return good
 79 | 
 80 |     def filter_good(self, urls):
 81 |         goodlinks = []
 82 |         for url in urls:
 83 |             host = urlparse.urlparse(url).netloc
 84 |             if host in self.hub_hosts:
 85 |                 goodlinks.append(url)
 86 |         return goodlinks
 87 | 
 88 |     async def process(self, url, ishub):
 89 |         status, html, redirected_url = await fn.fetch(self.session, url)
 90 |         self.urlpool.set_status(url, status)
 91 |         if redirected_url != url:
 92 |             self.urlpool.set_status(redirected_url, status)
 93 |         # 提取hub网页中的链接, 新闻网页中也有“相关新闻”的链接，按需提取
 94 |         if status != 200:
 95 |             self._workers -= 1
 96 |             return
 97 |         if ishub:
 98 |             newlinks = fn.extract_links_re(redirected_url, html)
 99 |             goodlinks = self.filter_good(newlinks)
100 |             print("%s/%s, goodlinks/newlinks" % (len(goodlinks), len(newlinks)))
101 |             self.urlpool.addmany(goodlinks)
102 |         else:
103 |             await self.save_to_db(redirected_url, html)
104 |         self._workers -= 1
105 | 
106 |     async def loop_crawl(self,):
107 |         await self.load_hubs()
108 |         last_rating_time = time.time()
109 |         counter = 0
110 |         while 1:
111 |             tasks = self.urlpool.pop(self._workers_max)
112 |             if not tasks:
113 |                 print('no url to crawl, sleep')
114 |                 await asyncio.sleep(3)
115 |                 continue
116 |             for url, ishub in tasks.items():
117 |                 self._workers += 1
118 |                 counter += 1
119 |                 print('crawl:', url)
120 |                 asyncio.ensure_future(self.process(url, ishub))
121 | 
122 |             gap = time.time() - last_rating_time
123 |             if gap > 5:
124 |                 rate = counter / gap
125 |                 print('\tloop_crawl() rate:%s, counter: %s, workers: %s' % (round(rate, 2), counter, self._workers))
126 |                 last_rating_time = time.time()
127 |                 counter = 0
128 |             if self._workers > self._workers_max:
129 |                 print('====== got workers_max, sleep 3 sec to next worker =====')
130 |                 await asyncio.sleep(3)
131 | 
132 |     def run(self):
133 |         try:
134 |             self.loop.run_until_complete(self.loop_crawl())
135 |         except KeyboardInterrupt:
136 |             print('stopped by yourself!')
137 |             del self.urlpool
138 |             pass
139 | 
140 | 
141 | 
142 | if __name__ == '__main__':
143 |     nc = NewsCrawlerAsync('yrx-async')
144 |     nc.run()
145 | 
146 | 
147 | 


--------------------------------------------------------------------------------
/news-crawler/functions.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Author: veelion
  3 | 
  4 | 
  5 | import re
  6 | import urllib.parse as urlparse
  7 | import requests
  8 | import cchardet
  9 | import traceback
 10 | 
 11 | 
 12 | async def fetch(session, url, headers=None, timeout=9, binary=False):
 13 |     _headers = {
 14 |         'User-Agent': ('Mozilla/5.0 (compatible; MSIE 9.0; '
 15 |                        'Windows NT 6.1; Win64; x64; Trident/5.0)'),
 16 |     }
 17 |     if headers:
 18 |         _headers = headers
 19 |     try:
 20 |         async with session.get(url, headers=_headers, timeout=timeout) as response:
 21 |             status = response.status
 22 |             html = await response.read()
 23 |             if not binary:
 24 |                 encoding = cchardet.detect(html)['encoding']
 25 |                 html = html.decode(encoding, errors='ignore')
 26 |             redirected_url = str(response.url)
 27 |     except Exception as e:
 28 |         msg = 'Failed download: {} | exception: {}, {}'.format(url, str(type(e)), str(e))
 29 |         print(msg)
 30 |         html = ''
 31 |         status = 0
 32 |         redirected_url = url
 33 |     return status, html, redirected_url
 34 | 
 35 | 
 36 | def downloader(url, timeout=10, headers=None, debug=False, binary=False):
 37 |     _headers = {
 38 |         'User-Agent': ('Mozilla/5.0 (compatible; MSIE 9.0; '
 39 |                        'Windows NT 6.1; Win64; x64; Trident/5.0)'),
 40 |     }
 41 |     redirected_url = url
 42 |     if headers:
 43 |         _headers = headers
 44 |     try:
 45 |         r = requests.get(url, headers=_headers, timeout=timeout)
 46 |         if binary:
 47 |             html = r.content
 48 |         else:
 49 |             encoding = cchardet.detect(r.content)['encoding']
 50 |             html = r.content.decode(encoding, errors='ignore')
 51 |         status = r.status_code
 52 |         redirected_url = r.url
 53 |     except:
 54 |         if debug:
 55 |             traceback.print_exc()
 56 |         msg = 'failed download: {}'.format(url)
 57 |         print(msg)
 58 |         if binary:
 59 |             html = b''
 60 |         else:
 61 |             html = ''
 62 |         status = 0
 63 |     return status, html, redirected_url
 64 | 
 65 | 
 66 | g_bin_postfix = set([
 67 |     'exe', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx',
 68 |     'pdf',
 69 |     'jpg', 'png', 'bmp', 'jpeg', 'gif',
 70 |     'zip', 'rar', 'tar', 'bz2', '7z', 'gz',
 71 |     'flv', 'mp4', 'avi', 'wmv', 'mkv',
 72 |     'apk',
 73 | ])
 74 | 
 75 | g_news_postfix = [
 76 |     '.html?', '.htm?', '.shtml?',
 77 |     '.shtm?',
 78 | ]
 79 | 
 80 | 
 81 | def clean_url(url):
 82 |     # 1. 是否为合法的http url
 83 |     if not url.startswith('http'):
 84 |         return ''
 85 |     # 2. 去掉静态化url后面的参数
 86 |     for np in g_news_postfix:
 87 |         p = url.find(np)
 88 |         if p > -1:
 89 |             p = url.find('?')
 90 |             url = url[:p]
 91 |             return url
 92 |     # 3. 不下载二进制类内容的链接
 93 |     up = urlparse.urlparse(url)
 94 |     path = up.path
 95 |     if not path:
 96 |         path = '/'
 97 |     postfix = path.split('.')[-1].lower()
 98 |     if postfix in g_bin_postfix:
 99 |         return ''
100 | 
101 |     # 4. 去掉标识流量来源的参数
102 |     # badquery = ['spm', 'utm_source', 'utm_source', 'utm_medium', 'utm_campaign']
103 |     good_queries = []
104 |     for query in up.query.split('&'):
105 |         qv = query.split('=')
106 |         if qv[0].startswith('spm') or qv[0].startswith('utm_'):
107 |             continue
108 |         if len(qv) == 1:
109 |             continue
110 |         good_queries.append(query)
111 |     query = '&'.join(good_queries)
112 |     url = urlparse.urlunparse((
113 |         up.scheme,
114 |         up.netloc,
115 |         path,
116 |         up.params,
117 |         query,
118 |         ''  #  crawler do not care fragment
119 |     ))
120 |     return url
121 | 
122 | 
123 | g_pattern_tag_a = re.compile(r'<a[^>]*?href=[\'"]?([^> \'"]+)[^>]*?>(.*?)</a>', re.I|re.S|re.M)
124 | 
125 | 
126 | def extract_links_re(url, html):
127 |     '''use re module to extract links from html'''
128 |     newlinks = set()
129 |     aa = g_pattern_tag_a.findall(html)
130 |     for a in aa:
131 |         link = a[0].strip()
132 |         if not link:
133 |             continue
134 |         link = urlparse.urljoin(url, link)
135 |         link = clean_url(link)
136 |         if not link:
137 |             continue
138 |         newlinks.add(link)
139 |     return newlinks
140 | 
141 | 
142 | def init_file_logger(fname):
143 |     # config logging
144 |     import logging
145 |     from logging.handlers import TimedRotatingFileHandler
146 |     ch = TimedRotatingFileHandler(fname, when="midnight")
147 |     ch.setLevel(logging.INFO)
148 |     # create formatter
149 |     fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
150 |     formatter = logging.Formatter(fmt)
151 |     # add formatter to ch
152 |     ch.setFormatter(formatter)
153 |     logger = logging.getLogger(fname)
154 |     # add ch to logger
155 |     logger.addHandler(ch)
156 |     return logger
157 | 
158 | 
159 | 
160 | if __name__ == '__main__':
161 |     url = 'http://news.baidu.com/'
162 |     s, html = downloader(url)
163 |     print(s, len(html))
164 | 


--------------------------------------------------------------------------------
/weibologin.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Author: veelion
  3 | 
  4 | 
  5 | import re
  6 | import pickle
  7 | import json
  8 | import base64
  9 | import binascii
 10 | import rsa
 11 | import requests
 12 | import urllib
 13 | import time
 14 | import traceback
 15 | 
 16 | 
 17 | 
 18 | class WeiboLogin:
 19 |     user_agent = (
 20 |         'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.11 (KHTML, like Gecko) '
 21 |         'Chrome/20.0.1132.57 Safari/536.11'
 22 |     )
 23 | 
 24 |     def __init__(self, username, password, cookies_tosave='weibo.cookies'):
 25 |         self.weibo_user = username
 26 |         self.weibo_password = password
 27 |         self.cookies_tosave = cookies_tosave
 28 |         self.session = requests.session()
 29 |         self.session.headers['User-Agent'] = self.user_agent
 30 | 
 31 |     def encrypt_user(self, username):
 32 |         user = urllib.parse.quote(username)
 33 |         su = base64.b64encode(user.encode())
 34 |         return su
 35 | 
 36 |     def encrypt_passwd(self, passwd, pubkey, servertime, nonce):
 37 |         key = rsa.PublicKey(int(pubkey, 16), int('10001', 16))
 38 |         message = str(servertime) + '\t' + str(nonce) + '\n' + str(passwd)
 39 |         passwd = rsa.encrypt(message.encode('utf-8'), key)
 40 |         return binascii.b2a_hex(passwd)
 41 | 
 42 |     def prelogin(self):
 43 |         preloginTimeStart = int(time.time()*1000)
 44 |         url = ('https://login.sina.com.cn/sso/prelogin.php?'
 45 |                'entry=weibo&callback=sinaSSOController.preloginCallBack&'
 46 |                'su=&rsakt=mod&client=ssologin.js(v1.4.19)&'
 47 |                '_=%s') % preloginTimeStart
 48 |         resp = self.session.get(url)
 49 |         pre_login_str = re.match(r'[^{]+({.+?})', resp.text).group(1)
 50 |         pre_login = json.loads(pre_login_str)
 51 |         pre_login['preloginTimeStart'] = preloginTimeStart
 52 |         print ('pre_login 1:', pre_login)
 53 |         return pre_login
 54 | 
 55 |     def get_prelt(self, pre_login):
 56 |         prelt = int(time.time() * 1000) - pre_login['preloginTimeStart'] - pre_login['exectime']
 57 |         return prelt
 58 | 
 59 |     def login(self):
 60 |         # step-1. prelogin
 61 |         pre_login = self.prelogin()
 62 |         su = self.encrypt_user(self.weibo_user)
 63 |         sp = self.encrypt_passwd(
 64 |             self.weibo_password,
 65 |             pre_login['pubkey'],
 66 |             pre_login['servertime'],
 67 |             pre_login['nonce']
 68 |         )
 69 |         prelt = self.get_prelt(pre_login)
 70 | 
 71 |         data = {
 72 |             'entry': 'weibo',
 73 |             'gateway': 1,
 74 |             'from': '',
 75 |             'savestate': 7,
 76 |             'qrcode_flag': 'false',
 77 |             'userticket': 1,
 78 |             'pagerefer': '',
 79 |             'vsnf': 1,
 80 |             'su': su,
 81 |             'service': 'miniblog',
 82 |             'servertime': pre_login['servertime'],
 83 |             'nonce': pre_login['nonce'],
 84 |             'vsnf': 1,
 85 |             'pwencode': 'rsa2',
 86 |             'sp': sp,
 87 |             'rsakv' : pre_login['rsakv'],
 88 |             'encoding': 'UTF-8',
 89 |             'prelt': prelt,
 90 |             'sr': "1280*800",
 91 |             'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.'
 92 |                    'sinaSSOController.feedBackUrlCallBack',
 93 |             'returntype': 'META'
 94 |         }
 95 | 
 96 |         # step-2 login POST
 97 |         login_url = 'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.19)'
 98 |         resp = self.session.post(login_url, data=data)
 99 |         print(resp.headers)
100 |         print(resp.content)
101 |         print('Step-2 response:', resp.text)
102 | 
103 |         # step-3 follow redirect
104 |         redirect_url = re.findall(r'location\.replace\("(.*?)"', resp.text)[0]
105 |         print('Step-3 to redirect:', redirect_url)
106 |         resp = self.session.get(redirect_url)
107 |         print('Step-3 response:', resp.text)
108 | 
109 |         # step-4 process step-3's response
110 |         arrURL = re.findall(r'"arrURL":(.*?)\}', resp.text)[0]
111 |         arrURL = json.loads(arrURL)
112 |         print('CrossDomainUrl:', arrURL)
113 |         for url in arrURL:
114 |             print('set CrossDomainUrl:', url)
115 |             resp_cross = self.session.get(url)
116 |             print(resp_cross.text)
117 |         redirect_url = re.findall(r'location\.replace\(\'(.*?)\'', resp.text)[0]
118 |         print('Step-4 redirect_url:', redirect_url)
119 |         resp = self.session.get(redirect_url)
120 |         print(resp.text)
121 |         with open(self.cookies_tosave, 'wb') as f:
122 |             pickle.dump(self.session.cookies, f)
123 |         return True
124 | 
125 |     def fetch(self, url):
126 |         try:
127 |             resp = self.session.get(url, timeout=10)
128 |             return resp
129 |         except:
130 |             traceback.print_exc()
131 |             return None
132 | 
133 | if __name__ == '__main__':
134 |     weibo_user = 'your-weibo-username'
135 |     weibo_password = 'your-weibo-password'
136 |     wb = WeiboLogin(weibo_user, weibo_password)
137 |     wb.login()
138 |     r = wb.fetch('https://weibo.com/')
139 |     print(r.encoding)
140 |     print(len(r.text))
141 | 


--------------------------------------------------------------------------------
/news-crawler/bee_client.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # encoding: UTF-8
  3 | # author: veelion
  4 | # file: bee_client.py
  5 | 
  6 | import re
  7 | import cchardet
  8 | import traceback
  9 | import time
 10 | import json
 11 | import asyncio
 12 | import urllib.parse as urlparse
 13 | import aiohttp
 14 | import uvloop
 15 | 
 16 | 
 17 | asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 18 | 
 19 | 
 20 | 
 21 | p_tag_a = re.compile(
 22 |     r'<a[^>]*?href=[\'"]?([^> \'"]+)[^>]*?>(.*?)</a>',
 23 |     re.I|re.S|re.M)
 24 | 
 25 | 
 26 | def extract_links_re(url, html):
 27 |     newlinks = set()
 28 |     aa = p_tag_a.findall(html)
 29 |     for a in aa:
 30 |         link = a[0].strip()
 31 |         if not link:
 32 |             continue
 33 |         link = urlparse.urljoin(url, link)
 34 |         if not link.startswith('http'):
 35 |             continue
 36 |         newlinks.add(link)
 37 |     return newlinks
 38 | 
 39 | 
 40 | 
 41 | class CrawlerClient:
 42 |     def __init__(self, ):
 43 |         self._workers = 0
 44 |         self.workers_max = 10
 45 |         self.server_host = 'localhost'
 46 |         self.server_port = 8080
 47 |         self.headers = {'User-Agent': ('Mozilla/5.0 (compatible; MSIE 9.0; '
 48 |       'Windows NT 6.1; Win64; x64; Trident/5.0)')}
 49 | 
 50 |         self.loop = asyncio.get_event_loop()
 51 |         self.queue = asyncio.Queue(loop=self.loop)
 52 |         self.session = aiohttp.ClientSession(loop=self.loop)
 53 | 
 54 |     async def download(self, url, timeout=25):
 55 |         status_code = 900
 56 |         html = ''
 57 |         url_now = url
 58 |         try:
 59 |             async with self.session.get(url_now, headers=self.headers, timeout=timeout) as response:
 60 |                 status_code = response.status
 61 |                 html = await response.read()
 62 |                 encoding = cchardet.detect(html)['encoding']
 63 |                 html = html.decode(encoding, errors='ignore')
 64 |                 url_now = str(response.url)
 65 |         except Exception as e:
 66 |             # traceback.print_exc()
 67 |             print('=== exception: ', e, type(e), str(e))
 68 |             msg = 'Failed download: {} | exception: {}, {}'.format(url, str(type(e)), str(e))
 69 |             print(msg)
 70 |         return status_code, html, url_now
 71 | 
 72 |     async def get_urls(self,):
 73 |         count = self.workers_max - self.queue.qsize()
 74 |         if count <= 0:
 75 |             print('no need to get urls this time')
 76 |             return None
 77 |         url = 'http://%s:%s/task?count=%s' % (
 78 |             self.server_host,
 79 |             self.server_port,
 80 |             count
 81 |         )
 82 |         try:
 83 |             async with self.session.get(url, timeout=3) as response:
 84 |                 if response.status not in [200, 201]:
 85 |                     return
 86 |                 jsn = await response.text()
 87 |                 urls = json.loads(jsn)
 88 |                 msg = ('get_urls()  to get [%s] but got[%s], @%s') % (
 89 |                     count, len(urls),
 90 |                     time.strftime('%Y-%m-%d %H:%M:%S'))
 91 |                 print(msg)
 92 |                 for kv in urls.items():
 93 |                     await self.queue.put(kv)
 94 |                 print('queue size:', self.queue.qsize(), ', _workers:', self._workers)
 95 |         except:
 96 |             traceback.print_exc()
 97 |             return
 98 | 
 99 |     async def send_result(self, result):
100 |         url = 'http://%s:%s/task' % (
101 |             self.server_host,
102 |             self.server_port
103 |         )
104 |         try:
105 |             async with self.session.post(url, json=result, timeout=3) as response:
106 |                 return response.status
107 |         except:
108 |             traceback.print_exc()
109 |             pass
110 | 
111 |     def save_html(self, url, html):
112 |         print('saved:', url, len(html))
113 | 
114 |     def filter_good(self, urls):
115 |         '''根据抓取目的过滤提取的URLs，只要你想要的'''
116 |         good = []
117 |         for url in urls:
118 |             if url.startswith('http'):
119 |                 good.append(url)
120 |         return good
121 | 
122 |     async def process(self, url, ishub):
123 |         status, html, url_now = await self.download(url)
124 |         self._workers -= 1
125 |         print('downloaded:', url, ', html:', len(html))
126 |         if html:
127 |             newurls = extract_links_re(url, html)
128 |             newurls = self.filter_good(newurls)
129 |             self.save_html(url, html)
130 |         else:
131 |             newurls = []
132 |         result = {
133 |             'url': url,
134 |             'url_real': url_now,
135 |             'status': status,
136 |             'newurls': newurls,
137 |         }
138 |         await self.send_result(result)
139 | 
140 |     async def loop_get_urls(self,):
141 |         print('loop_get_urls() start')
142 |         while 1:
143 |             await self.get_urls()
144 |             await asyncio.sleep(1)
145 | 
146 |     async def loop_crawl(self,):
147 |         print('loop_crawl() start')
148 |         asyncio.ensure_future(self.loop_get_urls())
149 |         counter = 0
150 |         while 1:
151 |             item = await self.queue.get()
152 |             url, url_level = item
153 |             self._workers += 1
154 |             counter += 1
155 |             asyncio.ensure_future(self.process(url, url_level))
156 | 
157 |             if self._workers > self.workers_max:
158 |                 print('====== got workers_max, sleep 3 sec to next worker =====')
159 |                 await asyncio.sleep(3)
160 | 
161 |     def start(self):
162 |         try:
163 |             self.loop.run_until_complete(self.loop_crawl())
164 |         except KeyboardInterrupt:
165 |             print('stopped by yourself!')
166 |             pass
167 | 
168 | 
169 | def run():
170 |     ant = CrawlerClient()
171 |     ant.start()
172 | 
173 | 
174 | if __name__ == '__main__':
175 |     run()
176 | 
177 | 


--------------------------------------------------------------------------------
/news-crawler/ezpymysql.py:
--------------------------------------------------------------------------------
  1 | #file: ezpymysql.py
  2 | #Author: veelion
  3 | 
  4 | """A lightweight wrapper around PyMySQL.
  5 | only for python3
  6 | 
  7 | """
  8 | 
  9 | import time
 10 | import logging
 11 | import traceback
 12 | import pymysql
 13 | import pymysql.cursors
 14 | 
 15 | version = "0.7"
 16 | version_info = (0, 7, 0, 0)
 17 | 
 18 | 
 19 | class Connection(object):
 20 |     """A lightweight wrapper around PyMySQL.
 21 |     """
 22 |     def __init__(self, host, database, user=None, password=None,
 23 |                  port=0,
 24 |                  max_idle_time=7 * 3600, connect_timeout=10,
 25 |                  time_zone="+0:00", charset = "utf8mb4", sql_mode="TRADITIONAL"):
 26 |         self.host = host
 27 |         self.database = database
 28 |         self.max_idle_time = float(max_idle_time)
 29 | 
 30 |         args = dict(use_unicode=True, charset=charset,
 31 |                     database=database,
 32 |                     init_command=('SET time_zone = "%s"' % time_zone),
 33 |                     cursorclass=pymysql.cursors.DictCursor,
 34 |                     connect_timeout=connect_timeout, sql_mode=sql_mode)
 35 |         if user is not None:
 36 |             args["user"] = user
 37 |         if password is not None:
 38 |             args["passwd"] = password
 39 | 
 40 |         # We accept a path to a MySQL socket file or a host(:port) string
 41 |         if "/" in host:
 42 |             args["unix_socket"] = host
 43 |         else:
 44 |             self.socket = None
 45 |             pair = host.split(":")
 46 |             if len(pair) == 2:
 47 |                 args["host"] = pair[0]
 48 |                 args["port"] = int(pair[1])
 49 |             else:
 50 |                 args["host"] = host
 51 |                 args["port"] = 3306
 52 |         if port:
 53 |             args['port'] = port
 54 | 
 55 |         self._db = None
 56 |         self._db_args = args
 57 |         self._last_use_time = time.time()
 58 |         try:
 59 |             self.reconnect()
 60 |         except Exception:
 61 |             logging.error("Cannot connect to MySQL on %s", self.host,
 62 |                           exc_info=True)
 63 | 
 64 |     def _ensure_connected(self):
 65 |         # Mysql by default closes client connections that are idle for
 66 |         # 8 hours, but the client library does not report this fact until
 67 |         # you try to perform a query and it fails.  Protect against this
 68 |         # case by preemptively closing and reopening the connection
 69 |         # if it has been idle for too long (7 hours by default).
 70 |         if (self._db is None or
 71 |             (time.time() - self._last_use_time > self.max_idle_time)):
 72 |             self.reconnect()
 73 |         self._last_use_time = time.time()
 74 | 
 75 |     def _cursor(self):
 76 |         self._ensure_connected()
 77 |         return self._db.cursor()
 78 | 
 79 |     def __del__(self):
 80 |         self.close()
 81 | 
 82 |     def close(self):
 83 |         """Closes this database connection."""
 84 |         if getattr(self, "_db", None) is not None:
 85 |             self._db.close()
 86 |             self._db = None
 87 | 
 88 |     def reconnect(self):
 89 |         """Closes the existing database connection and re-opens it."""
 90 |         self.close()
 91 |         self._db = pymysql.connect(**self._db_args)
 92 |         self._db.autocommit(True)
 93 | 
 94 |     def query(self, query, *parameters, **kwparameters):
 95 |         """Returns a row list for the given query and parameters."""
 96 |         cursor = self._cursor()
 97 |         try:
 98 |             cursor.execute(query, kwparameters or parameters)
 99 |             result = cursor.fetchall()
100 |             return result
101 |         finally:
102 |             cursor.close()
103 | 
104 |     def get(self, query, *parameters, **kwparameters):
105 |         """Returns the (singular) row returned by the given query.
106 |         """
107 |         cursor = self._cursor()
108 |         try:
109 |             cursor.execute(query, kwparameters or parameters)
110 |             return cursor.fetchone()
111 |         finally:
112 |             cursor.close()
113 | 
114 |     def execute(self, query, *parameters, **kwparameters):
115 |         """Executes the given query, returning the lastrowid from the query."""
116 |         cursor = self._cursor()
117 |         try:
118 |             cursor.execute(query, kwparameters or parameters)
119 |             return cursor.lastrowid
120 |         except Exception as e:
121 |             if e.args[0] == 1062:
122 |                 pass
123 |             else:
124 |                 traceback.print_exc()
125 |                 raise e
126 |         finally:
127 |             cursor.close()
128 | 
129 |     insert = execute
130 | 
131 |     ## =============== high level method for table ===================
132 | 
133 |     def table_has(self, table_name, field, value):
134 |         if isinstance(value, str):
135 |             value = value.encode('utf8')
136 |         sql = 'SELECT %s FROM %s WHERE %s="%s"' % (
137 |             field,
138 |             table_name,
139 |             field,
140 |             value)
141 |         d = self.get(sql)
142 |         return d
143 | 
144 |     def table_insert(self, table_name, item):
145 |         '''item is a dict : key is mysql table field'''
146 |         fields = list(item.keys())
147 |         values = list(item.values())
148 |         fieldstr = ','.join(fields)
149 |         valstr = ','.join(['%s'] * len(item))
150 |         for i in range(len(values)):
151 |             if isinstance(values[i], str):
152 |                 values[i] = values[i].encode('utf8')
153 |         sql = 'INSERT INTO %s (%s) VALUES(%s)' % (table_name, fieldstr, valstr)
154 |         try:
155 |             last_id = self.execute(sql, *values)
156 |             return last_id
157 |         except Exception as e:
158 |             if e.args[0] == 1062:
159 |                 # just skip duplicated item
160 |                 pass
161 |             else:
162 |                 traceback.print_exc()
163 |                 print('sql:', sql)
164 |                 print('item:')
165 |                 for i in range(len(fields)):
166 |                     vs = str(values[i])
167 |                     if len(vs) > 300:
168 |                         print(fields[i], ' : ', len(vs), type(values[i]))
169 |                     else:
170 |                         print(fields[i], ' : ', vs, type(values[i]))
171 |                 raise e
172 | 
173 |     def table_update(self, table_name, updates,
174 |                      field_where, value_where):
175 |         '''updates is a dict of {field_update:value_update}'''
176 |         upsets = []
177 |         values = []
178 |         for k, v in updates.items():
179 |             s = '%s=%%s' % k
180 |             upsets.append(s)
181 |             values.append(v)
182 |         upsets = ','.join(upsets)
183 |         sql = 'UPDATE %s SET %s WHERE %s="%s"' % (
184 |             table_name,
185 |             upsets,
186 |             field_where, value_where,
187 |         )
188 |         self.execute(sql, *(values))
189 | 
190 | 
191 | 
192 | if __name__ == '__main__':
193 |     db = Connection(
194 |         'localhost',
195 |         'db_name',
196 |         'user',
197 |         'password'
198 |     )
199 |     # 获取一条记录
200 |     sql = 'select * from test_table where id=%s'
201 |     data = db.get(sql, 2)
202 | 
203 |     # 获取多天记录
204 |     sql = 'select * from test_table where id>%s'
205 |     data = db.query(sql, 2)
206 | 
207 |     # 插入一条数据
208 |     sql = 'insert into test_table(title, url) values(%s, %s)'
209 |     last_id = db.execute(sql, 'test', 'http://a.com/')
210 |     # 或者
211 |     last_id = db.insert(sql, 'test', 'http://a.com/')
212 | 
213 | 
214 |     # 使用更高级的方法插入一条数据
215 |     item = {
216 |         'title': 'test',
217 |         'url': 'http://a.com/',
218 |     }
219 |     last_id = db.table_insert('test_table', item)
220 | 
221 | 


--------------------------------------------------------------------------------
/news-crawler/urlpool.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Author: veelion
  3 | 
  4 | """
  5 | URL Pool for crawler to manage URLs
  6 | """
  7 | 
  8 | import pickle
  9 | import leveldb
 10 | import time
 11 | import urllib.parse as urlparse
 12 | 
 13 | 
 14 | RED = '\x1b[31m'
 15 | GRE = '\x1b[32m'
 16 | BRO = '\x1b[33m'
 17 | BLU = '\x1b[34m'
 18 | PUR = '\x1b[35m'
 19 | CYA = '\x1b[36m'
 20 | WHI = '\x1b[37m'
 21 | NOR = '\x1b[0m'
 22 | 
 23 | 
 24 | class UrlDB:
 25 |     '''Use LevelDB to store URLs what have been done(succeed or faile)
 26 |     '''
 27 |     status_failure = b'0'
 28 |     status_success = b'1'
 29 | 
 30 |     def __init__(self, db_name):
 31 |         self.name = db_name + '.urldb'
 32 |         self.db = leveldb.LevelDB(self.name)
 33 | 
 34 |     def set_success(self, url):
 35 |         if isinstance(url, str):
 36 |             url = url.encode('utf8')
 37 |         try:
 38 |             self.db.Put(url, self.status_success)
 39 |             s = True
 40 |         except:
 41 |             s = False
 42 |         return s
 43 | 
 44 |     def set_failure(self, url):
 45 |         if isinstance(url, str):
 46 |             url = url.encode('utf8')
 47 |         try:
 48 |             self.db.Put(url, self.status_failure)
 49 |             s = True
 50 |         except:
 51 |             s = False
 52 |         return s
 53 | 
 54 |     def has(self, url):
 55 |         if isinstance(url, str):
 56 |             url = url.encode('utf8')
 57 |         try:
 58 |             attr = self.db.Get(url)
 59 |             return attr
 60 |         except:
 61 |             pass
 62 |         return False
 63 | 
 64 | 
 65 | class UrlPool:
 66 |     '''URL Pool for crawler to manage URLs
 67 |     '''
 68 | 
 69 |     def __init__(self, pool_name):
 70 |         self.name = pool_name
 71 |         self.db = UrlDB(pool_name)
 72 | 
 73 |         self.waiting = {}  # {host: set([urls]), } 按host分组，记录等待下载的URL
 74 |         self.pending = {}  # {url: pended_time, } 记录已被取出（self.pop()）但还未被更新状态（正在下载）的URL
 75 |         self.failure = {}  # {url: times,} 记录失败的URL的次数
 76 |         self.failure_threshold = 3
 77 |         self.pending_threshold = 10  # pending的最大时间，过期要重新下载
 78 |         self.waiting_count = 0  # self.waiting 字典里面的url的个数
 79 |         self.max_hosts = ['', 0]  # [host: url_count] 目前pool中url最多的host及其url数量
 80 |         self.hub_pool = {}  # {url: last_query_time, }  存放hub url
 81 |         self.hub_refresh_span = 0
 82 |         self.load_cache()
 83 | 
 84 |     def __del__(self):
 85 |         self.dump_cache()
 86 | 
 87 |     def load_cache(self,):
 88 |         path = self.name + '.pkl'
 89 |         try:
 90 |             with open(path, 'rb') as f:
 91 |                 self.waiting = pickle.load(f)
 92 |             cc = [len(v) for k, v in self.waiting.items()]
 93 |             print('saved pool loaded! urls:', sum(cc))
 94 |         except:
 95 |             pass
 96 | 
 97 |     def dump_cache(self):
 98 |         path = self.name + '.pkl'
 99 |         try:
100 |             with open(path, 'wb') as f:
101 |                 pickle.dump(self.waiting, f)
102 |             print('self.waiting saved!')
103 |         except:
104 |             pass
105 | 
106 |     def set_hubs(self, urls, hub_refresh_span):
107 |         self.hub_refresh_span = hub_refresh_span
108 |         self.hub_pool = {}
109 |         for url in urls:
110 |             self.hub_pool[url] = 0
111 | 
112 |     def set_status(self, url, status_code):
113 |         if url in self.pending:
114 |             self.pending.pop(url)
115 | 
116 |         if status_code == 200:
117 |             self.db.set_success(url)
118 |             return
119 |         if status_code == 404:
120 |             self.db.set_failure(url)
121 |             return
122 |         if url in self.failure:
123 |             self.failure[url] += 1
124 |             if self.failure[url] > self.failure_threshold:
125 |                 self.db.set_failure(url)
126 |                 self.failure.pop(url)
127 |             else:
128 |                 self.add(url)
129 |         else:
130 |             self.failure[url] = 1
131 |             self.add(url)
132 | 
133 |     def push_to_pool(self, url):
134 |         host = urlparse.urlparse(url).netloc
135 |         if not host or '.' not in host:
136 |             print('try to push_to_pool with bad url:', url, ', len of ur:', len(url))
137 |             return False
138 |         if host in self.waiting:
139 |             if url in self.waiting[host]:
140 |                 return True
141 |             self.waiting[host].add(url)
142 |             if len(self.waiting[host]) > self.max_hosts[1]:
143 |                 self.max_hosts[1] = len(self.waiting[host])
144 |                 self.max_hosts[0] = host
145 |         else:
146 |             self.waiting[host] = set([url])
147 |         self.waiting_count += 1
148 |         return True
149 | 
150 |     def add(self, url, always=False):
151 |         if always:
152 |             return self.push_to_pool(url)
153 |         pended_time = self.pending.get(url, 0)
154 |         if time.time() - pended_time < self.pending_threshold:
155 |             print('being downloading:', url)
156 |             return
157 |         if self.db.has(url):
158 |             return
159 |         if pended_time:
160 |             self.pending.pop(url)
161 |         return self.push_to_pool(url)
162 | 
163 |     def addmany(self, urls, always=False):
164 |         if isinstance(urls, str):
165 |             print('urls is a str !!!!', urls)
166 |             self.add(urls, always)
167 |         else:
168 |             for url in urls:
169 |                 self.add(url, always)
170 | 
171 |     def pop(self, count, hub_percent=50):
172 |         print('\n\tmax of host:', self.max_hosts)
173 | 
174 |         # 取出的url有两种类型：hub=1, 普通=0
175 |         url_attr_url = 0
176 |         url_attr_hub = 1
177 |         # 1. 首先取出hub，保证获取hub里面的最新url.
178 |         hubs = {}
179 |         hub_count = count * hub_percent // 100
180 |         for hub in self.hub_pool:
181 |             span = time.time() - self.hub_pool[hub]
182 |             if span < self.hub_refresh_span:
183 |                 continue
184 |             hubs[hub] = url_attr_hub  # 1 means hub-url
185 |             self.hub_pool[hub] = time.time()
186 |             if len(hubs) >= hub_count:
187 |                 break
188 | 
189 |         # 2. 再取出普通url
190 |         left_count = count - len(hubs)
191 |         urls = {}
192 |         for host in self.waiting:
193 |             if not self.waiting[host]:
194 |                 continue
195 |             url = self.waiting[host].pop()
196 |             urls[url] = url_attr_url
197 |             self.pending[url] = time.time()
198 |             if self.max_hosts[0] == host:
199 |                 self.max_hosts[1] -= 1
200 |             if len(urls) >= left_count:
201 |                 break
202 |         self.waiting_count -= len(urls)
203 |         print('To pop:%s, hubs: %s, urls: %s, hosts:%s' % (count, len(hubs), len(urls), len(self.waiting)))
204 |         urls.update(hubs)
205 |         return urls
206 | 
207 |     def size(self,):
208 |         return self.waiting_count
209 | 
210 |     def empty(self,):
211 |         return self.waiting_count == 0
212 | 
213 | 
214 | def test():
215 |     pool = UrlPool('crawl_urlpool')
216 |     urls = [
217 |         'http://1.a.cn/xyz',
218 |         'http://2.a.cn/xyz',
219 |         'http://3.a.cn/xyz',
220 |         'http://1.b.cn/xyz-1',
221 |         'http://1.b.cn/xyz-2',
222 |         'http://1.b.cn/xyz-3',
223 |         'http://1.b.cn/xyz-4',
224 |     ]
225 |     pool.addmany(urls)
226 |     del pool
227 | 
228 |     pool = UrlPool('crawl_urlpool')
229 |     urls = pool.pop(5)
230 |     urls = list(urls.keys())
231 |     print('pop:', urls)
232 |     print('pending:', pool.pending)
233 | 
234 |     pool.set_status(urls[0], 200)
235 |     print('pending:', pool.pending)
236 |     pool.set_status(urls[1], 404)
237 |     print('pending:', pool.pending)
238 | 
239 | 
240 | if __name__ == '__main__':
241 |     test()
242 | 


--------------------------------------------------------------------------------
/news-crawler/maincontent.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #File: maincontent.py
  3 | #Author: veelion
  4 | 
  5 | import re
  6 | import time
  7 | import traceback
  8 | 
  9 | import cchardet
 10 | import lxml
 11 | import lxml.html
 12 | from lxml.html import HtmlComment
 13 | 
 14 | REGEXES = {
 15 |     'positiveRe': re.compile(
 16 |         ('article|arti|body|content|entry|hentry|main|page|'
 17 |          'artical|zoom|arti|context|message|editor|'
 18 |          'pagination|post|txt|text|blog|story'), re.I),
 19 |     'negativeRe': re.compile(
 20 |         ('copyright|combx|comment|com-|contact|foot|footer|footnote|decl|copy|'
 21 |          'notice|'
 22 |          'masthead|media|meta|outbrain|promo|related|scroll|link|pagebottom|bottom|'
 23 |          'other|shoutbox|sidebar|sponsor|shopping|tags|tool|widget'), re.I),
 24 | }
 25 | 
 26 | 
 27 | class MainContent:
 28 |     def __init__(self,):
 29 |         self.non_content_tag = set([
 30 |             'head',
 31 |             'meta',
 32 |             'script',
 33 |             'style',
 34 |             'object', 'embed',
 35 |             'iframe',
 36 |             'marquee',
 37 |             'select',
 38 |         ])
 39 |         self.title = ''
 40 |         self.p_space = re.compile(r'\s')
 41 |         self.p_content_stop = re.compile(r'正文.*结束|正文下|相关阅读|声明')
 42 |         self.p_clean_tree = re.compile(r'author|post-add|copyright')
 43 | 
 44 |     def get_title(self, doc):
 45 |         title = ''
 46 |         title_el = doc.xpath('//title')
 47 |         if title_el:
 48 |             title = title_el[0].text_content().strip()
 49 |         if len(title) < 7:
 50 |             tt = doc.xpath('//meta[@name="title"]')
 51 |             if tt:
 52 |                 title = tt[0].get('content', '')
 53 |         if len(title) < 7:
 54 |             tt = doc.xpath('//*[contains(@id, "title") or contains(@class, "title")]')
 55 |             if not tt:
 56 |                 tt =  doc.xpath('//*[contains(@id, "font01") or contains(@class, "font01")]')
 57 |             for t in tt:
 58 |                 ti = t.text_content().strip()
 59 |                 if ti in title and len(ti)*2 > len(title):
 60 |                     title = ti
 61 |                     break
 62 |                 if len(ti) > 20: continue
 63 |                 if len(ti) > len(title) or len(ti) > 7:
 64 |                     title = ti
 65 |         return title
 66 | 
 67 |     def clean_title(self, title):
 68 |         spliters = [' - ', '–', '—', '-', '|', '::']
 69 |         for s in spliters:
 70 |             if s not in title:
 71 |                 continue
 72 |             tts = title.split(s)
 73 |             if len(tts) < 2:
 74 |                 continue
 75 |             title = tts[0]
 76 |             break
 77 |         return title
 78 | 
 79 |     def calc_node_weight(self, node):
 80 |         weight = 1
 81 |         attr = '%s %s %s' % (
 82 |             node.get('class', ''),
 83 |             node.get('id', ''),
 84 |             node.get('style', '')
 85 |         )
 86 |         if attr:
 87 |             mm = REGEXES['negativeRe'].findall(attr)
 88 |             weight -= 2 * len(mm)
 89 |             mm = REGEXES['positiveRe'].findall(attr)
 90 |             weight += 4 * len(mm)
 91 |         if node.tag in ['div', 'p', 'table']:
 92 |             weight += 2
 93 |         return weight
 94 | 
 95 |     def get_main_block(self, url, html, clean_title=True):
 96 |         ''' return (title, etree_of_main_content_block)
 97 |         '''
 98 |         if isinstance(html, bytes):
 99 |             encoding = cchardet.detect(html)['encoding']
100 |             if encoding is None:
101 |                 return None, None
102 |             html = html.decode(encoding, 'ignore')
103 |         try:
104 |             doc = lxml.html.fromstring(html)
105 |             doc.make_links_absolute(base_url=url)
106 |         except :
107 |             traceback.print_exc()
108 |             return None, None
109 |         self.title = self.get_title(doc)
110 |         if clean_title:
111 |             self.title = self.clean_title(self.title)
112 |         body = doc.xpath('//body')
113 |         if not body:
114 |             return self.title, None
115 |         candidates = []
116 |         nodes = body[0].getchildren()
117 |         while nodes:
118 |             node = nodes.pop(0)
119 |             children = node.getchildren()
120 |             tlen = 0
121 |             for child in children:
122 |                 if isinstance(child, HtmlComment):
123 |                     continue
124 |                 if child.tag in self.non_content_tag:
125 |                     continue
126 |                 if child.tag == 'a':
127 |                     continue
128 |                 if child.tag == 'textarea':
129 |                     # FIXME: this tag is only part of content?
130 |                     continue
131 |                 attr = '%s%s%s' % (child.get('class', ''),
132 |                                    child.get('id', ''),
133 |                                    child.get('style'))
134 |                 if 'display' in attr and 'none' in attr:
135 |                     continue
136 |                 nodes.append(child)
137 |                 if child.tag == 'p':
138 |                     weight = 3
139 |                 else:
140 |                     weight = 1
141 |                 text = '' if not child.text else child.text.strip()
142 |                 tail = '' if not child.tail else child.tail.strip()
143 |                 tlen += (len(text) + len(tail)) * weight
144 |             if tlen < 10:
145 |                 continue
146 |             weight = self.calc_node_weight(node)
147 |             candidates.append((node, tlen*weight))
148 |         if not candidates:
149 |             return self.title, None
150 |         candidates.sort(key=lambda a: a[1], reverse=True)
151 |         good = candidates[0][0]
152 |         if good.tag in ['p', 'pre', 'code', 'blockquote']:
153 |             for i in range(5):
154 |                 good = good.getparent()
155 |                 if good.tag == 'div':
156 |                     break
157 |         good = self.clean_node(good, url)
158 |         return self.title, good
159 | 
160 |     def clean_node(self, tree, url=''):
161 |         to_drop = []
162 |         drop_left = False
163 |         for node in tree.iterdescendants():
164 |             if drop_left:
165 |                 to_drop.append(node)
166 |                 continue
167 |             if isinstance(node, HtmlComment):
168 |                 to_drop.append(node)
169 |                 if self.p_content_stop.search(node.text):
170 |                     drop_left = True
171 |                 continue
172 |             if node.tag in self.non_content_tag:
173 |                 to_drop.append(node)
174 |                 continue
175 |             attr = '%s %s' % (
176 |                 node.get('class', ''),
177 |                 node.get('id', '')
178 |             )
179 |             if self.p_clean_tree.search(attr):
180 |                 to_drop.append(node)
181 |                 continue
182 |             aa = node.xpath('.//a')
183 |             if aa:
184 |                 text_node = len(self.p_space.sub('', node.text_content()))
185 |                 text_aa = 0
186 |                 for a in aa:
187 |                     alen = len(self.p_space.sub('', a.text_content()))
188 |                     if alen > 5:
189 |                         text_aa += alen
190 |                 if text_aa > text_node * 0.4:
191 |                     to_drop.append(node)
192 |         for node in to_drop:
193 |             try:
194 |                 node.drop_tree()
195 |             except:
196 |                 pass
197 |         return tree
198 | 
199 |     def get_text(self, doc):
200 |         lxml.etree.strip_elements(doc, 'script')
201 |         lxml.etree.strip_elements(doc, 'style')
202 |         for ch in doc.iterdescendants():
203 |             if not isinstance(ch.tag, str):
204 |                 continue
205 |             if ch.tag in ['div', 'h1', 'h2', 'h3', 'p', 'br', 'table', 'tr', 'dl']:
206 |                 if not ch.tail:
207 |                     ch.tail = '\n'
208 |                 else:
209 |                     ch.tail = '\n' + ch.tail.strip() + '\n'
210 |             if ch.tag in ['th', 'td']:
211 |                 if not ch.text:
212 |                     ch.text = '  '
213 |                 else:
214 |                     ch.text += '  '
215 |             # if ch.tail:
216 |             #     ch.tail = ch.tail.strip()
217 |         lines = doc.text_content().split('\n')
218 |         content = []
219 |         for l in lines:
220 |             l = l.strip()
221 |             if not l:
222 |                 continue
223 |             content.append(l)
224 |         return '\n'.join(content)
225 | 
226 |     def extract(self, url, html):
227 |         '''return (title, content)
228 |         '''
229 |         title, node = self.get_main_block(url, html)
230 |         if node is None:
231 |             print('\tno main block got !!!!!', url)
232 |             return title, '', ''
233 |         content = self.get_text(node)
234 |         return title, content
235 | 
236 | 
237 | if __name__ == '__main__':
238 |     from sys import argv
239 |     f = argv[1]
240 |     html = open(f, 'rb').read()
241 |     encoding = cchardet.detect(html)
242 |     print('encoding:', encoding)
243 |     encoding = encoding['encoding']
244 |     html = html.decode(encoding, 'ignore')
245 |     mc = MainContent()
246 |     b = time.time()
247 |     t, c = mc.extract('', html)
248 |     e = time.time()
249 |     print('title:', t)
250 |     print('content:', len(c))
251 |     print('time cost: ', e-b)
252 |     title, content = t, c
253 |     txt = 'title:%s\ncontent:\n%s\n\n' % (
254 |         title,
255 |         content,
256 |     )
257 |     open(f+'-content2.txt','w').write(txt)
258 | 


--------------------------------------------------------------------------------