├── README.md
├── news-crawler
├── config.py
├── simple-news-crawler.py
├── bee_server.py
├── lxml_demo.py
├── news-crawler-sync.py
├── news-crawler-async.py
├── functions.py
├── bee_client.py
├── ezpymysql.py
├── urlpool.py
└── maincontent.py
├── LICENSE
├── .gitignore
├── selenium-login.py
└── weibologin.py
/README.md:
--------------------------------------------------------------------------------
1 | # python-crawler
2 | 异步高并发分布式爬虫框架。
3 |
--------------------------------------------------------------------------------
/news-crawler/config.py:
--------------------------------------------------------------------------------
1 | # Author: veelion
2 |
3 |
4 | db_host = 'localhost'
5 | db_db = 'crawler'
6 | db_user = 'your-user'
7 | db_password = 'your-password'
8 |
9 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 veelion
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/news-crawler/simple-news-crawler.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Author: veelion
3 |
4 |
5 | import re
6 | import requests
7 | import tldextract
8 |
9 |
10 | def save_to_db(url, html):
11 | print('%s : %s' % (url, len(html)))
12 |
13 |
14 | def crawl():
15 | # 1. download baidu news
16 | hub_url = 'http://news.baidu.com/'
17 | html = requests.get(hub_url).text
18 |
19 | # 2. extract news links
20 | ## 2.1 extract all links with 'href'
21 | links = re.findall(r'href=[\'"]?(.*?)[\'"\s]', html)
22 | print('find links:', len(links))
23 | news_links = []
24 | ## 2.2 filter non-news link
25 | for link in links:
26 | if not link.startswith('http'):
27 | continue
28 | tld = tldextract.extract(link)
29 | if tld.domain == 'baidu':
30 | continue
31 | news_links.append(link)
32 |
33 | print('find news links:', len(news_links))
34 | # 3. download news and save to database
35 | for link in news_links:
36 | html = requests.get(link).text
37 | save_to_db(link, html)
38 | print('works done!')
39 |
40 |
41 | if __name__ == '__main__':
42 | crawl()
43 |
--------------------------------------------------------------------------------
/news-crawler/bee_server.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # encoding: utf8
3 | # author: veelion
4 | # file: bee_server.py
5 |
6 | from sanic import Sanic
7 | from sanic import response
8 |
9 | from urlpool import UrlPool
10 |
11 | urlpool = UrlPool(__file__)
12 |
13 | # 初始化urlpool,根据你的需要进行修改
14 | hub_urls = []
15 | urlpool.set_hubs(hub_urls, 300)
16 | urlpool.add('https://news.sina.com.cn/')
17 |
18 | # init
19 | app = Sanic(__name__)
20 |
21 |
22 | @app.listener('after_server_stop')
23 | async def cache_urlpool(app, loop):
24 | global urlpool
25 | print('caching urlpool after_server_stop')
26 | del urlpool
27 | print('bye!')
28 |
29 |
30 | @app.route('/task')
31 | async def task_get(request):
32 | count = request.args.get('count', 10)
33 | try:
34 | count = int(count)
35 | except:
36 | count = 10
37 | urls = urlpool.pop(count)
38 | return response.json(urls)
39 |
40 |
41 | @app.route('/task', methods=['POST', ])
42 | async def task_post(request):
43 | result = request.json
44 | urlpool.set_status(result['url'], result['status'])
45 | if result['url_real'] != result['url']:
46 | urlpool.set_status(result['url_real'], result['status'])
47 | if result['newurls']:
48 | print('receive URLs:', len(result['newurls']))
49 | for url in result['newurls']:
50 | urlpool.add(url)
51 | return response.text('ok')
52 |
53 |
54 | if __name__ == '__main__':
55 | app.run(
56 | host='0.0.0.0',
57 | port=8080,
58 | debug=False,
59 | access_log=False,
60 | workers=1)
61 |
62 |
--------------------------------------------------------------------------------
/news-crawler/lxml_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Author: veelion
3 |
4 |
5 | import re
6 | import requests
7 | import lxml.html
8 | from pprint import pprint
9 |
10 |
11 | def parse(li):
12 | item = {}
13 | # class="thumb"的div有两个,第一个是类别链接,第二个是文章链接
14 | thumb = li.xpath('./div[@class="thumb"]/a')
15 | item['cat'] = thumb[0].text
16 | item['link'] = thumb[1].get('href')
17 |
18 | # 获取title
19 | el_title = li.xpath('.//h2[@class="info-tit"]/a')[0]
20 | item['title'] = el_title.text
21 |
22 | el_info = li.xpath('.//div[@class="info-item"]/span')
23 | for span in el_info:
24 | attr = span.get('class')
25 | if attr == 'author':
26 | item['author'] = span.text_content()
27 | elif attr == 'time':
28 | item['time'] = span.text_content()
29 | elif attr == 'view':
30 | digit = re.findall(r'\d+', span.text_content())[0]
31 | item['view_count'] = int(digit)
32 | elif attr == 'cmt':
33 | digit = re.findall(r'\d+', span.text_content())[0]
34 | item['cmt_count'] = int(digit)
35 | return item
36 |
37 |
38 | def main():
39 | url = 'https://www.yuanrenxue.com/'
40 | headers = {'User-Agent': 'Firefox'}
41 | resp = requests.get(url, headers=headers)
42 | html = resp.content.decode('utf8')
43 | doc = lxml.html.fromstring(html)
44 | xp = '//ul[@id="postlist"]/li'
45 | lis = doc.xpath(xp)
46 | print('lis:', len(lis))
47 |
48 | articles = [parse(li) for li in lis]
49 | print('articles:', len(articles))
50 | pprint(articles[0])
51 |
52 |
53 | if __name__ == '__main__':
54 | main()
55 |
56 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
--------------------------------------------------------------------------------
/selenium-login.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Author: veelion
3 |
4 | import time
5 | import pickle
6 | import requests
7 | from selenium import webdriver
8 | from selenium.webdriver.common.keys import Keys
9 |
10 |
11 | def save_cookies(cookies, file_to_save):
12 | with open(file_to_save, 'wb') as f:
13 | pickle.dump(cookies, f)
14 |
15 |
16 | def login_auto(login_url, username, password,
17 | username_xpath, password_xpath,
18 | submit_xpath, cookies_file, browser=None):
19 | if browser is None:
20 | options = webdriver.ChromeOptions()
21 | options.add_argument('headless')
22 | options.add_argument('window-size=1200x600')
23 | browser = webdriver.Chrome(chrome_options=options)
24 | browser.maximize_window()
25 | browser.get(login_url)
26 | time.sleep(9) # 等登录加载完成
27 | browser.find_element_by_xpath(username_xpath).send_keys(username)
28 | browser.find_element_by_xpath(password_xpath).send_keys(password)
29 | browser.find_element_by_xpath(submit_xpath).send_keys(Keys.ENTER)
30 | time.sleep(9) # 等登录加载完成
31 | cookies = browser.get_cookies()
32 | print(cookies)
33 | save_cookies(cookies, cookies_file)
34 |
35 |
36 | def login_manually(login_url, cookies_file, browser=None):
37 | # 既然是手动,这里就不自动填写用户名和密码了
38 | if browser is None:
39 | browser = webdriver.Chrome()
40 | browser.get(login_url)
41 | time.sleep(30) # 给自己多了点时间输入用户名、密码、验证码
42 | cookies = browser.get_cookies()
43 | print(cookies)
44 | save_cookies(cookies, cookies_file)
45 |
46 |
47 | def load_to_browser(cookies_file, browser=None):
48 | with open(cookies_file, 'rb') as f:
49 | cookies = pickle.load(f)
50 | if browser is None:
51 | browser = webdriver.Chrome()
52 | for cookie in cookies:
53 | browser.add_cookie(cookie)
54 | return browser
55 |
56 |
57 | def load_to_requests(cookies_file, session=None):
58 | with open(cookies_file, 'rb') as f:
59 | cookies = pickle.load(f)
60 | if session is None:
61 | session = requests.Session()
62 | for cookie in cookies:
63 | session.cookies.set(cookie['name'], cookie['value'])
64 |
65 |
66 | if __name__ == '__main__':
67 | from sys import argv
68 | if argv[1] == 'manually':
69 | # login_url = 'https://passport.bilibili.com/login'
70 | login_url = 'https://www.zhihu.com/signin'
71 | login_manually(login_url, 'z-.cookies')
72 | elif argv[1] == 'auto':
73 | login_url = 'https://weibo.com/'
74 | username_xpath = '//input[@id="loginname"]'
75 | password_xpath = '//input[@name="password"]'
76 | submit_xpath = '//a[@action-type="btn_submit"]'
77 | username = 'your-username'
78 | password = 'your-password'
79 | login_auto(login_url, username, password, username_xpath, password_xpath, submit_xpath, 'z-weibo.cookies')
80 | else:
81 | print('invalid option')
82 |
83 |
--------------------------------------------------------------------------------
/news-crawler/news-crawler-sync.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Author: veelion
3 |
4 | import urllib.parse as urlparse
5 | import lzma
6 | import farmhash
7 | import traceback
8 |
9 |
10 | from ezpymysql import Connection
11 | from urlpool import UrlPool
12 | import functions as fn
13 | import config
14 |
15 | class NewsCrawlerSync:
16 | def __init__(self, name):
17 | self.db = Connection(
18 | config.db_host,
19 | config.db_db,
20 | config.db_user,
21 | config.db_password
22 | )
23 | self.logger = fn.init_file_logger(name + '.log')
24 | self.urlpool = UrlPool(name)
25 | self.hub_hosts = None
26 | self.load_hubs()
27 |
28 | def load_hubs(self,):
29 | sql = 'select url from crawler_hub'
30 | data = self.db.query(sql)
31 | self.hub_hosts = set()
32 | hubs = []
33 | for d in data:
34 | host = urlparse.urlparse(d['url']).netloc
35 | self.hub_hosts.add(host)
36 | hubs.append(d['url'])
37 | self.urlpool.set_hubs(hubs, 300)
38 |
39 | def save_to_db(self, url, html):
40 | urlhash = farmhash.hash64(url)
41 | sql = 'select url from crawler_html where urlhash=%s'
42 | d = self.db.get(sql, urlhash)
43 | if d:
44 | if d['url'] != url:
45 | msg = 'farmhash collision: %s <=> %s' % (url, d['url'])
46 | self.logger.error(msg)
47 | return True
48 | if isinstance(html, str):
49 | html = html.encode('utf8')
50 | html_lzma = lzma.compress(html)
51 | sql = ('insert into crawler_html(urlhash, url, html_lzma) '
52 | 'values(%s, %s, %s)')
53 | good = False
54 | try:
55 | self.db.execute(sql, urlhash, url, html_lzma)
56 | good = True
57 | except Exception as e:
58 | if e.args[0] == 1062:
59 | # Duplicate entry
60 | good = True
61 | pass
62 | else:
63 | traceback.print_exc()
64 | raise e
65 | return good
66 |
67 | def filter_good(self, urls):
68 | goodlinks = []
69 | for url in urls:
70 | host = urlparse.urlparse(url).netloc
71 | if host in self.hub_hosts:
72 | goodlinks.append(url)
73 | return goodlinks
74 |
75 | def process(self, url, ishub):
76 | status, html, redirected_url = fn.downloader(url)
77 | self.urlpool.set_status(url, status)
78 | if redirected_url != url:
79 | self.urlpool.set_status(redirected_url, status)
80 | # 提取hub网页中的链接, 新闻网页中也有“相关新闻”的链接,按需提取
81 | if status != 200:
82 | return
83 | if ishub:
84 | newlinks = fn.extract_links_re(redirected_url, html)
85 | goodlinks = self.filter_good(newlinks)
86 | print("%s/%s, goodlinks/newlinks" % (len(goodlinks), len(newlinks)))
87 | self.urlpool.addmany(goodlinks)
88 | else:
89 | self.save_to_db(redirected_url, html)
90 |
91 | def run(self,):
92 | while 1:
93 | urls = self.urlpool.pop(5)
94 | for url, ishub in urls.items():
95 | self.process(url, ishub)
96 |
97 |
98 | if __name__ == '__main__':
99 | crawler = NewsCrawlerSync('yuanrenxyue')
100 | crawler.run()
101 |
--------------------------------------------------------------------------------
/news-crawler/news-crawler-async.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # File: news-crawler-async.py
3 | # Author: veelion
4 |
5 | import traceback
6 | import time
7 | import asyncio
8 | import aiohttp
9 | import urllib.parse as urlparse
10 | import farmhash
11 | import lzma
12 |
13 | import uvloop
14 | asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
15 |
16 | import sanicdb
17 |
18 | from urlpool import UrlPool
19 | import functions as fn
20 | import config
21 |
22 |
23 | class NewsCrawlerAsync:
24 | def __init__(self, name):
25 | self._workers = 0
26 | self._workers_max = 30
27 | self.logger = fn.init_file_logger(name+ '.log')
28 |
29 | self.urlpool = UrlPool(name)
30 |
31 | self.loop = asyncio.get_event_loop()
32 | self.session = aiohttp.ClientSession(loop=self.loop)
33 | self.db = sanicdb.SanicDB(
34 | config.db_host,
35 | config.db_db,
36 | config.db_user,
37 | config.db_password,
38 | loop=self.loop
39 | )
40 |
41 | async def load_hubs(self,):
42 | sql = 'select url from crawler_hub'
43 | data = await self.db.query(sql)
44 | self.hub_hosts = set()
45 | hubs = []
46 | for d in data:
47 | host = urlparse.urlparse(d['url']).netloc
48 | self.hub_hosts.add(host)
49 | hubs.append(d['url'])
50 | self.urlpool.set_hubs(hubs, 300)
51 |
52 | async def save_to_db(self, url, html):
53 | urlhash = farmhash.hash64(url)
54 | sql = 'select url from crawler_html where urlhash=%s'
55 | d = await self.db.get(sql, urlhash)
56 | if d:
57 | if d['url'] != url:
58 | msg = 'farmhash collision: %s <=> %s' % (url, d['url'])
59 | self.logger.error(msg)
60 | return True
61 | if isinstance(html, str):
62 | html = html.encode('utf8')
63 | html_lzma = lzma.compress(html)
64 | sql = ('insert into crawler_html(urlhash, url, html_lzma) '
65 | 'values(%s, %s, %s)')
66 | good = False
67 | try:
68 | await self.db.execute(sql, urlhash, url, html_lzma)
69 | good = True
70 | except Exception as e:
71 | if e.args[0] == 1062:
72 | # Duplicate entry
73 | good = True
74 | pass
75 | else:
76 | traceback.print_exc()
77 | raise e
78 | return good
79 |
80 | def filter_good(self, urls):
81 | goodlinks = []
82 | for url in urls:
83 | host = urlparse.urlparse(url).netloc
84 | if host in self.hub_hosts:
85 | goodlinks.append(url)
86 | return goodlinks
87 |
88 | async def process(self, url, ishub):
89 | status, html, redirected_url = await fn.fetch(self.session, url)
90 | self.urlpool.set_status(url, status)
91 | if redirected_url != url:
92 | self.urlpool.set_status(redirected_url, status)
93 | # 提取hub网页中的链接, 新闻网页中也有“相关新闻”的链接,按需提取
94 | if status != 200:
95 | self._workers -= 1
96 | return
97 | if ishub:
98 | newlinks = fn.extract_links_re(redirected_url, html)
99 | goodlinks = self.filter_good(newlinks)
100 | print("%s/%s, goodlinks/newlinks" % (len(goodlinks), len(newlinks)))
101 | self.urlpool.addmany(goodlinks)
102 | else:
103 | await self.save_to_db(redirected_url, html)
104 | self._workers -= 1
105 |
106 | async def loop_crawl(self,):
107 | await self.load_hubs()
108 | last_rating_time = time.time()
109 | counter = 0
110 | while 1:
111 | tasks = self.urlpool.pop(self._workers_max)
112 | if not tasks:
113 | print('no url to crawl, sleep')
114 | await asyncio.sleep(3)
115 | continue
116 | for url, ishub in tasks.items():
117 | self._workers += 1
118 | counter += 1
119 | print('crawl:', url)
120 | asyncio.ensure_future(self.process(url, ishub))
121 |
122 | gap = time.time() - last_rating_time
123 | if gap > 5:
124 | rate = counter / gap
125 | print('\tloop_crawl() rate:%s, counter: %s, workers: %s' % (round(rate, 2), counter, self._workers))
126 | last_rating_time = time.time()
127 | counter = 0
128 | if self._workers > self._workers_max:
129 | print('====== got workers_max, sleep 3 sec to next worker =====')
130 | await asyncio.sleep(3)
131 |
132 | def run(self):
133 | try:
134 | self.loop.run_until_complete(self.loop_crawl())
135 | except KeyboardInterrupt:
136 | print('stopped by yourself!')
137 | del self.urlpool
138 | pass
139 |
140 |
141 |
142 | if __name__ == '__main__':
143 | nc = NewsCrawlerAsync('yrx-async')
144 | nc.run()
145 |
146 |
147 |
--------------------------------------------------------------------------------
/news-crawler/functions.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Author: veelion
3 |
4 |
5 | import re
6 | import urllib.parse as urlparse
7 | import requests
8 | import cchardet
9 | import traceback
10 |
11 |
12 | async def fetch(session, url, headers=None, timeout=9, binary=False):
13 | _headers = {
14 | 'User-Agent': ('Mozilla/5.0 (compatible; MSIE 9.0; '
15 | 'Windows NT 6.1; Win64; x64; Trident/5.0)'),
16 | }
17 | if headers:
18 | _headers = headers
19 | try:
20 | async with session.get(url, headers=_headers, timeout=timeout) as response:
21 | status = response.status
22 | html = await response.read()
23 | if not binary:
24 | encoding = cchardet.detect(html)['encoding']
25 | html = html.decode(encoding, errors='ignore')
26 | redirected_url = str(response.url)
27 | except Exception as e:
28 | msg = 'Failed download: {} | exception: {}, {}'.format(url, str(type(e)), str(e))
29 | print(msg)
30 | html = ''
31 | status = 0
32 | redirected_url = url
33 | return status, html, redirected_url
34 |
35 |
36 | def downloader(url, timeout=10, headers=None, debug=False, binary=False):
37 | _headers = {
38 | 'User-Agent': ('Mozilla/5.0 (compatible; MSIE 9.0; '
39 | 'Windows NT 6.1; Win64; x64; Trident/5.0)'),
40 | }
41 | redirected_url = url
42 | if headers:
43 | _headers = headers
44 | try:
45 | r = requests.get(url, headers=_headers, timeout=timeout)
46 | if binary:
47 | html = r.content
48 | else:
49 | encoding = cchardet.detect(r.content)['encoding']
50 | html = r.content.decode(encoding, errors='ignore')
51 | status = r.status_code
52 | redirected_url = r.url
53 | except:
54 | if debug:
55 | traceback.print_exc()
56 | msg = 'failed download: {}'.format(url)
57 | print(msg)
58 | if binary:
59 | html = b''
60 | else:
61 | html = ''
62 | status = 0
63 | return status, html, redirected_url
64 |
65 |
66 | g_bin_postfix = set([
67 | 'exe', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx',
68 | 'pdf',
69 | 'jpg', 'png', 'bmp', 'jpeg', 'gif',
70 | 'zip', 'rar', 'tar', 'bz2', '7z', 'gz',
71 | 'flv', 'mp4', 'avi', 'wmv', 'mkv',
72 | 'apk',
73 | ])
74 |
75 | g_news_postfix = [
76 | '.html?', '.htm?', '.shtml?',
77 | '.shtm?',
78 | ]
79 |
80 |
81 | def clean_url(url):
82 | # 1. 是否为合法的http url
83 | if not url.startswith('http'):
84 | return ''
85 | # 2. 去掉静态化url后面的参数
86 | for np in g_news_postfix:
87 | p = url.find(np)
88 | if p > -1:
89 | p = url.find('?')
90 | url = url[:p]
91 | return url
92 | # 3. 不下载二进制类内容的链接
93 | up = urlparse.urlparse(url)
94 | path = up.path
95 | if not path:
96 | path = '/'
97 | postfix = path.split('.')[-1].lower()
98 | if postfix in g_bin_postfix:
99 | return ''
100 |
101 | # 4. 去掉标识流量来源的参数
102 | # badquery = ['spm', 'utm_source', 'utm_source', 'utm_medium', 'utm_campaign']
103 | good_queries = []
104 | for query in up.query.split('&'):
105 | qv = query.split('=')
106 | if qv[0].startswith('spm') or qv[0].startswith('utm_'):
107 | continue
108 | if len(qv) == 1:
109 | continue
110 | good_queries.append(query)
111 | query = '&'.join(good_queries)
112 | url = urlparse.urlunparse((
113 | up.scheme,
114 | up.netloc,
115 | path,
116 | up.params,
117 | query,
118 | '' # crawler do not care fragment
119 | ))
120 | return url
121 |
122 |
123 | g_pattern_tag_a = re.compile(r']*?href=[\'"]?([^> \'"]+)[^>]*?>(.*?)', re.I|re.S|re.M)
124 |
125 |
126 | def extract_links_re(url, html):
127 | '''use re module to extract links from html'''
128 | newlinks = set()
129 | aa = g_pattern_tag_a.findall(html)
130 | for a in aa:
131 | link = a[0].strip()
132 | if not link:
133 | continue
134 | link = urlparse.urljoin(url, link)
135 | link = clean_url(link)
136 | if not link:
137 | continue
138 | newlinks.add(link)
139 | return newlinks
140 |
141 |
142 | def init_file_logger(fname):
143 | # config logging
144 | import logging
145 | from logging.handlers import TimedRotatingFileHandler
146 | ch = TimedRotatingFileHandler(fname, when="midnight")
147 | ch.setLevel(logging.INFO)
148 | # create formatter
149 | fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
150 | formatter = logging.Formatter(fmt)
151 | # add formatter to ch
152 | ch.setFormatter(formatter)
153 | logger = logging.getLogger(fname)
154 | # add ch to logger
155 | logger.addHandler(ch)
156 | return logger
157 |
158 |
159 |
160 | if __name__ == '__main__':
161 | url = 'http://news.baidu.com/'
162 | s, html = downloader(url)
163 | print(s, len(html))
164 |
--------------------------------------------------------------------------------
/weibologin.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Author: veelion
3 |
4 |
5 | import re
6 | import pickle
7 | import json
8 | import base64
9 | import binascii
10 | import rsa
11 | import requests
12 | import urllib
13 | import time
14 | import traceback
15 |
16 |
17 |
18 | class WeiboLogin:
19 | user_agent = (
20 | 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.11 (KHTML, like Gecko) '
21 | 'Chrome/20.0.1132.57 Safari/536.11'
22 | )
23 |
24 | def __init__(self, username, password, cookies_tosave='weibo.cookies'):
25 | self.weibo_user = username
26 | self.weibo_password = password
27 | self.cookies_tosave = cookies_tosave
28 | self.session = requests.session()
29 | self.session.headers['User-Agent'] = self.user_agent
30 |
31 | def encrypt_user(self, username):
32 | user = urllib.parse.quote(username)
33 | su = base64.b64encode(user.encode())
34 | return su
35 |
36 | def encrypt_passwd(self, passwd, pubkey, servertime, nonce):
37 | key = rsa.PublicKey(int(pubkey, 16), int('10001', 16))
38 | message = str(servertime) + '\t' + str(nonce) + '\n' + str(passwd)
39 | passwd = rsa.encrypt(message.encode('utf-8'), key)
40 | return binascii.b2a_hex(passwd)
41 |
42 | def prelogin(self):
43 | preloginTimeStart = int(time.time()*1000)
44 | url = ('https://login.sina.com.cn/sso/prelogin.php?'
45 | 'entry=weibo&callback=sinaSSOController.preloginCallBack&'
46 | 'su=&rsakt=mod&client=ssologin.js(v1.4.19)&'
47 | '_=%s') % preloginTimeStart
48 | resp = self.session.get(url)
49 | pre_login_str = re.match(r'[^{]+({.+?})', resp.text).group(1)
50 | pre_login = json.loads(pre_login_str)
51 | pre_login['preloginTimeStart'] = preloginTimeStart
52 | print ('pre_login 1:', pre_login)
53 | return pre_login
54 |
55 | def get_prelt(self, pre_login):
56 | prelt = int(time.time() * 1000) - pre_login['preloginTimeStart'] - pre_login['exectime']
57 | return prelt
58 |
59 | def login(self):
60 | # step-1. prelogin
61 | pre_login = self.prelogin()
62 | su = self.encrypt_user(self.weibo_user)
63 | sp = self.encrypt_passwd(
64 | self.weibo_password,
65 | pre_login['pubkey'],
66 | pre_login['servertime'],
67 | pre_login['nonce']
68 | )
69 | prelt = self.get_prelt(pre_login)
70 |
71 | data = {
72 | 'entry': 'weibo',
73 | 'gateway': 1,
74 | 'from': '',
75 | 'savestate': 7,
76 | 'qrcode_flag': 'false',
77 | 'userticket': 1,
78 | 'pagerefer': '',
79 | 'vsnf': 1,
80 | 'su': su,
81 | 'service': 'miniblog',
82 | 'servertime': pre_login['servertime'],
83 | 'nonce': pre_login['nonce'],
84 | 'vsnf': 1,
85 | 'pwencode': 'rsa2',
86 | 'sp': sp,
87 | 'rsakv' : pre_login['rsakv'],
88 | 'encoding': 'UTF-8',
89 | 'prelt': prelt,
90 | 'sr': "1280*800",
91 | 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.'
92 | 'sinaSSOController.feedBackUrlCallBack',
93 | 'returntype': 'META'
94 | }
95 |
96 | # step-2 login POST
97 | login_url = 'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.19)'
98 | resp = self.session.post(login_url, data=data)
99 | print(resp.headers)
100 | print(resp.content)
101 | print('Step-2 response:', resp.text)
102 |
103 | # step-3 follow redirect
104 | redirect_url = re.findall(r'location\.replace\("(.*?)"', resp.text)[0]
105 | print('Step-3 to redirect:', redirect_url)
106 | resp = self.session.get(redirect_url)
107 | print('Step-3 response:', resp.text)
108 |
109 | # step-4 process step-3's response
110 | arrURL = re.findall(r'"arrURL":(.*?)\}', resp.text)[0]
111 | arrURL = json.loads(arrURL)
112 | print('CrossDomainUrl:', arrURL)
113 | for url in arrURL:
114 | print('set CrossDomainUrl:', url)
115 | resp_cross = self.session.get(url)
116 | print(resp_cross.text)
117 | redirect_url = re.findall(r'location\.replace\(\'(.*?)\'', resp.text)[0]
118 | print('Step-4 redirect_url:', redirect_url)
119 | resp = self.session.get(redirect_url)
120 | print(resp.text)
121 | with open(self.cookies_tosave, 'wb') as f:
122 | pickle.dump(self.session.cookies, f)
123 | return True
124 |
125 | def fetch(self, url):
126 | try:
127 | resp = self.session.get(url, timeout=10)
128 | return resp
129 | except:
130 | traceback.print_exc()
131 | return None
132 |
133 | if __name__ == '__main__':
134 | weibo_user = 'your-weibo-username'
135 | weibo_password = 'your-weibo-password'
136 | wb = WeiboLogin(weibo_user, weibo_password)
137 | wb.login()
138 | r = wb.fetch('https://weibo.com/')
139 | print(r.encoding)
140 | print(len(r.text))
141 |
--------------------------------------------------------------------------------
/news-crawler/bee_client.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # encoding: UTF-8
3 | # author: veelion
4 | # file: bee_client.py
5 |
6 | import re
7 | import cchardet
8 | import traceback
9 | import time
10 | import json
11 | import asyncio
12 | import urllib.parse as urlparse
13 | import aiohttp
14 | import uvloop
15 |
16 |
17 | asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
18 |
19 |
20 |
21 | p_tag_a = re.compile(
22 | r']*?href=[\'"]?([^> \'"]+)[^>]*?>(.*?)',
23 | re.I|re.S|re.M)
24 |
25 |
26 | def extract_links_re(url, html):
27 | newlinks = set()
28 | aa = p_tag_a.findall(html)
29 | for a in aa:
30 | link = a[0].strip()
31 | if not link:
32 | continue
33 | link = urlparse.urljoin(url, link)
34 | if not link.startswith('http'):
35 | continue
36 | newlinks.add(link)
37 | return newlinks
38 |
39 |
40 |
41 | class CrawlerClient:
42 | def __init__(self, ):
43 | self._workers = 0
44 | self.workers_max = 10
45 | self.server_host = 'localhost'
46 | self.server_port = 8080
47 | self.headers = {'User-Agent': ('Mozilla/5.0 (compatible; MSIE 9.0; '
48 | 'Windows NT 6.1; Win64; x64; Trident/5.0)')}
49 |
50 | self.loop = asyncio.get_event_loop()
51 | self.queue = asyncio.Queue(loop=self.loop)
52 | self.session = aiohttp.ClientSession(loop=self.loop)
53 |
54 | async def download(self, url, timeout=25):
55 | status_code = 900
56 | html = ''
57 | url_now = url
58 | try:
59 | async with self.session.get(url_now, headers=self.headers, timeout=timeout) as response:
60 | status_code = response.status
61 | html = await response.read()
62 | encoding = cchardet.detect(html)['encoding']
63 | html = html.decode(encoding, errors='ignore')
64 | url_now = str(response.url)
65 | except Exception as e:
66 | # traceback.print_exc()
67 | print('=== exception: ', e, type(e), str(e))
68 | msg = 'Failed download: {} | exception: {}, {}'.format(url, str(type(e)), str(e))
69 | print(msg)
70 | return status_code, html, url_now
71 |
72 | async def get_urls(self,):
73 | count = self.workers_max - self.queue.qsize()
74 | if count <= 0:
75 | print('no need to get urls this time')
76 | return None
77 | url = 'http://%s:%s/task?count=%s' % (
78 | self.server_host,
79 | self.server_port,
80 | count
81 | )
82 | try:
83 | async with self.session.get(url, timeout=3) as response:
84 | if response.status not in [200, 201]:
85 | return
86 | jsn = await response.text()
87 | urls = json.loads(jsn)
88 | msg = ('get_urls() to get [%s] but got[%s], @%s') % (
89 | count, len(urls),
90 | time.strftime('%Y-%m-%d %H:%M:%S'))
91 | print(msg)
92 | for kv in urls.items():
93 | await self.queue.put(kv)
94 | print('queue size:', self.queue.qsize(), ', _workers:', self._workers)
95 | except:
96 | traceback.print_exc()
97 | return
98 |
99 | async def send_result(self, result):
100 | url = 'http://%s:%s/task' % (
101 | self.server_host,
102 | self.server_port
103 | )
104 | try:
105 | async with self.session.post(url, json=result, timeout=3) as response:
106 | return response.status
107 | except:
108 | traceback.print_exc()
109 | pass
110 |
111 | def save_html(self, url, html):
112 | print('saved:', url, len(html))
113 |
114 | def filter_good(self, urls):
115 | '''根据抓取目的过滤提取的URLs,只要你想要的'''
116 | good = []
117 | for url in urls:
118 | if url.startswith('http'):
119 | good.append(url)
120 | return good
121 |
122 | async def process(self, url, ishub):
123 | status, html, url_now = await self.download(url)
124 | self._workers -= 1
125 | print('downloaded:', url, ', html:', len(html))
126 | if html:
127 | newurls = extract_links_re(url, html)
128 | newurls = self.filter_good(newurls)
129 | self.save_html(url, html)
130 | else:
131 | newurls = []
132 | result = {
133 | 'url': url,
134 | 'url_real': url_now,
135 | 'status': status,
136 | 'newurls': newurls,
137 | }
138 | await self.send_result(result)
139 |
140 | async def loop_get_urls(self,):
141 | print('loop_get_urls() start')
142 | while 1:
143 | await self.get_urls()
144 | await asyncio.sleep(1)
145 |
146 | async def loop_crawl(self,):
147 | print('loop_crawl() start')
148 | asyncio.ensure_future(self.loop_get_urls())
149 | counter = 0
150 | while 1:
151 | item = await self.queue.get()
152 | url, url_level = item
153 | self._workers += 1
154 | counter += 1
155 | asyncio.ensure_future(self.process(url, url_level))
156 |
157 | if self._workers > self.workers_max:
158 | print('====== got workers_max, sleep 3 sec to next worker =====')
159 | await asyncio.sleep(3)
160 |
161 | def start(self):
162 | try:
163 | self.loop.run_until_complete(self.loop_crawl())
164 | except KeyboardInterrupt:
165 | print('stopped by yourself!')
166 | pass
167 |
168 |
169 | def run():
170 | ant = CrawlerClient()
171 | ant.start()
172 |
173 |
174 | if __name__ == '__main__':
175 | run()
176 |
177 |
--------------------------------------------------------------------------------
/news-crawler/ezpymysql.py:
--------------------------------------------------------------------------------
1 | #file: ezpymysql.py
2 | #Author: veelion
3 |
4 | """A lightweight wrapper around PyMySQL.
5 | only for python3
6 |
7 | """
8 |
9 | import time
10 | import logging
11 | import traceback
12 | import pymysql
13 | import pymysql.cursors
14 |
15 | version = "0.7"
16 | version_info = (0, 7, 0, 0)
17 |
18 |
19 | class Connection(object):
20 | """A lightweight wrapper around PyMySQL.
21 | """
22 | def __init__(self, host, database, user=None, password=None,
23 | port=0,
24 | max_idle_time=7 * 3600, connect_timeout=10,
25 | time_zone="+0:00", charset = "utf8mb4", sql_mode="TRADITIONAL"):
26 | self.host = host
27 | self.database = database
28 | self.max_idle_time = float(max_idle_time)
29 |
30 | args = dict(use_unicode=True, charset=charset,
31 | database=database,
32 | init_command=('SET time_zone = "%s"' % time_zone),
33 | cursorclass=pymysql.cursors.DictCursor,
34 | connect_timeout=connect_timeout, sql_mode=sql_mode)
35 | if user is not None:
36 | args["user"] = user
37 | if password is not None:
38 | args["passwd"] = password
39 |
40 | # We accept a path to a MySQL socket file or a host(:port) string
41 | if "/" in host:
42 | args["unix_socket"] = host
43 | else:
44 | self.socket = None
45 | pair = host.split(":")
46 | if len(pair) == 2:
47 | args["host"] = pair[0]
48 | args["port"] = int(pair[1])
49 | else:
50 | args["host"] = host
51 | args["port"] = 3306
52 | if port:
53 | args['port'] = port
54 |
55 | self._db = None
56 | self._db_args = args
57 | self._last_use_time = time.time()
58 | try:
59 | self.reconnect()
60 | except Exception:
61 | logging.error("Cannot connect to MySQL on %s", self.host,
62 | exc_info=True)
63 |
64 | def _ensure_connected(self):
65 | # Mysql by default closes client connections that are idle for
66 | # 8 hours, but the client library does not report this fact until
67 | # you try to perform a query and it fails. Protect against this
68 | # case by preemptively closing and reopening the connection
69 | # if it has been idle for too long (7 hours by default).
70 | if (self._db is None or
71 | (time.time() - self._last_use_time > self.max_idle_time)):
72 | self.reconnect()
73 | self._last_use_time = time.time()
74 |
75 | def _cursor(self):
76 | self._ensure_connected()
77 | return self._db.cursor()
78 |
79 | def __del__(self):
80 | self.close()
81 |
82 | def close(self):
83 | """Closes this database connection."""
84 | if getattr(self, "_db", None) is not None:
85 | self._db.close()
86 | self._db = None
87 |
88 | def reconnect(self):
89 | """Closes the existing database connection and re-opens it."""
90 | self.close()
91 | self._db = pymysql.connect(**self._db_args)
92 | self._db.autocommit(True)
93 |
94 | def query(self, query, *parameters, **kwparameters):
95 | """Returns a row list for the given query and parameters."""
96 | cursor = self._cursor()
97 | try:
98 | cursor.execute(query, kwparameters or parameters)
99 | result = cursor.fetchall()
100 | return result
101 | finally:
102 | cursor.close()
103 |
104 | def get(self, query, *parameters, **kwparameters):
105 | """Returns the (singular) row returned by the given query.
106 | """
107 | cursor = self._cursor()
108 | try:
109 | cursor.execute(query, kwparameters or parameters)
110 | return cursor.fetchone()
111 | finally:
112 | cursor.close()
113 |
114 | def execute(self, query, *parameters, **kwparameters):
115 | """Executes the given query, returning the lastrowid from the query."""
116 | cursor = self._cursor()
117 | try:
118 | cursor.execute(query, kwparameters or parameters)
119 | return cursor.lastrowid
120 | except Exception as e:
121 | if e.args[0] == 1062:
122 | pass
123 | else:
124 | traceback.print_exc()
125 | raise e
126 | finally:
127 | cursor.close()
128 |
129 | insert = execute
130 |
131 | ## =============== high level method for table ===================
132 |
133 | def table_has(self, table_name, field, value):
134 | if isinstance(value, str):
135 | value = value.encode('utf8')
136 | sql = 'SELECT %s FROM %s WHERE %s="%s"' % (
137 | field,
138 | table_name,
139 | field,
140 | value)
141 | d = self.get(sql)
142 | return d
143 |
144 | def table_insert(self, table_name, item):
145 | '''item is a dict : key is mysql table field'''
146 | fields = list(item.keys())
147 | values = list(item.values())
148 | fieldstr = ','.join(fields)
149 | valstr = ','.join(['%s'] * len(item))
150 | for i in range(len(values)):
151 | if isinstance(values[i], str):
152 | values[i] = values[i].encode('utf8')
153 | sql = 'INSERT INTO %s (%s) VALUES(%s)' % (table_name, fieldstr, valstr)
154 | try:
155 | last_id = self.execute(sql, *values)
156 | return last_id
157 | except Exception as e:
158 | if e.args[0] == 1062:
159 | # just skip duplicated item
160 | pass
161 | else:
162 | traceback.print_exc()
163 | print('sql:', sql)
164 | print('item:')
165 | for i in range(len(fields)):
166 | vs = str(values[i])
167 | if len(vs) > 300:
168 | print(fields[i], ' : ', len(vs), type(values[i]))
169 | else:
170 | print(fields[i], ' : ', vs, type(values[i]))
171 | raise e
172 |
173 | def table_update(self, table_name, updates,
174 | field_where, value_where):
175 | '''updates is a dict of {field_update:value_update}'''
176 | upsets = []
177 | values = []
178 | for k, v in updates.items():
179 | s = '%s=%%s' % k
180 | upsets.append(s)
181 | values.append(v)
182 | upsets = ','.join(upsets)
183 | sql = 'UPDATE %s SET %s WHERE %s="%s"' % (
184 | table_name,
185 | upsets,
186 | field_where, value_where,
187 | )
188 | self.execute(sql, *(values))
189 |
190 |
191 |
192 | if __name__ == '__main__':
193 | db = Connection(
194 | 'localhost',
195 | 'db_name',
196 | 'user',
197 | 'password'
198 | )
199 | # 获取一条记录
200 | sql = 'select * from test_table where id=%s'
201 | data = db.get(sql, 2)
202 |
203 | # 获取多天记录
204 | sql = 'select * from test_table where id>%s'
205 | data = db.query(sql, 2)
206 |
207 | # 插入一条数据
208 | sql = 'insert into test_table(title, url) values(%s, %s)'
209 | last_id = db.execute(sql, 'test', 'http://a.com/')
210 | # 或者
211 | last_id = db.insert(sql, 'test', 'http://a.com/')
212 |
213 |
214 | # 使用更高级的方法插入一条数据
215 | item = {
216 | 'title': 'test',
217 | 'url': 'http://a.com/',
218 | }
219 | last_id = db.table_insert('test_table', item)
220 |
221 |
--------------------------------------------------------------------------------
/news-crawler/urlpool.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Author: veelion
3 |
4 | """
5 | URL Pool for crawler to manage URLs
6 | """
7 |
8 | import pickle
9 | import leveldb
10 | import time
11 | import urllib.parse as urlparse
12 |
13 |
14 | RED = '\x1b[31m'
15 | GRE = '\x1b[32m'
16 | BRO = '\x1b[33m'
17 | BLU = '\x1b[34m'
18 | PUR = '\x1b[35m'
19 | CYA = '\x1b[36m'
20 | WHI = '\x1b[37m'
21 | NOR = '\x1b[0m'
22 |
23 |
24 | class UrlDB:
25 | '''Use LevelDB to store URLs what have been done(succeed or faile)
26 | '''
27 | status_failure = b'0'
28 | status_success = b'1'
29 |
30 | def __init__(self, db_name):
31 | self.name = db_name + '.urldb'
32 | self.db = leveldb.LevelDB(self.name)
33 |
34 | def set_success(self, url):
35 | if isinstance(url, str):
36 | url = url.encode('utf8')
37 | try:
38 | self.db.Put(url, self.status_success)
39 | s = True
40 | except:
41 | s = False
42 | return s
43 |
44 | def set_failure(self, url):
45 | if isinstance(url, str):
46 | url = url.encode('utf8')
47 | try:
48 | self.db.Put(url, self.status_failure)
49 | s = True
50 | except:
51 | s = False
52 | return s
53 |
54 | def has(self, url):
55 | if isinstance(url, str):
56 | url = url.encode('utf8')
57 | try:
58 | attr = self.db.Get(url)
59 | return attr
60 | except:
61 | pass
62 | return False
63 |
64 |
65 | class UrlPool:
66 | '''URL Pool for crawler to manage URLs
67 | '''
68 |
69 | def __init__(self, pool_name):
70 | self.name = pool_name
71 | self.db = UrlDB(pool_name)
72 |
73 | self.waiting = {} # {host: set([urls]), } 按host分组,记录等待下载的URL
74 | self.pending = {} # {url: pended_time, } 记录已被取出(self.pop())但还未被更新状态(正在下载)的URL
75 | self.failure = {} # {url: times,} 记录失败的URL的次数
76 | self.failure_threshold = 3
77 | self.pending_threshold = 10 # pending的最大时间,过期要重新下载
78 | self.waiting_count = 0 # self.waiting 字典里面的url的个数
79 | self.max_hosts = ['', 0] # [host: url_count] 目前pool中url最多的host及其url数量
80 | self.hub_pool = {} # {url: last_query_time, } 存放hub url
81 | self.hub_refresh_span = 0
82 | self.load_cache()
83 |
84 | def __del__(self):
85 | self.dump_cache()
86 |
87 | def load_cache(self,):
88 | path = self.name + '.pkl'
89 | try:
90 | with open(path, 'rb') as f:
91 | self.waiting = pickle.load(f)
92 | cc = [len(v) for k, v in self.waiting.items()]
93 | print('saved pool loaded! urls:', sum(cc))
94 | except:
95 | pass
96 |
97 | def dump_cache(self):
98 | path = self.name + '.pkl'
99 | try:
100 | with open(path, 'wb') as f:
101 | pickle.dump(self.waiting, f)
102 | print('self.waiting saved!')
103 | except:
104 | pass
105 |
106 | def set_hubs(self, urls, hub_refresh_span):
107 | self.hub_refresh_span = hub_refresh_span
108 | self.hub_pool = {}
109 | for url in urls:
110 | self.hub_pool[url] = 0
111 |
112 | def set_status(self, url, status_code):
113 | if url in self.pending:
114 | self.pending.pop(url)
115 |
116 | if status_code == 200:
117 | self.db.set_success(url)
118 | return
119 | if status_code == 404:
120 | self.db.set_failure(url)
121 | return
122 | if url in self.failure:
123 | self.failure[url] += 1
124 | if self.failure[url] > self.failure_threshold:
125 | self.db.set_failure(url)
126 | self.failure.pop(url)
127 | else:
128 | self.add(url)
129 | else:
130 | self.failure[url] = 1
131 | self.add(url)
132 |
133 | def push_to_pool(self, url):
134 | host = urlparse.urlparse(url).netloc
135 | if not host or '.' not in host:
136 | print('try to push_to_pool with bad url:', url, ', len of ur:', len(url))
137 | return False
138 | if host in self.waiting:
139 | if url in self.waiting[host]:
140 | return True
141 | self.waiting[host].add(url)
142 | if len(self.waiting[host]) > self.max_hosts[1]:
143 | self.max_hosts[1] = len(self.waiting[host])
144 | self.max_hosts[0] = host
145 | else:
146 | self.waiting[host] = set([url])
147 | self.waiting_count += 1
148 | return True
149 |
150 | def add(self, url, always=False):
151 | if always:
152 | return self.push_to_pool(url)
153 | pended_time = self.pending.get(url, 0)
154 | if time.time() - pended_time < self.pending_threshold:
155 | print('being downloading:', url)
156 | return
157 | if self.db.has(url):
158 | return
159 | if pended_time:
160 | self.pending.pop(url)
161 | return self.push_to_pool(url)
162 |
163 | def addmany(self, urls, always=False):
164 | if isinstance(urls, str):
165 | print('urls is a str !!!!', urls)
166 | self.add(urls, always)
167 | else:
168 | for url in urls:
169 | self.add(url, always)
170 |
171 | def pop(self, count, hub_percent=50):
172 | print('\n\tmax of host:', self.max_hosts)
173 |
174 | # 取出的url有两种类型:hub=1, 普通=0
175 | url_attr_url = 0
176 | url_attr_hub = 1
177 | # 1. 首先取出hub,保证获取hub里面的最新url.
178 | hubs = {}
179 | hub_count = count * hub_percent // 100
180 | for hub in self.hub_pool:
181 | span = time.time() - self.hub_pool[hub]
182 | if span < self.hub_refresh_span:
183 | continue
184 | hubs[hub] = url_attr_hub # 1 means hub-url
185 | self.hub_pool[hub] = time.time()
186 | if len(hubs) >= hub_count:
187 | break
188 |
189 | # 2. 再取出普通url
190 | left_count = count - len(hubs)
191 | urls = {}
192 | for host in self.waiting:
193 | if not self.waiting[host]:
194 | continue
195 | url = self.waiting[host].pop()
196 | urls[url] = url_attr_url
197 | self.pending[url] = time.time()
198 | if self.max_hosts[0] == host:
199 | self.max_hosts[1] -= 1
200 | if len(urls) >= left_count:
201 | break
202 | self.waiting_count -= len(urls)
203 | print('To pop:%s, hubs: %s, urls: %s, hosts:%s' % (count, len(hubs), len(urls), len(self.waiting)))
204 | urls.update(hubs)
205 | return urls
206 |
207 | def size(self,):
208 | return self.waiting_count
209 |
210 | def empty(self,):
211 | return self.waiting_count == 0
212 |
213 |
214 | def test():
215 | pool = UrlPool('crawl_urlpool')
216 | urls = [
217 | 'http://1.a.cn/xyz',
218 | 'http://2.a.cn/xyz',
219 | 'http://3.a.cn/xyz',
220 | 'http://1.b.cn/xyz-1',
221 | 'http://1.b.cn/xyz-2',
222 | 'http://1.b.cn/xyz-3',
223 | 'http://1.b.cn/xyz-4',
224 | ]
225 | pool.addmany(urls)
226 | del pool
227 |
228 | pool = UrlPool('crawl_urlpool')
229 | urls = pool.pop(5)
230 | urls = list(urls.keys())
231 | print('pop:', urls)
232 | print('pending:', pool.pending)
233 |
234 | pool.set_status(urls[0], 200)
235 | print('pending:', pool.pending)
236 | pool.set_status(urls[1], 404)
237 | print('pending:', pool.pending)
238 |
239 |
240 | if __name__ == '__main__':
241 | test()
242 |
--------------------------------------------------------------------------------
/news-crawler/maincontent.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | #File: maincontent.py
3 | #Author: veelion
4 |
5 | import re
6 | import time
7 | import traceback
8 |
9 | import cchardet
10 | import lxml
11 | import lxml.html
12 | from lxml.html import HtmlComment
13 |
14 | REGEXES = {
15 | 'positiveRe': re.compile(
16 | ('article|arti|body|content|entry|hentry|main|page|'
17 | 'artical|zoom|arti|context|message|editor|'
18 | 'pagination|post|txt|text|blog|story'), re.I),
19 | 'negativeRe': re.compile(
20 | ('copyright|combx|comment|com-|contact|foot|footer|footnote|decl|copy|'
21 | 'notice|'
22 | 'masthead|media|meta|outbrain|promo|related|scroll|link|pagebottom|bottom|'
23 | 'other|shoutbox|sidebar|sponsor|shopping|tags|tool|widget'), re.I),
24 | }
25 |
26 |
27 | class MainContent:
28 | def __init__(self,):
29 | self.non_content_tag = set([
30 | 'head',
31 | 'meta',
32 | 'script',
33 | 'style',
34 | 'object', 'embed',
35 | 'iframe',
36 | 'marquee',
37 | 'select',
38 | ])
39 | self.title = ''
40 | self.p_space = re.compile(r'\s')
41 | self.p_content_stop = re.compile(r'正文.*结束|正文下|相关阅读|声明')
42 | self.p_clean_tree = re.compile(r'author|post-add|copyright')
43 |
44 | def get_title(self, doc):
45 | title = ''
46 | title_el = doc.xpath('//title')
47 | if title_el:
48 | title = title_el[0].text_content().strip()
49 | if len(title) < 7:
50 | tt = doc.xpath('//meta[@name="title"]')
51 | if tt:
52 | title = tt[0].get('content', '')
53 | if len(title) < 7:
54 | tt = doc.xpath('//*[contains(@id, "title") or contains(@class, "title")]')
55 | if not tt:
56 | tt = doc.xpath('//*[contains(@id, "font01") or contains(@class, "font01")]')
57 | for t in tt:
58 | ti = t.text_content().strip()
59 | if ti in title and len(ti)*2 > len(title):
60 | title = ti
61 | break
62 | if len(ti) > 20: continue
63 | if len(ti) > len(title) or len(ti) > 7:
64 | title = ti
65 | return title
66 |
67 | def clean_title(self, title):
68 | spliters = [' - ', '–', '—', '-', '|', '::']
69 | for s in spliters:
70 | if s not in title:
71 | continue
72 | tts = title.split(s)
73 | if len(tts) < 2:
74 | continue
75 | title = tts[0]
76 | break
77 | return title
78 |
79 | def calc_node_weight(self, node):
80 | weight = 1
81 | attr = '%s %s %s' % (
82 | node.get('class', ''),
83 | node.get('id', ''),
84 | node.get('style', '')
85 | )
86 | if attr:
87 | mm = REGEXES['negativeRe'].findall(attr)
88 | weight -= 2 * len(mm)
89 | mm = REGEXES['positiveRe'].findall(attr)
90 | weight += 4 * len(mm)
91 | if node.tag in ['div', 'p', 'table']:
92 | weight += 2
93 | return weight
94 |
95 | def get_main_block(self, url, html, clean_title=True):
96 | ''' return (title, etree_of_main_content_block)
97 | '''
98 | if isinstance(html, bytes):
99 | encoding = cchardet.detect(html)['encoding']
100 | if encoding is None:
101 | return None, None
102 | html = html.decode(encoding, 'ignore')
103 | try:
104 | doc = lxml.html.fromstring(html)
105 | doc.make_links_absolute(base_url=url)
106 | except :
107 | traceback.print_exc()
108 | return None, None
109 | self.title = self.get_title(doc)
110 | if clean_title:
111 | self.title = self.clean_title(self.title)
112 | body = doc.xpath('//body')
113 | if not body:
114 | return self.title, None
115 | candidates = []
116 | nodes = body[0].getchildren()
117 | while nodes:
118 | node = nodes.pop(0)
119 | children = node.getchildren()
120 | tlen = 0
121 | for child in children:
122 | if isinstance(child, HtmlComment):
123 | continue
124 | if child.tag in self.non_content_tag:
125 | continue
126 | if child.tag == 'a':
127 | continue
128 | if child.tag == 'textarea':
129 | # FIXME: this tag is only part of content?
130 | continue
131 | attr = '%s%s%s' % (child.get('class', ''),
132 | child.get('id', ''),
133 | child.get('style'))
134 | if 'display' in attr and 'none' in attr:
135 | continue
136 | nodes.append(child)
137 | if child.tag == 'p':
138 | weight = 3
139 | else:
140 | weight = 1
141 | text = '' if not child.text else child.text.strip()
142 | tail = '' if not child.tail else child.tail.strip()
143 | tlen += (len(text) + len(tail)) * weight
144 | if tlen < 10:
145 | continue
146 | weight = self.calc_node_weight(node)
147 | candidates.append((node, tlen*weight))
148 | if not candidates:
149 | return self.title, None
150 | candidates.sort(key=lambda a: a[1], reverse=True)
151 | good = candidates[0][0]
152 | if good.tag in ['p', 'pre', 'code', 'blockquote']:
153 | for i in range(5):
154 | good = good.getparent()
155 | if good.tag == 'div':
156 | break
157 | good = self.clean_node(good, url)
158 | return self.title, good
159 |
160 | def clean_node(self, tree, url=''):
161 | to_drop = []
162 | drop_left = False
163 | for node in tree.iterdescendants():
164 | if drop_left:
165 | to_drop.append(node)
166 | continue
167 | if isinstance(node, HtmlComment):
168 | to_drop.append(node)
169 | if self.p_content_stop.search(node.text):
170 | drop_left = True
171 | continue
172 | if node.tag in self.non_content_tag:
173 | to_drop.append(node)
174 | continue
175 | attr = '%s %s' % (
176 | node.get('class', ''),
177 | node.get('id', '')
178 | )
179 | if self.p_clean_tree.search(attr):
180 | to_drop.append(node)
181 | continue
182 | aa = node.xpath('.//a')
183 | if aa:
184 | text_node = len(self.p_space.sub('', node.text_content()))
185 | text_aa = 0
186 | for a in aa:
187 | alen = len(self.p_space.sub('', a.text_content()))
188 | if alen > 5:
189 | text_aa += alen
190 | if text_aa > text_node * 0.4:
191 | to_drop.append(node)
192 | for node in to_drop:
193 | try:
194 | node.drop_tree()
195 | except:
196 | pass
197 | return tree
198 |
199 | def get_text(self, doc):
200 | lxml.etree.strip_elements(doc, 'script')
201 | lxml.etree.strip_elements(doc, 'style')
202 | for ch in doc.iterdescendants():
203 | if not isinstance(ch.tag, str):
204 | continue
205 | if ch.tag in ['div', 'h1', 'h2', 'h3', 'p', 'br', 'table', 'tr', 'dl']:
206 | if not ch.tail:
207 | ch.tail = '\n'
208 | else:
209 | ch.tail = '\n' + ch.tail.strip() + '\n'
210 | if ch.tag in ['th', 'td']:
211 | if not ch.text:
212 | ch.text = ' '
213 | else:
214 | ch.text += ' '
215 | # if ch.tail:
216 | # ch.tail = ch.tail.strip()
217 | lines = doc.text_content().split('\n')
218 | content = []
219 | for l in lines:
220 | l = l.strip()
221 | if not l:
222 | continue
223 | content.append(l)
224 | return '\n'.join(content)
225 |
226 | def extract(self, url, html):
227 | '''return (title, content)
228 | '''
229 | title, node = self.get_main_block(url, html)
230 | if node is None:
231 | print('\tno main block got !!!!!', url)
232 | return title, '', ''
233 | content = self.get_text(node)
234 | return title, content
235 |
236 |
237 | if __name__ == '__main__':
238 | from sys import argv
239 | f = argv[1]
240 | html = open(f, 'rb').read()
241 | encoding = cchardet.detect(html)
242 | print('encoding:', encoding)
243 | encoding = encoding['encoding']
244 | html = html.decode(encoding, 'ignore')
245 | mc = MainContent()
246 | b = time.time()
247 | t, c = mc.extract('', html)
248 | e = time.time()
249 | print('title:', t)
250 | print('content:', len(c))
251 | print('time cost: ', e-b)
252 | title, content = t, c
253 | txt = 'title:%s\ncontent:\n%s\n\n' % (
254 | title,
255 | content,
256 | )
257 | open(f+'-content2.txt','w').write(txt)
258 |
--------------------------------------------------------------------------------