├── .github └── workflows │ └── pythonapp.yml ├── .gitignore ├── README.md ├── requirements.txt └── src ├── httphand.py ├── log.txt ├── main.py ├── sqlhand.py ├── todayb.db └── toolhand.py /.github/workflows/pythonapp.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python application 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up Python 3.8 20 | uses: actions/setup-python@v1 21 | with: 22 | python-version: 3.8 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install -r requirements.txt 27 | - name: Lint with flake8 28 | run: | 29 | pip install flake8 30 | # stop the build if there are Python syntax errors or undefined names 31 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 32 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 33 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 34 | - name: Test with pytest 35 | run: | 36 | pip install pytest 37 | pytest 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | *.db 7 | # Packages 8 | *.egg 9 | *.egg-info 10 | dist 11 | build 12 | eggs 13 | parts 14 | bin 15 | var 16 | sdist 17 | develop-eggs 18 | .installed.cfg 19 | lib 20 | lib64 21 | 22 | # Installer logs 23 | pip-log.txt 24 | 25 | # Unit test / coverage reports 26 | .coverage 27 | .tox 28 | nosetests.xml 29 | 30 | # Translations 31 | *.mo 32 | 33 | # Mr Developer 34 | .mr.developer.cfg 35 | .project 36 | .pydevproject 37 | *.pyc 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | test 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4 2 | -------------------------------------------------------------------------------- /src/httphand.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | import http 3 | from urllib.parse import urlparse 4 | import urllib 5 | class httphand(): 6 | def geturl(self,url): 7 | resp = self.run(url) 8 | jumpnumber = 0 9 | jumpinfo = [] 10 | if not hasattr(resp,'status'): 11 | result = self.runurl(url) 12 | pass 13 | else: 14 | try: 15 | tt = [] 16 | while resp.status != 200 : 17 | tt.append(resp.getheader('location')) 18 | if resp.getheader('location') in tt:#防止进入陷阱 19 | break 20 | tmp = {} 21 | tmp['status'] = resp.status 22 | tmp['jumpurl'] = resp.getheader('location') 23 | jumpinfo.append(tmp) 24 | tmp = {} 25 | jumpnumber = jumpnumber + 1 26 | resp = self.run(resp.getheader('location')) 27 | data = resp.read() 28 | response = resp.status 29 | result = {'data':data,'jumpinfo':jumpinfo,'jumpnumber':jumpnumber,'response':response} 30 | except: 31 | result = self.runurl(url) 32 | return result 33 | def runurl(self,url): 34 | try: 35 | aaa = urllib2.urlopen(url,timeout=7) 36 | data = aaa.read() 37 | jumpnumber = 1 38 | jumpinfo = [{'status':'','jumpurl':aaa.geturl()}] 39 | response = aaa.getcode() 40 | return {'data':data,'jumpinfo':jumpinfo,'jumpnumber':jumpnumber,'response':response} 41 | except: 42 | return None 43 | pass 44 | def run(self,url): 45 | resp = None 46 | try: 47 | host = urlparse(url)[1] 48 | req = '/'#.join(urlparse(url)[2:5]) 49 | conn = http.client.HTTPConnection(host,timeout = 10) 50 | #headers = {'User-Agent':'Mozilla/5.0 (X11; Linux i686; rv:8.0) Gecko/20100101 Firefox/8.0'} 51 | headers = {} 52 | conn.request('GET', req,headers = headers) 53 | resp = conn.getresponse() 54 | except: 55 | pass 56 | return resp 57 | -------------------------------------------------------------------------------- /src/log.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simapple/spider/9e3a2e8d25d9c10fd7a4d6d592f8a31e18eb7e09/src/log.txt -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | import threading 3 | from queue import Queue 4 | from collections import deque 5 | from toolhand import * 6 | class multigeturl(threading.Thread): 7 | '''线程''' 8 | global temp 9 | def __init__(self,queue,threadname,logger,lock): 10 | threading.Thread.__init__(self,name=threadname) 11 | self.queue = queue 12 | self.lock = lock 13 | self.logger = logger 14 | self.tmp = [] 15 | def run(self): 16 | while True: 17 | info = self.queue.get() 18 | if self.lock.locked(): 19 | self.tmp.append(info) 20 | pass 21 | else: 22 | self.lock.acquire() 23 | db = dbhand(self.logger) 24 | dbfile = "todayb.db" 25 | db.dbconnect(dbfile) 26 | db.initdatabase()#数据表初始化 27 | for tt in self.tmp: 28 | allstar(tt,db) 29 | self.tmp = [] 30 | self.lock.release() 31 | self.queue.task_done() 32 | self.queue.task_done() 33 | def main(): 34 | start = {} 35 | start['url'] = "https://www.hao123.com" 36 | logop = {} 37 | logop['logfile'] = "log.txt"#uop['logfile'] 38 | logop['loglevel'] = "INFO"#uop['level'] 39 | global logger 40 | logger = getlog(logop)#构造log对象 41 | global db 42 | db = dbhand(logger) 43 | dbfile = "todayb.db" 44 | db.dbconnect(dbfile) 45 | db.initdatabase()#数据表初始化 46 | db.selecturls2() 47 | db.insertone(start,'urls') 48 | queue = Queue() 49 | threadnumber = 3 50 | lock = threading.Lock() 51 | for i in range(threadnumber):#初始化线程池 52 | t1 = multigeturl(queue,'urlt_'+str(i),logger,lock) 53 | t1.setDaemon(True) 54 | t1.start() 55 | allurl = db.selecturls2() 56 | if len(allurl) < 2: 57 | allstar(start,db) 58 | queue.put(start['url']) 59 | 60 | while len(allurl) > 0: 61 | 62 | t = allurl.pop() 63 | queue.put(t) 64 | allurl = db.selecturls2() 65 | queue.join() 66 | 67 | if __name__ == "__main__": 68 | main() 69 | -------------------------------------------------------------------------------- /src/sqlhand.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | import sqlite3 3 | import re 4 | class dbhand: 5 | def __init__(self,logger): 6 | self.logger = logger 7 | def dbconnect(self,info): 8 | self.dbcon = None 9 | try: 10 | self.dbcon = sqlite3.connect(info,isolation_level="DEFERRED",check_same_thread=False) 11 | self.con = self.dbcon.cursor() 12 | self.logger.info("数据库已连接") 13 | except: 14 | self.logger.info("数据库连接失败") 15 | print("数据库连接失败") 16 | quit() 17 | def initdatabase(self): 18 | """ 19 | 初始化数据库 20 | table1: 21 | url 网址 title 标题内容 inurl 内链数量 outurl 外链数量 jumpnumber 跳转次数 deep 层级定义第几次任务爬取到的链接 response 返回状态码 22 | jumptype 跳转类型 23 | """ 24 | try: 25 | sql1 = """create table if not exists urls 26 | (url text primary key,title text,inurl integer default 0,outurl integer default 0,jumpnumber integer default 0,jumptype text default null ,deep integer default 0,response integer default 0,jumpinfo text default null) 27 | 28 | """ 29 | self.con.execute(sql1) 30 | self.logger.info("数据表初始化成功") 31 | except: 32 | self.logger.error("数据表创建失败") 33 | pass 34 | 35 | def insertone(self,info,table): 36 | sql1 = "insert or ignore into `%s` "%table 37 | name = [] 38 | value = [] 39 | try: 40 | for k,v in info.items(): 41 | name.append("`%s`"%(str(k))) 42 | value.append("'%s'"%(str(v))) 43 | except UnicodeEncodeError: 44 | pass 45 | namestr = ','.join(name) 46 | valuestr = ','.join(value) 47 | sql2 = "(%s)values(%s)"%(namestr, valuestr) 48 | sql = sql1+sql2 49 | print("#######"+sql) 50 | try: 51 | self.con.execute(sql) 52 | self.dbcon.commit() 53 | except sqlite3.IntegrityError: 54 | pass 55 | except: 56 | pass 57 | 58 | pass 59 | def commitall(self): 60 | try: 61 | self.dbcon.commit() 62 | except sqlite3.IntegrityError: 63 | pass 64 | except: 65 | pass 66 | 67 | def updateone(self,info,table,where = ''): 68 | #sql1 = "update `%s` "%table 69 | # part = [] 70 | # for k,v in info.iteritems(): 71 | # part.append("`%s` = '%s'"%(k,str(v))) 72 | # partstr = ",".join(part) 73 | # sql = sql1+" set "+partstr + " where url = '%s' "%((str(info['url']))) 74 | print("*******************************************") 75 | try: 76 | self.con.execute("""update `urls` set title=?,inurl=?,jumpnumber=?, 77 | jumpinfo=?,outurl=?,response=? where url = ? 78 | """,(info['title'],info['inurl'],info['jumpnumber'],info['jumpinfo'],info['outurl'],info['response'],info['url'])) 79 | self.dbcon.commit() 80 | except TypeError:#sqlite3.IntegrityError: 81 | pass 82 | 83 | 84 | pass 85 | def geturls(self,info,workid): 86 | """ 87 | info['did'] 深度 88 | info['url'] 连接 89 | 90 | """ 91 | sql = "insert into urls (url,did,wid) values ('%s','%s','%s')"%(info['url'],info['did'],workid) 92 | try: 93 | self.con.execute(sql) 94 | self.dbcon.commit() 95 | except sqlite3.IntegrityError: 96 | pass 97 | except: 98 | self.logger.error(str(info['did'])+"深度"+str(info['url'])+" 无法正常入库") 99 | 100 | 101 | def selecturls2(self): 102 | info = {} 103 | allurl = [] 104 | sql = "select url from urls where response = '0'" 105 | self.con.execute(sql) 106 | for i in self.con.fetchall(): 107 | info['url'] = i[0] 108 | allurl.append(info) 109 | info = {} 110 | return allurl 111 | 112 | def getone(self,table,var = '*', where = ''): 113 | sql = "select "+var+" from "+table+where 114 | self.con.execute(sql) 115 | result = self.con.fetchone() 116 | return result[1] 117 | -------------------------------------------------------------------------------- /src/todayb.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simapple/spider/9e3a2e8d25d9c10fd7a4d6d592f8a31e18eb7e09/src/todayb.db -------------------------------------------------------------------------------- /src/toolhand.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | #使用协程作业 3 | import urllib 4 | from bs4 import BeautifulSoup 5 | import re 6 | from sqlhand import dbhand 7 | import logging 8 | from collections import deque 9 | import time 10 | from urllib.parse import urlparse 11 | from httphand import httphand 12 | from queue import Queue 13 | import pickle 14 | 15 | def getlog(info): 16 | logger = logging.getLogger() 17 | hdlr = logging.FileHandler(info['logfile']) 18 | formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') 19 | hdlr.setFormatter(formatter) 20 | logger.addHandler(hdlr) 21 | try: 22 | logger.setLevel(info['loglevel']) 23 | except: 24 | print("你输入的日志等级不正确") 25 | return logger 26 | 27 | def getaurl(info):#采集页面上所有的url,以及此次访问的状态码 28 | tmp = {} 29 | urls = {} 30 | try: 31 | xsdanx = httphand() 32 | result = xsdanx.geturl(info) 33 | if result is None: 34 | tmp['url'] = info 35 | tmp['response'] = 404#状态码 36 | tmp['title'] = ''#标题 37 | tmp['inurl'] = 0 38 | tmp['outurl'] = 0 39 | tmp['jumpnumber'] = 0 40 | tmp['jumpinfo'] = '' 41 | pass 42 | else: 43 | sop = BeautifulSoup(result['data']) 44 | urls = sop.findAll("a",{'href':True}) 45 | try: 46 | title = sop.head.title.string#只要一个title 47 | except AttributeError: 48 | title = "null" 49 | tmp['response'] = result['response']#状态码 50 | tmp['title'] = title#标题 51 | tmp['url'] = info 52 | urltype = checkurl2(info,urls) 53 | tmp['inurl'] = urltype['inurl'] 54 | tmp['outurl'] = urltype['outurl'] 55 | tmp['jumpnumber'] = result['jumpnumber'] 56 | tmp['jumpinfo'] = mkjumpinfo(result['jumpinfo']) 57 | return (tmp,urls) 58 | except (urllib.URLError,TypeError,UnicodeEncodeError): 59 | return (tmp,urls) 60 | def mkjumpinfo(info): 61 | tmp = [] 62 | for i in info: 63 | tmp.append(re.escape(pickle.dumps(i))) 64 | return "|".join(tmp) 65 | def checkurl2(url,text):#检查url 区分内链 外链 66 | inurl = 0 67 | outurl = 0 68 | valid = re.compile("^"+url+".*$") 69 | tmp = {} 70 | tmp['inurl'] = 0 71 | tmp['outurl'] = 0 72 | for i in text: 73 | try: 74 | if valid.match(i['href']) is None: 75 | tmp['outurl'] += 1 76 | else: 77 | tmp['inurl'] += 1 78 | except: 79 | pass 80 | return tmp 81 | def allstar(info,db): 82 | (thetest,urls) = getaurl(info['url']) 83 | if len(thetest) > 0 : 84 | db.updateone(thetest,'urls') 85 | if len(urls) > 0 : 86 | urlsget(urls,db) 87 | 88 | def checkurl(text):#检查url 89 | valid = re.compile("^[http|https].*$") 90 | 91 | if valid.match(text) is None: 92 | return False 93 | else: 94 | return True 95 | 96 | def urlsget(urls,db): 97 | tmp = {} 98 | tmp2 = '' 99 | ttmp = set([]) 100 | for i in urls:#将爬去到的url入库 101 | try: 102 | tmp2 = urlparse(i['href']) 103 | tmp['url'] = tmp2.scheme+"://"+tmp2.netloc 104 | if checkurl(tmp['url']) : 105 | if tmp['url'] not in ttmp: 106 | db.insertone(tmp,'urls') 107 | ttmp.add(tmp['url']) 108 | except (TypeError,KeyError,ValueError): 109 | pass 110 | 111 | --------------------------------------------------------------------------------