├── .github
    └── workflows
    │   └── pythonapp.yml
├── .gitignore
├── README.md
├── requirements.txt
└── src
    ├── httphand.py
    ├── log.txt
    ├── main.py
    ├── sqlhand.py
    ├── todayb.db
    └── toolhand.py


/.github/workflows/pythonapp.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python application
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master ]
 9 |   pull_request:
10 |     branches: [ master ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v2
19 |     - name: Set up Python 3.8
20 |       uses: actions/setup-python@v1
21 |       with:
22 |         python-version: 3.8
23 |     - name: Install dependencies
24 |       run: |
25 |         python -m pip install --upgrade pip
26 |         pip install -r requirements.txt
27 |     - name: Lint with flake8
28 |       run: |
29 |         pip install flake8
30 |         # stop the build if there are Python syntax errors or undefined names
31 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
32 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
33 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
34 |     - name: Test with pytest
35 |       run: |
36 |         pip install pytest
37 |         pytest
38 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | *.db
 7 | # Packages
 8 | *.egg
 9 | *.egg-info
10 | dist
11 | build
12 | eggs
13 | parts
14 | bin
15 | var
16 | sdist
17 | develop-eggs
18 | .installed.cfg
19 | lib
20 | lib64
21 | 
22 | # Installer logs
23 | pip-log.txt
24 | 
25 | # Unit test / coverage reports
26 | .coverage
27 | .tox
28 | nosetests.xml
29 | 
30 | # Translations
31 | *.mo
32 | 
33 | # Mr Developer
34 | .mr.developer.cfg
35 | .project
36 | .pydevproject
37 | *.pyc
38 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | test
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4
2 | 


--------------------------------------------------------------------------------
/src/httphand.py:
--------------------------------------------------------------------------------
 1 | #coding:utf8
 2 | import http
 3 | from urllib.parse import urlparse
 4 | import urllib
 5 | class httphand():
 6 |     def geturl(self,url):
 7 |         resp = self.run(url)
 8 |         jumpnumber = 0
 9 |         jumpinfo = []
10 |         if not hasattr(resp,'status'):
11 |             result = self.runurl(url)
12 |             pass
13 |         else:
14 |             try:
15 |                 tt = []
16 |                 while resp.status != 200 :
17 |                     tt.append(resp.getheader('location'))
18 |                     if resp.getheader('location') in tt:#防止进入陷阱
19 |                         break
20 |                     tmp = {}
21 |                     tmp['status'] = resp.status
22 |                     tmp['jumpurl'] = resp.getheader('location')
23 |                     jumpinfo.append(tmp)
24 |                     tmp = {}
25 |                     jumpnumber = jumpnumber + 1
26 |                     resp = self.run(resp.getheader('location'))
27 |                 data = resp.read()
28 |                 response = resp.status
29 |                 result = {'data':data,'jumpinfo':jumpinfo,'jumpnumber':jumpnumber,'response':response}
30 |             except:
31 |                 result = self.runurl(url)
32 |         return result
33 |     def runurl(self,url):
34 |         try:
35 |             aaa = urllib2.urlopen(url,timeout=7)
36 |             data = aaa.read()
37 |             jumpnumber = 1
38 |             jumpinfo = [{'status':'','jumpurl':aaa.geturl()}]
39 |             response = aaa.getcode()
40 |             return {'data':data,'jumpinfo':jumpinfo,'jumpnumber':jumpnumber,'response':response}
41 |         except:
42 |             return None
43 |         pass
44 |     def run(self,url):
45 |         resp = None
46 |         try:
47 |             host = urlparse(url)[1]
48 |             req = '/'#.join(urlparse(url)[2:5])
49 |             conn = http.client.HTTPConnection(host,timeout = 10)
50 |             #headers = {'User-Agent':'Mozilla/5.0 (X11; Linux i686; rv:8.0) Gecko/20100101 Firefox/8.0'}
51 |             headers = {}
52 |             conn.request('GET', req,headers = headers)
53 |             resp = conn.getresponse()
54 |         except:
55 |             pass
56 |         return resp
57 | 


--------------------------------------------------------------------------------
/src/log.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simapple/spider/9e3a2e8d25d9c10fd7a4d6d592f8a31e18eb7e09/src/log.txt


--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
 1 | #coding:utf8
 2 | import threading
 3 | from queue import Queue
 4 | from collections import deque
 5 | from toolhand import *
 6 | class multigeturl(threading.Thread):
 7 |     '''线程'''
 8 |     global temp
 9 |     def __init__(self,queue,threadname,logger,lock):
10 |         threading.Thread.__init__(self,name=threadname)
11 |         self.queue = queue
12 |         self.lock = lock
13 |         self.logger = logger
14 |         self.tmp = []
15 |     def run(self):
16 |         while True:
17 |             info = self.queue.get()
18 |             if self.lock.locked():
19 |                 self.tmp.append(info)
20 |                 pass
21 |             else:
22 |                 self.lock.acquire()
23 |                 db = dbhand(self.logger)
24 |                 dbfile = "todayb.db"
25 |                 db.dbconnect(dbfile)
26 |                 db.initdatabase()#数据表初始化
27 |                 for tt in self.tmp:
28 |                     allstar(tt,db)
29 |                 self.tmp = []
30 |                 self.lock.release()
31 |             self.queue.task_done()
32 |         self.queue.task_done()
33 | def main():
34 |     start = {}
35 |     start['url'] = "https://www.hao123.com"
36 |     logop = {}
37 |     logop['logfile'] = "log.txt"#uop['logfile']
38 |     logop['loglevel'] = "INFO"#uop['level']
39 |     global logger
40 |     logger = getlog(logop)#构造log对象
41 |     global db
42 |     db = dbhand(logger)
43 |     dbfile = "todayb.db"
44 |     db.dbconnect(dbfile)
45 |     db.initdatabase()#数据表初始化
46 |     db.selecturls2()
47 |     db.insertone(start,'urls')
48 |     queue = Queue()
49 |     threadnumber = 3
50 |     lock = threading.Lock()
51 |     for i in range(threadnumber):#初始化线程池
52 |         t1 = multigeturl(queue,'urlt_'+str(i),logger,lock)
53 |         t1.setDaemon(True)
54 |         t1.start()
55 |     allurl = db.selecturls2()
56 |     if len(allurl) < 2:
57 |         allstar(start,db)
58 |         queue.put(start['url'])
59 | 
60 |     while len(allurl) > 0:
61 | 
62 |         t = allurl.pop()
63 |         queue.put(t)
64 |         allurl = db.selecturls2()
65 |     queue.join()
66 | 
67 | if __name__ == "__main__":
68 |     main()
69 | 


--------------------------------------------------------------------------------
/src/sqlhand.py:
--------------------------------------------------------------------------------
  1 | #coding:utf8
  2 | import sqlite3
  3 | import re
  4 | class dbhand:
  5 |     def __init__(self,logger):
  6 |         self.logger = logger
  7 |     def dbconnect(self,info):
  8 |         self.dbcon = None
  9 |         try:
 10 |             self.dbcon = sqlite3.connect(info,isolation_level="DEFERRED",check_same_thread=False)
 11 |             self.con = self.dbcon.cursor()
 12 |             self.logger.info("数据库已连接")
 13 |         except:
 14 |             self.logger.info("数据库连接失败")
 15 |             print("数据库连接失败")
 16 |             quit()
 17 |     def initdatabase(self):
 18 |         """
 19 |         初始化数据库
 20 |         table1：
 21 |         url 网址 title 标题内容 inurl 内链数量 outurl 外链数量 jumpnumber 跳转次数 deep 层级定义第几次任务爬取到的链接 response 返回状态码
 22 |         jumptype  跳转类型
 23 |         """
 24 |         try:
 25 |             sql1 = """create table if not exists urls
 26 | (url text primary key,title text,inurl integer default 0,outurl integer default 0,jumpnumber integer default 0,jumptype text default null ,deep integer default 0,response integer default 0,jumpinfo text default null)
 27 | 
 28 | """
 29 |             self.con.execute(sql1)
 30 |             self.logger.info("数据表初始化成功")
 31 |         except:
 32 |             self.logger.error("数据表创建失败")
 33 |             pass
 34 | 
 35 |     def insertone(self,info,table):
 36 |         sql1 = "insert or ignore into `%s` "%table
 37 |         name = []
 38 |         value = []
 39 |         try:
 40 |             for k,v in info.items():
 41 |                 name.append("`%s`"%(str(k)))
 42 |                 value.append("'%s'"%(str(v)))
 43 |         except UnicodeEncodeError:
 44 |             pass
 45 |         namestr = ','.join(name)
 46 |         valuestr = ','.join(value)
 47 |         sql2 = "(%s)values(%s)"%(namestr, valuestr)
 48 |         sql = sql1+sql2
 49 |         print("#######"+sql)
 50 |         try:
 51 |             self.con.execute(sql)
 52 |             self.dbcon.commit()
 53 |         except sqlite3.IntegrityError:
 54 |             pass
 55 |         except:
 56 |             pass
 57 | 
 58 |         pass
 59 |     def commitall(self):
 60 |         try:
 61 |             self.dbcon.commit()
 62 |         except sqlite3.IntegrityError:
 63 |             pass
 64 |         except:
 65 |             pass
 66 | 
 67 |     def updateone(self,info,table,where = ''):
 68 |         #sql1 = "update  `%s` "%table
 69 |         # part = []
 70 |         # for k,v in info.iteritems():
 71 |         #     part.append("`%s` = '%s'"%(k,str(v)))
 72 |         # partstr = ",".join(part)
 73 |         # sql = sql1+" set "+partstr + " where url = '%s' "%((str(info['url'])))
 74 |         print("*******************************************")
 75 |         try:
 76 |             self.con.execute("""update `urls` set title=?,inurl=?,jumpnumber=?,
 77 | jumpinfo=?,outurl=?,response=? where url = ?
 78 | """,(info['title'],info['inurl'],info['jumpnumber'],info['jumpinfo'],info['outurl'],info['response'],info['url']))
 79 |             self.dbcon.commit()
 80 |         except TypeError:#sqlite3.IntegrityError:
 81 |             pass
 82 | 
 83 | 
 84 |         pass
 85 |     def geturls(self,info,workid):
 86 |         """
 87 |         info['did']  深度
 88 |         info['url']  连接
 89 | 
 90 |         """
 91 |         sql = "insert into urls (url,did,wid) values ('%s','%s','%s')"%(info['url'],info['did'],workid)
 92 |         try:
 93 |             self.con.execute(sql)
 94 |             self.dbcon.commit()
 95 |         except sqlite3.IntegrityError:
 96 |             pass
 97 |         except:
 98 |             self.logger.error(str(info['did'])+"深度"+str(info['url'])+" 无法正常入库")
 99 | 
100 | 
101 |     def selecturls2(self):
102 |         info = {}
103 |         allurl = []
104 |         sql = "select url from urls where response = '0'"
105 |         self.con.execute(sql)
106 |         for i in self.con.fetchall():
107 |             info['url'] = i[0]
108 |             allurl.append(info)
109 |             info = {}
110 |         return allurl
111 | 
112 |     def getone(self,table,var = '*', where = ''):
113 |         sql = "select "+var+" from "+table+where
114 |         self.con.execute(sql)
115 |         result = self.con.fetchone()
116 |         return result[1]
117 | 


--------------------------------------------------------------------------------
/src/todayb.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simapple/spider/9e3a2e8d25d9c10fd7a4d6d592f8a31e18eb7e09/src/todayb.db


--------------------------------------------------------------------------------
/src/toolhand.py:
--------------------------------------------------------------------------------
  1 | #coding:utf8
  2 | #使用协程作业
  3 | import urllib
  4 | from bs4 import BeautifulSoup
  5 | import re
  6 | from sqlhand import dbhand
  7 | import logging
  8 | from collections import deque
  9 | import time
 10 | from urllib.parse import urlparse
 11 | from httphand import httphand
 12 | from queue import Queue
 13 | import pickle
 14 | 
 15 | def getlog(info):
 16 |       logger = logging.getLogger()
 17 |       hdlr = logging.FileHandler(info['logfile'])
 18 |       formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
 19 |       hdlr.setFormatter(formatter)
 20 |       logger.addHandler(hdlr)
 21 |       try:
 22 |           logger.setLevel(info['loglevel'])
 23 |       except:
 24 |           print("你输入的日志等级不正确")
 25 |       return logger
 26 | 
 27 | def getaurl(info):#采集页面上所有的url，以及此次访问的状态码
 28 |     tmp = {}
 29 |     urls = {}
 30 |     try:
 31 |         xsdanx = httphand()
 32 |         result = xsdanx.geturl(info)
 33 |         if result is None:
 34 |               tmp['url'] = info
 35 |               tmp['response'] = 404#状态码
 36 |               tmp['title'] = ''#标题
 37 |               tmp['inurl'] = 0
 38 |               tmp['outurl'] = 0
 39 |               tmp['jumpnumber'] = 0
 40 |               tmp['jumpinfo'] = ''
 41 |               pass
 42 |         else:
 43 |               sop = BeautifulSoup(result['data'])
 44 |               urls = sop.findAll("a",{'href':True})
 45 |               try:
 46 |                     title = sop.head.title.string#只要一个title
 47 |               except AttributeError:
 48 |                     title = "null"
 49 |               tmp['response'] = result['response']#状态码
 50 |               tmp['title'] = title#标题
 51 |               tmp['url'] = info
 52 |               urltype = checkurl2(info,urls)
 53 |               tmp['inurl'] = urltype['inurl']
 54 |               tmp['outurl'] = urltype['outurl']
 55 |               tmp['jumpnumber'] = result['jumpnumber']
 56 |               tmp['jumpinfo'] = mkjumpinfo(result['jumpinfo'])
 57 |         return (tmp,urls)
 58 |     except (urllib.URLError,TypeError,UnicodeEncodeError):
 59 |           return (tmp,urls)
 60 | def mkjumpinfo(info):
 61 |       tmp = []
 62 |       for i in info:
 63 |             tmp.append(re.escape(pickle.dumps(i)))
 64 |       return "|".join(tmp)
 65 | def checkurl2(url,text):#检查url 区分内链 外链
 66 |     inurl = 0
 67 |     outurl = 0
 68 |     valid = re.compile("^"+url+".*$")
 69 |     tmp = {}
 70 |     tmp['inurl'] = 0
 71 |     tmp['outurl'] = 0
 72 |     for i in text:
 73 |         try:
 74 |             if valid.match(i['href']) is None:
 75 |                 tmp['outurl'] += 1
 76 |             else:
 77 |                 tmp['inurl'] += 1
 78 |         except:
 79 |             pass
 80 |     return tmp
 81 | def allstar(info,db):
 82 |       (thetest,urls) = getaurl(info['url'])
 83 |       if len(thetest)  > 0 :
 84 |             db.updateone(thetest,'urls')
 85 |       if len(urls) > 0 :
 86 |           urlsget(urls,db)
 87 | 
 88 | def checkurl(text):#检查url
 89 |     valid = re.compile("^[http|https].*$")
 90 | 
 91 |     if valid.match(text) is None:
 92 |         return False
 93 |     else:
 94 |         return True
 95 | 
 96 | def urlsget(urls,db):
 97 |     tmp = {}
 98 |     tmp2 = ''
 99 |     ttmp = set([])
100 |     for i in urls:#将爬去到的url入库
101 |           try:
102 |                 tmp2 = urlparse(i['href'])
103 |                 tmp['url'] = tmp2.scheme+"://"+tmp2.netloc
104 |                 if checkurl(tmp['url']) :
105 |                       if tmp['url'] not in ttmp:
106 |                             db.insertone(tmp,'urls')
107 |                             ttmp.add(tmp['url'])
108 |           except (TypeError,KeyError,ValueError):
109 |                 pass
110 | 
111 | 


--------------------------------------------------------------------------------