├── title.gif ├── README.md ├── spiderframe.py ├── spider.py └── Queue.py /title.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pangge/python-crawler-ccw/HEAD/title.gif -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 程序主要采用python3编写,采用多线程python爬虫技术,对给定的URL进行检索,找出网址中包含的DOC和PDF文档 2 | 3 | reference: [程序说明](http://www.goldencui.org/2014/10/15/%E7%BD%91%E7%BB%9C%E8%B5%84%E6%BA%90%E6%90%9C%E7%B4%A2%E7%88%AC%E8%99%AB(python%203.4.1%E5%AE%9E%E7%8E%B0)/) -------------------------------------------------------------------------------- /spiderframe.py: -------------------------------------------------------------------------------- 1 | from spider import * 2 | from spider import _ok 3 | content.grid(column=0, row=0,sticky=(N, S, E, W)) 4 | Labelurl.grid(column=0, row=0,columnspan=4, rowspan=1,pady=5,padx=3) 5 | 6 | lf.grid(column=0, row=1,columnspan=4,sticky=(N, S, E, W), rowspan=1,pady=10,padx=3) 7 | 8 | namelbl.grid(column=0, row=0) 9 | name.grid(column=1, row=0,columnspan=2,rowspan=1) 10 | example.grid(column=3, row=0) 11 | 12 | 13 | ok.grid(column=0, row=3) 14 | output.grid(column=1, row=3) 15 | Set.grid(column=2, row=3) 16 | Exit.grid(column=3, row=3) 17 | 18 | root.columnconfigure(0, weight=1) 19 | root.rowconfigure(0, weight=1) 20 | content.columnconfigure(0, weight=1) 21 | content.columnconfigure(1, weight=1) 22 | content.columnconfigure(2, weight=1) 23 | content.columnconfigure(3, weight=1) 24 | content.rowconfigure(1, weight=1) 25 | 26 | root.bind('',_ok) 27 | 28 | root.mainloop() 29 | -------------------------------------------------------------------------------- /spider.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | import time 4 | import math 5 | import threading, zipfile 6 | import urllib 7 | import urllib.request 8 | from urllib.parse import urlparse 9 | from urllib.parse import urlsplit 10 | from urllib.parse import urljoin 11 | from bs4 import BeautifulSoup 12 | from Queue import Queue 13 | from tkinter import filedialog 14 | 15 | 16 | 17 | FILEDIR = 'H:/python程序/file/' 18 | 19 | Urls_Content = {} 20 | class FetchPage: 21 | """docstring for fetchpage""" 22 | def __init__(self, url): 23 | self.url = url 24 | def fetch(self): 25 | req=urllib.request.Request(self.url) 26 | #req.add_header('Referer', 'self.url') 27 | req.add_header('User-agent', 'Mozilla/5.0') 28 | response = urllib.request.urlopen(req) 29 | return response 30 | def getcategory(self): 31 | filepdf = re.search(r'\.pdf\Z',self.url,re.IGNORECASE) 32 | if filepdf: 33 | try: 34 | self.getpdf() 35 | except: 36 | print('some error ignored!') 37 | return None 38 | return 'pdf' 39 | filedoc = re.search(r'\.doc|\.docx\Z',self.url,re.IGNORECASE) 40 | if filedoc: 41 | try: 42 | self.getdoc() 43 | except: 44 | print('some error ignored!') 45 | return None 46 | return 'doc' 47 | return None 48 | def getpdf(self): 49 | response = self.fetch() 50 | pdfurl = urlsplit(self.url) 51 | pdfname = pdfurl.geturl() 52 | purename = re.split('/',pdfname) 53 | f = open(FILEDIR+purename[-1],'wb') 54 | #print(Urls_Content) 55 | #f = open(PDFDIR+'/'+Urls_Content[url]+purename[-1],'wb') 56 | f.write(response.read()) 57 | f.close() 58 | textin('PDF : '+purename[-1]+' 已下载...... ') 59 | def getdoc(self): 60 | response = self.fetch() 61 | pdfurl = urlsplit(self.url) 62 | pdfname = pdfurl.geturl() 63 | purename = re.split('/',pdfname) 64 | f = open(FILEDIR+purename[-1],'wb') 65 | f.write(response.read()) 66 | f.close() 67 | textin('DOC : '+purename[-1]+' 已下载...... ') 68 | 69 | class Crawl: 70 | def __init__(self,root_url): 71 | self.root=root_url 72 | self.urls=set() 73 | self.host = urlparse(root_url)[1] 74 | def craw(self): 75 | html_depth = 0 76 | Que = Queue() 77 | Que.put(self.root) 78 | Que.put('#level#') #help to count the html tree depth 79 | 80 | while not Que.empty(): 81 | url = Que.get() 82 | #html_depth += 1 83 | if html_depth == 10: 84 | break 85 | if url == '#level#': 86 | html_depth += 1 87 | Que.put('#level#') 88 | print('html_deep ->'+html_depth.__str__()) 89 | continue 90 | 91 | fetchpage = FetchPage(url) 92 | # download file 93 | filetype = fetchpage.getcategory() 94 | if filetype is not None: 95 | continue 96 | 97 | self.urls.add(url) 98 | try: 99 | self.page = fetchpage.fetch() 100 | except: 101 | print('some error ignored!') 102 | continue 103 | soup = BeautifulSoup(self.page) 104 | urllist = soup.findAll({'a':True}) 105 | #print(urllist) 106 | for item in urllist: 107 | url_temp = item.get('href') 108 | url_content = item.get_text()#item.get('content') 109 | m = urlparse(url_temp) 110 | #print(m.geturl()) 111 | url_temp = urljoin(url,m.geturl()) 112 | if url_temp is not None and url_temp not in self.urls: 113 | #print(url_temp+url_content) 114 | Que.put(url_temp) 115 | self.urls.add(url_temp) 116 | Urls_Content[url_temp]=url_content 117 | 118 | class multiSuperSpider(threading.Thread): 119 | def __init__(self, host,num): 120 | threading.Thread.__init__(self) 121 | self.host = host 122 | self.num = num 123 | def run(self): 124 | Crawler = Crawl(self.host) 125 | Crawler.craw() 126 | #textin('litte spider:',self.num.__str__(),+'--->'+self.host) 127 | 128 | #Crawler = Crawl('http://papers.nips.cc/paper/5138-the-randomized-dependence-coefficient') 129 | #Crawler.craw() 130 | 131 | from tkinter import * 132 | from tkinter import ttk 133 | from tkinter import messagebox 134 | 135 | hosts = set() 136 | def textin(msg): 137 | t['state'] = 'normal' 138 | t.insert('end','\n'+msg) 139 | t['state'] = 'disabled' 140 | def examineinput(url): 141 | match = re.search(r'http://\w+\.', url,re.IGNORECASE) 142 | match1 = re.search(r'http://http://', url,re.IGNORECASE) 143 | if match1: 144 | return False 145 | if match: 146 | return True 147 | return False 148 | def _ok(*args): 149 | urlname = name.get() 150 | url = 'http://'+urlname 151 | if examineinput(url)==False: 152 | messagebox.showinfo(message='输入网址(URL)格式错误') 153 | return 154 | name.state(['readonly'])#='readonly' 155 | ok.state(['disabled']) 156 | p.grid(column=0, row=4,columnspan=4,sticky=(N, S, E, W), rowspan=1,pady=5) 157 | p.start() 158 | textin('开始检索,主要搜索PDF和DOC/DOCX文档') 159 | #initial threads of spider 160 | fetchpage = FetchPage(url) 161 | urls = fetchpage.fetch() 162 | soup = BeautifulSoup(urls) 163 | urllist = soup.findAll({'a':True}) 164 | for item in urllist: 165 | url_temp = item.get('href') 166 | m = urlparse(url_temp) 167 | url_temp = urljoin(url,m.geturl()) 168 | hosts.add(url_temp) 169 | #run 170 | i=1 171 | for host in hosts: 172 | #print(host) 173 | #textin(host) 174 | if examineinput(host): 175 | backspider = multiSuperSpider(host,i) 176 | backspider.start() 177 | #backspider.setDaemon(True) 178 | i+=1 179 | return 180 | def _output(): 181 | t.grid(column=0,row=5,columnspan=4,pady=10) 182 | About.grid(column=1,row=6,columnspan=2) 183 | output.state(['disabled']) 184 | 185 | def _Set(*args): 186 | def _savadir(): 187 | dirname = filedialog.askdirectory() 188 | FILEDIR=dirname 189 | textin('存储路径修改为:'+FILEDIR) 190 | direction.set(FILEDIR) 191 | setwin = Toplevel(root) 192 | setwin.resizable(FALSE,FALSE) 193 | setwin.title('参数设置') 194 | setwin.geometry('500x178+1200+400') 195 | content = ttk.Frame(setwin, padding=(3,3,3,3)) 196 | LabelDir = ttk.Label(content, text='文件存储路径') 197 | direction=StringVar() 198 | DIR = ttk.Entry(content,state='readonly',textvariable=direction,width=32) 199 | ok = ttk.Button(content, text="修改",command=_savadir) 200 | 201 | content.grid(column=0, row=0,sticky=(N, S, E, W)) 202 | LabelDir.grid(column=0, row=0,pady = 5) 203 | DIR.grid(column=1, row=0) 204 | ok.grid(column=2, row=0) 205 | direction.set(FILEDIR) 206 | #DIR.insert(0, FILEDIR) 207 | 208 | setwin.mainloop() 209 | def _about(): 210 | aboutwin = Toplevel(root) 211 | aboutwin.resizable(FALSE,FALSE) 212 | aboutwin.title('关于我') 213 | aboutwin.geometry('+1200+500') 214 | ll = ttk.Frame(aboutwin, padding=(23,13,23,13)) 215 | Labelme = ttk.Label(ll, text='个人主页: www.goldencui.org') 216 | ll.grid(column=0, row=0,sticky=(N, S, E, W)) 217 | Labelme.grid(column=0, row=0) 218 | #aboutwin.mainloop() 219 | def _Exit(*args): 220 | exit() 221 | 222 | 223 | root = Tk() 224 | root.resizable(FALSE,FALSE) 225 | root.title('资源搜索爬虫 Beta1') 226 | root.geometry('+800+400') 227 | content = ttk.Frame(root, padding=(20,3,20,3)) 228 | image = PhotoImage(file='title.gif') 229 | Labelurl = ttk.Label(content) 230 | Labelurl['image']=image 231 | 232 | 233 | lf = ttk.Labelframe(content, text='输入要抓取网址') 234 | 235 | namelbl = ttk.Label(lf, text="http://") 236 | url=StringVar() 237 | name = ttk.Entry(lf,textvariable=url) 238 | example = ttk.Label(lf, text="例如:www.baidu.com") 239 | 240 | p = ttk.Progressbar(content, orient=HORIZONTAL, length=380, mode='indeterminate') 241 | 242 | ok = ttk.Button(content, text="搜索",command=_ok,width=8) 243 | output = ttk.Button(content, text="输出",command=_output,width=8) 244 | Set = ttk.Button(content, text="设置",command=_Set,width=8) 245 | Exit = ttk.Button(content, text="退出",command=_Exit,width=8) 246 | t = Text(content, width=53, height=20,state='disabled',yscrollcommand='yview') 247 | About = ttk.Button(content, text="关于",command=_about,width=10) 248 | 249 | 250 | -------------------------------------------------------------------------------- /Queue.py: -------------------------------------------------------------------------------- 1 | '''A multi-producer, multi-consumer queue.''' 2 | 3 | try: 4 | import threading 5 | except ImportError: 6 | import dummy_threading as threading 7 | from collections import deque 8 | from heapq import heappush, heappop 9 | try: 10 | from time import monotonic as time 11 | except ImportError: 12 | from time import time 13 | 14 | __all__ = ['Empty', 'Full', 'Queue', 'PriorityQueue', 'LifoQueue'] 15 | 16 | class Empty(Exception): 17 | 'Exception raised by Queue.get(block=0)/get_nowait().' 18 | pass 19 | 20 | class Full(Exception): 21 | 'Exception raised by Queue.put(block=0)/put_nowait().' 22 | pass 23 | 24 | class Queue: 25 | '''Create a queue object with a given maximum size. 26 | 27 | If maxsize is <= 0, the queue size is infinite. 28 | ''' 29 | 30 | def __init__(self, maxsize=0): 31 | self.maxsize = maxsize 32 | self._init(maxsize) 33 | 34 | # mutex must be held whenever the queue is mutating. All methods 35 | # that acquire mutex must release it before returning. mutex 36 | # is shared between the three conditions, so acquiring and 37 | # releasing the conditions also acquires and releases mutex. 38 | self.mutex = threading.Lock() 39 | 40 | # Notify not_empty whenever an item is added to the queue; a 41 | # thread waiting to get is notified then. 42 | self.not_empty = threading.Condition(self.mutex) 43 | 44 | # Notify not_full whenever an item is removed from the queue; 45 | # a thread waiting to put is notified then. 46 | self.not_full = threading.Condition(self.mutex) 47 | 48 | # Notify all_tasks_done whenever the number of unfinished tasks 49 | # drops to zero; thread waiting to join() is notified to resume 50 | self.all_tasks_done = threading.Condition(self.mutex) 51 | self.unfinished_tasks = 0 52 | 53 | def task_done(self): 54 | '''Indicate that a formerly enqueued task is complete. 55 | 56 | Used by Queue consumer threads. For each get() used to fetch a task, 57 | a subsequent call to task_done() tells the queue that the processing 58 | on the task is complete. 59 | 60 | If a join() is currently blocking, it will resume when all items 61 | have been processed (meaning that a task_done() call was received 62 | for every item that had been put() into the queue). 63 | 64 | Raises a ValueError if called more times than there were items 65 | placed in the queue. 66 | ''' 67 | with self.all_tasks_done: 68 | unfinished = self.unfinished_tasks - 1 69 | if unfinished <= 0: 70 | if unfinished < 0: 71 | raise ValueError('task_done() called too many times') 72 | self.all_tasks_done.notify_all() 73 | self.unfinished_tasks = unfinished 74 | 75 | def join(self): 76 | '''Blocks until all items in the Queue have been gotten and processed. 77 | 78 | The count of unfinished tasks goes up whenever an item is added to the 79 | queue. The count goes down whenever a consumer thread calls task_done() 80 | to indicate the item was retrieved and all work on it is complete. 81 | 82 | When the count of unfinished tasks drops to zero, join() unblocks. 83 | ''' 84 | with self.all_tasks_done: 85 | while self.unfinished_tasks: 86 | self.all_tasks_done.wait() 87 | 88 | def qsize(self): 89 | '''Return the approximate size of the queue (not reliable!).''' 90 | with self.mutex: 91 | return self._qsize() 92 | 93 | def empty(self): 94 | '''Return True if the queue is empty, False otherwise (not reliable!). 95 | 96 | This method is likely to be removed at some point. Use qsize() == 0 97 | as a direct substitute, but be aware that either approach risks a race 98 | condition where a queue can grow before the result of empty() or 99 | qsize() can be used. 100 | 101 | To create code that needs to wait for all queued tasks to be 102 | completed, the preferred technique is to use the join() method. 103 | ''' 104 | with self.mutex: 105 | return not self._qsize() 106 | 107 | def full(self): 108 | '''Return True if the queue is full, False otherwise (not reliable!). 109 | 110 | This method is likely to be removed at some point. Use qsize() >= n 111 | as a direct substitute, but be aware that either approach risks a race 112 | condition where a queue can shrink before the result of full() or 113 | qsize() can be used. 114 | ''' 115 | with self.mutex: 116 | return 0 < self.maxsize <= self._qsize() 117 | 118 | def put(self, item, block=True, timeout=None): 119 | '''Put an item into the queue. 120 | 121 | If optional args 'block' is true and 'timeout' is None (the default), 122 | block if necessary until a free slot is available. If 'timeout' is 123 | a non-negative number, it blocks at most 'timeout' seconds and raises 124 | the Full exception if no free slot was available within that time. 125 | Otherwise ('block' is false), put an item on the queue if a free slot 126 | is immediately available, else raise the Full exception ('timeout' 127 | is ignored in that case). 128 | ''' 129 | with self.not_full: 130 | if self.maxsize > 0: 131 | if not block: 132 | if self._qsize() >= self.maxsize: 133 | raise Full 134 | elif timeout is None: 135 | while self._qsize() >= self.maxsize: 136 | self.not_full.wait() 137 | elif timeout < 0: 138 | raise ValueError("'timeout' must be a non-negative number") 139 | else: 140 | endtime = time() + timeout 141 | while self._qsize() >= self.maxsize: 142 | remaining = endtime - time() 143 | if remaining <= 0.0: 144 | raise Full 145 | self.not_full.wait(remaining) 146 | self._put(item) 147 | self.unfinished_tasks += 1 148 | self.not_empty.notify() 149 | 150 | def get(self, block=True, timeout=None): 151 | '''Remove and return an item from the queue. 152 | 153 | If optional args 'block' is true and 'timeout' is None (the default), 154 | block if necessary until an item is available. If 'timeout' is 155 | a non-negative number, it blocks at most 'timeout' seconds and raises 156 | the Empty exception if no item was available within that time. 157 | Otherwise ('block' is false), return an item if one is immediately 158 | available, else raise the Empty exception ('timeout' is ignored 159 | in that case). 160 | ''' 161 | with self.not_empty: 162 | if not block: 163 | if not self._qsize(): 164 | raise Empty 165 | elif timeout is None: 166 | while not self._qsize(): 167 | self.not_empty.wait() 168 | elif timeout < 0: 169 | raise ValueError("'timeout' must be a non-negative number") 170 | else: 171 | endtime = time() + timeout 172 | while not self._qsize(): 173 | remaining = endtime - time() 174 | if remaining <= 0.0: 175 | raise Empty 176 | self.not_empty.wait(remaining) 177 | item = self._get() 178 | self.not_full.notify() 179 | return item 180 | 181 | def put_nowait(self, item): 182 | '''Put an item into the queue without blocking. 183 | 184 | Only enqueue the item if a free slot is immediately available. 185 | Otherwise raise the Full exception. 186 | ''' 187 | return self.put(item, block=False) 188 | 189 | def get_nowait(self): 190 | '''Remove and return an item from the queue without blocking. 191 | 192 | Only get an item if one is immediately available. Otherwise 193 | raise the Empty exception. 194 | ''' 195 | return self.get(block=False) 196 | 197 | # Override these methods to implement other queue organizations 198 | # (e.g. stack or priority queue). 199 | # These will only be called with appropriate locks held 200 | 201 | # Initialize the queue representation 202 | def _init(self, maxsize): 203 | self.queue = deque() 204 | 205 | def _qsize(self): 206 | return len(self.queue) 207 | 208 | # Put a new item in the queue 209 | def _put(self, item): 210 | self.queue.append(item) 211 | 212 | # Get an item from the queue 213 | def _get(self): 214 | return self.queue.popleft() 215 | 216 | 217 | class PriorityQueue(Queue): 218 | '''Variant of Queue that retrieves open entries in priority order (lowest first). 219 | 220 | Entries are typically tuples of the form: (priority number, data). 221 | ''' 222 | 223 | def _init(self, maxsize): 224 | self.queue = [] 225 | 226 | def _qsize(self): 227 | return len(self.queue) 228 | 229 | def _put(self, item): 230 | heappush(self.queue, item) 231 | 232 | def _get(self): 233 | return heappop(self.queue) 234 | 235 | 236 | class LifoQueue(Queue): 237 | '''Variant of Queue that retrieves most recently added entries first.''' 238 | 239 | def _init(self, maxsize): 240 | self.queue = [] 241 | 242 | def _qsize(self): 243 | return len(self.queue) 244 | 245 | def _put(self, item): 246 | self.queue.append(item) 247 | 248 | def _get(self): 249 | return self.queue.pop() --------------------------------------------------------------------------------