├── README.md ├── dagaier.py ├── dagaier.zip └── snapshot └── snap1.png /README.md: -------------------------------------------------------------------------------- 1 | # caoliu_1024_dagaier_spider 2 | 爬取草榴论坛"达盖尔的旗帜"分类下的主题图片
3 | 4 | https://raw.githubusercontent.com/cary-zhou/caoliu_1024_dagaier_spider/master/dagaier.zip 5 | 6 | 运行: 7 | ========= 8 | linux:
9 | python ./达盖尔.py
10 | or
11 | windows:
12 | python .\达盖尔.py
13 | 14 | 15 | 环境准备: 16 | ========= 17 | windows or Linux
18 | --------- 19 | pip install pyquery
20 | pip install requests
21 | pip install -U requests[socks]
22 | 23 | 24 | 修改参数:
25 | ========= 26 | 修改代理地址为自己SS或SSR监听的地址端口
27 | proxy={"http":"socks5h://127.0.0.1:1088","https":"socks5h://127.0.0.1:1088"}
28 |
29 | 请合理设置线程数
30 | work_manager=ThreadManager(8)
31 |
32 | 请修改需要爬取的主题分页数
33 | while offset<10: #主题列表分页数
34 |
35 | 36 | 预编译二进制: 37 | ========= 38 | 压缩包:dagaier.zip,是windows下直接可双击执行的exe文件,
39 | 使用时需要解压exe可执行文件出来,不要在zip压缩管理器内直接双击执行,免得爬虫运行完了找不到肉。
40 | 然后启动你的SSR代理->选项设置->本地端口,填1088,因为程序内手写了通过本地socks5h://127.0.0.1:1088爬梯。
41 | 如图:
42 | ![image](https://github.com/cary-zhou/caoliu_1024_dagaier_spider/raw/master/snapshot/snap1.png) 43 |
爬取到的资源放在exe同级目录的images文件夹下,每个帖子每个文件夹分开存放,文件夹名就是帖子标题名。
44 | -------------------------------------------------------------------------------- /dagaier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2.7 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import re 5 | import sys 6 | import time 7 | import inspect 8 | import logging 9 | try: 10 | import httplib 11 | httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0' 12 | except: 13 | pass 14 | import requests 15 | import threading 16 | from pyquery import PyQuery as pq 17 | try: 18 | from Queue import Queue as queue 19 | except: 20 | from queue import Queue as queue 21 | 22 | logging.basicConfig(level=logging.INFO,format='%(asctime)s %(levelname)-6s %(message)s',datefmt='%Y-%m-%d %H:%M:%S') 23 | logging.addLevelName(50,'CRIT') 24 | logging.addLevelName(30,'WARN') 25 | 26 | header={ 27 | "User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", 28 | "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 29 | "Accept-Language":"zh-CN,zh;q=0.9", 30 | "Accept-Encoding":"gzip, deflate", 31 | "Connection":"close" 32 | } 33 | 34 | '''国内抓取需配置代理''' 35 | proxy={"http":"socks5h://127.0.0.1:1088","https":"socks5h://127.0.0.1:1088"} 36 | 37 | class ThreadManager(object): 38 | '''线程池管理''' 39 | def __init__(self,num): 40 | self.thread_num=num #线程数量 41 | self.queue=queue() #任务队列 42 | self.threadlist=list() #线程池列表 43 | self.shutdown=threading.Event() #线程退出标志 44 | 45 | def add_task(self,topic_url,topic_title): 46 | '''添加任务''' 47 | self.queue.put((topic_url,topic_title)) 48 | 49 | def __start__(self): 50 | '''线程初始化''' 51 | for i in range(self.thread_num): 52 | i=ThreadWork(self.queue,self.shutdown,i) 53 | i.start() 54 | self.threadlist.append(i) 55 | 56 | def loop(self): 57 | for i in self.threadlist: 58 | if not i.isAlive(): 59 | i=ThreadWork(self.queue,self.shutdown,i) 60 | i.start() 61 | 62 | def waitcomplete(self): 63 | '''等待线程退出''' 64 | for i in self.threadlist: 65 | if i.isAlive(): 66 | i.join() 67 | 68 | def isEmpty(self): 69 | '''判断任务队列为空''' 70 | return self.queue.empty() 71 | 72 | def __close__(self): 73 | '''设置线程退出标志''' 74 | self.shutdown.set() 75 | 76 | class ThreadWork(threading.Thread): 77 | '''工作线程入口''' 78 | def __init__(self,work_queue,shutdown,num): 79 | threading.Thread.__init__(self) 80 | self.setName(str(num)) 81 | self.tasklist=work_queue 82 | self.shutdown=shutdown 83 | self.setDaemon(True) 84 | 85 | def run(self): 86 | while True: 87 | if self.shutdown.isSet(): 88 | logging.info(u"线程ID:%s,检测到线程退出标志!"%(self.getName())) 89 | break 90 | try: 91 | url,title=self.tasklist.get(timeout=3) 92 | except: 93 | continue 94 | else: 95 | dagaier(url,title) 96 | 97 | def dagaier(topicurl,title): 98 | '''下载帖子内容''' 99 | topic_req=None 100 | error_count=0 101 | while True: 102 | if error_count>2: #异常或错误超过三次 103 | logging.warning(u"线程ID:%s,下载帖子内容失败, URL:%s"%(threading.currentThread().getName(),topicurl)) 104 | return 105 | try: 106 | topic_req=requests.get(topicurl,headers=header,proxies=proxy,timeout=10) 107 | topic_req.encoding='gbk' 108 | if topic_req.status_code!=200: 109 | error_count+=1 110 | continue 111 | except: 112 | error_count+=1 113 | continue 114 | else: 115 | break 116 | topic_pq=pq(topic_req.text) 117 | imglist=topic_pq("div[class='tpc_content do_not_catch']").find("img").items() 118 | for item in imglist: 119 | if item.attr('ess-data') is not None: 120 | logging.warning(u"提取图片URL:%s"%(item.attr('ess-data'))) 121 | downimg(item.attr('ess-data'),title) 122 | else: 123 | logging.warning(u"线程ID:%s,读取帖子图片URL失败, URL:%s"%(threading.currentThread().getName(),topicurl)) 124 | return False 125 | 126 | def downimg(url,title): 127 | '''下载帖子图片''' 128 | rstr = r"[\/\\\:\*\?\"\<\>\|]" 129 | imgname=re.sub(rstr, "_", url.split('/')[-1]) 130 | error_count=0 131 | while True: 132 | if error_count>2: 133 | logging.warning(u"线程ID:%s,下载帖子图片失败, URL:%s"%(threading.currentThread().getName(),url)) 134 | return 135 | try: 136 | img_req=requests.get(url,headers=header,proxies=proxy,timeout=10) 137 | if img_req.status_code!=200: 138 | error_count+=1 139 | continue 140 | except: 141 | error_count+=1 142 | continue 143 | else: 144 | break 145 | dirname=re.sub(rstr, "_", title) 146 | if not os.path.exists("./images/"+dirname): 147 | try: 148 | os.makedirs("./images/"+dirname) 149 | except: 150 | logging.error(u"创建目录失败:\"%s\""%("./images/"+dirname)) 151 | return False 152 | with open("./images/"+dirname+'/'+imgname,'wb+') as fd: 153 | fd.write(img_req.content) 154 | return True 155 | 156 | if __name__=='__main__': 157 | os.chdir(os.path.dirname(os.path.realpath(inspect.getfile(inspect.currentframe())))) 158 | if not os.path.exists("./images"): 159 | try: 160 | os.makedirs("./images") 161 | except: 162 | logging.critical(u"创建images目录失败,请检查当前用户是否有权限新建目录!") 163 | sys.exit(-1) 164 | work_manager=ThreadManager(8) #线程数 165 | work_manager.__start__() 166 | BasicURL='http://t66y.com/' 167 | offset=0 168 | error_count=0 169 | while offset<10: #主题列表分页数 170 | offset+=1 171 | if error_count>=3: 172 | logging.error(u"遍历主题列表页失败!页码:%i"%(offset)) 173 | error_count=0 174 | continue 175 | PageList='http://t66y.com/thread0806.php?fid=16&search=&page='+str(offset) 176 | Page_Obj=requests.get(PageList,headers=header,proxies=proxy,timeout=10) 177 | Page_Obj.encoding='utf-8' 178 | if Page_Obj.status_code!=200: 179 | error_count+=1 180 | logging.warn(u"下载主题列表分页失败:%i,重试:%i"%(offset,error_count)) 181 | offset-=1 182 | continue 183 | error_count=0 184 | PagePQ=pq(Page_Obj.text) 185 | TopicList=PagePQ("tbody>tr[class='tr3 t_one tac']>.tal>h3>a").items() 186 | for i in TopicList: 187 | if i.attr('href')[0:8]=='htm_data': 188 | work_manager.add_task(BasicURL+i.attr('href'),i.text()) 189 | while not work_manager.isEmpty(): 190 | work_manager.loop() 191 | time.sleep(1) 192 | logging.info(u"设置程序关闭标志") 193 | work_manager.__close__() 194 | logging.info(u"等待所有线程退出") 195 | work_manager.waitcomplete() 196 | sys.exit(0) 197 | 198 | -------------------------------------------------------------------------------- /dagaier.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AngusLkc/1024_dagaier_spider/72b4a4f471573161feff00281900296a25c07319/dagaier.zip -------------------------------------------------------------------------------- /snapshot/snap1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AngusLkc/1024_dagaier_spider/72b4a4f471573161feff00281900296a25c07319/snapshot/snap1.png --------------------------------------------------------------------------------