├── README.md
├── dagaier.py
├── dagaier.zip
└── snapshot
└── snap1.png
/README.md:
--------------------------------------------------------------------------------
1 | # caoliu_1024_dagaier_spider
2 | 爬取草榴论坛"达盖尔的旗帜"分类下的主题图片
3 |
4 | https://raw.githubusercontent.com/cary-zhou/caoliu_1024_dagaier_spider/master/dagaier.zip
5 |
6 | 运行:
7 | =========
8 | linux:
9 | python ./达盖尔.py
10 | or
11 | windows:
12 | python .\达盖尔.py
13 |
14 |
15 | 环境准备:
16 | =========
17 | windows or Linux
18 | ---------
19 | pip install pyquery
20 | pip install requests
21 | pip install -U requests[socks]
22 |
23 |
24 | 修改参数:
25 | =========
26 | 修改代理地址为自己SS或SSR监听的地址端口
27 | proxy={"http":"socks5h://127.0.0.1:1088","https":"socks5h://127.0.0.1:1088"}
28 |
29 | 请合理设置线程数
30 | work_manager=ThreadManager(8)
31 |
32 | 请修改需要爬取的主题分页数
33 | while offset<10: #主题列表分页数
34 |
35 |
36 | 预编译二进制:
37 | =========
38 | 压缩包:dagaier.zip,是windows下直接可双击执行的exe文件,
39 | 使用时需要解压exe可执行文件出来,不要在zip压缩管理器内直接双击执行,免得爬虫运行完了找不到肉。
40 | 然后启动你的SSR代理->选项设置->本地端口,填1088,因为程序内手写了通过本地socks5h://127.0.0.1:1088爬梯。
41 | 如图:
42 | 
43 |
爬取到的资源放在exe同级目录的images文件夹下,每个帖子每个文件夹分开存放,文件夹名就是帖子标题名。
44 |
--------------------------------------------------------------------------------
/dagaier.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python2.7
2 | # -*- coding: utf-8 -*-
3 | import os
4 | import re
5 | import sys
6 | import time
7 | import inspect
8 | import logging
9 | try:
10 | import httplib
11 | httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
12 | except:
13 | pass
14 | import requests
15 | import threading
16 | from pyquery import PyQuery as pq
17 | try:
18 | from Queue import Queue as queue
19 | except:
20 | from queue import Queue as queue
21 |
22 | logging.basicConfig(level=logging.INFO,format='%(asctime)s %(levelname)-6s %(message)s',datefmt='%Y-%m-%d %H:%M:%S')
23 | logging.addLevelName(50,'CRIT')
24 | logging.addLevelName(30,'WARN')
25 |
26 | header={
27 | "User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
28 | "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
29 | "Accept-Language":"zh-CN,zh;q=0.9",
30 | "Accept-Encoding":"gzip, deflate",
31 | "Connection":"close"
32 | }
33 |
34 | '''国内抓取需配置代理'''
35 | proxy={"http":"socks5h://127.0.0.1:1088","https":"socks5h://127.0.0.1:1088"}
36 |
37 | class ThreadManager(object):
38 | '''线程池管理'''
39 | def __init__(self,num):
40 | self.thread_num=num #线程数量
41 | self.queue=queue() #任务队列
42 | self.threadlist=list() #线程池列表
43 | self.shutdown=threading.Event() #线程退出标志
44 |
45 | def add_task(self,topic_url,topic_title):
46 | '''添加任务'''
47 | self.queue.put((topic_url,topic_title))
48 |
49 | def __start__(self):
50 | '''线程初始化'''
51 | for i in range(self.thread_num):
52 | i=ThreadWork(self.queue,self.shutdown,i)
53 | i.start()
54 | self.threadlist.append(i)
55 |
56 | def loop(self):
57 | for i in self.threadlist:
58 | if not i.isAlive():
59 | i=ThreadWork(self.queue,self.shutdown,i)
60 | i.start()
61 |
62 | def waitcomplete(self):
63 | '''等待线程退出'''
64 | for i in self.threadlist:
65 | if i.isAlive():
66 | i.join()
67 |
68 | def isEmpty(self):
69 | '''判断任务队列为空'''
70 | return self.queue.empty()
71 |
72 | def __close__(self):
73 | '''设置线程退出标志'''
74 | self.shutdown.set()
75 |
76 | class ThreadWork(threading.Thread):
77 | '''工作线程入口'''
78 | def __init__(self,work_queue,shutdown,num):
79 | threading.Thread.__init__(self)
80 | self.setName(str(num))
81 | self.tasklist=work_queue
82 | self.shutdown=shutdown
83 | self.setDaemon(True)
84 |
85 | def run(self):
86 | while True:
87 | if self.shutdown.isSet():
88 | logging.info(u"线程ID:%s,检测到线程退出标志!"%(self.getName()))
89 | break
90 | try:
91 | url,title=self.tasklist.get(timeout=3)
92 | except:
93 | continue
94 | else:
95 | dagaier(url,title)
96 |
97 | def dagaier(topicurl,title):
98 | '''下载帖子内容'''
99 | topic_req=None
100 | error_count=0
101 | while True:
102 | if error_count>2: #异常或错误超过三次
103 | logging.warning(u"线程ID:%s,下载帖子内容失败, URL:%s"%(threading.currentThread().getName(),topicurl))
104 | return
105 | try:
106 | topic_req=requests.get(topicurl,headers=header,proxies=proxy,timeout=10)
107 | topic_req.encoding='gbk'
108 | if topic_req.status_code!=200:
109 | error_count+=1
110 | continue
111 | except:
112 | error_count+=1
113 | continue
114 | else:
115 | break
116 | topic_pq=pq(topic_req.text)
117 | imglist=topic_pq("div[class='tpc_content do_not_catch']").find("img").items()
118 | for item in imglist:
119 | if item.attr('ess-data') is not None:
120 | logging.warning(u"提取图片URL:%s"%(item.attr('ess-data')))
121 | downimg(item.attr('ess-data'),title)
122 | else:
123 | logging.warning(u"线程ID:%s,读取帖子图片URL失败, URL:%s"%(threading.currentThread().getName(),topicurl))
124 | return False
125 |
126 | def downimg(url,title):
127 | '''下载帖子图片'''
128 | rstr = r"[\/\\\:\*\?\"\<\>\|]"
129 | imgname=re.sub(rstr, "_", url.split('/')[-1])
130 | error_count=0
131 | while True:
132 | if error_count>2:
133 | logging.warning(u"线程ID:%s,下载帖子图片失败, URL:%s"%(threading.currentThread().getName(),url))
134 | return
135 | try:
136 | img_req=requests.get(url,headers=header,proxies=proxy,timeout=10)
137 | if img_req.status_code!=200:
138 | error_count+=1
139 | continue
140 | except:
141 | error_count+=1
142 | continue
143 | else:
144 | break
145 | dirname=re.sub(rstr, "_", title)
146 | if not os.path.exists("./images/"+dirname):
147 | try:
148 | os.makedirs("./images/"+dirname)
149 | except:
150 | logging.error(u"创建目录失败:\"%s\""%("./images/"+dirname))
151 | return False
152 | with open("./images/"+dirname+'/'+imgname,'wb+') as fd:
153 | fd.write(img_req.content)
154 | return True
155 |
156 | if __name__=='__main__':
157 | os.chdir(os.path.dirname(os.path.realpath(inspect.getfile(inspect.currentframe()))))
158 | if not os.path.exists("./images"):
159 | try:
160 | os.makedirs("./images")
161 | except:
162 | logging.critical(u"创建images目录失败,请检查当前用户是否有权限新建目录!")
163 | sys.exit(-1)
164 | work_manager=ThreadManager(8) #线程数
165 | work_manager.__start__()
166 | BasicURL='http://t66y.com/'
167 | offset=0
168 | error_count=0
169 | while offset<10: #主题列表分页数
170 | offset+=1
171 | if error_count>=3:
172 | logging.error(u"遍历主题列表页失败!页码:%i"%(offset))
173 | error_count=0
174 | continue
175 | PageList='http://t66y.com/thread0806.php?fid=16&search=&page='+str(offset)
176 | Page_Obj=requests.get(PageList,headers=header,proxies=proxy,timeout=10)
177 | Page_Obj.encoding='utf-8'
178 | if Page_Obj.status_code!=200:
179 | error_count+=1
180 | logging.warn(u"下载主题列表分页失败:%i,重试:%i"%(offset,error_count))
181 | offset-=1
182 | continue
183 | error_count=0
184 | PagePQ=pq(Page_Obj.text)
185 | TopicList=PagePQ("tbody>tr[class='tr3 t_one tac']>.tal>h3>a").items()
186 | for i in TopicList:
187 | if i.attr('href')[0:8]=='htm_data':
188 | work_manager.add_task(BasicURL+i.attr('href'),i.text())
189 | while not work_manager.isEmpty():
190 | work_manager.loop()
191 | time.sleep(1)
192 | logging.info(u"设置程序关闭标志")
193 | work_manager.__close__()
194 | logging.info(u"等待所有线程退出")
195 | work_manager.waitcomplete()
196 | sys.exit(0)
197 |
198 |
--------------------------------------------------------------------------------
/dagaier.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AngusLkc/1024_dagaier_spider/72b4a4f471573161feff00281900296a25c07319/dagaier.zip
--------------------------------------------------------------------------------
/snapshot/snap1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AngusLkc/1024_dagaier_spider/72b4a4f471573161feff00281900296a25c07319/snapshot/snap1.png
--------------------------------------------------------------------------------