├── README.md
├── dagaier.py
├── dagaier.zip
└── snapshot
    └── snap1.png


/README.md:
--------------------------------------------------------------------------------
 1 | # caoliu_1024_dagaier_spider
 2 | 爬取草榴论坛"达盖尔的旗帜"分类下的主题图片<br>
 3 | 
 4 | https://raw.githubusercontent.com/cary-zhou/caoliu_1024_dagaier_spider/master/dagaier.zip
 5 | 
 6 | 运行：
 7 | =========
 8 | linux:<br>
 9 | python ./达盖尔.py<br>
10 | or<br>
11 | windows:<br>
12 | python .\达盖尔.py<br>
13 | 
14 | 
15 | 环境准备：
16 | =========
17 | windows or Linux<br>
18 | ---------
19 | pip install pyquery<br>
20 | pip install requests<br>
21 | pip install -U requests[socks]<br>
22 | 
23 | 
24 | 修改参数：<br>
25 | =========
26 | 修改代理地址为自己SS或SSR监听的地址端口<br>
27 | proxy={"http":"socks5h://127.0.0.1:1088","https":"socks5h://127.0.0.1:1088"}<br>
28 | <br>
29 | 请合理设置线程数<br>
30 | work_manager=ThreadManager(8)<br>
31 | <br>
32 | 请修改需要爬取的主题分页数<br>
33 | while offset<10: #主题列表分页数<br>
34 | <br>
35 | 
36 | 预编译二进制：
37 | =========
38 | 压缩包：dagaier.zip，是windows下直接可双击执行的exe文件，<br>
39 | 使用时需要解压exe可执行文件出来，不要在zip压缩管理器内直接双击执行，免得爬虫运行完了找不到肉。<br>
40 | 然后启动你的SSR代理->选项设置->本地端口,填1088，因为程序内手写了通过本地socks5h://127.0.0.1:1088爬梯。<br>
41 | 如图：<br>
42 | ![image](https://github.com/cary-zhou/caoliu_1024_dagaier_spider/raw/master/snapshot/snap1.png)
43 | <br>爬取到的资源放在exe同级目录的images文件夹下，每个帖子每个文件夹分开存放，文件夹名就是帖子标题名。<br>
44 | 


--------------------------------------------------------------------------------
/dagaier.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python2.7
  2 | # -*- coding: utf-8 -*-
  3 | import os
  4 | import re
  5 | import sys
  6 | import time
  7 | import inspect
  8 | import logging
  9 | try:
 10 |     import httplib
 11 |     httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
 12 | except:
 13 |     pass
 14 | import requests
 15 | import threading
 16 | from pyquery import PyQuery as pq
 17 | try:
 18 |     from Queue import Queue as queue
 19 | except:
 20 |     from queue import Queue as queue
 21 | 
 22 | logging.basicConfig(level=logging.INFO,format='%(asctime)s %(levelname)-6s %(message)s',datefmt='%Y-%m-%d %H:%M:%S')
 23 | logging.addLevelName(50,'CRIT')
 24 | logging.addLevelName(30,'WARN')
 25 | 
 26 | header={
 27 |     "User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
 28 |     "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
 29 |     "Accept-Language":"zh-CN,zh;q=0.9",
 30 |     "Accept-Encoding":"gzip, deflate",
 31 |     "Connection":"close"
 32 | }
 33 | 
 34 | '''国内抓取需配置代理'''
 35 | proxy={"http":"socks5h://127.0.0.1:1088","https":"socks5h://127.0.0.1:1088"}
 36 | 
 37 | class ThreadManager(object):
 38 |     '''线程池管理'''
 39 |     def __init__(self,num):
 40 |         self.thread_num=num #线程数量
 41 |         self.queue=queue()  #任务队列
 42 |         self.threadlist=list()  #线程池列表
 43 |         self.shutdown=threading.Event() #线程退出标志
 44 |     
 45 |     def add_task(self,topic_url,topic_title):
 46 |         '''添加任务'''
 47 |         self.queue.put((topic_url,topic_title))
 48 |     
 49 |     def __start__(self):
 50 |         '''线程初始化'''
 51 |         for i in range(self.thread_num):
 52 |             i=ThreadWork(self.queue,self.shutdown,i)
 53 |             i.start()
 54 |             self.threadlist.append(i)
 55 |     
 56 |     def loop(self):
 57 |         for i in self.threadlist:
 58 |             if not i.isAlive():
 59 |                 i=ThreadWork(self.queue,self.shutdown,i)
 60 |                 i.start()
 61 |     
 62 |     def waitcomplete(self):
 63 |         '''等待线程退出'''
 64 |         for i in self.threadlist:
 65 |             if i.isAlive():
 66 |                 i.join()
 67 | 
 68 |     def isEmpty(self):
 69 |         '''判断任务队列为空'''
 70 |         return self.queue.empty()
 71 |     
 72 |     def __close__(self):
 73 |         '''设置线程退出标志'''
 74 |         self.shutdown.set()
 75 | 
 76 | class ThreadWork(threading.Thread):
 77 |     '''工作线程入口'''
 78 |     def __init__(self,work_queue,shutdown,num):
 79 |         threading.Thread.__init__(self)
 80 |         self.setName(str(num))
 81 |         self.tasklist=work_queue
 82 |         self.shutdown=shutdown
 83 |         self.setDaemon(True)
 84 | 
 85 |     def run(self):
 86 |         while True:
 87 |             if self.shutdown.isSet():
 88 |                 logging.info(u"线程ID：%s，检测到线程退出标志！"%(self.getName()))
 89 |                 break
 90 |             try:
 91 |                 url,title=self.tasklist.get(timeout=3)
 92 |             except:
 93 |                 continue
 94 |             else:
 95 |                 dagaier(url,title)
 96 | 
 97 | def dagaier(topicurl,title):
 98 |     '''下载帖子内容'''
 99 |     topic_req=None
100 |     error_count=0
101 |     while True:
102 |         if error_count>2: #异常或错误超过三次
103 |             logging.warning(u"线程ID：%s，下载帖子内容失败, URL:%s"%(threading.currentThread().getName(),topicurl))
104 |             return
105 |         try:
106 |             topic_req=requests.get(topicurl,headers=header,proxies=proxy,timeout=10)
107 |             topic_req.encoding='gbk'
108 |             if topic_req.status_code!=200:
109 |                 error_count+=1
110 |                 continue
111 |         except:
112 |             error_count+=1
113 |             continue
114 |         else:
115 |             break
116 |     topic_pq=pq(topic_req.text)
117 |     imglist=topic_pq("div[class='tpc_content do_not_catch']").find("img").items()
118 |     for item in imglist:
119 |         if item.attr('ess-data') is not None:
120 |             logging.warning(u"提取图片URL:%s"%(item.attr('ess-data')))
121 |             downimg(item.attr('ess-data'),title)
122 |         else:
123 |             logging.warning(u"线程ID:%s,读取帖子图片URL失败, URL:%s"%(threading.currentThread().getName(),topicurl))
124 |             return False
125 | 
126 | def downimg(url,title):
127 |     '''下载帖子图片'''
128 |     rstr = r"[\/\\\:\*\?\"\<\>\|]"
129 |     imgname=re.sub(rstr, "_", url.split('/')[-1])
130 |     error_count=0
131 |     while True:
132 |         if error_count>2:
133 |             logging.warning(u"线程ID：%s，下载帖子图片失败, URL:%s"%(threading.currentThread().getName(),url))
134 |             return
135 |         try:
136 |             img_req=requests.get(url,headers=header,proxies=proxy,timeout=10)
137 |             if img_req.status_code!=200:
138 |                 error_count+=1
139 |                 continue
140 |         except:
141 |             error_count+=1
142 |             continue
143 |         else:
144 |             break
145 |     dirname=re.sub(rstr, "_", title)
146 |     if not os.path.exists("./images/"+dirname):
147 |         try:
148 |             os.makedirs("./images/"+dirname)
149 |         except:
150 |             logging.error(u"创建目录失败:\"%s\""%("./images/"+dirname))
151 |             return False
152 |     with open("./images/"+dirname+'/'+imgname,'wb+') as fd:
153 |         fd.write(img_req.content)
154 |     return True
155 | 
156 | if __name__=='__main__':
157 |     os.chdir(os.path.dirname(os.path.realpath(inspect.getfile(inspect.currentframe()))))
158 |     if not os.path.exists("./images"):
159 |         try:
160 |             os.makedirs("./images")
161 |         except:
162 |             logging.critical(u"创建images目录失败,请检查当前用户是否有权限新建目录!")
163 |             sys.exit(-1)
164 |     work_manager=ThreadManager(8) #线程数
165 |     work_manager.__start__()
166 |     BasicURL='http://t66y.com/'
167 |     offset=0
168 |     error_count=0
169 |     while offset<10: #主题列表分页数
170 |         offset+=1
171 |         if error_count>=3:
172 |             logging.error(u"遍历主题列表页失败！页码:%i"%(offset))
173 |             error_count=0
174 |             continue
175 |         PageList='http://t66y.com/thread0806.php?fid=16&search=&page='+str(offset)
176 |         Page_Obj=requests.get(PageList,headers=header,proxies=proxy,timeout=10)
177 |         Page_Obj.encoding='utf-8'
178 |         if Page_Obj.status_code!=200:
179 |             error_count+=1
180 |             logging.warn(u"下载主题列表分页失败：%i，重试:%i"%(offset,error_count))
181 |             offset-=1
182 |             continue
183 |         error_count=0
184 |         PagePQ=pq(Page_Obj.text)
185 |         TopicList=PagePQ("tbody>tr[class='tr3 t_one tac']>.tal>h3>a").items()
186 |         for i in TopicList:
187 |             if i.attr('href')[0:8]=='htm_data':
188 |                 work_manager.add_task(BasicURL+i.attr('href'),i.text())
189 |     while not work_manager.isEmpty():
190 |         work_manager.loop()
191 |         time.sleep(1)
192 |     logging.info(u"设置程序关闭标志")
193 |     work_manager.__close__()
194 |     logging.info(u"等待所有线程退出")
195 |     work_manager.waitcomplete()
196 |     sys.exit(0)
197 | 
198 | 


--------------------------------------------------------------------------------
/dagaier.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AngusLkc/1024_dagaier_spider/72b4a4f471573161feff00281900296a25c07319/dagaier.zip


--------------------------------------------------------------------------------
/snapshot/snap1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AngusLkc/1024_dagaier_spider/72b4a4f471573161feff00281900296a25c07319/snapshot/snap1.png


--------------------------------------------------------------------------------