├── .idea
├── MoivesSpider.iml
├── dataSources.xml
├── encodings.xml
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
└── vcs.xml
├── README.md
├── dytt8
├── __init__.py
└── dytt8Moive.py
├── image
├── design.png
└── result.png
├── main.py
├── model
├── RequestModel.py
└── TaskQueue.py
├── test.py
├── thread
├── FloorWorkThread.py
└── TopWorkThread.py
└── utils
└── Utils.py
/.idea/MoivesSpider.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/dataSources.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | sqlite.xerial
6 | true
7 | org.sqlite.JDBC
8 | jdbc:sqlite:$PROJECT_DIR$/dytt.db
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 1 爬取入口
2 | 电影天堂有 5 个电影栏目,分别为**最新电影**、**日韩电影**、**欧美电影**、**国内电影**、**综合电影**。每个栏目又有一定数量的分页,每个分页有 25 条电影信息。那么程序的入口可以有 5 个 url 地址。这 5 个地址分别对应每个栏目的首页链接。
3 |
4 | # 2 爬取思路
5 | >知道爬取入口,后面的工作就容易多了。我通过测试发现这几个栏目除了页面的 url 地址不一样之外,其他例如提取信息的 xpath 路径是一样的。因此,我把 5 个栏目当做 1 个类,再该类进行遍历爬取。
6 |
7 | 我这里“最新电影”为例说明爬取思路。
8 | 1)请求栏目的首页来获取到分页的总数,以及推测出每个分页的 url 地址;
9 | 2)将获取到的分页 url 存放到名为 floorQueue 队列中;
10 | 3)从 floorQueue 中依次取出分页 url,然后利用多线程发起请求;
11 | 4)将获取到的电影页面 url 存入到名为 middleQueue 的队列;
12 | 5)从 middleQueue 中依次取出电影页面 url,再利用多线程发起请求;
13 | 6)将请求结果使用 Xpath 解析并提取所需的电影信息;
14 | 7)将爬取到的电影信息存到名为 contentQueue 队列中;
15 | 8)从 contentQueue 队列中依次取出电影信息,然后存到数据库中。
16 |
17 | # 3 设计爬虫架构
18 | 根据爬取思路,我设计出爬虫架构。如下图所示:
19 | 
20 |
21 | # 4 代码分析
22 |
23 | - main 类
24 | 主要工作两个:第一,实例化出一个`dytt8Moive`对象,然后开始爬取信息。第二,等爬取结束,将数据插入到数据库中。
25 |
26 | - TaskQueue 类
27 | 维护 floorQueue、middleQueue、contentQueue 三个队列的管理类。之所以选择队列的数据结构,是因为爬虫程序需要用到多线程,队列能够保证线程安全。
28 |
29 | - dytt8Moive 类
30 | dytt8Moive 类是本程序的主心骨。程序最初的爬取目标是 5 个电影栏目,但是目前只现实了爬取最新栏目。如果你想爬取全部栏目电影,只需对 dytt8Moive 稍微改造下即可。
31 |
32 | # 5 爬取结果
33 | 我这里展示自己爬取最新栏目中 4000 多条数据中前面部分数据。
34 | 
--------------------------------------------------------------------------------
/dytt8/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/monkey-soft/MoivesSpider/2acb470043d8e8e31ebebdbdfa5dc46277fe4e71/dytt8/__init__.py
--------------------------------------------------------------------------------
/dytt8/dytt8Moive.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #coding=utf-8
3 |
4 | '''
5 | @Desc
6 | 主要用来抓取电影天堂(www.dytt8.net)的电影信息(包括电影名、导演、主角、下载地址)
7 | 爬取入口【最新电影】(http://www.dytt8.net/html/gndy/dyzz/index.html)
8 |
9 | @Author monkey
10 | @Date 2017-08-08
11 | '''
12 | import requests
13 | from lxml import etree
14 |
15 | from model.RequestModel import RequestModel
16 |
17 |
18 | class dytt_Lastest(object):
19 |
20 | # 获取爬虫程序抓取入口
21 | breakoutUrl = 'http://www.dytt8.net/html/gndy/dyzz/index.html'
22 |
23 | def __init__(self, sum):
24 | self.sum = sum
25 |
26 |
27 | # 获取【最新电影】有多少个页面
28 | # 截止到2017-08-08, 最新电影一共才有 164 个页面
29 | @classmethod
30 | def getMaxsize(cls):
31 | response = requests.get(cls.breakoutUrl, headers=RequestModel.getHeaders(), proxies=RequestModel.getProxies(), timeout=3)
32 | # 需将电影天堂的页面的编码改为 GBK, 不然会出现乱码的情况
33 | response.encoding = 'GBK'
34 |
35 | selector = etree.HTML(response.text)
36 | # 提取信息
37 | optionList = selector.xpath("//select[@name='sldd']/text()")
38 | return len(optionList) - 1 # 因首页重复, 所以要减1
39 |
40 |
41 | def getPageUrlList(self):
42 | '''
43 | 主要功能:目录页url取出,比如:http://www.dytt8.net/html/gndy/dyzz/list_23_'+ str(i) + '.html
44 | '''
45 | templist = []
46 | request_url_prefix = 'http://www.dytt8.net/html/gndy/dyzz/'
47 | templist = [request_url_prefix + 'index.html']
48 |
49 | for i in range(2, self.sum + 1):
50 | templist.append(request_url_prefix + 'list_23_' + str(i) + '.html')
51 |
52 | for t in templist:
53 | print('request url is ### ' + t + ' ###')
54 | return templist
55 |
56 |
57 | @classmethod
58 | def getMoivePageUrlList(cls, html):
59 | '''
60 | 获取电影信息的网页链接
61 | '''
62 | selector = etree.HTML(html)
63 | templist = selector.xpath("//div[@class='co_content8']/ul/td/table/tr/td/b/a/@href")
64 | # print(len(templist))
65 | # print(templist)
66 | return templist
67 |
68 | @classmethod
69 | def getMoiveInforms(cls, url, html):
70 | '''
71 | 解析电影信息页面的内容, 具体如下:
72 | 类型 : 疾速特攻/疾速追杀2][BD-mkv.720p.中英双字][2017年高分惊悚动作]
73 | ◎译名 : ◎译\u3000\u3000名\u3000疾速特攻/杀神John Wick 2(港)/捍卫任务2(台)/疾速追杀2/极速追杀:第二章/约翰·威克2
74 | ◎片名 : ◎片\u3000\u3000名\u3000John Wick: Chapter Two
75 | ◎年代 : ◎年\u3000\u3000代\u30002017
76 | ◎国家 : ◎产\u3000\u3000地\u3000美国
77 | ◎类别 : ◎类\u3000\u3000别\u3000动作/犯罪/惊悚
78 | ◎语言 : ◎语\u3000\u3000言\u3000英语
79 | ◎字幕 : ◎字\u3000\u3000幕\u3000中英双字幕
80 | ◎上映日期 :◎上映日期\u30002017-02-10(美国)
81 | ◎IMDb评分 : ◎IMDb评分\xa0 8.1/10 from 86,240 users
82 | ◎豆瓣评分 : ◎豆瓣评分\u30007.7/10 from 2,915 users
83 | ◎文件格式 : ◎文件格式\u3000x264 + aac
84 | ◎视频尺寸 : ◎视频尺寸\u30001280 x 720
85 | ◎文件大小 : ◎文件大小\u30001CD
86 | ◎片长 : ◎片\u3000\u3000长\u3000122分钟
87 | ◎导演 : ◎导\u3000\u3000演\u3000查德·史塔赫斯基 Chad Stahelski
88 | ◎主演 :
89 | ◎简介 : 暂不要该字段
90 | ◎获奖情况 : 暂不要该字段
91 | ◎海报
92 | 影片截图
93 | 下载地址
94 | '''
95 | # print(html)
96 | contentDir = {
97 | 'type': '',
98 | 'trans_name': '',
99 | 'name': '',
100 | 'decade': '',
101 | 'conutry': '',
102 | 'level': '',
103 | 'language': '',
104 | 'subtitles': '',
105 | 'publish': '',
106 | 'IMDB_socre': '',
107 | 'douban_score': '',
108 | 'format': '',
109 | 'resolution': '',
110 | 'size': '',
111 | 'duration': '',
112 | 'director': '',
113 | 'actors': '',
114 | 'placard': '',
115 | 'screenshot': '',
116 | 'ftpurl': '',
117 | 'dytt8_url': ''
118 | }
119 |
120 | selector = etree.HTML(html)
121 | content = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/td/p/text()")
122 | # 匹配出来有两张图片, 第一张是海报, 第二张是电影画面截图
123 | imgs = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/td/p/img/@src")
124 | # print(content)
125 |
126 | # 为了兼容 2012 年前的页面
127 | if not len(content):
128 | content = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/span/text()")
129 |
130 | # 有些页面特殊, 需要用以下表达式来重新获取信息
131 | # 电影天堂页面好混乱啊~
132 | if not len(content):
133 | content = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/td/div/text()")
134 |
135 | if not len(content):
136 | content = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/p/font/text()")
137 | if len(content) < 5:
138 | content = selector.xpath("//div[@class='co_content8']/ul/tr/td/p/font/text()")
139 |
140 | if not len(content):
141 | content = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/p/span/text()")
142 |
143 | if not len(content):
144 | content = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/div/span/text()")
145 |
146 | if not len(content):
147 | content = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/font/text()")
148 |
149 | if not len(content):
150 | content = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/p/text()")
151 |
152 | # print(content)
153 |
154 | # 不同渲染页面要采取不同的抓取方式抓取图片
155 | if not len(imgs):
156 | imgs = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/img/@src")
157 |
158 | if not len(imgs):
159 | imgs = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/p/img/@src")
160 |
161 | if not len(imgs):
162 | imgs = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/div/img/@src")
163 |
164 | if not len(imgs):
165 | imgs = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/td/div/img/@src")
166 |
167 | # 类型
168 | if content[0][0:1] != '◎':
169 | contentDir['type'] = '[' + content[0]
170 | actor = ''
171 |
172 | for each in content:
173 | if each[0:5] == '◎译\u3000\u3000名':
174 | # 译名 ◎译\u3000\u3000名\u3000 一共占居6位
175 | contentDir['trans_name'] = each[6: len(each)]
176 | elif each[0:5] == '◎片\u3000\u3000名':
177 | # 片名
178 | contentDir['name'] = each[6: len(each)]
179 | elif each[0:5] == '◎年\u3000\u3000代':
180 | # 年份
181 | contentDir['decade'] = each[6: len(each)]
182 | elif each[0:5] == '◎产\u3000\u3000地':
183 | # 产地
184 | contentDir['conutry'] = each[6: len(each)]
185 | elif each[0:5] == '◎类\u3000\u3000别':
186 | # 类别
187 | contentDir['level'] = each[6: len(each)]
188 | elif each[0:5] == '◎语\u3000\u3000言':
189 | # 语言
190 | contentDir['language'] = each[6: len(each)]
191 | elif each[0:5] == '◎字\u3000\u3000幕':
192 | # 字幕
193 | contentDir['subtitles'] = each[6: len(each)]
194 | elif each[0:5] == '◎上映日期':
195 | # 上映日期
196 | contentDir['publish'] = each[6: len(each)]
197 | elif each[0:7] == '◎IMDb评分':
198 | # IMDb评分
199 | contentDir['IMDB_socre'] = each[9: len(each)]
200 | elif each[0:5] == '◎豆瓣评分':
201 | # 豆瓣评分
202 | contentDir['douban_score'] = each[6: len(each)]
203 | elif each[0:5] == '◎文件格式':
204 | # 文件格式
205 | contentDir['format'] = each[6: len(each)]
206 | elif each[0:5] == '◎视频尺寸':
207 | # 视频尺寸
208 | contentDir['resolution'] = each[6: len(each)]
209 | elif each[0:5] == '◎文件大小':
210 | # 文件大小
211 | contentDir['size'] = each[6: len(each)]
212 | elif each[0:5] == '◎片\u3000\u3000长':
213 | # 片长
214 | contentDir['duration'] = each[6: len(each)]
215 | elif each[0:5] == '◎导\u3000\u3000演':
216 | # 导演
217 | contentDir['director'] = each[6: len(each)]
218 | elif each[0:5] == '◎主\u3000\u3000演':
219 | # 主演
220 | actor = each[6: len(each)]
221 |
222 | for item in content:
223 | if item[0: 4] == '\u3000\u3000\u3000\u3000':
224 | actor = actor + '\n' + item[6: len(item)]
225 |
226 | # 主演
227 | contentDir['actors'] = actor
228 | # 海报
229 | if imgs[0] != None:
230 | contentDir['placard'] = imgs[0]
231 | # 影片截图
232 | if imgs[1] != None:
233 | contentDir['screenshot'] = imgs[1]
234 | # 下载地址
235 | ftp = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/td/table/tbody/tr/td/a/text()")
236 |
237 | # 为了兼容 2012 年前的页面
238 | if not len(ftp):
239 | ftp = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/table/tbody/tr/td/font/a/text()")
240 |
241 | if not len(ftp):
242 | ftp = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/table/tbody/tr/td/a/text()")
243 |
244 | if not len(ftp):
245 | ftp = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/div/table/tbody/tr/td/font/a/text()")
246 |
247 | if not len(ftp):
248 | ftp = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/td/div/table/tbody/tr/td/a/text()")
249 |
250 | if not len(ftp):
251 | ftp = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/td/table/tbody/tr/td/a/text()")
252 |
253 | if not len(ftp):
254 | ftp = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/p/span/a/text()")
255 |
256 | if not len(ftp):
257 | ftp = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/div/div/table/tbody/tr/td/font/a/text()")
258 |
259 | if not len(ftp):
260 | ftp = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/span/table/tbody/tr/td/font/a/text()")
261 |
262 | if not len(ftp):
263 | ftp = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/div/span/div/table/tbody/tr/td/font/a/text()")
264 |
265 | contentDir['ftpurl'] = ftp[0]
266 | # 页面链接
267 | contentDir['dytt8_url'] = url
268 | print(contentDir)
269 | return contentDir
--------------------------------------------------------------------------------
/image/design.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/monkey-soft/MoivesSpider/2acb470043d8e8e31ebebdbdfa5dc46277fe4e71/image/design.png
--------------------------------------------------------------------------------
/image/result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/monkey-soft/MoivesSpider/2acb470043d8e8e31ebebdbdfa5dc46277fe4e71/image/result.png
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #coding=utf-8
3 |
4 |
5 | import sqlite3
6 |
7 | from dytt8.dytt8Moive import dytt_Lastest
8 | from model.TaskQueue import TaskQueue
9 | from thread.FloorWorkThread import FloorWorkThread
10 | from thread.TopWorkThread import TopWorkThread
11 | from utils.Utils import Utils
12 |
13 | '''
14 | 程序主入口
15 | @Author monkey
16 | @Date 2017-08-08
17 | '''
18 |
19 | # 截止到2017-08-08, 最新电影一共才有 164 个页面
20 | LASTEST_MOIVE_TOTAL_SUM = 6 #164
21 |
22 | # 请求网络线程总数, 线程不要调太好, 不然会返回很多 400
23 | THREAD_SUM = 5
24 |
25 |
26 | def startSpider():
27 | # 实例化对象
28 |
29 | # 获取【最新电影】有多少个页面
30 | LASTEST_MOIVE_TOTAL_SUM = dytt_Lastest.getMaxsize()
31 | print('【最新电影】一共 ' + str(LASTEST_MOIVE_TOTAL_SUM) + ' 有个页面')
32 | dyttlastest = dytt_Lastest(LASTEST_MOIVE_TOTAL_SUM)
33 | floorlist = dyttlastest.getPageUrlList()
34 |
35 | floorQueue = TaskQueue.getFloorQueue()
36 | for item in floorlist:
37 | floorQueue.put(item, 3)
38 |
39 | # print(floorQueue.qsize())
40 |
41 | for i in range(THREAD_SUM):
42 | workthread = FloorWorkThread(floorQueue, i)
43 | workthread.start()
44 |
45 | while True:
46 | if TaskQueue.isFloorQueueEmpty():
47 | break
48 | else:
49 | pass
50 |
51 | for i in range(THREAD_SUM):
52 | workthread = TopWorkThread(TaskQueue.getMiddleQueue(), i)
53 | workthread.start()
54 |
55 |
56 | while True:
57 | if TaskQueue.isMiddleQueueEmpty():
58 | break
59 | else:
60 | pass
61 |
62 | insertData()
63 |
64 |
65 | def insertData():
66 | DBName = 'dytt.db'
67 | db = sqlite3.connect('./' + DBName, 10)
68 | conn = db.cursor()
69 |
70 | SelectSql = 'Select * from sqlite_master where type = "table" and name="lastest_moive";'
71 | CreateTableSql = '''
72 | Create Table lastest_moive (
73 | 'm_id' INTEGER PRIMARY KEY,
74 | 'm_type' varchar(100),
75 | 'm_trans_name' varchar(200),
76 | 'm_name' varchar(100),
77 | 'm_decade' varchar(30),
78 | 'm_conutry' varchar(30),
79 | 'm_level' varchar(100),
80 | 'm_language' varchar(30),
81 | 'm_subtitles' varchar(100),
82 | 'm_publish' varchar(30),
83 | 'm_IMDB_socre' varchar(50),
84 | 'm_douban_score' varchar(50),
85 | 'm_format' varchar(20),
86 | 'm_resolution' varchar(20),
87 | 'm_size' varchar(10),
88 | 'm_duration' varchar(10),
89 | 'm_director' varchar(50),
90 | 'm_actors' varchar(1000),
91 | 'm_placard' varchar(200),
92 | 'm_screenshot' varchar(200),
93 | 'm_ftpurl' varchar(200),
94 | 'm_dytt8_url' varchar(200)
95 | );
96 | '''
97 |
98 | InsertSql = '''
99 | Insert into lastest_moive(m_type, m_trans_name, m_name, m_decade, m_conutry, m_level, m_language, m_subtitles, m_publish, m_IMDB_socre,
100 | m_douban_score, m_format, m_resolution, m_size, m_duration, m_director, m_actors, m_placard, m_screenshot, m_ftpurl,
101 | m_dytt8_url)
102 | values(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
103 | '''
104 |
105 | if not conn.execute(SelectSql).fetchone():
106 | conn.execute(CreateTableSql)
107 | db.commit()
108 | print('==== 创建表成功 ====')
109 | else:
110 | print('==== 创建表失败, 表已经存在 ====')
111 |
112 | count = 1
113 |
114 | while not TaskQueue.isContentQueueEmpty():
115 | item = TaskQueue.getContentQueue().get()
116 | conn.execute(InsertSql, Utils.dirToList(item))
117 | db.commit()
118 | print('插入第 ' + str(count) + ' 条数据成功')
119 | count = count + 1
120 |
121 | db.commit()
122 | db.close()
123 |
124 |
125 | if __name__ == '__main__':
126 | startSpider()
--------------------------------------------------------------------------------
/model/RequestModel.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #coding=utf-8
3 | import random
4 |
5 |
6 | class RequestModel(object):
7 |
8 | UserAgent_List = [
9 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
10 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
11 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
12 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
13 | "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
14 | "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
15 | "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
16 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
17 | "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
18 | "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
19 | "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
20 | "Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
21 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
22 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
23 | "Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36",
24 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
25 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36",
26 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36",
27 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36",
28 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
29 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36",
30 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F",
31 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1",
32 | "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
33 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
34 | "Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0",
35 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0",
36 | "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0",
37 | "Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16",
38 | "Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14",
39 | "Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14",
40 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0) Opera 12.14",
41 | "Opera/9.80 (Windows NT 5.1; U; zh-sg) Presto/2.9.181 Version/12.00"
42 | ]
43 |
44 | Proxy_Pool = [
45 | 'web-proxy.oa.com:8080',
46 | # '',
47 | ]
48 |
49 | # 获取不同的请求头
50 | @classmethod
51 | def getHeaders(cls):
52 | headers = {
53 | 'User-Agent': random.choice(cls.UserAgent_List),
54 | 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
55 | 'Accept-Encoding': 'gzip, deflate, sdch',
56 | 'Accept-Language': 'zh-CN,zh;q=0.8',
57 | 'Host':'www.dytt8.net',
58 | # 'Referer': 'http: //www.dytt8.net/html/gndy/dyzz/index.html',
59 | }
60 | return headers
61 |
62 | # 获取代理
63 | @classmethod
64 | def getProxies(cls):
65 | proxies = {
66 | # 'http': random.choice(cls.Proxy_Pool),
67 | # 'http':'web-proxy.oa.com:8080',
68 | # 'https': random.choice(cls.Proxy_Pool)
69 | }
70 | return proxies
71 |
--------------------------------------------------------------------------------
/model/TaskQueue.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #coding=utf-8
3 |
4 | '''
5 | @Desc
6 | 维护三个队列:
7 | FloorQueue 存放一级目录 url 链接的队列
8 | MiddleQueue 存放二级目录 url 链接的队列
9 | ContentQueue 存放获取电影信息(名称、导演、主角、下载地址等)的队列, 方便后续持久化
10 |
11 | 存放未爬取 url 的队列
12 | 存放
13 | @Author monkey
14 | @Date 2017-08-11
15 | '''
16 | from queue import Queue
17 |
18 |
19 | class TaskQueue(object):
20 |
21 | floorQueue = Queue()
22 | middleQueue = Queue()
23 | contentQueue = Queue()
24 |
25 | def __init__(self):
26 | pass
27 |
28 | # get queue
29 | @classmethod
30 | def getFloorQueue(cls):
31 | return cls.floorQueue
32 |
33 | @classmethod
34 | def getMiddleQueue(cls):
35 | return cls.middleQueue
36 |
37 | @classmethod
38 | def getContentQueue(cls):
39 | return cls.contentQueue
40 |
41 | # Return True if the queue is empty, False otherwise (not reliable!).
42 | @classmethod
43 | def isFloorQueueEmpty(cls):
44 | return cls.floorQueue.empty()
45 |
46 | @classmethod
47 | def isMiddleQueueEmpty(cls):
48 | return cls.middleQueue.empty()
49 |
50 | @classmethod
51 | def isContentQueueEmpty(cls):
52 | return cls.contentQueue.empty()
53 |
54 | # Put an item into the queue.
55 | @classmethod
56 | def putToFloorQueue(cls, item):
57 | cls.floorQueue.put(item)
58 |
59 | @classmethod
60 | def putToMiddleQueue(cls, item):
61 | cls.middleQueue.put(item)
62 |
63 | @classmethod
64 | def putToContentQueue(cls, item):
65 | cls.contentQueue.put(item)
66 |
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from lxml import etree
3 |
4 | from model.RequestModel import RequestModel
5 |
6 | '''
7 | @Desc
8 | 测试各个功能的脚本
9 |
10 | @Author monkey
11 | @Date 2017-08-08
12 | '''
13 |
14 | # 测试案例1
15 | # url = 'http://www.dytt8.net/html/gndy/dyzz/list_23_2.html'
16 | # html = requests.get(url, headers=RequestModel.getHeaders(), proxies=RequestModel.getProxies())
17 | # print(html.status_code)
18 |
19 | #测试案例2
20 | # temp1 = {'director': 'S·S·拉贾穆里 S.S. Rajamouli', 'language': '泰卢固语/泰米尔语/印地语/马拉雅拉姆语', 'resolution': '1280 x 720', 'type': '[巴霍巴利王(下):终结][BD-mkv.720p.中英双字][2017年动作战争]', 'trans_name': '巴霍巴利王(下):终结/巴霍巴利王:磅礴终章(台)/巴霍巴利王(下)/巴霍巴利王2/巴霍巴利王:结局', 'publish': '2017-04-28(印度)', 'ftpurl': 'ftp://ygdy8:ygdy8@yg72.dydytt.net:8035/[阳光电影www.ygdy8.com].英雄联盟.HD.720p.英语中字.mkv', 'decade': '2017', 'duration': '167分钟', 'level': '剧情/动作/战争/奇幻/冒险', 'size': '1CD', 'screenshot': 'https://public.lightpic.info/image/8274_59872EB90.jpg', 'format': 'x264 + AAC', 'douban_score': '7.1/10 from 2,496 users', 'name': 'Baahubali: The Conclusion', 'actors': '\u3000帕拉巴斯 Prabhas\n拉纳·达格巴提 Rana Daggubati\n安努舒卡·谢蒂 Anushka Shetty\n特曼娜·芭蒂亚 Tamannaah Bhatia\n萨伯拉杰 Subbaraju\n拉姆亚·克里希南 Ramya Krishnan\n纳赛尔 Nasser\n挲塞亚拉杰 Satyaraj', 'subtitles': '中英双字幕', 'IMDB_socre': '8.7/10 from 45,729 users', 'conutry': '印度', 'placard': 'https://public.lightpic.info/image/AEC1_597FDBD70.jpg', 'dytt8_url':'http://www.dytt8.net/html/gndy/dyzz/20170223/53313.html'}
21 | # temp2 = {'director': '迈克尔·法斯宾德 Michael Fassbender', 'language': '英语', 'resolution': '1CD', 'type': '[异形:契约][BD-mkv.720p.中英双字][2017年科幻惊悚恐怖]', 'trans_name': '异形:契约/异形:圣约(港/台)/神奇异形在哪里(豆友译名)/已开:大勺(豆友译名)/异形:失乐园/普罗米修斯2', 'publish': '2017-05-10(法国)/2017-05-19(美国)/2017-06-16(中国)', 'ftpurl': 'ftp://ygdy8:ygdy8@yg72.dydytt.net:8035/[阳光电影www.ygdy8.com].英雄联盟.HD.720p.英语中字.mkv', 'decade': '2017', 'duration': '雷德利·斯科特 Ridley Scott', 'level': '科幻/惊悚/恐怖', 'size': '122分钟', 'screenshot': 'https://public.lightpic.info/image/0E0B_5981E9240.jpg', 'format': '1280 x 720', 'douban_score': 'x264 + AAC', 'name': 'Alien: Covenant', 'actors': '\u3000凯瑟琳·沃特斯顿 Katherine Waterston\n凯瑟琳·沃特斯顿 Katherine Waterston\n比利·克鲁德普 Billy Crudup\n丹尼·麦克布耐德 Danny McBride\n德米安·比齐尔 Demián Bichir\n卡门·艾乔戈 Carmen Ejogo\n朱西·斯莫利特 Jussie Smollett\n考莉·赫尔南德斯 Callie Hernandez\n艾米·西米茨 Amy Seimetz\n纳撒尼尔·迪安 Nathaniel Dean\n亚历山大·英格兰 Alexander England\n本杰明·里格比 Benjamin Rigby\n乌利·拉图基孚 Uli Latukefu\n泰丝·哈乌布里奇 Tess Haubrich\n罗蕾莱·金 Lorelei King\n哈维尔·博特 Javier Botet\n詹姆斯·弗兰科 James Franco\n盖·皮尔斯 Guy Pearce\n劳米·拉佩斯 Noomi Rapace', 'subtitles': '中英双字幕', 'IMDB_socre': '6.7/10 from 91,839 users', 'conutry': '美国', 'placard': 'https://public.lightpic.info/image/D48F_595B69A80.jpg', 'dytt8_url':'http://www.dytt8.net/html/gndy/dyzz/20170223/53313.html'}
22 | # temp3 = {'director': '尼古拉·科斯特-瓦尔道 Nikolaj Coster-Waldau', 'language': '英语', 'resolution': '1CD', 'type': '[一锤定音][BD-mkv.720p.中英双字][2017年惊悚动作]', 'trans_name': '一锤定音', 'publish': '2017-06-17(洛杉矶电影节)', 'ftpurl': 'ftp://ygdy8:ygdy8@yg72.dydytt.net:8035/[阳光电影www.ygdy8.com].英雄联盟.HD.720p.英语中字.mkv', 'decade': '2016', 'duration': '里克·罗曼·沃夫 Ric Roman Waugh', 'level': '剧情/动作/犯罪/惊悚', 'size': '121分钟', 'screenshot': 'https://public.lightpic.info/image/E9DA_597FDBD70.jpg', 'format': '1280 x 720', 'douban_score': 'x264 + AAC', 'name': 'Shot Caller', 'actors': '\u3000乔·博恩瑟 Jon Bernthal\n乔·博恩瑟 Jon Bernthal\n蕾克·贝尔 Lake Bell\n欧玛瑞·哈德威克 Omari Hardwick\n杰弗里·多诺万 Jeffrey Donovan\n洁西·斯克拉姆 Jessy Schram\n本杰明·布拉特 Benjamin Bratt\n伊万·琼斯 Evan Jones\n马特·杰拉德 Matt Gerald\n迈克尔·兰德斯 Michael Landes\n艾莫里·科恩 Emory Cohen\nChris Browning\n基思·雅各 Keith Jardine', 'subtitles': '中英双字幕', 'IMDB_socre': '7.5/10 from 4,418 users', 'conutry': '美国', 'placard': 'https://public.lightpic.info/image/5D0C_597FDBD70.jpg', 'dytt8_url':'http://www.dytt8.net/html/gndy/dyzz/20170223/53313.html'}
23 | # temp4 = {'director': 'Raul Merida ....Teniente Conte', 'language': '西班牙语', 'resolution': '93分钟', 'type': '[敌对区域][BD-mkv.720p.中英双字][2017年剧情战争] ', 'trans_name': '敌对区域', 'publish': '分\xa0 6.8/10 from 289 users', 'ftpurl': 'ftp://ygdy8:ygdy8@yg72.dydytt.net:8035/[阳光电影www.ygdy8.com].英雄联盟.HD.720p.英语中字.mkv', 'decade': '2017', 'duration': 'Ariadna Gil ....Capitan Varela', 'level': '剧情/战争', 'size': 'Adolfo Martinez Perez', 'screenshot': 'https://public.lightpic.info/image/65CC_598093670.jpg', 'format': '1CD', 'douban_score': '1280 x 720', 'name': 'Zona hostil / rescue under fire', 'actors': '\u3000Roberto alamo ....Capitan Torres\nRaul Merida ....Teniente Conte\nRoberto alamo ....Capitan Torres\nAntonio Garrido ....Comandante Ledesma\nIngrid Garcia Jonsson ....Cabo Sanchez\nJacobo Dicenta ....Sargento 1o Aguilar\nIsmael Martinez ....Cabo 1o Carranza\nNasser Saleh ....Alferez Abda\nMariam Hernandez ....Sargento Castro\nRuth Gabriel ....Brigada Alvite\nYounes Bachir ....Soldado Rashid\nDavid de la Torre ....Soldado Vazquez\nJavier Bodalo ....Cabo Angulo\nBerta Hernandez ....Cabo Sobrino\nSergio Momo ....Soldado Hunt\nLeander Vyvey ....Soldado Norris\nAdolfo Martin Vela ....Teniente Vilches\nRichard Calderon ....Soldado Pe?a\nJose Luis Casado ....Brigada Rodriguez\nAntonio Cifo ....General Zarate\nVicente Ayala ....Coronel Bermudez\nAngelo Olivier ....Teniente Coronel Bravo\nJorge Fuentes ....Capitan Machado\nMaykol Hernandez ....Soldado Operador Control\nPedro Vassallo ....Capitan Lazaro\nLeo Rivera ....Capitan Legion', 'subtitles': '中英双字幕', 'IMDB_socre': '4 + AAC', 'conutry': '西班牙', 'placard': 'https://public.lightpic.info/image/7B2E_598093660.jpg', 'dytt8_url':'http://www.dytt8.net/html/gndy/dyzz/20170223/53313.html'}
24 | # temp5 = {'director': '庄鹃瑛 Ball Chuang', 'language': '普通话', 'resolution': '1CD', 'type': '[52赫兹,我爱你][BD-mkv.720p.国语中字][2017年爱情音乐]', 'trans_name': '52 Hz, I Love You', 'publish': '2017-01-26(台湾)/2017-06-16(中国)', 'ftpurl': 'ftp://ygdy8:ygdy8@yg72.dydytt.net:8035/[阳光电影www.ygdy8.com].英雄联盟.HD.720p.英语中字.mkv', 'decade': '2017', 'duration': '魏德圣 Te-Sheng Wei', 'level': '爱情/音乐', 'size': '109分钟', 'screenshot': 'https://public.lightpic.info/image/1D43_5981E9240.jpg', 'format': '1280 x 720', 'douban_score': 'x264 + AAC', 'name': '52赫兹,我爱你/52Hz我爱你/52赫茲,我愛你', 'actors': '\u3000林忠谕 Lin Chungyu\n林忠谕 Lin Chungyu\n姜圣民 Shengmin Jiang\n陈玫希 Mify Chen\n赵咏华 Cyndi Chaw\n林庆台 Ching-tai Lin\n张榕容 Sandrine Pinna\n李千娜 Gina Li\n马如龙 Ju-Lung Ma\n沛小岚 Hsiao-Lan Pei\n范逸臣 Van Fan\n田中千绘 Chie Tanaka\n马念先 Nien-Hsien Ma\n应蔚民 Wei-min Ying\n民雄 Min-Hsiung\n林晓培 Shino Lin\n安乙荞 Joanne Yang', 'subtitles': '中文', 'IMDB_socre': '6.8/10 from 190 users', 'conutry': '中国台湾', 'placard': 'https://public.lightpic.info/image/4353_5981E9240.jpg', 'dytt8_url':'http://www.dytt8.net/html/gndy/dyzz/20170223/53313.html'}
25 | # temp6 = {'director': '杨幂 Mi Yang', 'language': '普通话', 'resolution': '1CD', 'type': '[逆时营救][HD-mkv.720p.国语中字][2017年杨幂霍建华动作科幻]', 'trans_name': 'Fatal Countdown: Reset / Reset', 'publish': '2017-06-19(上海电影节)/2017-06-29(中国)', 'ftpurl': 'ftp://ygdy8:ygdy8@yg72.dydytt.net:8035/[阳光电影www.ygdy8.com].英雄联盟.HD.720p.英语中字.mkv', 'decade': '2017', 'duration': '尹鸿承 Chang', 'level': '动作/科幻', 'size': '106分钟', 'screenshot': 'https://public.lightpic.info/image/A8FF_597F29320.jpg', 'format': '1280 x 720', 'douban_score': 'x264 + AAC', 'name': '逆时营救/致命倒数', 'actors': '\u3000霍建华 Wallace Huo\n霍建华 Wallace Huo\n金士杰 King Shih-Chieh\n刘畅 Chang Liu\n张艺瀚\xa0 Hummer\n王俐丹 Lidan Wang', 'subtitles': '中文', 'IMDB_socre': '6.1/10 from 76 users', 'conutry': '中国', 'placard': 'https://public.lightpic.info/image/B3CC_597F29310.jpg', 'dytt8_url':'http://www.dytt8.net/html/gndy/dyzz/20170223/53313.html'}
26 | # temp7 = {'director': '程伟豪 Wei-hao Cheng', 'language': '普通话', 'resolution': '1280 x 720', 'type': '[目击者之追凶][HD-mkv.720p.国语中字][2017年悬疑惊悚]', 'trans_name': '目击者之追凶/目擊者', 'publish': '2017-03-31(中国台湾)/2017-06-21(上海电影节)', 'ftpurl': 'ftp://ygdy8:ygdy8@yg72.dydytt.net:8035/[阳光电影www.ygdy8.com].英雄联盟.HD.720p.英语中字.mkv', 'decade': '2017', 'duration': '113分钟', 'level': '犯罪/悬疑/惊悚', 'size': '1CD', 'screenshot': 'https://public.lightpic.info/image/E256_597B4D060.jpg', 'format': 'x264 + AAC', 'douban_score': '7.4/10 from 228 users', 'name': 'Who Killed Cock Robin', 'actors': '\u3000许玮甯 Tiffany Hsu\n柯佳嬿 Alice Ko\n庄凯勋 Cash Chuang\n李铭顺 Christopher Lee Ming Shun\n李淳 Mason Lee\nIan Chen\n郑志伟 Chih-Wei Cheng\n汤志伟 Chih Wei Tang\nMario Pu', 'subtitles': '中文', 'IMDB_socre': '7.5/10 from 217 users', 'conutry': '中国台湾', 'placard': 'https://public.lightpic.info/image/82EB_597B4D060.jpg', 'dytt8_url':'http://www.dytt8.net/html/gndy/dyzz/20170223/53313.html'}
27 | # temp8 = {'director': '徐浩峰 Haofeng Xu', 'language': '普通话', 'resolution': '1280 x 720', 'type': '[师父/师傅][BD-mkv.720p.国语中字][2015年高分获奖动作]', 'trans_name': '师父/师傅', 'publish': '2015-11-11(台北金马影展)/2015-12-10(中国)', 'ftpurl': 'ftp://ygdy8:ygdy8@yg72.dydytt.net:8035/[阳光电影www.ygdy8.com].英雄联盟.HD.720p.英语中字.mkv', 'decade': '2015', 'duration': '109分钟', 'level': '剧情/动作/武侠', 'size': '1CD', 'screenshot': 'https://public.lightpic.info/image/4AA4_597D9E790.jpg', 'format': 'x264 + AAC', 'douban_score': '8.1/10 from 138,157 users', 'name': 'The Final Master / The Master', 'actors': '\u3000廖凡 Fan Liao\n宋佳 Jia Song\n蒋雯丽 Wenli Jiang\n金士杰 Shi-Jye Jin\n宋洋 Yang Song\n黄觉 Jue Huang\n麦迪娜 Vicky\n张傲月 Aoyue Zhang\n马君 Jun Ma\n陈观泰 Kuan Tai Chen\n熊欣欣 Xinxin Xiong\n戴立忍 Leon Dai\n裘继戎 Jirong Qiu\n李博 Bo Li', 'subtitles': '中文', 'IMDB_socre': '7.2/10 from 1,035 users', 'conutry': '中国', 'placard': 'https://public.lightpic.info/image/6C79_597D9FFC0.jpg', 'dytt8_url':'http://www.dytt8.net/html/gndy/dyzz/20170223/53313.html'}
28 | # temp9 = {'director': '刘紫微 Ziwei Liu', 'language': '剧情', 'resolution': '1280 x 720', 'type': '[我心雀跃][HD-mkv.720p.国语中字][2017年获奖剧情]', 'trans_name': '', 'publish': '中文', 'ftpurl': 'ftp://ygdy8:ygdy8@yg72.dydytt.net:8035/[阳光电影www.ygdy8.com].英雄联盟.HD.720p.英语中字.mkv', 'decade': '我心雀跃', 'duration': '95分钟', 'level': '中国', 'size': '1CD', 'screenshot': 'https://public.lightpic.info/image/15AB_597B4D060.jpg', 'format': 'x264 + AAC', 'douban_score': '6.7/10 from 1,811 users', 'name': 'My Heart Leaps Up', 'actors': '\u3000孙伊涵 Yihan Sun\n宋宁 Ning Song\n周楚楚 Chu-chu Zhou\n杜双宇 Shuangyu Du\n刘锐 Kobe Liu\n池韵 Yun Chi\n刘北妍 Beiyan Liu\n任运杰 Yunjie Ren\n修健 Jian Xiu', 'subtitles': '普通话', 'IMDB_socre': '6-06-14(上海国际电影节)/2017-06-09(中国)', 'conutry': '2016', 'placard': 'https://public.lightpic.info/image/C24D_597B4D060.jpg', 'dytt8_url':'http://www.dytt8.net/html/gndy/dyzz/20170223/53313.html'}
29 | # temp10 = {'director': '查理·汉纳姆 Charlie Hunnam ', 'language': '动作/奇幻/冒险 ', 'resolution': '1CD', 'type': '[亚瑟王:斗兽争霸][BD-mkv.720p.中英双字][2017年奇幻动作] ', 'trans_name': '', 'publish': '中英双字幕 ', 'ftpurl': 'ftp://ygdy8:ygdy8@yg72.dydytt.net:8035/[阳光电影www.ygdy8.com].英雄联盟.HD.720p.英语中字.mkv', 'decade': 'King Arthur: Legend of the Sword ', 'duration': '盖·里奇 Guy Ritchie ', 'level': '美国 ', 'size': '126分钟 ', 'screenshot': 'https://public.lightpic.info/image/68F0_5979DCC50.jpg', 'format': '1280 x 720', 'douban_score': 'x264 + AAC', 'name': '亚瑟王:斗兽争霸/亚瑟王:圣剑传奇/亚瑟:王者之剑(台)/神剑亚瑟王(港)/亚瑟王:圆桌骑士/亚瑟王:剑之传奇/亚瑟王:石中剑传说/新亚瑟王/圆桌骑士 ', 'actors': '\u3000裘德·洛 Jude Law \n裘德·洛 Jude Law \n阿斯特丽德·伯格斯-弗瑞斯贝 àstrid Bergès-Frisbey \n米卡埃尔·佩斯布兰特 Mikael Persbrandt \n杰曼·翰苏 Djimon Hounsou \n安娜贝拉·沃丽丝 Annabelle Wallis \n艾瑞克·巴纳 Eric Bana \n艾丹·吉伦 Aidan Gillen \n尼尔·马斯克尔 Neil Maskell \n赫敏·科菲尔德 Hermione Corfield \n凯蒂·麦克格拉思 Katie McGrath \n杰奎·安斯蕾 Jacqui Ainsley \n弗莱迪·福克斯 Freddie Fox \n波比·迪瓦伊 Poppy Delevingne \n朱利安·西格尔 Julian Seager \n大卫·贝克汉姆 David Beckham \n杰夫·贝尔 Geoff Bell \n米莉·布拉迪 Millie Brady \n乔治娜·坎贝尔 Georgina Campbell \n丹尼尔·斯蒂森 Daniel Stisen \n伊琳·珀威尔 Eline Powell \n迈克尔·麦克埃尔哈顿 Michael McElhatton \n阿德里安·布薛特 Adrian Bouchet \n彼得·费迪南多 Peter Ferdinando \n汤姆·吴 Tom Wu \n金斯利·本-阿德 Kingsley Ben-Adir \n盖·里奇 Guy Ritchie ', 'subtitles': '英语 ', 'IMDB_socre': '.2/10 from 50,624 users ', 'conutry': '2017 ', 'placard': 'https://public.lightpic.info/image/FA4F_595920430.jpg', 'dytt8_url':'http://www.dytt8.net/html/gndy/dyzz/20170223/53313.html'}
30 | # temp11 = {'director': '张孝全 Hsiao-chuan Chang', 'language': '普通话', 'resolution': '100分钟', 'type': '[指甲刀人魔][BD-mkv.720p.国语中字][2017年周冬雨张孝全爱情喜剧]', 'trans_name': '指甲刀人魔', 'publish': '2017-04-11(北京电影节)/2017-04-14(中国)', 'ftpurl': 'ftp://ygdy8:ygdy8@yg72.dydytt.net:8035/[阳光电影www.ygdy8.com].英雄联盟.HD.720p.英语中字.mkv', 'decade': '2017', 'duration': '周冬雨 Dongyu Zhou', 'level': '喜剧/爱情', 'size': '关智耀 Jason Kwan Chi-Yiu', 'screenshot': 'https://public.lightpic.info/image/B137_596B4F690.jpg', 'format': '1CD', 'douban_score': '1280 x 720', 'name': 'A Nail Clipper Romance', 'actors': '\u3000纳豆 Na Dow\n张孝全 Hsiao-chuan Chang\n纳豆 Na Dow\n林辰唏 Zaizai Lin\n蔡洁 Jacky Cai\n盛朗熙 Joy Sheng\n郑伊健 Ekin Cheng\n谢依霖 Yilin Sie\n许玮甯 Tiffany Hsu', 'subtitles': '中文', 'IMDB_socre': '4 + aac', 'conutry': '中国/中国香港', 'placard': 'https://public.lightpic.info/image/8E19_590060000.jpg', 'dytt8_url':'http://www.dytt8.net/html/gndy/dyzz/20170223/53313.html'}
31 | # temp12 = {'director': '新海诚 Makoto Shinkai', 'language': '日语', 'resolution': '1280 x 720', 'type': '[你的名字。][HD-mkv.720p.日语中字][2016年高分获奖动画]', 'trans_name': '你的名字。/你的名字/君之名', 'publish': '2016-08-26(日本)/2016-12-02(中国)', 'ftpurl': 'ftp://ygdy8:ygdy8@yg72.dydytt.net:8035/[阳光电影www.ygdy8.com].英雄联盟.HD.720p.英语中字.mkv', 'decade': '2016', 'duration': '106分钟', 'level': '剧情/爱情/动画', 'size': '1CD', 'screenshot': 'https://public.lightpic.info/image/9035_5977ECF20.jpg', 'format': 'x264 + AAC', 'douban_score': '8.5/10 from 367,630 users', 'name': '君の名は。/Your Name', 'actors': '\u3000神木隆之介 Ry?nosuke Kamiki\n上白石萌音 Mone Kamishiraishi\n长泽雅美 Masami Nagasawa\n市原悦子 Etsuko Ichihara\n成田凌 Ryo Narita\n悠木碧 Aoi Yuki\n岛崎信长 Nobunaga Shimazaki\n石川界人 Kaito Ishikawa\n谷花音 Tani Kanon', 'subtitles': '中文', 'IMDB_socre': '8.5/10 from 41,821 users', 'conutry': '日本', 'placard': 'https://public.lightpic.info/image/3277_5977ECF10.jpg', 'dytt8_url':'http://www.dytt8.net/html/gndy/dyzz/20170223/53313.html'}
32 | # temp13 = {'director': '乔什·哈奈特 Josh Hartnett', 'language': '英语', 'resolution': '1CD', 'type': '[奥斯曼中尉][BD-mkv.720p.中英双字][2017年高分剧情战争]', 'trans_name': '奥斯曼中尉', 'publish': '2017-03-10(美国)', 'ftpurl': 'ftp://ygdy8:ygdy8@yg72.dydytt.net:8035/[阳光电影www.ygdy8.com].英雄联盟.HD.720p.英语中字.mkv', 'decade': '2017', 'duration': '约瑟夫·鲁本 Joseph Ruben', 'level': '剧情/战争', 'size': '106分钟', 'screenshot': 'https://public.lightpic.info/image/0C30_5977ECF20.jpg', 'format': '1280 x 720', 'douban_score': 'x264 + AAC', 'name': 'The Ottoman Lieutenant', 'actors': '\u3000米契尔·哈思曼 Michiel Huisman\n米契尔·哈思曼 Michiel Huisman\n哈鲁克·比尔根纳尔 Haluk Bilginer\n塞尔柱克.约奈坛 Selcuk Yontem\n海拉·西尔玛 Hera Hilmar', 'subtitles': '中英双字幕', 'IMDB_socre': '8.4/10 from 13,045 users', 'conutry': '土耳其/美国', 'placard': 'https://public.lightpic.info/image/B4D3_59769F3D0.jpg', 'dytt8_url':'http://www.dytt8.net/html/gndy/dyzz/20170223/53313.html'}
33 | # temp14 = {'director': '王泊文 Bowen Wang', 'language': '普通话', 'resolution': '1CD', 'type': '[无罪之城][HD-mkv.720p.国语中字][2017年动作]', 'trans_name': '无罪之城', 'publish': '2017-07-22(中国)', 'ftpurl': 'ftp://ygdy8:ygdy8@yg72.dydytt.net:8035/[阳光电影www.ygdy8.com].英雄联盟.HD.720p.英语中字.mkv', 'decade': '2017', 'duration': '尹晨阳', 'level': '动作/犯罪', 'size': '70分钟', 'screenshot': 'https://public.lightpic.info/image/A739_59774A5B0.jpg', 'format': '1280 x 720', 'douban_score': 'x264 + AAC', 'name': 'The Innocent City', 'actors': '\u3000谭盐盐\n谭盐盐\n赵怀良\n郭震 Zhen Guo', 'subtitles': '中文', 'IMDB_socre': '/10 from 178 users', 'conutry': '中国', 'placard': 'https://public.lightpic.info/image/77F9_59774B090.jpg', 'dytt8_url':'http://www.dytt8.net/html/gndy/dyzz/20170223/53313.html'}
34 | # temp15 = {'director': '陈正道 Leste Chen', 'language': '普通话', 'resolution': '1280 x 720', 'type': '[记忆大师][HD-mkv.720p.国语中字][2017年黄渤徐静蕾悬疑惊悚]', 'trans_name': '记忆大师/记忆战/催眠大师2', 'publish': '2017-04-23(北京电影节)/2017-04-28(中国)', 'ftpurl': 'ftp://ygdy8:ygdy8@yg72.dydytt.net:8035/[阳光电影www.ygdy8.com].英雄联盟.HD.720p.英语中字.mkv', 'decade': '2017', 'duration': '119分钟', 'level': '剧情/犯罪/悬疑/惊悚', 'size': '1CD', 'screenshot': 'https://public.lightpic.info/image/9FC6_59774A5B0.jpg', 'format': 'x264 + AAC', 'douban_score': '7.3/10 from 89,671 users', 'name': 'Battle of Memories', 'actors': '\u3000黄渤 Bo Huang\n徐静蕾 Jinglei Xu\n段奕宏 Yihong Duan\n杨子姗 Zishan Yang\n许玮宁 Tiffany Hsu\n梁杰理\n王真儿 Zhen Wang\n杜函梦 Hanmeng Du', 'subtitles': '中文', 'IMDB_socre': '6.9/10 from 206 users', 'conutry': '中国', 'placard': 'https://public.lightpic.info/image/C8C0_59774B090.jpg', 'dytt8_url':'http://www.dytt8.net/html/gndy/dyzz/20170223/53313.html'}
35 | #
36 | #
37 | #
38 | # TaskQueue.getContentQueue().put(temp1)
39 | # TaskQueue.getContentQueue().put(temp2)
40 | # TaskQueue.getContentQueue().put(temp3)
41 | # TaskQueue.getContentQueue().put(temp4)
42 | # TaskQueue.getContentQueue().put(temp5)
43 | # TaskQueue.getContentQueue().put(temp6)
44 | # TaskQueue.getContentQueue().put(temp7)
45 | # TaskQueue.getContentQueue().put(temp8)
46 | # TaskQueue.getContentQueue().put(temp9)
47 | # TaskQueue.getContentQueue().put(temp10)
48 | # TaskQueue.getContentQueue().put(temp11)
49 | # TaskQueue.getContentQueue().put(temp12)
50 | # TaskQueue.getContentQueue().put(temp13)
51 | # TaskQueue.getContentQueue().put(temp14)
52 | # TaskQueue.getContentQueue().put(temp15)
53 | #
54 | # DBName = 'dytt.db'
55 | # db = sqlite3.connect('./'+DBName, 10)
56 | # conn = db.cursor()
57 | #
58 | # SelectSql = 'Select * from sqlite_master where type = "table" and name="lastest_moive";'
59 | # CreateTableSql = '''
60 | # Create Table lastest_moive (
61 | # 'm_id' INTEGER PRIMARY KEY,
62 | # 'm_type' varchar(100),
63 | # 'm_trans_name' varchar(200),
64 | # 'm_name' varchar(100),
65 | # 'm_decade' varchar(30),
66 | # 'm_conutry' varchar(30),
67 | # 'm_level' varchar(100),
68 | # 'm_language' varchar(30),
69 | # 'm_subtitles' varchar(100),
70 | # 'm_publish' varchar(30),
71 | # 'm_IMDB_socre' varchar(50),
72 | # 'm_douban_score' varchar(50),
73 | # 'm_format' varchar(20),
74 | # 'm_resolution' varchar(20),
75 | # 'm_size' varchar(10),
76 | # 'm_duration' varchar(10),
77 | # 'm_director' varchar(50),
78 | # 'm_actors' varchar(1000),
79 | # 'm_placard' varchar(200),
80 | # 'm_screenshot' varchar(200),
81 | # 'm_ftpurl' varchar(200),
82 | # 'm_dytt8_url' varchar(200)
83 | # );
84 | # '''
85 | #
86 | # InsertSql = '''
87 | # Insert into lastest_moive(m_type, m_trans_name, m_name, m_decade, m_conutry, m_level, m_language, m_subtitles, m_publish, m_IMDB_socre,
88 | # m_douban_score, m_format, m_resolution, m_size, m_duration, m_director, m_actors, m_placard, m_screenshot, m_ftpurl,
89 | # m_dytt8_url)
90 | # values(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
91 | # '''
92 | # # 'm_id' int auto_increment,
93 | # if not conn.execute(SelectSql).fetchone():
94 | # conn.execute(CreateTableSql)
95 | # db.commit()
96 | # print('==== 创建表成功 ====')
97 | # else :
98 | # print('==== 创建表失败, 表已经存在 ====')
99 | #
100 | # count = 1
101 | #
102 | # while not TaskQueue.isContentQueueEmpty():
103 | # item = TaskQueue.getContentQueue().get()
104 | # conn.execute(InsertSql, Utils.dirToList(item))
105 | # db.commit()
106 | # print('插入第 ' + str(count) + ' 条数据成功')
107 | # count = count + 1
108 | #
109 | # db.commit()
110 | # db.close()
111 |
112 |
113 | # #测试案例3
114 | # DBName = 'dytt.db'
115 | # db = sqlite3.connect('./'+DBName, 10)
116 | # conn = db.cursor()
117 | #
118 | # curser = conn.execute("SELECT m_actors FROM lastest_moive WHERE m_name = 'Baahubali: The Conclusion';")
119 | # print(curser.fetchone())
120 |
121 |
122 | # 测试案例4
123 | # '''
124 | # 有点坑~ 估计是网站改版过。导致线上存在好几个不同的类型页面, 进而影响抓取结果为空
125 | # 需要做下兼容
126 | # Top 子线程 0 请求【 http://www.dytt8.net/html/gndy/dyzz/20170713/54501.html 】的结果: 200
127 | # Top 子线程 0 请求【 http://www.dytt8.net/html/gndy/dyzz/20170514/53986.html 】的结果: 200
128 | # Top 子线程 0 请求【 http://www.dytt8.net/html/gndy/dyzz/20170413/53726.html 】的结果: 200
129 | # Top 子线程 4 请求【 http://www.dytt8.net/html/gndy/dyzz/20170327/53562.html 】的结果: 200
130 | # Top 子线程 0 请求【 http://www.dytt8.net/html/gndy/dyzz/20170310/53447.html 】的结果: 200
131 | # Top 子线程 0 请求【 http://www.dytt8.net/html/gndy/dyzz/20170310/53446.html 】的结果: 200
132 | # ========================================================================================
133 | # Top 子线程 0 请求【 http://www.dytt8.net/html/gndy/dyzz/20170414/53727.html 】的结果: 200
134 | # Top 子线程 2 请求【 http://www.dytt8.net/html/gndy/dyzz/20170318/53507.html 】的结果: 200
135 | # ========================================================================================
136 | # Top 子线程 2 请求【 http://www.dytt8.net/html/gndy/dyzz/20091012/22194.html 】的结果: 200
137 | # Top 子线程 0 请求【 http://www.dytt8.net/html/gndy/dyzz/20091012/22192.html 】的结果: 200
138 | # Top 子线程 1 请求【 http://www.dytt8.net/html/gndy/dyzz/20091012/22189.html 】的结果: 200
139 | # Top 子线程 0 请求【 http://www.dytt8.net/html/gndy/dyzz/20091012/22184.html 】的结果: 200
140 | # Top 子线程 1 请求【 http://www.dytt8.net/html/gndy/dyzz/20091011/22172.html 】的结果: 200
141 | # Top 子线程 2 请求【 http://www.dytt8.net/html/gndy/dyzz/20091012/22193.html 】的结果: 200
142 | # ========================================================================================
143 | # Top 子线程 1 请求【 http://www.dytt8.net/html/gndy/dyzz/20091023/22428.html 】的结果: 200
144 | # Top 子线程 0 请求【 http://www.dytt8.net/html/gndy/dyzz/20091022/22418.html 】的结果: 200
145 | # ========================================================================================
146 | # Top 子线程 2 请求【 http://www.dytt8.net/html/gndy/dyzz/20091027/22516.html 】的结果: 200
147 | # Top 子线程 0 请求【 http://www.dytt8.net/html/gndy/dyzz/20091027/22515.html 】的结果: 200
148 | # ========================================================================================
149 | # Top 子线程 0 请求【 http://www.dytt8.net/html/gndy/dyzz/20091104/22688.html 】的结果: 200
150 | # 没有下载地址
151 | # ========================================================================================
152 | # Top 子线程 1 请求【 http://www.dytt8.net/html/gndy/dyzz/20091014/22223.html 】的结果: 200
153 | # Top 子线程 2 请求【 http://www.dytt8.net/html/gndy/dyzz/20091007/22074.html 】的结果: 200
154 | # Top 子线程 2 请求【 http://www.dytt8.net/html/gndy/dyzz/20091018/22327.html 】的结果: 200
155 | # Top 子线程 0 请求【 http://www.dytt8.net/html/gndy/dyzz/20091017/22304.html 】的结果: 200
156 | # Top 子线程 1 请求【 http://www.dytt8.net/html/gndy/dyzz/20091014/22239.html 】的结果: 200
157 | # ========================================================================================
158 | # Top 子线程 0 请求【 http://www.dytt8.net/html/gndy/dyzz/20091023/22441.html 】的结果: 200
159 | # Top 子线程 0 请求【 http://www.dytt8.net/html/gndy/dyzz/20091023/22439.html 】的结果: 200
160 | # ========================================================================================
161 | # Top 子线程 2 请求【 http://www.dytt8.net/html/gndy/dyzz/20091022/22414.html 】的结果: 200
162 | # Top 子线程 0 请求【 http://www.dytt8.net/html/gndy/dyzz/20091018/22326.html 】的结果: 200
163 | # Top 子线程 0 请求【 http://www.dytt8.net/html/gndy/dyzz/20091016/22282.html 】的结果: 200
164 | # Top 子线程 1 请求【 http://www.dytt8.net/html/gndy/dyzz/20091014/22239.html 】的结果: 200
165 | # Top 子线程 2 请求【 http://www.dytt8.net/html/gndy/dyzz/20091007/22074.html 】的结果: 200
166 | # Top 子线程 2 请求【 http://www.dytt8.net/html/gndy/dyzz/20091018/22327.html 】的结果: 200
167 | # Top 子线程 1 请求【 http://www.dytt8.net/html/gndy/dyzz/20091021/22381.html 】的结果: 200
168 | # Top 子线程 0 请求【 http://www.dytt8.net/html/gndy/dyzz/20091028/22540.html 】的结果: 200
169 | # Top 子线程 1 请求【 http://www.dytt8.net/html/gndy/dyzz/20091016/22282.html 】的结果: 200
170 | # ========================================================================================
171 | # Top 子线程 3 请求【 http://www.dytt8.net/html/gndy/dyzz/20141029/46502.html 】的结果: 200
172 | # Top 子线程 2 请求【 http://www.dytt8.net/html/gndy/dyzz/20141026/46484.html 】的结果: 200
173 | # Top 子线程 0 请求【 http://www.dytt8.net/html/gndy/dyzz/20141022/46458.html 】的结果: 200
174 | # ========================================================================================
175 | # Top 子线程 4 请求【 http://www.dytt8.net/html/gndy/dyzz/20120815/38998.html 】的结果: 200
176 | # Top 子线程 3 请求【 http://www.dytt8.net/html/gndy/dyzz/20120811/38936.html 】的结果: 200
177 | # Top 子线程 0 请求【 http://www.dytt8.net/html/gndy/dyzz/20120825/39129.html 】的结果: 200
178 | # Top 子线程 1 请求【 http://www.dytt8.net/html/gndy/dyzz/20120809/38919.html 】的结果: 200
179 | # Top 子线程 2 请求【 http://www.dytt8.net/html/gndy/dyzz/20120807/38894.html 】的结果: 200
180 | # Top 子线程 3 请求【 http://www.dytt8.net/html/gndy/dyzz/20120904/39251.html 】的结果: 200
181 |
182 | # '''
183 | url = 'http://www.dytt8.net/html/gndy/dyzz/20120807/38894.html'
184 | response = requests.get(url, headers=RequestModel.getHeaders(), proxies=RequestModel.getProxies(), timeout=3)
185 | print(' 请求【 ' + url + ' 】的结果: ' + str(response.status_code))
186 | response.encoding = 'GBK'
187 | selector = etree.HTML(response.text)
188 | # print(response.text) # 页面内容
189 |
190 |
191 | content = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/td/p/text()")
192 | print('第 1 次: content')
193 | print(content)
194 |
195 | # 匹配出来有两张图片, 第一张是海报, 第二张是电影画面截图
196 | imgs = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/td/p/img/@src")
197 |
198 |
199 |
200 | # 为了兼容 2012 年前的页面
201 | if not len(content):
202 | print('第 2 次: content')
203 | content = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/span/text()")
204 |
205 | # 有些页面特殊, 需要用以下表达式来重新获取信息
206 | if not len(content):
207 | print('第 3 次: content')
208 | content = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/td/div/text()")
209 |
210 | if not len(content):
211 | print('第 4 次: content')
212 | content = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/p/font/text()")
213 | if len(content) < 5:
214 | content = selector.xpath("//div[@class='co_content8']/ul/tr/td/p/font/text()")
215 |
216 | if not len(content):
217 | print('第 5 次: content')
218 | content = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/p/span/text()")
219 |
220 | if not len(content):
221 | print('第 6 次: content')
222 | content = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/div/span/text()")
223 |
224 | if not len(content):
225 | print('第 7 次: content')
226 | content = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/font/text()")
227 |
228 | if not len(content):
229 | print('第 8 次: content')
230 | content = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/p/text()")
231 |
232 |
233 |
234 | if not len(imgs):
235 | imgs = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/img/@src")
236 |
237 | if not len(imgs):
238 | imgs = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/p/img/@src")
239 |
240 | if not len(imgs):
241 | imgs = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/div/img/@src")
242 |
243 | if not len(imgs):
244 | imgs = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/td/div/img/@src")
245 |
246 |
247 |
248 | ftp = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/table/tbody/tr/td/a/text()")
249 |
250 | # 为了兼容 2012 年前的页面
251 | if not len(ftp):
252 | ftp = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/table/tbody/tr/td/font/a/text()")
253 |
254 | if not len(ftp):
255 | ftp = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/table/tbody/tr/td/a/text()")
256 |
257 | if not len(ftp):
258 | ftp = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/div/table/tbody/tr/td/font/a/text()")
259 |
260 | if not len(ftp):
261 | ftp = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/td/div/table/tbody/tr/td/a/text()")
262 |
263 | if not len(ftp):
264 | ftp = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/td/table/tbody/tr/td/a/text()")
265 |
266 | if not len(ftp):
267 | ftp = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/p/span/a/text()")
268 |
269 | if not len(ftp):
270 | ftp = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/div/div/table/tbody/tr/td/font/a/text()")
271 |
272 | if not len(ftp):
273 | ftp = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/span/table/tbody/tr/td/font/a/text()")
274 |
275 | if not len(ftp):
276 | ftp = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/div/td/div/span/div/table/tbody/tr/td/font/a/text()")
277 |
278 | print(content)
279 | print(ftp)
280 | print(imgs)
--------------------------------------------------------------------------------
/thread/FloorWorkThread.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #coding=utf-8
3 |
4 | import threading
5 | import time
6 |
7 | import requests
8 |
9 | from dytt8.dytt8Moive import dytt_Lastest
10 | from model.RequestModel import RequestModel
11 | from model.TaskQueue import TaskQueue
12 |
13 | '''
14 | 1)自己封装抓取二级网页多线程
15 | 2)由一级链接 抓取 电影目录
16 | 例如:由 http://www.dytt8.net/html/gndy/dyzz/list_23_2.html 获取
17 | "2017年动画喜剧《宝贝老板》英国粤三语.BD中英双字幕" 和 "页面 url 地址"等若干条电影的信息
18 | @Author monkey
19 | @Date 2017-08-08
20 | '''
21 |
22 | class FloorWorkThread(threading.Thread):
23 |
24 | NOT_EXIST = 0
25 |
26 | host = 'http://www.dytt8.net'
27 |
28 | def __init__(self, queue, id):
29 | threading.Thread.__init__(self)
30 | self.queue = queue
31 | self.id = id
32 |
33 |
34 | def run(self):
35 | while not self.NOT_EXIST:
36 | # 队列为空, 结束
37 | if self.queue.empty():
38 | NOT_EXIST = 1
39 | self.queue.task_done()
40 | break
41 |
42 | url = self.queue.get()
43 | try:
44 | response = requests.get(url, headers=RequestModel.getHeaders(), proxies=RequestModel.getProxies(), timeout=3)
45 | print('Floor 子线程 ' + str(self.id) + ' 请求【 ' + url + ' 】的结果: ' + str(response.status_code))
46 |
47 | # 需将电影天堂的页面的编码改为 GBK, 不然会出现乱码的情况
48 | response.encoding = 'GBK'
49 |
50 | if response.status_code != 200:
51 | self.queue.put(url)
52 | time.sleep(20)
53 | else :
54 | moivePageUrlList = dytt_Lastest.getMoivePageUrlList(response.text)
55 | for item in moivePageUrlList:
56 | each = self.host + item
57 | # print(each)
58 | TaskQueue.putToMiddleQueue(each)
59 | time.sleep(3) # 5
60 |
61 | except Exception as e:
62 | # print('catsh Exception ==== ')
63 | # self.queue.put(url)
64 | print(e)
--------------------------------------------------------------------------------
/thread/TopWorkThread.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #coding=utf-8
3 |
4 | import threading
5 | import time
6 |
7 | import requests
8 |
9 | from dytt8.dytt8Moive import dytt_Lastest
10 | from model.RequestModel import RequestModel
11 | from model.TaskQueue import TaskQueue
12 |
13 | '''
14 | 1)从电影详细信息页面【http://www.dytt8.net/html/gndy/dyzz/20170806/54695.html】中抓取目标内容
15 | 2)将数据存储到数据库中
16 | @Author monkey
17 | @Date 2017-08-14
18 | '''
19 | class TopWorkThread(threading.Thread):
20 |
21 | NOT_EXIST = 0
22 |
23 | def __init__(self, queue, id):
24 | threading.Thread.__init__(self)
25 | self.queue = queue
26 | self.id = id
27 |
28 |
29 |
30 | def run(self):
31 | while not self.NOT_EXIST:
32 | # 队列为空, 结束
33 | if self.queue.empty():
34 | NOT_EXIST = 1
35 | self.queue.task_done()
36 | break
37 |
38 | url = self.queue.get()
39 | try:
40 | response = requests.get(url, headers=RequestModel.getHeaders(), proxies=RequestModel.getProxies(), timeout=3)
41 | print('Top 子线程 ' + str(self.id) + ' 请求【 ' + url + ' 】的结果: ' + str(response.status_code))
42 |
43 | # 需将电影天堂的页面的编码改为 GBK, 不然会出现乱码的情况
44 | response.encoding = 'GBK'
45 |
46 | if response.status_code != 200:
47 | self.queue.put(url)
48 | time.sleep(20)
49 | else :
50 | temp = dytt_Lastest.getMoiveInforms(url, response.text)
51 | TaskQueue.getContentQueue().put(temp)
52 | time.sleep(5)
53 |
54 | except Exception as e:
55 | # self.queue.put(url)
56 | print(e)
57 |
--------------------------------------------------------------------------------
/utils/Utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #coding=utf-8
3 |
4 | '''
5 | 工具类
6 | @Author monkey
7 | @Date 2017-08-21
8 | '''
9 |
10 | class Utils(object):
11 |
12 | '''
13 | 将字典转化为列表
14 | '''
15 | @staticmethod
16 | def dirToList(item):
17 | itemlist = []
18 | itemlist.append(item['type'])
19 | itemlist.append(item['trans_name'])
20 | itemlist.append(item['name'])
21 | itemlist.append(item['decade'])
22 | itemlist.append(item['conutry'])
23 | itemlist.append(item['level'])
24 | itemlist.append(item['language'])
25 | itemlist.append(item['subtitles'])
26 | itemlist.append(item['publish'])
27 | itemlist.append(item['IMDB_socre'])
28 | itemlist.append(item['douban_score'])
29 | itemlist.append(item['format'])
30 | itemlist.append(item['resolution'])
31 | itemlist.append(item['size'])
32 | itemlist.append(item['duration'])
33 | itemlist.append(item['director'])
34 | itemlist.append(item['actors'])
35 | itemlist.append(item['placard'])
36 | itemlist.append(item['screenshot'])
37 | itemlist.append(item['ftpurl'])
38 | itemlist.append(item['dytt8_url'])
39 | return itemlist
--------------------------------------------------------------------------------