├── README.md └── Crawl.py /README.md: -------------------------------------------------------------------------------- 1 | # Python-Crawler 2 | python多线程爬虫爬取电影天堂资源 3 | -------------------------------------------------------------------------------- /Crawl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding: UTF-8 3 | # -*- coding:utf-8 -*- 4 | import re 5 | import threading 6 | import os 7 | from urllib2 import Request,urlopen,URLError,HTTPError 8 | from lxml import etree 9 | class myThread (threading.Thread): #继承父类threading.Thread 10 | def __init__(self, url, newdir,CrawledURLs): 11 | threading.Thread.__init__(self) 12 | self.url = url 13 | self.newdir = newdir 14 | self.CrawledURLs=CrawledURLs 15 | def run(self): #把要执行的代码写到run函数里面 线程在创建后会直接运行run函数 16 | CrawListPage(self.url, self.newdir,self.CrawledURLs) 17 | starturl="http://www.ygdy8.com/index.html" 18 | host="http://www.ygdy8.com" 19 | #判断地址是否已经爬取 20 | def __isexit(newurl,CrawledURLs): 21 | for url in CrawledURLs: 22 | if url == newurl: 23 | return True 24 | return False 25 | 26 | #获取页面资源 27 | def __getpage(url): 28 | req = Request(url) 29 | user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 ' \ 30 | '(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' 31 | req.add_header('User-Agent', user_agent) 32 | try: 33 | response = urlopen(req, timeout=60) 34 | except: 35 | return "error"#已经结束了,后面不会运行 36 | 37 | else: 38 | page = response.read() 39 | return page 40 | #处理资源页面 爬取资源地址 41 | def CrawlSourcePage(url,filedir,filename,CrawledURLs): 42 | print url 43 | page = __getpage(url) 44 | if page=="error": 45 | return 46 | CrawledURLs.append(url) 47 | page = page.decode('gbk', 'ignore') 48 | tree = etree.HTML(page) 49 | Nodes = tree.xpath("//div[@align='left']//table//a") 50 | try: 51 | source = filedir + "/" + filename + ".txt" 52 | f = open(source.decode("utf-8"), 'w') 53 | for node in Nodes: 54 | sourceurl = node.xpath("text()")[0] 55 | f.write(sourceurl.encode("utf-8")+"\n") 56 | f.close() 57 | except: 58 | print "!!!!!!!!!!!!!!!!!" 59 | 60 | # 解析分类文件 61 | def CrawListPage(indexurl,filedir,CrawledURLs): 62 | print "正在解析分类主页资源" 63 | print indexurl 64 | page = __getpage(indexurl) 65 | if page=="error": 66 | return 67 | CrawledURLs.append(indexurl) 68 | page = page.decode('gbk', 'ignore') 69 | tree = etree.HTML(page) 70 | Nodes = tree.xpath("//div[@class='co_content8']//a") 71 | for node in Nodes: 72 | url=node.xpath("@href")[0] 73 | if re.match(r'/', url): 74 | # 非分页地址 可以从中解析出视频资源地址 75 | if __isexit(host + url,CrawledURLs): 76 | pass 77 | else: 78 | #文件命名是不能出现以下特殊符号 79 | filename=node.xpath("text()")[0].encode("utf-8").replace("/"," ")\ 80 | .replace("\\"," ")\ 81 | .replace(":"," ")\ 82 | .replace("*"," ")\ 83 | .replace("?"," ")\ 84 | .replace("\""," ")\ 85 | .replace("<", " ") \ 86 | .replace(">", " ")\ 87 | .replace("|", " ") 88 | CrawlSourcePage(host + url,filedir,filename,CrawledURLs) 89 | pass 90 | else: 91 | # 分页地址 从中嵌套再次解析 92 | print "分页地址 从中嵌套再次解析",url 93 | index = indexurl.rfind("/") 94 | baseurl = indexurl[0:index + 1] 95 | pageurl = baseurl + url 96 | if __isexit(pageurl,CrawledURLs): 97 | pass 98 | else: 99 | print "分页地址 从中嵌套再次解析", pageurl 100 | CrawListPage(pageurl,filedir,CrawledURLs) 101 | pass 102 | pass 103 | 104 | #解析首页 105 | def CrawIndexPage(starturl): 106 | print "正在爬取首页" 107 | page = __getpage(starturl) 108 | if page=="error": 109 | return 110 | page = page.decode('gbk', 'ignore') 111 | tree = etree.HTML(page) 112 | Nodes = tree.xpath("//div[@id='menu']//a") 113 | print "首页解析出地址",len(Nodes),"条" 114 | for node in Nodes: 115 | CrawledURLs = [] 116 | CrawledURLs.append(starturl) 117 | url=node.xpath("@href")[0] 118 | if re.match(r'/html/[A-Za-z0-9_/]+/index.html', url): 119 | if __isexit(host + url,CrawledURLs): 120 | pass 121 | else: 122 | try: 123 | catalog = node.xpath("text()")[0].encode("utf-8") 124 | newdir = "E:/电影资源/" + catalog 125 | os.makedirs(newdir.decode("utf-8")) 126 | print "创建分类目录成功------"+newdir 127 | thread = myThread(host + url, newdir,CrawledURLs) 128 | thread.start() 129 | except: 130 | pass 131 | CrawIndexPage(starturl) 132 | 133 | 134 | 135 | 136 | 137 | --------------------------------------------------------------------------------