├── spiderinfo └── localhost ├── source.pyc ├── source ├── dytt8.py ├── dytt8.pyc ├── youku.pyc └── youku.py ├── Conf └── setting.ini ├── source.py └── spider.py /spiderinfo/localhost: -------------------------------------------------------------------------------- 1 | 数据库名称,用户名,密码,数据库 2 | 关键词1 3 | 关键词2 -------------------------------------------------------------------------------- /source.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markstock7/Simple-Computer-Robot/HEAD/source.pyc -------------------------------------------------------------------------------- /source/dytt8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markstock7/Simple-Computer-Robot/HEAD/source/dytt8.py -------------------------------------------------------------------------------- /source/dytt8.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markstock7/Simple-Computer-Robot/HEAD/source/dytt8.pyc -------------------------------------------------------------------------------- /source/youku.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markstock7/Simple-Computer-Robot/HEAD/source/youku.pyc -------------------------------------------------------------------------------- /Conf/setting.ini: -------------------------------------------------------------------------------- 1 | #格式 2 | #服务器名 获取范围(ALL|RECENT) 获取来源(youku,dytt8) 3 | #localhost ALL dytt8 4 | #52kc ALL dytt8 5 | localhost ALL youku -------------------------------------------------------------------------------- /source.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | path = os.getcwd()+"/source" 4 | if not path in sys.path: 5 | sys.path.append(path) 6 | from dytt8 import * 7 | from youku import * 8 | 9 | -------------------------------------------------------------------------------- /spider.py: -------------------------------------------------------------------------------- 1 | #coding=gb2312 2 | import pickle 3 | import urllib2 4 | from sgmllib import SGMLParser 5 | import urllib 6 | import MySQLdb 7 | import time 8 | import threading 9 | import os 10 | import Queue 11 | import random 12 | import re 13 | import urlparse 14 | from source import * 15 | class webNode(object): 16 | def __init__(self,host,user,passwd,db='gxcms',prefix='gx_'): 17 | self.host = host 18 | self.user = user 19 | self.passwd = passwd 20 | self.keyword = [] 21 | self.db = db 22 | self.prefix = prefix 23 | self.scope = '' 24 | self.source = '' 25 | class mainSpider(object): 26 | def __init__(self,numberOfThreads = 1): 27 | print "Trying to load all the web server...\n" 28 | try: 29 | fp = open("Conf/setting.ini","r+b") 30 | except IOError: 31 | print "Can't find the \"webinfo\" file ...\n" 32 | return 33 | self.webinfo = [] 34 | line = fp.readline().strip() 35 | while line: 36 | if line.startswith("#"): 37 | line = fp.readline().strip() 38 | continue 39 | 40 | info = line.split() 41 | print info 42 | if len(info) != 3: 43 | print "[Error] There is an error in setting.ini\n" 44 | return 45 | SERVER = 0 46 | SCOPE = 1 47 | SOURCE = 2 48 | #if not self.webinfo.has_key(info[SOURCE]): 49 | # self.webinfo[info[SOURCE]] = [] 50 | try: 51 | fpp = open("spiderinfo/"+info[SERVER],"r+") 52 | except IOError: 53 | print "Can't find the file \"%s\"..\n" % info[SERVER] 54 | line = fp.readline().strip() 55 | continue 56 | 57 | webinfo = fpp.readline().strip().split(',') 58 | #address user passwd database prefix 59 | if len(webinfo) == 4: 60 | newWebNode = webNode(webinfo[0],webinfo[1],webinfo[2],webinfo[3]) 61 | elif len(webinfo) == 5: 62 | newWebNode = webNode(webinfo[0],webinfo[1],webinfo[2],webinfo[3],webinfo[4]) 63 | else: 64 | print "[Error] There is an error in file \"%s\" at Line 1" % info[SERVER] 65 | newWebNode.scope = info[SCOPE] 66 | newWebNode.source = info[SOURCE] 67 | keyword = fpp.readline().strip() 68 | #loading keywords if exist 69 | while keyword: 70 | keyword = keyword.decode("GB2312").encode("UTF-8") 71 | newWebNode.keyword.append(urllib.quote(keyword)) 72 | keyword = fpp.readline().strip() 73 | self.webinfo.append(newWebNode) 74 | fpp.close() 75 | line = fp.readline().strip() 76 | fp.close() 77 | self.numberOfThreads = numberOfThreads 78 | self.Qout = 0 79 | 80 | def run(self): 81 | self.spiders = Queue.Queue() 82 | for web in self.webinfo: 83 | s = "self.spiders.put(%s(web))" % web.source 84 | exec(s) 85 | self.Pool = [] 86 | for i in range(self.numberOfThreads): 87 | new_thread = threading.Thread(target = self.Mession) 88 | new_thread.setDaemon(True) 89 | self.Pool.append(new_thread) 90 | new_thread.start() 91 | while True: 92 | if self.Qout == self.numberOfThreads and self.Pool: 93 | for i in self.Pool: 94 | i.join() 95 | del self.Pool[:] 96 | print "we have done all the work ,byby!\n" 97 | return 98 | def Mession(self): 99 | while True: 100 | if self.spiders.empty(): 101 | print "One Thread got Free \n" 102 | self.Qout += 1 103 | return 104 | spider = self.spiders.get() 105 | spider.run() 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | if __name__ == "__main__": 114 | s = mainSpider() 115 | s.run() 116 | 117 | -------------------------------------------------------------------------------- /source/youku.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import urllib2 3 | from sgmllib import SGMLParser 4 | import urllib 5 | import MySQLdb 6 | import time 7 | import os 8 | import random 9 | import re 10 | import urlparse 11 | 12 | url = "http://www.soku.com/search_video/q_%s_orderby_2_page_%s" 13 | class youku(object): 14 | def __init__(self,web): 15 | self.web = web 16 | self.result = 0 17 | self.id = -1 18 | self.rfinally = 0 19 | def run(self): 20 | try: 21 | self.conn = MySQLdb.connect(host=self.web.host,user=self.web.user,passwd = self.web.passwd,db=self.web.db,charset="utf8",connect_timeout=5) 22 | print "connet to %s ,%s\n" % (self.web.host,self.web.db) 23 | except: 24 | print "can't connet to %s ,%s\n" % (self.web.host,self.web.db) 25 | return 26 | i = 0 27 | try: 28 | # python UCS-4 build的处理方式 29 | highpoints = re.compile('[\\x00-\\xFF]{2,4}') 30 | except re.error: 31 | # python UCS-2 build的处理方式 32 | highpoints = re.compile('[\uD800-\uDBFF][\uDC00-\uDFFF]') 33 | for index,keyword in enumerate(self.web.keyword): 34 | index = index + 1 35 | #组合成所需要的url 36 | for i in range(1,2):#默认扫描十页 37 | myurl = url % (keyword , i) 38 | #获取所有最新的视频 39 | food = youkuSGML(self.web.scope) 40 | #获取当前页的所有结果 41 | try: 42 | context = urllib2.urlopen(myurl,timeout=5) 43 | content = context.read() 44 | except: 45 | print "can't read from %s " % myurl 46 | continue 47 | 48 | food.feed(content) 49 | self.result += len(food.result) 50 | #将结果输入进数据库 51 | for clist in food.result: 52 | score = round(random.random(),2)*10 53 | scoreer = random.randint(10,100) 54 | atime = int(time.time()) 55 | e = False 56 | if clist["title"] == '': 57 | continue 58 | clist["title"] = MySQLdb.escape_string(clist["title"]) 59 | sql = "insert into gx_video(`cid`,`intro`,`title`,`picurl`,`playurl`,`score`,`scoreer`,`keywords`,`color`,`actor`,`director`,`content`,`area`,`language`,`year`,`serial`,`addtime`,`hits`,`monthhits`,`weekhits`,`dayhits`,`hitstime`,`stars`,`status`,`up`,`down`,`downurl`,`inputer`,`reurl`,`letter`,`genuine`) values (%d,'',\'%s\',\'%s\',\'%s\',%d,%d,'','','','','','','',0,0,%d,0,0,0,0,0,0,1,0,0,'','','','',0)" % (index,clist["title"],clist["pic"],clist["link"],score,scoreer,atime) 60 | print sql 61 | try: 62 | try: 63 | self.conn.ping() 64 | except Exception,e: 65 | try: 66 | self.conn = MySQLdb.connect(host=self.web.host,user=self.web.user,passwd = self.web.passwd,db=self.web.db,charset="utf8",connect_timeout=5) 67 | print "Reconnet to %s ,%s\n" % (self.web.host,self.web.db) 68 | except: 69 | print "can't Reconnet to %s ,%s\n" % (self.web.host,self.web.db) 70 | e = True 71 | if not e: 72 | self.conn.query(sql) 73 | self.rfinally += 1 74 | except:print sql+"/n" 75 | print "%s get %d results\n and %s insert successfully" % (self.web.db,self.result,self.rfinally) 76 | self.conn.close() 77 | class youkuSGML(SGMLParser): 78 | def __init__(self,scope): 79 | SGMLParser.__init__(self) 80 | self.startflag = 0 81 | self.result =[] 82 | self.scope = scope 83 | print self.scope 84 | self.newr = {} 85 | self.getpic = False 86 | self.getlink = False 87 | self.gettime = False 88 | def start_div(self,attrs): 89 | if self.startflag > 0: 90 | self.startflag += 1 91 | for k,v in attrs: 92 | if k == "class" and v.strip() == "sk-vlist clearfix": 93 | self.startflag = 1 94 | return 95 | if self.startflag > 0 : 96 | if k == "class" and v == "v-thumb": 97 | self.getpic = True 98 | if k == "class" and v == "v-link": 99 | self.getlink = True 100 | def start_img(self,attrs): 101 | if self.getpic and self.startflag: 102 | for k,v in attrs: 103 | if k == "src": 104 | self.newr["pic"] = v 105 | if k == "alt": 106 | self.newr["title"] = v 107 | self.gitpic = False 108 | def start_a(self,attrs): 109 | if self.getlink and self.startflag: 110 | for k,v in attrs: 111 | if k == "href": 112 | self.newr["link"] = v 113 | self.result.append(self.newr) 114 | self.newr = dict() 115 | break 116 | self.getlink = False 117 | def start_span(self,attrs): 118 | if self.startflag: 119 | for k,v in attrs: 120 | if k == "class" and v == "pub": 121 | self.gettime = True 122 | def handle_data(self,text): 123 | if self.gettime and self.scope != 'ALL': 124 | #check the uplaod time 125 | if text.strip("0123456789") != '\xe5\xb0\x8f\xe6\x97\xb6\xe5\x89\x8d': 126 | self.startflag = 0 127 | self.gettime = False 128 | else: 129 | self.gettime = False 130 | def end_div(self): 131 | if self.startflag> 0: 132 | self.startflag -= 1 133 | --------------------------------------------------------------------------------