├── spiderinfo
    └── localhost
├── source.pyc
├── source
    ├── dytt8.py
    ├── dytt8.pyc
    ├── youku.pyc
    └── youku.py
├── Conf
    └── setting.ini
├── source.py
└── spider.py


/spiderinfo/localhost:
--------------------------------------------------------------------------------
1 | 数据库名称,用户名,密码,数据库
2 | 关键词1
3 | 关键词2


--------------------------------------------------------------------------------
/source.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markstock7/Simple-Computer-Robot/HEAD/source.pyc


--------------------------------------------------------------------------------
/source/dytt8.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markstock7/Simple-Computer-Robot/HEAD/source/dytt8.py


--------------------------------------------------------------------------------
/source/dytt8.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markstock7/Simple-Computer-Robot/HEAD/source/dytt8.pyc


--------------------------------------------------------------------------------
/source/youku.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markstock7/Simple-Computer-Robot/HEAD/source/youku.pyc


--------------------------------------------------------------------------------
/Conf/setting.ini:
--------------------------------------------------------------------------------
1 | #格式
2 | #服务器名 获取范围(ALL|RECENT) 获取来源(youku,dytt8)
3 | #localhost ALL dytt8
4 | #52kc ALL dytt8
5 | localhost ALL youku


--------------------------------------------------------------------------------
/source.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | path = os.getcwd()+"/source"
4 | if not path in sys.path:
5 |     sys.path.append(path)
6 | from dytt8 import *
7 | from youku import *
8 | 
9 | 


--------------------------------------------------------------------------------
/spider.py:
--------------------------------------------------------------------------------
  1 | #coding=gb2312
  2 | import pickle
  3 | import urllib2
  4 | from sgmllib import SGMLParser
  5 | import urllib
  6 | import MySQLdb
  7 | import time
  8 | import threading
  9 | import os
 10 | import Queue
 11 | import random
 12 | import re
 13 | import urlparse
 14 | from source import *
 15 | class webNode(object):
 16 |     def __init__(self,host,user,passwd,db='gxcms',prefix='gx_'):
 17 |         self.host = host
 18 |         self.user = user
 19 |         self.passwd = passwd
 20 |         self.keyword = []
 21 |         self.db = db
 22 |         self.prefix = prefix
 23 |         self.scope = ''
 24 |         self.source = ''
 25 | class mainSpider(object):
 26 |     def __init__(self,numberOfThreads = 1):
 27 |         print "Trying to load all the web server...\n"
 28 |         try:
 29 |             fp = open("Conf/setting.ini","r+b")
 30 |         except IOError:
 31 |             print "Can't find the \"webinfo\" file ...\n"
 32 |             return
 33 |         self.webinfo = []
 34 |         line = fp.readline().strip()
 35 |         while line:
 36 |             if line.startswith("#"):
 37 |                 line = fp.readline().strip()
 38 |                 continue
 39 |                 
 40 |             info = line.split()
 41 |             print info
 42 |             if len(info) != 3:
 43 |                 print "[Error] There is an error in setting.ini\n"
 44 |                 return
 45 |             SERVER = 0
 46 |             SCOPE = 1
 47 |             SOURCE = 2
 48 |             #if not self.webinfo.has_key(info[SOURCE]):
 49 |             #    self.webinfo[info[SOURCE]] = []
 50 |             try:
 51 |                 fpp = open("spiderinfo/"+info[SERVER],"r+")
 52 |             except IOError:
 53 |                 print "Can't find the file \"%s\"..\n" % info[SERVER]
 54 |                 line = fp.readline().strip()
 55 |                 continue
 56 |             
 57 |             webinfo = fpp.readline().strip().split(',')
 58 |             #address user passwd database prefix
 59 |             if len(webinfo) == 4:
 60 |                 newWebNode = webNode(webinfo[0],webinfo[1],webinfo[2],webinfo[3])
 61 |             elif len(webinfo) == 5:
 62 |                 newWebNode = webNode(webinfo[0],webinfo[1],webinfo[2],webinfo[3],webinfo[4])
 63 |             else:
 64 |                 print "[Error] There is an error in file \"%s\" at Line 1" % info[SERVER]
 65 |             newWebNode.scope = info[SCOPE]
 66 |             newWebNode.source = info[SOURCE]
 67 |             keyword = fpp.readline().strip()
 68 |             #loading keywords if exist
 69 |             while keyword:
 70 |                 keyword = keyword.decode("GB2312").encode("UTF-8")
 71 |                 newWebNode.keyword.append(urllib.quote(keyword))
 72 |                 keyword = fpp.readline().strip()
 73 |             self.webinfo.append(newWebNode)
 74 |             fpp.close()
 75 |             line = fp.readline().strip()
 76 |         fp.close()
 77 |         self.numberOfThreads = numberOfThreads
 78 |         self.Qout = 0
 79 |     
 80 |     def run(self):
 81 |         self.spiders = Queue.Queue()
 82 |         for web in self.webinfo:
 83 |             s = "self.spiders.put(%s(web))" % web.source
 84 |             exec(s)
 85 |         self.Pool = []
 86 |         for i in range(self.numberOfThreads):
 87 |             new_thread = threading.Thread(target = self.Mession)
 88 |             new_thread.setDaemon(True)
 89 |             self.Pool.append(new_thread)
 90 |             new_thread.start()
 91 |         while True:
 92 |             if self.Qout == self.numberOfThreads and self.Pool:
 93 |                 for i in self.Pool:
 94 |                     i.join()
 95 |                 del self.Pool[:]
 96 |                 print "we have done all the work ,byby!\n"
 97 |                 return
 98 |     def Mession(self):
 99 |         while True:
100 |             if self.spiders.empty():
101 |                 print "One Thread got Free \n"
102 |                 self.Qout += 1
103 |                 return
104 |             spider = self.spiders.get()
105 |             spider.run()
106 |             
107 |         
108 | 
109 |             
110 |                 
111 |                 
112 | 
113 | if __name__ == "__main__":
114 |     s = mainSpider()
115 |     s.run()
116 |     
117 | 


--------------------------------------------------------------------------------
/source/youku.py:
--------------------------------------------------------------------------------
  1 | #coding=utf-8
  2 | import urllib2
  3 | from sgmllib import SGMLParser
  4 | import urllib
  5 | import MySQLdb
  6 | import time
  7 | import os
  8 | import random
  9 | import re
 10 | import urlparse
 11 | 
 12 | url = "http://www.soku.com/search_video/q_%s_orderby_2_page_%s"
 13 | class youku(object):
 14 |     def __init__(self,web):
 15 |         self.web = web
 16 |         self.result = 0
 17 |         self.id = -1
 18 |         self.rfinally = 0
 19 |     def run(self):
 20 |         try:
 21 |             self.conn = MySQLdb.connect(host=self.web.host,user=self.web.user,passwd = self.web.passwd,db=self.web.db,charset="utf8",connect_timeout=5)
 22 |             print "connet to %s ,%s\n" % (self.web.host,self.web.db)
 23 |         except:
 24 |             print "can't connet to %s ,%s\n" % (self.web.host,self.web.db)
 25 |             return
 26 |         i = 0
 27 |         try:  
 28 |             # python UCS-4 build的处理方式  
 29 |             highpoints = re.compile('[\\x00-\\xFF]{2,4}')  
 30 |         except re.error:  
 31 |             # python UCS-2 build的处理方式  
 32 |             highpoints = re.compile('[\uD800-\uDBFF][\uDC00-\uDFFF]')  
 33 |         for index,keyword in enumerate(self.web.keyword):
 34 |             index = index + 1
 35 |             #组合成所需要的url
 36 |             for i in range(1,2):#默认扫描十页
 37 |                 myurl = url % (keyword , i)
 38 |                 #获取所有最新的视频
 39 |                 food = youkuSGML(self.web.scope)
 40 |                 #获取当前页的所有结果
 41 |                 try:
 42 |                     context = urllib2.urlopen(myurl,timeout=5)
 43 |                     content = context.read()
 44 |                 except:
 45 |                     print "can't read from %s " % myurl
 46 |                     continue
 47 | 
 48 |                 food.feed(content)
 49 |                 self.result += len(food.result)
 50 |         #将结果输入进数据库
 51 |                 for clist in food.result:
 52 |                     score = round(random.random(),2)*10
 53 |                     scoreer = random.randint(10,100)
 54 |                     atime = int(time.time())
 55 |                     e = False
 56 |                     if clist["title"] == '':
 57 |                         continue
 58 |                     clist["title"] = MySQLdb.escape_string(clist["title"])
 59 |                     sql = "insert into gx_video(`cid`,`intro`,`title`,`picurl`,`playurl`,`score`,`scoreer`,`keywords`,`color`,`actor`,`director`,`content`,`area`,`language`,`year`,`serial`,`addtime`,`hits`,`monthhits`,`weekhits`,`dayhits`,`hitstime`,`stars`,`status`,`up`,`down`,`downurl`,`inputer`,`reurl`,`letter`,`genuine`) values (%d,'',\'%s\',\'%s\',\'%s\',%d,%d,'','','','','','','',0,0,%d,0,0,0,0,0,0,1,0,0,'','','','',0)" % (index,clist["title"],clist["pic"],clist["link"],score,scoreer,atime)
 60 |                     print sql
 61 |                     try:
 62 |                         try:
 63 |                             self.conn.ping()
 64 |                         except Exception,e:
 65 |                             try:
 66 |                                self.conn = MySQLdb.connect(host=self.web.host,user=self.web.user,passwd = self.web.passwd,db=self.web.db,charset="utf8",connect_timeout=5)
 67 |                                print "Reconnet to %s ,%s\n" % (self.web.host,self.web.db)
 68 |                             except:
 69 |                                print "can't Reconnet to %s ,%s\n" % (self.web.host,self.web.db)
 70 |                                e = True
 71 |                         if not e:
 72 |                             self.conn.query(sql)
 73 |                             self.rfinally += 1
 74 |                     except:print sql+"/n"
 75 |         print "%s get %d results\n and %s insert successfully" % (self.web.db,self.result,self.rfinally)
 76 |         self.conn.close()
 77 | class youkuSGML(SGMLParser):
 78 |     def __init__(self,scope):
 79 |         SGMLParser.__init__(self)
 80 |         self.startflag = 0
 81 |         self.result =[]
 82 |         self.scope = scope
 83 |         print self.scope
 84 |         self.newr = {}
 85 |         self.getpic = False
 86 |         self.getlink = False
 87 |         self.gettime = False
 88 |     def start_div(self,attrs):
 89 |         if self.startflag > 0:
 90 |            self.startflag += 1
 91 |         for k,v in attrs:
 92 |            if k == "class" and v.strip() == "sk-vlist clearfix":
 93 |                self.startflag = 1
 94 |                return 
 95 |            if self.startflag > 0 :
 96 |                if k == "class" and v == "v-thumb":
 97 |                    self.getpic = True
 98 |                if k == "class" and v == "v-link":
 99 |                    self.getlink = True
100 |     def start_img(self,attrs):
101 |         if self.getpic and self.startflag:
102 |             for k,v in attrs:
103 |                 if k == "src":
104 |                     self.newr["pic"] = v
105 |                 if k == "alt":
106 |                     self.newr["title"] = v
107 |             self.gitpic = False
108 |     def start_a(self,attrs):
109 |         if self.getlink and self.startflag:
110 |             for k,v in attrs:
111 |                 if k == "href":
112 |                     self.newr["link"] = v
113 |                     self.result.append(self.newr)
114 |                     self.newr = dict()
115 |                     break
116 |             self.getlink = False
117 |     def start_span(self,attrs):
118 |         if self.startflag:
119 |             for k,v in attrs:
120 |                 if k == "class" and v == "pub":
121 |                     self.gettime = True
122 |     def handle_data(self,text):
123 |         if self.gettime and self.scope != 'ALL':
124 |             #check the uplaod time
125 |             if text.strip("0123456789") != '\xe5\xb0\x8f\xe6\x97\xb6\xe5\x89\x8d':
126 |                 self.startflag = 0
127 |                 self.gettime = False
128 |             else:
129 |                self.gettime = False
130 |     def end_div(self):
131 |         if self.startflag> 0:
132 |             self.startflag -= 1
133 | 


--------------------------------------------------------------------------------