├── 1688
    └── get_tel.py
├── 11选5
    └── chart.py
├── README.md
├── amazon
    ├── get_items.py
    └── items_usa.py
├── anjuke
    └── get_house.py
├── dianping
    └── get_info.py
├── douban
    ├── dou_movie.py
    └── dou_tv.py
├── guimi
    └── guimi.py
├── ingredient
    ├── get_infor.py
    └── get_ingre.py
├── itslaw
    └── get_anli.py
├── job
    ├── Job_get.py
    └── REANME.md
├── www.aihuishou.com
    └── get_price.py
├── www.hexun.com
    └── hexun.py
├── www.liepin.com
    └── liepin.py
├── www.renrendai.com
    └── renrendai.py
├── www.yanglao.com.cn
    └── get_infor.py
└── www.zimuzu.tv
    ├── movie_get.py
    └── tv_get.py


/11选5/chart.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import xlwt3
 6 | import os
 7 | import time
 8 | 
 9 | class Get_infor():
10 | 	def __init__(self):
11 | 		self.headers = {
12 | 			'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
13 | 			'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
14 | 			'Accept-Language': 'en-US,en;q=0.5',
15 | 			'Accept-Encoding': 'gzip, deflate',
16 | 			'Connection': 'keep-alive'}
17 | 		self.urls={'北京11选5': 'http://pub.icaile.com/bj11x5kjjg.php', '新疆11选5': 'http://pub.icaile.com/xj11x5kjjg.php', '湖北11选5': 'http://pub.icaile.com/hb11x5kjjg.php', '江西11选5': 'http://pub.icaile.com/jx11x5kjjg.php', '山西11选5': 'http://pub.icaile.com/sx11x5kjjg.php', '宁夏11选5': 'http://pub.icaile.com/nx11x5kjjg.php', '辽宁11选5': 'http://pub.icaile.com/ln11x5kjjg.php', '贵州11选5': 'http://pub.icaile.com/gz11x5kjjg.php', '云南11选5': 'http://pub.icaile.com/yn11x5kjjg.php', '西藏11选5': 'http://pub.icaile.com/xz11x5kjjg.php', '重庆11选5': 'http://pub.icaile.com/cq11x5kjjg.php', '吉林11选5': 'http://pub.icaile.com/jl11x5kjjg.php', '黑龙江11选5': 'http://pub.icaile.com/hlj11x5kjjg.php', '河南11选5': 'http://pub.icaile.com/hn11x5kjjg.php', '上海11选5': 'http://pub.icaile.com/sh11x5kjjg.php', '广东11选5': 'http://pub.icaile.com/gd11x5kjjg.php', '四川11选5': 'http://pub.icaile.com/sc11x5kjjg.php', '山东11选5': 'http://pub.icaile.com/sd11x5kjjg.php', '安徽11选5': 'http://pub.icaile.com/ah11x5kjjg.php', '浙江11选5': 'http://pub.icaile.com/zj11x5kjjg.php', '江苏11选5': 'http://pub.icaile.com/js11x5kjjg.php', '内蒙古11选5': 'http://pub.icaile.com/nmg11x5kjjg.php', '甘肃11选5': 'http://pub.icaile.com/gs11x5kjjg.php', '福建11选5': 'http://pub.icaile.com/fj11x5kjjg.php', '河北11选5': 'http://pub.icaile.com/heb11x5kjjg.php', '广西11选5': 'http://pub.icaile.com/gx11x5kjjg.php', '天津11选5': 'http://pub.icaile.com/tj11x5kjjg.php', '陕西11选5': 'http://pub.icaile.com/shx11x5kjjg.php'}
18 | 	def run(self):
19 | 		try:
20 | 			os.mkdir('data')
21 | 		except:
22 | 			print('..')
23 | 		for key in self.urls:
24 | 			try:
25 | 				html=requests.get(self.urls[key],headers=self.headers).text
26 | 			except:
27 | 				continue
28 | 			table=BeautifulSoup(html,'html.parser').find('table',attrs={'class':'today'}).find_all('tr')
29 | 			self.f=xlwt3.Workbook()
30 | 			self.sheet=self.f.add_sheet('sheet')
31 | 			self.count=0
32 | 			for item in table:
33 | 				try:
34 | 					infor=item.find_all('td')
35 | 					self.sheet.write(self.count,0,infor[0].get_text())
36 | 					num=1
37 | 					for i in infor[2].find_all('em'):
38 | 						self.sheet.write(self.count,num,i.get_text())
39 | 						num+=1
40 | 					self.count+=1
41 | 				except:
42 | 					continue
43 | 			self.f.save('data/'+key+'.xls')
44 | def test():
45 | 	html=requests.get('http://pub.icaile.com/sd11x5kjjg.php').text
46 | 	table=BeautifulSoup(html).find('div',attrs={'class':'left-nav'}).find('ul').find_all('li')
47 | 	urls={}
48 | 	for i in table:
49 | 		urls[i.get_text()]=i.find('a').get('href')
50 | 	print(urls)
51 | 
52 | if __name__=='__main__':
53 | 	print('1.直接抓取')
54 | 	print('2.定时抓取')
55 | 	num=input('输入序号：')
56 | 	if(num=='1'):
57 | 		work=Get_infor()
58 | 		work.run()
59 | 		print('OK')
60 | 	elif(num=='2'):
61 | 		times=input('输入间隔时间（小时）：')
62 | 		while True:
63 | 			work=Get_infor()
64 | 			work.run()
65 | 			print('OK')
66 | 			time.sleep(float(times)*3600)
67 | 


--------------------------------------------------------------------------------
/1688/get_tel.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | 
  3 | import requests
  4 | from bs4 import BeautifulSoup
  5 | import time
  6 | import re
  7 | import random
  8 | import xlwt3
  9 | 
 10 | class Get_ip(object):
 11 |     """docstring for Get_ip"""
 12 |     def __init__(self):
 13 |         super(Get_ip, self).__init__()
 14 |         self.url='http://www.xicidaili.com/nn/'
 15 |         self.headers = {
 16 | 			'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 17 | 			'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 18 | 			'Accept-Language': 'en-US,en;q=0.5',
 19 | 			'Accept-Encoding': 'gzip, deflate',
 20 | 			'Connection': 'keep-alive'}
 21 |         self.session=requests.session()
 22 |     def run(self):
 23 |         html=self.session.get(self.url,headers=self.headers).text
 24 |         table=BeautifulSoup(html).find('table',attrs={'id':'ip_list'}).find_all('tr')
 25 |         http_ips=[]
 26 |         for item in table[1:]:
 27 |             lists=item.find_all('td')
 28 |             ip={'ip':'','port':''}
 29 |             if lists[6].get_text()=='HTTP':
 30 |                 ip['ip']=lists[2].get_text()
 31 |                 ip['port']=lists[3].get_text()
 32 |                 http_ips.append(ip)
 33 |         return http_ips
 34 | 
 35 | class get_urls():
 36 | 	def __init__(self,url,page,ip):
 37 | 		self.page=page
 38 | 		self.url=url
 39 | 		self.proxies={
 40 | 			'http':'http://'+ip['ip']+':'+ip['port']
 41 | 		}
 42 | 	def get_url(self):
 43 | 		headers = {
 44 | 			'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 45 | 			'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 46 | 			'Accept-Language': 'en-US,en;q=0.5',
 47 | 			'Accept-Encoding': 'gzip, deflate',
 48 | 			
 49 | 			'Connection': 'keep-alive'}
 50 | 		html=requests.get(self.url+'&beginPage='+str(self.page),headers=headers,proxies=self.proxies,timeout=10).text
 51 | 		soup=BeautifulSoup(html)
 52 | 		table=soup.find('div',attrs={'id':'sw_mod_mainblock'}).find('ul').find_all('div',attrs={'class':'list-item-left'})
 53 | 		urls=[]
 54 | 		for item in table:
 55 | 			urls.append(item.find('a').get('href'))
 56 | 		return urls
 57 | 
 58 | class get_contact():
 59 | 	def __init__(self,url,ip):
 60 | 		#super(get_contact, self).__init__()
 61 | 		self.proxies={
 62 | 			'http':'http://'+ip['ip']+':'+ip['port']
 63 | 		}
 64 | 		self.url=url
 65 | 		self.headers = {
 66 | 			'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 67 | 			'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 68 | 			'Accept-Language': 'en-US,en;q=0.5',
 69 | 			
 70 | 			'Accept-Encoding': 'gzip, deflate'
 71 | 			}
 72 | 		self.session=requests.session()
 73 | 	def run(self):
 74 | 		try:
 75 | 			html=self.session.get(self.url,headers=self.headers,proxies=self.proxies,timeout=10).text
 76 | 			contact_url=BeautifulSoup(html).find('div',attrs={'class':'top-nav-bar-box'}).find('li',attrs={'data-page-name':'contactinfo'}).find('a').get('href')
 77 | 		except:
 78 | 			self.statue=0
 79 | 			print('~~~')
 80 | 			return
 81 | 		self.statue=1
 82 | 		try:
 83 | 			#time.sleep(random.randint(4, 6))
 84 | 			html=self.session.get(contact_url,headers=self.headers,proxies=self.proxies,timeout=10).text
 85 | 			table=BeautifulSoup(html).find('div',attrs={'class':'fd-line'}).find_all('dl')
 86 | 			self.title=BeautifulSoup(html).find('div',attrs={'class':'contact-info'}).find('h4').get_text()
 87 | 			self.infor=[]
 88 | 			for item in table[:-1]:
 89 | 				self.infor.append(item.get_text().replace('\n','').replace(' ',''))
 90 | 		except:
 91 | 			self.statue=0
 92 | 
 93 | class Main():
 94 | 	def __init__(self):
 95 | 		self.f=xlwt3.Workbook()
 96 | 		self.sheet=self.f.add_sheet('sheet')
 97 | 		self.count=0
 98 | 		work=Get_ip()
 99 | 		self.ips=work.run()
100 | 	def work(self):
101 | 		search_url=input('输入链接:')
102 | 		for i in range(100):
103 | 			url_get=get_urls(search_url,i+1,self.ips[random.randint(0, len(self.ips)-1)])
104 | 			try:
105 | 				urls=url_get.get_url()
106 | 			except:
107 | 				continue
108 | 			for url in urls:
109 | 				#time.sleep(random.randint(6, 9))
110 | 				spider=get_contact(url,self.ips[random.randint(0, len(self.ips)-1)])
111 | 				spider.run()
112 | 				if spider.statue==0:
113 | 					continue
114 | 				self.sheet.write(self.count,0,spider.title)
115 | 				num=1
116 | 				for infor in spider.infor:
117 | 					self.sheet.write(self.count,num,infor)
118 | 					num+=1
119 | 				self.count+=1
120 | 				print(self.count)
121 | 				self.f.save('data.xls')
122 | 			#time.sleep(random.randint(5, 8))
123 | def test():
124 | 	test=get_urls('http://s.1688.com/company/company_search.htm?keywords=%BE%AB%C3%DC%BB%FA%D0%B5&earseDirect=false&button_click=top&n=y&pageSize=30',1)
125 | 	print(test.get_url())
126 | 
127 | if __name__=='__main__':
128 | 	work=Main()
129 | 	work.work()
130 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Nyspider
 2 | 写的各种各样的爬虫
 3 | 
 4 | 11选5---各省最新11选5开奖信息
 5 | 
 6 | amazon---获取amazon商品信息
 7 | 
 8 | anjuke---获取安居客上面小区信息
 9 | 
10 | dianping---大众点评上商家信息
11 | 
12 | douban---豆瓣电影信息
13 | 
14 | guimi---闺蜜网商品评论
15 | 
16 | ingredient---化妆品成分信息
17 | 
18 | itslaw---无讼网案例信息
19 | 
20 | job---各招聘网站职位信息
21 | 
22 | www.aihuishou.com ---爱回收手机价格
23 | 
24 | www.yanglaowang.com.cn ---养老网上养老院信息
25 | 
26 | www.zimuzu.tv ---电影，电视剧ed2k链接
27 | 
28 | www.hexun.com ---和讯网，获取股票交易明细
29 | 
30 | www.renrendai.com ---人人贷，获取贷款信息
31 | 


--------------------------------------------------------------------------------
/amazon/get_items.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | 
  3 | import requests
  4 | import xlwt
  5 | from bs4 import BeautifulSoup
  6 | import time
  7 | import re
  8 | import random
  9 | 
 10 | class get_urls():
 11 |     def __init__(self,page,keyword):
 12 |         self.session=requests.session()
 13 |         self.page=page
 14 |         self.keyword=keyword
 15 |     def get_url(self):
 16 |         headers = {
 17 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 18 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 19 |             'Accept-Language': 'en-US,en;q=0.5',
 20 |             'Accept-Encoding': 'gzip, deflate',
 21 |             'DNT': 1,
 22 |             'Connection': 'keep-alive'}
 23 |         html=self.session.get('http://www.amazon.cn/s/ref=sr_pg_3?rh=i%3Aaps%2Ck%3A'+self.keyword+'&page='+str(self.page)+'&keywords='+self.keyword+'&ie=UTF8&qid=1442030727',headers=headers).text
 24 |         table=BeautifulSoup(html).find('div',attrs={'id':'rightContainerATF'})
 25 |         rel='<li(.*?)</li>'
 26 |         table=re.findall(re.compile(rel),str(table))
 27 |         urls=[]
 28 |         rel='href="(.*?)"'
 29 |         rel=re.compile(rel)
 30 |         for item in table:
 31 |             url=re.findall(rel,str(item))
 32 |             try:
 33 |                 urls.append(url[0])
 34 |             except:
 35 |                 continue
 36 |         return urls
 37 | 
 38 | class get_infor():
 39 |     def __init__(self,url):
 40 |         self.url=url
 41 |         self.session=requests.session()
 42 |         self.headers = {
 43 |             "X-Forwarded-For":str(random.randint(0, 255))+'.'+str(random.randint(0, 255))+'.'+str(random.randint(0, 255))+'.'+str(random.randint(0, 255)),
 44 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 45 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 46 |             'Accept-Language': 'en-US,en;q=0.5',
 47 |             'Accept-Encoding': 'gzip, deflate',
 48 |             'DNT': 1,
 49 |             'Connection': 'keep-alive'}
 50 |         self.get_info()
 51 |     def get_info(self):
 52 |         self.statue=0
 53 |         while True:
 54 |             try:
 55 |                 html=self.session.get(self.url,headers=self.headers,timeout=5).text
 56 |                 soup=BeautifulSoup(html)
 57 |                 self.price=''.join(soup.find('div',attrs={'id':'price'}).find('tr').get_text().replace('\n','').split())
 58 |                 self.title=soup.find('h1',attrs={'id':'title'}).get_text().replace('\n','')
 59 |                 try:
 60 |                     self.previews=''.join(list(filter(str.isdigit, soup.find('div',attrs={'id':'centerCol'}).find('a',attrs={'id':'cmrs-atf'}).get_text())))
 61 |                 except:
 62 |                     self.previews=0
 63 |                 self.picture_url=soup.find('div',attrs={'class':'imgTagWrapper'}).find('img').get('data-old-hires')
 64 |                 self.picture=self.session.get(self.picture_url,headers=self.headers,timeout=5).content
 65 |                 self.statue=1
 66 |                 break
 67 |             except:
 68 |                 break
 69 | 
 70 | class Main():
 71 |     def __init__(self):
 72 |         self.f=xlwt.Workbook()
 73 |         self.sheet=self.f.add_sheet('sheet',cell_overwrite_ok=True)
 74 |         self.count=0
 75 |     def work(self):
 76 |         keyword=input("输入关键字（英文）：")
 77 |         page=input("输入页数：")
 78 |         for i in range(int(page)):
 79 |             try:
 80 |                 work=get_urls(i+1,keyword)
 81 |                 urls=work.get_url()
 82 |             except:
 83 |                 continue
 84 |             for url in urls:
 85 |                 item=get_infor(url)
 86 |                 if item.statue==0:
 87 |                     continue
 88 |                 with open(str(self.count)+item.picture_url[-4:],'wb') as img:
 89 |                     img.write(item.picture)
 90 |                 img.close()
 91 |                 self.sheet.write(self.count,0,str(self.count))
 92 |                 self.sheet.write(self.count,1,item.title)
 93 |                 self.sheet.write(self.count,2,item.price)
 94 |                 self.sheet.write(self.count,3,item.previews)
 95 |                 self.count+=1
 96 |                 self.f.save('data.xls')
 97 |                 print(self.count)
 98 | 
 99 | if __name__=='__main__':
100 |     work=Main()
101 |     work.work()
102 | 


--------------------------------------------------------------------------------
/amazon/items_usa.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | import xlwt
 5 | from bs4 import BeautifulSoup
 6 | import time
 7 | import re
 8 | import random
 9 | 
10 | class get_urls():
11 |     def __init__(self,page,keyword):
12 |         self.session=requests.session()
13 |         self.page=page
14 |         self.keyword=keyword
15 |     def get_url(self):
16 |         headers = {
17 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
18 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
19 |             'Accept-Language': 'en-US,en;q=0.5',
20 |             'Accept-Encoding': 'gzip, deflate',
21 |             'DNT': 1,
22 |             'Connection': 'keep-alive'}
23 |         html=self.session.get('http://www.amazon.com/s/ref=sr_pg_2?rh=i%3Aaps%2Ck%3A'+self.keyword+'&page='+str(self.page)+'&keywords='+self.keyword+'&ie=UTF8',headers=headers).text
24 |         #table=BeautifulSoup(html).find('ul',attrs={'id':'s-results-list-atf'})
25 |         rel='a class="a-link-normal a-text-normal" href="(http.*?)"'
26 |         table=re.findall(re.compile(rel),str(html))
27 |         urls=list(set(table))
28 |         return urls
29 | 
30 | class get_infor():
31 |     def __init__(self,url):
32 |         self.url=url
33 |         self.session=requests.session()
34 |         self.headers = {
35 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
36 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
37 |             'Accept-Language': 'en-US,en;q=0.5',
38 |             'Accept-Encoding': 'gzip, deflate',
39 |             'DNT': 1,
40 |             'Connection': 'keep-alive'}
41 |         self.get_info()
42 |     def get_info(self):
43 |         self.statue=0
44 |         while True:
45 |             try:
46 |                 html=self.session.get(self.url,headers=self.headers,timeout=5).text
47 |                 soup=BeautifulSoup(html)
48 |                 self.price=''.join(soup.find('div',attrs={'id':'price'}).find('tr').get_text().replace('\n','').split())
49 |                 self.title=soup.find('h1',attrs={'id':'title'}).get_text().replace('\n','')
50 |                 try:
51 |                     self.previews=''.join(list(filter(str.isdigit, soup.find('div',attrs={'id':'centerCol'}).find('a',attrs={'id':'acrCustomerReviewLink'}).get_text())))
52 |                 except:
53 |                     self.previews=0
54 |                 self.picture_url=soup.find('div',attrs={'class':'imgTagWrapper'}).find('img').get('data-old-hires')
55 |                 self.picture=self.session.get(self.picture_url,headers=self.headers,timeout=5).content
56 |                 self.statue=1
57 |                 break
58 |             except:
59 |                 break
60 | 
61 | class Main():
62 |     def __init__(self):
63 |         self.f=xlwt.Workbook()
64 |         self.sheet=self.f.add_sheet('sheet',cell_overwrite_ok=True)
65 |         self.count=0
66 |     def work(self):
67 |         keyword=input("输入关键字（英文）：")
68 |         page=input("输入页数：")
69 |         for i in range(int(page)):
70 |             try:
71 |                 work=get_urls(i+1,keyword)
72 |                 urls=work.get_url()
73 |             except:
74 |                 continue
75 |             for url in urls:
76 |                 item=get_infor(url)
77 |                 if item.statue==0:
78 |                     continue
79 |                 with open(str(self.count)+item.picture_url[-4:],'wb') as img:
80 |                     img.write(item.picture)
81 |                 img.close()
82 |                 self.sheet.write(self.count,0,str(self.count))
83 |                 self.sheet.write(self.count,1,item.title)
84 |                 self.sheet.write(self.count,2,item.price)
85 |                 self.sheet.write(self.count,3,item.previews)
86 |                 self.count+=1
87 |                 self.f.save('data.xls')
88 |                 print(self.count)
89 | 
90 | if __name__=='__main__':
91 |     work=Main()
92 |     work.work()
93 | 


--------------------------------------------------------------------------------
/anjuke/get_house.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | import xlwt3
 5 | from bs4 import BeautifulSoup
 6 | import re
 7 | 
 8 | class get_infor():
 9 |     def __init__(self,url):
10 |         self.url=url
11 |         self.session=requests.session()
12 |         self.headers = {
13 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
14 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
15 |             'Accept-Language': 'en-US,en;q=0.5',
16 |             'Accept-Encoding': 'gzip, deflate',
17 |             'DNT': 1,
18 |             'Connection': 'keep-alive'}
19 |     def work(self):
20 |         html=self.session.get(self.url,headers=self.headers).text
21 |         self.statue=0
22 |         soup=BeautifulSoup(html)
23 |         self.price=soup.find('div',attrs={'class':'comm-cont'}).find('p',attrs={'class':'mag-b2'}).get_text().replace('\n','').replace(' ','')
24 |         table=soup.find('div',attrs={'class':'comm-list clearfix'}).find_all('dl')
25 |         self.infortable=[]
26 |         for i in table:
27 |             lists=i.find_all('dd')
28 |             for item in lists:
29 |                 self.infortable.append(item.get_text().replace('\n','').replace(' ',''))
30 |         self.statue=1
31 | 
32 | 
33 | class get_urls():
34 |     def __init__(self,url):
35 |         self.url=url
36 |         self.session=requests.session()
37 |         self.headers = {
38 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
39 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
40 |             'Accept-Language': 'en-US,en;q=0.5',
41 |             'Accept-Encoding': 'gzip, deflate',
42 |             'DNT': 1,
43 |             'Connection': 'keep-alive'}
44 |     def run(self):
45 |         html=self.session.get(self.url,headers=self.headers).text
46 |         lists=BeautifulSoup(html,'lxml').find('div',attrs={'class':'pL'}).find('ul').find_all('li')
47 |         urls=[]
48 |         for item in lists:
49 |             urls.append(item.find('a').get('href'))
50 |         return urls
51 | 
52 | class Main():
53 |     def work(self):
54 |         self.f=xlwt3.Workbook()
55 |         self.sheet=self.f.add_sheet('sheet')
56 |         self.count=0
57 |         for page in range(338):
58 |             get_url=get_urls('http://shanghai.anjuke.com/community/W0QQp1Z7QQp'+'Z'+str(page+1))
59 |             print(page)
60 |             urls=get_url.run()
61 |             for url in urls:
62 |                 item=get_infor(url)
63 |                 item.work()
64 |                 if item.statue==0:
65 |                     continue
66 |                 self.sheet.write(self.count,0,'浦东')
67 |                 num=1
68 |                 for infor in item.infortable:
69 |                     self.sheet.write(self.count,num,infor)
70 |                     num+=1
71 |                 self.sheet.write(self.count,num,item.price)
72 |                 num+=1
73 |                 self.sheet.write(self.count,num,url)
74 |                 self.count+=1
75 |                 self.f.save('data.xls')
76 | def test():
77 |     test=get_infor('http://shanghai.anjuke.com/community/view/106')
78 |     test.work()
79 | 
80 | 
81 | if __name__=='__main__':
82 |     work=Main()
83 |     work.work()
84 | 


--------------------------------------------------------------------------------
/dianping/get_info.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | 
  3 | import requests
  4 | import xlwt3
  5 | from bs4 import BeautifulSoup
  6 | import re
  7 | 
  8 | class get_infor():
  9 |     def __init__(self,url):
 10 |         self.url=url
 11 |         self.session=requests.session()
 12 |         self.headers = {
 13 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 14 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 15 |             'Accept-Language': 'en-US,en;q=0.5',
 16 |             'Accept-Encoding': 'gzip, deflate',
 17 |             'DNT': 1,
 18 |             'Connection': 'keep-alive'}
 19 |     def work(self):
 20 |         html=self.session.get(self.url,headers=self.headers).text
 21 |         self.statue=0
 22 |         infor=BeautifulSoup(html,'lxml').find('div',attrs={'class':'main'}).find('div',attrs={'id':'basic-info'})
 23 |         try:
 24 |             self.title=infor.find('h1').get_text().replace('\n','').replace(' ','')
 25 |         except:
 26 |             return
 27 |         try:
 28 |             self.area=BeautifulSoup(html,'lxml').find('div',attrs={'class':'breadcrumb'}).find_all('a')[2].get_text().replace('\n','').replace(' ','')
 29 |         except:
 30 |             self.area=''
 31 |         try:
 32 |             self.address=infor.find('div',attrs={'class':'expand-info address'}).get_text().replace('\n','').replace(' ','')
 33 |         except:
 34 |             self.address=' '
 35 |         try:
 36 |             self.tel=infor.find('span',attrs={'itemprop':'tel'}).get_text()
 37 |         except:
 38 |             self.tel='  '
 39 |         table=infor.find('div',attrs={'class':'other J-other Hide'}).find_all('p')
 40 |         self.price=''
 41 |         self.times=''
 42 |         for item in table:
 43 |             try:
 44 |                 if(item.find('span').get_text()=='营业时间：'):
 45 |                     self.times=item.get_text().replace('\n','').replace(' ','').replace('修改','')
 46 |             except:
 47 |                 continue
 48 |         table=infor.find('div',attrs={'class':'brief-info'}).find_all('span')
 49 |         for item in table:
 50 |             try:
 51 |                 if(item.get_text()[:2]=='人均' or item.get_text()[:2]=='费用' or item.get_text()[:2]=='均价'):
 52 |                     self.price=item.get_text().replace('\n','').replace(' ','')
 53 |             except:
 54 |                 continue
 55 |         if self.price=='':
 56 |             self.price='--'
 57 |         if self.times=='':
 58 |             self.times='--'
 59 |         self.statue=1
 60 | 
 61 | 
 62 | class get_urls():
 63 |     def __init__(self,url):
 64 |         self.url=url
 65 |         self.session=requests.session()
 66 |         self.headers = {
 67 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 68 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 69 |             'Accept-Language': 'en-US,en;q=0.5',
 70 |             'Accept-Encoding': 'gzip, deflate',
 71 |             'DNT': 1,
 72 |             'Cookie':'showNav=#nav-tab|0|1; navCtgScroll=0; _hc.v="\"23f85427-5787-47bd-9df4-4e831c7a4cae.1442049973\""; __utma=1.649416466.1442049979.1442049979.1442049979.1; __utmz=1.1442049979.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; cy=1; cye=shanghai; s_ViewType=10; aburl=1; JSESSIONID=95881D627CA4C940D686AD118D776232; PHOENIX_ID=0a0308bc-14fda0ead2e-4e713c',
 73 |             'Connection': 'keep-alive'}
 74 |     def run(self):
 75 |         html=self.session.get(self.url,headers=self.headers).text
 76 |         lists=BeautifulSoup(html,'lxml').find('div',attrs={'id':'shop-all-list'}).find_all('li')
 77 |         urls=[]
 78 |         for item in lists:
 79 |             urls.append('http://www.dianping.com'+item.find('a').get('href'))
 80 |         return urls
 81 | 
 82 | class Main():
 83 |     def work(self):
 84 |         self.f=xlwt3.Workbook()
 85 |         self.sheet=self.f.add_sheet('sheet')
 86 |         self.count=0
 87 |         for page in range(50):
 88 |             get_url=get_urls('http://www.dianping.com/search/category/1/20/g187r12'+'p'+str(page+1))
 89 |             print(page)
 90 |             urls=get_url.run()
 91 |             for url in urls:
 92 |                 try:
 93 |                     item=get_infor(url)
 94 |                     item.work()
 95 |                 except:
 96 |                     continue
 97 |                 if item.statue==0:
 98 |                     continue
 99 |                 self.sheet.write(self.count,0,'购物')
100 |                 self.sheet.write(self.count,1,'超市便利店')
101 |                 self.sheet.write(self.count,2,'闵行')
102 |                 self.sheet.write(self.count,3,item.area)
103 |                 self.sheet.write(self.count,4,item.title)
104 |                 self.sheet.write(self.count,5,item.address)
105 |                 self.sheet.write(self.count,6,item.tel)
106 |                 self.sheet.write(self.count,7,item.price)
107 |                 self.sheet.write(self.count,8,item.times)
108 |                 self.sheet.write(self.count,9,url)
109 |                 self.count+=1
110 |                 self.f.save('data.xls')
111 | 
112 | def test():
113 |     test=get_infor('http://www.dianping.com/shop/1909912')
114 |     test.work()
115 |     print(test.times)
116 |     print(test.price)
117 | if __name__=='__main__':
118 |     work=Main()
119 |     work.work()
120 | 


--------------------------------------------------------------------------------
/douban/dou_movie.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | 
  3 | import time
  4 | import requests
  5 | from bs4 import BeautifulSoup
  6 | import os
  7 | import sqlite3
  8 | 
  9 | class Douban():
 10 |     def __init__(self):
 11 |         self.session=requests.session()
 12 |         self.headers = {
 13 |             'Host': 'movie.douban.com',
 14 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 15 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 16 |             'Accept-Language': 'en-US,en;q=0.5',
 17 |             'Accept-Encoding': 'gzip, deflate',
 18 |             'DNT': 1,
 19 |             'Connection': 'keep-alive'}
 20 |         self.session.get('http://www.douban.com',headers=self.headers)
 21 |         self.count=0
 22 | 
 23 |     def work(self):
 24 |         self.get_urls('http://www.douban.com/tag/%E5%89%A7%E6%83%85/movie',0)
 25 |         self.get_urls('http://www.douban.com/tag/%E5%8A%A8%E7%94%BB/movie',1)
 26 |         self.get_urls('http://www.douban.com/tag/%E7%8A%AF%E7%BD%AA/movie',2)
 27 |         self.get_urls('http://www.douban.com/tag/%E6%83%8A%E6%82%9A/movie',3)
 28 |         self.get_urls('http://www.douban.com/tag/%E6%82%AC%E7%96%91/movie',4)
 29 |         self.get_urls('http://www.douban.com/tag/cult/movie',5)
 30 |         self.get_urls('http://www.douban.com/tag/%E6%81%90%E6%80%96/movie',6)
 31 |         self.get_urls('http://www.douban.com/tag/%E6%9A%B4%E5%8A%9B/movie',7)
 32 |         self.get_urls('http://www.douban.com/tag/%E9%BB%91%E5%B8%AE/movie',8)
 33 | 
 34 |     def get_urls(self,url,types):
 35 |         dbs=['juqing_urls.db','donghua_urls.db','fanzui_urls.db','jingsong_urls.db','xuanyi_urls.db','cult_urls.db','kongbu_urls.db','baoli_urls.db','heibang_urls_db']
 36 |         db=dbs[types]
 37 |         if os.path.isfile(db):
 38 |             conn = sqlite3.connect(db)
 39 |             cursor=conn.cursor()
 40 |         else:
 41 |             conn=sqlite3.connect(db)
 42 |             cursor=conn.cursor()
 43 |             cursor.execute("create table urls(url varchar(40) primary key)")
 44 |         urls=self.get_url(url)
 45 |         for i in urls:
 46 |             try:
 47 |                 cursor.execute("insert into urls(url) values (?)",(i,))
 48 |             except:
 49 |                 continue
 50 |         cursor.close()
 51 |         conn.commit()
 52 |         conn.close()
 53 |         print(db+'    OK')
 54 | 
 55 |     def get_url(self,url):
 56 |         num=0
 57 |         urls=[]
 58 |         while True:
 59 |             time.sleep(2)
 60 |             try:
 61 |                 html=self.session.get(url+'?start='+str(num)).text
 62 |             except:
 63 |                 break
 64 |             try:
 65 |                 table=BeautifulSoup(html).find('div',attrs={'class':'mod movie-list'}).find_all('dl')
 66 |             except:
 67 |                 break
 68 |             if table==[]:
 69 |                 break
 70 |             for i in table:
 71 |                 urls.append(i.find('a').get('href'))
 72 |             num+=15
 73 |         return urls
 74 | 
 75 |     def run(self):
 76 |         for i in range(9):
 77 |             self.get_text(i)
 78 | 
 79 |     def get_text(self,num):
 80 |         dbs=['juqing_urls.db','donghua_urls.db','fanzui_urls.db','jingsong_urls.db','xuanyi_urls.db','cult_urls.db']
 81 |         conn = sqlite3.connect(dbs[num])
 82 |         cursor = conn.execute("SELECT url from urls")
 83 |         file_text=open(dbs[num].replace('_urls.db','.txt'),'w',encoding='utf-8')
 84 |         for row in cursor:
 85 |             time.sleep(2)
 86 |             try:
 87 |                 text=self.spider(row[0])
 88 |             except:
 89 |                 continue
 90 |             file_text.write(text+'\n\n')
 91 |             print(self.count)
 92 |             self.count+=1
 93 |         cursor.close()
 94 |         conn.commit()
 95 |         conn.close()
 96 |         file_text.close()
 97 | 
 98 |     def spider(self, url):
 99 |         html = requests.get(url, headers=self.headers).text
100 |         soup = BeautifulSoup(html)
101 |         name=soup.find('span',attrs={'property':'v:itemreviewed'}).get_text()
102 |         picture=soup.find('img',attrs={'rel':'v:image'}).get('src')
103 |         picture='[img]'+picture+'[/img]'
104 |         text=name+'\n'
105 |         text+=picture+'\n'
106 |         info=soup.find('div',attrs={'class':'indent clearfix'}).find('div',attrs={'id':'info'}).get_text()
107 |         text+=info
108 |         intro=soup.find('div',attrs={'class':'related-info'}).get_text()
109 |         text+=intro
110 |         return text
111 | 
112 | def test():
113 |     work = Douban()
114 |     #work.get_url('http://www.douban.com/tag/%E7%8A%AF%E7%BD%AA/movie')
115 |     print(work.spider('http://movie.douban.com/subject/3592854/?from=tag_all'))
116 | 
117 | if __name__ == '__main__':
118 |     work = Douban()
119 |     work.work()
120 |     work.run()
121 | 


--------------------------------------------------------------------------------
/douban/dou_tv.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | 
 3 | import time
 4 | import requests
 5 | from bs4 import BeautifulSoup
 6 | import os
 7 | import sqlite3
 8 | 
 9 | class Douban():
10 |     def __init__(self):
11 |         self.session=requests.session()
12 |         self.headers = {
13 |             'Host': 'movie.douban.com',
14 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
15 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
16 |             'Accept-Language': 'en-US,en;q=0.5',
17 |             'Accept-Encoding': 'gzip, deflate',
18 |             'DNT': 1,
19 |             'Connection': 'keep-alive'}
20 |         self.count=0
21 |         self.session.get('http://www.douban.com',headers=self.headers)
22 | 
23 |     def work(self):
24 |         if os.path.isfile('tv_urls.db'):
25 |             conn = sqlite3.connect('tv_urls.db')
26 |             cursor=conn.cursor()
27 |         else:
28 |             conn=sqlite3.connect('tv_urls.db')
29 |             cursor=conn.cursor()
30 |             cursor.execute("create table urls(url varchar(40) primary key)")
31 |         urls=self.get_url('http://movie.douban.com/tag/%E7%94%B5%E8%A7%86%E5%89%A7')
32 |         for i in urls:
33 |             try:
34 |                 cursor.execute("insert into urls(url) values (?)",(i,))
35 |             except:
36 |                 continue
37 |         cursor.close()
38 |         conn.commit()
39 |         conn.close()
40 |         print('OK')
41 | 
42 |     def get_url(self,url):
43 |         num=0
44 |         urls=[]
45 |         while True:
46 |             time.sleep(2)
47 |             try:
48 |                 html=self.session.get(url+'?start='+str(num)+'&type=T').text
49 |             except:
50 |                 break
51 |             try:
52 |                 table=BeautifulSoup(html).find('div',attrs={'class':'article'}).find('div',attrs={'class':''}).find_all('table')
53 |             except:
54 |                 break
55 |             if table==[]:
56 |                 break
57 |             for i in table:
58 |                 urls.append(i.find('a',attrs={'class':'nbg'}).get('href'))
59 |             num+=20
60 |         return urls
61 | 
62 |     def get_text(self):
63 |         conn = sqlite3.connect('tv_urls.db')
64 |         cursor = conn.execute("SELECT url from urls")
65 |         file_text=open('tv.txt','w',encoding='utf-8')
66 |         for row in cursor:
67 |             time.sleep(2)
68 |             try:
69 |                 text=self.spider(row[0])
70 |             except:
71 |                 continue
72 |             file_text.write(text+'\n\n')
73 |             print(self.count)
74 |             self.count+=1
75 |         cursor.close()
76 |         conn.commit()
77 |         conn.close()
78 |         file_text.close()
79 | 
80 |     def spider(self, url):
81 |         html = requests.get(url, headers=self.headers).text
82 |         soup = BeautifulSoup(html)
83 |         name=soup.find('span',attrs={'property':'v:itemreviewed'}).get_text()
84 |         picture=soup.find('img',attrs={'rel':'v:image'}).get('src')
85 |         picture='[img]'+picture+'[/img]'
86 |         text=name+'\n'
87 |         text+=picture+'\n'
88 |         info=soup.find('div',attrs={'class':'indent clearfix'}).find('div',attrs={'id':'info'}).get_text()
89 |         text+=info
90 |         intro=soup.find('div',attrs={'class':'related-info'}).get_text()
91 |         text+=intro
92 |         return text
93 | 
94 | if __name__=='__main__':
95 |     work=Douban()
96 |     work.work()
97 |     work.get_text()
98 | 


--------------------------------------------------------------------------------
/guimi/guimi.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | 
  3 | from bs4 import BeautifulSoup
  4 | import requests
  5 | import time
  6 | import re
  7 | import jieba
  8 | import sys
  9 | import jieba.analyse
 10 | import xlwt
 11 | 
 12 | class Urls_get():
 13 |     def __init__(self,url):
 14 |         self.url=url
 15 |         self.headers = {
 16 |             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 17 |             "Accept-Encoding": "gzip, deflate",
 18 |             "Accept-Language": "en-US,en;q=0.5",
 19 |             "Connection": "keep-alive",
 20 |             "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
 21 |         self.session=requests.session()
 22 |         print('Get items')
 23 |         self.item_urls=[]
 24 |         self.get_urls()
 25 | 
 26 |     def get_urls(self):
 27 |         html=self.session.get(self.url,headers=self.headers).text
 28 |         rel=r'class="txtBg" href="(.*?)" target='
 29 |         rel=re.compile(rel)
 30 |         lists=re.findall(rel,html)
 31 |         for i in lists:
 32 |             if(i.endswith('review')):
 33 |                 continue
 34 |             if(i.endswith('product')):
 35 |                 continue
 36 |             self.item_urls.append(i)
 37 |         count=2
 38 |         while(True):
 39 |                 time.sleep(1)
 40 |                 html=self.session.get(self.url+'&page='+str(count),headers=self.headers).text
 41 |                 lists=re.findall(rel,html)
 42 |                 if lists:
 43 |                     for i in lists:
 44 |                         if(i.endswith('review')):
 45 |                             continue
 46 |                         if(i.endswith('product')):
 47 |                             continue
 48 |                         self.item_urls.append(i)
 49 |                     count+=1
 50 |                 else:
 51 |                     break
 52 | 
 53 | class Word_frequency():
 54 |     def __init__(self,file_name):
 55 |         self.file_name=file_name
 56 |         self.analyse()
 57 | 
 58 |     def analyse(self):
 59 |         content=open(self.file_name,'r').read()
 60 |         '''
 61 |         text=jieba.analyse.extract_tags(content, topK=50, withWeight=True, allowPOS=('adj'))
 62 |         '''
 63 |         text=jieba.analyse.textrank(content, topK=50, withWeight=True, allowPOS=('adj'))
 64 |         f=xlwt.Workbook()
 65 |         sheet=f.add_sheet('sheet')
 66 |         count=0
 67 |         for i in text:
 68 |             sheet.write(count,0,i[0])
 69 |             sheet.write(count,1,i[1])
 70 |             count+=1
 71 |         f.save('fenghua.xls')
 72 | 
 73 | class Review_get():
 74 |     def __init__(self,urls):
 75 |         self.urls=urls
 76 |         self.headers = {
 77 |             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 78 |             "Accept-Encoding": "gzip, deflate",
 79 |             "Accept-Language": "en-US,en;q=0.5",
 80 |             "Connection": "keep-alive",
 81 |             "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
 82 |         self.session=requests.session()
 83 |         print('Get reviews')
 84 |         self.reviews=[]
 85 |         self.write_review()
 86 | 
 87 |     def write_review(self):
 88 |         for i in self.urls:
 89 |             self.reviews+=self.spider(i)
 90 |         file=open('fenghua.txt','w')
 91 |         for i in self.reviews:
 92 |             file.write(i+'\n')
 93 |         file.close
 94 | 
 95 |     def spider(self,url):
 96 |         count=1
 97 |         review=[]
 98 |         while(True):
 99 |             time.sleep(0.1)
100 |             html=self.session.get(url+ str(count),headers=self.headers).content
101 |             soup=BeautifulSoup(html).find_all('p',attrs={'class':'com_p'})
102 |             if soup:
103 |                 for i in soup:
104 |                     review.append(i.get_text())
105 |                 count+=1
106 |             else:
107 |                 return review
108 | def Main():
109 |     item_get=Urls_get('http://so.kimiss.com/?keyword=%B7%E4%BB%A8%BB%A4%B7%A2%CB%D8&idx=10')
110 |     urls=item_get.item_urls
111 |     review_get=Review_get(urls)
112 |     Analyse=Word_frequency('fenghua.txt')
113 | 
114 | if __name__=='__main__':
115 |     Main()
116 | 


--------------------------------------------------------------------------------
/ingredient/get_infor.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | 
  3 | import requests
  4 | import xlwt3
  5 | from bs4 import BeautifulSoup
  6 | import re
  7 | import xlrd
  8 | import threading
  9 | import random
 10 | import time
 11 | 
 12 | class Get_ip():
 13 |     def __init__(self,num):
 14 |         self.headers = {
 15 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 16 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 17 |             'Accept-Language': 'en-US,en;q=0.5',
 18 |             'Accept-Encoding': 'gzip, deflate',
 19 |             'Connection': 'keep-alive'}
 20 |         self.url='http://vxer.daili666api.com/ip/?tid=559950660678689&num='+str(num)+'&delay=3&category=2'
 21 |         self.session=requests.session()
 22 |     def get(self):
 23 |         ip=self.session.get(self.url,headers=self.headers).text.replace('\n','')
 24 |         return ip
 25 | 
 26 | class Get_infor(threading.Thread):
 27 |     def __init__(self,score,english_name):
 28 |         super(Get_infor, self).__init__()
 29 |         self.session=requests.session()
 30 |         self.headers = {
 31 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 32 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 33 |             'Accept-Language': 'en-US,en;q=0.5',
 34 |             'Accept-Encoding': 'gzip, deflate',
 35 |             'Connection': 'keep-alive'}
 36 |         self.f=xlwt3.Workbook()
 37 |         self.sheet=self.f.add_sheet('sheet')
 38 |         self.count=0
 39 |         self.score=score
 40 |         self.english_name=english_name
 41 | 
 42 |     def run(self):
 43 |         self.statue=1
 44 |         try:
 45 |             html=self.session.get('http://www.cosdna.com/chs/stuff.php?q='+self.english_name,headers=self.headers).text
 46 |         except:
 47 |             self.statue=0
 48 |             return
 49 |         try:
 50 |             url=BeautifulSoup(html).find('div',attrs={'class':'StuffResult'}).find('tr').find('a').get('href')
 51 |             self.infor('http://www.cosdna.com/chs/'+url)
 52 |         except:
 53 |             try:
 54 |                 self.infor('http://www.cosdna.com/chs/stuff.php?q='+self.english_name)
 55 |             except:
 56 |                 self.statue=0
 57 | 
 58 |     def infor(self,url):
 59 |         html=self.session.get(url,headers=self.headers).text
 60 |         infor_table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'StuffDetail'})
 61 |         try:
 62 |             self.chinese_name=infor_table.find('div',attrs={'class':'Stuff_DetailC'}).get_text()
 63 |         except:
 64 |             self.statue=0
 65 |             return
 66 |         rel='r/>(.*?)<b'
 67 |         rel=re.compile(rel)
 68 |         try:
 69 |             self.function=re.findall(rel,str(infor_table))[0]
 70 |         except:
 71 |             self.function=''
 72 | 
 73 | class Main():
 74 |     def __init__(self):
 75 |         self.f=xlwt3.Workbook()
 76 |         self.sheet=self.f.add_sheet('sheet')
 77 |         self.count=0
 78 |     def run(self):
 79 |         names=xlrd.open_workbook('ingredient.xls')
 80 |         table=names.sheets()[0]
 81 |         threads=[]
 82 |         for i in range(table.nrows):
 83 |             get_infor=Get_infor(table.cell(i,0).value,table.cell(i,1).value)
 84 |             threads.append(get_infor)
 85 |             if len(threads)<30 and i<9505:
 86 |                 continue
 87 |             for work in threads:
 88 |                 work.start()
 89 |             for work in threads:
 90 |                 work.join()
 91 |             for work in threads:
 92 |                 if work.statue==0:
 93 |                     continue
 94 |                 self.sheet.write(self.count,0,work.score)
 95 |                 self.sheet.write(self.count,1,work.english_name)
 96 |                 self.sheet.write(self.count,2,work.chinese_name)
 97 |                 self.sheet.write(self.count,3,work.function)
 98 |                 self.count+=1
 99 |                 print(self.count)
100 |                 self.f.save('data.xls')
101 |             threads=[]
102 | 
103 | if __name__=='__main__':
104 |     work=Main()
105 |     work.run()
106 | 


--------------------------------------------------------------------------------
/ingredient/get_ingre.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import xlwt3
 6 | 
 7 | class Get_ingredient():
 8 |     def __init__(self):
 9 |         self.session=requests.session()
10 |         self.headers = {
11 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
12 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
13 |             'Accept-Language': 'en-US,en;q=0.5',
14 |             'Accept-Encoding': 'gzip, deflate',
15 |             'Connection': 'keep-alive'}
16 |         self.f=xlwt3.Workbook()
17 |         self.sheet=self.f.add_sheet('sheet')
18 |         self.count=0
19 |     def get_infor(self):
20 |         for page in range(899):
21 |             html=self.session.get('http://www.ewg.org/skindeep/search.php?search_group=ingredients&query=.&order=webscore+INC&&showmore=ingredients&start='+str(page*10),headers=self.headers).text
22 |             ingredients=BeautifulSoup(html).find('table',attrs={'id':'table-browse'}).find_all('tr')
23 |             for item in ingredients[0:]:
24 |                 try:
25 |                     name=item.find('td',attrs={'align':'left'}).find('a').get_text()
26 |                 except:
27 |                     continue
28 |                 score=item.find_all('td',attrs={'align':'center'})[1].find('img').get('src').replace('http://static.ewg.org/skindeep/img/draw_score/score_image','')
29 |                 if score[0]==score[2]:
30 |                     score=score[0]
31 |                 else:
32 |                     score=str(score[2])+'-'+str(score[0])
33 |                 self.sheet.write(self.count,0,score)
34 |                 self.sheet.write(self.count,1,name)
35 |                 self.count+=1
36 |             self.f.save('ingredient.xls')
37 |             print(page)
38 | 
39 | if __name__=='__main__':
40 |     work=Get_ingredient()
41 |     work.get_infor()
42 | 


--------------------------------------------------------------------------------
/itslaw/get_anli.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | 
  3 | import requests
  4 | import xlwt3
  5 | from bs4 import BeautifulSoup
  6 | import re
  7 | import threading
  8 | import json
  9 | import random
 10 | import time
 11 | 
 12 | class Get_urls():
 13 |     def __init__(self,url):
 14 |         self.session=requests.session()
 15 |         self.headers = {
 16 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 17 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 18 |             'Accept-Language': 'en-US,en;q=0.5',
 19 |             'Accept-Encoding': 'gzip, deflate',
 20 |             'Connection': 'keep-alive'}
 21 |         self.url=url
 22 |     def run(self):
 23 |         html=self.session.get(self.url,headers=self.headers).text
 24 |         data=json.loads(html)
 25 |         data=data['data']['searchResult']['judgements']
 26 |         ids=[]
 27 |         for item in data:
 28 |             ids.append(item['id'])
 29 |         return ids
 30 | 
 31 | class Get_infor():
 32 |     def __init__(self,ID):
 33 |         self.url='http://www.itslaw.com/api/v1/detail?timestamp=1443233970966&judgementId='+ID
 34 |         self.session=requests.session()
 35 |         self.headers = {
 36 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 37 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 38 |             'Accept-Language': 'en-US,en;q=0.5',
 39 |             'Accept-Encoding': 'gzip, deflate',
 40 |             'Connection': 'keep-alive'}
 41 |     def run(self):
 42 |         html=self.session.get(self.url,headers=self.headers).text
 43 |         data=json.loads(html)
 44 |         data=data['data']['fullJudgement']
 45 |         self.title=data['title']
 46 |         self.court=data['court']
 47 |         try:
 48 |             self.judgementDate=data['judgementDate']
 49 |         except:
 50 |             self.judgementDate=data['publishDate']
 51 |         try:
 52 |             self.caseNumber=data['caseNumber']
 53 |         except:
 54 |             self.caseNumber=''
 55 |         try:
 56 |             self.caseType=data['caseType']
 57 |         except:
 58 |             self.caseType=''
 59 |         try:
 60 |             self.judgementType=data['judgementType']
 61 |         except:
 62 |             self.judgementType=''
 63 |         try:
 64 |             self.trialRound=''
 65 |             if data['trialRound']=='1':
 66 |                 self.trialRound='一审'
 67 |             if data['trialRound']=='2':
 68 |                 self.trialRound='二审'
 69 |             if data['trialRound']=='3':
 70 |                 self.trialRound='再审'
 71 |         except:
 72 |             self.trialRound=''
 73 |         self.texts=[]
 74 |         for item in data['paragraphs']:
 75 |             try:
 76 |                 infor=item['typeText']+':\n'
 77 |             except:
 78 |                 infor=''
 79 |             for i in item['subParagraphs']:
 80 |                 try:
 81 |                     infor+=i['text']+'\n'
 82 |                 except:
 83 |                     infor+=''
 84 |             self.texts.append(infor)
 85 | 
 86 | class Main():
 87 |     def __init__(self):
 88 |         self.f=xlwt3.Workbook()
 89 |         self.sheet=self.f.add_sheet('sheet')
 90 |         self.count=0
 91 |     def run(self):
 92 |         for page in range(91):
 93 |             if page*20<1100:
 94 |                 continue
 95 |             ids=Get_urls('http://www.itslaw.com/api/v1/caseFiles?startIndex='+str(page*20)+'&countPerPage=20&sortType=1&conditions=searchWord%2B%E5%AE%89%E5%85%A8%E7%94%9F%E4%BA%A7%E7%9B%91%E7%9D%A3%E7%AE%A1%E7%90%86%E5%B1%80%2B1&conditions=caseType%2B3%2B1').run()
 96 |             for num in ids:
 97 |                 work=Get_infor(num)
 98 |                 work.run()
 99 |                 self.sheet.write(self.count,0,work.title)
100 |                 self.sheet.write(self.count,1,work.court)
101 |                 self.sheet.write(self.count,2,work.judgementDate)
102 |                 self.sheet.write(self.count,3,work.caseNumber)
103 |                 self.sheet.write(self.count,4,work.caseType)
104 |                 self.sheet.write(self.count,5,work.judgementType)
105 |                 self.sheet.write(self.count,6,work.trialRound)
106 |                 num=7
107 |                 for i in work.texts:
108 |                     self.sheet.write(self.count,num,i)
109 |                     num+=1
110 |                 self.count+=1
111 |                 self.f.save('data2.xls')
112 |                 print(self.count)
113 | if __name__=='__main__':
114 |     work=Main()
115 |     work.run()
116 | 


--------------------------------------------------------------------------------
/job/Job_get.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | from bs4 import BeautifulSoup
  3 | import requests
  4 | import time
  5 | 
  6 | class Ganji():
  7 |     def __init__(self):
  8 |         self.session=requests.session()
  9 |         self.f = open('Ganji.txt', 'w', encoding = 'utf-8')
 10 |         self.headers = {
 11 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 12 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 13 |             'Accept-Language': 'en-US,en;q=0.5',
 14 |             'Accept-Encoding': 'gzip, deflate',
 15 |             'DNT': 1,
 16 |             'Connection': 'keep-alive'}
 17 |     def work(self):
 18 |         self.run('changan', 2)
 19 |         self.run('dalingshan', 1)
 20 |         self.run('humen', 2)
 21 |         self.run('changping', 2)
 22 |         self.run('qiaotou', 1)
 23 |         self.run('qishi', 1)
 24 |         self.run('tangsha', 1)
 25 |         self.run('qingxi', 1)
 26 |         self.run('fenggang', 2)
 27 |         self.run('zhangmutou', 1)
 28 |         self.f.close()
 29 |     def run(self,name,x):
 30 |         for page in range(x):
 31 |             lists=self.spider(page,name)
 32 |             self.write(lists)
 33 |     def write(self,lists):
 34 |         for i in lists:
 35 |             self.f.write(i.find('dt').find('a').get_text()+' ')
 36 |             self.f.write(i.find('dd', attrs = {'class' : 'company'}).find('a').get('title')+' ')
 37 |             self.f.write(i.find('dd', attrs = {'class' : 'pay'}).get_text()+'\n\n')
 38 |     def spider(self,page,name):
 39 |         html=requests.get('http://dg.ganji.com/zpwaimaozhuanyuan/'+name+'/o'+str(page+1)+'/',headers=self.headers).text
 40 |         soup = BeautifulSoup(html)
 41 |         table = soup.find('div', attrs = {'id' : 'list-job-id'})
 42 |         lists = table.find_all('dl', attrs = {'class' : 'list-noimg job-list clearfix'})
 43 |         return lists
 44 | 
 45 | class Job_58():
 46 |     def __init__(self):
 47 |         self.f = open('job_58.txt', 'w')
 48 |         self.session=requests.session()
 49 |         self.headers = {
 50 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 51 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 52 |             'Accept-Language': 'en-US,en;q=0.5',
 53 |             'Accept-Encoding': 'gzip, deflate',
 54 |             'DNT': 1,
 55 |             'Connection': 'keep-alive'}
 56 |     def work(self):
 57 |         self.run('changanqv', 1)
 58 |         self.run('dalingshan', 1)
 59 |         self.run('changpingshi', 1)
 60 |         self.run('tangsha', 1)
 61 |         self.run('zhangmutou', 1)
 62 |         self.run('fenggang', 1)
 63 |         self.run('qiaotouz', 1)
 64 |         self.run('qingxi', 1)
 65 |         self.run('qishis', 1)
 66 |         self.f.close()
 67 |     def run(self,name,x):
 68 |         for page in range(x):
 69 |             lists=self.spider(page,name)
 70 |             self.write(lists)
 71 |     def write(self,lists):
 72 |         for i in lists:
 73 |             self.f.write(i.find('dt').find('a').get_text().replace('\n',' '))
 74 |             self.f.write(i.find('dd', attrs = {'class' : 'w271'}).get_text().replace('\n',' '))
 75 |             self.f.write(i.find('dd', attrs = {'class' : 'w96'}).get_text()+'\n\n')
 76 |     def spider(self,page,name):
 77 |         html=requests.get('http://dg.58.com/'+name+'/zpshangwumaoyi/pn'+str(page+1)+'/',headers=self.headers).text
 78 |         soup = BeautifulSoup(html)
 79 |         table = soup.find('div', attrs = {'id' : 'infolist'})
 80 |         lists = table.find_all('dl')
 81 |         return lists
 82 | 
 83 | class Job_5156():
 84 |     def __init__(self):
 85 |         self.session=requests.session()
 86 |         self.f = open('job_5156.txt', 'w')
 87 |         self.headers = {
 88 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 89 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 90 |             'Accept-Language': 'en-US,en;q=0.5',
 91 |             'Accept-Encoding': 'gzip, deflate',
 92 |             'DNT': 1,
 93 |             'Connection': 'keep-alive'}
 94 |     def final(self):
 95 |         lists=['http://s.job5156.com/s/p/result?keywordType=0&keyword=%E5%A4%96%E8%B4%B8&locationList=14010500%2C14010600%2C14011000&posTypeList=&industryList=&updateIn=90&salary=&salaryUnPublic=1&gender=&age=&',
 96 |             'http://s.job5156.com/s/p/result?keywordType=0&keyword=%E5%A4%96%E8%B4%B8&locationList=14011100%2C14012000%2C14012100&posTypeList=&industryList=&updateIn=90&degreeFrom=1&degreeTo=8&degreeUnlimit=1&workyearFrom=-1&workyearTo=11&workyearUnlimit=1&salary=&salaryUnPublic=1&propertyList=1&gender=&age=&',
 97 |             'http://s.job5156.com/s/p/result?keywordType=0&keyword=%E5%A4%96%E8%B4%B8&locationList=14012400%2C14012700%2C14012800&posTypeList=&industryList=&updateIn=90&degreeFrom=1&degreeTo=8&degreeUnlimit=1&workyearFrom=-1&workyearTo=11&workyearUnlimit=1&salary=&salaryUnPublic=1&propertyList=1&gender=&age=&',
 98 |             'http://s.job5156.com/s/p/result?keywordType=0&keyword=%E5%A4%96%E8%B4%B8&locationList=14013000&posTypeList=&industryList=&updateIn=90&degreeFrom=1&degreeTo=8&degreeUnlimit=1&workyearFrom=-1&workyearTo=11&workyearUnlimit=1&salary=&salaryUnPublic=1&propertyList=1&gender=&age=&']
 99 |         for i in lists:
100 |             self.run(i+'pn=')
101 |         self.f.close()
102 |     def run(self,hname):
103 |         for page in range(5):
104 |             try:
105 |                 lists=self.spider(page,hname)
106 |             except:
107 |                 break
108 |             w=self.write(lists)
109 | 
110 |     def write(self,lists):
111 |         for i in lists:
112 |             self.f.write(i.find('div', attrs = {'class' : 't1'}).find('a').get('title')+' ')
113 |             self.f.write(i.find('div', attrs = {'class' : 't2'}).find('a', attrs = {'class' : 'comName'}).get('title')+' ')
114 |             self.f.write(i.find('div', attrs = {'class' : 't2'}).find('span').get_text().replace('\n', ' ').lstrip() + '\n\n')
115 |     def spider(self,page,hname):
116 |         html=self.session.get(hname,headers=self.headers).text
117 |         soup = BeautifulSoup(html)
118 |         table = soup.find('div', attrs = {'id' : 'js_jobSearch'})
119 |         lists = table.find_all('div', attrs = {'class' : 'postItem'})
120 |         return lists
121 | 
122 | class Job_cn():
123 |     def __init__(self):
124 |         self.session=requests.session()
125 |         self.f = open('job_cn.txt', 'w', encoding = 'utf-8')
126 |         self.headers = {
127 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
128 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
129 |             'Accept-Language': 'en-US,en;q=0.5',
130 |             'Accept-Encoding': 'gzip, deflate',
131 |             'DNT': 1,
132 |             'Connection': 'keep-alive'}
133 |     def run(self):
134 |         for page in range(4):
135 |             try:
136 |                 lists=self.spider(page)
137 |             except:
138 |                 continue
139 |             w=self.write(lists)
140 |         self.f.close()
141 |     def write(self,lists):
142 |         for i in lists:
143 |             self.f.write(i.find('h4', attrs = {'class' : 'job_name'}).get_text().replace('\n',' '))
144 |             self.f.write(i.find('div', attrs = {'class' : 'job_info'}).find('a').get('title').replace('\n',' ')+'   ')
145 |             get_id = i.find('a',attrs={'class':'job_check '}).get('data-value')
146 |             html = requests.get('http://www.jobcn.com/search/position_detail.uhtml?ids='+get_id,headers=self.headers).text
147 |             soup = BeautifulSoup(html)
148 |             self.f.write(soup.find('div',attrs={'class':'gl_wk'}).get_text().replace('工作地址：','')+'\n\n')
149 |     def spider(self,page):
150 |         html=requests.get('http://www.jobcn.com/search/result.xhtml?s=search%2Findex&p.sortBy=default&p.jobLocationId=3002&p.jobLocationTown=%C6%F3%CA%AF%D5%F2%3B%C7%C5%CD%B7%D5%F2%3B%D5%C1%C4%BE%CD%B7%D5%F2%3B%B4%F3%C1%EB%C9%BD%D5%F2%3B%C7%E5%CF%AA%D5%F2%3B%CC%C1%CF%C3%D5%F2%3B%BB%A2%C3%C5%D5%F2%3B%B7%EF%B8%DA%D5%F2%3B%B3%A4%B0%B2%D5%F2%3B%B3%A3%C6%BD%D5%F2&p.jobLocationTownId=300209%2C300211%2C300215%2C300216%2C300220%2C300223%2C300224%2C300226%2C300227%2C300230&p.keyword=%CD%E2%C3%B3&p.keywordType=2#P'+str(page+1),headers=self.headers).text
151 |         soup = BeautifulSoup(html)
152 |         table = soup.find('form', attrs = {'id' : 'result_data'})
153 |         lists = table.find_all('div', attrs = {'class' : 'item_box'})
154 |         return lists
155 | 
156 | 
157 | class Job_51():
158 |     def __init__(self):
159 |         self.f = open('job_51.txt', 'w', encoding = 'utf-8')
160 |         self.session=requests.session()
161 |         self.headers = {
162 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
163 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
164 |             'Accept-Language': 'en-US,en;q=0.5',
165 |             'Accept-Encoding': 'gzip, deflate',
166 |             'DNT': 1,
167 |             'Connection': 'keep-alive'}
168 |     def work(self):
169 |         self.run('09', 1)
170 |         self.run('11', 1)
171 |         self.run('20', 1)
172 |         self.run('21', 1)
173 |         self.run('25', 1)
174 |         self.run('28', 1)
175 |         self.run('29', 1)
176 |         self.run('31', 1)
177 |         self.run('32', 1)
178 |         self.f.close()
179 |     def run(self,name,x):
180 |         for page in range(x):
181 |             try:
182 |                 lists=self.spider(name,page)
183 |             except:
184 |                 continue
185 |             self.write(lists)
186 |     def write(self,lists):
187 |         for i in lists:
188 |             self.f.write(i.find('td', attrs = {'class' : 'td1'}).find('a').get_text().replace('\n',' ')+'  ')
189 |             self.f.write(i.find('td', attrs = {'class' : 'td2'}).find('a').get_text().replace('\n',' ')+'  ')
190 |             self.f.write(i.find('td', attrs = {'class' : 'td3'}).get_text()+'\n\n')
191 |     def spider(self,name,page):
192 |         html=requests.get('http://search.51job.com/list/030800,0308'+name+',0000,00,9,99,%25CD%25E2%25C3%25B3,1,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=01&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&confirmdate=9&fromType=17',headers=self.headers).text.encode('ISO-8859-1').decode('gb2312','ignore')
193 |         soup = BeautifulSoup(html)
194 |         table = soup.find('table', attrs = {'id' : 'resultList'})
195 |         lists = table.find_all('tr', attrs = {'class' : 'tr0'})
196 |         return lists
197 | 
198 | class Job_get():
199 |     def run(self):
200 |         ganji=Ganji()
201 |         ganji.work()
202 |         print("Ganji OK")
203 |         job_51=Job_51()
204 |         job_51.work()
205 |         print("job_51 OK")
206 |         job_58=Job_58()
207 |         job_58.work()
208 |         print("job_58 OK")
209 |         job_5156=Job_5156()
210 |         job_5156.final()
211 |         print("job_5156 OK")
212 |         job_cn=Job_cn()
213 |         job_cn.run()
214 |         print("job_cn OK")
215 | 
216 | if __name__=='__main__':
217 |     while True:
218 |         work=Job_get()
219 |         work.run()
220 |         print("sleep...")
221 |         time.sleep(600)
222 | 


--------------------------------------------------------------------------------
/job/REANME.md:
--------------------------------------------------------------------------------
1 | 从各个网站获取职位信息
2 | 


--------------------------------------------------------------------------------
/www.aihuishou.com/get_price.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | 
  3 | import requests
  4 | import xlwt
  5 | import xlrd
  6 | from bs4 import BeautifulSoup
  7 | import os
  8 | import sqlite3
  9 | import time
 10 | import re
 11 | import random
 12 | 
 13 | class get_urls():
 14 |     def __init__(self):
 15 |         self.session=requests.session()
 16 |         self.headers = {
 17 |             "X-Forwarded-For":'',
 18 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 19 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 20 |             'Accept-Language': 'en-US,en;q=0.5',
 21 |             'Accept-Encoding': 'gzip, deflate',
 22 |             'DNT': 1,
 23 |             'Connection': 'keep-alive'}
 24 |     def get_url(self):
 25 |         f=xlwt.Workbook()
 26 |         sheet=f.add_sheet('sheet',cell_overwrite_ok=True)
 27 |         num=0
 28 |         data=xlrd.open_workbook('urls.xls')
 29 |         table=data.sheets()[0]
 30 |         for i in range(table.nrows):
 31 |             if(i<1389):
 32 |                 continue
 33 |             url=self.spider(table.cell(i,0).value)
 34 |             if(url=='null'):
 35 |                 continue
 36 |             self.headers = {
 37 |                 "X-Forwarded-For":'186.19.12.'+str(random.randint(0, 255)),
 38 |                 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 39 |                 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 40 |                 'Accept-Language': 'en-US,en;q=0.5',
 41 |                 'Accept-Encoding': 'gzip, deflate',
 42 |                 'DNT': 1,
 43 |                 'Connection': 'keep-alive'}
 44 |             html=requests.get('http://www.aihuishou.com'+url,headers=self.headers).text
 45 |             soup=BeautifulSoup(html)
 46 |             sheet.write(num,0,soup.find('div',attrs={'class':'product_name'}).get_text())
 47 |             sheet.write(num,1,soup.find('div',attrs={'class':'price'}).get_text())
 48 |             num+=1
 49 |             print(num)
 50 |             f.save('price.xls')
 51 |     def spider(self,url):
 52 |         count=0
 53 |         while(True):
 54 |             if(count==20):
 55 |                 return('null')
 56 |             try:
 57 |                 self.headers = {
 58 |                     "X-Forwarded-For":'186.101.12.'+str(random.randint(0, 255)),
 59 |                     'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 60 |                     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 61 |                     'Accept-Language': 'en-US,en;q=0.5',
 62 |                     'Accept-Encoding': 'gzip, deflate',
 63 |                     'DNT': 1,
 64 |                     'Connection': 'keep-alive'}
 65 |                 html=requests.get(url,headers=self.headers).text
 66 |                 table=BeautifulSoup(html).find('div',attrs={'class':'step_right'}).find_all('dl')
 67 |                 num=''
 68 |                 for i in table:
 69 |                     num+=i.find('li').get('data-id')+';'
 70 |                 num+='2026;2045;2098;2100;2104;2106;2108;2112;2129;2134;2476;2102;'
 71 |                 '''
 72 |                 rel='(\d+)'
 73 |                 id_num=re.findall(re.compile(rel),url)[0]
 74 |                 '''
 75 |                 id_num=BeautifulSoup(html).find('div',attrs={'id':'submit'}).get('data-pid')
 76 |                 postdata={
 77 |                 'AuctionProductId':id_num,
 78 |                 'ProductModelId':'',
 79 |                 'PriceUnits':num
 80 |                 }
 81 |                 html=self.session.post('http://www.aihuishou.com/userinquiry/create',data=postdata,headers=self.headers).text
 82 |                 break
 83 |             except:
 84 |                 count+=1
 85 |                 continue
 86 |         rel=r'RedirectUrl":"(.*?)"'
 87 |         count=0
 88 |         while True:
 89 |             if(count==20):
 90 |                 return('null')
 91 |             try:
 92 |                 url=re.findall(re.compile(rel),html)[0]
 93 |                 break
 94 |             except:
 95 |                 self.headers = {
 96 |                     "X-Forwarded-For":'189.'+str(random.randint(0, 255))+'.12.'+str(random.randint(0, 255)),
 97 |                     'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 98 |                     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 99 |                     'Accept-Language': 'en-US,en;q=0.5',
100 |                     'Accept-Encoding': 'gzip, deflate',
101 |                     'DNT': 1,
102 |                     'Connection': 'keep-alive'}
103 |                 html=self.session.post('http://www.aihuishou.com/userinquiry/create',data=postdata,headers=self.headers).text
104 |                 count+=1
105 |                 continue
106 |         return(url)
107 | 
108 | def get():
109 |     headers = {
110 |         'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
111 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
112 |         'Accept-Language': 'en-US,en;q=0.5',
113 |         'Accept-Encoding': 'gzip, deflate',
114 |         'DNT': 1,
115 |         'Connection': 'keep-alive'}
116 |     f=xlwt3.Workbook()
117 |     sheet=f.add_sheet('sheet')
118 |     num=90
119 |     for i in range(55):
120 |         html=requests.get('http://www.aihuishou.com/product/search?cid=1&bid=0&keyword=&pageIndex='+str(i+1),headers=headers).text
121 |         table=BeautifulSoup(html).find('ul',attrs={'class':'products'}).find_all('li')
122 |         urls=[]
123 |         for j in table:
124 |             urls.append('http://www.aihuishou.com'+j.find('a').get('href'))
125 |         for j in urls:
126 |             sheet.write(num,0,j)
127 |             num+=1
128 |         f.save('urls.xls')
129 |         print(i)
130 | 
131 | if __name__=='__main__':
132 |     work=get_urls()
133 |     work.get_url()
134 | 


--------------------------------------------------------------------------------
/www.hexun.com/hexun.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | import requests
 5 | import xlwt3
 6 | import re
 7 | import time
 8 | 
 9 | class Get_infor():
10 |     def __init__(self,code):
11 |         self.session=requests.session()
12 |         self.code=code
13 |         self.url_all='http://vol.stock.hexun.com/Data/Stock/Deal/All.ashx?code='+code+'&count=20&page='
14 |         self.url_buy='http://vol.stock.hexun.com/Data/Stock/Deal/Buy.ashx?code='+code+'&count=20&page='
15 |         self.headers = {
16 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
17 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
18 |             'Accept-Language': 'en-US,en;q=0.5',
19 |             'Accept-Encoding': 'gzip, deflate',
20 |             'Connection': 'keep-alive'}
21 |         self.f=xlwt3.Workbook()
22 |         self.sheet_all=self.f.add_sheet('all')
23 |         self.sheet_buy=self.f.add_sheet('buy')
24 |         self.buy_count=0
25 |         self.all_count=0
26 |     def run(self):
27 |         html=self.session.get(self.url_all+str(2),headers=self.headers).text.replace('sum','"sum"').replace('list','"list"').replace('(','').replace(')','')
28 |         for num in range(17):
29 |             html=html.replace('data'+str(num),str(num))
30 |         data=eval(html)
31 |         pages=data['sum']//20+1
32 |         print('获取交易明细')
33 |         for page in range(pages):
34 |             try:
35 |                 html=self.session.get(self.url_all+str(page+1)+'&callback=hx_json1444183138247644219',headers=self.headers).text.replace('sum','"sum"').replace('list','"list"').replace('(','').replace(')','').replace('hx_json1444183138247644219','')
36 |                 for num in range(17):
37 |                     html=html.replace('data'+str(num),str(num))
38 |                 data=eval(html)
39 |             except:
40 |                 print('something wrong')
41 |                 break
42 |             for item in data['list']:
43 |                 num=0
44 |                 for key in item:
45 |                     self.sheet_all.write(self.all_count,num,BeautifulSoup(item[key],'html.parser').get_text())
46 |                     num+=1
47 |                     if num==11:
48 |                         break
49 |                 self.all_count+=1
50 |             print(page)
51 |             time.sleep(1)
52 |             self.f.save(self.code+'.xls')
53 |         print('获取买卖明细')
54 |         for page in range(pages):
55 |             try:
56 |                 html=self.session.get(self.url_buy+str(page+1)+'&callback=hx_json1444183138247644219',headers=self.headers).text.replace('sum','"sum"').replace('list','"list"').replace('(','').replace(')','').replace('hx_json1444183138247644219','')
57 |                 for num in range(17):
58 |                     html=html.replace('data'+str(num),str(num))
59 |                 data=eval(html)
60 |             except:
61 |                 print('something wrong')
62 |                 break
63 |             for item in data['list']:
64 |                 num=0
65 |                 for key in item:
66 |                     self.sheet_buy.write(self.buy_count,num,BeautifulSoup(item[key],'html.parser').get_text())
67 |                     num+=1
68 |                     if num==10:
69 |                         break
70 |                 self.buy_count+=1
71 |             print(page)
72 |             time.sleep(1)
73 |             self.f.save(self.code+'.xls')
74 | 
75 | def test():
76 |     code=input('输入股票代码：')
77 |     work=Get_infor(code)
78 |     work.run()
79 |     print('OK')
80 |     time.sleep(20)
81 | 
82 | if __name__=='__main__':
83 |     test()
84 | 


--------------------------------------------------------------------------------
/www.liepin.com/liepin.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | from bs4 import BeautifulSoup
  3 | import requests
  4 | import time
  5 | import xlwt3
  6 | import re
  7 | import threading
  8 | 
  9 | class get_urls():
 10 |     def __init__(self,url,page):
 11 |         self.session=requests.session()
 12 |         self.url = url
 13 |         self.page = page
 14 |         self.headers = {
 15 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 16 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 17 |             'Accept-Language': 'en-US,en;q=0.5',
 18 |             'Accept-Encoding': 'gzip, deflate',
 19 |             'DNT': 1,
 20 |             'Connection': 'keep-alive'}
 21 |     def run(self):
 22 |         lists = []
 23 |         html = requests.get(self.url+'/&curPage='+str(self.page),headers=self.headers).text
 24 |         soup = BeautifulSoup(html,'lxml')
 25 |         table = soup.find('ul', attrs = {'class' : 'sojob-result-list'})
 26 |         list = table.find_all('li')
 27 |         for i in list:
 28 |             url1 = i.find('a').get('href')
 29 |             lists.append(url1)
 30 |         return lists
 31 | 
 32 | class Get_infor(threading.Thread):
 33 |     def __init__(self,url):
 34 |         super(Get_infor,self).__init__()
 35 |         self.url=url
 36 |         self.headers = {
 37 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 38 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 39 |             'Accept-Language': 'en-US,en;q=0.5',
 40 |             'Accept-Encoding': 'gzip, deflate',
 41 |             'DNT': 1,
 42 |             'Connection': 'keep-alive'}
 43 |     def run(self):
 44 |         html = requests.get(self.url,headers=self.headers).text
 45 |         soup = BeautifulSoup(html,'lxml')
 46 |         self.statue=1
 47 |         try:
 48 |             c = soup.find('div',attrs={'class':'main'}).find('div',attrs={'class':'title'})
 49 |             self.company=c.find('div',attrs={'class':'title-info '}).find('h3').get_text()
 50 |             self.job=c.find('div',attrs={'class':'title-info '}).find('h1').get_text()
 51 |             self.city=c.find('p',attrs={'class':'basic-infor'}).find('span').get_text()
 52 |             self.experience=c.find('div',attrs={'class':'resume clearfix'}).find_all('span')[1].get_text()
 53 |             self.edu=c.find('div',attrs={'class':'resume clearfix'}).find_all('span')[0].get_text()
 54 |             self.age=c.find('div',attrs={'class':'resume clearfix'}).find_all('span')[3].get_text()
 55 |             self.salary=re.match(r'.*万',c.find('p',attrs={'class':'job-main-title'}).get_text()).group(0)
 56 |         except:
 57 |             self.statue=0
 58 | 
 59 | class Main():
 60 |     def __init__(self):
 61 |         self.f=xlwt3.Workbook(encoding='utf-8')
 62 |         self.sheet=self.f.add_sheet('sheet',cell_overwrite_ok=True)
 63 |         self.sheet1=self.f.add_sheet('sheet1',cell_overwrite_ok=True)
 64 |         self.count = 1
 65 |         head=['公司','职位','职位发布城市','工作经验','学历要求','年龄要求','薪水']
 66 |         num=0
 67 |         for i in head:
 68 |             self.sheet.write(0,num,i)
 69 |             self.sheet1.write(0,num,i)
 70 |             num += 1
 71 |         self.f.save('data.xls')
 72 |     def run(self):
 73 |         url = input('请输入猎聘网的链接:')
 74 |         hang = 1
 75 |         hang1 = 1
 76 |         counting = 0
 77 |         pro_urls=[]
 78 |         for page in range(100):
 79 |             get_url = get_urls(url,page)
 80 |             urls = get_url.run()
 81 |             if urls==pro_urls:
 82 |                 break
 83 |             pro_urls=urls
 84 |             threadings=[]
 85 |             for i in urls:
 86 |                 work=Get_infor(i)
 87 |                 threadings.append(work)
 88 |             for work in threadings:
 89 |                 work.start()
 90 |             for work in threadings:
 91 |                 work.join()
 92 |             for work in threadings:
 93 |                 if(work.statue == 1):
 94 |                     if(work.age == '年龄不限'):
 95 |                         self.sheet1.write(hang1,0,work.company)
 96 |                         self.sheet1.write(hang1,1,work.job)
 97 |                         self.sheet1.write(hang1,2,work.city)
 98 |                         self.sheet1.write(hang1,3,work.experience)
 99 |                         self.sheet1.write(hang1,4,work.edu)
100 |                         self.sheet1.write(hang1,5,work.age)
101 |                         self.sheet1.write(hang1,6,work.salary)
102 |                         hang1+=1
103 |                     else:
104 |                         self.sheet.write(hang,0,work.company)
105 |                         self.sheet.write(hang,1,work.job)
106 |                         self.sheet.write(hang,2,work.city)
107 |                         self.sheet.write(hang,3,work.experience)
108 |                         self.sheet.write(hang,4,work.edu)
109 |                         self.sheet.write(hang,5,work.age)
110 |                         self.sheet.write(hang,6,work.salary)
111 |                         hang+=1
112 |                 else:
113 |                     continue
114 |                 print(counting)
115 |                 counting+=1
116 |             self.f.save('data.xls')
117 | 
118 | if __name__=='__main__':
119 |     work=Main()
120 |     work.run()
121 | 


--------------------------------------------------------------------------------
/www.renrendai.com/renrendai.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | 
  3 | from bs4 import BeautifulSoup
  4 | import requests
  5 | import time
  6 | import xlwt3
  7 | import re
  8 | import threading
  9 | 
 10 | class Get_ids():
 11 |     def __init__(self,url):
 12 |         self.session=requests.session()
 13 |         self.url=url
 14 |         self.headers = {
 15 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 16 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 17 |             'Accept-Language': 'en-US,en;q=0.5',
 18 |             'Accept-Encoding': 'gzip, deflate',
 19 |             'Connection': 'keep-alive'}
 20 |     def run(self):
 21 |         html=self.session.get(self.url,headers=self.headers).text
 22 |         ID_re='loanId":(\d+)'
 23 |         ids=re.findall(re.compile(ID_re),html)
 24 |         return ids
 25 | 
 26 | class Get_infor(threading.Thread):
 27 |     def __init__(self,loanid):
 28 |         super(Get_infor,self).__init__()
 29 |         self.loanid=loanid
 30 |         self.url='http://www.renrendai.com/lend/detailPage.action?loanId='+self.loanid
 31 |         self.headers = {
 32 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 33 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 34 |             'Accept-Language': 'en-US,en;q=0.5',
 35 |             'Accept-Encoding': 'gzip, deflate',
 36 |             'Connection': 'keep-alive'}
 37 | 
 38 |     def run(self):
 39 |         self.num=1
 40 |         try:
 41 |             html=requests.get(self.url,headers=self.headers).text
 42 |             infor_re='"creditInfo":(.*?),"creditPassedTime"'
 43 |             data=re.findall(re.compile(infor_re),html)[0]
 44 |             self.infor=eval(data)
 45 |         except:
 46 |             self.num=0
 47 | 
 48 | class Main():
 49 |     def __init__(self):
 50 |         self.f=xlwt3.Workbook(encoding='utf-8')
 51 |         self.sheet=self.f.add_sheet('sheet',cell_overwrite_ok=True)
 52 |         self.count=1
 53 |         head=['实地认证','身份认证','信用报告','工作认证','收入认证','房产认证','购车认证','结婚认证','学历认证','手机实名认证','微博认证','居住地证明']
 54 |         self.sheet.write(0,0,'ID')
 55 |         num=1
 56 |         for i in head:
 57 |             self.sheet.write(0,num,i)
 58 |             num+=1
 59 |         self.f.save('data.xls')
 60 |     def run(self):
 61 |         startID=input('输入起始ID:')
 62 |         endID=input('输入结束ID:')
 63 |         startID=int(startID)
 64 |         endID=int(endID)
 65 |         ids=[]
 66 |         threads=[]
 67 |         while startID<endID:
 68 |             ids.append(startID)
 69 |             if len(ids)<10:
 70 |                 startID+=1
 71 |                 continue
 72 |             for ID in ids:
 73 |                 work=Get_infor(str(ID))
 74 |                 threads.append(work)
 75 |             for thread in threads:
 76 |                 thread.start()
 77 |             for thread in threads:
 78 |                 thread.join()
 79 |             for thread in threads:
 80 |                 if thread.num==0:
 81 |                     continue
 82 |                 self.sheet.write(self.count,0,thread.loanid)
 83 | 
 84 |                 if thread.infor['fieldAudit']=='VALID':
 85 |                     self.sheet.write(self.count,1,'2')
 86 |                 elif thread.infor['fieldAudit']=='INVALID':
 87 |                     self.sheet.write(self.count,1,'0')
 88 |                 else:
 89 |                     self.sheet.write(self.count,1,'1')
 90 | 
 91 |                 if thread.infor['identification']=='VALID':
 92 |                     self.sheet.write(self.count,2,'2')
 93 |                 elif thread.infor['identification']=='INVALID':
 94 |                     self.sheet.write(self.count,2,'0')
 95 |                 else:
 96 |                     self.sheet.write(self.count,2,'1')
 97 | 
 98 |                 if thread.infor['credit']=='VALID':
 99 |                     self.sheet.write(self.count,3,'2')
100 |                 elif thread.infor['credit']=='INVALID':
101 |                     self.sheet.write(self.count,3,'0')
102 |                 else:
103 |                     self.sheet.write(self.count,3,'1')
104 | 
105 |                 if thread.infor['work']=='VALID':
106 |                     self.sheet.write(self.count,4,'2')
107 |                 elif thread.infor['work']=='INVALID':
108 |                     self.sheet.write(self.count,4,'0')
109 |                 else:
110 |                     self.sheet.write(self.count,4,'1')
111 | 
112 |                 if thread.infor['incomeDuty']=='VALID':
113 |                     self.sheet.write(self.count,5,'2')
114 |                 elif thread.infor['incomeDuty']=='INVALID':
115 |                     self.sheet.write(self.count,5,'0')
116 |                 else:
117 |                     self.sheet.write(self.count,5,'1')
118 | 
119 |                 if thread.infor['house']=='VALID':
120 |                     self.sheet.write(self.count,6,'2')
121 |                 elif thread.infor['house']=='INVALID':
122 |                     self.sheet.write(self.count,6,'0')
123 |                 else:
124 |                     self.sheet.write(self.count,6,'1')
125 | 
126 |                 if thread.infor['car']=='VALID':
127 |                     self.sheet.write(self.count,7,'2')
128 |                 elif thread.infor['car']=='INVALID':
129 |                     self.sheet.write(self.count,7,'0')
130 |                 else:
131 |                     self.sheet.write(self.count,7,'1')
132 | 
133 |                 if thread.infor['marriage']=='VALID':
134 |                     self.sheet.write(self.count,8,'2')
135 |                 elif thread.infor['marriage']=='INVALID':
136 |                     self.sheet.write(self.count,8,'0')
137 |                 else:
138 |                     self.sheet.write(self.count,8,'1')
139 | 
140 |                 if thread.infor['graduation']=='VALID':
141 |                     self.sheet.write(self.count,9,'2')
142 |                 elif thread.infor['graduation']=='INVALID':
143 |                     self.sheet.write(self.count,9,'0')
144 |                 else:
145 |                     self.sheet.write(self.count,9,'1')
146 | 
147 | 
148 |                 if thread.infor['mobileReceipt']=='VALID':
149 |                     self.sheet.write(self.count,10,'2')
150 |                 elif thread.infor['mobileReceipt']=='INVALID':
151 |                     self.sheet.write(self.count,10,'0')
152 |                 else:
153 |                     self.sheet.write(self.count,10,'1')
154 | 
155 |                 if thread.infor['kaixin']=='VALID':
156 |                     self.sheet.write(self.count,11,'2')
157 |                 elif thread.infor['kaixin']=='INVALID':
158 |                     self.sheet.write(self.count,11,'0')
159 |                 else:
160 |                     self.sheet.write(self.count,11,'1')
161 | 
162 |                 if thread.infor['residence']=='VALID':
163 |                     self.sheet.write(self.count,12,'2')
164 |                 elif thread.infor['residence']=='INVALID':
165 |                     self.sheet.write(self.count,12,'0')
166 |                 else:
167 |                     self.sheet.write(self.count,12,'1')
168 |                 self.count+=1
169 |                 print(self.count)
170 |             time.sleep(5)
171 |             ids=[]
172 |             threads=[]
173 |             startID+=1
174 |             self.f.save('data.xls')
175 | 
176 | def test():
177 |     work=Main()
178 | 
179 | if __name__=='__main__':
180 |     work=Main()
181 |     work.run()
182 | 


--------------------------------------------------------------------------------
/www.yanglao.com.cn/get_infor.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | 
  3 | import requests
  4 | from bs4 import BeautifulSoup
  5 | import xlwt3
  6 | import re
  7 | import threading
  8 | 
  9 | class Get_infor(threading.Thread):
 10 |     def __init__(self,url):
 11 |         super(Get_infor,self).__init__()
 12 |         self.url=url
 13 |         self.headers = {
 14 |             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 15 |             "Accept-Encoding": "gzip, deflate",
 16 |             "Accept-Language": "en-US,en;q=0.5",
 17 |             "Connection": "keep-alive",
 18 |             "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
 19 |         self.session=requests.session()
 20 |     def run(self):
 21 |         self.statue=1
 22 |         try:
 23 |             html=self.session.get(self.url,headers=self.headers).text.encode('ISO-8859-1').decode('utf-8','ignore')
 24 |         except:
 25 |             self.statue=0
 26 |             return
 27 |         soup=BeautifulSoup(html)
 28 |         self.title=soup.find('div',attrs={'class':'inst-summary'}).find('h1').get_text().replace(' ','').replace('\n','')
 29 |         table=soup.find('div',attrs={'class':'inst-summary'}).find_all('li')
 30 |         self.address=''
 31 |         self.bednum=''
 32 |         self.price=''
 33 |         for item in table:
 34 |             if item.get_text().replace(' ','')[:1]=='地':
 35 |                 self.address=item.get_text().replace(' ','')
 36 |             if item.get_text().replace(' ','')[:1]=='床':
 37 |                 self.bednum=item.get_text().replace(' ','')
 38 |             if item.get_text().replace(' ','')[:1]=='收':
 39 |                 self.price=item.get_text().replace(' ','')
 40 |         table=soup.find('div',attrs={'class':'base-info'}).find('div',attrs={'class':'cont'}).find_all('li')
 41 |         self.area=''
 42 |         self.type=''
 43 |         self.people=''
 44 |         self.tese=''
 45 |         for item in table:
 46 |             if item.get_text()[:4]=='所在地区':
 47 |                 self.area=item.get_text().replace('所在地区：','').replace('\n','').replace(' ','').split('-')
 48 |             if item.get_text()[:4]=='机构类型':
 49 |                 self.type=item.get_text().replace('机构类型：','').replace('\n','').replace(' ','')
 50 |             if item.get_text()[:4]=='收住对象':
 51 |                 self.people=item.get_text().replace('收住对象：','').replace('\n','').replace(' ','')
 52 |             if item.get_text()[:4]=='特色服务':
 53 |                 self.tese=item.get_text().replace('特色服务：','').replace('\n','').replace(' ','')
 54 | 
 55 | class Get_urls():
 56 |     def __init__(self,url):
 57 |         self.url=url
 58 |         self.headers = {
 59 |             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 60 |             "Accept-Encoding": "gzip, deflate",
 61 |             "Accept-Language": "en-US,en;q=0.5",
 62 |             "Connection": "keep-alive",
 63 |             "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
 64 |         self.session=requests.session()
 65 |     def run(self):
 66 |         html=self.session.get(self.url,headers=self.headers).text.encode('ISO-8859-1').decode('utf-8','ignore')
 67 |         table=BeautifulSoup(html).find('div',attrs={'class':'list-view'}).find('ul').find_all('li')
 68 |         urls=[]
 69 |         for item in table:
 70 |             try:
 71 |                 urls.append('http://www.yanglao.com.cn'+item.find('a').get('href'))
 72 |             except:
 73 |                 continue
 74 |         return urls
 75 | 
 76 | class Main():
 77 |     def __init__(self):
 78 |         self.f=xlwt3.Workbook()
 79 |         self.sheet=self.f.add_sheet('sheet')
 80 |         self.count=0
 81 |     def run(self):
 82 |         for page in range(1569):
 83 |             if(page<537):
 84 |                 continue
 85 |             get_url=Get_urls('http://www.yanglao.com.cn/resthome_'+str(page+1))
 86 |             urls=get_url.run()
 87 |             threads=[]
 88 |             for url in urls:
 89 |                 work=Get_infor(url)
 90 |                 threads.append(work)
 91 |             for work in threads:
 92 |                 work.start()
 93 |             for work in threads:
 94 |                 work.join()
 95 |             for work in threads:
 96 |                 if work.statue==0:
 97 |                     continue
 98 |                 try:
 99 |                     self.sheet.write(self.count,0,work.area[0])
100 |                 except:
101 |                     self.sheet.write(self.count,0,' ')
102 |                 try:
103 |                     self.sheet.write(self.count,1,work.area[1])
104 |                 except:
105 |                     self.sheet.write(self.count,1,' ')
106 |                 try:
107 |                     self.sheet.write(self.count,2,work.area[2])
108 |                 except:
109 |                     self.sheet.write(self.count,2,' ')
110 |                 self.sheet.write(self.count,3,work.title)
111 |                 self.sheet.write(self.count,4,work.address)
112 |                 self.sheet.write(self.count,5,work.type)
113 |                 self.sheet.write(self.count,6,work.people)
114 |                 self.sheet.write(self.count,7,work.tese)
115 |                 self.sheet.write(self.count,8,work.bednum)
116 |                 self.sheet.write(self.count,9,work.price)
117 |                 self.count+=1
118 |                 print(self.count)
119 |             self.f.save('data.xls')
120 | if __name__=='__main__':
121 |     work=Main()
122 |     work.run()
123 | 


--------------------------------------------------------------------------------
/www.zimuzu.tv/movie_get.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | class Get_movie_urls():
 7 |     def __init__(self):
 8 |         self.session=requests.session()
 9 |         self.headers = {
10 |             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
11 |             "Accept-Encoding": "gzip, deflate",
12 |             "Accept-Language": "en-US,en;q=0.5",
13 |             "Connection": "keep-alive",
14 |             "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
15 | 
16 |     def jingsong(self):
17 |         html=self.session.get('http://www.zimuzu.tv/eresourcelist?channel=movie&area=&category=%E6%83%8A%E6%82%9A&format=&year=&sort=',headers=self.headers).text
18 |         table=BeautifulSoup(html).find('div',attrs={'class':'resource-showlist'}).find_all('div',attrs={'class':'fl-img'})
19 |         urls=[]
20 |         for i in table:
21 |             urls.append(i.find('a').get('href'))
22 |         page=2
23 |         while True:
24 |             html=self.session.get('http://www.zimuzu.tv/eresourcelist?page='+str(page)+'&channel=movie&area=&category=%E6%83%8A%E6%82%9A&format=&year=&sort=',headers=self.headers).text
25 |             table=BeautifulSoup(html).find('div',attrs={'class':'resource-showlist'}).find_all('div',attrs={'class':'fl-img'})
26 |             if table==[]:
27 |                 break
28 |             for i in table:
29 |                 urls.append(i.find('a').get('href'))
30 |             page+=1
31 |     def xuanyi(self):
32 |         html=self.session.get('http://www.zimuzu.tv/eresourcelist?channel=movie&area=&category=%E6%82%AC%E7%96%91&format=&year=&sort=',headers=self.headers).text
33 |         table=BeautifulSoup(html).find('div',attrs={'class':'resource-showlist'}).find_all('div',attrs={'class':'fl-img'})
34 |         urls=[]
35 |         for i in table:
36 |             urls.append(i.find('a').get('href'))
37 |         page=2
38 |         while True:
39 |             html=self.session.get('http://www.zimuzu.tv/eresourcelist?page='+str(page)+'&channel=movie&area=&category=%E6%82%AC%E7%96%91&format=&year=&sort=',headers=self.headers).text
40 |             table=BeautifulSoup(html).find('div',attrs={'class':'resource-showlist'}).find_all('div',attrs={'class':'fl-img'})
41 |             if table==[]:
42 |                 break
43 |             for i in table:
44 |                 urls.append(i.find('a').get('href'))
45 |             page+=1
46 |     def zuian(self):
47 |         html=self.session.get('http://www.zimuzu.tv/eresourcelist?channel=movie&area=&category=%E7%BD%AA%E6%A1%88&format=&year=&sort=',headers=self.headers).text
48 |         table=BeautifulSoup(html).find('div',attrs={'class':'resource-showlist'}).find_all('div',attrs={'class':'fl-img'})
49 |         urls=[]
50 |         for i in table:
51 |             urls.append(i.find('a').get('href'))
52 |         page=2
53 |         while True:
54 |             html=self.session.get('http://www.zimuzu.tv/eresourcelist?page='+str(page)+'&channel=movie&area=&category=%E7%BD%AA%E6%A1%88&format=&year=&sort=',headers=self.headers).text
55 |             table=BeautifulSoup(html).find('div',attrs={'class':'resource-showlist'}).find_all('div',attrs={'class':'fl-img'})
56 |             if table==[]:
57 |                 break
58 |             for i in table:
59 |                 urls.append(i.find('a').get('href'))
60 |             page+=1
61 | 
62 |     def maoxian(self):
63 |         html=self.session.get('http://www.zimuzu.tv/eresourcelist?channel=movie&area=&category=%E5%86%92%E9%99%A9&format=&year=&sort=',headers=self.headers).text
64 |         table=BeautifulSoup(html).find('div',attrs={'class':'resource-showlist'}).find_all('div',attrs={'class':'fl-img'})
65 |         urls=[]
66 |         for i in table:
67 |             urls.append(i.find('a').get('href'))
68 |         page=2
69 |         while True:
70 |             html=self.session.get('http://www.zimuzu.tv/eresourcelist?page='+str(page)+'&channel=movie&area=&category=%E5%86%92%E9%99%A9&format=&year=&sort=',headers=self.headers).text
71 |             table=BeautifulSoup(html).find('div',attrs={'class':'resource-showlist'}).find_all('div',attrs={'class':'fl-img'})
72 |             if table==[]:
73 |                 break
74 |             for i in table:
75 |                 urls.append(i.find('a').get('href'))
76 |             page+=1
77 | 
78 | 
79 | 
80 | if __name__=='__main__':
81 |     work=Get_movie_urls()
82 |     work.jingsong()
83 | 


--------------------------------------------------------------------------------
/www.zimuzu.tv/tv_get.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | 
  3 | import requests
  4 | from bs4 import BeautifulSoup
  5 | import os
  6 | import sqlite3
  7 | import time
  8 | import re
  9 | 
 10 | class Ame_tv():
 11 |     def __init__(self):
 12 |         self.headers = {
 13 |             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 14 |             "Accept-Encoding": "gzip, deflate",
 15 |             "Accept-Language": "en-US,en;q=0.5",
 16 |             "Connection": "keep-alive",
 17 |             "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
 18 |         self.session=requests.session()
 19 |         self.urls=['http://www.zimuzu.tv/eresourcelist?channel=tv&area=%E7%BE%8E%E5%9B%BD&category=%E6%83%8A%E6%82%9A&format=&year=&sort=',
 20 |                     'http://www.zimuzu.tv/eresourcelist?channel=tv&area=%E7%BE%8E%E5%9B%BD&category=%E6%82%AC%E7%96%91&format=&year=&sort=',
 21 |                     'http://www.zimuzu.tv/eresourcelist?channel=tv&area=%E7%BE%8E%E5%9B%BD&category=%E5%8C%BB%E5%8A%A1&format=&year=&sort=',
 22 |                     'http://www.zimuzu.tv/eresourcelist?channel=tv&area=%E7%BE%8E%E5%9B%BD&category=%E5%BE%8B%E6%94%BF&format=&year=&sort=',
 23 |                     'http://www.zimuzu.tv/eresourcelist?channel=tv&area=%E7%BE%8E%E5%9B%BD&category=%E8%B0%8D%E6%88%98&format=&year=&sort=',
 24 |                     'http://www.zimuzu.tv/eresourcelist?channel=tv&area=%E7%BE%8E%E5%9B%BD&category=%E7%BD%AA%E6%A1%88&format=&year=&sort=',
 25 |                     'http://www.zimuzu.tv/eresourcelist?channel=tv&area=%E7%BE%8E%E5%9B%BD&category=%E5%86%92%E9%99%A9&format=&year=&sort=',
 26 |                     'http://www.zimuzu.tv/eresourcelist?channel=tv&area=%E7%BE%8E%E5%9B%BD&category=%E5%8A%A8%E7%94%BB&format=&year=&sort=']
 27 |         self.get_urls()
 28 | 
 29 |     def get_urls(self):
 30 |         if os.path.isfile('ame_urls.db'):
 31 |             conn = sqlite3.connect('ame_urls.db')
 32 |             cursor=conn.cursor()
 33 |         else:
 34 |             conn=sqlite3.connect('ame_urls.db')
 35 |             cursor=conn.cursor()
 36 |             cursor.execute("create table urls(url varchar(40) primary key)")
 37 |         ame_urls=[]
 38 |         for i in self.urls:
 39 |             ame_urls+=self.spider(i)
 40 |             print(i)
 41 |         for url in ame_urls:
 42 |             try:
 43 |                 cursor.execute("insert into urls(url) values (?)",(url,))
 44 |             except:
 45 |                 continue
 46 |         cursor.close()
 47 |         conn.commit()
 48 |         conn.close()
 49 | 
 50 |     def spider(self,url):
 51 |         html=self.session.get(url,headers=self.headers).text
 52 |         table=BeautifulSoup(html).find('div',attrs={'class':'resource-showlist'}).find_all('div',attrs={'class':'fl-img'})
 53 |         urls=[]
 54 |         for i in table:
 55 |             urls.append(i.find('a').get('href'))
 56 |         page=2
 57 |         while True:
 58 |             ur=url.replace('eresourcelist?','eresourcelist?page='+str(page))
 59 |             html=self.session.get(ur,headers=self.headers).text
 60 |             table=BeautifulSoup(html).find('div',attrs={'class':'resource-showlist'}).find_all('div',attrs={'class':'fl-img'})
 61 |             if table==[]:
 62 |                 break
 63 |             for i in table:
 64 |                 urls.append(i.find('a').get('href'))
 65 |             page+=1
 66 |         return urls
 67 | 
 68 | 
 69 | class Japan_tv():
 70 |     def __init__(self):
 71 |         self.headers = {
 72 |             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 73 |             "Accept-Encoding": "gzip, deflate",
 74 |             "Accept-Language": "en-US,en;q=0.5",
 75 |             "Connection": "keep-alive",
 76 |             "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
 77 |         self.session=requests.session()
 78 |         self.urls=['http://www.zimuzu.tv/eresourcelist?channel=tv&area=%E6%97%A5%E6%9C%AC&category=%E6%83%8A%E6%82%9A&format=&year=&sort=',
 79 |                     'http://www.zimuzu.tv/eresourcelist?channel=tv&area=%E6%97%A5%E6%9C%AC&category=%E6%82%AC%E7%96%91&format=&year=&sort=',
 80 |                     'http://www.zimuzu.tv/eresourcelist?channel=tv&area=%E6%97%A5%E6%9C%AC&category=%E5%8C%BB%E5%8A%A1&format=&year=&sort=',
 81 |                     'http://www.zimuzu.tv/eresourcelist?channel=tv&area=%E6%97%A5%E6%9C%AC&category=%E5%BE%8B%E6%94%BF&format=&year=&sort=',
 82 |                     'http://www.zimuzu.tv/eresourcelist?channel=tv&area=%E6%97%A5%E6%9C%AC&category=%E8%B0%8D%E6%88%98&format=&year=&sort=',
 83 |                     'http://www.zimuzu.tv/eresourcelist?channel=tv&area=%E6%97%A5%E6%9C%AC&category=%E7%BD%AA%E6%A1%88&format=&year=&sort=',
 84 |                     'http://www.zimuzu.tv/eresourcelist?channel=tv&area=%E6%97%A5%E6%9C%AC&category=%E5%86%92%E9%99%A9&format=&year=&sort=',
 85 |                     'http://www.zimuzu.tv/eresourcelist?channel=tv&area=%E6%97%A5%E6%9C%AC&category=%E5%8A%A8%E7%94%BB&format=&year=&sort=']
 86 |         self.get_urls()
 87 | 
 88 |     def get_urls(self):
 89 |         if os.path.isfile('japan_urls.db'):
 90 |             conn = sqlite3.connect('japan_urls.db')
 91 |             cursor=conn.cursor()
 92 |         else:
 93 |             conn=sqlite3.connect('japan_urls.db')
 94 |             cursor=conn.cursor()
 95 |             cursor.execute("create table urls(url varchar(40) primary key)")
 96 |         ame_urls=[]
 97 |         for i in self.urls:
 98 |             ame_urls+=self.spider(i)
 99 |         for url in ame_urls:
100 |             try:
101 |                 cursor.execute("insert into urls(url) values (?)",(url,))
102 |             except:
103 |                 continue
104 |         cursor.close()
105 |         conn.commit()
106 |         conn.close()
107 | 
108 |     def spider(self,url):
109 |         html=self.session.get(url,headers=self.headers).text
110 |         table=BeautifulSoup(html).find('div',attrs={'class':'resource-showlist'}).find_all('div',attrs={'class':'fl-img'})
111 |         urls=[]
112 |         for i in table:
113 |             urls.append(i.find('a').get('href'))
114 |         page=2
115 |         while True:
116 |             ur=url.replace('eresourcelist?','eresourcelist?page='+str(page))
117 |             html=self.session.get(ur,headers=self.headers).text
118 |             table=BeautifulSoup(html).find('div',attrs={'class':'resource-showlist'}).find_all('div',attrs={'class':'fl-img'})
119 |             if table==[]:
120 |                 break
121 |             for i in table:
122 |                 urls.append(i.find('a').get('href'))
123 |             page+=1
124 |         return urls
125 | 
126 | 
127 | class Britain_tv():
128 |     def __init__(self):
129 |         self.headers = {
130 |             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
131 |             "Accept-Encoding": "gzip, deflate",
132 |             "Accept-Language": "en-US,en;q=0.5",
133 |             "Connection": "keep-alive",
134 |             "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
135 |         self.session=requests.session()
136 |         self.urls=['http://www.zimuzu.tv/eresourcelist?channel=tv&area=%E8%8B%B1%E5%9B%BD&category=%E6%83%8A%E6%82%9A&format=&year=&sort=',
137 |                     'http://www.zimuzu.tv/eresourcelist?channel=tv&area=%E8%8B%B1%E5%9B%BD&category=%E6%82%AC%E7%96%91&format=&year=&sort=',
138 |                     'http://www.zimuzu.tv/eresourcelist?channel=tv&area=%E8%8B%B1%E5%9B%BD&category=%E5%8C%BB%E5%8A%A1&format=&year=&sort=',
139 |                     'http://www.zimuzu.tv/eresourcelist?channel=tv&area=%E8%8B%B1%E5%9B%BD&category=%E5%BE%8B%E6%94%BF&format=&year=&sort=',
140 |                     'http://www.zimuzu.tv/eresourcelist?channel=tv&area=%E8%8B%B1%E5%9B%BD&category=%E8%B0%8D%E6%88%98&format=&year=&sort=',
141 |                     'http://www.zimuzu.tv/eresourcelist?channel=tv&area=%E8%8B%B1%E5%9B%BD&category=%E7%BD%AA%E6%A1%88&format=&year=&sort=',
142 |                     'http://www.zimuzu.tv/eresourcelist?channel=tv&area=%E8%8B%B1%E5%9B%BD&category=%E5%86%92%E9%99%A9&format=&year=&sort=',
143 |                     'http://www.zimuzu.tv/eresourcelist?channel=tv&area=%E8%8B%B1%E5%9B%BD&category=%E5%8A%A8%E7%94%BB&format=&year=&sort=']
144 |         self.get_urls()
145 | 
146 |     def get_urls(self):
147 |         if os.path.isfile('bri_urls.db'):
148 |             conn = sqlite3.connect('bri_urls.db')
149 |             cursor=conn.cursor()
150 |         else:
151 |             conn=sqlite3.connect('bri_urls.db')
152 |             cursor=conn.cursor()
153 |             cursor.execute("create table urls(url varchar(40) primary key)")
154 |         ame_urls=[]
155 |         for i in self.urls:
156 |             ame_urls+=self.spider(i)
157 |         for url in ame_urls:
158 |             try:
159 |                 cursor.execute("insert into urls(url) values (?)",(url,))
160 |             except:
161 |                 continue
162 |         cursor.close()
163 |         conn.commit()
164 |         conn.close()
165 | 
166 |     def spider(self,url):
167 |         html=self.session.get(url,headers=self.headers).text
168 |         table=BeautifulSoup(html).find('div',attrs={'class':'resource-showlist'}).find_all('div',attrs={'class':'fl-img'})
169 |         urls=[]
170 |         for i in table:
171 |             urls.append(i.find('a').get('href'))
172 |         page=2
173 |         while True:
174 |             ur=url.replace('eresourcelist?','eresourcelist?page='+str(page))
175 |             html=self.session.get(ur,headers=self.headers).text
176 |             table=BeautifulSoup(html).find('div',attrs={'class':'resource-showlist'}).find_all('div',attrs={'class':'fl-img'})
177 |             if table==[]:
178 |                 break
179 |             for i in table:
180 |                 urls.append(i.find('a').get('href'))
181 |             page+=1
182 |         return urls
183 | 
184 | class Other_tv():
185 |     def __init__(self):
186 |         self.headers = {
187 |             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
188 |             "Accept-Encoding": "gzip, deflate",
189 |             "Accept-Language": "en-US,en;q=0.5",
190 |             "Connection": "keep-alive",
191 |             "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
192 |         self.session=requests.session()
193 |         self.url='http://www.zimuzu.tv/eresourcelist?channel=tv&area='
194 |         self.areas=['%E5%8A%A0%E6%8B%BF%E5%A4%A7','%E8%A5%BF%E7%8F%AD%E7%89%99','%E6%84%8F%E5%A4%A7%E5%88%A9','%E5%BE%B7%E5%9B%BD','%E4%BF%84%E7%BD%97%E6%96%AF','%E6%BE%B3%E5%A4%A7%E5%88%A9%E4%BA%9A','%E5%85%B6%E4%BB%96']
195 |         self.types=['&category=%E6%83%8A%E6%82%9A&format=&year=&sort=',
196 |                     '&category=%E6%82%AC%E7%96%91&format=&year=&sort=',
197 |                     '&category=%E5%8C%BB%E5%8A%A1&format=&year=&sort=',
198 |                     '&category=%E5%BE%8B%E6%94%BF&format=&year=&sort=',
199 |                     '&category=%E8%B0%8D%E6%88%98&format=&year=&sort=',
200 |                     '&category=%E7%BD%AA%E6%A1%88&format=&year=&sort=',
201 |                     '&category=%E5%86%92%E9%99%A9&format=&year=&sort=',
202 |                     '&category=%E5%8A%A8%E7%94%BB&format=&year=&sort=']
203 |         self.urls=[]
204 |         for i in self.areas:
205 |             for j in self.types:
206 |                 self.urls.append(self.url+i+j)
207 |         self.get_urls()
208 | 
209 |     def get_urls(self):
210 |         if os.path.isfile('other_urls.db'):
211 |             conn = sqlite3.connect('other_urls.db')
212 |             cursor=conn.cursor()
213 |         else:
214 |             conn=sqlite3.connect('other_urls.db')
215 |             cursor=conn.cursor()
216 |             cursor.execute("create table urls(url varchar(40) primary key)")
217 |         ame_urls=[]
218 |         for i in self.urls:
219 |             ame_urls+=self.spider(i)
220 |         for url in ame_urls:
221 |             try:
222 |                 cursor.execute("insert into urls(url) values (?)",(url,))
223 |             except:
224 |                 continue
225 |         cursor.close()
226 |         conn.commit()
227 |         conn.close()
228 | 
229 |     def spider(self,url):
230 |         html=self.session.get(url,headers=self.headers).text
231 |         table=BeautifulSoup(html).find('div',attrs={'class':'resource-showlist'}).find_all('div',attrs={'class':'fl-img'})
232 |         urls=[]
233 |         for i in table:
234 |             urls.append(i.find('a').get('href'))
235 |         page=2
236 |         while True:
237 |             ur=url.replace('eresourcelist?','eresourcelist?page='+str(page))
238 |             html=self.session.get(ur,headers=self.headers).text
239 |             table=BeautifulSoup(html).find('div',attrs={'class':'resource-showlist'}).find_all('div',attrs={'class':'fl-img'})
240 |             if table==[]:
241 |                 break
242 |             for i in table:
243 |                 urls.append(i.find('a').get('href'))
244 |             page+=1
245 |         return urls
246 | 
247 | 
248 | class get_tv():
249 |     def __init__(self,url):
250 |         self.url=url
251 |         self.session=requests.session()
252 |         self.headers = {
253 |             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
254 |             "Accept-Encoding": "gzip, deflate",
255 |             "Accept-Language": "en-US,en;q=0.5",
256 |             "Connection": "keep-alive",
257 |             "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
258 |         postdata={
259 |         'account':'sq5423',
260 |         'password':'sq@lovewenwen134',
261 |         'remember':1,
262 |         'url_back':'http://www.zimuzu.tv/eresourcelist'
263 |         }
264 |         self.session.post('http://www.zimuzu.tv/User/Login/ajaxLogin',data=postdata,headers=self.headers)
265 | 
266 | 
267 |     def get_information(self):
268 |         html=self.session.get(self.url,headers=self.headers).text
269 |         table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'fl-info'})
270 |         table=BeautifulSoup(str(table)).find_all('li')
271 |         self.name=BeautifulSoup(html,'lxml').find('div',attrs={'class':'box score-box'}).find('h2').get_text()[:-2]
272 |         self.picture=BeautifulSoup(html,'lxml').find('div',attrs={'class':'fl-img'}).find('a').get('href')
273 |         self.information=[]
274 |         for i in table:
275 |             if i==table[-1]:
276 |                 try:
277 |                     self.information.append('简介：'+i.find('div').get_text())
278 |                 except:
279 |                     self.information.append(i.get_text())
280 |                 break
281 |             self.information.append(i.get_text())
282 | 
283 |     def get_urls(self):
284 |         html=self.session.get(self.url.replace('resource','resource/list'),headers=self.headers).text
285 |         self.table=BeautifulSoup(html).find('div',attrs={'class':'media-box'}).find_all('div',attrs={'class':'media-list'})
286 | 
287 | 
288 | class get_text():
289 |     def __init__(self):
290 |         self.file_Ame=open('Ame_tv.txt','w',encoding='utf-8')
291 |         self.Ame_names=open('Ame_names.txt','w',encoding='utf-8')
292 |         self.file_Japan=open('Japan_tv.txt','w',encoding='utf-8')
293 |         self.Japan_names=open('Japan_names.txt','w',encoding='utf-8')
294 |         self.file_Bri=open('Br_tv.txt','w',encoding='utf-8')
295 |         self.Bri_names=open('Bri_names.txt','w',encoding='utf-8')
296 |         self.file_Other=open('Other_tv.txt','w',encoding='utf-8')
297 |         self.Other_names=open('Other_names.txt','w',encoding='utf-8')
298 | 
299 |     def run(self):
300 |         self.count=0
301 |         url_db=['bri_urls.db','ame_urls.db','japan_urls.db','other_urls.db']
302 |         for i in url_db:
303 |             self.work(i)
304 | 
305 |     def work(self,db):
306 |         if db=='ame_urls.db':
307 |             country=1
308 |         if db=='bri_urls.db':
309 |             country=2
310 |         if db=='japan_urls.db':
311 |             country=3
312 |         if db=='other_urls.db':
313 |             country=4
314 |         conn = sqlite3.connect(db)
315 |         cursor = conn.execute("SELECT url from urls")
316 |         if os.path.isfile('completed.db'):
317 |             com = sqlite3.connect('completed.db')
318 |             com_cursor=com.cursor()
319 |         else:
320 |             com=sqlite3.connect('completed.db')
321 |             com_cursor=com.cursor()
322 |             com_cursor.execute("create table urls(url varchar(40) primary key)")
323 | 
324 |         for row in cursor:
325 |             time.sleep(2)
326 |             try:
327 |                 data=get_tv(row[0])
328 |                 data.get_information()
329 |                 data.get_urls()
330 |             except:
331 |                 continue
332 |             if data.table==[]:
333 |                 continue
334 |             else:
335 |                 try:
336 |                     com_cursor.execute("insert into urls(url) values (?)",(row[0],))
337 |                 except:
338 |                     continue
339 |             try:
340 |                 self.write(data,country)
341 |             except:
342 |                 continue
343 |             print(self.count)
344 |             self.count+=1
345 | 
346 |         if db=='ame_urls.db':
347 |             self.file_Ame.close()
348 |             self.Ame_names.close()
349 |         if db=='bri_urls.db':
350 |             self.Bri_names.close()
351 |             self.file_Bri.close()
352 |         if db=='japan_urls.db':
353 |             self.file_Japan.close()
354 |             self.Japan_names.close()
355 |         if db=='other_urls.db':
356 |             self.Other_names.close()
357 |             self.file_Other.close()
358 |         cursor.close()
359 |         conn.commit()
360 |         conn.close()
361 |         com_cursor.close()
362 |         com.commit()
363 |         com.close()
364 | 
365 |     def write(self,data,country):
366 |         urls=''
367 |         li=data.table[0].find_all('li')
368 |         if li==[]:
369 |             return
370 |         for i in data.table:
371 |             li=i.find_all('li')
372 |             try:
373 |                 for j in li:
374 |                     urls+=j.find('a',attrs={'type':'ed2k'}).get('href')+'\n'
375 |                     tr='=========='+'\n'
376 |                 urls+=tr
377 |             except:
378 |                 continue
379 |         urls='[ed2k]'+urls+'[/ed2k]'
380 |         text='[img]'+data.picture+'[/img]'+'\n'
381 |         for i in data.information:
382 |             text=text+i.replace('\n','')+'\n'
383 |         text+=urls
384 |         name='['+data.name+']'+'[MP4+MKV]'+'[中英]'
385 |         if country==1:
386 |             self.file_Ame.write(text+'\n||\n')
387 |             self.Ame_names.write(name+'\n||\n')
388 |         if country==3:
389 |             self.file_Japan.write(text+'\n||\n')
390 |             self.Japan_names.write(name+'\n||\n')
391 |         if country==2:
392 |             self.file_Bri.write(text+'\n||\n')
393 |             self.Bri_names.write(name+'\n||\n')
394 |         if country==4:
395 |             self.file_Other.write(text+'\n||\n')
396 |             self.Other_names.write(name+'\n||\n')
397 | 
398 | 
399 | def Main():
400 |     get_ame=Ame_tv()
401 |     get_japan=Japan_tv()
402 |     get_bri=Britain_tv()
403 |     get_other=Other_tv()
404 |     work=get_text()
405 |     work.run()
406 | 
407 | if __name__=='__main__':
408 |     #Main()
409 |     work=get_tv('http://www.zimuzu.tv/resource/33681')
410 |     work.get_information()
411 |     print(work.information)
412 | 


--------------------------------------------------------------------------------