├── 1688
└── get_tel.py
├── 11选5
└── chart.py
├── README.md
├── amazon
├── get_items.py
└── items_usa.py
├── anjuke
└── get_house.py
├── dianping
└── get_info.py
├── douban
├── dou_movie.py
└── dou_tv.py
├── guimi
└── guimi.py
├── ingredient
├── get_infor.py
└── get_ingre.py
├── itslaw
└── get_anli.py
├── job
├── Job_get.py
└── REANME.md
├── www.aihuishou.com
└── get_price.py
├── www.hexun.com
└── hexun.py
├── www.liepin.com
└── liepin.py
├── www.renrendai.com
└── renrendai.py
├── www.yanglao.com.cn
└── get_infor.py
└── www.zimuzu.tv
├── movie_get.py
└── tv_get.py
/11选5/chart.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 | import xlwt3
6 | import os
7 | import time
8 |
9 | class Get_infor():
10 | def __init__(self):
11 | self.headers = {
12 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
13 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
14 | 'Accept-Language': 'en-US,en;q=0.5',
15 | 'Accept-Encoding': 'gzip, deflate',
16 | 'Connection': 'keep-alive'}
17 | self.urls={'北京11选5': 'http://pub.icaile.com/bj11x5kjjg.php', '新疆11选5': 'http://pub.icaile.com/xj11x5kjjg.php', '湖北11选5': 'http://pub.icaile.com/hb11x5kjjg.php', '江西11选5': 'http://pub.icaile.com/jx11x5kjjg.php', '山西11选5': 'http://pub.icaile.com/sx11x5kjjg.php', '宁夏11选5': 'http://pub.icaile.com/nx11x5kjjg.php', '辽宁11选5': 'http://pub.icaile.com/ln11x5kjjg.php', '贵州11选5': 'http://pub.icaile.com/gz11x5kjjg.php', '云南11选5': 'http://pub.icaile.com/yn11x5kjjg.php', '西藏11选5': 'http://pub.icaile.com/xz11x5kjjg.php', '重庆11选5': 'http://pub.icaile.com/cq11x5kjjg.php', '吉林11选5': 'http://pub.icaile.com/jl11x5kjjg.php', '黑龙江11选5': 'http://pub.icaile.com/hlj11x5kjjg.php', '河南11选5': 'http://pub.icaile.com/hn11x5kjjg.php', '上海11选5': 'http://pub.icaile.com/sh11x5kjjg.php', '广东11选5': 'http://pub.icaile.com/gd11x5kjjg.php', '四川11选5': 'http://pub.icaile.com/sc11x5kjjg.php', '山东11选5': 'http://pub.icaile.com/sd11x5kjjg.php', '安徽11选5': 'http://pub.icaile.com/ah11x5kjjg.php', '浙江11选5': 'http://pub.icaile.com/zj11x5kjjg.php', '江苏11选5': 'http://pub.icaile.com/js11x5kjjg.php', '内蒙古11选5': 'http://pub.icaile.com/nmg11x5kjjg.php', '甘肃11选5': 'http://pub.icaile.com/gs11x5kjjg.php', '福建11选5': 'http://pub.icaile.com/fj11x5kjjg.php', '河北11选5': 'http://pub.icaile.com/heb11x5kjjg.php', '广西11选5': 'http://pub.icaile.com/gx11x5kjjg.php', '天津11选5': 'http://pub.icaile.com/tj11x5kjjg.php', '陕西11选5': 'http://pub.icaile.com/shx11x5kjjg.php'}
18 | def run(self):
19 | try:
20 | os.mkdir('data')
21 | except:
22 | print('..')
23 | for key in self.urls:
24 | try:
25 | html=requests.get(self.urls[key],headers=self.headers).text
26 | except:
27 | continue
28 | table=BeautifulSoup(html,'html.parser').find('table',attrs={'class':'today'}).find_all('tr')
29 | self.f=xlwt3.Workbook()
30 | self.sheet=self.f.add_sheet('sheet')
31 | self.count=0
32 | for item in table:
33 | try:
34 | infor=item.find_all('td')
35 | self.sheet.write(self.count,0,infor[0].get_text())
36 | num=1
37 | for i in infor[2].find_all('em'):
38 | self.sheet.write(self.count,num,i.get_text())
39 | num+=1
40 | self.count+=1
41 | except:
42 | continue
43 | self.f.save('data/'+key+'.xls')
44 | def test():
45 | html=requests.get('http://pub.icaile.com/sd11x5kjjg.php').text
46 | table=BeautifulSoup(html).find('div',attrs={'class':'left-nav'}).find('ul').find_all('li')
47 | urls={}
48 | for i in table:
49 | urls[i.get_text()]=i.find('a').get('href')
50 | print(urls)
51 |
52 | if __name__=='__main__':
53 | print('1.直接抓取')
54 | print('2.定时抓取')
55 | num=input('输入序号:')
56 | if(num=='1'):
57 | work=Get_infor()
58 | work.run()
59 | print('OK')
60 | elif(num=='2'):
61 | times=input('输入间隔时间(小时):')
62 | while True:
63 | work=Get_infor()
64 | work.run()
65 | print('OK')
66 | time.sleep(float(times)*3600)
67 |
--------------------------------------------------------------------------------
/1688/get_tel.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 | import time
6 | import re
7 | import random
8 | import xlwt3
9 |
10 | class Get_ip(object):
11 | """docstring for Get_ip"""
12 | def __init__(self):
13 | super(Get_ip, self).__init__()
14 | self.url='http://www.xicidaili.com/nn/'
15 | self.headers = {
16 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
17 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
18 | 'Accept-Language': 'en-US,en;q=0.5',
19 | 'Accept-Encoding': 'gzip, deflate',
20 | 'Connection': 'keep-alive'}
21 | self.session=requests.session()
22 | def run(self):
23 | html=self.session.get(self.url,headers=self.headers).text
24 | table=BeautifulSoup(html).find('table',attrs={'id':'ip_list'}).find_all('tr')
25 | http_ips=[]
26 | for item in table[1:]:
27 | lists=item.find_all('td')
28 | ip={'ip':'','port':''}
29 | if lists[6].get_text()=='HTTP':
30 | ip['ip']=lists[2].get_text()
31 | ip['port']=lists[3].get_text()
32 | http_ips.append(ip)
33 | return http_ips
34 |
35 | class get_urls():
36 | def __init__(self,url,page,ip):
37 | self.page=page
38 | self.url=url
39 | self.proxies={
40 | 'http':'http://'+ip['ip']+':'+ip['port']
41 | }
42 | def get_url(self):
43 | headers = {
44 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
45 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
46 | 'Accept-Language': 'en-US,en;q=0.5',
47 | 'Accept-Encoding': 'gzip, deflate',
48 |
49 | 'Connection': 'keep-alive'}
50 | html=requests.get(self.url+'&beginPage='+str(self.page),headers=headers,proxies=self.proxies,timeout=10).text
51 | soup=BeautifulSoup(html)
52 | table=soup.find('div',attrs={'id':'sw_mod_mainblock'}).find('ul').find_all('div',attrs={'class':'list-item-left'})
53 | urls=[]
54 | for item in table:
55 | urls.append(item.find('a').get('href'))
56 | return urls
57 |
58 | class get_contact():
59 | def __init__(self,url,ip):
60 | #super(get_contact, self).__init__()
61 | self.proxies={
62 | 'http':'http://'+ip['ip']+':'+ip['port']
63 | }
64 | self.url=url
65 | self.headers = {
66 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
67 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
68 | 'Accept-Language': 'en-US,en;q=0.5',
69 |
70 | 'Accept-Encoding': 'gzip, deflate'
71 | }
72 | self.session=requests.session()
73 | def run(self):
74 | try:
75 | html=self.session.get(self.url,headers=self.headers,proxies=self.proxies,timeout=10).text
76 | contact_url=BeautifulSoup(html).find('div',attrs={'class':'top-nav-bar-box'}).find('li',attrs={'data-page-name':'contactinfo'}).find('a').get('href')
77 | except:
78 | self.statue=0
79 | print('~~~')
80 | return
81 | self.statue=1
82 | try:
83 | #time.sleep(random.randint(4, 6))
84 | html=self.session.get(contact_url,headers=self.headers,proxies=self.proxies,timeout=10).text
85 | table=BeautifulSoup(html).find('div',attrs={'class':'fd-line'}).find_all('dl')
86 | self.title=BeautifulSoup(html).find('div',attrs={'class':'contact-info'}).find('h4').get_text()
87 | self.infor=[]
88 | for item in table[:-1]:
89 | self.infor.append(item.get_text().replace('\n','').replace(' ',''))
90 | except:
91 | self.statue=0
92 |
93 | class Main():
94 | def __init__(self):
95 | self.f=xlwt3.Workbook()
96 | self.sheet=self.f.add_sheet('sheet')
97 | self.count=0
98 | work=Get_ip()
99 | self.ips=work.run()
100 | def work(self):
101 | search_url=input('输入链接:')
102 | for i in range(100):
103 | url_get=get_urls(search_url,i+1,self.ips[random.randint(0, len(self.ips)-1)])
104 | try:
105 | urls=url_get.get_url()
106 | except:
107 | continue
108 | for url in urls:
109 | #time.sleep(random.randint(6, 9))
110 | spider=get_contact(url,self.ips[random.randint(0, len(self.ips)-1)])
111 | spider.run()
112 | if spider.statue==0:
113 | continue
114 | self.sheet.write(self.count,0,spider.title)
115 | num=1
116 | for infor in spider.infor:
117 | self.sheet.write(self.count,num,infor)
118 | num+=1
119 | self.count+=1
120 | print(self.count)
121 | self.f.save('data.xls')
122 | #time.sleep(random.randint(5, 8))
123 | def test():
124 | test=get_urls('http://s.1688.com/company/company_search.htm?keywords=%BE%AB%C3%DC%BB%FA%D0%B5&earseDirect=false&button_click=top&n=y&pageSize=30',1)
125 | print(test.get_url())
126 |
127 | if __name__=='__main__':
128 | work=Main()
129 | work.work()
130 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Nyspider
2 | 写的各种各样的爬虫
3 |
4 | 11选5---各省最新11选5开奖信息
5 |
6 | amazon---获取amazon商品信息
7 |
8 | anjuke---获取安居客上面小区信息
9 |
10 | dianping---大众点评上商家信息
11 |
12 | douban---豆瓣电影信息
13 |
14 | guimi---闺蜜网商品评论
15 |
16 | ingredient---化妆品成分信息
17 |
18 | itslaw---无讼网案例信息
19 |
20 | job---各招聘网站职位信息
21 |
22 | www.aihuishou.com ---爱回收手机价格
23 |
24 | www.yanglaowang.com.cn ---养老网上养老院信息
25 |
26 | www.zimuzu.tv ---电影,电视剧ed2k链接
27 |
28 | www.hexun.com ---和讯网,获取股票交易明细
29 |
30 | www.renrendai.com ---人人贷,获取贷款信息
31 |
--------------------------------------------------------------------------------
/amazon/get_items.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | import xlwt
5 | from bs4 import BeautifulSoup
6 | import time
7 | import re
8 | import random
9 |
10 | class get_urls():
11 | def __init__(self,page,keyword):
12 | self.session=requests.session()
13 | self.page=page
14 | self.keyword=keyword
15 | def get_url(self):
16 | headers = {
17 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
18 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
19 | 'Accept-Language': 'en-US,en;q=0.5',
20 | 'Accept-Encoding': 'gzip, deflate',
21 | 'DNT': 1,
22 | 'Connection': 'keep-alive'}
23 | html=self.session.get('http://www.amazon.cn/s/ref=sr_pg_3?rh=i%3Aaps%2Ck%3A'+self.keyword+'&page='+str(self.page)+'&keywords='+self.keyword+'&ie=UTF8&qid=1442030727',headers=headers).text
24 | table=BeautifulSoup(html).find('div',attrs={'id':'rightContainerATF'})
25 | rel='
'
26 | table=re.findall(re.compile(rel),str(table))
27 | urls=[]
28 | rel='href="(.*?)"'
29 | rel=re.compile(rel)
30 | for item in table:
31 | url=re.findall(rel,str(item))
32 | try:
33 | urls.append(url[0])
34 | except:
35 | continue
36 | return urls
37 |
38 | class get_infor():
39 | def __init__(self,url):
40 | self.url=url
41 | self.session=requests.session()
42 | self.headers = {
43 | "X-Forwarded-For":str(random.randint(0, 255))+'.'+str(random.randint(0, 255))+'.'+str(random.randint(0, 255))+'.'+str(random.randint(0, 255)),
44 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
45 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
46 | 'Accept-Language': 'en-US,en;q=0.5',
47 | 'Accept-Encoding': 'gzip, deflate',
48 | 'DNT': 1,
49 | 'Connection': 'keep-alive'}
50 | self.get_info()
51 | def get_info(self):
52 | self.statue=0
53 | while True:
54 | try:
55 | html=self.session.get(self.url,headers=self.headers,timeout=5).text
56 | soup=BeautifulSoup(html)
57 | self.price=''.join(soup.find('div',attrs={'id':'price'}).find('tr').get_text().replace('\n','').split())
58 | self.title=soup.find('h1',attrs={'id':'title'}).get_text().replace('\n','')
59 | try:
60 | self.previews=''.join(list(filter(str.isdigit, soup.find('div',attrs={'id':'centerCol'}).find('a',attrs={'id':'cmrs-atf'}).get_text())))
61 | except:
62 | self.previews=0
63 | self.picture_url=soup.find('div',attrs={'class':'imgTagWrapper'}).find('img').get('data-old-hires')
64 | self.picture=self.session.get(self.picture_url,headers=self.headers,timeout=5).content
65 | self.statue=1
66 | break
67 | except:
68 | break
69 |
70 | class Main():
71 | def __init__(self):
72 | self.f=xlwt.Workbook()
73 | self.sheet=self.f.add_sheet('sheet',cell_overwrite_ok=True)
74 | self.count=0
75 | def work(self):
76 | keyword=input("输入关键字(英文):")
77 | page=input("输入页数:")
78 | for i in range(int(page)):
79 | try:
80 | work=get_urls(i+1,keyword)
81 | urls=work.get_url()
82 | except:
83 | continue
84 | for url in urls:
85 | item=get_infor(url)
86 | if item.statue==0:
87 | continue
88 | with open(str(self.count)+item.picture_url[-4:],'wb') as img:
89 | img.write(item.picture)
90 | img.close()
91 | self.sheet.write(self.count,0,str(self.count))
92 | self.sheet.write(self.count,1,item.title)
93 | self.sheet.write(self.count,2,item.price)
94 | self.sheet.write(self.count,3,item.previews)
95 | self.count+=1
96 | self.f.save('data.xls')
97 | print(self.count)
98 |
99 | if __name__=='__main__':
100 | work=Main()
101 | work.work()
102 |
--------------------------------------------------------------------------------
/amazon/items_usa.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | import xlwt
5 | from bs4 import BeautifulSoup
6 | import time
7 | import re
8 | import random
9 |
10 | class get_urls():
11 | def __init__(self,page,keyword):
12 | self.session=requests.session()
13 | self.page=page
14 | self.keyword=keyword
15 | def get_url(self):
16 | headers = {
17 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
18 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
19 | 'Accept-Language': 'en-US,en;q=0.5',
20 | 'Accept-Encoding': 'gzip, deflate',
21 | 'DNT': 1,
22 | 'Connection': 'keep-alive'}
23 | html=self.session.get('http://www.amazon.com/s/ref=sr_pg_2?rh=i%3Aaps%2Ck%3A'+self.keyword+'&page='+str(self.page)+'&keywords='+self.keyword+'&ie=UTF8',headers=headers).text
24 | #table=BeautifulSoup(html).find('ul',attrs={'id':'s-results-list-atf'})
25 | rel='a class="a-link-normal a-text-normal" href="(http.*?)"'
26 | table=re.findall(re.compile(rel),str(html))
27 | urls=list(set(table))
28 | return urls
29 |
30 | class get_infor():
31 | def __init__(self,url):
32 | self.url=url
33 | self.session=requests.session()
34 | self.headers = {
35 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
36 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
37 | 'Accept-Language': 'en-US,en;q=0.5',
38 | 'Accept-Encoding': 'gzip, deflate',
39 | 'DNT': 1,
40 | 'Connection': 'keep-alive'}
41 | self.get_info()
42 | def get_info(self):
43 | self.statue=0
44 | while True:
45 | try:
46 | html=self.session.get(self.url,headers=self.headers,timeout=5).text
47 | soup=BeautifulSoup(html)
48 | self.price=''.join(soup.find('div',attrs={'id':'price'}).find('tr').get_text().replace('\n','').split())
49 | self.title=soup.find('h1',attrs={'id':'title'}).get_text().replace('\n','')
50 | try:
51 | self.previews=''.join(list(filter(str.isdigit, soup.find('div',attrs={'id':'centerCol'}).find('a',attrs={'id':'acrCustomerReviewLink'}).get_text())))
52 | except:
53 | self.previews=0
54 | self.picture_url=soup.find('div',attrs={'class':'imgTagWrapper'}).find('img').get('data-old-hires')
55 | self.picture=self.session.get(self.picture_url,headers=self.headers,timeout=5).content
56 | self.statue=1
57 | break
58 | except:
59 | break
60 |
61 | class Main():
62 | def __init__(self):
63 | self.f=xlwt.Workbook()
64 | self.sheet=self.f.add_sheet('sheet',cell_overwrite_ok=True)
65 | self.count=0
66 | def work(self):
67 | keyword=input("输入关键字(英文):")
68 | page=input("输入页数:")
69 | for i in range(int(page)):
70 | try:
71 | work=get_urls(i+1,keyword)
72 | urls=work.get_url()
73 | except:
74 | continue
75 | for url in urls:
76 | item=get_infor(url)
77 | if item.statue==0:
78 | continue
79 | with open(str(self.count)+item.picture_url[-4:],'wb') as img:
80 | img.write(item.picture)
81 | img.close()
82 | self.sheet.write(self.count,0,str(self.count))
83 | self.sheet.write(self.count,1,item.title)
84 | self.sheet.write(self.count,2,item.price)
85 | self.sheet.write(self.count,3,item.previews)
86 | self.count+=1
87 | self.f.save('data.xls')
88 | print(self.count)
89 |
90 | if __name__=='__main__':
91 | work=Main()
92 | work.work()
93 |
--------------------------------------------------------------------------------
/anjuke/get_house.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | import xlwt3
5 | from bs4 import BeautifulSoup
6 | import re
7 |
8 | class get_infor():
9 | def __init__(self,url):
10 | self.url=url
11 | self.session=requests.session()
12 | self.headers = {
13 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
14 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
15 | 'Accept-Language': 'en-US,en;q=0.5',
16 | 'Accept-Encoding': 'gzip, deflate',
17 | 'DNT': 1,
18 | 'Connection': 'keep-alive'}
19 | def work(self):
20 | html=self.session.get(self.url,headers=self.headers).text
21 | self.statue=0
22 | soup=BeautifulSoup(html)
23 | self.price=soup.find('div',attrs={'class':'comm-cont'}).find('p',attrs={'class':'mag-b2'}).get_text().replace('\n','').replace(' ','')
24 | table=soup.find('div',attrs={'class':'comm-list clearfix'}).find_all('dl')
25 | self.infortable=[]
26 | for i in table:
27 | lists=i.find_all('dd')
28 | for item in lists:
29 | self.infortable.append(item.get_text().replace('\n','').replace(' ',''))
30 | self.statue=1
31 |
32 |
33 | class get_urls():
34 | def __init__(self,url):
35 | self.url=url
36 | self.session=requests.session()
37 | self.headers = {
38 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
39 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
40 | 'Accept-Language': 'en-US,en;q=0.5',
41 | 'Accept-Encoding': 'gzip, deflate',
42 | 'DNT': 1,
43 | 'Connection': 'keep-alive'}
44 | def run(self):
45 | html=self.session.get(self.url,headers=self.headers).text
46 | lists=BeautifulSoup(html,'lxml').find('div',attrs={'class':'pL'}).find('ul').find_all('li')
47 | urls=[]
48 | for item in lists:
49 | urls.append(item.find('a').get('href'))
50 | return urls
51 |
52 | class Main():
53 | def work(self):
54 | self.f=xlwt3.Workbook()
55 | self.sheet=self.f.add_sheet('sheet')
56 | self.count=0
57 | for page in range(338):
58 | get_url=get_urls('http://shanghai.anjuke.com/community/W0QQp1Z7QQp'+'Z'+str(page+1))
59 | print(page)
60 | urls=get_url.run()
61 | for url in urls:
62 | item=get_infor(url)
63 | item.work()
64 | if item.statue==0:
65 | continue
66 | self.sheet.write(self.count,0,'浦东')
67 | num=1
68 | for infor in item.infortable:
69 | self.sheet.write(self.count,num,infor)
70 | num+=1
71 | self.sheet.write(self.count,num,item.price)
72 | num+=1
73 | self.sheet.write(self.count,num,url)
74 | self.count+=1
75 | self.f.save('data.xls')
76 | def test():
77 | test=get_infor('http://shanghai.anjuke.com/community/view/106')
78 | test.work()
79 |
80 |
81 | if __name__=='__main__':
82 | work=Main()
83 | work.work()
84 |
--------------------------------------------------------------------------------
/dianping/get_info.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | import xlwt3
5 | from bs4 import BeautifulSoup
6 | import re
7 |
8 | class get_infor():
9 | def __init__(self,url):
10 | self.url=url
11 | self.session=requests.session()
12 | self.headers = {
13 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
14 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
15 | 'Accept-Language': 'en-US,en;q=0.5',
16 | 'Accept-Encoding': 'gzip, deflate',
17 | 'DNT': 1,
18 | 'Connection': 'keep-alive'}
19 | def work(self):
20 | html=self.session.get(self.url,headers=self.headers).text
21 | self.statue=0
22 | infor=BeautifulSoup(html,'lxml').find('div',attrs={'class':'main'}).find('div',attrs={'id':'basic-info'})
23 | try:
24 | self.title=infor.find('h1').get_text().replace('\n','').replace(' ','')
25 | except:
26 | return
27 | try:
28 | self.area=BeautifulSoup(html,'lxml').find('div',attrs={'class':'breadcrumb'}).find_all('a')[2].get_text().replace('\n','').replace(' ','')
29 | except:
30 | self.area=''
31 | try:
32 | self.address=infor.find('div',attrs={'class':'expand-info address'}).get_text().replace('\n','').replace(' ','')
33 | except:
34 | self.address=' '
35 | try:
36 | self.tel=infor.find('span',attrs={'itemprop':'tel'}).get_text()
37 | except:
38 | self.tel=' '
39 | table=infor.find('div',attrs={'class':'other J-other Hide'}).find_all('p')
40 | self.price=''
41 | self.times=''
42 | for item in table:
43 | try:
44 | if(item.find('span').get_text()=='营业时间:'):
45 | self.times=item.get_text().replace('\n','').replace(' ','').replace('修改','')
46 | except:
47 | continue
48 | table=infor.find('div',attrs={'class':'brief-info'}).find_all('span')
49 | for item in table:
50 | try:
51 | if(item.get_text()[:2]=='人均' or item.get_text()[:2]=='费用' or item.get_text()[:2]=='均价'):
52 | self.price=item.get_text().replace('\n','').replace(' ','')
53 | except:
54 | continue
55 | if self.price=='':
56 | self.price='--'
57 | if self.times=='':
58 | self.times='--'
59 | self.statue=1
60 |
61 |
62 | class get_urls():
63 | def __init__(self,url):
64 | self.url=url
65 | self.session=requests.session()
66 | self.headers = {
67 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
68 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
69 | 'Accept-Language': 'en-US,en;q=0.5',
70 | 'Accept-Encoding': 'gzip, deflate',
71 | 'DNT': 1,
72 | 'Cookie':'showNav=#nav-tab|0|1; navCtgScroll=0; _hc.v="\"23f85427-5787-47bd-9df4-4e831c7a4cae.1442049973\""; __utma=1.649416466.1442049979.1442049979.1442049979.1; __utmz=1.1442049979.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; cy=1; cye=shanghai; s_ViewType=10; aburl=1; JSESSIONID=95881D627CA4C940D686AD118D776232; PHOENIX_ID=0a0308bc-14fda0ead2e-4e713c',
73 | 'Connection': 'keep-alive'}
74 | def run(self):
75 | html=self.session.get(self.url,headers=self.headers).text
76 | lists=BeautifulSoup(html,'lxml').find('div',attrs={'id':'shop-all-list'}).find_all('li')
77 | urls=[]
78 | for item in lists:
79 | urls.append('http://www.dianping.com'+item.find('a').get('href'))
80 | return urls
81 |
82 | class Main():
83 | def work(self):
84 | self.f=xlwt3.Workbook()
85 | self.sheet=self.f.add_sheet('sheet')
86 | self.count=0
87 | for page in range(50):
88 | get_url=get_urls('http://www.dianping.com/search/category/1/20/g187r12'+'p'+str(page+1))
89 | print(page)
90 | urls=get_url.run()
91 | for url in urls:
92 | try:
93 | item=get_infor(url)
94 | item.work()
95 | except:
96 | continue
97 | if item.statue==0:
98 | continue
99 | self.sheet.write(self.count,0,'购物')
100 | self.sheet.write(self.count,1,'超市便利店')
101 | self.sheet.write(self.count,2,'闵行')
102 | self.sheet.write(self.count,3,item.area)
103 | self.sheet.write(self.count,4,item.title)
104 | self.sheet.write(self.count,5,item.address)
105 | self.sheet.write(self.count,6,item.tel)
106 | self.sheet.write(self.count,7,item.price)
107 | self.sheet.write(self.count,8,item.times)
108 | self.sheet.write(self.count,9,url)
109 | self.count+=1
110 | self.f.save('data.xls')
111 |
112 | def test():
113 | test=get_infor('http://www.dianping.com/shop/1909912')
114 | test.work()
115 | print(test.times)
116 | print(test.price)
117 | if __name__=='__main__':
118 | work=Main()
119 | work.work()
120 |
--------------------------------------------------------------------------------
/douban/dou_movie.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | import time
4 | import requests
5 | from bs4 import BeautifulSoup
6 | import os
7 | import sqlite3
8 |
9 | class Douban():
10 | def __init__(self):
11 | self.session=requests.session()
12 | self.headers = {
13 | 'Host': 'movie.douban.com',
14 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
15 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
16 | 'Accept-Language': 'en-US,en;q=0.5',
17 | 'Accept-Encoding': 'gzip, deflate',
18 | 'DNT': 1,
19 | 'Connection': 'keep-alive'}
20 | self.session.get('http://www.douban.com',headers=self.headers)
21 | self.count=0
22 |
23 | def work(self):
24 | self.get_urls('http://www.douban.com/tag/%E5%89%A7%E6%83%85/movie',0)
25 | self.get_urls('http://www.douban.com/tag/%E5%8A%A8%E7%94%BB/movie',1)
26 | self.get_urls('http://www.douban.com/tag/%E7%8A%AF%E7%BD%AA/movie',2)
27 | self.get_urls('http://www.douban.com/tag/%E6%83%8A%E6%82%9A/movie',3)
28 | self.get_urls('http://www.douban.com/tag/%E6%82%AC%E7%96%91/movie',4)
29 | self.get_urls('http://www.douban.com/tag/cult/movie',5)
30 | self.get_urls('http://www.douban.com/tag/%E6%81%90%E6%80%96/movie',6)
31 | self.get_urls('http://www.douban.com/tag/%E6%9A%B4%E5%8A%9B/movie',7)
32 | self.get_urls('http://www.douban.com/tag/%E9%BB%91%E5%B8%AE/movie',8)
33 |
34 | def get_urls(self,url,types):
35 | dbs=['juqing_urls.db','donghua_urls.db','fanzui_urls.db','jingsong_urls.db','xuanyi_urls.db','cult_urls.db','kongbu_urls.db','baoli_urls.db','heibang_urls_db']
36 | db=dbs[types]
37 | if os.path.isfile(db):
38 | conn = sqlite3.connect(db)
39 | cursor=conn.cursor()
40 | else:
41 | conn=sqlite3.connect(db)
42 | cursor=conn.cursor()
43 | cursor.execute("create table urls(url varchar(40) primary key)")
44 | urls=self.get_url(url)
45 | for i in urls:
46 | try:
47 | cursor.execute("insert into urls(url) values (?)",(i,))
48 | except:
49 | continue
50 | cursor.close()
51 | conn.commit()
52 | conn.close()
53 | print(db+' OK')
54 |
55 | def get_url(self,url):
56 | num=0
57 | urls=[]
58 | while True:
59 | time.sleep(2)
60 | try:
61 | html=self.session.get(url+'?start='+str(num)).text
62 | except:
63 | break
64 | try:
65 | table=BeautifulSoup(html).find('div',attrs={'class':'mod movie-list'}).find_all('dl')
66 | except:
67 | break
68 | if table==[]:
69 | break
70 | for i in table:
71 | urls.append(i.find('a').get('href'))
72 | num+=15
73 | return urls
74 |
75 | def run(self):
76 | for i in range(9):
77 | self.get_text(i)
78 |
79 | def get_text(self,num):
80 | dbs=['juqing_urls.db','donghua_urls.db','fanzui_urls.db','jingsong_urls.db','xuanyi_urls.db','cult_urls.db']
81 | conn = sqlite3.connect(dbs[num])
82 | cursor = conn.execute("SELECT url from urls")
83 | file_text=open(dbs[num].replace('_urls.db','.txt'),'w',encoding='utf-8')
84 | for row in cursor:
85 | time.sleep(2)
86 | try:
87 | text=self.spider(row[0])
88 | except:
89 | continue
90 | file_text.write(text+'\n\n')
91 | print(self.count)
92 | self.count+=1
93 | cursor.close()
94 | conn.commit()
95 | conn.close()
96 | file_text.close()
97 |
98 | def spider(self, url):
99 | html = requests.get(url, headers=self.headers).text
100 | soup = BeautifulSoup(html)
101 | name=soup.find('span',attrs={'property':'v:itemreviewed'}).get_text()
102 | picture=soup.find('img',attrs={'rel':'v:image'}).get('src')
103 | picture='[img]'+picture+'[/img]'
104 | text=name+'\n'
105 | text+=picture+'\n'
106 | info=soup.find('div',attrs={'class':'indent clearfix'}).find('div',attrs={'id':'info'}).get_text()
107 | text+=info
108 | intro=soup.find('div',attrs={'class':'related-info'}).get_text()
109 | text+=intro
110 | return text
111 |
112 | def test():
113 | work = Douban()
114 | #work.get_url('http://www.douban.com/tag/%E7%8A%AF%E7%BD%AA/movie')
115 | print(work.spider('http://movie.douban.com/subject/3592854/?from=tag_all'))
116 |
117 | if __name__ == '__main__':
118 | work = Douban()
119 | work.work()
120 | work.run()
121 |
--------------------------------------------------------------------------------
/douban/dou_tv.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | import time
4 | import requests
5 | from bs4 import BeautifulSoup
6 | import os
7 | import sqlite3
8 |
9 | class Douban():
10 | def __init__(self):
11 | self.session=requests.session()
12 | self.headers = {
13 | 'Host': 'movie.douban.com',
14 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
15 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
16 | 'Accept-Language': 'en-US,en;q=0.5',
17 | 'Accept-Encoding': 'gzip, deflate',
18 | 'DNT': 1,
19 | 'Connection': 'keep-alive'}
20 | self.count=0
21 | self.session.get('http://www.douban.com',headers=self.headers)
22 |
23 | def work(self):
24 | if os.path.isfile('tv_urls.db'):
25 | conn = sqlite3.connect('tv_urls.db')
26 | cursor=conn.cursor()
27 | else:
28 | conn=sqlite3.connect('tv_urls.db')
29 | cursor=conn.cursor()
30 | cursor.execute("create table urls(url varchar(40) primary key)")
31 | urls=self.get_url('http://movie.douban.com/tag/%E7%94%B5%E8%A7%86%E5%89%A7')
32 | for i in urls:
33 | try:
34 | cursor.execute("insert into urls(url) values (?)",(i,))
35 | except:
36 | continue
37 | cursor.close()
38 | conn.commit()
39 | conn.close()
40 | print('OK')
41 |
42 | def get_url(self,url):
43 | num=0
44 | urls=[]
45 | while True:
46 | time.sleep(2)
47 | try:
48 | html=self.session.get(url+'?start='+str(num)+'&type=T').text
49 | except:
50 | break
51 | try:
52 | table=BeautifulSoup(html).find('div',attrs={'class':'article'}).find('div',attrs={'class':''}).find_all('table')
53 | except:
54 | break
55 | if table==[]:
56 | break
57 | for i in table:
58 | urls.append(i.find('a',attrs={'class':'nbg'}).get('href'))
59 | num+=20
60 | return urls
61 |
62 | def get_text(self):
63 | conn = sqlite3.connect('tv_urls.db')
64 | cursor = conn.execute("SELECT url from urls")
65 | file_text=open('tv.txt','w',encoding='utf-8')
66 | for row in cursor:
67 | time.sleep(2)
68 | try:
69 | text=self.spider(row[0])
70 | except:
71 | continue
72 | file_text.write(text+'\n\n')
73 | print(self.count)
74 | self.count+=1
75 | cursor.close()
76 | conn.commit()
77 | conn.close()
78 | file_text.close()
79 |
80 | def spider(self, url):
81 | html = requests.get(url, headers=self.headers).text
82 | soup = BeautifulSoup(html)
83 | name=soup.find('span',attrs={'property':'v:itemreviewed'}).get_text()
84 | picture=soup.find('img',attrs={'rel':'v:image'}).get('src')
85 | picture='[img]'+picture+'[/img]'
86 | text=name+'\n'
87 | text+=picture+'\n'
88 | info=soup.find('div',attrs={'class':'indent clearfix'}).find('div',attrs={'id':'info'}).get_text()
89 | text+=info
90 | intro=soup.find('div',attrs={'class':'related-info'}).get_text()
91 | text+=intro
92 | return text
93 |
94 | if __name__=='__main__':
95 | work=Douban()
96 | work.work()
97 | work.get_text()
98 |
--------------------------------------------------------------------------------
/guimi/guimi.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | from bs4 import BeautifulSoup
4 | import requests
5 | import time
6 | import re
7 | import jieba
8 | import sys
9 | import jieba.analyse
10 | import xlwt
11 |
12 | class Urls_get():
13 | def __init__(self,url):
14 | self.url=url
15 | self.headers = {
16 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
17 | "Accept-Encoding": "gzip, deflate",
18 | "Accept-Language": "en-US,en;q=0.5",
19 | "Connection": "keep-alive",
20 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
21 | self.session=requests.session()
22 | print('Get items')
23 | self.item_urls=[]
24 | self.get_urls()
25 |
26 | def get_urls(self):
27 | html=self.session.get(self.url,headers=self.headers).text
28 | rel=r'class="txtBg" href="(.*?)" target='
29 | rel=re.compile(rel)
30 | lists=re.findall(rel,html)
31 | for i in lists:
32 | if(i.endswith('review')):
33 | continue
34 | if(i.endswith('product')):
35 | continue
36 | self.item_urls.append(i)
37 | count=2
38 | while(True):
39 | time.sleep(1)
40 | html=self.session.get(self.url+'&page='+str(count),headers=self.headers).text
41 | lists=re.findall(rel,html)
42 | if lists:
43 | for i in lists:
44 | if(i.endswith('review')):
45 | continue
46 | if(i.endswith('product')):
47 | continue
48 | self.item_urls.append(i)
49 | count+=1
50 | else:
51 | break
52 |
53 | class Word_frequency():
54 | def __init__(self,file_name):
55 | self.file_name=file_name
56 | self.analyse()
57 |
58 | def analyse(self):
59 | content=open(self.file_name,'r').read()
60 | '''
61 | text=jieba.analyse.extract_tags(content, topK=50, withWeight=True, allowPOS=('adj'))
62 | '''
63 | text=jieba.analyse.textrank(content, topK=50, withWeight=True, allowPOS=('adj'))
64 | f=xlwt.Workbook()
65 | sheet=f.add_sheet('sheet')
66 | count=0
67 | for i in text:
68 | sheet.write(count,0,i[0])
69 | sheet.write(count,1,i[1])
70 | count+=1
71 | f.save('fenghua.xls')
72 |
73 | class Review_get():
74 | def __init__(self,urls):
75 | self.urls=urls
76 | self.headers = {
77 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
78 | "Accept-Encoding": "gzip, deflate",
79 | "Accept-Language": "en-US,en;q=0.5",
80 | "Connection": "keep-alive",
81 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
82 | self.session=requests.session()
83 | print('Get reviews')
84 | self.reviews=[]
85 | self.write_review()
86 |
87 | def write_review(self):
88 | for i in self.urls:
89 | self.reviews+=self.spider(i)
90 | file=open('fenghua.txt','w')
91 | for i in self.reviews:
92 | file.write(i+'\n')
93 | file.close
94 |
95 | def spider(self,url):
96 | count=1
97 | review=[]
98 | while(True):
99 | time.sleep(0.1)
100 | html=self.session.get(url+ str(count),headers=self.headers).content
101 | soup=BeautifulSoup(html).find_all('p',attrs={'class':'com_p'})
102 | if soup:
103 | for i in soup:
104 | review.append(i.get_text())
105 | count+=1
106 | else:
107 | return review
108 | def Main():
109 | item_get=Urls_get('http://so.kimiss.com/?keyword=%B7%E4%BB%A8%BB%A4%B7%A2%CB%D8&idx=10')
110 | urls=item_get.item_urls
111 | review_get=Review_get(urls)
112 | Analyse=Word_frequency('fenghua.txt')
113 |
114 | if __name__=='__main__':
115 | Main()
116 |
--------------------------------------------------------------------------------
/ingredient/get_infor.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | import xlwt3
5 | from bs4 import BeautifulSoup
6 | import re
7 | import xlrd
8 | import threading
9 | import random
10 | import time
11 |
12 | class Get_ip():
13 | def __init__(self,num):
14 | self.headers = {
15 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
16 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
17 | 'Accept-Language': 'en-US,en;q=0.5',
18 | 'Accept-Encoding': 'gzip, deflate',
19 | 'Connection': 'keep-alive'}
20 | self.url='http://vxer.daili666api.com/ip/?tid=559950660678689&num='+str(num)+'&delay=3&category=2'
21 | self.session=requests.session()
22 | def get(self):
23 | ip=self.session.get(self.url,headers=self.headers).text.replace('\n','')
24 | return ip
25 |
26 | class Get_infor(threading.Thread):
27 | def __init__(self,score,english_name):
28 | super(Get_infor, self).__init__()
29 | self.session=requests.session()
30 | self.headers = {
31 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
32 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
33 | 'Accept-Language': 'en-US,en;q=0.5',
34 | 'Accept-Encoding': 'gzip, deflate',
35 | 'Connection': 'keep-alive'}
36 | self.f=xlwt3.Workbook()
37 | self.sheet=self.f.add_sheet('sheet')
38 | self.count=0
39 | self.score=score
40 | self.english_name=english_name
41 |
42 | def run(self):
43 | self.statue=1
44 | try:
45 | html=self.session.get('http://www.cosdna.com/chs/stuff.php?q='+self.english_name,headers=self.headers).text
46 | except:
47 | self.statue=0
48 | return
49 | try:
50 | url=BeautifulSoup(html).find('div',attrs={'class':'StuffResult'}).find('tr').find('a').get('href')
51 | self.infor('http://www.cosdna.com/chs/'+url)
52 | except:
53 | try:
54 | self.infor('http://www.cosdna.com/chs/stuff.php?q='+self.english_name)
55 | except:
56 | self.statue=0
57 |
58 | def infor(self,url):
59 | html=self.session.get(url,headers=self.headers).text
60 | infor_table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'StuffDetail'})
61 | try:
62 | self.chinese_name=infor_table.find('div',attrs={'class':'Stuff_DetailC'}).get_text()
63 | except:
64 | self.statue=0
65 | return
66 | rel='r/>(.*?)