├── 1688 └── get_tel.py ├── 11选5 └── chart.py ├── README.md ├── amazon ├── get_items.py └── items_usa.py ├── anjuke └── get_house.py ├── dianping └── get_info.py ├── douban ├── dou_movie.py └── dou_tv.py ├── guimi └── guimi.py ├── ingredient ├── get_infor.py └── get_ingre.py ├── itslaw └── get_anli.py ├── job ├── Job_get.py └── REANME.md ├── www.aihuishou.com └── get_price.py ├── www.hexun.com └── hexun.py ├── www.liepin.com └── liepin.py ├── www.renrendai.com └── renrendai.py ├── www.yanglao.com.cn └── get_infor.py └── www.zimuzu.tv ├── movie_get.py └── tv_get.py /11选5/chart.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import xlwt3 6 | import os 7 | import time 8 | 9 | class Get_infor(): 10 | def __init__(self): 11 | self.headers = { 12 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 13 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 14 | 'Accept-Language': 'en-US,en;q=0.5', 15 | 'Accept-Encoding': 'gzip, deflate', 16 | 'Connection': 'keep-alive'} 17 | self.urls={'北京11选5': 'http://pub.icaile.com/bj11x5kjjg.php', '新疆11选5': 'http://pub.icaile.com/xj11x5kjjg.php', '湖北11选5': 'http://pub.icaile.com/hb11x5kjjg.php', '江西11选5': 'http://pub.icaile.com/jx11x5kjjg.php', '山西11选5': 'http://pub.icaile.com/sx11x5kjjg.php', '宁夏11选5': 'http://pub.icaile.com/nx11x5kjjg.php', '辽宁11选5': 'http://pub.icaile.com/ln11x5kjjg.php', '贵州11选5': 'http://pub.icaile.com/gz11x5kjjg.php', '云南11选5': 'http://pub.icaile.com/yn11x5kjjg.php', '西藏11选5': 'http://pub.icaile.com/xz11x5kjjg.php', '重庆11选5': 'http://pub.icaile.com/cq11x5kjjg.php', '吉林11选5': 'http://pub.icaile.com/jl11x5kjjg.php', '黑龙江11选5': 'http://pub.icaile.com/hlj11x5kjjg.php', '河南11选5': 'http://pub.icaile.com/hn11x5kjjg.php', '上海11选5': 'http://pub.icaile.com/sh11x5kjjg.php', '广东11选5': 'http://pub.icaile.com/gd11x5kjjg.php', '四川11选5': 'http://pub.icaile.com/sc11x5kjjg.php', '山东11选5': 'http://pub.icaile.com/sd11x5kjjg.php', '安徽11选5': 'http://pub.icaile.com/ah11x5kjjg.php', '浙江11选5': 'http://pub.icaile.com/zj11x5kjjg.php', '江苏11选5': 'http://pub.icaile.com/js11x5kjjg.php', '内蒙古11选5': 'http://pub.icaile.com/nmg11x5kjjg.php', '甘肃11选5': 'http://pub.icaile.com/gs11x5kjjg.php', '福建11选5': 'http://pub.icaile.com/fj11x5kjjg.php', '河北11选5': 'http://pub.icaile.com/heb11x5kjjg.php', '广西11选5': 'http://pub.icaile.com/gx11x5kjjg.php', '天津11选5': 'http://pub.icaile.com/tj11x5kjjg.php', '陕西11选5': 'http://pub.icaile.com/shx11x5kjjg.php'} 18 | def run(self): 19 | try: 20 | os.mkdir('data') 21 | except: 22 | print('..') 23 | for key in self.urls: 24 | try: 25 | html=requests.get(self.urls[key],headers=self.headers).text 26 | except: 27 | continue 28 | table=BeautifulSoup(html,'html.parser').find('table',attrs={'class':'today'}).find_all('tr') 29 | self.f=xlwt3.Workbook() 30 | self.sheet=self.f.add_sheet('sheet') 31 | self.count=0 32 | for item in table: 33 | try: 34 | infor=item.find_all('td') 35 | self.sheet.write(self.count,0,infor[0].get_text()) 36 | num=1 37 | for i in infor[2].find_all('em'): 38 | self.sheet.write(self.count,num,i.get_text()) 39 | num+=1 40 | self.count+=1 41 | except: 42 | continue 43 | self.f.save('data/'+key+'.xls') 44 | def test(): 45 | html=requests.get('http://pub.icaile.com/sd11x5kjjg.php').text 46 | table=BeautifulSoup(html).find('div',attrs={'class':'left-nav'}).find('ul').find_all('li') 47 | urls={} 48 | for i in table: 49 | urls[i.get_text()]=i.find('a').get('href') 50 | print(urls) 51 | 52 | if __name__=='__main__': 53 | print('1.直接抓取') 54 | print('2.定时抓取') 55 | num=input('输入序号:') 56 | if(num=='1'): 57 | work=Get_infor() 58 | work.run() 59 | print('OK') 60 | elif(num=='2'): 61 | times=input('输入间隔时间(小时):') 62 | while True: 63 | work=Get_infor() 64 | work.run() 65 | print('OK') 66 | time.sleep(float(times)*3600) 67 | -------------------------------------------------------------------------------- /1688/get_tel.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import time 6 | import re 7 | import random 8 | import xlwt3 9 | 10 | class Get_ip(object): 11 | """docstring for Get_ip""" 12 | def __init__(self): 13 | super(Get_ip, self).__init__() 14 | self.url='http://www.xicidaili.com/nn/' 15 | self.headers = { 16 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 17 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 18 | 'Accept-Language': 'en-US,en;q=0.5', 19 | 'Accept-Encoding': 'gzip, deflate', 20 | 'Connection': 'keep-alive'} 21 | self.session=requests.session() 22 | def run(self): 23 | html=self.session.get(self.url,headers=self.headers).text 24 | table=BeautifulSoup(html).find('table',attrs={'id':'ip_list'}).find_all('tr') 25 | http_ips=[] 26 | for item in table[1:]: 27 | lists=item.find_all('td') 28 | ip={'ip':'','port':''} 29 | if lists[6].get_text()=='HTTP': 30 | ip['ip']=lists[2].get_text() 31 | ip['port']=lists[3].get_text() 32 | http_ips.append(ip) 33 | return http_ips 34 | 35 | class get_urls(): 36 | def __init__(self,url,page,ip): 37 | self.page=page 38 | self.url=url 39 | self.proxies={ 40 | 'http':'http://'+ip['ip']+':'+ip['port'] 41 | } 42 | def get_url(self): 43 | headers = { 44 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 45 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 46 | 'Accept-Language': 'en-US,en;q=0.5', 47 | 'Accept-Encoding': 'gzip, deflate', 48 | 49 | 'Connection': 'keep-alive'} 50 | html=requests.get(self.url+'&beginPage='+str(self.page),headers=headers,proxies=self.proxies,timeout=10).text 51 | soup=BeautifulSoup(html) 52 | table=soup.find('div',attrs={'id':'sw_mod_mainblock'}).find('ul').find_all('div',attrs={'class':'list-item-left'}) 53 | urls=[] 54 | for item in table: 55 | urls.append(item.find('a').get('href')) 56 | return urls 57 | 58 | class get_contact(): 59 | def __init__(self,url,ip): 60 | #super(get_contact, self).__init__() 61 | self.proxies={ 62 | 'http':'http://'+ip['ip']+':'+ip['port'] 63 | } 64 | self.url=url 65 | self.headers = { 66 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 67 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 68 | 'Accept-Language': 'en-US,en;q=0.5', 69 | 70 | 'Accept-Encoding': 'gzip, deflate' 71 | } 72 | self.session=requests.session() 73 | def run(self): 74 | try: 75 | html=self.session.get(self.url,headers=self.headers,proxies=self.proxies,timeout=10).text 76 | contact_url=BeautifulSoup(html).find('div',attrs={'class':'top-nav-bar-box'}).find('li',attrs={'data-page-name':'contactinfo'}).find('a').get('href') 77 | except: 78 | self.statue=0 79 | print('~~~') 80 | return 81 | self.statue=1 82 | try: 83 | #time.sleep(random.randint(4, 6)) 84 | html=self.session.get(contact_url,headers=self.headers,proxies=self.proxies,timeout=10).text 85 | table=BeautifulSoup(html).find('div',attrs={'class':'fd-line'}).find_all('dl') 86 | self.title=BeautifulSoup(html).find('div',attrs={'class':'contact-info'}).find('h4').get_text() 87 | self.infor=[] 88 | for item in table[:-1]: 89 | self.infor.append(item.get_text().replace('\n','').replace(' ','')) 90 | except: 91 | self.statue=0 92 | 93 | class Main(): 94 | def __init__(self): 95 | self.f=xlwt3.Workbook() 96 | self.sheet=self.f.add_sheet('sheet') 97 | self.count=0 98 | work=Get_ip() 99 | self.ips=work.run() 100 | def work(self): 101 | search_url=input('输入链接:') 102 | for i in range(100): 103 | url_get=get_urls(search_url,i+1,self.ips[random.randint(0, len(self.ips)-1)]) 104 | try: 105 | urls=url_get.get_url() 106 | except: 107 | continue 108 | for url in urls: 109 | #time.sleep(random.randint(6, 9)) 110 | spider=get_contact(url,self.ips[random.randint(0, len(self.ips)-1)]) 111 | spider.run() 112 | if spider.statue==0: 113 | continue 114 | self.sheet.write(self.count,0,spider.title) 115 | num=1 116 | for infor in spider.infor: 117 | self.sheet.write(self.count,num,infor) 118 | num+=1 119 | self.count+=1 120 | print(self.count) 121 | self.f.save('data.xls') 122 | #time.sleep(random.randint(5, 8)) 123 | def test(): 124 | test=get_urls('http://s.1688.com/company/company_search.htm?keywords=%BE%AB%C3%DC%BB%FA%D0%B5&earseDirect=false&button_click=top&n=y&pageSize=30',1) 125 | print(test.get_url()) 126 | 127 | if __name__=='__main__': 128 | work=Main() 129 | work.work() 130 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Nyspider 2 | 写的各种各样的爬虫 3 | 4 | 11选5---各省最新11选5开奖信息 5 | 6 | amazon---获取amazon商品信息 7 | 8 | anjuke---获取安居客上面小区信息 9 | 10 | dianping---大众点评上商家信息 11 | 12 | douban---豆瓣电影信息 13 | 14 | guimi---闺蜜网商品评论 15 | 16 | ingredient---化妆品成分信息 17 | 18 | itslaw---无讼网案例信息 19 | 20 | job---各招聘网站职位信息 21 | 22 | www.aihuishou.com ---爱回收手机价格 23 | 24 | www.yanglaowang.com.cn ---养老网上养老院信息 25 | 26 | www.zimuzu.tv ---电影,电视剧ed2k链接 27 | 28 | www.hexun.com ---和讯网,获取股票交易明细 29 | 30 | www.renrendai.com ---人人贷,获取贷款信息 31 | -------------------------------------------------------------------------------- /amazon/get_items.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | import xlwt 5 | from bs4 import BeautifulSoup 6 | import time 7 | import re 8 | import random 9 | 10 | class get_urls(): 11 | def __init__(self,page,keyword): 12 | self.session=requests.session() 13 | self.page=page 14 | self.keyword=keyword 15 | def get_url(self): 16 | headers = { 17 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 18 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 19 | 'Accept-Language': 'en-US,en;q=0.5', 20 | 'Accept-Encoding': 'gzip, deflate', 21 | 'DNT': 1, 22 | 'Connection': 'keep-alive'} 23 | html=self.session.get('http://www.amazon.cn/s/ref=sr_pg_3?rh=i%3Aaps%2Ck%3A'+self.keyword+'&page='+str(self.page)+'&keywords='+self.keyword+'&ie=UTF8&qid=1442030727',headers=headers).text 24 | table=BeautifulSoup(html).find('div',attrs={'id':'rightContainerATF'}) 25 | rel='' 26 | table=re.findall(re.compile(rel),str(table)) 27 | urls=[] 28 | rel='href="(.*?)"' 29 | rel=re.compile(rel) 30 | for item in table: 31 | url=re.findall(rel,str(item)) 32 | try: 33 | urls.append(url[0]) 34 | except: 35 | continue 36 | return urls 37 | 38 | class get_infor(): 39 | def __init__(self,url): 40 | self.url=url 41 | self.session=requests.session() 42 | self.headers = { 43 | "X-Forwarded-For":str(random.randint(0, 255))+'.'+str(random.randint(0, 255))+'.'+str(random.randint(0, 255))+'.'+str(random.randint(0, 255)), 44 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 45 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 46 | 'Accept-Language': 'en-US,en;q=0.5', 47 | 'Accept-Encoding': 'gzip, deflate', 48 | 'DNT': 1, 49 | 'Connection': 'keep-alive'} 50 | self.get_info() 51 | def get_info(self): 52 | self.statue=0 53 | while True: 54 | try: 55 | html=self.session.get(self.url,headers=self.headers,timeout=5).text 56 | soup=BeautifulSoup(html) 57 | self.price=''.join(soup.find('div',attrs={'id':'price'}).find('tr').get_text().replace('\n','').split()) 58 | self.title=soup.find('h1',attrs={'id':'title'}).get_text().replace('\n','') 59 | try: 60 | self.previews=''.join(list(filter(str.isdigit, soup.find('div',attrs={'id':'centerCol'}).find('a',attrs={'id':'cmrs-atf'}).get_text()))) 61 | except: 62 | self.previews=0 63 | self.picture_url=soup.find('div',attrs={'class':'imgTagWrapper'}).find('img').get('data-old-hires') 64 | self.picture=self.session.get(self.picture_url,headers=self.headers,timeout=5).content 65 | self.statue=1 66 | break 67 | except: 68 | break 69 | 70 | class Main(): 71 | def __init__(self): 72 | self.f=xlwt.Workbook() 73 | self.sheet=self.f.add_sheet('sheet',cell_overwrite_ok=True) 74 | self.count=0 75 | def work(self): 76 | keyword=input("输入关键字(英文):") 77 | page=input("输入页数:") 78 | for i in range(int(page)): 79 | try: 80 | work=get_urls(i+1,keyword) 81 | urls=work.get_url() 82 | except: 83 | continue 84 | for url in urls: 85 | item=get_infor(url) 86 | if item.statue==0: 87 | continue 88 | with open(str(self.count)+item.picture_url[-4:],'wb') as img: 89 | img.write(item.picture) 90 | img.close() 91 | self.sheet.write(self.count,0,str(self.count)) 92 | self.sheet.write(self.count,1,item.title) 93 | self.sheet.write(self.count,2,item.price) 94 | self.sheet.write(self.count,3,item.previews) 95 | self.count+=1 96 | self.f.save('data.xls') 97 | print(self.count) 98 | 99 | if __name__=='__main__': 100 | work=Main() 101 | work.work() 102 | -------------------------------------------------------------------------------- /amazon/items_usa.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | import xlwt 5 | from bs4 import BeautifulSoup 6 | import time 7 | import re 8 | import random 9 | 10 | class get_urls(): 11 | def __init__(self,page,keyword): 12 | self.session=requests.session() 13 | self.page=page 14 | self.keyword=keyword 15 | def get_url(self): 16 | headers = { 17 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 18 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 19 | 'Accept-Language': 'en-US,en;q=0.5', 20 | 'Accept-Encoding': 'gzip, deflate', 21 | 'DNT': 1, 22 | 'Connection': 'keep-alive'} 23 | html=self.session.get('http://www.amazon.com/s/ref=sr_pg_2?rh=i%3Aaps%2Ck%3A'+self.keyword+'&page='+str(self.page)+'&keywords='+self.keyword+'&ie=UTF8',headers=headers).text 24 | #table=BeautifulSoup(html).find('ul',attrs={'id':'s-results-list-atf'}) 25 | rel='a class="a-link-normal a-text-normal" href="(http.*?)"' 26 | table=re.findall(re.compile(rel),str(html)) 27 | urls=list(set(table)) 28 | return urls 29 | 30 | class get_infor(): 31 | def __init__(self,url): 32 | self.url=url 33 | self.session=requests.session() 34 | self.headers = { 35 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 36 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 37 | 'Accept-Language': 'en-US,en;q=0.5', 38 | 'Accept-Encoding': 'gzip, deflate', 39 | 'DNT': 1, 40 | 'Connection': 'keep-alive'} 41 | self.get_info() 42 | def get_info(self): 43 | self.statue=0 44 | while True: 45 | try: 46 | html=self.session.get(self.url,headers=self.headers,timeout=5).text 47 | soup=BeautifulSoup(html) 48 | self.price=''.join(soup.find('div',attrs={'id':'price'}).find('tr').get_text().replace('\n','').split()) 49 | self.title=soup.find('h1',attrs={'id':'title'}).get_text().replace('\n','') 50 | try: 51 | self.previews=''.join(list(filter(str.isdigit, soup.find('div',attrs={'id':'centerCol'}).find('a',attrs={'id':'acrCustomerReviewLink'}).get_text()))) 52 | except: 53 | self.previews=0 54 | self.picture_url=soup.find('div',attrs={'class':'imgTagWrapper'}).find('img').get('data-old-hires') 55 | self.picture=self.session.get(self.picture_url,headers=self.headers,timeout=5).content 56 | self.statue=1 57 | break 58 | except: 59 | break 60 | 61 | class Main(): 62 | def __init__(self): 63 | self.f=xlwt.Workbook() 64 | self.sheet=self.f.add_sheet('sheet',cell_overwrite_ok=True) 65 | self.count=0 66 | def work(self): 67 | keyword=input("输入关键字(英文):") 68 | page=input("输入页数:") 69 | for i in range(int(page)): 70 | try: 71 | work=get_urls(i+1,keyword) 72 | urls=work.get_url() 73 | except: 74 | continue 75 | for url in urls: 76 | item=get_infor(url) 77 | if item.statue==0: 78 | continue 79 | with open(str(self.count)+item.picture_url[-4:],'wb') as img: 80 | img.write(item.picture) 81 | img.close() 82 | self.sheet.write(self.count,0,str(self.count)) 83 | self.sheet.write(self.count,1,item.title) 84 | self.sheet.write(self.count,2,item.price) 85 | self.sheet.write(self.count,3,item.previews) 86 | self.count+=1 87 | self.f.save('data.xls') 88 | print(self.count) 89 | 90 | if __name__=='__main__': 91 | work=Main() 92 | work.work() 93 | -------------------------------------------------------------------------------- /anjuke/get_house.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | import xlwt3 5 | from bs4 import BeautifulSoup 6 | import re 7 | 8 | class get_infor(): 9 | def __init__(self,url): 10 | self.url=url 11 | self.session=requests.session() 12 | self.headers = { 13 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 14 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 15 | 'Accept-Language': 'en-US,en;q=0.5', 16 | 'Accept-Encoding': 'gzip, deflate', 17 | 'DNT': 1, 18 | 'Connection': 'keep-alive'} 19 | def work(self): 20 | html=self.session.get(self.url,headers=self.headers).text 21 | self.statue=0 22 | soup=BeautifulSoup(html) 23 | self.price=soup.find('div',attrs={'class':'comm-cont'}).find('p',attrs={'class':'mag-b2'}).get_text().replace('\n','').replace(' ','') 24 | table=soup.find('div',attrs={'class':'comm-list clearfix'}).find_all('dl') 25 | self.infortable=[] 26 | for i in table: 27 | lists=i.find_all('dd') 28 | for item in lists: 29 | self.infortable.append(item.get_text().replace('\n','').replace(' ','')) 30 | self.statue=1 31 | 32 | 33 | class get_urls(): 34 | def __init__(self,url): 35 | self.url=url 36 | self.session=requests.session() 37 | self.headers = { 38 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 39 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 40 | 'Accept-Language': 'en-US,en;q=0.5', 41 | 'Accept-Encoding': 'gzip, deflate', 42 | 'DNT': 1, 43 | 'Connection': 'keep-alive'} 44 | def run(self): 45 | html=self.session.get(self.url,headers=self.headers).text 46 | lists=BeautifulSoup(html,'lxml').find('div',attrs={'class':'pL'}).find('ul').find_all('li') 47 | urls=[] 48 | for item in lists: 49 | urls.append(item.find('a').get('href')) 50 | return urls 51 | 52 | class Main(): 53 | def work(self): 54 | self.f=xlwt3.Workbook() 55 | self.sheet=self.f.add_sheet('sheet') 56 | self.count=0 57 | for page in range(338): 58 | get_url=get_urls('http://shanghai.anjuke.com/community/W0QQp1Z7QQp'+'Z'+str(page+1)) 59 | print(page) 60 | urls=get_url.run() 61 | for url in urls: 62 | item=get_infor(url) 63 | item.work() 64 | if item.statue==0: 65 | continue 66 | self.sheet.write(self.count,0,'浦东') 67 | num=1 68 | for infor in item.infortable: 69 | self.sheet.write(self.count,num,infor) 70 | num+=1 71 | self.sheet.write(self.count,num,item.price) 72 | num+=1 73 | self.sheet.write(self.count,num,url) 74 | self.count+=1 75 | self.f.save('data.xls') 76 | def test(): 77 | test=get_infor('http://shanghai.anjuke.com/community/view/106') 78 | test.work() 79 | 80 | 81 | if __name__=='__main__': 82 | work=Main() 83 | work.work() 84 | -------------------------------------------------------------------------------- /dianping/get_info.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | import xlwt3 5 | from bs4 import BeautifulSoup 6 | import re 7 | 8 | class get_infor(): 9 | def __init__(self,url): 10 | self.url=url 11 | self.session=requests.session() 12 | self.headers = { 13 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 14 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 15 | 'Accept-Language': 'en-US,en;q=0.5', 16 | 'Accept-Encoding': 'gzip, deflate', 17 | 'DNT': 1, 18 | 'Connection': 'keep-alive'} 19 | def work(self): 20 | html=self.session.get(self.url,headers=self.headers).text 21 | self.statue=0 22 | infor=BeautifulSoup(html,'lxml').find('div',attrs={'class':'main'}).find('div',attrs={'id':'basic-info'}) 23 | try: 24 | self.title=infor.find('h1').get_text().replace('\n','').replace(' ','') 25 | except: 26 | return 27 | try: 28 | self.area=BeautifulSoup(html,'lxml').find('div',attrs={'class':'breadcrumb'}).find_all('a')[2].get_text().replace('\n','').replace(' ','') 29 | except: 30 | self.area='' 31 | try: 32 | self.address=infor.find('div',attrs={'class':'expand-info address'}).get_text().replace('\n','').replace(' ','') 33 | except: 34 | self.address=' ' 35 | try: 36 | self.tel=infor.find('span',attrs={'itemprop':'tel'}).get_text() 37 | except: 38 | self.tel=' ' 39 | table=infor.find('div',attrs={'class':'other J-other Hide'}).find_all('p') 40 | self.price='' 41 | self.times='' 42 | for item in table: 43 | try: 44 | if(item.find('span').get_text()=='营业时间:'): 45 | self.times=item.get_text().replace('\n','').replace(' ','').replace('修改','') 46 | except: 47 | continue 48 | table=infor.find('div',attrs={'class':'brief-info'}).find_all('span') 49 | for item in table: 50 | try: 51 | if(item.get_text()[:2]=='人均' or item.get_text()[:2]=='费用' or item.get_text()[:2]=='均价'): 52 | self.price=item.get_text().replace('\n','').replace(' ','') 53 | except: 54 | continue 55 | if self.price=='': 56 | self.price='--' 57 | if self.times=='': 58 | self.times='--' 59 | self.statue=1 60 | 61 | 62 | class get_urls(): 63 | def __init__(self,url): 64 | self.url=url 65 | self.session=requests.session() 66 | self.headers = { 67 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 68 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 69 | 'Accept-Language': 'en-US,en;q=0.5', 70 | 'Accept-Encoding': 'gzip, deflate', 71 | 'DNT': 1, 72 | 'Cookie':'showNav=#nav-tab|0|1; navCtgScroll=0; _hc.v="\"23f85427-5787-47bd-9df4-4e831c7a4cae.1442049973\""; __utma=1.649416466.1442049979.1442049979.1442049979.1; __utmz=1.1442049979.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; cy=1; cye=shanghai; s_ViewType=10; aburl=1; JSESSIONID=95881D627CA4C940D686AD118D776232; PHOENIX_ID=0a0308bc-14fda0ead2e-4e713c', 73 | 'Connection': 'keep-alive'} 74 | def run(self): 75 | html=self.session.get(self.url,headers=self.headers).text 76 | lists=BeautifulSoup(html,'lxml').find('div',attrs={'id':'shop-all-list'}).find_all('li') 77 | urls=[] 78 | for item in lists: 79 | urls.append('http://www.dianping.com'+item.find('a').get('href')) 80 | return urls 81 | 82 | class Main(): 83 | def work(self): 84 | self.f=xlwt3.Workbook() 85 | self.sheet=self.f.add_sheet('sheet') 86 | self.count=0 87 | for page in range(50): 88 | get_url=get_urls('http://www.dianping.com/search/category/1/20/g187r12'+'p'+str(page+1)) 89 | print(page) 90 | urls=get_url.run() 91 | for url in urls: 92 | try: 93 | item=get_infor(url) 94 | item.work() 95 | except: 96 | continue 97 | if item.statue==0: 98 | continue 99 | self.sheet.write(self.count,0,'购物') 100 | self.sheet.write(self.count,1,'超市便利店') 101 | self.sheet.write(self.count,2,'闵行') 102 | self.sheet.write(self.count,3,item.area) 103 | self.sheet.write(self.count,4,item.title) 104 | self.sheet.write(self.count,5,item.address) 105 | self.sheet.write(self.count,6,item.tel) 106 | self.sheet.write(self.count,7,item.price) 107 | self.sheet.write(self.count,8,item.times) 108 | self.sheet.write(self.count,9,url) 109 | self.count+=1 110 | self.f.save('data.xls') 111 | 112 | def test(): 113 | test=get_infor('http://www.dianping.com/shop/1909912') 114 | test.work() 115 | print(test.times) 116 | print(test.price) 117 | if __name__=='__main__': 118 | work=Main() 119 | work.work() 120 | -------------------------------------------------------------------------------- /douban/dou_movie.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import time 4 | import requests 5 | from bs4 import BeautifulSoup 6 | import os 7 | import sqlite3 8 | 9 | class Douban(): 10 | def __init__(self): 11 | self.session=requests.session() 12 | self.headers = { 13 | 'Host': 'movie.douban.com', 14 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 15 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 16 | 'Accept-Language': 'en-US,en;q=0.5', 17 | 'Accept-Encoding': 'gzip, deflate', 18 | 'DNT': 1, 19 | 'Connection': 'keep-alive'} 20 | self.session.get('http://www.douban.com',headers=self.headers) 21 | self.count=0 22 | 23 | def work(self): 24 | self.get_urls('http://www.douban.com/tag/%E5%89%A7%E6%83%85/movie',0) 25 | self.get_urls('http://www.douban.com/tag/%E5%8A%A8%E7%94%BB/movie',1) 26 | self.get_urls('http://www.douban.com/tag/%E7%8A%AF%E7%BD%AA/movie',2) 27 | self.get_urls('http://www.douban.com/tag/%E6%83%8A%E6%82%9A/movie',3) 28 | self.get_urls('http://www.douban.com/tag/%E6%82%AC%E7%96%91/movie',4) 29 | self.get_urls('http://www.douban.com/tag/cult/movie',5) 30 | self.get_urls('http://www.douban.com/tag/%E6%81%90%E6%80%96/movie',6) 31 | self.get_urls('http://www.douban.com/tag/%E6%9A%B4%E5%8A%9B/movie',7) 32 | self.get_urls('http://www.douban.com/tag/%E9%BB%91%E5%B8%AE/movie',8) 33 | 34 | def get_urls(self,url,types): 35 | dbs=['juqing_urls.db','donghua_urls.db','fanzui_urls.db','jingsong_urls.db','xuanyi_urls.db','cult_urls.db','kongbu_urls.db','baoli_urls.db','heibang_urls_db'] 36 | db=dbs[types] 37 | if os.path.isfile(db): 38 | conn = sqlite3.connect(db) 39 | cursor=conn.cursor() 40 | else: 41 | conn=sqlite3.connect(db) 42 | cursor=conn.cursor() 43 | cursor.execute("create table urls(url varchar(40) primary key)") 44 | urls=self.get_url(url) 45 | for i in urls: 46 | try: 47 | cursor.execute("insert into urls(url) values (?)",(i,)) 48 | except: 49 | continue 50 | cursor.close() 51 | conn.commit() 52 | conn.close() 53 | print(db+' OK') 54 | 55 | def get_url(self,url): 56 | num=0 57 | urls=[] 58 | while True: 59 | time.sleep(2) 60 | try: 61 | html=self.session.get(url+'?start='+str(num)).text 62 | except: 63 | break 64 | try: 65 | table=BeautifulSoup(html).find('div',attrs={'class':'mod movie-list'}).find_all('dl') 66 | except: 67 | break 68 | if table==[]: 69 | break 70 | for i in table: 71 | urls.append(i.find('a').get('href')) 72 | num+=15 73 | return urls 74 | 75 | def run(self): 76 | for i in range(9): 77 | self.get_text(i) 78 | 79 | def get_text(self,num): 80 | dbs=['juqing_urls.db','donghua_urls.db','fanzui_urls.db','jingsong_urls.db','xuanyi_urls.db','cult_urls.db'] 81 | conn = sqlite3.connect(dbs[num]) 82 | cursor = conn.execute("SELECT url from urls") 83 | file_text=open(dbs[num].replace('_urls.db','.txt'),'w',encoding='utf-8') 84 | for row in cursor: 85 | time.sleep(2) 86 | try: 87 | text=self.spider(row[0]) 88 | except: 89 | continue 90 | file_text.write(text+'\n\n') 91 | print(self.count) 92 | self.count+=1 93 | cursor.close() 94 | conn.commit() 95 | conn.close() 96 | file_text.close() 97 | 98 | def spider(self, url): 99 | html = requests.get(url, headers=self.headers).text 100 | soup = BeautifulSoup(html) 101 | name=soup.find('span',attrs={'property':'v:itemreviewed'}).get_text() 102 | picture=soup.find('img',attrs={'rel':'v:image'}).get('src') 103 | picture='[img]'+picture+'[/img]' 104 | text=name+'\n' 105 | text+=picture+'\n' 106 | info=soup.find('div',attrs={'class':'indent clearfix'}).find('div',attrs={'id':'info'}).get_text() 107 | text+=info 108 | intro=soup.find('div',attrs={'class':'related-info'}).get_text() 109 | text+=intro 110 | return text 111 | 112 | def test(): 113 | work = Douban() 114 | #work.get_url('http://www.douban.com/tag/%E7%8A%AF%E7%BD%AA/movie') 115 | print(work.spider('http://movie.douban.com/subject/3592854/?from=tag_all')) 116 | 117 | if __name__ == '__main__': 118 | work = Douban() 119 | work.work() 120 | work.run() 121 | -------------------------------------------------------------------------------- /douban/dou_tv.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import time 4 | import requests 5 | from bs4 import BeautifulSoup 6 | import os 7 | import sqlite3 8 | 9 | class Douban(): 10 | def __init__(self): 11 | self.session=requests.session() 12 | self.headers = { 13 | 'Host': 'movie.douban.com', 14 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 15 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 16 | 'Accept-Language': 'en-US,en;q=0.5', 17 | 'Accept-Encoding': 'gzip, deflate', 18 | 'DNT': 1, 19 | 'Connection': 'keep-alive'} 20 | self.count=0 21 | self.session.get('http://www.douban.com',headers=self.headers) 22 | 23 | def work(self): 24 | if os.path.isfile('tv_urls.db'): 25 | conn = sqlite3.connect('tv_urls.db') 26 | cursor=conn.cursor() 27 | else: 28 | conn=sqlite3.connect('tv_urls.db') 29 | cursor=conn.cursor() 30 | cursor.execute("create table urls(url varchar(40) primary key)") 31 | urls=self.get_url('http://movie.douban.com/tag/%E7%94%B5%E8%A7%86%E5%89%A7') 32 | for i in urls: 33 | try: 34 | cursor.execute("insert into urls(url) values (?)",(i,)) 35 | except: 36 | continue 37 | cursor.close() 38 | conn.commit() 39 | conn.close() 40 | print('OK') 41 | 42 | def get_url(self,url): 43 | num=0 44 | urls=[] 45 | while True: 46 | time.sleep(2) 47 | try: 48 | html=self.session.get(url+'?start='+str(num)+'&type=T').text 49 | except: 50 | break 51 | try: 52 | table=BeautifulSoup(html).find('div',attrs={'class':'article'}).find('div',attrs={'class':''}).find_all('table') 53 | except: 54 | break 55 | if table==[]: 56 | break 57 | for i in table: 58 | urls.append(i.find('a',attrs={'class':'nbg'}).get('href')) 59 | num+=20 60 | return urls 61 | 62 | def get_text(self): 63 | conn = sqlite3.connect('tv_urls.db') 64 | cursor = conn.execute("SELECT url from urls") 65 | file_text=open('tv.txt','w',encoding='utf-8') 66 | for row in cursor: 67 | time.sleep(2) 68 | try: 69 | text=self.spider(row[0]) 70 | except: 71 | continue 72 | file_text.write(text+'\n\n') 73 | print(self.count) 74 | self.count+=1 75 | cursor.close() 76 | conn.commit() 77 | conn.close() 78 | file_text.close() 79 | 80 | def spider(self, url): 81 | html = requests.get(url, headers=self.headers).text 82 | soup = BeautifulSoup(html) 83 | name=soup.find('span',attrs={'property':'v:itemreviewed'}).get_text() 84 | picture=soup.find('img',attrs={'rel':'v:image'}).get('src') 85 | picture='[img]'+picture+'[/img]' 86 | text=name+'\n' 87 | text+=picture+'\n' 88 | info=soup.find('div',attrs={'class':'indent clearfix'}).find('div',attrs={'id':'info'}).get_text() 89 | text+=info 90 | intro=soup.find('div',attrs={'class':'related-info'}).get_text() 91 | text+=intro 92 | return text 93 | 94 | if __name__=='__main__': 95 | work=Douban() 96 | work.work() 97 | work.get_text() 98 | -------------------------------------------------------------------------------- /guimi/guimi.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from bs4 import BeautifulSoup 4 | import requests 5 | import time 6 | import re 7 | import jieba 8 | import sys 9 | import jieba.analyse 10 | import xlwt 11 | 12 | class Urls_get(): 13 | def __init__(self,url): 14 | self.url=url 15 | self.headers = { 16 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 17 | "Accept-Encoding": "gzip, deflate", 18 | "Accept-Language": "en-US,en;q=0.5", 19 | "Connection": "keep-alive", 20 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 21 | self.session=requests.session() 22 | print('Get items') 23 | self.item_urls=[] 24 | self.get_urls() 25 | 26 | def get_urls(self): 27 | html=self.session.get(self.url,headers=self.headers).text 28 | rel=r'class="txtBg" href="(.*?)" target=' 29 | rel=re.compile(rel) 30 | lists=re.findall(rel,html) 31 | for i in lists: 32 | if(i.endswith('review')): 33 | continue 34 | if(i.endswith('product')): 35 | continue 36 | self.item_urls.append(i) 37 | count=2 38 | while(True): 39 | time.sleep(1) 40 | html=self.session.get(self.url+'&page='+str(count),headers=self.headers).text 41 | lists=re.findall(rel,html) 42 | if lists: 43 | for i in lists: 44 | if(i.endswith('review')): 45 | continue 46 | if(i.endswith('product')): 47 | continue 48 | self.item_urls.append(i) 49 | count+=1 50 | else: 51 | break 52 | 53 | class Word_frequency(): 54 | def __init__(self,file_name): 55 | self.file_name=file_name 56 | self.analyse() 57 | 58 | def analyse(self): 59 | content=open(self.file_name,'r').read() 60 | ''' 61 | text=jieba.analyse.extract_tags(content, topK=50, withWeight=True, allowPOS=('adj')) 62 | ''' 63 | text=jieba.analyse.textrank(content, topK=50, withWeight=True, allowPOS=('adj')) 64 | f=xlwt.Workbook() 65 | sheet=f.add_sheet('sheet') 66 | count=0 67 | for i in text: 68 | sheet.write(count,0,i[0]) 69 | sheet.write(count,1,i[1]) 70 | count+=1 71 | f.save('fenghua.xls') 72 | 73 | class Review_get(): 74 | def __init__(self,urls): 75 | self.urls=urls 76 | self.headers = { 77 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 78 | "Accept-Encoding": "gzip, deflate", 79 | "Accept-Language": "en-US,en;q=0.5", 80 | "Connection": "keep-alive", 81 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 82 | self.session=requests.session() 83 | print('Get reviews') 84 | self.reviews=[] 85 | self.write_review() 86 | 87 | def write_review(self): 88 | for i in self.urls: 89 | self.reviews+=self.spider(i) 90 | file=open('fenghua.txt','w') 91 | for i in self.reviews: 92 | file.write(i+'\n') 93 | file.close 94 | 95 | def spider(self,url): 96 | count=1 97 | review=[] 98 | while(True): 99 | time.sleep(0.1) 100 | html=self.session.get(url+ str(count),headers=self.headers).content 101 | soup=BeautifulSoup(html).find_all('p',attrs={'class':'com_p'}) 102 | if soup: 103 | for i in soup: 104 | review.append(i.get_text()) 105 | count+=1 106 | else: 107 | return review 108 | def Main(): 109 | item_get=Urls_get('http://so.kimiss.com/?keyword=%B7%E4%BB%A8%BB%A4%B7%A2%CB%D8&idx=10') 110 | urls=item_get.item_urls 111 | review_get=Review_get(urls) 112 | Analyse=Word_frequency('fenghua.txt') 113 | 114 | if __name__=='__main__': 115 | Main() 116 | -------------------------------------------------------------------------------- /ingredient/get_infor.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | import xlwt3 5 | from bs4 import BeautifulSoup 6 | import re 7 | import xlrd 8 | import threading 9 | import random 10 | import time 11 | 12 | class Get_ip(): 13 | def __init__(self,num): 14 | self.headers = { 15 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 16 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 17 | 'Accept-Language': 'en-US,en;q=0.5', 18 | 'Accept-Encoding': 'gzip, deflate', 19 | 'Connection': 'keep-alive'} 20 | self.url='http://vxer.daili666api.com/ip/?tid=559950660678689&num='+str(num)+'&delay=3&category=2' 21 | self.session=requests.session() 22 | def get(self): 23 | ip=self.session.get(self.url,headers=self.headers).text.replace('\n','') 24 | return ip 25 | 26 | class Get_infor(threading.Thread): 27 | def __init__(self,score,english_name): 28 | super(Get_infor, self).__init__() 29 | self.session=requests.session() 30 | self.headers = { 31 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 32 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 33 | 'Accept-Language': 'en-US,en;q=0.5', 34 | 'Accept-Encoding': 'gzip, deflate', 35 | 'Connection': 'keep-alive'} 36 | self.f=xlwt3.Workbook() 37 | self.sheet=self.f.add_sheet('sheet') 38 | self.count=0 39 | self.score=score 40 | self.english_name=english_name 41 | 42 | def run(self): 43 | self.statue=1 44 | try: 45 | html=self.session.get('http://www.cosdna.com/chs/stuff.php?q='+self.english_name,headers=self.headers).text 46 | except: 47 | self.statue=0 48 | return 49 | try: 50 | url=BeautifulSoup(html).find('div',attrs={'class':'StuffResult'}).find('tr').find('a').get('href') 51 | self.infor('http://www.cosdna.com/chs/'+url) 52 | except: 53 | try: 54 | self.infor('http://www.cosdna.com/chs/stuff.php?q='+self.english_name) 55 | except: 56 | self.statue=0 57 | 58 | def infor(self,url): 59 | html=self.session.get(url,headers=self.headers).text 60 | infor_table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'StuffDetail'}) 61 | try: 62 | self.chinese_name=infor_table.find('div',attrs={'class':'Stuff_DetailC'}).get_text() 63 | except: 64 | self.statue=0 65 | return 66 | rel='r/>(.*?)