├── 1688 └── get_tel.py ├── 58.213.159.173 └── jiangsu_Atmosphere.py ├── JobGet └── JobInforGet.py ├── Nyspider.py ├── README.md ├── ali_comments ├── fan_jian.py ├── langconv.py ├── taobao.py ├── tianmao.py └── zh_wiki.py ├── amap └── amap.py ├── amazon ├── get_items.py ├── items_usa.py └── shopProducts.py ├── anjuke ├── anjuke_hourse.py ├── community.py ├── get_house.py └── location.py ├── apk.91.com ├── Send_email.py ├── email_game.py └── email_soft.py ├── apps.fas.usda.gov └── psdQuery.py ├── aso100.com ├── aso100_ui.py └── aso100_ui_v2.py ├── baidu.lecai.com ├── lottery.py └── www.zy91.com │ └── zndz.py ├── baidumap ├── baidumap.py └── city_ids.txt ├── bbs.tianya.cn └── comments.py ├── bjguahao.gov.cn ├── bjguahao.py ├── bjguahao_v2.py └── bjguahao_v3.py ├── brokerbin.com ├── brokerbin.py ├── brokerbin_3.py ├── email_template ├── filter │ └── filter.txt └── send_email.py ├── buluo.qq.com └── images.py ├── chart.cp.360.cn └── charthistory.py ├── china.tandfonline.com └── search_article.py ├── club.qingdaonews.com └── article.py ├── cn.bing.com ├── bing_search.py └── urls.txt ├── data.cma.gov.cn ├── Duplicate.py └── get_data.py ├── datacenter.mep.gov.cn ├── air_dairy.py └── air_dairy_aqi.py ├── dianping ├── comments.txt ├── data │ ├── 上海.xls │ ├── 北京.xls │ ├── 南京.xls │ ├── 厦门.xls │ ├── 大连.xls │ ├── 天津.xls │ ├── 宁波.xls │ ├── 广州.xls │ ├── 成都.xls │ ├── 无锡.xls │ ├── 杭州.xls │ ├── 武汉.xls │ ├── 沈阳.xls │ ├── 济南.xls │ ├── 深圳.xls │ ├── 苏州.xls │ ├── 西安.xls │ ├── 郑州.xls │ ├── 重庆.xls │ ├── 长沙.xls │ └── 青岛.xls ├── get_info.py ├── memberlist.py ├── memberlist.txt ├── shopinfor.py └── shoplist.py ├── douban ├── dou_movie.py ├── dou_tv.py ├── movie_grade.py ├── movieinfor.py └── movies.txt ├── downloadbooks └── save_into_baiduyun.py ├── duapp2.drexel.edu ├── TMS.py ├── TMSCourse_Excel.py └── TMSCourse_Sqlite.py ├── finance.sina.com.cn ├── ManagerInfo.py └── codes.txt ├── finance.yahoo.com ├── finance.py └── new_finance.py ├── forecast.io ├── forecast.py └── getData.py ├── fsfc.fsjw.gov.cn └── house.py ├── gcjs.linfen.gov.cn └── company.py ├── hklock.com └── products.py ├── itslaw └── get_anli.py ├── jbk.39.net └── disease.py ├── job.qiaosiwang.com └── workinfor.py ├── job ├── Job_get.py └── REANME.md ├── landchina ├── infor.py └── landchina.py ├── lvyou.baidu.com └── guilin.py ├── mall.jd.com └── jd_shop.py ├── maoyan ├── Duplicate.py ├── get_infor.py └── maoyan.py ├── music.163.com └── music_lists.py ├── news.sohu.com └── news.py ├── news_get ├── cn.chinadaily.com.cn │ └── chinadaily.com.cn.py ├── people.com.cn │ └── people.com.cn.py ├── www.cankaoxiaoxi.com │ └── cankaoxiaoxi.com.py ├── www.eastday.com │ └── eastday.com.py ├── www.gmw.cn │ └── gmw.cn.py ├── www.haiwainet.cn │ └── haiwainet.cn.py ├── www.huanqiu.com │ └── huanqiu.com.py ├── www.youth.cn │ └── youth.cn.py └── www.zaobao.com │ └── zaobao.com.py ├── newseed.pedaily.cn └── invest.py ├── pan.baidu.com └── sharelink.py ├── qimingpian.com └── qimingpian.py ├── rank.kongzhong.com └── userInfor.py ├── stock.finance.qq.com ├── stk_holder.py ├── stkcode.py └── stkcode.txt ├── stock.jrj.com.cn └── flowhistory.py ├── taobao ├── suggest.py └── sycm.py ├── tur.bizdirlib.com └── bizdirlib.py ├── waimai.meituan.com └── orderlist.py ├── weibo ├── weibo.md └── weibo.py ├── weidian └── weidian.py ├── wenda.so.com ├── question.py └── search.py ├── wenshu.court.gov.cn └── download.py ├── worldfreightrates └── trates.py ├── www.18ladys.com └── 18ladys.py ├── www.360che.com └── products.py ├── www.3j1688.com └── 3j1688.py ├── www.58.com ├── JobInforGet.py ├── company.py ├── companyExcel.py └── sendemail.py ├── www.aihuishou.com └── get_price.py ├── www.airbnb.com ├── deal.py ├── roominfor.py ├── rooms.py └── userinfor.py ├── www.aqistudy.cn └── aqistudy.py ├── www.autozi.com ├── carBrandLetter.py ├── products.py └── products_infor.py ├── www.b8b8.tv ├── ballbar_mobile.py └── ballbar_pc.py ├── www.baikemy.com └── disease.py ├── www.cbooo.cn └── cbooo.py ├── www.chazidian.com └── yuwen.py ├── www.chealth.org.cn └── disease.py ├── www.china-10.com ├── china10.py └── excel.py ├── www.chuanlaoda.cn ├── CaptchaOCR.dll ├── chuanlaoda.py ├── py2exe_install.py ├── testdll.py └── x64 │ └── CaptchaOCR.dll ├── www.cjsyw.com └── ship.py ├── www.cofeed.com └── cofeed.py ├── www.cpbz.gov.cn ├── company.py └── write_to_excel.py ├── www.ctrip.com ├── comments.py ├── comments_bydate.py └── youtrip.py ├── www.dicos.com.cn ├── citys.txt └── storelist.py ├── www.eastmoney.com ├── company.py ├── guba.py ├── iguba.py ├── quote.py ├── transaction.py └── urls.txt ├── www.fang.com ├── get_hourse.py └── new_hourse.py ├── www.gamefaqs.com └── gameinfor.py ├── www.ganji.com └── ganji_tel.py ├── www.gewara.com └── reviews.py ├── www.guahao.com ├── doctor.py └── hospital.py ├── www.hexun.com └── hexun.py ├── www.ifeng.com └── fashionhealth.py ├── www.imdb.com ├── boxoffice.py ├── movies.py └── rottentomatoes.py ├── www.itjuzi.com ├── baseInvestevents.py ├── company.py ├── companylist.py ├── investevents.py ├── itjuzi.py └── tag_itjuzi.py ├── www.jfz.com └── products.py ├── www.jisilu.com ├── JiSiLu.py └── jisilu.py ├── www.kfc.com ├── citys.txt └── storelist.py ├── www.kimiss.com ├── Nyspider.py ├── baby.txt ├── baby_pro.txt ├── get_product.py └── man.txt ├── www.lagou.com └── lagou.py ├── www.lianjia.com └── lianjiahourse.py ├── www.liepin.com └── liepin.py ├── www.locoso.com └── locoso.py ├── www.mohurd.gov.cn ├── company.py ├── deal.py ├── registrar_thread.py └── registrarinfor.py ├── www.ncbi.nlm.nih.gov ├── gethtml.py ├── parser.py ├── pubmed.py └── write_to_excel.py ├── www.pizzahut.com.cn ├── citys.txt └── storelist.py ├── www.pm25.in └── pm25.py ├── www.ppdai.com ├── Tppdai.py ├── excel.py ├── get_data.py ├── invest.py ├── ppdai.py └── ppdaiInfor.py ├── www.renrendai.com └── renrendai.py ├── www.sxhouse.com.cn └── sxhouse.py ├── www.teld.cn ├── setting │ └── cities.txt └── teld.py ├── www.tichk.org └── travel_agent.py ├── www.tjcn.org └── patent.py ├── www.trademaps.cn └── trademaps.py ├── www.tripadvisor.com ├── deal.py ├── getpage.py ├── moredata.py └── userinfor.py ├── www.tyshbj.com.cn └── tyshbj.py ├── www.ukers.cn └── ukers.py ├── www.variflight.com ├── flights_num.txt ├── icon │ ├── 0 │ │ ├── 20.png │ │ └── 23.png │ ├── 1 │ │ ├── 1.png │ │ └── 4.png │ ├── 2 │ │ ├── 0.png │ │ └── 33.png │ ├── 3 │ │ ├── 43.png │ │ └── 64.png │ ├── 4 │ │ ├── 3.png │ │ └── 9.png │ ├── 5 │ │ ├── 71.png │ │ └── 8.png │ ├── 6 │ │ ├── 19.png │ │ ├── 51.png │ │ └── 6.png │ ├── 7 │ │ ├── 16.png │ │ └── 26.png │ ├── 8 │ │ ├── 93.png │ │ └── 98.png │ ├── 9 │ │ ├── 21.png │ │ └── 31.png │ ├── 24 │ │ ├── 117.png │ │ ├── 304.png │ │ └── 783.png │ ├── 44 │ │ ├── 141.png │ │ └── 88.png │ ├── b │ │ ├── 2202.png │ │ └── 2248.png │ ├── m │ │ ├── 2397.png │ │ ├── 2408.png │ │ └── 2419.png │ └── s │ │ ├── 2245.png │ │ ├── 2413.png │ │ └── 2424.png ├── ui_variflight.py └── variflight.py ├── www.vvic.com └── getitems.py ├── www.watchseries.li └── watchseries.py ├── www.we.com └── renrendai.py ├── www.yelp.com ├── restaurant_infor.py └── restaurants.py ├── www.yhd.com ├── data.xls ├── replace.py ├── shopinfor.py └── text.html ├── www.zdic.net ├── words.txt ├── write_to_excel.py └── zdic.py ├── www.zhongchou.com ├── Duplicate.py ├── excel.py ├── get_id.py ├── get_infor.py └── other.py ├── www.zimuzu.tv ├── movie_get.py └── tv_get.py ├── www.zy91.com └── zndz.py ├── wwwapps.ups.com ├── search.py └── write2excel.py ├── xxgk.jl.gov.cn └── infor.py ├── yangcong345.com └── yangcong345.py ├── zhidao.baidu.com ├── question.py └── search.py ├── zhihu ├── get_followee.py ├── top500.py ├── zhihu_search.py └── zhihuinfor.py └── zsb.suda.edu.cn ├── inquery.py ├── markhistory.py └── new_markhistory.py /Nyspider.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | import os 5 | import sqlite3 6 | import xlwt3 7 | from email import encoders 8 | from email.header import Header 9 | from email.mime.text import MIMEText 10 | from email.utils import parseaddr,formataddr 11 | import smtplib 12 | import datetime 13 | 14 | headers = { 15 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 16 | "Accept-Encoding": "gzip, deflate", 17 | "Accept-Language": "en-US,en;q=0.5", 18 | "Connection": "keep-alive", 19 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 20 | 21 | 22 | def get_image(image_url,image_name): 23 | content=requests.get(image_url,headers=headers).content 24 | with open(image_name,'wb') as f: 25 | f.write(content) 26 | f.close 27 | 28 | def to_Excel(): 29 | for filename in os.listdir('.'): 30 | if(filename.endswith('txt')): 31 | f_d=open(filename,'r') 32 | f_ex=xlwt3.Workbook() 33 | sheet=f_ex.add_sheet('one') 34 | count=0 35 | for line in f_d.readlines(): 36 | lists=line.split('|') 37 | try: 38 | num=0 39 | for text in lists: 40 | sheet.write(count,num,text) 41 | num+=1 42 | count+=1 43 | except: 44 | sheet=f_ex.add_sheet('two') 45 | count=0 46 | num=0 47 | for text in lists: 48 | sheet.write(count,num,text) 49 | num+=1 50 | count+=1 51 | f_ex.save(filename.replace('txt','xls')) 52 | 53 | def send_email(email,subject,text,user,passwd): 54 | smtp_server='smtp.126.com' 55 | msg = MIMEText(text, 'plain', 'utf-8') 56 | msg['Subject']=subject 57 | msg['From'] = _format_addr(user) 58 | msg['To'] = _format_addr(email) 59 | server = smtplib.SMTP(smtp_server, 25) 60 | server.set_debuglevel(1) 61 | server.login(user, passwd) 62 | server.sendmail(user, [email], msg.as_string()) 63 | server.quit() 64 | 65 | def convert_html(html): 66 | return html.encode('ISO-8859-1').decode('utf-8','ignore') 67 | 68 | def Duplicate(): 69 | for filename in os.listdir('.'): 70 | if filename.endswith('txt'): 71 | lines=open(filename,'r').readlines() 72 | lines=list(set(lines)) 73 | lines.sort() 74 | f=open(filename,'w') 75 | for line in lines: 76 | f.write(line) 77 | f.close() 78 | 79 | def yesterday_get(today=datetime.datetime.now()): 80 | oneday = datetime.timedelta(days=1) 81 | yesterday = today- oneday 82 | return yesterday 83 | -------------------------------------------------------------------------------- /ali_comments/fan_jian.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | from langconv import * 3 | import xlrd 4 | import xlwt3 5 | 6 | # 转换繁体到简体 7 | def run(): 8 | name='相机' 9 | f=xlwt3.Workbook(encoding='utf-8') 10 | sheet=f.add_sheet('sheet') 11 | data=xlrd.open_workbook(name+'.xls') 12 | table=data.sheets()[0] 13 | for i in range(table.nrows): 14 | line=table.cell(i,0).value 15 | line=fan_jian(line) 16 | sheet.write(i,0,line) 17 | f.save(name+'_.xls') 18 | 19 | 20 | def fan_jian(line): 21 | line = Converter('zh-hans').convert(line)#.decode('utf-8')) 22 | line = line#.encode('utf-8') 23 | return line 24 | 25 | def jian_fan(line): 26 | line = Converter('zh-hant').convert(line.decode('utf-8')) 27 | line = line.encode('utf-8') 28 | return line 29 | 30 | run() 31 | -------------------------------------------------------------------------------- /ali_comments/taobao.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | import xlwt3 5 | import re 6 | import requests 7 | requests.packages.urllib3.disable_warnings() 8 | 9 | class Get_comments(object): 10 | """docstring for Get_comments""" 11 | def __init__(self): 12 | super(Get_comments, self).__init__() 13 | self.f=xlwt3.Workbook() 14 | self.sheet=self.f.add_sheet('sheet') 15 | self.headers = { 16 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 17 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 18 | 'Accept-Language': 'en-US,en;q=0.5', 19 | 'Accept-Encoding': 'gzip, deflate', 20 | 'Cookie':"isg=1895AE3ACA648D8B28455A6D1992F41F; l=AvX1ovPGHd3jI30I58r3v3IcJXuvcqmE; t=1e6dd9b5d55aacb2ca5e07cb5be03a2b; thw=cn; cna=7Dd0DgMB+HcCAXrNCByTSHxR; uc3=nk2=1pCplIlkFn7n&id2=WvAz2mB1qeE%2F&vt3=F8dASMh%2Fnu8OGgfEtGM%3D&lg2=URm48syIIVrSKA%3D%3D; tracknick=%5Cu98A0%5Cu6C9B%5Cu4E4B%5Cu590F3; _cc_=URm48syIZQ%3D%3D; tg=0; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; mt=np=&ci=-1_0&cyk=0_0; ali_ab=211.69.194.131.1444291484725.8; lgc=%5Cu98A0%5Cu6C9B%5Cu4E4B%5Cu590F3; lzstat_uv=13179738183169067975|3492151@3600092@3288243@3260534; v=0; cookie2=1cdef8cc85ef4b19772fd48de808f9c0; _tb_token_=0BF8LVbNvUzT; uc1=cookie14=UoWzXLHAxnd7aw%3D%3D&existShop=true&cookie16=V32FPkk%2FxXMk5UvIbNtImtMfJQ%3D%3D&cookie21=WqG3DMC9Edo1SB5NB6Qtng%3D%3D&tag=2&cookie15=W5iHLLyFOGW7aA%3D%3D&pas=0; hng=CN%7Czh-cn%7CCHN; existShop=MTQ0NTE1NzMzOQ%3D%3D; sg=343; cookie1=BYTvDkInmXl2wO%2F6AW0tX%2Bpb6nHX4a5Olly%2Fg4DvWfE%3D; unb=907324234; skt=ae45361e45082d58; publishItemObj=Ng%3D%3D; _l_g_=Ug%3D%3D; _nk_=%5Cu98A0%5Cu6C9B%5Cu4E4B%5Cu590F3; cookie17=WvAz2mB1qeE%2F", 21 | 'Connection': 'keep-alive'} 22 | self.count=0 23 | import ssl 24 | ssl._create_default_https_context = ssl._create_unverified_context 25 | self.url='https://rate.taobao.com/feedRateList.htm?callback=jsonp_reviews_list&userNumId=84131819&auctionNumId=6774286903&siteID=3&rateType=&orderType=sort_weight&showContent=1&attribute=¤tPageNum=' 26 | def run(self): 27 | cert='/home/nyloner/work/ali_comments/cert.pem' 28 | for page in range(80): 29 | html=requests.get(self.url+str(page+1),headers=self.headers,verify=False).text 30 | print(html) 31 | rel='content":"(.*?)"' 32 | comments=re.findall(rel,html) 33 | for item in comments: 34 | self.sheet.write(self.count,0,item) 35 | self.count+=1 36 | self.f.save('麻辣花生.xls') 37 | print(self.count) 38 | 39 | work=Get_comments() 40 | work.run() 41 | -------------------------------------------------------------------------------- /amap/amap.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import time 4 | from bs4 import BeautifulSoup 5 | import random 6 | 7 | headers = { 8 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 9 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 10 | 'Accept-Language': 'en-US,en;q=0.5', 11 | 'Accept-Encoding': 'gzip, deflate'} 12 | 13 | def get_province(): 14 | html=requests.get('http://ditu.amap.com/service/poiInfo?query_type=TQUERY&pagesize=20&pagenum=1&qii=true&cluster_state=5&need_utd=true&utd_sceneid=1000&div=PC1000&addr_poi_merge=true&is_classify=true&city=100000&geoobj=19.198221%7C11.793397%7C-172.051779%7C53.547635&keywords=%E5%B9%B2%E6%9E%9C',headers=headers).text 15 | data=json.loads(html) 16 | table=BeautifulSoup(data['html'],'lxml').find_all('div',{'class':'sug-province'}) 17 | f=open('citys.txt','a') 18 | for item in table: 19 | try: 20 | province=item.find('b').get_text() 21 | citys=item.find_all('a',{'class':'citycode'}) 22 | for city in citys: 23 | f.write(province+'|'+city.get_text()+'|'+city.get('adcode')+'\n') 24 | except: 25 | continue 26 | f.close() 27 | 28 | def search(key,citycode): 29 | page=1 30 | result=[] 31 | while True: 32 | html=requests.get('http://ditu.amap.com/service/poiInfo?query_type=TQUERY&pagesize=20&pagenum=%s&qii=true&cluster_state=5&need_utd=true&div=PC1000&addr_poi_merge=true&is_classify=true&city=%s&keywords=%s'%(page,citycode,key),headers=headers).text 33 | data=json.loads(html)['data'][0]['list'] 34 | if data==[]: 35 | break 36 | for item in data: 37 | try: 38 | tel=item['templateData']['tel'] 39 | address=item['address'] 40 | name=item['name'] 41 | result.append(name+'| '+address+' |'+tel) 42 | except: 43 | continue 44 | page+=1 45 | print(citycode,page) 46 | time.sleep(random.randint(2,8)) 47 | return result 48 | 49 | def main(): 50 | for line in open('citys.txt','r'): 51 | line=line.replace('\n','') 52 | code=line.split('|')[-1] 53 | try: 54 | result=search('干果',code) 55 | except: 56 | failed=open('failed.txt','a') 57 | failed.write(line+'\n') 58 | failed.close() 59 | continue 60 | f=open('result.txt','a') 61 | for item in result: 62 | f.write(line+'|'+item+'\n') 63 | f.close() 64 | print(line) 65 | main() 66 | -------------------------------------------------------------------------------- /anjuke/location.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | 4 | 5 | def get_location(address,city): 6 | url='http://api.map.baidu.com/place/v2/search?query=%s®ion=%s&city_limit=true&output=json&ak=fh980b9Ga64S8bl8QblSC3kq'%(address,city) 7 | html=requests.get(url).text 8 | try: 9 | data=json.loads(html)['results'][0]['location'] 10 | except: 11 | return '' 12 | lng=data['lng'] 13 | lat=data['lat'] 14 | return str(lng)+'|'+str(lat) 15 | 16 | 17 | line=get_location('滨湖新区四川路与云谷路交口西北角','合肥') 18 | -------------------------------------------------------------------------------- /baidumap/baidumap.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import requests 3 | import json 4 | import time 5 | import re 6 | 7 | 8 | headers = { 9 | 'Host':"map.baidu.com", 10 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 11 | "Accept-Encoding": "gzip, deflate", 12 | "Accept-Language": "en-US,en;q=0.5", 13 | "Connection": "keep-alive", 14 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 15 | 16 | def citys(): 17 | html=requests.get('http://map.baidu.com/?newmap=1&reqflag=pcmap&biz=1&from=webmap&da_par=baidu&pcevaname=pc4.1&qt=s&da_src=searchBox.button&wd=%E6%B1%BD%E8%BD%A6%E7%BE%8E%E5%AE%B9%E5%BA%97&c=1&src=0&wd2=&sug=0&l=5&b=(7002451.220000001,1994587.88;19470675.22,7343963.88)&from=webmap&biz_forward={%22scaler%22:1,%22styles%22:%22pl%22}&sug_forward=&tn=B_NORMAL_MAP&nn=0&u_loc=12736591.152491,3547888.166124&ie=utf-8&t=1459951988807',headers=headers).text 18 | f=open('city_ids.txt','a') 19 | data=json.loads(html) 20 | for item in data['content']: 21 | #for city in item['city']: 22 | f.write(str(item)+'\n') 23 | f.close() 24 | 25 | def get_infor(keyword,code,page): 26 | html=requests.get('http://map.baidu.com/?newmap=1&reqflag=pcmap&biz=1&from=webmap&da_par=baidu&pcevaname=pc4.1&qt=con&from=webmap&c='+str(code)+'&wd='+keyword+'&wd2=&pn='+str(page)+'&nn='+str(page*10)+'&db=0&sug=0&addr=0&&da_src=pcmappg.poi.page&on_gel=1&src=7&gr=3&l=12&tn=B_NORMAL_MAP&u_loc=12736591.152491,3547888.166124&ie=utf-8',headers=headers).text 27 | data=json.loads(html)['content'] 28 | return data 29 | 30 | 31 | def main(): 32 | keys=['眼镜店','视光中心'] 33 | for keyword in keys: 34 | f=open(keyword+'_tels.txt','a') 35 | for line in open('city_ids.txt','r').readlines(): 36 | line=line.replace('\n','') 37 | code=eval(line)['code'] 38 | page=1 39 | while True: 40 | try: 41 | data=get_infor(keyword,code,page) 42 | except: 43 | break 44 | if data==[]: 45 | break 46 | for item in data: 47 | f.write(str(item)+'\n') 48 | page+=1 49 | print(code,page) 50 | time.sleep(1) 51 | f.close() 52 | main() 53 | -------------------------------------------------------------------------------- /brokerbin.com/email_template: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |

9 | Hi {name}, 10 |

11 |

12 | We have following quote matches your "searching" on Brokerbin: 13 |

14 |

15 | {product_name} new landed {price} each 3days delievery 16 |

17 |

18 | 19 | Click here to buy on our website. 20 | 21 |

22 |

23 | Please know the special price only provide after you logined. 24 |

25 |

26 | Thanks! 27 |

28 |

29 |

30 |

31 | -- 32 |

33 |

34 | *Please register at our web-store:www.sailnetwork.com to check P&A 24-7. Regular coupons will be sent to register customers, and product prices will be lower online. 35 |
36 | Best regards; 37 |
38 | Sales Department | Sail Network Co., Ltd. 39 |
40 | Office: +86(0)2154223056*8004 41 |
42 | E-mail: sales@sailnetwork.com 43 |
44 | E-Shop: www.sailnetwork.com 45 |
46 | No.3-318, Lane7058, Zhongchun Rd. Shanghai, China 47 |

48 | 49 | 50 | -------------------------------------------------------------------------------- /brokerbin.com/filter/filter.txt: -------------------------------------------------------------------------------- 1 | nocsupply.com 2 | nfsmith.nl 3 | florinconnect.com 4 | 3c-systerms.com 5 | arbitech.com 6 | fulinetwork.com 7 | beaoncn.com 8 | marketconnections.nl 9 | globalnetworkstech.com 10 | konnect8.co.uk 11 | inventusgroup.com 12 | square1product.com 13 | squarelnc.com 14 | apexitltd.com 15 | uniontechcoop.com 16 | -------------------------------------------------------------------------------- /brokerbin.com/send_email.py: -------------------------------------------------------------------------------- 1 | from email import encoders 2 | from email.header import Header 3 | from email.mime.text import MIMEText 4 | from email.utils import parseaddr,formataddr 5 | import smtplib 6 | import time 7 | import os 8 | import json 9 | 10 | 11 | def _format_addr(s): 12 | name, addr = parseaddr(s) 13 | return formataddr((Header(name, 'utf-8').encode(), addr)) 14 | 15 | def sendEmail(fromemail,passwd,toemail,subject,text): 16 | msg = MIMEText(text, 'html', 'utf-8') 17 | msg['Subject']=subject 18 | msg['From'] = _format_addr(fromemail.replace('foxmail','sailnetwork')) 19 | msg['To'] = _format_addr(toemail) 20 | server=smtplib.SMTP_SSL('smtp.qq.com') 21 | server.ehlo('smtp.qq.com') 22 | server.login(fromemail,passwd) 23 | server.sendmail(fromemail, [toemail], msg.as_string()) 24 | server.quit() 25 | 26 | def load_emails(filename): 27 | f=open('email/'+filename,'r',encoding='utf-8').read() 28 | emails=[] 29 | for item in f.split('---'*8): 30 | try: 31 | lines=item.split('***'*4) 32 | subject=lines[0].replace('\r\n','') 33 | email=lines[1].replace('\r\n','').replace(' ','') 34 | text=lines[2] 35 | emails.append([email,subject,text]) 36 | except: 37 | continue 38 | return emails 39 | 40 | def load_login(): 41 | f=open('./email.json','r',encoding='utf8') 42 | data=json.load(f) 43 | return data 44 | 45 | def main(): 46 | try: 47 | data=load_login() 48 | fromemail=data['fromemail'] 49 | passwd=data['passwd'] 50 | toemail=data['toemail'] 51 | except: 52 | print("帐号导入失败") 53 | return 54 | for filename in os.listdir('email'): 55 | try: 56 | emails=load_emails(filename) 57 | except: 58 | print(filename,'load failed') 59 | for i in range(len(emails)): 60 | try: 61 | email=emails[i] 62 | subject=email[1].replace('\r','').replace('\n','').replace('\t','').replace(' ','')+'\t'+email[0].replace('\r','').replace('\n','').replace('\t','').replace(' ','') 63 | except: 64 | continue 65 | try: 66 | sendEmail(fromemail,passwd,toemail,subject,email[2]) 67 | time.sleep(2) 68 | print(subject,'send ok') 69 | except: 70 | print(subject,'failed') 71 | print(filename,'完成') 72 | 73 | main() 74 | time.sleep(60) 75 | -------------------------------------------------------------------------------- /buluo.qq.com/images.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import os 4 | import json 5 | import time 6 | 7 | 8 | headers = { 9 | 'X-Requested-With': 'XMLHttpRequest', 10 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 11 | "Accept-Encoding": "gzip, deflate", 12 | "Accept-Language": "en-US,en;q=0.5", 13 | "Connection": "keep-alive", 14 | 'Referer': 'http://buluo.qq.com/mobile/barindex.html?_wv=1027&_bid=128&from=recentvisited&bid=15226', 15 | "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36"} 16 | 17 | def get_page(bid,page): 18 | data={ 19 | 'bid':bid, 20 | 'num':'10', 21 | 'start':page*10, 22 | 'bkn':'' 23 | } 24 | html=requests.post('http://buluo.qq.com/cgi-bin/bar/post/get_post_by_page',headers=headers,data=data).text 25 | data=json.loads(html)['result']['posts'] 26 | result=[] 27 | for item in data: 28 | try: 29 | title=item['title'] 30 | pic_list=item['post']['pic_list'] 31 | except: 32 | continue 33 | result.append([title,pic_list]) 34 | return result 35 | 36 | def save_image(filedir,filename,img_url): 37 | headers = { 38 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 39 | "Accept-Encoding": "gzip, deflate", 40 | "Accept-Language": "en-US,en;q=0.5", 41 | "Connection": "keep-alive", 42 | "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36"} 43 | content=requests.get(img_url,headers=headers,timeout=30).content 44 | with open('images/%s/%s.jpg'%(filedir,filename),'wb') as img: 45 | img.write(content) 46 | 47 | def main(): 48 | bid=input("输入bid:") 49 | try: 50 | startpage=input("起始页码:") 51 | startpage=int(startpage)-1 52 | except: 53 | startpage=0 54 | try: 55 | endpage=input("结束页码:") 56 | endpage=int(endpage)-1 57 | except: 58 | endpage=10 59 | filedir=1 60 | try: 61 | os.mkdir('images/') 62 | except: 63 | pass 64 | while startpage<=endpage: 65 | images=get_page(bid,startpage) 66 | for image in images: 67 | try: 68 | os.mkdir('images/'+str(filedir)) 69 | except: 70 | pass 71 | f=open('images/%s/content.txt'%filedir,'a',encoding='utf-8') 72 | f.write(image[0]) 73 | f.close() 74 | imgnum=1 75 | for img in image[1]: 76 | try: 77 | save_image(filedir,imgnum,img['url']) 78 | except: 79 | continue 80 | imgnum+=1 81 | print('page',startpage,filedir,'ok') 82 | filedir+=1 83 | startpage+=1 84 | print(startpage,'ok') 85 | time.sleep(2) 86 | 87 | main() 88 | -------------------------------------------------------------------------------- /chart.cp.360.cn/charthistory.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import time 4 | import datetime 5 | 6 | headers = { 7 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 8 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 9 | 'Accept-Language': 'en-US,en;q=0.5', 10 | 'Accept-Encoding': 'gzip, deflate', 11 | 'Connection': 'keep-alive'} 12 | 13 | def get_history(date): 14 | url='http://chart.cp.360.cn/kaijiang/kaijiang?lotId=255401&spanType=2&span=%s_%s'%(date,date) 15 | html=requests.get(url,headers=headers).text.encode('iso-8859-1').decode('gbk') 16 | tables=BeautifulSoup(html,'lxml').find('div',id='his-tab').find('table',{'width':'100%'}).find_all('table') 17 | result=[] 18 | for table in tables: 19 | for tr in table.find_all('tr'): 20 | try: 21 | tds=tr.find_all('td') 22 | number=tds[0].get_text() 23 | if number=='': 24 | continue 25 | value=tds[1].get_text() 26 | if value=='': 27 | continue 28 | value1=value[:3] 29 | value2=value[1:4] 30 | value3=value[2:] 31 | result.append([date,number,value,value1,value2,value3]) 32 | except: 33 | continue 34 | return result 35 | 36 | def nextday(d): 37 | oneday = datetime.timedelta(days=1) 38 | day = d+oneday 39 | return day 40 | 41 | def main(): 42 | day=datetime.datetime.strptime('2010-01-01','%Y-%m-%d') 43 | while True: 44 | str_day=str(day).split(' ')[0] 45 | f=open('result.txt','a') 46 | try: 47 | result=get_history(str_day) 48 | except: 49 | print(str_day,'failed') 50 | time.sleep(1) 51 | continue 52 | for item in result: 53 | f.write(str(item)+'\n') 54 | f.close() 55 | day=nextday(day) 56 | print(str_day,'ok') 57 | time.sleep(1) 58 | if str_day=='2016-10-23': 59 | break 60 | 61 | main() 62 | -------------------------------------------------------------------------------- /china.tandfonline.com/search_article.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import openpyxl 4 | import time 5 | 6 | headers = { 7 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 8 | "Accept-Encoding": "gzip, deflate", 9 | "Accept-Language": "en-US,en;q=0.5", 10 | "Connection": "keep-alive", 11 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 12 | 13 | def get_articles(): 14 | page=0 15 | while True: 16 | html=requests.get('http://china.tandfonline.com/action/doSearch?AllField=urban+design&Ppub=%5B20151107+TO+20161107%5D&content=standard&countTerms=true&target=default&sortBy=&pageSize=50&subjectTitle=&startPage='+str(page),headers=headers).text 17 | table=BeautifulSoup(html,'lxml').find('ol',{'class':'search-results'}).find_all('li') 18 | f=open('titles.txt','a') 19 | for item in table: 20 | title=item.find('article').get('data-title') 21 | f.write(title+'\n') 22 | f.close() 23 | page+=1 24 | print('抓取第',page,'页') 25 | #time.sleep(1) 26 | if page==267: 27 | break 28 | 29 | def word_cut(): 30 | text=open('./titles.txt','r').read() 31 | text=text.replace(':',' ').replace("?",' ').replace('.','').replace(')',' ').replace('(','').replace('+','').replace('“','').replace('”','').replace('\n','') 32 | words=text.split(' ') 33 | result={} 34 | for word in words: 35 | word=word.lower() 36 | try: 37 | result[word]+=1 38 | except: 39 | result[word]=1 40 | 41 | excel=openpyxl.Workbook(write_only=True) 42 | sheet=excel.create_sheet() 43 | for key in result: 44 | sheet.append([key,result[key]]) 45 | excel.save('result.xlsx') 46 | 47 | get_articles() 48 | -------------------------------------------------------------------------------- /club.qingdaonews.com/article.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import openpyxl 4 | 5 | headers = { 6 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 7 | "Accept-Encoding": "gzip, deflate", 8 | "Accept-Language": "en-US,en;q=0.5", 9 | "Connection": "keep-alive", 10 | 'Cookie':'PHPSESSID=d2a521b9298f8691e4c37487b6657ac3; Hm_lvt_099a2f2a4f2c2f042dbd360b42309fc4=1482199772; Hm_lpvt_099a2f2a4f2c2f042dbd360b42309fc4=1482199852; CNZZDATA1000084976=1383072779-1482195841-null%7C1482195841; username=JarMrmn4olyPFzOAltjC0Q%3D%3D; password=jv2Y7Ga10EoO2Tn3W%2FY1plZvYz1QGqB2; NSC_dmvc=ffffffff09020e0445525d5f4f58455e445a4a423660', 11 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 12 | 13 | 14 | def get_article(endpage): 15 | page=1 16 | result=[] 17 | while True: 18 | url='http://club.qingdaonews.com/usercenter/mytopic.php?page=%s'%page 19 | try: 20 | html=requests.get(url,headers=headers,timeout=30).text 21 | except: 22 | continue 23 | table=BeautifulSoup(html,'lxml').find('div',{'class':'add_list'}).find_all('li') 24 | for li in table: 25 | try: 26 | url='http://club.qingdaonews.com'+li.find('a').get('href') 27 | title=li.find('a').get_text() 28 | result.append([title,url]) 29 | except: 30 | continue 31 | if page==endpage: 32 | break 33 | print(page,'ok') 34 | page+=1 35 | return result 36 | 37 | def main(): 38 | result=get_article(168) 39 | excel=openpyxl.Workbook(write_only=True) 40 | sheet=excel.create_sheet() 41 | for line in result: 42 | sheet.append(line) 43 | excel.save('urls.xlsx') 44 | 45 | main() 46 | -------------------------------------------------------------------------------- /cn.bing.com/urls.txt: -------------------------------------------------------------------------------- 1 | www.azlyrics.com 2 | www.metrolyrics.com/ 3 | lyrics.wikia.com 4 | www.songlyrics.com 5 | www.musixmatch.com/ 6 | www.lyricsfreak.com/ 7 | www.lyricsmode.com/ 8 | www.directlyrics.com/ 9 | www.darklyrics.com/ 10 | www.allthelyrics.com 11 | www.sing365.com/ 12 | www.lyricsg.com 13 | www.parolesmania.com/ 14 | www.sweetslyrics.com 15 | azlyricdb.com 16 | www.musicsonglyrics.com/ 17 | www.honeyguide.co.uk 18 | songmeanings.com/ 19 | www.lyricsforsong.net/ 20 | www.elyrics.com 21 | www.lyricsreg.com 22 | batlyrics.net/ 23 | genius.com/ 24 | www.lyricspond.com/ 25 | artists.letssingit.com/ 26 | www.cduniverse.com/ 27 | www.leoslyrics.com/ 28 | www.lyrster.com/ 29 | www.smartlyrics.com/ 30 | www.lyrics007.com/ 31 | www.classic-country-song-lyrics.com/ 32 | -------------------------------------------------------------------------------- /data.cma.gov.cn/Duplicate.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import os 4 | 5 | def Duplicate(): 6 | for filename in os.listdir('.'): 7 | if filename.endswith('txt'): 8 | lines=open(filename,'r').readlines() 9 | lines=list(set(lines)) 10 | lines.sort() 11 | f=open(filename,'w') 12 | for line in lines: 13 | f.write(line) 14 | f.close() 15 | 16 | Duplicate() 17 | -------------------------------------------------------------------------------- /datacenter.mep.gov.cn/air_dairy.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import openpyxl 4 | import time 5 | 6 | headers = { 7 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 8 | "Accept-Encoding": "gzip, deflate", 9 | "Accept-Language": "en-US,en;q=0.5", 10 | "Connection": "keep-alive", 11 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 12 | 13 | 14 | def get_table(url): 15 | html=requests.get(url,headers=headers).text 16 | table=BeautifulSoup(html,'html.parser').find('table',id='report1').find_all('tr') 17 | result=[] 18 | for tr in table[2:-3]: 19 | item='' 20 | for td in tr.find_all('td'): 21 | item+=td.get_text()+'|' 22 | result.append(item) 23 | return result 24 | 25 | def main(): 26 | text_f=open('2014_2016.txt','w',encoding='utf-8') 27 | startdate='2014-01-01'#起始日期 28 | enddate='2016-07-19'#结束日期 29 | startpage=1#起始页码 30 | endpage=10#结束页码 31 | while startpage<=endpage: 32 | url='http://datacenter.mep.gov.cn/report/air_daily/air_dairy.jsp?city=&startdate={}&enddate={}&page={}'.format(startdate,enddate,startpage) 33 | try: 34 | items=get_table(url) 35 | except: 36 | time.sleep(2) 37 | print(startpage,'-failed') 38 | continue 39 | for item in items: 40 | text_f.write(item+'\n') 41 | print(startpage,'-ok') 42 | startpage+=1 43 | text_f.close() 44 | write_to_excel() 45 | 46 | def write_to_excel(): 47 | excel=openpyxl.Workbook(write_only=True) 48 | sheet=excel.create_sheet() 49 | for line in open('2014_2016.txt','r',encoding='utf-8'): 50 | line=line.replace('\n','') 51 | sheet.append(line.split('|')) 52 | excel.save('2014_2016.xlsx') 53 | 54 | main() 55 | -------------------------------------------------------------------------------- /datacenter.mep.gov.cn/air_dairy_aqi.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import openpyxl 4 | import time 5 | 6 | headers = { 7 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 8 | "Accept-Encoding": "gzip, deflate", 9 | "Accept-Language": "en-US,en;q=0.5", 10 | "Connection": "keep-alive", 11 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 12 | 13 | 14 | def get_table(url): 15 | html=requests.get(url,headers=headers).text 16 | table=BeautifulSoup(html,'html.parser').find('table',id='report1').find_all('tr') 17 | result=[] 18 | for tr in table[2:-3]: 19 | item='' 20 | for td in tr.find_all('td'): 21 | item+=td.get_text()+'|' 22 | result.append(item) 23 | return result 24 | 25 | def main(): 26 | text_f=open('2000_2014.txt','w',encoding='utf-8') 27 | startdate='2000-01-01'#起始日期 28 | enddate='2015-12-31'#结束日期 29 | startpage=1#起始页码 30 | endpage=10#结束页码 31 | while startpage<=endpage: 32 | url='http://datacenter.mep.gov.cn/report/air_daily/air_dairy_aqi.jsp?city=&startdate={}&enddate={}&page={}'.format(startdate,enddate,startpage) 33 | try: 34 | items=get_table(url) 35 | except: 36 | time.sleep(2) 37 | print(startpage,'-failed') 38 | continue 39 | for item in items: 40 | text_f.write(item+'\n') 41 | print(startpage,'-ok') 42 | startpage+=1 43 | text_f.close() 44 | write_to_excel() 45 | 46 | def write_to_excel(): 47 | excel=openpyxl.Workbook(write_only=True) 48 | sheet=excel.create_sheet() 49 | for line in open('2000_2014.txt','r',encoding='utf-8'): 50 | line=line.replace('\n','') 51 | sheet.append(line.split('|')) 52 | excel.save('2000_2014.xlsx') 53 | 54 | main() 55 | -------------------------------------------------------------------------------- /dianping/data/上海.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/上海.xls -------------------------------------------------------------------------------- /dianping/data/北京.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/北京.xls -------------------------------------------------------------------------------- /dianping/data/南京.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/南京.xls -------------------------------------------------------------------------------- /dianping/data/厦门.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/厦门.xls -------------------------------------------------------------------------------- /dianping/data/大连.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/大连.xls -------------------------------------------------------------------------------- /dianping/data/天津.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/天津.xls -------------------------------------------------------------------------------- /dianping/data/宁波.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/宁波.xls -------------------------------------------------------------------------------- /dianping/data/广州.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/广州.xls -------------------------------------------------------------------------------- /dianping/data/成都.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/成都.xls -------------------------------------------------------------------------------- /dianping/data/无锡.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/无锡.xls -------------------------------------------------------------------------------- /dianping/data/杭州.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/杭州.xls -------------------------------------------------------------------------------- /dianping/data/武汉.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/武汉.xls -------------------------------------------------------------------------------- /dianping/data/沈阳.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/沈阳.xls -------------------------------------------------------------------------------- /dianping/data/济南.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/济南.xls -------------------------------------------------------------------------------- /dianping/data/深圳.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/深圳.xls -------------------------------------------------------------------------------- /dianping/data/苏州.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/苏州.xls -------------------------------------------------------------------------------- /dianping/data/西安.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/西安.xls -------------------------------------------------------------------------------- /dianping/data/郑州.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/郑州.xls -------------------------------------------------------------------------------- /dianping/data/重庆.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/重庆.xls -------------------------------------------------------------------------------- /dianping/data/长沙.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/长沙.xls -------------------------------------------------------------------------------- /dianping/data/青岛.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/青岛.xls -------------------------------------------------------------------------------- /dianping/shopinfor.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import time 6 | headers = { 7 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 8 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 9 | 'Accept-Language': 'en-US,en;q=0.5', 10 | 'Accept-Encoding': 'gzip, deflate', 11 | 'Connection': 'keep-alive'} 12 | 13 | def get_infor(): 14 | urls=['https://www.dianping.com/search/category/2/10/r2588o2p','https://www.dianping.com/search/category/2/10/r1493o2p','https://www.dianping.com/search/category/2/10/r1490o2p'] 15 | f=open('haidian.txt','a',encoding='utf-8') 16 | for url in urls: 17 | page=1 18 | while page<=50: 19 | try: 20 | html=requests.get(url+str(page),headers=headers,timeout=30).text 21 | except: 22 | continue 23 | table=BeautifulSoup(html,'lxml').find('div',id='shop-all-list').find_all('li') 24 | for li in table: 25 | try: 26 | soup=li.find('div',attrs={'class':'txt'}) 27 | tit=soup.find('div',attrs={'class':'tit'}) 28 | comment=soup.find('div',attrs={'class':'comment'}) 29 | tag_addr=soup.find('div',attrs={'class':'tag-addr'}) 30 | text=tit.find('a').get_text().replace('\r','').replace('\n','')+'||'+comment.find('span').get('title')+'||'+comment.find('a',attrs={'class':'review-num'}).get_text().replace('\r','').replace('\n','')+'||'+comment.find('a',attrs={'class':'mean-price'}).get_text().replace('\r','').replace('\n','')+'||'+tag_addr.find('span',attrs={'class':'tag'}).get_text().replace('\r','').replace('\n','')+'||'+tag_addr.find('span',attrs={'class':'addr'}).get_text().replace('\r','').replace('\n','')+'||' 31 | comment_list=soup.find('span',attrs={'class':'comment-list'}).find_all('span') 32 | for i in comment_list: 33 | text+='||'+i.get_text().replace('\r','').replace('\n','') 34 | for i in tit.find('div',attrs={'class':'promo-icon'}).find_all('a'): 35 | try: 36 | text+='||'+i.get('class') 37 | except: 38 | text+='||'+i.get('class')[0] 39 | f.write(text.replace(' ','')+'\n') 40 | except: 41 | continue 42 | page+=1 43 | print(page) 44 | time.sleep(1) 45 | f.close() 46 | 47 | get_infor() 48 | -------------------------------------------------------------------------------- /dianping/shoplist.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | import json 5 | import xlwt3 6 | import os 7 | 8 | headers = { 9 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 10 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 11 | 'Accept-Language': 'en-US,en;q=0.5', 12 | 'Accept-Encoding': 'gzip, deflate', 13 | 'Connection': 'keep-alive'} 14 | 15 | def get_data(url): 16 | html=requests.get(url,headers=headers).text 17 | data=json.loads(html)['shopBeans'] 18 | return data 19 | 20 | def shoplist(): 21 | try: 22 | os.mkdir('data') 23 | except: 24 | print('--') 25 | items={'最佳餐厅':'http://www.dianping.com/mylist/ajax/shoprank?cityId=%s&shopType=10&rankType=score&categoryId=0','人气餐厅':'http://www.dianping.com/mylist/ajax/shoprank?cityId=%s&shopType=10&rankType=popscore&categoryId=0','口味最佳':'http://www.dianping.com/mylist/ajax/shoprank?cityId=%s&shopType=10&rankType=score1&categoryId=0','环境最佳':'http://www.dianping.com/mylist/ajax/shoprank?cityId=%s&shopType=10&rankType=score2&categoryId=0','服务最佳':'http://www.dianping.com/mylist/ajax/shoprank?cityId=%s&shopType=10&rankType=score3&categoryId=0'} 26 | citys={'北京':'2','上海':'1','广州':'4','深圳':'7','成都':'8','重庆':'9','杭州':'3','南京':'5','沈阳':'18','苏州':'6','天津':'10','武汉':'16','西安':'17','长沙':'344','大连':'19','济南':'22','宁波':'11','青岛':'21','无锡':'13','厦门':'15','郑州':'160'} 27 | excel=xlwt3.Workbook() 28 | sheet=excel.add_sheet('sheet') 29 | count=0 30 | for city in citys: 31 | for key in items: 32 | try: 33 | data=get_data(items[key]%(citys[city])) 34 | except: 35 | print('Error!') 36 | continue 37 | num=1 38 | for item in data: 39 | sheet.write(count,0,str(count+1)) 40 | sheet.write(count,1,key) 41 | sheet.write(count,2,city) 42 | sheet.write(count,3,num) 43 | sheet.write(count,4,item['filterFullName']) 44 | sheet.write(count,5,item['mainRegionName']) 45 | sheet.write(count,6,item['refinedScore1']) 46 | sheet.write(count,7,item['refinedScore2']) 47 | sheet.write(count,8,item['refinedScore3']) 48 | sheet.write(count,9,item['avgPrice']) 49 | if '(' in item['filterFullName'] or '(' in item['filterFullName']: 50 | sheet.write(count,10,'Y') 51 | else: 52 | sheet.write(count,10,'N') 53 | num+=1 54 | count+=1 55 | print(city+'--OK') 56 | excel.save('data/data.xls') 57 | 58 | shoplist() 59 | -------------------------------------------------------------------------------- /douban/movie_grade.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import json 4 | 5 | headers = { 6 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 7 | "Accept-Encoding": "gzip, deflate", 8 | "Accept-Language": "en-US,en;q=0.5", 9 | "Connection": "keep-alive", 10 | "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36"} 11 | 12 | def comments(movieid,fromdate,todate): 13 | start=0 14 | rating={} 15 | comments={} 16 | while True: 17 | url='https://m.douban.com/rexxar/api/v2/movie/{}/interests?count=20&order_by=latest&start={}&ck=&for_mobile=1'.format(movieid,start) 18 | html=requests.get(url,headers=headers).text 19 | print(movieid,start) 20 | start+=25 21 | data=json.loads(html)['interests'] 22 | if len(data)==0: 23 | break 24 | for item in data: 25 | date=item['create_time'].split(' ')[0] 26 | int_date=int(date.replace('-','')) 27 | if int_date>todate: 28 | continue 29 | if int_date500000): 55 | return data 56 | return [] 57 | 58 | if __name__=='__main__': 59 | threadings=[] 60 | f=open('华语.txt','r') 61 | file_d=open('data.txt','a') 62 | for line in f.readlines(): 63 | for id in eval(line.replace('\n','')): 64 | data=get_id(id) 65 | if data==[]: 66 | continue 67 | file_d.write(str(data)+'\n') 68 | print(id) 69 | -------------------------------------------------------------------------------- /newseed.pedaily.cn/invest.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import openpyxl 4 | import time 5 | 6 | headers = { 7 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 8 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 9 | 'Accept-Language': 'en-US,en;q=0.5', 10 | 'Accept-Encoding': 'gzip, deflate', 11 | 'Connection': 'keep-alive'} 12 | 13 | def invest(page): 14 | html=requests.get('http://newseed.pedaily.cn/invest/p'+str(page),headers=headers).text 15 | table=BeautifulSoup(html,'lxml').find('table',{'class':'record-table'}).find_all('tr') 16 | result=[] 17 | for tr in table: 18 | tds=tr.find_all('td') 19 | if len(tds)==0: 20 | continue 21 | line=[] 22 | for td in tds: 23 | try: 24 | line.append(td.get_text()) 25 | except: 26 | line.append('') 27 | result.append(line) 28 | return result 29 | 30 | def write_to_excel(result): 31 | excel=openpyxl.Workbook(write_only=True) 32 | sheet=excel.create_sheet() 33 | for line in result: 34 | try: 35 | sheet.append(line) 36 | except: 37 | continue 38 | excel.save('result.xlsx') 39 | 40 | def main(): 41 | pagefrom=input("起始页:") 42 | pageto=input("结束页:") 43 | pagefrom=int(pagefrom) 44 | pageto=int(pageto) 45 | result=[] 46 | while pagefrom<=pageto: 47 | try: 48 | result+=invest(pagefrom) 49 | except: 50 | print(pagefrom,'failed') 51 | continue 52 | print(pagefrom,'ok') 53 | pagefrom+=1 54 | time.sleep(1) 55 | write_to_excel(result) 56 | 57 | main() 58 | -------------------------------------------------------------------------------- /rank.kongzhong.com/userInfor.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from bs4 import BeautifulSoup 3 | import time 4 | 5 | def loadNameAndArea():#加载需要抓取的名单 6 | lines=open('names.txt','r',encoding='utf-8').readlines()#读入文本 7 | userlist=[] 8 | for line in lines: 9 | userlist.append(line.replace('\r','').replace('\n','')) 10 | return userlist 11 | 12 | 13 | def writeToTxt(user):#将结果写入txt 14 | line='\t'.join(user) 15 | f=open('result.txt','a',encoding='utf-8') 16 | f.write(line+'\r\n') 17 | f.close() 18 | 19 | def parser(html):#解析网页,用的是BeautifulSoup库 20 | soup=BeautifulSoup(html,'html.parser').find('div',id='total') 21 | result=[] 22 | labels=['singlebattle','teambattle','totalbattle'] 23 | for label in labels: 24 | table=soup.find('div',id=label) 25 | result.append(table.find('span',{'class':'value separate'}).get_text()) 26 | result.append(table.find('span',{'class':'value2'}).get_text()) 27 | return result 28 | 29 | def getUserInfor(): 30 | browser=webdriver.Firefox()#调用火狐浏览器 31 | browser.get('http://rank.kongzhong.com/wows/index.html?name=%E4%BD%BF%E5%BE%92-%E6%B8%94%E9%B6%B8&zone=north') 32 | browser.implicitly_wait(10)#设置页面加载等待时间 33 | userlist=loadNameAndArea()#获取名单 34 | for user in userlist: 35 | user=user.split('\t')#名单中 名字和区域是以\t分隔 36 | if '南区' in user[-1]:#判断是那一个区域 37 | area='south' 38 | else: 39 | area='north' 40 | url='http://rank.kongzhong.com/wows/index.html?name=%s&zone=%s'%(user[0],area)#构造链接 41 | browser.get(url)#打开链接 42 | time.sleep(2)#停2s等待页面加载完成 43 | html=browser.page_source#获取页面源码 44 | try: 45 | result=parser(html)#解析页面 46 | except: 47 | continue 48 | result=user+result 49 | writeToTxt(result)#写入txt 50 | browser.quit() 51 | 52 | getUserInfor() 53 | -------------------------------------------------------------------------------- /stock.finance.qq.com/stk_holder.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import openpyxl 4 | import time 5 | 6 | headers = { 7 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 8 | "Accept-Encoding": "gzip, deflate", 9 | "Accept-Language": "en-US,en;q=0.5", 10 | "Connection": "keep-alive", 11 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 12 | 13 | def get_stkholder(name,stkcode): 14 | html=requests.get('http://stock.finance.qq.com/corp1/stk_holder.php?zqdm=%s'%stkcode,headers=headers).text 15 | soup=BeautifulSoup(html,'lxml').find('table',{'class':'list list_d'}) 16 | date=soup.find('tr').find_all('span',{'class':'fntTahoma'})[-1].get_text() 17 | table=soup.find_all('tr') 18 | result=[] 19 | for tr in table[2:-1]: 20 | tds=tr.find_all('td') 21 | item=[name,stkcode,date] 22 | for td in tds: 23 | item.append(td.get_text()) 24 | result.append(item) 25 | return result 26 | 27 | def write_to_excel(): 28 | excel=openpyxl.Workbook(write_only=True) 29 | filename=time.strftime("%Y%m%d %H%M%S",time.localtime())+'.xlsx' 30 | sheet=excel.create_sheet() 31 | for line in result: 32 | sheet.append(line) 33 | excel.save(filename) 34 | 35 | def main(): 36 | result=[] 37 | for line in open('stkcode.txt','r',encoding='utf-8'): 38 | title=line.replace('\r','').replace('\n','').split('---') 39 | try: 40 | items=get_stkholder(title[0],title[1]) 41 | except: 42 | pass 43 | time.sleep(3) 44 | continue 45 | result+=items 46 | print(title[0],title[1],'ok') 47 | time.sleep(3) 48 | write_to_excel(result) 49 | 50 | main() 51 | -------------------------------------------------------------------------------- /stock.finance.qq.com/stkcode.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | 4 | 5 | def get_stkcode(): 6 | f=open('stkcode.txt','w') 7 | page=1 8 | while True: 9 | html=requests.get('http://hq.gucheng.com/List.asp?Type=A&Sort=&Page=%s'%page).text.encode('ISO-8859-1').decode('GBK','ignore') 10 | table=BeautifulSoup(html,'lxml').find('div',{'class':'hq_big_bk md_6'}).find_all('tr') 11 | for tr in table[1:-1]: 12 | tds=tr.find_all('td') 13 | line=tds[1].get_text()+'---'+tds[0].get_text() 14 | print(line) 15 | f.write(line+'\r\n') 16 | page+=1 17 | if page==139: 18 | break 19 | f.close() 20 | 21 | get_stkcode() 22 | -------------------------------------------------------------------------------- /stock.jrj.com.cn/flowhistory.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import openpyxl 3 | import json 4 | import time 5 | 6 | 7 | def get_flowhistory(stockid): 8 | html=requests.get('http://zj.flashdata2.jrj.com.cn/flowhistory/share/%s.js'%stockid).text 9 | data=json.loads(html.replace('var stock_flow=','')) 10 | result=[] 11 | header=['序号','日期','涨跌幅','收盘价','换手率','净流入金额','主力净流入净额','主力净流入净占比','中单净流入净额','中单净流入净占比','散户净流入净额','散户净流入净占比','第二天'] 12 | result.append(header) 13 | keys=['date','pl','cp','tr','tin','zin','zpit','min','mpit','sin','spit'] 14 | count=1 15 | pre_line='' 16 | for line in data: 17 | item=[count] 18 | count+=1 19 | for key in keys: 20 | item.append(line[key]) 21 | try: 22 | item.append(pre_line['pl']) 23 | except: 24 | pass 25 | result.append(item) 26 | pre_line=line 27 | return result 28 | 29 | def write_to_excel(result,stockid): 30 | excel=openpyxl.Workbook(write_only=True) 31 | sheet=excel.create_sheet() 32 | for item in result: 33 | sheet.append(item) 34 | excel.save('%s.xlsx'%stockid) 35 | 36 | def main(): 37 | stockid=input("输入股票代码:") 38 | try: 39 | result=get_flowhistory(stockid) 40 | except: 41 | print('Failed!') 42 | time.sleep(10) 43 | return 44 | write_to_excel(result,stockid) 45 | 46 | main() 47 | -------------------------------------------------------------------------------- /taobao/suggest.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import time 4 | import os 5 | import chardet 6 | 7 | headers = { 8 | ':authority':'suggest.taobao.com', 9 | 'user-agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36', 10 | 'Accept':"*/*", 11 | 'Accept-Language': 'en-US,en;q=0.5', 12 | 'Accept-Encoding': 'gzip, deflate', 13 | 'Connection': 'keep-alive'} 14 | 15 | 16 | def suggest(keyword): 17 | html=requests.get('https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null'.format(keyword),headers=headers).text 18 | data=json.loads(html)['result'] 19 | result=[] 20 | for item in data: 21 | result.append(item[0].replace('','').replace('','')) 22 | return result 23 | 24 | def get_chardet(filename): 25 | data=open(filename,'rb').read() 26 | coding=chardet.detect(data) 27 | return coding['encoding'] 28 | 29 | def loadkeywords(): 30 | keywords={} 31 | for filename in os.listdir('keywords'): 32 | if '.txt' not in filename: 33 | continue 34 | encoding=get_chardet('keywords/'+filename) 35 | if encoding=='GB2312': 36 | encoding='GBK' 37 | keywords[filename]=[] 38 | for line in open('keywords/'+filename,'r',encoding=encoding): 39 | word=line.replace('\r','').replace('\n','') 40 | keywords[filename].append(word) 41 | return keywords 42 | 43 | def save_to_txt(filename,deep,words): 44 | f=open('result/'+filename.replace('.txt','_%s.txt'%deep),'w',encoding='utf-8') 45 | writed=[] 46 | for word in words: 47 | if word in writed: 48 | continue 49 | writed.append(word) 50 | f.write(word+'\r\n') 51 | f.close() 52 | 53 | def main(): 54 | keywords=loadkeywords() 55 | while True: 56 | try: 57 | deep=input("输入采集深度:") 58 | deep=int(deep) 59 | break 60 | except: 61 | pass 62 | for filename in keywords: 63 | result=[] 64 | for word in keywords[filename]: 65 | words=[word] 66 | count=0 67 | for num in range(deep): 68 | suggest_words=[] 69 | for need_word in words: 70 | try: 71 | suggest_words+=suggest(need_word) 72 | except: 73 | continue 74 | suggest_words=list(set(suggest_words)) 75 | words=suggest_words 76 | count+=len(suggest_words) 77 | result+=suggest_words 78 | print(word,'deep',num+1) 79 | print(word,'get',count,'ok') 80 | save_to_txt(filename,deep,result) 81 | 82 | main() 83 | -------------------------------------------------------------------------------- /weibo/weibo.md: -------------------------------------------------------------------------------- 1 | ###Python网络爬虫之新浪微博 2 | ####1.模拟登录 3 | 这里我是利用selenium登录,然后获取登录后的cookies,方便快捷,也免去了编写代码模拟登录的麻烦。requests直接可以利用这个cookies实现登录抓取。 4 | 5 | ```python 6 | from selenium import webdriver 7 | 8 | def login(username,password): 9 | browser=webdriver.PhantomJS('./phantomjs') 10 | browser.get('https://passport.weibo.cn/signin/login?entry=mweibo&res=wel&wm=3349&r=http%3A%2F%2Fm.weibo.cn%2F')#打开登录界面 11 | browser.set_page_load_timeout(10) 12 | time.sleep(5)#延时等待网页加载完成 13 | browser.find_element_by_id('loginName').send_keys(username)#填入用户名 14 | browser.find_element_by_id('loginPassword').send_keys(password)#填入密码 15 | browser.find_element_by_id('loginAction').click()#点击登录 16 | time.sleep(5) 17 | cookies=browser.get_cookies()#获取登录后的cookies 18 | result={} 19 | for item in cookies: 20 | try: 21 | result[item['name']]=item['value'] 22 | except: 23 | continue 24 | return result#返回dict类型cookies 25 | 26 | ``` 27 | requests不能保持手动构建的cookie,因此需要将dict类型的cookie转成cookiejar类型 28 | 29 | ```python 30 | import requests 31 | import os 32 | 33 | def weibo(): 34 | if os.path.isfile('cookies'): 35 | cookies=eval(open('cookies','r').read()) 36 | else: 37 | cookies=login('username','password')#获取登录后的cookie 38 | session=requests.session() 39 | session.cookies=requests.utils.cookiejar_from_dict(cookies)#将字典转为CookieJar,并传入session中 40 | return session 41 | 42 | ``` 43 | 44 | ####2.获取首页微博 45 | ```python 46 | import json 47 | 48 | headers = { 49 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 50 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 51 | 'Accept-Language': 'en-US,en;q=0.5', 52 | 'Accept-Encoding': 'gzip, deflate', 53 | 'Connection': 'keep-alive'} 54 | 55 | session=weibo() 56 | html=session.get('http://m.weibo.cn/index/feed?format=cards&page=1',headers=headers).text 57 | data=json.loads(html)[0]['card_group'] 58 | result=[] 59 | for item in data: 60 | user=item['mblog']['user']['screen_name'] 61 | text=item['mblog']['text'] 62 | weiboid=item['mblog']['idstr'] 63 | result.append({'user':user,'text':text}) 64 | print(result) 65 | ``` 66 | 67 | ####3.获取微博评论 68 | 69 | ```python 70 | 71 | def get_comments(session,weiboid): 72 | page=1 73 | html=session.get('http://m.weibo.cn/single/rcList?format=cards&id={weiboid}&type=comment&hot=0&page={page}'.format(weiboid=weiweiboid,page=page),headers=headers).text 74 | data=json.loads(html)[1]['card_group'] 75 | comments=[] 76 | for item in data: 77 | comment={} 78 | comment['user']=item['user']['screen_name'] 79 | comment['date']=item['created_at'] 80 | comment['text']=item['text'] 81 | comments.append(comment) 82 | return comments 83 | ``` 84 | -------------------------------------------------------------------------------- /weibo/weibo.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | from selenium import webdriver 4 | import time 5 | import os 6 | import json 7 | 8 | headers = { 9 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 10 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 11 | 'Accept-Language': 'en-US,en;q=0.5', 12 | 'Accept-Encoding': 'gzip, deflate', 13 | 'Connection': 'keep-alive'} 14 | 15 | def login(username,password): 16 | browser=webdriver.PhantomJS('/home/nyloner/phantomjs/phantomjs') 17 | #browser=webdriver.Firefox() 18 | browser.get('https://passport.weibo.cn/signin/login?entry=mweibo&res=wel&wm=3349&r=http%3A%2F%2Fm.weibo.cn%2F') 19 | browser.set_page_load_timeout(10) 20 | time.sleep(5) 21 | browser.find_element_by_id('loginName').send_keys(username) 22 | browser.find_element_by_id('loginPassword').send_keys(password) 23 | browser.find_element_by_id('loginAction').click() 24 | time.sleep(5) 25 | cookies=browser.get_cookies() 26 | result={} 27 | for item in cookies: 28 | try: 29 | result[item['name']]=item['value'] 30 | except: 31 | continue 32 | f=open('cookies','w') 33 | f.write(str(result)) 34 | f.close() 35 | return result 36 | 37 | def weibo(): 38 | if os.path.isfile('cookies'): 39 | cookies=eval(open('cookies','r').read()) 40 | else: 41 | cookies=login('username','password') 42 | session=requests.session() 43 | session.cookies=requests.utils.cookiejar_from_dict(cookies) 44 | html=session.get('http://m.weibo.cn',headers=headers).text 45 | html=session.get('http://m.weibo.cn/index/feed?format=cards&page=1',headers=headers).text 46 | data=json.loads(html)[0]['card_group'] 47 | result=[] 48 | for item in data: 49 | user=item['mblog']['user']['screen_name'] 50 | text=item['mblog']['text'] 51 | result.append({'user':user,'text':text}) 52 | print(result) 53 | print(get_comments(session,'4013542757481643')) 54 | 55 | def get_comments(session,weiboid): 56 | page=1 57 | html=session.get('http://m.weibo.cn/single/rcList?format=cards&id={weiboid}&type=comment&hot=0&page={page}'.format(weiboid=weiboid,page=page),headers=headers).text 58 | data=json.loads(html)[1]['card_group'] 59 | comments=[] 60 | for item in data: 61 | comment={} 62 | comment['user']=item['user']['screen_name'] 63 | comment['date']=item['created_at'] 64 | comment['text']=item['text'] 65 | comments.append(comment) 66 | return comments 67 | 68 | weibo() 69 | -------------------------------------------------------------------------------- /weidian/weidian.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | from selenium import webdriver 6 | import time 7 | import re 8 | 9 | headers = { 10 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 11 | "Accept-Encoding": "gzip, deflate", 12 | "Accept-Language": "en-US,en;q=0.5", 13 | "Connection": "keep-alive", 14 | "User-Agent": "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"} 15 | 16 | def get_place(): 17 | f=open('place.txt','w') 18 | browser=webdriver.Firefox() 19 | #html=requests.get('http://weidian.com/near_shop/chunjie/city.html?&from=weidian&userid=211106418&umk=34542211106418',headers=headers).text.encode('ISO-8859-1').decode('utf-8','ignore') 20 | browser.get('http://weidian.com/near_shop/chunjie/city.html?&from=weidian&userid=211106418&umk=34542211106418') 21 | time.sleep(10) 22 | html=browser.page_source 23 | table=BeautifulSoup(html,'lxml').find('div',id='show-place').find_all('ul') 24 | places={} 25 | print(html) 26 | for item in table[1:]: 27 | for li in item.find_all('li'): 28 | places[li.get_text()]='http://weidian.com/near_shop/chunjie/'+li.find('a').get('href') 29 | for li in table[0].find_all('li'): 30 | places[li.get_text()]='http://weidian.com/near_shop/chunjie/'+li.find('a').get('href') 31 | for key in places: 32 | text=key+'||'+places[key]+'\n' 33 | f.write(text) 34 | f.close() 35 | 36 | def get_shop(): 37 | f=open('shops.txt','a',encoding='utf-8') 38 | for line in open('place.txt').readlines(): 39 | city=line.split('||')[0] 40 | place=re.findall('place=(.*?)&',line)[0] 41 | page=0 42 | while True: 43 | url='http://api.buyer.weidian.com/h5/appserver_nearbyShop.do?place='+place+'&seed=0&category=%E7%AE%B1%E5%8C%85&limit=50&page='+str(page)+'&callback=jsonp4&rnd=0.8898308666990978' 44 | html=requests.get(url,headers=headers).text 45 | rel='"shopid":"(.*?)","entranceName":"(.*?)","address":"(.*?)"' 46 | lists=re.findall(rel,html) 47 | if lists==[]: 48 | break 49 | for item in lists: 50 | text=item[0]+'||'+item[1]+'||'+item[2] 51 | f.write(text+'\n') 52 | print(city+place+'--'+str(page)) 53 | page+=1 54 | f.close() 55 | 56 | def get_weixin(): 57 | f=open('data.txt','a') 58 | for line in open('shops.txt'): 59 | line=line.replace('\n','') 60 | shopurl='http://weidian.com/?userid='+line.split('||')[0] 61 | html=requests.get(shopurl,headers=headers).text 62 | try: 63 | html=requests.get(shopurl,headers=headers).text 64 | rel='微信: (.*?)<' 65 | weixin=re.findall(rel,html)[0] 66 | except: 67 | continue 68 | print(line+'---OK') 69 | line=line+'||'+weixin+'\n' 70 | f.write(line) 71 | 72 | def main(): 73 | #get_shop() 74 | get_weixin() 75 | 76 | main() 77 | -------------------------------------------------------------------------------- /wenda.so.com/search.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import time 4 | from selenium import webdriver 5 | 6 | headers = { 7 | 'Host':"wenda.so.com", 8 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 9 | "Accept-Encoding": "gzip, deflate", 10 | "Accept-Language": "en-US,en;q=0.5", 11 | "Connection": "keep-alive", 12 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 13 | 14 | browser=webdriver.Firefox() 15 | browser.get('http://wenda.so.com/') 16 | browser.implicitly_wait(10) 17 | def search(key): 18 | #html=requests.get('http://wenda.so.com/search/?q='+key,headers=headers,timeout=30).text 19 | browser.get('http://wenda.so.com/search/?q='+key) 20 | time.sleep(0.5) 21 | html=browser.page_source 22 | table=BeautifulSoup(html,'lxml').find_all('li',{'class':'item'}) 23 | for item in table: 24 | try: 25 | url=item.find('a').get('href') 26 | if 'q/' in url: 27 | return 'http://wenda.so.com/'+url 28 | except: 29 | continue 30 | 31 | def get_questions(): 32 | for word in open('failed_words','r'): 33 | word=word.replace('\r','').replace('\n','') 34 | try: 35 | url=search(word) 36 | except: 37 | failed=open('failed.txt','a') 38 | failed.write(word+'\n') 39 | failed.close() 40 | continue 41 | if url==None: 42 | failed=open('failed.txt','a') 43 | failed.write(word+'\n') 44 | failed.close() 45 | continue 46 | f=open('question_','a') 47 | f.write(word+'||'+url+'\n') 48 | print(word,'ok') 49 | f.close() 50 | 51 | get_questions() 52 | -------------------------------------------------------------------------------- /wenshu.court.gov.cn/download.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import json 4 | 5 | headers = { 6 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 7 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 8 | 'Accept-Language': 'en-US,en;q=0.5', 9 | 'Accept-Encoding': 'gzip, deflate', 10 | 'Connection': 'keep-alive'} 11 | 12 | def doclist(page,Param="",Order="裁判日期"): 13 | data={ 14 | 'Param':Param, 15 | 'Index':page, 16 | 'Page':"20", 17 | 'Order':Order, 18 | 'Direction':"desc" 19 | } 20 | html=requests.post('http://wenshu.court.gov.cn/List/ListContent',data=data,headers=headers).text 21 | data=json.loads(html) 22 | data=eval(data) 23 | result=[] 24 | for item in data: 25 | if 'Count' in item: 26 | continue 27 | result.append(item) 28 | return result 29 | 30 | def download(docid,title): 31 | data={ 32 | 'conditions':'', 33 | 'docIds':docid+'|'+title+'|', 34 | 'keyCode':"" 35 | } 36 | content=requests.post('http://wenshu.court.gov.cn/CreateContentJS/CreateListDocZip.aspx?action=1',data=data,headers=headers).content 37 | with open('result/%s.doc'%docid,'wb') as f: 38 | f.write(content) 39 | 40 | if __name__ == '__main__': 41 | docs=doclist(1) 42 | try: 43 | import os 44 | os.mkdir('result') 45 | except: 46 | pass 47 | for item in docs: 48 | download(item['文书ID'],item['案件名称']) 49 | print(item['案件名称']) 50 | -------------------------------------------------------------------------------- /worldfreightrates/trates.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | import re 5 | import xlrd 6 | import xlwt3 7 | 8 | headers = { 9 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 10 | "Accept-Encoding": "gzip, deflate", 11 | "Accept-Language": "en-US,en;q=0.5", 12 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 13 | 14 | def get_port(name): 15 | name=name.replace(' ','+') 16 | count=0 17 | statue=True 18 | while statue: 19 | try: 20 | html=requests.get('http://worldfreightrates.com/calculator/ports?term=%s'%name,headers=headers,timeout=30).text 21 | statue=False 22 | except: 23 | count+=1 24 | if count==3: 25 | return False 26 | continue 27 | try: 28 | data=eval(html) 29 | Id=data[0]['id'] 30 | return Id 31 | except: 32 | return False 33 | 34 | def get_infor(fromid,toid,commodityName): 35 | url='http://worldfreightrates.com/en/calculator/ocean/rate?fromId='+fromid+'&toId='+toid+'&oceanType=FCL&commodityName='+commodityName+'&commodityValue=100&includeInsurance=false&includeReefer=false&includeHazardous=false&unit=lb&containerSize=40' 36 | html=requests.get(url,headers=headers,timeout=50).text.replace('\\','') 37 | rel='"result">(.*?)

' 38 | try: 39 | result=re.findall(rel,html)[0] 40 | except: 41 | result='' 42 | return result 43 | 44 | def main(): 45 | data = xlrd.open_workbook('data/data.xlsx') 46 | table = data.sheets()[0] 47 | excel=xlwt3.Workbook() 48 | sheet=excel.add_sheet('sheet') 49 | for row in range(table.nrows): 50 | print(row) 51 | fromport=table.cell(row,0).value 52 | toport=table.cell(row,1).value 53 | commodityName=table.cell(row,2).value 54 | Load_Type=table.cell(row,3).value 55 | fromid=get_port(fromport) 56 | toid=get_port(toport) 57 | if fromid==False or toid==False: 58 | sheet.write(row,0,fromport) 59 | sheet.write(row,1,toport) 60 | sheet.write(row,2,commodityName) 61 | sheet.write(row,3,Load_Type) 62 | sheet.write(row,4,'') 63 | excel.save('data/result.xls') 64 | continue 65 | try: 66 | result=get_infor(fromid,toid,commodityName.replace('&','%26').replace(' ','+').replace(',','%2C')) 67 | except: 68 | result='' 69 | sheet.write(row,0,fromport) 70 | sheet.write(row,1,toport) 71 | sheet.write(row,2,commodityName) 72 | sheet.write(row,3,Load_Type) 73 | sheet.write(row,4,result) 74 | excel.save('data/result.xls') 75 | main() 76 | -------------------------------------------------------------------------------- /www.18ladys.com/18ladys.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import time 4 | import re 5 | import openpyxl 6 | 7 | headers = { 8 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 9 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 10 | 'Accept-Language': 'en-US,en;q=0.5', 11 | 'Accept-Encoding': 'gzip, deflate'} 12 | 13 | def get_names(): 14 | page=1 15 | while page<21: 16 | html=requests.get('http://www.18ladys.com/cyzy/index.asp?page='+str(page),headers=headers).text.encode('iso-8859-1').decode('gbk') 17 | table=BeautifulSoup(html,'lxml').find('div',{'class':'tb1'}).find_all('a') 18 | f=open('names.txt','a') 19 | for item in table: 20 | try: 21 | name=item.get_text() 22 | url='http://www.18ladys.com/cyzy/'+item.get('href') 23 | f.write(name+'|'+url+'\n') 24 | except: 25 | continue 26 | f.close() 27 | print(page) 28 | page+=1 29 | 30 | def get_infor(name,url): 31 | html=requests.get(url,headers=headers).text.encode('iso-8859-1').decode('gbk','ignore') 32 | text=BeautifulSoup(html,'lxml').find('dd',{'class':'f14 jl4'}).find('p').get_text().replace('【','||【').replace('\r','').replace('\n','') 33 | text=text.split('||') 34 | result={'name':name} 35 | for item in text: 36 | try: 37 | name_value=item.split('】') 38 | name=name_value[0].replace('【','') 39 | value=name_value[1] 40 | result[name]=value 41 | except: 42 | continue 43 | return result 44 | 45 | def crawler(): 46 | for line in open('names.txt','r'): 47 | line=line.replace('\n','') 48 | name=line.split('|')[0] 49 | url=line.split('|')[1] 50 | try: 51 | item=get_infor(name,url) 52 | except: 53 | failed=open('failed','a') 54 | failed.write(line+'\n') 55 | failed.close() 56 | f=open('result.txt','a') 57 | f.write(str(item)+'\n') 58 | f.close() 59 | print(line,'ok') 60 | 61 | def write_to_excel(): 62 | excel=openpyxl.Workbook(write_only=True) 63 | sheet=excel.create_sheet() 64 | keys=['name','异名','别名','来源','植物形态','功用主治','用法与用量','炮制'] 65 | sheet.append(keys) 66 | for line in open('result.txt','r'): 67 | item=eval(line) 68 | infor=[] 69 | for key in keys: 70 | try: 71 | infor.append(item[key]) 72 | except: 73 | infor.append('') 74 | sheet.append(infor) 75 | excel.save('result.xlsx') 76 | 77 | crawler() -------------------------------------------------------------------------------- /www.58.com/sendemail.py: -------------------------------------------------------------------------------- 1 | import smtplib 2 | from email.mime.text import MIMEText 3 | from email.mime.multipart import MIMEMultipart 4 | from email.header import Header 5 | import time 6 | 7 | 8 | def sendmail(): 9 | sender = 'xxx@qq.com' 10 | receivers = ['xxx@qq.com'] # 接收邮件,可设置为你的QQ邮箱或者其他邮箱 11 | #创建一个带附件的实例 12 | message = MIMEMultipart() 13 | message['From'] = Header("xxxx", 'utf-8') 14 | message['To'] = Header("xxx@qq.com", 'utf-8') 15 | subject ='time.strftime("%Y-%m-%d %H:%M:%S")' 16 | message['Subject'] = Header(subject, 'utf-8') 17 | #邮件正文内容 18 | message.attach(MIMEText('time.strftime("%Y-%m-%d %H:%M:%S")', 'plain', 'utf-8')) 19 | att1 = MIMEText(open('result.xls', 'rb').read(), 'base64', 'utf-8') 20 | att1["Content-Type"] = 'application/octet-stream' 21 | # 这里的filename可以任意写,写什么名字,邮件中显示什么名字 22 | att1["Content-Disposition"] = 'attachment; filename="result.xls"' 23 | message.attach(att1) 24 | server=smtplib.SMTP_SSL('smtp.qq.com') 25 | server.ehlo('smtp.qq.com') 26 | server.login(sender,passwd) 27 | server.sendmail(sender, receivers, message.as_string()) 28 | 29 | sendmail() 30 | -------------------------------------------------------------------------------- /www.airbnb.com/deal.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import xlwt3 4 | 5 | 6 | def deal_userdata(): 7 | userresult=open('userresult.txt','w') 8 | for line in open('userdata.txt','r'): 9 | line=line.replace('\n','') 10 | lists=line.split('||') 11 | try: 12 | allreview=int(lists[-2].replace('Reviews','')) 13 | except: 14 | allreview=0 15 | try: 16 | hostreview=int(lists[-1]) 17 | except: 18 | hostreview=0 19 | try: 20 | prereview=allreview-hostreview 21 | except: 22 | prereview='--' 23 | result='' 24 | for i in lists: 25 | result+=i+'||' 26 | result+=str(prereview) 27 | userresult.write(result+'\n') 28 | userresult.close() 29 | 30 | def replace_r(): 31 | room=open('roomtxt.txt','w') 32 | f=open('roomdata.txt','r').readlines() 33 | for line in f: 34 | line=line.replace('\r','').replace('\n','') 35 | room.write(line+'\n') 36 | room.close() 37 | 38 | def Excel(): 39 | Response_rate='Response rate:(.*?)Response' 40 | Response_time='Response time:(.*?hours)' 41 | users=open('userresult.txt','r').readlines() 42 | rooms=open('roomtxt.txt','r').readlines() 43 | excel=xlwt3.Workbook() 44 | usersheet=excel.add_sheet('user') 45 | roomsheet=excel.add_sheet('room') 46 | count=0 47 | for line in rooms: 48 | lists=line.replace('\n','').split('||') 49 | for user in users: 50 | if lists[5] in user: 51 | try: 52 | rate=re.findall(Response_rate,line)[0] 53 | except: 54 | rate='--' 55 | try: 56 | time=re.findall(Response_time,line)[0] 57 | except: 58 | time='--' 59 | num=0 60 | for i in lists: 61 | try: 62 | i=i.split('?')[0] 63 | i=i.split(':')[-1] 64 | i=i.replace('/rooms/','') 65 | i=i.replace('/users/show/','') 66 | except: 67 | pass 68 | roomsheet.write(count,num,i) 69 | num+=1 70 | roomsheet.write(count,num,rate) 71 | num+=1 72 | roomsheet.write(count,num,time) 73 | num=0 74 | for i in user.replace('\n','').split('||'): 75 | try: 76 | i=i.split('?')[0] 77 | i=i.split(':')[-1] 78 | i=i.replace('/rooms/','') 79 | i=i.replace('/users/show/','') 80 | except: 81 | pass 82 | usersheet.write(count,num,i) 83 | num+=1 84 | count+=1 85 | excel.save('result.xls') 86 | 87 | Excel() 88 | -------------------------------------------------------------------------------- /www.airbnb.com/rooms.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import re 4 | 5 | headers = { 6 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/44.0', 7 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 8 | 'Accept-Language': 'en-US,en;q=0.5', 9 | 'Accept-Encoding': 'gzip, deflate', 10 | 'Connection': 'keep-alive'} 11 | 12 | def rooms(url): 13 | html=requests.get(url,headers=headers).text 14 | try: 15 | table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'listings-container'}).find_all('div',attrs={'class':'listing'}) 16 | except: 17 | return False 18 | result=[] 19 | for item in table: 20 | try: 21 | price=item.find('div',attrs={'class':'price-amount-container'}).get_text() 22 | except: 23 | price='--' 24 | try: 25 | media=item.find('div',attrs={'class':'media'}) 26 | title=media.find('h3').get_text() 27 | userurl=media.find('a').get('href') 28 | roomurl=media.find('h3').find('a').get('href') 29 | except: 30 | continue 31 | a=media.find('a',attrs={'class':'text-normal link-reset'}) 32 | try: 33 | rating=a.find('div',attrs={'class':'star-rating'}).find('div').find_all('i') 34 | star=len(rating) 35 | clases=[] 36 | for i in rating: 37 | clases+=i.get('class') 38 | if 'icon-star-half' in clases: 39 | star=star-0.5 40 | except: 41 | star='--' 42 | try: 43 | review=a.get_text().replace('\r','').replace('\n','').replace(' ','') 44 | review=re.findall('(\d+)reviews',review)[0] 45 | except: 46 | review='--' 47 | text=title+'||'+price+'||'+review+'||'+str(star)+'||'+roomurl+'||'+userurl 48 | result.append(text.replace('\r','').replace('\n','').replace(' ','')) 49 | return result 50 | 51 | def getrooms(): 52 | citys="Chicago,Vancouver,Montreal,Portland,Philadelphia,Denver,Austin,D.C.,New Orleans,Phoenix,San Diego,Nashville,Paris,Berlin,Rome,Amsterdam,Barcelona,Copenhagen,Prague,Budapest,Stockholm,Florence,Edinburgh,Istanbul,Sydney,Melbourne,Cape Town,Beijing,Shanghai,Tokyo" 53 | failed=open('failed.txt','a',encoding='utf-8') 54 | for city in citys.split(','): 55 | print(city) 56 | url_f=open('urls.txt','a',encoding='utf-8') 57 | url='https://www.airbnb.com/s/'+city.replace(' ','+').replace('.','%252E') 58 | page=1 59 | pre=[] 60 | while True: 61 | result=rooms(url+'?ss_id=v5im73ob&page=%s'%page) 62 | if result==pre: 63 | break 64 | pre=result 65 | if result==False: 66 | failed.write(city+'--'+str(page)) 67 | break 68 | for item in result: 69 | url_f.write(city+'||'+item+'\n') 70 | print(city,'--',page) 71 | page+=1 72 | if(page==18): 73 | break 74 | url_f.close() 75 | url_f.close() 76 | failed.close() 77 | 78 | getrooms() 79 | -------------------------------------------------------------------------------- /www.baikemy.com/disease.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import time 4 | import openpyxl 5 | 6 | 7 | headers = { 8 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 9 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 10 | 'Accept-Language': 'en-US,en;q=0.5', 11 | 'Accept-Encoding': 'gzip, deflate', 12 | 'Connection': 'keep-alive'} 13 | 14 | def disease_list(): 15 | page=1 16 | f=open('urls.txt','w',encoding='utf-8') 17 | while True: 18 | try: 19 | html=requests.get('http://www.baikemy.com/disease/list/0/0?pageIndex='+str(page),headers=headers,timeout=30).text 20 | except: 21 | continue 22 | table=BeautifulSoup(html,'lxml').find('div',{'class':'ccjb_jbli'}).find_all('li') 23 | for li in table: 24 | try: 25 | name=li.find('a').get_text() 26 | url='http://www.baikemy.com/'+li.find('a').get('href').replace('view','detail')+'/1/' 27 | f.write(name+'|'+url+'\n') 28 | except: 29 | pass 30 | if len(table)==1: 31 | break 32 | print('page %s urls get'%page) 33 | page+=1 34 | f.close() 35 | 36 | def disease_infor(name,url): 37 | html=requests.get(url,headers=headers,timeout=30).text 38 | table=BeautifulSoup(html,'lxml').find('div',{'class':'lemma-main'}).find_all('div',{'class':'lemma-main-content'}) 39 | result=[name] 40 | for item in table: 41 | try: 42 | key=item.find('span',{'class':'headline-content'}).get_text() 43 | value=item.find('div',{'class':'para'}).get_text() 44 | result.append(key+':\t '+value) 45 | except: 46 | continue 47 | return result 48 | 49 | def write_to_excel(result): 50 | excel=openpyxl.Workbook(write_only=True) 51 | sheet=excel.create_sheet() 52 | for line in result: 53 | try: 54 | sheet.append(line) 55 | except: 56 | pass 57 | excel.save('result.xlsx') 58 | 59 | def main(): 60 | disease_list() 61 | result=[] 62 | for line in open('urls.txt','r',encoding='utf-8'): 63 | line=line.replace('\n','') 64 | try: 65 | name=line.split('|')[0] 66 | url=line.split('|')[1] 67 | except: 68 | continue 69 | try: 70 | data=disease_infor(name,url) 71 | except: 72 | failed=open('failed.txt','a',encoding='utf-8') 73 | failed.write(line+'\r\n') 74 | failed.close() 75 | continue 76 | result.append(data) 77 | try: 78 | print(name,'ok') 79 | except: 80 | pass 81 | write_to_excel(result) 82 | print('完成') 83 | 84 | 85 | main() 86 | time.sleep(60) 87 | -------------------------------------------------------------------------------- /www.chazidian.com/yuwen.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import openpyxl 4 | import time 5 | 6 | headers = { 7 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 8 | "Accept-Encoding": "gzip, deflate", 9 | "Accept-Language": "en-US,en;q=0.5", 10 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 11 | 12 | 13 | def get_terms(): 14 | html=open('html.html','r').read() 15 | table=BeautifulSoup(html).find_all('span',{'class':'y-l'}) 16 | urls=[] 17 | f=open('terms.txt','w') 18 | for item in table: 19 | try: 20 | term=item.find('h4').get_text() 21 | publishers=item.find_all('p') 22 | for p in publishers: 23 | publisher=p.get_text() 24 | links=p.find_all('a') 25 | for a in links: 26 | url=a.get('href') 27 | f.write(term+'|'+publisher+'|'+a.get_text()+'|'+url+'\n') 28 | except: 29 | continue 30 | f.close() 31 | 32 | def get_article_url(term_url): 33 | html=requests.get('http://yuwen.chazidian.com'+term_url,headers=headers).text 34 | table=BeautifulSoup(html,'lxml').find('div',id='mulu').find_all('div',{'class':'mldy'}) 35 | result=[] 36 | num=1 37 | for item in table: 38 | title=item.find('a').get_text() 39 | url=item.find('a').get('href').replace('kewen','kewendetail') 40 | line=str(num)+'|'+title+'|'+url 41 | result.append(line) 42 | num+=1 43 | return result 44 | 45 | def get_urls(): 46 | for line in open('terms.txt','r'): 47 | line=line.replace('\n','') 48 | url=line.split('|')[-1] 49 | result=get_article_url(url) 50 | f=open('urls.txt','a') 51 | for item in result: 52 | f.write(line+'|'+item+'\n') 53 | f.close() 54 | print(line) 55 | time.sleep(1) 56 | 57 | def get_article_content(url): 58 | html=requests.get(url,headers=headers).text 59 | content=BeautifulSoup(html,'lxml').find('div',id='print_content').get_text() 60 | return content 61 | 62 | def main(): 63 | excel=openpyxl.Workbook(write_only=True) 64 | sheet=excel.create_sheet() 65 | for line in open('urls.txt','r'): 66 | line=line.replace('\n','') 67 | infor_list=line.split('|') 68 | url=infor_list[-1] 69 | try: 70 | content=get_article_content(url) 71 | except: 72 | failed=open('failed.txt','a') 73 | failed.write(line+'\n') 74 | failed.close() 75 | continue 76 | sheet.append(infor_list+[content]) 77 | print(line) 78 | time.sleep(0.5) 79 | excel.save('result.xlsx') 80 | main() 81 | -------------------------------------------------------------------------------- /www.china-10.com/china10.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import time 6 | 7 | headers = { 8 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 9 | "Accept-Encoding": "gzip, deflate", 10 | "Accept-Language": "en-US,en;q=0.5", 11 | "Connection": "keep-alive", 12 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 13 | 14 | def get_kinds(): 15 | f=open('types.txt','w') 16 | url='http://www.china-10.com/brand/' 17 | html=requests.get(url).text 18 | table=BeautifulSoup(html,'lxml').find('div',id='menubox').find('ul',id='conmenu').find_all('li',attrs={'class':'menu'}) 19 | for item in table[1:-3]: 20 | key=item.find('a').get_text().replace('\n','')+'||' 21 | for li in item.find_all('li'): 22 | f.write(key+li.find('a').get('title')+'||'+li.find('a').get('href')+'\n') 23 | f.close() 24 | 25 | def get_brands(): 26 | f=open('types.txt','r') 27 | data=open('brands.txt','w') 28 | for line in f.readlines(): 29 | print(line) 30 | line=line.replace('\n','') 31 | page=1 32 | while True: 33 | html=requests.get(line.split('||')[-1]+'?action=ajax&page='+str(page),headers).text 34 | page+=1 35 | table=BeautifulSoup(html,'lxml').find_all('li') 36 | if(table==[]): 37 | break 38 | for item in table: 39 | text=line+'||'+item.get_text()+'||'+item.find('a').get('href')+'\n' 40 | data.write(text) 41 | print(page) 42 | f.close() 43 | 44 | def get_infor(line): 45 | html=requests.get(line.split('||')[-1],headers=headers).text 46 | table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'brandinfo'}) 47 | des=table.find('dd').get_text() 48 | line+='||'+des 49 | table=table.find('ul').find_all('li') 50 | for li in table: 51 | line+='||'+li.get_text().replace('\r','').replace('\n','').replace('\t','').replace(' ','') 52 | return line 53 | 54 | def main(): 55 | data=open('data.txt','a') 56 | failed=open('failed.txt','a') 57 | count=0 58 | for line in open('brands.txt','r').readlines(): 59 | line=line.replace('\n','') 60 | try: 61 | line=get_infor(line) 62 | except: 63 | failed.write(line+'\n') 64 | continue 65 | data.write(line+'\n') 66 | count+=1 67 | time.sleep(1) 68 | print(count) 69 | 70 | main() 71 | -------------------------------------------------------------------------------- /www.china-10.com/excel.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import xlwt3 4 | import re 5 | 6 | def excel(): 7 | f=open('data.txt','r') 8 | ex=xlwt3.Workbook() 9 | sheet=ex.add_sheet('sheet') 10 | count=0 11 | rels=['品牌等级:(.*?)\|\|','关注指数:(.*?)\|\|','\|\|.*?董事.*?:(.*?)品牌创立','时间:(.*?)\|\|','发源地:(.*?)\|\|','官方网站:(.*?)\|\|','客服电话:(.*?)\|\|','告词:(.*?)\|\|','(产品\d+)]','(网点\d+)]','(新闻\d+)]','(网店.*?)]'] 12 | for line in f.readlines(): 13 | line=line.replace('\n','').replace('信用指数:','') 14 | lists=[] 15 | for rel in rels: 16 | try: 17 | i=re.findall(rel,line)[0] 18 | except: 19 | i='--' 20 | lists.append(i) 21 | strs=line.split('||') 22 | sheet.write(count,0,strs[0]) 23 | sheet.write(count,1,strs[1]) 24 | sheet.write(count,2,strs[2]) 25 | sheet.write(count,3,strs[3]) 26 | sheet.write(count,4,strs[4]) 27 | sheet.write(count,5,strs[5]) 28 | num=6 29 | for i in lists: 30 | sheet.write(count,num,i) 31 | num+=1 32 | sheet.write(count,num,strs[-1]) 33 | count+=1 34 | ex.save('data.xls') 35 | 36 | excel() 37 | -------------------------------------------------------------------------------- /www.chuanlaoda.cn/CaptchaOCR.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.chuanlaoda.cn/CaptchaOCR.dll -------------------------------------------------------------------------------- /www.chuanlaoda.cn/py2exe_install.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | from distutils.core import setup 3 | import py2exe 4 | 5 | setup(console=["chuanlaoda.py"]) 6 | -------------------------------------------------------------------------------- /www.chuanlaoda.cn/testdll.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | 4 | from ctypes import * 5 | 6 | ocrpasswd = "868197D30CC624FD3C2E2EE66494DA5F" 7 | #VcodeInit 初始换引擎函数 只有一个参数 为引擎初始化密码 失败返回-1 此函数只需调用一次 切勿多次调用 。 8 | dll = windll.LoadLibrary('CaptchaOCR.dll') 9 | load_ocr = dll.VcodeInit 10 | load_ocr.argtypes = [c_char_p] 11 | load_ocr.restypes = c_int 12 | index = load_ocr(ocrpasswd.encode('utf-8')) 13 | img_string = open(imgname, "rb").read() 14 | img_buffer = create_string_buffer(img_string) 15 | #申请接收识别结果的缓冲区 一定要申请 16 | ret_buffer = create_string_buffer(15) 17 | #调用此函数之前,如果已经初始化成功过识别引擎函数 那么无需再调用初始化函数 18 | #GetVcode 识别函数 参数1为 VcodeInit 返回值 index 参数2为图片数据 参数3为图片大小 参数4为接收识别结果 需要给变量申请内存 如 ret_buffer = create_string_buffer(10) 19 | get_code_from_buffer = dll.GetVcode 20 | get_code_from_buffer(index, byref(img_buffer), len(img_buffer), byref(ret_buffer)) 21 | print (ret_buffer.value.decode('utf-8')) 22 | -------------------------------------------------------------------------------- /www.chuanlaoda.cn/x64/CaptchaOCR.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.chuanlaoda.cn/x64/CaptchaOCR.dll -------------------------------------------------------------------------------- /www.cpbz.gov.cn/write_to_excel.py: -------------------------------------------------------------------------------- 1 | import openpyxl 2 | 3 | 4 | def load_result(): 5 | result=[] 6 | for line in open('result.txt','r'): 7 | item=eval(line) 8 | baseinfor=[item['url']] 9 | for key in ['机构名称','法定代表人','组织机构代码','邮政编码','注册地址','行政区划']: 10 | try: 11 | baseinfor.append(item['企业基本信息'][key]) 12 | except: 13 | baseinfor.append('') 14 | numbers=[] 15 | try: 16 | for num_line in item['技术指标']: 17 | numbers+=num_line 18 | except: 19 | pass 20 | for key in ['标准名称','标准编号','公开时间','url']: 21 | try: 22 | baseinfor.append(item['标准信息'][key]) 23 | except: 24 | baseinfor.append('') 25 | try: 26 | products=item['产品信息'] 27 | except: 28 | products=[] 29 | for product in products: 30 | product[-1]=item['standardStatus'] 31 | yield baseinfor+product+numbers 32 | 33 | def write_to_excel(): 34 | excel=openpyxl.Workbook(write_only=True) 35 | sheet=excel.create_sheet() 36 | for line in load_result(): 37 | sheet.append(line) 38 | excel.save('result.xlsx') 39 | 40 | write_to_excel() 41 | -------------------------------------------------------------------------------- /www.ctrip.com/comments.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from bs4 import BeautifulSoup 3 | import time 4 | 5 | browser=webdriver.Chrome("./chromedriver") 6 | browser.get('http://hotels.ctrip.com/hotel/zhuhai31') 7 | browser.implicitly_wait(10) 8 | hotels=[eval(line) for line in open('hotels.txt','r')] 9 | flag=True 10 | for hotel in hotels: 11 | hotel_id=hotel[2].split('.')[0].split('/')[-1] 12 | if hotel_id!='1353810' and flag: 13 | continue 14 | flag=False 15 | page=1 16 | ''' 17 | if hotel_id=='435300': 18 | page=54 19 | ''' 20 | endpage=1000 21 | while page<=endpage: 22 | try: 23 | browser.get('http://hotels.ctrip.com/hotel/dianping/%s_p%st0.html'%(hotel_id,page)) 24 | html=browser.page_source 25 | except: 26 | continue 27 | time.sleep(2) 28 | try: 29 | browser.find_element_by_class_name('comment_tab_main') 30 | comments=BeautifulSoup(html,'lxml').find('div',{'class':'comment_tab_main'}).find_all('div',{'class':'comment_block'}) 31 | except: 32 | continue 33 | if '以下为酒店3年前历史点评' in str(comments): 34 | print('以下为酒店3年前历史点评') 35 | break 36 | f=open('result_2.txt','a') 37 | for line in comments: 38 | f.write(str(hotel+[str(line)])+'\n') 39 | f.close() 40 | print(page,endpage,hotel[0]) 41 | if endpage==1000: 42 | try: 43 | endpage=BeautifulSoup(html,'lxml').find('div',{'class':'c_page_list'}).find_all('a')[-1].get('value') 44 | endpage=int(endpage) 45 | except: 46 | break 47 | page+=1 48 | -------------------------------------------------------------------------------- /www.ctrip.com/youtrip.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import time 4 | 5 | headers = { 6 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 7 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 8 | 'Accept-Language': 'en-US,en;q=0.5', 9 | 'Accept-Encoding': 'gzip, deflate', 10 | 'Connection': 'keep-alive'} 11 | 12 | def getUrl(): 13 | f=open('urls.txt','a') 14 | page=1 15 | while True: 16 | html=requests.get('http://you.ctrip.com/travels/guilin28/t3-p{}.html'.format(page),headers=headers).text 17 | table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'journalslist cf'}).find_all('a',attrs={'class':'journal-item cf'}) 18 | for item in table: 19 | title=item.find('dt').get_text().replace('\r','').replace('\n','') 20 | f.write(title+'||'+item.get('href')+'\n') 21 | print(page,'--ok') 22 | page+=1 23 | if page==991: 24 | break 25 | time.sleep(2) 26 | f.close() 27 | 28 | def getcontent(url): 29 | html=requests.get(url,headers=headers).text 30 | soup=BeautifulSoup(html,'lxml').find('div',attrs={'class':'ctd_content'}) 31 | body=soup.get_text() 32 | place=soup.find('div',{'class':'ctd_content_controls cf'}).get_text() 33 | result=body.replace(place,'') 34 | return result 35 | 36 | 37 | def main(): 38 | excel=xlwt3.Workbook() 39 | sheet=excel.add_sheet('sheet') 40 | count=0 41 | for line in open('urls.txt','r'): 42 | line=line.replace('\n','') 43 | title=line.split('||')[0] 44 | url='http://you.ctrip.com'+line.split('||')[-1] 45 | try: 46 | content=getcontent(url) 47 | except: 48 | failed=open('failed.txt','a') 49 | failed.write(line+'\n') 50 | failed.close() 51 | continue 52 | sheet.write(count,0,count) 53 | sheet.write(count,1,title) 54 | sheet.write(count,2,content) 55 | count+=1 56 | excel.save('result.xls') 57 | time.sleep(2) 58 | print(count,'--ok') 59 | 60 | -------------------------------------------------------------------------------- /www.dicos.com.cn/storelist.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import openpyxl 4 | 5 | def citys(): 6 | f=open('citys.txt','a') 7 | for pid in range(6,33): 8 | html=requests.get('http://www.dicos.com.cn/index.php?c=page&m=getcityhtml&iscity=1&pid=%s'%pid).text 9 | table=BeautifulSoup(html,'lxml').find_all('option') 10 | for item in table: 11 | f.write(item.get_text()+'|'+item.get('value')+'\n') 12 | f.close() 13 | 14 | def get_store(citycode): 15 | html=requests.get('http://www.dicos.com.cn/index.php?c=page&m=getstorehtml&waimai=0&mProvince=3&mCity=%s'%citycode).text 16 | table=BeautifulSoup(html,'lxml').find_all('tr') 17 | result=[] 18 | for item in table: 19 | text='' 20 | for td in item.find_all('td')[1:4]: 21 | text+='|'+td.get_text() 22 | result.append(text.replace('\r','').replace('\n','')) 23 | return result 24 | 25 | def main(): 26 | f=open('result.txt','a') 27 | for line in open('citys.txt'): 28 | line=line.replace('\n','') 29 | try: 30 | result=get_store(line.split('|')[-1]) 31 | except: 32 | failed=open('failed.txt','a') 33 | failed.write(line+'\n') 34 | failed.close() 35 | continue 36 | for item in result: 37 | f.write(line+item+'\n') 38 | print(line,'ok') 39 | f.close() 40 | 41 | def write_to_excel(): 42 | result={} 43 | excel=openpyxl.Workbook(write_only=True) 44 | sheet1=excel.create_sheet('1') 45 | for line in open('result.txt','r'): 46 | line=line.replace('\n','') 47 | lists=line.split('|') 48 | try: 49 | result[lists[0]]+=1 50 | except: 51 | result[lists[0]]=1 52 | sheet1.append(lists) 53 | sheet2=excel.create_sheet('2') 54 | for key in result: 55 | sheet2.append([key,result[key]]) 56 | excel.save('result.xlsx') 57 | 58 | write_to_excel() 59 | -------------------------------------------------------------------------------- /www.eastmoney.com/quote.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import openpyxl 3 | import json 4 | 5 | 6 | def get_data(code,market): 7 | url='http://hqdigi2.eastmoney.com/EM_Quote2010NumericApplication/CompatiblePage.aspx?Type=OB&stk=%s&Reference=xml&limit=0&page=%s' 8 | html=requests.get(url%(code+market,1)).text 9 | data=json.loads(html.replace('var jsTimeSharingData=','').replace(';','').replace('pages','"pages"').replace('data','"data"')) 10 | if data['pages']==0: 11 | return False 12 | pages=data['pages'] 13 | page=2 14 | result=[] 15 | for item in data['data']: 16 | result.append(item.split(',')) 17 | while page<=pages: 18 | html=requests.get(url%(code+market,page)).text 19 | data=json.loads(html.replace('var jsTimeSharingData=','').replace(';','').replace('pages','"pages"').replace('data','"data"')) 20 | for item in data['data']: 21 | result.append(item.split(',')) 22 | page+=1 23 | return result 24 | 25 | def write_to_excel(code,result): 26 | excel=openpyxl.Workbook(write_only=True) 27 | sheet=excel.create_sheet() 28 | for item in result: 29 | sheet.append(item) 30 | excel.save('%s.xlsx'%code) 31 | print(code,'OK') 32 | 33 | def main(): 34 | try: 35 | code=input('输入股票代码:') 36 | except: 37 | print("Faliled") 38 | return 39 | result=[] 40 | for market in ['1','2']: 41 | try: 42 | result=get_data(code,market) 43 | except: 44 | continue 45 | if result==False: 46 | continue 47 | break 48 | if result==[] or result==False: 49 | print('Failed') 50 | return 51 | write_to_excel(code,result) 52 | 53 | while True: 54 | main() -------------------------------------------------------------------------------- /www.eastmoney.com/transaction.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import openpyxl 3 | import re 4 | import time 5 | import os 6 | 7 | 8 | def get_data(code,market): 9 | url='http://nufm3.dfcfw.com/EM_Finance2014NumericApplication/JS.aspx?type=CT&cmd=%s&sty=DPTTFD&st=z&sr=1&p=1&ps=&cb=&token=beb0a0047196124721f56b0f0ff5a27c' 10 | html=requests.get(url%(code+market)).text 11 | if 'false' in html: 12 | return False 13 | text=re.findall('"(.*?)"',html)[0] 14 | lines=text.split('|') 15 | result=[] 16 | for line in lines: 17 | result.append(line.split('~')) 18 | return result 19 | 20 | def write_to_excel(code,result): 21 | excel=openpyxl.Workbook(write_only=True) 22 | sheet=excel.create_sheet() 23 | for item in result: 24 | sheet.append(item) 25 | try: 26 | os.mkdir('result/'+code) 27 | except: 28 | pass 29 | date=timenow=time.strftime('%Y-%m-%d',time.localtime()) 30 | excel.save('result/'+code+'/%s.xlsx'%date) 31 | 32 | def get_transaction(code): 33 | global result 34 | for market in ['1','2']: 35 | try: 36 | data=get_data(code,market) 37 | except: 38 | continue 39 | if data==False: 40 | continue 41 | break 42 | if data==[] or data==False: 43 | print('Failed') 44 | return 45 | timenow=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime()) 46 | print(timenow,code,'ok') 47 | is_write=False 48 | for line in data: 49 | if line in result: 50 | continue 51 | result.append(line) 52 | is_write=True 53 | if is_write: 54 | write_to_excel(code,result) 55 | 56 | 57 | code=input('输入股票代码:') 58 | result=[] 59 | while True: 60 | get_transaction(code) 61 | time.sleep(0.5) -------------------------------------------------------------------------------- /www.fang.com/new_hourse.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import time 4 | import openpyxl 5 | 6 | headers = { 7 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 8 | "Accept-Encoding": "gzip, deflate", 9 | "Accept-Language": "en-US,en;q=0.5", 10 | "Connection": "keep-alive", 11 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 12 | 13 | 14 | def get_house(): 15 | page=1 16 | url='http://newhouse.cs.fang.com/house/s/b9' 17 | while True: 18 | html=requests.get(url+str(page),headers=headers).text.encode('iso-8859-1').decode('gbk') 19 | table=BeautifulSoup(html,'lxml').find('div',{'class':'nhouse_list'}).find_all('li') 20 | f=open('urls.txt','a') 21 | for item in table: 22 | detail=item.find('div',{'class':'nlc_details'}) 23 | house_url=detail.find('a').get('href') 24 | name=detail.find('a').get_text() 25 | address_div=detail.find('div',{'class':'address'}) 26 | address=address_div.find('a').get('title') 27 | try: 28 | location=address_div.find('span').get_text() 29 | except: 30 | location='-' 31 | try: 32 | price=detail.find('div',{'class':'nhouse_price'}).find('span').get_text() 33 | except: 34 | price='-' 35 | line=name+'|'+house_url+'|'+price+'|'+location+'|'+address 36 | line=line.replace('\r','').replace('\n','').replace('\t','') 37 | f.write(line+'\n') 38 | f.close() 39 | print(page,'ok') 40 | page+=1 41 | time.sleep(1) 42 | 43 | def get_house_live_history(url): 44 | html=requests.get(url,headers=headers).text.encode('iso-8859-1').decode('gbk') 45 | table=BeautifulSoup(html,'lxml').find('div',id='tc_jiaofang').find_all('tr') 46 | lines=[] 47 | for tr in table[2:-1]: 48 | tds=tr.find_all('td') 49 | date=tds[0].get_text() 50 | month=date.split('-')[1] 51 | infor=tds[1].get_text() 52 | line=month+'|'+date+'|'+infor 53 | lines.append(line.replace('\xa0','')) 54 | return lines 55 | 56 | def house_live_history(): 57 | is_ok=True 58 | for item in open('urls.txt','r'): 59 | item=item.replace('\n','') 60 | url=item.split('|')[1] 61 | if url!='http://jiulongshanjy.fang.com/' and is_ok==True: 62 | continue 63 | is_ok=False 64 | try: 65 | lines=get_house_live_history(url) 66 | except: 67 | lines=[] 68 | print(item) 69 | f=open('changsha.txt','a') 70 | if lines==[]: 71 | f.write(item+'\n') 72 | f.close() 73 | continue 74 | for line in lines: 75 | f.write(item+'|'+line+'\n') 76 | f.close() 77 | time.sleep(1) 78 | 79 | house_live_history() 80 | -------------------------------------------------------------------------------- /www.ganji.com/ganji_tel.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import re 6 | import time 7 | import openpyxl 8 | 9 | headers = { 10 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 11 | "Accept-Encoding": "gzip, deflate", 12 | "Accept-Language": "en-US,en;q=0.5", 13 | "Connection": "keep-alive", 14 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 15 | 16 | 17 | def get_tels(url): 18 | html=requests.get(url,headers=headers).text 19 | try: 20 | table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'list'}).find_all('li') 21 | except: 22 | return [] 23 | tels=[] 24 | for li in table: 25 | try: 26 | tel=li.find('div',attrs={'class':'list-r-area'}).find('p',attrs={'class':'tel'}).find('span').get_text() 27 | except: 28 | continue 29 | tels.append(tel) 30 | return tels 31 | 32 | 33 | def main(): 34 | url=input('输入链接:') 35 | url=re.sub('o\d+/','',url) 36 | if not url.startswith('http'): 37 | url='http://'+url 38 | page=1 39 | tels=[] 40 | while True: 41 | try: 42 | result=get_tels(url+'o'+str(page)+'/') 43 | except: 44 | continue 45 | if result==[]: 46 | break 47 | tels+=result 48 | print('第%s页--完成'%page) 49 | page+=1 50 | time.sleep(5) 51 | tels=list(set(tels)) 52 | count=0 53 | excel=openpyxl.Workbook(write_only=True) 54 | sheet=excel.create_sheet() 55 | for tel in tels: 56 | sheet.append([tel]) 57 | excel.save('tels.xls') 58 | 59 | main() 60 | -------------------------------------------------------------------------------- /www.gewara.com/reviews.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import openpyxl 4 | import time 5 | 6 | 7 | headers = { 8 | 'X-Requested-With':"XMLHttpRequest", 9 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 10 | "Accept-Encoding": "gzip, deflate", 11 | "Accept-Language": "en-US,en;q=0.5", 12 | "Connection": "keep-alive", 13 | 'Referer':"http://www.gewara.com/movie/282568860", 14 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 15 | 16 | def getreviews(page,relatedid): 17 | html=requests.get('http://www.gewara.com/activity/ajax/sns/qryComment.xhtml?pageNumber={}&relatedid={}&topic=&issue=false&hasMarks=true&isCount=true&tag=movie&isPic=true&isVideo=false&userLogo=&newWalaPage=true&isShare=false&isNew=true&maxCount=200&isWide=true&isTicket=false'.format(page,relatedid),headers=headers).text 18 | table=BeautifulSoup(html,'lxml').find_all('div',{'class':'page_wala'}) 19 | result=[] 20 | for item in table: 21 | try: 22 | grade=item.find('span',{'class':'ui_grades left ui_grade10'}).get('title') 23 | reviewid=item.find('div',{'class':'wala_txt'}).get('data-id') 24 | if reviewid==None: 25 | review=item.find('div',{'class':'wala_miniTxt'}).get_text().replace('\r','').replace('\n','').replace('\t','') 26 | result.append({'grade':grade,'review':review}) 27 | continue 28 | result.append({'grade':grade,'id':reviewid}) 29 | except: 30 | continue 31 | return result 32 | 33 | def getcontent(id): 34 | html=requests.get('http://www.gewara.com/activity/sns/ajaxCommentDetail.xhtml?id=%s&isNew=true'%id).text 35 | text=BeautifulSoup(html,'lxml').get_text().replace('\r','').replace('\n','').replace('\t','') 36 | return text 37 | 38 | def write_to_excel(): 39 | excel=openpyxl.Workbook(write_only=True) 40 | sheet=excel.create_sheet() 41 | for line in open('result.txt','r'): 42 | item=eval(line) 43 | sheet.append([item['grade'],item['review']]) 44 | excel.save('result.xlsx') 45 | 46 | def main(): 47 | f=open('result.txt','a') 48 | page=1 49 | count=0 50 | while True: 51 | try: 52 | result=getreviews(page,'282568860') 53 | except: 54 | print('failed') 55 | time.sleep(3) 56 | continue 57 | for item in result: 58 | try: 59 | dataid=item['id'] 60 | except: 61 | count+=1 62 | print(count) 63 | f.write(str(item)+'\n') 64 | continue 65 | try: 66 | review=getcontent(dataid) 67 | except: 68 | continue 69 | item['review']=review 70 | f.write(str(item)+'\n') 71 | count+=1 72 | print(count) 73 | time.sleep(0.5) 74 | print(page,'ok') 75 | page+=1 76 | if page==200: 77 | break 78 | f.close() 79 | 80 | write_to_excel() 81 | -------------------------------------------------------------------------------- /www.imdb.com/boxoffice.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import re 6 | 7 | headers = { 8 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 9 | "Accept-Encoding": "gzip, deflate", 10 | "Accept-Language": "en-US,en;q=0.5", 11 | "Connection": "keep-alive", 12 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 13 | 14 | def get_url(title): 15 | name=re.sub('\(.*?\)','',title)#.lower()#.replace(' ','') 16 | html=requests.get('http://www.boxofficemojo.com/search/?q=%s'%name,headers=headers).text.replace('\r','').replace('\n','').replace('\t','') 17 | rel='bgcolor=#FFFF99>(.*?)' 18 | tr=re.findall(rel,html)[0]#BeautifulSoup(html,'lxml').find('tr',attrs={'bgcolor':'#FFFF99'}) 19 | tds=BeautifulSoup(str(tr),'lxml').find_all('td') 20 | #tds=tr.findall('td') 21 | url='http://www.boxofficemojo.com'+tds[0].find('a').get('href') 22 | de=tds[2].get_text() 23 | html=requests.get(url,headers=headers).text 24 | table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'mp_box_content'}).get_text().replace('\r','|').replace('\n','|') 25 | print(table) 26 | line=de 27 | rel='Worldwide:\|(.*?)\|' 28 | try: 29 | wl=re.findall(rel,table)[0] 30 | except: 31 | wl='-' 32 | line=de+'||'+wl 33 | return line 34 | 35 | def main(): 36 | f=open('data.txt','w') 37 | for line in open('new.txt','r').readlines(): 38 | line=line.replace('\r','').replace('\n','') 39 | try: 40 | price=get_url(line) 41 | except: 42 | price='--||--' 43 | f.write(line+'||'+price+'\n') 44 | print(price) 45 | 46 | main() 47 | -------------------------------------------------------------------------------- /www.imdb.com/movies.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import time 6 | from selenium import webdriver 7 | 8 | headers = { 9 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 10 | "Accept-Encoding": "gzip, deflate", 11 | "Accept-Language": "en-US,en;q=0.5", 12 | "Connection": "keep-alive", 13 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 14 | 15 | def get_movies(): 16 | f=open('data_movies2013.txt','a') 17 | start=1 18 | while start<8519: 19 | try: 20 | html=requests.get('http://www.imdb.com/search/title?at=0&sort=boxoffice_gross_us&start=%s&title_type=feature&year=2013,2013'%start,headers=headers,timeout=30).text 21 | except: 22 | continue 23 | items=parser(html) 24 | for item in items: 25 | f.write(item+'\n') 26 | start+=50 27 | print(start) 28 | 29 | def parser(html): 30 | items=[] 31 | table=BeautifulSoup(html,'lxml').find('table',attrs={'class':'results'}).find_all('tr')[1:] 32 | for item in table: 33 | td=item.find('td',attrs={'class':'title'}) 34 | title=item.find('a').get('title') 35 | try: 36 | score=td.find('span',attrs={'class':'rating-rating'}).get_text() 37 | except: 38 | score='-' 39 | try: 40 | col=item.find('td',attrs={'class':'sort_col'}).get_text() 41 | except: 42 | col='-' 43 | text=title+'||'+score+'||'+col 44 | items.append(text) 45 | return items 46 | 47 | get_movies() 48 | -------------------------------------------------------------------------------- /www.imdb.com/rottentomatoes.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import re 6 | import threading 7 | 8 | headers = { 9 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 10 | "Accept-Encoding": "gzip, deflate", 11 | "Accept-Language": "en-US,en;q=0.5", 12 | "Connection": "keep-alive", 13 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 14 | 15 | class Score(threading.Thread): 16 | def __init__(self,line): 17 | super(Score,self).__init__() 18 | self.line=line 19 | self.name=self.line.split('||')[0] 20 | 21 | def run(self): 22 | try: 23 | self.score=self.get_score(self.name) 24 | except: 25 | self.score='-' 26 | print(self.score) 27 | self.line=self.line+'||'+self.score 28 | 29 | def get_score(self,name): 30 | try: 31 | html=requests.get('http://www.rottentomatoes.com/search/?search=%s'%name.replace(' ','+'),headers=headers,timeout=40).text 32 | except: 33 | return self.get_score(name) 34 | try: 35 | table=BeautifulSoup(html,'lxml').find('ul',id='movie_results_ul').find_all('li') 36 | except: 37 | return score(html) 38 | url='' 39 | for li in table: 40 | title=li.find('div',attrs={'class':'nomargin media-heading bold'}).get_text().replace('\r','').replace('\n','').replace(' ','') 41 | if title.lower()==name.replace(' ','').lower(): 42 | url='http://www.rottentomatoes.com'+li.find('a').get('href') 43 | break 44 | if(url==''): 45 | return '-' 46 | html=requests.get(url,headers=headers,timeout=40).text 47 | return score(html) 48 | 49 | def score(html): 50 | text=BeautifulSoup(html,'lxml').find('div',id='scorePanel').get_text().replace('\r','').replace('\n','').replace(' ','') 51 | rel='AverageRating:(.*?)R' 52 | try: 53 | result=re.findall(rel,text)[0] 54 | return result 55 | except: 56 | return '-' 57 | 58 | 59 | def main(): 60 | f=open('movies_2013.txt','a') 61 | items=[] 62 | for line in open('data_movies2013.txt','r').readlines(): 63 | line=line.replace('\n','') 64 | items.append(line) 65 | if(len(items)<40): 66 | continue 67 | threadings=[] 68 | for item in items: 69 | work=Score(item) 70 | threadings.append(work) 71 | for work in threadings: 72 | work.start() 73 | for work in threadings: 74 | work.join() 75 | for work in threadings: 76 | f.write(work.line+'\n') 77 | items=[] 78 | threadings=[] 79 | 80 | for item in items: 81 | work=Score(item) 82 | threadings.append(work) 83 | for work in threadings: 84 | work.start() 85 | for work in threadings: 86 | work.join() 87 | for work in threadings: 88 | f.write(work.line+'\n') 89 | f.close() 90 | 91 | main() 92 | -------------------------------------------------------------------------------- /www.itjuzi.com/baseInvestevents.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import xlwt3 6 | import time 7 | 8 | headers = { 9 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 10 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 11 | 'Accept-Language': 'en-US,en;q=0.5', 12 | 'Accept-Encoding': 'gzip, deflate', 13 | 'Connection': 'keep-alive'} 14 | 15 | def get_infor(url): 16 | html=requests.get(url,headers=headers,timeout=50).text 17 | results=[] 18 | table=BeautifulSoup(html,'html.parser').find_all('ul',attrs={'class':'list-main-eventset'})[1].find_all('li') 19 | for li in table: 20 | item={} 21 | i=li.find_all('i') 22 | item['date']=i[0].get_text().replace('\n','').replace('\t','') 23 | spans=i[2].find_all('span') 24 | item['name']=spans[0].get_text().replace('\n','').replace('\t','') 25 | item['industry']=spans[1].get_text().replace('\n','').replace('\t','') 26 | item['local']=spans[2].get_text().replace('\n','').replace('\t','') 27 | item['round']=i[3].get_text().replace('\n','').replace('\t','') 28 | item['capital']=i[4].get_text().replace('\n','').replace('\t','') 29 | companys=i[5].find_all('a') 30 | Investmenters='' 31 | if(companys==[]): 32 | Investmenters=i[5].get_text().replace('\n','').replace('\t','') 33 | else: 34 | for a in companys: 35 | Investmenters+=a.get_text().replace('\n','').replace('\t','')+';' 36 | item['Investmenters']=Investmenters 37 | results.append(item) 38 | return results 39 | 40 | def main(): 41 | excel=xlwt3.Workbook() 42 | sheet=excel.add_sheet('sheet') 43 | count=0 44 | startpage=1 45 | keys=['date','name','industry','local','round','capital','Investmenters'] 46 | while startpage<1143: 47 | try: 48 | results=get_infor('https://www.itjuzi.com/investevents?page=%s'%startpage) 49 | except: 50 | time.sleep(5) 51 | continue 52 | for item in results: 53 | num=0 54 | for key in keys: 55 | sheet.write(count,num,item[key]) 56 | num+=1 57 | count+=1 58 | print(startpage,'--ok') 59 | startpage+=1 60 | time.sleep(3) 61 | excel.save('investevents.xls') 62 | main() 63 | -------------------------------------------------------------------------------- /www.itjuzi.com/companylist.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import time 4 | import openpyxl 5 | 6 | headers = { 7 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 8 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 9 | 'Accept-Language': 'en-US,en;q=0.5', 10 | 'Accept-Encoding': 'gzip, deflate', 11 | 'Connection': 'keep-alive'} 12 | 13 | def get_companylist(page): 14 | html=requests.get('http://www.itjuzi.com/company?page=%s'%page,headers=headers,timeout=30).text 15 | table=BeautifulSoup(html,'html.parser').find_all('ul',{'class':'list-main-icnset'})[1].find_all('li') 16 | if len(table)==0: 17 | return [] 18 | result=[] 19 | for li in table: 20 | try: 21 | img=li.find('img').get('src').split('?')[0] 22 | title=li.find('p',{'class':'title'}).get_text() 23 | url=li.find('a').get('href') 24 | des=li.find('p',{'class':'des'}).get_text() 25 | tags=li.find('span',{'class':'tags'}).get_text() 26 | loca=li.find('span',{'class':'loca'}).get_text() 27 | date=li.find('i',{'class':'date'}).get_text() 28 | round=li.find('i',{'class':'round'}).get_text() 29 | except: 30 | continue 31 | result.append([img,title,url,des,tags,loca,date,round]) 32 | return result 33 | 34 | def write_to_excel(result): 35 | excel=openpyxl.Workbook(write_only=True) 36 | sheet=excel.create_sheet() 37 | filename=time.strftime("%Y%m%d_%H%M%S",time.localtime())+'.xlsx' 38 | for line in result: 39 | sheet.append(line) 40 | excel.save(filename) 41 | 42 | def loadcompany(): 43 | companys=[] 44 | for line in open('result.txt','r',encoding='utf-8'): 45 | companys.append(line.replace('\r','').replace('\n','')) 46 | return companys 47 | 48 | def main(): 49 | try: 50 | companys=loadcompany() 51 | except: 52 | companys=[] 53 | page=1 54 | f=open('result.txt','w',encoding='utf-8') 55 | flag=False 56 | new_list=[] 57 | while True: 58 | try: 59 | result=get_companylist(page) 60 | except: 61 | time.sleep(5) 62 | continue 63 | if result==[]: 64 | break 65 | for item in result: 66 | line='||'.join(item) 67 | line=line.replace('\r','').replace('\n','').replace('\t','') 68 | if line in companys: 69 | flag=True 70 | break 71 | new_list.append(item) 72 | f.write(line+'\r\n') 73 | if flag: 74 | break 75 | print(page,'ok') 76 | page+=1 77 | time.sleep(3) 78 | for company in companys: 79 | f.write(company+'\r\n') 80 | f.close() 81 | write_to_excel(new_list) 82 | 83 | main() 84 | -------------------------------------------------------------------------------- /www.itjuzi.com/investevents.py: -------------------------------------------------------------------------------- 1 | #codnig:utf-8 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import xlwt3 6 | 7 | headers = { 8 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 9 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 10 | 'Accept-Language': 'en-US,en;q=0.5', 11 | 'Accept-Encoding': 'gzip, deflate', 12 | 'Connection': 'keep-alive'} 13 | 14 | def get_base_infor(): 15 | f=open('data.txt','a') 16 | for page in range(1048): 17 | html=requests.get('https://www.itjuzi.com/investevents?page=%s'%(page+1),headers=headers).text 18 | table=BeautifulSoup(html,'html.parser').find_all('ul',attrs={'class':'list-main-eventset'})[1].find_all('li') 19 | for li in table: 20 | item={} 21 | i=li.find_all('i') 22 | item['date']=i[0].get_text() 23 | item['url']=i[1].find('a').get('href') 24 | spans=i[2].find_all('span') 25 | item['name']=spans[0].get_text() 26 | item['industry']=spans[1].get_text() 27 | item['local']=spans[2].get_text() 28 | item['round']=i[3].get_text() 29 | item['capital']=i[4].get_text() 30 | companys=i[5].find_all('a') 31 | lists=[] 32 | if(companys==[]): 33 | lists.append(i[5].get_text()) 34 | else: 35 | for a in companys: 36 | lists.append(a.get_text()) 37 | item['Investmenters']=lists 38 | f.write(str(item)+'\n') 39 | print(page) 40 | 41 | def main(): 42 | f=open('data.txt','r') 43 | data_f=open('investevents.txt','a') 44 | failed_f=open('failed.txt','a') 45 | for line in f.readlines(): 46 | try: 47 | item=eval(line.replace('\n','')) 48 | html=requests.get(item['url'],headers=headers).text 49 | url=BeautifulSoup(html,'lxml').find('div',attrs={'class':'block-inc-fina'}).find('a',attrs={'class':'incicon'}).get('href') 50 | html=requests.get(url,headers=headers).text 51 | soup=BeautifulSoup(html,'lxml').find('div',attrs={'class':'thewrap'}) 52 | table=soup.find('div',attrs={'class':'sec'}) 53 | company_url=table.find('div',attrs={'class':'rowhead'}).find('div',attrs={'class':'row c-gray-aset'}).find('div',attrs={'class':'dbi linkset c-gray'}).find('a').get('href') 54 | tags=[] 55 | for a in table.find('div',attrs={'class':'rowfoot'}).find('div',attrs={'class':'tagset dbi'}).find_all('a'): 56 | tags.append(a.get_text()) 57 | des=soup.find('div',attrs={'class':'block block-inc-info'}).find('div',attrs={'class':'des'}).get_text() 58 | item['company_url']=company_url 59 | item['tags']=tags 60 | item['des']=des 61 | data_f.write(str(item)+'\n') 62 | print(item['url']) 63 | except: 64 | failed_f.write(line) 65 | 66 | main() 67 | -------------------------------------------------------------------------------- /www.jisilu.com/jisilu.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import openpyxl 4 | import time 5 | 6 | headers = { 7 | 'Host':"www.jisilu.cn", 8 | 'Accept':"application/json, text/javascript, */*; q=0.01", 9 | "Accept-Encoding": "gzip, deflate", 10 | "Accept-Language": "en-US,en;q=0.5", 11 | "Connection": "keep-alive", 12 | 'Content-Type':"application/x-www-form-urlencoded; charset=UTF-8", 13 | 'X-Requested-With':"XMLHttpRequest", 14 | 'Cookie':"kbzw__Session=4sv8h9vjir144ijdh02h4nefd0; Hm_lvt_164fe01b1433a19b507595a43bf58262=1468934580; Hm_lpvt_164fe01b1433a19b507595a43bf58262=1468935752; kbz_newcookie=1; kbzw__user_login=7Obd08_P1ebax9aX5dvi0OXc5ZmcndHV7Ojg6N7bwNOM2KjZqpmgw6feqM6upamTqJmt3KbbkaKU17HXoNql2ZiXnKTs3Ny_zYylr6qgspyYnaO2uNXQo67f293l4cqooaWSlonPqKSzgcXD6efp3rSMw8vk1u-X67CXz5eotJXb76arlqSRoJe63cTb0KOrpZqpnKiSp4G94OXdx9_Zo62pl6k.", 15 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 16 | 17 | def login(): 18 | logindata=open('user','r',encoding='utf-8').read().replace('\r','').replace('\n','') 19 | logindata=eval(logindata) 20 | data={ 21 | 'user_name':logindata['user_name'], 22 | 'password':logindata['password'], 23 | 'net_auto_login':1, 24 | '_post_type':'ajax', 25 | 'return_url':'https://www.jisilu.cn' 26 | } 27 | session=requests.session() 28 | session.post('https://www.jisilu.cn/account/ajax/login_process/',data=data).text 29 | return session 30 | 31 | def getdata(): 32 | data={ 33 | 'is_search':"0", 34 | 'avolume':"100", 35 | 'bvolume':"100", 36 | 'market':["sh","sz"], 37 | 'ptype':"price", 38 | 'rp':"50", 39 | 'page':"1" 40 | } 41 | session=login() 42 | timestr=str(time.time()).replace('.','') 43 | html=session.post('https://www.jisilu.cn/data/sfnew/arbitrage_vip_list/?___t=%s'%timestr,data=data).text 44 | data=json.loads(html)['rows'] 45 | print(data[0]) 46 | write_to_excel(data) 47 | print('OK') 48 | 49 | def write_to_excel(data): 50 | keys=['fundA_id','fundA_nm','sell1A','increase_rtA','fundA_volume','fundA_amount_increase', 51 | 'fundB_id','fundB_nm','sell1B','increase_rtB','fundB_volume','fundB_amount_increase', 52 | 'abrate','merge_price','est_dis_rt','base_fund_id','base_fund_nm','base_nav','base_est_val', 53 | 'index_nm','idx_incr_rt','asset_ratio','asset_ratio_last','apply_fee','redeem_fee'] 54 | excel=openpyxl.Workbook(write_only=True) 55 | sheet=excel.create_sheet() 56 | for item in data: 57 | cell=[] 58 | for key in keys: 59 | try: 60 | cell.append(item['cell'][key]) 61 | except: 62 | cell.append('-') 63 | sheet.append(cell) 64 | excel.save('result.xlsx') 65 | 66 | while True: 67 | try: 68 | getdata() 69 | except: 70 | print('Failed') 71 | continue 72 | time.sleep(10) 73 | break 74 | -------------------------------------------------------------------------------- /www.kfc.com/storelist.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import json 4 | import time 5 | import openpyxl 6 | 7 | def citys(): 8 | html=open('index.html','r').read() 9 | table=BeautifulSoup(html,'lxml').find('ul',{'class':'city_info'}).find_all('li') 10 | f=open('citys.txt','w') 11 | for li in table: 12 | for item in li.find_all('a'): 13 | f.write(item.get_text()+'\n') 14 | f.close() 15 | 16 | def get_store(city): 17 | result=[] 18 | page=1 19 | while True: 20 | data={ 21 | 'cname':city, 22 | 'pid':"", 23 | 'pageIndex':page, 24 | 'pageSize':"100" 25 | } 26 | html=requests.post('http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname',data=data).text 27 | stores=json.loads(html)['Table1'] 28 | if stores==[]: 29 | break 30 | page+=1 31 | for item in stores: 32 | result.append(item['storeName']+'|'+item['cityName']+'|'+item['addressDetail']+'|'+item['pro']) 33 | time.sleep(1) 34 | return result 35 | 36 | 37 | def main(): 38 | f=open('result.txt','a') 39 | for line in open('citys.txt','r'): 40 | city=line.replace('\n','') 41 | try: 42 | result=get_store(city) 43 | except: 44 | failed=open('failed.txt','a') 45 | failed.write(city+'\n') 46 | failed.close() 47 | continue 48 | for item in result: 49 | f.write(item+'\n') 50 | print(city,'ok') 51 | f.close() 52 | 53 | def write_to_excel(): 54 | result={} 55 | excel=openpyxl.Workbook(write_only=True) 56 | sheet1=excel.create_sheet('1') 57 | for line in open('result.txt','r'): 58 | line=line.replace('\n','') 59 | lists=line.split('|') 60 | lists[0]=lists[0]+'餐厅' 61 | try: 62 | result[lists[1]]+=1 63 | except: 64 | result[lists[1]]=1 65 | sheet1.append(lists) 66 | sheet2=excel.create_sheet('2') 67 | for key in result: 68 | sheet2.append([key,result[key]]) 69 | excel.save('result.xlsx') 70 | 71 | write_to_excel() 72 | -------------------------------------------------------------------------------- /www.kimiss.com/Nyspider.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | import os 5 | import sqlite3 6 | 7 | headers = { 8 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 9 | "Accept-Encoding": "gzip, deflate", 10 | "Accept-Language": "en-US,en;q=0.5", 11 | "Connection": "keep-alive", 12 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 13 | 14 | def get_html(url): 15 | html=requests.get(url,headers=headers).text 16 | return html 17 | 18 | 19 | def get_image(image_url,image_name): 20 | content=requests.get(image_url,headers=headers).content 21 | with open(image_name,'wb') as f: 22 | f.write(content) 23 | f.close 24 | -------------------------------------------------------------------------------- /www.kimiss.com/man.txt: -------------------------------------------------------------------------------- 1 | {'男士面部护理': ['http://product.kimiss.com/nanshirunchungao2/', 'http://product.kimiss.com/nanshiyanbujinghua2/', 'http://product.kimiss.com/nanshiyanshuang2/', 'http://product.kimiss.com/nanshiruye/', 'http://product.kimiss.com/nanshijiemian/', 'http://product.kimiss.com/nanshishuangfushui/', 'http://product.kimiss.com/nanshijinghua/', 'http://product.kimiss.com/nanshimianmo/', 'http://product.kimiss.com/nanshifangshai/', 'http://product.kimiss.com/nanshitaozhuang/', 'http://product.kimiss.com/nanshimianbuqujiaozhi/']} 2 | {'男士身体护理': ['http://product.kimiss.com/nanshimuyulu/', 'http://product.kimiss.com/nanshishuangshenxiangtipin/', 'http://product.kimiss.com/nanshirunfuru/', 'http://product.kimiss.com/nanshixiantichanpin/', 'http://product.kimiss.com/nanshishentimoshagao/', 'http://product.kimiss.com/nanshisichuhuli/']} 3 | {'男士剃须护理': ['http://product.kimiss.com/tixudao/', 'http://product.kimiss.com/xuhouhuli/', 'http://product.kimiss.com/xuqianhuli/']} 4 | {'男士美发护发': ['http://product.kimiss.com/nanshitoufazaoxing/', 'http://product.kimiss.com/nanshixifa/', 'http://product.kimiss.com/nanshirunfa/']} 5 | {'男士面部彩妆': ['http://product.kimiss.com/nanshibbshuang/', 'http://product.kimiss.com/nanshifendi/', 'http://product.kimiss.com/nanshigelishuang/', 'http://product.kimiss.com/nanshizhexia/', 'http://product.kimiss.com/nanshijiemaogao/', 'http://product.kimiss.com/nanshisanfen/']} 6 | -------------------------------------------------------------------------------- /www.lagou.com/lagou.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import requests 3 | import json 4 | import time 5 | from write_sql import write2sqlite 6 | from bs4 import BeautifulSoup 7 | 8 | headers = { 9 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 10 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 11 | 'Accept-Language': 'en-US,en;q=0.5', 12 | 'Accept-Encoding': 'gzip, deflate', 13 | 'Connection': 'keep-alive'} 14 | 15 | def get_jobs(keyword): 16 | jobs=[] 17 | page=1 18 | while True: 19 | js_data=requests.get('http://www.lagou.com/jobs/positionAjax.json?px=new&kd=%s&pn=%s&'%(keyword,page),headers=headers).text 20 | data=json.loads(js_data) 21 | data=data['content']['positionResult']['result'] 22 | for item in data: 23 | job={} 24 | job['fromsite']='拉勾' 25 | job['id']=item['positionId'] 26 | job['companyId']=item['companyId'] 27 | job['positionType']=keyword 28 | job['positionName']=item['positionName'] 29 | job['company']=item['companyFullName'] 30 | job['salary']=item.get('salary') 31 | job['workYear']=item['workYear'] 32 | job['education']=item['education'] 33 | job['industryField']=item['industryField'] 34 | job['companySize']=item['companySize'] 35 | job['city']=item['city'] 36 | job['financeStage']=item['financeStage'] 37 | jobs.append(job) 38 | print(page,keyword,'ok') 39 | page+=1 40 | if page==31: 41 | break 42 | time.sleep(1) 43 | return jobs 44 | 45 | def get_job_des(jobid): 46 | url='http://www.lagou.com/jobs/%s.html'%jobid 47 | html=requests.get(url,headers=headers,timeout=30).text 48 | des=BeautifulSoup(html,'lxml').find('dd',{'class':'job_bt'}).get_text() 49 | return des 50 | 51 | def get_company_rate(companyid): 52 | url='http://www.lagou.com/gongsi/%s.html'%(companyid) 53 | html=requests.get(url,headers=headers,timeout=30).text 54 | rate=BeautifulSoup(html,'lxml').find('div',{'class':'reviews-top'}).find('span',{'class':'score'}).get_text() 55 | return rate 56 | 57 | def main(): 58 | keywords=[line.replace('\n','') for line in open('type.txt','r')] 59 | for keyword in keywords: 60 | jobs=get_jobs(keyword) 61 | result=[] 62 | for job in jobs: 63 | try: 64 | des=get_job_des(job['id']) 65 | except: 66 | des='-' 67 | try: 68 | rate=get_company_rate(job['companyId']) 69 | except: 70 | rate='-' 71 | job['jobDes']=des 72 | job['rate']=rate 73 | result.append(job) 74 | time.sleep(1) 75 | write2sqlite(result,keyword) 76 | print(keyword,'ok') 77 | main() 78 | -------------------------------------------------------------------------------- /www.locoso.com/locoso.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import re 6 | 7 | headers = { 8 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 9 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 10 | 'Accept-Language': 'en-US,en;q=0.5', 11 | 'Accept-Encoding': 'gzip, deflate', 12 | 'Connection': 'keep-alive'} 13 | 14 | def get_citys(): 15 | url='http://www.locoso.com/s2/js/topcity.js' 16 | html=requests.get(url,headers=headers).text.replace('\\"','') 17 | table=BeautifulSoup(html,'lxml') 18 | lists=table.find_all('div',attrs={'class':'pro_bt'}) 19 | f=open('citys.txt','a') 20 | root={} 21 | rel='prcity2(.*?)"' 22 | rel=re.compile(rel) 23 | citys={} 24 | for item in lists: 25 | try: 26 | root[str(item.get_text())]=eval(rel.findall(str(item))[0])[0] 27 | except: 28 | continue 29 | dicts={} 30 | for i in table.find('div',id=item.get('id')+'_2').find_all('li'): 31 | dicts[str(i.get_text())]=eval(rel.findall(str(i))[0])[0] 32 | citys[str(item.get_text())]=dicts 33 | for key in citys: 34 | for city in citys[key]: 35 | qu={} 36 | url='http://www.locoso.com/search/-all/c'+citys[key][city] 37 | html=requests.get(url,headers=headers).text 38 | table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'xiaofenlei_zhong02c2'}).find_all('li') 39 | dicts={} 40 | for i in table: 41 | dicts[i.find('a').get('title')]=i.find('a').get('href').replace('/search/-all/c','') 42 | qu[city]=dicts 43 | f.write(str(qu)+'\n') 44 | print(city) 45 | 46 | def get_industry(): 47 | html=requests.get('www.locoso.com/search/-all/',headers=headers) 48 | 49 | get_citys() 50 | -------------------------------------------------------------------------------- /www.mohurd.gov.cn/deal.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | 5 | def load_level(): 6 | level={} 7 | for line in open('Cost_qualification.txt','r'): 8 | line=line.replace('\n','').split('\t') 9 | print(line) 10 | level[line[0]]=line[1] 11 | return level 12 | 13 | def deal(): 14 | keys=['姓名','性别','民族','学历','name','所属省市','联系地址','法人代表','工程监理资质','招标代理','造价咨询','一级注册建筑师','二级注册建筑师' 15 | ,'一级注册结构工程师','二级注册结构工程师','注册土木工程师(岩土)','注册公用设备工程师(暖通空调)','注册公用设备工程师(给水排水)','注册公用设备工程师(动力)' 16 | ,'注册公用设备工程师(发输变电)','注册公用设备工程师(供配电)','注册化工工程师','监理工程师','一级建造师','二级建造师','造价工程师'] 17 | keys_two=['姓名','性别','民族','学历','name','所属省市','联系地址','法人代表'] 18 | keys_three=['工程监理资质','招标代理','造价咨询','监理工程师','一级建造师','二级建造师'] 19 | f=open('data.txt','w') 20 | level=loadLevel() 21 | for line in open('result.txt','r'): 22 | person={} 23 | item=eval(line) 24 | for key in keys: 25 | if key not in item: 26 | person[key]='N' 27 | else: 28 | person[key]='Y' 29 | for key in keys_two: 30 | person[key]=item[key] 31 | for key in keys_three: 32 | text='' 33 | try: 34 | for i in item[key]: 35 | if i not in text: 36 | text+=i+',' 37 | person[key]=text[:-1] 38 | except: 39 | person[key]=text 40 | try: 41 | person['造价咨询']=level[item['name']] 42 | except: 43 | person['造价咨询']='-' 44 | text='' 45 | for key in keys: 46 | text+=person[key]+' ||' 47 | f.write(text+'\n') 48 | f.close() 49 | 50 | deal() 51 | -------------------------------------------------------------------------------- /www.mohurd.gov.cn/registrarinfor.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import re 4 | 5 | headers = { 6 | 'Host':"210.12.219.18", 7 | 'X-Requested-With':"XMLHttpRequest", 8 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/44.0', 9 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 10 | 'Accept-Language': 'en-US,en;q=0.5', 11 | 'Accept-Encoding': 'gzip, deflate', 12 | 'Referer':"http://210.12.219.18/jianguanfabuweb/companies.html", 13 | 'Cookie':"ASP.NET_SessionId=evkmapz1ljljsqh54siborwj", 14 | 'Connection': 'keep-alive'} 15 | 16 | def get_infor(item): 17 | url='http://210.12.219.18/jianguanfabuweb/'+item['url'] 18 | html=requests.get(url,headers=headers,timeout=30).text 19 | soup=BeautifulSoup(html,'lxml').find('div',{'class':'content'}) 20 | basic=soup.find('table',{'class':'engineer_basic_infor_table'}).get_text().replace('\r','').replace('\n','').replace(' ','') 21 | basic_re='姓名:(.*?)民族:(.*?)性别:(.*?)手.*?学历:(.*?)学位' 22 | basicinfor=re.findall(basic_re,basic)[0] 23 | item['姓名']=basicinfor[0] 24 | item['民族']=basicinfor[1] 25 | item['性别']=basicinfor[2] 26 | item['学历']=basicinfor[3] 27 | zhengshu=soup.find_all('div',{'class':'zhengshu'}) 28 | for div in zhengshu: 29 | header=div.find('div',{'class':'zhengshu_head'}).get_text() 30 | profess=div.find('table').find_all('td')[-1].get_text().split(',') 31 | item[header]=profess 32 | return item 33 | 34 | 35 | def main(): 36 | f=open('result.txt','a') 37 | count=0 38 | for line in open('person.txt','r').readlines(): 39 | count+=1 40 | person=eval(line.replace('\n','')) 41 | try: 42 | item=get_infor(person) 43 | except: 44 | failed=open('person_failed.txt','a') 45 | failed.write(line) 46 | failed.close() 47 | print(person['name'],'failed') 48 | continue 49 | print(count) 50 | f.write(str(item)+'\n') 51 | f.close() 52 | 53 | main() 54 | -------------------------------------------------------------------------------- /www.ncbi.nlm.nih.gov/gethtml.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | import os 3 | import time 4 | 5 | def main(): 6 | browser=webdriver.Firefox() 7 | browser.get('http://www.ncbi.nlm.nih.gov/pubmed') 8 | input('OK?') 9 | browser.implicitly_wait(10) 10 | count=0 11 | while True: 12 | html=browser.page_source 13 | f=open('html/%s.html'%count,'w') 14 | f.write(html) 15 | f.close() 16 | browser.find_element_by_xpath("//a[@id='EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Entrez_Pager.Page' and @sid=3]").click() 17 | time.sleep(5) 18 | count+=1 19 | if count==5330: 20 | break 21 | 22 | main() 23 | -------------------------------------------------------------------------------- /www.ncbi.nlm.nih.gov/parser.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import os 3 | 4 | def parser(): 5 | files=[] 6 | for filename in os.listdir('html'): 7 | files.append(filename) 8 | files.sort(key=lambda x:int(x.replace('.html',''))) 9 | f=open('result.txt','a') 10 | for filename in files: 11 | html=open('html/'+filename,'r').read() 12 | try: 13 | table=BeautifulSoup(html,'lxml').find('div',{'class':'rprt_all'}).find_all('div',{'class':"rprt abstract"}) 14 | except: 15 | continue 16 | for item in table: 17 | cit=item.find('div',{'class':'cit'}) 18 | try: 19 | periodical=cit.find('a').get_text() 20 | except: 21 | periodical='-' 22 | try: 23 | date=cit.get_text().replace(periodical,'') 24 | except: 25 | date='-' 26 | try: 27 | title=item.find('h1').get_text() 28 | except: 29 | continue 30 | try: 31 | auths=item.find('div',{'class':'auths'}).find_all('a') 32 | except: 33 | auths=[] 34 | auth_num=len(auths) 35 | auth_name='' 36 | for a in auths: 37 | auth_name+=a.get_text()+';' 38 | try: 39 | afflist=item.find('div',{'class':'afflist'}).find_all('li') 40 | except: 41 | afflist='' 42 | auth_infor='' 43 | for li in afflist: 44 | auth_infor+=li.get_text()+'||' 45 | try: 46 | abstract=item.find('div',{'class':'abstr'}).get_text() 47 | except: 48 | abstract='' 49 | try: 50 | pmid=item.find('div',{'class':'aux'}).find('a',{'ref':'aid_type=pmid'}).get_text() 51 | except: 52 | pmid='-' 53 | f.write(str([pmid,periodical,date,title,auth_num,auth_name,auth_infor,abstract])+'\r\n') 54 | print(filename,'-ok') 55 | f.close() 56 | parser() 57 | -------------------------------------------------------------------------------- /www.ncbi.nlm.nih.gov/write_to_excel.py: -------------------------------------------------------------------------------- 1 | import openpyxl 2 | 3 | def write_to_excel(): 4 | excel=openpyxl.Workbook(write_only=True) 5 | sheet=excel.create_sheet() 6 | count=0 7 | filecount=1 8 | exist=[] 9 | for line in open('result.txt','r'): 10 | line=line.replace('\r\n','') 11 | item=eval(line) 12 | if item[0] in exist: 13 | continue 14 | exist.append(item[0]) 15 | sheet.append(item) 16 | count+=1 17 | print(count) 18 | if count%100000==0: 19 | excel.save('%s.xlsx'%filecount) 20 | filecount+=1 21 | excel=openpyxl.Workbook(write_only=True) 22 | sheet=excel.create_sheet() 23 | excel.save('%s.xlsx'%filecount) 24 | 25 | write_to_excel() 26 | -------------------------------------------------------------------------------- /www.pizzahut.com.cn/storelist.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import urllib 4 | import openpyxl 5 | 6 | def citys(): 7 | html=open('index.html','r').read() 8 | table=BeautifulSoup(html,'lxml').find_all('div',{'class':'city_window'})[1].find_all('a') 9 | f=open('citys.txt','w') 10 | for item in table: 11 | f.write(item.get_text()+'\n') 12 | f.close() 13 | 14 | def get_store(city): 15 | city=urllib.parse.quote(city) 16 | headers = { 17 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 18 | 'Accept-Language': 'en-US,en;q=0.5', 19 | 'Accept-Encoding': 'gzip, deflate', 20 | 'Cookie':"NSC_CX_QfstjtufodzHspvq=ffffffff09320b0745525d5f4f58455e445a4a423660; _u_=1; __RequestVerificationToken=tOMoZty3Jp6D53oSF-NqlfyAlPa0sRNndZ7PNG5iPrWgM_ngcVFEOP79uEvHJGuqlHDoAA3WDd1MN9QA8ZEhpurYLA0WSkuyswlEO9Nj9oqeMWnu84Q1fyQQYx5-vjq-73NNZXJJLcF9jq3fjB_dsw2; iplocation={}%7C0%7C0".format(city), 21 | 'User-Agent':"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.0", 22 | 'Connection': 'keep-alive'} 23 | page=1 24 | result=[] 25 | while True: 26 | data={ 27 | 'pageIndex':page, 28 | 'pageSize':"100", 29 | 'keyword':"输入餐厅地址或餐厅名称" 30 | } 31 | html=requests.post('http://www.pizzahut.com.cn/StoreList/Index',headers=headers,data=data).text 32 | soup=BeautifulSoup(html,'lxml').find_all('li') 33 | items=[] 34 | for li in soup: 35 | item='' 36 | try: 37 | for p in li.find('div',{'class':'re_RNew'}).find_all('p'): 38 | item+='|'+p.get_text() 39 | except: 40 | continue 41 | items.append(item) 42 | if items==[]: 43 | break 44 | result+=items 45 | page+=1 46 | return result 47 | 48 | def main(): 49 | f=open('result.txt','a') 50 | for line in open('citys.txt','r'): 51 | city=line.replace('\n','') 52 | try: 53 | result=get_store(city) 54 | except: 55 | failed=open('failed.txt','a') 56 | failed.write(city+'\n') 57 | failed.close() 58 | continue 59 | for item in result: 60 | f.write(city+item+'\n') 61 | print(city,'ok') 62 | f.close() 63 | 64 | def write_to_excel(): 65 | result={} 66 | excel=openpyxl.Workbook(write_only=True) 67 | sheet1=excel.create_sheet('1') 68 | for line in open('result.txt','r'): 69 | line=line.replace('\n','') 70 | lists=line.split('|') 71 | try: 72 | result[lists[1]]+=1 73 | except: 74 | result[lists[1]]=1 75 | sheet1.append(lists) 76 | sheet2=excel.create_sheet('2') 77 | for key in result: 78 | sheet2.append([key,result[key]]) 79 | excel.save('result.xlsx') 80 | 81 | write_to_excel() 82 | -------------------------------------------------------------------------------- /www.ppdai.com/excel.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import xlwt3 3 | 4 | def excel(): 5 | file_d=open('data.txt','r') 6 | excel_f=xlwt3.Workbook() 7 | sheet=excel_f.add_sheet('sheet') 8 | count=0 9 | for line in file_d.readlines(): 10 | lists=line.replace('\n','').split('|') 11 | num=0 12 | for item in lists: 13 | try: 14 | text=item.split(':')[1] 15 | except: 16 | text=item 17 | sheet.write(count,num,text) 18 | num+=1 19 | count+=1 20 | excel_f.save('data.xls') 21 | 22 | excel() 23 | -------------------------------------------------------------------------------- /www.teld.cn/setting/cities.txt: -------------------------------------------------------------------------------- 1 | 广州市 2 | 上海市 3 | 杭州市 4 | 成都市 5 | 南京市 6 | -------------------------------------------------------------------------------- /www.tripadvisor.com/getpage.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | import os 5 | import time 6 | 7 | headers = { 8 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 9 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 10 | 'Accept-Language': 'en-US,en;q=0.5', 11 | 'Accept-Encoding': 'gzip, deflate'} 12 | 13 | def main(): 14 | html=requests.get('https://www.tripadvisor.com/Attraction_Review-g294212-d325811-Reviews-Great_Wall_at_Mutianyu-Beijing.html#REVIEWS',headers=headers).text 15 | try: 16 | os.mkdir('page') 17 | except: 18 | pass 19 | count=0 20 | f=open('page'+str(count)+'.html','w') 21 | f.write(html) 22 | f.close() 23 | count+=1 24 | num=10 25 | while True: 26 | try: 27 | html=requests.get('https://www.tripadvisor.com/Attraction_Review-g294212-d325811-Reviews-or%s-Great_Wall_at_Mutianyu-Beijing.html#REVIEWS'%num,headers=headers).text 28 | except: 29 | continue 30 | f=open('page/'+str(count)+'.html','w') 31 | f.write(html) 32 | f.close() 33 | num+=10 34 | print(num) 35 | count+=1 36 | if(num==8490): 37 | break 38 | time.sleep(2) 39 | 40 | main() 41 | -------------------------------------------------------------------------------- /www.tripadvisor.com/moredata.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | 6 | 7 | headers = { 8 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 9 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 10 | 'Accept-Language': 'en-US,en;q=0.5', 11 | 'Accept-Encoding': 'gzip, deflate'} 12 | 13 | 14 | def getdata(target,viewid): 15 | html=requests.get('https://www.tripadvisor.com/ExpandedUserReviews-g294212-d325811?target=%s&context=1&reviews=%s&servlet=Attraction_Review&expand=1'%(target,viewid),headers=headers).text 16 | table=BeautifulSoup(html,'lxml').find_all('div',attrs={'class':'innerBubble'}) 17 | result=[] 18 | for item in table: 19 | text=item.find('div',attrs={'class':'entry'}).get_text().replace('\r','').replace('\n','')+'||' 20 | try: 21 | text+=item.find('div',attrs={'class':'recommend'}).get_text().replace('\r','').replace('\n','') 22 | except: 23 | text+='--' 24 | result.append(text) 25 | return result 26 | 27 | def main(): 28 | f=open('result.txt','a') 29 | viewids=[] 30 | lines=[] 31 | count=0 32 | for line in open('data.txt','r'): 33 | line=line.replace('\n','') 34 | lines.append(line) 35 | viewid=line.split('||')[1].split('-')[-1].replace('SRC_','') 36 | viewids.append(viewid) 37 | if(len(viewids)<20): 38 | continue 39 | text='' 40 | for id in viewids: 41 | text+=id+',' 42 | result=getdata(viewids[0],text[:-1]) 43 | print(len(result)) 44 | for num in range(len(lines)): 45 | f.write(lines[num]+'||'+result[num]+'\n') 46 | viewids.clear() 47 | lines.clear() 48 | count+=1 49 | print(count,'--ok') 50 | text='' 51 | for id in viewids: 52 | text+=id+',' 53 | result=getdata(viewids[0],text[:-1]) 54 | for num in range(lines): 55 | f.write(lines[num]+'||'+result[num]+'\n') 56 | viewids.clear() 57 | lines.clear() 58 | f.close() 59 | 60 | main() 61 | -------------------------------------------------------------------------------- /www.tripadvisor.com/userinfor.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import threading 6 | 7 | 8 | headers = { 9 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 10 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 11 | 'Accept-Language': 'en-US,en;q=0.5', 12 | 'Accept-Encoding': 'gzip, deflate'} 13 | 14 | 15 | class Infor(threading.Thread): 16 | def __init__(self,line): 17 | super(Infor,self).__init__() 18 | self.line=line 19 | self.uid=self.line.split('||')[1].split('-')[0].replace('UID_','') 20 | 21 | def run(self): 22 | try: 23 | html=requests.get('https://www.tripadvisor.com/MemberOverlay?uid=%s&c=&fus=false&partner=false&LsoId='%self.uid,headers=headers,timeout=50).text 24 | except: 25 | self.result='--' 26 | self.line+='||'+self.result 27 | return 28 | try: 29 | self.result=BeautifulSoup(html,'lxml').find('ul',attrs={'class':'memberdescription'}).find_all('li')[1].get_text().replace('\r','').replace('\n','') 30 | except: 31 | self.result='--' 32 | self.line+='||'+self.result 33 | 34 | 35 | def main(): 36 | f=open('re_data.txt','a') 37 | threadings=[] 38 | lines=[] 39 | count=0 40 | for line in open('result.txt','r'): 41 | line=line.replace('\n','') 42 | lines.append(line) 43 | if(len(lines)<20): 44 | continue 45 | for line in lines: 46 | work=Infor(line) 47 | threadings.append(work) 48 | for work in threadings: 49 | work.start() 50 | for work in threadings: 51 | work.join() 52 | for work in threadings: 53 | f.write(work.line+'\n') 54 | count+=1 55 | print(count,'--ok') 56 | threadings.clear() 57 | lines.clear() 58 | 59 | main() 60 | -------------------------------------------------------------------------------- /www.variflight.com/icon/0/20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/0/20.png -------------------------------------------------------------------------------- /www.variflight.com/icon/0/23.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/0/23.png -------------------------------------------------------------------------------- /www.variflight.com/icon/1/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/1/1.png -------------------------------------------------------------------------------- /www.variflight.com/icon/1/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/1/4.png -------------------------------------------------------------------------------- /www.variflight.com/icon/2/0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/2/0.png -------------------------------------------------------------------------------- /www.variflight.com/icon/2/33.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/2/33.png -------------------------------------------------------------------------------- /www.variflight.com/icon/24/117.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/24/117.png -------------------------------------------------------------------------------- /www.variflight.com/icon/24/304.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/24/304.png -------------------------------------------------------------------------------- /www.variflight.com/icon/24/783.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/24/783.png -------------------------------------------------------------------------------- /www.variflight.com/icon/3/43.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/3/43.png -------------------------------------------------------------------------------- /www.variflight.com/icon/3/64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/3/64.png -------------------------------------------------------------------------------- /www.variflight.com/icon/4/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/4/3.png -------------------------------------------------------------------------------- /www.variflight.com/icon/4/9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/4/9.png -------------------------------------------------------------------------------- /www.variflight.com/icon/44/141.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/44/141.png -------------------------------------------------------------------------------- /www.variflight.com/icon/44/88.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/44/88.png -------------------------------------------------------------------------------- /www.variflight.com/icon/5/71.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/5/71.png -------------------------------------------------------------------------------- /www.variflight.com/icon/5/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/5/8.png -------------------------------------------------------------------------------- /www.variflight.com/icon/6/19.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/6/19.png -------------------------------------------------------------------------------- /www.variflight.com/icon/6/51.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/6/51.png -------------------------------------------------------------------------------- /www.variflight.com/icon/6/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/6/6.png -------------------------------------------------------------------------------- /www.variflight.com/icon/7/16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/7/16.png -------------------------------------------------------------------------------- /www.variflight.com/icon/7/26.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/7/26.png -------------------------------------------------------------------------------- /www.variflight.com/icon/8/93.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/8/93.png -------------------------------------------------------------------------------- /www.variflight.com/icon/8/98.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/8/98.png -------------------------------------------------------------------------------- /www.variflight.com/icon/9/21.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/9/21.png -------------------------------------------------------------------------------- /www.variflight.com/icon/9/31.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/9/31.png -------------------------------------------------------------------------------- /www.variflight.com/icon/b/2202.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/b/2202.png -------------------------------------------------------------------------------- /www.variflight.com/icon/b/2248.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/b/2248.png -------------------------------------------------------------------------------- /www.variflight.com/icon/m/2397.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/m/2397.png -------------------------------------------------------------------------------- /www.variflight.com/icon/m/2408.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/m/2408.png -------------------------------------------------------------------------------- /www.variflight.com/icon/m/2419.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/m/2419.png -------------------------------------------------------------------------------- /www.variflight.com/icon/s/2245.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/s/2245.png -------------------------------------------------------------------------------- /www.variflight.com/icon/s/2413.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/s/2413.png -------------------------------------------------------------------------------- /www.variflight.com/icon/s/2424.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/s/2424.png -------------------------------------------------------------------------------- /www.yhd.com/data.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.yhd.com/data.xls -------------------------------------------------------------------------------- /www.yhd.com/replace.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.yhd.com/replace.py -------------------------------------------------------------------------------- /www.yhd.com/shopinfor.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import xlwt3 6 | import re 7 | 8 | headers = { 9 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 10 | "Accept-Encoding": "gzip, deflate", 11 | "Accept-Language": "en-US,en;q=0.5", 12 | "Connection": "keep-alive", 13 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 14 | 15 | def get_urls(url): 16 | try: 17 | html=requests.get(url,headers=headers,timeout=50).text 18 | except: 19 | return [] 20 | rel='(http://shop.yhd.com/m-\d+.html)' 21 | urls=re.findall(rel,html) 22 | urls=list(set(urls)) 23 | try: 24 | html=requests.get(url+'&isGetMoreProducts=1',headers=headers,timeout=50).text 25 | urls+=re.findall(rel,html) 26 | urls=list(set(urls)) 27 | except: 28 | print('--') 29 | return urls 30 | 31 | def get_infor(url): 32 | html=requests.get(url,headers=headers).text 33 | table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'shop-des'}).find_all('li') 34 | item={} 35 | item['url']=url 36 | try: 37 | item['name']=table[0].find('span').get_text() 38 | except: 39 | item['name']='' 40 | try: 41 | item['city']=table[1].find('span').get_text() 42 | except: 43 | item['city']='' 44 | try: 45 | item['tel']=table[2].find('span').get_text() 46 | except: 47 | item['tel']='' 48 | return item 49 | 50 | def main(): 51 | excel_f=xlwt3.Workbook() 52 | sheet=excel_f.add_sheet('sheet') 53 | count=0 54 | list_url=input("输入商铺链接:") 55 | list_url=list_url.replace('list.yhd.com/','list.yhd.com/searchPage/') 56 | page=1 57 | while True: 58 | urls=get_urls(re.sub('p\d','p'+str(page),list_url)) 59 | if(urls==[]): 60 | break 61 | for url in urls: 62 | try: 63 | item=get_infor(url) 64 | except: 65 | continue 66 | sheet.write(count,0,item['name']) 67 | sheet.write(count,1,item['city']) 68 | sheet.write(count,2,item['tel']) 69 | sheet.write(count,3,item['url']) 70 | count+=1 71 | print(count) 72 | excel_f.save('data.xls') 73 | page+=1 74 | 75 | main() 76 | -------------------------------------------------------------------------------- /www.zdic.net/write_to_excel.py: -------------------------------------------------------------------------------- 1 | import openpyxl 2 | import os 3 | from bs4 import BeautifulSoup 4 | import re 5 | 6 | 7 | def load_result_1(): 8 | result=[] 9 | for line in open('result.txt','r'): 10 | item=eval(line) 11 | baseinfor=item['baseinfor'] 12 | for word in item['words']: 13 | line=word[:-1] 14 | des='' 15 | for p in word[-1]: 16 | des+=p+'\n' 17 | result.append(line+baseinfor+[des,item['url']]) 18 | return result 19 | 20 | def load_result_2(): 21 | result=[] 22 | for line in open('result.txt','r'): 23 | item=eval(line) 24 | baseinfor=item['baseinfor'] 25 | for word in item['words']: 26 | line=word[:-1] 27 | num=1 28 | for p in word[-1]: 29 | text=BeautifulSoup(p,'lxml').get_text() 30 | text=re.sub('(\d+. )|◎ ','',text) 31 | result.append(line+baseinfor+[num,text,item['url']]) 32 | num+=1 33 | return result 34 | 35 | def write_to_excel(result,filename): 36 | excel=openpyxl.Workbook(write_only=True) 37 | sheet=excel.create_sheet() 38 | for line in result: 39 | sheet.append(line) 40 | excel.save(filename) 41 | 42 | result=load_result_1() 43 | write_to_excel(result,'result_1.xlsx') 44 | -------------------------------------------------------------------------------- /www.zhongchou.com/Duplicate.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import os 4 | 5 | def Duplicate(): 6 | for filename in os.listdir('.'): 7 | if filename.endswith('txt'): 8 | lines=open(filename,'r').readlines() 9 | lines=list(set(lines)) 10 | lines.sort() 11 | f=open(filename,'w') 12 | for line in lines: 13 | f.write(line) 14 | f.close() 15 | 16 | Duplicate() 17 | -------------------------------------------------------------------------------- /www.zhongchou.com/excel.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import xlwt3 4 | 5 | def write(): 6 | f=xlwt3.Workbook() 7 | sheet=f.add_sheet('sheet') 8 | file_f=open('D.txt','r') 9 | num=1 10 | head=['项目','id','进展数','评论数','最小金额','人数','video','类型','地区','支持人数','已筹款','比例','目标筹资','关注'] 11 | count=0 12 | for item in head: 13 | sheet.write(0,count,item) 14 | count+=1 15 | for line in file_f.readlines(): 16 | lists=line.replace('\n','').split('|') 17 | for count in range(14): 18 | sheet.write(num,count,lists[count]) 19 | num+=1 20 | f.save('data.xls') 21 | 22 | write() 23 | -------------------------------------------------------------------------------- /www.zhongchou.com/get_id.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | 6 | def get_id(): 7 | f=open('ids.txt','a') 8 | headers = { 9 | 'Host':"www.zhongchou.com", 10 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 11 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 12 | 'Accept-Language': 'en-US,en;q=0.5', 13 | 'Accept-Encoding': 'gzip, deflate', 14 | 'Connection': 'keep-alive'} 15 | for page in range(150): 16 | html=requests.get('http://www.zhongchou.com/browse/re-p'+str(page+1),headers=headers).text.encode('ISO-8859-1').decode('utf-8','ignore') 17 | table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'sousuoListBox clearfix'}).find_all('div',attrs={'class':'ssCardItem'}) 18 | for item in table: 19 | text='' 20 | p=item.find('h3').find('a') 21 | text=p.get('title')+'|'+p.get('href').replace('http://www.zhongchou.com/deal-show/id-','')+'\n' 22 | print(text) 23 | f.write(text) 24 | print(page) 25 | f.close() 26 | 27 | get_id() 28 | -------------------------------------------------------------------------------- /www.zhongchou.com/other.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | 6 | def get_infor(text): 7 | headers = { 8 | 'Host':"www.zhongchou.com", 9 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 10 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 11 | 'Accept-Language': 'en-US,en;q=0.5', 12 | 'Accept-Encoding': 'gzip, deflate', 13 | 'Connection': 'keep-alive'} 14 | id=text.split('|')[1] 15 | try: 16 | html=requests.get('http://www.zhongchou.com/deal-show/id-'+id,headers=headers).text.encode('ISO-8859-1').decode('utf-8','ignore') 17 | except: 18 | return None 19 | table=BeautifulSoup(html,'html.parser').find('div',attrs={'class':'mainIn02Box'}) 20 | title=table.find('div',attrs={'class':'jlxqTitleText siteIlB_box'}).find_all('div') 21 | text+='|'+title[0].get_text().replace('\n','') 22 | text+='|'+title[1].get_text() 23 | right_table=table.find('div',attrs={'class':'xqDetailRight'}) 24 | su_table=right_table.find('div',attrs={'class':"xqDetailDataBox"}).find_all('div') 25 | text+='|'+su_table[0].find('p').get_text() 26 | text+='|'+su_table[1].find('p').get_text() 27 | su_table=right_table.find('div',attrs={'class':'xqRatioOuterBox'}) 28 | text+='|'+su_table.find('p').get_text()+'|'+su_table.find('b').get_text() 29 | su_table=right_table.find('div',attrs={'class':'xqDetailBtnBox'}).find('a',id='deal_detail_like') 30 | text+='|'+su_table.find('b').get_text() 31 | return text 32 | 33 | def main(): 34 | file_d=open('data.txt','r') 35 | data_f=open('other.txt','a') 36 | num=0 37 | for line in file_d.readlines(): 38 | try: 39 | text=get_infor(line.replace('\n','')) 40 | except: 41 | continue 42 | if text==None: 43 | continue 44 | data_f.write(text+'\n') 45 | num+=1 46 | print(num) 47 | data_f.close() 48 | 49 | main() 50 | -------------------------------------------------------------------------------- /wwwapps.ups.com/write2excel.py: -------------------------------------------------------------------------------- 1 | import openpyxl 2 | 3 | def load_data(): 4 | keys=[line.replace('\n','').replace(' ','') for line in open('data','r')] 5 | data={} 6 | for line in open('result.txt','r'): 7 | line=line.replace('\n','').split('-') 8 | try: 9 | data[line[0]][line[1]]=int(line[-1]) 10 | except: 11 | data[line[0]]={} 12 | data[line[0]][line[1]]=int(line[-1]) 13 | try: 14 | data[line[1]][line[0]]=int(line[-1]) 15 | except: 16 | data[line[1]]={} 17 | data[line[1]][line[0]]=int(line[-1]) 18 | return keys,data 19 | 20 | def write_to_excel(): 21 | keys,data=load_data() 22 | excel=openpyxl.Workbook(write_only=True) 23 | sheet=excel.create_sheet() 24 | line=[''] 25 | for key in keys: 26 | if len(key)==4: 27 | key='0'+key 28 | line.append(key) 29 | sheet.append(line) 30 | for key in keys: 31 | if len(key)==4: 32 | key='0'+key 33 | line=[key] 34 | for another_key in keys: 35 | if len(another_key)==4: 36 | another_key='0'+another_key 37 | if key==another_key: 38 | line.append(1) 39 | else: 40 | try: 41 | line.append(data[key][another_key]) 42 | except: 43 | line.append('') 44 | sheet.append(line) 45 | sheet=excel.create_sheet() 46 | line=[''] 47 | for key in keys: 48 | if len(key)==4: 49 | key='0'+key 50 | line.append(key) 51 | sheet.append(line) 52 | for key in keys: 53 | if len(key)==4: 54 | key='0'+key 55 | line=[key] 56 | for another_key in keys: 57 | if len(another_key)==4: 58 | another_key='0'+another_key 59 | if key==another_key: 60 | line.append(1) 61 | else: 62 | try: 63 | value=data[key][another_key] 64 | if value!=1: 65 | value=0 66 | line.append(value) 67 | except: 68 | line.append('') 69 | sheet.append(line) 70 | excel.save('result.xlsx') 71 | 72 | write_to_excel() 73 | -------------------------------------------------------------------------------- /xxgk.jl.gov.cn/infor.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import openpyxl 4 | import re 5 | 6 | 7 | headers = { 8 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', 9 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 10 | 'Accept-Language': 'en-US,en;q=0.5', 11 | 'Accept-Encoding': 'gzip, deflate', 12 | 'Connection': 'keep-alive'} 13 | 14 | def geturls(): 15 | f=open('urls.txt','a') 16 | page=1 17 | while True: 18 | html=requests.get('http://xxgk.jl.gov.cn/zwdtSjgl/Directory/depListDir1.jsp?department_name=%CB%F9%D3%D0&pageNo='+str(page),headers=headers).text 19 | table=BeautifulSoup(html,'lxml').find_all('div',style='display:none;') 20 | for item in table: 21 | try: 22 | pid=item.get('id').replace('_text','') 23 | item=str(item).replace('','').replace('
','
') 24 | items=BeautifulSoup(item,'lxml').find_all('a') 25 | title=items[2].get_text() 26 | date=items[3].get_text() 27 | line=title+'|| '+date+' ||'+pid 28 | f.write(line.replace('\r','').replace('\n','')+'\n') 29 | except: 30 | continue 31 | print(page,'ok') 32 | page+=1 33 | if page==937: 34 | break 35 | f.close() 36 | 37 | def getinfor(pid): 38 | html=requests.get('http://xxgk.jl.gov.cn/zwdtSjgl/Directory/showDir.jsp?keyid='+pid,headers=headers,timeout=30).text 39 | tables=BeautifulSoup(html,'lxml').find_all('table',width=700) 40 | text=tables[0].get_text().replace('\r','').replace('\n','') 41 | try: 42 | location=re.findall('发布机构:(.*?)生成日期',text)[0] 43 | except: 44 | location='--' 45 | text=tables[1].get_text().replace('\r','').replace('\n','') 46 | return location+'||'+text 47 | 48 | def main(): 49 | f=open('result.txt','a') 50 | for line in open('urls.txt','r'): 51 | line=line.replace('\n','') 52 | try: 53 | result=getinfor(line.split('||')[-1].replace(' ','')) 54 | except: 55 | failed=open('failed','a') 56 | failed.write(line+'\n') 57 | failed.close() 58 | continue 59 | f.write(line+'||'+result+'\n') 60 | print(line) 61 | f.close() 62 | 63 | main() 64 | -------------------------------------------------------------------------------- /zhidao.baidu.com/question.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import time 4 | import threading 5 | 6 | headers = { 7 | 'Host':"zhidao.baidu.com", 8 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 9 | "Accept-Encoding": "gzip, deflate", 10 | "Accept-Language": "en-US,en;q=0.5", 11 | "Connection": "keep-alive", 12 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 13 | 14 | class Ques(threading.Thread): 15 | def __init__(self,line): 16 | super(Ques,self).__init__() 17 | self.line=line 18 | self.url=line.split('||')[-1] 19 | self.word=line.split('||')[0] 20 | 21 | def run(self): 22 | self.status=True 23 | try: 24 | self.data=self.question() 25 | except: 26 | self.status=False 27 | 28 | def question(self): 29 | html=requests.get(self.url,headers=headers,timeout=30).text.encode('ISO-8859-1').decode('gbk','ignore') 30 | table=BeautifulSoup(html,'lxml').find('article',id='qb-content') 31 | header=table.find('div',id='wgt-ask') 32 | title=header.find('span',{'class':'ask-title'}).get_text() 33 | try: 34 | des=header.find('span',{'class':'con'}).get_text() 35 | except: 36 | des='-' 37 | try: 38 | answer=table.find('div',{'class':['bd','answer']}).find('pre').get_text() 39 | except: 40 | try: 41 | answer=table.find('div',{'id':'wgt-answers'}).find('span',{'class':'con'}).get_text() 42 | except: 43 | answer='-' 44 | return [title,des,answer] 45 | 46 | def main(): 47 | f=open('result.txt','a') 48 | lines=[] 49 | count=0 50 | for line in open('./urls.txt','r'): 51 | line=line.replace('\n','') 52 | lines.append(line) 53 | if len(lines)<10: 54 | continue 55 | threadings=[] 56 | for item in lines: 57 | work=Ques(item) 58 | threadings.append(work) 59 | for work in threadings: 60 | work.start() 61 | for work in threadings: 62 | work.join() 63 | for work in threadings: 64 | if work.status==False: 65 | failed=open('question_failed','a') 66 | failed.write(work.line+'\n') 67 | failed.close() 68 | continue 69 | count+=1 70 | print(count) 71 | f.write(str([work.word]+work.data)+'\n') 72 | lines.clear() 73 | threadings=[] 74 | for item in lines: 75 | work=Ques(item) 76 | threadings.append(work) 77 | for work in threadings: 78 | work.start() 79 | for work in threadings: 80 | work.join() 81 | for work in threadings: 82 | if work.status==False: 83 | failed=open('question_failed','a') 84 | failed.write(work.line+'\n') 85 | failed.close() 86 | continue 87 | f.write(str([work.word]+work.data)+'\n') 88 | f.close() 89 | 90 | main() 91 | -------------------------------------------------------------------------------- /zhidao.baidu.com/search.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import time 4 | import threading 5 | 6 | headers = { 7 | 'Host':"zhidao.baidu.com", 8 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 9 | "Accept-Encoding": "gzip, deflate", 10 | "Accept-Language": "en-US,en;q=0.5", 11 | "Connection": "keep-alive", 12 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 13 | 14 | def search(key): 15 | html=requests.get('https://zhidao.baidu.com/search?lm=0&rn=10&pn=0&fr=search&ie=utf-8&word='+key,headers=headers,timeout=30).text.encode('ISO-8859-1').decode('gbk','ignore') 16 | table=BeautifulSoup(html,'lxml').find('div',{'class':'list-wraper'}).find_all('dl') 17 | for dl in table: 18 | try: 19 | url=dl.find('a').get('href') 20 | if 'zhidao.baidu.com/question' in url: 21 | return url 22 | except: 23 | continue 24 | 25 | class Search(threading.Thread): 26 | def __init__(self,key): 27 | super(Search,self).__init__() 28 | self.key=key 29 | 30 | def run(self): 31 | self.status=True 32 | try: 33 | self.url=search(self.key) 34 | except: 35 | self.status=False 36 | 37 | def main(): 38 | f=open('urls.txt','w') 39 | lines=[] 40 | count=0 41 | for line in open('./failed_words','r'): 42 | line=line.replace('\n','') 43 | lines.append(line) 44 | if len(lines)<5: 45 | continue 46 | threadings=[] 47 | for item in lines: 48 | work=Search(item) 49 | threadings.append(work) 50 | for work in threadings: 51 | work.start() 52 | for work in threadings: 53 | work.join() 54 | for work in threadings: 55 | if work.status==False: 56 | continue 57 | if work.url==None: 58 | continue 59 | count+=1 60 | print(count) 61 | try: 62 | f.write(work.key+"||"+work.url+'\n') 63 | except: 64 | continue 65 | lines.clear() 66 | threadings=[] 67 | for item in lines: 68 | work=Search(item) 69 | threadings.append(work) 70 | for work in threadings: 71 | work.start() 72 | for work in threadings: 73 | work.join() 74 | for work in threadings: 75 | if work.status==False: 76 | continue 77 | if work.url==None: 78 | continue 79 | count+=1 80 | print(count) 81 | f.write(work.key+"||"+work.url+'\n') 82 | lines.clear() 83 | f.close() 84 | main() 85 | -------------------------------------------------------------------------------- /zhihu/get_followee.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import json 6 | 7 | 8 | headers = { 9 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 10 | "Accept-Encoding": "gzip, deflate", 11 | "Accept-Language": "en-US,en;q=0.5", 12 | "Connection": "keep-alive", 13 | 'Cookie':'q_c1=52c451e7774943a2983e4b1341af47c4|1455451362000|1449924628000; _za=b08b756f-83e2-44b8-8719-9fd22ea0e8fc; __utma=51854390.837289251.1457412853.1457412853.1457412853.1; __utmz=51854390.1457412853.1.1.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/people/gejinyuban/topics; cap_id="MWRmMmU0NjlhMmM1NDRhMWFlYzg1MmI3OTJmYjJmN2I=|1457411531|febb54ce12ed1f54a9d134f44ad639a8d21a406a"; _xsrf=3193e002ffde3f8236b8bf0425ba0a8c; udid="AFBAu5wSlAmPTqUZ3Pnq-vBRhHF-_se18_Q="; n_c=1; __utmc=51854390; __utmb=51854390.2.10.1457412853; z_c0="QUFCQVB4azVBQUFYQUFBQVlRSlZUZERpQlZjb1l3SmlXMlVuTTVXNmMyamsyaFh0TmNZZm9BPT0=|1457411536|03555bed95004f561fc044aa14585204ce700106"; unlock_ticket="QUFCQVB4azVBQUFYQUFBQVlRSlZUZGhjM2xaVGJYbi1uVzlzS1pGODllTFZGaXpzTFZZbFZBPT0=|1457411536|9d460fe15bde1d5e5e723349745d7084654ce709"; __utmt=1; __utmv=51854390.100-1|2=registration_date=20141006=1^3=entry_date=20141006=1', 14 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 15 | 16 | def get_followe(ID,hashid): 17 | html=requests.get('https://www.zhihu.com/people/%s/followees'%ID,headers=headers).text 18 | xsrf=BeautifulSoup(html,'lxml').find('input',attrs={'name':'_xsrf'}).get('value') 19 | print(xsrf) 20 | count=0 21 | persons=[] 22 | while True: 23 | data={ 24 | 'method':"next", 25 | 'params':'{"offset":%s,"order_by":"created","hash_id":"%s"}'%(count,hashid), 26 | '_xsrf':xsrf 27 | } 28 | try: 29 | html=requests.post('https://www.zhihu.com/node/ProfileFolloweesListV2',headers=headers,data=data).text 30 | except: 31 | continue 32 | try: 33 | jsondata=json.loads(html)['msg'] 34 | except: 35 | return persons 36 | if(jsondata==[]): 37 | break 38 | for item in jsondata: 39 | name=BeautifulSoup(item,'lxml').find('a',attrs={'class':'zg-link'}).get('title') 40 | persons.append(name) 41 | count+=20 42 | return persons 43 | 44 | def main(): 45 | f=open('followee.txt','a',encoding='utf-8') 46 | statue=True 47 | for line in open('data.txt','r').readlines(): 48 | lists=line.split('||') 49 | name=lists[0] 50 | if(statue): 51 | if(name=='keso'): 52 | statue=False 53 | continue 54 | ID=lists[1] 55 | item={} 56 | item['name']=name 57 | item['id']=ID 58 | hashid=lists[3] 59 | item['followee']=get_followe(ID, hashid) 60 | f.write(str(item)+'\n') 61 | print(name) 62 | main() 63 | -------------------------------------------------------------------------------- /zhihu/top500.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | import json 5 | 6 | def get_top(page): 7 | html=requests.get('http://api.kanzhihu.com/topuser/follower/%s/50'%page).text 8 | data=json.loads(html)['topuser'] 9 | return data 10 | 11 | def main(): 12 | f=open('persons.txt','a',encoding='utf-8') 13 | page=1 14 | while True: 15 | data=get_top(page) 16 | for item in data: 17 | text=item['name']+'||'+item['id']+'||'+str(item['follower'])+'||'+item['hash'] 18 | f.write(text+'\n') 19 | print(page) 20 | page+=1 21 | if(page==20): 22 | break 23 | f.close() 24 | 25 | def followee(): 26 | f=open('data.txt','a',encoding='utf-8') 27 | for line in open('persons.txt','r').readlines(): 28 | line=line.replace('\n','') 29 | print(line) 30 | data=requests.get('http://api.kanzhihu.com/userdetail2/'+line.split('||')[-1]).text 31 | data=json.loads(data) 32 | line=line+'|| '+str(data['signature'])+'|| '+str(data['description'])+'|| ' 33 | detail=data['detail'] 34 | line=line+str(detail['ask'])+'|| '+str(detail['answer'])+'|| '+str(detail['post'])+'|| '+str(detail['agree'])+'|| '+str(detail['thanks'])+'|| '+str(detail['fav'])+'||'+str(detail['logs']) 35 | f.write(line.replace('\r','').replace('\n','')+'\n') 36 | 37 | followee() 38 | -------------------------------------------------------------------------------- /zhihu/zhihuinfor.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import re 6 | 7 | headers = { 8 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 9 | "Accept-Encoding": "gzip, deflate", 10 | "Accept-Language": "en-US,en;q=0.5", 11 | "Connection": "keep-alive", 12 | 'Cookie':'q_c1=52c451e7774943a2983e4b1341af47c4|1455451362000|1449924628000; _za=b08b756f-83e2-44b8-8719-9fd22ea0e8fc; __utma=51854390.837289251.1457412853.1457412853.1457412853.1; __utmz=51854390.1457412853.1.1.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/people/gejinyuban/topics; cap_id="MWRmMmU0NjlhMmM1NDRhMWFlYzg1MmI3OTJmYjJmN2I=|1457411531|febb54ce12ed1f54a9d134f44ad639a8d21a406a"; _xsrf=3193e002ffde3f8236b8bf0425ba0a8c; udid="AFBAu5wSlAmPTqUZ3Pnq-vBRhHF-_se18_Q="; n_c=1; __utmc=51854390; __utmb=51854390.2.10.1457412853; z_c0="QUFCQVB4azVBQUFYQUFBQVlRSlZUZERpQlZjb1l3SmlXMlVuTTVXNmMyamsyaFh0TmNZZm9BPT0=|1457411536|03555bed95004f561fc044aa14585204ce700106"; unlock_ticket="QUFCQVB4azVBQUFYQUFBQVlRSlZUZGhjM2xaVGJYbi1uVzlzS1pGODllTFZGaXpzTFZZbFZBPT0=|1457411536|9d460fe15bde1d5e5e723349745d7084654ce709"; __utmt=1; __utmv=51854390.100-1|2=registration_date=20141006=1^3=entry_date=20141006=1', 13 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} 14 | 15 | def get_topics(ID): 16 | try: 17 | html=requests.get('https://www.zhihu.com/people/%s/topics'%ID,headers=headers).text 18 | table=BeautifulSoup(html,'lxml').find('div',id='zh-profile-topic-list').find_all('strong') 19 | topics='' 20 | for item in table: 21 | topics+=item.get_text()+',' 22 | return topics[:-1] 23 | except: 24 | return get_topics(ID) 25 | 26 | def get_profile(ID): 27 | try: 28 | html=requests.get('https://www.zhihu.com/people/%s'%ID,headers=headers).text 29 | rel='class="zg-gray-darker">(.*?)' 30 | table=re.findall(rel,html) 31 | profile='' 32 | for item in table: 33 | profile+=item+',' 34 | return profile[:-1] 35 | except: 36 | return get_profile(ID) 37 | 38 | def main(): 39 | f=open('person.txt','a',encoding='utf-8') 40 | statue=True 41 | for line in open('data.txt','r').readlines(): 42 | line=line.replace('\n','') 43 | ID=line.split('||')[1] 44 | if(statue): 45 | if(ID=='kun-yu'): 46 | statue=False 47 | continue 48 | topics=get_topics(ID) 49 | profile=get_profile(ID) 50 | f.write(line+'||'+topics+'||'+profile+'\n') 51 | print(line) 52 | f.close() 53 | 54 | main() 55 | -------------------------------------------------------------------------------- /zsb.suda.edu.cn/markhistory.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | 4 | #导入模块 5 | import requests 6 | from bs4 import BeautifulSoup 7 | import re 8 | import sqlite3 9 | import os 10 | 11 | #获取招生省份 12 | def get_provinces(): 13 | #打开网页获取网页源码 14 | html=requests.get('http://zsb.suda.edu.cn/markHistory.aspx').text 15 | #解析网页,查找到省份,这个要结合网页源码 16 | table=BeautifulSoup(html,'html.parser').find('select',id='ctl00_ContentPlaceHolder1_DropDownList2').find_all('option') 17 | provinces={} 18 | #获取省份名 19 | for option in table: 20 | provinces[option.get_text()]=option.get('value') 21 | return provinces 22 | 23 | #获取招生专业,分数及其他信息 24 | def parser(year,aid,province): 25 | #构造url,打开网页,获取源码 26 | url='http://zsb.suda.edu.cn/view_markhistory.aspx?aa=%s年%s各专业录取分数一览表&aid=%s&ay=%s'%(year,province,aid,year) 27 | print(url) 28 | html=requests.get(url).text 29 | #解析网页,获取具体信息 30 | table=BeautifulSoup(html,'html.parser').find('table',id='ctl00_ContentPlaceHolder1_GridView1').find_all('tr')[1:] 31 | items=[] 32 | #遍历表格每一项,获取信息 33 | for tr in table: 34 | item=[year,province] 35 | for td in tr.find_all('td'): 36 | item.append(td.get_text().replace('\n','')) 37 | items.append(item) 38 | return items 39 | 40 | def main(): 41 | try: 42 | os.remove('data.db') 43 | except: 44 | pass 45 | #连接数据库 46 | conn=sqlite3.connect('data.db') 47 | #创建游标 48 | cursor=conn.cursor() 49 | #创建数据表 50 | cursor.execute("create table if not exists markhistory(year varchar(8),province varchar(80),professional varchar(80),length varchar(20),category varchar(20),numbers varchar(20),highest varchar(20),minimum varchar(20),average varchar(20))") 51 | #需要抓取的年份 52 | need_years=['2015','2014','2013'] 53 | #获取招生的省份 54 | provinces=get_provinces() 55 | #获取每个省每一年的信息 56 | for year in need_years: 57 | for key in provinces: 58 | #获取 某年(year)某地区(province)各专业信息 59 | try: 60 | items=parser(year,provinces[key],key) 61 | except: 62 | continue 63 | for item in items: 64 | #入库 65 | cursor.execute('insert into markhistory(year,province,professional,length,category,numbers,highest,minimum,average) values'+str(tuple(item))) 66 | #提交事物,入库 67 | conn.commit() 68 | #打印完成信息 69 | print(year,key,'--ok') 70 | #关闭游标 71 | cursor.close() 72 | #关闭数据库连接 73 | conn.close() 74 | 75 | main() 76 | -------------------------------------------------------------------------------- /zsb.suda.edu.cn/new_markhistory.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import sqlite3 6 | import os 7 | import re 8 | 9 | 10 | #获取招生省份 11 | def get_provinces(): 12 | #打开网页获取网页源码 13 | html=requests.get('http://zsb.suda.edu.cn/markHistory.aspx').text 14 | #解析网页,查找到省份,这个要结合网页源码 15 | table=BeautifulSoup(html,'html.parser').find('select',id='ctl00_ContentPlaceHolder1_DropDownList2').find_all('option') 16 | provinces=[] 17 | #获取省份名 18 | for option in table: 19 | provinces.append(option.get_text()) 20 | return provinces 21 | 22 | def get_school(): 23 | #打开网页获取网页源码 24 | html=requests.get('http://zsb.suda.edu.cn/markHistory.aspx').text 25 | #解析网页,查找到学院,这个要结合网页源码 26 | table=BeautifulSoup(html,'html.parser').find('select',id='ctl00_ContentPlaceHolder1_DropDownList3').find_all('option') 27 | school=[] 28 | #获取学院名 29 | for option in table: 30 | school.append(option.get_text()) 31 | return school 32 | 33 | #获取招生专业,分数及其他信息 34 | def parser(year,province,school): 35 | #构造url,打开网页,获取源码 36 | url='http://zsb.suda.edu.cn/search.aspx?nf=%s&sf=%s&xy=%s'%(year,province,school) 37 | html=requests.get(url).text 38 | #解析网页,获取具体信息 39 | table=BeautifulSoup(html,'html.parser').find('table',id='ctl00_ContentPlaceHolder1_GridView1').find_all('tr')[1:] 40 | items=[] 41 | #遍历表格每一项,获取信息 42 | for tr in table: 43 | item=[] 44 | for td in tr.find_all('td'): 45 | item.append(td.get_text().replace('\n','')) 46 | items.append(item) 47 | return items 48 | 49 | def main(): 50 | try: 51 | os.remove('data.db') 52 | except: 53 | pass 54 | #连接数据库 55 | conn=sqlite3.connect('data.db') 56 | #创建游标 57 | cursor=conn.cursor() 58 | #创建数据表 59 | cursor.execute("create table if not exists markhistory(school varchar(80),year varchar(8),province varchar(80),professional varchar(80),length varchar(20),category varchar(20),numbers varchar(20),highest varchar(20),minimum varchar(20),average varchar(20))") 60 | #需要抓取的年份 61 | need_years=['2015','2014','2013'] 62 | #获取招生的省份 63 | provinces=get_provinces() 64 | schools=get_school() 65 | #获取每个省每一年的信息 66 | for year in need_years: 67 | for province in provinces: 68 | for school in schools: 69 | #获取 某年(year)某地区(province)各专业信息 70 | index=schools.index(school)+1 71 | if(index>19): 72 | index+=2 73 | try: 74 | items=parser(year,provinces.index(province)+1,index) 75 | except: 76 | continue 77 | for item in items: 78 | item.insert(2, school) 79 | #入库 80 | cursor.execute('insert into markhistory(school,year,province,professional,length,category,numbers,highest,minimum,average) values'+str(tuple(item))) 81 | #提交事物,入库 82 | #打印完成信息 83 | print(school,year,province,'--ok') 84 | conn.commit() 85 | #关闭游标 86 | cursor.close() 87 | #关闭数据库连接 88 | conn.close() 89 | 90 | main() 91 | --------------------------------------------------------------------------------