├── 1688
└── get_tel.py
├── 58.213.159.173
└── jiangsu_Atmosphere.py
├── JobGet
└── JobInforGet.py
├── Nyspider.py
├── README.md
├── ali_comments
├── fan_jian.py
├── langconv.py
├── taobao.py
├── tianmao.py
└── zh_wiki.py
├── amap
└── amap.py
├── amazon
├── get_items.py
├── items_usa.py
└── shopProducts.py
├── anjuke
├── anjuke_hourse.py
├── community.py
├── get_house.py
└── location.py
├── apk.91.com
├── Send_email.py
├── email_game.py
└── email_soft.py
├── apps.fas.usda.gov
└── psdQuery.py
├── aso100.com
├── aso100_ui.py
└── aso100_ui_v2.py
├── baidu.lecai.com
├── lottery.py
└── www.zy91.com
│ └── zndz.py
├── baidumap
├── baidumap.py
└── city_ids.txt
├── bbs.tianya.cn
└── comments.py
├── bjguahao.gov.cn
├── bjguahao.py
├── bjguahao_v2.py
└── bjguahao_v3.py
├── brokerbin.com
├── brokerbin.py
├── brokerbin_3.py
├── email_template
├── filter
│ └── filter.txt
└── send_email.py
├── buluo.qq.com
└── images.py
├── chart.cp.360.cn
└── charthistory.py
├── china.tandfonline.com
└── search_article.py
├── club.qingdaonews.com
└── article.py
├── cn.bing.com
├── bing_search.py
└── urls.txt
├── data.cma.gov.cn
├── Duplicate.py
└── get_data.py
├── datacenter.mep.gov.cn
├── air_dairy.py
└── air_dairy_aqi.py
├── dianping
├── comments.txt
├── data
│ ├── 上海.xls
│ ├── 北京.xls
│ ├── 南京.xls
│ ├── 厦门.xls
│ ├── 大连.xls
│ ├── 天津.xls
│ ├── 宁波.xls
│ ├── 广州.xls
│ ├── 成都.xls
│ ├── 无锡.xls
│ ├── 杭州.xls
│ ├── 武汉.xls
│ ├── 沈阳.xls
│ ├── 济南.xls
│ ├── 深圳.xls
│ ├── 苏州.xls
│ ├── 西安.xls
│ ├── 郑州.xls
│ ├── 重庆.xls
│ ├── 长沙.xls
│ └── 青岛.xls
├── get_info.py
├── memberlist.py
├── memberlist.txt
├── shopinfor.py
└── shoplist.py
├── douban
├── dou_movie.py
├── dou_tv.py
├── movie_grade.py
├── movieinfor.py
└── movies.txt
├── downloadbooks
└── save_into_baiduyun.py
├── duapp2.drexel.edu
├── TMS.py
├── TMSCourse_Excel.py
└── TMSCourse_Sqlite.py
├── finance.sina.com.cn
├── ManagerInfo.py
└── codes.txt
├── finance.yahoo.com
├── finance.py
└── new_finance.py
├── forecast.io
├── forecast.py
└── getData.py
├── fsfc.fsjw.gov.cn
└── house.py
├── gcjs.linfen.gov.cn
└── company.py
├── hklock.com
└── products.py
├── itslaw
└── get_anli.py
├── jbk.39.net
└── disease.py
├── job.qiaosiwang.com
└── workinfor.py
├── job
├── Job_get.py
└── REANME.md
├── landchina
├── infor.py
└── landchina.py
├── lvyou.baidu.com
└── guilin.py
├── mall.jd.com
└── jd_shop.py
├── maoyan
├── Duplicate.py
├── get_infor.py
└── maoyan.py
├── music.163.com
└── music_lists.py
├── news.sohu.com
└── news.py
├── news_get
├── cn.chinadaily.com.cn
│ └── chinadaily.com.cn.py
├── people.com.cn
│ └── people.com.cn.py
├── www.cankaoxiaoxi.com
│ └── cankaoxiaoxi.com.py
├── www.eastday.com
│ └── eastday.com.py
├── www.gmw.cn
│ └── gmw.cn.py
├── www.haiwainet.cn
│ └── haiwainet.cn.py
├── www.huanqiu.com
│ └── huanqiu.com.py
├── www.youth.cn
│ └── youth.cn.py
└── www.zaobao.com
│ └── zaobao.com.py
├── newseed.pedaily.cn
└── invest.py
├── pan.baidu.com
└── sharelink.py
├── qimingpian.com
└── qimingpian.py
├── rank.kongzhong.com
└── userInfor.py
├── stock.finance.qq.com
├── stk_holder.py
├── stkcode.py
└── stkcode.txt
├── stock.jrj.com.cn
└── flowhistory.py
├── taobao
├── suggest.py
└── sycm.py
├── tur.bizdirlib.com
└── bizdirlib.py
├── waimai.meituan.com
└── orderlist.py
├── weibo
├── weibo.md
└── weibo.py
├── weidian
└── weidian.py
├── wenda.so.com
├── question.py
└── search.py
├── wenshu.court.gov.cn
└── download.py
├── worldfreightrates
└── trates.py
├── www.18ladys.com
└── 18ladys.py
├── www.360che.com
└── products.py
├── www.3j1688.com
└── 3j1688.py
├── www.58.com
├── JobInforGet.py
├── company.py
├── companyExcel.py
└── sendemail.py
├── www.aihuishou.com
└── get_price.py
├── www.airbnb.com
├── deal.py
├── roominfor.py
├── rooms.py
└── userinfor.py
├── www.aqistudy.cn
└── aqistudy.py
├── www.autozi.com
├── carBrandLetter.py
├── products.py
└── products_infor.py
├── www.b8b8.tv
├── ballbar_mobile.py
└── ballbar_pc.py
├── www.baikemy.com
└── disease.py
├── www.cbooo.cn
└── cbooo.py
├── www.chazidian.com
└── yuwen.py
├── www.chealth.org.cn
└── disease.py
├── www.china-10.com
├── china10.py
└── excel.py
├── www.chuanlaoda.cn
├── CaptchaOCR.dll
├── chuanlaoda.py
├── py2exe_install.py
├── testdll.py
└── x64
│ └── CaptchaOCR.dll
├── www.cjsyw.com
└── ship.py
├── www.cofeed.com
└── cofeed.py
├── www.cpbz.gov.cn
├── company.py
└── write_to_excel.py
├── www.ctrip.com
├── comments.py
├── comments_bydate.py
└── youtrip.py
├── www.dicos.com.cn
├── citys.txt
└── storelist.py
├── www.eastmoney.com
├── company.py
├── guba.py
├── iguba.py
├── quote.py
├── transaction.py
└── urls.txt
├── www.fang.com
├── get_hourse.py
└── new_hourse.py
├── www.gamefaqs.com
└── gameinfor.py
├── www.ganji.com
└── ganji_tel.py
├── www.gewara.com
└── reviews.py
├── www.guahao.com
├── doctor.py
└── hospital.py
├── www.hexun.com
└── hexun.py
├── www.ifeng.com
└── fashionhealth.py
├── www.imdb.com
├── boxoffice.py
├── movies.py
└── rottentomatoes.py
├── www.itjuzi.com
├── baseInvestevents.py
├── company.py
├── companylist.py
├── investevents.py
├── itjuzi.py
└── tag_itjuzi.py
├── www.jfz.com
└── products.py
├── www.jisilu.com
├── JiSiLu.py
└── jisilu.py
├── www.kfc.com
├── citys.txt
└── storelist.py
├── www.kimiss.com
├── Nyspider.py
├── baby.txt
├── baby_pro.txt
├── get_product.py
└── man.txt
├── www.lagou.com
└── lagou.py
├── www.lianjia.com
└── lianjiahourse.py
├── www.liepin.com
└── liepin.py
├── www.locoso.com
└── locoso.py
├── www.mohurd.gov.cn
├── company.py
├── deal.py
├── registrar_thread.py
└── registrarinfor.py
├── www.ncbi.nlm.nih.gov
├── gethtml.py
├── parser.py
├── pubmed.py
└── write_to_excel.py
├── www.pizzahut.com.cn
├── citys.txt
└── storelist.py
├── www.pm25.in
└── pm25.py
├── www.ppdai.com
├── Tppdai.py
├── excel.py
├── get_data.py
├── invest.py
├── ppdai.py
└── ppdaiInfor.py
├── www.renrendai.com
└── renrendai.py
├── www.sxhouse.com.cn
└── sxhouse.py
├── www.teld.cn
├── setting
│ └── cities.txt
└── teld.py
├── www.tichk.org
└── travel_agent.py
├── www.tjcn.org
└── patent.py
├── www.trademaps.cn
└── trademaps.py
├── www.tripadvisor.com
├── deal.py
├── getpage.py
├── moredata.py
└── userinfor.py
├── www.tyshbj.com.cn
└── tyshbj.py
├── www.ukers.cn
└── ukers.py
├── www.variflight.com
├── flights_num.txt
├── icon
│ ├── 0
│ │ ├── 20.png
│ │ └── 23.png
│ ├── 1
│ │ ├── 1.png
│ │ └── 4.png
│ ├── 2
│ │ ├── 0.png
│ │ └── 33.png
│ ├── 3
│ │ ├── 43.png
│ │ └── 64.png
│ ├── 4
│ │ ├── 3.png
│ │ └── 9.png
│ ├── 5
│ │ ├── 71.png
│ │ └── 8.png
│ ├── 6
│ │ ├── 19.png
│ │ ├── 51.png
│ │ └── 6.png
│ ├── 7
│ │ ├── 16.png
│ │ └── 26.png
│ ├── 8
│ │ ├── 93.png
│ │ └── 98.png
│ ├── 9
│ │ ├── 21.png
│ │ └── 31.png
│ ├── 24
│ │ ├── 117.png
│ │ ├── 304.png
│ │ └── 783.png
│ ├── 44
│ │ ├── 141.png
│ │ └── 88.png
│ ├── b
│ │ ├── 2202.png
│ │ └── 2248.png
│ ├── m
│ │ ├── 2397.png
│ │ ├── 2408.png
│ │ └── 2419.png
│ └── s
│ │ ├── 2245.png
│ │ ├── 2413.png
│ │ └── 2424.png
├── ui_variflight.py
└── variflight.py
├── www.vvic.com
└── getitems.py
├── www.watchseries.li
└── watchseries.py
├── www.we.com
└── renrendai.py
├── www.yelp.com
├── restaurant_infor.py
└── restaurants.py
├── www.yhd.com
├── data.xls
├── replace.py
├── shopinfor.py
└── text.html
├── www.zdic.net
├── words.txt
├── write_to_excel.py
└── zdic.py
├── www.zhongchou.com
├── Duplicate.py
├── excel.py
├── get_id.py
├── get_infor.py
└── other.py
├── www.zimuzu.tv
├── movie_get.py
└── tv_get.py
├── www.zy91.com
└── zndz.py
├── wwwapps.ups.com
├── search.py
└── write2excel.py
├── xxgk.jl.gov.cn
└── infor.py
├── yangcong345.com
└── yangcong345.py
├── zhidao.baidu.com
├── question.py
└── search.py
├── zhihu
├── get_followee.py
├── top500.py
├── zhihu_search.py
└── zhihuinfor.py
└── zsb.suda.edu.cn
├── inquery.py
├── markhistory.py
└── new_markhistory.py
/Nyspider.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | import os
5 | import sqlite3
6 | import xlwt3
7 | from email import encoders
8 | from email.header import Header
9 | from email.mime.text import MIMEText
10 | from email.utils import parseaddr,formataddr
11 | import smtplib
12 | import datetime
13 |
14 | headers = {
15 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
16 | "Accept-Encoding": "gzip, deflate",
17 | "Accept-Language": "en-US,en;q=0.5",
18 | "Connection": "keep-alive",
19 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
20 |
21 |
22 | def get_image(image_url,image_name):
23 | content=requests.get(image_url,headers=headers).content
24 | with open(image_name,'wb') as f:
25 | f.write(content)
26 | f.close
27 |
28 | def to_Excel():
29 | for filename in os.listdir('.'):
30 | if(filename.endswith('txt')):
31 | f_d=open(filename,'r')
32 | f_ex=xlwt3.Workbook()
33 | sheet=f_ex.add_sheet('one')
34 | count=0
35 | for line in f_d.readlines():
36 | lists=line.split('|')
37 | try:
38 | num=0
39 | for text in lists:
40 | sheet.write(count,num,text)
41 | num+=1
42 | count+=1
43 | except:
44 | sheet=f_ex.add_sheet('two')
45 | count=0
46 | num=0
47 | for text in lists:
48 | sheet.write(count,num,text)
49 | num+=1
50 | count+=1
51 | f_ex.save(filename.replace('txt','xls'))
52 |
53 | def send_email(email,subject,text,user,passwd):
54 | smtp_server='smtp.126.com'
55 | msg = MIMEText(text, 'plain', 'utf-8')
56 | msg['Subject']=subject
57 | msg['From'] = _format_addr(user)
58 | msg['To'] = _format_addr(email)
59 | server = smtplib.SMTP(smtp_server, 25)
60 | server.set_debuglevel(1)
61 | server.login(user, passwd)
62 | server.sendmail(user, [email], msg.as_string())
63 | server.quit()
64 |
65 | def convert_html(html):
66 | return html.encode('ISO-8859-1').decode('utf-8','ignore')
67 |
68 | def Duplicate():
69 | for filename in os.listdir('.'):
70 | if filename.endswith('txt'):
71 | lines=open(filename,'r').readlines()
72 | lines=list(set(lines))
73 | lines.sort()
74 | f=open(filename,'w')
75 | for line in lines:
76 | f.write(line)
77 | f.close()
78 |
79 | def yesterday_get(today=datetime.datetime.now()):
80 | oneday = datetime.timedelta(days=1)
81 | yesterday = today- oneday
82 | return yesterday
83 |
--------------------------------------------------------------------------------
/ali_comments/fan_jian.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | from langconv import *
3 | import xlrd
4 | import xlwt3
5 |
6 | # 转换繁体到简体
7 | def run():
8 | name='相机'
9 | f=xlwt3.Workbook(encoding='utf-8')
10 | sheet=f.add_sheet('sheet')
11 | data=xlrd.open_workbook(name+'.xls')
12 | table=data.sheets()[0]
13 | for i in range(table.nrows):
14 | line=table.cell(i,0).value
15 | line=fan_jian(line)
16 | sheet.write(i,0,line)
17 | f.save(name+'_.xls')
18 |
19 |
20 | def fan_jian(line):
21 | line = Converter('zh-hans').convert(line)#.decode('utf-8'))
22 | line = line#.encode('utf-8')
23 | return line
24 |
25 | def jian_fan(line):
26 | line = Converter('zh-hant').convert(line.decode('utf-8'))
27 | line = line.encode('utf-8')
28 | return line
29 |
30 | run()
31 |
--------------------------------------------------------------------------------
/ali_comments/taobao.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | import xlwt3
5 | import re
6 | import requests
7 | requests.packages.urllib3.disable_warnings()
8 |
9 | class Get_comments(object):
10 | """docstring for Get_comments"""
11 | def __init__(self):
12 | super(Get_comments, self).__init__()
13 | self.f=xlwt3.Workbook()
14 | self.sheet=self.f.add_sheet('sheet')
15 | self.headers = {
16 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
17 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
18 | 'Accept-Language': 'en-US,en;q=0.5',
19 | 'Accept-Encoding': 'gzip, deflate',
20 | 'Cookie':"isg=1895AE3ACA648D8B28455A6D1992F41F; l=AvX1ovPGHd3jI30I58r3v3IcJXuvcqmE; t=1e6dd9b5d55aacb2ca5e07cb5be03a2b; thw=cn; cna=7Dd0DgMB+HcCAXrNCByTSHxR; uc3=nk2=1pCplIlkFn7n&id2=WvAz2mB1qeE%2F&vt3=F8dASMh%2Fnu8OGgfEtGM%3D&lg2=URm48syIIVrSKA%3D%3D; tracknick=%5Cu98A0%5Cu6C9B%5Cu4E4B%5Cu590F3; _cc_=URm48syIZQ%3D%3D; tg=0; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; mt=np=&ci=-1_0&cyk=0_0; ali_ab=211.69.194.131.1444291484725.8; lgc=%5Cu98A0%5Cu6C9B%5Cu4E4B%5Cu590F3; lzstat_uv=13179738183169067975|3492151@3600092@3288243@3260534; v=0; cookie2=1cdef8cc85ef4b19772fd48de808f9c0; _tb_token_=0BF8LVbNvUzT; uc1=cookie14=UoWzXLHAxnd7aw%3D%3D&existShop=true&cookie16=V32FPkk%2FxXMk5UvIbNtImtMfJQ%3D%3D&cookie21=WqG3DMC9Edo1SB5NB6Qtng%3D%3D&tag=2&cookie15=W5iHLLyFOGW7aA%3D%3D&pas=0; hng=CN%7Czh-cn%7CCHN; existShop=MTQ0NTE1NzMzOQ%3D%3D; sg=343; cookie1=BYTvDkInmXl2wO%2F6AW0tX%2Bpb6nHX4a5Olly%2Fg4DvWfE%3D; unb=907324234; skt=ae45361e45082d58; publishItemObj=Ng%3D%3D; _l_g_=Ug%3D%3D; _nk_=%5Cu98A0%5Cu6C9B%5Cu4E4B%5Cu590F3; cookie17=WvAz2mB1qeE%2F",
21 | 'Connection': 'keep-alive'}
22 | self.count=0
23 | import ssl
24 | ssl._create_default_https_context = ssl._create_unverified_context
25 | self.url='https://rate.taobao.com/feedRateList.htm?callback=jsonp_reviews_list&userNumId=84131819&auctionNumId=6774286903&siteID=3&rateType=&orderType=sort_weight&showContent=1&attribute=¤tPageNum='
26 | def run(self):
27 | cert='/home/nyloner/work/ali_comments/cert.pem'
28 | for page in range(80):
29 | html=requests.get(self.url+str(page+1),headers=self.headers,verify=False).text
30 | print(html)
31 | rel='content":"(.*?)"'
32 | comments=re.findall(rel,html)
33 | for item in comments:
34 | self.sheet.write(self.count,0,item)
35 | self.count+=1
36 | self.f.save('麻辣花生.xls')
37 | print(self.count)
38 |
39 | work=Get_comments()
40 | work.run()
41 |
--------------------------------------------------------------------------------
/amap/amap.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import json
3 | import time
4 | from bs4 import BeautifulSoup
5 | import random
6 |
7 | headers = {
8 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
9 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
10 | 'Accept-Language': 'en-US,en;q=0.5',
11 | 'Accept-Encoding': 'gzip, deflate'}
12 |
13 | def get_province():
14 | html=requests.get('http://ditu.amap.com/service/poiInfo?query_type=TQUERY&pagesize=20&pagenum=1&qii=true&cluster_state=5&need_utd=true&utd_sceneid=1000&div=PC1000&addr_poi_merge=true&is_classify=true&city=100000&geoobj=19.198221%7C11.793397%7C-172.051779%7C53.547635&keywords=%E5%B9%B2%E6%9E%9C',headers=headers).text
15 | data=json.loads(html)
16 | table=BeautifulSoup(data['html'],'lxml').find_all('div',{'class':'sug-province'})
17 | f=open('citys.txt','a')
18 | for item in table:
19 | try:
20 | province=item.find('b').get_text()
21 | citys=item.find_all('a',{'class':'citycode'})
22 | for city in citys:
23 | f.write(province+'|'+city.get_text()+'|'+city.get('adcode')+'\n')
24 | except:
25 | continue
26 | f.close()
27 |
28 | def search(key,citycode):
29 | page=1
30 | result=[]
31 | while True:
32 | html=requests.get('http://ditu.amap.com/service/poiInfo?query_type=TQUERY&pagesize=20&pagenum=%s&qii=true&cluster_state=5&need_utd=true&div=PC1000&addr_poi_merge=true&is_classify=true&city=%s&keywords=%s'%(page,citycode,key),headers=headers).text
33 | data=json.loads(html)['data'][0]['list']
34 | if data==[]:
35 | break
36 | for item in data:
37 | try:
38 | tel=item['templateData']['tel']
39 | address=item['address']
40 | name=item['name']
41 | result.append(name+'| '+address+' |'+tel)
42 | except:
43 | continue
44 | page+=1
45 | print(citycode,page)
46 | time.sleep(random.randint(2,8))
47 | return result
48 |
49 | def main():
50 | for line in open('citys.txt','r'):
51 | line=line.replace('\n','')
52 | code=line.split('|')[-1]
53 | try:
54 | result=search('干果',code)
55 | except:
56 | failed=open('failed.txt','a')
57 | failed.write(line+'\n')
58 | failed.close()
59 | continue
60 | f=open('result.txt','a')
61 | for item in result:
62 | f.write(line+'|'+item+'\n')
63 | f.close()
64 | print(line)
65 | main()
66 |
--------------------------------------------------------------------------------
/anjuke/location.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import json
3 |
4 |
5 | def get_location(address,city):
6 | url='http://api.map.baidu.com/place/v2/search?query=%s®ion=%s&city_limit=true&output=json&ak=fh980b9Ga64S8bl8QblSC3kq'%(address,city)
7 | html=requests.get(url).text
8 | try:
9 | data=json.loads(html)['results'][0]['location']
10 | except:
11 | return ''
12 | lng=data['lng']
13 | lat=data['lat']
14 | return str(lng)+'|'+str(lat)
15 |
16 |
17 | line=get_location('滨湖新区四川路与云谷路交口西北角','合肥')
18 |
--------------------------------------------------------------------------------
/baidumap/baidumap.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | import requests
3 | import json
4 | import time
5 | import re
6 |
7 |
8 | headers = {
9 | 'Host':"map.baidu.com",
10 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
11 | "Accept-Encoding": "gzip, deflate",
12 | "Accept-Language": "en-US,en;q=0.5",
13 | "Connection": "keep-alive",
14 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
15 |
16 | def citys():
17 | html=requests.get('http://map.baidu.com/?newmap=1&reqflag=pcmap&biz=1&from=webmap&da_par=baidu&pcevaname=pc4.1&qt=s&da_src=searchBox.button&wd=%E6%B1%BD%E8%BD%A6%E7%BE%8E%E5%AE%B9%E5%BA%97&c=1&src=0&wd2=&sug=0&l=5&b=(7002451.220000001,1994587.88;19470675.22,7343963.88)&from=webmap&biz_forward={%22scaler%22:1,%22styles%22:%22pl%22}&sug_forward=&tn=B_NORMAL_MAP&nn=0&u_loc=12736591.152491,3547888.166124&ie=utf-8&t=1459951988807',headers=headers).text
18 | f=open('city_ids.txt','a')
19 | data=json.loads(html)
20 | for item in data['content']:
21 | #for city in item['city']:
22 | f.write(str(item)+'\n')
23 | f.close()
24 |
25 | def get_infor(keyword,code,page):
26 | html=requests.get('http://map.baidu.com/?newmap=1&reqflag=pcmap&biz=1&from=webmap&da_par=baidu&pcevaname=pc4.1&qt=con&from=webmap&c='+str(code)+'&wd='+keyword+'&wd2=&pn='+str(page)+'&nn='+str(page*10)+'&db=0&sug=0&addr=0&&da_src=pcmappg.poi.page&on_gel=1&src=7&gr=3&l=12&tn=B_NORMAL_MAP&u_loc=12736591.152491,3547888.166124&ie=utf-8',headers=headers).text
27 | data=json.loads(html)['content']
28 | return data
29 |
30 |
31 | def main():
32 | keys=['眼镜店','视光中心']
33 | for keyword in keys:
34 | f=open(keyword+'_tels.txt','a')
35 | for line in open('city_ids.txt','r').readlines():
36 | line=line.replace('\n','')
37 | code=eval(line)['code']
38 | page=1
39 | while True:
40 | try:
41 | data=get_infor(keyword,code,page)
42 | except:
43 | break
44 | if data==[]:
45 | break
46 | for item in data:
47 | f.write(str(item)+'\n')
48 | page+=1
49 | print(code,page)
50 | time.sleep(1)
51 | f.close()
52 | main()
53 |
--------------------------------------------------------------------------------
/brokerbin.com/email_template:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | Hi {name},
10 |
11 |
12 | We have following quote matches your "searching" on Brokerbin:
13 |
14 |
15 | {product_name} new landed {price} each 3days delievery
16 |
17 |
18 |
19 | Click here to buy on our website.
20 |
21 |
22 |
23 | Please know the special price only provide after you logined.
24 |
25 |
26 | Thanks!
27 |
28 |
29 |
30 |
31 | --
32 |
33 |
34 | *Please register at our web-store:www.sailnetwork.com to check P&A 24-7. Regular coupons will be sent to register customers, and product prices will be lower online.
35 |
36 | Best regards;
37 |
38 | Sales Department | Sail Network Co., Ltd.
39 |
40 | Office: +86(0)2154223056*8004
41 |
42 | E-mail: sales@sailnetwork.com
43 |
44 | E-Shop: www.sailnetwork.com
45 |
46 | No.3-318, Lane7058, Zhongchun Rd. Shanghai, China
47 |
48 |
49 |
50 |
--------------------------------------------------------------------------------
/brokerbin.com/filter/filter.txt:
--------------------------------------------------------------------------------
1 | nocsupply.com
2 | nfsmith.nl
3 | florinconnect.com
4 | 3c-systerms.com
5 | arbitech.com
6 | fulinetwork.com
7 | beaoncn.com
8 | marketconnections.nl
9 | globalnetworkstech.com
10 | konnect8.co.uk
11 | inventusgroup.com
12 | square1product.com
13 | squarelnc.com
14 | apexitltd.com
15 | uniontechcoop.com
16 |
--------------------------------------------------------------------------------
/brokerbin.com/send_email.py:
--------------------------------------------------------------------------------
1 | from email import encoders
2 | from email.header import Header
3 | from email.mime.text import MIMEText
4 | from email.utils import parseaddr,formataddr
5 | import smtplib
6 | import time
7 | import os
8 | import json
9 |
10 |
11 | def _format_addr(s):
12 | name, addr = parseaddr(s)
13 | return formataddr((Header(name, 'utf-8').encode(), addr))
14 |
15 | def sendEmail(fromemail,passwd,toemail,subject,text):
16 | msg = MIMEText(text, 'html', 'utf-8')
17 | msg['Subject']=subject
18 | msg['From'] = _format_addr(fromemail.replace('foxmail','sailnetwork'))
19 | msg['To'] = _format_addr(toemail)
20 | server=smtplib.SMTP_SSL('smtp.qq.com')
21 | server.ehlo('smtp.qq.com')
22 | server.login(fromemail,passwd)
23 | server.sendmail(fromemail, [toemail], msg.as_string())
24 | server.quit()
25 |
26 | def load_emails(filename):
27 | f=open('email/'+filename,'r',encoding='utf-8').read()
28 | emails=[]
29 | for item in f.split('---'*8):
30 | try:
31 | lines=item.split('***'*4)
32 | subject=lines[0].replace('\r\n','')
33 | email=lines[1].replace('\r\n','').replace(' ','')
34 | text=lines[2]
35 | emails.append([email,subject,text])
36 | except:
37 | continue
38 | return emails
39 |
40 | def load_login():
41 | f=open('./email.json','r',encoding='utf8')
42 | data=json.load(f)
43 | return data
44 |
45 | def main():
46 | try:
47 | data=load_login()
48 | fromemail=data['fromemail']
49 | passwd=data['passwd']
50 | toemail=data['toemail']
51 | except:
52 | print("帐号导入失败")
53 | return
54 | for filename in os.listdir('email'):
55 | try:
56 | emails=load_emails(filename)
57 | except:
58 | print(filename,'load failed')
59 | for i in range(len(emails)):
60 | try:
61 | email=emails[i]
62 | subject=email[1].replace('\r','').replace('\n','').replace('\t','').replace(' ','')+'\t'+email[0].replace('\r','').replace('\n','').replace('\t','').replace(' ','')
63 | except:
64 | continue
65 | try:
66 | sendEmail(fromemail,passwd,toemail,subject,email[2])
67 | time.sleep(2)
68 | print(subject,'send ok')
69 | except:
70 | print(subject,'failed')
71 | print(filename,'完成')
72 |
73 | main()
74 | time.sleep(60)
75 |
--------------------------------------------------------------------------------
/buluo.qq.com/images.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import os
4 | import json
5 | import time
6 |
7 |
8 | headers = {
9 | 'X-Requested-With': 'XMLHttpRequest',
10 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
11 | "Accept-Encoding": "gzip, deflate",
12 | "Accept-Language": "en-US,en;q=0.5",
13 | "Connection": "keep-alive",
14 | 'Referer': 'http://buluo.qq.com/mobile/barindex.html?_wv=1027&_bid=128&from=recentvisited&bid=15226',
15 | "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36"}
16 |
17 | def get_page(bid,page):
18 | data={
19 | 'bid':bid,
20 | 'num':'10',
21 | 'start':page*10,
22 | 'bkn':''
23 | }
24 | html=requests.post('http://buluo.qq.com/cgi-bin/bar/post/get_post_by_page',headers=headers,data=data).text
25 | data=json.loads(html)['result']['posts']
26 | result=[]
27 | for item in data:
28 | try:
29 | title=item['title']
30 | pic_list=item['post']['pic_list']
31 | except:
32 | continue
33 | result.append([title,pic_list])
34 | return result
35 |
36 | def save_image(filedir,filename,img_url):
37 | headers = {
38 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
39 | "Accept-Encoding": "gzip, deflate",
40 | "Accept-Language": "en-US,en;q=0.5",
41 | "Connection": "keep-alive",
42 | "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36"}
43 | content=requests.get(img_url,headers=headers,timeout=30).content
44 | with open('images/%s/%s.jpg'%(filedir,filename),'wb') as img:
45 | img.write(content)
46 |
47 | def main():
48 | bid=input("输入bid:")
49 | try:
50 | startpage=input("起始页码:")
51 | startpage=int(startpage)-1
52 | except:
53 | startpage=0
54 | try:
55 | endpage=input("结束页码:")
56 | endpage=int(endpage)-1
57 | except:
58 | endpage=10
59 | filedir=1
60 | try:
61 | os.mkdir('images/')
62 | except:
63 | pass
64 | while startpage<=endpage:
65 | images=get_page(bid,startpage)
66 | for image in images:
67 | try:
68 | os.mkdir('images/'+str(filedir))
69 | except:
70 | pass
71 | f=open('images/%s/content.txt'%filedir,'a',encoding='utf-8')
72 | f.write(image[0])
73 | f.close()
74 | imgnum=1
75 | for img in image[1]:
76 | try:
77 | save_image(filedir,imgnum,img['url'])
78 | except:
79 | continue
80 | imgnum+=1
81 | print('page',startpage,filedir,'ok')
82 | filedir+=1
83 | startpage+=1
84 | print(startpage,'ok')
85 | time.sleep(2)
86 |
87 | main()
88 |
--------------------------------------------------------------------------------
/chart.cp.360.cn/charthistory.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import time
4 | import datetime
5 |
6 | headers = {
7 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
8 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
9 | 'Accept-Language': 'en-US,en;q=0.5',
10 | 'Accept-Encoding': 'gzip, deflate',
11 | 'Connection': 'keep-alive'}
12 |
13 | def get_history(date):
14 | url='http://chart.cp.360.cn/kaijiang/kaijiang?lotId=255401&spanType=2&span=%s_%s'%(date,date)
15 | html=requests.get(url,headers=headers).text.encode('iso-8859-1').decode('gbk')
16 | tables=BeautifulSoup(html,'lxml').find('div',id='his-tab').find('table',{'width':'100%'}).find_all('table')
17 | result=[]
18 | for table in tables:
19 | for tr in table.find_all('tr'):
20 | try:
21 | tds=tr.find_all('td')
22 | number=tds[0].get_text()
23 | if number=='':
24 | continue
25 | value=tds[1].get_text()
26 | if value=='':
27 | continue
28 | value1=value[:3]
29 | value2=value[1:4]
30 | value3=value[2:]
31 | result.append([date,number,value,value1,value2,value3])
32 | except:
33 | continue
34 | return result
35 |
36 | def nextday(d):
37 | oneday = datetime.timedelta(days=1)
38 | day = d+oneday
39 | return day
40 |
41 | def main():
42 | day=datetime.datetime.strptime('2010-01-01','%Y-%m-%d')
43 | while True:
44 | str_day=str(day).split(' ')[0]
45 | f=open('result.txt','a')
46 | try:
47 | result=get_history(str_day)
48 | except:
49 | print(str_day,'failed')
50 | time.sleep(1)
51 | continue
52 | for item in result:
53 | f.write(str(item)+'\n')
54 | f.close()
55 | day=nextday(day)
56 | print(str_day,'ok')
57 | time.sleep(1)
58 | if str_day=='2016-10-23':
59 | break
60 |
61 | main()
62 |
--------------------------------------------------------------------------------
/china.tandfonline.com/search_article.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import openpyxl
4 | import time
5 |
6 | headers = {
7 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
8 | "Accept-Encoding": "gzip, deflate",
9 | "Accept-Language": "en-US,en;q=0.5",
10 | "Connection": "keep-alive",
11 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
12 |
13 | def get_articles():
14 | page=0
15 | while True:
16 | html=requests.get('http://china.tandfonline.com/action/doSearch?AllField=urban+design&Ppub=%5B20151107+TO+20161107%5D&content=standard&countTerms=true&target=default&sortBy=&pageSize=50&subjectTitle=&startPage='+str(page),headers=headers).text
17 | table=BeautifulSoup(html,'lxml').find('ol',{'class':'search-results'}).find_all('li')
18 | f=open('titles.txt','a')
19 | for item in table:
20 | title=item.find('article').get('data-title')
21 | f.write(title+'\n')
22 | f.close()
23 | page+=1
24 | print('抓取第',page,'页')
25 | #time.sleep(1)
26 | if page==267:
27 | break
28 |
29 | def word_cut():
30 | text=open('./titles.txt','r').read()
31 | text=text.replace(':',' ').replace("?",' ').replace('.','').replace(')',' ').replace('(','').replace('+','').replace('“','').replace('”','').replace('\n','')
32 | words=text.split(' ')
33 | result={}
34 | for word in words:
35 | word=word.lower()
36 | try:
37 | result[word]+=1
38 | except:
39 | result[word]=1
40 |
41 | excel=openpyxl.Workbook(write_only=True)
42 | sheet=excel.create_sheet()
43 | for key in result:
44 | sheet.append([key,result[key]])
45 | excel.save('result.xlsx')
46 |
47 | get_articles()
48 |
--------------------------------------------------------------------------------
/club.qingdaonews.com/article.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import openpyxl
4 |
5 | headers = {
6 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
7 | "Accept-Encoding": "gzip, deflate",
8 | "Accept-Language": "en-US,en;q=0.5",
9 | "Connection": "keep-alive",
10 | 'Cookie':'PHPSESSID=d2a521b9298f8691e4c37487b6657ac3; Hm_lvt_099a2f2a4f2c2f042dbd360b42309fc4=1482199772; Hm_lpvt_099a2f2a4f2c2f042dbd360b42309fc4=1482199852; CNZZDATA1000084976=1383072779-1482195841-null%7C1482195841; username=JarMrmn4olyPFzOAltjC0Q%3D%3D; password=jv2Y7Ga10EoO2Tn3W%2FY1plZvYz1QGqB2; NSC_dmvc=ffffffff09020e0445525d5f4f58455e445a4a423660',
11 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
12 |
13 |
14 | def get_article(endpage):
15 | page=1
16 | result=[]
17 | while True:
18 | url='http://club.qingdaonews.com/usercenter/mytopic.php?page=%s'%page
19 | try:
20 | html=requests.get(url,headers=headers,timeout=30).text
21 | except:
22 | continue
23 | table=BeautifulSoup(html,'lxml').find('div',{'class':'add_list'}).find_all('li')
24 | for li in table:
25 | try:
26 | url='http://club.qingdaonews.com'+li.find('a').get('href')
27 | title=li.find('a').get_text()
28 | result.append([title,url])
29 | except:
30 | continue
31 | if page==endpage:
32 | break
33 | print(page,'ok')
34 | page+=1
35 | return result
36 |
37 | def main():
38 | result=get_article(168)
39 | excel=openpyxl.Workbook(write_only=True)
40 | sheet=excel.create_sheet()
41 | for line in result:
42 | sheet.append(line)
43 | excel.save('urls.xlsx')
44 |
45 | main()
46 |
--------------------------------------------------------------------------------
/cn.bing.com/urls.txt:
--------------------------------------------------------------------------------
1 | www.azlyrics.com
2 | www.metrolyrics.com/
3 | lyrics.wikia.com
4 | www.songlyrics.com
5 | www.musixmatch.com/
6 | www.lyricsfreak.com/
7 | www.lyricsmode.com/
8 | www.directlyrics.com/
9 | www.darklyrics.com/
10 | www.allthelyrics.com
11 | www.sing365.com/
12 | www.lyricsg.com
13 | www.parolesmania.com/
14 | www.sweetslyrics.com
15 | azlyricdb.com
16 | www.musicsonglyrics.com/
17 | www.honeyguide.co.uk
18 | songmeanings.com/
19 | www.lyricsforsong.net/
20 | www.elyrics.com
21 | www.lyricsreg.com
22 | batlyrics.net/
23 | genius.com/
24 | www.lyricspond.com/
25 | artists.letssingit.com/
26 | www.cduniverse.com/
27 | www.leoslyrics.com/
28 | www.lyrster.com/
29 | www.smartlyrics.com/
30 | www.lyrics007.com/
31 | www.classic-country-song-lyrics.com/
32 |
--------------------------------------------------------------------------------
/data.cma.gov.cn/Duplicate.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import os
4 |
5 | def Duplicate():
6 | for filename in os.listdir('.'):
7 | if filename.endswith('txt'):
8 | lines=open(filename,'r').readlines()
9 | lines=list(set(lines))
10 | lines.sort()
11 | f=open(filename,'w')
12 | for line in lines:
13 | f.write(line)
14 | f.close()
15 |
16 | Duplicate()
17 |
--------------------------------------------------------------------------------
/datacenter.mep.gov.cn/air_dairy.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import openpyxl
4 | import time
5 |
6 | headers = {
7 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
8 | "Accept-Encoding": "gzip, deflate",
9 | "Accept-Language": "en-US,en;q=0.5",
10 | "Connection": "keep-alive",
11 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
12 |
13 |
14 | def get_table(url):
15 | html=requests.get(url,headers=headers).text
16 | table=BeautifulSoup(html,'html.parser').find('table',id='report1').find_all('tr')
17 | result=[]
18 | for tr in table[2:-3]:
19 | item=''
20 | for td in tr.find_all('td'):
21 | item+=td.get_text()+'|'
22 | result.append(item)
23 | return result
24 |
25 | def main():
26 | text_f=open('2014_2016.txt','w',encoding='utf-8')
27 | startdate='2014-01-01'#起始日期
28 | enddate='2016-07-19'#结束日期
29 | startpage=1#起始页码
30 | endpage=10#结束页码
31 | while startpage<=endpage:
32 | url='http://datacenter.mep.gov.cn/report/air_daily/air_dairy.jsp?city=&startdate={}&enddate={}&page={}'.format(startdate,enddate,startpage)
33 | try:
34 | items=get_table(url)
35 | except:
36 | time.sleep(2)
37 | print(startpage,'-failed')
38 | continue
39 | for item in items:
40 | text_f.write(item+'\n')
41 | print(startpage,'-ok')
42 | startpage+=1
43 | text_f.close()
44 | write_to_excel()
45 |
46 | def write_to_excel():
47 | excel=openpyxl.Workbook(write_only=True)
48 | sheet=excel.create_sheet()
49 | for line in open('2014_2016.txt','r',encoding='utf-8'):
50 | line=line.replace('\n','')
51 | sheet.append(line.split('|'))
52 | excel.save('2014_2016.xlsx')
53 |
54 | main()
55 |
--------------------------------------------------------------------------------
/datacenter.mep.gov.cn/air_dairy_aqi.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import openpyxl
4 | import time
5 |
6 | headers = {
7 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
8 | "Accept-Encoding": "gzip, deflate",
9 | "Accept-Language": "en-US,en;q=0.5",
10 | "Connection": "keep-alive",
11 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
12 |
13 |
14 | def get_table(url):
15 | html=requests.get(url,headers=headers).text
16 | table=BeautifulSoup(html,'html.parser').find('table',id='report1').find_all('tr')
17 | result=[]
18 | for tr in table[2:-3]:
19 | item=''
20 | for td in tr.find_all('td'):
21 | item+=td.get_text()+'|'
22 | result.append(item)
23 | return result
24 |
25 | def main():
26 | text_f=open('2000_2014.txt','w',encoding='utf-8')
27 | startdate='2000-01-01'#起始日期
28 | enddate='2015-12-31'#结束日期
29 | startpage=1#起始页码
30 | endpage=10#结束页码
31 | while startpage<=endpage:
32 | url='http://datacenter.mep.gov.cn/report/air_daily/air_dairy_aqi.jsp?city=&startdate={}&enddate={}&page={}'.format(startdate,enddate,startpage)
33 | try:
34 | items=get_table(url)
35 | except:
36 | time.sleep(2)
37 | print(startpage,'-failed')
38 | continue
39 | for item in items:
40 | text_f.write(item+'\n')
41 | print(startpage,'-ok')
42 | startpage+=1
43 | text_f.close()
44 | write_to_excel()
45 |
46 | def write_to_excel():
47 | excel=openpyxl.Workbook(write_only=True)
48 | sheet=excel.create_sheet()
49 | for line in open('2000_2014.txt','r',encoding='utf-8'):
50 | line=line.replace('\n','')
51 | sheet.append(line.split('|'))
52 | excel.save('2000_2014.xlsx')
53 |
54 | main()
55 |
--------------------------------------------------------------------------------
/dianping/data/上海.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/上海.xls
--------------------------------------------------------------------------------
/dianping/data/北京.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/北京.xls
--------------------------------------------------------------------------------
/dianping/data/南京.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/南京.xls
--------------------------------------------------------------------------------
/dianping/data/厦门.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/厦门.xls
--------------------------------------------------------------------------------
/dianping/data/大连.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/大连.xls
--------------------------------------------------------------------------------
/dianping/data/天津.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/天津.xls
--------------------------------------------------------------------------------
/dianping/data/宁波.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/宁波.xls
--------------------------------------------------------------------------------
/dianping/data/广州.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/广州.xls
--------------------------------------------------------------------------------
/dianping/data/成都.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/成都.xls
--------------------------------------------------------------------------------
/dianping/data/无锡.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/无锡.xls
--------------------------------------------------------------------------------
/dianping/data/杭州.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/杭州.xls
--------------------------------------------------------------------------------
/dianping/data/武汉.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/武汉.xls
--------------------------------------------------------------------------------
/dianping/data/沈阳.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/沈阳.xls
--------------------------------------------------------------------------------
/dianping/data/济南.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/济南.xls
--------------------------------------------------------------------------------
/dianping/data/深圳.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/深圳.xls
--------------------------------------------------------------------------------
/dianping/data/苏州.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/苏州.xls
--------------------------------------------------------------------------------
/dianping/data/西安.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/西安.xls
--------------------------------------------------------------------------------
/dianping/data/郑州.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/郑州.xls
--------------------------------------------------------------------------------
/dianping/data/重庆.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/重庆.xls
--------------------------------------------------------------------------------
/dianping/data/长沙.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/长沙.xls
--------------------------------------------------------------------------------
/dianping/data/青岛.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/青岛.xls
--------------------------------------------------------------------------------
/dianping/shopinfor.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 | import time
6 | headers = {
7 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
8 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
9 | 'Accept-Language': 'en-US,en;q=0.5',
10 | 'Accept-Encoding': 'gzip, deflate',
11 | 'Connection': 'keep-alive'}
12 |
13 | def get_infor():
14 | urls=['https://www.dianping.com/search/category/2/10/r2588o2p','https://www.dianping.com/search/category/2/10/r1493o2p','https://www.dianping.com/search/category/2/10/r1490o2p']
15 | f=open('haidian.txt','a',encoding='utf-8')
16 | for url in urls:
17 | page=1
18 | while page<=50:
19 | try:
20 | html=requests.get(url+str(page),headers=headers,timeout=30).text
21 | except:
22 | continue
23 | table=BeautifulSoup(html,'lxml').find('div',id='shop-all-list').find_all('li')
24 | for li in table:
25 | try:
26 | soup=li.find('div',attrs={'class':'txt'})
27 | tit=soup.find('div',attrs={'class':'tit'})
28 | comment=soup.find('div',attrs={'class':'comment'})
29 | tag_addr=soup.find('div',attrs={'class':'tag-addr'})
30 | text=tit.find('a').get_text().replace('\r','').replace('\n','')+'||'+comment.find('span').get('title')+'||'+comment.find('a',attrs={'class':'review-num'}).get_text().replace('\r','').replace('\n','')+'||'+comment.find('a',attrs={'class':'mean-price'}).get_text().replace('\r','').replace('\n','')+'||'+tag_addr.find('span',attrs={'class':'tag'}).get_text().replace('\r','').replace('\n','')+'||'+tag_addr.find('span',attrs={'class':'addr'}).get_text().replace('\r','').replace('\n','')+'||'
31 | comment_list=soup.find('span',attrs={'class':'comment-list'}).find_all('span')
32 | for i in comment_list:
33 | text+='||'+i.get_text().replace('\r','').replace('\n','')
34 | for i in tit.find('div',attrs={'class':'promo-icon'}).find_all('a'):
35 | try:
36 | text+='||'+i.get('class')
37 | except:
38 | text+='||'+i.get('class')[0]
39 | f.write(text.replace(' ','')+'\n')
40 | except:
41 | continue
42 | page+=1
43 | print(page)
44 | time.sleep(1)
45 | f.close()
46 |
47 | get_infor()
48 |
--------------------------------------------------------------------------------
/dianping/shoplist.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | import json
5 | import xlwt3
6 | import os
7 |
8 | headers = {
9 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
10 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
11 | 'Accept-Language': 'en-US,en;q=0.5',
12 | 'Accept-Encoding': 'gzip, deflate',
13 | 'Connection': 'keep-alive'}
14 |
15 | def get_data(url):
16 | html=requests.get(url,headers=headers).text
17 | data=json.loads(html)['shopBeans']
18 | return data
19 |
20 | def shoplist():
21 | try:
22 | os.mkdir('data')
23 | except:
24 | print('--')
25 | items={'最佳餐厅':'http://www.dianping.com/mylist/ajax/shoprank?cityId=%s&shopType=10&rankType=score&categoryId=0','人气餐厅':'http://www.dianping.com/mylist/ajax/shoprank?cityId=%s&shopType=10&rankType=popscore&categoryId=0','口味最佳':'http://www.dianping.com/mylist/ajax/shoprank?cityId=%s&shopType=10&rankType=score1&categoryId=0','环境最佳':'http://www.dianping.com/mylist/ajax/shoprank?cityId=%s&shopType=10&rankType=score2&categoryId=0','服务最佳':'http://www.dianping.com/mylist/ajax/shoprank?cityId=%s&shopType=10&rankType=score3&categoryId=0'}
26 | citys={'北京':'2','上海':'1','广州':'4','深圳':'7','成都':'8','重庆':'9','杭州':'3','南京':'5','沈阳':'18','苏州':'6','天津':'10','武汉':'16','西安':'17','长沙':'344','大连':'19','济南':'22','宁波':'11','青岛':'21','无锡':'13','厦门':'15','郑州':'160'}
27 | excel=xlwt3.Workbook()
28 | sheet=excel.add_sheet('sheet')
29 | count=0
30 | for city in citys:
31 | for key in items:
32 | try:
33 | data=get_data(items[key]%(citys[city]))
34 | except:
35 | print('Error!')
36 | continue
37 | num=1
38 | for item in data:
39 | sheet.write(count,0,str(count+1))
40 | sheet.write(count,1,key)
41 | sheet.write(count,2,city)
42 | sheet.write(count,3,num)
43 | sheet.write(count,4,item['filterFullName'])
44 | sheet.write(count,5,item['mainRegionName'])
45 | sheet.write(count,6,item['refinedScore1'])
46 | sheet.write(count,7,item['refinedScore2'])
47 | sheet.write(count,8,item['refinedScore3'])
48 | sheet.write(count,9,item['avgPrice'])
49 | if '(' in item['filterFullName'] or '(' in item['filterFullName']:
50 | sheet.write(count,10,'Y')
51 | else:
52 | sheet.write(count,10,'N')
53 | num+=1
54 | count+=1
55 | print(city+'--OK')
56 | excel.save('data/data.xls')
57 |
58 | shoplist()
59 |
--------------------------------------------------------------------------------
/douban/movie_grade.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import json
4 |
5 | headers = {
6 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
7 | "Accept-Encoding": "gzip, deflate",
8 | "Accept-Language": "en-US,en;q=0.5",
9 | "Connection": "keep-alive",
10 | "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36"}
11 |
12 | def comments(movieid,fromdate,todate):
13 | start=0
14 | rating={}
15 | comments={}
16 | while True:
17 | url='https://m.douban.com/rexxar/api/v2/movie/{}/interests?count=20&order_by=latest&start={}&ck=&for_mobile=1'.format(movieid,start)
18 | html=requests.get(url,headers=headers).text
19 | print(movieid,start)
20 | start+=25
21 | data=json.loads(html)['interests']
22 | if len(data)==0:
23 | break
24 | for item in data:
25 | date=item['create_time'].split(' ')[0]
26 | int_date=int(date.replace('-',''))
27 | if int_date>todate:
28 | continue
29 | if int_date500000):
55 | return data
56 | return []
57 |
58 | if __name__=='__main__':
59 | threadings=[]
60 | f=open('华语.txt','r')
61 | file_d=open('data.txt','a')
62 | for line in f.readlines():
63 | for id in eval(line.replace('\n','')):
64 | data=get_id(id)
65 | if data==[]:
66 | continue
67 | file_d.write(str(data)+'\n')
68 | print(id)
69 |
--------------------------------------------------------------------------------
/newseed.pedaily.cn/invest.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import openpyxl
4 | import time
5 |
6 | headers = {
7 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
8 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
9 | 'Accept-Language': 'en-US,en;q=0.5',
10 | 'Accept-Encoding': 'gzip, deflate',
11 | 'Connection': 'keep-alive'}
12 |
13 | def invest(page):
14 | html=requests.get('http://newseed.pedaily.cn/invest/p'+str(page),headers=headers).text
15 | table=BeautifulSoup(html,'lxml').find('table',{'class':'record-table'}).find_all('tr')
16 | result=[]
17 | for tr in table:
18 | tds=tr.find_all('td')
19 | if len(tds)==0:
20 | continue
21 | line=[]
22 | for td in tds:
23 | try:
24 | line.append(td.get_text())
25 | except:
26 | line.append('')
27 | result.append(line)
28 | return result
29 |
30 | def write_to_excel(result):
31 | excel=openpyxl.Workbook(write_only=True)
32 | sheet=excel.create_sheet()
33 | for line in result:
34 | try:
35 | sheet.append(line)
36 | except:
37 | continue
38 | excel.save('result.xlsx')
39 |
40 | def main():
41 | pagefrom=input("起始页:")
42 | pageto=input("结束页:")
43 | pagefrom=int(pagefrom)
44 | pageto=int(pageto)
45 | result=[]
46 | while pagefrom<=pageto:
47 | try:
48 | result+=invest(pagefrom)
49 | except:
50 | print(pagefrom,'failed')
51 | continue
52 | print(pagefrom,'ok')
53 | pagefrom+=1
54 | time.sleep(1)
55 | write_to_excel(result)
56 |
57 | main()
58 |
--------------------------------------------------------------------------------
/rank.kongzhong.com/userInfor.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from bs4 import BeautifulSoup
3 | import time
4 |
5 | def loadNameAndArea():#加载需要抓取的名单
6 | lines=open('names.txt','r',encoding='utf-8').readlines()#读入文本
7 | userlist=[]
8 | for line in lines:
9 | userlist.append(line.replace('\r','').replace('\n',''))
10 | return userlist
11 |
12 |
13 | def writeToTxt(user):#将结果写入txt
14 | line='\t'.join(user)
15 | f=open('result.txt','a',encoding='utf-8')
16 | f.write(line+'\r\n')
17 | f.close()
18 |
19 | def parser(html):#解析网页,用的是BeautifulSoup库
20 | soup=BeautifulSoup(html,'html.parser').find('div',id='total')
21 | result=[]
22 | labels=['singlebattle','teambattle','totalbattle']
23 | for label in labels:
24 | table=soup.find('div',id=label)
25 | result.append(table.find('span',{'class':'value separate'}).get_text())
26 | result.append(table.find('span',{'class':'value2'}).get_text())
27 | return result
28 |
29 | def getUserInfor():
30 | browser=webdriver.Firefox()#调用火狐浏览器
31 | browser.get('http://rank.kongzhong.com/wows/index.html?name=%E4%BD%BF%E5%BE%92-%E6%B8%94%E9%B6%B8&zone=north')
32 | browser.implicitly_wait(10)#设置页面加载等待时间
33 | userlist=loadNameAndArea()#获取名单
34 | for user in userlist:
35 | user=user.split('\t')#名单中 名字和区域是以\t分隔
36 | if '南区' in user[-1]:#判断是那一个区域
37 | area='south'
38 | else:
39 | area='north'
40 | url='http://rank.kongzhong.com/wows/index.html?name=%s&zone=%s'%(user[0],area)#构造链接
41 | browser.get(url)#打开链接
42 | time.sleep(2)#停2s等待页面加载完成
43 | html=browser.page_source#获取页面源码
44 | try:
45 | result=parser(html)#解析页面
46 | except:
47 | continue
48 | result=user+result
49 | writeToTxt(result)#写入txt
50 | browser.quit()
51 |
52 | getUserInfor()
53 |
--------------------------------------------------------------------------------
/stock.finance.qq.com/stk_holder.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import openpyxl
4 | import time
5 |
6 | headers = {
7 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
8 | "Accept-Encoding": "gzip, deflate",
9 | "Accept-Language": "en-US,en;q=0.5",
10 | "Connection": "keep-alive",
11 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
12 |
13 | def get_stkholder(name,stkcode):
14 | html=requests.get('http://stock.finance.qq.com/corp1/stk_holder.php?zqdm=%s'%stkcode,headers=headers).text
15 | soup=BeautifulSoup(html,'lxml').find('table',{'class':'list list_d'})
16 | date=soup.find('tr').find_all('span',{'class':'fntTahoma'})[-1].get_text()
17 | table=soup.find_all('tr')
18 | result=[]
19 | for tr in table[2:-1]:
20 | tds=tr.find_all('td')
21 | item=[name,stkcode,date]
22 | for td in tds:
23 | item.append(td.get_text())
24 | result.append(item)
25 | return result
26 |
27 | def write_to_excel():
28 | excel=openpyxl.Workbook(write_only=True)
29 | filename=time.strftime("%Y%m%d %H%M%S",time.localtime())+'.xlsx'
30 | sheet=excel.create_sheet()
31 | for line in result:
32 | sheet.append(line)
33 | excel.save(filename)
34 |
35 | def main():
36 | result=[]
37 | for line in open('stkcode.txt','r',encoding='utf-8'):
38 | title=line.replace('\r','').replace('\n','').split('---')
39 | try:
40 | items=get_stkholder(title[0],title[1])
41 | except:
42 | pass
43 | time.sleep(3)
44 | continue
45 | result+=items
46 | print(title[0],title[1],'ok')
47 | time.sleep(3)
48 | write_to_excel(result)
49 |
50 | main()
51 |
--------------------------------------------------------------------------------
/stock.finance.qq.com/stkcode.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 |
4 |
5 | def get_stkcode():
6 | f=open('stkcode.txt','w')
7 | page=1
8 | while True:
9 | html=requests.get('http://hq.gucheng.com/List.asp?Type=A&Sort=&Page=%s'%page).text.encode('ISO-8859-1').decode('GBK','ignore')
10 | table=BeautifulSoup(html,'lxml').find('div',{'class':'hq_big_bk md_6'}).find_all('tr')
11 | for tr in table[1:-1]:
12 | tds=tr.find_all('td')
13 | line=tds[1].get_text()+'---'+tds[0].get_text()
14 | print(line)
15 | f.write(line+'\r\n')
16 | page+=1
17 | if page==139:
18 | break
19 | f.close()
20 |
21 | get_stkcode()
22 |
--------------------------------------------------------------------------------
/stock.jrj.com.cn/flowhistory.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import openpyxl
3 | import json
4 | import time
5 |
6 |
7 | def get_flowhistory(stockid):
8 | html=requests.get('http://zj.flashdata2.jrj.com.cn/flowhistory/share/%s.js'%stockid).text
9 | data=json.loads(html.replace('var stock_flow=',''))
10 | result=[]
11 | header=['序号','日期','涨跌幅','收盘价','换手率','净流入金额','主力净流入净额','主力净流入净占比','中单净流入净额','中单净流入净占比','散户净流入净额','散户净流入净占比','第二天']
12 | result.append(header)
13 | keys=['date','pl','cp','tr','tin','zin','zpit','min','mpit','sin','spit']
14 | count=1
15 | pre_line=''
16 | for line in data:
17 | item=[count]
18 | count+=1
19 | for key in keys:
20 | item.append(line[key])
21 | try:
22 | item.append(pre_line['pl'])
23 | except:
24 | pass
25 | result.append(item)
26 | pre_line=line
27 | return result
28 |
29 | def write_to_excel(result,stockid):
30 | excel=openpyxl.Workbook(write_only=True)
31 | sheet=excel.create_sheet()
32 | for item in result:
33 | sheet.append(item)
34 | excel.save('%s.xlsx'%stockid)
35 |
36 | def main():
37 | stockid=input("输入股票代码:")
38 | try:
39 | result=get_flowhistory(stockid)
40 | except:
41 | print('Failed!')
42 | time.sleep(10)
43 | return
44 | write_to_excel(result,stockid)
45 |
46 | main()
47 |
--------------------------------------------------------------------------------
/taobao/suggest.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import json
3 | import time
4 | import os
5 | import chardet
6 |
7 | headers = {
8 | ':authority':'suggest.taobao.com',
9 | 'user-agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36',
10 | 'Accept':"*/*",
11 | 'Accept-Language': 'en-US,en;q=0.5',
12 | 'Accept-Encoding': 'gzip, deflate',
13 | 'Connection': 'keep-alive'}
14 |
15 |
16 | def suggest(keyword):
17 | html=requests.get('https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null'.format(keyword),headers=headers).text
18 | data=json.loads(html)['result']
19 | result=[]
20 | for item in data:
21 | result.append(item[0].replace('','').replace('',''))
22 | return result
23 |
24 | def get_chardet(filename):
25 | data=open(filename,'rb').read()
26 | coding=chardet.detect(data)
27 | return coding['encoding']
28 |
29 | def loadkeywords():
30 | keywords={}
31 | for filename in os.listdir('keywords'):
32 | if '.txt' not in filename:
33 | continue
34 | encoding=get_chardet('keywords/'+filename)
35 | if encoding=='GB2312':
36 | encoding='GBK'
37 | keywords[filename]=[]
38 | for line in open('keywords/'+filename,'r',encoding=encoding):
39 | word=line.replace('\r','').replace('\n','')
40 | keywords[filename].append(word)
41 | return keywords
42 |
43 | def save_to_txt(filename,deep,words):
44 | f=open('result/'+filename.replace('.txt','_%s.txt'%deep),'w',encoding='utf-8')
45 | writed=[]
46 | for word in words:
47 | if word in writed:
48 | continue
49 | writed.append(word)
50 | f.write(word+'\r\n')
51 | f.close()
52 |
53 | def main():
54 | keywords=loadkeywords()
55 | while True:
56 | try:
57 | deep=input("输入采集深度:")
58 | deep=int(deep)
59 | break
60 | except:
61 | pass
62 | for filename in keywords:
63 | result=[]
64 | for word in keywords[filename]:
65 | words=[word]
66 | count=0
67 | for num in range(deep):
68 | suggest_words=[]
69 | for need_word in words:
70 | try:
71 | suggest_words+=suggest(need_word)
72 | except:
73 | continue
74 | suggest_words=list(set(suggest_words))
75 | words=suggest_words
76 | count+=len(suggest_words)
77 | result+=suggest_words
78 | print(word,'deep',num+1)
79 | print(word,'get',count,'ok')
80 | save_to_txt(filename,deep,result)
81 |
82 | main()
83 |
--------------------------------------------------------------------------------
/weibo/weibo.md:
--------------------------------------------------------------------------------
1 | ###Python网络爬虫之新浪微博
2 | ####1.模拟登录
3 | 这里我是利用selenium登录,然后获取登录后的cookies,方便快捷,也免去了编写代码模拟登录的麻烦。requests直接可以利用这个cookies实现登录抓取。
4 |
5 | ```python
6 | from selenium import webdriver
7 |
8 | def login(username,password):
9 | browser=webdriver.PhantomJS('./phantomjs')
10 | browser.get('https://passport.weibo.cn/signin/login?entry=mweibo&res=wel&wm=3349&r=http%3A%2F%2Fm.weibo.cn%2F')#打开登录界面
11 | browser.set_page_load_timeout(10)
12 | time.sleep(5)#延时等待网页加载完成
13 | browser.find_element_by_id('loginName').send_keys(username)#填入用户名
14 | browser.find_element_by_id('loginPassword').send_keys(password)#填入密码
15 | browser.find_element_by_id('loginAction').click()#点击登录
16 | time.sleep(5)
17 | cookies=browser.get_cookies()#获取登录后的cookies
18 | result={}
19 | for item in cookies:
20 | try:
21 | result[item['name']]=item['value']
22 | except:
23 | continue
24 | return result#返回dict类型cookies
25 |
26 | ```
27 | requests不能保持手动构建的cookie,因此需要将dict类型的cookie转成cookiejar类型
28 |
29 | ```python
30 | import requests
31 | import os
32 |
33 | def weibo():
34 | if os.path.isfile('cookies'):
35 | cookies=eval(open('cookies','r').read())
36 | else:
37 | cookies=login('username','password')#获取登录后的cookie
38 | session=requests.session()
39 | session.cookies=requests.utils.cookiejar_from_dict(cookies)#将字典转为CookieJar,并传入session中
40 | return session
41 |
42 | ```
43 |
44 | ####2.获取首页微博
45 | ```python
46 | import json
47 |
48 | headers = {
49 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
50 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
51 | 'Accept-Language': 'en-US,en;q=0.5',
52 | 'Accept-Encoding': 'gzip, deflate',
53 | 'Connection': 'keep-alive'}
54 |
55 | session=weibo()
56 | html=session.get('http://m.weibo.cn/index/feed?format=cards&page=1',headers=headers).text
57 | data=json.loads(html)[0]['card_group']
58 | result=[]
59 | for item in data:
60 | user=item['mblog']['user']['screen_name']
61 | text=item['mblog']['text']
62 | weiboid=item['mblog']['idstr']
63 | result.append({'user':user,'text':text})
64 | print(result)
65 | ```
66 |
67 | ####3.获取微博评论
68 |
69 | ```python
70 |
71 | def get_comments(session,weiboid):
72 | page=1
73 | html=session.get('http://m.weibo.cn/single/rcList?format=cards&id={weiboid}&type=comment&hot=0&page={page}'.format(weiboid=weiweiboid,page=page),headers=headers).text
74 | data=json.loads(html)[1]['card_group']
75 | comments=[]
76 | for item in data:
77 | comment={}
78 | comment['user']=item['user']['screen_name']
79 | comment['date']=item['created_at']
80 | comment['text']=item['text']
81 | comments.append(comment)
82 | return comments
83 | ```
84 |
--------------------------------------------------------------------------------
/weibo/weibo.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | from selenium import webdriver
4 | import time
5 | import os
6 | import json
7 |
8 | headers = {
9 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
10 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
11 | 'Accept-Language': 'en-US,en;q=0.5',
12 | 'Accept-Encoding': 'gzip, deflate',
13 | 'Connection': 'keep-alive'}
14 |
15 | def login(username,password):
16 | browser=webdriver.PhantomJS('/home/nyloner/phantomjs/phantomjs')
17 | #browser=webdriver.Firefox()
18 | browser.get('https://passport.weibo.cn/signin/login?entry=mweibo&res=wel&wm=3349&r=http%3A%2F%2Fm.weibo.cn%2F')
19 | browser.set_page_load_timeout(10)
20 | time.sleep(5)
21 | browser.find_element_by_id('loginName').send_keys(username)
22 | browser.find_element_by_id('loginPassword').send_keys(password)
23 | browser.find_element_by_id('loginAction').click()
24 | time.sleep(5)
25 | cookies=browser.get_cookies()
26 | result={}
27 | for item in cookies:
28 | try:
29 | result[item['name']]=item['value']
30 | except:
31 | continue
32 | f=open('cookies','w')
33 | f.write(str(result))
34 | f.close()
35 | return result
36 |
37 | def weibo():
38 | if os.path.isfile('cookies'):
39 | cookies=eval(open('cookies','r').read())
40 | else:
41 | cookies=login('username','password')
42 | session=requests.session()
43 | session.cookies=requests.utils.cookiejar_from_dict(cookies)
44 | html=session.get('http://m.weibo.cn',headers=headers).text
45 | html=session.get('http://m.weibo.cn/index/feed?format=cards&page=1',headers=headers).text
46 | data=json.loads(html)[0]['card_group']
47 | result=[]
48 | for item in data:
49 | user=item['mblog']['user']['screen_name']
50 | text=item['mblog']['text']
51 | result.append({'user':user,'text':text})
52 | print(result)
53 | print(get_comments(session,'4013542757481643'))
54 |
55 | def get_comments(session,weiboid):
56 | page=1
57 | html=session.get('http://m.weibo.cn/single/rcList?format=cards&id={weiboid}&type=comment&hot=0&page={page}'.format(weiboid=weiboid,page=page),headers=headers).text
58 | data=json.loads(html)[1]['card_group']
59 | comments=[]
60 | for item in data:
61 | comment={}
62 | comment['user']=item['user']['screen_name']
63 | comment['date']=item['created_at']
64 | comment['text']=item['text']
65 | comments.append(comment)
66 | return comments
67 |
68 | weibo()
69 |
--------------------------------------------------------------------------------
/weidian/weidian.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 | from selenium import webdriver
6 | import time
7 | import re
8 |
9 | headers = {
10 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
11 | "Accept-Encoding": "gzip, deflate",
12 | "Accept-Language": "en-US,en;q=0.5",
13 | "Connection": "keep-alive",
14 | "User-Agent": "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"}
15 |
16 | def get_place():
17 | f=open('place.txt','w')
18 | browser=webdriver.Firefox()
19 | #html=requests.get('http://weidian.com/near_shop/chunjie/city.html?&from=weidian&userid=211106418&umk=34542211106418',headers=headers).text.encode('ISO-8859-1').decode('utf-8','ignore')
20 | browser.get('http://weidian.com/near_shop/chunjie/city.html?&from=weidian&userid=211106418&umk=34542211106418')
21 | time.sleep(10)
22 | html=browser.page_source
23 | table=BeautifulSoup(html,'lxml').find('div',id='show-place').find_all('ul')
24 | places={}
25 | print(html)
26 | for item in table[1:]:
27 | for li in item.find_all('li'):
28 | places[li.get_text()]='http://weidian.com/near_shop/chunjie/'+li.find('a').get('href')
29 | for li in table[0].find_all('li'):
30 | places[li.get_text()]='http://weidian.com/near_shop/chunjie/'+li.find('a').get('href')
31 | for key in places:
32 | text=key+'||'+places[key]+'\n'
33 | f.write(text)
34 | f.close()
35 |
36 | def get_shop():
37 | f=open('shops.txt','a',encoding='utf-8')
38 | for line in open('place.txt').readlines():
39 | city=line.split('||')[0]
40 | place=re.findall('place=(.*?)&',line)[0]
41 | page=0
42 | while True:
43 | url='http://api.buyer.weidian.com/h5/appserver_nearbyShop.do?place='+place+'&seed=0&category=%E7%AE%B1%E5%8C%85&limit=50&page='+str(page)+'&callback=jsonp4&rnd=0.8898308666990978'
44 | html=requests.get(url,headers=headers).text
45 | rel='"shopid":"(.*?)","entranceName":"(.*?)","address":"(.*?)"'
46 | lists=re.findall(rel,html)
47 | if lists==[]:
48 | break
49 | for item in lists:
50 | text=item[0]+'||'+item[1]+'||'+item[2]
51 | f.write(text+'\n')
52 | print(city+place+'--'+str(page))
53 | page+=1
54 | f.close()
55 |
56 | def get_weixin():
57 | f=open('data.txt','a')
58 | for line in open('shops.txt'):
59 | line=line.replace('\n','')
60 | shopurl='http://weidian.com/?userid='+line.split('||')[0]
61 | html=requests.get(shopurl,headers=headers).text
62 | try:
63 | html=requests.get(shopurl,headers=headers).text
64 | rel='微信: (.*?)<'
65 | weixin=re.findall(rel,html)[0]
66 | except:
67 | continue
68 | print(line+'---OK')
69 | line=line+'||'+weixin+'\n'
70 | f.write(line)
71 |
72 | def main():
73 | #get_shop()
74 | get_weixin()
75 |
76 | main()
77 |
--------------------------------------------------------------------------------
/wenda.so.com/search.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import time
4 | from selenium import webdriver
5 |
6 | headers = {
7 | 'Host':"wenda.so.com",
8 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
9 | "Accept-Encoding": "gzip, deflate",
10 | "Accept-Language": "en-US,en;q=0.5",
11 | "Connection": "keep-alive",
12 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
13 |
14 | browser=webdriver.Firefox()
15 | browser.get('http://wenda.so.com/')
16 | browser.implicitly_wait(10)
17 | def search(key):
18 | #html=requests.get('http://wenda.so.com/search/?q='+key,headers=headers,timeout=30).text
19 | browser.get('http://wenda.so.com/search/?q='+key)
20 | time.sleep(0.5)
21 | html=browser.page_source
22 | table=BeautifulSoup(html,'lxml').find_all('li',{'class':'item'})
23 | for item in table:
24 | try:
25 | url=item.find('a').get('href')
26 | if 'q/' in url:
27 | return 'http://wenda.so.com/'+url
28 | except:
29 | continue
30 |
31 | def get_questions():
32 | for word in open('failed_words','r'):
33 | word=word.replace('\r','').replace('\n','')
34 | try:
35 | url=search(word)
36 | except:
37 | failed=open('failed.txt','a')
38 | failed.write(word+'\n')
39 | failed.close()
40 | continue
41 | if url==None:
42 | failed=open('failed.txt','a')
43 | failed.write(word+'\n')
44 | failed.close()
45 | continue
46 | f=open('question_','a')
47 | f.write(word+'||'+url+'\n')
48 | print(word,'ok')
49 | f.close()
50 |
51 | get_questions()
52 |
--------------------------------------------------------------------------------
/wenshu.court.gov.cn/download.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import json
4 |
5 | headers = {
6 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
7 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
8 | 'Accept-Language': 'en-US,en;q=0.5',
9 | 'Accept-Encoding': 'gzip, deflate',
10 | 'Connection': 'keep-alive'}
11 |
12 | def doclist(page,Param="",Order="裁判日期"):
13 | data={
14 | 'Param':Param,
15 | 'Index':page,
16 | 'Page':"20",
17 | 'Order':Order,
18 | 'Direction':"desc"
19 | }
20 | html=requests.post('http://wenshu.court.gov.cn/List/ListContent',data=data,headers=headers).text
21 | data=json.loads(html)
22 | data=eval(data)
23 | result=[]
24 | for item in data:
25 | if 'Count' in item:
26 | continue
27 | result.append(item)
28 | return result
29 |
30 | def download(docid,title):
31 | data={
32 | 'conditions':'',
33 | 'docIds':docid+'|'+title+'|',
34 | 'keyCode':""
35 | }
36 | content=requests.post('http://wenshu.court.gov.cn/CreateContentJS/CreateListDocZip.aspx?action=1',data=data,headers=headers).content
37 | with open('result/%s.doc'%docid,'wb') as f:
38 | f.write(content)
39 |
40 | if __name__ == '__main__':
41 | docs=doclist(1)
42 | try:
43 | import os
44 | os.mkdir('result')
45 | except:
46 | pass
47 | for item in docs:
48 | download(item['文书ID'],item['案件名称'])
49 | print(item['案件名称'])
50 |
--------------------------------------------------------------------------------
/worldfreightrates/trates.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | import re
5 | import xlrd
6 | import xlwt3
7 |
8 | headers = {
9 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
10 | "Accept-Encoding": "gzip, deflate",
11 | "Accept-Language": "en-US,en;q=0.5",
12 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
13 |
14 | def get_port(name):
15 | name=name.replace(' ','+')
16 | count=0
17 | statue=True
18 | while statue:
19 | try:
20 | html=requests.get('http://worldfreightrates.com/calculator/ports?term=%s'%name,headers=headers,timeout=30).text
21 | statue=False
22 | except:
23 | count+=1
24 | if count==3:
25 | return False
26 | continue
27 | try:
28 | data=eval(html)
29 | Id=data[0]['id']
30 | return Id
31 | except:
32 | return False
33 |
34 | def get_infor(fromid,toid,commodityName):
35 | url='http://worldfreightrates.com/en/calculator/ocean/rate?fromId='+fromid+'&toId='+toid+'&oceanType=FCL&commodityName='+commodityName+'&commodityValue=100&includeInsurance=false&includeReefer=false&includeHazardous=false&unit=lb&containerSize=40'
36 | html=requests.get(url,headers=headers,timeout=50).text.replace('\\','')
37 | rel='"result">(.*?)'
38 | try:
39 | result=re.findall(rel,html)[0]
40 | except:
41 | result=''
42 | return result
43 |
44 | def main():
45 | data = xlrd.open_workbook('data/data.xlsx')
46 | table = data.sheets()[0]
47 | excel=xlwt3.Workbook()
48 | sheet=excel.add_sheet('sheet')
49 | for row in range(table.nrows):
50 | print(row)
51 | fromport=table.cell(row,0).value
52 | toport=table.cell(row,1).value
53 | commodityName=table.cell(row,2).value
54 | Load_Type=table.cell(row,3).value
55 | fromid=get_port(fromport)
56 | toid=get_port(toport)
57 | if fromid==False or toid==False:
58 | sheet.write(row,0,fromport)
59 | sheet.write(row,1,toport)
60 | sheet.write(row,2,commodityName)
61 | sheet.write(row,3,Load_Type)
62 | sheet.write(row,4,'')
63 | excel.save('data/result.xls')
64 | continue
65 | try:
66 | result=get_infor(fromid,toid,commodityName.replace('&','%26').replace(' ','+').replace(',','%2C'))
67 | except:
68 | result=''
69 | sheet.write(row,0,fromport)
70 | sheet.write(row,1,toport)
71 | sheet.write(row,2,commodityName)
72 | sheet.write(row,3,Load_Type)
73 | sheet.write(row,4,result)
74 | excel.save('data/result.xls')
75 | main()
76 |
--------------------------------------------------------------------------------
/www.18ladys.com/18ladys.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import time
4 | import re
5 | import openpyxl
6 |
7 | headers = {
8 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
9 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
10 | 'Accept-Language': 'en-US,en;q=0.5',
11 | 'Accept-Encoding': 'gzip, deflate'}
12 |
13 | def get_names():
14 | page=1
15 | while page<21:
16 | html=requests.get('http://www.18ladys.com/cyzy/index.asp?page='+str(page),headers=headers).text.encode('iso-8859-1').decode('gbk')
17 | table=BeautifulSoup(html,'lxml').find('div',{'class':'tb1'}).find_all('a')
18 | f=open('names.txt','a')
19 | for item in table:
20 | try:
21 | name=item.get_text()
22 | url='http://www.18ladys.com/cyzy/'+item.get('href')
23 | f.write(name+'|'+url+'\n')
24 | except:
25 | continue
26 | f.close()
27 | print(page)
28 | page+=1
29 |
30 | def get_infor(name,url):
31 | html=requests.get(url,headers=headers).text.encode('iso-8859-1').decode('gbk','ignore')
32 | text=BeautifulSoup(html,'lxml').find('dd',{'class':'f14 jl4'}).find('p').get_text().replace('【','||【').replace('\r','').replace('\n','')
33 | text=text.split('||')
34 | result={'name':name}
35 | for item in text:
36 | try:
37 | name_value=item.split('】')
38 | name=name_value[0].replace('【','')
39 | value=name_value[1]
40 | result[name]=value
41 | except:
42 | continue
43 | return result
44 |
45 | def crawler():
46 | for line in open('names.txt','r'):
47 | line=line.replace('\n','')
48 | name=line.split('|')[0]
49 | url=line.split('|')[1]
50 | try:
51 | item=get_infor(name,url)
52 | except:
53 | failed=open('failed','a')
54 | failed.write(line+'\n')
55 | failed.close()
56 | f=open('result.txt','a')
57 | f.write(str(item)+'\n')
58 | f.close()
59 | print(line,'ok')
60 |
61 | def write_to_excel():
62 | excel=openpyxl.Workbook(write_only=True)
63 | sheet=excel.create_sheet()
64 | keys=['name','异名','别名','来源','植物形态','功用主治','用法与用量','炮制']
65 | sheet.append(keys)
66 | for line in open('result.txt','r'):
67 | item=eval(line)
68 | infor=[]
69 | for key in keys:
70 | try:
71 | infor.append(item[key])
72 | except:
73 | infor.append('')
74 | sheet.append(infor)
75 | excel.save('result.xlsx')
76 |
77 | crawler()
--------------------------------------------------------------------------------
/www.58.com/sendemail.py:
--------------------------------------------------------------------------------
1 | import smtplib
2 | from email.mime.text import MIMEText
3 | from email.mime.multipart import MIMEMultipart
4 | from email.header import Header
5 | import time
6 |
7 |
8 | def sendmail():
9 | sender = 'xxx@qq.com'
10 | receivers = ['xxx@qq.com'] # 接收邮件,可设置为你的QQ邮箱或者其他邮箱
11 | #创建一个带附件的实例
12 | message = MIMEMultipart()
13 | message['From'] = Header("xxxx", 'utf-8')
14 | message['To'] = Header("xxx@qq.com", 'utf-8')
15 | subject ='time.strftime("%Y-%m-%d %H:%M:%S")'
16 | message['Subject'] = Header(subject, 'utf-8')
17 | #邮件正文内容
18 | message.attach(MIMEText('time.strftime("%Y-%m-%d %H:%M:%S")', 'plain', 'utf-8'))
19 | att1 = MIMEText(open('result.xls', 'rb').read(), 'base64', 'utf-8')
20 | att1["Content-Type"] = 'application/octet-stream'
21 | # 这里的filename可以任意写,写什么名字,邮件中显示什么名字
22 | att1["Content-Disposition"] = 'attachment; filename="result.xls"'
23 | message.attach(att1)
24 | server=smtplib.SMTP_SSL('smtp.qq.com')
25 | server.ehlo('smtp.qq.com')
26 | server.login(sender,passwd)
27 | server.sendmail(sender, receivers, message.as_string())
28 |
29 | sendmail()
30 |
--------------------------------------------------------------------------------
/www.airbnb.com/deal.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import xlwt3
4 |
5 |
6 | def deal_userdata():
7 | userresult=open('userresult.txt','w')
8 | for line in open('userdata.txt','r'):
9 | line=line.replace('\n','')
10 | lists=line.split('||')
11 | try:
12 | allreview=int(lists[-2].replace('Reviews',''))
13 | except:
14 | allreview=0
15 | try:
16 | hostreview=int(lists[-1])
17 | except:
18 | hostreview=0
19 | try:
20 | prereview=allreview-hostreview
21 | except:
22 | prereview='--'
23 | result=''
24 | for i in lists:
25 | result+=i+'||'
26 | result+=str(prereview)
27 | userresult.write(result+'\n')
28 | userresult.close()
29 |
30 | def replace_r():
31 | room=open('roomtxt.txt','w')
32 | f=open('roomdata.txt','r').readlines()
33 | for line in f:
34 | line=line.replace('\r','').replace('\n','')
35 | room.write(line+'\n')
36 | room.close()
37 |
38 | def Excel():
39 | Response_rate='Response rate:(.*?)Response'
40 | Response_time='Response time:(.*?hours)'
41 | users=open('userresult.txt','r').readlines()
42 | rooms=open('roomtxt.txt','r').readlines()
43 | excel=xlwt3.Workbook()
44 | usersheet=excel.add_sheet('user')
45 | roomsheet=excel.add_sheet('room')
46 | count=0
47 | for line in rooms:
48 | lists=line.replace('\n','').split('||')
49 | for user in users:
50 | if lists[5] in user:
51 | try:
52 | rate=re.findall(Response_rate,line)[0]
53 | except:
54 | rate='--'
55 | try:
56 | time=re.findall(Response_time,line)[0]
57 | except:
58 | time='--'
59 | num=0
60 | for i in lists:
61 | try:
62 | i=i.split('?')[0]
63 | i=i.split(':')[-1]
64 | i=i.replace('/rooms/','')
65 | i=i.replace('/users/show/','')
66 | except:
67 | pass
68 | roomsheet.write(count,num,i)
69 | num+=1
70 | roomsheet.write(count,num,rate)
71 | num+=1
72 | roomsheet.write(count,num,time)
73 | num=0
74 | for i in user.replace('\n','').split('||'):
75 | try:
76 | i=i.split('?')[0]
77 | i=i.split(':')[-1]
78 | i=i.replace('/rooms/','')
79 | i=i.replace('/users/show/','')
80 | except:
81 | pass
82 | usersheet.write(count,num,i)
83 | num+=1
84 | count+=1
85 | excel.save('result.xls')
86 |
87 | Excel()
88 |
--------------------------------------------------------------------------------
/www.airbnb.com/rooms.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import re
4 |
5 | headers = {
6 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/44.0',
7 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
8 | 'Accept-Language': 'en-US,en;q=0.5',
9 | 'Accept-Encoding': 'gzip, deflate',
10 | 'Connection': 'keep-alive'}
11 |
12 | def rooms(url):
13 | html=requests.get(url,headers=headers).text
14 | try:
15 | table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'listings-container'}).find_all('div',attrs={'class':'listing'})
16 | except:
17 | return False
18 | result=[]
19 | for item in table:
20 | try:
21 | price=item.find('div',attrs={'class':'price-amount-container'}).get_text()
22 | except:
23 | price='--'
24 | try:
25 | media=item.find('div',attrs={'class':'media'})
26 | title=media.find('h3').get_text()
27 | userurl=media.find('a').get('href')
28 | roomurl=media.find('h3').find('a').get('href')
29 | except:
30 | continue
31 | a=media.find('a',attrs={'class':'text-normal link-reset'})
32 | try:
33 | rating=a.find('div',attrs={'class':'star-rating'}).find('div').find_all('i')
34 | star=len(rating)
35 | clases=[]
36 | for i in rating:
37 | clases+=i.get('class')
38 | if 'icon-star-half' in clases:
39 | star=star-0.5
40 | except:
41 | star='--'
42 | try:
43 | review=a.get_text().replace('\r','').replace('\n','').replace(' ','')
44 | review=re.findall('(\d+)reviews',review)[0]
45 | except:
46 | review='--'
47 | text=title+'||'+price+'||'+review+'||'+str(star)+'||'+roomurl+'||'+userurl
48 | result.append(text.replace('\r','').replace('\n','').replace(' ',''))
49 | return result
50 |
51 | def getrooms():
52 | citys="Chicago,Vancouver,Montreal,Portland,Philadelphia,Denver,Austin,D.C.,New Orleans,Phoenix,San Diego,Nashville,Paris,Berlin,Rome,Amsterdam,Barcelona,Copenhagen,Prague,Budapest,Stockholm,Florence,Edinburgh,Istanbul,Sydney,Melbourne,Cape Town,Beijing,Shanghai,Tokyo"
53 | failed=open('failed.txt','a',encoding='utf-8')
54 | for city in citys.split(','):
55 | print(city)
56 | url_f=open('urls.txt','a',encoding='utf-8')
57 | url='https://www.airbnb.com/s/'+city.replace(' ','+').replace('.','%252E')
58 | page=1
59 | pre=[]
60 | while True:
61 | result=rooms(url+'?ss_id=v5im73ob&page=%s'%page)
62 | if result==pre:
63 | break
64 | pre=result
65 | if result==False:
66 | failed.write(city+'--'+str(page))
67 | break
68 | for item in result:
69 | url_f.write(city+'||'+item+'\n')
70 | print(city,'--',page)
71 | page+=1
72 | if(page==18):
73 | break
74 | url_f.close()
75 | url_f.close()
76 | failed.close()
77 |
78 | getrooms()
79 |
--------------------------------------------------------------------------------
/www.baikemy.com/disease.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import time
4 | import openpyxl
5 |
6 |
7 | headers = {
8 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
9 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
10 | 'Accept-Language': 'en-US,en;q=0.5',
11 | 'Accept-Encoding': 'gzip, deflate',
12 | 'Connection': 'keep-alive'}
13 |
14 | def disease_list():
15 | page=1
16 | f=open('urls.txt','w',encoding='utf-8')
17 | while True:
18 | try:
19 | html=requests.get('http://www.baikemy.com/disease/list/0/0?pageIndex='+str(page),headers=headers,timeout=30).text
20 | except:
21 | continue
22 | table=BeautifulSoup(html,'lxml').find('div',{'class':'ccjb_jbli'}).find_all('li')
23 | for li in table:
24 | try:
25 | name=li.find('a').get_text()
26 | url='http://www.baikemy.com/'+li.find('a').get('href').replace('view','detail')+'/1/'
27 | f.write(name+'|'+url+'\n')
28 | except:
29 | pass
30 | if len(table)==1:
31 | break
32 | print('page %s urls get'%page)
33 | page+=1
34 | f.close()
35 |
36 | def disease_infor(name,url):
37 | html=requests.get(url,headers=headers,timeout=30).text
38 | table=BeautifulSoup(html,'lxml').find('div',{'class':'lemma-main'}).find_all('div',{'class':'lemma-main-content'})
39 | result=[name]
40 | for item in table:
41 | try:
42 | key=item.find('span',{'class':'headline-content'}).get_text()
43 | value=item.find('div',{'class':'para'}).get_text()
44 | result.append(key+':\t '+value)
45 | except:
46 | continue
47 | return result
48 |
49 | def write_to_excel(result):
50 | excel=openpyxl.Workbook(write_only=True)
51 | sheet=excel.create_sheet()
52 | for line in result:
53 | try:
54 | sheet.append(line)
55 | except:
56 | pass
57 | excel.save('result.xlsx')
58 |
59 | def main():
60 | disease_list()
61 | result=[]
62 | for line in open('urls.txt','r',encoding='utf-8'):
63 | line=line.replace('\n','')
64 | try:
65 | name=line.split('|')[0]
66 | url=line.split('|')[1]
67 | except:
68 | continue
69 | try:
70 | data=disease_infor(name,url)
71 | except:
72 | failed=open('failed.txt','a',encoding='utf-8')
73 | failed.write(line+'\r\n')
74 | failed.close()
75 | continue
76 | result.append(data)
77 | try:
78 | print(name,'ok')
79 | except:
80 | pass
81 | write_to_excel(result)
82 | print('完成')
83 |
84 |
85 | main()
86 | time.sleep(60)
87 |
--------------------------------------------------------------------------------
/www.chazidian.com/yuwen.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import openpyxl
4 | import time
5 |
6 | headers = {
7 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
8 | "Accept-Encoding": "gzip, deflate",
9 | "Accept-Language": "en-US,en;q=0.5",
10 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
11 |
12 |
13 | def get_terms():
14 | html=open('html.html','r').read()
15 | table=BeautifulSoup(html).find_all('span',{'class':'y-l'})
16 | urls=[]
17 | f=open('terms.txt','w')
18 | for item in table:
19 | try:
20 | term=item.find('h4').get_text()
21 | publishers=item.find_all('p')
22 | for p in publishers:
23 | publisher=p.get_text()
24 | links=p.find_all('a')
25 | for a in links:
26 | url=a.get('href')
27 | f.write(term+'|'+publisher+'|'+a.get_text()+'|'+url+'\n')
28 | except:
29 | continue
30 | f.close()
31 |
32 | def get_article_url(term_url):
33 | html=requests.get('http://yuwen.chazidian.com'+term_url,headers=headers).text
34 | table=BeautifulSoup(html,'lxml').find('div',id='mulu').find_all('div',{'class':'mldy'})
35 | result=[]
36 | num=1
37 | for item in table:
38 | title=item.find('a').get_text()
39 | url=item.find('a').get('href').replace('kewen','kewendetail')
40 | line=str(num)+'|'+title+'|'+url
41 | result.append(line)
42 | num+=1
43 | return result
44 |
45 | def get_urls():
46 | for line in open('terms.txt','r'):
47 | line=line.replace('\n','')
48 | url=line.split('|')[-1]
49 | result=get_article_url(url)
50 | f=open('urls.txt','a')
51 | for item in result:
52 | f.write(line+'|'+item+'\n')
53 | f.close()
54 | print(line)
55 | time.sleep(1)
56 |
57 | def get_article_content(url):
58 | html=requests.get(url,headers=headers).text
59 | content=BeautifulSoup(html,'lxml').find('div',id='print_content').get_text()
60 | return content
61 |
62 | def main():
63 | excel=openpyxl.Workbook(write_only=True)
64 | sheet=excel.create_sheet()
65 | for line in open('urls.txt','r'):
66 | line=line.replace('\n','')
67 | infor_list=line.split('|')
68 | url=infor_list[-1]
69 | try:
70 | content=get_article_content(url)
71 | except:
72 | failed=open('failed.txt','a')
73 | failed.write(line+'\n')
74 | failed.close()
75 | continue
76 | sheet.append(infor_list+[content])
77 | print(line)
78 | time.sleep(0.5)
79 | excel.save('result.xlsx')
80 | main()
81 |
--------------------------------------------------------------------------------
/www.china-10.com/china10.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 | import time
6 |
7 | headers = {
8 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
9 | "Accept-Encoding": "gzip, deflate",
10 | "Accept-Language": "en-US,en;q=0.5",
11 | "Connection": "keep-alive",
12 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
13 |
14 | def get_kinds():
15 | f=open('types.txt','w')
16 | url='http://www.china-10.com/brand/'
17 | html=requests.get(url).text
18 | table=BeautifulSoup(html,'lxml').find('div',id='menubox').find('ul',id='conmenu').find_all('li',attrs={'class':'menu'})
19 | for item in table[1:-3]:
20 | key=item.find('a').get_text().replace('\n','')+'||'
21 | for li in item.find_all('li'):
22 | f.write(key+li.find('a').get('title')+'||'+li.find('a').get('href')+'\n')
23 | f.close()
24 |
25 | def get_brands():
26 | f=open('types.txt','r')
27 | data=open('brands.txt','w')
28 | for line in f.readlines():
29 | print(line)
30 | line=line.replace('\n','')
31 | page=1
32 | while True:
33 | html=requests.get(line.split('||')[-1]+'?action=ajax&page='+str(page),headers).text
34 | page+=1
35 | table=BeautifulSoup(html,'lxml').find_all('li')
36 | if(table==[]):
37 | break
38 | for item in table:
39 | text=line+'||'+item.get_text()+'||'+item.find('a').get('href')+'\n'
40 | data.write(text)
41 | print(page)
42 | f.close()
43 |
44 | def get_infor(line):
45 | html=requests.get(line.split('||')[-1],headers=headers).text
46 | table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'brandinfo'})
47 | des=table.find('dd').get_text()
48 | line+='||'+des
49 | table=table.find('ul').find_all('li')
50 | for li in table:
51 | line+='||'+li.get_text().replace('\r','').replace('\n','').replace('\t','').replace(' ','')
52 | return line
53 |
54 | def main():
55 | data=open('data.txt','a')
56 | failed=open('failed.txt','a')
57 | count=0
58 | for line in open('brands.txt','r').readlines():
59 | line=line.replace('\n','')
60 | try:
61 | line=get_infor(line)
62 | except:
63 | failed.write(line+'\n')
64 | continue
65 | data.write(line+'\n')
66 | count+=1
67 | time.sleep(1)
68 | print(count)
69 |
70 | main()
71 |
--------------------------------------------------------------------------------
/www.china-10.com/excel.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import xlwt3
4 | import re
5 |
6 | def excel():
7 | f=open('data.txt','r')
8 | ex=xlwt3.Workbook()
9 | sheet=ex.add_sheet('sheet')
10 | count=0
11 | rels=['品牌等级:(.*?)\|\|','关注指数:(.*?)\|\|','\|\|.*?董事.*?:(.*?)品牌创立','时间:(.*?)\|\|','发源地:(.*?)\|\|','官方网站:(.*?)\|\|','客服电话:(.*?)\|\|','告词:(.*?)\|\|','(产品\d+)]','(网点\d+)]','(新闻\d+)]','(网店.*?)]']
12 | for line in f.readlines():
13 | line=line.replace('\n','').replace('信用指数:','')
14 | lists=[]
15 | for rel in rels:
16 | try:
17 | i=re.findall(rel,line)[0]
18 | except:
19 | i='--'
20 | lists.append(i)
21 | strs=line.split('||')
22 | sheet.write(count,0,strs[0])
23 | sheet.write(count,1,strs[1])
24 | sheet.write(count,2,strs[2])
25 | sheet.write(count,3,strs[3])
26 | sheet.write(count,4,strs[4])
27 | sheet.write(count,5,strs[5])
28 | num=6
29 | for i in lists:
30 | sheet.write(count,num,i)
31 | num+=1
32 | sheet.write(count,num,strs[-1])
33 | count+=1
34 | ex.save('data.xls')
35 |
36 | excel()
37 |
--------------------------------------------------------------------------------
/www.chuanlaoda.cn/CaptchaOCR.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.chuanlaoda.cn/CaptchaOCR.dll
--------------------------------------------------------------------------------
/www.chuanlaoda.cn/py2exe_install.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | from distutils.core import setup
3 | import py2exe
4 |
5 | setup(console=["chuanlaoda.py"])
6 |
--------------------------------------------------------------------------------
/www.chuanlaoda.cn/testdll.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 |
4 | from ctypes import *
5 |
6 | ocrpasswd = "868197D30CC624FD3C2E2EE66494DA5F"
7 | #VcodeInit 初始换引擎函数 只有一个参数 为引擎初始化密码 失败返回-1 此函数只需调用一次 切勿多次调用 。
8 | dll = windll.LoadLibrary('CaptchaOCR.dll')
9 | load_ocr = dll.VcodeInit
10 | load_ocr.argtypes = [c_char_p]
11 | load_ocr.restypes = c_int
12 | index = load_ocr(ocrpasswd.encode('utf-8'))
13 | img_string = open(imgname, "rb").read()
14 | img_buffer = create_string_buffer(img_string)
15 | #申请接收识别结果的缓冲区 一定要申请
16 | ret_buffer = create_string_buffer(15)
17 | #调用此函数之前,如果已经初始化成功过识别引擎函数 那么无需再调用初始化函数
18 | #GetVcode 识别函数 参数1为 VcodeInit 返回值 index 参数2为图片数据 参数3为图片大小 参数4为接收识别结果 需要给变量申请内存 如 ret_buffer = create_string_buffer(10)
19 | get_code_from_buffer = dll.GetVcode
20 | get_code_from_buffer(index, byref(img_buffer), len(img_buffer), byref(ret_buffer))
21 | print (ret_buffer.value.decode('utf-8'))
22 |
--------------------------------------------------------------------------------
/www.chuanlaoda.cn/x64/CaptchaOCR.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.chuanlaoda.cn/x64/CaptchaOCR.dll
--------------------------------------------------------------------------------
/www.cpbz.gov.cn/write_to_excel.py:
--------------------------------------------------------------------------------
1 | import openpyxl
2 |
3 |
4 | def load_result():
5 | result=[]
6 | for line in open('result.txt','r'):
7 | item=eval(line)
8 | baseinfor=[item['url']]
9 | for key in ['机构名称','法定代表人','组织机构代码','邮政编码','注册地址','行政区划']:
10 | try:
11 | baseinfor.append(item['企业基本信息'][key])
12 | except:
13 | baseinfor.append('')
14 | numbers=[]
15 | try:
16 | for num_line in item['技术指标']:
17 | numbers+=num_line
18 | except:
19 | pass
20 | for key in ['标准名称','标准编号','公开时间','url']:
21 | try:
22 | baseinfor.append(item['标准信息'][key])
23 | except:
24 | baseinfor.append('')
25 | try:
26 | products=item['产品信息']
27 | except:
28 | products=[]
29 | for product in products:
30 | product[-1]=item['standardStatus']
31 | yield baseinfor+product+numbers
32 |
33 | def write_to_excel():
34 | excel=openpyxl.Workbook(write_only=True)
35 | sheet=excel.create_sheet()
36 | for line in load_result():
37 | sheet.append(line)
38 | excel.save('result.xlsx')
39 |
40 | write_to_excel()
41 |
--------------------------------------------------------------------------------
/www.ctrip.com/comments.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from bs4 import BeautifulSoup
3 | import time
4 |
5 | browser=webdriver.Chrome("./chromedriver")
6 | browser.get('http://hotels.ctrip.com/hotel/zhuhai31')
7 | browser.implicitly_wait(10)
8 | hotels=[eval(line) for line in open('hotels.txt','r')]
9 | flag=True
10 | for hotel in hotels:
11 | hotel_id=hotel[2].split('.')[0].split('/')[-1]
12 | if hotel_id!='1353810' and flag:
13 | continue
14 | flag=False
15 | page=1
16 | '''
17 | if hotel_id=='435300':
18 | page=54
19 | '''
20 | endpage=1000
21 | while page<=endpage:
22 | try:
23 | browser.get('http://hotels.ctrip.com/hotel/dianping/%s_p%st0.html'%(hotel_id,page))
24 | html=browser.page_source
25 | except:
26 | continue
27 | time.sleep(2)
28 | try:
29 | browser.find_element_by_class_name('comment_tab_main')
30 | comments=BeautifulSoup(html,'lxml').find('div',{'class':'comment_tab_main'}).find_all('div',{'class':'comment_block'})
31 | except:
32 | continue
33 | if '以下为酒店3年前历史点评' in str(comments):
34 | print('以下为酒店3年前历史点评')
35 | break
36 | f=open('result_2.txt','a')
37 | for line in comments:
38 | f.write(str(hotel+[str(line)])+'\n')
39 | f.close()
40 | print(page,endpage,hotel[0])
41 | if endpage==1000:
42 | try:
43 | endpage=BeautifulSoup(html,'lxml').find('div',{'class':'c_page_list'}).find_all('a')[-1].get('value')
44 | endpage=int(endpage)
45 | except:
46 | break
47 | page+=1
48 |
--------------------------------------------------------------------------------
/www.ctrip.com/youtrip.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import time
4 |
5 | headers = {
6 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
7 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
8 | 'Accept-Language': 'en-US,en;q=0.5',
9 | 'Accept-Encoding': 'gzip, deflate',
10 | 'Connection': 'keep-alive'}
11 |
12 | def getUrl():
13 | f=open('urls.txt','a')
14 | page=1
15 | while True:
16 | html=requests.get('http://you.ctrip.com/travels/guilin28/t3-p{}.html'.format(page),headers=headers).text
17 | table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'journalslist cf'}).find_all('a',attrs={'class':'journal-item cf'})
18 | for item in table:
19 | title=item.find('dt').get_text().replace('\r','').replace('\n','')
20 | f.write(title+'||'+item.get('href')+'\n')
21 | print(page,'--ok')
22 | page+=1
23 | if page==991:
24 | break
25 | time.sleep(2)
26 | f.close()
27 |
28 | def getcontent(url):
29 | html=requests.get(url,headers=headers).text
30 | soup=BeautifulSoup(html,'lxml').find('div',attrs={'class':'ctd_content'})
31 | body=soup.get_text()
32 | place=soup.find('div',{'class':'ctd_content_controls cf'}).get_text()
33 | result=body.replace(place,'')
34 | return result
35 |
36 |
37 | def main():
38 | excel=xlwt3.Workbook()
39 | sheet=excel.add_sheet('sheet')
40 | count=0
41 | for line in open('urls.txt','r'):
42 | line=line.replace('\n','')
43 | title=line.split('||')[0]
44 | url='http://you.ctrip.com'+line.split('||')[-1]
45 | try:
46 | content=getcontent(url)
47 | except:
48 | failed=open('failed.txt','a')
49 | failed.write(line+'\n')
50 | failed.close()
51 | continue
52 | sheet.write(count,0,count)
53 | sheet.write(count,1,title)
54 | sheet.write(count,2,content)
55 | count+=1
56 | excel.save('result.xls')
57 | time.sleep(2)
58 | print(count,'--ok')
59 |
60 |
--------------------------------------------------------------------------------
/www.dicos.com.cn/storelist.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import openpyxl
4 |
5 | def citys():
6 | f=open('citys.txt','a')
7 | for pid in range(6,33):
8 | html=requests.get('http://www.dicos.com.cn/index.php?c=page&m=getcityhtml&iscity=1&pid=%s'%pid).text
9 | table=BeautifulSoup(html,'lxml').find_all('option')
10 | for item in table:
11 | f.write(item.get_text()+'|'+item.get('value')+'\n')
12 | f.close()
13 |
14 | def get_store(citycode):
15 | html=requests.get('http://www.dicos.com.cn/index.php?c=page&m=getstorehtml&waimai=0&mProvince=3&mCity=%s'%citycode).text
16 | table=BeautifulSoup(html,'lxml').find_all('tr')
17 | result=[]
18 | for item in table:
19 | text=''
20 | for td in item.find_all('td')[1:4]:
21 | text+='|'+td.get_text()
22 | result.append(text.replace('\r','').replace('\n',''))
23 | return result
24 |
25 | def main():
26 | f=open('result.txt','a')
27 | for line in open('citys.txt'):
28 | line=line.replace('\n','')
29 | try:
30 | result=get_store(line.split('|')[-1])
31 | except:
32 | failed=open('failed.txt','a')
33 | failed.write(line+'\n')
34 | failed.close()
35 | continue
36 | for item in result:
37 | f.write(line+item+'\n')
38 | print(line,'ok')
39 | f.close()
40 |
41 | def write_to_excel():
42 | result={}
43 | excel=openpyxl.Workbook(write_only=True)
44 | sheet1=excel.create_sheet('1')
45 | for line in open('result.txt','r'):
46 | line=line.replace('\n','')
47 | lists=line.split('|')
48 | try:
49 | result[lists[0]]+=1
50 | except:
51 | result[lists[0]]=1
52 | sheet1.append(lists)
53 | sheet2=excel.create_sheet('2')
54 | for key in result:
55 | sheet2.append([key,result[key]])
56 | excel.save('result.xlsx')
57 |
58 | write_to_excel()
59 |
--------------------------------------------------------------------------------
/www.eastmoney.com/quote.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import openpyxl
3 | import json
4 |
5 |
6 | def get_data(code,market):
7 | url='http://hqdigi2.eastmoney.com/EM_Quote2010NumericApplication/CompatiblePage.aspx?Type=OB&stk=%s&Reference=xml&limit=0&page=%s'
8 | html=requests.get(url%(code+market,1)).text
9 | data=json.loads(html.replace('var jsTimeSharingData=','').replace(';','').replace('pages','"pages"').replace('data','"data"'))
10 | if data['pages']==0:
11 | return False
12 | pages=data['pages']
13 | page=2
14 | result=[]
15 | for item in data['data']:
16 | result.append(item.split(','))
17 | while page<=pages:
18 | html=requests.get(url%(code+market,page)).text
19 | data=json.loads(html.replace('var jsTimeSharingData=','').replace(';','').replace('pages','"pages"').replace('data','"data"'))
20 | for item in data['data']:
21 | result.append(item.split(','))
22 | page+=1
23 | return result
24 |
25 | def write_to_excel(code,result):
26 | excel=openpyxl.Workbook(write_only=True)
27 | sheet=excel.create_sheet()
28 | for item in result:
29 | sheet.append(item)
30 | excel.save('%s.xlsx'%code)
31 | print(code,'OK')
32 |
33 | def main():
34 | try:
35 | code=input('输入股票代码:')
36 | except:
37 | print("Faliled")
38 | return
39 | result=[]
40 | for market in ['1','2']:
41 | try:
42 | result=get_data(code,market)
43 | except:
44 | continue
45 | if result==False:
46 | continue
47 | break
48 | if result==[] or result==False:
49 | print('Failed')
50 | return
51 | write_to_excel(code,result)
52 |
53 | while True:
54 | main()
--------------------------------------------------------------------------------
/www.eastmoney.com/transaction.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import openpyxl
3 | import re
4 | import time
5 | import os
6 |
7 |
8 | def get_data(code,market):
9 | url='http://nufm3.dfcfw.com/EM_Finance2014NumericApplication/JS.aspx?type=CT&cmd=%s&sty=DPTTFD&st=z&sr=1&p=1&ps=&cb=&token=beb0a0047196124721f56b0f0ff5a27c'
10 | html=requests.get(url%(code+market)).text
11 | if 'false' in html:
12 | return False
13 | text=re.findall('"(.*?)"',html)[0]
14 | lines=text.split('|')
15 | result=[]
16 | for line in lines:
17 | result.append(line.split('~'))
18 | return result
19 |
20 | def write_to_excel(code,result):
21 | excel=openpyxl.Workbook(write_only=True)
22 | sheet=excel.create_sheet()
23 | for item in result:
24 | sheet.append(item)
25 | try:
26 | os.mkdir('result/'+code)
27 | except:
28 | pass
29 | date=timenow=time.strftime('%Y-%m-%d',time.localtime())
30 | excel.save('result/'+code+'/%s.xlsx'%date)
31 |
32 | def get_transaction(code):
33 | global result
34 | for market in ['1','2']:
35 | try:
36 | data=get_data(code,market)
37 | except:
38 | continue
39 | if data==False:
40 | continue
41 | break
42 | if data==[] or data==False:
43 | print('Failed')
44 | return
45 | timenow=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())
46 | print(timenow,code,'ok')
47 | is_write=False
48 | for line in data:
49 | if line in result:
50 | continue
51 | result.append(line)
52 | is_write=True
53 | if is_write:
54 | write_to_excel(code,result)
55 |
56 |
57 | code=input('输入股票代码:')
58 | result=[]
59 | while True:
60 | get_transaction(code)
61 | time.sleep(0.5)
--------------------------------------------------------------------------------
/www.fang.com/new_hourse.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import time
4 | import openpyxl
5 |
6 | headers = {
7 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
8 | "Accept-Encoding": "gzip, deflate",
9 | "Accept-Language": "en-US,en;q=0.5",
10 | "Connection": "keep-alive",
11 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
12 |
13 |
14 | def get_house():
15 | page=1
16 | url='http://newhouse.cs.fang.com/house/s/b9'
17 | while True:
18 | html=requests.get(url+str(page),headers=headers).text.encode('iso-8859-1').decode('gbk')
19 | table=BeautifulSoup(html,'lxml').find('div',{'class':'nhouse_list'}).find_all('li')
20 | f=open('urls.txt','a')
21 | for item in table:
22 | detail=item.find('div',{'class':'nlc_details'})
23 | house_url=detail.find('a').get('href')
24 | name=detail.find('a').get_text()
25 | address_div=detail.find('div',{'class':'address'})
26 | address=address_div.find('a').get('title')
27 | try:
28 | location=address_div.find('span').get_text()
29 | except:
30 | location='-'
31 | try:
32 | price=detail.find('div',{'class':'nhouse_price'}).find('span').get_text()
33 | except:
34 | price='-'
35 | line=name+'|'+house_url+'|'+price+'|'+location+'|'+address
36 | line=line.replace('\r','').replace('\n','').replace('\t','')
37 | f.write(line+'\n')
38 | f.close()
39 | print(page,'ok')
40 | page+=1
41 | time.sleep(1)
42 |
43 | def get_house_live_history(url):
44 | html=requests.get(url,headers=headers).text.encode('iso-8859-1').decode('gbk')
45 | table=BeautifulSoup(html,'lxml').find('div',id='tc_jiaofang').find_all('tr')
46 | lines=[]
47 | for tr in table[2:-1]:
48 | tds=tr.find_all('td')
49 | date=tds[0].get_text()
50 | month=date.split('-')[1]
51 | infor=tds[1].get_text()
52 | line=month+'|'+date+'|'+infor
53 | lines.append(line.replace('\xa0',''))
54 | return lines
55 |
56 | def house_live_history():
57 | is_ok=True
58 | for item in open('urls.txt','r'):
59 | item=item.replace('\n','')
60 | url=item.split('|')[1]
61 | if url!='http://jiulongshanjy.fang.com/' and is_ok==True:
62 | continue
63 | is_ok=False
64 | try:
65 | lines=get_house_live_history(url)
66 | except:
67 | lines=[]
68 | print(item)
69 | f=open('changsha.txt','a')
70 | if lines==[]:
71 | f.write(item+'\n')
72 | f.close()
73 | continue
74 | for line in lines:
75 | f.write(item+'|'+line+'\n')
76 | f.close()
77 | time.sleep(1)
78 |
79 | house_live_history()
80 |
--------------------------------------------------------------------------------
/www.ganji.com/ganji_tel.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 | import re
6 | import time
7 | import openpyxl
8 |
9 | headers = {
10 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
11 | "Accept-Encoding": "gzip, deflate",
12 | "Accept-Language": "en-US,en;q=0.5",
13 | "Connection": "keep-alive",
14 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
15 |
16 |
17 | def get_tels(url):
18 | html=requests.get(url,headers=headers).text
19 | try:
20 | table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'list'}).find_all('li')
21 | except:
22 | return []
23 | tels=[]
24 | for li in table:
25 | try:
26 | tel=li.find('div',attrs={'class':'list-r-area'}).find('p',attrs={'class':'tel'}).find('span').get_text()
27 | except:
28 | continue
29 | tels.append(tel)
30 | return tels
31 |
32 |
33 | def main():
34 | url=input('输入链接:')
35 | url=re.sub('o\d+/','',url)
36 | if not url.startswith('http'):
37 | url='http://'+url
38 | page=1
39 | tels=[]
40 | while True:
41 | try:
42 | result=get_tels(url+'o'+str(page)+'/')
43 | except:
44 | continue
45 | if result==[]:
46 | break
47 | tels+=result
48 | print('第%s页--完成'%page)
49 | page+=1
50 | time.sleep(5)
51 | tels=list(set(tels))
52 | count=0
53 | excel=openpyxl.Workbook(write_only=True)
54 | sheet=excel.create_sheet()
55 | for tel in tels:
56 | sheet.append([tel])
57 | excel.save('tels.xls')
58 |
59 | main()
60 |
--------------------------------------------------------------------------------
/www.gewara.com/reviews.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import openpyxl
4 | import time
5 |
6 |
7 | headers = {
8 | 'X-Requested-With':"XMLHttpRequest",
9 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
10 | "Accept-Encoding": "gzip, deflate",
11 | "Accept-Language": "en-US,en;q=0.5",
12 | "Connection": "keep-alive",
13 | 'Referer':"http://www.gewara.com/movie/282568860",
14 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
15 |
16 | def getreviews(page,relatedid):
17 | html=requests.get('http://www.gewara.com/activity/ajax/sns/qryComment.xhtml?pageNumber={}&relatedid={}&topic=&issue=false&hasMarks=true&isCount=true&tag=movie&isPic=true&isVideo=false&userLogo=&newWalaPage=true&isShare=false&isNew=true&maxCount=200&isWide=true&isTicket=false'.format(page,relatedid),headers=headers).text
18 | table=BeautifulSoup(html,'lxml').find_all('div',{'class':'page_wala'})
19 | result=[]
20 | for item in table:
21 | try:
22 | grade=item.find('span',{'class':'ui_grades left ui_grade10'}).get('title')
23 | reviewid=item.find('div',{'class':'wala_txt'}).get('data-id')
24 | if reviewid==None:
25 | review=item.find('div',{'class':'wala_miniTxt'}).get_text().replace('\r','').replace('\n','').replace('\t','')
26 | result.append({'grade':grade,'review':review})
27 | continue
28 | result.append({'grade':grade,'id':reviewid})
29 | except:
30 | continue
31 | return result
32 |
33 | def getcontent(id):
34 | html=requests.get('http://www.gewara.com/activity/sns/ajaxCommentDetail.xhtml?id=%s&isNew=true'%id).text
35 | text=BeautifulSoup(html,'lxml').get_text().replace('\r','').replace('\n','').replace('\t','')
36 | return text
37 |
38 | def write_to_excel():
39 | excel=openpyxl.Workbook(write_only=True)
40 | sheet=excel.create_sheet()
41 | for line in open('result.txt','r'):
42 | item=eval(line)
43 | sheet.append([item['grade'],item['review']])
44 | excel.save('result.xlsx')
45 |
46 | def main():
47 | f=open('result.txt','a')
48 | page=1
49 | count=0
50 | while True:
51 | try:
52 | result=getreviews(page,'282568860')
53 | except:
54 | print('failed')
55 | time.sleep(3)
56 | continue
57 | for item in result:
58 | try:
59 | dataid=item['id']
60 | except:
61 | count+=1
62 | print(count)
63 | f.write(str(item)+'\n')
64 | continue
65 | try:
66 | review=getcontent(dataid)
67 | except:
68 | continue
69 | item['review']=review
70 | f.write(str(item)+'\n')
71 | count+=1
72 | print(count)
73 | time.sleep(0.5)
74 | print(page,'ok')
75 | page+=1
76 | if page==200:
77 | break
78 | f.close()
79 |
80 | write_to_excel()
81 |
--------------------------------------------------------------------------------
/www.imdb.com/boxoffice.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 | import re
6 |
7 | headers = {
8 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
9 | "Accept-Encoding": "gzip, deflate",
10 | "Accept-Language": "en-US,en;q=0.5",
11 | "Connection": "keep-alive",
12 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
13 |
14 | def get_url(title):
15 | name=re.sub('\(.*?\)','',title)#.lower()#.replace(' ','')
16 | html=requests.get('http://www.boxofficemojo.com/search/?q=%s'%name,headers=headers).text.replace('\r','').replace('\n','').replace('\t','')
17 | rel='bgcolor=#FFFF99>(.*?)'
18 | tr=re.findall(rel,html)[0]#BeautifulSoup(html,'lxml').find('tr',attrs={'bgcolor':'#FFFF99'})
19 | tds=BeautifulSoup(str(tr),'lxml').find_all('td')
20 | #tds=tr.findall('td')
21 | url='http://www.boxofficemojo.com'+tds[0].find('a').get('href')
22 | de=tds[2].get_text()
23 | html=requests.get(url,headers=headers).text
24 | table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'mp_box_content'}).get_text().replace('\r','|').replace('\n','|')
25 | print(table)
26 | line=de
27 | rel='Worldwide:\|(.*?)\|'
28 | try:
29 | wl=re.findall(rel,table)[0]
30 | except:
31 | wl='-'
32 | line=de+'||'+wl
33 | return line
34 |
35 | def main():
36 | f=open('data.txt','w')
37 | for line in open('new.txt','r').readlines():
38 | line=line.replace('\r','').replace('\n','')
39 | try:
40 | price=get_url(line)
41 | except:
42 | price='--||--'
43 | f.write(line+'||'+price+'\n')
44 | print(price)
45 |
46 | main()
47 |
--------------------------------------------------------------------------------
/www.imdb.com/movies.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 | import time
6 | from selenium import webdriver
7 |
8 | headers = {
9 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
10 | "Accept-Encoding": "gzip, deflate",
11 | "Accept-Language": "en-US,en;q=0.5",
12 | "Connection": "keep-alive",
13 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
14 |
15 | def get_movies():
16 | f=open('data_movies2013.txt','a')
17 | start=1
18 | while start<8519:
19 | try:
20 | html=requests.get('http://www.imdb.com/search/title?at=0&sort=boxoffice_gross_us&start=%s&title_type=feature&year=2013,2013'%start,headers=headers,timeout=30).text
21 | except:
22 | continue
23 | items=parser(html)
24 | for item in items:
25 | f.write(item+'\n')
26 | start+=50
27 | print(start)
28 |
29 | def parser(html):
30 | items=[]
31 | table=BeautifulSoup(html,'lxml').find('table',attrs={'class':'results'}).find_all('tr')[1:]
32 | for item in table:
33 | td=item.find('td',attrs={'class':'title'})
34 | title=item.find('a').get('title')
35 | try:
36 | score=td.find('span',attrs={'class':'rating-rating'}).get_text()
37 | except:
38 | score='-'
39 | try:
40 | col=item.find('td',attrs={'class':'sort_col'}).get_text()
41 | except:
42 | col='-'
43 | text=title+'||'+score+'||'+col
44 | items.append(text)
45 | return items
46 |
47 | get_movies()
48 |
--------------------------------------------------------------------------------
/www.imdb.com/rottentomatoes.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 | import re
6 | import threading
7 |
8 | headers = {
9 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
10 | "Accept-Encoding": "gzip, deflate",
11 | "Accept-Language": "en-US,en;q=0.5",
12 | "Connection": "keep-alive",
13 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
14 |
15 | class Score(threading.Thread):
16 | def __init__(self,line):
17 | super(Score,self).__init__()
18 | self.line=line
19 | self.name=self.line.split('||')[0]
20 |
21 | def run(self):
22 | try:
23 | self.score=self.get_score(self.name)
24 | except:
25 | self.score='-'
26 | print(self.score)
27 | self.line=self.line+'||'+self.score
28 |
29 | def get_score(self,name):
30 | try:
31 | html=requests.get('http://www.rottentomatoes.com/search/?search=%s'%name.replace(' ','+'),headers=headers,timeout=40).text
32 | except:
33 | return self.get_score(name)
34 | try:
35 | table=BeautifulSoup(html,'lxml').find('ul',id='movie_results_ul').find_all('li')
36 | except:
37 | return score(html)
38 | url=''
39 | for li in table:
40 | title=li.find('div',attrs={'class':'nomargin media-heading bold'}).get_text().replace('\r','').replace('\n','').replace(' ','')
41 | if title.lower()==name.replace(' ','').lower():
42 | url='http://www.rottentomatoes.com'+li.find('a').get('href')
43 | break
44 | if(url==''):
45 | return '-'
46 | html=requests.get(url,headers=headers,timeout=40).text
47 | return score(html)
48 |
49 | def score(html):
50 | text=BeautifulSoup(html,'lxml').find('div',id='scorePanel').get_text().replace('\r','').replace('\n','').replace(' ','')
51 | rel='AverageRating:(.*?)R'
52 | try:
53 | result=re.findall(rel,text)[0]
54 | return result
55 | except:
56 | return '-'
57 |
58 |
59 | def main():
60 | f=open('movies_2013.txt','a')
61 | items=[]
62 | for line in open('data_movies2013.txt','r').readlines():
63 | line=line.replace('\n','')
64 | items.append(line)
65 | if(len(items)<40):
66 | continue
67 | threadings=[]
68 | for item in items:
69 | work=Score(item)
70 | threadings.append(work)
71 | for work in threadings:
72 | work.start()
73 | for work in threadings:
74 | work.join()
75 | for work in threadings:
76 | f.write(work.line+'\n')
77 | items=[]
78 | threadings=[]
79 |
80 | for item in items:
81 | work=Score(item)
82 | threadings.append(work)
83 | for work in threadings:
84 | work.start()
85 | for work in threadings:
86 | work.join()
87 | for work in threadings:
88 | f.write(work.line+'\n')
89 | f.close()
90 |
91 | main()
92 |
--------------------------------------------------------------------------------
/www.itjuzi.com/baseInvestevents.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 | import xlwt3
6 | import time
7 |
8 | headers = {
9 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
10 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
11 | 'Accept-Language': 'en-US,en;q=0.5',
12 | 'Accept-Encoding': 'gzip, deflate',
13 | 'Connection': 'keep-alive'}
14 |
15 | def get_infor(url):
16 | html=requests.get(url,headers=headers,timeout=50).text
17 | results=[]
18 | table=BeautifulSoup(html,'html.parser').find_all('ul',attrs={'class':'list-main-eventset'})[1].find_all('li')
19 | for li in table:
20 | item={}
21 | i=li.find_all('i')
22 | item['date']=i[0].get_text().replace('\n','').replace('\t','')
23 | spans=i[2].find_all('span')
24 | item['name']=spans[0].get_text().replace('\n','').replace('\t','')
25 | item['industry']=spans[1].get_text().replace('\n','').replace('\t','')
26 | item['local']=spans[2].get_text().replace('\n','').replace('\t','')
27 | item['round']=i[3].get_text().replace('\n','').replace('\t','')
28 | item['capital']=i[4].get_text().replace('\n','').replace('\t','')
29 | companys=i[5].find_all('a')
30 | Investmenters=''
31 | if(companys==[]):
32 | Investmenters=i[5].get_text().replace('\n','').replace('\t','')
33 | else:
34 | for a in companys:
35 | Investmenters+=a.get_text().replace('\n','').replace('\t','')+';'
36 | item['Investmenters']=Investmenters
37 | results.append(item)
38 | return results
39 |
40 | def main():
41 | excel=xlwt3.Workbook()
42 | sheet=excel.add_sheet('sheet')
43 | count=0
44 | startpage=1
45 | keys=['date','name','industry','local','round','capital','Investmenters']
46 | while startpage<1143:
47 | try:
48 | results=get_infor('https://www.itjuzi.com/investevents?page=%s'%startpage)
49 | except:
50 | time.sleep(5)
51 | continue
52 | for item in results:
53 | num=0
54 | for key in keys:
55 | sheet.write(count,num,item[key])
56 | num+=1
57 | count+=1
58 | print(startpage,'--ok')
59 | startpage+=1
60 | time.sleep(3)
61 | excel.save('investevents.xls')
62 | main()
63 |
--------------------------------------------------------------------------------
/www.itjuzi.com/companylist.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import time
4 | import openpyxl
5 |
6 | headers = {
7 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
8 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
9 | 'Accept-Language': 'en-US,en;q=0.5',
10 | 'Accept-Encoding': 'gzip, deflate',
11 | 'Connection': 'keep-alive'}
12 |
13 | def get_companylist(page):
14 | html=requests.get('http://www.itjuzi.com/company?page=%s'%page,headers=headers,timeout=30).text
15 | table=BeautifulSoup(html,'html.parser').find_all('ul',{'class':'list-main-icnset'})[1].find_all('li')
16 | if len(table)==0:
17 | return []
18 | result=[]
19 | for li in table:
20 | try:
21 | img=li.find('img').get('src').split('?')[0]
22 | title=li.find('p',{'class':'title'}).get_text()
23 | url=li.find('a').get('href')
24 | des=li.find('p',{'class':'des'}).get_text()
25 | tags=li.find('span',{'class':'tags'}).get_text()
26 | loca=li.find('span',{'class':'loca'}).get_text()
27 | date=li.find('i',{'class':'date'}).get_text()
28 | round=li.find('i',{'class':'round'}).get_text()
29 | except:
30 | continue
31 | result.append([img,title,url,des,tags,loca,date,round])
32 | return result
33 |
34 | def write_to_excel(result):
35 | excel=openpyxl.Workbook(write_only=True)
36 | sheet=excel.create_sheet()
37 | filename=time.strftime("%Y%m%d_%H%M%S",time.localtime())+'.xlsx'
38 | for line in result:
39 | sheet.append(line)
40 | excel.save(filename)
41 |
42 | def loadcompany():
43 | companys=[]
44 | for line in open('result.txt','r',encoding='utf-8'):
45 | companys.append(line.replace('\r','').replace('\n',''))
46 | return companys
47 |
48 | def main():
49 | try:
50 | companys=loadcompany()
51 | except:
52 | companys=[]
53 | page=1
54 | f=open('result.txt','w',encoding='utf-8')
55 | flag=False
56 | new_list=[]
57 | while True:
58 | try:
59 | result=get_companylist(page)
60 | except:
61 | time.sleep(5)
62 | continue
63 | if result==[]:
64 | break
65 | for item in result:
66 | line='||'.join(item)
67 | line=line.replace('\r','').replace('\n','').replace('\t','')
68 | if line in companys:
69 | flag=True
70 | break
71 | new_list.append(item)
72 | f.write(line+'\r\n')
73 | if flag:
74 | break
75 | print(page,'ok')
76 | page+=1
77 | time.sleep(3)
78 | for company in companys:
79 | f.write(company+'\r\n')
80 | f.close()
81 | write_to_excel(new_list)
82 |
83 | main()
84 |
--------------------------------------------------------------------------------
/www.itjuzi.com/investevents.py:
--------------------------------------------------------------------------------
1 | #codnig:utf-8
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 | import xlwt3
6 |
7 | headers = {
8 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
9 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
10 | 'Accept-Language': 'en-US,en;q=0.5',
11 | 'Accept-Encoding': 'gzip, deflate',
12 | 'Connection': 'keep-alive'}
13 |
14 | def get_base_infor():
15 | f=open('data.txt','a')
16 | for page in range(1048):
17 | html=requests.get('https://www.itjuzi.com/investevents?page=%s'%(page+1),headers=headers).text
18 | table=BeautifulSoup(html,'html.parser').find_all('ul',attrs={'class':'list-main-eventset'})[1].find_all('li')
19 | for li in table:
20 | item={}
21 | i=li.find_all('i')
22 | item['date']=i[0].get_text()
23 | item['url']=i[1].find('a').get('href')
24 | spans=i[2].find_all('span')
25 | item['name']=spans[0].get_text()
26 | item['industry']=spans[1].get_text()
27 | item['local']=spans[2].get_text()
28 | item['round']=i[3].get_text()
29 | item['capital']=i[4].get_text()
30 | companys=i[5].find_all('a')
31 | lists=[]
32 | if(companys==[]):
33 | lists.append(i[5].get_text())
34 | else:
35 | for a in companys:
36 | lists.append(a.get_text())
37 | item['Investmenters']=lists
38 | f.write(str(item)+'\n')
39 | print(page)
40 |
41 | def main():
42 | f=open('data.txt','r')
43 | data_f=open('investevents.txt','a')
44 | failed_f=open('failed.txt','a')
45 | for line in f.readlines():
46 | try:
47 | item=eval(line.replace('\n',''))
48 | html=requests.get(item['url'],headers=headers).text
49 | url=BeautifulSoup(html,'lxml').find('div',attrs={'class':'block-inc-fina'}).find('a',attrs={'class':'incicon'}).get('href')
50 | html=requests.get(url,headers=headers).text
51 | soup=BeautifulSoup(html,'lxml').find('div',attrs={'class':'thewrap'})
52 | table=soup.find('div',attrs={'class':'sec'})
53 | company_url=table.find('div',attrs={'class':'rowhead'}).find('div',attrs={'class':'row c-gray-aset'}).find('div',attrs={'class':'dbi linkset c-gray'}).find('a').get('href')
54 | tags=[]
55 | for a in table.find('div',attrs={'class':'rowfoot'}).find('div',attrs={'class':'tagset dbi'}).find_all('a'):
56 | tags.append(a.get_text())
57 | des=soup.find('div',attrs={'class':'block block-inc-info'}).find('div',attrs={'class':'des'}).get_text()
58 | item['company_url']=company_url
59 | item['tags']=tags
60 | item['des']=des
61 | data_f.write(str(item)+'\n')
62 | print(item['url'])
63 | except:
64 | failed_f.write(line)
65 |
66 | main()
67 |
--------------------------------------------------------------------------------
/www.jisilu.com/jisilu.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import json
3 | import openpyxl
4 | import time
5 |
6 | headers = {
7 | 'Host':"www.jisilu.cn",
8 | 'Accept':"application/json, text/javascript, */*; q=0.01",
9 | "Accept-Encoding": "gzip, deflate",
10 | "Accept-Language": "en-US,en;q=0.5",
11 | "Connection": "keep-alive",
12 | 'Content-Type':"application/x-www-form-urlencoded; charset=UTF-8",
13 | 'X-Requested-With':"XMLHttpRequest",
14 | 'Cookie':"kbzw__Session=4sv8h9vjir144ijdh02h4nefd0; Hm_lvt_164fe01b1433a19b507595a43bf58262=1468934580; Hm_lpvt_164fe01b1433a19b507595a43bf58262=1468935752; kbz_newcookie=1; kbzw__user_login=7Obd08_P1ebax9aX5dvi0OXc5ZmcndHV7Ojg6N7bwNOM2KjZqpmgw6feqM6upamTqJmt3KbbkaKU17HXoNql2ZiXnKTs3Ny_zYylr6qgspyYnaO2uNXQo67f293l4cqooaWSlonPqKSzgcXD6efp3rSMw8vk1u-X67CXz5eotJXb76arlqSRoJe63cTb0KOrpZqpnKiSp4G94OXdx9_Zo62pl6k.",
15 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
16 |
17 | def login():
18 | logindata=open('user','r',encoding='utf-8').read().replace('\r','').replace('\n','')
19 | logindata=eval(logindata)
20 | data={
21 | 'user_name':logindata['user_name'],
22 | 'password':logindata['password'],
23 | 'net_auto_login':1,
24 | '_post_type':'ajax',
25 | 'return_url':'https://www.jisilu.cn'
26 | }
27 | session=requests.session()
28 | session.post('https://www.jisilu.cn/account/ajax/login_process/',data=data).text
29 | return session
30 |
31 | def getdata():
32 | data={
33 | 'is_search':"0",
34 | 'avolume':"100",
35 | 'bvolume':"100",
36 | 'market':["sh","sz"],
37 | 'ptype':"price",
38 | 'rp':"50",
39 | 'page':"1"
40 | }
41 | session=login()
42 | timestr=str(time.time()).replace('.','')
43 | html=session.post('https://www.jisilu.cn/data/sfnew/arbitrage_vip_list/?___t=%s'%timestr,data=data).text
44 | data=json.loads(html)['rows']
45 | print(data[0])
46 | write_to_excel(data)
47 | print('OK')
48 |
49 | def write_to_excel(data):
50 | keys=['fundA_id','fundA_nm','sell1A','increase_rtA','fundA_volume','fundA_amount_increase',
51 | 'fundB_id','fundB_nm','sell1B','increase_rtB','fundB_volume','fundB_amount_increase',
52 | 'abrate','merge_price','est_dis_rt','base_fund_id','base_fund_nm','base_nav','base_est_val',
53 | 'index_nm','idx_incr_rt','asset_ratio','asset_ratio_last','apply_fee','redeem_fee']
54 | excel=openpyxl.Workbook(write_only=True)
55 | sheet=excel.create_sheet()
56 | for item in data:
57 | cell=[]
58 | for key in keys:
59 | try:
60 | cell.append(item['cell'][key])
61 | except:
62 | cell.append('-')
63 | sheet.append(cell)
64 | excel.save('result.xlsx')
65 |
66 | while True:
67 | try:
68 | getdata()
69 | except:
70 | print('Failed')
71 | continue
72 | time.sleep(10)
73 | break
74 |
--------------------------------------------------------------------------------
/www.kfc.com/storelist.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import json
4 | import time
5 | import openpyxl
6 |
7 | def citys():
8 | html=open('index.html','r').read()
9 | table=BeautifulSoup(html,'lxml').find('ul',{'class':'city_info'}).find_all('li')
10 | f=open('citys.txt','w')
11 | for li in table:
12 | for item in li.find_all('a'):
13 | f.write(item.get_text()+'\n')
14 | f.close()
15 |
16 | def get_store(city):
17 | result=[]
18 | page=1
19 | while True:
20 | data={
21 | 'cname':city,
22 | 'pid':"",
23 | 'pageIndex':page,
24 | 'pageSize':"100"
25 | }
26 | html=requests.post('http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname',data=data).text
27 | stores=json.loads(html)['Table1']
28 | if stores==[]:
29 | break
30 | page+=1
31 | for item in stores:
32 | result.append(item['storeName']+'|'+item['cityName']+'|'+item['addressDetail']+'|'+item['pro'])
33 | time.sleep(1)
34 | return result
35 |
36 |
37 | def main():
38 | f=open('result.txt','a')
39 | for line in open('citys.txt','r'):
40 | city=line.replace('\n','')
41 | try:
42 | result=get_store(city)
43 | except:
44 | failed=open('failed.txt','a')
45 | failed.write(city+'\n')
46 | failed.close()
47 | continue
48 | for item in result:
49 | f.write(item+'\n')
50 | print(city,'ok')
51 | f.close()
52 |
53 | def write_to_excel():
54 | result={}
55 | excel=openpyxl.Workbook(write_only=True)
56 | sheet1=excel.create_sheet('1')
57 | for line in open('result.txt','r'):
58 | line=line.replace('\n','')
59 | lists=line.split('|')
60 | lists[0]=lists[0]+'餐厅'
61 | try:
62 | result[lists[1]]+=1
63 | except:
64 | result[lists[1]]=1
65 | sheet1.append(lists)
66 | sheet2=excel.create_sheet('2')
67 | for key in result:
68 | sheet2.append([key,result[key]])
69 | excel.save('result.xlsx')
70 |
71 | write_to_excel()
72 |
--------------------------------------------------------------------------------
/www.kimiss.com/Nyspider.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | import os
5 | import sqlite3
6 |
7 | headers = {
8 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
9 | "Accept-Encoding": "gzip, deflate",
10 | "Accept-Language": "en-US,en;q=0.5",
11 | "Connection": "keep-alive",
12 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
13 |
14 | def get_html(url):
15 | html=requests.get(url,headers=headers).text
16 | return html
17 |
18 |
19 | def get_image(image_url,image_name):
20 | content=requests.get(image_url,headers=headers).content
21 | with open(image_name,'wb') as f:
22 | f.write(content)
23 | f.close
24 |
--------------------------------------------------------------------------------
/www.kimiss.com/man.txt:
--------------------------------------------------------------------------------
1 | {'男士面部护理': ['http://product.kimiss.com/nanshirunchungao2/', 'http://product.kimiss.com/nanshiyanbujinghua2/', 'http://product.kimiss.com/nanshiyanshuang2/', 'http://product.kimiss.com/nanshiruye/', 'http://product.kimiss.com/nanshijiemian/', 'http://product.kimiss.com/nanshishuangfushui/', 'http://product.kimiss.com/nanshijinghua/', 'http://product.kimiss.com/nanshimianmo/', 'http://product.kimiss.com/nanshifangshai/', 'http://product.kimiss.com/nanshitaozhuang/', 'http://product.kimiss.com/nanshimianbuqujiaozhi/']}
2 | {'男士身体护理': ['http://product.kimiss.com/nanshimuyulu/', 'http://product.kimiss.com/nanshishuangshenxiangtipin/', 'http://product.kimiss.com/nanshirunfuru/', 'http://product.kimiss.com/nanshixiantichanpin/', 'http://product.kimiss.com/nanshishentimoshagao/', 'http://product.kimiss.com/nanshisichuhuli/']}
3 | {'男士剃须护理': ['http://product.kimiss.com/tixudao/', 'http://product.kimiss.com/xuhouhuli/', 'http://product.kimiss.com/xuqianhuli/']}
4 | {'男士美发护发': ['http://product.kimiss.com/nanshitoufazaoxing/', 'http://product.kimiss.com/nanshixifa/', 'http://product.kimiss.com/nanshirunfa/']}
5 | {'男士面部彩妆': ['http://product.kimiss.com/nanshibbshuang/', 'http://product.kimiss.com/nanshifendi/', 'http://product.kimiss.com/nanshigelishuang/', 'http://product.kimiss.com/nanshizhexia/', 'http://product.kimiss.com/nanshijiemaogao/', 'http://product.kimiss.com/nanshisanfen/']}
6 |
--------------------------------------------------------------------------------
/www.lagou.com/lagou.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | import requests
3 | import json
4 | import time
5 | from write_sql import write2sqlite
6 | from bs4 import BeautifulSoup
7 |
8 | headers = {
9 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
10 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
11 | 'Accept-Language': 'en-US,en;q=0.5',
12 | 'Accept-Encoding': 'gzip, deflate',
13 | 'Connection': 'keep-alive'}
14 |
15 | def get_jobs(keyword):
16 | jobs=[]
17 | page=1
18 | while True:
19 | js_data=requests.get('http://www.lagou.com/jobs/positionAjax.json?px=new&kd=%s&pn=%s&'%(keyword,page),headers=headers).text
20 | data=json.loads(js_data)
21 | data=data['content']['positionResult']['result']
22 | for item in data:
23 | job={}
24 | job['fromsite']='拉勾'
25 | job['id']=item['positionId']
26 | job['companyId']=item['companyId']
27 | job['positionType']=keyword
28 | job['positionName']=item['positionName']
29 | job['company']=item['companyFullName']
30 | job['salary']=item.get('salary')
31 | job['workYear']=item['workYear']
32 | job['education']=item['education']
33 | job['industryField']=item['industryField']
34 | job['companySize']=item['companySize']
35 | job['city']=item['city']
36 | job['financeStage']=item['financeStage']
37 | jobs.append(job)
38 | print(page,keyword,'ok')
39 | page+=1
40 | if page==31:
41 | break
42 | time.sleep(1)
43 | return jobs
44 |
45 | def get_job_des(jobid):
46 | url='http://www.lagou.com/jobs/%s.html'%jobid
47 | html=requests.get(url,headers=headers,timeout=30).text
48 | des=BeautifulSoup(html,'lxml').find('dd',{'class':'job_bt'}).get_text()
49 | return des
50 |
51 | def get_company_rate(companyid):
52 | url='http://www.lagou.com/gongsi/%s.html'%(companyid)
53 | html=requests.get(url,headers=headers,timeout=30).text
54 | rate=BeautifulSoup(html,'lxml').find('div',{'class':'reviews-top'}).find('span',{'class':'score'}).get_text()
55 | return rate
56 |
57 | def main():
58 | keywords=[line.replace('\n','') for line in open('type.txt','r')]
59 | for keyword in keywords:
60 | jobs=get_jobs(keyword)
61 | result=[]
62 | for job in jobs:
63 | try:
64 | des=get_job_des(job['id'])
65 | except:
66 | des='-'
67 | try:
68 | rate=get_company_rate(job['companyId'])
69 | except:
70 | rate='-'
71 | job['jobDes']=des
72 | job['rate']=rate
73 | result.append(job)
74 | time.sleep(1)
75 | write2sqlite(result,keyword)
76 | print(keyword,'ok')
77 | main()
78 |
--------------------------------------------------------------------------------
/www.locoso.com/locoso.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 | import re
6 |
7 | headers = {
8 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
9 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
10 | 'Accept-Language': 'en-US,en;q=0.5',
11 | 'Accept-Encoding': 'gzip, deflate',
12 | 'Connection': 'keep-alive'}
13 |
14 | def get_citys():
15 | url='http://www.locoso.com/s2/js/topcity.js'
16 | html=requests.get(url,headers=headers).text.replace('\\"','')
17 | table=BeautifulSoup(html,'lxml')
18 | lists=table.find_all('div',attrs={'class':'pro_bt'})
19 | f=open('citys.txt','a')
20 | root={}
21 | rel='prcity2(.*?)"'
22 | rel=re.compile(rel)
23 | citys={}
24 | for item in lists:
25 | try:
26 | root[str(item.get_text())]=eval(rel.findall(str(item))[0])[0]
27 | except:
28 | continue
29 | dicts={}
30 | for i in table.find('div',id=item.get('id')+'_2').find_all('li'):
31 | dicts[str(i.get_text())]=eval(rel.findall(str(i))[0])[0]
32 | citys[str(item.get_text())]=dicts
33 | for key in citys:
34 | for city in citys[key]:
35 | qu={}
36 | url='http://www.locoso.com/search/-all/c'+citys[key][city]
37 | html=requests.get(url,headers=headers).text
38 | table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'xiaofenlei_zhong02c2'}).find_all('li')
39 | dicts={}
40 | for i in table:
41 | dicts[i.find('a').get('title')]=i.find('a').get('href').replace('/search/-all/c','')
42 | qu[city]=dicts
43 | f.write(str(qu)+'\n')
44 | print(city)
45 |
46 | def get_industry():
47 | html=requests.get('www.locoso.com/search/-all/',headers=headers)
48 |
49 | get_citys()
50 |
--------------------------------------------------------------------------------
/www.mohurd.gov.cn/deal.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 |
4 |
5 | def load_level():
6 | level={}
7 | for line in open('Cost_qualification.txt','r'):
8 | line=line.replace('\n','').split('\t')
9 | print(line)
10 | level[line[0]]=line[1]
11 | return level
12 |
13 | def deal():
14 | keys=['姓名','性别','民族','学历','name','所属省市','联系地址','法人代表','工程监理资质','招标代理','造价咨询','一级注册建筑师','二级注册建筑师'
15 | ,'一级注册结构工程师','二级注册结构工程师','注册土木工程师(岩土)','注册公用设备工程师(暖通空调)','注册公用设备工程师(给水排水)','注册公用设备工程师(动力)'
16 | ,'注册公用设备工程师(发输变电)','注册公用设备工程师(供配电)','注册化工工程师','监理工程师','一级建造师','二级建造师','造价工程师']
17 | keys_two=['姓名','性别','民族','学历','name','所属省市','联系地址','法人代表']
18 | keys_three=['工程监理资质','招标代理','造价咨询','监理工程师','一级建造师','二级建造师']
19 | f=open('data.txt','w')
20 | level=loadLevel()
21 | for line in open('result.txt','r'):
22 | person={}
23 | item=eval(line)
24 | for key in keys:
25 | if key not in item:
26 | person[key]='N'
27 | else:
28 | person[key]='Y'
29 | for key in keys_two:
30 | person[key]=item[key]
31 | for key in keys_three:
32 | text=''
33 | try:
34 | for i in item[key]:
35 | if i not in text:
36 | text+=i+','
37 | person[key]=text[:-1]
38 | except:
39 | person[key]=text
40 | try:
41 | person['造价咨询']=level[item['name']]
42 | except:
43 | person['造价咨询']='-'
44 | text=''
45 | for key in keys:
46 | text+=person[key]+' ||'
47 | f.write(text+'\n')
48 | f.close()
49 |
50 | deal()
51 |
--------------------------------------------------------------------------------
/www.mohurd.gov.cn/registrarinfor.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import re
4 |
5 | headers = {
6 | 'Host':"210.12.219.18",
7 | 'X-Requested-With':"XMLHttpRequest",
8 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/44.0',
9 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
10 | 'Accept-Language': 'en-US,en;q=0.5',
11 | 'Accept-Encoding': 'gzip, deflate',
12 | 'Referer':"http://210.12.219.18/jianguanfabuweb/companies.html",
13 | 'Cookie':"ASP.NET_SessionId=evkmapz1ljljsqh54siborwj",
14 | 'Connection': 'keep-alive'}
15 |
16 | def get_infor(item):
17 | url='http://210.12.219.18/jianguanfabuweb/'+item['url']
18 | html=requests.get(url,headers=headers,timeout=30).text
19 | soup=BeautifulSoup(html,'lxml').find('div',{'class':'content'})
20 | basic=soup.find('table',{'class':'engineer_basic_infor_table'}).get_text().replace('\r','').replace('\n','').replace(' ','')
21 | basic_re='姓名:(.*?)民族:(.*?)性别:(.*?)手.*?学历:(.*?)学位'
22 | basicinfor=re.findall(basic_re,basic)[0]
23 | item['姓名']=basicinfor[0]
24 | item['民族']=basicinfor[1]
25 | item['性别']=basicinfor[2]
26 | item['学历']=basicinfor[3]
27 | zhengshu=soup.find_all('div',{'class':'zhengshu'})
28 | for div in zhengshu:
29 | header=div.find('div',{'class':'zhengshu_head'}).get_text()
30 | profess=div.find('table').find_all('td')[-1].get_text().split(',')
31 | item[header]=profess
32 | return item
33 |
34 |
35 | def main():
36 | f=open('result.txt','a')
37 | count=0
38 | for line in open('person.txt','r').readlines():
39 | count+=1
40 | person=eval(line.replace('\n',''))
41 | try:
42 | item=get_infor(person)
43 | except:
44 | failed=open('person_failed.txt','a')
45 | failed.write(line)
46 | failed.close()
47 | print(person['name'],'failed')
48 | continue
49 | print(count)
50 | f.write(str(item)+'\n')
51 | f.close()
52 |
53 | main()
54 |
--------------------------------------------------------------------------------
/www.ncbi.nlm.nih.gov/gethtml.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | import os
3 | import time
4 |
5 | def main():
6 | browser=webdriver.Firefox()
7 | browser.get('http://www.ncbi.nlm.nih.gov/pubmed')
8 | input('OK?')
9 | browser.implicitly_wait(10)
10 | count=0
11 | while True:
12 | html=browser.page_source
13 | f=open('html/%s.html'%count,'w')
14 | f.write(html)
15 | f.close()
16 | browser.find_element_by_xpath("//a[@id='EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Entrez_Pager.Page' and @sid=3]").click()
17 | time.sleep(5)
18 | count+=1
19 | if count==5330:
20 | break
21 |
22 | main()
23 |
--------------------------------------------------------------------------------
/www.ncbi.nlm.nih.gov/parser.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import os
3 |
4 | def parser():
5 | files=[]
6 | for filename in os.listdir('html'):
7 | files.append(filename)
8 | files.sort(key=lambda x:int(x.replace('.html','')))
9 | f=open('result.txt','a')
10 | for filename in files:
11 | html=open('html/'+filename,'r').read()
12 | try:
13 | table=BeautifulSoup(html,'lxml').find('div',{'class':'rprt_all'}).find_all('div',{'class':"rprt abstract"})
14 | except:
15 | continue
16 | for item in table:
17 | cit=item.find('div',{'class':'cit'})
18 | try:
19 | periodical=cit.find('a').get_text()
20 | except:
21 | periodical='-'
22 | try:
23 | date=cit.get_text().replace(periodical,'')
24 | except:
25 | date='-'
26 | try:
27 | title=item.find('h1').get_text()
28 | except:
29 | continue
30 | try:
31 | auths=item.find('div',{'class':'auths'}).find_all('a')
32 | except:
33 | auths=[]
34 | auth_num=len(auths)
35 | auth_name=''
36 | for a in auths:
37 | auth_name+=a.get_text()+';'
38 | try:
39 | afflist=item.find('div',{'class':'afflist'}).find_all('li')
40 | except:
41 | afflist=''
42 | auth_infor=''
43 | for li in afflist:
44 | auth_infor+=li.get_text()+'||'
45 | try:
46 | abstract=item.find('div',{'class':'abstr'}).get_text()
47 | except:
48 | abstract=''
49 | try:
50 | pmid=item.find('div',{'class':'aux'}).find('a',{'ref':'aid_type=pmid'}).get_text()
51 | except:
52 | pmid='-'
53 | f.write(str([pmid,periodical,date,title,auth_num,auth_name,auth_infor,abstract])+'\r\n')
54 | print(filename,'-ok')
55 | f.close()
56 | parser()
57 |
--------------------------------------------------------------------------------
/www.ncbi.nlm.nih.gov/write_to_excel.py:
--------------------------------------------------------------------------------
1 | import openpyxl
2 |
3 | def write_to_excel():
4 | excel=openpyxl.Workbook(write_only=True)
5 | sheet=excel.create_sheet()
6 | count=0
7 | filecount=1
8 | exist=[]
9 | for line in open('result.txt','r'):
10 | line=line.replace('\r\n','')
11 | item=eval(line)
12 | if item[0] in exist:
13 | continue
14 | exist.append(item[0])
15 | sheet.append(item)
16 | count+=1
17 | print(count)
18 | if count%100000==0:
19 | excel.save('%s.xlsx'%filecount)
20 | filecount+=1
21 | excel=openpyxl.Workbook(write_only=True)
22 | sheet=excel.create_sheet()
23 | excel.save('%s.xlsx'%filecount)
24 |
25 | write_to_excel()
26 |
--------------------------------------------------------------------------------
/www.pizzahut.com.cn/storelist.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import urllib
4 | import openpyxl
5 |
6 | def citys():
7 | html=open('index.html','r').read()
8 | table=BeautifulSoup(html,'lxml').find_all('div',{'class':'city_window'})[1].find_all('a')
9 | f=open('citys.txt','w')
10 | for item in table:
11 | f.write(item.get_text()+'\n')
12 | f.close()
13 |
14 | def get_store(city):
15 | city=urllib.parse.quote(city)
16 | headers = {
17 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
18 | 'Accept-Language': 'en-US,en;q=0.5',
19 | 'Accept-Encoding': 'gzip, deflate',
20 | 'Cookie':"NSC_CX_QfstjtufodzHspvq=ffffffff09320b0745525d5f4f58455e445a4a423660; _u_=1; __RequestVerificationToken=tOMoZty3Jp6D53oSF-NqlfyAlPa0sRNndZ7PNG5iPrWgM_ngcVFEOP79uEvHJGuqlHDoAA3WDd1MN9QA8ZEhpurYLA0WSkuyswlEO9Nj9oqeMWnu84Q1fyQQYx5-vjq-73NNZXJJLcF9jq3fjB_dsw2; iplocation={}%7C0%7C0".format(city),
21 | 'User-Agent':"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.0",
22 | 'Connection': 'keep-alive'}
23 | page=1
24 | result=[]
25 | while True:
26 | data={
27 | 'pageIndex':page,
28 | 'pageSize':"100",
29 | 'keyword':"输入餐厅地址或餐厅名称"
30 | }
31 | html=requests.post('http://www.pizzahut.com.cn/StoreList/Index',headers=headers,data=data).text
32 | soup=BeautifulSoup(html,'lxml').find_all('li')
33 | items=[]
34 | for li in soup:
35 | item=''
36 | try:
37 | for p in li.find('div',{'class':'re_RNew'}).find_all('p'):
38 | item+='|'+p.get_text()
39 | except:
40 | continue
41 | items.append(item)
42 | if items==[]:
43 | break
44 | result+=items
45 | page+=1
46 | return result
47 |
48 | def main():
49 | f=open('result.txt','a')
50 | for line in open('citys.txt','r'):
51 | city=line.replace('\n','')
52 | try:
53 | result=get_store(city)
54 | except:
55 | failed=open('failed.txt','a')
56 | failed.write(city+'\n')
57 | failed.close()
58 | continue
59 | for item in result:
60 | f.write(city+item+'\n')
61 | print(city,'ok')
62 | f.close()
63 |
64 | def write_to_excel():
65 | result={}
66 | excel=openpyxl.Workbook(write_only=True)
67 | sheet1=excel.create_sheet('1')
68 | for line in open('result.txt','r'):
69 | line=line.replace('\n','')
70 | lists=line.split('|')
71 | try:
72 | result[lists[1]]+=1
73 | except:
74 | result[lists[1]]=1
75 | sheet1.append(lists)
76 | sheet2=excel.create_sheet('2')
77 | for key in result:
78 | sheet2.append([key,result[key]])
79 | excel.save('result.xlsx')
80 |
81 | write_to_excel()
82 |
--------------------------------------------------------------------------------
/www.ppdai.com/excel.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | import xlwt3
3 |
4 | def excel():
5 | file_d=open('data.txt','r')
6 | excel_f=xlwt3.Workbook()
7 | sheet=excel_f.add_sheet('sheet')
8 | count=0
9 | for line in file_d.readlines():
10 | lists=line.replace('\n','').split('|')
11 | num=0
12 | for item in lists:
13 | try:
14 | text=item.split(':')[1]
15 | except:
16 | text=item
17 | sheet.write(count,num,text)
18 | num+=1
19 | count+=1
20 | excel_f.save('data.xls')
21 |
22 | excel()
23 |
--------------------------------------------------------------------------------
/www.teld.cn/setting/cities.txt:
--------------------------------------------------------------------------------
1 | 广州市
2 | 上海市
3 | 杭州市
4 | 成都市
5 | 南京市
6 |
--------------------------------------------------------------------------------
/www.tripadvisor.com/getpage.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | import os
5 | import time
6 |
7 | headers = {
8 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
9 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
10 | 'Accept-Language': 'en-US,en;q=0.5',
11 | 'Accept-Encoding': 'gzip, deflate'}
12 |
13 | def main():
14 | html=requests.get('https://www.tripadvisor.com/Attraction_Review-g294212-d325811-Reviews-Great_Wall_at_Mutianyu-Beijing.html#REVIEWS',headers=headers).text
15 | try:
16 | os.mkdir('page')
17 | except:
18 | pass
19 | count=0
20 | f=open('page'+str(count)+'.html','w')
21 | f.write(html)
22 | f.close()
23 | count+=1
24 | num=10
25 | while True:
26 | try:
27 | html=requests.get('https://www.tripadvisor.com/Attraction_Review-g294212-d325811-Reviews-or%s-Great_Wall_at_Mutianyu-Beijing.html#REVIEWS'%num,headers=headers).text
28 | except:
29 | continue
30 | f=open('page/'+str(count)+'.html','w')
31 | f.write(html)
32 | f.close()
33 | num+=10
34 | print(num)
35 | count+=1
36 | if(num==8490):
37 | break
38 | time.sleep(2)
39 |
40 | main()
41 |
--------------------------------------------------------------------------------
/www.tripadvisor.com/moredata.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 |
6 |
7 | headers = {
8 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
9 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
10 | 'Accept-Language': 'en-US,en;q=0.5',
11 | 'Accept-Encoding': 'gzip, deflate'}
12 |
13 |
14 | def getdata(target,viewid):
15 | html=requests.get('https://www.tripadvisor.com/ExpandedUserReviews-g294212-d325811?target=%s&context=1&reviews=%s&servlet=Attraction_Review&expand=1'%(target,viewid),headers=headers).text
16 | table=BeautifulSoup(html,'lxml').find_all('div',attrs={'class':'innerBubble'})
17 | result=[]
18 | for item in table:
19 | text=item.find('div',attrs={'class':'entry'}).get_text().replace('\r','').replace('\n','')+'||'
20 | try:
21 | text+=item.find('div',attrs={'class':'recommend'}).get_text().replace('\r','').replace('\n','')
22 | except:
23 | text+='--'
24 | result.append(text)
25 | return result
26 |
27 | def main():
28 | f=open('result.txt','a')
29 | viewids=[]
30 | lines=[]
31 | count=0
32 | for line in open('data.txt','r'):
33 | line=line.replace('\n','')
34 | lines.append(line)
35 | viewid=line.split('||')[1].split('-')[-1].replace('SRC_','')
36 | viewids.append(viewid)
37 | if(len(viewids)<20):
38 | continue
39 | text=''
40 | for id in viewids:
41 | text+=id+','
42 | result=getdata(viewids[0],text[:-1])
43 | print(len(result))
44 | for num in range(len(lines)):
45 | f.write(lines[num]+'||'+result[num]+'\n')
46 | viewids.clear()
47 | lines.clear()
48 | count+=1
49 | print(count,'--ok')
50 | text=''
51 | for id in viewids:
52 | text+=id+','
53 | result=getdata(viewids[0],text[:-1])
54 | for num in range(lines):
55 | f.write(lines[num]+'||'+result[num]+'\n')
56 | viewids.clear()
57 | lines.clear()
58 | f.close()
59 |
60 | main()
61 |
--------------------------------------------------------------------------------
/www.tripadvisor.com/userinfor.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 | import threading
6 |
7 |
8 | headers = {
9 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
10 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
11 | 'Accept-Language': 'en-US,en;q=0.5',
12 | 'Accept-Encoding': 'gzip, deflate'}
13 |
14 |
15 | class Infor(threading.Thread):
16 | def __init__(self,line):
17 | super(Infor,self).__init__()
18 | self.line=line
19 | self.uid=self.line.split('||')[1].split('-')[0].replace('UID_','')
20 |
21 | def run(self):
22 | try:
23 | html=requests.get('https://www.tripadvisor.com/MemberOverlay?uid=%s&c=&fus=false&partner=false&LsoId='%self.uid,headers=headers,timeout=50).text
24 | except:
25 | self.result='--'
26 | self.line+='||'+self.result
27 | return
28 | try:
29 | self.result=BeautifulSoup(html,'lxml').find('ul',attrs={'class':'memberdescription'}).find_all('li')[1].get_text().replace('\r','').replace('\n','')
30 | except:
31 | self.result='--'
32 | self.line+='||'+self.result
33 |
34 |
35 | def main():
36 | f=open('re_data.txt','a')
37 | threadings=[]
38 | lines=[]
39 | count=0
40 | for line in open('result.txt','r'):
41 | line=line.replace('\n','')
42 | lines.append(line)
43 | if(len(lines)<20):
44 | continue
45 | for line in lines:
46 | work=Infor(line)
47 | threadings.append(work)
48 | for work in threadings:
49 | work.start()
50 | for work in threadings:
51 | work.join()
52 | for work in threadings:
53 | f.write(work.line+'\n')
54 | count+=1
55 | print(count,'--ok')
56 | threadings.clear()
57 | lines.clear()
58 |
59 | main()
60 |
--------------------------------------------------------------------------------
/www.variflight.com/icon/0/20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/0/20.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/0/23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/0/23.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/1/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/1/1.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/1/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/1/4.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/2/0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/2/0.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/2/33.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/2/33.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/24/117.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/24/117.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/24/304.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/24/304.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/24/783.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/24/783.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/3/43.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/3/43.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/3/64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/3/64.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/4/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/4/3.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/4/9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/4/9.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/44/141.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/44/141.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/44/88.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/44/88.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/5/71.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/5/71.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/5/8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/5/8.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/6/19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/6/19.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/6/51.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/6/51.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/6/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/6/6.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/7/16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/7/16.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/7/26.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/7/26.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/8/93.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/8/93.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/8/98.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/8/98.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/9/21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/9/21.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/9/31.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/9/31.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/b/2202.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/b/2202.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/b/2248.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/b/2248.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/m/2397.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/m/2397.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/m/2408.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/m/2408.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/m/2419.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/m/2419.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/s/2245.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/s/2245.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/s/2413.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/s/2413.png
--------------------------------------------------------------------------------
/www.variflight.com/icon/s/2424.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/s/2424.png
--------------------------------------------------------------------------------
/www.yhd.com/data.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.yhd.com/data.xls
--------------------------------------------------------------------------------
/www.yhd.com/replace.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.yhd.com/replace.py
--------------------------------------------------------------------------------
/www.yhd.com/shopinfor.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 | import xlwt3
6 | import re
7 |
8 | headers = {
9 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
10 | "Accept-Encoding": "gzip, deflate",
11 | "Accept-Language": "en-US,en;q=0.5",
12 | "Connection": "keep-alive",
13 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
14 |
15 | def get_urls(url):
16 | try:
17 | html=requests.get(url,headers=headers,timeout=50).text
18 | except:
19 | return []
20 | rel='(http://shop.yhd.com/m-\d+.html)'
21 | urls=re.findall(rel,html)
22 | urls=list(set(urls))
23 | try:
24 | html=requests.get(url+'&isGetMoreProducts=1',headers=headers,timeout=50).text
25 | urls+=re.findall(rel,html)
26 | urls=list(set(urls))
27 | except:
28 | print('--')
29 | return urls
30 |
31 | def get_infor(url):
32 | html=requests.get(url,headers=headers).text
33 | table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'shop-des'}).find_all('li')
34 | item={}
35 | item['url']=url
36 | try:
37 | item['name']=table[0].find('span').get_text()
38 | except:
39 | item['name']=''
40 | try:
41 | item['city']=table[1].find('span').get_text()
42 | except:
43 | item['city']=''
44 | try:
45 | item['tel']=table[2].find('span').get_text()
46 | except:
47 | item['tel']=''
48 | return item
49 |
50 | def main():
51 | excel_f=xlwt3.Workbook()
52 | sheet=excel_f.add_sheet('sheet')
53 | count=0
54 | list_url=input("输入商铺链接:")
55 | list_url=list_url.replace('list.yhd.com/','list.yhd.com/searchPage/')
56 | page=1
57 | while True:
58 | urls=get_urls(re.sub('p\d','p'+str(page),list_url))
59 | if(urls==[]):
60 | break
61 | for url in urls:
62 | try:
63 | item=get_infor(url)
64 | except:
65 | continue
66 | sheet.write(count,0,item['name'])
67 | sheet.write(count,1,item['city'])
68 | sheet.write(count,2,item['tel'])
69 | sheet.write(count,3,item['url'])
70 | count+=1
71 | print(count)
72 | excel_f.save('data.xls')
73 | page+=1
74 |
75 | main()
76 |
--------------------------------------------------------------------------------
/www.zdic.net/write_to_excel.py:
--------------------------------------------------------------------------------
1 | import openpyxl
2 | import os
3 | from bs4 import BeautifulSoup
4 | import re
5 |
6 |
7 | def load_result_1():
8 | result=[]
9 | for line in open('result.txt','r'):
10 | item=eval(line)
11 | baseinfor=item['baseinfor']
12 | for word in item['words']:
13 | line=word[:-1]
14 | des=''
15 | for p in word[-1]:
16 | des+=p+'\n'
17 | result.append(line+baseinfor+[des,item['url']])
18 | return result
19 |
20 | def load_result_2():
21 | result=[]
22 | for line in open('result.txt','r'):
23 | item=eval(line)
24 | baseinfor=item['baseinfor']
25 | for word in item['words']:
26 | line=word[:-1]
27 | num=1
28 | for p in word[-1]:
29 | text=BeautifulSoup(p,'lxml').get_text()
30 | text=re.sub('(\d+. )|◎ ','',text)
31 | result.append(line+baseinfor+[num,text,item['url']])
32 | num+=1
33 | return result
34 |
35 | def write_to_excel(result,filename):
36 | excel=openpyxl.Workbook(write_only=True)
37 | sheet=excel.create_sheet()
38 | for line in result:
39 | sheet.append(line)
40 | excel.save(filename)
41 |
42 | result=load_result_1()
43 | write_to_excel(result,'result_1.xlsx')
44 |
--------------------------------------------------------------------------------
/www.zhongchou.com/Duplicate.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import os
4 |
5 | def Duplicate():
6 | for filename in os.listdir('.'):
7 | if filename.endswith('txt'):
8 | lines=open(filename,'r').readlines()
9 | lines=list(set(lines))
10 | lines.sort()
11 | f=open(filename,'w')
12 | for line in lines:
13 | f.write(line)
14 | f.close()
15 |
16 | Duplicate()
17 |
--------------------------------------------------------------------------------
/www.zhongchou.com/excel.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import xlwt3
4 |
5 | def write():
6 | f=xlwt3.Workbook()
7 | sheet=f.add_sheet('sheet')
8 | file_f=open('D.txt','r')
9 | num=1
10 | head=['项目','id','进展数','评论数','最小金额','人数','video','类型','地区','支持人数','已筹款','比例','目标筹资','关注']
11 | count=0
12 | for item in head:
13 | sheet.write(0,count,item)
14 | count+=1
15 | for line in file_f.readlines():
16 | lists=line.replace('\n','').split('|')
17 | for count in range(14):
18 | sheet.write(num,count,lists[count])
19 | num+=1
20 | f.save('data.xls')
21 |
22 | write()
23 |
--------------------------------------------------------------------------------
/www.zhongchou.com/get_id.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 |
6 | def get_id():
7 | f=open('ids.txt','a')
8 | headers = {
9 | 'Host':"www.zhongchou.com",
10 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
11 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
12 | 'Accept-Language': 'en-US,en;q=0.5',
13 | 'Accept-Encoding': 'gzip, deflate',
14 | 'Connection': 'keep-alive'}
15 | for page in range(150):
16 | html=requests.get('http://www.zhongchou.com/browse/re-p'+str(page+1),headers=headers).text.encode('ISO-8859-1').decode('utf-8','ignore')
17 | table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'sousuoListBox clearfix'}).find_all('div',attrs={'class':'ssCardItem'})
18 | for item in table:
19 | text=''
20 | p=item.find('h3').find('a')
21 | text=p.get('title')+'|'+p.get('href').replace('http://www.zhongchou.com/deal-show/id-','')+'\n'
22 | print(text)
23 | f.write(text)
24 | print(page)
25 | f.close()
26 |
27 | get_id()
28 |
--------------------------------------------------------------------------------
/www.zhongchou.com/other.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 |
6 | def get_infor(text):
7 | headers = {
8 | 'Host':"www.zhongchou.com",
9 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
10 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
11 | 'Accept-Language': 'en-US,en;q=0.5',
12 | 'Accept-Encoding': 'gzip, deflate',
13 | 'Connection': 'keep-alive'}
14 | id=text.split('|')[1]
15 | try:
16 | html=requests.get('http://www.zhongchou.com/deal-show/id-'+id,headers=headers).text.encode('ISO-8859-1').decode('utf-8','ignore')
17 | except:
18 | return None
19 | table=BeautifulSoup(html,'html.parser').find('div',attrs={'class':'mainIn02Box'})
20 | title=table.find('div',attrs={'class':'jlxqTitleText siteIlB_box'}).find_all('div')
21 | text+='|'+title[0].get_text().replace('\n','')
22 | text+='|'+title[1].get_text()
23 | right_table=table.find('div',attrs={'class':'xqDetailRight'})
24 | su_table=right_table.find('div',attrs={'class':"xqDetailDataBox"}).find_all('div')
25 | text+='|'+su_table[0].find('p').get_text()
26 | text+='|'+su_table[1].find('p').get_text()
27 | su_table=right_table.find('div',attrs={'class':'xqRatioOuterBox'})
28 | text+='|'+su_table.find('p').get_text()+'|'+su_table.find('b').get_text()
29 | su_table=right_table.find('div',attrs={'class':'xqDetailBtnBox'}).find('a',id='deal_detail_like')
30 | text+='|'+su_table.find('b').get_text()
31 | return text
32 |
33 | def main():
34 | file_d=open('data.txt','r')
35 | data_f=open('other.txt','a')
36 | num=0
37 | for line in file_d.readlines():
38 | try:
39 | text=get_infor(line.replace('\n',''))
40 | except:
41 | continue
42 | if text==None:
43 | continue
44 | data_f.write(text+'\n')
45 | num+=1
46 | print(num)
47 | data_f.close()
48 |
49 | main()
50 |
--------------------------------------------------------------------------------
/wwwapps.ups.com/write2excel.py:
--------------------------------------------------------------------------------
1 | import openpyxl
2 |
3 | def load_data():
4 | keys=[line.replace('\n','').replace(' ','') for line in open('data','r')]
5 | data={}
6 | for line in open('result.txt','r'):
7 | line=line.replace('\n','').split('-')
8 | try:
9 | data[line[0]][line[1]]=int(line[-1])
10 | except:
11 | data[line[0]]={}
12 | data[line[0]][line[1]]=int(line[-1])
13 | try:
14 | data[line[1]][line[0]]=int(line[-1])
15 | except:
16 | data[line[1]]={}
17 | data[line[1]][line[0]]=int(line[-1])
18 | return keys,data
19 |
20 | def write_to_excel():
21 | keys,data=load_data()
22 | excel=openpyxl.Workbook(write_only=True)
23 | sheet=excel.create_sheet()
24 | line=['']
25 | for key in keys:
26 | if len(key)==4:
27 | key='0'+key
28 | line.append(key)
29 | sheet.append(line)
30 | for key in keys:
31 | if len(key)==4:
32 | key='0'+key
33 | line=[key]
34 | for another_key in keys:
35 | if len(another_key)==4:
36 | another_key='0'+another_key
37 | if key==another_key:
38 | line.append(1)
39 | else:
40 | try:
41 | line.append(data[key][another_key])
42 | except:
43 | line.append('')
44 | sheet.append(line)
45 | sheet=excel.create_sheet()
46 | line=['']
47 | for key in keys:
48 | if len(key)==4:
49 | key='0'+key
50 | line.append(key)
51 | sheet.append(line)
52 | for key in keys:
53 | if len(key)==4:
54 | key='0'+key
55 | line=[key]
56 | for another_key in keys:
57 | if len(another_key)==4:
58 | another_key='0'+another_key
59 | if key==another_key:
60 | line.append(1)
61 | else:
62 | try:
63 | value=data[key][another_key]
64 | if value!=1:
65 | value=0
66 | line.append(value)
67 | except:
68 | line.append('')
69 | sheet.append(line)
70 | excel.save('result.xlsx')
71 |
72 | write_to_excel()
73 |
--------------------------------------------------------------------------------
/xxgk.jl.gov.cn/infor.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import openpyxl
4 | import re
5 |
6 |
7 | headers = {
8 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
9 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
10 | 'Accept-Language': 'en-US,en;q=0.5',
11 | 'Accept-Encoding': 'gzip, deflate',
12 | 'Connection': 'keep-alive'}
13 |
14 | def geturls():
15 | f=open('urls.txt','a')
16 | page=1
17 | while True:
18 | html=requests.get('http://xxgk.jl.gov.cn/zwdtSjgl/Directory/depListDir1.jsp?department_name=%CB%F9%D3%D0&pageNo='+str(page),headers=headers).text
19 | table=BeautifulSoup(html,'lxml').find_all('div',style='display:none;')
20 | for item in table:
21 | try:
22 | pid=item.get('id').replace('_text','')
23 | item=str(item).replace('','').replace('
','')
24 | items=BeautifulSoup(item,'lxml').find_all('a')
25 | title=items[2].get_text()
26 | date=items[3].get_text()
27 | line=title+'|| '+date+' ||'+pid
28 | f.write(line.replace('\r','').replace('\n','')+'\n')
29 | except:
30 | continue
31 | print(page,'ok')
32 | page+=1
33 | if page==937:
34 | break
35 | f.close()
36 |
37 | def getinfor(pid):
38 | html=requests.get('http://xxgk.jl.gov.cn/zwdtSjgl/Directory/showDir.jsp?keyid='+pid,headers=headers,timeout=30).text
39 | tables=BeautifulSoup(html,'lxml').find_all('table',width=700)
40 | text=tables[0].get_text().replace('\r','').replace('\n','')
41 | try:
42 | location=re.findall('发布机构:(.*?)生成日期',text)[0]
43 | except:
44 | location='--'
45 | text=tables[1].get_text().replace('\r','').replace('\n','')
46 | return location+'||'+text
47 |
48 | def main():
49 | f=open('result.txt','a')
50 | for line in open('urls.txt','r'):
51 | line=line.replace('\n','')
52 | try:
53 | result=getinfor(line.split('||')[-1].replace(' ',''))
54 | except:
55 | failed=open('failed','a')
56 | failed.write(line+'\n')
57 | failed.close()
58 | continue
59 | f.write(line+'||'+result+'\n')
60 | print(line)
61 | f.close()
62 |
63 | main()
64 |
--------------------------------------------------------------------------------
/zhidao.baidu.com/question.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import time
4 | import threading
5 |
6 | headers = {
7 | 'Host':"zhidao.baidu.com",
8 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
9 | "Accept-Encoding": "gzip, deflate",
10 | "Accept-Language": "en-US,en;q=0.5",
11 | "Connection": "keep-alive",
12 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
13 |
14 | class Ques(threading.Thread):
15 | def __init__(self,line):
16 | super(Ques,self).__init__()
17 | self.line=line
18 | self.url=line.split('||')[-1]
19 | self.word=line.split('||')[0]
20 |
21 | def run(self):
22 | self.status=True
23 | try:
24 | self.data=self.question()
25 | except:
26 | self.status=False
27 |
28 | def question(self):
29 | html=requests.get(self.url,headers=headers,timeout=30).text.encode('ISO-8859-1').decode('gbk','ignore')
30 | table=BeautifulSoup(html,'lxml').find('article',id='qb-content')
31 | header=table.find('div',id='wgt-ask')
32 | title=header.find('span',{'class':'ask-title'}).get_text()
33 | try:
34 | des=header.find('span',{'class':'con'}).get_text()
35 | except:
36 | des='-'
37 | try:
38 | answer=table.find('div',{'class':['bd','answer']}).find('pre').get_text()
39 | except:
40 | try:
41 | answer=table.find('div',{'id':'wgt-answers'}).find('span',{'class':'con'}).get_text()
42 | except:
43 | answer='-'
44 | return [title,des,answer]
45 |
46 | def main():
47 | f=open('result.txt','a')
48 | lines=[]
49 | count=0
50 | for line in open('./urls.txt','r'):
51 | line=line.replace('\n','')
52 | lines.append(line)
53 | if len(lines)<10:
54 | continue
55 | threadings=[]
56 | for item in lines:
57 | work=Ques(item)
58 | threadings.append(work)
59 | for work in threadings:
60 | work.start()
61 | for work in threadings:
62 | work.join()
63 | for work in threadings:
64 | if work.status==False:
65 | failed=open('question_failed','a')
66 | failed.write(work.line+'\n')
67 | failed.close()
68 | continue
69 | count+=1
70 | print(count)
71 | f.write(str([work.word]+work.data)+'\n')
72 | lines.clear()
73 | threadings=[]
74 | for item in lines:
75 | work=Ques(item)
76 | threadings.append(work)
77 | for work in threadings:
78 | work.start()
79 | for work in threadings:
80 | work.join()
81 | for work in threadings:
82 | if work.status==False:
83 | failed=open('question_failed','a')
84 | failed.write(work.line+'\n')
85 | failed.close()
86 | continue
87 | f.write(str([work.word]+work.data)+'\n')
88 | f.close()
89 |
90 | main()
91 |
--------------------------------------------------------------------------------
/zhidao.baidu.com/search.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import time
4 | import threading
5 |
6 | headers = {
7 | 'Host':"zhidao.baidu.com",
8 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
9 | "Accept-Encoding": "gzip, deflate",
10 | "Accept-Language": "en-US,en;q=0.5",
11 | "Connection": "keep-alive",
12 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
13 |
14 | def search(key):
15 | html=requests.get('https://zhidao.baidu.com/search?lm=0&rn=10&pn=0&fr=search&ie=utf-8&word='+key,headers=headers,timeout=30).text.encode('ISO-8859-1').decode('gbk','ignore')
16 | table=BeautifulSoup(html,'lxml').find('div',{'class':'list-wraper'}).find_all('dl')
17 | for dl in table:
18 | try:
19 | url=dl.find('a').get('href')
20 | if 'zhidao.baidu.com/question' in url:
21 | return url
22 | except:
23 | continue
24 |
25 | class Search(threading.Thread):
26 | def __init__(self,key):
27 | super(Search,self).__init__()
28 | self.key=key
29 |
30 | def run(self):
31 | self.status=True
32 | try:
33 | self.url=search(self.key)
34 | except:
35 | self.status=False
36 |
37 | def main():
38 | f=open('urls.txt','w')
39 | lines=[]
40 | count=0
41 | for line in open('./failed_words','r'):
42 | line=line.replace('\n','')
43 | lines.append(line)
44 | if len(lines)<5:
45 | continue
46 | threadings=[]
47 | for item in lines:
48 | work=Search(item)
49 | threadings.append(work)
50 | for work in threadings:
51 | work.start()
52 | for work in threadings:
53 | work.join()
54 | for work in threadings:
55 | if work.status==False:
56 | continue
57 | if work.url==None:
58 | continue
59 | count+=1
60 | print(count)
61 | try:
62 | f.write(work.key+"||"+work.url+'\n')
63 | except:
64 | continue
65 | lines.clear()
66 | threadings=[]
67 | for item in lines:
68 | work=Search(item)
69 | threadings.append(work)
70 | for work in threadings:
71 | work.start()
72 | for work in threadings:
73 | work.join()
74 | for work in threadings:
75 | if work.status==False:
76 | continue
77 | if work.url==None:
78 | continue
79 | count+=1
80 | print(count)
81 | f.write(work.key+"||"+work.url+'\n')
82 | lines.clear()
83 | f.close()
84 | main()
85 |
--------------------------------------------------------------------------------
/zhihu/get_followee.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 | import json
6 |
7 |
8 | headers = {
9 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
10 | "Accept-Encoding": "gzip, deflate",
11 | "Accept-Language": "en-US,en;q=0.5",
12 | "Connection": "keep-alive",
13 | 'Cookie':'q_c1=52c451e7774943a2983e4b1341af47c4|1455451362000|1449924628000; _za=b08b756f-83e2-44b8-8719-9fd22ea0e8fc; __utma=51854390.837289251.1457412853.1457412853.1457412853.1; __utmz=51854390.1457412853.1.1.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/people/gejinyuban/topics; cap_id="MWRmMmU0NjlhMmM1NDRhMWFlYzg1MmI3OTJmYjJmN2I=|1457411531|febb54ce12ed1f54a9d134f44ad639a8d21a406a"; _xsrf=3193e002ffde3f8236b8bf0425ba0a8c; udid="AFBAu5wSlAmPTqUZ3Pnq-vBRhHF-_se18_Q="; n_c=1; __utmc=51854390; __utmb=51854390.2.10.1457412853; z_c0="QUFCQVB4azVBQUFYQUFBQVlRSlZUZERpQlZjb1l3SmlXMlVuTTVXNmMyamsyaFh0TmNZZm9BPT0=|1457411536|03555bed95004f561fc044aa14585204ce700106"; unlock_ticket="QUFCQVB4azVBQUFYQUFBQVlRSlZUZGhjM2xaVGJYbi1uVzlzS1pGODllTFZGaXpzTFZZbFZBPT0=|1457411536|9d460fe15bde1d5e5e723349745d7084654ce709"; __utmt=1; __utmv=51854390.100-1|2=registration_date=20141006=1^3=entry_date=20141006=1',
14 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
15 |
16 | def get_followe(ID,hashid):
17 | html=requests.get('https://www.zhihu.com/people/%s/followees'%ID,headers=headers).text
18 | xsrf=BeautifulSoup(html,'lxml').find('input',attrs={'name':'_xsrf'}).get('value')
19 | print(xsrf)
20 | count=0
21 | persons=[]
22 | while True:
23 | data={
24 | 'method':"next",
25 | 'params':'{"offset":%s,"order_by":"created","hash_id":"%s"}'%(count,hashid),
26 | '_xsrf':xsrf
27 | }
28 | try:
29 | html=requests.post('https://www.zhihu.com/node/ProfileFolloweesListV2',headers=headers,data=data).text
30 | except:
31 | continue
32 | try:
33 | jsondata=json.loads(html)['msg']
34 | except:
35 | return persons
36 | if(jsondata==[]):
37 | break
38 | for item in jsondata:
39 | name=BeautifulSoup(item,'lxml').find('a',attrs={'class':'zg-link'}).get('title')
40 | persons.append(name)
41 | count+=20
42 | return persons
43 |
44 | def main():
45 | f=open('followee.txt','a',encoding='utf-8')
46 | statue=True
47 | for line in open('data.txt','r').readlines():
48 | lists=line.split('||')
49 | name=lists[0]
50 | if(statue):
51 | if(name=='keso'):
52 | statue=False
53 | continue
54 | ID=lists[1]
55 | item={}
56 | item['name']=name
57 | item['id']=ID
58 | hashid=lists[3]
59 | item['followee']=get_followe(ID, hashid)
60 | f.write(str(item)+'\n')
61 | print(name)
62 | main()
63 |
--------------------------------------------------------------------------------
/zhihu/top500.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | import json
5 |
6 | def get_top(page):
7 | html=requests.get('http://api.kanzhihu.com/topuser/follower/%s/50'%page).text
8 | data=json.loads(html)['topuser']
9 | return data
10 |
11 | def main():
12 | f=open('persons.txt','a',encoding='utf-8')
13 | page=1
14 | while True:
15 | data=get_top(page)
16 | for item in data:
17 | text=item['name']+'||'+item['id']+'||'+str(item['follower'])+'||'+item['hash']
18 | f.write(text+'\n')
19 | print(page)
20 | page+=1
21 | if(page==20):
22 | break
23 | f.close()
24 |
25 | def followee():
26 | f=open('data.txt','a',encoding='utf-8')
27 | for line in open('persons.txt','r').readlines():
28 | line=line.replace('\n','')
29 | print(line)
30 | data=requests.get('http://api.kanzhihu.com/userdetail2/'+line.split('||')[-1]).text
31 | data=json.loads(data)
32 | line=line+'|| '+str(data['signature'])+'|| '+str(data['description'])+'|| '
33 | detail=data['detail']
34 | line=line+str(detail['ask'])+'|| '+str(detail['answer'])+'|| '+str(detail['post'])+'|| '+str(detail['agree'])+'|| '+str(detail['thanks'])+'|| '+str(detail['fav'])+'||'+str(detail['logs'])
35 | f.write(line.replace('\r','').replace('\n','')+'\n')
36 |
37 | followee()
38 |
--------------------------------------------------------------------------------
/zhihu/zhihuinfor.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 | import re
6 |
7 | headers = {
8 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
9 | "Accept-Encoding": "gzip, deflate",
10 | "Accept-Language": "en-US,en;q=0.5",
11 | "Connection": "keep-alive",
12 | 'Cookie':'q_c1=52c451e7774943a2983e4b1341af47c4|1455451362000|1449924628000; _za=b08b756f-83e2-44b8-8719-9fd22ea0e8fc; __utma=51854390.837289251.1457412853.1457412853.1457412853.1; __utmz=51854390.1457412853.1.1.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/people/gejinyuban/topics; cap_id="MWRmMmU0NjlhMmM1NDRhMWFlYzg1MmI3OTJmYjJmN2I=|1457411531|febb54ce12ed1f54a9d134f44ad639a8d21a406a"; _xsrf=3193e002ffde3f8236b8bf0425ba0a8c; udid="AFBAu5wSlAmPTqUZ3Pnq-vBRhHF-_se18_Q="; n_c=1; __utmc=51854390; __utmb=51854390.2.10.1457412853; z_c0="QUFCQVB4azVBQUFYQUFBQVlRSlZUZERpQlZjb1l3SmlXMlVuTTVXNmMyamsyaFh0TmNZZm9BPT0=|1457411536|03555bed95004f561fc044aa14585204ce700106"; unlock_ticket="QUFCQVB4azVBQUFYQUFBQVlRSlZUZGhjM2xaVGJYbi1uVzlzS1pGODllTFZGaXpzTFZZbFZBPT0=|1457411536|9d460fe15bde1d5e5e723349745d7084654ce709"; __utmt=1; __utmv=51854390.100-1|2=registration_date=20141006=1^3=entry_date=20141006=1',
13 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
14 |
15 | def get_topics(ID):
16 | try:
17 | html=requests.get('https://www.zhihu.com/people/%s/topics'%ID,headers=headers).text
18 | table=BeautifulSoup(html,'lxml').find('div',id='zh-profile-topic-list').find_all('strong')
19 | topics=''
20 | for item in table:
21 | topics+=item.get_text()+','
22 | return topics[:-1]
23 | except:
24 | return get_topics(ID)
25 |
26 | def get_profile(ID):
27 | try:
28 | html=requests.get('https://www.zhihu.com/people/%s'%ID,headers=headers).text
29 | rel='class="zg-gray-darker">(.*?)'
30 | table=re.findall(rel,html)
31 | profile=''
32 | for item in table:
33 | profile+=item+','
34 | return profile[:-1]
35 | except:
36 | return get_profile(ID)
37 |
38 | def main():
39 | f=open('person.txt','a',encoding='utf-8')
40 | statue=True
41 | for line in open('data.txt','r').readlines():
42 | line=line.replace('\n','')
43 | ID=line.split('||')[1]
44 | if(statue):
45 | if(ID=='kun-yu'):
46 | statue=False
47 | continue
48 | topics=get_topics(ID)
49 | profile=get_profile(ID)
50 | f.write(line+'||'+topics+'||'+profile+'\n')
51 | print(line)
52 | f.close()
53 |
54 | main()
55 |
--------------------------------------------------------------------------------
/zsb.suda.edu.cn/markhistory.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 |
4 | #导入模块
5 | import requests
6 | from bs4 import BeautifulSoup
7 | import re
8 | import sqlite3
9 | import os
10 |
11 | #获取招生省份
12 | def get_provinces():
13 | #打开网页获取网页源码
14 | html=requests.get('http://zsb.suda.edu.cn/markHistory.aspx').text
15 | #解析网页,查找到省份,这个要结合网页源码
16 | table=BeautifulSoup(html,'html.parser').find('select',id='ctl00_ContentPlaceHolder1_DropDownList2').find_all('option')
17 | provinces={}
18 | #获取省份名
19 | for option in table:
20 | provinces[option.get_text()]=option.get('value')
21 | return provinces
22 |
23 | #获取招生专业,分数及其他信息
24 | def parser(year,aid,province):
25 | #构造url,打开网页,获取源码
26 | url='http://zsb.suda.edu.cn/view_markhistory.aspx?aa=%s年%s各专业录取分数一览表&aid=%s&ay=%s'%(year,province,aid,year)
27 | print(url)
28 | html=requests.get(url).text
29 | #解析网页,获取具体信息
30 | table=BeautifulSoup(html,'html.parser').find('table',id='ctl00_ContentPlaceHolder1_GridView1').find_all('tr')[1:]
31 | items=[]
32 | #遍历表格每一项,获取信息
33 | for tr in table:
34 | item=[year,province]
35 | for td in tr.find_all('td'):
36 | item.append(td.get_text().replace('\n',''))
37 | items.append(item)
38 | return items
39 |
40 | def main():
41 | try:
42 | os.remove('data.db')
43 | except:
44 | pass
45 | #连接数据库
46 | conn=sqlite3.connect('data.db')
47 | #创建游标
48 | cursor=conn.cursor()
49 | #创建数据表
50 | cursor.execute("create table if not exists markhistory(year varchar(8),province varchar(80),professional varchar(80),length varchar(20),category varchar(20),numbers varchar(20),highest varchar(20),minimum varchar(20),average varchar(20))")
51 | #需要抓取的年份
52 | need_years=['2015','2014','2013']
53 | #获取招生的省份
54 | provinces=get_provinces()
55 | #获取每个省每一年的信息
56 | for year in need_years:
57 | for key in provinces:
58 | #获取 某年(year)某地区(province)各专业信息
59 | try:
60 | items=parser(year,provinces[key],key)
61 | except:
62 | continue
63 | for item in items:
64 | #入库
65 | cursor.execute('insert into markhistory(year,province,professional,length,category,numbers,highest,minimum,average) values'+str(tuple(item)))
66 | #提交事物,入库
67 | conn.commit()
68 | #打印完成信息
69 | print(year,key,'--ok')
70 | #关闭游标
71 | cursor.close()
72 | #关闭数据库连接
73 | conn.close()
74 |
75 | main()
76 |
--------------------------------------------------------------------------------
/zsb.suda.edu.cn/new_markhistory.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 | import sqlite3
6 | import os
7 | import re
8 |
9 |
10 | #获取招生省份
11 | def get_provinces():
12 | #打开网页获取网页源码
13 | html=requests.get('http://zsb.suda.edu.cn/markHistory.aspx').text
14 | #解析网页,查找到省份,这个要结合网页源码
15 | table=BeautifulSoup(html,'html.parser').find('select',id='ctl00_ContentPlaceHolder1_DropDownList2').find_all('option')
16 | provinces=[]
17 | #获取省份名
18 | for option in table:
19 | provinces.append(option.get_text())
20 | return provinces
21 |
22 | def get_school():
23 | #打开网页获取网页源码
24 | html=requests.get('http://zsb.suda.edu.cn/markHistory.aspx').text
25 | #解析网页,查找到学院,这个要结合网页源码
26 | table=BeautifulSoup(html,'html.parser').find('select',id='ctl00_ContentPlaceHolder1_DropDownList3').find_all('option')
27 | school=[]
28 | #获取学院名
29 | for option in table:
30 | school.append(option.get_text())
31 | return school
32 |
33 | #获取招生专业,分数及其他信息
34 | def parser(year,province,school):
35 | #构造url,打开网页,获取源码
36 | url='http://zsb.suda.edu.cn/search.aspx?nf=%s&sf=%s&xy=%s'%(year,province,school)
37 | html=requests.get(url).text
38 | #解析网页,获取具体信息
39 | table=BeautifulSoup(html,'html.parser').find('table',id='ctl00_ContentPlaceHolder1_GridView1').find_all('tr')[1:]
40 | items=[]
41 | #遍历表格每一项,获取信息
42 | for tr in table:
43 | item=[]
44 | for td in tr.find_all('td'):
45 | item.append(td.get_text().replace('\n',''))
46 | items.append(item)
47 | return items
48 |
49 | def main():
50 | try:
51 | os.remove('data.db')
52 | except:
53 | pass
54 | #连接数据库
55 | conn=sqlite3.connect('data.db')
56 | #创建游标
57 | cursor=conn.cursor()
58 | #创建数据表
59 | cursor.execute("create table if not exists markhistory(school varchar(80),year varchar(8),province varchar(80),professional varchar(80),length varchar(20),category varchar(20),numbers varchar(20),highest varchar(20),minimum varchar(20),average varchar(20))")
60 | #需要抓取的年份
61 | need_years=['2015','2014','2013']
62 | #获取招生的省份
63 | provinces=get_provinces()
64 | schools=get_school()
65 | #获取每个省每一年的信息
66 | for year in need_years:
67 | for province in provinces:
68 | for school in schools:
69 | #获取 某年(year)某地区(province)各专业信息
70 | index=schools.index(school)+1
71 | if(index>19):
72 | index+=2
73 | try:
74 | items=parser(year,provinces.index(province)+1,index)
75 | except:
76 | continue
77 | for item in items:
78 | item.insert(2, school)
79 | #入库
80 | cursor.execute('insert into markhistory(school,year,province,professional,length,category,numbers,highest,minimum,average) values'+str(tuple(item)))
81 | #提交事物,入库
82 | #打印完成信息
83 | print(school,year,province,'--ok')
84 | conn.commit()
85 | #关闭游标
86 | cursor.close()
87 | #关闭数据库连接
88 | conn.close()
89 |
90 | main()
91 |
--------------------------------------------------------------------------------