├── 1688
    └── get_tel.py
├── 58.213.159.173
    └── jiangsu_Atmosphere.py
├── JobGet
    └── JobInforGet.py
├── Nyspider.py
├── README.md
├── ali_comments
    ├── fan_jian.py
    ├── langconv.py
    ├── taobao.py
    ├── tianmao.py
    └── zh_wiki.py
├── amap
    └── amap.py
├── amazon
    ├── get_items.py
    ├── items_usa.py
    └── shopProducts.py
├── anjuke
    ├── anjuke_hourse.py
    ├── community.py
    ├── get_house.py
    └── location.py
├── apk.91.com
    ├── Send_email.py
    ├── email_game.py
    └── email_soft.py
├── apps.fas.usda.gov
    └── psdQuery.py
├── aso100.com
    ├── aso100_ui.py
    └── aso100_ui_v2.py
├── baidu.lecai.com
    ├── lottery.py
    └── www.zy91.com
    │   └── zndz.py
├── baidumap
    ├── baidumap.py
    └── city_ids.txt
├── bbs.tianya.cn
    └── comments.py
├── bjguahao.gov.cn
    ├── bjguahao.py
    ├── bjguahao_v2.py
    └── bjguahao_v3.py
├── brokerbin.com
    ├── brokerbin.py
    ├── brokerbin_3.py
    ├── email_template
    ├── filter
    │   └── filter.txt
    └── send_email.py
├── buluo.qq.com
    └── images.py
├── chart.cp.360.cn
    └── charthistory.py
├── china.tandfonline.com
    └── search_article.py
├── club.qingdaonews.com
    └── article.py
├── cn.bing.com
    ├── bing_search.py
    └── urls.txt
├── data.cma.gov.cn
    ├── Duplicate.py
    └── get_data.py
├── datacenter.mep.gov.cn
    ├── air_dairy.py
    └── air_dairy_aqi.py
├── dianping
    ├── comments.txt
    ├── data
    │   ├── 上海.xls
    │   ├── 北京.xls
    │   ├── 南京.xls
    │   ├── 厦门.xls
    │   ├── 大连.xls
    │   ├── 天津.xls
    │   ├── 宁波.xls
    │   ├── 广州.xls
    │   ├── 成都.xls
    │   ├── 无锡.xls
    │   ├── 杭州.xls
    │   ├── 武汉.xls
    │   ├── 沈阳.xls
    │   ├── 济南.xls
    │   ├── 深圳.xls
    │   ├── 苏州.xls
    │   ├── 西安.xls
    │   ├── 郑州.xls
    │   ├── 重庆.xls
    │   ├── 长沙.xls
    │   └── 青岛.xls
    ├── get_info.py
    ├── memberlist.py
    ├── memberlist.txt
    ├── shopinfor.py
    └── shoplist.py
├── douban
    ├── dou_movie.py
    ├── dou_tv.py
    ├── movie_grade.py
    ├── movieinfor.py
    └── movies.txt
├── downloadbooks
    └── save_into_baiduyun.py
├── duapp2.drexel.edu
    ├── TMS.py
    ├── TMSCourse_Excel.py
    └── TMSCourse_Sqlite.py
├── finance.sina.com.cn
    ├── ManagerInfo.py
    └── codes.txt
├── finance.yahoo.com
    ├── finance.py
    └── new_finance.py
├── forecast.io
    ├── forecast.py
    └── getData.py
├── fsfc.fsjw.gov.cn
    └── house.py
├── gcjs.linfen.gov.cn
    └── company.py
├── hklock.com
    └── products.py
├── itslaw
    └── get_anli.py
├── jbk.39.net
    └── disease.py
├── job.qiaosiwang.com
    └── workinfor.py
├── job
    ├── Job_get.py
    └── REANME.md
├── landchina
    ├── infor.py
    └── landchina.py
├── lvyou.baidu.com
    └── guilin.py
├── mall.jd.com
    └── jd_shop.py
├── maoyan
    ├── Duplicate.py
    ├── get_infor.py
    └── maoyan.py
├── music.163.com
    └── music_lists.py
├── news.sohu.com
    └── news.py
├── news_get
    ├── cn.chinadaily.com.cn
    │   └── chinadaily.com.cn.py
    ├── people.com.cn
    │   └── people.com.cn.py
    ├── www.cankaoxiaoxi.com
    │   └── cankaoxiaoxi.com.py
    ├── www.eastday.com
    │   └── eastday.com.py
    ├── www.gmw.cn
    │   └── gmw.cn.py
    ├── www.haiwainet.cn
    │   └── haiwainet.cn.py
    ├── www.huanqiu.com
    │   └── huanqiu.com.py
    ├── www.youth.cn
    │   └── youth.cn.py
    └── www.zaobao.com
    │   └── zaobao.com.py
├── newseed.pedaily.cn
    └── invest.py
├── pan.baidu.com
    └── sharelink.py
├── qimingpian.com
    └── qimingpian.py
├── rank.kongzhong.com
    └── userInfor.py
├── stock.finance.qq.com
    ├── stk_holder.py
    ├── stkcode.py
    └── stkcode.txt
├── stock.jrj.com.cn
    └── flowhistory.py
├── taobao
    ├── suggest.py
    └── sycm.py
├── tur.bizdirlib.com
    └── bizdirlib.py
├── waimai.meituan.com
    └── orderlist.py
├── weibo
    ├── weibo.md
    └── weibo.py
├── weidian
    └── weidian.py
├── wenda.so.com
    ├── question.py
    └── search.py
├── wenshu.court.gov.cn
    └── download.py
├── worldfreightrates
    └── trates.py
├── www.18ladys.com
    └── 18ladys.py
├── www.360che.com
    └── products.py
├── www.3j1688.com
    └── 3j1688.py
├── www.58.com
    ├── JobInforGet.py
    ├── company.py
    ├── companyExcel.py
    └── sendemail.py
├── www.aihuishou.com
    └── get_price.py
├── www.airbnb.com
    ├── deal.py
    ├── roominfor.py
    ├── rooms.py
    └── userinfor.py
├── www.aqistudy.cn
    └── aqistudy.py
├── www.autozi.com
    ├── carBrandLetter.py
    ├── products.py
    └── products_infor.py
├── www.b8b8.tv
    ├── ballbar_mobile.py
    └── ballbar_pc.py
├── www.baikemy.com
    └── disease.py
├── www.cbooo.cn
    └── cbooo.py
├── www.chazidian.com
    └── yuwen.py
├── www.chealth.org.cn
    └── disease.py
├── www.china-10.com
    ├── china10.py
    └── excel.py
├── www.chuanlaoda.cn
    ├── CaptchaOCR.dll
    ├── chuanlaoda.py
    ├── py2exe_install.py
    ├── testdll.py
    └── x64
    │   └── CaptchaOCR.dll
├── www.cjsyw.com
    └── ship.py
├── www.cofeed.com
    └── cofeed.py
├── www.cpbz.gov.cn
    ├── company.py
    └── write_to_excel.py
├── www.ctrip.com
    ├── comments.py
    ├── comments_bydate.py
    └── youtrip.py
├── www.dicos.com.cn
    ├── citys.txt
    └── storelist.py
├── www.eastmoney.com
    ├── company.py
    ├── guba.py
    ├── iguba.py
    ├── quote.py
    ├── transaction.py
    └── urls.txt
├── www.fang.com
    ├── get_hourse.py
    └── new_hourse.py
├── www.gamefaqs.com
    └── gameinfor.py
├── www.ganji.com
    └── ganji_tel.py
├── www.gewara.com
    └── reviews.py
├── www.guahao.com
    ├── doctor.py
    └── hospital.py
├── www.hexun.com
    └── hexun.py
├── www.ifeng.com
    └── fashionhealth.py
├── www.imdb.com
    ├── boxoffice.py
    ├── movies.py
    └── rottentomatoes.py
├── www.itjuzi.com
    ├── baseInvestevents.py
    ├── company.py
    ├── companylist.py
    ├── investevents.py
    ├── itjuzi.py
    └── tag_itjuzi.py
├── www.jfz.com
    └── products.py
├── www.jisilu.com
    ├── JiSiLu.py
    └── jisilu.py
├── www.kfc.com
    ├── citys.txt
    └── storelist.py
├── www.kimiss.com
    ├── Nyspider.py
    ├── baby.txt
    ├── baby_pro.txt
    ├── get_product.py
    └── man.txt
├── www.lagou.com
    └── lagou.py
├── www.lianjia.com
    └── lianjiahourse.py
├── www.liepin.com
    └── liepin.py
├── www.locoso.com
    └── locoso.py
├── www.mohurd.gov.cn
    ├── company.py
    ├── deal.py
    ├── registrar_thread.py
    └── registrarinfor.py
├── www.ncbi.nlm.nih.gov
    ├── gethtml.py
    ├── parser.py
    ├── pubmed.py
    └── write_to_excel.py
├── www.pizzahut.com.cn
    ├── citys.txt
    └── storelist.py
├── www.pm25.in
    └── pm25.py
├── www.ppdai.com
    ├── Tppdai.py
    ├── excel.py
    ├── get_data.py
    ├── invest.py
    ├── ppdai.py
    └── ppdaiInfor.py
├── www.renrendai.com
    └── renrendai.py
├── www.sxhouse.com.cn
    └── sxhouse.py
├── www.teld.cn
    ├── setting
    │   └── cities.txt
    └── teld.py
├── www.tichk.org
    └── travel_agent.py
├── www.tjcn.org
    └── patent.py
├── www.trademaps.cn
    └── trademaps.py
├── www.tripadvisor.com
    ├── deal.py
    ├── getpage.py
    ├── moredata.py
    └── userinfor.py
├── www.tyshbj.com.cn
    └── tyshbj.py
├── www.ukers.cn
    └── ukers.py
├── www.variflight.com
    ├── flights_num.txt
    ├── icon
    │   ├── 0
    │   │   ├── 20.png
    │   │   └── 23.png
    │   ├── 1
    │   │   ├── 1.png
    │   │   └── 4.png
    │   ├── 2
    │   │   ├── 0.png
    │   │   └── 33.png
    │   ├── 3
    │   │   ├── 43.png
    │   │   └── 64.png
    │   ├── 4
    │   │   ├── 3.png
    │   │   └── 9.png
    │   ├── 5
    │   │   ├── 71.png
    │   │   └── 8.png
    │   ├── 6
    │   │   ├── 19.png
    │   │   ├── 51.png
    │   │   └── 6.png
    │   ├── 7
    │   │   ├── 16.png
    │   │   └── 26.png
    │   ├── 8
    │   │   ├── 93.png
    │   │   └── 98.png
    │   ├── 9
    │   │   ├── 21.png
    │   │   └── 31.png
    │   ├── 24
    │   │   ├── 117.png
    │   │   ├── 304.png
    │   │   └── 783.png
    │   ├── 44
    │   │   ├── 141.png
    │   │   └── 88.png
    │   ├── b
    │   │   ├── 2202.png
    │   │   └── 2248.png
    │   ├── m
    │   │   ├── 2397.png
    │   │   ├── 2408.png
    │   │   └── 2419.png
    │   └── s
    │   │   ├── 2245.png
    │   │   ├── 2413.png
    │   │   └── 2424.png
    ├── ui_variflight.py
    └── variflight.py
├── www.vvic.com
    └── getitems.py
├── www.watchseries.li
    └── watchseries.py
├── www.we.com
    └── renrendai.py
├── www.yelp.com
    ├── restaurant_infor.py
    └── restaurants.py
├── www.yhd.com
    ├── data.xls
    ├── replace.py
    ├── shopinfor.py
    └── text.html
├── www.zdic.net
    ├── words.txt
    ├── write_to_excel.py
    └── zdic.py
├── www.zhongchou.com
    ├── Duplicate.py
    ├── excel.py
    ├── get_id.py
    ├── get_infor.py
    └── other.py
├── www.zimuzu.tv
    ├── movie_get.py
    └── tv_get.py
├── www.zy91.com
    └── zndz.py
├── wwwapps.ups.com
    ├── search.py
    └── write2excel.py
├── xxgk.jl.gov.cn
    └── infor.py
├── yangcong345.com
    └── yangcong345.py
├── zhidao.baidu.com
    ├── question.py
    └── search.py
├── zhihu
    ├── get_followee.py
    ├── top500.py
    ├── zhihu_search.py
    └── zhihuinfor.py
└── zsb.suda.edu.cn
    ├── inquery.py
    ├── markhistory.py
    └── new_markhistory.py


/Nyspider.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | import os
 5 | import sqlite3
 6 | import xlwt3
 7 | from email import encoders
 8 | from email.header import Header
 9 | from email.mime.text import MIMEText
10 | from email.utils import parseaddr,formataddr
11 | import smtplib
12 | import datetime
13 | 
14 | headers = {
15 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
16 |     "Accept-Encoding": "gzip, deflate",
17 |     "Accept-Language": "en-US,en;q=0.5",
18 |     "Connection": "keep-alive",
19 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
20 | 
21 | 
22 | def get_image(image_url,image_name):
23 |     content=requests.get(image_url,headers=headers).content
24 |     with open(image_name,'wb') as f:
25 |         f.write(content)
26 |         f.close
27 | 
28 | def to_Excel():
29 |     for filename in os.listdir('.'):
30 |         if(filename.endswith('txt')):
31 |             f_d=open(filename,'r')
32 |             f_ex=xlwt3.Workbook()
33 |             sheet=f_ex.add_sheet('one')
34 |             count=0
35 |             for line in f_d.readlines():
36 |                 lists=line.split('|')
37 |                 try:
38 |                     num=0
39 |                     for text in lists:
40 |                         sheet.write(count,num,text)
41 |                         num+=1
42 |                     count+=1
43 |                 except:
44 |                     sheet=f_ex.add_sheet('two')
45 |                     count=0
46 |                     num=0
47 |                     for text in lists:
48 |                         sheet.write(count,num,text)
49 |                         num+=1
50 |                     count+=1
51 |             f_ex.save(filename.replace('txt','xls'))
52 | 
53 | def send_email(email,subject,text,user,passwd):
54 |     smtp_server='smtp.126.com'
55 |     msg = MIMEText(text, 'plain', 'utf-8')
56 |     msg['Subject']=subject
57 |     msg['From'] = _format_addr(user)
58 |     msg['To'] = _format_addr(email)
59 |     server = smtplib.SMTP(smtp_server, 25)
60 |     server.set_debuglevel(1)
61 |     server.login(user, passwd)
62 |     server.sendmail(user, [email], msg.as_string())
63 |     server.quit()
64 | 
65 | def convert_html(html):
66 |     return html.encode('ISO-8859-1').decode('utf-8','ignore')
67 | 
68 | def Duplicate():
69 |     for filename in os.listdir('.'):
70 |         if filename.endswith('txt'):
71 |             lines=open(filename,'r').readlines()
72 |             lines=list(set(lines))
73 |             lines.sort()
74 |             f=open(filename,'w')
75 |             for line in lines:
76 |                 f.write(line)
77 |             f.close()
78 | 
79 | def yesterday_get(today=datetime.datetime.now()):
80 |     oneday = datetime.timedelta(days=1)
81 |     yesterday = today- oneday
82 |     return yesterday
83 | 


--------------------------------------------------------------------------------
/ali_comments/fan_jian.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | from langconv import *
 3 | import xlrd
 4 | import xlwt3
 5 | 
 6 | # 转换繁体到简体
 7 | def run():
 8 |     name='相机'
 9 |     f=xlwt3.Workbook(encoding='utf-8')
10 |     sheet=f.add_sheet('sheet')
11 |     data=xlrd.open_workbook(name+'.xls')
12 |     table=data.sheets()[0]
13 |     for i in range(table.nrows):
14 |         line=table.cell(i,0).value
15 |         line=fan_jian(line)
16 |         sheet.write(i,0,line)
17 |     f.save(name+'_.xls')
18 | 
19 | 
20 | def fan_jian(line):
21 |     line = Converter('zh-hans').convert(line)#.decode('utf-8'))
22 |     line = line#.encode('utf-8')
23 |     return line
24 | 
25 | def jian_fan(line):
26 |     line = Converter('zh-hant').convert(line.decode('utf-8'))
27 |     line = line.encode('utf-8')
28 |     return line
29 | 
30 | run()
31 | 


--------------------------------------------------------------------------------
/ali_comments/taobao.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | import xlwt3
 5 | import re
 6 | import requests
 7 | requests.packages.urllib3.disable_warnings()
 8 | 
 9 | class Get_comments(object):
10 |     """docstring for Get_comments"""
11 |     def __init__(self):
12 |         super(Get_comments, self).__init__()
13 |         self.f=xlwt3.Workbook()
14 |         self.sheet=self.f.add_sheet('sheet')
15 |         self.headers = {
16 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
17 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
18 |             'Accept-Language': 'en-US,en;q=0.5',
19 |             'Accept-Encoding': 'gzip, deflate',
20 |             'Cookie':"isg=1895AE3ACA648D8B28455A6D1992F41F; l=AvX1ovPGHd3jI30I58r3v3IcJXuvcqmE; t=1e6dd9b5d55aacb2ca5e07cb5be03a2b; thw=cn; cna=7Dd0DgMB+HcCAXrNCByTSHxR; uc3=nk2=1pCplIlkFn7n&id2=WvAz2mB1qeE%2F&vt3=F8dASMh%2Fnu8OGgfEtGM%3D&lg2=URm48syIIVrSKA%3D%3D; tracknick=%5Cu98A0%5Cu6C9B%5Cu4E4B%5Cu590F3; _cc_=URm48syIZQ%3D%3D; tg=0; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; mt=np=&ci=-1_0&cyk=0_0; ali_ab=211.69.194.131.1444291484725.8; lgc=%5Cu98A0%5Cu6C9B%5Cu4E4B%5Cu590F3; lzstat_uv=13179738183169067975|3492151@3600092@3288243@3260534; v=0; cookie2=1cdef8cc85ef4b19772fd48de808f9c0; _tb_token_=0BF8LVbNvUzT; uc1=cookie14=UoWzXLHAxnd7aw%3D%3D&existShop=true&cookie16=V32FPkk%2FxXMk5UvIbNtImtMfJQ%3D%3D&cookie21=WqG3DMC9Edo1SB5NB6Qtng%3D%3D&tag=2&cookie15=W5iHLLyFOGW7aA%3D%3D&pas=0; hng=CN%7Czh-cn%7CCHN; existShop=MTQ0NTE1NzMzOQ%3D%3D; sg=343; cookie1=BYTvDkInmXl2wO%2F6AW0tX%2Bpb6nHX4a5Olly%2Fg4DvWfE%3D; unb=907324234; skt=ae45361e45082d58; publishItemObj=Ng%3D%3D; _l_g_=Ug%3D%3D; _nk_=%5Cu98A0%5Cu6C9B%5Cu4E4B%5Cu590F3; cookie17=WvAz2mB1qeE%2F",
21 |             'Connection': 'keep-alive'}
22 |         self.count=0
23 |         import ssl
24 |         ssl._create_default_https_context = ssl._create_unverified_context
25 |         self.url='https://rate.taobao.com/feedRateList.htm?callback=jsonp_reviews_list&userNumId=84131819&auctionNumId=6774286903&siteID=3&rateType=&orderType=sort_weight&showContent=1&attribute=&currentPageNum='
26 |     def run(self):
27 |         cert='/home/nyloner/work/ali_comments/cert.pem'
28 |         for page in range(80):
29 |             html=requests.get(self.url+str(page+1),headers=self.headers,verify=False).text
30 |             print(html)
31 |             rel='content":"(.*?)"'
32 |             comments=re.findall(rel,html)
33 |             for item in comments:
34 |                 self.sheet.write(self.count,0,item)
35 |                 self.count+=1
36 |             self.f.save('麻辣花生.xls')
37 |             print(self.count)
38 | 
39 | work=Get_comments()
40 | work.run()
41 | 


--------------------------------------------------------------------------------
/amap/amap.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import time
 4 | from bs4 import BeautifulSoup
 5 | import random
 6 | 
 7 | headers = {
 8 |         'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 9 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
10 |         'Accept-Language': 'en-US,en;q=0.5',
11 |         'Accept-Encoding': 'gzip, deflate'}
12 | 
13 | def get_province():
14 |     html=requests.get('http://ditu.amap.com/service/poiInfo?query_type=TQUERY&pagesize=20&pagenum=1&qii=true&cluster_state=5&need_utd=true&utd_sceneid=1000&div=PC1000&addr_poi_merge=true&is_classify=true&city=100000&geoobj=19.198221%7C11.793397%7C-172.051779%7C53.547635&keywords=%E5%B9%B2%E6%9E%9C',headers=headers).text
15 |     data=json.loads(html)
16 |     table=BeautifulSoup(data['html'],'lxml').find_all('div',{'class':'sug-province'})
17 |     f=open('citys.txt','a')
18 |     for item in table:
19 |         try:
20 |             province=item.find('b').get_text()
21 |             citys=item.find_all('a',{'class':'citycode'})
22 |             for city in citys:
23 |                 f.write(province+'|'+city.get_text()+'|'+city.get('adcode')+'\n')
24 |         except:
25 |             continue
26 |     f.close()
27 | 
28 | def search(key,citycode):
29 |     page=1
30 |     result=[]
31 |     while True:
32 |         html=requests.get('http://ditu.amap.com/service/poiInfo?query_type=TQUERY&pagesize=20&pagenum=%s&qii=true&cluster_state=5&need_utd=true&div=PC1000&addr_poi_merge=true&is_classify=true&city=%s&keywords=%s'%(page,citycode,key),headers=headers).text
33 |         data=json.loads(html)['data'][0]['list']
34 |         if data==[]:
35 |             break
36 |         for item in data:
37 |             try:
38 |                 tel=item['templateData']['tel']
39 |                 address=item['address']
40 |                 name=item['name']
41 |                 result.append(name+'| '+address+' |'+tel)
42 |             except:
43 |                 continue
44 |         page+=1
45 |         print(citycode,page)
46 |         time.sleep(random.randint(2,8))
47 |     return result
48 | 
49 | def main():
50 |     for line in open('citys.txt','r'):
51 |         line=line.replace('\n','')
52 |         code=line.split('|')[-1]
53 |         try:
54 |             result=search('干果',code)
55 |         except:
56 |             failed=open('failed.txt','a')
57 |             failed.write(line+'\n')
58 |             failed.close()
59 |             continue
60 |         f=open('result.txt','a')
61 |         for item in result:
62 |             f.write(line+'|'+item+'\n')
63 |         f.close()
64 |         print(line)
65 | main()
66 |         


--------------------------------------------------------------------------------
/anjuke/location.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | 
 4 | 
 5 | def get_location(address,city):
 6 |     url='http://api.map.baidu.com/place/v2/search?query=%s&region=%s&city_limit=true&output=json&ak=fh980b9Ga64S8bl8QblSC3kq'%(address,city)
 7 |     html=requests.get(url).text
 8 |     try:
 9 |         data=json.loads(html)['results'][0]['location']
10 |     except:
11 |         return ''
12 |     lng=data['lng']
13 |     lat=data['lat']
14 |     return str(lng)+'|'+str(lat)
15 | 
16 |     
17 | line=get_location('滨湖新区四川路与云谷路交口西北角','合肥')
18 | 


--------------------------------------------------------------------------------
/baidumap/baidumap.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | import requests
 3 | import json
 4 | import time
 5 | import re
 6 | 
 7 | 
 8 | headers = {
 9 |     'Host':"map.baidu.com",
10 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
11 |     "Accept-Encoding": "gzip, deflate",
12 |     "Accept-Language": "en-US,en;q=0.5",
13 |     "Connection": "keep-alive",
14 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
15 | 
16 | def citys():
17 |     html=requests.get('http://map.baidu.com/?newmap=1&reqflag=pcmap&biz=1&from=webmap&da_par=baidu&pcevaname=pc4.1&qt=s&da_src=searchBox.button&wd=%E6%B1%BD%E8%BD%A6%E7%BE%8E%E5%AE%B9%E5%BA%97&c=1&src=0&wd2=&sug=0&l=5&b=(7002451.220000001,1994587.88;19470675.22,7343963.88)&from=webmap&biz_forward={%22scaler%22:1,%22styles%22:%22pl%22}&sug_forward=&tn=B_NORMAL_MAP&nn=0&u_loc=12736591.152491,3547888.166124&ie=utf-8&t=1459951988807',headers=headers).text
18 |     f=open('city_ids.txt','a')
19 |     data=json.loads(html)
20 |     for item in data['content']:
21 |         #for city in item['city']:
22 |             f.write(str(item)+'\n')
23 |     f.close()
24 | 
25 | def get_infor(keyword,code,page):
26 |     html=requests.get('http://map.baidu.com/?newmap=1&reqflag=pcmap&biz=1&from=webmap&da_par=baidu&pcevaname=pc4.1&qt=con&from=webmap&c='+str(code)+'&wd='+keyword+'&wd2=&pn='+str(page)+'&nn='+str(page*10)+'&db=0&sug=0&addr=0&&da_src=pcmappg.poi.page&on_gel=1&src=7&gr=3&l=12&tn=B_NORMAL_MAP&u_loc=12736591.152491,3547888.166124&ie=utf-8',headers=headers).text
27 |     data=json.loads(html)['content']
28 |     return data
29 | 
30 | 
31 | def main():
32 |     keys=['眼镜店','视光中心']
33 |     for keyword in keys:
34 |         f=open(keyword+'_tels.txt','a')
35 |         for line in open('city_ids.txt','r').readlines():
36 |             line=line.replace('\n','')
37 |             code=eval(line)['code']
38 |             page=1
39 |             while True:
40 |                 try:
41 |                     data=get_infor(keyword,code,page)
42 |                 except:
43 |                     break
44 |                 if data==[]:
45 |                     break
46 |                 for item in data:
47 |                     f.write(str(item)+'\n')
48 |                 page+=1
49 |                 print(code,page)
50 |                 time.sleep(1)
51 |         f.close()
52 | main()
53 | 


--------------------------------------------------------------------------------
/brokerbin.com/email_template:
--------------------------------------------------------------------------------
 1 | 
 2 | <!DOCTYPE html>
 3 | <html>
 4 |     <head>
 5 |         <meta charset="utf-8">
 6 |     </head>
 7 |     <body>
 8 |         <h3>
 9 |         Hi {name},
10 |         </h3>
11 |         <p>
12 |             We have following quote matches your "searching" on Brokerbin:
13 |         </p>
14 |         <p>
15 |             {product_name}  new landed {price}  each 3days delievery
16 |         </p>
17 |         <p>
18 |             <a href="{product_url}">
19 |             Click here to buy on our website.
20 |             </a>
21 |         </p>
22 |         <p>
23 |             Please know the special price only provide after you logined.
24 |         </p>
25 |         <p>
26 |             Thanks!
27 |         </p>
28 |         <p>
29 |         </p>
30 |         <p>
31 |             --
32 |         </p>
33 |         <p>
34 |             *Please register at our web-store:www.sailnetwork.com to check P&A 24-7. Regular coupons will be sent to register customers, and product prices will be lower online.
35 |             <br>
36 |             Best regards;
37 |             <br>
38 |             Sales Department | Sail Network Co., Ltd.
39 |             <br>
40 |             Office: +86(0)2154223056*8004
41 |             <br>
42 |             E-mail: sales@sailnetwork.com
43 |             <br>
44 |             E-Shop: www.sailnetwork.com
45 |             <br>
46 |             No.3-318, Lane7058, Zhongchun Rd. Shanghai, China
47 |         </p>
48 |     </body>
49 | </html>
50 | 


--------------------------------------------------------------------------------
/brokerbin.com/filter/filter.txt:
--------------------------------------------------------------------------------
 1 | nocsupply.com
 2 | nfsmith.nl
 3 | florinconnect.com
 4 | 3c-systerms.com
 5 | arbitech.com
 6 | fulinetwork.com
 7 | beaoncn.com
 8 | marketconnections.nl
 9 | globalnetworkstech.com
10 | konnect8.co.uk
11 | inventusgroup.com
12 | square1product.com
13 | squarelnc.com
14 | apexitltd.com
15 | uniontechcoop.com
16 | 


--------------------------------------------------------------------------------
/brokerbin.com/send_email.py:
--------------------------------------------------------------------------------
 1 | from email import encoders
 2 | from email.header import Header
 3 | from email.mime.text import MIMEText
 4 | from email.utils import parseaddr,formataddr
 5 | import smtplib
 6 | import time
 7 | import os
 8 | import json
 9 | 
10 | 
11 | def _format_addr(s):
12 |     name, addr = parseaddr(s)
13 |     return formataddr((Header(name, 'utf-8').encode(), addr))
14 | 
15 | def sendEmail(fromemail,passwd,toemail,subject,text):
16 |     msg = MIMEText(text, 'html', 'utf-8')
17 |     msg['Subject']=subject
18 |     msg['From'] = _format_addr(fromemail.replace('foxmail','sailnetwork'))
19 |     msg['To'] = _format_addr(toemail)
20 |     server=smtplib.SMTP_SSL('smtp.qq.com')
21 |     server.ehlo('smtp.qq.com')
22 |     server.login(fromemail,passwd)
23 |     server.sendmail(fromemail, [toemail], msg.as_string())
24 |     server.quit()
25 | 
26 | def load_emails(filename):
27 |     f=open('email/'+filename,'r',encoding='utf-8').read()
28 |     emails=[]
29 |     for item in f.split('---'*8):
30 |         try:
31 |             lines=item.split('***'*4)
32 |             subject=lines[0].replace('\r\n','')
33 |             email=lines[1].replace('\r\n','').replace(' ','')
34 |             text=lines[2]
35 |             emails.append([email,subject,text])
36 |         except:
37 |             continue
38 |     return emails
39 | 
40 | def load_login():
41 |     f=open('./email.json','r',encoding='utf8')
42 |     data=json.load(f)
43 |     return data
44 | 
45 | def main():
46 |     try:
47 |         data=load_login()
48 |         fromemail=data['fromemail']
49 |         passwd=data['passwd']
50 |         toemail=data['toemail']
51 |     except:
52 |         print("帐号导入失败")
53 |         return
54 |     for filename in os.listdir('email'):
55 |         try:
56 |             emails=load_emails(filename)
57 |         except:
58 |             print(filename,'load failed')
59 |         for i in range(len(emails)):
60 |             try:
61 |                 email=emails[i]
62 |                 subject=email[1].replace('\r','').replace('\n','').replace('\t','').replace(' ','')+'\t'+email[0].replace('\r','').replace('\n','').replace('\t','').replace(' ','')
63 |             except:
64 |                 continue
65 |             try:
66 |                 sendEmail(fromemail,passwd,toemail,subject,email[2])
67 |                 time.sleep(2)
68 |                 print(subject,'send ok')
69 |             except:
70 |                 print(subject,'failed')
71 |     print(filename,'完成')
72 |                 
73 | main()
74 | time.sleep(60)
75 | 


--------------------------------------------------------------------------------
/buluo.qq.com/images.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import os
 4 | import json
 5 | import time
 6 | 
 7 | 
 8 | headers = {
 9 |     'X-Requested-With': 'XMLHttpRequest',
10 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
11 |     "Accept-Encoding": "gzip, deflate",
12 |     "Accept-Language": "en-US,en;q=0.5",
13 |     "Connection": "keep-alive",
14 |     'Referer': 'http://buluo.qq.com/mobile/barindex.html?_wv=1027&_bid=128&from=recentvisited&bid=15226',
15 |     "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36"}
16 | 
17 | def get_page(bid,page):
18 |     data={
19 |     'bid':bid,
20 |     'num':'10',
21 |     'start':page*10,
22 |     'bkn':''
23 |     }
24 |     html=requests.post('http://buluo.qq.com/cgi-bin/bar/post/get_post_by_page',headers=headers,data=data).text
25 |     data=json.loads(html)['result']['posts']
26 |     result=[]
27 |     for item in data:
28 |         try:
29 |             title=item['title']
30 |             pic_list=item['post']['pic_list']
31 |         except:
32 |             continue
33 |         result.append([title,pic_list])
34 |     return result
35 | 
36 | def save_image(filedir,filename,img_url):
37 |     headers = {
38 |         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
39 |         "Accept-Encoding": "gzip, deflate",
40 |         "Accept-Language": "en-US,en;q=0.5",
41 |         "Connection": "keep-alive",
42 |         "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36"}
43 |     content=requests.get(img_url,headers=headers,timeout=30).content
44 |     with open('images/%s/%s.jpg'%(filedir,filename),'wb') as img:
45 |         img.write(content)
46 | 
47 | def main():
48 |     bid=input("输入bid:")
49 |     try:
50 |         startpage=input("起始页码:")
51 |         startpage=int(startpage)-1
52 |     except:
53 |         startpage=0
54 |     try:
55 |         endpage=input("结束页码:")
56 |         endpage=int(endpage)-1
57 |     except:
58 |         endpage=10
59 |     filedir=1
60 |     try:
61 |         os.mkdir('images/')
62 |     except:
63 |         pass
64 |     while startpage<=endpage:
65 |         images=get_page(bid,startpage)
66 |         for image in images:
67 |             try:
68 |                 os.mkdir('images/'+str(filedir))
69 |             except:
70 |                 pass
71 |             f=open('images/%s/content.txt'%filedir,'a',encoding='utf-8')
72 |             f.write(image[0])
73 |             f.close()
74 |             imgnum=1
75 |             for img in image[1]:
76 |                 try:
77 |                     save_image(filedir,imgnum,img['url'])
78 |                 except:
79 |                     continue
80 |                 imgnum+=1
81 |             print('page',startpage,filedir,'ok')
82 |             filedir+=1
83 |         startpage+=1
84 |         print(startpage,'ok')
85 |         time.sleep(2)
86 | 
87 | main()
88 | 


--------------------------------------------------------------------------------
/chart.cp.360.cn/charthistory.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import time
 4 | import datetime
 5 | 
 6 | headers = {
 7 |         'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 8 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 9 |         'Accept-Language': 'en-US,en;q=0.5',
10 |         'Accept-Encoding': 'gzip, deflate',
11 |         'Connection': 'keep-alive'}
12 | 
13 | def get_history(date):
14 |     url='http://chart.cp.360.cn/kaijiang/kaijiang?lotId=255401&spanType=2&span=%s_%s'%(date,date)
15 |     html=requests.get(url,headers=headers).text.encode('iso-8859-1').decode('gbk')
16 |     tables=BeautifulSoup(html,'lxml').find('div',id='his-tab').find('table',{'width':'100%'}).find_all('table')
17 |     result=[]
18 |     for table in tables:
19 |         for tr in table.find_all('tr'):
20 |             try:
21 |                 tds=tr.find_all('td')
22 |                 number=tds[0].get_text()
23 |                 if number=='':
24 |                     continue
25 |                 value=tds[1].get_text()
26 |                 if value=='':
27 |                     continue
28 |                 value1=value[:3]
29 |                 value2=value[1:4]
30 |                 value3=value[2:]
31 |                 result.append([date,number,value,value1,value2,value3])
32 |             except:
33 |                 continue
34 |     return result
35 | 
36 | def nextday(d):
37 |     oneday = datetime.timedelta(days=1)
38 |     day = d+oneday
39 |     return day
40 | 
41 | def main():
42 |     day=datetime.datetime.strptime('2010-01-01','%Y-%m-%d')
43 |     while True:
44 |         str_day=str(day).split(' ')[0]
45 |         f=open('result.txt','a')
46 |         try:
47 |             result=get_history(str_day)
48 |         except:
49 |             print(str_day,'failed')
50 |             time.sleep(1)
51 |             continue
52 |         for item in result:
53 |             f.write(str(item)+'\n')
54 |         f.close()
55 |         day=nextday(day)
56 |         print(str_day,'ok')
57 |         time.sleep(1)
58 |         if str_day=='2016-10-23':
59 |             break
60 | 
61 | main()
62 | 


--------------------------------------------------------------------------------
/china.tandfonline.com/search_article.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import openpyxl
 4 | import time
 5 | 
 6 | headers = {
 7 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 8 |     "Accept-Encoding": "gzip, deflate",
 9 |     "Accept-Language": "en-US,en;q=0.5",
10 |     "Connection": "keep-alive",
11 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
12 | 
13 | def get_articles():
14 |     page=0
15 |     while True:
16 |         html=requests.get('http://china.tandfonline.com/action/doSearch?AllField=urban+design&Ppub=%5B20151107+TO+20161107%5D&content=standard&countTerms=true&target=default&sortBy=&pageSize=50&subjectTitle=&startPage='+str(page),headers=headers).text
17 |         table=BeautifulSoup(html,'lxml').find('ol',{'class':'search-results'}).find_all('li')
18 |         f=open('titles.txt','a')
19 |         for item in table:
20 |             title=item.find('article').get('data-title')
21 |             f.write(title+'\n')
22 |         f.close()
23 |         page+=1
24 |         print('抓取第',page,'页')
25 |         #time.sleep(1)
26 |         if page==267:
27 |             break
28 | 
29 | def word_cut():
30 |     text=open('./titles.txt','r').read()
31 |     text=text.replace(':',' ').replace("?",' ').replace('.','').replace(')',' ').replace('(','').replace('+','').replace('“','').replace('”','').replace('\n','')
32 |     words=text.split(' ')
33 |     result={}
34 |     for word in words:
35 |         word=word.lower()
36 |         try:
37 |             result[word]+=1
38 |         except:
39 |             result[word]=1
40 | 
41 |     excel=openpyxl.Workbook(write_only=True)
42 |     sheet=excel.create_sheet()
43 |     for key in result:
44 |         sheet.append([key,result[key]])
45 |     excel.save('result.xlsx')
46 | 
47 | get_articles()
48 | 


--------------------------------------------------------------------------------
/club.qingdaonews.com/article.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import openpyxl
 4 | 
 5 | headers = {
 6 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 7 |     "Accept-Encoding": "gzip, deflate",
 8 |     "Accept-Language": "en-US,en;q=0.5",
 9 |     "Connection": "keep-alive",
10 |     'Cookie':'PHPSESSID=d2a521b9298f8691e4c37487b6657ac3; Hm_lvt_099a2f2a4f2c2f042dbd360b42309fc4=1482199772; Hm_lpvt_099a2f2a4f2c2f042dbd360b42309fc4=1482199852; CNZZDATA1000084976=1383072779-1482195841-null%7C1482195841; username=JarMrmn4olyPFzOAltjC0Q%3D%3D; password=jv2Y7Ga10EoO2Tn3W%2FY1plZvYz1QGqB2; NSC_dmvc=ffffffff09020e0445525d5f4f58455e445a4a423660',
11 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
12 | 
13 | 
14 | def get_article(endpage):
15 |     page=1
16 |     result=[]
17 |     while True:
18 |         url='http://club.qingdaonews.com/usercenter/mytopic.php?page=%s'%page
19 |         try:
20 |             html=requests.get(url,headers=headers,timeout=30).text
21 |         except:
22 |             continue
23 |         table=BeautifulSoup(html,'lxml').find('div',{'class':'add_list'}).find_all('li')
24 |         for li in table:
25 |             try:
26 |                 url='http://club.qingdaonews.com'+li.find('a').get('href')
27 |                 title=li.find('a').get_text()
28 |                 result.append([title,url])
29 |             except:
30 |                 continue
31 |         if page==endpage:
32 |             break
33 |         print(page,'ok')
34 |         page+=1
35 |     return result
36 | 
37 | def main():
38 |     result=get_article(168)
39 |     excel=openpyxl.Workbook(write_only=True)
40 |     sheet=excel.create_sheet()
41 |     for line in result:
42 |         sheet.append(line)
43 |     excel.save('urls.xlsx')
44 | 
45 | main()
46 | 


--------------------------------------------------------------------------------
/cn.bing.com/urls.txt:
--------------------------------------------------------------------------------
 1 | www.azlyrics.com
 2 | www.metrolyrics.com/
 3 | lyrics.wikia.com
 4 | www.songlyrics.com
 5 | www.musixmatch.com/
 6 | www.lyricsfreak.com/
 7 | www.lyricsmode.com/
 8 | www.directlyrics.com/
 9 | www.darklyrics.com/
10 | www.allthelyrics.com
11 | www.sing365.com/
12 | www.lyricsg.com
13 | www.parolesmania.com/
14 | www.sweetslyrics.com
15 | azlyricdb.com
16 | www.musicsonglyrics.com/
17 | www.honeyguide.co.uk
18 | songmeanings.com/
19 | www.lyricsforsong.net/
20 | www.elyrics.com
21 | www.lyricsreg.com
22 | batlyrics.net/
23 | genius.com/
24 | www.lyricspond.com/
25 | artists.letssingit.com/
26 | www.cduniverse.com/
27 | www.leoslyrics.com/
28 | www.lyrster.com/
29 | www.smartlyrics.com/
30 | www.lyrics007.com/
31 | www.classic-country-song-lyrics.com/
32 | 


--------------------------------------------------------------------------------
/data.cma.gov.cn/Duplicate.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import os
 4 | 
 5 | def Duplicate():
 6 |     for filename in os.listdir('.'):
 7 |         if filename.endswith('txt'):
 8 |             lines=open(filename,'r').readlines()
 9 |             lines=list(set(lines))
10 |             lines.sort()
11 |             f=open(filename,'w')
12 |             for line in lines:
13 |                 f.write(line)
14 |             f.close()
15 | 
16 | Duplicate()
17 | 


--------------------------------------------------------------------------------
/datacenter.mep.gov.cn/air_dairy.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import openpyxl
 4 | import time
 5 | 
 6 | headers = {
 7 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 8 |     "Accept-Encoding": "gzip, deflate",
 9 |     "Accept-Language": "en-US,en;q=0.5",
10 |     "Connection": "keep-alive",
11 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
12 | 
13 | 
14 | def get_table(url):
15 |     html=requests.get(url,headers=headers).text
16 |     table=BeautifulSoup(html,'html.parser').find('table',id='report1').find_all('tr')
17 |     result=[]
18 |     for tr in table[2:-3]:
19 |         item=''
20 |         for td in tr.find_all('td'):
21 |             item+=td.get_text()+'|'
22 |         result.append(item)
23 |     return result
24 | 
25 | def main():
26 |     text_f=open('2014_2016.txt','w',encoding='utf-8')
27 |     startdate='2014-01-01'#起始日期
28 |     enddate='2016-07-19'#结束日期
29 |     startpage=1#起始页码
30 |     endpage=10#结束页码
31 |     while startpage<=endpage:
32 |         url='http://datacenter.mep.gov.cn/report/air_daily/air_dairy.jsp?city=&startdate={}&enddate={}&page={}'.format(startdate,enddate,startpage)
33 |         try:
34 |             items=get_table(url)
35 |         except:
36 |             time.sleep(2)
37 |             print(startpage,'-failed')
38 |             continue
39 |         for item in items:
40 |             text_f.write(item+'\n')
41 |         print(startpage,'-ok')
42 |         startpage+=1
43 |     text_f.close()
44 |     write_to_excel()
45 | 
46 | def write_to_excel():
47 |     excel=openpyxl.Workbook(write_only=True)
48 |     sheet=excel.create_sheet()
49 |     for line in open('2014_2016.txt','r',encoding='utf-8'):
50 |         line=line.replace('\n','')
51 |         sheet.append(line.split('|'))
52 |     excel.save('2014_2016.xlsx')
53 | 
54 | main()
55 | 


--------------------------------------------------------------------------------
/datacenter.mep.gov.cn/air_dairy_aqi.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import openpyxl
 4 | import time
 5 | 
 6 | headers = {
 7 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 8 |     "Accept-Encoding": "gzip, deflate",
 9 |     "Accept-Language": "en-US,en;q=0.5",
10 |     "Connection": "keep-alive",
11 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
12 | 
13 | 
14 | def get_table(url):
15 |     html=requests.get(url,headers=headers).text
16 |     table=BeautifulSoup(html,'html.parser').find('table',id='report1').find_all('tr')
17 |     result=[]
18 |     for tr in table[2:-3]:
19 |         item=''
20 |         for td in tr.find_all('td'):
21 |             item+=td.get_text()+'|'
22 |         result.append(item)
23 |     return result
24 | 
25 | def main():
26 |     text_f=open('2000_2014.txt','w',encoding='utf-8')
27 |     startdate='2000-01-01'#起始日期
28 |     enddate='2015-12-31'#结束日期
29 |     startpage=1#起始页码
30 |     endpage=10#结束页码
31 |     while startpage<=endpage:
32 |         url='http://datacenter.mep.gov.cn/report/air_daily/air_dairy_aqi.jsp?city=&startdate={}&enddate={}&page={}'.format(startdate,enddate,startpage)
33 |         try:
34 |             items=get_table(url)
35 |         except:
36 |             time.sleep(2)
37 |             print(startpage,'-failed')
38 |             continue
39 |         for item in items:
40 |             text_f.write(item+'\n')
41 |         print(startpage,'-ok')
42 |         startpage+=1
43 |     text_f.close()
44 |     write_to_excel()
45 | 
46 | def write_to_excel():
47 |     excel=openpyxl.Workbook(write_only=True)
48 |     sheet=excel.create_sheet()
49 |     for line in open('2000_2014.txt','r',encoding='utf-8'):
50 |         line=line.replace('\n','')
51 |         sheet.append(line.split('|'))
52 |     excel.save('2000_2014.xlsx')
53 | 
54 | main()
55 | 


--------------------------------------------------------------------------------
/dianping/data/上海.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/上海.xls


--------------------------------------------------------------------------------
/dianping/data/北京.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/北京.xls


--------------------------------------------------------------------------------
/dianping/data/南京.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/南京.xls


--------------------------------------------------------------------------------
/dianping/data/厦门.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/厦门.xls


--------------------------------------------------------------------------------
/dianping/data/大连.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/大连.xls


--------------------------------------------------------------------------------
/dianping/data/天津.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/天津.xls


--------------------------------------------------------------------------------
/dianping/data/宁波.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/宁波.xls


--------------------------------------------------------------------------------
/dianping/data/广州.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/广州.xls


--------------------------------------------------------------------------------
/dianping/data/成都.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/成都.xls


--------------------------------------------------------------------------------
/dianping/data/无锡.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/无锡.xls


--------------------------------------------------------------------------------
/dianping/data/杭州.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/杭州.xls


--------------------------------------------------------------------------------
/dianping/data/武汉.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/武汉.xls


--------------------------------------------------------------------------------
/dianping/data/沈阳.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/沈阳.xls


--------------------------------------------------------------------------------
/dianping/data/济南.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/济南.xls


--------------------------------------------------------------------------------
/dianping/data/深圳.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/深圳.xls


--------------------------------------------------------------------------------
/dianping/data/苏州.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/苏州.xls


--------------------------------------------------------------------------------
/dianping/data/西安.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/西安.xls


--------------------------------------------------------------------------------
/dianping/data/郑州.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/郑州.xls


--------------------------------------------------------------------------------
/dianping/data/重庆.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/重庆.xls


--------------------------------------------------------------------------------
/dianping/data/长沙.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/长沙.xls


--------------------------------------------------------------------------------
/dianping/data/青岛.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/dianping/data/青岛.xls


--------------------------------------------------------------------------------
/dianping/shopinfor.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import time
 6 | headers = {
 7 |     'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 8 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 9 |     'Accept-Language': 'en-US,en;q=0.5',
10 |     'Accept-Encoding': 'gzip, deflate',
11 |     'Connection': 'keep-alive'}
12 | 
13 | def get_infor():
14 |     urls=['https://www.dianping.com/search/category/2/10/r2588o2p','https://www.dianping.com/search/category/2/10/r1493o2p','https://www.dianping.com/search/category/2/10/r1490o2p']
15 |     f=open('haidian.txt','a',encoding='utf-8')
16 |     for url in urls:
17 |         page=1
18 |         while page<=50:
19 |             try:
20 |                 html=requests.get(url+str(page),headers=headers,timeout=30).text
21 |             except:
22 |                 continue
23 |             table=BeautifulSoup(html,'lxml').find('div',id='shop-all-list').find_all('li')
24 |             for li in table:
25 |                 try:
26 |                     soup=li.find('div',attrs={'class':'txt'})
27 |                     tit=soup.find('div',attrs={'class':'tit'})
28 |                     comment=soup.find('div',attrs={'class':'comment'})
29 |                     tag_addr=soup.find('div',attrs={'class':'tag-addr'})
30 |                     text=tit.find('a').get_text().replace('\r','').replace('\n','')+'||'+comment.find('span').get('title')+'||'+comment.find('a',attrs={'class':'review-num'}).get_text().replace('\r','').replace('\n','')+'||'+comment.find('a',attrs={'class':'mean-price'}).get_text().replace('\r','').replace('\n','')+'||'+tag_addr.find('span',attrs={'class':'tag'}).get_text().replace('\r','').replace('\n','')+'||'+tag_addr.find('span',attrs={'class':'addr'}).get_text().replace('\r','').replace('\n','')+'||'
31 |                     comment_list=soup.find('span',attrs={'class':'comment-list'}).find_all('span')
32 |                     for i in comment_list:
33 |                         text+='||'+i.get_text().replace('\r','').replace('\n','')
34 |                     for i in tit.find('div',attrs={'class':'promo-icon'}).find_all('a'):
35 |                         try:
36 |                             text+='||'+i.get('class')
37 |                         except:
38 |                             text+='||'+i.get('class')[0]
39 |                     f.write(text.replace(' ','')+'\n')
40 |                 except:
41 |                     continue
42 |             page+=1
43 |             print(page)
44 |             time.sleep(1)
45 |     f.close()
46 | 
47 | get_infor()
48 | 


--------------------------------------------------------------------------------
/dianping/shoplist.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | import json
 5 | import xlwt3
 6 | import os
 7 | 
 8 | headers = {
 9 |     'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
10 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
11 |     'Accept-Language': 'en-US,en;q=0.5',
12 |     'Accept-Encoding': 'gzip, deflate',
13 |     'Connection': 'keep-alive'}
14 | 
15 | def get_data(url):
16 |     html=requests.get(url,headers=headers).text
17 |     data=json.loads(html)['shopBeans']
18 |     return data
19 | 
20 | def shoplist():
21 |     try:
22 |         os.mkdir('data')
23 |     except:
24 |         print('--')
25 |     items={'最佳餐厅':'http://www.dianping.com/mylist/ajax/shoprank?cityId=%s&shopType=10&rankType=score&categoryId=0','人气餐厅':'http://www.dianping.com/mylist/ajax/shoprank?cityId=%s&shopType=10&rankType=popscore&categoryId=0','口味最佳':'http://www.dianping.com/mylist/ajax/shoprank?cityId=%s&shopType=10&rankType=score1&categoryId=0','环境最佳':'http://www.dianping.com/mylist/ajax/shoprank?cityId=%s&shopType=10&rankType=score2&categoryId=0','服务最佳':'http://www.dianping.com/mylist/ajax/shoprank?cityId=%s&shopType=10&rankType=score3&categoryId=0'}
26 |     citys={'北京':'2','上海':'1','广州':'4','深圳':'7','成都':'8','重庆':'9','杭州':'3','南京':'5','沈阳':'18','苏州':'6','天津':'10','武汉':'16','西安':'17','长沙':'344','大连':'19','济南':'22','宁波':'11','青岛':'21','无锡':'13','厦门':'15','郑州':'160'}
27 |     excel=xlwt3.Workbook()
28 |     sheet=excel.add_sheet('sheet')
29 |     count=0
30 |     for city in citys:
31 |         for key in items:
32 |             try:
33 |                 data=get_data(items[key]%(citys[city]))
34 |             except:
35 |                 print('Error!')
36 |                 continue
37 |             num=1
38 |             for item in data:
39 |                 sheet.write(count,0,str(count+1))
40 |                 sheet.write(count,1,key)
41 |                 sheet.write(count,2,city)
42 |                 sheet.write(count,3,num)
43 |                 sheet.write(count,4,item['filterFullName'])
44 |                 sheet.write(count,5,item['mainRegionName'])
45 |                 sheet.write(count,6,item['refinedScore1'])
46 |                 sheet.write(count,7,item['refinedScore2'])
47 |                 sheet.write(count,8,item['refinedScore3'])
48 |                 sheet.write(count,9,item['avgPrice'])
49 |                 if '(' in item['filterFullName'] or '（' in item['filterFullName']:
50 |                     sheet.write(count,10,'Y')
51 |                 else:
52 |                     sheet.write(count,10,'N')
53 |                 num+=1
54 |                 count+=1
55 |         print(city+'--OK')
56 |         excel.save('data/data.xls')
57 | 
58 | shoplist()
59 | 


--------------------------------------------------------------------------------
/douban/movie_grade.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import json
 4 | 
 5 | headers = {
 6 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 7 |     "Accept-Encoding": "gzip, deflate",
 8 |     "Accept-Language": "en-US,en;q=0.5",
 9 |     "Connection": "keep-alive",
10 |     "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36"}
11 | 
12 | def comments(movieid,fromdate,todate):
13 |     start=0
14 |     rating={}
15 |     comments={}
16 |     while True:
17 |         url='https://m.douban.com/rexxar/api/v2/movie/{}/interests?count=20&order_by=latest&start={}&ck=&for_mobile=1'.format(movieid,start)
18 |         html=requests.get(url,headers=headers).text
19 |         print(movieid,start)
20 |         start+=25
21 |         data=json.loads(html)['interests']
22 |         if len(data)==0:
23 |             break
24 |         for item in data:
25 |             date=item['create_time'].split(' ')[0]
26 |             int_date=int(date.replace('-',''))
27 |             if int_date>todate:
28 |                 continue
29 |             if int_date<fromdate:
30 |                 continue
31 | 
32 |             f=open('data.txt','a')
33 |             f.write(str(item)+'\n')
34 |             f.close()
35 |             try:
36 |                 value=item['rating']['value']
37 |                 try:
38 |                     rating[date].append(value)
39 |                 except:
40 |                     rating[date]=[value]
41 |             except:
42 |                 pass
43 |             vote_count=item['vote_count']
44 |             if vote_count!=0:
45 |                 try:
46 |                     comments[date]+=1
47 |                 except:
48 |                     comments[date]=1
49 |     return {'rating':rating,'comments':comments}
50 | 
51 | def load_movie():
52 |     movies=[]
53 |     for line in open('movies.txt','r'):
54 |         line=line.replace('\n','')
55 |         movies.append(line.split('|'))
56 |     return movies
57 | 
58 | def main():
59 |     movies=load_movie()
60 |     for movie in movies:
61 |         movieid=movie[-1]
62 |         fromdate=int(movie[1])
63 |         todate=int(movie[2])
64 |         result=comments(movieid,fromdate,todate)
65 |         f=open('result.txt','a')
66 |         f.write(str(result)+'\n')
67 |         f.close()
68 |         print(movie,'ok')
69 | 
70 | main()
71 | 


--------------------------------------------------------------------------------
/douban/movies.txt:
--------------------------------------------------------------------------------
 1 | 爱情麻辣烫之情定终身|20160307|20160408|25907086
 2 | 爱在深秋|20160121|20160222|26351519
 3 | 澳门风云3|20160207|20160402|26334559
 4 | 爸爸我来救你了|20160122|20160321|26677940
 5 | 奔爱|20160213|20160404|26637534
 6 | 恶灵之门|20160121|20160321|26661696
 7 | 恶人报喜|20160122|20160229|26219651
 8 | 高跟鞋先生|20160213|20160413|26652816
 9 | 恭喜发财之谈钱说爱|20160218|20160408|26718806
10 | 诡娃|20160224|20160331|26680909
11 | 国酒|20160114|20160220|26647899
12 | 果宝特工之水果大逃亡|20160122|20160317|26290381
13 | 过年好|20160131|20160310|26698301
14 | 极限挑战之皇家宝藏|20160114|20160229|26607739
15 | 箭士柳白猿|20160310|20160421|7564989
16 | 开罗宣言|20160310|20160509|25945367
17 | 美人鱼|20160207|20160407|19944106
18 | 谋杀似水年华|20160213|20160331|26263919
19 | 年兽大作战|20160207|20160404|26361479
20 | 青蛙王国之冰冻大冒险|20160218|20160418|26253893
21 | 唐人街探案|20151230|20160228|26311973
22 | 卧虎藏龙之青冥宝剑|20160218|20160415|21327512
23 | 西游记之三打白骨精|20160207|20160407|25827963
24 | 熊出没之熊心归来|20160115|20160315|26330955
25 | 叶问3|20160303|20160421|11598977
26 | 一家老小向前冲|20160121|20160228|26630517
27 | 真相禁区|20160121|20160224|26348798
28 | 蒸发太平洋|20160128|20160303|26279449
29 | 致我们终将到来的爱情|20160128|20160317|26413306
30 | 


--------------------------------------------------------------------------------
/downloadbooks/save_into_baiduyun.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | import time
 3 | 
 4 | def get_browser():
 5 |     browser=webdriver.Firefox()
 6 |     browser.get('http://yun.baidu.com/#from=share_yun_logo')
 7 |     browser.implicitly_wait(10)
 8 |     time.sleep(2)
 9 |     return browser
10 | 
11 | def main():
12 |     browser=get_browser()
13 |     input('login?')
14 |     for line in open('books.txt','r'):
15 |         line=line.replace('\n','')
16 |         url=line.split('||')[-1]
17 |         if 'http://pan.baidu.com' not in url:
18 |             continue
19 |         browser.get(url)
20 |         try:
21 |             browser.find_element_by_id('emphasizeButton').click()
22 |             time.sleep(3)
23 |             browser.find_element_by_id('_disk_id_6').click()
24 |             time.sleep(3)
25 |         except:
26 |             continue
27 | 
28 | main()
29 | 


--------------------------------------------------------------------------------
/finance.sina.com.cn/ManagerInfo.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import time
 4 | import urllib
 5 | 
 6 | headers = {
 7 |         'Host':"vip.stock.finance.sina.com.cn",
 8 |         'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 9 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
10 |         'Accept-Language': 'en-US,en;q=0.5',
11 |         'Accept-Encoding': 'gzip, deflate',
12 |         'Connection': 'keep-alive'}
13 | 
14 | def get_managers(code):
15 |     html=requests.get('http://vip.stock.finance.sina.com.cn/corp/go.php/vCI_CorpManager/stockid/%s.phtml'%code,headers=headers).text.encode('iso-8859-1').decode('gbk')
16 |     table=BeautifulSoup(html,'lxml').find('div',id='con02-6').find_all('tr')
17 |     result=[]
18 |     for item in table:
19 |         if '独立董事' not in str(item):
20 |             continue
21 |         tds=item.find_all('td')
22 |         line=''
23 |         for td in tds:
24 |             line+=td.get_text()+'|'
25 |         line+=item.find('a').get('href')
26 |         line=line.replace('\r','').replace('\n','')
27 |         result.append(line)
28 |     return result
29 | 
30 | def manager_infor(code,url):
31 |     word=url.split('=')[-1]
32 |     key=word.encode('gbk')
33 |     key=urllib.parse.quote(key)
34 |     html=requests.get(url.replace(word,key),headers=headers).text.encode('iso-8859-1').decode('gbk')
35 |     table=BeautifulSoup(html,'lxml').find('table',id='Table1').find('tbody').find_all('tr')
36 |     line=''
37 |     for item in table:
38 |         for td in item.find_all('td'):
39 |             line+='|'+td.get_text()
40 |     line=line.replace('\r','').replace('\n','')
41 |     return line
42 | 
43 | 
44 | def manegers():
45 |     for line in open('./codes.txt','r'):
46 |         code=line.replace('\r','').replace('\n','').replace('\t','').replace(' ','')
47 |         try:
48 |             result=get_managers(code)
49 |         except:
50 |             failed=('failed','a')
51 |             failed.write(line)
52 |             failed.close()
53 |             continue
54 |         f=open('manager.txt','a')
55 |         for item in result:
56 |             f.write(code+'|'+item+'\n')
57 |         f.close()
58 |         print(code,'ok')
59 | 
60 | def main():
61 |     urls={}
62 |     for line in open('./manager.txt','r',encoding='utf-8'):
63 |         line=line.replace('\n','')
64 |         url='http://vip.stock.finance.sina.com.cn/'+line.split('|')[-1]
65 |         f=open('result.txt','a',encoding='utf-8')
66 |         if url in urls:
67 |             f.write(line+urls[url]+'\n')
68 |             f.close()
69 |             continue
70 |         try:
71 |             item=manager_infor(line.split('|')[0],url)
72 |         except:
73 |             failed=open('failed','a',encoding='utf-8')
74 |             failed.write(line+'\n')
75 |             failed.close()
76 |             continue
77 |         urls[url]=item
78 |         f.write(line+item+'\n')
79 |         f.close()
80 |         print(url,'ok')
81 |     f.close()
82 | main()
83 | 


--------------------------------------------------------------------------------
/finance.yahoo.com/finance.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | import requests
 3 | from bs4 import BeautifulSoup
 4 | import xlwt3
 5 | 
 6 | 
 7 | 
 8 | headers = {
 9 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
10 |     "Accept-Encoding": "gzip, deflate",
11 |     "Accept-Language": "en-US,en;q=0.5",
12 |     "Connection": "keep-alive",
13 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
14 | 
15 | def get_infor(name):
16 |     url='http://finance.yahoo.com/q?s='+name
17 |     try:
18 |         html=requests.get(url,headers=headers).text
19 |         table=BeautifulSoup(html).find('table',id='table1').find_all('tr')
20 |         infor={}
21 |         for item in table:
22 |             if item.find('th').get_text()=='Beta:':
23 |                 infor['Beta']=item.find('td').get_text()
24 |         table=BeautifulSoup(html).find('table',id='table2').find_all('tr')
25 |         for item in table:
26 |             if item.find('th').get_text()[0]=='P':
27 |                 infor['PE']=item.find('td').get_text()
28 |         return infor
29 |     except:
30 |         infor={}
31 |         infor['Beta']=''
32 |         infor['PE']=''
33 |         return infor
34 | 
35 | class Main():
36 |     def __init__(self):
37 |         self.f=xlwt3.Workbook()
38 |         self.sheet=self.f.add_sheet('sheet')
39 |         self.count=0
40 |     def run(self):
41 |         file=open('data.txt','r')
42 |         for line in file:
43 |             data=line.strip().split(' ')
44 |             infor=get_infor(data[1])
45 |             self.sheet.write(self.count,0,data[1])
46 |             self.sheet.write(self.count,1,infor['Beta'])
47 |             self.sheet.write(self.count,2,infor['PE'])
48 |             self.count+=1
49 |             self.f.save('data.xls')
50 |             print(self.count)
51 | 
52 | work=Main()
53 | work.run()
54 | 


--------------------------------------------------------------------------------
/finance.yahoo.com/new_finance.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | import requests
 3 | from bs4 import BeautifulSoup
 4 | import xlwt3
 5 | import xlrd
 6 | import threading
 7 | 
 8 | 
 9 | headers = {
10 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
11 |     "Accept-Encoding": "gzip, deflate",
12 |     "Accept-Language": "en-US,en;q=0.5",
13 |     "Connection": "keep-alive",
14 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
15 | class Get_infor(threading.Thread):
16 |     def __init__(self,name):
17 |         super(Get_infor,self).__init__()
18 |         self.name=name
19 |         self.url='http://finance.yahoo.com/q?s='+name
20 |     def run(self):
21 |         try:
22 |             html=requests.get(self.url,headers=headers,timeout=30).text
23 |             table=BeautifulSoup(html).find('table',id='table1').find_all('tr')
24 |             self.infor={}
25 |             for item in table:
26 |                 if item.find('th').get_text()=='Beta:':
27 |                     self.infor['Beta']=item.find('td').get_text()
28 |             table=BeautifulSoup(html).find('table',id='table2').find_all('tr')
29 |             for item in table:
30 |                 if item.find('th').get_text()[0]=='P':
31 |                     self.infor['PE']=item.find('td').get_text()
32 |         except:
33 |             self.infor={}
34 |             self.infor['Beta']=''
35 |             self.infor['PE']=''
36 | 
37 | class Main():
38 |     def __init__(self):
39 |         self.f=xlwt3.Workbook()
40 |         self.sheet=self.f.add_sheet('sheet')
41 |         self.count=0
42 |     def run(self):
43 |         file=open('data.txt','r')
44 |         items=[]
45 |         for line in file:
46 |             data=line.strip().split(' ')
47 |             items.append(data[1])
48 |             if len(items)<10:
49 |                 continue
50 |             threadings=[]
51 |             for item in items:
52 |                 work=Get_infor(item)
53 |                 threadings.append(work)
54 |             for work in threadings:
55 |                 work.run()
56 |             for work in threadings:
57 |                 self.sheet.write(self.count,0,work.name)
58 |                 self.sheet.write(self.count,1,work.infor['Beta'])
59 |                 self.sheet.write(self.count,2,work.infor['PE'])
60 |                 self.count+=1
61 |             self.f.save('data2.xls')
62 |             items=[]
63 |             print(self.count)
64 | 
65 | 
66 | work=Main()
67 | work.run()
68 | 


--------------------------------------------------------------------------------
/forecast.io/forecast.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | from selenium import webdriver
 4 | import xlwt
 5 | import time
 6 | from bs4 import BeautifulSoup
 7 | 
 8 | def Main():
 9 |     browser=webdriver.Firefox()
10 |     browser.get('http://forecast.io/#/f/35.0000,105.0000/1262404800')
11 |     excel=xlwt.Workbook()
12 |     sheet=excel.add_sheet('sheet')
13 |     count=0
14 |     starttime=1262318400
15 |     endtime=time.time()
16 |     while starttime<endtime:
17 |         browser.get('http://forecast.io/#/f/35.0000,105.0000/1262404800')
18 |         time.sleep(2)
19 |         result=parser(browser.page_source)
20 |         sheet.write(count,0,timetostr(starttime))
21 |         num=1
22 |         for i in result:
23 |             sheet.write(count,num,i)
24 |             num+=1
25 |         starttime+=86400
26 |         count+=1
27 |         time.sleep(2)
28 |         excel.save('result1.xls')
29 | 
30 | def timetostr(timestr):
31 |     date=time.localtime(timestr)
32 |     return time.strftime("%Y-%m-%d %H:%M:%S", date)
33 | 
34 | 
35 | def parser(html):
36 |     table=BeautifulSoup(html,'html.parser').find('div',attrs={'class':'slider_details'})
37 |     result=[]
38 |     result.append(table.find('div',attrs={'class':'summary'}).get_text())
39 |     for tr in table.find('tr',attrs={'class':'val'}).find_all('td'):
40 |         result.append(tr.get_text())
41 |     return result
42 | 
43 | Main()
44 | 


--------------------------------------------------------------------------------
/forecast.io/getData.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | import xlwt3
 5 | import time
 6 | import json
 7 | 
 8 | 
 9 | headers = {
10 |     'Host':"forecast.io",
11 |     'X-Requested-With':"XMLHttpRequest",
12 |     'Referer':"http://forecast.io/",
13 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
14 |     "Accept-Encoding": "gzip, deflate",
15 |     "Accept-Language": "en-US,en;q=0.5",
16 |     "Connection": "keep-alive",
17 |     'Cookie':"__utma=188038335.1670853311.1460370725.1460370725.1460374016.2; __utmc=188038335; __utmz=188038335.1460370725.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _gauges_unique_day=1; _gauges_unique_month=1; _gauges_unique_year=1; _gauges_unique=1; __utmb=188038335.6.10.1460374016; __utmt=1; _gauges_unique_hour=1",
18 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
19 | 
20 | def getdata(timestr,session):
21 |     html=session.get('http://forecast.io/forecast?q=35,105,%s&satellites'%timestr,headers=headers,proxies=proxies).text
22 |     data=json.loads(html)['hourly']['data']
23 |     return data
24 | 
25 | def timetostr(timestr):
26 |     date=time.localtime(timestr)
27 |     return time.strftime("%Y-%m-%d %H:%M:%S", date)
28 | 
29 | def main():
30 |     excel=xlwt3.Workbook()
31 |     sheet=excel.add_sheet('sheet')
32 |     keys=['summary','temperature','windSpeed','humidity','visibility','pressure']
33 |     count=0
34 |     starttime=1262318400
35 |     endtime=time.time()
36 |     session=requests.session()
37 |     while starttime<endtime:
38 |         result=getdata(starttime,session)
39 |         try:
40 |             data=result[10]
41 |         except:
42 |             continue
43 |         date=timetostr(starttime)
44 |         num=1
45 |         for key in keys:
46 |             try:
47 |                 sheet.write(count,num,data[key])
48 |                 num+=1
49 |             except:
50 |                 num+=1
51 |                 continue
52 |         sheet.write(count,0,date)
53 |         starttime+=86400
54 |         count+=1
55 |         print(date,'--ok')
56 |         excel.save('result.xls')
57 | 
58 | main()
59 | 


--------------------------------------------------------------------------------
/fsfc.fsjw.gov.cn/house.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import json
 4 | import openpyxl
 5 | import time
 6 | 
 7 | 
 8 | headers = {
 9 |         'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
10 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
11 |         'Accept-Language': 'en-US,en;q=0.5',
12 |         'Accept-Encoding': 'gzip, deflate',
13 |         'Connection': 'keep-alive'}
14 | 
15 | 
16 | def get_buildings(house_id):
17 |     html=requests.get('http://fsfc.fsjw.gov.cn/hpms_project/roomView.jhtml?id='+house_id,headers=headers).text
18 |     table=BeautifulSoup(html,'html.parser').find('div',{'class':'lp-list-con'}).find('p',{'class':'bot-a'}).find_all('a')
19 |     buildings=[]
20 |     for item in table:
21 |         name=item.get_text()
22 |         building_id=item.get('id')
23 |         buildings.append({'name':name,'id':building_id})
24 |     return buildings
25 | 
26 | def buildinfor(building_id):
27 |     html=requests.get('http://fsfc.fsjw.gov.cn/hpms_project/room.jhtml?id='+str(building_id),headers=headers).text
28 |     data=json.loads(html)
29 |     result=[]
30 |     keys=['roomno','ghyt','jzmj','tnmj','zt']
31 |     for item in data:
32 |         build=[]
33 |         for key in keys:
34 |             try:
35 |                 if key in ['jzmj','tnmj']:
36 |                     build.append(float(item[key]))
37 |                     continue
38 |                 build.append(item[key])
39 |             except:
40 |                 build.append('')
41 |         result.append(build)
42 |     return result
43 | 
44 | def main():
45 |     house_id=input("输入楼盘id:")#楼盘链接中的id
46 |     excel=openpyxl.Workbook(write_only=True)
47 |     buildings=get_buildings(house_id)
48 |     header=['房号','用途','总面积(㎡)','套内面积(㎡)','销售状态']
49 |     count=0
50 |     for item in buildings:
51 |         sheet=excel.create_sheet(str(count))
52 |         count+=1
53 |         sheet.append([item['name']])
54 |         sheet.append(header)
55 |         result=buildinfor(item['id'])
56 |         for line in result:
57 |             sheet.append(line)
58 |     filename=time.strftime('%Y%m%d_%H%M%S')+'.xlsx'
59 |     excel.save(filename)
60 | 
61 | main()
62 | 


--------------------------------------------------------------------------------
/job.qiaosiwang.com/workinfor.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import xlwt3
 6 | import re
 7 | 
 8 | 
 9 | headers = {
10 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
11 |     "Accept-Encoding": "gzip, deflate",
12 |     "Accept-Language": "en-US,en;q=0.5",
13 |     "Connection": "keep-alive",
14 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
15 | 
16 | 
17 | def get_url(page):
18 |     html=requests.get('http://job.qiaosiwang.com/zhiwei.html?Colname=0&KeyWord=&PageNo=%s&s_hangye=0&s_gangwei=0&s_xueli=&s_didian=&s_xinzhi=&s_date=&s_qiyexingzhi=&s_daiyu=&key=0'%page,headers=headers).text
19 |     table=BeautifulSoup(html,'lxml').find('div',id='hover_bg').find_all('div',attrs={'class':'item'})
20 |     urls=[]
21 |     for item in table:
22 |         url=item.find('a').get('href')
23 |         urls.append(url)
24 |     return urls
25 | 
26 | 
27 | def get_infor(url):
28 |     html=requests.get(url,headers=headers).text
29 |     soup=BeautifulSoup(html,'lxml').find('div',attrs={'class':'left'})
30 |     text=soup.get_text().replace('\r',' ').replace('\n',' ')
31 |     re_lists=['公司名称：(.*?) ','职位类别：(.*?) ','工作地点：(.*?) ','工作经验：(.*?) ','月薪：(.*?) ','联系地址： (.*?) ','联系人： (.*?) ','联系电话： (.*?) ']
32 |     results=[]
33 |     for rel in re_lists:
34 |         items=re.findall(rel,text)
35 |         try:
36 |             results.append(items[0].replace('\xa0\xa0\xa0\xa0查看附近租房信息', '').replace('职位描述',''))
37 |         except:
38 |             results.append('--')
39 |     des=soup.find('div',attrs={'class':'mo_3'}).get_text().replace('\r','').replace('\n','')
40 |     results.append(des)
41 |     return results
42 | 
43 | def main():
44 |     pagefrom=int(input('起始页：'))
45 |     pageto=int(input('终止页：'))
46 |     excel=xlwt3.Workbook()
47 |     sheet=excel.add_sheet('sheet')
48 |     count=0
49 |     while pagefrom<=pageto:
50 |         urls=get_url(pagefrom)
51 |         for url in urls:
52 |             results=get_infor(url)
53 |             num=0
54 |             for item in results:
55 |                 sheet.write(count,num,item)
56 |                 num+=1
57 |             count+=1
58 |             print(count)
59 |         excel.save('data.xls')
60 |         pagefrom+=1
61 |     excel.save('data.xls')
62 | 
63 | main()
64 | 


--------------------------------------------------------------------------------
/job/REANME.md:
--------------------------------------------------------------------------------
1 | 从各个网站获取职位信息
2 | 


--------------------------------------------------------------------------------
/landchina/landchina.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import openpyxl
 6 | import time
 7 | import os
 8 | 
 9 | headers = {
10 |     'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
11 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
12 |     'Accept-Language': 'en-US,en;q=0.5',
13 |     'Accept-Encoding': 'gzip, deflate',
14 |     'Connection': 'keep-alive'}
15 | 
16 | 
17 | def get_urls():
18 |     #province={'山东':'37%','江苏':'32%','浙江':'33%','重庆':'50%','四川':'51%','湖北':'42%','河南':'41%','安徽':'34%','辽宁':'21%'}
19 |     province={'浙江':'33%','辽宁':'21%'}
20 |     try:
21 |         os.mkdir('province')
22 |     except:
23 |         pass
24 |     for key in province:
25 |         f=open('province/%s.txt'%key,'a',encoding='utf-8')
26 |         for date in ['2016-1-1~2016-4-7','2016-4-8~2016-9-7']:
27 |             page=1
28 |             while True:
29 |                 data={
30 |                 'hidComName':"default",
31 |                 'TAB_QueryConditionItem':'9f2c3acd-0256-4da2-a659-6949c4671a2a',
32 |                 'TAB_QueryConditionItem':"ec9f9d83-914e-4c57-8c8d-2c57185e912a",
33 |                 'TAB_QuerySubmitConditionData':"9f2c3acd-0256-4da2-a659-6949c4671a2a:{}|42ad98ae-c46a-40aa-aacc-c0884036eeaf:{}".format(date,province[key]),
34 |                 'TAB_QuerySubmitOrderData':"",
35 |                 'TAB_RowButtonActionControl':"",
36 |                 'TAB_QuerySubmitPagerData':page,
37 |                 'TAB_QuerySubmitSortData':""
38 |                 }
39 |                 try:
40 |                     html=requests.post('http://www.landchina.com/default.aspx?tabid=263',data=data,headers=headers,timeout=30).text
41 |                 except:
42 |                     continue
43 |                 table=BeautifulSoup(html,'lxml').find('table',id='TAB_contentTable').find_all('tr')
44 |                 if '没有检索到相关数据' in str(table):
45 |                     break
46 |                 for item in table:
47 |                     try:
48 |                         line=[item.find('a').get('href')]
49 |                     except:
50 |                         continue
51 |                     for td in item.find_all('td'):
52 |                         try:
53 |                             line.append(td.get_text().replace('\r','').replace('\n',''))
54 |                         except:
55 |                             line.append('')
56 |                     f.write(str(line)+'\n')
57 |                     print(line[-1])
58 |                 print(key,date,page,'ok')
59 |                 page+=1
60 |         f.close()
61 | 
62 | get_urls()
63 | 


--------------------------------------------------------------------------------
/mall.jd.com/jd_shop.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import xlwt3
 6 | import re
 7 | 
 8 | 
 9 | headers = {
10 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
11 |     "Accept-Encoding": "gzip, deflate",
12 |     "Accept-Language": "en-US,en;q=0.5",
13 |     "Connection": "keep-alive",
14 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
15 | 
16 | def get_shop_urls(url):
17 |     html=requests.get(url,headers=headers).text
18 |     rel='(mall.jd.com/index-\d+.html)'
19 |     urls=re.findall(rel,html)
20 |     urls=list(set(urls))
21 |     return urls
22 | 
23 | def get_infor(url):
24 |     html=requests.get(url,headers=headers).text
25 | 
26 | def get_type_url():
27 |     html=requests.get('http://search.jd.com/jshop.php?keyword=&enc=utf-8&vender=1',headers=headers).text.encode('utf-8').decode('utf-8','ignore')
28 |     table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'sl-value'}).find('div',attrs={'class':'sl-v-list'}).find_all('li')
29 |     items=[]
30 |     for li in table:
31 |         item={}
32 |         item['name']=li.get_text()
33 |         item['url']=li.find('a').get('href')
34 |         items.append(item)
35 |     return items
36 | 
37 | def main():
38 |     items=get_type_url()
39 |     print(items)
40 |     for item in items:
41 |         page=1
42 |         while True:
43 |             urls=get_shop_urls('http://search.jd.com/jshop.php'+item['url']+'&page=%s'%page)
44 |             print(urls)
45 |             return
46 | 
47 | main()
48 | 


--------------------------------------------------------------------------------
/maoyan/Duplicate.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import os
 4 | 
 5 | def Duplicate():
 6 |     for filename in os.listdir('.'):
 7 |         if filename.endswith('txt'):
 8 |             lines=open(filename,'r').readlines()
 9 |             lines=list(set(lines))
10 |             lines.sort()
11 |             f=open(filename,'w')
12 |             for line in lines:
13 |                 f.write(line)
14 |             f.close()
15 | 
16 | Duplicate()
17 | 


--------------------------------------------------------------------------------
/maoyan/get_infor.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | def get_infor():
 7 |     headers = {
 8 |         'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 9 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
10 |         'Accept-Language': 'en-US,en;q=0.5',
11 |         'Accept-Encoding': 'gzip, deflate',
12 |         'Connection': 'keep-alive'}
13 |     fi=open('ids.txt','r')
14 |     infor_f=open('data.txt','a')
15 |     count=0
16 |     for line in fi.readlines():
17 |         if(count<608):
18 |             count+=1
19 |             continue
20 |         lists=line.split('||')
21 |         try:
22 |             html=requests.get(lists[1].replace('\n',''),headers=headers).text
23 |             soup=BeautifulSoup(html,'lxml')
24 |             infor=soup.find('aside',attrs={'class':'infos'}).find_all('p')
25 |             Type=infor[0].get_text()
26 |             date=infor[3].get_text()
27 |             tags=soup.find('article',attrs={'class':'tags clearfix'}).find_all('span')
28 |         except:
29 |             continue
30 |         try:
31 |             zong_piao=tags[0].get_text().replace('总票房:','')
32 |             zhou_piao=tags[1].get_text().replace('首周票房:','')
33 |         except:
34 |             continue
35 |         try:
36 |             text=lists[0]+'||'+date+'||'+Type+'||'+zong_piao+'||'+zhou_piao+'||'
37 |             table=soup.find('div',id='ticket_tbody').find_all('ul')
38 |         except:
39 |             continue
40 |         for item in table:
41 |             lis=item.find_all('li')
42 |             infor_f.write(text+lis[0].get_text()+'||'+lis[3].get_text()+'\n')
43 |         count+=1
44 |         print(count)
45 | 
46 | get_infor()
47 | 


--------------------------------------------------------------------------------
/maoyan/maoyan.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import xlwt3
 6 | import datetime
 7 | 
 8 | 
 9 | def day_get(d):
10 |     oneday = datetime.timedelta(days=1)
11 |     day = d - oneday
12 |     return day
13 | 
14 | def get_ids():
15 |     headers = {
16 |         'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
17 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
18 |         'Accept-Language': 'en-US,en;q=0.5',
19 |         'Accept-Encoding': 'gzip, deflate',
20 |         'Connection': 'keep-alive'}
21 |     d = datetime.datetime.now()
22 |     f=open('ids.txt','w')
23 |     while True:
24 |         d=day_get(d)
25 |         day=str(d).split(' ')[0]
26 |         html=requests.get('http://piaofang.maoyan.com/?date=%s&cnt=10&_v_=yes'%day,headers=headers).text
27 |         table=BeautifulSoup(html,'lxml').find('div',id='ticket_tbody').find_all('ul')
28 |         for item in table:
29 |             name=item.find('li').find('b').get_text()
30 |             url=item.get('data-com').replace("hrefTo,href:'",'').replace("'",'')
31 |             f.write(name+'||'+'http://piaofang.maoyan.com'+url+'\n')
32 |         print(day)
33 |         if day=='2014-01-01':
34 |             break
35 |     f.close()
36 | 
37 | get_ids()
38 | 


--------------------------------------------------------------------------------
/music.163.com/music_lists.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import sqlite3
 6 | import threading
 7 | import json
 8 | 
 9 | class Get_list_id():
10 |     def __init__(self):
11 |         self.urls={
12 |             '华语':'http://music.163.com/discover/playlist/?order=hot&cat=%E5%8D%8E%E8%AF%AD&limit=35&offset=',
13 |             '欧美':'http://music.163.com/discover/playlist/?order=hot&cat=%E6%AC%A7%E7%BE%8E&limit=35&offset=',
14 |             '日语':'http://music.163.com/discover/playlist/?order=hot&cat=%E6%97%A5%E8%AF%AD&limit=35&offset=',
15 |             '韩语':'http://music.163.com/discover/playlist/?order=hot&cat=%E9%9F%A9%E8%AF%AD&limit=35&offset='
16 |             }
17 |         self.headers = {
18 |             'Host':"music.163.com",
19 |             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
20 |             "Accept-Encoding": "gzip, deflate",
21 |             "Accept-Language": "en-US,en;q=0.5",
22 |             "Connection": "keep-alive",
23 |             'Referer':"http://music.163.com/",
24 |             "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
25 |     def run(self):
26 |         threadings=[]
27 |         for key in self.urls:
28 |             work=threading.Thread(target=self.get_lists,args=(key,))
29 |             threadings.append(work)
30 |         for work in threadings:
31 |             work.start()
32 |         for work in threadings:
33 |             work.join()
34 | 
35 |     def get_lists(self,key):
36 |         f=open(key+'.txt','a')
37 |         count=35
38 |         while True:
39 |             html=requests.get(self.urls[key]+str(count),headers=self.headers).text
40 |             try:
41 |                 table=BeautifulSoup(html,'html.parser').find('ul',id='m-pl-container').find_all('li')
42 |             except:
43 |                 break
44 |             ids=[]
45 |             for item in table:
46 |                 ids.append(item.find('div',attrs={'class':'bottom'}).find('a').get('data-res-id'))
47 |             count+=35
48 |             f.write(str(ids)+'\n')
49 | 
50 | def get_id(list_id):
51 |     url='http://music.163.com/api/playlist/detail?id='+str(list_id)
52 |     data=requests.get(url).text
53 |     data=json.loads(data)['result']
54 |     if(data['playCount']>500000):
55 |         return data
56 |     return []
57 | 
58 | if __name__=='__main__':
59 |     threadings=[]
60 |     f=open('华语.txt','r')
61 |     file_d=open('data.txt','a')
62 |     for line in f.readlines():
63 |         for id in eval(line.replace('\n','')):
64 |             data=get_id(id)
65 |             if data==[]:
66 |                 continue
67 |             file_d.write(str(data)+'\n')
68 |             print(id)
69 | 


--------------------------------------------------------------------------------
/newseed.pedaily.cn/invest.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import openpyxl
 4 | import time
 5 | 
 6 | headers = {
 7 |     'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 8 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 9 |     'Accept-Language': 'en-US,en;q=0.5',
10 |     'Accept-Encoding': 'gzip, deflate',
11 |     'Connection': 'keep-alive'}
12 | 
13 | def invest(page):
14 |     html=requests.get('http://newseed.pedaily.cn/invest/p'+str(page),headers=headers).text
15 |     table=BeautifulSoup(html,'lxml').find('table',{'class':'record-table'}).find_all('tr')
16 |     result=[]
17 |     for tr in table:
18 |         tds=tr.find_all('td')
19 |         if len(tds)==0:
20 |             continue
21 |         line=[]
22 |         for td in tds:
23 |             try:
24 |                 line.append(td.get_text())
25 |             except:
26 |                 line.append('')
27 |         result.append(line)
28 |     return result
29 | 
30 | def write_to_excel(result):
31 |     excel=openpyxl.Workbook(write_only=True)
32 |     sheet=excel.create_sheet()
33 |     for line in result:
34 |         try:
35 |             sheet.append(line)
36 |         except:
37 |             continue
38 |     excel.save('result.xlsx')
39 | 
40 | def main():
41 |     pagefrom=input("起始页:")
42 |     pageto=input("结束页:")
43 |     pagefrom=int(pagefrom)
44 |     pageto=int(pageto)
45 |     result=[]
46 |     while pagefrom<=pageto:
47 |         try:
48 |             result+=invest(pagefrom)
49 |         except:
50 |             print(pagefrom,'failed')
51 |             continue
52 |         print(pagefrom,'ok')
53 |         pagefrom+=1
54 |         time.sleep(1)
55 |     write_to_excel(result)
56 | 
57 | main()
58 | 


--------------------------------------------------------------------------------
/rank.kongzhong.com/userInfor.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from bs4 import BeautifulSoup
 3 | import time
 4 | 
 5 | def loadNameAndArea():#加载需要抓取的名单
 6 |     lines=open('names.txt','r',encoding='utf-8').readlines()#读入文本
 7 |     userlist=[]
 8 |     for line in lines:
 9 |         userlist.append(line.replace('\r','').replace('\n',''))
10 |     return userlist
11 | 
12 | 
13 | def writeToTxt(user):#将结果写入txt
14 |     line='\t'.join(user)
15 |     f=open('result.txt','a',encoding='utf-8')
16 |     f.write(line+'\r\n')
17 |     f.close()
18 | 
19 | def parser(html):#解析网页，用的是BeautifulSoup库
20 |     soup=BeautifulSoup(html,'html.parser').find('div',id='total')
21 |     result=[]
22 |     labels=['singlebattle','teambattle','totalbattle']
23 |     for label in labels:
24 |         table=soup.find('div',id=label)
25 |         result.append(table.find('span',{'class':'value separate'}).get_text())
26 |         result.append(table.find('span',{'class':'value2'}).get_text())
27 |     return result
28 | 
29 | def getUserInfor():
30 |     browser=webdriver.Firefox()#调用火狐浏览器
31 |     browser.get('http://rank.kongzhong.com/wows/index.html?name=%E4%BD%BF%E5%BE%92-%E6%B8%94%E9%B6%B8&zone=north')
32 |     browser.implicitly_wait(10)#设置页面加载等待时间
33 |     userlist=loadNameAndArea()#获取名单
34 |     for user in userlist:
35 |         user=user.split('\t')#名单中 名字和区域是以\t分隔
36 |         if '南区' in user[-1]:#判断是那一个区域
37 |             area='south'
38 |         else:
39 |             area='north'
40 |         url='http://rank.kongzhong.com/wows/index.html?name=%s&zone=%s'%(user[0],area)#构造链接
41 |         browser.get(url)#打开链接
42 |         time.sleep(2)#停2s等待页面加载完成
43 |         html=browser.page_source#获取页面源码
44 |         try:
45 |             result=parser(html)#解析页面
46 |         except:
47 |             continue
48 |         result=user+result
49 |         writeToTxt(result)#写入txt
50 |     browser.quit()
51 | 
52 | getUserInfor()
53 | 


--------------------------------------------------------------------------------
/stock.finance.qq.com/stk_holder.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import openpyxl
 4 | import time
 5 | 
 6 | headers = {
 7 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 8 |     "Accept-Encoding": "gzip, deflate",
 9 |     "Accept-Language": "en-US,en;q=0.5",
10 |     "Connection": "keep-alive",
11 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
12 | 
13 | def get_stkholder(name,stkcode):
14 |     html=requests.get('http://stock.finance.qq.com/corp1/stk_holder.php?zqdm=%s'%stkcode,headers=headers).text
15 |     soup=BeautifulSoup(html,'lxml').find('table',{'class':'list list_d'})
16 |     date=soup.find('tr').find_all('span',{'class':'fntTahoma'})[-1].get_text()
17 |     table=soup.find_all('tr')
18 |     result=[]
19 |     for tr in table[2:-1]:
20 |         tds=tr.find_all('td')
21 |         item=[name,stkcode,date]
22 |         for td in tds:
23 |             item.append(td.get_text())
24 |         result.append(item)
25 |     return result
26 | 
27 | def write_to_excel():
28 |     excel=openpyxl.Workbook(write_only=True)
29 |     filename=time.strftime("%Y%m%d %H%M%S",time.localtime())+'.xlsx'
30 |     sheet=excel.create_sheet()
31 |     for line in result:
32 |         sheet.append(line)
33 |     excel.save(filename)
34 | 
35 | def main():
36 |     result=[]
37 |     for line in open('stkcode.txt','r',encoding='utf-8'):
38 |         title=line.replace('\r','').replace('\n','').split('---')
39 |         try:
40 |             items=get_stkholder(title[0],title[1])
41 |         except:
42 |             pass
43 |             time.sleep(3)
44 |             continue
45 |         result+=items
46 |         print(title[0],title[1],'ok')
47 |         time.sleep(3)
48 |     write_to_excel(result)
49 | 
50 | main()
51 | 


--------------------------------------------------------------------------------
/stock.finance.qq.com/stkcode.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | 
 4 | 
 5 | def get_stkcode():
 6 |     f=open('stkcode.txt','w')
 7 |     page=1
 8 |     while True:
 9 |         html=requests.get('http://hq.gucheng.com/List.asp?Type=A&Sort=&Page=%s'%page).text.encode('ISO-8859-1').decode('GBK','ignore')
10 |         table=BeautifulSoup(html,'lxml').find('div',{'class':'hq_big_bk md_6'}).find_all('tr')
11 |         for tr in table[1:-1]:
12 |             tds=tr.find_all('td')
13 |             line=tds[1].get_text()+'---'+tds[0].get_text()
14 |             print(line)
15 |             f.write(line+'\r\n')
16 |         page+=1
17 |         if page==139:
18 |             break
19 |     f.close()
20 | 
21 | get_stkcode()
22 | 


--------------------------------------------------------------------------------
/stock.jrj.com.cn/flowhistory.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import openpyxl
 3 | import json
 4 | import time
 5 | 
 6 | 
 7 | def get_flowhistory(stockid):
 8 |     html=requests.get('http://zj.flashdata2.jrj.com.cn/flowhistory/share/%s.js'%stockid).text
 9 |     data=json.loads(html.replace('var stock_flow=',''))
10 |     result=[]
11 |     header=['序号','日期','涨跌幅','收盘价','换手率','净流入金额','主力净流入净额','主力净流入净占比','中单净流入净额','中单净流入净占比','散户净流入净额','散户净流入净占比','第二天']
12 |     result.append(header)
13 |     keys=['date','pl','cp','tr','tin','zin','zpit','min','mpit','sin','spit']
14 |     count=1
15 |     pre_line=''
16 |     for line in data:
17 |         item=[count]
18 |         count+=1
19 |         for key in keys:
20 |             item.append(line[key])
21 |         try:
22 |             item.append(pre_line['pl'])
23 |         except:
24 |             pass
25 |         result.append(item)
26 |         pre_line=line
27 |     return result
28 | 
29 | def write_to_excel(result,stockid):
30 |     excel=openpyxl.Workbook(write_only=True)
31 |     sheet=excel.create_sheet()
32 |     for item in result:
33 |         sheet.append(item)
34 |     excel.save('%s.xlsx'%stockid)
35 | 
36 | def main():
37 |     stockid=input("输入股票代码:")
38 |     try:
39 |         result=get_flowhistory(stockid)
40 |     except:
41 |         print('Failed!')
42 |         time.sleep(10)
43 |         return
44 |     write_to_excel(result,stockid)
45 | 
46 | main()
47 | 


--------------------------------------------------------------------------------
/taobao/suggest.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import time
 4 | import os
 5 | import chardet
 6 | 
 7 | headers = {
 8 |         ':authority':'suggest.taobao.com',
 9 |         'user-agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36',
10 |         'Accept':"*/*",
11 |         'Accept-Language': 'en-US,en;q=0.5',
12 |         'Accept-Encoding': 'gzip, deflate',
13 |         'Connection': 'keep-alive'}
14 | 
15 | 
16 | def suggest(keyword):
17 |     html=requests.get('https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null'.format(keyword),headers=headers).text
18 |     data=json.loads(html)['result']
19 |     result=[]
20 |     for item in data:
21 |         result.append(item[0].replace('</b>','').replace('<b>',''))
22 |     return result
23 | 
24 | def get_chardet(filename):
25 |     data=open(filename,'rb').read()
26 |     coding=chardet.detect(data)
27 |     return coding['encoding']
28 | 
29 | def loadkeywords():
30 |     keywords={}
31 |     for filename in os.listdir('keywords'):
32 |         if '.txt' not in filename:
33 |             continue
34 |         encoding=get_chardet('keywords/'+filename)
35 |         if encoding=='GB2312':
36 |             encoding='GBK'
37 |         keywords[filename]=[]
38 |         for line in open('keywords/'+filename,'r',encoding=encoding):
39 |             word=line.replace('\r','').replace('\n','')
40 |             keywords[filename].append(word)
41 |     return keywords
42 | 
43 | def save_to_txt(filename,deep,words):
44 |     f=open('result/'+filename.replace('.txt','_%s.txt'%deep),'w',encoding='utf-8')
45 |     writed=[]
46 |     for word in words:
47 |         if word in writed:
48 |             continue
49 |         writed.append(word)
50 |         f.write(word+'\r\n')
51 |     f.close()
52 | 
53 | def main():
54 |     keywords=loadkeywords()
55 |     while True:
56 |         try:
57 |             deep=input("输入采集深度:")
58 |             deep=int(deep)
59 |             break
60 |         except:
61 |             pass
62 |     for filename in keywords:
63 |         result=[]
64 |         for word in keywords[filename]:
65 |             words=[word]
66 |             count=0
67 |             for num in range(deep):
68 |                 suggest_words=[]
69 |                 for need_word in words:
70 |                     try:
71 |                         suggest_words+=suggest(need_word)
72 |                     except:
73 |                         continue
74 |                 suggest_words=list(set(suggest_words))
75 |                 words=suggest_words
76 |                 count+=len(suggest_words)
77 |                 result+=suggest_words
78 |                 print(word,'deep',num+1)
79 |             print(word,'get',count,'ok')
80 |         save_to_txt(filename,deep,result)
81 | 
82 | main()
83 | 


--------------------------------------------------------------------------------
/weibo/weibo.md:
--------------------------------------------------------------------------------
 1 | ###Python网络爬虫之新浪微博
 2 | ####1.模拟登录
 3 | 这里我是利用selenium登录，然后获取登录后的cookies,方便快捷，也免去了编写代码模拟登录的麻烦。requests直接可以利用这个cookies实现登录抓取。
 4 | 
 5 | ```python
 6 | from selenium import webdriver
 7 | 
 8 | def login(username,password):
 9 |     browser=webdriver.PhantomJS('./phantomjs')
10 |     browser.get('https://passport.weibo.cn/signin/login?entry=mweibo&amp;res=wel&amp;wm=3349&amp;r=http%3A%2F%2Fm.weibo.cn%2F')#打开登录界面
11 |     browser.set_page_load_timeout(10)
12 |     time.sleep(5)#延时等待网页加载完成
13 |     browser.find_element_by_id('loginName').send_keys(username)#填入用户名
14 |     browser.find_element_by_id('loginPassword').send_keys(password)#填入密码
15 |     browser.find_element_by_id('loginAction').click()#点击登录
16 |     time.sleep(5)
17 |     cookies=browser.get_cookies()#获取登录后的cookies
18 |     result={}
19 |     for item in cookies:
20 |         try:
21 |             result[item['name']]=item['value']
22 |         except:
23 |             continue
24 |     return result#返回dict类型cookies
25 | 
26 | ```
27 | requests不能保持手动构建的cookie，因此需要将dict类型的cookie转成cookiejar类型
28 | 
29 | ```python
30 | import requests
31 | import os
32 | 
33 | def weibo():
34 |     if os.path.isfile('cookies'):
35 |         cookies=eval(open('cookies','r').read())
36 |     else:
37 |         cookies=login('username','password')#获取登录后的cookie
38 |     session=requests.session()
39 |     session.cookies=requests.utils.cookiejar_from_dict(cookies)#将字典转为CookieJar,并传入session中
40 |     return session
41 | 
42 | ```
43 | 
44 | ####2.获取首页微博
45 | ```python
46 | import json
47 | 
48 | headers = {
49 |         'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
50 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
51 |         'Accept-Language': 'en-US,en;q=0.5',
52 |         'Accept-Encoding': 'gzip, deflate',
53 |         'Connection': 'keep-alive'}
54 | 
55 | session=weibo()
56 | html=session.get('http://m.weibo.cn/index/feed?format=cards&page=1',headers=headers).text
57 | data=json.loads(html)[0]['card_group']
58 | result=[]
59 | for item in data:
60 |     user=item['mblog']['user']['screen_name']
61 |     text=item['mblog']['text']
62 |     weiboid=item['mblog']['idstr']
63 |     result.append({'user':user,'text':text})
64 | print(result)
65 | ```
66 | 
67 | ####3.获取微博评论
68 | 
69 | ```python
70 | 
71 | def get_comments(session,weiboid):
72 |     page=1
73 |     html=session.get('http://m.weibo.cn/single/rcList?format=cards&id={weiboid}&type=comment&hot=0&page={page}'.format(weiboid=weiweiboid,page=page),headers=headers).text
74 |     data=json.loads(html)[1]['card_group']
75 |     comments=[]
76 |     for item in data:
77 |         comment={}
78 |         comment['user']=item['user']['screen_name']
79 |         comment['date']=item['created_at']
80 |         comment['text']=item['text']
81 |         comments.append(comment)
82 |     return comments
83 | ```
84 | 


--------------------------------------------------------------------------------
/weibo/weibo.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | from selenium import webdriver
 4 | import time
 5 | import os
 6 | import json
 7 | 
 8 | headers = {
 9 |         'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
10 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
11 |         'Accept-Language': 'en-US,en;q=0.5',
12 |         'Accept-Encoding': 'gzip, deflate',
13 |         'Connection': 'keep-alive'}
14 | 
15 | def login(username,password):
16 |     browser=webdriver.PhantomJS('/home/nyloner/phantomjs/phantomjs')
17 |     #browser=webdriver.Firefox()
18 |     browser.get('https://passport.weibo.cn/signin/login?entry=mweibo&amp;res=wel&amp;wm=3349&amp;r=http%3A%2F%2Fm.weibo.cn%2F')
19 |     browser.set_page_load_timeout(10)
20 |     time.sleep(5)
21 |     browser.find_element_by_id('loginName').send_keys(username)
22 |     browser.find_element_by_id('loginPassword').send_keys(password)
23 |     browser.find_element_by_id('loginAction').click()
24 |     time.sleep(5)
25 |     cookies=browser.get_cookies()
26 |     result={}
27 |     for item in cookies:
28 |         try:
29 |             result[item['name']]=item['value']
30 |         except:
31 |             continue
32 |     f=open('cookies','w')
33 |     f.write(str(result))
34 |     f.close()
35 |     return result
36 | 
37 | def weibo():
38 |     if os.path.isfile('cookies'):
39 |         cookies=eval(open('cookies','r').read())
40 |     else:
41 |         cookies=login('username','password')
42 |     session=requests.session()
43 |     session.cookies=requests.utils.cookiejar_from_dict(cookies)
44 |     html=session.get('http://m.weibo.cn',headers=headers).text
45 |     html=session.get('http://m.weibo.cn/index/feed?format=cards&page=1',headers=headers).text
46 |     data=json.loads(html)[0]['card_group']
47 |     result=[]
48 |     for item in data:
49 |         user=item['mblog']['user']['screen_name']
50 |         text=item['mblog']['text']
51 |         result.append({'user':user,'text':text})
52 |     print(result)
53 |     print(get_comments(session,'4013542757481643'))
54 | 
55 | def get_comments(session,weiboid):
56 |     page=1
57 |     html=session.get('http://m.weibo.cn/single/rcList?format=cards&id={weiboid}&type=comment&hot=0&page={page}'.format(weiboid=weiboid,page=page),headers=headers).text
58 |     data=json.loads(html)[1]['card_group']
59 |     comments=[]
60 |     for item in data:
61 |         comment={}
62 |         comment['user']=item['user']['screen_name']
63 |         comment['date']=item['created_at']
64 |         comment['text']=item['text']
65 |         comments.append(comment)
66 |     return comments
67 | 
68 | weibo()
69 | 


--------------------------------------------------------------------------------
/weidian/weidian.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | from selenium import webdriver
 6 | import time
 7 | import re
 8 | 
 9 | headers = {
10 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
11 |     "Accept-Encoding": "gzip, deflate",
12 |     "Accept-Language": "en-US,en;q=0.5",
13 |     "Connection": "keep-alive",
14 |     "User-Agent": "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"}
15 | 
16 | def get_place():
17 |     f=open('place.txt','w')
18 |     browser=webdriver.Firefox()
19 |     #html=requests.get('http://weidian.com/near_shop/chunjie/city.html?&from=weidian&userid=211106418&umk=34542211106418',headers=headers).text.encode('ISO-8859-1').decode('utf-8','ignore')
20 |     browser.get('http://weidian.com/near_shop/chunjie/city.html?&from=weidian&userid=211106418&umk=34542211106418')
21 |     time.sleep(10)
22 |     html=browser.page_source
23 |     table=BeautifulSoup(html,'lxml').find('div',id='show-place').find_all('ul')
24 |     places={}
25 |     print(html)
26 |     for item in table[1:]:
27 |         for li in item.find_all('li'):
28 |             places[li.get_text()]='http://weidian.com/near_shop/chunjie/'+li.find('a').get('href')
29 |     for li in table[0].find_all('li'):
30 |         places[li.get_text()]='http://weidian.com/near_shop/chunjie/'+li.find('a').get('href')
31 |     for key in places:
32 |         text=key+'||'+places[key]+'\n'
33 |         f.write(text)
34 |     f.close()
35 | 
36 | def get_shop():
37 |     f=open('shops.txt','a',encoding='utf-8')
38 |     for line in open('place.txt').readlines():
39 |         city=line.split('||')[0]
40 |         place=re.findall('place=(.*?)&',line)[0]
41 |         page=0
42 |         while True:
43 |             url='http://api.buyer.weidian.com/h5/appserver_nearbyShop.do?place='+place+'&seed=0&category=%E7%AE%B1%E5%8C%85&limit=50&page='+str(page)+'&callback=jsonp4&rnd=0.8898308666990978'
44 |             html=requests.get(url,headers=headers).text
45 |             rel='"shopid":"(.*?)","entranceName":"(.*?)","address":"(.*?)"'
46 |             lists=re.findall(rel,html)
47 |             if lists==[]:
48 |                 break
49 |             for item in lists:
50 |                 text=item[0]+'||'+item[1]+'||'+item[2]
51 |                 f.write(text+'\n')
52 |             print(city+place+'--'+str(page))
53 |             page+=1
54 |     f.close()
55 | 
56 | def get_weixin():
57 |     f=open('data.txt','a')
58 |     for line in open('shops.txt'):
59 |         line=line.replace('\n','')
60 |         shopurl='http://weidian.com/?userid='+line.split('||')[0]
61 |         html=requests.get(shopurl,headers=headers).text
62 |         try:
63 |             html=requests.get(shopurl,headers=headers).text
64 |             rel='微信: (.*?)<'
65 |             weixin=re.findall(rel,html)[0]
66 |         except:
67 |             continue
68 |         print(line+'---OK')
69 |         line=line+'||'+weixin+'\n'
70 |         f.write(line)
71 | 
72 | def main():
73 |     #get_shop()
74 |     get_weixin()
75 | 
76 | main()
77 | 


--------------------------------------------------------------------------------
/wenda.so.com/search.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import time
 4 | from selenium import webdriver
 5 | 
 6 | headers = {
 7 |     'Host':"wenda.so.com",
 8 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 9 |     "Accept-Encoding": "gzip, deflate",
10 |     "Accept-Language": "en-US,en;q=0.5",
11 |     "Connection": "keep-alive",
12 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
13 | 
14 | browser=webdriver.Firefox()
15 | browser.get('http://wenda.so.com/')
16 | browser.implicitly_wait(10)
17 | def search(key):
18 |     #html=requests.get('http://wenda.so.com/search/?q='+key,headers=headers,timeout=30).text
19 |     browser.get('http://wenda.so.com/search/?q='+key)
20 |     time.sleep(0.5)
21 |     html=browser.page_source
22 |     table=BeautifulSoup(html,'lxml').find_all('li',{'class':'item'})
23 |     for item in table:
24 |         try:
25 |             url=item.find('a').get('href')
26 |             if 'q/' in url:
27 |                 return 'http://wenda.so.com/'+url
28 |         except:
29 |             continue
30 | 
31 | def get_questions():
32 |     for word in open('failed_words','r'):
33 |         word=word.replace('\r','').replace('\n','')
34 |         try:
35 |             url=search(word)
36 |         except:
37 |             failed=open('failed.txt','a')
38 |             failed.write(word+'\n')
39 |             failed.close()
40 |             continue
41 |         if url==None:
42 |             failed=open('failed.txt','a')
43 |             failed.write(word+'\n')
44 |             failed.close()
45 |             continue
46 |         f=open('question_','a')
47 |         f.write(word+'||'+url+'\n')
48 |         print(word,'ok')
49 |         f.close()
50 | 
51 | get_questions()
52 | 


--------------------------------------------------------------------------------
/wenshu.court.gov.cn/download.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import json
 4 | 
 5 | headers = {
 6 |         'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 7 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 8 |         'Accept-Language': 'en-US,en;q=0.5',
 9 |         'Accept-Encoding': 'gzip, deflate',
10 |         'Connection': 'keep-alive'}
11 | 
12 | def doclist(page,Param="",Order="裁判日期"):
13 |     data={
14 |     'Param':Param,
15 |     'Index':page,
16 |     'Page':"20",
17 |     'Order':Order,
18 |     'Direction':"desc"
19 |     }
20 |     html=requests.post('http://wenshu.court.gov.cn/List/ListContent',data=data,headers=headers).text
21 |     data=json.loads(html)
22 |     data=eval(data)
23 |     result=[]
24 |     for item in data:
25 |         if 'Count' in item:
26 |             continue
27 |         result.append(item)
28 |     return result
29 | 
30 | def download(docid,title):
31 |     data={
32 |     'conditions':'',
33 |     'docIds':docid+'|'+title+'|',
34 |     'keyCode':""
35 |     }
36 |     content=requests.post('http://wenshu.court.gov.cn/CreateContentJS/CreateListDocZip.aspx?action=1',data=data,headers=headers).content
37 |     with open('result/%s.doc'%docid,'wb') as f:
38 |         f.write(content)
39 | 
40 | if __name__ == '__main__':
41 |     docs=doclist(1)
42 |     try:
43 |         import os
44 |         os.mkdir('result')
45 |     except:
46 |         pass
47 |     for item in docs:
48 |         download(item['文书ID'],item['案件名称'])
49 |         print(item['案件名称'])
50 | 


--------------------------------------------------------------------------------
/worldfreightrates/trates.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | import re
 5 | import xlrd
 6 | import xlwt3
 7 | 
 8 | headers = {
 9 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
10 |     "Accept-Encoding": "gzip, deflate",
11 |     "Accept-Language": "en-US,en;q=0.5",
12 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
13 | 
14 | def get_port(name):
15 |     name=name.replace(' ','+')
16 |     count=0
17 |     statue=True
18 |     while statue:
19 |         try:
20 |             html=requests.get('http://worldfreightrates.com/calculator/ports?term=%s'%name,headers=headers,timeout=30).text
21 |             statue=False
22 |         except:
23 |             count+=1
24 |             if count==3:
25 |                 return False
26 |             continue
27 |     try:
28 |         data=eval(html)
29 |         Id=data[0]['id']
30 |         return Id
31 |     except:
32 |         return False
33 | 
34 | def get_infor(fromid,toid,commodityName):
35 |     url='http://worldfreightrates.com/en/calculator/ocean/rate?fromId='+fromid+'&toId='+toid+'&oceanType=FCL&commodityName='+commodityName+'&commodityValue=100&includeInsurance=false&includeReefer=false&includeHazardous=false&unit=lb&containerSize=40'
36 |     html=requests.get(url,headers=headers,timeout=50).text.replace('\\','')
37 |     rel='"result">(.*?)</p>'
38 |     try:
39 |         result=re.findall(rel,html)[0]
40 |     except:
41 |         result=''
42 |     return result
43 | 
44 | def main():
45 |     data = xlrd.open_workbook('data/data.xlsx')
46 |     table = data.sheets()[0]
47 |     excel=xlwt3.Workbook()
48 |     sheet=excel.add_sheet('sheet')
49 |     for row in range(table.nrows):
50 |         print(row)
51 |         fromport=table.cell(row,0).value
52 |         toport=table.cell(row,1).value
53 |         commodityName=table.cell(row,2).value
54 |         Load_Type=table.cell(row,3).value
55 |         fromid=get_port(fromport)
56 |         toid=get_port(toport)
57 |         if fromid==False or toid==False:
58 |             sheet.write(row,0,fromport)
59 |             sheet.write(row,1,toport)
60 |             sheet.write(row,2,commodityName)
61 |             sheet.write(row,3,Load_Type)
62 |             sheet.write(row,4,'')
63 |             excel.save('data/result.xls')
64 |             continue
65 |         try:
66 |             result=get_infor(fromid,toid,commodityName.replace('&','%26').replace(' ','+').replace(',','%2C'))
67 |         except:
68 |             result=''
69 |         sheet.write(row,0,fromport)
70 |         sheet.write(row,1,toport)
71 |         sheet.write(row,2,commodityName)
72 |         sheet.write(row,3,Load_Type)
73 |         sheet.write(row,4,result)
74 |         excel.save('data/result.xls')
75 | main()
76 | 


--------------------------------------------------------------------------------
/www.18ladys.com/18ladys.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import time
 4 | import re
 5 | import openpyxl
 6 | 
 7 | headers = {
 8 |         'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 9 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
10 |         'Accept-Language': 'en-US,en;q=0.5',
11 |         'Accept-Encoding': 'gzip, deflate'}
12 | 
13 | def get_names():
14 |     page=1
15 |     while page<21:
16 |         html=requests.get('http://www.18ladys.com/cyzy/index.asp?page='+str(page),headers=headers).text.encode('iso-8859-1').decode('gbk')
17 |         table=BeautifulSoup(html,'lxml').find('div',{'class':'tb1'}).find_all('a')
18 |         f=open('names.txt','a')
19 |         for item in table:
20 |             try:
21 |                 name=item.get_text()
22 |                 url='http://www.18ladys.com/cyzy/'+item.get('href')
23 |                 f.write(name+'|'+url+'\n')
24 |             except:
25 |                 continue
26 |         f.close()
27 |         print(page)
28 |         page+=1
29 | 
30 | def get_infor(name,url):
31 |     html=requests.get(url,headers=headers).text.encode('iso-8859-1').decode('gbk','ignore')
32 |     text=BeautifulSoup(html,'lxml').find('dd',{'class':'f14 jl4'}).find('p').get_text().replace('【','||【').replace('\r','').replace('\n','')
33 |     text=text.split('||')
34 |     result={'name':name}
35 |     for item in text:
36 |         try:
37 |             name_value=item.split('】')
38 |             name=name_value[0].replace('【','')
39 |             value=name_value[1]
40 |             result[name]=value
41 |         except:
42 |             continue
43 |     return result
44 | 
45 | def crawler():
46 |     for line in open('names.txt','r'):
47 |         line=line.replace('\n','')
48 |         name=line.split('|')[0]
49 |         url=line.split('|')[1]
50 |         try:
51 |             item=get_infor(name,url)
52 |         except:
53 |             failed=open('failed','a')
54 |             failed.write(line+'\n')
55 |             failed.close()
56 |         f=open('result.txt','a')
57 |         f.write(str(item)+'\n')
58 |         f.close()
59 |         print(line,'ok')
60 | 
61 | def write_to_excel():
62 |     excel=openpyxl.Workbook(write_only=True)
63 |     sheet=excel.create_sheet()
64 |     keys=['name','异名','别名','来源','植物形态','功用主治','用法与用量','炮制']
65 |     sheet.append(keys)
66 |     for line in open('result.txt','r'):
67 |         item=eval(line)
68 |         infor=[]
69 |         for key in keys:
70 |             try:
71 |                 infor.append(item[key])
72 |             except:
73 |                 infor.append('')
74 |         sheet.append(infor)
75 |     excel.save('result.xlsx')
76 | 
77 | crawler()


--------------------------------------------------------------------------------
/www.58.com/sendemail.py:
--------------------------------------------------------------------------------
 1 | import smtplib
 2 | from email.mime.text import MIMEText
 3 | from email.mime.multipart import MIMEMultipart
 4 | from email.header import Header
 5 | import time
 6 | 
 7 | 
 8 | def sendmail():
 9 |     sender = 'xxx@qq.com'
10 |     receivers = ['xxx@qq.com']  # 接收邮件，可设置为你的QQ邮箱或者其他邮箱
11 |     #创建一个带附件的实例
12 |     message = MIMEMultipart()
13 |     message['From'] = Header("xxxx", 'utf-8')
14 |     message['To'] =  Header("xxx@qq.com", 'utf-8')
15 |     subject ='time.strftime("%Y-%m-%d %H:%M:%S")'
16 |     message['Subject'] = Header(subject, 'utf-8')
17 |     #邮件正文内容
18 |     message.attach(MIMEText('time.strftime("%Y-%m-%d %H:%M:%S")', 'plain', 'utf-8'))
19 |     att1 = MIMEText(open('result.xls', 'rb').read(), 'base64', 'utf-8')
20 |     att1["Content-Type"] = 'application/octet-stream'
21 |     # 这里的filename可以任意写，写什么名字，邮件中显示什么名字
22 |     att1["Content-Disposition"] = 'attachment; filename="result.xls"'
23 |     message.attach(att1)
24 |     server=smtplib.SMTP_SSL('smtp.qq.com')
25 |     server.ehlo('smtp.qq.com')
26 |     server.login(sender,passwd)
27 |     server.sendmail(sender, receivers, message.as_string())
28 | 
29 | sendmail()
30 | 


--------------------------------------------------------------------------------
/www.airbnb.com/deal.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import xlwt3
 4 | 
 5 | 
 6 | def deal_userdata():
 7 |     userresult=open('userresult.txt','w')
 8 |     for line in open('userdata.txt','r'):
 9 |         line=line.replace('\n','')
10 |         lists=line.split('||')
11 |         try:
12 |             allreview=int(lists[-2].replace('Reviews',''))
13 |         except:
14 |             allreview=0
15 |         try:
16 |             hostreview=int(lists[-1])
17 |         except:
18 |             hostreview=0
19 |         try:
20 |             prereview=allreview-hostreview
21 |         except:
22 |             prereview='--'
23 |         result=''
24 |         for i in lists:
25 |             result+=i+'||'
26 |         result+=str(prereview)
27 |         userresult.write(result+'\n')
28 |     userresult.close()
29 | 
30 | def replace_r():
31 |     room=open('roomtxt.txt','w')
32 |     f=open('roomdata.txt','r').readlines()
33 |     for line in f:
34 |         line=line.replace('\r','').replace('\n','')
35 |         room.write(line+'\n')
36 |     room.close()
37 | 
38 | def Excel():
39 |     Response_rate='Response rate:(.*?)Response'
40 |     Response_time='Response time:(.*?hours)'
41 |     users=open('userresult.txt','r').readlines()
42 |     rooms=open('roomtxt.txt','r').readlines()
43 |     excel=xlwt3.Workbook()
44 |     usersheet=excel.add_sheet('user')
45 |     roomsheet=excel.add_sheet('room')
46 |     count=0
47 |     for line in rooms:
48 |         lists=line.replace('\n','').split('||')
49 |         for user in users:
50 |             if lists[5] in user:
51 |                 try:
52 |                     rate=re.findall(Response_rate,line)[0]
53 |                 except:
54 |                     rate='--'
55 |                 try:
56 |                     time=re.findall(Response_time,line)[0]
57 |                 except:
58 |                     time='--'
59 |                 num=0
60 |                 for i in lists:
61 |                     try:
62 |                         i=i.split('?')[0]
63 |                         i=i.split(':')[-1]
64 |                         i=i.replace('/rooms/','')
65 |                         i=i.replace('/users/show/','')
66 |                     except:
67 |                         pass
68 |                     roomsheet.write(count,num,i)
69 |                     num+=1
70 |                 roomsheet.write(count,num,rate)
71 |                 num+=1
72 |                 roomsheet.write(count,num,time)
73 |                 num=0
74 |                 for i in user.replace('\n','').split('||'):
75 |                     try:
76 |                         i=i.split('?')[0]
77 |                         i=i.split(':')[-1]
78 |                         i=i.replace('/rooms/','')
79 |                         i=i.replace('/users/show/','')
80 |                     except:
81 |                         pass
82 |                     usersheet.write(count,num,i)
83 |                     num+=1
84 |                 count+=1
85 |     excel.save('result.xls')
86 | 
87 | Excel()
88 | 


--------------------------------------------------------------------------------
/www.airbnb.com/rooms.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import re
 4 | 
 5 | headers = {
 6 |     'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/44.0',
 7 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 8 |     'Accept-Language': 'en-US,en;q=0.5',
 9 |     'Accept-Encoding': 'gzip, deflate',
10 |     'Connection': 'keep-alive'}
11 | 
12 | def rooms(url):
13 |     html=requests.get(url,headers=headers).text
14 |     try:
15 |         table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'listings-container'}).find_all('div',attrs={'class':'listing'})
16 |     except:
17 |         return False
18 |     result=[]
19 |     for item in table:
20 |         try:
21 |             price=item.find('div',attrs={'class':'price-amount-container'}).get_text()
22 |         except:
23 |             price='--'
24 |         try:
25 |             media=item.find('div',attrs={'class':'media'})
26 |             title=media.find('h3').get_text()
27 |             userurl=media.find('a').get('href')
28 |             roomurl=media.find('h3').find('a').get('href')
29 |         except:
30 |             continue
31 |         a=media.find('a',attrs={'class':'text-normal link-reset'})
32 |         try:
33 |             rating=a.find('div',attrs={'class':'star-rating'}).find('div').find_all('i')
34 |             star=len(rating)
35 |             clases=[]
36 |             for i in rating:
37 |                 clases+=i.get('class')
38 |             if 'icon-star-half' in clases:
39 |                 star=star-0.5
40 |         except:
41 |             star='--'
42 |         try:
43 |             review=a.get_text().replace('\r','').replace('\n','').replace(' ','')
44 |             review=re.findall('(\d+)reviews',review)[0]
45 |         except:
46 |             review='--'
47 |         text=title+'||'+price+'||'+review+'||'+str(star)+'||'+roomurl+'||'+userurl
48 |         result.append(text.replace('\r','').replace('\n','').replace(' ',''))
49 |     return result
50 | 
51 | def getrooms():
52 |     citys="Chicago,Vancouver,Montreal,Portland,Philadelphia,Denver,Austin,D.C.,New Orleans,Phoenix,San Diego,Nashville,Paris,Berlin,Rome,Amsterdam,Barcelona,Copenhagen,Prague,Budapest,Stockholm,Florence,Edinburgh,Istanbul,Sydney,Melbourne,Cape Town,Beijing,Shanghai,Tokyo"
53 |     failed=open('failed.txt','a',encoding='utf-8')
54 |     for city in citys.split(','):
55 |         print(city)
56 |         url_f=open('urls.txt','a',encoding='utf-8')
57 |         url='https://www.airbnb.com/s/'+city.replace(' ','+').replace('.','%252E')
58 |         page=1
59 |         pre=[]
60 |         while True:
61 |             result=rooms(url+'?ss_id=v5im73ob&page=%s'%page)
62 |             if result==pre:
63 |                 break
64 |             pre=result
65 |             if result==False:
66 |                 failed.write(city+'--'+str(page))
67 |                 break
68 |             for item in result:
69 |                 url_f.write(city+'||'+item+'\n')
70 |             print(city,'--',page)
71 |             page+=1
72 |             if(page==18):
73 |                 break
74 |         url_f.close()
75 |     url_f.close()
76 |     failed.close()
77 | 
78 | getrooms()
79 | 


--------------------------------------------------------------------------------
/www.baikemy.com/disease.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import time
 4 | import openpyxl
 5 | 
 6 | 
 7 | headers = {
 8 |     'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 9 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
10 |     'Accept-Language': 'en-US,en;q=0.5',
11 |     'Accept-Encoding': 'gzip, deflate',
12 |     'Connection': 'keep-alive'}
13 | 
14 | def disease_list():
15 |     page=1
16 |     f=open('urls.txt','w',encoding='utf-8')
17 |     while True:
18 |         try:
19 |             html=requests.get('http://www.baikemy.com/disease/list/0/0?pageIndex='+str(page),headers=headers,timeout=30).text
20 |         except:
21 |             continue
22 |         table=BeautifulSoup(html,'lxml').find('div',{'class':'ccjb_jbli'}).find_all('li')
23 |         for li in table:
24 |             try:
25 |                 name=li.find('a').get_text()
26 |                 url='http://www.baikemy.com/'+li.find('a').get('href').replace('view','detail')+'/1/'
27 |                 f.write(name+'|'+url+'\n')
28 |             except:
29 |                 pass
30 |         if len(table)==1:
31 |             break
32 |         print('page %s urls get'%page)
33 |         page+=1
34 |     f.close()
35 | 
36 | def disease_infor(name,url):
37 |     html=requests.get(url,headers=headers,timeout=30).text
38 |     table=BeautifulSoup(html,'lxml').find('div',{'class':'lemma-main'}).find_all('div',{'class':'lemma-main-content'})
39 |     result=[name]
40 |     for item in table:
41 |         try:
42 |             key=item.find('span',{'class':'headline-content'}).get_text()
43 |             value=item.find('div',{'class':'para'}).get_text()
44 |             result.append(key+':\t   '+value)
45 |         except:
46 |             continue
47 |     return result
48 | 
49 | def write_to_excel(result):
50 |     excel=openpyxl.Workbook(write_only=True)
51 |     sheet=excel.create_sheet()
52 |     for line in result:
53 |         try:
54 |             sheet.append(line)
55 |         except:
56 |             pass
57 |     excel.save('result.xlsx')
58 | 
59 | def main():
60 |     disease_list()
61 |     result=[]
62 |     for line in open('urls.txt','r',encoding='utf-8'):
63 |         line=line.replace('\n','')
64 |         try:
65 |             name=line.split('|')[0]
66 |             url=line.split('|')[1]
67 |         except:
68 |             continue
69 |         try:
70 |             data=disease_infor(name,url)
71 |         except:
72 |             failed=open('failed.txt','a',encoding='utf-8')
73 |             failed.write(line+'\r\n')
74 |             failed.close()
75 |             continue
76 |         result.append(data)
77 |         try:
78 |             print(name,'ok')
79 |         except:
80 |             pass
81 |     write_to_excel(result)
82 |     print('完成')
83 | 
84 | 
85 | main()
86 | time.sleep(60)
87 | 


--------------------------------------------------------------------------------
/www.chazidian.com/yuwen.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import openpyxl
 4 | import time
 5 | 
 6 | headers = {
 7 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 8 |     "Accept-Encoding": "gzip, deflate",
 9 |     "Accept-Language": "en-US,en;q=0.5",
10 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
11 | 
12 | 
13 | def get_terms():
14 |     html=open('html.html','r').read()
15 |     table=BeautifulSoup(html).find_all('span',{'class':'y-l'})
16 |     urls=[]
17 |     f=open('terms.txt','w')
18 |     for item in table:
19 |         try:
20 |             term=item.find('h4').get_text()
21 |             publishers=item.find_all('p')
22 |             for p in publishers:
23 |                 publisher=p.get_text()
24 |                 links=p.find_all('a')
25 |                 for a in links:
26 |                     url=a.get('href')
27 |                     f.write(term+'|'+publisher+'|'+a.get_text()+'|'+url+'\n')
28 |         except:
29 |             continue
30 |     f.close()
31 | 
32 | def get_article_url(term_url):
33 |     html=requests.get('http://yuwen.chazidian.com'+term_url,headers=headers).text
34 |     table=BeautifulSoup(html,'lxml').find('div',id='mulu').find_all('div',{'class':'mldy'})
35 |     result=[]
36 |     num=1
37 |     for item in table:
38 |         title=item.find('a').get_text()
39 |         url=item.find('a').get('href').replace('kewen','kewendetail')
40 |         line=str(num)+'|'+title+'|'+url
41 |         result.append(line)
42 |         num+=1
43 |     return result
44 | 
45 | def get_urls():
46 |     for line in open('terms.txt','r'):
47 |         line=line.replace('\n','')
48 |         url=line.split('|')[-1]
49 |         result=get_article_url(url)
50 |         f=open('urls.txt','a')
51 |         for item in result:
52 |             f.write(line+'|'+item+'\n')
53 |         f.close()
54 |         print(line)
55 |         time.sleep(1)
56 | 
57 | def get_article_content(url):
58 |     html=requests.get(url,headers=headers).text
59 |     content=BeautifulSoup(html,'lxml').find('div',id='print_content').get_text()
60 |     return content
61 | 
62 | def main():
63 |     excel=openpyxl.Workbook(write_only=True)
64 |     sheet=excel.create_sheet()
65 |     for line in open('urls.txt','r'):
66 |         line=line.replace('\n','')
67 |         infor_list=line.split('|')
68 |         url=infor_list[-1]
69 |         try:
70 |             content=get_article_content(url)
71 |         except:
72 |             failed=open('failed.txt','a')
73 |             failed.write(line+'\n')
74 |             failed.close()
75 |             continue
76 |         sheet.append(infor_list+[content])
77 |         print(line)
78 |         time.sleep(0.5)
79 |     excel.save('result.xlsx')
80 | main()
81 | 


--------------------------------------------------------------------------------
/www.china-10.com/china10.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import  BeautifulSoup
 5 | import time
 6 | 
 7 | headers = {
 8 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 9 |     "Accept-Encoding": "gzip, deflate",
10 |     "Accept-Language": "en-US,en;q=0.5",
11 |     "Connection": "keep-alive",
12 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
13 | 
14 | def get_kinds():
15 |     f=open('types.txt','w')
16 |     url='http://www.china-10.com/brand/'
17 |     html=requests.get(url).text
18 |     table=BeautifulSoup(html,'lxml').find('div',id='menubox').find('ul',id='conmenu').find_all('li',attrs={'class':'menu'})
19 |     for item in table[1:-3]:
20 |         key=item.find('a').get_text().replace('\n','')+'||'
21 |         for li in item.find_all('li'):
22 |             f.write(key+li.find('a').get('title')+'||'+li.find('a').get('href')+'\n')
23 |     f.close()
24 | 
25 | def get_brands():
26 |     f=open('types.txt','r')
27 |     data=open('brands.txt','w')
28 |     for line in f.readlines():
29 |         print(line)
30 |         line=line.replace('\n','')
31 |         page=1
32 |         while True:
33 |             html=requests.get(line.split('||')[-1]+'?action=ajax&page='+str(page),headers).text
34 |             page+=1
35 |             table=BeautifulSoup(html,'lxml').find_all('li')
36 |             if(table==[]):
37 |                 break
38 |             for item in table:
39 |                 text=line+'||'+item.get_text()+'||'+item.find('a').get('href')+'\n'
40 |                 data.write(text)
41 |             print(page)
42 |     f.close()
43 | 
44 | def get_infor(line):
45 |     html=requests.get(line.split('||')[-1],headers=headers).text
46 |     table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'brandinfo'})
47 |     des=table.find('dd').get_text()
48 |     line+='||'+des
49 |     table=table.find('ul').find_all('li')
50 |     for li in table:
51 |         line+='||'+li.get_text().replace('\r','').replace('\n','').replace('\t','').replace(' ','')
52 |     return line
53 | 
54 | def main():
55 |     data=open('data.txt','a')
56 |     failed=open('failed.txt','a')
57 |     count=0
58 |     for line in open('brands.txt','r').readlines():
59 |         line=line.replace('\n','')
60 |         try:
61 |             line=get_infor(line)
62 |         except:
63 |             failed.write(line+'\n')
64 |             continue
65 |         data.write(line+'\n')
66 |         count+=1
67 |         time.sleep(1)
68 |         print(count)
69 | 
70 | main()
71 | 


--------------------------------------------------------------------------------
/www.china-10.com/excel.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import xlwt3
 4 | import re
 5 | 
 6 | def excel():
 7 |     f=open('data.txt','r')
 8 |     ex=xlwt3.Workbook()
 9 |     sheet=ex.add_sheet('sheet')
10 |     count=0
11 |     rels=['品牌等级：(.*?)\|\|','关注指数：(.*?)\|\|','\|\|.*?董事.*?：(.*?)品牌创立','时间：(.*?)\|\|','发源地：(.*?)\|\|','官方网站：(.*?)\|\|','客服电话：(.*?)\|\|','告词：(.*?)\|\|','(产品\d+)]','(网点\d+)]','(新闻\d+)]','(网店.*?)]']
12 |     for line in f.readlines():
13 |         line=line.replace('\n','').replace('信用指数：','')
14 |         lists=[]
15 |         for rel in rels:
16 |             try:
17 |                 i=re.findall(rel,line)[0]
18 |             except:
19 |                 i='--'
20 |             lists.append(i)
21 |         strs=line.split('||')
22 |         sheet.write(count,0,strs[0])
23 |         sheet.write(count,1,strs[1])
24 |         sheet.write(count,2,strs[2])
25 |         sheet.write(count,3,strs[3])
26 |         sheet.write(count,4,strs[4])
27 |         sheet.write(count,5,strs[5])
28 |         num=6
29 |         for i in lists:
30 |             sheet.write(count,num,i)
31 |             num+=1
32 |         sheet.write(count,num,strs[-1])
33 |         count+=1
34 |     ex.save('data.xls')
35 | 
36 | excel()
37 | 


--------------------------------------------------------------------------------
/www.chuanlaoda.cn/CaptchaOCR.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.chuanlaoda.cn/CaptchaOCR.dll


--------------------------------------------------------------------------------
/www.chuanlaoda.cn/py2exe_install.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | from distutils.core import setup
3 | import py2exe
4 | 
5 | setup(console=["chuanlaoda.py"])
6 | 


--------------------------------------------------------------------------------
/www.chuanlaoda.cn/testdll.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | 
 4 | from ctypes import *
 5 | 
 6 | ocrpasswd = "868197D30CC624FD3C2E2EE66494DA5F"
 7 | #VcodeInit 初始换引擎函数 只有一个参数 为引擎初始化密码 失败返回-1 此函数只需调用一次 切勿多次调用 。
 8 | dll = windll.LoadLibrary('CaptchaOCR.dll')
 9 | load_ocr = dll.VcodeInit
10 | load_ocr.argtypes = [c_char_p]
11 | load_ocr.restypes = c_int
12 | index = load_ocr(ocrpasswd.encode('utf-8'))
13 | img_string = open(imgname, "rb").read()
14 | img_buffer = create_string_buffer(img_string)
15 | #申请接收识别结果的缓冲区  一定要申请
16 | ret_buffer = create_string_buffer(15)
17 | #调用此函数之前，如果已经初始化成功过识别引擎函数 那么无需再调用初始化函数
18 | #GetVcode 识别函数 参数1为 VcodeInit 返回值 index 参数2为图片数据 参数3为图片大小 参数4为接收识别结果 需要给变量申请内存 如 ret_buffer = create_string_buffer(10)
19 | get_code_from_buffer = dll.GetVcode
20 | get_code_from_buffer(index, byref(img_buffer), len(img_buffer), byref(ret_buffer))
21 | print (ret_buffer.value.decode('utf-8'))
22 | 


--------------------------------------------------------------------------------
/www.chuanlaoda.cn/x64/CaptchaOCR.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.chuanlaoda.cn/x64/CaptchaOCR.dll


--------------------------------------------------------------------------------
/www.cpbz.gov.cn/write_to_excel.py:
--------------------------------------------------------------------------------
 1 | import openpyxl
 2 | 
 3 | 
 4 | def load_result():
 5 |     result=[]
 6 |     for line in open('result.txt','r'):
 7 |         item=eval(line)
 8 |         baseinfor=[item['url']]
 9 |         for key in ['机构名称','法定代表人','组织机构代码','邮政编码','注册地址','行政区划']:
10 |             try:
11 |                 baseinfor.append(item['企业基本信息'][key])
12 |             except:
13 |                 baseinfor.append('')
14 |         numbers=[]
15 |         try:
16 |             for num_line in item['技术指标']:
17 |                 numbers+=num_line
18 |         except:
19 |             pass
20 |         for key in ['标准名称','标准编号','公开时间','url']:
21 |             try:
22 |                 baseinfor.append(item['标准信息'][key])
23 |             except:
24 |                 baseinfor.append('')
25 |         try:
26 |             products=item['产品信息']
27 |         except:
28 |             products=[]
29 |         for product in products:
30 |             product[-1]=item['standardStatus']
31 |             yield baseinfor+product+numbers
32 | 
33 | def write_to_excel():
34 |     excel=openpyxl.Workbook(write_only=True)
35 |     sheet=excel.create_sheet()
36 |     for line in load_result():
37 |         sheet.append(line)
38 |     excel.save('result.xlsx')
39 | 
40 | write_to_excel()
41 | 


--------------------------------------------------------------------------------
/www.ctrip.com/comments.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from bs4 import BeautifulSoup
 3 | import time
 4 | 
 5 | browser=webdriver.Chrome("./chromedriver")
 6 | browser.get('http://hotels.ctrip.com/hotel/zhuhai31')
 7 | browser.implicitly_wait(10)
 8 | hotels=[eval(line) for line in open('hotels.txt','r')]
 9 | flag=True
10 | for hotel in hotels:
11 |     hotel_id=hotel[2].split('.')[0].split('/')[-1]
12 |     if hotel_id!='1353810' and flag:
13 |         continue
14 |     flag=False
15 |     page=1
16 |     '''
17 |     if hotel_id=='435300':
18 |         page=54
19 |     '''
20 |     endpage=1000
21 |     while page<=endpage:
22 |         try:
23 |             browser.get('http://hotels.ctrip.com/hotel/dianping/%s_p%st0.html'%(hotel_id,page))
24 |             html=browser.page_source
25 |         except:
26 |             continue
27 |         time.sleep(2)
28 |         try:
29 |             browser.find_element_by_class_name('comment_tab_main')
30 |             comments=BeautifulSoup(html,'lxml').find('div',{'class':'comment_tab_main'}).find_all('div',{'class':'comment_block'})
31 |         except:
32 |             continue
33 |         if '以下为酒店3年前历史点评' in str(comments):
34 |             print('以下为酒店3年前历史点评')
35 |             break
36 |         f=open('result_2.txt','a')
37 |         for line in comments:
38 |             f.write(str(hotel+[str(line)])+'\n')
39 |         f.close()
40 |         print(page,endpage,hotel[0])
41 |         if endpage==1000:
42 |             try:
43 |                 endpage=BeautifulSoup(html,'lxml').find('div',{'class':'c_page_list'}).find_all('a')[-1].get('value')
44 |                 endpage=int(endpage)
45 |             except:
46 |                 break
47 |         page+=1
48 | 


--------------------------------------------------------------------------------
/www.ctrip.com/youtrip.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import time
 4 | 
 5 | headers = {
 6 |         'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 7 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 8 |         'Accept-Language': 'en-US,en;q=0.5',
 9 |         'Accept-Encoding': 'gzip, deflate',
10 |         'Connection': 'keep-alive'}
11 | 
12 | def getUrl():
13 |     f=open('urls.txt','a')
14 |     page=1
15 |     while True:
16 |         html=requests.get('http://you.ctrip.com/travels/guilin28/t3-p{}.html'.format(page),headers=headers).text
17 |         table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'journalslist cf'}).find_all('a',attrs={'class':'journal-item cf'})
18 |         for item in table:
19 |             title=item.find('dt').get_text().replace('\r','').replace('\n','')
20 |             f.write(title+'||'+item.get('href')+'\n')
21 |         print(page,'--ok')
22 |         page+=1
23 |         if page==991:
24 |             break
25 |         time.sleep(2)
26 |     f.close()
27 | 
28 | def getcontent(url):
29 |     html=requests.get(url,headers=headers).text
30 |     soup=BeautifulSoup(html,'lxml').find('div',attrs={'class':'ctd_content'})
31 |     body=soup.get_text()
32 |     place=soup.find('div',{'class':'ctd_content_controls cf'}).get_text()
33 |     result=body.replace(place,'')
34 |     return result
35 | 
36 | 
37 | def main():
38 |     excel=xlwt3.Workbook()
39 |     sheet=excel.add_sheet('sheet')
40 |     count=0
41 |     for line in open('urls.txt','r'):
42 |         line=line.replace('\n','')
43 |         title=line.split('||')[0]
44 |         url='http://you.ctrip.com'+line.split('||')[-1]
45 |         try:
46 |             content=getcontent(url)
47 |         except:
48 |             failed=open('failed.txt','a')
49 |             failed.write(line+'\n')
50 |             failed.close()
51 |             continue
52 |         sheet.write(count,0,count)
53 |         sheet.write(count,1,title)
54 |         sheet.write(count,2,content)
55 |         count+=1
56 |         excel.save('result.xls')
57 |         time.sleep(2)
58 |         print(count,'--ok')
59 |         
60 | 


--------------------------------------------------------------------------------
/www.dicos.com.cn/storelist.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import openpyxl
 4 | 
 5 | def citys():
 6 |     f=open('citys.txt','a')
 7 |     for pid in range(6,33):
 8 |         html=requests.get('http://www.dicos.com.cn/index.php?c=page&m=getcityhtml&iscity=1&pid=%s'%pid).text
 9 |         table=BeautifulSoup(html,'lxml').find_all('option')
10 |         for item in table:
11 |             f.write(item.get_text()+'|'+item.get('value')+'\n')
12 |     f.close()
13 | 
14 | def get_store(citycode):
15 |     html=requests.get('http://www.dicos.com.cn/index.php?c=page&m=getstorehtml&waimai=0&mProvince=3&mCity=%s'%citycode).text
16 |     table=BeautifulSoup(html,'lxml').find_all('tr')
17 |     result=[]
18 |     for item in table:
19 |         text=''
20 |         for td in item.find_all('td')[1:4]:
21 |             text+='|'+td.get_text()
22 |         result.append(text.replace('\r','').replace('\n',''))
23 |     return result
24 | 
25 | def main():
26 |     f=open('result.txt','a')
27 |     for line in open('citys.txt'):
28 |         line=line.replace('\n','')
29 |         try:
30 |             result=get_store(line.split('|')[-1])
31 |         except:
32 |             failed=open('failed.txt','a')
33 |             failed.write(line+'\n')
34 |             failed.close()
35 |             continue
36 |         for item in result:
37 |             f.write(line+item+'\n')
38 |         print(line,'ok')
39 |     f.close()
40 | 
41 | def write_to_excel():
42 |     result={}
43 |     excel=openpyxl.Workbook(write_only=True)
44 |     sheet1=excel.create_sheet('1')
45 |     for line in open('result.txt','r'):
46 |         line=line.replace('\n','')
47 |         lists=line.split('|')
48 |         try:
49 |             result[lists[0]]+=1
50 |         except:
51 |             result[lists[0]]=1
52 |         sheet1.append(lists)
53 |     sheet2=excel.create_sheet('2')
54 |     for key in result:
55 |         sheet2.append([key,result[key]])
56 |     excel.save('result.xlsx')
57 | 
58 | write_to_excel()
59 | 


--------------------------------------------------------------------------------
/www.eastmoney.com/quote.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import openpyxl
 3 | import json
 4 | 
 5 | 
 6 | def get_data(code,market):
 7 |     url='http://hqdigi2.eastmoney.com/EM_Quote2010NumericApplication/CompatiblePage.aspx?Type=OB&stk=%s&Reference=xml&limit=0&page=%s'
 8 |     html=requests.get(url%(code+market,1)).text
 9 |     data=json.loads(html.replace('var jsTimeSharingData=','').replace(';','').replace('pages','"pages"').replace('data','"data"'))
10 |     if data['pages']==0:
11 |         return False
12 |     pages=data['pages']
13 |     page=2
14 |     result=[]
15 |     for item in data['data']:
16 |         result.append(item.split(','))
17 |     while page<=pages:
18 |         html=requests.get(url%(code+market,page)).text
19 |         data=json.loads(html.replace('var jsTimeSharingData=','').replace(';','').replace('pages','"pages"').replace('data','"data"'))
20 |         for item in data['data']:
21 |             result.append(item.split(','))
22 |         page+=1
23 |     return result
24 | 
25 | def write_to_excel(code,result):
26 |     excel=openpyxl.Workbook(write_only=True)
27 |     sheet=excel.create_sheet()
28 |     for item in result:
29 |         sheet.append(item)
30 |     excel.save('%s.xlsx'%code)
31 |     print(code,'OK')
32 | 
33 | def main():
34 |     try:
35 |         code=input('输入股票代码：')
36 |     except:
37 |         print("Faliled")
38 |         return
39 |     result=[]
40 |     for market in ['1','2']:
41 |         try:
42 |             result=get_data(code,market)
43 |         except:
44 |             continue
45 |         if result==False:
46 |             continue
47 |         break
48 |     if result==[] or result==False:
49 |         print('Failed')
50 |         return
51 |     write_to_excel(code,result)
52 | 
53 | while True:
54 |     main()


--------------------------------------------------------------------------------
/www.eastmoney.com/transaction.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import openpyxl
 3 | import re
 4 | import time
 5 | import os
 6 | 
 7 | 
 8 | def get_data(code,market):
 9 |     url='http://nufm3.dfcfw.com/EM_Finance2014NumericApplication/JS.aspx?type=CT&cmd=%s&sty=DPTTFD&st=z&sr=1&p=1&ps=&cb=&token=beb0a0047196124721f56b0f0ff5a27c'
10 |     html=requests.get(url%(code+market)).text
11 |     if 'false' in html:
12 |         return False
13 |     text=re.findall('"(.*?)"',html)[0]
14 |     lines=text.split('|')
15 |     result=[]
16 |     for line in lines:
17 |         result.append(line.split('~'))
18 |     return result
19 | 
20 | def write_to_excel(code,result):
21 |     excel=openpyxl.Workbook(write_only=True)
22 |     sheet=excel.create_sheet()
23 |     for item in result:
24 |         sheet.append(item)
25 |     try:
26 |         os.mkdir('result/'+code)
27 |     except:
28 |         pass
29 |     date=timenow=time.strftime('%Y-%m-%d',time.localtime())
30 |     excel.save('result/'+code+'/%s.xlsx'%date)
31 | 
32 | def get_transaction(code):
33 |     global result
34 |     for market in ['1','2']:
35 |         try:
36 |             data=get_data(code,market)
37 |         except:
38 |             continue
39 |         if data==False:
40 |             continue
41 |         break
42 |     if data==[] or data==False:
43 |         print('Failed')
44 |         return
45 |     timenow=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())
46 |     print(timenow,code,'ok')
47 |     is_write=False
48 |     for line in data:
49 |         if line in result:
50 |             continue
51 |         result.append(line)
52 |         is_write=True
53 |     if is_write:
54 |         write_to_excel(code,result)
55 |     
56 | 
57 | code=input('输入股票代码：')
58 | result=[]
59 | while True:
60 |     get_transaction(code)
61 |     time.sleep(0.5)


--------------------------------------------------------------------------------
/www.fang.com/new_hourse.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import time
 4 | import openpyxl
 5 | 
 6 | headers = {
 7 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 8 |     "Accept-Encoding": "gzip, deflate",
 9 |     "Accept-Language": "en-US,en;q=0.5",
10 |     "Connection": "keep-alive",
11 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
12 | 
13 | 
14 | def get_house():
15 |     page=1
16 |     url='http://newhouse.cs.fang.com/house/s/b9'
17 |     while True:
18 |         html=requests.get(url+str(page),headers=headers).text.encode('iso-8859-1').decode('gbk')
19 |         table=BeautifulSoup(html,'lxml').find('div',{'class':'nhouse_list'}).find_all('li')
20 |         f=open('urls.txt','a')
21 |         for item in table:
22 |             detail=item.find('div',{'class':'nlc_details'})
23 |             house_url=detail.find('a').get('href')
24 |             name=detail.find('a').get_text()
25 |             address_div=detail.find('div',{'class':'address'})
26 |             address=address_div.find('a').get('title')
27 |             try:
28 |                 location=address_div.find('span').get_text()
29 |             except:
30 |                 location='-'
31 |             try:
32 |                 price=detail.find('div',{'class':'nhouse_price'}).find('span').get_text()
33 |             except:
34 |                 price='-'
35 |             line=name+'|'+house_url+'|'+price+'|'+location+'|'+address
36 |             line=line.replace('\r','').replace('\n','').replace('\t','')
37 |             f.write(line+'\n')
38 |         f.close()
39 |         print(page,'ok')
40 |         page+=1
41 |         time.sleep(1)
42 | 
43 | def get_house_live_history(url):
44 |     html=requests.get(url,headers=headers).text.encode('iso-8859-1').decode('gbk')
45 |     table=BeautifulSoup(html,'lxml').find('div',id='tc_jiaofang').find_all('tr')
46 |     lines=[]
47 |     for tr in table[2:-1]:
48 |         tds=tr.find_all('td')
49 |         date=tds[0].get_text()
50 |         month=date.split('-')[1]
51 |         infor=tds[1].get_text()
52 |         line=month+'|'+date+'|'+infor
53 |         lines.append(line.replace('\xa0',''))
54 |     return lines
55 | 
56 | def house_live_history():
57 |     is_ok=True
58 |     for item in open('urls.txt','r'):
59 |         item=item.replace('\n','')
60 |         url=item.split('|')[1]
61 |         if url!='http://jiulongshanjy.fang.com/' and is_ok==True:
62 |             continue
63 |         is_ok=False
64 |         try:
65 |             lines=get_house_live_history(url)
66 |         except:
67 |             lines=[]
68 |         print(item)
69 |         f=open('changsha.txt','a')
70 |         if lines==[]:
71 |             f.write(item+'\n')
72 |             f.close()
73 |             continue
74 |         for line in lines:
75 |             f.write(item+'|'+line+'\n')
76 |         f.close()
77 |         time.sleep(1)
78 | 
79 | house_live_history()
80 | 


--------------------------------------------------------------------------------
/www.ganji.com/ganji_tel.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import re
 6 | import time
 7 | import openpyxl
 8 | 
 9 | headers = {
10 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
11 |     "Accept-Encoding": "gzip, deflate",
12 |     "Accept-Language": "en-US,en;q=0.5",
13 |     "Connection": "keep-alive",
14 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
15 | 
16 | 
17 | def get_tels(url):
18 |     html=requests.get(url,headers=headers).text
19 |     try:
20 |         table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'list'}).find_all('li')
21 |     except:
22 |         return []
23 |     tels=[]
24 |     for li in table:
25 |         try:
26 |             tel=li.find('div',attrs={'class':'list-r-area'}).find('p',attrs={'class':'tel'}).find('span').get_text()
27 |         except:
28 |             continue
29 |         tels.append(tel)
30 |     return tels
31 | 
32 | 
33 | def main():
34 |     url=input('输入链接：')
35 |     url=re.sub('o\d+/','',url)
36 |     if not url.startswith('http'):
37 |         url='http://'+url
38 |     page=1
39 |     tels=[]
40 |     while True:
41 |         try:
42 |             result=get_tels(url+'o'+str(page)+'/')
43 |         except:
44 |             continue
45 |         if result==[]:
46 |             break
47 |         tels+=result
48 |         print('第%s页--完成'%page)
49 |         page+=1
50 |         time.sleep(5)
51 |     tels=list(set(tels))
52 |     count=0
53 |     excel=openpyxl.Workbook(write_only=True)
54 |     sheet=excel.create_sheet()
55 |     for tel in tels:
56 |         sheet.append([tel])
57 |     excel.save('tels.xls')
58 |     
59 | main()
60 | 


--------------------------------------------------------------------------------
/www.gewara.com/reviews.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import openpyxl
 4 | import time
 5 | 
 6 | 
 7 | headers = {
 8 |     'X-Requested-With':"XMLHttpRequest",
 9 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
10 |     "Accept-Encoding": "gzip, deflate",
11 |     "Accept-Language": "en-US,en;q=0.5",
12 |     "Connection": "keep-alive",
13 |     'Referer':"http://www.gewara.com/movie/282568860",
14 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
15 | 
16 | def getreviews(page,relatedid):
17 |     html=requests.get('http://www.gewara.com/activity/ajax/sns/qryComment.xhtml?pageNumber={}&relatedid={}&topic=&issue=false&hasMarks=true&isCount=true&tag=movie&isPic=true&isVideo=false&userLogo=&newWalaPage=true&isShare=false&isNew=true&maxCount=200&isWide=true&isTicket=false'.format(page,relatedid),headers=headers).text
18 |     table=BeautifulSoup(html,'lxml').find_all('div',{'class':'page_wala'})
19 |     result=[]
20 |     for item in table:
21 |         try:
22 |             grade=item.find('span',{'class':'ui_grades left ui_grade10'}).get('title')
23 |             reviewid=item.find('div',{'class':'wala_txt'}).get('data-id')
24 |             if reviewid==None:
25 |                 review=item.find('div',{'class':'wala_miniTxt'}).get_text().replace('\r','').replace('\n','').replace('\t','')
26 |                 result.append({'grade':grade,'review':review})
27 |                 continue
28 |             result.append({'grade':grade,'id':reviewid})
29 |         except:
30 |             continue
31 |     return result
32 | 
33 | def getcontent(id):
34 |     html=requests.get('http://www.gewara.com/activity/sns/ajaxCommentDetail.xhtml?id=%s&isNew=true'%id).text
35 |     text=BeautifulSoup(html,'lxml').get_text().replace('\r','').replace('\n','').replace('\t','')
36 |     return text
37 | 
38 | def write_to_excel():
39 |     excel=openpyxl.Workbook(write_only=True)
40 |     sheet=excel.create_sheet()
41 |     for line in open('result.txt','r'):
42 |         item=eval(line)
43 |         sheet.append([item['grade'],item['review']])
44 |     excel.save('result.xlsx')
45 | 
46 | def main():
47 |     f=open('result.txt','a')
48 |     page=1
49 |     count=0
50 |     while True:
51 |         try:
52 |             result=getreviews(page,'282568860')
53 |         except:
54 |             print('failed')
55 |             time.sleep(3)
56 |             continue
57 |         for item in result:
58 |             try:
59 |                 dataid=item['id']
60 |             except:
61 |                 count+=1
62 |                 print(count)
63 |                 f.write(str(item)+'\n')
64 |                 continue
65 |             try:
66 |                 review=getcontent(dataid)
67 |             except:
68 |                 continue
69 |             item['review']=review
70 |             f.write(str(item)+'\n')
71 |             count+=1
72 |             print(count)
73 |             time.sleep(0.5)
74 |         print(page,'ok')
75 |         page+=1
76 |         if page==200:
77 |             break
78 |     f.close()
79 | 
80 | write_to_excel()
81 | 


--------------------------------------------------------------------------------
/www.imdb.com/boxoffice.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import re
 6 | 
 7 | headers = {
 8 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 9 |     "Accept-Encoding": "gzip, deflate",
10 |     "Accept-Language": "en-US,en;q=0.5",
11 |     "Connection": "keep-alive",
12 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
13 | 
14 | def get_url(title):
15 |     name=re.sub('\(.*?\)','',title)#.lower()#.replace(' ','')
16 |     html=requests.get('http://www.boxofficemojo.com/search/?q=%s'%name,headers=headers).text.replace('\r','').replace('\n','').replace('\t','')
17 |     rel='bgcolor=#FFFF99>(.*?)</tr>'
18 |     tr=re.findall(rel,html)[0]#BeautifulSoup(html,'lxml').find('tr',attrs={'bgcolor':'#FFFF99'})
19 |     tds=BeautifulSoup(str(tr),'lxml').find_all('td')
20 |     #tds=tr.findall('td')
21 |     url='http://www.boxofficemojo.com'+tds[0].find('a').get('href')
22 |     de=tds[2].get_text()
23 |     html=requests.get(url,headers=headers).text
24 |     table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'mp_box_content'}).get_text().replace('\r','|').replace('\n','|')
25 |     print(table)
26 |     line=de
27 |     rel='Worldwide:\|(.*?)\|'
28 |     try:
29 |         wl=re.findall(rel,table)[0]
30 |     except:
31 |         wl='-'
32 |     line=de+'||'+wl
33 |     return line
34 | 
35 | def main():
36 |     f=open('data.txt','w')
37 |     for line in open('new.txt','r').readlines():
38 |         line=line.replace('\r','').replace('\n','')
39 |         try:
40 |             price=get_url(line)
41 |         except:
42 |             price='--||--'
43 |         f.write(line+'||'+price+'\n')
44 |         print(price)
45 | 
46 | main()
47 | 


--------------------------------------------------------------------------------
/www.imdb.com/movies.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import time
 6 | from selenium  import webdriver
 7 | 
 8 | headers = {
 9 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
10 |     "Accept-Encoding": "gzip, deflate",
11 |     "Accept-Language": "en-US,en;q=0.5",
12 |     "Connection": "keep-alive",
13 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
14 | 
15 | def get_movies():
16 |     f=open('data_movies2013.txt','a')
17 |     start=1
18 |     while start<8519:
19 |         try:
20 |             html=requests.get('http://www.imdb.com/search/title?at=0&sort=boxoffice_gross_us&start=%s&title_type=feature&year=2013,2013'%start,headers=headers,timeout=30).text
21 |         except:
22 |             continue
23 |         items=parser(html)
24 |         for item in items:
25 |             f.write(item+'\n')
26 |         start+=50
27 |         print(start)
28 | 
29 | def parser(html):
30 |     items=[]
31 |     table=BeautifulSoup(html,'lxml').find('table',attrs={'class':'results'}).find_all('tr')[1:]
32 |     for item in table:
33 |         td=item.find('td',attrs={'class':'title'})
34 |         title=item.find('a').get('title')
35 |         try:
36 |             score=td.find('span',attrs={'class':'rating-rating'}).get_text()
37 |         except:
38 |             score='-'
39 |         try:
40 |             col=item.find('td',attrs={'class':'sort_col'}).get_text()
41 |         except:
42 |             col='-'
43 |         text=title+'||'+score+'||'+col
44 |         items.append(text)
45 |     return items
46 | 
47 | get_movies()
48 | 


--------------------------------------------------------------------------------
/www.imdb.com/rottentomatoes.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import re
 6 | import threading
 7 | 
 8 | headers = {
 9 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
10 |     "Accept-Encoding": "gzip, deflate",
11 |     "Accept-Language": "en-US,en;q=0.5",
12 |     "Connection": "keep-alive",
13 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
14 | 
15 | class Score(threading.Thread):
16 |     def __init__(self,line):
17 |         super(Score,self).__init__()
18 |         self.line=line
19 |         self.name=self.line.split('||')[0]
20 | 
21 |     def run(self):
22 |         try:
23 |             self.score=self.get_score(self.name)
24 |         except:
25 |             self.score='-'
26 |         print(self.score)
27 |         self.line=self.line+'||'+self.score
28 | 
29 |     def get_score(self,name):
30 |         try:
31 |             html=requests.get('http://www.rottentomatoes.com/search/?search=%s'%name.replace(' ','+'),headers=headers,timeout=40).text
32 |         except:
33 |             return self.get_score(name)
34 |         try:
35 |             table=BeautifulSoup(html,'lxml').find('ul',id='movie_results_ul').find_all('li')
36 |         except:
37 |             return score(html)
38 |         url=''
39 |         for li in table:
40 |             title=li.find('div',attrs={'class':'nomargin media-heading bold'}).get_text().replace('\r','').replace('\n','').replace(' ','')
41 |             if title.lower()==name.replace(' ','').lower():
42 |                 url='http://www.rottentomatoes.com'+li.find('a').get('href')
43 |                 break
44 |         if(url==''):
45 |             return '-'
46 |         html=requests.get(url,headers=headers,timeout=40).text
47 |         return score(html)
48 | 
49 | def score(html):
50 |     text=BeautifulSoup(html,'lxml').find('div',id='scorePanel').get_text().replace('\r','').replace('\n','').replace(' ','')
51 |     rel='AverageRating:(.*?)R'
52 |     try:
53 |         result=re.findall(rel,text)[0]
54 |         return result
55 |     except:
56 |         return '-'
57 | 
58 | 
59 | def main():
60 |     f=open('movies_2013.txt','a')
61 |     items=[]
62 |     for line in open('data_movies2013.txt','r').readlines():
63 |         line=line.replace('\n','')
64 |         items.append(line)
65 |         if(len(items)<40):
66 |             continue
67 |         threadings=[]
68 |         for item in items:
69 |             work=Score(item)
70 |             threadings.append(work)
71 |         for work in threadings:
72 |             work.start()
73 |         for work in threadings:
74 |             work.join()
75 |         for work in threadings:
76 |             f.write(work.line+'\n')
77 |         items=[]
78 |         threadings=[]
79 | 
80 |     for item in items:
81 |         work=Score(item)
82 |         threadings.append(work)
83 |     for work in threadings:
84 |         work.start()
85 |     for work in threadings:
86 |         work.join()
87 |     for work in threadings:
88 |         f.write(work.line+'\n')
89 |     f.close()
90 | 
91 | main()
92 | 


--------------------------------------------------------------------------------
/www.itjuzi.com/baseInvestevents.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import xlwt3
 6 | import time
 7 | 
 8 | headers = {
 9 |     'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
10 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
11 |     'Accept-Language': 'en-US,en;q=0.5',
12 |     'Accept-Encoding': 'gzip, deflate',
13 |     'Connection': 'keep-alive'}
14 | 
15 | def get_infor(url):
16 |     html=requests.get(url,headers=headers,timeout=50).text
17 |     results=[]
18 |     table=BeautifulSoup(html,'html.parser').find_all('ul',attrs={'class':'list-main-eventset'})[1].find_all('li')
19 |     for li in table:
20 |         item={}
21 |         i=li.find_all('i')
22 |         item['date']=i[0].get_text().replace('\n','').replace('\t','')
23 |         spans=i[2].find_all('span')
24 |         item['name']=spans[0].get_text().replace('\n','').replace('\t','')
25 |         item['industry']=spans[1].get_text().replace('\n','').replace('\t','')
26 |         item['local']=spans[2].get_text().replace('\n','').replace('\t','')
27 |         item['round']=i[3].get_text().replace('\n','').replace('\t','')
28 |         item['capital']=i[4].get_text().replace('\n','').replace('\t','')
29 |         companys=i[5].find_all('a')
30 |         Investmenters=''
31 |         if(companys==[]):
32 |             Investmenters=i[5].get_text().replace('\n','').replace('\t','')
33 |         else:
34 |             for a in companys:
35 |                 Investmenters+=a.get_text().replace('\n','').replace('\t','')+';'
36 |         item['Investmenters']=Investmenters
37 |         results.append(item)
38 |     return results
39 | 
40 | def main():
41 |     excel=xlwt3.Workbook()
42 |     sheet=excel.add_sheet('sheet')
43 |     count=0
44 |     startpage=1
45 |     keys=['date','name','industry','local','round','capital','Investmenters']
46 |     while startpage<1143:
47 |         try:
48 |             results=get_infor('https://www.itjuzi.com/investevents?page=%s'%startpage)
49 |         except:
50 |             time.sleep(5)
51 |             continue
52 |         for item in results:
53 |             num=0
54 |             for key in keys:
55 |                 sheet.write(count,num,item[key])
56 |                 num+=1
57 |             count+=1
58 |         print(startpage,'--ok')
59 |         startpage+=1
60 |         time.sleep(3)
61 |         excel.save('investevents.xls')
62 | main()
63 | 


--------------------------------------------------------------------------------
/www.itjuzi.com/companylist.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import time
 4 | import openpyxl
 5 | 
 6 | headers = {
 7 |     'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 8 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 9 |     'Accept-Language': 'en-US,en;q=0.5',
10 |     'Accept-Encoding': 'gzip, deflate',
11 |     'Connection': 'keep-alive'}
12 | 
13 | def get_companylist(page):
14 |     html=requests.get('http://www.itjuzi.com/company?page=%s'%page,headers=headers,timeout=30).text
15 |     table=BeautifulSoup(html,'html.parser').find_all('ul',{'class':'list-main-icnset'})[1].find_all('li')
16 |     if len(table)==0:
17 |         return []
18 |     result=[]
19 |     for li in table:
20 |         try:
21 |             img=li.find('img').get('src').split('?')[0]
22 |             title=li.find('p',{'class':'title'}).get_text()
23 |             url=li.find('a').get('href')
24 |             des=li.find('p',{'class':'des'}).get_text()
25 |             tags=li.find('span',{'class':'tags'}).get_text()
26 |             loca=li.find('span',{'class':'loca'}).get_text()
27 |             date=li.find('i',{'class':'date'}).get_text()
28 |             round=li.find('i',{'class':'round'}).get_text()
29 |         except:
30 |             continue
31 |         result.append([img,title,url,des,tags,loca,date,round])
32 |     return result
33 | 
34 | def write_to_excel(result):
35 |     excel=openpyxl.Workbook(write_only=True)
36 |     sheet=excel.create_sheet()
37 |     filename=time.strftime("%Y%m%d_%H%M%S",time.localtime())+'.xlsx'
38 |     for line in result:
39 |         sheet.append(line)
40 |     excel.save(filename)
41 | 
42 | def loadcompany():
43 |     companys=[]
44 |     for line in open('result.txt','r',encoding='utf-8'):
45 |         companys.append(line.replace('\r','').replace('\n',''))
46 |     return companys
47 | 
48 | def main():
49 |     try:
50 |         companys=loadcompany()
51 |     except:
52 |         companys=[]
53 |     page=1
54 |     f=open('result.txt','w',encoding='utf-8')
55 |     flag=False
56 |     new_list=[]
57 |     while True:
58 |         try:
59 |             result=get_companylist(page)
60 |         except:
61 |             time.sleep(5)
62 |             continue
63 |         if result==[]:
64 |             break
65 |         for item in result:
66 |             line='||'.join(item)
67 |             line=line.replace('\r','').replace('\n','').replace('\t','')
68 |             if line in companys:
69 |                 flag=True
70 |                 break
71 |             new_list.append(item)
72 |             f.write(line+'\r\n')
73 |         if flag:
74 |             break
75 |         print(page,'ok')
76 |         page+=1
77 |         time.sleep(3)
78 |     for company in companys:
79 |         f.write(company+'\r\n')
80 |     f.close()
81 |     write_to_excel(new_list)
82 | 
83 | main()
84 | 


--------------------------------------------------------------------------------
/www.itjuzi.com/investevents.py:
--------------------------------------------------------------------------------
 1 | #codnig:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import xlwt3
 6 | 
 7 | headers = {
 8 |     'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 9 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
10 |     'Accept-Language': 'en-US,en;q=0.5',
11 |     'Accept-Encoding': 'gzip, deflate',
12 |     'Connection': 'keep-alive'}
13 | 
14 | def get_base_infor():
15 |     f=open('data.txt','a')
16 |     for page in range(1048):
17 |         html=requests.get('https://www.itjuzi.com/investevents?page=%s'%(page+1),headers=headers).text
18 |         table=BeautifulSoup(html,'html.parser').find_all('ul',attrs={'class':'list-main-eventset'})[1].find_all('li')
19 |         for li in table:
20 |             item={}
21 |             i=li.find_all('i')
22 |             item['date']=i[0].get_text()
23 |             item['url']=i[1].find('a').get('href')
24 |             spans=i[2].find_all('span')
25 |             item['name']=spans[0].get_text()
26 |             item['industry']=spans[1].get_text()
27 |             item['local']=spans[2].get_text()
28 |             item['round']=i[3].get_text()
29 |             item['capital']=i[4].get_text()
30 |             companys=i[5].find_all('a')
31 |             lists=[]
32 |             if(companys==[]):
33 |                 lists.append(i[5].get_text())
34 |             else:
35 |                 for a in companys:
36 |                     lists.append(a.get_text())
37 |             item['Investmenters']=lists
38 |             f.write(str(item)+'\n')
39 |         print(page)
40 | 
41 | def main():
42 |     f=open('data.txt','r')
43 |     data_f=open('investevents.txt','a')
44 |     failed_f=open('failed.txt','a')
45 |     for line in f.readlines():
46 |         try:
47 |             item=eval(line.replace('\n',''))
48 |             html=requests.get(item['url'],headers=headers).text
49 |             url=BeautifulSoup(html,'lxml').find('div',attrs={'class':'block-inc-fina'}).find('a',attrs={'class':'incicon'}).get('href')
50 |             html=requests.get(url,headers=headers).text
51 |             soup=BeautifulSoup(html,'lxml').find('div',attrs={'class':'thewrap'})
52 |             table=soup.find('div',attrs={'class':'sec'})
53 |             company_url=table.find('div',attrs={'class':'rowhead'}).find('div',attrs={'class':'row c-gray-aset'}).find('div',attrs={'class':'dbi linkset c-gray'}).find('a').get('href')
54 |             tags=[]
55 |             for a in table.find('div',attrs={'class':'rowfoot'}).find('div',attrs={'class':'tagset dbi'}).find_all('a'):
56 |                 tags.append(a.get_text())
57 |             des=soup.find('div',attrs={'class':'block block-inc-info'}).find('div',attrs={'class':'des'}).get_text()
58 |             item['company_url']=company_url
59 |             item['tags']=tags
60 |             item['des']=des
61 |             data_f.write(str(item)+'\n')
62 |             print(item['url'])
63 |         except:
64 |             failed_f.write(line)
65 | 
66 | main()
67 | 


--------------------------------------------------------------------------------
/www.jisilu.com/jisilu.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import openpyxl
 4 | import time
 5 | 
 6 | headers = {
 7 |     'Host':"www.jisilu.cn",
 8 |     'Accept':"application/json, text/javascript, */*; q=0.01",
 9 |     "Accept-Encoding": "gzip, deflate",
10 |     "Accept-Language": "en-US,en;q=0.5",
11 |     "Connection": "keep-alive",
12 |     'Content-Type':"application/x-www-form-urlencoded; charset=UTF-8",
13 |     'X-Requested-With':"XMLHttpRequest",
14 |     'Cookie':"kbzw__Session=4sv8h9vjir144ijdh02h4nefd0; Hm_lvt_164fe01b1433a19b507595a43bf58262=1468934580; Hm_lpvt_164fe01b1433a19b507595a43bf58262=1468935752; kbz_newcookie=1; kbzw__user_login=7Obd08_P1ebax9aX5dvi0OXc5ZmcndHV7Ojg6N7bwNOM2KjZqpmgw6feqM6upamTqJmt3KbbkaKU17HXoNql2ZiXnKTs3Ny_zYylr6qgspyYnaO2uNXQo67f293l4cqooaWSlonPqKSzgcXD6efp3rSMw8vk1u-X67CXz5eotJXb76arlqSRoJe63cTb0KOrpZqpnKiSp4G94OXdx9_Zo62pl6k.",
15 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
16 | 
17 | def login():
18 |     logindata=open('user','r',encoding='utf-8').read().replace('\r','').replace('\n','')
19 |     logindata=eval(logindata)
20 |     data={
21 |     'user_name':logindata['user_name'],
22 |     'password':logindata['password'],
23 |     'net_auto_login':1,
24 |     '_post_type':'ajax',
25 |     'return_url':'https://www.jisilu.cn'
26 |     }
27 |     session=requests.session()
28 |     session.post('https://www.jisilu.cn/account/ajax/login_process/',data=data).text
29 |     return session
30 | 
31 | def getdata():
32 |     data={
33 |     'is_search':"0",
34 |     'avolume':"100",
35 |     'bvolume':"100",
36 |     'market':["sh","sz"],
37 |     'ptype':"price",
38 |     'rp':"50",
39 |     'page':"1"
40 |     }
41 |     session=login()
42 |     timestr=str(time.time()).replace('.','')
43 |     html=session.post('https://www.jisilu.cn/data/sfnew/arbitrage_vip_list/?___t=%s'%timestr,data=data).text
44 |     data=json.loads(html)['rows']
45 |     print(data[0])
46 |     write_to_excel(data)
47 |     print('OK')
48 | 
49 | def write_to_excel(data):
50 |     keys=['fundA_id','fundA_nm','sell1A','increase_rtA','fundA_volume','fundA_amount_increase',
51 |         'fundB_id','fundB_nm','sell1B','increase_rtB','fundB_volume','fundB_amount_increase',
52 |         'abrate','merge_price','est_dis_rt','base_fund_id','base_fund_nm','base_nav','base_est_val',
53 |         'index_nm','idx_incr_rt','asset_ratio','asset_ratio_last','apply_fee','redeem_fee']
54 |     excel=openpyxl.Workbook(write_only=True)
55 |     sheet=excel.create_sheet()
56 |     for item in data:
57 |         cell=[]
58 |         for key in keys:
59 |             try:
60 |                 cell.append(item['cell'][key])
61 |             except:
62 |                 cell.append('-')
63 |         sheet.append(cell)
64 |     excel.save('result.xlsx')
65 | 
66 | while True:
67 |     try:
68 |         getdata()
69 |     except:
70 |         print('Failed')
71 |         continue
72 |     time.sleep(10)
73 |     break
74 | 


--------------------------------------------------------------------------------
/www.kfc.com/storelist.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import json
 4 | import time
 5 | import openpyxl
 6 | 
 7 | def citys():
 8 |     html=open('index.html','r').read()
 9 |     table=BeautifulSoup(html,'lxml').find('ul',{'class':'city_info'}).find_all('li')
10 |     f=open('citys.txt','w')
11 |     for li in table:
12 |         for item in li.find_all('a'):
13 |             f.write(item.get_text()+'\n')
14 |     f.close()
15 | 
16 | def get_store(city):
17 |     result=[]
18 |     page=1
19 |     while True:
20 |         data={
21 |         'cname':city,
22 |         'pid':"",
23 |         'pageIndex':page,
24 |         'pageSize':"100"
25 |         }
26 |         html=requests.post('http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname',data=data).text
27 |         stores=json.loads(html)['Table1']
28 |         if stores==[]:
29 |             break
30 |         page+=1
31 |         for item in stores:
32 |             result.append(item['storeName']+'|'+item['cityName']+'|'+item['addressDetail']+'|'+item['pro'])
33 |         time.sleep(1)
34 |     return result
35 | 
36 | 
37 | def main():
38 |     f=open('result.txt','a')
39 |     for line in open('citys.txt','r'):
40 |         city=line.replace('\n','')
41 |         try:
42 |             result=get_store(city)
43 |         except:
44 |             failed=open('failed.txt','a')
45 |             failed.write(city+'\n')
46 |             failed.close()
47 |             continue
48 |         for item in result:
49 |             f.write(item+'\n')
50 |         print(city,'ok')
51 |     f.close()
52 | 
53 | def write_to_excel():
54 |     result={}
55 |     excel=openpyxl.Workbook(write_only=True)
56 |     sheet1=excel.create_sheet('1')
57 |     for line in open('result.txt','r'):
58 |         line=line.replace('\n','')
59 |         lists=line.split('|')
60 |         lists[0]=lists[0]+'餐厅'
61 |         try:
62 |             result[lists[1]]+=1
63 |         except:
64 |             result[lists[1]]=1
65 |         sheet1.append(lists)
66 |     sheet2=excel.create_sheet('2')
67 |     for key in result:
68 |         sheet2.append([key,result[key]])
69 |     excel.save('result.xlsx')
70 | 
71 | write_to_excel()
72 | 


--------------------------------------------------------------------------------
/www.kimiss.com/Nyspider.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | import os
 5 | import sqlite3
 6 | 
 7 | headers = {
 8 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 9 |     "Accept-Encoding": "gzip, deflate",
10 |     "Accept-Language": "en-US,en;q=0.5",
11 |     "Connection": "keep-alive",
12 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
13 | 
14 | def get_html(url):
15 |     html=requests.get(url,headers=headers).text
16 |     return html
17 | 
18 | 
19 | def get_image(image_url,image_name):
20 |     content=requests.get(image_url,headers=headers).content
21 |     with open(image_name,'wb') as f:
22 |         f.write(content)
23 |         f.close
24 | 


--------------------------------------------------------------------------------
/www.kimiss.com/man.txt:
--------------------------------------------------------------------------------
1 | {'男士面部护理': ['http://product.kimiss.com/nanshirunchungao2/', 'http://product.kimiss.com/nanshiyanbujinghua2/', 'http://product.kimiss.com/nanshiyanshuang2/', 'http://product.kimiss.com/nanshiruye/', 'http://product.kimiss.com/nanshijiemian/', 'http://product.kimiss.com/nanshishuangfushui/', 'http://product.kimiss.com/nanshijinghua/', 'http://product.kimiss.com/nanshimianmo/', 'http://product.kimiss.com/nanshifangshai/', 'http://product.kimiss.com/nanshitaozhuang/', 'http://product.kimiss.com/nanshimianbuqujiaozhi/']}
2 | {'男士身体护理': ['http://product.kimiss.com/nanshimuyulu/', 'http://product.kimiss.com/nanshishuangshenxiangtipin/', 'http://product.kimiss.com/nanshirunfuru/', 'http://product.kimiss.com/nanshixiantichanpin/', 'http://product.kimiss.com/nanshishentimoshagao/', 'http://product.kimiss.com/nanshisichuhuli/']}
3 | {'男士剃须护理': ['http://product.kimiss.com/tixudao/', 'http://product.kimiss.com/xuhouhuli/', 'http://product.kimiss.com/xuqianhuli/']}
4 | {'男士美发护发': ['http://product.kimiss.com/nanshitoufazaoxing/', 'http://product.kimiss.com/nanshixifa/', 'http://product.kimiss.com/nanshirunfa/']}
5 | {'男士面部彩妆': ['http://product.kimiss.com/nanshibbshuang/', 'http://product.kimiss.com/nanshifendi/', 'http://product.kimiss.com/nanshigelishuang/', 'http://product.kimiss.com/nanshizhexia/', 'http://product.kimiss.com/nanshijiemaogao/', 'http://product.kimiss.com/nanshisanfen/']}
6 | 


--------------------------------------------------------------------------------
/www.lagou.com/lagou.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | import requests
 3 | import json
 4 | import time
 5 | from write_sql import write2sqlite
 6 | from bs4 import BeautifulSoup
 7 | 
 8 | headers = {
 9 |     'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
10 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
11 |     'Accept-Language': 'en-US,en;q=0.5',
12 |     'Accept-Encoding': 'gzip, deflate',
13 |     'Connection': 'keep-alive'}
14 | 
15 | def get_jobs(keyword):
16 |     jobs=[]
17 |     page=1
18 |     while True:
19 |         js_data=requests.get('http://www.lagou.com/jobs/positionAjax.json?px=new&kd=%s&pn=%s&'%(keyword,page),headers=headers).text
20 |         data=json.loads(js_data)
21 |         data=data['content']['positionResult']['result']
22 |         for item in data:
23 |             job={}
24 |             job['fromsite']='拉勾'
25 |             job['id']=item['positionId']
26 |             job['companyId']=item['companyId']
27 |             job['positionType']=keyword
28 |             job['positionName']=item['positionName']
29 |             job['company']=item['companyFullName']
30 |             job['salary']=item.get('salary')
31 |             job['workYear']=item['workYear']
32 |             job['education']=item['education']
33 |             job['industryField']=item['industryField']
34 |             job['companySize']=item['companySize']
35 |             job['city']=item['city']
36 |             job['financeStage']=item['financeStage']
37 |             jobs.append(job)
38 |         print(page,keyword,'ok')
39 |         page+=1
40 |         if page==31:
41 |             break
42 |         time.sleep(1)
43 |     return jobs
44 | 
45 | def get_job_des(jobid):
46 |     url='http://www.lagou.com/jobs/%s.html'%jobid
47 |     html=requests.get(url,headers=headers,timeout=30).text
48 |     des=BeautifulSoup(html,'lxml').find('dd',{'class':'job_bt'}).get_text()
49 |     return des
50 | 
51 | def get_company_rate(companyid):
52 |     url='http://www.lagou.com/gongsi/%s.html'%(companyid)
53 |     html=requests.get(url,headers=headers,timeout=30).text
54 |     rate=BeautifulSoup(html,'lxml').find('div',{'class':'reviews-top'}).find('span',{'class':'score'}).get_text()
55 |     return rate
56 | 
57 | def main():
58 |     keywords=[line.replace('\n','') for line in open('type.txt','r')]
59 |     for keyword in keywords:
60 |         jobs=get_jobs(keyword)
61 |         result=[]
62 |         for job in jobs:
63 |             try:
64 |                 des=get_job_des(job['id'])
65 |             except:
66 |                 des='-'
67 |             try:
68 |                 rate=get_company_rate(job['companyId'])
69 |             except:
70 |                 rate='-'
71 |             job['jobDes']=des
72 |             job['rate']=rate
73 |             result.append(job)
74 |             time.sleep(1)
75 |         write2sqlite(result,keyword)
76 |         print(keyword,'ok')
77 | main()
78 | 


--------------------------------------------------------------------------------
/www.locoso.com/locoso.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import re
 6 | 
 7 | headers = {
 8 |     'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 9 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
10 |     'Accept-Language': 'en-US,en;q=0.5',
11 |     'Accept-Encoding': 'gzip, deflate',
12 |     'Connection': 'keep-alive'}
13 | 
14 | def get_citys():
15 |     url='http://www.locoso.com/s2/js/topcity.js'
16 |     html=requests.get(url,headers=headers).text.replace('\\"','')
17 |     table=BeautifulSoup(html,'lxml')
18 |     lists=table.find_all('div',attrs={'class':'pro_bt'})
19 |     f=open('citys.txt','a')
20 |     root={}
21 |     rel='prcity2(.*?)"'
22 |     rel=re.compile(rel)
23 |     citys={}
24 |     for item in lists:
25 |         try:
26 |             root[str(item.get_text())]=eval(rel.findall(str(item))[0])[0]
27 |         except:
28 |             continue
29 |         dicts={}
30 |         for i in table.find('div',id=item.get('id')+'_2').find_all('li'):
31 |             dicts[str(i.get_text())]=eval(rel.findall(str(i))[0])[0]
32 |         citys[str(item.get_text())]=dicts
33 |     for key in citys:
34 |         for city in citys[key]:
35 |             qu={}
36 |             url='http://www.locoso.com/search/-all/c'+citys[key][city]
37 |             html=requests.get(url,headers=headers).text
38 |             table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'xiaofenlei_zhong02c2'}).find_all('li')
39 |             dicts={}
40 |             for i in table:
41 |                 dicts[i.find('a').get('title')]=i.find('a').get('href').replace('/search/-all/c','')
42 |             qu[city]=dicts
43 |             f.write(str(qu)+'\n')
44 |             print(city)
45 | 
46 | def get_industry():
47 |     html=requests.get('www.locoso.com/search/-all/',headers=headers)
48 | 
49 | get_citys()
50 | 


--------------------------------------------------------------------------------
/www.mohurd.gov.cn/deal.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | 
 5 | def load_level():
 6 |     level={}
 7 |     for line in open('Cost_qualification.txt','r'):
 8 |         line=line.replace('\n','').split('\t')
 9 |         print(line)
10 |         level[line[0]]=line[1]
11 |     return level
12 | 
13 | def deal():
14 |     keys=['姓名','性别','民族','学历','name','所属省市','联系地址','法人代表','工程监理资质','招标代理','造价咨询','一级注册建筑师','二级注册建筑师'
15 |             ,'一级注册结构工程师','二级注册结构工程师','注册土木工程师（岩土）','注册公用设备工程师（暖通空调）','注册公用设备工程师（给水排水）','注册公用设备工程师（动力）'
16 |                 ,'注册公用设备工程师（发输变电）','注册公用设备工程师（供配电）','注册化工工程师','监理工程师','一级建造师','二级建造师','造价工程师']
17 |     keys_two=['姓名','性别','民族','学历','name','所属省市','联系地址','法人代表']
18 |     keys_three=['工程监理资质','招标代理','造价咨询','监理工程师','一级建造师','二级建造师']
19 |     f=open('data.txt','w')
20 |     level=loadLevel()
21 |     for line in open('result.txt','r'):
22 |         person={}
23 |         item=eval(line)
24 |         for key in keys:
25 |             if key not in item:
26 |                 person[key]='N'
27 |             else:
28 |                 person[key]='Y'
29 |         for key in keys_two:
30 |             person[key]=item[key]
31 |         for key in keys_three:
32 |             text=''
33 |             try:
34 |                 for i in item[key]:
35 |                     if i not in text:
36 |                         text+=i+','
37 |                 person[key]=text[:-1]
38 |             except:
39 |                 person[key]=text
40 |         try:
41 |             person['造价咨询']=level[item['name']]
42 |         except:
43 |             person['造价咨询']='-'
44 |         text=''
45 |         for key in keys:
46 |             text+=person[key]+' ||'
47 |         f.write(text+'\n')
48 |     f.close()
49 | 
50 | deal()
51 | 


--------------------------------------------------------------------------------
/www.mohurd.gov.cn/registrarinfor.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import re
 4 | 
 5 | headers = {
 6 |     'Host':"210.12.219.18",
 7 |     'X-Requested-With':"XMLHttpRequest",
 8 |     'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/44.0',
 9 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
10 |     'Accept-Language': 'en-US,en;q=0.5',
11 |     'Accept-Encoding': 'gzip, deflate',
12 |     'Referer':"http://210.12.219.18/jianguanfabuweb/companies.html",
13 |     'Cookie':"ASP.NET_SessionId=evkmapz1ljljsqh54siborwj",
14 |     'Connection': 'keep-alive'}
15 | 
16 | def get_infor(item):
17 |     url='http://210.12.219.18/jianguanfabuweb/'+item['url']
18 |     html=requests.get(url,headers=headers,timeout=30).text
19 |     soup=BeautifulSoup(html,'lxml').find('div',{'class':'content'})
20 |     basic=soup.find('table',{'class':'engineer_basic_infor_table'}).get_text().replace('\r','').replace('\n','').replace(' ','')
21 |     basic_re='姓名：(.*?)民族：(.*?)性别：(.*?)手.*?学历：(.*?)学位'
22 |     basicinfor=re.findall(basic_re,basic)[0]
23 |     item['姓名']=basicinfor[0]
24 |     item['民族']=basicinfor[1]
25 |     item['性别']=basicinfor[2]
26 |     item['学历']=basicinfor[3]
27 |     zhengshu=soup.find_all('div',{'class':'zhengshu'})
28 |     for div in zhengshu:
29 |         header=div.find('div',{'class':'zhengshu_head'}).get_text()
30 |         profess=div.find('table').find_all('td')[-1].get_text().split(',')
31 |         item[header]=profess
32 |     return item
33 | 
34 | 
35 | def main():
36 |     f=open('result.txt','a')
37 |     count=0
38 |     for line in open('person.txt','r').readlines():
39 |         count+=1
40 |         person=eval(line.replace('\n',''))
41 |         try:
42 |             item=get_infor(person)
43 |         except:
44 |             failed=open('person_failed.txt','a')
45 |             failed.write(line)
46 |             failed.close()
47 |             print(person['name'],'failed')
48 |             continue
49 |         print(count)
50 |         f.write(str(item)+'\n')
51 |     f.close()
52 | 
53 | main()
54 | 


--------------------------------------------------------------------------------
/www.ncbi.nlm.nih.gov/gethtml.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | import os
 3 | import time
 4 | 
 5 | def main():
 6 |     browser=webdriver.Firefox()
 7 |     browser.get('http://www.ncbi.nlm.nih.gov/pubmed')
 8 |     input('OK?')
 9 |     browser.implicitly_wait(10)
10 |     count=0
11 |     while True:
12 |         html=browser.page_source
13 |         f=open('html/%s.html'%count,'w')
14 |         f.write(html)
15 |         f.close()
16 |         browser.find_element_by_xpath("//a[@id='EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Entrez_Pager.Page' and @sid=3]").click()
17 |         time.sleep(5)
18 |         count+=1
19 |         if count==5330:
20 |             break
21 | 
22 | main()
23 | 


--------------------------------------------------------------------------------
/www.ncbi.nlm.nih.gov/parser.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import os
 3 | 
 4 | def parser():
 5 |     files=[]
 6 |     for filename in os.listdir('html'):
 7 |         files.append(filename)
 8 |     files.sort(key=lambda x:int(x.replace('.html','')))
 9 |     f=open('result.txt','a')
10 |     for filename in files:
11 |         html=open('html/'+filename,'r').read()
12 |         try:
13 |             table=BeautifulSoup(html,'lxml').find('div',{'class':'rprt_all'}).find_all('div',{'class':"rprt abstract"})
14 |         except:
15 |             continue
16 |         for item in table:
17 |             cit=item.find('div',{'class':'cit'})
18 |             try:
19 |                 periodical=cit.find('a').get_text()
20 |             except:
21 |                 periodical='-'
22 |             try:
23 |                 date=cit.get_text().replace(periodical,'')
24 |             except:
25 |                 date='-'
26 |             try:
27 |                 title=item.find('h1').get_text()
28 |             except:
29 |                 continue
30 |             try:
31 |                 auths=item.find('div',{'class':'auths'}).find_all('a')
32 |             except:
33 |                 auths=[]
34 |             auth_num=len(auths)
35 |             auth_name=''
36 |             for a in auths:
37 |                 auth_name+=a.get_text()+';'
38 |             try:
39 |                 afflist=item.find('div',{'class':'afflist'}).find_all('li')
40 |             except:
41 |                 afflist=''
42 |             auth_infor=''
43 |             for li in afflist:
44 |                 auth_infor+=li.get_text()+'||'
45 |             try:
46 |                 abstract=item.find('div',{'class':'abstr'}).get_text()
47 |             except:
48 |                 abstract=''
49 |             try:
50 |                 pmid=item.find('div',{'class':'aux'}).find('a',{'ref':'aid_type=pmid'}).get_text()
51 |             except:
52 |                 pmid='-'
53 |             f.write(str([pmid,periodical,date,title,auth_num,auth_name,auth_infor,abstract])+'\r\n')
54 |         print(filename,'-ok')
55 |     f.close()
56 | parser()
57 | 


--------------------------------------------------------------------------------
/www.ncbi.nlm.nih.gov/write_to_excel.py:
--------------------------------------------------------------------------------
 1 | import openpyxl
 2 | 
 3 | def write_to_excel():
 4 |     excel=openpyxl.Workbook(write_only=True)
 5 |     sheet=excel.create_sheet()
 6 |     count=0
 7 |     filecount=1
 8 |     exist=[]
 9 |     for line in open('result.txt','r'):
10 |         line=line.replace('\r\n','')
11 |         item=eval(line)
12 |         if item[0] in exist:
13 |             continue
14 |         exist.append(item[0])
15 |         sheet.append(item)
16 |         count+=1
17 |         print(count)
18 |         if count%100000==0:
19 |             excel.save('%s.xlsx'%filecount)
20 |             filecount+=1
21 |             excel=openpyxl.Workbook(write_only=True)
22 |             sheet=excel.create_sheet()
23 |     excel.save('%s.xlsx'%filecount)
24 | 
25 | write_to_excel()
26 | 


--------------------------------------------------------------------------------
/www.pizzahut.com.cn/storelist.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import urllib
 4 | import openpyxl
 5 | 
 6 | def citys():
 7 |     html=open('index.html','r').read()
 8 |     table=BeautifulSoup(html,'lxml').find_all('div',{'class':'city_window'})[1].find_all('a')
 9 |     f=open('citys.txt','w')
10 |     for item in table:
11 |         f.write(item.get_text()+'\n')
12 |     f.close()
13 | 
14 | def get_store(city):
15 |     city=urllib.parse.quote(city)
16 |     headers = {
17 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
18 |         'Accept-Language': 'en-US,en;q=0.5',
19 |         'Accept-Encoding': 'gzip, deflate',
20 |         'Cookie':"NSC_CX_QfstjtufodzHspvq=ffffffff09320b0745525d5f4f58455e445a4a423660; _u_=1; __RequestVerificationToken=tOMoZty3Jp6D53oSF-NqlfyAlPa0sRNndZ7PNG5iPrWgM_ngcVFEOP79uEvHJGuqlHDoAA3WDd1MN9QA8ZEhpurYLA0WSkuyswlEO9Nj9oqeMWnu84Q1fyQQYx5-vjq-73NNZXJJLcF9jq3fjB_dsw2; iplocation={}%7C0%7C0".format(city),
21 |         'User-Agent':"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.0",
22 |         'Connection': 'keep-alive'}
23 |     page=1
24 |     result=[]
25 |     while True:
26 |         data={
27 |         'pageIndex':page,
28 |         'pageSize':"100",
29 |         'keyword':"输入餐厅地址或餐厅名称"
30 |         }
31 |         html=requests.post('http://www.pizzahut.com.cn/StoreList/Index',headers=headers,data=data).text
32 |         soup=BeautifulSoup(html,'lxml').find_all('li')
33 |         items=[]
34 |         for li in soup:
35 |             item=''
36 |             try:
37 |                 for p in li.find('div',{'class':'re_RNew'}).find_all('p'):
38 |                     item+='|'+p.get_text()
39 |             except:
40 |                 continue
41 |             items.append(item)
42 |         if items==[]:
43 |             break
44 |         result+=items
45 |         page+=1
46 |     return result
47 | 
48 | def main():
49 |     f=open('result.txt','a')
50 |     for line in open('citys.txt','r'):
51 |         city=line.replace('\n','')
52 |         try:
53 |             result=get_store(city)
54 |         except:
55 |             failed=open('failed.txt','a')
56 |             failed.write(city+'\n')
57 |             failed.close()
58 |             continue
59 |         for item in result:
60 |             f.write(city+item+'\n')
61 |         print(city,'ok')
62 |     f.close()
63 | 
64 | def write_to_excel():
65 |     result={}
66 |     excel=openpyxl.Workbook(write_only=True)
67 |     sheet1=excel.create_sheet('1')
68 |     for line in open('result.txt','r'):
69 |         line=line.replace('\n','')
70 |         lists=line.split('|')
71 |         try:
72 |             result[lists[1]]+=1
73 |         except:
74 |             result[lists[1]]=1
75 |         sheet1.append(lists)
76 |     sheet2=excel.create_sheet('2')
77 |     for key in result:
78 |         sheet2.append([key,result[key]])
79 |     excel.save('result.xlsx')
80 | 
81 | write_to_excel()
82 | 


--------------------------------------------------------------------------------
/www.ppdai.com/excel.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | import xlwt3
 3 | 
 4 | def excel():
 5 |     file_d=open('data.txt','r')
 6 |     excel_f=xlwt3.Workbook()
 7 |     sheet=excel_f.add_sheet('sheet')
 8 |     count=0
 9 |     for line in file_d.readlines():
10 |         lists=line.replace('\n','').split('|')
11 |         num=0
12 |         for item in lists:
13 |             try:
14 |                 text=item.split('：')[1]
15 |             except:
16 |                 text=item
17 |             sheet.write(count,num,text)
18 |             num+=1
19 |         count+=1
20 |     excel_f.save('data.xls')
21 | 
22 | excel()
23 | 


--------------------------------------------------------------------------------
/www.teld.cn/setting/cities.txt:
--------------------------------------------------------------------------------
1 | 广州市
2 | 上海市
3 | 杭州市
4 | 成都市
5 | 南京市
6 | 


--------------------------------------------------------------------------------
/www.tripadvisor.com/getpage.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | import os
 5 | import time
 6 | 
 7 | headers = {
 8 |     'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 9 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
10 |     'Accept-Language': 'en-US,en;q=0.5',
11 |     'Accept-Encoding': 'gzip, deflate'}
12 | 
13 | def main():
14 |     html=requests.get('https://www.tripadvisor.com/Attraction_Review-g294212-d325811-Reviews-Great_Wall_at_Mutianyu-Beijing.html#REVIEWS',headers=headers).text
15 |     try:
16 |         os.mkdir('page')
17 |     except:
18 |         pass
19 |     count=0
20 |     f=open('page'+str(count)+'.html','w')
21 |     f.write(html)
22 |     f.close()
23 |     count+=1
24 |     num=10
25 |     while True:
26 |         try:
27 |             html=requests.get('https://www.tripadvisor.com/Attraction_Review-g294212-d325811-Reviews-or%s-Great_Wall_at_Mutianyu-Beijing.html#REVIEWS'%num,headers=headers).text
28 |         except:
29 |             continue
30 |         f=open('page/'+str(count)+'.html','w')
31 |         f.write(html)
32 |         f.close()
33 |         num+=10
34 |         print(num)
35 |         count+=1
36 |         if(num==8490):
37 |             break
38 |         time.sleep(2)
39 | 
40 | main()
41 | 


--------------------------------------------------------------------------------
/www.tripadvisor.com/moredata.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | 
 7 | headers = {
 8 |     'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 9 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
10 |     'Accept-Language': 'en-US,en;q=0.5',
11 |     'Accept-Encoding': 'gzip, deflate'}
12 | 
13 | 
14 | def getdata(target,viewid):
15 |     html=requests.get('https://www.tripadvisor.com/ExpandedUserReviews-g294212-d325811?target=%s&context=1&reviews=%s&servlet=Attraction_Review&expand=1'%(target,viewid),headers=headers).text
16 |     table=BeautifulSoup(html,'lxml').find_all('div',attrs={'class':'innerBubble'})
17 |     result=[]
18 |     for item in table:
19 |         text=item.find('div',attrs={'class':'entry'}).get_text().replace('\r','').replace('\n','')+'||'
20 |         try:
21 |             text+=item.find('div',attrs={'class':'recommend'}).get_text().replace('\r','').replace('\n','')
22 |         except:
23 |             text+='--'
24 |         result.append(text)
25 |     return result
26 | 
27 | def main():
28 |     f=open('result.txt','a')
29 |     viewids=[]
30 |     lines=[]
31 |     count=0
32 |     for line in open('data.txt','r'):
33 |         line=line.replace('\n','')
34 |         lines.append(line)
35 |         viewid=line.split('||')[1].split('-')[-1].replace('SRC_','')
36 |         viewids.append(viewid)
37 |         if(len(viewids)<20):
38 |             continue
39 |         text=''
40 |         for id in viewids:
41 |             text+=id+','
42 |         result=getdata(viewids[0],text[:-1])
43 |         print(len(result))
44 |         for num in range(len(lines)):
45 |             f.write(lines[num]+'||'+result[num]+'\n')
46 |         viewids.clear()
47 |         lines.clear()
48 |         count+=1
49 |         print(count,'--ok')
50 |     text=''
51 |     for id in viewids:
52 |         text+=id+','
53 |     result=getdata(viewids[0],text[:-1])
54 |     for num in range(lines):
55 |         f.write(lines[num]+'||'+result[num]+'\n')
56 |     viewids.clear()
57 |     lines.clear()
58 |     f.close()
59 | 
60 | main()
61 | 


--------------------------------------------------------------------------------
/www.tripadvisor.com/userinfor.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import threading
 6 | 
 7 | 
 8 | headers = {
 9 |     'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
10 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
11 |     'Accept-Language': 'en-US,en;q=0.5',
12 |     'Accept-Encoding': 'gzip, deflate'}
13 | 
14 | 
15 | class Infor(threading.Thread):
16 |     def __init__(self,line):
17 |         super(Infor,self).__init__()
18 |         self.line=line
19 |         self.uid=self.line.split('||')[1].split('-')[0].replace('UID_','')
20 | 
21 |     def run(self):
22 |         try:
23 |             html=requests.get('https://www.tripadvisor.com/MemberOverlay?uid=%s&c=&fus=false&partner=false&LsoId='%self.uid,headers=headers,timeout=50).text
24 |         except:
25 |             self.result='--'
26 |             self.line+='||'+self.result
27 |             return
28 |         try:
29 |             self.result=BeautifulSoup(html,'lxml').find('ul',attrs={'class':'memberdescription'}).find_all('li')[1].get_text().replace('\r','').replace('\n','')
30 |         except:
31 |             self.result='--'
32 |         self.line+='||'+self.result
33 | 
34 | 
35 | def main():
36 |     f=open('re_data.txt','a')
37 |     threadings=[]
38 |     lines=[]
39 |     count=0
40 |     for line in open('result.txt','r'):
41 |         line=line.replace('\n','')
42 |         lines.append(line)
43 |         if(len(lines)<20):
44 |             continue
45 |         for line in lines:
46 |             work=Infor(line)
47 |             threadings.append(work)
48 |         for work in threadings:
49 |             work.start()
50 |         for work in threadings:
51 |             work.join()
52 |         for work in threadings:
53 |             f.write(work.line+'\n')
54 |         count+=1
55 |         print(count,'--ok')
56 |         threadings.clear()
57 |         lines.clear()
58 | 
59 | main()
60 | 


--------------------------------------------------------------------------------
/www.variflight.com/icon/0/20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/0/20.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/0/23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/0/23.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/1/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/1/1.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/1/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/1/4.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/2/0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/2/0.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/2/33.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/2/33.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/24/117.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/24/117.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/24/304.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/24/304.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/24/783.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/24/783.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/3/43.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/3/43.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/3/64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/3/64.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/4/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/4/3.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/4/9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/4/9.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/44/141.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/44/141.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/44/88.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/44/88.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/5/71.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/5/71.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/5/8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/5/8.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/6/19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/6/19.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/6/51.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/6/51.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/6/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/6/6.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/7/16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/7/16.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/7/26.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/7/26.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/8/93.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/8/93.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/8/98.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/8/98.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/9/21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/9/21.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/9/31.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/9/31.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/b/2202.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/b/2202.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/b/2248.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/b/2248.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/m/2397.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/m/2397.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/m/2408.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/m/2408.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/m/2419.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/m/2419.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/s/2245.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/s/2245.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/s/2413.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/s/2413.png


--------------------------------------------------------------------------------
/www.variflight.com/icon/s/2424.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.variflight.com/icon/s/2424.png


--------------------------------------------------------------------------------
/www.yhd.com/data.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.yhd.com/data.xls


--------------------------------------------------------------------------------
/www.yhd.com/replace.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyishisi/Nyspider/8aa7431f76f2796af6858d0fa0b732516b35ae53/www.yhd.com/replace.py


--------------------------------------------------------------------------------
/www.yhd.com/shopinfor.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import xlwt3
 6 | import re
 7 | 
 8 | headers = {
 9 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
10 |     "Accept-Encoding": "gzip, deflate",
11 |     "Accept-Language": "en-US,en;q=0.5",
12 |     "Connection": "keep-alive",
13 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
14 | 
15 | def get_urls(url):
16 |     try:
17 |         html=requests.get(url,headers=headers,timeout=50).text
18 |     except:
19 |         return []
20 |     rel='(http://shop.yhd.com/m-\d+.html)'
21 |     urls=re.findall(rel,html)
22 |     urls=list(set(urls))
23 |     try:
24 |         html=requests.get(url+'&isGetMoreProducts=1',headers=headers,timeout=50).text
25 |         urls+=re.findall(rel,html)
26 |         urls=list(set(urls))
27 |     except:
28 |         print('--')
29 |     return urls
30 | 
31 | def get_infor(url):
32 |     html=requests.get(url,headers=headers).text
33 |     table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'shop-des'}).find_all('li')
34 |     item={}
35 |     item['url']=url
36 |     try:
37 |         item['name']=table[0].find('span').get_text()
38 |     except:
39 |         item['name']=''
40 |     try:
41 |         item['city']=table[1].find('span').get_text()
42 |     except:
43 |         item['city']=''
44 |     try:
45 |         item['tel']=table[2].find('span').get_text()
46 |     except:
47 |         item['tel']=''
48 |     return item
49 | 
50 | def main():
51 |     excel_f=xlwt3.Workbook()
52 |     sheet=excel_f.add_sheet('sheet')
53 |     count=0
54 |     list_url=input("输入商铺链接:")
55 |     list_url=list_url.replace('list.yhd.com/','list.yhd.com/searchPage/')
56 |     page=1
57 |     while True:
58 |         urls=get_urls(re.sub('p\d','p'+str(page),list_url))
59 |         if(urls==[]):
60 |             break
61 |         for url in urls:
62 |             try:
63 |                 item=get_infor(url)
64 |             except:
65 |                 continue
66 |             sheet.write(count,0,item['name'])
67 |             sheet.write(count,1,item['city'])
68 |             sheet.write(count,2,item['tel'])
69 |             sheet.write(count,3,item['url'])
70 |             count+=1
71 |             print(count)
72 |             excel_f.save('data.xls')
73 |         page+=1
74 | 
75 | main()
76 | 


--------------------------------------------------------------------------------
/www.zdic.net/write_to_excel.py:
--------------------------------------------------------------------------------
 1 | import openpyxl
 2 | import os
 3 | from bs4 import BeautifulSoup
 4 | import re
 5 | 
 6 | 
 7 | def load_result_1():
 8 |     result=[]
 9 |     for line in open('result.txt','r'):
10 |         item=eval(line)
11 |         baseinfor=item['baseinfor']
12 |         for word in item['words']:
13 |             line=word[:-1]
14 |             des=''
15 |             for p in word[-1]:
16 |                 des+=p+'\n'
17 |             result.append(line+baseinfor+[des,item['url']])
18 |     return result
19 | 
20 | def load_result_2():
21 |     result=[]
22 |     for line in open('result.txt','r'):
23 |         item=eval(line)
24 |         baseinfor=item['baseinfor']
25 |         for word in item['words']:
26 |             line=word[:-1]
27 |             num=1
28 |             for p in word[-1]:
29 |                 text=BeautifulSoup(p,'lxml').get_text()
30 |                 text=re.sub('(\d+. )|◎ ','',text)
31 |                 result.append(line+baseinfor+[num,text,item['url']])
32 |                 num+=1
33 |     return result
34 | 
35 | def write_to_excel(result,filename):
36 |     excel=openpyxl.Workbook(write_only=True)
37 |     sheet=excel.create_sheet()
38 |     for line in result:
39 |         sheet.append(line)
40 |     excel.save(filename)
41 | 
42 | result=load_result_1()
43 | write_to_excel(result,'result_1.xlsx')
44 | 


--------------------------------------------------------------------------------
/www.zhongchou.com/Duplicate.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import os
 4 | 
 5 | def Duplicate():
 6 |     for filename in os.listdir('.'):
 7 |         if filename.endswith('txt'):
 8 |             lines=open(filename,'r').readlines()
 9 |             lines=list(set(lines))
10 |             lines.sort()
11 |             f=open(filename,'w')
12 |             for line in lines:
13 |                 f.write(line)
14 |             f.close()
15 | 
16 | Duplicate()
17 | 


--------------------------------------------------------------------------------
/www.zhongchou.com/excel.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import xlwt3
 4 | 
 5 | def write():
 6 |     f=xlwt3.Workbook()
 7 |     sheet=f.add_sheet('sheet')
 8 |     file_f=open('D.txt','r')
 9 |     num=1
10 |     head=['项目','id','进展数','评论数','最小金额','人数','video','类型','地区','支持人数','已筹款','比例','目标筹资','关注']
11 |     count=0
12 |     for item in head:
13 |         sheet.write(0,count,item)
14 |         count+=1
15 |     for line in file_f.readlines():
16 |         lists=line.replace('\n','').split('|')
17 |         for count in range(14):
18 |             sheet.write(num,count,lists[count])
19 |         num+=1
20 |     f.save('data.xls')
21 | 
22 | write()
23 | 


--------------------------------------------------------------------------------
/www.zhongchou.com/get_id.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | def get_id():
 7 |     f=open('ids.txt','a')
 8 |     headers = {
 9 |             'Host':"www.zhongchou.com",
10 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
11 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
12 |             'Accept-Language': 'en-US,en;q=0.5',
13 |             'Accept-Encoding': 'gzip, deflate',
14 |             'Connection': 'keep-alive'}
15 |     for page in range(150):
16 |         html=requests.get('http://www.zhongchou.com/browse/re-p'+str(page+1),headers=headers).text.encode('ISO-8859-1').decode('utf-8','ignore')
17 |         table=BeautifulSoup(html,'lxml').find('div',attrs={'class':'sousuoListBox clearfix'}).find_all('div',attrs={'class':'ssCardItem'})
18 |         for item in table:
19 |             text=''
20 |             p=item.find('h3').find('a')
21 |             text=p.get('title')+'|'+p.get('href').replace('http://www.zhongchou.com/deal-show/id-','')+'\n'
22 |             print(text)
23 |             f.write(text)
24 |         print(page)
25 |     f.close()
26 | 
27 | get_id()
28 | 


--------------------------------------------------------------------------------
/www.zhongchou.com/other.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | def get_infor(text):
 7 |     headers = {
 8 |             'Host':"www.zhongchou.com",
 9 |             'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
10 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
11 |             'Accept-Language': 'en-US,en;q=0.5',
12 |             'Accept-Encoding': 'gzip, deflate',
13 |             'Connection': 'keep-alive'}
14 |     id=text.split('|')[1]
15 |     try:
16 |         html=requests.get('http://www.zhongchou.com/deal-show/id-'+id,headers=headers).text.encode('ISO-8859-1').decode('utf-8','ignore')
17 |     except:
18 |         return None
19 |     table=BeautifulSoup(html,'html.parser').find('div',attrs={'class':'mainIn02Box'})
20 |     title=table.find('div',attrs={'class':'jlxqTitleText siteIlB_box'}).find_all('div')
21 |     text+='|'+title[0].get_text().replace('\n','')
22 |     text+='|'+title[1].get_text()
23 |     right_table=table.find('div',attrs={'class':'xqDetailRight'})
24 |     su_table=right_table.find('div',attrs={'class':"xqDetailDataBox"}).find_all('div')
25 |     text+='|'+su_table[0].find('p').get_text()
26 |     text+='|'+su_table[1].find('p').get_text()
27 |     su_table=right_table.find('div',attrs={'class':'xqRatioOuterBox'})
28 |     text+='|'+su_table.find('p').get_text()+'|'+su_table.find('b').get_text()
29 |     su_table=right_table.find('div',attrs={'class':'xqDetailBtnBox'}).find('a',id='deal_detail_like')
30 |     text+='|'+su_table.find('b').get_text()
31 |     return text
32 | 
33 | def main():
34 |     file_d=open('data.txt','r')
35 |     data_f=open('other.txt','a')
36 |     num=0
37 |     for line in file_d.readlines():
38 |         try:
39 |             text=get_infor(line.replace('\n',''))
40 |         except:
41 |             continue
42 |         if text==None:
43 |             continue
44 |         data_f.write(text+'\n')
45 |         num+=1
46 |         print(num)
47 |     data_f.close()
48 | 
49 | main()
50 | 


--------------------------------------------------------------------------------
/wwwapps.ups.com/write2excel.py:
--------------------------------------------------------------------------------
 1 | import openpyxl
 2 | 
 3 | def load_data():
 4 |     keys=[line.replace('\n','').replace(' ','') for line in open('data','r')]
 5 |     data={}
 6 |     for line in open('result.txt','r'):
 7 |         line=line.replace('\n','').split('-')
 8 |         try:
 9 |             data[line[0]][line[1]]=int(line[-1])
10 |         except:
11 |             data[line[0]]={}
12 |             data[line[0]][line[1]]=int(line[-1])
13 |         try:
14 |             data[line[1]][line[0]]=int(line[-1])
15 |         except:
16 |             data[line[1]]={}
17 |             data[line[1]][line[0]]=int(line[-1])
18 |     return keys,data
19 | 
20 | def write_to_excel():
21 |     keys,data=load_data()
22 |     excel=openpyxl.Workbook(write_only=True)
23 |     sheet=excel.create_sheet()
24 |     line=['']
25 |     for key in keys:
26 |         if len(key)==4:
27 |             key='0'+key
28 |         line.append(key)
29 |     sheet.append(line)
30 |     for key in keys:
31 |         if len(key)==4:
32 |             key='0'+key
33 |         line=[key]
34 |         for another_key in keys:
35 |             if len(another_key)==4:
36 |                 another_key='0'+another_key
37 |             if key==another_key:
38 |                 line.append(1)
39 |             else:
40 |                 try:
41 |                     line.append(data[key][another_key])
42 |                 except:
43 |                     line.append('')
44 |         sheet.append(line)
45 |     sheet=excel.create_sheet()
46 |     line=['']
47 |     for key in keys:
48 |         if len(key)==4:
49 |             key='0'+key
50 |         line.append(key)
51 |     sheet.append(line)
52 |     for key in keys:
53 |         if len(key)==4:
54 |             key='0'+key
55 |         line=[key]
56 |         for another_key in keys:
57 |             if len(another_key)==4:
58 |                 another_key='0'+another_key
59 |             if key==another_key:
60 |                 line.append(1)
61 |             else:
62 |                 try:
63 |                     value=data[key][another_key]
64 |                     if value!=1:
65 |                         value=0
66 |                     line.append(value)
67 |                 except:
68 |                     line.append('')
69 |         sheet.append(line)
70 |     excel.save('result.xlsx')
71 | 
72 | write_to_excel()
73 | 


--------------------------------------------------------------------------------
/xxgk.jl.gov.cn/infor.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import openpyxl
 4 | import re
 5 | 
 6 | 
 7 | headers = {
 8 |         'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
 9 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
10 |         'Accept-Language': 'en-US,en;q=0.5',
11 |         'Accept-Encoding': 'gzip, deflate',
12 |         'Connection': 'keep-alive'}
13 | 
14 | def geturls():
15 |     f=open('urls.txt','a')
16 |     page=1
17 |     while True:
18 |         html=requests.get('http://xxgk.jl.gov.cn/zwdtSjgl/Directory/depListDir1.jsp?department_name=%CB%F9%D3%D0&pageNo='+str(page),headers=headers).text
19 |         table=BeautifulSoup(html,'lxml').find_all('div',style='display:none;')
20 |         for item in table:
21 |             try:
22 |                 pid=item.get('id').replace('_text','')
23 |                 item=str(item).replace('</strong>','</strong><a>').replace('<br/>','</a>')
24 |                 items=BeautifulSoup(item,'lxml').find_all('a')
25 |                 title=items[2].get_text()
26 |                 date=items[3].get_text()
27 |                 line=title+'|| '+date+' ||'+pid
28 |                 f.write(line.replace('\r','').replace('\n','')+'\n')
29 |             except:
30 |                 continue
31 |         print(page,'ok')
32 |         page+=1
33 |         if page==937:
34 |             break
35 |     f.close()
36 | 
37 | def getinfor(pid):
38 |     html=requests.get('http://xxgk.jl.gov.cn/zwdtSjgl/Directory/showDir.jsp?keyid='+pid,headers=headers,timeout=30).text
39 |     tables=BeautifulSoup(html,'lxml').find_all('table',width=700)
40 |     text=tables[0].get_text().replace('\r','').replace('\n','')
41 |     try:
42 |         location=re.findall('发布机构：(.*?)生成日期',text)[0]
43 |     except:
44 |         location='--'
45 |     text=tables[1].get_text().replace('\r','').replace('\n','')
46 |     return location+'||'+text
47 | 
48 | def main():
49 |     f=open('result.txt','a')
50 |     for line in open('urls.txt','r'):
51 |         line=line.replace('\n','')
52 |         try:
53 |             result=getinfor(line.split('||')[-1].replace(' ',''))
54 |         except:
55 |             failed=open('failed','a')
56 |             failed.write(line+'\n')
57 |             failed.close()
58 |             continue
59 |         f.write(line+'||'+result+'\n')
60 |         print(line)
61 |     f.close()
62 | 
63 | main()
64 | 


--------------------------------------------------------------------------------
/zhidao.baidu.com/question.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import time
 4 | import threading
 5 | 
 6 | headers = {
 7 |     'Host':"zhidao.baidu.com",
 8 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 9 |     "Accept-Encoding": "gzip, deflate",
10 |     "Accept-Language": "en-US,en;q=0.5",
11 |     "Connection": "keep-alive",
12 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
13 | 
14 | class Ques(threading.Thread):
15 |     def __init__(self,line):
16 |         super(Ques,self).__init__()
17 |         self.line=line
18 |         self.url=line.split('||')[-1]
19 |         self.word=line.split('||')[0]
20 | 
21 |     def run(self):
22 |         self.status=True
23 |         try:
24 |             self.data=self.question()
25 |         except:
26 |             self.status=False
27 | 
28 |     def question(self):
29 |         html=requests.get(self.url,headers=headers,timeout=30).text.encode('ISO-8859-1').decode('gbk','ignore')
30 |         table=BeautifulSoup(html,'lxml').find('article',id='qb-content')
31 |         header=table.find('div',id='wgt-ask')
32 |         title=header.find('span',{'class':'ask-title'}).get_text()
33 |         try:
34 |             des=header.find('span',{'class':'con'}).get_text()
35 |         except:
36 |             des='-'
37 |         try:
38 |             answer=table.find('div',{'class':['bd','answer']}).find('pre').get_text()
39 |         except:
40 |             try:
41 |                 answer=table.find('div',{'id':'wgt-answers'}).find('span',{'class':'con'}).get_text()
42 |             except:
43 |                 answer='-'
44 |         return [title,des,answer]
45 | 
46 | def main():
47 |     f=open('result.txt','a')
48 |     lines=[]
49 |     count=0
50 |     for line in open('./urls.txt','r'):
51 |         line=line.replace('\n','')
52 |         lines.append(line)
53 |         if len(lines)<10:
54 |             continue
55 |         threadings=[]
56 |         for item in lines:
57 |             work=Ques(item)
58 |             threadings.append(work)
59 |         for work in threadings:
60 |             work.start()
61 |         for work in threadings:
62 |             work.join()
63 |         for work in threadings:
64 |             if work.status==False:
65 |                 failed=open('question_failed','a')
66 |                 failed.write(work.line+'\n')
67 |                 failed.close()
68 |                 continue
69 |             count+=1
70 |             print(count)
71 |             f.write(str([work.word]+work.data)+'\n')
72 |         lines.clear()
73 |     threadings=[]
74 |     for item in lines:
75 |         work=Ques(item)
76 |         threadings.append(work)
77 |     for work in threadings:
78 |         work.start()
79 |     for work in threadings:
80 |         work.join()
81 |     for work in threadings:
82 |         if work.status==False:
83 |             failed=open('question_failed','a')
84 |             failed.write(work.line+'\n')
85 |             failed.close()
86 |             continue
87 |         f.write(str([work.word]+work.data)+'\n')
88 |     f.close()
89 | 
90 | main()
91 | 


--------------------------------------------------------------------------------
/zhidao.baidu.com/search.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import time
 4 | import threading
 5 | 
 6 | headers = {
 7 |     'Host':"zhidao.baidu.com",
 8 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 9 |     "Accept-Encoding": "gzip, deflate",
10 |     "Accept-Language": "en-US,en;q=0.5",
11 |     "Connection": "keep-alive",
12 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
13 | 
14 | def search(key):
15 |     html=requests.get('https://zhidao.baidu.com/search?lm=0&rn=10&pn=0&fr=search&ie=utf-8&word='+key,headers=headers,timeout=30).text.encode('ISO-8859-1').decode('gbk','ignore')
16 |     table=BeautifulSoup(html,'lxml').find('div',{'class':'list-wraper'}).find_all('dl')
17 |     for dl in table:
18 |         try:
19 |             url=dl.find('a').get('href')
20 |             if 'zhidao.baidu.com/question' in url:
21 |                 return url
22 |         except:
23 |             continue
24 | 
25 | class Search(threading.Thread):
26 |     def __init__(self,key):
27 |         super(Search,self).__init__()
28 |         self.key=key
29 | 
30 |     def run(self):
31 |         self.status=True
32 |         try:
33 |             self.url=search(self.key)
34 |         except:
35 |             self.status=False
36 | 
37 | def main():
38 |     f=open('urls.txt','w')
39 |     lines=[]
40 |     count=0
41 |     for line in open('./failed_words','r'):
42 |         line=line.replace('\n','')
43 |         lines.append(line)
44 |         if len(lines)<5:
45 |             continue
46 |         threadings=[]
47 |         for item in lines:
48 |             work=Search(item)
49 |             threadings.append(work)
50 |         for work in threadings:
51 |             work.start()
52 |         for work in threadings:
53 |             work.join()
54 |         for work in threadings:
55 |             if work.status==False:
56 |                 continue
57 |             if work.url==None:
58 |                 continue
59 |             count+=1
60 |             print(count)
61 |             try:
62 |                 f.write(work.key+"||"+work.url+'\n')
63 |             except:
64 |                 continue
65 |         lines.clear()
66 |     threadings=[]
67 |     for item in lines:
68 |         work=Search(item)
69 |         threadings.append(work)
70 |     for work in threadings:
71 |         work.start()
72 |     for work in threadings:
73 |         work.join()
74 |     for work in threadings:
75 |         if work.status==False:
76 |             continue
77 |         if work.url==None:
78 |             continue
79 |         count+=1
80 |         print(count)
81 |         f.write(work.key+"||"+work.url+'\n')
82 |     lines.clear()
83 |     f.close()
84 | main()
85 | 


--------------------------------------------------------------------------------
/zhihu/get_followee.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import json
 6 | 
 7 | 
 8 | headers = {
 9 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
10 |     "Accept-Encoding": "gzip, deflate",
11 |     "Accept-Language": "en-US,en;q=0.5",
12 |     "Connection": "keep-alive",
13 |     'Cookie':'q_c1=52c451e7774943a2983e4b1341af47c4|1455451362000|1449924628000; _za=b08b756f-83e2-44b8-8719-9fd22ea0e8fc; __utma=51854390.837289251.1457412853.1457412853.1457412853.1; __utmz=51854390.1457412853.1.1.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/people/gejinyuban/topics; cap_id="MWRmMmU0NjlhMmM1NDRhMWFlYzg1MmI3OTJmYjJmN2I=|1457411531|febb54ce12ed1f54a9d134f44ad639a8d21a406a"; _xsrf=3193e002ffde3f8236b8bf0425ba0a8c; udid="AFBAu5wSlAmPTqUZ3Pnq-vBRhHF-_se18_Q="; n_c=1; __utmc=51854390; __utmb=51854390.2.10.1457412853; z_c0="QUFCQVB4azVBQUFYQUFBQVlRSlZUZERpQlZjb1l3SmlXMlVuTTVXNmMyamsyaFh0TmNZZm9BPT0=|1457411536|03555bed95004f561fc044aa14585204ce700106"; unlock_ticket="QUFCQVB4azVBQUFYQUFBQVlRSlZUZGhjM2xaVGJYbi1uVzlzS1pGODllTFZGaXpzTFZZbFZBPT0=|1457411536|9d460fe15bde1d5e5e723349745d7084654ce709"; __utmt=1; __utmv=51854390.100-1|2=registration_date=20141006=1^3=entry_date=20141006=1',
14 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
15 | 
16 | def get_followe(ID,hashid):
17 |     html=requests.get('https://www.zhihu.com/people/%s/followees'%ID,headers=headers).text
18 |     xsrf=BeautifulSoup(html,'lxml').find('input',attrs={'name':'_xsrf'}).get('value')
19 |     print(xsrf)
20 |     count=0
21 |     persons=[]
22 |     while True:
23 |         data={
24 |         'method':"next",
25 |         'params':'{"offset":%s,"order_by":"created","hash_id":"%s"}'%(count,hashid),
26 |         '_xsrf':xsrf
27 |         }
28 |         try:
29 |             html=requests.post('https://www.zhihu.com/node/ProfileFolloweesListV2',headers=headers,data=data).text
30 |         except:
31 |             continue
32 |         try:
33 |             jsondata=json.loads(html)['msg']
34 |         except:
35 |             return persons
36 |         if(jsondata==[]):
37 |             break
38 |         for item in jsondata:
39 |             name=BeautifulSoup(item,'lxml').find('a',attrs={'class':'zg-link'}).get('title')
40 |             persons.append(name)
41 |         count+=20
42 |     return persons
43 | 
44 | def main():
45 |     f=open('followee.txt','a',encoding='utf-8')
46 |     statue=True
47 |     for line in open('data.txt','r').readlines():
48 |         lists=line.split('||')
49 |         name=lists[0]
50 |         if(statue):
51 |             if(name=='keso'):
52 |                 statue=False
53 |             continue
54 |         ID=lists[1]
55 |         item={}
56 |         item['name']=name
57 |         item['id']=ID
58 |         hashid=lists[3]
59 |         item['followee']=get_followe(ID, hashid)
60 |         f.write(str(item)+'\n')
61 |         print(name)
62 | main()
63 | 


--------------------------------------------------------------------------------
/zhihu/top500.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | import json
 5 | 
 6 | def get_top(page):
 7 |     html=requests.get('http://api.kanzhihu.com/topuser/follower/%s/50'%page).text
 8 |     data=json.loads(html)['topuser']
 9 |     return data
10 | 
11 | def main():
12 |     f=open('persons.txt','a',encoding='utf-8')
13 |     page=1
14 |     while True:
15 |         data=get_top(page)
16 |         for item in data:
17 |             text=item['name']+'||'+item['id']+'||'+str(item['follower'])+'||'+item['hash']
18 |             f.write(text+'\n')
19 |         print(page)
20 |         page+=1
21 |         if(page==20):
22 |             break
23 |     f.close()
24 | 
25 | def followee():
26 |     f=open('data.txt','a',encoding='utf-8')
27 |     for line in open('persons.txt','r').readlines():
28 |         line=line.replace('\n','')
29 |         print(line)
30 |         data=requests.get('http://api.kanzhihu.com/userdetail2/'+line.split('||')[-1]).text
31 |         data=json.loads(data)
32 |         line=line+'|| '+str(data['signature'])+'|| '+str(data['description'])+'|| '
33 |         detail=data['detail']
34 |         line=line+str(detail['ask'])+'|| '+str(detail['answer'])+'|| '+str(detail['post'])+'|| '+str(detail['agree'])+'|| '+str(detail['thanks'])+'|| '+str(detail['fav'])+'||'+str(detail['logs'])
35 |         f.write(line.replace('\r','').replace('\n','')+'\n')
36 | 
37 | followee()
38 | 


--------------------------------------------------------------------------------
/zhihu/zhihuinfor.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import re
 6 | 
 7 | headers = {
 8 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 9 |     "Accept-Encoding": "gzip, deflate",
10 |     "Accept-Language": "en-US,en;q=0.5",
11 |     "Connection": "keep-alive",
12 |     'Cookie':'q_c1=52c451e7774943a2983e4b1341af47c4|1455451362000|1449924628000; _za=b08b756f-83e2-44b8-8719-9fd22ea0e8fc; __utma=51854390.837289251.1457412853.1457412853.1457412853.1; __utmz=51854390.1457412853.1.1.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/people/gejinyuban/topics; cap_id="MWRmMmU0NjlhMmM1NDRhMWFlYzg1MmI3OTJmYjJmN2I=|1457411531|febb54ce12ed1f54a9d134f44ad639a8d21a406a"; _xsrf=3193e002ffde3f8236b8bf0425ba0a8c; udid="AFBAu5wSlAmPTqUZ3Pnq-vBRhHF-_se18_Q="; n_c=1; __utmc=51854390; __utmb=51854390.2.10.1457412853; z_c0="QUFCQVB4azVBQUFYQUFBQVlRSlZUZERpQlZjb1l3SmlXMlVuTTVXNmMyamsyaFh0TmNZZm9BPT0=|1457411536|03555bed95004f561fc044aa14585204ce700106"; unlock_ticket="QUFCQVB4azVBQUFYQUFBQVlRSlZUZGhjM2xaVGJYbi1uVzlzS1pGODllTFZGaXpzTFZZbFZBPT0=|1457411536|9d460fe15bde1d5e5e723349745d7084654ce709"; __utmt=1; __utmv=51854390.100-1|2=registration_date=20141006=1^3=entry_date=20141006=1',
13 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
14 | 
15 | def get_topics(ID):
16 |     try:
17 |         html=requests.get('https://www.zhihu.com/people/%s/topics'%ID,headers=headers).text
18 |         table=BeautifulSoup(html,'lxml').find('div',id='zh-profile-topic-list').find_all('strong')
19 |         topics=''
20 |         for item in table:
21 |             topics+=item.get_text()+','
22 |         return topics[:-1]
23 |     except:
24 |         return get_topics(ID)
25 | 
26 | def get_profile(ID):
27 |     try:
28 |         html=requests.get('https://www.zhihu.com/people/%s'%ID,headers=headers).text
29 |         rel='class="zg-gray-darker">(.*?)</a>'
30 |         table=re.findall(rel,html)
31 |         profile=''
32 |         for item in table:
33 |             profile+=item+','
34 |         return profile[:-1]
35 |     except:
36 |         return get_profile(ID)
37 | 
38 | def main():
39 |     f=open('person.txt','a',encoding='utf-8')
40 |     statue=True
41 |     for line in open('data.txt','r').readlines():
42 |         line=line.replace('\n','')
43 |         ID=line.split('||')[1]
44 |         if(statue):
45 |             if(ID=='kun-yu'):
46 |                 statue=False
47 |             continue
48 |         topics=get_topics(ID)
49 |         profile=get_profile(ID)
50 |         f.write(line+'||'+topics+'||'+profile+'\n')
51 |         print(line)
52 |     f.close()
53 | 
54 | main()
55 | 


--------------------------------------------------------------------------------
/zsb.suda.edu.cn/markhistory.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | 
 4 | #导入模块
 5 | import requests
 6 | from bs4 import BeautifulSoup
 7 | import re
 8 | import sqlite3
 9 | import os
10 | 
11 | #获取招生省份
12 | def get_provinces():
13 |     #打开网页获取网页源码
14 |     html=requests.get('http://zsb.suda.edu.cn/markHistory.aspx').text
15 |     #解析网页，查找到省份，这个要结合网页源码
16 |     table=BeautifulSoup(html,'html.parser').find('select',id='ctl00_ContentPlaceHolder1_DropDownList2').find_all('option')
17 |     provinces={}
18 |     #获取省份名
19 |     for option in table:
20 |         provinces[option.get_text()]=option.get('value')
21 |     return provinces
22 | 
23 | #获取招生专业，分数及其他信息
24 | def parser(year,aid,province):
25 |     #构造url,打开网页，获取源码
26 |     url='http://zsb.suda.edu.cn/view_markhistory.aspx?aa=%s年%s各专业录取分数一览表&aid=%s&ay=%s'%(year,province,aid,year)
27 |     print(url)
28 |     html=requests.get(url).text
29 |     #解析网页，获取具体信息
30 |     table=BeautifulSoup(html,'html.parser').find('table',id='ctl00_ContentPlaceHolder1_GridView1').find_all('tr')[1:]
31 |     items=[]
32 |     #遍历表格每一项，获取信息
33 |     for tr in table:
34 |         item=[year,province]
35 |         for td in tr.find_all('td'):
36 |             item.append(td.get_text().replace('\n',''))
37 |         items.append(item)
38 |     return items
39 | 
40 | def main():
41 |     try:
42 |         os.remove('data.db')
43 |     except:
44 |         pass
45 |     #连接数据库
46 |     conn=sqlite3.connect('data.db')
47 |     #创建游标
48 |     cursor=conn.cursor()
49 |     #创建数据表
50 |     cursor.execute("create table if not exists markhistory(year varchar(8),province varchar(80),professional varchar(80),length varchar(20),category varchar(20),numbers varchar(20),highest varchar(20),minimum varchar(20),average varchar(20))")
51 |     #需要抓取的年份
52 |     need_years=['2015','2014','2013']
53 |     #获取招生的省份
54 |     provinces=get_provinces()
55 |     #获取每个省每一年的信息
56 |     for year in need_years:
57 |         for key in provinces:
58 |             #获取 某年（year)某地区（province）各专业信息
59 |             try:
60 |                 items=parser(year,provinces[key],key)
61 |             except:
62 |                 continue
63 |             for item in items:
64 |                 #入库
65 |                 cursor.execute('insert into markhistory(year,province,professional,length,category,numbers,highest,minimum,average) values'+str(tuple(item)))
66 |             #提交事物，入库
67 |             conn.commit()
68 |             #打印完成信息
69 |             print(year,key,'--ok')
70 |     #关闭游标
71 |     cursor.close()
72 |     #关闭数据库连接
73 |     conn.close()
74 | 
75 | main()
76 | 


--------------------------------------------------------------------------------
/zsb.suda.edu.cn/new_markhistory.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import sqlite3
 6 | import os
 7 | import re
 8 | 
 9 | 
10 | #获取招生省份
11 | def get_provinces():
12 |     #打开网页获取网页源码
13 |     html=requests.get('http://zsb.suda.edu.cn/markHistory.aspx').text
14 |     #解析网页，查找到省份，这个要结合网页源码
15 |     table=BeautifulSoup(html,'html.parser').find('select',id='ctl00_ContentPlaceHolder1_DropDownList2').find_all('option')
16 |     provinces=[]
17 |     #获取省份名
18 |     for option in table:
19 |         provinces.append(option.get_text())
20 |     return provinces
21 | 
22 | def get_school():
23 |     #打开网页获取网页源码
24 |     html=requests.get('http://zsb.suda.edu.cn/markHistory.aspx').text
25 |     #解析网页，查找到学院，这个要结合网页源码
26 |     table=BeautifulSoup(html,'html.parser').find('select',id='ctl00_ContentPlaceHolder1_DropDownList3').find_all('option')
27 |     school=[]
28 |     #获取学院名
29 |     for option in table:
30 |         school.append(option.get_text())
31 |     return school
32 | 
33 | #获取招生专业，分数及其他信息
34 | def parser(year,province,school):
35 |     #构造url,打开网页，获取源码
36 |     url='http://zsb.suda.edu.cn/search.aspx?nf=%s&sf=%s&xy=%s'%(year,province,school)
37 |     html=requests.get(url).text
38 |     #解析网页，获取具体信息
39 |     table=BeautifulSoup(html,'html.parser').find('table',id='ctl00_ContentPlaceHolder1_GridView1').find_all('tr')[1:]
40 |     items=[]
41 |     #遍历表格每一项，获取信息
42 |     for tr in table:
43 |         item=[]
44 |         for td in tr.find_all('td'):
45 |             item.append(td.get_text().replace('\n',''))
46 |         items.append(item)
47 |     return items
48 | 
49 | def main():
50 |     try:
51 |         os.remove('data.db')
52 |     except:
53 |         pass
54 |     #连接数据库
55 |     conn=sqlite3.connect('data.db')
56 |     #创建游标
57 |     cursor=conn.cursor()
58 |     #创建数据表
59 |     cursor.execute("create table if not exists markhistory(school varchar(80),year varchar(8),province varchar(80),professional varchar(80),length varchar(20),category varchar(20),numbers varchar(20),highest varchar(20),minimum varchar(20),average varchar(20))")
60 |     #需要抓取的年份
61 |     need_years=['2015','2014','2013']
62 |     #获取招生的省份
63 |     provinces=get_provinces()
64 |     schools=get_school()
65 |     #获取每个省每一年的信息
66 |     for year in need_years:
67 |         for province in provinces:
68 |             for school in schools:
69 |                 #获取 某年（year)某地区（province）各专业信息
70 |                 index=schools.index(school)+1
71 |                 if(index>19):
72 |                     index+=2
73 |                 try:
74 |                     items=parser(year,provinces.index(province)+1,index)
75 |                 except:
76 |                     continue
77 |                 for item in items:
78 |                     item.insert(2, school)
79 |                     #入库
80 |                     cursor.execute('insert into markhistory(school,year,province,professional,length,category,numbers,highest,minimum,average) values'+str(tuple(item)))
81 |                 #提交事物，入库
82 |                 #打印完成信息
83 |                 print(school,year,province,'--ok')
84 |     conn.commit()
85 |     #关闭游标
86 |     cursor.close()
87 |     #关闭数据库连接
88 |     conn.close()
89 | 
90 | main()
91 | 


--------------------------------------------------------------------------------