├── BITcourse
    ├── rank.py
    ├── stock2.0.py
    └── taobao2.0.py
├── BaiduStocks
    ├── BaiduStockInfo.txt
    ├── BaiduStocks
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── pipelines.cpython-36.pyc
    │   │   └── settings.cpython-36.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-36.pyc
    │   │       └── stocks.cpython-36.pyc
    │   │   └── stocks.py
    └── scrapy.cfg
├── Bilibili
    ├── Bilibili.py
    ├── Login.py
    ├── README.md
    ├── top.png
    └── vedio.png
├── LPL
    ├── esports.py
    └── lpl.py
├── README.md
├── Typhoon
    ├── README.md
    ├── download1.png
    ├── download2.jpg
    ├── typhoon.py
    └── wztf.png
├── csdn
    ├── csdn.py
    └── csdn_login.py
├── douban
    ├── README.md
    ├── douban
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── items.cpython-37.pyc
    │   │   ├── middlewares.cpython-37.pyc
    │   │   ├── pipelines.cpython-37.pyc
    │   │   └── settings.cpython-37.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-37.pyc
    │   │       └── doubanmovie.cpython-37.pyc
    │   │   └── doubanmovie.py
    ├── getlist.py
    ├── scrapy.cfg
    └── top250.json
├── image
    └── vx_code.jpg
├── python123demo
    ├── demo.html
    ├── python123demo
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   └── settings.cpython-36.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-36.pyc
    │   │       └── demo.cpython-36.pyc
    │   │   └── demo.py
    └── scrapy.cfg
├── story
    ├── English_story.py
    ├── README.md
    ├── story.py
    ├── story1.png
    ├── story2.png
    ├── weather.py
    └── web1.png
├── unsplash
    ├── README.md
    ├── scrapy.cfg
    └── unsplash
    │   ├── __init__.py
    │   ├── __pycache__
    │       ├── __init__.cpython-37.pyc
    │       ├── items.cpython-37.pyc
    │       ├── pipelines.cpython-37.pyc
    │       └── settings.cpython-37.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │       ├── __init__.py
    │       ├── __pycache__
    │           ├── __init__.cpython-37.pyc
    │           └── download_unspalsh.cpython-37.pyc
    │       └── download_unsplash.py
├── wyy
    ├── README.md
    ├── fans.png
    ├── scrapy.cfg
    └── wyy
    │   ├── __init__.py
    │   ├── __pycache__
    │       ├── __init__.cpython-37.pyc
    │       ├── items.cpython-37.pyc
    │       ├── pipelines.cpython-37.pyc
    │       └── settings.cpython-37.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │       ├── __init__.py
    │       ├── __pycache__
    │           ├── __init__.cpython-37.pyc
    │           ├── wwy_fans1.cpython-37.pyc
    │           ├── wyy_fans.cpython-37.pyc
    │           └── wyy_fans2.cpython-37.pyc
    │       ├── wyy_fans.py
    │       └── wyy_fans2.py
├── xueqiu
    ├── readme.md
    └── xueqiu.py
├── 今日头条
    ├── README.md
    ├── download.png
    ├── jiepai.py
    └── web.png
├── 代理IP
    ├── README.md
    ├── download.png
    ├── getgoodip.py
    ├── ip.png
    └── proxy.py
├── 全国历史天气
    ├── README.md
    ├── download.png
    ├── weather.py
    ├── weather
    │   ├── scrapy.cfg
    │   └── weather
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-37.pyc
    │   │       ├── pipelines.cpython-37.pyc
    │   │       └── settings.cpython-37.pyc
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │       ├── __init__.py
    │   │       ├── __pycache__
    │   │           ├── __init__.cpython-37.pyc
    │   │           ├── getweather.cpython-37.pyc
    │   │           └── untitled3.cpython-37.pyc
    │   │       └── getweather.py
    └── web.png
├── 公交
    └── hangzhou_bus_info.py
├── 大众点评
    ├── README.md
    ├── a.woff
    ├── comment.png
    └── dzdp.py
├── 实习僧
    ├── README.md
    ├── download.png
    ├── new_font.woff
    ├── new_font.xml
    └── shixiseng_crawl.py
├── 微信公众号
    ├── README.md
    ├── article2pdf.py
    ├── download1.png
    ├── download2.png
    ├── web.png
    └── wechatarticle.py
├── 微博
    └── weibophoto.py
├── 拉钩
    ├── README.md
    ├── download.png
    ├── lagou.py
    └── web.png
├── 有道翻译
    ├── README.md
    └── youdao.py
├── 梦幻西游
    ├── CBG.py
    └── test.js
├── 汽车之家
    ├── README.md
    ├── base.ttf
    ├── font_1.xml
    ├── luntan.py
    ├── new.ttf
    └── sourcecode.png
├── 牛客网
    └── niuke.py
├── 猫眼
    ├── MYcomment.py
    ├── README.md
    ├── fonts
    │   ├── 1d490f047e308d1cfa27df888ed679e82080.woff
    │   ├── 1fcc293a32feb0b86780097608f908972088.woff
    │   ├── 3722d3c43709bd8bf9d17cb06c9e84d62080.woff
    │   └── base.woff
    ├── link.jpg
    ├── maoyan.py
    └── maoyan_font.py
├── 百度文库
    └── baiduwenku.py
├── 百度热点
    └── baidu_hotspot.py
├── 知网
    ├── README.md
    ├── cnki.png
    ├── code.jpg
    └── ocr.py
├── 研招网
    ├── pku.py
    └── yz.py
├── 笔趣阁
    └── Novel.py
├── 网易云阅读
    ├── Book.png
    ├── BookList.png
    ├── README.md
    ├── getbook.py
    └── getbooklist.py
├── 网易云音乐
    ├── README.md
    ├── download.png
    ├── getcomment.py
    └── web.png
├── 腾讯视频
    ├── danmu_for_dear.py
    └── danmucrawl.py
├── 英雄联盟盒子
    ├── README.md
    ├── lol.py
    └── url2pdf.py
├── 豆瓣
    └── doubancomment.py
├── 链家
    ├── README.md
    ├── ershoufang.png
    ├── lianjia
    │   ├── lianjia
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   ├── pipelines.cpython-37.pyc
    │   │   │   └── settings.cpython-37.pyc
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │       ├── __init__.cpython-37.pyc
    │   │   │       ├── lianjiacrawl.cpython-37.pyc
    │   │   │       └── lianjiacrawl2.cpython-37.pyc
    │   │   │   ├── lianjiacrawl.py
    │   │   │   └── lianjiacrawl2.py
    │   └── scrapy.cfg
    └── zufang.png
├── 阴阳师
    ├── README.md
    ├── web.png
    └── yys_cbg.py
└── 阿里文学
    ├── README.md
    ├── book_dowanloader.py
    ├── booklist.txt
    ├── download1.png
    ├── download2.png
    ├── getbookdic.py
    └── test.js


/BITcourse/rank.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Feb 10 10:04:18 2019
 4 | 
 5 | @author: Administrator
 6 | """
 7 | 
 8 | import requests
 9 | import bs4
10 | from bs4 import BeautifulSoup
11 | 
12 | def getHTMLText(url):
13 |     try:
14 |         r=requests.get(url,timeout=30)
15 |         r.raise_for_status()
16 |         r.encoding=r.apparent_encoding
17 |         return r.text
18 |     except:
19 |         return "爬取失败"
20 |     
21 | def fillUnivList(ulist,html):
22 |     soup=BeautifulSoup(html,"html.parser")
23 |     for tr in soup.find('tbody').children:
24 |     #for tr in soup.find_all('tr',attr={'class':'alt'}):
25 |         if isinstance(tr,bs4.element.Tag):
26 |             tds=tr('td')
27 |             ulist.append([tds[0].string,tds[1].string,tds[3].string])
28 | 
29 | def printUnivList(ulist,num):
30 |     tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"
31 |     print(tplt.format("排名","学校名称","总分",chr(12288))) 
32 |     for i in range(num):       #做了优化，是输出的学校名称居中对齐，chr（12288）是汉字里面的空格
33 |         u=ulist[i]
34 |         print(tplt.format(u[0],u[1],u[2],chr(12288)))
35 |     
36 | def main():
37 |     ulist=[]
38 |     url='http://www.zuihaodaxue.cn/zuihaodaxuepaiming2019.html'
39 |     html=getHTMLText(url)
40 |     fillUnivList(ulist,html)
41 |     printUnivList(ulist,30)
42 | 
43 | #if __name__=='__main__':
44 | main()   
45 |     
46 |             


--------------------------------------------------------------------------------
/BITcourse/stock2.0.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Feb 11 14:59:59 2019
  4 | 
  5 | @author: Administrator
  6 | 
  7 | 东方网 部分html:    
  8 |     <li><a target="_blank" href="http://quote.eastmoney.com/sh201000.html">R003(201000)</a></li>
  9 | 百度 部分html：
 10 | <div class="stock-info" data-spm="2">
 11 |     <div class="stock-bets">
 12 |         <h1>
 13 |             <a class="bets-name" href="/stock/sh600172.html">
 14 |             黄河旋风 (<span>600172</span>)
 15 |             </a>
 16 |             <span class="state f-up">已收盘 2019-02-11 &nbsp;15:00:12
 17 |             </span>
 18 |                     </h1>
 19 |         <div class="price s-up ">
 20 |                         <strong  class="_close">3.42</strong>
 21 |             <span>+0.14</span>
 22 |             <span>+4.27%</span>
 23 |                     </div>
 24 |         <div class="bets-content">
 25 |             
 26 |                                             <div class="line1">
 27 |                     <dl><dt>今开</dt><dd class="s-up">3.29</dd></dl>
 28 |                     <dl><dt>成交量</dt><dd>17.00万手</dd></dl>
 29 |                     <dl><dt>最高</dt><dd class="s-up">3.47</dd></dl>
 30 |                     <dl><dt>涨停</dt><dd class="s-up">3.61</dd></dl>
 31 |                     <dl><dt>内盘</dt><dd>6.43万手</dd></dl>
 32 |                     <dl><dt>成交额</dt><dd>5775.12万</dd></dl>
 33 |                     <dl><dt>委比</dt><dd>9.47%</dd></dl>
 34 |                     <dl><dt>流通市值</dt><dd>39.14亿</dd></dl>
 35 |                     <dl><dt class="mt-1">市盈率<sup>MRQ</sup></dt><dd>36.66</dd></dl>
 36 |                     <dl><dt>每股收益</dt><dd>0.07</dd></dl>
 37 |                     <dl><dt>总股本</dt><dd>14.51亿</dd></dl>
 38 |                     <div class="clear"></div>
 39 |                 </div>   
 40 | """
 41 | 
 42 | import re
 43 | import requests
 44 | from bs4 import BeautifulSoup
 45 | 
 46 | def getHTMLText(url,code='utf8'):
 47 |     print('开始获取url信息')
 48 |     try:
 49 |         r=requests.get(url,timeout=30)
 50 |         r.raise_for_status() # 如果是200，表示返回的内容正确；如果不是200，会产生HttpError异常
 51 |         #如果已知网页的编码格式，可以修改自动识别的方式
 52 |         #r.encoding=r.apparent_encoding
 53 |         r.encoding=code
 54 |         print(r.text[:1000])
 55 |         return r.text
 56 |     except:
 57 |         print ("urlError")
 58 |     print('完成url信息获取')
 59 |         
 60 | def getStockList(lst,stockURL):
 61 |     print('开始获取东方网上的股票列表')
 62 |     html=getHTMLText(stockURL)
 63 |     soup=BeautifulSoup(html,'html.parser')
 64 |     a=soup.find_all('a')
 65 |     for i in a:
 66 |         try:
 67 |             #个股的股票编号保存在lst中
 68 |             href=i.attrs['href']
 69 |             lst.append(re.findall(r'[s][hz]\d{6}',href)[0])
 70 |         except:
 71 |             continue
 72 |     print('获取股票列表完毕')
 73 | 
 74 | def getStockInfo(lst,stockURL,fpath):
 75 |     count=0;
 76 |     print('开始获取百度网上的股票信息')
 77 |     for stock in lst:
 78 |         url=stockURL+stock+".html"
 79 |         html=getHTMLText(url)
 80 |         try:
 81 |             if html=="":
 82 |                 continue
 83 |             #可能东方网上有的股票，在百度网上没有，用东方网上的股票list匹配百度网时可能出现空页面
 84 |             infoDict={}
 85 |             soup=BeautifulSoup(html,'html.parser')
 86 |             stockInfo=soup.find('div',attrs={'class':'stock-bets'})
 87 |             name=stockInfo.find_all(attrs={'class':'bets-name'})[0]   
 88 |             #find_all返回列表，加上[0],表示返回sz519835,而不是[sz519835]
 89 |             infoDict.update({'股票名称':name.text.split()[0]})
 90 |             #找到股票信息的两个键、值标签
 91 |             keyList=stockInfo.find_all('dt')
 92 |             valueList=stockInfo.find_all('dd')            
 93 |             #将键值对还原成键值对，并存到字典中
 94 |             for i in range(len(keyList)):
 95 |                 key=keyList[i].text
 96 |                 value=valueList[i].text
 97 |                 infoDict[key]=value
 98 |             with open(fpath,'a',encoding='utf8') as f:
 99 |                 f.write(str(infoDict)+'\n')
100 |                 count+=1
101 |                 print('\rspeed:{:.2f}%'.format(count*100/len(lst)),end='')
102 |         except:
103 |             count+=1
104 |             print('\rspeed:{:.2f}%'.format(count*100/len(lst)),end='')
105 |             continue
106 |     print('完成股票信息提取')
107 | 
108 | 
109 | def main():
110 |     stock_list_url = 'http://quote.eastmoney.com/stocklist.html'
111 |     stock_info_url = 'https://gupiao.baidu.com/stock/'
112 |     output_file = 'D://BaiduStockInfo.txt'
113 |     slist=[]#股票信息
114 |     getStockList(slist, stock_list_url)
115 |     getStockInfo(slist, stock_info_url, output_file)
116 |     
117 | if __name__=='__main__':
118 |     main()
119 | 
120 |     
121 | 


--------------------------------------------------------------------------------
/BITcourse/taobao2.0.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Jan 28 11:00:07 2019
 4 | 
 5 | @author: Administrator
 6 | """
 7 | 
 8 | import requests
 9 | import re
10 | 
11 | 
12 | def getHTMLText(url,headers):
13 |     try:
14 |         r=requests.get(url,headers=headers,timeout=30)
15 |         r.raise_for_status # 如果是200，表示返回的内容正确；如果不是200，会产生HttpError异常
16 |         r.encoding=r.apparent_encoding# 将对文本中分析的编码来替换整体的编码
17 |         #print(r.text[:1000])
18 |         return r.text
19 |     except:
20 |         print ("urlError")
21 |         
22 | 
23 | 
24 | def parserPage(ilt,html):
25 |     try:
26 |         plt=re.findall(r'"view_price":"[\d.]*"',html)# 匹配键值对——"view_price":"[\d.]*"
27 |         tlt=re.findall(r'"raw_title":".*?"',html) # *?表示前一个字符0或无限次扩展，最小匹配（只取到最后一个”为止的内容，这样就约束了匹配的内容就是商品本身的名字）
28 |         '''
29 |         去掉view_price字段，只取价格部分
30 |         eval函数用来执行字符串表达式，并返回表达式的值（相当于将外面的双引号去掉）
31 |         这里string.split(pattern)不是正则表达式re.split(pattern, string)，是将字符串分割，[1]是获得键值对的后面部分
32 |         '''
33 |         for i in range(len(plt)):
34 |             price=eval(plt[i].split(':')[1])
35 |             title=eval(tlt[i].split(':')[1])
36 |             ilt.append([price,title])
37 |         #print(ilt)
38 |     except:
39 |         print ("parserError")
40 |     
41 | def printGoodsList(ilt):
42 |     tplt="{:4}\t{:8}\t{:16}"# 定义输出格式的模板
43 |     print(tplt.format("num","price","name"))
44 |     count=0
45 |     for g in ilt:
46 |         count=count+1
47 |         print(tplt.format(count,g[0],g[1]))
48 |       
49 | def main():
50 |     goods='爬虫书'
51 |     depth=3 # 爬取的深度，即有多个网页时决定爬几个网页
52 |     headers = {"User_Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
53 |     "Cookie" : "登陆淘宝后F12在控制台中输入document.cookie获取Cookie"
54 | }               
55 |     start_url="https://s.taobao.com/search?q=" + goods
56 |     infoList=[]
57 |     for i in range(depth):
58 |         try:
59 |             url=start_url+'&s='+str(44*i)
60 |             html=getHTMLText(url,headers)
61 |             parserPage(infoList,html)
62 |         except:
63 |             continue
64 |     printGoodsList(infoList)
65 | 
66 | 
67 | main()
68 | 
69 | 
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/BaiduStocks/BaiduStockInfo.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/BaiduStocks/BaiduStockInfo.txt


--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/BaiduStocks/BaiduStocks/__init__.py


--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/BaiduStocks/BaiduStocks/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/BaiduStocks/BaiduStocks/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/BaiduStocks/BaiduStocks/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class BaidustocksItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class BaidustocksSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class BaidustocksDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class BaidustocksPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | class BaidustocksInfoPipeline(object):
13 |     def open_spider(self,spider):
14 |         self.f=open('BaiduStockInfo.txt','a')
15 |     def close_spider(self,spider):
16 |         self.f.close()
17 |         
18 |     def process_item(self,item,spider):
19 |         try:
20 |             line=str(dict(item))+'\n'
21 |             self.f.write(line)
22 |         except:
23 |             pass
24 |         return item


--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for BaiduStocks project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'BaiduStocks'
13 | 
14 | SPIDER_MODULES = ['BaiduStocks.spiders']
15 | NEWSPIDER_MODULE = 'BaiduStocks.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'BaiduStocks (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'BaiduStocks.middlewares.BaidustocksSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'BaiduStocks.middlewares.BaidustocksDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |     'BaiduStocks.pipelines.BaidustocksInfoPipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/BaiduStocks/BaiduStocks/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/spiders/__pycache__/stocks.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/BaiduStocks/BaiduStocks/spiders/__pycache__/stocks.cpython-36.pyc


--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/spiders/stocks.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import re
 4 | 
 5 | class StocksSpider(scrapy.Spider):
 6 |     name = 'stocks'
 7 |     start_urls = ['http://quote.eastmoney.com/stocklist.html']
 8 | 
 9 |     def parse(self, response):
10 |         for href in response.css('a::attr(href)').extract():
11 |             try:
12 |                 stock=re.findall(r'[s][hz]\d{6}',href)[0]
13 |                 url='https://gupiao.baidu.com/stock/'+stock+'.html'
14 |                 headers={'Accept':'*/*',
15 |                          'Accept-Encoding':'gzip, deflate, br',
16 |                          'Accept-Language':'zh-CN,zh;q=0.9',
17 |                          'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
18 |                          'Referer':'https://gupiao.baidu.com/'
19 |                         }
20 |                 yield scrapy.Request(url,headers=headers,callback=self.parse_stock)
21 |             except:
22 |                 continue
23 |             
24 |     def parse_stock(self,response):
25 |         infoDict={}
26 |         stockinfo=response.css('.stock-bets')
27 |         name=stockinfo.css('.bets-name').extract()[0]
28 |         keyList=stockinfo.css('dt').extract()
29 |         valueList=stockinfo.css('dd').extract()
30 |         for i in range(len(keyList)):
31 |             key=re.findall(r'>.*</dt>',keyList[i])[0][1:-5]
32 |             try:
33 |                 val=re.findall(r'\d+\.?.*</dd>',valueList[i])[0][0:-5]
34 |             except:
35 |                 val='--'
36 |             infoDict[key]=val
37 |         infoDict.update({'股票名称':re.findall('\s.*\(',name)[0].split()[0]+re.findall('\>.*\<',name)[0][1:-1]})
38 |         yield infoDict
39 |         


--------------------------------------------------------------------------------
/BaiduStocks/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = BaiduStocks.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = BaiduStocks
12 | 


--------------------------------------------------------------------------------
/Bilibili/README.md:
--------------------------------------------------------------------------------
 1 | ﻿# 哔哩哔哩小视频 & B站登录滑动验证码破解
 2 | ## [Link](http://vc.bilibili.com/p/eden/rank#/?tab=%E5%85%A8%E9%83%A8)
 3 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/Bilibili/top.png)
 4 | 
 5 | * [滑动验证码破解](https://mp.weixin.qq.com/s/4cvr3mqKD0jkZNVDky4y7w)
 6 | ## Target 
 7 | * 获取所有排行榜上的小视频，下载到本地存储
 8 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/Bilibili/vedio.png)
 9 | ## Tips
10 | * 获取来自服务器的原始套接字响应,需要在初始请求中设置 stream=True
11 | * 当流下载时，用Response.iter_content或许更方便些。requests.get(url)默认是下载在内存中的，下载完成才存到硬盘上，可以用Response.iter_content 来边下载边存硬盘
12 | * tqdm进度条 
13 | ```python
14 | for data in tqdm(iterable):
15 |     pass
16 | ```
17 | ## Todo
18 | * 视频文件名无法正确存储，这里用数字代替
19 | 


--------------------------------------------------------------------------------
/Bilibili/top.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/Bilibili/top.png


--------------------------------------------------------------------------------
/Bilibili/vedio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/Bilibili/vedio.png


--------------------------------------------------------------------------------
/LPL/lpl.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Aug  6 16:22:36 2019
 4 | 
 5 | @author: Lee
 6 | """
 7 | import requests
 8 | from lxml import etree
 9 | 
10 | 
11 | 
12 | class lpl_match(object):
13 |     
14 |     def __init__(self):
15 |         self.headers={
16 |                      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
17 |                       }
18 |         self.url='http://lol.duowan.com/LPL/'
19 |         self.score=[]
20 |         self.matches=[]
21 |         
22 |     def get_info(self):
23 |         r=requests.get(url=self.url,headers=self.headers)
24 |         tree=etree.HTML(r.text)
25 |         # 对阵情况
26 |         vs=tree.xpath('//div[@class="match-container"]/div[@class="title"]/text()')
27 |         # 对战时间
28 |         dates=tree.xpath('//div[@class="match-container"]/div[@class="info"]/div[@class="date"]/text()')
29 |         # 比赛状态
30 |         status=tree.xpath('//div[@class="match-container"]/div[@class="info"]/div[contains(@class,"status")]/text()')
31 |         # 比分
32 |         scores=tree.xpath('//div[@class="score"]/span[contains(@class,"t")]/text()')
33 |         # 视频链接
34 |         vedio=tree.xpath('//div[@class="team-btn-wrap"]/a[1]/@href')  
35 |         # 比赛报道
36 |         report=tree.xpath('//div[@class="team-btn-wrap"]/a[2]/@href') 
37 |         
38 |         return vs,dates,status,scores,vedio,report
39 | #score=tree.xpath('string(//div[@class="score"])')
40 | #score_left=tree.xpath('//div[@class="score"]/span[@class="left"]/text() | //div[@class="score"]/span[@class="left light"]/text() | //div[@class="score"]/span[@class="left "]/text()')
41 | #score_right=tree.xpath('//div[@class="score"]/span[@class="right light"]/text() | //div[@class="score"]/span[@class="right"]/text() | //div[@class="score"]/span[@class="right "]/text()')
42 | 
43 | #score_left=tree.xpath('//div[@class="score"]/span[contains(@class,"left")]/text()')
44 | #score_right=tree.xpath('//div[@class="score"]/span[contains(@class,"right")]/text()')
45 |     def process_info(self):
46 |         
47 |         vs,dates,status,scores,vedio,report=self.get_info()
48 |         score_left=scores[::2]
49 |         score_right=scores[1::2]
50 |         for i in range(len(score_left)):
51 |             self.score.append(str(score_left[i])+':'+str(score_right[i]))
52 |         date=dates[::2]
53 |         time=dates[1::2]
54 |         for i in range(len(score_left)):
55 |             match={
56 |                   'title':vs[i],
57 |                   'date':date[i],
58 |                   'time':time[i],
59 |                   'score':self.score[i],
60 |                   'status':status[i],
61 |                   'vedio':vedio[i],
62 |                   'report':report[i]
63 |            }
64 |             self.matches.append(match)
65 |             
66 | if __name__=='__main__':
67 |     
68 |     lpl=lpl_match()
69 |     lpl.process_info()           
70 |     print(lpl.matches)


--------------------------------------------------------------------------------
/Typhoon/README.md:
--------------------------------------------------------------------------------
 1 | ﻿# 温州台风网
 2 | ## [Link](http://www.wztf121.com/)
 3 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/Typhoon/wztf.png)
 4 | 
 5 | ## Target 
 6 | * 获取历年台风信息，下载到本地存储
 7 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/Typhoon/download1.png)
 8 | * 以及包含路径风力风速等详细信息
 9 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/Typhoon/download2.jpg)
10 | 
11 | * [台风数据分析](https://mp.weixin.qq.com/s/S6JNrl_xZPo75QWVi413Ig)


--------------------------------------------------------------------------------
/Typhoon/download1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/Typhoon/download1.png


--------------------------------------------------------------------------------
/Typhoon/download2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/Typhoon/download2.jpg


--------------------------------------------------------------------------------
/Typhoon/wztf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/Typhoon/wztf.png


--------------------------------------------------------------------------------
/csdn/csdn_login.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Apr  2 15:03:43 2019
 4 | 
 5 | @author: Administrator
 6 | """
 7 | 
 8 | from selenium import webdriver
 9 | import time
10 | 
11 | def login(email, password):
12 | 
13 |     browser = webdriver.Chrome()
14 |     browser.get("https://passport.csdn.net/login")
15 |     time.sleep(1)
16 |     #找到账号登陆接口并点击
17 |     input_button = browser.find_element_by_xpath('//div[@class="main-select"]/ul/li[2]/a')
18 |     input_button.click()
19 |     time.sleep(1)
20 |     # 输入账号密码，并点击登陆
21 |     input_element = browser.find_element_by_xpath(
22 |         '//div[@class="col-xs-12 col-sm-12 control-col-pos col-pr-no col-pl-no"]/input[@id="all"]')
23 |     input_element.send_keys(email)
24 |     time.sleep(1)
25 |     # 输入密码
26 |     input_password = browser.find_element_by_xpath(
27 |         '//div[@class="col-xs-12 col-sm-12 control-col-pos col-pr-no col-pl-no"]/input[@id="password-number"]')
28 |     input_password.send_keys(password)
29 |     time.sleep(1)
30 |     # 点击登陆
31 |     touch_button = browser.find_element_by_xpath('//button')
32 |     touch_button.click()
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     
37 |     email = input("请输入你的账号:")
38 |     password = input("请输入你的密码:")
39 |     login(email, password)


--------------------------------------------------------------------------------
/douban/douban/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/douban/douban/__init__.py


--------------------------------------------------------------------------------
/douban/douban/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/douban/douban/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/douban/douban/__pycache__/items.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/douban/douban/__pycache__/items.cpython-37.pyc


--------------------------------------------------------------------------------
/douban/douban/__pycache__/middlewares.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/douban/douban/__pycache__/middlewares.cpython-37.pyc


--------------------------------------------------------------------------------
/douban/douban/__pycache__/pipelines.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/douban/douban/__pycache__/pipelines.cpython-37.pyc


--------------------------------------------------------------------------------
/douban/douban/__pycache__/settings.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/douban/douban/__pycache__/settings.cpython-37.pyc


--------------------------------------------------------------------------------
/douban/douban/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class DoubanItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     # 电影标题
15 |     title = scrapy.Field()
16 |     # 豆瓣评分
17 |     star = scrapy.Field()
18 |     # 主演信息
19 |     Staring = scrapy.Field()
20 |     # 豆瓣排名
21 |     rank = scrapy.Field()
22 |     # 描述
23 |     quote = scrapy.Field()
24 |     # 豆瓣详情页
25 |     url = scrapy.Field()
26 |     # 电影海报图片url
27 |     image_url = scrapy.Field()
28 |     # 电影海报保存到本地的path
29 |     save_path = scrapy.Field()
30 | 


--------------------------------------------------------------------------------
/douban/douban/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | import random
 10 | 
 11 | 
 12 | 
 13 | class ProxyMiddleware(object):
 14 |     '''
 15 |     设置Proxy
 16 |     '''
 17 | 
 18 |     def __init__(self, ip):
 19 |         self.ip = ip
 20 | 
 21 |     @classmethod
 22 |     def from_crawler(cls, crawler):
 23 |         return cls(ip=crawler.settings.get('PROXIES'))
 24 | 
 25 |     def process_request(self, request, spider):
 26 |         ip = random.choice(self.ip)
 27 |         request.meta['proxy'] = ip
 28 | 
 29 | class DoubanSpiderMiddleware(object):
 30 |     # Not all methods need to be defined. If a method is not defined,
 31 |     # scrapy acts as if the spider middleware does not modify the
 32 |     # passed objects.
 33 | 
 34 |     @classmethod
 35 |     def from_crawler(cls, crawler):
 36 |         # This method is used by Scrapy to create your spiders.
 37 |         s = cls()
 38 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 39 |         return s
 40 | 
 41 |     def process_spider_input(self, response, spider):
 42 |         # Called for each response that goes through the spider
 43 |         # middleware and into the spider.
 44 | 
 45 |         # Should return None or raise an exception.
 46 |         return None
 47 | 
 48 |     def process_spider_output(self, response, result, spider):
 49 |         # Called with the results returned from the Spider, after
 50 |         # it has processed the response.
 51 | 
 52 |         # Must return an iterable of Request, dict or Item objects.
 53 |         for i in result:
 54 |             yield i
 55 | 
 56 |     def process_spider_exception(self, response, exception, spider):
 57 |         # Called when a spider or process_spider_input() method
 58 |         # (from other spider middleware) raises an exception.
 59 | 
 60 |         # Should return either None or an iterable of Response, dict
 61 |         # or Item objects.
 62 |         pass
 63 | 
 64 |     def process_start_requests(self, start_requests, spider):
 65 |         # Called with the start requests of the spider, and works
 66 |         # similarly to the process_spider_output() method, except
 67 |         # that it doesn’t have a response associated.
 68 | 
 69 |         # Must return only requests (not items).
 70 |         for r in start_requests:
 71 |             yield r
 72 | 
 73 |     def spider_opened(self, spider):
 74 |         spider.logger.info('Spider opened: %s' % spider.name)
 75 | 
 76 | 
 77 | class DoubanDownloaderMiddleware(object):
 78 |     # Not all methods need to be defined. If a method is not defined,
 79 |     # scrapy acts as if the downloader middleware does not modify the
 80 |     # passed objects.
 81 | 
 82 |     @classmethod
 83 |     def from_crawler(cls, crawler):
 84 |         # This method is used by Scrapy to create your spiders.
 85 |         s = cls()
 86 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 87 |         return s
 88 | 
 89 |     def process_request(self, request, spider):
 90 |         # Called for each request that goes through the downloader
 91 |         # middleware.
 92 | 
 93 |         # Must either:
 94 |         # - return None: continue processing this request
 95 |         # - or return a Response object
 96 |         # - or return a Request object
 97 |         # - or raise IgnoreRequest: process_exception() methods of
 98 |         #   installed downloader middleware will be called
 99 |         return None
100 | 
101 |     def process_response(self, request, response, spider):
102 |         # Called with the response returned from the downloader.
103 | 
104 |         # Must either;
105 |         # - return a Response object
106 |         # - return a Request object
107 |         # - or raise IgnoreRequest
108 |         return response
109 | 
110 |     def process_exception(self, request, exception, spider):
111 |         # Called when a download handler or a process_request()
112 |         # (from other downloader middleware) raises an exception.
113 | 
114 |         # Must either:
115 |         # - return None: continue processing this exception
116 |         # - return a Response object: stops process_exception() chain
117 |         # - return a Request object: stops process_exception() chain
118 |         pass
119 | 
120 |     def spider_opened(self, spider):
121 |         spider.logger.info('Spider opened: %s' % spider.name)
122 | 


--------------------------------------------------------------------------------
/douban/douban/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import os
 8 | import json
 9 | import codecs
10 | import requests
11 | from pymongo import MongoClient
12 | from douban.settings import IMAGES_STORE
13 | 
14 | # 将爬取的内容保存到文件中
15 | class SaveFilePipeline(object):
16 | 
17 |     def __init__(self) -> None:
18 |         self.res_list = []
19 |         super().__init__()
20 | 
21 |     def process_item(self, item, spider):
22 |         res = dict(item)
23 |         # print(str)
24 |         self.res_list.append(res)
25 |         return item
26 | 
27 |     def open_spider(self, spider):
28 |         pass
29 | 
30 |     def close_spider(self, spider):
31 |         # print(self.res_list)
32 |         # 打开文件, w+ 读写, 如果文件不存在会被创建, 存在则内容会被清空会重写写入
33 |         file = codecs.open(filename="douban_movie_top_250.json", mode='w+', encoding='utf-8')
34 |         # ensure_ascii=False 保证输出的是中文而不是unicode字符
35 |         file.write(json.dumps(self.res_list, ensure_ascii=False))
36 |         file.close()
37 | 
38 | 
39 | # 保存电影海报图片
40 | class SaveImgPipeline(object):
41 | 
42 |     def process_item(self, item, spider):
43 |         file_path = "{}//{}_{}.jpg".format(IMAGES_STORE, item['rank'], item['title'])
44 | 
45 |         if os.path.exists(file_path):
46 |             pass
47 |         else:
48 |             print("图片将保存到 ==> " + file_path)
49 |             with open(file_path, "wb") as f:
50 |                 r = requests.get(item['image_url'])
51 |                 f.write(r.content)
52 |         item['save_path'] = file_path
53 |         return item
54 | 
55 | 
56 | # 将爬取的内容保存到mongoDB中
57 | class Save2MongoPipeline(object):
58 | 
59 |     def __init__(self) -> None:
60 |         # 连接
61 |         self.client = MongoClient(host='localhost', port=27017)
62 |         # 如果设置有权限, 则需要先登录
63 |         # db_auth = self.client.admin
64 |         # db_auth.authenticate('root', 'root')
65 |         # 需要保存到的collection
66 |         self.col = self.client['douban_movie']
67 |         self.top250 = self.col.top250
68 |         # 先清除之前保存的数据
69 |         # self.top250.delete_many({})
70 | 
71 |     def process_item(self, item, spider):
72 |         res = dict(item)
73 |         self.top250.insert_one(res)
74 |         return item
75 | 
76 |     def open_spider(self, spider):
77 |         pass
78 | 
79 |     def close_spider(self, spider):
80 |         self.client.close()
81 | 


--------------------------------------------------------------------------------
/douban/douban/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for douban project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'douban'
 13 | 
 14 | SPIDER_MODULES = ['douban.spiders']
 15 | NEWSPIDER_MODULE = 'douban.spiders'
 16 | 
 17 | # 设置UA
 18 | import random
 19 | USER_AGENT_LIST = [
 20 |     'MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23',
 21 |     'Opera/9.20 (Macintosh; Intel Mac OS X; U; en)',
 22 |     'Opera/9.0 (Macintosh; PPC Mac OS X; U; en)',
 23 |     'iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)',
 24 |     'Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8    sun4u)',
 25 |     'iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)',
 26 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0',
 27 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0',
 28 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0',
 29 |     'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)',
 30 |     'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)'
 31 | ]
 32 | USER_AGENT = random.choice(USER_AGENT_LIST)
 33 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 34 | #USER_AGENT = 'douban (+http://www.yourdomain.com)'
 35 | 
 36 | PROXIES = ['http://210.22.176.146:47578']
 37 | 
 38 | 
 39 | 
 40 | # Obey robots.txt rules
 41 | ROBOTSTXT_OBEY = True
 42 | 
 43 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 44 | #CONCURRENT_REQUESTS = 32
 45 | 
 46 | # Configure a delay for requests for the same website (default: 0)
 47 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 48 | # See also autothrottle settings and docs
 49 | #DOWNLOAD_DELAY = 3
 50 | # The download delay setting will honor only one of:
 51 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 52 | #CONCURRENT_REQUESTS_PER_IP = 16
 53 | 
 54 | # Disable cookies (enabled by default)
 55 | #COOKIES_ENABLED = False
 56 | 
 57 | # Disable Telnet Console (enabled by default)
 58 | #TELNETCONSOLE_ENABLED = False
 59 | 
 60 | # Override the default request headers:
 61 | #DEFAULT_REQUEST_HEADERS = {
 62 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 63 | #   'Accept-Language': 'en',
 64 | #}
 65 | 
 66 | # Enable or disable spider middlewares
 67 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 68 | #SPIDER_MIDDLEWARES = {
 69 | #    'douban.middlewares.DoubanSpiderMiddleware': 543,
 70 | #}
 71 | 
 72 | # Enable or disable downloader middlewares
 73 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 74 | 
 75 | DOWNLOADER_MIDDLEWARES = {
 76 |     'douban.middlewares.ProxyMiddleware': 543,
 77 | }
 78 | 
 79 | 
 80 | # Enable or disable extensions
 81 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 82 | #EXTENSIONS = {
 83 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 84 | #}
 85 | 
 86 | # Configure item pipelines
 87 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 88 | 
 89 | ITEM_PIPELINES = {
 90 |     'douban.pipelines.Save2MongoPipeline': 200,
 91 |     'douban.pipelines.SaveFilePipeline': 300,
 92 | 
 93 | }
 94 | 
 95 | # Enable and configure the AutoThrottle extension (disabled by default)
 96 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 97 | #AUTOTHROTTLE_ENABLED = True
 98 | # The initial download delay
 99 | #AUTOTHROTTLE_START_DELAY = 5
100 | # The maximum download delay to be set in case of high latencies
101 | #AUTOTHROTTLE_MAX_DELAY = 60
102 | # The average number of requests Scrapy should be sending in parallel to
103 | # each remote server
104 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
105 | # Enable showing throttling stats for every response received:
106 | #AUTOTHROTTLE_DEBUG = False
107 | 
108 | # Enable and configure HTTP caching (disabled by default)
109 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
110 | #HTTPCACHE_ENABLED = True
111 | #HTTPCACHE_EXPIRATION_SECS = 0
112 | #HTTPCACHE_DIR = 'httpcache'
113 | #HTTPCACHE_IGNORE_HTTP_CODES = []
114 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
115 | 
116 | # 电影海报保存文件夹
117 | IMAGES_STORE = 'F://movie'


--------------------------------------------------------------------------------
/douban/douban/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/douban/douban/spiders/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/douban/douban/spiders/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/douban/douban/spiders/__pycache__/doubanmovie.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/douban/douban/spiders/__pycache__/doubanmovie.cpython-37.pyc


--------------------------------------------------------------------------------
/douban/douban/spiders/doubanmovie.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from douban.items import DoubanItem
 4 | 
 5 | 
 6 | class DoubanmovieSpider(scrapy.Spider):
 7 |     name = 'doubanmovie'
 8 |     start_urls = ['https://movie.douban.com/top250']
 9 |     # 如果有多个spider, 则通过custom_settings配置, 取代全局settings文件中的部分
10 |     # 注意SaveImagePipeline的优先级应该要高于Save2MongoPipeline的优先级, 因为将电影海报保存到本地后, item还需要记录保存到本地的路径
11 |     custom_settings = {
12 |         'ITEM_PIPELINES': {
13 |             'douban.pipelines.SaveImgPipeline': 100,
14 |             'douban.pipelines.Save2MongoPipeline': 200,
15 |         },
16 |     }
17 |     def parse(self, response):
18 |         for item in response.css('.item'):
19 |             movie = DoubanItem()
20 |             #Staring = item.xpath('//*[@id="content"]/div/div[1]/ol/li[1]/div/div[2]/div[2]/p[1]/text()').extract_first()
21 |             Staring =item.css('.bd p::text').extract_first()
22 |             rank = item.css('.pic em::text').extract_first()
23 |             title = item.css('.hd span.title::text').extract_first()
24 |             star = item.css('.star span.rating_num::text').extract_first()
25 |             quote = item.css('.quote span.inq::text').extract_first()
26 |             url = item.css('.pic a::attr("href")').extract_first()
27 |             image_url = item.css('.pic img::attr("src")').extract_first()
28 |             movie['rank'] = rank
29 |             movie['title'] = title
30 |             movie['star'] = star
31 |             movie['Staring'] = Staring
32 |             movie['quote'] = quote
33 |             movie['url'] = url
34 |             movie['image_url'] = image_url
35 |             yield movie
36 |     
37 |         # 获取下一页的url
38 |         next_url = response.css('span.next a::attr("href")').extract_first()
39 |         if next_url is not None:
40 |             url = self.start_urls[0] + next_url
41 |             yield scrapy.Request(url=url, callback=self.parse)
42 | 
43 | 


--------------------------------------------------------------------------------
/douban/getlist.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jul 10 11:06:24 2019
 4 | 
 5 | @author: Lee
 6 | """
 7 | 
 8 | import json
 9 | 
10 | with open("top250.json",'r',encoding='utf-8') as load_f:
11 |     load_dicts = json.load(load_f)
12 |     newdic={}
13 |     for load_dict in load_dicts:
14 |         newdic[load_dict['rank']]=load_dict['title']
15 | print(newdic)


--------------------------------------------------------------------------------
/douban/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = douban.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = douban
12 | 


--------------------------------------------------------------------------------
/image/vx_code.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/image/vx_code.jpg


--------------------------------------------------------------------------------
/python123demo/demo.html:
--------------------------------------------------------------------------------
1 | <html><head><title>This is a python demo page</title></head>
2 | <body>
3 | <p class="title"><b>The demo python introduces several python courses.</b></p>
4 | <p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
5 | <a href="http://www.icourse163.org/course/BIT-268001" class="py1" id="link1">Basic Python</a> and <a href="http://www.icourse163.org/course/BIT-1001870001" class="py2" id="link2">Advanced Python</a>.</p>
6 | </body></html>


--------------------------------------------------------------------------------
/python123demo/python123demo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/python123demo/python123demo/__init__.py


--------------------------------------------------------------------------------
/python123demo/python123demo/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/python123demo/python123demo/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/python123demo/python123demo/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/python123demo/python123demo/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/python123demo/python123demo/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class Python123DemoItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/python123demo/python123demo/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class Python123DemoSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class Python123DemoDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/python123demo/python123demo/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class Python123DemoPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/python123demo/python123demo/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for python123demo project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'python123demo'
13 | 
14 | SPIDER_MODULES = ['python123demo.spiders']
15 | NEWSPIDER_MODULE = 'python123demo.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'python123demo (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'python123demo.middlewares.Python123DemoSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'python123demo.middlewares.Python123DemoDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'python123demo.pipelines.Python123DemoPipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/python123demo/python123demo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/python123demo/python123demo/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/python123demo/python123demo/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/python123demo/python123demo/spiders/__pycache__/demo.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/python123demo/python123demo/spiders/__pycache__/demo.cpython-36.pyc


--------------------------------------------------------------------------------
/python123demo/python123demo/spiders/demo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | 
 5 | class DemoSpider(scrapy.Spider):
 6 |     name = 'demo'
 7 |     #allowed_domains = ['python123.io']
 8 |     start_urls = ['http://python123.io/ws/demo.html']
 9 | 
10 |     def parse(self, response):
11 |         fname=response.url.split('/')[-1]
12 |         with open(fname,'wb') as f:
13 |             f.write(response.body)
14 |         self.log('Saved file %s.' % fname)
15 | 


--------------------------------------------------------------------------------
/python123demo/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = python123demo.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = python123demo
12 | 


--------------------------------------------------------------------------------
/story/English_story.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat May  4 19:02:32 2019
  4 | 
  5 | @author: Administrator
  6 | """
  7 | 
  8 | import requests
  9 | from bs4 import BeautifulSoup
 10 | import smtplib
 11 | from email.mime.text import MIMEText
 12 | import datetime
 13 | import time
 14 | 
 15 | def getDays():
 16 |     
 17 |     inlove_date=datetime.datetime(2017,8,31)
 18 |     today_date=datetime.datetime.today()
 19 |     inlove_days=(today_date-inlove_date).days
 20 |     return str(inlove_days)
 21 | 
 22 | def getHTMLText(url,headers):
 23 |     try:
 24 |         r=requests.get(url,headers=headers,timeout=30)
 25 |         r.raise_for_status()
 26 |         r.encoding=r.apparent_encoding
 27 |         #print(r.text)
 28 |         return r.text
 29 |        
 30 |     except:
 31 |         return "爬取失败"
 32 |     
 33 | def parsehtml(namelist,urllist,html):
 34 |     url='http://www.en8848.com.cn/'
 35 |     soup=BeautifulSoup(html,'html.parser')
 36 |     t=soup.find(attrs={'class':'ch_content'})
 37 |     #print(t)
 38 |     i=t.find_all('a')
 39 |     #print(i)
 40 |     for link in i[1:59:2]:
 41 |         urllist.append(url+link.get('href'))
 42 |         namelist.append(link.get('title'))
 43 | 
 44 | 
 45 | def parsehtml2(html):
 46 |     text=[]
 47 |     soup=BeautifulSoup(html,'html.parser')
 48 |     t=soup.find(attrs={'class':'jxa_content','id':'articlebody'})
 49 |     for i in t.findAll('p'):
 50 |         text.append(i.text)
 51 |     #print(text)
 52 |     return "\n".join(text)
 53 | 
 54 | def sendemail(url,headers,title):
 55 |     date_today=time.strftime("%Y-%m-%d", time.localtime())
 56 |     msg_from='870407139@qq.com'                                 #发送方邮箱
 57 |     passwd=''                                                   #填入发送方邮箱的授权码
 58 |     receivers=['870407139@qq.com']                              #收件人邮箱
 59 |                             
 60 |     subject="Today's story from Laofei " +str(date_today)       #主题     
 61 |     html=getHTMLText(url,headers)
 62 |     content='Dear Xiaofei:\n    We have been in love for '+getDays()+' Days !\n\n⭐⭐⭐⭐⭐❤❤💗❤❤⭐⭐⭐⭐⭐'+parsehtml2(html)                                        #正文
 63 |     msg = MIMEText(content)
 64 |     msg['Subject'] = subject
 65 |     msg['From'] = msg_from
 66 |     msg['To'] = ','.join(receivers)
 67 |     try:
 68 |         s=smtplib.SMTP_SSL("smtp.qq.com",465)                   #邮件服务器及端口号
 69 |         s.login(msg_from, passwd)
 70 |         s.sendmail(msg_from, msg['To'].split(','), msg.as_string())
 71 |         print("发送成功")
 72 |     except:
 73 |         print("发送失败")
 74 |     finally:
 75 |         s.quit()
 76 |         
 77 | def main():
 78 | 
 79 | 
 80 |     headers = {'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
 81 |                }
 82 | 
 83 |     urllist=[]
 84 |     namelist=[]
 85 |     for i in range(1,21):
 86 |         if i==1:
 87 |             url='http://www.en8848.com.cn/article/love/dating/index.html'
 88 |         else:
 89 |             url='http://www.en8848.com.cn/article/love/dating/index_'+str(i)+'.html'
 90 |         print ("正在爬取第%s页的英语短文链接：" % (i))
 91 |         print (url+'\n')
 92 |         html=getHTMLText(url,headers)
 93 |         parsehtml(namelist,urllist,html)
 94 |     print("爬取链接完成")
 95 |     date=int(getDays())-611
 96 |     sendemail(urllist[date],headers,namelist[date])
 97 |     
 98 |     
 99 | if __name__=='__main__':
100 |     main()
101 | 


--------------------------------------------------------------------------------
/story/README.md:
--------------------------------------------------------------------------------
 1 | ﻿# 睡前小故事
 2 | ## [Link1](http://www.tom61.com/ertongwenxue/shuiqiangushi/)
 3 | ## [Link2](http://www.en8848.com.cn/article/love/dating/index.html)
 4 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/story/web1.png)
 5 | ## Target 
 6 | * 爬取睡前小故事，并在每天晚上9点定时发送到邮箱
 7 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/story/story1.png)
 8 | ## Code
 9 | * story.py
10 | ## Todo
11 | * 爬取的故事是随机选取一则发送，可能会出现重复发送的现象
12 | * 发送的小故事未带上响应的标题一起发送
13 | 
14 | ## Tips
15 | * 之前的问题在我找到一个英文故事网站之后全部解决
16 | * 按照日期顺序每天发送一则，并且带上了相应的标题
17 | 
18 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/story/story2.png)
19 | ## Code
20 | * English_story.py
21 | 
22 | 


--------------------------------------------------------------------------------
/story/story.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Mar 14 21:25:45 2019
 4 | 
 5 | @author: Administrator
 6 | """
 7 | 
 8 | import requests
 9 | from bs4 import BeautifulSoup
10 | import smtplib
11 | from email.mime.text import MIMEText
12 | import random
13 | 
14 | 
15 | def getHTMLText(url,headers):
16 |     try:
17 |         r=requests.get(url,headers=headers,timeout=30)
18 |         r.raise_for_status()
19 |         r.encoding=r.apparent_encoding
20 |         #print(r.text)
21 |         return r.text
22 |        
23 |     except:
24 |         return "爬取失败"
25 |     
26 | def parsehtml(namelist,urllist,html):
27 |     url='http://www.tom61.com/'
28 |     soup=BeautifulSoup(html,'html.parser')
29 |     t=soup.find('dl',attrs={'class':'txt_box'})
30 |     #print(t)
31 |     i=t.find_all('a')
32 |     #print(i)
33 |     for link in i:
34 |         urllist.append(url+link.get('href'))
35 |         namelist.append(link.get('title'))
36 | 
37 | 
38 | def parsehtml2(html):
39 |     text=[]
40 |     soup=BeautifulSoup(html,'html.parser')
41 |     t=soup.find('div',class_='t_news_txt')
42 |     for i in t.findAll('p'):
43 |         text.append(i.text)
44 |     #print(text)
45 |     return "\n".join(text)
46 | 
47 | def sendemail(url,headers):
48 | 
49 |     msg_from=''                                 #发送方邮箱
50 |     passwd=''                                   #填入发送方邮箱的授权码
51 |     receivers=[',']                             #收件人邮箱
52 |                             
53 |     subject='今日份的睡前小故事'                                     #主题     
54 |     html=getHTMLText(url,headers)
55 |     content=parsehtml2(html)                                        #正文
56 |     msg = MIMEText(content)
57 |     msg['Subject'] = subject
58 |     msg['From'] = msg_from
59 |     msg['To'] = ','.join(receivers)
60 |     try:
61 |         s=smtplib.SMTP_SSL("smtp.qq.com",465)                   #邮件服务器及端口号
62 |         s.login(msg_from, passwd)
63 |         s.sendmail(msg_from, msg['To'].split(','), msg.as_string())
64 |         print("发送成功")
65 |     except:
66 |         print("发送失败")
67 |     finally:
68 |         s.quit()
69 |         
70 | def main():
71 | 
72 | 
73 |     headers = {'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
74 |                }
75 | 
76 |     urllist=[]
77 |     namelist=[]
78 |     for i in range(1,11):
79 |         if i==1:
80 |             url='http://www.tom61.com/ertongwenxue/shuiqiangushi/index.html'
81 |         else:
82 |             url='http://www.tom61.com/ertongwenxue/shuiqiangushi/index_'+str(i)+'.html'
83 |         print ("正在爬取第%s页的故事链接：" % (i))
84 |         print (url+'\n')
85 |         html=getHTMLText(url,headers)
86 |         parsehtml(namelist,urllist,html)
87 |     print("爬取链接完成")
88 |     '''
89 |     for i in urllist:
90 |         html=getHTMLText(i,headers)
91 |         parsehtml2(html)
92 |     '''
93 |     sendemail(random.choice(urllist),headers)
94 | if __name__=='__main__':
95 |     main()
96 | 


--------------------------------------------------------------------------------
/story/story1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/story/story1.png


--------------------------------------------------------------------------------
/story/story2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/story/story2.png


--------------------------------------------------------------------------------
/story/weather.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Aug  6 13:11:58 2019
 4 | 
 5 | @author: Lee
 6 | """
 7 | 
 8 | 
 9 | # import itchat 
10 | import time
11 | import requests
12 | from lxml import etree
13 | import wxpy
14 | 
15 | def getWeather():
16 |     # 使用BeautifulSoup获取天气信息
17 |     r=requests.get('https://tianqi.sogou.com/?tid=101280601')
18 |     tree=etree.HTML(r.text)
19 |     today=tree.xpath('//div[@class="row2 row2-1"]/a/text()')[0]
20 |     weather=tree.xpath('//p[@class="des"]/text()')[1]
21 |     wind=tree.xpath('//p[@class="wind"]/text()')[1]
22 |     quality=tree.xpath('//span[@class="liv-text"]/a/em/text()')[0]
23 |     rank=tree.xpath('//span[@class="liv-img liv-img-cor1"]/text()')[0]
24 |     high=tree.xpath('//div[@class="r-temp"]/@data-high')[0].split(',')[1]
25 |     low=tree.xpath('//div[@class="r-temp"]/@data-low')[0].split(',')[1]
26 |     content='早上好，亲爱的！\n今日份的天气请注意查看喔~\n今天是：'+today+'\n天气：'+weather+'\n风级：'+wind+'\n最高温度:'+high+'\n最低温度:'+low+'\n空气质量指数:'+quality+' 等级：'+rank
27 |     print(content)
28 |     return content
29 | 
30 | 
31 | def main():
32 |     
33 |     message = getWeather()
34 |     print('成功获取天气信息')
35 | 
36 | #    # 参数hotReload=True实现保持微信网页版登陆状态，下次发送无需再次扫码
37 | #    itchat.auto_login()
38 | #    users=itchat.search_friends('')
39 | #    print(users)
40 | #    userName=users[0]['UserName']
41 | #    ret=itchat.send(msg = message, toUserName = userName)
42 | #    if ret:
43 | #        print("成功发送")
44 | #    else:
45 | #        print("发送失败")
46 | #    time.sleep(3)
47 | #    itchat.logout()
48 |     
49 |     bot=wxpy.Bot()
50 |     my_friend=bot.friends().search('Snall')[0]
51 |     my_friend.send(message[0])
52 | 
53 | 
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     main()
58 | 


--------------------------------------------------------------------------------
/story/web1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/story/web1.png


--------------------------------------------------------------------------------
/unsplash/README.md:
--------------------------------------------------------------------------------
 1 | ﻿# Unsplash海量图片
 2 | ## 爬取方法一:Requests
 3 | * 进入[图片网站](https://unsplash.com/)，先按F12打开开发者工具，观察Network，滚动页面，向下翻页，可以发现下图photos?page=3&per_page=12
 4 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20190711171128220.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2x5YzQ0ODEzNDE4,size_16,color_FFFFFF,t_70)
 5 | * 观察其request URL,从其构造不难看出每页12张图片，当前是第三页，继续下滑网页，发现出现photos?page=4&per_page=12，观察得到参数仅有page不同，也验证了猜想，接下来继续观察这个链接，不难发现，图片的下载链接就藏在其中。
 6 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20190711172406391.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2x5YzQ0ODEzNDE4,size_16,color_FFFFFF,t_70)
 7 | * 这个网页对新人爬虫还是非常友好的嘛！立马动手展开代码书写，只要在循环之中改变page的值就可以爬取整个网页的所有图片！
 8 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20190711172543756.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2x5YzQ0ODEzNDE4,size_16,color_FFFFFF,t_70)
 9 | * 程序成功地运行！但是它的速度真是让人不敢恭维，一页12张图片都需要不少的时间代价，这10多万张图不得爬到猴年马月？于是我选择Scrapy框架来爬取图片。
10 | #
11 | 
12 | # 爬取方法二:Scrapy
13 | * 首先，与昨天相同输入命令建立工程，若不记得可以参看[Scrapy实战](https://mp.weixin.qq.com/s?__biz=MzkyMTAwMjQ4NA==&tempkey=MTAxN19kSjFFYUtwSEVrSGZpODd1YlZQT0tDYVB0aEtSS0FzTDJ6V3duMDd0bnJjanhaM3NoSG41empwYmtFa1J0a2tjaU5UUnJiTndOVVgySnJ6dDFXaVdBWENHbG42ZW80MmZjS3Nua0tDZW5nTTRQdHh0MHJ4M0dPM3lfT0hQZllSc21POHJBUUYycVVIcno1V3VEakFJQWdjVnVIa0E5bG5sdi00NWp3fn4%3D&chksm=418b099e76fc808807861b5ad71c926cafca187b09530709f256f6790430fb65d490cbc2d382#rd)。然后来编写各组件的代码：
14 | ### spider
15 | * 这部分是爬虫的主要部分，start_urls设置了请求的网页链接，然后用到了json库将网页返回的内容变成json格式，提取出其中的图片下载链接。并且利用了**scrapy.Request**对unsplash网返回的内容进行二次解析，并将图片交给pipelines进行输出。
16 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20190711191018397.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2x5YzQ0ODEzNDE4,size_16,color_FFFFFF,t_70)
17 | ### pipelines
18 | * 这部分是进行图片的输出存储，利用了MD5生成摘要来给图片命名，这样可以完成去重存储。
19 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20190711191329571.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2x5YzQ0ODEzNDE4,size_16,color_FFFFFF,t_70)
20 | ### settings
21 | * 既然对pipelines函数进行了编写，需要在settings.py中取消其注释，并且加上随机的代理头，加上一定的时延，来增强其假装浏览器的能力，当然也不要忘了在items.py中设置fields。
22 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/2019071119170531.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2x5YzQ0ODEzNDE4,size_16,color_FFFFFF,t_70)
23 | ## 爬取结果
24 | * 完成程序的编写之后，启动项目来看看成果，嗯，一大堆高清图片已收入囊中。
25 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20190711192028711.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2x5YzQ0ODEzNDE4,size_16,color_FFFFFF,t_70)


--------------------------------------------------------------------------------
/unsplash/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = unsplash.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = unsplash
12 | 


--------------------------------------------------------------------------------
/unsplash/unsplash/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/unsplash/unsplash/__init__.py


--------------------------------------------------------------------------------
/unsplash/unsplash/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/unsplash/unsplash/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/unsplash/unsplash/__pycache__/items.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/unsplash/unsplash/__pycache__/items.cpython-37.pyc


--------------------------------------------------------------------------------
/unsplash/unsplash/__pycache__/pipelines.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/unsplash/unsplash/__pycache__/pipelines.cpython-37.pyc


--------------------------------------------------------------------------------
/unsplash/unsplash/__pycache__/settings.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/unsplash/unsplash/__pycache__/settings.cpython-37.pyc


--------------------------------------------------------------------------------
/unsplash/unsplash/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class UnsplashItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     image=scrapy.Field()
15 | 


--------------------------------------------------------------------------------
/unsplash/unsplash/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class UnsplashSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class UnsplashDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/unsplash/unsplash/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import os
 8 | from hashlib import md5
 9 | 
10 | class UnsplashPipeline(object):
11 |     def process_item(self, item, spider):
12 |         r=item['image']  
13 |         name=md5(r).hexdigest()
14 |         file_path="F://pic//{}.jpg".format(name)
15 |         if os.path.exists(file_path):
16 |             pass
17 |         else:
18 |             with open(file_path,"wb") as f:
19 |                 f.write(r)
20 |     
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/unsplash/unsplash/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for unsplash project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'unsplash'
 13 | 
 14 | SPIDER_MODULES = ['unsplash.spiders']
 15 | NEWSPIDER_MODULE = 'unsplash.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'unsplash (+http://www.yourdomain.com)'
 20 | # ÉèÖÃUA
 21 | import random
 22 | USER_AGENT_LIST = [
 23 |     'MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23',
 24 |     'Opera/9.20 (Macintosh; Intel Mac OS X; U; en)',
 25 |     'Opera/9.0 (Macintosh; PPC Mac OS X; U; en)',
 26 |     'iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)',
 27 |     'Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8    sun4u)',
 28 |     'iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)',
 29 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0',
 30 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0',
 31 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0',
 32 |     'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)',
 33 |     'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)'
 34 | ]
 35 | USER_AGENT = random.choice(USER_AGENT_LIST)
 36 | # Obey robots.txt rules
 37 | ROBOTSTXT_OBEY = True
 38 | 
 39 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 40 | #CONCURRENT_REQUESTS = 32
 41 | 
 42 | # Configure a delay for requests for the same website (default: 0)
 43 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 44 | # See also autothrottle settings and docs
 45 | # DOWNLOAD_DELAY = 0.5
 46 | # The download delay setting will honor only one of:
 47 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 48 | #CONCURRENT_REQUESTS_PER_IP = 16
 49 | 
 50 | # Disable cookies (enabled by default)
 51 | #COOKIES_ENABLED = False
 52 | 
 53 | # Disable Telnet Console (enabled by default)
 54 | #TELNETCONSOLE_ENABLED = False
 55 | 
 56 | # Override the default request headers:
 57 | #DEFAULT_REQUEST_HEADERS = {
 58 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 59 | #   'Accept-Language': 'en',
 60 | #}
 61 | 
 62 | # Enable or disable spider middlewares
 63 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 64 | #SPIDER_MIDDLEWARES = {
 65 | #    'unsplash.middlewares.UnsplashSpiderMiddleware': 543,
 66 | #}
 67 | 
 68 | # Enable or disable downloader middlewares
 69 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 70 | #DOWNLOADER_MIDDLEWARES = {
 71 | #    'unsplash.middlewares.UnsplashDownloaderMiddleware': 543,
 72 | #}
 73 | 
 74 | # Enable or disable extensions
 75 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 76 | #EXTENSIONS = {
 77 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 78 | #}
 79 | 
 80 | # Configure item pipelines
 81 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 82 | ITEM_PIPELINES = {
 83 |     'unsplash.pipelines.UnsplashPipeline': 300,
 84 | }
 85 | 
 86 | # Enable and configure the AutoThrottle extension (disabled by default)
 87 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 88 | #AUTOTHROTTLE_ENABLED = True
 89 | # The initial download delay
 90 | #AUTOTHROTTLE_START_DELAY = 5
 91 | # The maximum download delay to be set in case of high latencies
 92 | #AUTOTHROTTLE_MAX_DELAY = 60
 93 | # The average number of requests Scrapy should be sending in parallel to
 94 | # each remote server
 95 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 96 | # Enable showing throttling stats for every response received:
 97 | #AUTOTHROTTLE_DEBUG = False
 98 | 
 99 | # Enable and configure HTTP caching (disabled by default)
100 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
101 | #HTTPCACHE_ENABLED = True
102 | #HTTPCACHE_EXPIRATION_SECS = 0
103 | #HTTPCACHE_DIR = 'httpcache'
104 | #HTTPCACHE_IGNORE_HTTP_CODES = []
105 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
106 | 


--------------------------------------------------------------------------------
/unsplash/unsplash/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/unsplash/unsplash/spiders/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/unsplash/unsplash/spiders/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/unsplash/unsplash/spiders/__pycache__/download_unspalsh.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/unsplash/unsplash/spiders/__pycache__/download_unspalsh.cpython-37.pyc


--------------------------------------------------------------------------------
/unsplash/unsplash/spiders/download_unsplash.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from unsplash.items import UnsplashItem
 4 | import json
 5 | 
 6 | 
 7 | class DownloadUnspalshSpider(scrapy.Spider):
 8 |     name = 'download_unsplash'
 9 |     allowed_domains = ['unsplash.com']
10 |     start_urls = ['http://unsplash.com/napi/photos?page={}&per_page=12'.format(n) for n in range(10224)]
11 | 
12 |     def parse(self,response):
13 |         play_url = json.loads(response.text)
14 |         for download_url in play_url:
15 |             image_url=download_url['links']['download']
16 |             yield scrapy.Request(image_url,callback=self.parse_url)
17 |     
18 |     def parse_url(self,response):
19 |         pic=UnsplashItem()
20 |         image=response.body
21 |         pic['image']=image
22 |         yield pic
23 | 
24 |         
25 | 


--------------------------------------------------------------------------------
/wyy/README.md:
--------------------------------------------------------------------------------
 1 | ﻿# 网易云音乐歌手粉丝
 2 | 
 3 | ## Target 
 4 | * 解决之前评论接口限制数目的问题
 5 | * 破解网易云js加密，主要是两个参数，一个是params,一个是encSecKey
 6 | 
 7 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/wyy/fans.png#pic_center)
 8 | 
 9 | ## Tips
10 | 
11 | * 具体详解请看[JS逆向之网易云音乐](https://mp.weixin.qq.com/s/prahlIq527XkirDE51jMjg)
12 | 
13 | ## TODO
14 | * 该接口只能抓取50页，即1000个粉丝信息
15 | * 网易云音乐app可以显示所有粉丝信息
16 | 
17 | 


--------------------------------------------------------------------------------
/wyy/fans.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/wyy/fans.png


--------------------------------------------------------------------------------
/wyy/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = wyy.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = wyy
12 | 


--------------------------------------------------------------------------------
/wyy/wyy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/wyy/wyy/__init__.py


--------------------------------------------------------------------------------
/wyy/wyy/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/wyy/wyy/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/wyy/wyy/__pycache__/items.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/wyy/wyy/__pycache__/items.cpython-37.pyc


--------------------------------------------------------------------------------
/wyy/wyy/__pycache__/pipelines.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/wyy/wyy/__pycache__/pipelines.cpython-37.pyc


--------------------------------------------------------------------------------
/wyy/wyy/__pycache__/settings.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/wyy/wyy/__pycache__/settings.cpython-37.pyc


--------------------------------------------------------------------------------
/wyy/wyy/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class WyyItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     avatar = scrapy.Field()
15 |     userId = scrapy.Field()
16 |     # vipRights = scrapy.Field()
17 |     vipType =scrapy.Field()
18 |     gender = scrapy.Field()
19 |     eventCount = scrapy.Field()
20 |     fan_followeds = scrapy.Field()
21 |     fan_follows = scrapy.Field()
22 |     signature = scrapy.Field()
23 |     time = scrapy.Field()
24 |     nickname = scrapy.Field()
25 |     playlistCount = scrapy.Field()
26 |     total_record_count = scrapy.Field()
27 |     week_record_count = scrapy.Field()


--------------------------------------------------------------------------------
/wyy/wyy/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class WyySpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class WyyDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/wyy/wyy/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | from pymongo import MongoClient
 8 | 
 9 | class WyyPipeline(object):
10 |     
11 |     def __init__(self) -> None:
12 |         # 连接
13 |         self.client = MongoClient(host='localhost', port=27017)
14 |         # 如果设置有权限, 则需要先登录
15 |         # db_auth = self.client.admin
16 |         # db_auth.authenticate('root', 'root')
17 |         # 需要保存到的collection
18 |         self.col = self.client['wyy']
19 |         self.fans = self.col.fans2
20 | 
21 | 
22 |     def process_item(self, item, spider):
23 |         res = dict(item)
24 |         self.fans.update_one({"userId":res['userId']}, {"$set": res}, upsert = True)
25 |         return item
26 | 
27 |     def open_spider(self, spider):
28 |         pass
29 | 
30 |     def close_spider(self, spider):
31 |         self.client.close()
32 | 
33 | 


--------------------------------------------------------------------------------
/wyy/wyy/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for wyy project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'wyy'
 13 | 
 14 | SPIDER_MODULES = ['wyy.spiders']
 15 | NEWSPIDER_MODULE = 'wyy.spiders'
 16 | 
 17 | # 设置UA
 18 | import random
 19 | USER_AGENT_LIST = [
 20 |     'MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23',
 21 |     'Opera/9.20 (Macintosh; Intel Mac OS X; U; en)',
 22 |     'Opera/9.0 (Macintosh; PPC Mac OS X; U; en)',
 23 |     'iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)',
 24 |     'Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8    sun4u)',
 25 |     'iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)',
 26 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0',
 27 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0',
 28 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0',
 29 |     'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)',
 30 |     'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)'
 31 | ]
 32 | USER_AGENT = random.choice(USER_AGENT_LIST)
 33 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 34 | #USER_AGENT = 'wyy (+http://www.yourdomain.com)'
 35 | 
 36 | # Obey robots.txt rules
 37 | ROBOTSTXT_OBEY = True
 38 | 
 39 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 40 | #CONCURRENT_REQUESTS = 32
 41 | 
 42 | # Configure a delay for requests for the same website (default: 0)
 43 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 44 | # See also autothrottle settings and docs
 45 | #DOWNLOAD_DELAY = 3
 46 | # The download delay setting will honor only one of:
 47 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 48 | #CONCURRENT_REQUESTS_PER_IP = 16
 49 | 
 50 | # Disable cookies (enabled by default)
 51 | #COOKIES_ENABLED = False
 52 | 
 53 | # Disable Telnet Console (enabled by default)
 54 | #TELNETCONSOLE_ENABLED = False
 55 | 
 56 | # Override the default request headers:
 57 | #DEFAULT_REQUEST_HEADERS = {
 58 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 59 | #   'Accept-Language': 'en',
 60 | #}
 61 | 
 62 | # Enable or disable spider middlewares
 63 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 64 | #SPIDER_MIDDLEWARES = {
 65 | #    'wyy.middlewares.WyySpiderMiddleware': 543,
 66 | #}
 67 | 
 68 | # Enable or disable downloader middlewares
 69 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 70 | #DOWNLOADER_MIDDLEWARES = {
 71 | #    'wyy.middlewares.WyyDownloaderMiddleware': 543,
 72 | #}
 73 | 
 74 | # Enable or disable extensions
 75 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 76 | #EXTENSIONS = {
 77 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 78 | #}
 79 | 
 80 | # Configure item pipelines
 81 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 82 | ITEM_PIPELINES = {
 83 |     'wyy.pipelines.WyyPipeline': 300,
 84 | }
 85 | 
 86 | # Enable and configure the AutoThrottle extension (disabled by default)
 87 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 88 | #AUTOTHROTTLE_ENABLED = True
 89 | # The initial download delay
 90 | #AUTOTHROTTLE_START_DELAY = 5
 91 | # The maximum download delay to be set in case of high latencies
 92 | #AUTOTHROTTLE_MAX_DELAY = 60
 93 | # The average number of requests Scrapy should be sending in parallel to
 94 | # each remote server
 95 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 96 | # Enable showing throttling stats for every response received:
 97 | #AUTOTHROTTLE_DEBUG = False
 98 | 
 99 | # Enable and configure HTTP caching (disabled by default)
100 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
101 | #HTTPCACHE_ENABLED = True
102 | #HTTPCACHE_EXPIRATION_SECS = 0
103 | #HTTPCACHE_DIR = 'httpcache'
104 | #HTTPCACHE_IGNORE_HTTP_CODES = []
105 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
106 | 


--------------------------------------------------------------------------------
/wyy/wyy/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/wyy/wyy/spiders/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/wyy/wyy/spiders/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/wyy/wyy/spiders/__pycache__/wwy_fans1.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/wyy/wyy/spiders/__pycache__/wwy_fans1.cpython-37.pyc


--------------------------------------------------------------------------------
/wyy/wyy/spiders/__pycache__/wyy_fans.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/wyy/wyy/spiders/__pycache__/wyy_fans.cpython-37.pyc


--------------------------------------------------------------------------------
/wyy/wyy/spiders/__pycache__/wyy_fans2.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/wyy/wyy/spiders/__pycache__/wyy_fans2.cpython-37.pyc


--------------------------------------------------------------------------------
/xueqiu/readme.md:
--------------------------------------------------------------------------------
1 | ### 雪球 实时股票信息统计
2 | 
3 | 
4 | [港股股票行情](https://xueqiu.com/hq#exchange=HK&firstName=2&secondName=2_0)
5 | 
6 | tips: 雪球封的比较快，如果要实时抓的话建议上代理池。


--------------------------------------------------------------------------------
/xueqiu/xueqiu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import requests
 4 | import time
 5 | 
 6 | _type_ = "US"                                        #香港就填Hk
 7 | 
 8 | headers={
 9 |          "user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
10 |          "Cookie": "",                               #上雪球把自己的cookie复制下来
11 |          }
12 | 
13 | f = open("xueqiu.csv", "w")
14 | f.write("symbol, name, current, percent, market_capital, pe_ttm\n")
15 | 
16 | 
17 | def get_json(page):                                  #带上cookie get请求
18 |     return requests.get("https://xueqiu.com/service/v5/stock/screener/quote/list?page={}"
19 |                         "&size=30&order=desc&orderby=percent&order_by=percent&market={}"
20 |                         "&type={}&_={}".format(page, _type_, _type_, int(time.time()*1000)),
21 |                         headers=headers).json()
22 | 
23 | 
24 | def parse_json(data, _all):                          #解析数据
25 |     _list = data["data"]["list"]
26 |     for _each in _list:
27 |         symbol = _each.get("symbol")                 #股票代码
28 |         name = _each.get("name")                     #股票名称
29 |         current = _each.get("current")               #当前价格
30 |         percent = _each.get("percent")               #涨跌幅
31 |         market_capital = _each.get("market_capital") #市值
32 |         pe_ttm = _each.get("pe_ttm")                 #市盈率
33 |         f.write(','.join(map(str, [symbol,name,current,percent,market_capital,pe_ttm])))
34 |         f.write("\n")
35 |         _all += 1
36 |         if _all == 100:
37 |             break
38 |     return _all
39 | 
40 | 
41 | def main():
42 |     _all = 0
43 |     for i in range(1,5):
44 |         data = get_json(i)
45 |         _all = parse_json(data, _all)
46 | 
47 |         time.sleep(5)                                #sleep 5s，
48 | 
49 | 
50 | def test():
51 |     print(requests.get("https://xueqiu.com/service/v5/stock/screener/quote/list?page=3&size=30&order=desc&orderby=percent&order_by=percent&market=US&type=us&_=1555485032232",
52 |                        headers=headers).json())
53 | 
54 | if __name__ == "__main__":
55 | 
56 |     main()
57 |     # test()
58 | 


--------------------------------------------------------------------------------
/今日头条/README.md:
--------------------------------------------------------------------------------
 1 | ﻿# 今日头条街拍图片
 2 | ## [Link](https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D)
 3 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/今日头条/web.png)
 4 | ## Target 
 5 | * 获取街拍图片，并下载到本地存储
 6 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/今日头条/download.png)
 7 | ## Tips
 8 | * urlencode用来拼接url链接
 9 | * 利用了多线程或者多进程来处理
10 | * 图片的名称使用其内容的MD5值，这样可以去除重复
11 | * 构造一个生成器，将图片链接和图片所属的标题一并返回
12 | ```python
13 | for image in images:
14 |     yield {
15 |              'image': image.get('url'),
16 |              'title': title
17 |                 }
18 | ```
19 | * 利用正则表达式将不符合命名规范的title去除
20 | 
21 | 


--------------------------------------------------------------------------------
/今日头条/download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/今日头条/download.png


--------------------------------------------------------------------------------
/今日头条/jiepai.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sun Jun 30 17:07:29 2019
  4 | 
  5 | @author: Administrator
  6 | """
  7 | 
  8 | import requests
  9 | from urllib.parse import urlencode
 10 | import os
 11 | from hashlib import md5
 12 | from multiprocessing.pool import Pool
 13 | import re
 14 | import threading
 15 | 
 16 | 
 17 | def get_page(offset):
 18 |     headers = {
 19 |         'cookie': 'tt_webid=6667396596445660679; csrftoken=3a212e0c06e7821650315a4fecf47ac9; tt_webid=6667396596445660679; WEATHER_CITY=%E5%8C%97%E4%BA%AC; UM_distinctid=16b846003e03d7-0dd00a2eb5ea11-353166-1fa400-16b846003e1566; CNZZDATA1259612802=2077267981-1561291030-https%253A%252F%252Fwww.baidu.com%252F%7C1561361230; __tasessionId=4vm71cznd1561363013083; sso_uid_tt=47d6f9788277e4e071f3825a3c36a294; toutiao_sso_user=e02fd616c83dff880adda691cd201aaa; login_flag=6859a0b8ffdb01687b00fe96bbeeba6e; sessionid=21f852358a845d783bdbe1236c9b385b; uid_tt=d40499ec45187c2d411cb7bf656330730d8c15a783bb6284da0f73104cd300a2; sid_tt=21f852358a845d783bdbe1236c9b385b; sid_guard="21f852358a845d783bdbe1236c9b385b|1561363028|15552000|Sat\054 21-Dec-2019 07:57:08 GMT"; s_v_web_id=6f40e192e0bdeb62ff50fca2bcdf2944',
 20 |         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
 21 |         'x-requested-with': 'XMLHttpRequest',
 22 |         'referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
 23 |     }
 24 |     params = {
 25 |         'aid': '24',
 26 |         'app_name': 'web_search',
 27 |         'offset': offset,
 28 |         'format': 'json',
 29 |         'keyword': '街拍',
 30 |         'autoload': 'true',
 31 |         'count': '20',
 32 |         'en_qc': '1',
 33 |         'cur_tab': '1',
 34 |         'from': 'search_tab',
 35 |         'pd': 'synthesis',
 36 |     }
 37 |     base_url = 'https://www.toutiao.com/api/search/content/?'
 38 |     url = base_url + urlencode(params)
 39 |     try:
 40 |         r = requests.get(url, headers=headers)
 41 |         if 200  == r.status_code:
 42 |             return r.json()
 43 |     except:
 44 |         return None
 45 | 
 46 | 
 47 | def get_images(json):
 48 |     if json.get('data'):
 49 |         data = json.get('data')
 50 |         for item in data:
 51 |             if item.get('title') is None:
 52 |                 continue
 53 |             title = re.sub('[\t\\\|]', '', item.get('title'))
 54 |             images = item.get('image_list')
 55 |             for image in images:
 56 |                 origin_image = re.sub("list.*?pgc-image", "large/pgc-image", image.get('url'))
 57 |                 yield {
 58 |                     'image': origin_image,
 59 |                     'title': title
 60 |                 }
 61 | 
 62 | 
 63 | def save_image(item):
 64 |     img_path = 'img' + os.path.sep + item.get('title')
 65 |     if not os.path.exists(img_path):
 66 |         os.makedirs(img_path)
 67 |     r = requests.get(item.get('image'))
 68 |     file_path = img_path + os.path.sep + '{file_name}.{file_suffix}'.format(
 69 |                 file_name=md5(r.content).hexdigest(),
 70 |                 file_suffix='jpg')
 71 |     if not os.path.exists(file_path):
 72 |         with open(file_path, 'wb') as f:
 73 |             f.write(r.content)
 74 |         print("Downloaded image path is {}".format(file_path))
 75 |     else:
 76 |         print("Already Downloaded")
 77 | 
 78 | 
 79 | 
 80 | 
 81 | def main(offset):
 82 |     json = get_page(offset)
 83 |     for item in get_images(json):
 84 |         try:           
 85 |             save_image(item)            
 86 |         except:
 87 |             print("下载图片失败！")
 88 | 
 89 | 
 90 | if __name__ == '__main__':
 91 |       
 92 |     #pool = Pool()     # 进程池
 93 |     groups = ([x * 20 for x in range(10)])
 94 |     '''
 95 |     pool.map(main, groups)
 96 |     pool.close()
 97 |     pool.join()
 98 |     '''
 99 |     tasks = []           # 线程池
100 |     
101 |     for group in groups:
102 |         task = threading.Thread(target=main, args=(group,))
103 |         tasks.append(task)
104 |         task.start()
105 |         
106 |     # 等待所有线程完成
107 |     for _ in tasks:
108 |         _.join()         
109 |     print("完成图片爬取并存储到本地！")
110 |     


--------------------------------------------------------------------------------
/今日头条/web.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/今日头条/web.png


--------------------------------------------------------------------------------
/代理IP/README.md:
--------------------------------------------------------------------------------
 1 | ﻿# 高可用代理IP
 2 | 
 3 | ## Source
 4 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/代理IP/ip.png)
 5 | * [仓库链接](https://github.com/dxxzst/free-proxy-list)
 6 | * 该仓库提供了免费的ip代理，可用性和实时性都可以接受
 7 | 
 8 | 
 9 | ## Target
10 | * 爬取该repo上的代理IP列表，进行筛选并验证IP的可用性
11 | * getgoodip.py
12 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/代理IP/download.png)
13 | 
14 | ## Tips
15 | * http://ip.tool.chinaz.com/ 是校验IP的网站
16 | * 如何使用：
17 | 
18 | ```python
19 | 
20 | conn=MongoClient('127.0.0.1', 27017)      
21 | db=conn.proxy
22 | mongo_proxy=db.good_proxy
23 | proxy_data=mongo_proxy.find()
24 | proxies=json_normalize([ip for ip in proxy_data])
25 | proxy_list=list(proxies['ip'])
26 | proxy=random.choice(proxy_list)
27 | r=requests.get(url,headers=headers,proxies={'https': 'https://{}'.format(proxy),'http':'http://{}'.format(proxy)})
28 | 
29 | ```
30 |  
31 | 


--------------------------------------------------------------------------------
/代理IP/download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/代理IP/download.png


--------------------------------------------------------------------------------
/代理IP/getgoodip.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Jul 30 12:27:42 2019
 4 | 
 5 | @author: Lee
 6 | """
 7 | 
 8 | import requests
 9 | from bs4 import BeautifulSoup
10 | import threading
11 | from pymongo import MongoClient
12 | from lxml import etree
13 | 
14 | 
15 | def checkip(proxy):
16 |     try:
17 |         url='http://ip.tool.chinaz.com/'
18 |         headers={'User-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'}
19 |         r1=requests.get(url,headers=headers,proxies={'https': 'https://{}'.format(proxy),'http': 'http://{}'.format(proxy)},timeout=30)
20 |         tree=etree.HTML(r1.text)
21 |         ipaddress=tree.xpath('//dd[@class="fz24"]/text()')  
22 |         # print(ipaddress)
23 | 
24 |         if ipaddress[0]==proxy[:-5]:
25 |             return True
26 |         elif ipaddress[0]==proxy[:-6]:
27 |             return True
28 |         else:
29 |             return False
30 |     except:
31 |         return False
32 |     
33 |     
34 | 
35 | 
36 | def getgoodproxy(ip,ip_type):
37 | 
38 |     if checkip(ip):
39 |         print('{}可用，类型为{}'.format(ip,ip_type))
40 |         goodip.append(ip)
41 |         handler.insert_one({'ip':ip})
42 |         
43 | 
44 | if __name__ == '__main__':
45 |     
46 |     url='https://github.com/dxxzst/free-proxy-list'
47 |     headers={ 'User-Agent': "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"}
48 |     r=requests.get(url,headers=headers)
49 |     soup=BeautifulSoup(r.text,"html.parser")
50 |     table=soup.find_all('table')[1]
51 |     ulist1=[]
52 |     ulist2=[]
53 |     for tr in table.find_all('tr')[1:]:
54 |         a=tr.text.split("\n")
55 |         if a[4]=='high':
56 |         # 'https': 'https://{}'.format(proxy)
57 |             if a[3]=='http':
58 |                 ulist1.append("{}:{}".format(a[1],a[2]))
59 |             else:
60 |                 ulist2.append("{}:{}".format(a[1],a[2]))
61 | 
62 |     goodip=[]
63 |     client=MongoClient()
64 |     db=client.proxy
65 |     handler=db.good_proxy
66 |     handler.delete_many({})  
67 |     tasks=[]           # 线程池
68 |     
69 |     for ip1 in ulist1:
70 |         task=threading.Thread(target=getgoodproxy, args=(ip1,'http',))
71 |         tasks.append(task)
72 |         task.start()
73 |     
74 |     for ip2 in ulist2:
75 |         task=threading.Thread(target=getgoodproxy, args=(ip2,'https',))
76 |         tasks.append(task)
77 |         task.start()
78 |         
79 |         
80 |     # 等待所有线程完成
81 |     for _ in tasks:
82 |         _.join()         
83 |     print("完成代理ip验证并存储到本地！")
84 | 


--------------------------------------------------------------------------------
/代理IP/ip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/代理IP/ip.png


--------------------------------------------------------------------------------
/代理IP/proxy.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Feb 12 21:57:49 2019
  4 | 
  5 | @author: Administrator
  6 | """
  7 | 
  8 | from bs4 import BeautifulSoup
  9 | import requests
 10 | import json
 11 | import time
 12 | from pymongo import MongoClient as Client
 13 | 
 14 | 
 15 | def dict2proxy(dic):
 16 |     s=dic['类型']+'://'+dic['ip']+':'+str(dic['端口'])
 17 |     print(s)
 18 |     return {'http':s,'https':s}
 19 | 
 20 | 
 21 | def getHTMLText(url,headers,code='utf-8'):
 22 |     try:
 23 |         r=requests.get(url,headers=headers,timeout=30)
 24 |         r.raise_for_status()
 25 |         r.encoding=code
 26 |         return r.text
 27 |     except:
 28 |         return "url异常"
 29 |     
 30 | def getIP(html,ulist):
 31 |     
 32 |     soup=BeautifulSoup(html,'html.parser')
 33 |     items=soup.find_all('tr')[1:]
 34 |     #第一个不是ip
 35 |     
 36 |     for item in items:
 37 |         tds=item.find_all('td')
 38 |         ulist.append({'ip':tds[0].text,'端口':tds[1].text,'类型':tds[3].text,'位置':tds[4].text,'响应速度':tds[5].text,'最后验证时间':tds[6].text})
 39 |     #print(ulist)
 40 |     return ulist
 41 |             
 42 |             
 43 |                         
 44 | def saveAsJson(ulist):
 45 |     with open('proxy.json','w',encoding='utf-8') as f:
 46 |                 json.dump(ulist,f,indent=7,ensure_ascii=False)#ensure_ascii参数使显示为中文
 47 |                 
 48 | def saveAsJson1(ulist):
 49 |     with open('goodproxy.json','w',encoding='utf-8') as f:
 50 |                 json.dump(ulist,f,indent=7,ensure_ascii=False) 
 51 |                 
 52 |                 
 53 | '''
 54 | def write_to_mongo(ip):
 55 |     client=Client(host='localhost',port=27017)
 56 |     db=client['proxies_db']
 57 |     coll=db['proxies']
 58 |     for i in ip:
 59 |         if coll.find({'ip':i['ip']}).count==0:
 60 |             coll.insert_one(i)
 61 |     client.close()
 62 | '''               
 63 |                 
 64 | def checkip(ip):
 65 |     try:
 66 |         proxies=dict2proxy(ip)
 67 |         url='http://www.ipip.net/'
 68 |         headers={'User-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'}
 69 |         r=requests.get(url,headers=headers,proxies=proxies,timeout=5)
 70 |         r.raise_for_status()
 71 |     except:
 72 |         return False
 73 |     else:
 74 |         return True
 75 | 
 76 | def getgoodip(ip):
 77 |     goodip=[]
 78 |     for i in ip:
 79 |         if checkip(i):
 80 |             goodip.append(i)
 81 |     return goodip
 82 | 
 83 |                 
 84 | def main():
 85 |     ulist=[]
 86 |     headers={'Accept':'image/webp,image/apng,image/*,*/*;q=0.8',
 87 |              'Accept-encoding':'gzip, deflate, br',
 88 |              'Accept-language':'zh-CN,zh;q=0.9',
 89 |              'User-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
 90 |             }
 91 |     for num in range(1,11):
 92 |         url='https://www.kuaidaili.com/free/inha/%s' % num
 93 |         print("正在爬取第{}页".format(num))
 94 |         html=getHTMLText(url,headers)
 95 |         # time.sleep(5)  #增加间隔
 96 |         #print(html)
 97 |         iplist=getIP(html,ulist)
 98 |         saveAsJson(iplist)
 99 |     print("爬取完成！")
100 |     print("开始检验ip")
101 |     goodip=getgoodip(ulist)
102 |     print("打印可以使用的ip:{}".format(goodip))
103 |     print("开始存储可以使用的ip")
104 |     saveAsJson1(goodip)
105 |     print("完成存储")
106 | 
107 |   
108 |     
109 | if __name__ == '__main__':
110 |     main()


--------------------------------------------------------------------------------
/全国历史天气/README.md:
--------------------------------------------------------------------------------
1 | ﻿# 全国历史天气
2 | 
3 | ## Target
4 | * 获取2012年-2018年全国各地历史天气信息
5 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/全国历史天气/web.png)
6 | 
7 | * 包含各个地区每天的天气情况
8 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/全国历史天气/download.png)


--------------------------------------------------------------------------------
/全国历史天气/download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/全国历史天气/download.png


--------------------------------------------------------------------------------
/全国历史天气/weather.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Aug 31 20:08:11 2019
  4 | 
  5 | @author: Administrator
  6 | """
  7 | 
  8 | import requests
  9 | import threading
 10 | from pymongo import MongoClient
 11 | from lxml import etree
 12 | import time
 13 | 
 14 | 
 15 | class weather(object):
 16 | 
 17 |     def __init__(self):
 18 |         
 19 |         self.origin_url='https://m.tianqi.com'
 20 |         self.url='https://m.tianqi.com/lishi/{}/{}{}.html'
 21 |         self.headers={"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36"}
 22 |         self.conn=MongoClient('127.0.0.1',27017)
 23 |         self.db=self.conn.weather
 24 |         
 25 |     def get_province(self):
 26 |         
 27 |         r=requests.get('https://m.tianqi.com/lishi.html',headers=self.headers)
 28 |         tree=etree.HTML(r.text)
 29 |         province_link=tree.xpath('//ul[@class="clear"]/li/a/@href')[:34]
 30 |         province=tree.xpath('//li/a/text()')[:34]
 31 |         return province_link,province
 32 |     
 33 |     def get_direct(self,directs,name):
 34 |         
 35 |         province_link,province=self.get_province()
 36 |         print(province)
 37 |         for pro in province_link:
 38 |             r=requests.get(self.origin_url+pro,headers=self.headers)
 39 |             tree=etree.HTML(r.text)
 40 |             direct_per_pro=tree.xpath('//ul[@class="clear"]/li/a/@href')
 41 |             name_per_pro=tree.xpath('//ul[@class="clear"]/li/a/text()')
 42 |             directs.append(direct_per_pro)
 43 |             name.append(name_per_pro)
 44 |             
 45 |         return province
 46 |     
 47 |     def get_weather(self,directs,name,province):
 48 |         
 49 | #        direct_name=[]
 50 | #        for i in range(len(province)):
 51 | #            for direct in directs[i]:
 52 | #                direct_name.append(direct[7:-11])
 53 | #        print(direct_name)
 54 |         for i in range(len(province)):
 55 |             for j in range(len(directs[i])):
 56 |                 
 57 |                 for year in range(2012,2019):
 58 |                     tasks=[]
 59 |                     for month in range(1,13):
 60 |                         task=threading.Thread(target=self.run, args=(i,j,year,month,directs[i][j][7:-11],province[i]))
 61 |                         tasks.append(task)
 62 |                         task.start()
 63 |                     # 等待所有线程完成
 64 |                     for _ in tasks:
 65 |                         _.join()
 66 |                     
 67 |     
 68 |     def run(self,i,j,year,month,direct_name,province_name):
 69 |         
 70 |         try:
 71 |             if month<10:
 72 |                 r=requests.get(self.url.format(direct_name,year,'0'+str(month)),headers=self.headers)
 73 |             else:
 74 |                 r=requests.get(self.url.format(direct_name,year,month),headers=self.headers)
 75 |             tree=etree.HTML(r.text)
 76 |             average_high_tem=tree.xpath('//h5[@class="red"]/text()')[0]
 77 |             max_high_tem=tree.xpath('//h5[@class="red"]/text()')[1]
 78 |             average_low_tem=tree.xpath('//tr/td[2]/h5/text()')[0]
 79 |             min_low_tem=tree.xpath('//tr/td[2]/h5/text()')[1]
 80 |             best_quality=tree.xpath('//td[@colspan="2"]/h5/text()')[0]
 81 |             worst_quality=tree.xpath('//td[@colspan="2"]/h5/text()')[1]
 82 |             date=tree.xpath('//dd[@class="date"]/text()')
 83 |             weather=tree.xpath('//dd[@class="txt1"]/text()')
 84 |             date=[d[:5] for d in date]
 85 |             dic=dict(zip(date,weather))
 86 |             item1={
 87 |                'average_high_tem':average_high_tem,
 88 |                'max_high_tem':max_high_tem,
 89 |                'average_low_tem':average_low_tem,
 90 |                'min_low_tem':min_low_tem,
 91 |                'best_quality':best_quality,
 92 |                'worst_quality':worst_quality,
 93 |                'city':name[i][j],
 94 |                'year':year,
 95 |                'month':month,
 96 |                'weather':dic,
 97 |                'province':province_name
 98 |                            }
 99 |             self.db['info'].insert_one(item1)
100 |         except Exception as e:
101 |             print(e)
102 |             #time.sleep(2)
103 |             #self.run(i,j,year,month,direct_name,province_name)
104 |         
105 |     
106 | if __name__=='__main__':
107 |     
108 |     All_weather=weather()
109 |     directs=[]
110 |     name=[]
111 |     province=All_weather.get_direct(directs,name)
112 |     print(directs)
113 |     All_weather.get_weather(directs,name,province)
114 | 
115 | 


--------------------------------------------------------------------------------
/全国历史天气/weather/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = weather.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = weather
12 | 


--------------------------------------------------------------------------------
/全国历史天气/weather/weather/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/全国历史天气/weather/weather/__init__.py


--------------------------------------------------------------------------------
/全国历史天气/weather/weather/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/全国历史天气/weather/weather/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/全国历史天气/weather/weather/__pycache__/pipelines.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/全国历史天气/weather/weather/__pycache__/pipelines.cpython-37.pyc


--------------------------------------------------------------------------------
/全国历史天气/weather/weather/__pycache__/settings.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/全国历史天气/weather/weather/__pycache__/settings.cpython-37.pyc


--------------------------------------------------------------------------------
/全国历史天气/weather/weather/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class WeatherItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/全国历史天气/weather/weather/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class WeatherSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class WeatherDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/全国历史天气/weather/weather/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | from pymongo import MongoClient
10 | 
11 |  # 将爬取的内容保存到mongoDB中
12 | class WeatherPipeline(object):
13 | 
14 |     def __init__(self):
15 |         # 连接
16 |         self.client = MongoClient()
17 | 
18 |         self.col = self.client['weather']
19 |         self.info = self.col.info
20 |         # 先清除之前保存的数据
21 |         # self.ershoufang.delete_many({})
22 | 
23 |     def process_item(self, item, spider):
24 |         self.info.insert_one(item)
25 |         return item
26 | 
27 |     def open_spider(self, spider):
28 |         pass
29 | 
30 |     def close_spider(self, spider):
31 |         self.client.close()


--------------------------------------------------------------------------------
/全国历史天气/weather/weather/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for weather project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'weather'
 13 | 
 14 | SPIDER_MODULES = ['weather.spiders']
 15 | NEWSPIDER_MODULE = 'weather.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'weather (+http://www.yourdomain.com)'
 20 | 
 21 | import random
 22 | USER_AGENT_LIST = [
 23 |     'MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23',
 24 |     'Opera/9.20 (Macintosh; Intel Mac OS X; U; en)',
 25 |     'Opera/9.0 (Macintosh; PPC Mac OS X; U; en)',
 26 |     'iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)',
 27 |     'Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8    sun4u)',
 28 |     'iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)',
 29 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0',
 30 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0',
 31 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0',
 32 |     'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)',
 33 |     'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)'
 34 | ]
 35 | USER_AGENT = random.choice(USER_AGENT_LIST)
 36 | 
 37 | # Obey robots.txt rules
 38 | # ROBOTSTXT_OBEY = True
 39 | 
 40 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 41 | #CONCURRENT_REQUESTS = 32
 42 | 
 43 | # Configure a delay for requests for the same website (default: 0)
 44 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 45 | # See also autothrottle settings and docs
 46 | DOWNLOAD_DELAY = random.random()
 47 | # The download delay setting will honor only one of:
 48 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 49 | #CONCURRENT_REQUESTS_PER_IP = 16
 50 | 
 51 | # Disable cookies (enabled by default)
 52 | #COOKIES_ENABLED = False
 53 | 
 54 | # Disable Telnet Console (enabled by default)
 55 | #TELNETCONSOLE_ENABLED = False
 56 | 
 57 | # Override the default request headers:
 58 | #DEFAULT_REQUEST_HEADERS = {
 59 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 60 | #   'Accept-Language': 'en',
 61 | #}
 62 | 
 63 | # Enable or disable spider middlewares
 64 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 65 | #SPIDER_MIDDLEWARES = {
 66 | #    'weather.middlewares.WeatherSpiderMiddleware': 543,
 67 | #}
 68 | 
 69 | # Enable or disable downloader middlewares
 70 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 71 | #DOWNLOADER_MIDDLEWARES = {
 72 | #    'weather.middlewares.WeatherDownloaderMiddleware': 543,
 73 | #}
 74 | 
 75 | # Enable or disable extensions
 76 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 77 | #EXTENSIONS = {
 78 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 79 | #}
 80 | 
 81 | # Configure item pipelines
 82 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 83 | ITEM_PIPELINES = {
 84 |     'weather.pipelines.WeatherPipeline': 300,
 85 | }
 86 | 
 87 | # Enable and configure the AutoThrottle extension (disabled by default)
 88 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 89 | #AUTOTHROTTLE_ENABLED = True
 90 | # The initial download delay
 91 | #AUTOTHROTTLE_START_DELAY = 5
 92 | # The maximum download delay to be set in case of high latencies
 93 | #AUTOTHROTTLE_MAX_DELAY = 60
 94 | # The average number of requests Scrapy should be sending in parallel to
 95 | # each remote server
 96 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 97 | # Enable showing throttling stats for every response received:
 98 | #AUTOTHROTTLE_DEBUG = False
 99 | 
100 | # Enable and configure HTTP caching (disabled by default)
101 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
102 | #HTTPCACHE_ENABLED = True
103 | #HTTPCACHE_EXPIRATION_SECS = 0
104 | #HTTPCACHE_DIR = 'httpcache'
105 | #HTTPCACHE_IGNORE_HTTP_CODES = []
106 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
107 | 


--------------------------------------------------------------------------------
/全国历史天气/weather/weather/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/全国历史天气/weather/weather/spiders/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/全国历史天气/weather/weather/spiders/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/全国历史天气/weather/weather/spiders/__pycache__/getweather.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/全国历史天气/weather/weather/spiders/__pycache__/getweather.cpython-37.pyc


--------------------------------------------------------------------------------
/全国历史天气/weather/weather/spiders/__pycache__/untitled3.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/全国历史天气/weather/weather/spiders/__pycache__/untitled3.cpython-37.pyc


--------------------------------------------------------------------------------
/全国历史天气/weather/weather/spiders/getweather.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | 
 5 | class GetweatherSpider(scrapy.Spider):
 6 |     name = 'getweather'
 7 |     start_urls = ['https://lishi.tianqi.com/']
 8 | 
 9 |     def parse(self, response):
10 |         city_url=[]
11 |         city_name=[]
12 |         for alpha in [chr(i) for i in range(65,91)]:
13 |             city_url.extend(response.xpath('//ul[@id="city_{}"]/li/a/@href'.format(alpha)).extract()[1:])
14 |             city_name.extend(response.xpath('//ul[@id="city_{}"]/li/a/text()'.format(alpha)).extract()[1:])
15 |         for j in range(len(city_url)):
16 |             yield scrapy.Request(city_url[j],callback=self.parse_info1,meta={'city':city_name[j]})
17 |         
18 |     def parse_info1(self,response):
19 |         
20 |         detail_href=response.xpath('//div[@class="tqtongji1"]/ul/li/a/@href').extract()[:-24]
21 |         print(detail_href)
22 |         for href in detail_href:
23 |             yield scrapy.Request(href,callback=self.parse_info2,meta=response.meta)
24 |             
25 |             
26 |             
27 |     def parse_info2(self,response):
28 |         
29 |         date=response.xpath('//div[@class="tqtongji2"]/ul/li[1]/a/text()').extract()
30 |         high_temp=response.xpath('//div[@class="tqtongji2"]/ul/li[2]/text()').extract()[1:]
31 |         low_temp=response.xpath('//div[@class="tqtongji2"]/ul/li[3]/text()').extract()[1:]
32 |         weather=response.xpath('//div[@class="tqtongji2"]/ul/li[4]/text()').extract()[1:]
33 |         wind_direct=response.xpath('//div[@class="tqtongji2"]/ul/li[5]/text()').extract()[1:]
34 |         wind_power=response.xpath('//div[@class="tqtongji2"]/ul/li[6]/text()').extract()[1:]
35 |             
36 |         for i in range(len(date)):   
37 |             yield {
38 |             '城市': response.meta['city'],
39 |             '日期': date[i],
40 |             '最高气温': high_temp[i],
41 |             '最低气温': low_temp[i],
42 |             '天气情况': weather[i],
43 |             '风向': wind_direct[i],
44 |             '风力': wind_power[i],
45 |              }


--------------------------------------------------------------------------------
/全国历史天气/web.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/全国历史天气/web.png


--------------------------------------------------------------------------------
/公交/hangzhou_bus_info.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed May 22 13:24:46 2019
  4 | 
  5 | @author: Administrator
  6 | """
  7 | 
  8 | import requests
  9 | import re 
 10 | from lxml import etree
 11 | 
 12 | 
 13 | class Spyder_bus(object):
 14 |     
 15 |     def __init__(self):
 16 |         self.headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; \
 17 |                                      x64) AppleWebKit/537.36 (KHTML, like Gecko) \
 18 |                                      Chrome/70.0.3538.102 Safari/537.36'}
 19 |         self.items=[]
 20 |         self.url='https://hangzhou.8684.cn/'
 21 |         
 22 |     def parse_navigation(self):
 23 |         r = requests.get(self.url, headers=self.headers)
 24 |         # 解析内容，获取所有导航链接
 25 |         tree = etree.HTML(r.text)
 26 |         number_href_list = tree.xpath('//div[@class="bus_kt_r1"]/a/@href')
 27 |         letter_href_list = tree.xpath('//div[@class="bus_kt_r2"]/a/@href')
 28 |         all_navigation=number_href_list + letter_href_list
 29 |         return all_navigation
 30 |     
 31 |     def parse_third_url(self,content):
 32 |         tree = etree.HTML(content)
 33 |         # 依次获取公交详细内容
 34 |         # 获取公交线路信息
 35 |         bus_number = tree.xpath('//div[@class="bus_i_t1"]/h1/text()')[0]
 36 |         bus_number = bus_number.replace('&nbsp', '')
 37 |         # 获取运行时间
 38 |         run_time = tree.xpath('//p[@class="bus_i_t4"][1]/text()')[0]
 39 |         run_time = re.sub(r'(.*?：)', '', run_time)
 40 |         # 获取票价信息
 41 |         ticket_info = tree.xpath('//p[@class="bus_i_t4"][2]/text()')[0]
 42 |         ticket_info = re.sub(r'(.*?：)', '', ticket_info)
 43 |         # 该公交线路公司名称
 44 |         company_info = tree.xpath('//p[@class="bus_i_t4"]/a/text()')[0]
 45 |         # 获取更新时间
 46 |         update_time = tree.xpath('//p[@class="bus_i_t4"][4]/text()')[0]
 47 |         update_time = re.sub(r'(.*?：)', '', update_time)
 48 | 
 49 |         total_list = tree.xpath('//span[@class="bus_line_no"]/text()')
 50 |         # 获取上行总站数
 51 |         up_total = total_list[0]
 52 |         # 将里面空格去掉
 53 |         up_total = up_total.replace('\xa0', '')
 54 |         # 获取上行所有站名
 55 |         up_site_list = tree.xpath('//div[@class="bus_line_site "][1]/div/div/a/text()')
 56 |         
 57 |         #有些线路只有单线，内环外环线路
 58 |         try:
 59 |         # 获取下行总站数
 60 |             down_total = total_list[1]
 61 |             down_total = down_total.replace('\xa0','')
 62 |         # 获取下行所有站名
 63 |             down_site_list = tree.xpath('//div[@class="bus_line_site "][2]/div/div/a/text()')
 64 |         # 将每一条公交线路存放到字典中
 65 |         except Exception as e:
 66 |             down_total = ''
 67 |             down_site_list = []
 68 | 
 69 |         item = {
 70 |                 '线路名':   bus_number,
 71 |                 '运行时间': run_time,
 72 |                 '票价信息': ticket_info,
 73 |                 '公司名称': company_info,
 74 |                 '更新时间': update_time,
 75 |                 '上行站数': up_total,
 76 |                 '上行站点': up_site_list,
 77 |                 '下行站数': down_total,
 78 |                 '下行站点': down_site_list
 79 |                  }
 80 |         self.items.append(item)
 81 | 
 82 | 
 83 |     def parse_second_url(self,content):
 84 |         tree = etree.HTML(content)
 85 |         route_list = tree.xpath('//div[@id="con_site_1"]/a/@href')
 86 |         route_name = tree.xpath('//div[@id="con_site_1"]/a/text()')
 87 |         # 遍历上面的列表
 88 |         i = 0
 89 |         for route in route_list:
 90 |             print('开始爬取%s线路' % route_name[i])
 91 |             route = self.url + route
 92 |             r = requests.get(url=route, headers=self.headers)
 93 |             print('结束爬取%s线路' % route_name[i])
 94 |             # 解析内容，获取每一路公交的详细信息
 95 |             self.parse_third_url(r.text)
 96 |             i += 1
 97 | 
 98 | 
 99 |     def parse_first_url(self,navi_list):
100 |             # 遍历列表，依次发送请求，解析内容，获取每个页面的所有公交线的url
101 |         for url in navi_list:
102 |             first_url =self.url + url
103 |             print('开始爬取%s所有的公交信息' % first_url)
104 |             r = requests.get(url=first_url, headers=self.headers)
105 |             # 解析内容，获取每一路公交的详细的url
106 |             self.parse_second_url(r.text)
107 |             print('结束爬取%s所有的公交信息' % first_url)
108 |             
109 |     def save_to_txt(self):
110 | 
111 |         with open('hangzhou_bus_info.txt', 'w', encoding='utf-8') as f:
112 |             for item in self.items:
113 |                 f.write(str(item)+'\n')
114 |     
115 | 
116 | 
117 | if __name__ == '__main__':
118 |     
119 |     bus_info=Spyder_bus()
120 |     navigation=bus_info.parse_navigation()
121 |     #print(navigation)
122 |     bus_info.parse_first_url(navigation)
123 |     bus_info.save_to_txt()
124 | 
125 | 
126 | 
127 | 
128 | 
129 |         
130 | 


--------------------------------------------------------------------------------
/大众点评/README.md:
--------------------------------------------------------------------------------
 1 | ﻿# 大众点评
 2 | 
 3 | ## Target 
 4 | * 破解字体反爬，获得餐厅的评论信息
 5 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/大众点评/comment.png#pic_center)
 6 | 
 7 | ## Tips
 8 | 
 9 | * 具体详解请看[字体反爬之大众点评](https://mp.weixin.qq.com/s/q-lIhCcaCZR9L1m9r_Jmyw)
10 | 
11 | 


--------------------------------------------------------------------------------
/大众点评/a.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/大众点评/a.woff


--------------------------------------------------------------------------------
/大众点评/comment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/大众点评/comment.png


--------------------------------------------------------------------------------
/实习僧/README.md:
--------------------------------------------------------------------------------
 1 | ﻿# 实习僧
 2 | 
 3 | ## Target 
 4 | * 破解字体反爬，获得相关招聘岗位信息并存入数据库
 5 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/实习僧/download.png#pic_center)
 6 | 
 7 | ## Tips
 8 | 
 9 | * 具体详解请看[字体反爬之实习僧](https://mp.weixin.qq.com/s/3tyPmarn_gcsn78cSKgnAQ)
10 | 
11 | 


--------------------------------------------------------------------------------
/实习僧/download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/实习僧/download.png


--------------------------------------------------------------------------------
/实习僧/new_font.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/实习僧/new_font.woff


--------------------------------------------------------------------------------
/微信公众号/README.md:
--------------------------------------------------------------------------------
 1 | ﻿# 微信公众号文章
 2 | ## [Link](https://mp.weixin.qq.com/s/Fqp9h27uwycbs_PJ3Tqggw)
 3 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/微信公众号/web.png)
 4 | ## Target 
 5 | * 获取某微信公众号的全部内容，文章URL存入数据库，并以PDF的形式下载到本地存储
 6 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/微信公众号/download2.png)
 7 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/微信公众号/download1.png)
 8 | ## Tips
 9 | * 常见方法一：通过搜狗微信去获取，但是只能获取最新的十篇文章
10 | * 常见方法二：通过微信公众号的素材管理，获取公众号文章。但是需要有一个自己的公众号
11 | * 通过Fiddler抓包发现链接的一些必要参数，在访问链接的时候带上这些参数，参数从抓包工具中获取
12 | * 通过pdfkit这个模块导出pdf文件
13 | * 上述模块需要安装[Wkhtmltopdf](https://wkhtmltopdf.org/downloads.html)才能使用
14 | 
15 | ## TODO
16 | * 有比较大的局限性，每爬取一个公众号的所有文章，就需要通过抓包更新这些必要的参数
17 | 
18 | 


--------------------------------------------------------------------------------
/微信公众号/article2pdf.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sat Jul  6 12:51:10 2019
 4 | 
 5 | @author: Lee
 6 | """
 7 | from pymongo import MongoClient
 8 | from pandas.io.json import json_normalize
 9 | import pdfkit
10 | import re
11 | 
12 | 
13 | 
14 | # Mongo配置
15 | conn=MongoClient('127.0.0.1', 27017)
16 | db=conn.wx                    #连接wx数据库，没有则自动创建
17 | mongo_wx=db.article           #使用article集合，没有则自动创建
18 | 
19 | # 配置wkhtmltopdf
20 | config=pdfkit.configuration(wkhtmltopdf=r"F:\wkhtmltopdf\wkhtmltox-0.12.5-1.mxe-cross-win64\wkhtmltox\bin\wkhtmltopdf.exe")
21 | wx_url_data=mongo_wx.find()
22 | data=json_normalize([comment for comment in wx_url_data])
23 | url_list=list(data['content_url'])
24 | title_list=list(data['title'])
25 | # 修改title名，使之能够成为文件名
26 | for i in range(len(title_list)):
27 |     if title_list[i]:
28 |         title_list[i]=re.sub('[\t\\\|\?\*\:\<\>\"\/]', '', title_list[i])
29 | count=0
30 | # url 转换成pdf存储
31 | for url in url_list:    
32 |     if url:
33 |         pdfkit.from_url(url, '{}.pdf'.format(title_list[count]),configuration=config)
34 |     count+=1
35 | print("已经将所有文章转换为PDF文件！")
36 |         
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/微信公众号/download1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/微信公众号/download1.png


--------------------------------------------------------------------------------
/微信公众号/download2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/微信公众号/download2.png


--------------------------------------------------------------------------------
/微信公众号/web.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/微信公众号/web.png


--------------------------------------------------------------------------------
/微信公众号/wechatarticle.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Fri Jul  5 21:51:21 2019
  4 | 
  5 | @author: Administrator
  6 | """
  7 | 
  8 | import requests
  9 | import json
 10 | import time
 11 | from pymongo import MongoClient
 12 | 
 13 | url='http://mp.weixin.qq.com/mp/profile_ext'  
 14 | 
 15 | # Mongo配置
 16 | conn=MongoClient('127.0.0.1', 27017)
 17 | db=conn.wx                    #连接wx数据库，没有则自动创建
 18 | mongo_wx=db.article           #使用article集合，没有则自动创建
 19 | 
 20 | def get_wx_article(biz,uin,key,pass_ticket,appmsg_token,count=10):
 21 |     offset=1+(index+1)*11
 22 |     params={
 23 |         '__biz':biz,
 24 |         'uin':uin,
 25 |         'key':key,
 26 |         'offset':offset,
 27 |         'count':count,
 28 |         'action':'getmsg',
 29 |         'f':'json',
 30 |         'pass_ticket':pass_ticket,
 31 |         'scene':124,
 32 |         'is_ok':1,
 33 |         'appmsg_token':appmsg_token,
 34 |         'x5':0,
 35 |     }
 36 | 
 37 |     headers={
 38 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
 39 |         }
 40 | 
 41 |     r=requests.get(url=url, params=params, headers=headers)
 42 |     resp_json=r.json()
 43 |     #print(resp_json)
 44 |     if resp_json.get('errmsg') == 'ok':
 45 |         # 是否还有分页数据，若没有更多数据则返回
 46 |         can_msg_continue=resp_json['can_msg_continue']
 47 |         # 当前分页文章数
 48 |         msg_count=resp_json['msg_count']
 49 |         print("当前分页共有{}篇文章".format(msg_count))
 50 |         general_msg_list=json.loads(resp_json['general_msg_list'])
 51 |         infolist=general_msg_list.get('list')
 52 |         print(infolist, "\n**************")
 53 |         for info in infolist:
 54 |             app_msg_ext_info=info['app_msg_ext_info']
 55 |             # 标题
 56 |             title=app_msg_ext_info['title']
 57 |             # 文章链接
 58 |             content_url=app_msg_ext_info['content_url']
 59 |             # 封面图
 60 |             cover=app_msg_ext_info['cover']
 61 |             # 发布时间
 62 |             datetime=info['comm_msg_info']['datetime']
 63 |             datetime=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(datetime))
 64 | 
 65 |             mongo_wx.insert({
 66 |                 'title': title,
 67 |                 'content_url': content_url,
 68 |                 'cover': cover,
 69 |                 'datetime': datetime
 70 |             })
 71 |         if can_msg_continue==1:
 72 |             return True
 73 |         return False
 74 |     else:
 75 |         print('获取文章异常...')
 76 |         return False
 77 | 
 78 | 
 79 | if __name__ == '__main__':
 80 |     # 参数通过抓包获得
 81 |     biz=''
 82 |     uin=''
 83 |     key=''
 84 |     pass_ticket=''
 85 |     appmsg_token=''
 86 |     index=-1
 87 |     while 1:
 88 |         print(f'开始抓取公众号第{index + 1} 页文章.')
 89 |         flag=get_wx_article(biz,uin,key,pass_ticket,appmsg_token,index=index)
 90 |         # 防止和谐，暂停8秒
 91 |         time.sleep(8)
 92 |         index+=1
 93 |         if not flag:
 94 |             print('该公众号文章已全部抓取并且存入本地数据库')
 95 |             break
 96 | 
 97 |         print('..........准备抓取公众号第 {} 页文章.'.format(index+2))
 98 |         
 99 |         
100 |         
101 |         
102 |         
103 |         
104 |         
105 |         
106 |         
107 |         
108 |         
109 |         
110 |         
111 |         
112 |         
113 |         
114 |         
115 |         
116 |         
117 |         
118 |         
119 |         
120 |         
121 |         
122 |         
123 |         


--------------------------------------------------------------------------------
/拉钩/README.md:
--------------------------------------------------------------------------------
 1 | ﻿# 拉钩职位信息
 2 | ## [Link](https://www.lagou.com/)
 3 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/拉钩/web.png)
 4 | ## Target 
 5 | * 获取Python相关的职位信息，并下载到本地存储
 6 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/拉钩/download.png)
 7 | ## Tips
 8 | * headers中需写入**Referer**，不然会报错，在同一次session中访问，提取cookies
 9 | ```python
10 | 'status': False, 'msg': '您操作太频繁,请稍后再访问', 'clientIp': '117.136.41.41', 'state': 2402
11 | ```
12 | * 在请求中去掉timeout，不然容易提取未加载的新页面
13 | ```
14 | <script type="text/javascript">gRltTszy();</script>页面加载中...<script type="text/javascript" src="https://www.lagou.com/upload/oss.js"></script>
15 | ```
16 | * 调用csv模块，可以将数据写入csv文件
17 | ```python
18 | import csv
19 | with open('lagou_data.csv','w',encoding='gbk',newline='') as f:
20 |     csv_write = csv.writer(f)
21 |     title = ['id','职位','城市','学历','工作年限','薪资','第一标签','第二标签','第三标签','技能库','公司名称','融资阶段','公司规模']
22 |     csv_write.writerow(title)
23 | ```
24 | 
25 | ## TODO
26 | * 依然会出现 “页面加载中”的情况，难以获得所有的具体招聘要求和职位描述
27 |  
28 | 


--------------------------------------------------------------------------------
/拉钩/download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/拉钩/download.png


--------------------------------------------------------------------------------
/拉钩/web.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/拉钩/web.png


--------------------------------------------------------------------------------
/有道翻译/README.md:
--------------------------------------------------------------------------------
1 | ﻿# 有道翻译
2 | ## [Link](http://fanyi.youdao.com/)
3 | ## Target 
4 | * 破解有道翻译网页版的参数加密
5 | ## Tips
6 | * JS调试
7 | * 具体详解请看[JS逆向初探之有道翻译](https://mp.weixin.qq.com/s/a-ORkG5XGSAP_-6GNilBbQ)
8 | 
9 | 


--------------------------------------------------------------------------------
/有道翻译/youdao.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jul 3 13:16:37 2019
 4 | 
 5 | @author: Lee
 6 | """
 7 | import requests
 8 | import time
 9 | import hashlib
10 | import random
11 | 
12 | class youdao_crawl():
13 |     def __init__(self):
14 |         self.headers = {
15 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
16 |             'Referer': 'http://fanyi.youdao.com/',
17 |             'Cookie': 'OUTFOX_SEARCH_USER_ID=850665018@10.169.0.83; OUTFOX_SEARCH_USER_ID_NCOO=71221285.04687975; _ntes_nnid=6f09e5c54e440a52f10b177100aa9d1d,1561431366198; JSESSIONID=aaavC3vS98F0m-IjbuAVw; DICT_UGC=be3af0da19b5c5e6aa4e17bd8d90b28a|; JSESSIONID=abc32dNbypRD-5CwnJAVw; user-from=http://www.youdao.com/w/eng/%E8%8B%B9%E6%9E%9C/; from-page=http://www.youdao.com/w/eng/%E8%8B%B9%E6%9E%9C/; ___rl__test__cookies=1562740161910'
18 |           }
19 |         self.data = {
20 |             'i': None,
21 |             'from': 'AUTO',
22 |             'to': 'AUTO',
23 |             'smartresult': 'dict',
24 |             'client': 'fanyideskweb',
25 |             'salt': None,
26 |             'sign': None,
27 |             'ts': None,
28 |             'bv': None,
29 |             'doctype': 'json',
30 |             'version': '2.1',
31 |             'keyfrom': 'fanyi.web',
32 |             'action': 'FY_BY_REALTlME'
33 |           }
34 |         self.url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
35 |         
36 |     def translate(self, word):
37 |         ts = str(int(time.time()*10000))
38 |         salt = str(int(time.time()*10000) + random.random()*10 + 10)
39 |         sign = 'fanyideskweb' + word + salt + '97_3(jkMYg@T[KZQmqjTK'
40 |         sign = hashlib.md5(sign.encode('utf-8')).hexdigest()
41 |         bv = '5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
42 |         bv = hashlib.md5(bv.encode('utf-8')).hexdigest()
43 |         self.data['i'] = word
44 |         self.data['salt'] = salt
45 |         self.data['sign'] = sign
46 |         self.data['ts'] = ts
47 |         self.data['bv'] = bv
48 |         re = requests.post(self.url, headers=self.headers, data=self.data)
49 |         return re.json()['translateResult'][0][0].get('tgt')
50 |     
51 |     
52 | if __name__ == '__main__':
53 |     youdao = youdao_crawl()
54 |     while True:
55 |         content = input("请输入您需要翻译的内容:")
56 |         if content == "q":
57 |             break
58 |         trans = youdao.translate(content)
59 |         print(trans)


--------------------------------------------------------------------------------
/梦幻西游/CBG.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Aug 27 11:20:53 2019
 4 | 
 5 | @author: Administrator
 6 | """
 7 | 
 8 | import requests
 9 | import re
10 | import json
11 | import execjs
12 | from pymongo import MongoClient
13 | import time
14 | 
15 | class CBG(object):
16 |     
17 |     def __init__(self):
18 |         
19 |         self.url='https://xyq.cbg.163.com/cgi-bin/equipquery.py?act=overall_rank&rank_type=31&page={}'
20 |         self.headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
21 |         self.conn=MongoClient('127.0.0.1', 27017)
22 |         self.db=self.conn.MHXY
23 |         
24 |     def get_json(self,i):
25 |         r=requests.get(self.url.format(i),headers=self.headers)
26 |         js=re.findall(r"var data = (.*);",r.text)[0]
27 |         js=json.loads(js)
28 |         return js
29 |     
30 |     def decode(self,data):
31 |         with open('test.js','r') as f:
32 |             javascript=f.read()
33 |         ctx=execjs.compile(javascript)
34 |         real_content=ctx.call('get_g',data)
35 |         return real_content
36 |     
37 |     def get_highlights(self,data):
38 |         highlights=data.encode().decode("unicode_escape")
39 |         return highlights
40 |     
41 |     def get_equip_info(self,i):
42 |         js=self.get_json(i)
43 |         equip_list=js["equip_list"]
44 |         for equip in equip_list:
45 |             gem_level=equip['gem_level']
46 |             large_equip_desc=self.decode(equip['large_equip_desc'])
47 |             sum_dex=equip['sum_dex']
48 |             create_time=equip['create_time']
49 |             collect_num=equip['collect_num']
50 |             highlights=self.get_highlights(equip['highlights'])
51 |             price=equip['price']
52 |             rank=equip['rank']
53 |             expire_time=equip['expire_time']
54 |             server_name=equip['server_name']
55 |             item={
56 |             'gem_level':gem_level,
57 |             'large_equip_desc':large_equip_desc,
58 |             'sum_dex':sum_dex,
59 |             'create_time':create_time,
60 |             'collect_num':collect_num,
61 |             'highlights':highlights,
62 |             'price':price,
63 |             'rank':rank,
64 |             'expire_time':expire_time,
65 |             'server_name':server_name
66 |             }
67 |             self.db['cbg'].insert_one(item)
68 |             
69 |             
70 |             
71 |             
72 | if __name__=='__main__':
73 |     
74 |     mhxycbg=CBG()
75 |     for i in range(1,11):
76 |         mhxycbg.get_equip_info(i)
77 |         time.sleep(1)
78 |         print("第{}页装备信息已存入数据库！".format(i))


--------------------------------------------------------------------------------
/汽车之家/README.md:
--------------------------------------------------------------------------------
 1 | ﻿# 汽车之家论坛
 2 | 
 3 | ## Target 
 4 | * 破解字体反爬，获得字体解密后的页面源代码
 5 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/汽车之家/sourcecode.png#pic_center)
 6 | 
 7 | ## Tips
 8 | * FontCreator观察ttf文件
 9 | * 坐标近似视为同一个字
10 | * 具体详解请看[字体反爬之汽车之家](https://mp.weixin.qq.com/s/zIDHQ1iRSElfV5PBAokFJw)
11 | 
12 | 


--------------------------------------------------------------------------------
/汽车之家/base.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/汽车之家/base.ttf


--------------------------------------------------------------------------------
/汽车之家/luntan.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sun Jul 21 08:28:59 2019
  4 | 
  5 | @author: Lee
  6 | """
  7 | import requests
  8 | import re
  9 | from fontTools.ttLib import TTFont
 10 | 
 11 | 
 12 | 
 13 | 
 14 | def get_new_ttf(url):
 15 |     """
 16 |     输入：网页链接
 17 |     输出：新字体以及网页源代码
 18 |     """
 19 |     r1=requests.get(url,headers=headers)
 20 |     ttf=re.findall(r",url\('(//.*\.ttf)'\)",r1.text)[0]
 21 |     r=requests.get('https:'+ttf)
 22 |     with open('new.ttf','wb') as f:
 23 |         f.write(r.content)
 24 |     font2=TTFont('new.ttf')
 25 |     # font2.saveXML('font_2.xml')
 26 |     return font2,r1.text
 27 |     
 28 |     
 29 | def compare(c1,c2):
 30 |     """
 31 |     输入：某俩个对象字体的坐标列表
 32 |     输出：bool类型，True则可视为是同一个字
 33 |     """
 34 |     if len(c1)!=len(c2):
 35 |         return False
 36 |     else:
 37 |         for i in range(len(c1)):
 38 |             if abs(c1[i][0]-c2[i][0])<50 and abs(c1[i][1]-c2[i][1])<50:
 39 |                 pass
 40 |             else:
 41 |                 return False
 42 |         return True
 43 | 
 44 | def decrypt_font(font1,font2,response):
 45 |     """
 46 |     输入：base字体，新字体以及网页源代码
 47 |     输出：字体解密后的网页源代码
 48 |     """
 49 |     word_list=['九','呢','着','地','得','的','五','六','低','右','一','二','远','更','了','好','三','多','小','长','是','坏','十','近','少','八','很','四','短','上','七','下','不','和','高','左','矮','大']
 50 |     uniname_list1=['uniEC1F', 'uniEC21', 'uniEC39', 'uniEC3B', 'uniEC55', 'uniEC67', 'uniEC71', 'uniEC81', 'uniEC82', 'uniEC8B', 'uniEC9D', 'uniECAE', 'uniECB7', 'uniECB8', 'uniECD3', 'uniECE4', 'uniECED', 'uniECFE', 'uniED00', 'uniED09', 'uniED18', 'uniED1A', 'uniED34', 'uniED36', 'uniED46', 'uniED50', 'uniED61', 'uniED6A', 'uniED7C', 'uniED96', 'uniED97', 'uniEDB2', 'uniEDC3', 'uniEDCC', 'uniEDCD', 'uniEDDD', 'uniEDE8', 'uniEDF9']
 51 |     uniname_list2=font2.getGlyphNames()[1:]
 52 |     base_dict=dict(zip(uniname_list1,word_list))
 53 |     
 54 |     # 保存每个字符的坐标信息，分别存入coordinate_list1和coordinate_list2
 55 |     coordinate_list1=[]     
 56 |     for uniname in uniname_list1:
 57 |         # 获取字体对象的横纵坐标信息
 58 |         coordinate=font1['glyf'][uniname].coordinates 
 59 |         coordinate_list1.append(list(coordinate))
 60 | 
 61 |     coordinate_list2=[]
 62 |     for i in uniname_list2:
 63 |         coordinate=font2['glyf'][i].coordinates
 64 |         coordinate_list2.append(list(coordinate))
 65 | 
 66 |     index2=-1
 67 |     new_dict={}
 68 |     for name2 in coordinate_list2:
 69 |         index2+=1
 70 |         index1=-1
 71 |         for name1 in coordinate_list1:           
 72 |             index1+=1
 73 |             if compare(name1,name2):
 74 |                 new_dict[uniname_list2[index2]]=base_dict[uniname_list1[index1]]
 75 |                 
 76 |     for uniname in uniname_list2:
 77 |         pattern='&#x'+uniname[3:].lower()+';'
 78 |         response=re.sub(pattern,new_dict[uniname],response)
 79 |     return response
 80 |             
 81 |             
 82 |             
 83 |             
 84 |             
 85 | if __name__ == '__main__':   
 86 |          
 87 |     font1=TTFont('base.ttf')
 88 |     # font.saveXML('font_1.xml')
 89 |     headers={
 90 |             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
 91 |             "Cookie": "fvlid=1563667948613fkOHKfG2zR; sessionip=114.213.210.86; sessionid=BDFE3D02-CE41-4D60-8D53-5277BD287ECF%7C%7C2019-07-21+08%3A12%3A27.462%7C%7Cwww.baidu.com; autoid=c3a5376fa8c6cf9ca92ff9ceb0176be2; sessionvid=CBD6AA01-530C-475E-AC39-EF6C90CE57DF; area=340111; ahpau=1; sessionuid=BDFE3D02-CE41-4D60-8D53-5277BD287ECF%7C%7C2019-07-21+08%3A12%3A27.462%7C%7Cwww.baidu.com; __ah_uuid_ng=c_BDFE3D02-CE41-4D60-8D53-5277BD287ECF; cookieCityId=110100; ahpvno=9; pvidchain=3311277,3454442,3311253,6826817,6826819; ref=www.baidu.com%7C0%7C0%7C0%7C2019-07-21+08%3A44%3A19.829%7C2019-07-21+08%3A12%3A27.462; ahrlid=1563669858351rrF6xJuLHZ-1563669896292",
 92 |             "Host": "club.autohome.com.cn"}
 93 |     url='https://club.autohome.com.cn/bbs/thread/e27f0f48dcb56de8/81875131-1.html'
 94 |     font2,response=get_new_ttf(url)
 95 |     after_decrypt_response=decrypt_font(font1,font2,response)
 96 |     print(after_decrypt_response)
 97 |           
 98 |           
 99 | 
100 | 
101 |     
102 | 
103 | 


--------------------------------------------------------------------------------
/汽车之家/new.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/汽车之家/new.ttf


--------------------------------------------------------------------------------
/汽车之家/sourcecode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/汽车之家/sourcecode.png


--------------------------------------------------------------------------------
/牛客网/niuke.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Sep 11 13:35:56 2019
  4 | 
  5 | @author: Lee
  6 | """
  7 | 
  8 | import requests
  9 | from lxml import etree
 10 | import re
 11 | from bs4 import BeautifulSoup
 12 | import random
 13 | import time
 14 | import os
 15 | import threading
 16 | from retry import retry
 17 | 
 18 | 
 19 | 
 20 | class niuke(object):
 21 |     
 22 |     def __init__(self):
 23 |         
 24 |         self.url='https://www.nowcoder.com/discuss/tag/{}?type=2&page={}'
 25 |         self.headers={"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36"}
 26 |         self.prefix='https://www.nowcoder.com'
 27 |         self.origin_url='https://www.nowcoder.com/discuss/tags?type=2'
 28 |         
 29 |     def get_enterprise(self):
 30 |         
 31 |         r=requests.get(self.origin_url,headers=self.headers)
 32 |         tree=etree.HTML(r.text)
 33 |         enterprise=tree.xpath('//div[@data-nav="企业"]/ul[@class="discuss-tags-mod"]/li/a/@data-href')
 34 |         enterprise_name=tree.xpath('//div[@data-nav="企业"]/ul[@class="discuss-tags-mod"]/li/a/span[@class="discuss-tag-item"]/text()')
 35 |         num=tree.xpath('//div[@data-nav="企业"]/ul[@class="discuss-tags-mod"]/li/span[@class="discuss-tag-num"]/text()')
 36 |         enterprise=[i[13:-7] for i in enterprise]
 37 |         num=[int(i[:-1]) for i in num]
 38 |         
 39 |         return enterprise,enterprise_name,num
 40 |  
 41 |     
 42 |     def get_href(self,enterprise,page):
 43 |         
 44 |         titles_new=[]
 45 |         r=requests.get(self.url.format(enterprise,page),headers=self.headers)
 46 |         tree=etree.HTML(r.text)
 47 |         hrefs=tree.xpath('//div[@class="discuss-main clearfix"]/a[1]/@href')
 48 |         titles=tree.xpath('//div[@class="discuss-main clearfix"]/a[1]/text()')
 49 |         hrefs=[self.prefix+href for href in hrefs]
 50 |         for title in titles:
 51 |             if title!='\n':
 52 |                 titles_new.append(title.replace("\n","").replace("[","").replace("]","").replace("/","").replace("|"," ").replace("*","").replace("?","").replace("\\",",").replace(":",",").replace("<","").replace(">",""))
 53 | 
 54 | #        print(hrefs)
 55 | #        print(titles_new)
 56 |         return hrefs,titles_new
 57 |         
 58 |     def get_article(self,enterprise_name,hrefs,titles):
 59 |         
 60 |         for i in range(len(hrefs)):
 61 |             if os.path.exists('{}/{}.txt'.format(enterprise_name,titles[i])):
 62 |                 pass
 63 |             else:
 64 |                 r=requests.get(hrefs[i],headers=self.headers)
 65 | #            soup=BeautifulSoup(r.text,'html.parser')
 66 | #            text=soup.find(attrs={"class":"post-topic-des nc-post-content"})
 67 | #            text=str(text).replace("</div>","\n").replace("<div>","").replace("<br/>","").replace("<span>","").replace("</span>","")[44:]
 68 |                 tree=etree.HTML(r.text)
 69 |                 text=tree.xpath('string(//div[@class="post-topic-des nc-post-content"])')
 70 |                 with open ('{}/{}.txt'.format(enterprise_name,titles[i]),'w',encoding='utf-8') as f:
 71 |                     f.write(text.replace("   ","\n"))
 72 |     
 73 | 
 74 |     @retry(tries=5,delay=0.5)          
 75 |     def main(self,enterprise,page,enterprise_name):
 76 |         
 77 |         hrefs,titles=self.get_href(enterprise,page)
 78 |         self.get_article(enterprise_name,hrefs,titles)
 79 | 
 80 | 
 81 | 
 82 | if __name__=='__main__':
 83 |     
 84 |     crawl=niuke()
 85 |     enterprise,enterprise_name,num=crawl.get_enterprise()
 86 |     
 87 | #    for j in range(len(enterprise)):
 88 |     for j in range(100):
 89 |         
 90 |         pages=int(num[j]/30)+2 if int(num[j]/30)<45 else 46
 91 |         if os.path.exists(enterprise_name[j]):
 92 |             pass
 93 |         else:
 94 |             os.mkdir(enterprise_name[j])
 95 | 
 96 | #        for page in range(1,pages):
 97 | #            print("开始抓取{}第{}页面经...".format(enterprise_name[j],page))
 98 | #            hrefs,titles=crawl.get_href(enterprise[j],page)
 99 | #            crawl.get_article(enterprise_name[j],hrefs,titles)
100 | #            print("已经完成{}第{}页面经的抓取".format(enterprise_name[j],page))
101 | #            time.sleep(random.random())
102 | #            print("---------------------------------------------")
103 |             
104 |         tasks=[]
105 |         for page in range(1,pages):
106 |             print("开始抓取{}第{}页面经...".format(enterprise_name[j],page))
107 |             task=threading.Thread(target=crawl.main,args=(enterprise[j],page,enterprise_name[j],))
108 |             tasks.append(task)
109 |             task.start()
110 | 
111 |         for _ in tasks:
112 |             _.join() 
113 |         print("已经成功抓取{}所有面经...".format(enterprise_name[j]))
114 |         print("---------------------------------------------")
115 | 
116 | 
117 | 
118 | 
119 | #url='https://www.nowcoder.com/discuss/254363'
120 | #headers={"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36"}
121 | #
122 | #r=requests.get(url,headers=headers)
123 | ##tree=etree.HTML(r.text)
124 | ##text=tree.xpath('string(//div[@class="post-topic-des nc-post-content"])')
125 | ##print(text.replace(" ","\n"))
126 | #
127 | #
128 | #soup=BeautifulSoup(r.text,'html.parser')
129 | #text=soup.find(attrs={"class":"post-topic-des nc-post-content"})
130 | #
131 | #text=str(text).replace("</div>","\n").replace("<div>","").replace("<br/>","").replace("<span>","").replace("</span>","")[44:]
132 | #print(text)


--------------------------------------------------------------------------------
/猫眼/MYcomment.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Feb 12 14:43:42 2019
 4 | 
 5 | @author: Administrator
 6 | """
 7 | 
 8 | import os
 9 | import time
10 | from datetime import datetime
11 | import requests
12 | from pymongo import MongoClient
13 | 
14 | 
15 | class MaoYan(object):
16 |     
17 |     def __init__(self):
18 |         """
19 |         初始化函数
20 |         """
21 |         self.headers = {'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.'
22 |                                       '38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
23 |                         'Connection': 'keep-alive',
24 |                         'Cookie': '_lxsdk_cuid=168d5d128e7c8-033114908a580c-10376654-fa000-168d5d128e7c8;'
25 |                                   ' _lx_utm=utm_source%3Dbing%26utm_medium%3Dorganic; uuid_n_v=v1;'
26 |                                   ' iuuid=5D49FF702DB211E9AF1B8D0648275EC02D381B7848144BC1A299A63C05094BF5;'
27 |                                   ' webp=true; selectci=true; ci=281%2C%E6%83%A0%E5%B7%9E;'
28 |                                   ' __mta=247299643.1549775481575.1549783540088.1549862773375.3;'
29 |                                   ' _lxsdk=5D49FF702DB211E9AF1B8D0648275EC02D381B7848144BC1A299A63C05094BF5;'
30 |                                   ' _lxsdk_s=168db05185a-332-e0d-bc5%7C%7C157'}
31 |         self.time = int(time.time()*1000)   #获取当前时间戳获取毫秒
32 |         self.premiere_time = int(time.mktime(time.strptime('2019-02-05 00:00:00', '%Y-%m-%d %H:%M:%S'))*1000)
33 |         #premiere_time,首映时间戳,strptime()方法,返回struct_time对象。
34 | 
35 |         # 配置mongodb数据库
36 |         host = os.environ.get('MONGODB_HOST', '127.0.0.1')  # 本地数据库
37 |         port = os.environ.get('MONGODB_PORT', '27017')  # 数据库端口
38 |         mongo_url = 'mongodb://{}:{}'.format(host, port)
39 |         mongo_db = os.environ.get('MONGODB_DATABASE', 'maoyan')
40 |         client = MongoClient(mongo_url)
41 |         self.db = client[mongo_db]
42 |         self.db['maoyan'].create_index('id', unique=True)  # 以评论的id为主键进行去重
43 | 
44 |     def get_comment(self):
45 |         """
46 |         爬取首映2.5大年初一到当前时间的电影评论
47 |         """
48 |         url = 'http://m.maoyan.com/review/v2/comments.json?movieId=248906&userId=-1&' \
49 |               'offset=0&limit=15&ts={}&type=3'
50 |         while self.time > self.premiere_time:
51 |             req_url = url.format(self.time)
52 |             r = requests.get(req_url, headers=self.headers)
53 |             count = 0
54 |             for com in r.json()['data']['comments']:
55 |                 self.parse_comment(com=com)
56 |                 count += 1
57 |                 if count == 15:
58 |                     self.time = com['time']
59 | 
60 |             print('成功爬取截止到{}的数据！'.format(datetime.fromtimestamp(int(self.time/1000))))
61 |             #转换为datetime格式的时间
62 |         
63 |     def parse_comment(self, com):
64 |         """
65 |         解析函数，用来解析爬回来的json评论数据，并把数据保存进mongodb数据库
66 |         """
67 |         comment = {'content': com['content'], 'gender': com['gender'], 'id': com['id'],
68 |                    'nick': com['nick'], 'replyCount': com['replyCount'], 'score': com['score'],
69 |                    'time': com['time'], 'upCount': com['upCount'],
70 |                    'userId': com['userId'], 'userLevel': com['userLevel']}  # 构造评论字典
71 |         # 通过评论id去重，如果已经有了就更新，没有就插入
72 |         self.db['maoyan'].update_one({'id': comment['id']}, {'$set': comment}, upsert=True)
73 | 
74 | 
75 | if __name__ == '__main__':
76 |     maoyan = MaoYan()
77 |     maoyan.get_comment()


--------------------------------------------------------------------------------
/猫眼/README.md:
--------------------------------------------------------------------------------
 1 | ﻿# 猫眼电影
 2 | 
 3 | ## Target 1
 4 | * 破解字体反爬，获得实时票房
 5 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/猫眼/link.jpg#pic_center)
 6 | 
 7 | ## Target2
 8 | * 电影评论数据抓取
 9 | 
10 | ## Tips
11 | * FontCreator观察woff文件
12 | * 页面woff文件实时更新
13 | * 具体详解请看[字体反爬之猫眼](https://mp.weixin.qq.com/s/1aNU76w2m9vJWCcZTRpp_A)
14 | 
15 | 


--------------------------------------------------------------------------------
/猫眼/fonts/1d490f047e308d1cfa27df888ed679e82080.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/猫眼/fonts/1d490f047e308d1cfa27df888ed679e82080.woff


--------------------------------------------------------------------------------
/猫眼/fonts/1fcc293a32feb0b86780097608f908972088.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/猫眼/fonts/1fcc293a32feb0b86780097608f908972088.woff


--------------------------------------------------------------------------------
/猫眼/fonts/3722d3c43709bd8bf9d17cb06c9e84d62080.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/猫眼/fonts/3722d3c43709bd8bf9d17cb06c9e84d62080.woff


--------------------------------------------------------------------------------
/猫眼/fonts/base.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/猫眼/fonts/base.woff


--------------------------------------------------------------------------------
/猫眼/link.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/猫眼/link.jpg


--------------------------------------------------------------------------------
/猫眼/maoyan_font.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Jul 19 10:17:37 2019
 4 | 
 5 | @author: Lee
 6 | """
 7 | 
 8 | import re
 9 | import requests
10 | from fontTools.ttLib import TTFont
11 | from lxml import etree
12 | from pymongo import MongoClient
13 | 
14 | 
15 | 
16 | 
17 | 
18 | def get_info(response):
19 |     '''
20 |     输入：页面源码
21 |     输出：包含电影票房等信息的字典列表    
22 |     '''
23 |     # Mongo配置
24 |     conn=MongoClient('127.0.0.1', 27017)
25 |     db=conn.maoyan                    #连接maoyan数据库，没有则自动创建
26 |     mongo_my=db.film                  #使用film集合，没有则自动创建
27 |     items=[]
28 |     tree=etree.HTML(response)
29 |     film_name=tree.xpath('//div[@class="movie-item-info"]/p/a/text()')
30 |     booking_office_today=tree.xpath('//div[@class="movie-item-number boxoffice"]/p[@class="realtime"]/span/span/text()')
31 |     booking_office_total=tree.xpath('//div[@class="movie-item-number boxoffice"]/p[@class="total-boxoffice"]/span/span/text()')
32 |     a=tree.xpath('//div[@class="movie-item-number boxoffice"]/p[@class="realtime"]/text()')[1::2]
33 |     b=tree.xpath('//div[@class="movie-item-number boxoffice"]/p[@class="total-boxoffice"]/text()')[1::2]
34 |     for i in range(len(film_name)):
35 |         item={'film_name':film_name[i],
36 |               'booking_office_today':booking_office_today[i]+a[i].replace('\n',''),
37 |               'booking_office_total':booking_office_total[i]+b[i].replace('\n',''),              
38 |             }
39 |         items.append(item)
40 |         mongo_my.insert_one(item)       
41 |     return items
42 | 
43 | 
44 | def decrypt_font(url,headers):
45 |     '''
46 |     输入：链接和头部信息
47 |     输出：返回解决字体反爬后的页面源码
48 |     
49 |     '''
50 | 
51 |     font1=TTFont('./fonts/base.woff')
52 |     # 使用百度的FontEditor找到本地字体文件name和数字之间的对应关系, 保存到字典中
53 |     base_dict={'uniE18E': '3', 'uniE585': '2', 'uniE194': '9', 'uniF439': '4', 'uniE7DB': '7','uniF115': '0',
54 |                 'uniF0A4': '5', 'uniE311': '1', 'uniF7EF': '8', 'uniEACB': '6'}
55 |     name_list1=font1.getGlyphNames()[1:-1]    
56 |     response=requests.get(url,headers).text
57 |     # 正则匹配字体woff文件
58 |     font_file=re.findall(r'vfile\.meituan\.net\/colorstone\/(\w+\.woff)', response)[0]
59 |     url2='http://vfile.meituan.net/colorstone/' + font_file
60 |     new_file=requests.get(url2,headers)
61 |     with open('./fonts/'+font_file,'wb') as f:
62 |         f.write(new_file.content)
63 |     font2=TTFont('./fonts/'+font_file)
64 |     font2.saveXML('font_2.xml')
65 |     name_list2=font2.getGlyphNames()[1:-1]
66 |     # 构造新映射
67 |     new_dict={}
68 |     for name2 in name_list2:
69 |         obj2=font2['glyf'][name2]
70 |         for name1 in name_list1:
71 |             obj1=font1['glyf'][name1]
72 |             # 对象相等则说明对应的数字相同​
73 |             if obj1==obj2:
74 |                 new_dict[name2]=base_dict[name1]
75 | 
76 |     for i in name_list2:
77 |         pattern='&#x'+i[3:].lower()+';'
78 |         response=re.sub(pattern,new_dict[i],response)
79 |     return response
80 | 
81 | 
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     
86 |     headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"}
87 |     url='https://maoyan.com/board/1'
88 |     r=decrypt_font(url,headers)
89 |     info=get_info(r)
90 |     print(info)
91 |     
92 | 
93 | 


--------------------------------------------------------------------------------
/百度热点/baidu_hotspot.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Feb 14 20:20:17 2019
 4 | 
 5 | @author: Administrator
 6 | """
 7 | 
 8 | import requests
 9 | 
10 | from bs4 import BeautifulSoup
11 | 
12 | def getHTMLText(url,headers):
13 |     try:
14 |         r=requests.get(url,headers=headers,timeout=30)
15 |         r.raise_for_status()
16 |         r.encoding=r.apparent_encoding
17 |         return r.text
18 |     except:
19 |         return "爬取失败"
20 | 
21 | 
22 | def parsehtml(html):
23 |    soup = BeautifulSoup(html,'html.parser')
24 |    all_topics=soup.find_all('tr')[5:]
25 |    for each_topic in all_topics:
26 |        times = each_topic.find('td', class_='last')  # 搜索指数
27 |        rank = each_topic.find('td', class_='first')  # 排名
28 |        name = each_topic.find('a', class_='list-title')  # 标题目
29 |        #去除其他的td标签的内容，并获取文本，进行输出格式校对
30 |        if rank != None and name != None and times != None:
31 |            rank = each_topic.find('td', class_='first').get_text().replace(' ', '').replace('\n', '')
32 |            name = each_topic.find('a', class_='list-title').get_text().replace(' ', '').replace('\n', '')
33 |            times = each_topic.find('td', class_='last').get_text().replace(' ', '').replace('\n', '')
34 |            tplt = "排名：{0:^4}\t标题：{1:{3}^15}\t搜索指数：{2:^8}"
35 |            print(tplt.format(rank, name, times, chr(12288)))
36 | 
37 | def main():
38 |     #百度实时热点排行榜单链接
39 |     url = 'http://top.baidu.com/buzz?b=1&fr=topindex'
40 |     headers = {'User-Agent': 'MMozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
41 |     html = getHTMLText(url, headers)
42 |     parsehtml(html)
43 | 
44 | if __name__ == '__main__':
45 |     main()
46 | 


--------------------------------------------------------------------------------
/知网/README.md:
--------------------------------------------------------------------------------
 1 | ﻿# 知网OCR验证码
 2 | 
 3 | ## Target
 4 | * 破解知网注册的字母验证码
 5 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/知网/cnki.png)
 6 | 
 7 | ## Tips
 8 | 
 9 | * tesserocr需要安装tessoract依赖库，所以安装tesserocr前需要安装tessoract
10 | * [点击下载tesseract](https://digi.bib.uni-mannheim.de/tesseract/)
11 | * [点击下载tesserocr的whl文件](https://github.com/simonflueckiger/tesserocr-windows_build/releases)
12 | * 命令
13 | ```python
14 | pip install XXXX.whl
15 | ```
16 | 
17 | * 将下载的tesseract的tessdata文件夹放在Anaconda的目录下


--------------------------------------------------------------------------------
/知网/cnki.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/知网/cnki.png


--------------------------------------------------------------------------------
/知网/code.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/知网/code.jpg


--------------------------------------------------------------------------------
/知网/ocr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Aug 13 17:13:56 2019
 4 | 
 5 | @author: Lee
 6 | """
 7 | 
 8 | import tesserocr
 9 | from PIL import Image
10 | 
11 | image=Image.open('code.jpg')
12 | image=image.convert('L')
13 | threshold=127
14 | table=[]
15 | for i in range(256):
16 |     if i<threshold:
17 |         table.append(0)
18 |     else:
19 |         table.append(1)
20 |         
21 | image=image.point(table,'1')
22 | print(tesserocr.image_to_text(image))


--------------------------------------------------------------------------------
/研招网/yz.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Fri May 22 08:31:53 2019
  4 | 
  5 | @author: Administrator
  6 | bz: "调剂特殊要求"
  7 | dwdm: "单位代码"
  8 | dwmc: "单位名称"
  9 | fbsjStr: "发布时间"
 10 | gxsj: "距离最后更新时间已过xx分钟"
 11 | hasit: "考生是否已经填报该志愿 true 或 false"
 12 | id: "余额信息ID"
 13 | qers: "余额人数"
 14 | sfmzyq: "是否满足要求，空为满足要求，非空其内容为不满足要求原因"
 15 | ssdm: "省市代码"
 16 | xxfs: "学习方式"
 17 | yjfxdm: "研究方向代码"
 18 | yjfxmc: "研究方向名称"
 19 | yxsdm: "院系所代码"
 20 | yxsmc: "院系所名称"
 21 | zt: "余额状态"
 22 | zydm: "专业代码"
 23 | zymc: "专业名称"
 24 | """
 25 | 
 26 | import requests
 27 | import json
 28 | import os
 29 | from pymongo import MongoClient
 30 | import pandas as pd
 31 | 
 32 | 
 33 | 
 34 | count=0
 35 | count1=0
 36 | url='https://yz.chsi.com.cn/sytj/stu/sytjqexxcx.action'
 37 | 
 38 | headers={
 39 |             'Accept': '*/*',
 40 |             'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
 41 |             'Origin': 'https://yz.chsi.com.cn',
 42 |             'Referer': 'https://yz.chsi.com.cn/sytj/tj/qecx.html',
 43 |             'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Mobile Safari/537.36',
 44 |             'Cookie':''
 45 | }   
 46 | type_dict = {}
 47 | type_dict['1'] = "全日制"
 48 | type_dict['2'] = "非全日制"
 49 | 
 50 | 
 51 |                   
 52 | def parse_one_page(content):
 53 | 
 54 |     for item in content:
 55 |         yield{
 56 |             'school': item['dwmc'],
 57 |             'academic': item['yxsmc'],
 58 |             'major': item['zymc'],
 59 |             'majorID': item['zydm'],
 60 |             'schoolID': item['dwdm'],
 61 |             'direction':item['yjfxmc'],
 62 |             'type':type_dict[str(item['xxfs'])],
 63 |             'remain':item['qers'],
 64 |             'publish':item['gxsj']
 65 |         }
 66 |        
 67 | with open('tjinfo.csv', 'a', encoding='utf-8') as csv:
 68 |     csv.write('学校编号'+','+'学校名称'+','+'所属学院'+','+'专业名称'+','
 69 |               +'专业代码'+','+'研究方向'+','+'培养类型'+','
 70 |               +'计划人数'+','+'距离最后更新时间已过分钟'+'\n')       
 71 | for i in range(60):
 72 |     if count==0:
 73 |         para={
 74 |          'pageSize': 20,
 75 |          'start': '',
 76 |          'orderBy':'' ,
 77 |          'mhcx': 1,
 78 |          'ssdm2': '',
 79 |          'xxfs2': '',
 80 |          'dwmc2': '计算机',
 81 |          'data_type': 'json',
 82 |          'agent_from':'wap',
 83 |          'pageid': ''
 84 |         }
 85 |     else:
 86 |         para={
 87 |          'pageSize': 20,
 88 |          'start': count*20,
 89 |          'orderBy':'' ,
 90 |          'mhcx': 1,
 91 |          'ssdm2': '',
 92 |          'xxfs2': '',
 93 |          'dwmc2': '计算机',
 94 |          'data_type': 'json',
 95 |          'agent_from':'wap',
 96 |          'pageid': 'tj_qe_list' 
 97 |             
 98 |             }
 99 |     try:
100 |         r=requests.post(url,headers=headers,timeout=30,data=para)
101 |         count+=1
102 |         r.raise_for_status()
103 |         r.encoding='utf-8'
104 |         #print (r.text)      
105 |         text=json.loads(r.text)
106 |         content=text['data']['vo_list']['vos']
107 |         #print(content)
108 |     except:
109 |         count+=1
110 | 
111 |     
112 |     for item in parse_one_page(content): 
113 |         with open('tjinfo.csv', 'a', encoding='utf-8') as csv:
114 |             csv.write(item['schoolID'] + ',' + item['school'] +','+ 
115 |                        item['academic'] + ',' + item['major'] + ','+ item['majorID']+',' +
116 |                        item['direction']+ ',' + str(item['type'])+','+str(item['remain'])+','+str(item['publish'])+'\n')
117 | 
118 | 
119 | for i in range(60):
120 |     if count1==0:
121 |         para={
122 |          'pageSize': 20,
123 |          'start': '',
124 |          'orderBy':'' ,
125 |          'mhcx': 1,
126 |          'ssdm2': '',
127 |          'xxfs2': '',
128 |          'dwmc2': '软件',
129 |          'data_type': 'json',
130 |          'agent_from':'wap',
131 |          'pageid': ''
132 |         }
133 |     else:
134 |         para={
135 |          'pageSize': 20,
136 |          'start': count1*20,
137 |          'orderBy':'' ,
138 |          'mhcx': 1,
139 |          'ssdm2': '',
140 |          'xxfs2': '',
141 |          'dwmc2': '软件',
142 |          'data_type': 'json',
143 |          'agent_from':'wap',
144 |          'pageid': 'tj_qe_list' 
145 |             
146 |             }
147 |     try:
148 |         r=requests.post(url,headers=headers,timeout=30,data=para)
149 |         count1+=1
150 |         r.raise_for_status()
151 |         r.encoding='utf-8'
152 |         #print (r.text)      
153 |         text=json.loads(r.text)
154 |         content=text['data']['vo_list']['vos']
155 |         #print(content)
156 |     except:
157 |         count1+=1
158 | 
159 |     
160 |     for item in parse_one_page(content): 
161 |         with open('tjinfo.csv', 'a', encoding='utf-8') as csv:
162 |             csv.write(item['schoolID'] + ',' + item['school'] +','+ 
163 |                        item['academic'] + ',' + item['major'] + ','+ item['majorID']+',' +
164 |                        item['direction']+ ',' + str(item['type'])+','+str(item['remain'])+','+str(item['publish'])+'\n')                
165 |     
166 | print("存入csv文件完成")
167 | 
168 | 
169 |              
170 | 
171 | 
172 | '''
173 | host = os.environ.get('MONGODB_HOST', '127.0.0.1')  # 本地数据库
174 | port = os.environ.get('MONGODB_PORT', '27017')  # 数据库端口
175 | mongo_url = 'mongodb://{}:{}'.format(host, port)
176 | mongo_db = os.environ.get('MONGODB_DATABASE', 'yanzhaotj')
177 | client = MongoClient(mongo_url)
178 | db = client[mongo_db]
179 | #db['yanzhaotj'].create_index('yjfxdm')
180 | 
181 | data={'school':content['dwmc'],'academic':content['yxsmc'],'major':content['zymc'],'direction':content['yjfxmc']
182 |       }
183 | 
184 | db['yanzhaotjl].insert_one(data)
185 | '''
186 | 


--------------------------------------------------------------------------------
/笔趣阁/Novel.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Mar 28 17:02:49 2019
  4 | 
  5 | @author: Administrator
  6 | """
  7 | import requests
  8 | from bs4 import BeautifulSoup
  9 | import sys
 10 | 
 11 | class download_txt(object):
 12 |     
 13 |     def __init__(self):
 14 |         self.url='http://www.biqukan.com'
 15 |         self.new_url = 'http://www.biqukan.com/3_3026/'
 16 |         self.names = []             #存放章节名
 17 |         self.urls = []              #存放每一个章节的链接
 18 |         self.nums = 0               #章节数目
 19 |     
 20 |     def get_url(self):
 21 |         
 22 |         try:
 23 |             r=requests.get(url=self.new_url,timeout=30)
 24 |             r.raise_for_status()
 25 |             r.encoding=r.apparent_encoding
 26 |             html=r.text
 27 |             #print(html)
 28 |             soup=BeautifulSoup(html,'html.parser')
 29 |             div=soup.find_all('div',class_='listmain')
 30 |             #print(div)
 31 |             a_bf=BeautifulSoup(str(div[0]),'html.parser')
 32 |             a=a_bf.find_all('a')
 33 |             self.nums=len(a[12:])
 34 |             for each in a[12:]:
 35 |                 self.names.append(each.string)
 36 |                 self.urls.append(self.url+each.get('href'))
 37 |             
 38 |         except:
 39 |             return "爬取链接失败"
 40 |         
 41 |     def get_contents(self,url):
 42 |         
 43 |         try:
 44 |             r=requests.get(url,timeout=30)
 45 |             r.raise_for_status()
 46 |             r.encoding=r.apparent_encoding
 47 |             html=r.text
 48 |             soup=BeautifulSoup(html,'html.parser')
 49 |             texts=soup.find_all('div',class_='showtxt')
 50 |             texts=texts[0].text.replace('\xa0'*8,'\n\n')
 51 |             return texts
 52 |         
 53 |         except:
 54 |             return "爬取文章内容失败"
 55 |         
 56 |     def save_txt(self,name,path,text):
 57 |         
 58 |         with open(path,'a',encoding='utf-8') as f:
 59 |             f.write(name+'\n')
 60 |             f.writelines(text)
 61 |             f.write('\n\n')
 62 |             
 63 |             
 64 | if __name__=='__main__':
 65 |     d=download_txt()
 66 |     d.get_url()
 67 |     print('开始下载小说《斗罗大陆》')
 68 | 
 69 |     for i in range(d.nums):
 70 |         
 71 |         d.save_txt(d.names[i],'斗罗大陆.txt',d.get_contents(d.urls[i]))
 72 |         sys.stdout.write("  已经下载：%.2f%%" % float(i/d.nums*100) + '\r')
 73 |         #sys.stdout.flush()
 74 |     print('《斗罗大陆》下载完成')
 75 |             
 76 |         
 77 |         
 78 |         
 79 |         
 80 |         
 81 |         
 82 |         
 83 |         
 84 |         
 85 |         
 86 |         
 87 |         
 88 |         
 89 |         
 90 |         
 91 |         
 92 |         
 93 |         
 94 |         
 95 |         
 96 |         
 97 |         
 98 |         
 99 |         
100 |         
101 |         
102 |         
103 |         
104 |         
105 |         
106 |         
107 |         
108 |         
109 | 
110 | 


--------------------------------------------------------------------------------
/网易云阅读/Book.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/网易云阅读/Book.png


--------------------------------------------------------------------------------
/网易云阅读/BookList.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/网易云阅读/BookList.png


--------------------------------------------------------------------------------
/网易云阅读/README.md:
--------------------------------------------------------------------------------
 1 | # 网易云阅读
 2 | ## [Link](http://yuedu.163.com/book/category/category/2100/2110/1_0_1)
 3 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/%E7%BD%91%E6%98%93%E4%BA%91%E9%98%85%E8%AF%BB/Book.png#pic_center)
 4 | ## Target 
 5 | * 获取所有古典文学作品的全文，下载到本地存储
 6 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/%E7%BD%91%E6%98%93%E4%BA%91%E9%98%85%E8%AF%BB/BookList.png)
 7 | ## Tips
 8 | * 用base64解密网页上的书本内容数据
 9 | * 增加随机延迟，变换频率访问网站
10 | * 同名书籍避免重复下载
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/网易云阅读/getbooklist.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Jun 24 10:02:37 2019
 4 | 
 5 | @author: Lee
 6 | """
 7 | 
 8 | # 获取该分类下的所有书籍的名字和书籍的id清单
 9 | import requests
10 | from bs4 import BeautifulSoup
11 | 
12 | headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
13 | url='http://yuedu.163.com/book/category/category/2100/2110/1_0_1/'
14 | newbookid1=[]
15 | newbookid2=[]
16 | newbookid=[]
17 | name=[]
18 | 
19 | for i in range(1,27):
20 |     new_url=url+'p'+str(i)+'/s20'
21 |     r=requests.get(new_url,headers=headers,timeout=30)
22 |     r.raise_for_status()
23 |     r.encoding=r.apparent_encoding
24 |     soup=BeautifulSoup(r.text,'html.parser')   
25 |     t=soup.find(attrs={'class':'g-bdw g-bdw-1'})
26 |     for ti in t.findAll('h2')[:20]:
27 |         name.append(ti.text)
28 |     for j in soup.find_all(attrs={'class':'yd-book-item yd-book-item-pull-left'}):
29 |         bookid=[]
30 |         for k in j.find_all('a'):
31 |             bookid.append(k.get('href'))
32 |         for idx in bookid:
33 |             a=idx[8:]
34 |             newbookid1.append(a)
35 |     for j in soup.find_all(attrs={'class':'yd-book-item yd-book-item-pull-left edge-right'}):
36 |         bookid=[]
37 |         for k in j.find_all('a'):
38 |             bookid.append(k.get('href'))
39 |         for idx in bookid:
40 |             a=idx[8:]
41 |             newbookid2.append(a)
42 | for i in range(len(newbookid1)):
43 |     newbookid.append(newbookid1[i])
44 |     newbookid.append(newbookid2[i])
45 |     
46 | print(newbookid)
47 | print(name)
48 | 


--------------------------------------------------------------------------------
/网易云音乐/README.md:
--------------------------------------------------------------------------------
 1 | ﻿# 网易云音乐评论
 2 | ## [Link](https://music.163.com/#/song?id=1374051000)
 3 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/网易云音乐/web.png)
 4 | ## Target 
 5 | * 获取薛之谦的新歌木偶人评论信息，并下载到本地存储
 6 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/网易云音乐/download.png)
 7 | ## Tips
 8 | *  存入csv文件中，需要将 encoding改为'utf-8_sig'，不然会出现乱码
 9 | *  UTF-8以字节为编码单元，它的字节顺序在所有系统中都是相同的，没有字节序的问题，也因此它实际上并不需要BOM(“ByteOrder Mark”), 但是UTF-8 with BOM即utf-8-sig需要提供BOM（"ByteOrder Mark"）
10 | * 一种无需提供post参数的apk：http://music.163.com/api/v1/resource/comments/R_SO_4_1374051000，直接加上offset作为偏移量即可
11 | * 需要代理IP池，要不然容易被封IP
12 | ## TODO
13 | * 上述接口在偏移量达到5000之后不再能够提取数据
14 | * 网易云音乐的JS破解
15 | * 两个参数，一个是params,一个是encSecKey
16 | 
17 | 


--------------------------------------------------------------------------------
/网易云音乐/download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/网易云音乐/download.png


--------------------------------------------------------------------------------
/网易云音乐/web.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/网易云音乐/web.png


--------------------------------------------------------------------------------
/腾讯视频/danmucrawl.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Mar 31 10:02:15 2019
 4 | 
 5 | @author: Administrator
 6 | """
 7 | 
 8 | import requests
 9 | import os
10 | from pymongo import MongoClient
11 | 
12 | 
13 | class Crawler(object):
14 |     
15 |     def __init__(self):
16 |         """
17 |         初始化函数
18 |         """
19 |         self.headers = {'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.'
20 |                                       '38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
21 |                         }
22 | 
23 |         # 配置mongodb数据库
24 |         host = os.environ.get('MONGODB_HOST', '127.0.0.1')  # 本地数据库
25 |         port = os.environ.get('MONGODB_PORT', '27017')  # 数据库端口
26 |         mongo_url = 'mongodb://{}:{}'.format(host, port)
27 |         mongo_db = os.environ.get('MONGODB_DATABASE', 'TecentVedioDanmu')
28 |         client = MongoClient(mongo_url)
29 |         self.db = client[mongo_db]
30 |         #self.db['TecentVedioDanmu'].create_index('id', unique=True)
31 |         
32 |     '''   
33 |     def getHTMLText(self,url,headers):
34 |         try:
35 |             r=requests.get(url,headers=headers,timeout=30)
36 |             r.raise_for_status()
37 |             r.encoding=r.apparent_encoding
38 |             return r.text
39 |         except:
40 |             return "爬取失败"
41 |     '''
42 |         
43 |     def get_danmu(self):
44 |         """
45 |         爬取一集新版倚天屠龙记电视剧的弹幕
46 |         """
47 |         count=0
48 |         target_list=[str(3748894695),str(3748894707),str(3748894703),str(3748894705),str(3748894704)]
49 |         for target in target_list:
50 |             url='http://mfm.video.qq.com/danmu?timestamp={}&target_id='+target
51 |             page=1    
52 |            
53 |             for i in range(100):
54 |                 req_url=url.format(30*page)
55 |                 r=requests.get(req_url, headers=self.headers)
56 |                 for danmu in r.json(strict=False)['comments']:
57 |                     self.parse_comment(danmu=danmu)               
58 |                 page+=1
59 |             count+=1
60 |             print("已将第{}集弹幕信息存入数据库".format(count))
61 |             
62 |         '''
63 |         url = 'http://mfm.video.qq.com/danmu?timestamp={}&target_id=3748894707'        
64 |         
65 |         for i in range(100):
66 |             req_url=url.format(30*page)
67 |             r=requests.get(req_url, headers=self.headers)
68 |             for danmu in r.json()['comments']:
69 |                 self.parse_comment(danmu=danmu)
70 |             page+=1
71 |         print("已将该集弹幕信息存入数据库")
72 |         '''
73 | 
74 |     #695 707 703 705 704 696 706 702 3751228221 222 3753663998 999 3765083890 891 3767617669 668
75 |                     
76 |     def parse_comment(self, danmu):
77 |         """
78 |         解析函数，用来解析爬回来的json评论数据，并把数据保存进mongodb数据库
79 |         """
80 |         dmdic = {'content': danmu['content'], 'id': danmu['commentid'],
81 |                  'upcount': danmu['upcount'],'timepoint': danmu['timepoint'],
82 |                  'username': danmu['opername']
83 |                    }  # 构造弹幕字典
84 |         self.db['TecentVedioDanmu'].insert_one(dmdic)
85 |     
86 | if __name__=='__main__':
87 |     dmcrawl=Crawler()
88 |     dmcrawl.get_danmu()
89 |     
90 |     


--------------------------------------------------------------------------------
/英雄联盟盒子/README.md:
--------------------------------------------------------------------------------
 1 | ﻿# 掌上英雄联盟
 2 | 
 3 | ## Target 
 4 | * 爬取掌盟文章，并以pdf形式保存在本地
 5 | 
 6 | ## Tips
 7 | * Fiddler 抓包
 8 | * 具体详解请看[一键下载掌盟文章](https://mp.weixin.qq.com/s/_EyBV6i7UG2aRS1D1nZ8-Q)
 9 | 
10 | 


--------------------------------------------------------------------------------
/英雄联盟盒子/lol.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Jul 16 19:25:18 2019
 4 | 
 5 | @author: Lee
 6 | """
 7 | import requests
 8 | from pymongo import MongoClient
 9 | 
10 | 
11 | 
12 | # Mongo配置
13 | conn=MongoClient('127.0.0.1', 27017)
14 | db=conn.lol                    #连接lol数据库，没有则自动创建
15 | mongo_lol=db.article           #使用article集合，没有则自动创建
16 | headers={
17 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
18 |         }
19 | url='http://qt.qq.com/lua/lol_news/recommend_refresh?cid=12&plat=ios'
20 | r=requests.get(url,headers=headers).json()
21 | datalist=[]
22 | update_list=r['update_list']
23 | 
24 | for update in update_list:
25 |     
26 |     article_url=update['article_url']
27 |     image_url=update['image_url_big']
28 |     publication_date=update['publication_date']
29 |     title=update['title']
30 |     author=update['author']
31 |     
32 |     data={'article_url':article_url,
33 |           'image_url':image_url,
34 |           'publication_date':publication_date,
35 |           'title':title,
36 |           'author':author            
37 |             }
38 |     mongo_lol.insert_one(data)
39 | print("掌盟文章数据已经完全入库！")
40 | 
41 | '''
42 | import urllib.request
43 | urllib.request.urlretrieve(url,'name.html')
44 | '''
45 |     
46 | 
47 | 
48 | 
49 |     


--------------------------------------------------------------------------------
/英雄联盟盒子/url2pdf.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jul 17 09:51:12 2019
 4 | 
 5 | @author: Lee
 6 | """
 7 | 
 8 | from pymongo import MongoClient
 9 | from pandas.io.json import json_normalize
10 | import pdfkit
11 | import re
12 | import time
13 | 
14 | 
15 | 
16 | # Mongo配置
17 | conn=MongoClient('127.0.0.1', 27017)
18 | db=conn.lol                    #连接lol数据库，没有则自动创建
19 | mongo_lol=db.article           #使用article集合，没有则自动创建
20 | 
21 | # 配置wkhtmltopdf
22 | config=pdfkit.configuration(wkhtmltopdf=r"F:\wkhtmltopdf\wkhtmltox-0.12.5-1.mxe-cross-win64\wkhtmltox\bin\wkhtmltopdf.exe")
23 | lol_url_data=mongo_lol.find()
24 | data=json_normalize([article for article in lol_url_data])
25 | url_list=list(data['article_url'])
26 | title_list=list(data['title'])
27 | 
28 | # 修改title名，使之能够成为合法文件名
29 | for i in range(len(title_list)):
30 |     if title_list[i]:
31 |         title_list[i]=re.sub('[\t\\\|\?\*\:\<\>\"\/]', '', title_list[i])
32 | 
33 | count=0
34 | # url 转换成pdf存储
35 | for url in url_list:    
36 |     try:
37 |         pdfkit.from_url(url, '{}.pdf'.format(title_list[count]),configuration=config)
38 |         count+=1
39 |         time.sleep(30)
40 |     except:
41 |         count+=1
42 | 
43 | print("已经将所有文章转换为PDF文件！")


--------------------------------------------------------------------------------
/豆瓣/doubancomment.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Feb 11 11:34:49 2019
 4 | 
 5 | @author: Administrator
 6 | """
 7 | import requests
 8 | from bs4 import BeautifulSoup
 9 | 
10 | def getHTMLText(url,headers,code='utf8'):
11 |     try:
12 |         r=requests.get(url,headers=headers,timeout=30)
13 |         r.raise_for_status()
14 |         r.encoding=code
15 |         #r.encoding=r.apparent_encoding
16 |         #print(r.text[:1000])
17 |         return r.text
18 |     except:
19 |         return "爬取失败"
20 |     
21 | def getComment(url,headers):
22 |     html=getHTMLText(url,headers)
23 |     soup=BeautifulSoup(html,'html.parser')
24 |     comment=soup.findAll('span','short')
25 |     lst=[]
26 |     for com in comment:
27 |         lst.append(com.getText()+'\n')
28 |     return lst
29 |  
30 |     
31 | 
32 | 
33 | 
34 | def main():
35 |     headers={'Accept':'*/*',
36 |              'Accept-Encoding':'gzip, deflate, br',
37 |              'Accept-Language':'zh-CN,zh;q=0.9',
38 |              'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36'
39 |                           '(KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
40 |             }
41 |     f=open('流浪地球豆瓣评论.txt', 'w', encoding='utf-8')
42 |     for page in range(10):
43 |         url='https://movie.douban.com/subject/26266893/comments?start='+ str(20*page) +'&limit=20&sort=new_score&status=P'
44 |         print ("正在爬取第%s页的评论：" % (page+1))
45 |         print (url+'\n')
46 |         for i in getComment(url,headers):
47 |             f.write(i)
48 |     print("爬取完成")
49 |     
50 | 
51 |     
52 | if __name__=='__main__':
53 |     main()


--------------------------------------------------------------------------------
/链家/README.md:
--------------------------------------------------------------------------------
 1 | ﻿# 链家
 2 | 
 3 | ## Target 
 4 | * 爬取链家杭州租房房源信息 lianjiacrawl2.py
 5 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/链家/zufang.png#pic_center)
 6 | * 爬取链家杭州二手房房源信息 lianjiacrawl.py
 7 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/链家/ershoufang.png#pic_center)
 8 | 
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/链家/ershoufang.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/链家/ershoufang.png


--------------------------------------------------------------------------------
/链家/lianjia/lianjia/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/链家/lianjia/lianjia/__init__.py


--------------------------------------------------------------------------------
/链家/lianjia/lianjia/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/链家/lianjia/lianjia/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/链家/lianjia/lianjia/__pycache__/pipelines.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/链家/lianjia/lianjia/__pycache__/pipelines.cpython-37.pyc


--------------------------------------------------------------------------------
/链家/lianjia/lianjia/__pycache__/settings.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/链家/lianjia/lianjia/__pycache__/settings.cpython-37.pyc


--------------------------------------------------------------------------------
/链家/lianjia/lianjia/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class LianjiaItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/链家/lianjia/lianjia/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class LianjiaSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class LianjiaDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/链家/lianjia/lianjia/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | from pymongo import MongoClient
 9 | 
10 |  # 将爬取的内容保存到mongoDB中
11 | class LianjiaPipeline(object):
12 | 
13 |     def __init__(self):
14 |         # 连接
15 |         self.client = MongoClient(host='localhost', port=27017)
16 |         # 如果设置有权限, 则需要先登录
17 |         # db_auth = self.client.admin
18 |         # db_auth.authenticate('root', 'root')
19 |         # 需要保存到的collection
20 |         self.col = self.client['lianjia']
21 |         self.zufang = self.col.zufang
22 |         # 先清除之前保存的数据
23 |         # self.ershoufang.delete_many({})
24 | 
25 |     def process_item(self, item, spider):
26 |         self.zufang.insert_one(item)
27 |         return item
28 | 
29 |     def open_spider(self, spider):
30 |         pass
31 | 
32 |     def close_spider(self, spider):
33 |         self.client.close()
34 | 


--------------------------------------------------------------------------------
/链家/lianjia/lianjia/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for lianjia project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'lianjia'
 13 | 
 14 | SPIDER_MODULES = ['lianjia.spiders']
 15 | NEWSPIDER_MODULE = 'lianjia.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'lianjia (+http://www.yourdomain.com)'
 20 | 
 21 | # 设置UA
 22 | import random
 23 | USER_AGENT_LIST = [
 24 |     'MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23',
 25 |     'Opera/9.20 (Macintosh; Intel Mac OS X; U; en)',
 26 |     'Opera/9.0 (Macintosh; PPC Mac OS X; U; en)',
 27 |     'iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)',
 28 |     'Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8    sun4u)',
 29 |     'iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)',
 30 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0',
 31 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0',
 32 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0',
 33 |     'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)',
 34 |     'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)'
 35 | ]
 36 | USER_AGENT = random.choice(USER_AGENT_LIST)
 37 | # Obey robots.txt rules
 38 | ROBOTSTXT_OBEY = True
 39 | 
 40 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 41 | #CONCURRENT_REQUESTS = 32
 42 | 
 43 | # Configure a delay for requests for the same website (default: 0)
 44 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 45 | # See also autothrottle settings and docs
 46 | DOWNLOAD_DELAY = random.random()
 47 | # The download delay setting will honor only one of:
 48 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 49 | #CONCURRENT_REQUESTS_PER_IP = 16
 50 | 
 51 | # Disable cookies (enabled by default)
 52 | #COOKIES_ENABLED = False
 53 | 
 54 | # Disable Telnet Console (enabled by default)
 55 | #TELNETCONSOLE_ENABLED = False
 56 | 
 57 | # Override the default request headers:
 58 | #DEFAULT_REQUEST_HEADERS = {
 59 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 60 | #   'Accept-Language': 'en',
 61 | #}
 62 | 
 63 | # Enable or disable spider middlewares
 64 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 65 | #SPIDER_MIDDLEWARES = {
 66 | #    'lianjia.middlewares.LianjiaSpiderMiddleware': 543,
 67 | #}
 68 | 
 69 | # Enable or disable downloader middlewares
 70 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 71 | #DOWNLOADER_MIDDLEWARES = {
 72 | #    'lianjia.middlewares.LianjiaDownloaderMiddleware': 543,
 73 | #}
 74 | 
 75 | # Enable or disable extensions
 76 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 77 | #EXTENSIONS = {
 78 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 79 | #}
 80 | 
 81 | # Configure item pipelines
 82 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 83 | ITEM_PIPELINES = {
 84 |     'lianjia.pipelines.LianjiaPipeline': 300,
 85 | }
 86 | 
 87 | # Enable and configure the AutoThrottle extension (disabled by default)
 88 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 89 | #AUTOTHROTTLE_ENABLED = True
 90 | # The initial download delay
 91 | #AUTOTHROTTLE_START_DELAY = 5
 92 | # The maximum download delay to be set in case of high latencies
 93 | #AUTOTHROTTLE_MAX_DELAY = 60
 94 | # The average number of requests Scrapy should be sending in parallel to
 95 | # each remote server
 96 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 97 | # Enable showing throttling stats for every response received:
 98 | #AUTOTHROTTLE_DEBUG = False
 99 | 
100 | # Enable and configure HTTP caching (disabled by default)
101 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
102 | #HTTPCACHE_ENABLED = True
103 | #HTTPCACHE_EXPIRATION_SECS = 0
104 | #HTTPCACHE_DIR = 'httpcache'
105 | #HTTPCACHE_IGNORE_HTTP_CODES = []
106 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
107 | 


--------------------------------------------------------------------------------
/链家/lianjia/lianjia/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/链家/lianjia/lianjia/spiders/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/链家/lianjia/lianjia/spiders/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/链家/lianjia/lianjia/spiders/__pycache__/lianjiacrawl.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/链家/lianjia/lianjia/spiders/__pycache__/lianjiacrawl.cpython-37.pyc


--------------------------------------------------------------------------------
/链家/lianjia/lianjia/spiders/__pycache__/lianjiacrawl2.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/链家/lianjia/lianjia/spiders/__pycache__/lianjiacrawl2.cpython-37.pyc


--------------------------------------------------------------------------------
/链家/lianjia/lianjia/spiders/lianjiacrawl.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | 
 5 | class LianjiacrawlSpider(scrapy.Spider):
 6 |     name = 'lianjiacrawl'
 7 |     allowed_domains = ['lianjia.com']
 8 |     start_urls = ['https://hz.lianjia.com/ershoufang/pg{}/'.format(i) for i in range(1,101)]
 9 | 
10 |     def parse(self, response):
11 |         urls = response.xpath('//div[@class="info clear"]/div[@class="title"]/a/@href').extract()
12 |         for url in urls:
13 |             yield scrapy.Request(url,callback=self.parse_info)
14 |     
15 |     def parse_info(self,response):
16 |         total_price = response.xpath('concat(//span[@class="total"]/text(),//span[@class="unit"]/span/text())').extract_first()
17 |         unit_price = response.xpath('string(//span[@class="unitPriceValue"])').extract_first()
18 |         residential_district = response.xpath('//a[contains(@class,"info")]/text()').extract_first()
19 |         district = response.xpath('string(//div[@class="areaName"]/span[@class="info"])').extract_first()
20 | 
21 |         base1 = response.xpath('//div[@class="base"]//ul')
22 |         apartment = base1.xpath('./li[1]/text()').extract_first()
23 |         floor = base1.xpath('./li[2]/text()').extract_first()
24 |         area = base1.xpath('./li[3]/text()').extract_first()
25 |         architecture =base1.xpath('./li[4]/text()').extract_first()
26 |         orientation = base1.xpath('./li[7]/text()').extract_first()
27 |         renovation = base1.xpath('./li[9]/text()').extract_first()
28 |         tihu_ration = base1.xpath('./li[last()-2]/text()').extract_first()
29 |         property_right_year = base1.xpath('./li[last()]/text()').extract_first()
30 | 
31 |         base2 = response.xpath('//div[@class="transaction"]//ul')
32 |         purpose = base2.xpath('./li[4]/span[2]/text()').extract_first()
33 |         property_right = base2.xpath('./li[6]/span[2]/text()').extract_first()
34 | 
35 |         yield {
36 |             '总价': total_price,
37 |             '单价': unit_price,
38 |             '小区': residential_district,
39 |             '区域': district,
40 |             '户型': apartment,
41 |             '楼层': floor,
42 |             '面积': area,
43 |             '户型结构': architecture,
44 |             '朝向': orientation,
45 |             '装修情况': renovation,
46 |             '梯户比例': tihu_ration,
47 |             '产权年限': property_right_year,
48 |             '用途': purpose,
49 |             '产权所属': property_right,
50 |              }
51 | 
52 | 


--------------------------------------------------------------------------------
/链家/lianjia/lianjia/spiders/lianjiacrawl2.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Jul 24 15:56:26 2019
  4 | 
  5 | @author: Lee
  6 | """
  7 | 
  8 | # -*- coding: utf-8 -*-
  9 | import scrapy
 10 | 
 11 | 
 12 | class Lianjiacrawl2Spider(scrapy.Spider):
 13 |     name = 'lianjiacrawl2'
 14 |     allowed_domains = ['lianjia.com']
 15 |     start_urls = ['https://hz.lianjia.com/zufang/pg{}/'.format(i) for i in range(1,101)]
 16 | 
 17 |     def parse(self, response):
 18 |         urls = response.xpath('//p[@class="content__list--item--title twoline"]/a/@href').extract()
 19 |         for url in urls:
 20 |             yield scrapy.Request('https://hz.lianjia.com'+url,callback=self.parse_info)
 21 |     
 22 |     def parse_info(self,response):
 23 |         title = response.xpath('//p[@class="content__title"]/text()').extract_first()
 24 |         price = response.xpath('string(//p[@class="content__aside--title"])').extract_first()
 25 | 
 26 | 
 27 |         base1 = response.xpath('//div[@class="content__article__info"]/ul')
 28 |         time = base1.xpath('./li[2]/text()').extract_first().split("：" )[1]
 29 |         ru_zhu = base1.xpath('./li[3]/text()').extract_first().split("：" )[1]
 30 |         zu_qi = base1.xpath('./li[5]/text()').extract_first().split("：" )[1]
 31 |         kan_fang = base1.xpath('./li[6]/text()').extract_first().split("：" )[1]
 32 |         floor = base1.xpath('./li[8]/text()').extract_first().split("：" )[1]
 33 |         lift = base1.xpath('./li[9]/text()').extract_first().split("：" )[1]
 34 |         car = base1.xpath('./li[11]/text()').extract_first().split("：" )[1]
 35 |         water = base1.xpath('./li[12]/text()').extract_first().split("：" )[1]
 36 |         electric = base1.xpath('./li[14]/text()').extract_first().split("：" )[1]
 37 |         gas = base1.xpath('./li[15]/text()').extract_first().split("：" )[1]
 38 |         warm = base1.xpath('./li[17]/text()').extract_first().split("：" )[1]
 39 |         
 40 |         base2 = response.xpath('//p[@class="content__article__table"]')
 41 |         house = base2.xpath('./span[1]/text()').extract_first()
 42 |         architecture = base2.xpath('./span[2]/text()').extract_first()
 43 |         area = base2.xpath('./span[3]/text()').extract_first()
 44 |         orientation = base2.xpath('./span[4]/text()').extract_first()
 45 |         
 46 |         if response.xpath('//li[@class="fl oneline television_no "]'):
 47 |             television = '无'
 48 |         else:
 49 |             television = '有'
 50 |             
 51 |         if response.xpath('//li[@class="fl oneline refrigerator_no "]'):
 52 |             refrigerator = '无'
 53 |         else:
 54 |             refrigerator = '有'
 55 |             
 56 |         if response.xpath('//li[@class="fl oneline washing_machine_no "]'):
 57 |             washing_machine = '无'
 58 |         else:
 59 |             washing_machine = '有'
 60 |             
 61 |         if response.xpath('//li[@class="fl oneline air_conditioner_no "]'):
 62 |             air_conditioner = '无'
 63 |         else:
 64 |             air_conditioner = '有'
 65 |             
 66 |         if response.xpath('//li[@class="fl oneline water_heater_no "]'):
 67 |             water_heater = '无'
 68 |         else:
 69 |             water_heater = '有'
 70 |             
 71 |         if response.xpath('//li[@class="fl oneline bed_no "]'):
 72 |             bed = '无'
 73 |         else:
 74 |             bed = '有'
 75 |             
 76 |         if response.xpath('//li[@class="fl oneline heating_no "]'):
 77 |             heating = '无'
 78 |         else:
 79 |             heating = '有'
 80 |             
 81 |         if response.xpath('//li[@class="fl oneline wifi_no "]'):
 82 |             wifi = '无'
 83 |         else:
 84 |             wifi = '有'
 85 |         
 86 |         if response.xpath('//li[@class="fl oneline wardrobe_no "]'):
 87 |             wardrobe = '无'
 88 |         else:
 89 |             wardrobe = '有'
 90 |         
 91 |         if response.xpath('//li[@class="fl oneline natural_gas_no "]'):
 92 |             natural_gas = '无'
 93 |         else:
 94 |             natural_gas = '有'
 95 | 
 96 |         yield {
 97 |             '名称': title,
 98 |             '价格': price,
 99 |             '发布时间': time,
100 |             '方式': house,
101 |             '户型': architecture,
102 |             '面积': area,
103 |             '朝向': orientation,
104 |             '入住': ru_zhu,
105 |             '租期': zu_qi,
106 |             '看房': kan_fang,
107 |             '楼层': floor,
108 |             '电梯': lift,
109 |             '车位': car,
110 |             '用水': water,
111 |             '用电': electric,
112 |             '燃气': gas,
113 |             '采暖': warm,
114 |             '电视': television,
115 |             '冰箱': refrigerator,
116 |             '洗衣机': washing_machine,
117 |             '空调': air_conditioner,
118 |             '热水器': water_heater,
119 |             '床': bed,
120 |             '暖气': heating,
121 |             '宽带': wifi,
122 |             '衣柜': wardrobe,
123 |             '天然气': natural_gas,
124 |             
125 | 
126 |              }


--------------------------------------------------------------------------------
/链家/lianjia/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = lianjia.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = lianjia
12 | 


--------------------------------------------------------------------------------
/链家/zufang.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/链家/zufang.png


--------------------------------------------------------------------------------
/阴阳师/README.md:
--------------------------------------------------------------------------------
 1 | ﻿# 阴阳师藏宝阁
 2 | 
 3 | 
 4 | 
 5 | ## Target
 6 | * 爬取藏宝阁上的全部账号信息
 7 |  
 8 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/阴阳师/web.png)
 9 | 
10 | ## Tips
11 | * 需要使用代理IP，不然会限制访问，会弹出验证码
12 | 


--------------------------------------------------------------------------------
/阴阳师/web.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/阴阳师/web.png


--------------------------------------------------------------------------------
/阿里文学/README.md:
--------------------------------------------------------------------------------
 1 | ﻿# 阿里文学
 2 | ## [Link](https://www.aliwx.com.cn/)
 3 | ## Target 
 4 | * 做一个基于阿里文学的小说下载器
 5 | * 输入小说名称，提供小说下载到本地的功能
 6 | 
 7 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/阿里文学/download1.png#pic_center)
 8 | 
 9 | 
10 | ![在这里插入图片描述](https://github.com/librauee/Reptile/blob/master/阿里文学/download2.png#pic_center)
11 | ## Tips
12 | * JS解密
13 | * 具体详解请看[JS逆向之阿里文学](https://mp.weixin.qq.com/s/7Z5qB8YG0oDI857N95Z0MQ)
14 | 
15 | 


--------------------------------------------------------------------------------
/阿里文学/booklist.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/阿里文学/booklist.txt


--------------------------------------------------------------------------------
/阿里文学/download1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/阿里文学/download1.png


--------------------------------------------------------------------------------
/阿里文学/download2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zevs6/Reptile/4bb3847bd4a722edd081ce2c4d24828d1bd251ca/阿里文学/download2.png


--------------------------------------------------------------------------------
/阿里文学/getbookdic.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Jul 14 07:09:41 2019
 4 | 
 5 | @author: Administrator
 6 | """
 7 | import requests
 8 | import re
 9 | from bs4 import BeautifulSoup
10 | 
11 | url='https://www.aliwx.com.cn/store?page={}'
12 | headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36',
13 |          'referer': 'https://www.aliwx.com.cn/store'}
14 | items=[]
15 | for page in range(1,33):
16 |     r=requests.get(url.format(page),headers=headers)
17 |     soup=BeautifulSoup(r.text,'html.parser')
18 |     for a in soup.find_all('a',{'class':'clear'}):
19 |         bookname=a.attrs['title']
20 |         bookid=a.attrs['href'][11:]
21 |         item={
22 |             #'bookname':bookname,
23 |             #'bookid':bookid
24 |             bookname:bookid
25 |           }
26 |         items.append(item)
27 | print(items)
28 | with open('booklist1.txt','w') as f:
29 |     f.write(str(items))
30 | 
31 | 


--------------------------------------------------------------------------------