└── 1024crawler.py /1024crawler.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | import os 3 | import re 4 | import time 5 | import socket 6 | from bs4 import BeautifulSoup 7 | def crawerEach(url,urldir): 8 | resp=urllib.request.urlopen(url) 9 | html=resp.read().decode('gbk') 10 | soup = BeautifulSoup(html) 11 | items=soup.find('body').find('div',id='main').find(name='div',attrs={"class":"t","style":"margin:3px auto"} 12 | ).find('table',id='ajaxtable' 13 | ).find("tbody").findAll(name="tr",attrs={"class":"tr3 t_one"}) 14 | for item in items: 15 | target=item.find(name='td',attrs={"style":"text-align:left;padding-left:8px"}).find('h3').find('a') 16 | if target.u == None and target.b == None and target.font == None: 17 | urldir[target.text] = "http://wo.yao.cl/"+target.get('href') 18 | return urldir 19 | #爬下所有文章的标题的URL地址 20 | def crawer(): 21 | urldir={} 22 | for i in range(39): 23 | url="http://wo.yao.cl/thread0806.php?fid=20&page="+str(i+1) 24 | print("=====================正在爬取第"+str(i+1)+"页=========") 25 | urldir=crawerEach(url,urldir) 26 | f=open("all.xml",'w',encoding="utf-8") 27 | for key,url in urldir.items() : 28 | firstColumn="

" 29 | secondColumn=" "+""+url+"" 30 | thirdColumn="

" 31 | f.write(firstColumn+'\n'+secondColumn+'\n'+thirdColumn+'\n') 32 | f.close() 33 | 34 | 35 | #以文章的标题模糊搜索 36 | def search(): 37 | keyword = input("请输入关键字：") 38 | file=open("all.xml",'r',encoding='utf-8') 39 | content=file.read() 40 | soup=BeautifulSoup(content) 41 | items=soup.findAll(name="article",attrs={"title":re.compile(keyword)}) 42 | for item in items: 43 | print(item.get('title') + item.text) 44 | 45 | #获得文章内容 46 | def getContent(soup , author ,url, pageAccount): 47 | contents = soup.body.find(name='div',attrs={'id':'main'}).findAll(name='div',attrs={'class':'t t2'}) 48 | tid = url[-12:] 49 | print (tid) 50 | #获得首页的文章内容 51 | for item in contents: 52 | if(item.find('table').find(name='tr',attrs={'class':'tr3 tr1'}).find('font').b.text == author): 53 | content = item.table.find(name='tr',attrs={'class':'tr3 tr1'}).find(name='th',attrs={'class':'r_one'} 54 | ).table.tr.td.find(name='div',attrs={'class':'tpc_content'}).text 55 | writeContent(content) 56 | print(content) 57 | print("") 58 | pageInt = int(pageAccount) 59 | i = 2 60 | while i<=pageInt: 61 | pageUrl = "http://wo.yao.cl/read.php?tid=" + tid + "&page=" + str(i) 62 | print(pageUrl) 63 | getAuthorFloorContent(pageUrl,author) 64 | i=i+1 65 | print(pageUrl) 66 | 67 | #把内容写入文件 68 | def writeContent(content): 69 | f=open('content1.txt','a',encoding='utf-8') 70 | f.write(content) 71 | f.write('\n') 72 | f.write('\n') 73 | f.write('\n') 74 | f.write('\n') 75 | f.close() 76 | 77 | 78 | 79 | '''以下为获得内容所做的准备''' 80 | #获得第2页以后的页面的作者的楼层中的内容 81 | def getAuthorFloorContent(pageUrl,author): 82 | resp=urllib.request.urlopen(pageUrl) 83 | html=resp.read().decode('gbk') 84 | soup = BeautifulSoup(html) 85 | #获得所有楼层 86 | contents = soup.body.find(name='div',attrs={'id':'main'}).findAll(name='div',attrs={'class':'t t2'}) 87 | 88 | for item in contents: 89 | #在所有楼层中选出作者的楼层 90 | if(item.find('table').find(name='tr',attrs={'class':'tr1'}).find(name='th',attrs={'class':'r_two'}).b.text == author): 91 | content = item.table.find(name='tr',attrs={'class':'tr1'}).find(name='th',attrs={'class':'r_one'} 92 | ).find(name='div',attrs={'class':'tpc_content'}).text 93 | writeContent(content) 94 | print(content) 95 | print("") 96 | 97 | 98 | 99 | #获得帖子中共有多少页 100 | def getContentPage(soup): 101 | divItems = soup.body.find('div',id='main').findAll(name='div',attrs={'class':'t3'}) 102 | #获得页数的节点 103 | pageAccounts = divItems[2].table.tr.td.find(name='div',attrs={'class':'pages'}).findAll(name='a',attrs={'style':None}) 104 | pageAccount = pageAccounts[len(pageAccounts)-1].text 105 | print("页数为：" + pageAccount) 106 | return pageAccount 107 | 108 | 109 | 110 | #获得作者名字 111 | def getAuthor(soup): 112 | author = soup.body.find('div',id='main').find(name='div',attrs={'class':'t t2'} 113 | ).find('table').find(name='tr',attrs={'class':'tr3 tr1'}).find('font').b.text 114 | print("作者为：" + author) 115 | return author 116 | 117 | 118 | #获得文章 119 | def getArtilcle(url): 120 | resp=urllib.request.urlopen(url) 121 | html=resp.read().decode('gbk') 122 | soup = BeautifulSoup(html) 123 | 124 | #取得帖子的页数 125 | account = getContentPage(soup) 126 | #取得文章的作者 127 | author = getAuthor(soup) 128 | #取得内容，并将内容存入txt 129 | content = getContent(soup , author ,url ,account) 130 | 131 | 132 | #获得图片 133 | def getPicture(url): 134 | #url="http://wo.yao.cl/htm_data/8/1412/1313643.html" 135 | resp=urllib.request.urlopen(url) 136 | soup = BeautifulSoup(resp) 137 | contents = soup.body.find(name='div',attrs={'id':'main'}).findAll(name='div',attrs={'class':'t t2'}) 138 | #获得网页内容 139 | for item in contents: 140 | pictures = item.table.find(name='tr',attrs={'class':'tr3 tr1'}).find(name='th',attrs={'class':'r_one'} 141 | ).table.tr.td.find(name='div',attrs={'class':'tpc_content'}).findAll(name='input') 142 | i = 0 143 | for tag in pictures: 144 | print(tag['src']) 145 | conn = urllib.request.urlopen(tag['src']) 146 | f=open(str(i)+".jpg",'wb') 147 | i=i+1 148 | f.write(conn.read()) 149 | f.close() 150 | resp.close(); 151 | 152 | 153 | 154 | 155 | 156 | if __name__ == "__main__": 157 | print("1--更新") 158 | print("2--查询") 159 | print("3--取得文章") 160 | print("4--取得图片") 161 | choose=input("请输入结果：") 162 | if choose=="1": 163 | crawer() 164 | else : 165 | if choose=="3": 166 | url = input("请输入文章的网址:") 167 | getArtilcle(url) 168 | else: 169 | if choose=="4": 170 | url = input("请出入图片的网址:") 171 | getPicture(url) 172 | else: 173 | search() 174 | print("The End") 175 | 176 | 177 | --------------------------------------------------------------------------------