├── 00爬免费代理.py ├── 02TaogubaCrawler.py ├── 03阿布云版.py ├── MySqlConnect.py └── README.md /00爬免费代理.py: -------------------------------------------------------------------------------- 1 | # 参考博客:https://www.cnblogs.com/TurboWay/p/8172246.html 2 | # http://www.xicidaili.com/nt/ 3 | # 一、为什么需要建立爬虫代理ip池 4 | # 5 | # 在众多的网站防爬措施中,有一种是根据ip的访问频率进行限制的,在某段时间内,当某个ip的访问量达到一定的阀值时,该ip会被拉黑、在一段时间内被禁止访问。 6 | # 7 | # 这种时候,可以通过降低爬虫的频率,或者更改ip来应对。后者就需要有一个可用的代理ip池,以供爬虫工作时切换。 8 | # 9 | # 二、如何建立一个爬虫代理ip池 10 | # 11 | # 思路: 1、找到一个免费的ip代理网站(如:西刺代理) 12 | # 13 | # 2、爬取ip(常规爬取requests+BeautifulSoup) 14 | # 15 | # 3、验证ip有效性(携带爬取到的ip,去访问指定的url,看返回的状态码是不是200) 16 | # 17 | # 4、记录ip (写到文档) 18 | 19 | 20 | # !/usr/bin/env python3 21 | # -*- coding: utf-8 -*- 22 | import requests, threading, datetime 23 | from bs4 import BeautifulSoup 24 | import random 25 | 26 | """ 27 | 1、抓取西刺代理网站的代理ip 28 | 2、并根据指定的目标url,对抓取到ip的有效性进行验证 29 | 3、最后存到指定的path 30 | """ 31 | 32 | 33 | # ------------------------------------------------------文档处理-------------------------------------------------------- 34 | # 写入文档 35 | def write(path, text): 36 | with open(path, 'a', encoding='utf-8') as f: 37 | f.writelines(text) 38 | f.write('\n') 39 | f.close() 40 | 41 | 42 | # 清空文档 43 | def truncatefile(path): 44 | with open(path, 'w', encoding='utf-8') as f: 45 | f.truncate() 46 | 47 | 48 | # 读取文档 49 | def read(path): 50 | with open(path, 'r', encoding='utf-8') as f: 51 | txt = [] 52 | for s in f.readlines(): 53 | txt.append(s.strip()) 54 | return txt 55 | 56 | 57 | # ---------------------------------------------------------------------------------------------------------------------- 58 | # 计算时间差,格式: 时分秒 59 | def gettimediff(start, end): 60 | seconds = (end - start).seconds 61 | m, s = divmod(seconds, 60) 62 | h, m = divmod(m, 60) 63 | diff = ("%02d:%02d:%02d" % (h, m, s)) 64 | return diff 65 | 66 | 67 | # ---------------------------------------------------------------------------------------------------------------------- 68 | # 返回一个随机的请求头 headers 69 | def getheaders(): 70 | user_agent_list = [ \ 71 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \ 72 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \ 73 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \ 74 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \ 75 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \ 76 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \ 77 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \ 78 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ 79 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ 80 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ 81 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ 82 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ 83 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ 84 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ 85 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ 86 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \ 87 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \ 88 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 89 | ] 90 | UserAgent = random.choice(user_agent_list) 91 | headers = {'User-Agent': UserAgent} 92 | return headers 93 | 94 | 95 | # -----------------------------------------------------检查ip是否可用---------------------------------------------------- 96 | def checkip(targeturl, ip): 97 | headers = getheaders() # 定制请求头 98 | proxies = {"http": "http://" + ip, "https": "http://" + ip} # 代理ip 99 | try: 100 | response = requests.get(url=targeturl, proxies=proxies, headers=headers, timeout=5).status_code 101 | if response == 200: 102 | return True 103 | else: 104 | return False 105 | except: 106 | return False 107 | 108 | 109 | # -------------------------------------------------------获取代理方法---------------------------------------------------- 110 | # 免费代理 XiciDaili 111 | def findip(type, pagenum, targeturl, path): # ip类型,页码,目标url,存放ip的路径 112 | list = {'1': 'http://www.xicidaili.com/wn/', # xicidaili国内https代理 113 | '2': 'http://www.xicidaili.com/nn/', # xicidaili国内高匿代理 114 | '3': 'http://www.xicidaili.com/nt/', # xicidaili国内普通代理 115 | '4': 'http://www.xicidaili.com/wt/'} # xicidaili国外http代理 116 | url = list[str(type)] + str(pagenum) # 配置url 117 | # print("url:",url) 118 | headers = getheaders() # 定制请求头 119 | html = requests.get(url=url, headers=headers, timeout=5).text 120 | # print("html:", html) 121 | soup = BeautifulSoup(html, 'lxml') 122 | all = soup.find_all('tr', class_='odd') 123 | for i in all: 124 | t = i.find_all('td') 125 | ip = t[1].text + ':' + t[2].text 126 | is_avail = checkip(targeturl, ip) 127 | if is_avail == True: 128 | write(path=path, text=ip) 129 | print(ip) 130 | 131 | 132 | # -----------------------------------------------------多线程抓取ip入口--------------------------------------------------- 133 | def getip(targeturl, path): 134 | truncatefile(path) # 爬取前清空文档 135 | start = datetime.datetime.now() # 开始时间 136 | threads = [] 137 | for type in range(1): # 四种类型ip,每种类型取前三页,共12条线程 138 | for pagenum in range(3): 139 | t = threading.Thread(target=findip, args=(type + 1, pagenum + 1, targeturl, path)) 140 | threads.append(t) 141 | print('开始爬取代理ip') 142 | for s in threads: # 开启多线程爬取 143 | s.start() 144 | for e in threads: # 等待所有线程结束 145 | e.join() 146 | print('爬取完成') 147 | end = datetime.datetime.now() # 结束时间 148 | diff = gettimediff(start, end) # 计算耗时 149 | ips = read(path) # 读取爬到的ip数量 150 | print('一共爬取代理ip: %s 个,共耗时: %s \n' % (len(ips), diff)) 151 | 152 | 153 | # -------------------------------------------------------启动----------------------------------------------------------- 154 | if __name__ == '__main__': 155 | path = 'ip.txt' # 存放爬取ip的文档path 156 | targeturl = 'http://www.cnblogs.com/TurboWay/' # 验证ip有效性的指定url 157 | getip(targeturl, path) 158 | -------------------------------------------------------------------------------- /02TaogubaCrawler.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import os 4 | import threading 5 | import random 6 | import time 7 | import datetime 8 | import threadpool 9 | import MySqlConnect 10 | 11 | 12 | def getContent(count): 13 | headers = { 14 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'} 15 | url = 'https://www.taoguba.com.cn/Article/' + str(count) + '/1' 16 | # proxy_dict = [ # 免费代理ip http://www.xiladaili.com/ #代理池 17 | # # '60.255.186.169:8888', '42.176.36.251:43800', '120.198.61.126:38724', '39.105.171.101:3128', 18 | # # '123.206.6.218:8888' 19 | # '123.7.61.8:53281','106.12.7.54:8118','117.114.149.66:53281' 20 | # ] 21 | f = open(".\\ip.txt") # 返回一个文件对象 22 | proxy_dict = f.read().strip() 23 | proxy_dict = proxy_dict.split("\n") 24 | # print("proxy_dict:", proxy_dict) 25 | random_ip = random.choice(proxy_dict) 26 | proxy_dict = {'https:': random_ip} 27 | # print(random_ip) 28 | # requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数 29 | s = requests.session() 30 | s.keep_alive = False # 关闭多余连接 31 | f.close() 32 | 33 | try: 34 | res = s.get(url, headers=headers, proxies=proxy_dict) 35 | res = requests.get('https://www.taoguba.com.cn/Article/' + str(count) + '/1', 36 | headers=headers) # get方法中加入请求头 37 | soup = BeautifulSoup(res.text, 'html.parser') # 对返回的结果进行解析 38 | # 提取文章内容 39 | tatime = soup.find_all('span', class_='p_tatime') # 时间 40 | content = soup.find_all('div', class_='p_coten') # 内容 41 | comment = soup.find_all('div', class_='pcnr_wz') # 评论 42 | # print(len(comment),type(comment)) 43 | allcomment = '' 44 | number = len(comment) # 每个帖子的评论条数 45 | replyid = 1 46 | for i in range(number): # postid发帖ID号 parentid发帖所属主题ID号 47 | allcomment += comment[i].text 48 | save2DB_comment(count, replyid, comment[i].text) 49 | replyid += 1 50 | # print(comment[i].text) 51 | # print(allcomment,type(allcomment)) 52 | # print(str(count) + ':' + tatime[0].text + ":" + content[0].text + ":" + str(number) + ":" + allcomment) 53 | # save2DB_content(count, tatime[0].text, content[0].text, str(number), allcomment) # 评论在一起 54 | save2DB_content(count, tatime[0].text, content[0].text, str(number)) 55 | except Exception as e: 56 | print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " - " + random_ip + " : " + url + "\n" + str(e)) 57 | finally: 58 | return 59 | 60 | 61 | # def save2DB_content(id, stringTime, content, number, comment): 62 | # sql = "INSERT INTO taoguba (\ 63 | # id,\ 64 | # stringTime,\ 65 | # time,\ 66 | # content,\ 67 | # number,\ 68 | # comment\ 69 | # )\ 70 | # VALUES(\"" + str( 71 | # id) + "\",\"" + stringTime + "\",\"" + stringTime + ":00\",\"" + content + "\",\"" + number + "\",\"" + comment.strip() + "\")" 72 | # # print(sql) 73 | # MySqlConnect.edit(sql) 74 | def save2DB_content(id, stringTime, content, number): 75 | sql = "INSERT INTO taoguba (\ 76 | id,\ 77 | stringTime,\ 78 | time,\ 79 | content,\ 80 | number\ 81 | )\ 82 | VALUES(\"" + str( 83 | id) + "\",\"" + stringTime + "\",\"" + stringTime + ":00\",\"" + content + "\",\"" + number + "\")" 84 | # print(sql) 85 | MySqlConnect.edit(sql) 86 | 87 | 88 | def save2DB_comment(postid, replyid, comment): 89 | sql = "INSERT INTO comment (\ 90 | postid, \ 91 | replyid, \ 92 | comment\ 93 | )\ 94 | VALUES(\"" + str(postid) + "\",\"" + str(replyid) + "\",\"" + comment + "\")" 95 | # print(sql) 96 | MySqlConnect.edit(sql) 97 | 98 | 99 | if __name__ == "__main__": 100 | # 前两页测试 101 | begin = 79000 102 | end = 200000 103 | # 线程池:https://www.cnblogs.com/xiaozi/p/6182990.html 104 | pool = threadpool.ThreadPool(2) 105 | # for _count in range(begin,end): 106 | threadRequests = threadpool.makeRequests(getContent, range(begin, end)) 107 | [pool.putRequest(req) for req in threadRequests] 108 | pool.wait() 109 | -------------------------------------------------------------------------------- /03阿布云版.py: -------------------------------------------------------------------------------- 1 | from urllib import request 2 | import requests 3 | from bs4 import BeautifulSoup 4 | import datetime 5 | import threadpool 6 | import MySqlConnect 7 | 8 | 9 | # 改变自己的ip地址 10 | def open_web(url): 11 | # 要访问的目标页面 12 | url = url 13 | # 代理服务器 14 | proxyHost = "http-dyn.abuyun.com" 15 | proxyPort = "9020" 16 | 17 | # 代理隧道验证信息 18 | proxyUser = "HGO7O08N5FL3528D" 19 | proxyPass = "776880CC24445926" 20 | 21 | proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { 22 | "host": proxyHost, 23 | "port": proxyPort, 24 | "user": proxyUser, 25 | "pass": proxyPass, 26 | } 27 | 28 | proxy_handler = request.ProxyHandler({ 29 | "http": proxyMeta, 30 | "https": proxyMeta, 31 | }) 32 | 33 | # auth = request.HTTPBasicAuthHandler() 34 | # opener = request.build_opener(proxy_handler, auth, request.HTTPHandler) 35 | opener = request.build_opener(proxy_handler) 36 | request.install_opener(opener) 37 | html = request.urlopen(url).read().decode('utf-8') 38 | return html 39 | 40 | 41 | def getContent(count): 42 | headers = { 43 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'} 44 | url = 'https://www.taoguba.com.cn/Article/' + str(count) + '/1' 45 | url = open_web(url) 46 | # proxy_dict = [ # 免费代理ip http://www.xiladaili.com/ #代理池 47 | # # '60.255.186.169:8888', '42.176.36.251:43800', '120.198.61.126:38724', '39.105.171.101:3128', 48 | # # '123.206.6.218:8888' 49 | # '123.7.61.8:53281','106.12.7.54:8118','117.114.149.66:53281' 50 | # ] 51 | 52 | # f = open(".\\ip.txt") # 返回一个文件对象 53 | # proxy_dict = f.read().strip() 54 | # proxy_dict = proxy_dict.split("\n") 55 | # # print("proxy_dict:", proxy_dict) 56 | # random_ip = random.choice(proxy_dict) 57 | # proxy_dict = {'https:': random_ip} 58 | # # print(random_ip) 59 | # # requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数 60 | # s = requests.session() 61 | # s.keep_alive = False # 关闭多余连接 62 | # f.close() 63 | 64 | 65 | try: 66 | # res = s.get(url, headers=headers, proxies=proxy_dict) 67 | res = requests.get('https://www.taoguba.com.cn/Article/' + str(count) + '/1', 68 | headers=headers) # get方法中加入请求头 69 | soup = BeautifulSoup(res.text, 'html.parser') # 对返回的结果进行解析 70 | # 提取文章内容 71 | tatime = soup.find_all('span', class_='p_tatime') # 时间 72 | content = soup.find_all('div', class_='p_coten') # 内容 73 | comment = soup.find_all('div', class_='pcnr_wz') # 评论 74 | # print(len(comment),type(comment)) 75 | allcomment = '' 76 | number = len(comment) # 每个帖子的评论条数 77 | replyid = 1 78 | for i in range(number): # postid发帖ID号 parentid发帖所属主题ID号 79 | allcomment += comment[i].text 80 | save2DB_comment(count, replyid, comment[i].text) 81 | replyid += 1 82 | # print(comment[i].text) 83 | # print(allcomment,type(allcomment)) 84 | # print(str(count) + ':' + tatime[0].text + ":" + content[0].text + ":" + str(number) + ":" + allcomment) 85 | # save2DB_content(count, tatime[0].text, content[0].text, str(number), allcomment) # 评论在一起 86 | save2DB_content(count, tatime[0].text, content[0].text, str(number)) 87 | except Exception as e: 88 | print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " - " + str(e)) 89 | finally: 90 | return 91 | 92 | 93 | def save2DB_content(id, stringTime, content, number): 94 | sql = "INSERT INTO taoguba (\ 95 | id,\ 96 | stringTime,\ 97 | time,\ 98 | content,\ 99 | number\ 100 | )\ 101 | VALUES(\"" + str( 102 | id) + "\",\"" + stringTime + "\",\"" + stringTime + ":00\",\"" + content + "\",\"" + number + "\")" 103 | # print(sql) 104 | MySqlConnect.edit(sql) 105 | 106 | 107 | def save2DB_comment(postid, replyid, comment): 108 | sql = "INSERT INTO comment (\ 109 | postid, \ 110 | replyid, \ 111 | comment\ 112 | )\ 113 | VALUES(\"" + str(postid) + "\",\"" + str(replyid) + "\",\"" + comment + "\")" 114 | # print(sql) 115 | MySqlConnect.edit(sql) 116 | 117 | 118 | if __name__ == "__main__": 119 | # 前两页测试 120 | begin = 1 121 | end = 10000 122 | # 线程池:https://www.cnblogs.com/xiaozi/p/6182990.html 123 | pool = threadpool.ThreadPool(2) 124 | # for _count in range(begin,end): 125 | threadRequests = threadpool.makeRequests(getContent, range(begin, end)) 126 | [pool.putRequest(req) for req in threadRequests] 127 | pool.wait() 128 | 129 | 130 | -------------------------------------------------------------------------------- /MySqlConnect.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python3 2 | 3 | import pymysql 4 | 5 | dbInfo = { 6 | 'host': '127.0.0.1', 7 | 'port': 3306, 8 | 'user': 'root', 9 | 'passwd': 'root', 10 | 'db': 'ai', 11 | 'charset': 'utf8' 12 | } 13 | 14 | 15 | def select(dbInfo, sql): 16 | # 打开数据库连接 17 | db = pymysql.connect(**dbInfo) 18 | # 使用cursor()方法获取操作游标 19 | cursor = db.cursor() 20 | try: 21 | # 执行SQL语句 22 | cursor.execute(sql) 23 | # 获取所有记录列表 24 | results = cursor.fetchall() 25 | except: 26 | print("Error: unable to fetch data") 27 | finally: 28 | # 关闭数据库连接 29 | db.close() 30 | return results 31 | 32 | 33 | def select(sql): 34 | # 打开数据库连接 35 | db = pymysql.connect(**dbInfo) 36 | # 使用cursor()方法获取操作游标 37 | cursor = db.cursor() 38 | try: 39 | # 执行SQL语句 40 | cursor.execute(sql) 41 | # 获取所有记录列表 42 | results = cursor.fetchall() 43 | except: 44 | print("Error: unable to fetch data") 45 | finally: 46 | # 关闭数据库连接 47 | db.close() 48 | return results 49 | 50 | 51 | def edit(sql): 52 | # 打开数据库连接 53 | db = pymysql.connect(**dbInfo) 54 | # 使用cursor()方法获取操作游标 55 | cursor = db.cursor() 56 | try: 57 | # 执行SQL语句 58 | cursor.execute(sql) 59 | # 提交操作 60 | db.commit() 61 | except Exception as e: 62 | print(e) 63 | print("Error: unable to fetch data") 64 | db.rollback() 65 | finally: 66 | # 关闭数据库连接 67 | db.close() 68 | # results=select("select * from dt_hiddendanger_record") 69 | # for row in results: 70 | # print(row) 71 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Taoguba_Data 2 | 项目主要参考东方财富网爬取了淘股吧的发贴信息,研究内容分为论坛中人们的行为分布和股市涨跌的延迟相关性。 嗯嗯嗯……呃呃呃 第一次写代码,终日受代码摧残,深深体会到了一个人的孤单与无奈,一边百度一边写,很感谢百度提供的思路与代码分享,之后还用CNN进行股票预测,虽然效果还差强人意吧……本次上传的是爬虫部分(现在ip被封了),希望路过的人多多改正,小女万分感激。 3 | --------------------------------------------------------------------------------