├── 00爬免费代理.py
├── 02TaogubaCrawler.py
├── 03阿布云版.py
├── MySqlConnect.py
└── README.md


/00爬免费代理.py:
--------------------------------------------------------------------------------
  1 | # 参考博客：https://www.cnblogs.com/TurboWay/p/8172246.html
  2 | # http://www.xicidaili.com/nt/
  3 | # 一、为什么需要建立爬虫代理ip池
  4 | #
  5 | #               在众多的网站防爬措施中，有一种是根据ip的访问频率进行限制的，在某段时间内，当某个ip的访问量达到一定的阀值时，该ip会被拉黑、在一段时间内被禁止访问。
  6 | #
  7 | #       这种时候，可以通过降低爬虫的频率，或者更改ip来应对。后者就需要有一个可用的代理ip池，以供爬虫工作时切换。
  8 | #
  9 | # 二、如何建立一个爬虫代理ip池
 10 | #
 11 | #       思路：   1、找到一个免费的ip代理网站(如：西刺代理)
 12 | #
 13 | #                  2、爬取ip（常规爬取requests+BeautifulSoup）
 14 | #
 15 | #                  3、验证ip有效性（携带爬取到的ip，去访问指定的url，看返回的状态码是不是200）
 16 | #
 17 | #                  4、记录ip （写到文档）
 18 | 
 19 | 
 20 | # !/usr/bin/env python3
 21 | # -*- coding: utf-8 -*-
 22 | import requests, threading, datetime
 23 | from bs4 import BeautifulSoup
 24 | import random
 25 | 
 26 | """
 27 | 1、抓取西刺代理网站的代理ip
 28 | 2、并根据指定的目标url,对抓取到ip的有效性进行验证
 29 | 3、最后存到指定的path
 30 | """
 31 | 
 32 | 
 33 | # ------------------------------------------------------文档处理--------------------------------------------------------
 34 | # 写入文档
 35 | def write(path, text):
 36 |     with open(path, 'a', encoding='utf-8') as f:
 37 |         f.writelines(text)
 38 |         f.write('\n')
 39 |         f.close()
 40 | 
 41 | 
 42 | # 清空文档
 43 | def truncatefile(path):
 44 |     with open(path, 'w', encoding='utf-8') as f:
 45 |         f.truncate()
 46 | 
 47 | 
 48 | # 读取文档
 49 | def read(path):
 50 |     with open(path, 'r', encoding='utf-8') as f:
 51 |         txt = []
 52 |         for s in f.readlines():
 53 |             txt.append(s.strip())
 54 |     return txt
 55 | 
 56 | 
 57 | # ----------------------------------------------------------------------------------------------------------------------
 58 | # 计算时间差,格式: 时分秒
 59 | def gettimediff(start, end):
 60 |     seconds = (end - start).seconds
 61 |     m, s = divmod(seconds, 60)
 62 |     h, m = divmod(m, 60)
 63 |     diff = ("%02d:%02d:%02d" % (h, m, s))
 64 |     return diff
 65 | 
 66 | 
 67 | # ----------------------------------------------------------------------------------------------------------------------
 68 | # 返回一个随机的请求头 headers
 69 | def getheaders():
 70 |     user_agent_list = [ \
 71 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \
 72 |         "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \
 73 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \
 74 |         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \
 75 |         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \
 76 |         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \
 77 |         "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \
 78 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
 79 |         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
 80 |         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
 81 |         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
 82 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
 83 |         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
 84 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
 85 |         "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
 86 |         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \
 87 |         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \
 88 |         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
 89 |     ]
 90 |     UserAgent = random.choice(user_agent_list)
 91 |     headers = {'User-Agent': UserAgent}
 92 |     return headers
 93 | 
 94 | 
 95 | # -----------------------------------------------------检查ip是否可用----------------------------------------------------
 96 | def checkip(targeturl, ip):
 97 |     headers = getheaders()  # 定制请求头
 98 |     proxies = {"http": "http://" + ip, "https": "http://" + ip}  # 代理ip
 99 |     try:
100 |         response = requests.get(url=targeturl, proxies=proxies, headers=headers, timeout=5).status_code
101 |         if response == 200:
102 |             return True
103 |         else:
104 |             return False
105 |     except:
106 |         return False
107 | 
108 | 
109 | # -------------------------------------------------------获取代理方法----------------------------------------------------
110 | # 免费代理 XiciDaili
111 | def findip(type, pagenum, targeturl, path):  # ip类型,页码,目标url,存放ip的路径
112 |     list = {'1': 'http://www.xicidaili.com/wn/',  # xicidaili国内https代理
113 |     '2': 'http://www.xicidaili.com/nn/',  # xicidaili国内高匿代理
114 |     '3': 'http://www.xicidaili.com/nt/',  # xicidaili国内普通代理
115 |     '4': 'http://www.xicidaili.com/wt/'}  # xicidaili国外http代理
116 |     url = list[str(type)] + str(pagenum)  # 配置url
117 |     # print("url:",url)
118 |     headers = getheaders()  # 定制请求头
119 |     html = requests.get(url=url, headers=headers, timeout=5).text
120 |     # print("html:", html)
121 |     soup = BeautifulSoup(html, 'lxml')
122 |     all = soup.find_all('tr', class_='odd')
123 |     for i in all:
124 |         t = i.find_all('td')
125 |         ip = t[1].text + ':' + t[2].text
126 |         is_avail = checkip(targeturl, ip)
127 |         if is_avail == True:
128 |             write(path=path, text=ip)
129 |             print(ip)
130 | 
131 | 
132 | # -----------------------------------------------------多线程抓取ip入口---------------------------------------------------
133 | def getip(targeturl, path):
134 |     truncatefile(path)  # 爬取前清空文档
135 |     start = datetime.datetime.now()  # 开始时间
136 |     threads = []
137 |     for type in range(1):  # 四种类型ip,每种类型取前三页,共12条线程
138 |         for pagenum in range(3):
139 |             t = threading.Thread(target=findip, args=(type + 1, pagenum + 1, targeturl, path))
140 |             threads.append(t)
141 |     print('开始爬取代理ip')
142 |     for s in threads:  # 开启多线程爬取
143 |         s.start()
144 |     for e in threads:  # 等待所有线程结束
145 |         e.join()
146 |     print('爬取完成')
147 |     end = datetime.datetime.now()  # 结束时间
148 |     diff = gettimediff(start, end)  # 计算耗时
149 |     ips = read(path)  # 读取爬到的ip数量
150 |     print('一共爬取代理ip: %s 个,共耗时: %s \n' % (len(ips), diff))
151 | 
152 | 
153 | # -------------------------------------------------------启动-----------------------------------------------------------
154 | if __name__ == '__main__':
155 |     path = 'ip.txt'  # 存放爬取ip的文档path
156 |     targeturl = 'http://www.cnblogs.com/TurboWay/'  # 验证ip有效性的指定url
157 |     getip(targeturl, path)
158 | 


--------------------------------------------------------------------------------
/02TaogubaCrawler.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import os
  4 | import threading
  5 | import random
  6 | import time
  7 | import datetime
  8 | import threadpool
  9 | import MySqlConnect
 10 | 
 11 | 
 12 | def getContent(count):
 13 |     headers = {
 14 |         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'}
 15 |     url = 'https://www.taoguba.com.cn/Article/' + str(count) + '/1'
 16 |     # proxy_dict = [  # 免费代理ip  http://www.xiladaili.com/  #代理池
 17 |     #         # '60.255.186.169:8888', '42.176.36.251:43800', '120.198.61.126:38724', '39.105.171.101:3128',
 18 |     #         # '123.206.6.218:8888'
 19 |     #         '123.7.61.8:53281','106.12.7.54:8118','117.114.149.66:53281'
 20 |     #     ]
 21 |     f = open(".\\ip.txt")  # 返回一个文件对象
 22 |     proxy_dict = f.read().strip()
 23 |     proxy_dict = proxy_dict.split("\n")
 24 |     # print("proxy_dict:", proxy_dict)
 25 |     random_ip = random.choice(proxy_dict)
 26 |     proxy_dict = {'https:': random_ip}
 27 |     # print(random_ip)
 28 |     # requests.adapters.DEFAULT_RETRIES = 5  # 增加重连次数
 29 |     s = requests.session()
 30 |     s.keep_alive = False  # 关闭多余连接
 31 |     f.close()
 32 | 
 33 |     try:
 34 |         res = s.get(url, headers=headers, proxies=proxy_dict)
 35 |         res = requests.get('https://www.taoguba.com.cn/Article/' + str(count) + '/1',
 36 |                            headers=headers)  # get方法中加入请求头
 37 |         soup = BeautifulSoup(res.text, 'html.parser')  # 对返回的结果进行解析
 38 |         # 提取文章内容
 39 |         tatime = soup.find_all('span', class_='p_tatime')  # 时间
 40 |         content = soup.find_all('div', class_='p_coten')  # 内容
 41 |         comment = soup.find_all('div', class_='pcnr_wz')  # 评论
 42 |         # print(len(comment),type(comment))
 43 |         allcomment = ''
 44 |         number = len(comment)  # 每个帖子的评论条数
 45 |         replyid = 1
 46 |         for i in range(number):  # postid发帖ID号  parentid发帖所属主题ID号
 47 |             allcomment += comment[i].text
 48 |             save2DB_comment(count, replyid, comment[i].text)
 49 |             replyid += 1
 50 |             # print(comment[i].text)
 51 |         # print(allcomment,type(allcomment))
 52 |         # print(str(count) + ':' + tatime[0].text + ":" + content[0].text + ":" + str(number) + ":" + allcomment)
 53 |         # save2DB_content(count, tatime[0].text, content[0].text, str(number), allcomment)  # 评论在一起
 54 |         save2DB_content(count, tatime[0].text, content[0].text, str(number))
 55 |     except Exception as e:
 56 |         print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " - " + random_ip + " : " + url + "\n" + str(e))
 57 |     finally:
 58 |         return
 59 | 
 60 | 
 61 | # def save2DB_content(id, stringTime, content, number, comment):
 62 | #     sql = "INSERT INTO taoguba (\
 63 | #         id,\
 64 | #         stringTime,\
 65 | #         time,\
 66 | #         content,\
 67 | #         number,\
 68 | #         comment\
 69 | #         )\
 70 | #         VALUES(\"" + str(
 71 | #         id) + "\",\"" + stringTime + "\",\"" + stringTime + ":00\",\"" + content + "\",\"" + number + "\",\"" + comment.strip() + "\")"
 72 | #     # print(sql)
 73 | #     MySqlConnect.edit(sql)
 74 | def save2DB_content(id, stringTime, content, number):
 75 |     sql = "INSERT INTO taoguba (\
 76 |         id,\
 77 |         stringTime,\
 78 |         time,\
 79 |         content,\
 80 |         number\
 81 |         )\
 82 |         VALUES(\"" + str(
 83 |         id) + "\",\"" + stringTime + "\",\"" + stringTime + ":00\",\"" + content + "\",\"" + number + "\")"
 84 |     # print(sql)
 85 |     MySqlConnect.edit(sql)
 86 | 
 87 | 
 88 | def save2DB_comment(postid, replyid, comment):
 89 |     sql = "INSERT INTO comment (\
 90 |         postid, \
 91 |         replyid, \
 92 |         comment\
 93 |         )\
 94 |         VALUES(\"" + str(postid) + "\",\"" + str(replyid) + "\",\"" + comment + "\")"
 95 |     # print(sql)
 96 |     MySqlConnect.edit(sql)
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     # 前两页测试
101 |     begin = 79000
102 |     end = 200000
103 |     # 线程池：https://www.cnblogs.com/xiaozi/p/6182990.html
104 |     pool = threadpool.ThreadPool(2)
105 |     # for _count in range(begin,end):
106 |     threadRequests = threadpool.makeRequests(getContent, range(begin, end))
107 |     [pool.putRequest(req) for req in threadRequests]
108 |     pool.wait()
109 | 


--------------------------------------------------------------------------------
/03阿布云版.py:
--------------------------------------------------------------------------------
  1 | from urllib import request
  2 | import requests
  3 | from bs4 import BeautifulSoup
  4 | import datetime
  5 | import threadpool
  6 | import MySqlConnect
  7 | 
  8 | 
  9 | # 改变自己的ip地址
 10 | def open_web(url):
 11 |     # 要访问的目标页面
 12 |     url = url
 13 |     # 代理服务器
 14 |     proxyHost = "http-dyn.abuyun.com"
 15 |     proxyPort = "9020"
 16 | 
 17 |     # 代理隧道验证信息
 18 |     proxyUser = "HGO7O08N5FL3528D"
 19 |     proxyPass = "776880CC24445926"
 20 | 
 21 |     proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
 22 |         "host": proxyHost,
 23 |         "port": proxyPort,
 24 |         "user": proxyUser,
 25 |         "pass": proxyPass,
 26 |     }
 27 | 
 28 |     proxy_handler = request.ProxyHandler({
 29 |         "http": proxyMeta,
 30 |         "https": proxyMeta,
 31 |     })
 32 | 
 33 |     # auth = request.HTTPBasicAuthHandler()
 34 |     # opener = request.build_opener(proxy_handler, auth, request.HTTPHandler)
 35 |     opener = request.build_opener(proxy_handler)
 36 |     request.install_opener(opener)
 37 |     html = request.urlopen(url).read().decode('utf-8')
 38 |     return html
 39 | 
 40 | 
 41 | def getContent(count):
 42 |     headers = {
 43 |         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'}
 44 |     url = 'https://www.taoguba.com.cn/Article/' + str(count) + '/1'
 45 |     url = open_web(url)
 46 |     # proxy_dict = [  # 免费代理ip  http://www.xiladaili.com/  #代理池
 47 |     #         # '60.255.186.169:8888', '42.176.36.251:43800', '120.198.61.126:38724', '39.105.171.101:3128',
 48 |     #         # '123.206.6.218:8888'
 49 |     #         '123.7.61.8:53281','106.12.7.54:8118','117.114.149.66:53281'
 50 |     #     ]
 51 | 
 52 |     # f = open(".\\ip.txt")  # 返回一个文件对象
 53 |     # proxy_dict = f.read().strip()
 54 |     # proxy_dict = proxy_dict.split("\n")
 55 |     # # print("proxy_dict:", proxy_dict)
 56 |     # random_ip = random.choice(proxy_dict)
 57 |     # proxy_dict = {'https:': random_ip}
 58 |     # # print(random_ip)
 59 |     # # requests.adapters.DEFAULT_RETRIES = 5  # 增加重连次数
 60 |     # s = requests.session()
 61 |     # s.keep_alive = False  # 关闭多余连接
 62 |     # f.close()
 63 | 
 64 | 
 65 |     try:
 66 |         # res = s.get(url, headers=headers, proxies=proxy_dict)
 67 |         res = requests.get('https://www.taoguba.com.cn/Article/' + str(count) + '/1',
 68 |                            headers=headers)  # get方法中加入请求头
 69 |         soup = BeautifulSoup(res.text, 'html.parser')  # 对返回的结果进行解析
 70 |         # 提取文章内容
 71 |         tatime = soup.find_all('span', class_='p_tatime')  # 时间
 72 |         content = soup.find_all('div', class_='p_coten')  # 内容
 73 |         comment = soup.find_all('div', class_='pcnr_wz')  # 评论
 74 |         # print(len(comment),type(comment))
 75 |         allcomment = ''
 76 |         number = len(comment)  # 每个帖子的评论条数
 77 |         replyid = 1
 78 |         for i in range(number):  # postid发帖ID号  parentid发帖所属主题ID号
 79 |             allcomment += comment[i].text
 80 |             save2DB_comment(count, replyid, comment[i].text)
 81 |             replyid += 1
 82 |             # print(comment[i].text)
 83 |         # print(allcomment,type(allcomment))
 84 |         # print(str(count) + ':' + tatime[0].text + ":" + content[0].text + ":" + str(number) + ":" + allcomment)
 85 |         # save2DB_content(count, tatime[0].text, content[0].text, str(number), allcomment)  # 评论在一起
 86 |         save2DB_content(count, tatime[0].text, content[0].text, str(number))
 87 |     except Exception as e:
 88 |         print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " - " + str(e))
 89 |     finally:
 90 |         return
 91 | 
 92 | 
 93 | def save2DB_content(id, stringTime, content, number):
 94 |     sql = "INSERT INTO taoguba (\
 95 |         id,\
 96 |         stringTime,\
 97 |         time,\
 98 |         content,\
 99 |         number\
100 |         )\
101 |         VALUES(\"" + str(
102 |         id) + "\",\"" + stringTime + "\",\"" + stringTime + ":00\",\"" + content + "\",\"" + number + "\")"
103 |     # print(sql)
104 |     MySqlConnect.edit(sql)
105 | 
106 | 
107 | def save2DB_comment(postid, replyid, comment):
108 |     sql = "INSERT INTO comment (\
109 |         postid, \
110 |         replyid, \
111 |         comment\
112 |         )\
113 |         VALUES(\"" + str(postid) + "\",\"" + str(replyid) + "\",\"" + comment + "\")"
114 |     # print(sql)
115 |     MySqlConnect.edit(sql)
116 | 
117 | 
118 | if __name__ == "__main__":
119 |     # 前两页测试
120 |     begin = 1
121 |     end = 10000
122 |     # 线程池：https://www.cnblogs.com/xiaozi/p/6182990.html
123 |     pool = threadpool.ThreadPool(2)
124 |     # for _count in range(begin,end):
125 |     threadRequests = threadpool.makeRequests(getContent, range(begin, end))
126 |     [pool.putRequest(req) for req in threadRequests]
127 |     pool.wait()
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/MySqlConnect.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python3
 2 | 
 3 | import pymysql
 4 | 
 5 | dbInfo = {
 6 |     'host': '127.0.0.1',
 7 |     'port': 3306,
 8 |     'user': 'root',
 9 |     'passwd': 'root',
10 |     'db': 'ai',
11 |     'charset': 'utf8'
12 | }
13 | 
14 | 
15 | def select(dbInfo, sql):
16 |     # 打开数据库连接
17 |     db = pymysql.connect(**dbInfo)
18 |     # 使用cursor()方法获取操作游标
19 |     cursor = db.cursor()
20 |     try:
21 |         # 执行SQL语句
22 |         cursor.execute(sql)
23 |         # 获取所有记录列表
24 |         results = cursor.fetchall()
25 |     except:
26 |         print("Error: unable to fetch data")
27 |     finally:
28 |         # 关闭数据库连接
29 |         db.close()
30 |     return results
31 | 
32 | 
33 | def select(sql):
34 |     # 打开数据库连接
35 |     db = pymysql.connect(**dbInfo)
36 |     # 使用cursor()方法获取操作游标
37 |     cursor = db.cursor()
38 |     try:
39 |         # 执行SQL语句
40 |         cursor.execute(sql)
41 |         # 获取所有记录列表
42 |         results = cursor.fetchall()
43 |     except:
44 |         print("Error: unable to fetch data")
45 |     finally:
46 |         # 关闭数据库连接
47 |         db.close()
48 |     return results
49 | 
50 | 
51 | def edit(sql):
52 |     # 打开数据库连接
53 |     db = pymysql.connect(**dbInfo)
54 |     # 使用cursor()方法获取操作游标
55 |     cursor = db.cursor()
56 |     try:
57 |         # 执行SQL语句
58 |         cursor.execute(sql)
59 |         # 提交操作
60 |         db.commit()
61 |     except Exception as e:
62 |         print(e)
63 |         print("Error: unable to fetch data")
64 |         db.rollback()
65 |     finally:
66 |         # 关闭数据库连接
67 |         db.close()
68 | # results=select("select * from dt_hiddendanger_record")
69 | # for row in results:
70 | #     print(row)
71 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Taoguba_Data
2 | 项目主要参考东方财富网爬取了淘股吧的发贴信息，研究内容分为论坛中人们的行为分布和股市涨跌的延迟相关性。  嗯嗯嗯……呃呃呃 第一次写代码，终日受代码摧残，深深体会到了一个人的孤单与无奈，一边百度一边写，很感谢百度提供的思路与代码分享，之后还用CNN进行股票预测，虽然效果还差强人意吧……本次上传的是爬虫部分（现在ip被封了），希望路过的人多多改正，小女万分感激。
3 | 


--------------------------------------------------------------------------------