├── README.md ├── settings.py └── spider.py /README.md: -------------------------------------------------------------------------------- 1 | # ths_spider 2 | 同花顺股票信息爬虫 3 | 4 | 使用代理抓取股票信息,提取股票名称,股票代码,涨跌幅等信息 5 | 6 | 为了防止封ip,增加了代理池, 7 | 8 | 因为代理使用的是免费代理,导致有些页面的信息抓取失败,因此增加了一个列表,用来存储失败的url,等到之后再次尝试, 9 | 10 | 使用的免费代理,导致有些ip能用,有些不能使用,因此在切换ip时,使用了一下策略: 11 | 12 | (1)如果代理能用,则一直使用,直至不能使用为止, 13 | 14 | (2)代理不能使用,则立即切换代理 15 | -------------------------------------------------------------------------------- /settings.py: -------------------------------------------------------------------------------- 1 | #必要参数设置 2 | MAX_PAGE = 165 #最大页数 3 | PAGE_TRACK = 1 #追踪到了第几页 4 | MAX_GET = 1 #获取最大尝试次数 5 | MAX_PARSE = 1 #解析尝试最大次数 6 | MAX_CSV = 1 #文件保存最大次数 7 | MAX_PROXY =1 #获取代理的最大次数 8 | MAX_START = 1 #MAX_*的初始值 9 | MAX_TRY = 4 #最大尝试次数 10 | FLAG = 0 #用于标识,是否使用 url_omi() 函数 11 | 12 | #初始链接 13 | URL_START = "http://q.10jqka.com.cn//index/index/board/all/field/zdf/order/desc/page/" 14 | PARAMS = "/ajax/1/" 15 | 16 | 17 | #第一次爬取的 html 缺失的页面 的url 列表 18 | #先进先出的列表 19 | PAGE_LIST = [] 20 | 21 | #代理池接口 22 | PROXY_POOL_API = "http://127.0.0.1:5555/random" 23 | 24 | headers = { 25 | 'Accept': 'text/html, */*; q=0.01', 26 | 'Accept-Encoding': 'gzip, deflate, sdch', 27 | 'Accept-Language': 'zh-CN,zh;q=0.8', 28 | 'Connection': 'keep-alive', 29 | 'Cookie': 'spversion=20130314; __utma=156575163.1163133091.1530233537.1530289428.1530369413.3; __utmz=156575163.1530369413.3.3.utmcsr=stockpage.10jqka.com.cn|utmccn=(referral)|utmcmd=referral|utmcct=/; Hm_lvt_78c58f01938e4d85eaf619eae71b4ed1=1530444468,1530505958,1530506333,1530516152; Hm_lpvt_78c58f01938e4d85eaf619eae71b4ed1=1530516152; historystock=300033%7C*%7C1A0001; v=AiDRI3i0b1qEZNNemO_FOZlE8SXqKQQBpg9Y4Jox7pbOH8oZQjnUg_YdKIHp', 30 | 'hexin-v': 'AiDRI3i0b1qEZNNemO_FOZlE8SXqKQQBpg9Y4Jox7pbOH8oZQjnUg_YdKIHp', 31 | 'Host': 'q.10jqka.com.cn', 32 | 'Referer': 'http://q.10jqka.com.cn/', 33 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', 34 | 'X-Requested-With': 'XMLHttpRequest'} 35 | 36 | #数据库部分 37 | MONGO_URL = '' -------------------------------------------------------------------------------- /spider.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from urllib.parse import quote 3 | from settings import * 4 | from bs4 import BeautifulSoup 5 | import sys 6 | import time 7 | import random 8 | import csv 9 | 10 | 11 | class crawl(object): 12 | 13 | def __init__(self): 14 | self.MAX_PAGE = MAX_PAGE 15 | self.PAGE_TRACK = PAGE_TRACK #跟踪次数 16 | self.FLAG = FLAG #设置标志位 17 | self.PAGE_LIST = PAGE_LIST #第一次获取失败的 html 的 列表 18 | self.URL_START = URL_START #初始链接 19 | self.PARAMS = PARAMS #url 构造参数 20 | self.PROXY_POOL_API = "http://127.0.0.1:5555/random" 21 | self.proxy_save = None #用于存储代理 22 | self.proxy_con = 0 #用于控制代理什么时候更换 23 | self.fieldnames = ['代码','名称','现价','涨跌幅'] 24 | self.file = open("ths.csv",'a', newline='') #打开文件 25 | self.writer = csv.DictWriter(self.file, fieldnames = self.fieldnames) 26 | self.writer.writeheader() 27 | 28 | 29 | 30 | 31 | def proxy_get(self, num_retries=2): 32 | """ 33 | #代理获取模块 34 | 35 | """ 36 | try: 37 | r_proxy = requests.get(self.PROXY_POOL_API, timeout = 5) 38 | proxy = r_proxy.text #指定代理 39 | print("代理是", proxy) 40 | proxies = { 41 | "http": 'http://' + proxy, 42 | "https": 'https://' + proxy, 43 | } 44 | return proxies 45 | except: 46 | if num_retries > 0: 47 | print("代理获取失败,重新获取") 48 | self.proxy_get(num_retries-1) 49 | 50 | 51 | def url_yield(self): 52 | """ 53 | :func 用于生成url 54 | :yield items 55 | """ 56 | for i in range(1, self.MAX_PAGE + 1 ): 57 | self.PAGE_TRACK = i #页面追踪 58 | self.FLAG += 1 #每次加1 59 | print('FLAG 是:', self.FLAG) 60 | url = "{}{}{}".format(self.URL_START, i, self.PARAMS) 61 | yield url 62 | 63 | def url_omi(self): 64 | print("开始补漏") 65 | length_pl = len(self.PAGE_LIST) 66 | if length_pl != 0: #判断是否为空 67 | for i in range(length_pl): 68 | self.PAGE_TRACK = self.PAGE_LIST.pop(0) #构造一个动态列表, 弹出第一个元素 69 | url = "{}{}{}".format(self.URL_START, self.PAGE_TRACK, self.PARAMS) 70 | yield url 71 | 72 | 73 | 74 | def downloader(self, url, num_retries=3): 75 | if self.proxy_con == 0: 76 | proxies = self.proxy_get() #获取代理 77 | else: 78 | proxies = self.proxy_save #继续使用代理 79 | self.proxy_save = proxies #更换代理值 80 | headers_list = [{ 81 | 'Accept': 'text/html, */*; q=0.01', 82 | 'Accept-Encoding': 'gzip, deflate, sdch', 83 | 'Accept-Language': 'zh-CN,zh;q=0.8', 84 | 'Connection': 'keep-alive', 85 | 'Cookie':'log=; Hm_lvt_78c58f01938e4d85eaf619eae71b4ed1=1533992361,1533998469,1533998895,1533998953; Hm_lpvt_78c58f01938e4d85eaf619eae71b4ed1=1533998953; user=MDrAz9H9akQ6Ok5vbmU6NTAwOjQ2OTU0MjIzNDo3LDExMTExMTExMTExLDQwOzQ0LDExLDQwOzYsMSw0MDs1LDEsNDA7MSwxLDQwOzIsMSw0MDszLDEsNDA7NSwxLDQwOzgsMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDEsNDA6Ojo6NDU5NTQyMjM0OjE1MzM5OTkwNzU6OjoxNTMzOTk5MDYwOjg2NDAwOjA6MTZmOGFjOTgwMGNhMjFjZjRkMWZlMjk0NDQ4M2FhNDFkOmRlZmF1bHRfMjox; userid=459542234; u_name=%C0%CF%D1%FDjD; escapename=%25u8001%25u5996jD; ticket=7c92fb758f81dfa4399d0983f7ee5e53; v=Ajz6VIblS6HlDX_9PqmhBV0QDdH4NeBfYtn0Ixa9SCcK4daNPkWw77LpxLZl', 86 | 'hexin-v': 'AiDRI3i0b1qEZNNemO_FOZlE8SXqKQQBpg9Y4Jox7pbOH8oZQjnUg_YdKIHp', 87 | 'Host': 'q.10jqka.com.cn', 88 | 'Referer': 'http://q.10jqka.com.cn/', 89 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', 90 | },{'Accept': 'text/html, */*; q=0.01', 91 | 'Accept-Encoding': 'gzip, deflate, sdch', 92 | 'Accept-Language': 'zh-CN,zh;q=0.8', 93 | 'Connection': 'keep-alive', 94 | 'Cookie': 'user=MDq62tH9NUU6Ok5vbmU6NTAwOjQ2OTU0MjA4MDo3LDExMTExMTExMTExLDQwOzQ0LDExLDQwOzYsMSw0MDs1LDEsNDA7MSwxLDQwOzIsMSw0MDszLDEsNDA7NSwxLDQwOzgsMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDEsNDA6Ojo6NDU5NTQyMDgwOjE1MzM5OTg4OTc6OjoxNTMzOTk4ODgwOjg2NDAwOjA6MTEwOTNhMzBkNTAxMWFlOTg0OWM1MzVjODA2NjQyMThmOmRlZmF1bHRfMjox; userid=459542080; u_name=%BA%DA%D1%FD5E; escapename=%25u9ed1%25u59965E; ticket=658289e5730da881ef99b521b65da6af; log=; Hm_lvt_78c58f01938e4d85eaf619eae71b4ed1=1533992361,1533998469,1533998895,1533998953; Hm_lpvt_78c58f01938e4d85eaf619eae71b4ed1=1533998953; v=AibgksC3Qd-feBV7t0kbK7PCd5e-B2rBPEueJRDPEskkk8xLeJe60Qzb7jDj', 'hexin-v': 'AiDRI3i0b1qEZNNemO_FOZlE8SXqKQQBpg9Y4Jox7pbOH8oZQjnUg_YdKIHp', 95 | 'Host': 'q.10jqka.com.cn', 96 | 'Referer': 'http://q.10jqka.com.cn/', 97 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', 98 | }, 99 | {'Accept': 'text/html, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Connection': 'keep-alive', 'Cookie': 'user=MDq62sm9wM%2FR%2FVk6Ok5vbmU6NTAwOjQ2OTU0MTY4MTo3LDExMTExMTExMTExLDQwOzQ0LDExLDQwOzYsMSw0MDs1LDEsNDA7MSwxLDQwOzIsMSw0MDszLDEsNDA7NSwxLDQwOzgsMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDEsNDA6Ojo6NDU5NTQxNjgxOjE1MzM5OTg0NjI6OjoxNTMzOTk4NDYwOjg2NDAwOjA6MTAwNjE5YWExNjc2NDQ2MGE3ZGYxYjgxNDZlNzY3ODIwOmRlZmF1bHRfMjox; userid=459541681; u_name=%BA%DA%C9%BD%C0%CF%D1%FDY; escapename=%25u9ed1%25u5c71%25u8001%25u5996Y; ticket=4def626a5a60cc1d998231d7730d2947; log=; Hm_lvt_78c58f01938e4d85eaf619eae71b4ed1=1533992361,1533998469; Hm_lpvt_78c58f01938e4d85eaf619eae71b4ed1=1533998496; v=AvYwAjBHsS9PCEXLZexL20PSRyfuFzpQjFtutWDf4ll0o5zbyKeKYVzrvsAz', 'hexin-v': 'AiDRI3i0b1qEZNNemO_FOZlE8SXqKQQBpg9Y4Jox7pbOH8oZQjnUg_YdKIHp', 'Host': 'q.10jqka.com.cn', 'Referer': 'http://q.10jqka.com.cn/', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest'}, 100 | {'Accept': 'text/html, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Connection': 'keep-alive', 'Cookie': 'Hm_lvt_78c58f01938e4d85eaf619eae71b4ed1=1533992361; Hm_lpvt_78c58f01938e4d85eaf619eae71b4ed1=1533992361; user=MDq62sm9SnpsOjpOb25lOjUwMDo0Njk1NDE0MTM6NywxMTExMTExMTExMSw0MDs0NCwxMSw0MDs2LDEsNDA7NSwxLDQwOzEsMSw0MDsyLDEsNDA7MywxLDQwOzUsMSw0MDs4LDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAxLDQwOjo6OjQ1OTU0MTQxMzoxNTMzOTk4MjA5Ojo6MTUzMzk5ODE2MDo4NjQwMDowOjFlYTE2YTBjYTU4MGNmYmJlZWJmZWExODQ3ODRjOTAxNDpkZWZhdWx0XzI6MQ%3D%3D; userid=459541413; u_name=%BA%DA%C9%BDJzl; escapename=%25u9ed1%25u5c71Jzl; ticket=b909a4542156f3781a86b8aaefce3007; v=ApheKMKxdxX9FluRdtjNUdGcac08gfwLXuXQj9KJ5FOGbTKxepHMm671oBoh', 'hexin-v': 'AiDRI3i0b1qEZNNemO_FOZlE8SXqKQQBpg9Y4Jox7pbOH8oZQjnUg_YdKIHp', 'Host': 'q.10jqka.com.cn', 'Referer': 'http://q.10jqka.com.cn/', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest'}, 101 | 102 | ] 103 | 104 | try: 105 | time.sleep(random.random()*5) #设置延时 106 | headers = random.choice(headers_list) 107 | r = requests.get(url, headers = headers, proxies=proxies, timeout=4) 108 | except: 109 | if num_retries > 0: 110 | print("重新下载") 111 | self.proxy_con = 0 #更换代理 112 | self.downloader(url,num_retries-1) 113 | else: 114 | if not self.PAGE_TRACK in self.PAGE_LIST: #首先应该判断 该页是否存在列表中,如果不存在, 则将其加入其中 115 | self.PAGE_LIST.append(self.PAGE_TRACK) #将获取失败的url保存起来,后面再次循环利用,将元素添加在末尾, 116 | else: 117 | return r.text 118 | 119 | 120 | def items_return(self): 121 | sys.setrecursionlimit(5000) 122 | count = 0 123 | while True: 124 | if self.FLAG < self.MAX_PAGE: 125 | url_list = self.url_yield() #获取url 126 | else: 127 | url_list = self.url_omi() 128 | if len(PAGE_LIST) ==0: 129 | break 130 | print("执行到了获取模块") 131 | 132 | for url in url_list: 133 | html = self.downloader(url) 134 | #打印提示信息 135 | print('URL is:', url) 136 | items = {} #建立一个空字典,用于信息存储 137 | try: 138 | soup = BeautifulSoup(html, 'lxml') 139 | for tr in soup.find('tbody').find_all('tr'): 140 | td_list = tr.find_all('td') 141 | items['代码'] = td_list[1].string 142 | items['名称'] = td_list[2].string 143 | items['现价'] = td_list[3].string 144 | items['涨跌幅'] = td_list[4].string 145 | self.writer.writerow(items) 146 | print(items) 147 | print("保存成功") 148 | #如果保存成功,则继续使用代理 149 | self.proxy_con = 1 150 | #print("解析成功") 151 | #yield items #将结果返回 152 | except: 153 | print("解析失败") 154 | #解析失败,则将代理换掉 155 | self.proxy_con = 0 156 | #print(html) 157 | if not self.PAGE_TRACK in self.PAGE_LIST: 158 | self.PAGE_LIST.append(self.PAGE_TRACK) 159 | else: 160 | count += 1 161 | 162 | if count == 2: 163 | break 164 | 165 | 166 | if __name__ == '__main__': 167 | app = crawl() 168 | app.items_return() #打印最后的结果 --------------------------------------------------------------------------------