├── .gitignore ├── README.md ├── bmp ├── online.jpg ├── redis.png ├── result.jpg └── worker.jpg ├── fenbus.py ├── get_gupiao.py ├── pythonData ├── Data20190117.xls └── Data20190117排名.xls ├── requirements.txt ├── run.sh └── wtask.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .buildpath 3 | .hgignore.swp 4 | .project 5 | .orig 6 | .swp 7 | *.swp 8 | *.swo 9 | .idea/ 10 | .settings/ 11 | .vscode/ 12 | vender/ 13 | log/ 14 | composer.lock 15 | gitpush.sh 16 | cbuild 17 | */.DS_Store 18 | config/config.toml 19 | config.toml 20 | main 21 | .vscode 22 | go.sum 23 | *.log 24 | /med-prescription 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fgupiao 2 | 百度股票分布式爬虫 3 | 4 | ## 开始 5 | 6 | ### 启动 redis 7 | 8 | ``` 9 | .\redis-server.exe .\redis.windows.conf 10 | ``` 11 | ![image](https://github.com/itzujun/fgupiao/blob/master/bmp/redis.png) 12 | 13 | ### 启动 worker 14 | 15 | ``` 16 | celery -A wtask worker --loglevel=info 17 | ``` 18 | 19 | ![image](https://github.com/itzujun/fgupiao/blob/master/bmp/worker.jpg) 20 | 21 | ### 运行客户端 22 | ``` 23 | python fenbus.py #运行客户端 24 | ``` 25 | 26 | ### 运行效果图 27 | ![image](https://github.com/itzujun/fgupiao/blob/master/bmp/online.jpg) 28 | 29 | ### 运行结果 30 | ![image](https://github.com/itzujun/fgupiao/blob/master/bmp/result.jpg) 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /bmp/online.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itzujun/fgupiao/0be4fb865a633edcefb74401428c3383259309f9/bmp/online.jpg -------------------------------------------------------------------------------- /bmp/redis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itzujun/fgupiao/0be4fb865a633edcefb74401428c3383259309f9/bmp/redis.png -------------------------------------------------------------------------------- /bmp/result.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itzujun/fgupiao/0be4fb865a633edcefb74401428c3383259309f9/bmp/result.jpg -------------------------------------------------------------------------------- /bmp/worker.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itzujun/fgupiao/0be4fb865a633edcefb74401428c3383259309f9/bmp/worker.jpg -------------------------------------------------------------------------------- /fenbus.py: -------------------------------------------------------------------------------- 1 | # _*_ coding:utf-8 _*_ 2 | 3 | """ 4 | 分布式运算 抓取股票详情 5 | """ 6 | 7 | __time__ = "2019.01.15" 8 | __author__ = "open_china" 9 | 10 | import os 11 | import sys 12 | import time 13 | 14 | import numpy as np 15 | import pandas as pd 16 | import requests 17 | from bs4 import BeautifulSoup 18 | from celery import group 19 | 20 | from wtask import down 21 | 22 | 23 | class GupiaoSpider(object): 24 | def __init__(self): 25 | self.baseurl = "http://quote.eastmoney.com/stocklist.html" 26 | self.Data = [] 27 | self.Date = time.strftime('%Y%m%d') 28 | self.Recordpath = '.\\pythonData\\' 29 | self.filename = 'Data' + self.Date 30 | self.limit = 800 # 设置开启N个线程 31 | self.session = requests.Session() 32 | self.timeout = 100 33 | if not os.path.exists(self.Recordpath): 34 | os.makedirs(self.Recordpath) 35 | 36 | def getTotalUrl(self): 37 | try: 38 | req = self.session.get(self.baseurl, timeout=self.timeout) 39 | if int(req.status_code) != 200: 40 | return None 41 | req.encoding = "gbk" 42 | lis = BeautifulSoup(req.text, 'lxml').select("div.quotebody li") 43 | data_lis = [] 44 | for msg in lis: 45 | cuturl = msg.a["href"].split("/")[-1].replace(".html", "") 46 | names = msg.text.split("(") 47 | name = names[0] 48 | code = names[1].replace(")", "") 49 | if not (cuturl.startswith("sz300") or cuturl.startswith("sh002")): 50 | continue 51 | add = {"url": cuturl, "name": name, "code": code} 52 | data_lis.append(add) 53 | return data_lis 54 | except Exception as e: 55 | print(sys._getframe().f_lineno, e) 56 | return None 57 | 58 | def download(self, tups): 59 | print("start to down...") 60 | lis = list(tups) 61 | col = int(np.floor(len(lis) / self.limit)) 62 | downlis = np.array(lis[0:col * self.limit]).reshape(col, self.limit).tolist() 63 | if col * self.limit < len(lis): 64 | downlis.append(lis[col * self.limit:]) 65 | print(downlis) 66 | for urls in downlis: 67 | print("len:>>>>>>>>>>>>.", len(urls)) 68 | g = group(down.s(parms["url"], parms["name"], parms["code"]) for parms in urls).apply_async() 69 | print(">>>>>>>>>>>>... :", len(g)) 70 | for a in g: 71 | results = a.get() 72 | if results is not None: self.Data.append(results) 73 | print(">>>>> ", results) 74 | print("批量下载结束...") 75 | self.save() 76 | 77 | def save(self): 78 | df = pd.DataFrame(self.Data) 79 | df.to_excel(self.Recordpath + self.filename + '.xls', index=False) # 未排名 80 | df["涨幅"] = df["涨幅"].apply(lambda x: float(str(x).replace("%", ""))) 81 | df = df.sort_values(by=["涨幅"], ascending=[False]) 82 | df["涨幅"] = df["涨幅"].apply(lambda x: str(x) + "%") 83 | df.to_excel(self.Recordpath + self.filename + '排名.xls', index=False) 84 | print("保存文件成功:", self.Recordpath) 85 | 86 | 87 | if __name__ == "__main__": 88 | t0 = time.time() 89 | spider = GupiaoSpider() 90 | urllis = spider.getTotalUrl() 91 | if urllis is not None: 92 | spider.download(urllis) 93 | t1 = time.time() 94 | print("used: ", str(t1 - t0)) 95 | pass 96 | -------------------------------------------------------------------------------- /get_gupiao.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re 3 | import json 4 | import pandas as pd 5 | import time 6 | import datetime 7 | import os 8 | 9 | 10 | class GuPiao(): 11 | def __init__(self): 12 | self.date = time.strftime('%Y%m%d') 13 | self.url = "http://55.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112406092635132097686_1628574993000&pn={page}&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152" 14 | 15 | def get_data(self, page): 16 | requests.Session() 17 | resp = requests.get(self.url.format(page=page)) 18 | text = resp.text 19 | 20 | start = resp.text.find('(') 21 | end = resp.text.find(')') 22 | new_text = (text[start + 1: end]) 23 | inp_dict = json.loads(new_text) 24 | 25 | length = (len(inp_dict['data']['diff'])) 26 | temp_li = [] 27 | data = inp_dict['data']['diff'] 28 | for i in range(length): 29 | temp = {} 30 | if str(data[i]["f3"]).__contains__("-") | str(data[i]["f17"]).__contains__("-"): 31 | continue 32 | temp["名称"] = data[i]['f14'] 33 | temp["代码"] = data[i]['f12'] 34 | temp["收盘价"] = data[i]['f2'] 35 | temp["成交量"] = data[i]['f5'] 36 | temp["振幅"] = data[i]['f7'] 37 | temp["最高"] = data[i]['f15'] 38 | temp["最低"] = data[i]['f16'] 39 | temp["开盘价"] = data[i]['f17'] 40 | temp["涨幅"] = data[i]['f3'] 41 | temp_li.append(temp) 42 | return temp_li 43 | 44 | 45 | if __name__ == "__main__": 46 | gu = GuPiao() 47 | li = [] 48 | for i in range(100): 49 | temp_li = gu.get_data(i + 1) 50 | li = li + temp_li 51 | print("正在下载 {page} 数据 ...".format(page=len(li))) 52 | pass 53 | df = pd.DataFrame(li) 54 | 55 | file = "./files" 56 | if os.path.exists(file) is False: 57 | os.mkdir(file) 58 | 59 | df.to_excel("./files/" + gu.date + '排名.xlsx', index=False) 60 | print("save success") 61 | -------------------------------------------------------------------------------- /pythonData/Data20190117.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itzujun/fgupiao/0be4fb865a633edcefb74401428c3383259309f9/pythonData/Data20190117.xls -------------------------------------------------------------------------------- /pythonData/Data20190117排名.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itzujun/fgupiao/0be4fb865a633edcefb74401428c3383259309f9/pythonData/Data20190117排名.xls -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | celery==3.1.25 2 | requests==2.21.0 3 | numpy==1.14.5 4 | pandas==0.22.0 5 | beautifulsoup4==4.7.1 6 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python3 ./get_gupiao.py -------------------------------------------------------------------------------- /wtask.py: -------------------------------------------------------------------------------- 1 | # _*_ coding:utf-8_*_ 2 | 3 | """" 4 | 分布式爬虫 5 | write the code , change the world 6 | """ 7 | 8 | __author = "open_china" 9 | __time__ = "2019.01.15" 10 | 11 | import json 12 | import time 13 | 14 | import requests 15 | from celery import Celery 16 | import celery 17 | 18 | app = Celery('wtask', backend='redis://127.0.0.1:6379/0', broker='redis://127.0.0.1:6379/0') 19 | # app = Celery('wtask', backend='redis://192.168.3.2:6379/0', broker='redis://192.168.3.2:6379/0') 20 | 21 | timeout = 10 22 | 23 | 24 | class CountTask(celery.Task): 25 | count = 0 26 | 27 | def on_success(self, retval, task_id, args, kwargs): 28 | self.count = self.count + 1 29 | # 执行成功,运行该函数 30 | print("on_success>>>>>>>>>>>>>>>>>>>>>>>>>>..." + str(self.count)) 31 | return self.count 32 | 33 | def on_failure(self, exc, task_id, args, kwargs, einfo): 34 | print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> on error...") 35 | 36 | 37 | @app.task(bind=True) 38 | def test_mes(self): 39 | for i in range(1, 11): 40 | time.sleep(0.1) 41 | print("update_state>>>>>>>>>>>>>>>>>>>>>>>>") 42 | self.update_state(state="PROGRESS", meta={'p': i * 10}) 43 | return 'finish' 44 | 45 | 46 | @app.task(base=CountTask) 47 | def hello(): 48 | return "hello world" 49 | 50 | 51 | @app.task 52 | def down(url, name, code): 53 | record_d = {} 54 | record_d["名称"] = name 55 | record_d["代码"] = code 56 | linkurl = "https://gupiao.baidu.com/api/stocks/stockdaybar?from=pc&os_ver=1&cuid=xxx&vv=100&format=json&stock_code=" + \ 57 | url + "&step=3&start=&count=160&fq_type=no×tamp=" + str(int(time.time())) 58 | try: 59 | resp = requests.get(linkurl, timeout=timeout).content 60 | js = json.loads(resp) 61 | lis = js.get("mashData", "-") 62 | msg = lis[0].get("kline") 63 | record_d["涨幅"] = str(format(float(msg.get("netChangeRatio", "-")), ".2f")) + "%" 64 | record_d["开盘"] = msg.get("open", "-") 65 | record_d["最高"] = msg.get("high", "-") 66 | record_d["最低"] = msg.get("low", "-") 67 | record_d["收盘"] = msg.get("close", "-") 68 | record_d["成交量"] = msg.get("volume", "-") 69 | record_d["昨收"] = msg.get("preClose", "-") 70 | record_d["收盘"] = msg.get("close", "-") 71 | print("完成数据: " + name, code) 72 | return record_d 73 | except Exception as e: 74 | print(e, name, code) 75 | return None 76 | pass 77 | --------------------------------------------------------------------------------