├── .gitignore
├── README.md
├── bmp
    ├── online.jpg
    ├── redis.png
    ├── result.jpg
    └── worker.jpg
├── fenbus.py
├── get_gupiao.py
├── pythonData
    ├── Data20190117.xls
    └── Data20190117排名.xls
├── requirements.txt
├── run.sh
└── wtask.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | .buildpath
 3 | .hgignore.swp
 4 | .project
 5 | .orig
 6 | .swp
 7 | *.swp
 8 | *.swo
 9 | .idea/
10 | .settings/
11 | .vscode/
12 | vender/
13 | log/
14 | composer.lock
15 | gitpush.sh
16 | cbuild
17 | */.DS_Store
18 | config/config.toml
19 | config.toml
20 | main
21 | .vscode
22 | go.sum
23 | *.log
24 | /med-prescription
25 | 
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # fgupiao
 2 | 百度股票分布式爬虫
 3 | 
 4 | ## 开始
 5 | 
 6 | ### 启动 redis
 7 | 
 8 | ```
 9 |  .\redis-server.exe .\redis.windows.conf
10 | ```
11 | ![image](https://github.com/itzujun/fgupiao/blob/master/bmp/redis.png)
12 | 
13 | ### 启动 worker
14 | 
15 | ```
16 | celery -A wtask worker --loglevel=info
17 | ```
18 | 
19 | ![image](https://github.com/itzujun/fgupiao/blob/master/bmp/worker.jpg)
20 | 
21 | ### 运行客户端
22 | ```
23 | python fenbus.py  #运行客户端
24 | ```
25 | 
26 | ### 运行效果图
27 | ![image](https://github.com/itzujun/fgupiao/blob/master/bmp/online.jpg)
28 | 
29 | ### 运行结果
30 | ![image](https://github.com/itzujun/fgupiao/blob/master/bmp/result.jpg)
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/bmp/online.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itzujun/fgupiao/0be4fb865a633edcefb74401428c3383259309f9/bmp/online.jpg


--------------------------------------------------------------------------------
/bmp/redis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itzujun/fgupiao/0be4fb865a633edcefb74401428c3383259309f9/bmp/redis.png


--------------------------------------------------------------------------------
/bmp/result.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itzujun/fgupiao/0be4fb865a633edcefb74401428c3383259309f9/bmp/result.jpg


--------------------------------------------------------------------------------
/bmp/worker.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itzujun/fgupiao/0be4fb865a633edcefb74401428c3383259309f9/bmp/worker.jpg


--------------------------------------------------------------------------------
/fenbus.py:
--------------------------------------------------------------------------------
 1 | # _*_ coding:utf-8 _*_
 2 | 
 3 | """
 4 | 分布式运算 抓取股票详情
 5 | """
 6 | 
 7 | __time__ = "2019.01.15"
 8 | __author__ = "open_china"
 9 | 
10 | import os
11 | import sys
12 | import time
13 | 
14 | import numpy as np
15 | import pandas as pd
16 | import requests
17 | from bs4 import BeautifulSoup
18 | from celery import group
19 | 
20 | from wtask import down
21 | 
22 | 
23 | class GupiaoSpider(object):
24 |     def __init__(self):
25 |         self.baseurl = "http://quote.eastmoney.com/stocklist.html"
26 |         self.Data = []
27 |         self.Date = time.strftime('%Y%m%d')
28 |         self.Recordpath = '.\\pythonData\\'
29 |         self.filename = 'Data' + self.Date
30 |         self.limit = 800  # 设置开启N个线程
31 |         self.session = requests.Session()
32 |         self.timeout = 100
33 |         if not os.path.exists(self.Recordpath):
34 |             os.makedirs(self.Recordpath)
35 | 
36 |     def getTotalUrl(self):
37 |         try:
38 |             req = self.session.get(self.baseurl, timeout=self.timeout)
39 |             if int(req.status_code) != 200:
40 |                 return None
41 |             req.encoding = "gbk"
42 |             lis = BeautifulSoup(req.text, 'lxml').select("div.quotebody li")
43 |             data_lis = []
44 |             for msg in lis:
45 |                 cuturl = msg.a["href"].split("/")[-1].replace(".html", "")
46 |                 names = msg.text.split("(")
47 |                 name = names[0]
48 |                 code = names[1].replace(")", "")
49 |                 if not (cuturl.startswith("sz300") or cuturl.startswith("sh002")):
50 |                     continue
51 |                 add = {"url": cuturl, "name": name, "code": code}
52 |                 data_lis.append(add)
53 |             return data_lis
54 |         except Exception as e:
55 |             print(sys._getframe().f_lineno, e)
56 |             return None
57 | 
58 |     def download(self, tups):
59 |         print("start to down...")
60 |         lis = list(tups)
61 |         col = int(np.floor(len(lis) / self.limit))
62 |         downlis = np.array(lis[0:col * self.limit]).reshape(col, self.limit).tolist()
63 |         if col * self.limit < len(lis):
64 |             downlis.append(lis[col * self.limit:])
65 |         print(downlis)
66 |         for urls in downlis:
67 |             print("len:>>>>>>>>>>>>.", len(urls))
68 |             g = group(down.s(parms["url"], parms["name"], parms["code"]) for parms in urls).apply_async()
69 |             print(">>>>>>>>>>>>... :", len(g))
70 |             for a in g:
71 |                 results = a.get()
72 |                 if results is not None: self.Data.append(results)
73 |                 print(">>>>>  ", results)
74 |             print("批量下载结束...")
75 |         self.save()
76 | 
77 |     def save(self):
78 |         df = pd.DataFrame(self.Data)
79 |         df.to_excel(self.Recordpath + self.filename + '.xls', index=False)  # 未排名
80 |         df["涨幅"] = df["涨幅"].apply(lambda x: float(str(x).replace("%", "")))
81 |         df = df.sort_values(by=["涨幅"], ascending=[False])
82 |         df["涨幅"] = df["涨幅"].apply(lambda x: str(x) + "%")
83 |         df.to_excel(self.Recordpath + self.filename + '排名.xls', index=False)
84 |         print("保存文件成功：", self.Recordpath)
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     t0 = time.time()
89 |     spider = GupiaoSpider()
90 |     urllis = spider.getTotalUrl()
91 |     if urllis is not None:
92 |         spider.download(urllis)
93 |     t1 = time.time()
94 |     print("used: ", str(t1 - t0))
95 |     pass
96 | 


--------------------------------------------------------------------------------
/get_gupiao.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import re
 3 | import json
 4 | import pandas as pd
 5 | import time
 6 | import datetime
 7 | import os
 8 | 
 9 | 
10 | class GuPiao():
11 |     def __init__(self):
12 |         self.date = time.strftime('%Y%m%d')
13 |         self.url = "http://55.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112406092635132097686_1628574993000&pn={page}&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152"
14 | 
15 |     def get_data(self, page):
16 |         requests.Session()
17 |         resp = requests.get(self.url.format(page=page))
18 |         text = resp.text
19 | 
20 |         start = resp.text.find('(')
21 |         end = resp.text.find(')')
22 |         new_text = (text[start + 1: end])
23 |         inp_dict = json.loads(new_text)
24 | 
25 |         length = (len(inp_dict['data']['diff']))
26 |         temp_li = []
27 |         data = inp_dict['data']['diff']
28 |         for i in range(length):
29 |             temp = {}
30 |             if str(data[i]["f3"]).__contains__("-") | str(data[i]["f17"]).__contains__("-"):
31 |                 continue
32 |             temp["名称"] = data[i]['f14']
33 |             temp["代码"] = data[i]['f12']
34 |             temp["收盘价"] = data[i]['f2']
35 |             temp["成交量"] = data[i]['f5']
36 |             temp["振幅"] = data[i]['f7']
37 |             temp["最高"] = data[i]['f15']
38 |             temp["最低"] = data[i]['f16']
39 |             temp["开盘价"] = data[i]['f17']
40 |             temp["涨幅"] = data[i]['f3']
41 |             temp_li.append(temp)
42 |         return temp_li
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     gu = GuPiao()
47 |     li = []
48 |     for i in range(100):
49 |         temp_li = gu.get_data(i + 1)
50 |         li = li + temp_li
51 |         print("正在下载 {page} 数据 ...".format(page=len(li)))
52 |         pass
53 |     df = pd.DataFrame(li)
54 | 
55 |     file = "./files"
56 |     if os.path.exists(file) is False:
57 |         os.mkdir(file)
58 | 
59 |     df.to_excel("./files/" + gu.date + '排名.xlsx', index=False)
60 |     print("save success")
61 | 


--------------------------------------------------------------------------------
/pythonData/Data20190117.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itzujun/fgupiao/0be4fb865a633edcefb74401428c3383259309f9/pythonData/Data20190117.xls


--------------------------------------------------------------------------------
/pythonData/Data20190117排名.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itzujun/fgupiao/0be4fb865a633edcefb74401428c3383259309f9/pythonData/Data20190117排名.xls


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | celery==3.1.25
2 | requests==2.21.0
3 | numpy==1.14.5
4 | pandas==0.22.0
5 | beautifulsoup4==4.7.1
6 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | python3 ./get_gupiao.py


--------------------------------------------------------------------------------
/wtask.py:
--------------------------------------------------------------------------------
 1 | # _*_ coding:utf-8_*_
 2 | 
 3 | """"
 4 | 分布式爬虫
 5 | write the code , change the world
 6 | """
 7 | 
 8 | __author = "open_china"
 9 | __time__ = "2019.01.15"
10 | 
11 | import json
12 | import time
13 | 
14 | import requests
15 | from celery import Celery
16 | import celery
17 | 
18 | app = Celery('wtask', backend='redis://127.0.0.1:6379/0', broker='redis://127.0.0.1:6379/0')
19 | # app = Celery('wtask', backend='redis://192.168.3.2:6379/0', broker='redis://192.168.3.2:6379/0')
20 | 
21 | timeout = 10
22 | 
23 | 
24 | class CountTask(celery.Task):
25 |     count = 0
26 | 
27 |     def on_success(self, retval, task_id, args, kwargs):
28 |         self.count = self.count + 1
29 |         # 执行成功，运行该函数
30 |         print("on_success>>>>>>>>>>>>>>>>>>>>>>>>>>..." + str(self.count))
31 |         return self.count
32 | 
33 |     def on_failure(self, exc, task_id, args, kwargs, einfo):
34 |         print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> on error...")
35 | 
36 | 
37 | @app.task(bind=True)
38 | def test_mes(self):
39 |     for i in range(1, 11):
40 |         time.sleep(0.1)
41 |         print("update_state>>>>>>>>>>>>>>>>>>>>>>>>")
42 |         self.update_state(state="PROGRESS", meta={'p': i * 10})
43 |     return 'finish'
44 | 
45 | 
46 | @app.task(base=CountTask)
47 | def hello():
48 |     return "hello world"
49 | 
50 | 
51 | @app.task
52 | def down(url, name, code):
53 |     record_d = {}
54 |     record_d["名称"] = name
55 |     record_d["代码"] = code
56 |     linkurl = "https://gupiao.baidu.com/api/stocks/stockdaybar?from=pc&os_ver=1&cuid=xxx&vv=100&format=json&stock_code=" + \
57 |               url + "&step=3&start=&count=160&fq_type=no&timestamp=" + str(int(time.time()))
58 |     try:
59 |         resp = requests.get(linkurl, timeout=timeout).content
60 |         js = json.loads(resp)
61 |         lis = js.get("mashData", "-")
62 |         msg = lis[0].get("kline")
63 |         record_d["涨幅"] = str(format(float(msg.get("netChangeRatio", "-")), ".2f")) + "%"
64 |         record_d["开盘"] = msg.get("open", "-")
65 |         record_d["最高"] = msg.get("high", "-")
66 |         record_d["最低"] = msg.get("low", "-")
67 |         record_d["收盘"] = msg.get("close", "-")
68 |         record_d["成交量"] = msg.get("volume", "-")
69 |         record_d["昨收"] = msg.get("preClose", "-")
70 |         record_d["收盘"] = msg.get("close", "-")
71 |         print("完成数据:  " + name, code)
72 |         return record_d
73 |     except Exception as e:
74 |         print(e, name, code)
75 |         return None
76 |     pass
77 | 


--------------------------------------------------------------------------------