├── 01_get_all_data
    ├── __init__.py
    ├── run.py
    └── xueqiu_spider
    │   ├── config.py
    │   ├── copy_data_to_history.py
    │   ├── content_spider.py
    │   └── copy_data_to_30_day.py
├── 03_get_cookies
    ├── __init__.py
    ├── ym
    │   ├── ym_config.py
    │   ├── detail_info.py
    │   └── ym.py
    ├── run.py
    └── login
    │   ├── save_cookie.py
    │   ├── chome_config.py
    │   └── open_chome.py
├── 02_add_new_comments
    ├── __init__.py
    ├── add_comments
    │   ├── config.py
    │   └── new_comments_spider.py
    └── run.py
├── 05_MongoDB_to_MySQL
    ├── __init__.py
    ├── mongo_to_mysql_taolun.py
    ├── mongo_to_mysql_jiaoyi.py
    ├── mongo_to_mysql_xinwen.py
    ├── mongo_to_mysql_yanbao.py
    └── mongo_to_mysql_gonggao.py
├── xueqiu.png
├── 04_get_history_from_wangyi
    ├── __init__.py
    ├── csv_to_mongodb.py
    └── get_history_wangyi_csv.py
└── README.md


/01_get_all_data/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 |     (主要)爬取前一天数据
3 | """


--------------------------------------------------------------------------------
/03_get_cookies/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 |     自动登录获取Cookies
3 | """


--------------------------------------------------------------------------------
/02_add_new_comments/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 |     (主要)用户评论数据定时更新
3 | """


--------------------------------------------------------------------------------
/05_MongoDB_to_MySQL/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 |     将MongoDB数据导入到MySQL中
3 | """


--------------------------------------------------------------------------------
/xueqiu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jasonhower/xueqiu/HEAD/xueqiu.png


--------------------------------------------------------------------------------
/04_get_history_from_wangyi/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 |     定期从网易财经上爬取股票历史交易数据
3 | """


--------------------------------------------------------------------------------
/03_get_cookies/ym/ym_config.py:
--------------------------------------------------------------------------------
1 | 
2 | # token
3 | TOKEN = ''
4 | 
5 | # 项目代码
6 | ITEMID = 1232


--------------------------------------------------------------------------------
/03_get_cookies/run.py:
--------------------------------------------------------------------------------
1 | from login.open_chome import *
2 | import time
3 | if __name__ == '__main__':
4 |     open_chrome()


--------------------------------------------------------------------------------
/03_get_cookies/ym/detail_info.py:
--------------------------------------------------------------------------------
1 | class detail_info():
2 |     def __init__(self):
3 |         pass
4 | 
5 |     # 获取账户余额
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 该项目主要是以雪球网中股票讨论、交易、咨询、公告和研报中用户评论数据的爬取。
 2 | 
 3 | * 01_get_all_data：(主要)爬取前一天数据
 4 | * 02_add_new_comments：(主要)用户评论数据定时更新
 5 | * 03_get_cookies：模拟自动登录获取Cookies
 6 | * 04_get_history_from_wangyi：定期从网易财经上爬取股票历史交易数据
 7 | * 05_MongoDB_to_MySQL：将MongoDB数据导入到MySQL中
 8 | 
 9 | ![雪球网爬取流程](https://github.com/wang1051992187/xueqiu/blob/master/xueqiu.png)
10 | 


--------------------------------------------------------------------------------
/03_get_cookies/login/save_cookie.py:
--------------------------------------------------------------------------------
 1 | # 数据存到cookie池
 2 | import pymongo
 3 | import time
 4 | from login.save_cookie import *
 5 | 
 6 | def save_cookie(cookie,agent):
 7 |     client = pymongo.MongoClient('192.168.1.108', connect=False)
 8 |     db = client['cookiesPool']
 9 |     collection = db['cookies']
10 |     data = {
11 |         "Host": "xueqiu.com",
12 |         "Upgrade-Insecure-Requests": "1",
13 |         "Cookie": cookie,
14 |         "User-Agent": agent,
15 |         "time": int(time.time()),
16 |         "count_used": 0
17 |     }
18 |     return collection.insert(data)


--------------------------------------------------------------------------------
/02_add_new_comments/add_comments/config.py:
--------------------------------------------------------------------------------
 1 | MONGODB_URL = '192.168.1.108'
 2 | 
 3 | COOKIE_DB = 'cookiesPool'
 4 | 
 5 | COOKIE_TABLE = 'cookies'
 6 | 
 7 | POST_DAY = ''
 8 | 
 9 | TAOLUN_DB_MYSQL = 'all_taolun_comments_2'
10 | JIAOYI_DB_MYSQL = 'all_jiaoyi_comments_2'
11 | XINWEN_DB_MYSQL = 'all_xinwen_comments_2'
12 | GONGGAO_DB_MYSQL = 'all_gonggao_comments_2'
13 | YANBAO_DB_MYSQL = 'all_yanbao_comments_2'
14 | 
15 | 
16 | MIDDLE_TAOLUN_DB = 'day30_taolun'
17 | MIDDLE_JIAOYI_DB = 'day30_jiaoyi'
18 | MIDDLE_XINWEN_DB = 'day30_xinwen'
19 | MIDDLE_GONGGAO_DB = 'day30_gonggao'
20 | MIDDLE_YANBAO_DB = 'day30_yanbao'
21 | 
22 | 


--------------------------------------------------------------------------------
/02_add_new_comments/run.py:
--------------------------------------------------------------------------------
 1 | from add_comments.config import *
 2 | from add_comments.new_comments_spider import *
 3 | from multiprocessing import Pool
 4 | import time
 5 | 
 6 | 
 7 | def run_main(type, databases):
 8 |     com = add_comments(type, databases)
 9 |     com.main()
10 | 
11 | if __name__ == '__main__':
12 |     # 实现5个进程池
13 |     pool = Pool(5)
14 | 
15 |     pool.apply_async(run_main, (1, MIDDLE_TAOLUN_DB))
16 |     time.sleep(2)
17 |     pool.apply_async(run_main, (2, MIDDLE_JIAOYI_DB))
18 |     time.sleep(2)
19 |     pool.apply_async(run_main, (3, MIDDLE_XINWEN_DB))
20 |     time.sleep(2)
21 |     pool.apply_async(run_main, (4, MIDDLE_GONGGAO_DB))
22 |     time.sleep(2)
23 |     pool.apply_async(run_main, (5, [MIDDLE_YANBAO_DB]))
24 | 
25 |     pool.close()  # 关闭进程池
26 |     pool.join()   # 主进程在这里等待，只有子进程全部结束之后，在会开启主线程
27 | 


--------------------------------------------------------------------------------
/04_get_history_from_wangyi/csv_to_mongodb.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import csv
 3 | import pymongo
 4 | 
 5 | import os.path
 6 | import re
 7 | 
 8 | 
 9 | # 遍历指定目录，显示目录下的所有文件名
10 | def eachFile(filepath):
11 |     pathDir = os.listdir(filepath)
12 |     print(pathDir)
13 |     for allDir in pathDir:
14 |         child = os.path.join('%s\%s' % (filepath, allDir))
15 |         if os.path.isfile(child):
16 |             with open(child, 'rt', encoding="ANSI") as csvfile:
17 |                 reader = csv.DictReader(csvfile)
18 |                 column = [row for row in reader]
19 |             print(column)
20 |             client = pymongo.MongoClient('192.168.1.108', connect=False)
21 |             db = client['all_history_transaction_data']
22 |             db[allDir[:-4]].insert(column)
23 | 
24 | if __name__ == '__main__':
25 |     # get_csv()
26 |     filenames = 'D:\history'  # refer root dir
27 |     arr = []
28 |     eachFile(filenames)


--------------------------------------------------------------------------------
/01_get_all_data/run.py:
--------------------------------------------------------------------------------
 1 | from xueqiu_spider.content_spider import *
 2 | from xueqiu_spider.copy_data_to_30_day import *
 3 | from xueqiu_spider.copy_data_to_history import *
 4 | from multiprocessing import Pool
 5 | import time
 6 | 
 7 | 
 8 | def run_main(type, divide_db, divide_collection, add_time):
 9 |     # type, divide_db, divide_collection, add_time
10 |     dealhtml = DealHtml(type, divide_db, divide_collection, add_time)
11 |     dealhtml.main()
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     # 实现20个进程池
16 |     pool = Pool(20)
17 | 
18 |     for i in range(6):
19 |         pool.apply_async(run_main, (i, COME_DB, SHANGHAI_A, PAST_DAY))
20 |         time.sleep(1)
21 | 
22 |     for i in range(6):
23 |         pool.apply_async(run_main, (i, COME_DB, SHENZHEN_A, PAST_DAY))
24 |         time.sleep(1)
25 | 
26 |     pool.close()  # 关闭进程池
27 |     pool.join()   # 主进程在这里等待，只有子进程全部结束之后，在会开启主线程
28 | 
29 |     # 24小时 --> 30天数据
30 |     cp = copy_middle_db()
31 |     cp.copy_middle_main()
32 | 
33 |     # 30天  --> 历史数据
34 |     cp = copy_history_db()
35 |     cp.copy_history_main()
36 | 
37 | 


--------------------------------------------------------------------------------
/05_MongoDB_to_MySQL/mongo_to_mysql_taolun.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | import pymongo
 3 | import MySQLdb
 4 | 
 5 | 
 6 | def start_MySQL():
 7 |     conn = MySQLdb.connect(
 8 |             host='192.168.1.108',
 9 |             port=3306,
10 |             user='',
11 |             passwd='',
12 |             db='xueqiu',
13 |             charset='utf8')
14 | 
15 |     cur = conn.cursor()
16 |     myConn_list = [conn, cur]
17 |     return myConn_list
18 | 
19 | 
20 | def close_MySQL(cur,conn):
21 |     cur.close()
22 |     conn.close()
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     client = pymongo.MongoClient('192.168.1.108', 27017)
27 |     TempleSpider = client['all_comments']
28 |     temple_comment_collect = TempleSpider['all_taolun_comments']
29 | 
30 |     myConn_list = start_MySQL()
31 |     cur = myConn_list[1]
32 |     conn = myConn_list[0]
33 | 
34 |     for temple in temple_comment_collect.find():
35 |         sql = "INSERT INTO all_taolun_comments_2(url,company_name,comment_people,comment_time,comment_content) VALUES ('{}', '{}', '{}', '{}', '{}')".format(temple['url'],temple['股票名称'],temple['评论人'],temple['评论时间'],temple['评论内容'])
36 |         print(sql)
37 |         try:
38 |             cur.execute(sql)
39 |             conn.commit()
40 |         except Exception as e:
41 |                 print(e)
42 |     close_MySQL(cur, conn)


--------------------------------------------------------------------------------
/03_get_cookies/login/chome_config.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | USER_AGENT = ['Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3298.4 Safari/537.36',
 5 |           'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36',
 6 |           'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0',
 7 |           'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBRO',
 8 |           'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
 9 |           'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.4737.400 QQBrowser/10.0.654.400',
10 |           'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36 Maxthon/5.1.3.2000',
11 |           'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36',
12 |           'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
13 |           'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299',
14 |           'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko']


--------------------------------------------------------------------------------
/05_MongoDB_to_MySQL/mongo_to_mysql_jiaoyi.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | import pymongo
 3 | import MySQLdb
 4 | 
 5 | 
 6 | def start_MySQL():
 7 |     conn = MySQLdb.connect(
 8 |             host='192.168.1.108',
 9 |             port=3306,
10 |             user='',
11 |             passwd='',
12 |             db='xueqiu',
13 |             charset='utf8')
14 | 
15 |     cur = conn.cursor()
16 |     myConn_list = [conn, cur]
17 |     return myConn_list
18 | 
19 | 
20 | def close_MySQL(cur,conn):
21 |     cur.close()
22 |     conn.close()
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     client = pymongo.MongoClient('192.168.1.108', 27017)
27 |     TempleSpider = client['all_comments']
28 |     temple_comment_collect = TempleSpider['all_jiaoyi_comments']
29 | 
30 |     myConn_list = start_MySQL()
31 |     cur = myConn_list[1]
32 |     conn = myConn_list[0]
33 | 
34 |     for temple in temple_comment_collect.find():
35 |         sql = "INSERT INTO all_jiaoyi_comments_2(url,company_name,comment_people,comment_time,comment_content) VALUES ('{}', '{}', '{}', '{}', '{}')".format(temple['url'],temple['股票名称'],temple['评论人'],temple['评论时间'],temple['评论内容'])
36 |         print(sql)
37 |         try:
38 |             cur.execute(sql)
39 |             conn.commit()
40 |         except Exception as e:
41 |                 sql = "insert into error(url,come_from) VALUES('{}','jiaoyi')".format(temple['url'])
42 |                 print(e)
43 |                 print(sql + "错误")
44 |                 cur.execute(sql)
45 |                 conn.commit()
46 |     close_MySQL(cur, conn)


--------------------------------------------------------------------------------
/05_MongoDB_to_MySQL/mongo_to_mysql_xinwen.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | import pymongo
 3 | import MySQLdb
 4 | 
 5 | 
 6 | def start_MySQL():
 7 |     conn = MySQLdb.connect(
 8 |             host='192.168.1.108',
 9 |             port=3306,
10 |             user='',
11 |             passwd='',
12 |             db='xueqiu',
13 |             charset='utf8')
14 | 
15 |     cur = conn.cursor()
16 |     myConn_list = [conn, cur]
17 |     return myConn_list
18 | 
19 | 
20 | def close_MySQL(cur,conn):
21 |     cur.close()
22 |     conn.close()
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     client = pymongo.MongoClient('192.168.1.108', 27017)
27 |     TempleSpider = client['all_comments']
28 |     temple_comment_collect = TempleSpider['all_xinwen_comments']
29 | 
30 |     myConn_list = start_MySQL()
31 |     cur = myConn_list[1]
32 |     conn = myConn_list[0]
33 | 
34 |     for temple in temple_comment_collect.find():
35 |         sql = "INSERT INTO all_xinwen_comments_2(url,company_name,comment_people,comment_time,comment_content) VALUES ('{}', '{}', '{}', '{}', '{}')".format(temple['url'],temple['股票名称'],temple['评论人'],temple['评论时间'],temple['评论内容'])
36 |         print(sql)
37 |         try:
38 |             cur.execute(sql)
39 |             conn.commit()
40 |         except Exception as e:
41 |                 sql = "insert into error(url,come_from) VALUES('{}','xinwen')".format(temple['url'])
42 |                 print(e)
43 |                 print(sql + "错误")
44 |                 cur.execute(sql)
45 |                 conn.commit()
46 |     close_MySQL(cur, conn)


--------------------------------------------------------------------------------
/05_MongoDB_to_MySQL/mongo_to_mysql_yanbao.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | import pymongo
 3 | import MySQLdb
 4 | 
 5 | 
 6 | def start_MySQL():
 7 |     conn = MySQLdb.connect(
 8 |             host='192.168.1.108',
 9 |             port=3306,
10 |             user='',
11 |             passwd='',
12 |             db='xueqiu',
13 |             charset='utf8')
14 | 
15 |     cur = conn.cursor()
16 |     myConn_list = [conn, cur]
17 |     return myConn_list
18 | 
19 | 
20 | def close_MySQL(cur,conn):
21 |     cur.close()
22 |     conn.close()
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     client = pymongo.MongoClient('192.168.1.108', 27017)
27 |     TempleSpider = client['all_comments']
28 |     temple_comment_collect = TempleSpider['all_yanbao_comments']
29 | 
30 |     myConn_list = start_MySQL()
31 |     cur = myConn_list[1]
32 |     conn = myConn_list[0]
33 | 
34 |     for temple in temple_comment_collect.find():
35 |         sql = "INSERT INTO all_yanbao_comments_2(url,company_name,comment_people,comment_time,comment_content) VALUES ('{}', '{}', '{}', '{}', '{}')".format(temple['url'],temple['股票名称'],temple['评论人'],temple['评论时间'],temple['评论内容'])
36 |         print(sql)
37 |         try:
38 |             cur.execute(sql)
39 |             conn.commit()
40 |         except Exception as e:
41 |                 sql = "insert into error(url,come_from) VALUES('{}','yanbao')".format(temple['url'])
42 |                 print(e)
43 |                 print(sql + "错误")
44 |                 cur.execute(sql)
45 |                 conn.commit()
46 |     close_MySQL(cur, conn)


--------------------------------------------------------------------------------
/05_MongoDB_to_MySQL/mongo_to_mysql_gonggao.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | import pymongo
 3 | import MySQLdb
 4 | 
 5 | 
 6 | def start_MySQL():
 7 |     conn = MySQLdb.connect(
 8 |             host='192.168.1.108',
 9 |             port=3306,
10 |             user='',
11 |             passwd='',
12 |             db='xueqiu',
13 |             charset='utf8')
14 | 
15 |     cur = conn.cursor()
16 |     myConn_list = [conn, cur]
17 |     return myConn_list
18 | 
19 | 
20 | def close_MySQL(cur,conn):
21 |     cur.close()
22 |     conn.close()
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     client = pymongo.MongoClient('192.168.1.108', 27017)
27 |     TempleSpider = client['all_comments']
28 |     temple_comment_collect = TempleSpider['all_gonggao_comments']
29 | 
30 |     myConn_list = start_MySQL()
31 |     cur = myConn_list[1]
32 |     conn = myConn_list[0]
33 | 
34 |     for temple in temple_comment_collect.find():
35 |         sql = "INSERT INTO all_gonggao_comments_2(url,company_name,comment_people,comment_time,comment_content) VALUES ('{}', '{}', '{}', '{}', '{}')".format(temple['url'],temple['股票名称'],temple['评论人'],temple['评论时间'],temple['评论内容'])
36 |         print(sql)
37 |         try:
38 |             cur.execute(sql)
39 |             conn.commit()
40 |         except Exception as e:
41 |                 sql = "insert into error(url,come_from) VALUES('{}','gonggao')".format(temple['url'])
42 |                 print(e)
43 |                 print(sql + "错误")
44 |                 cur.execute(sql)
45 |                 conn.commit()
46 |     close_MySQL(cur, conn)
47 | 
48 | 


--------------------------------------------------------------------------------
/01_get_all_data/xueqiu_spider/config.py:
--------------------------------------------------------------------------------
 1 | # Mongodb 的地址
 2 | MONGO_URL = '192.168.1.108'
 3 | 
 4 | # 雪球网地址
 5 | XUEQIU_URL = 'https://xueqiu.com'
 6 | 
 7 | # ip代理池地址
 8 | PROXY_POOL_URL = 'http://127.0.0.1:5000/get'
 9 | 
10 | # A股来源数据库
11 | COME_DB = 'company'
12 | 
13 | # A股来源表
14 | SHENZHEN_ZB = 'shanghai_1'    # 深圳主板数据
15 | SHENZHEN_ZXQY = 'shanghai_2'  # 深圳中小企业板数据
16 | SHENZHEN_CYB = 'shanghai_3'   # 深圳创业板数据
17 | SHANGHAI_A = 'shanghai_4'     # 上海A股数据
18 | 
19 | SHANGHAI_A = 'shanghai'     # 上海A股数据
20 | SHENZHEN_A = 'shenzhen'     # 深圳A股数据
21 | 
22 | # 间隔时间(天、时、分、秒)(每天爬取前一天数据)
23 | PAST_DAY = 1 * 24 * 60 * 60
24 | 
25 | 
26 | # 保存的数据库地址
27 | TODAY_TAOLUN_DB = 'day1_taolun'
28 | TODAY_JIAOYI_DB = 'day1_jiaoyi'
29 | TODAY_XINWEN_DB = 'day1_xinwen'
30 | TODAY_GONGGAO_DB = 'day1_gonggao'
31 | TODAY_YANBAO_DB = 'day1_yanbao'
32 | 
33 | 
34 | # 30天临时数据库
35 | MIDDLE_TIME = 30
36 | 
37 | MIDDLE_TAOLUN_DB = 'day30_taolun'
38 | MIDDLE_JIAOYI_DB = 'day30_jiaoyi'
39 | MIDDLE_XINWEN_DB = 'day30_xinwen'
40 | MIDDLE_YANBAO_DB = 'day30_yanbao'
41 | MIDDLE_GONGGAO_DB = 'day30_gonggao'
42 | 
43 | MIDDLE_COMMENTS = 'middle_comments'
44 | 
45 | # 历史数据库
46 | HISTORY_TAOLUN_DB = 'xueqiu_history_taolun'
47 | HISTORY_JIAOYI_DB = 'xueqiu_history_jiaoyi'
48 | HISTORY_XINWEN_DB = 'xueqiu_history_xinwen'
49 | HISTORY_YANBAO_DB = 'xueqiu_history_yanbao'
50 | HISTORY_GONGGAO_DB = 'xueqiu_history_gonggao'
51 | 
52 | HISTORY_COMMENTS = 'xueqiu_history_comments'
53 | 
54 | TAOLUN_DB_MYSQL = 'all_taolun_comments_2'
55 | JIAOYI_DB_MYSQL = 'all_jiaoyi_comments_2'
56 | XINWEN_DB_MYSQL = 'all_xinwen_comments_2'
57 | YANBAO_DB_MYSQL = 'all_yanbao_comments_2'
58 | GONGGAO_DB_MYSQL = 'all_gonggao_comments_2'
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/03_get_cookies/ym/ym.py:
--------------------------------------------------------------------------------
 1 | #  -*- coding: utf-8 -*-
 2 | # 易码平台对接
 3 | from ym.ym_config import *
 4 | import requests
 5 | import re
 6 | import time
 7 | class ym():
 8 |     def __init__(self):
 9 |         self.token = TOKEN
10 |         self.itemid = ITEMID
11 | 
12 |     # 获取账户信息
13 |     def get_accountinfo(self):
14 |         url = 'http://api.fxhyd.cn/UserInterface.aspx?action=getaccountinfo&token=' + self.token
15 |         html = self.get_html(url)
16 |         return self.deal_html(html)
17 | 
18 |     # 获取电话号码
19 |     def get_mobile(self):
20 |         url = 'http://api.fxhyd.cn/UserInterface.aspx?action=getmobile&itemid={}&token={}'.format(self.itemid, self.token)
21 |         print(url)
22 |         html = self.get_html(url)
23 |         re = self.deal_html(html)
24 |         if re[0] == 'success':
25 |             return re[1]
26 | 
27 |     # 释放电话号码
28 |     def release(self, mobile):
29 |         url = 'http://api.fxhyd.cn/UserInterface.aspx?action=release&mobile={}&itemid={}&token={}'.format(mobile, self.itemid, self.token)
30 |         html = self.get_html(url)
31 |         return self.deal_html(html)
32 | 
33 |     # 释放全部电话号码
34 |     def release_all(self):
35 |         url = 'http://api.fxhyd.cn/UserInterface.aspx?action=releaseall&token={}'.format(self.token)
36 |         html = self.get_html(url)
37 |         return self.deal_html(html)
38 | 
39 |     # 获取短信
40 |     def get_sms(self, mobile):
41 |         for n in range(1, 13):
42 |             time.sleep(5)
43 |             url = 'http://api.fxhyd.cn/UserInterface.aspx?action=getsms&mobile={}&itemid={}&token={}&release=1'.format(mobile, self.itemid, self.token)
44 |             html = self.get_html(url)
45 |             print(html)
46 |             if html != '3001':
47 |                 res = self.deal_html(html)
48 |                 print(res)
49 |                 if res[0] == 'success':
50 |                     info = re.findall(r'\d+', res[1])
51 |                     return info[0]
52 |         return None
53 | 
54 |     # 获取短信发送状态
55 |     def get_send_sms_state(self):
56 |         pass
57 | 
58 |     # 获取网页信息
59 |     def get_html(self, url):
60 |         try:
61 |             response = requests.get(url)
62 |             response.encoding = 'utf-8'
63 |             if response.status_code == 200:
64 |                 return response.text
65 |             return None
66 |         except ConnectionError:
67 |             return None
68 | 
69 |     # 解析网页信息
70 |     def deal_html(self, html):
71 |         print(html)
72 |         return html.split("|")
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/04_get_history_from_wangyi/get_history_wangyi_csv.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import time
 3 | from lxml import etree
 4 | import pymongo
 5 | import re
 6 | class Download_HistoryStock(object):
 7 |     def __init__(self, code, name, totalCount):
 8 |         self.code = code
 9 |         self.name = name
10 |         self.totalCount = totalCount
11 |         self.start_url = "http://quotes.money.163.com/trade/lsjysj_" + totalCount + ".html#01b07"
12 |         print(self.start_url)
13 |         self.headers = {
14 |             "User-Agent": ":Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
15 |         }
16 | 
17 |     def parse_url(self):
18 |         response = requests.get(self.start_url)
19 |         print(response.status_code)
20 |         if response.status_code == 200:
21 |             return etree.HTML(response.content)
22 |         return False
23 | 
24 |     def get_date(self, response):
25 |         # 得到开始和结束的日期
26 |         start_date = ''.join(response.xpath('//input[@name="date_start_type"]/@value')[0].split('-'))
27 |         end_date = ''.join(response.xpath('//input[@name="date_end_type"]/@value')[0].split('-'))
28 |         return start_date, end_date
29 | 
30 |     def download(self, start_date, end_date):
31 |         download_url = "http://quotes.money.163.com/service/chddata.html?code=" + self.code + "&start=" + start_date + "&end=" + end_date + "&fields=TCLOSE;HIGH;LOW;TOPEN;LCLOSE;CHG;PCHG;TURNOVER;VOTURNOVER;VATURNOVER;TCAP;MCAP"
32 |         print(download_url)
33 |         data = requests.get(download_url)
34 |         f = open('D://history//' + self.name + '.csv', 'wb')
35 | 
36 |         for chunk in data.iter_content(chunk_size=10000):
37 |             if chunk:
38 |                 f.write(chunk)
39 |         print('股票---', self.code, '历史数据正在下载')
40 | 
41 |     def run(self):
42 |         try:
43 |             html = self.parse_url()
44 |             start_date, end_date = self.get_date(html)
45 |             self.download(start_date, end_date)
46 |         except Exception as e:
47 |             print(e)
48 | 
49 | if __name__ == '__main__':
50 |     # 获取股票列表
51 |     client = pymongo.MongoClient('192.168.1.108', connect=False)
52 |     db = client['company']
53 |     for table in db.collection_names():
54 |         if table != 'system.indexes':
55 |             print('table name is ', table)
56 |             collection = db[table]
57 |             cursor = collection.find(no_cursor_timeout=True)
58 |             for item in cursor:
59 |                 name = item['A股代码']
60 |                 totalCount = re.sub("\D", "", name)
61 |                 if table =='shanghai':
62 |                     temp_code = '0' + totalCount
63 |                 elif table == 'shenzhen':
64 |                     temp_code = '1' + totalCount
65 |                 print(temp_code)
66 |                 time.sleep(1)
67 |                 download = Download_HistoryStock(temp_code, item['A股代码'] + '_' + item['A股简称'], totalCount)
68 |                 download.run()
69 |             cursor.close()
70 | 


--------------------------------------------------------------------------------
/03_get_cookies/login/open_chome.py:
--------------------------------------------------------------------------------
 1 | # 模拟自动登录
 2 | import time
 3 | import random
 4 | from login.chome_config import *
 5 | from login.save_cookie import *
 6 | from ym.ym import ym
 7 | from selenium import webdriver
 8 | from selenium.webdriver.support.ui import WebDriverWait
 9 | from selenium.webdriver.chrome.options import Options
10 | 
11 | options = webdriver.ChromeOptions()
12 | 
13 | options.add_argument('--headless')
14 | options.add_argument('--disable-gpu')
15 | 
16 | 
17 | # 设置中文
18 | options.add_argument('lang=zh_CN.UTF-8')
19 | uaList = USER_AGENT
20 | 
21 | 
22 | 
23 | agent = random.choice(uaList)
24 | print(agent)
25 | options.binary_location = r'C:\Users\wangk\AppData\Local\Google\Chrome\Application\chrome.exe'
26 | # options.binary_location = '/opt/google/chrome/chrome'
27 | # 更换头部
28 | options.add_argument(
29 |     'user-agent="{}"'.format(agent))
30 | 
31 | 
32 | # chrome_options = Options()
33 | options.add_argument('window-size=1920x3000') #指定浏览器分辨率
34 | options.add_argument('--disable-extensions')
35 | options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug
36 | # chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面
37 | # chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
38 | options.add_argument('--no-sandbox')
39 | options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
40 | # chrome_options.binary_location = r'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary' #手动指定使用的浏览器位置
41 | options.binary_location = r'/opt/google/chrome/chrome'
42 | 
43 | # opener = webdriver.Chrome(r'/usr/local/bin/chromedriver', chrome_options=chrome_options)
44 | 
45 | 
46 | browser = webdriver.Chrome(r'/usr/local/bin/chromedriver', chrome_options=options)
47 | wait = WebDriverWait(browser, 10)
48 | 
49 | y = ym()
50 | 
51 | 
52 | def open_chrome():
53 |     try:
54 |         browser.get('https://xueqiu.com/')
55 |         print(browser.get_cookies())
56 |         time.sleep(2)
57 |         browser.find_element_by_class_name('nav__login__regist').click()
58 |         # 获取手机号
59 |         phone = y.get_mobile()
60 |         print(phone)
61 |         browser.find_element_by_css_selector('#app > div.modals.dimmer.js-shown > div:nth-child(1) > div.modal.modal__login > div.modal__login__main > div.modal__login__mod > div.modal__login__regist > div.modal__login__form > form > div:nth-child(1) > input[type="text"]').send_keys(phone)
62 |         browser.find_element_by_css_selector('#app > div.modals.dimmer.js-shown > div:nth-child(1) > div.modal.modal__login > div.modal__login__main > div.modal__login__mod > div.modal__login__regist > div.modal__login__form > form > div:nth-child(1) > span:nth-child(3) > a').click()
63 | 
64 |         code = y.get_sms(phone)
65 |         print(code)
66 |         browser.find_element_by_css_selector('#app > div.modals.dimmer.js-shown > div:nth-child(1) > div.modal.modal__login > div.modal__login__main > div.modal__login__mod > div.modal__login__regist > div.modal__login__form > form > div:nth-child(2) > input[type="text"]').send_keys(code)
67 |         time.sleep(2)
68 |         browser.find_element_by_css_selector('#app > div.modals.dimmer.js-shown > div:nth-child(1) > div.modal.modal__login > div.modal__login__main > div.modal__login__btn').click()
69 |         time.sleep(1)
70 |         print(browser.get_cookies())
71 |         time.sleep(4)
72 |         browser.find_element_by_css_selector('#app > div.modals.dimmer.js-shown > div:nth-child(2) > div > div.modal__confirm__btns > a.button.button-lg.modal__confirm__submit').click()
73 |         time.sleep(4)
74 |         cookies = browser.get_cookies()
75 |         print(cookies)
76 |         cook = ""
77 |         for cookie in browser.get_cookies():
78 |             cook = cook + cookie['name'] + '=' + cookie['value'] + ';'
79 |             print(cookie['name'], cookie['value'])
80 |         print(cook)
81 |         if save_cookie(cook,agent):
82 |             print("Save to Mongodb Successfully")
83 |             browser.close()
84 |             return True
85 |     except Exception:
86 |         open_chrome()
87 | 
88 | 


--------------------------------------------------------------------------------
/02_add_new_comments/add_comments/new_comments_spider.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # 增量爬评论数据
  3 | import json
  4 | import time
  5 | import pymongo
  6 | from requests.exceptions import ConnectionError
  7 | import re
  8 | import MySQLdb
  9 | import requests
 10 | import datetime
 11 | from add_comments.config import *
 12 | from urllib.parse import urlencode
 13 | from json.decoder import JSONDecodeError
 14 | 
 15 | 
 16 | proxy = None
 17 | 
 18 | 
 19 | class add_comments():
 20 |     def __init__(self, type, databases_group):
 21 |         self.type = type
 22 |         self.header = self.get_header()
 23 |         self.databases_group = databases_group
 24 | 
 25 | 
 26 |     # 获取存放到的Mysql数据库
 27 |     def get_mysql_databases(self):
 28 |         # 讨论 : 1 ; 交易 : 2 ; 新闻 : 3 ; 公告 : 4 ; 研报 : 5
 29 |         if self.type == 1:
 30 |             return TAOLUN_DB_MYSQL
 31 |         elif self.type == 2:
 32 |             return JIAOYI_DB_MYSQL
 33 |         elif self.type == 3:
 34 |             return XINWEN_DB_MYSQL
 35 |         elif self.type == 4:
 36 |             return GONGGAO_DB_MYSQL
 37 |         elif self.type == 5:
 38 |             return YANBAO_DB_MYSQL
 39 |         else:
 40 |             return None
 41 | 
 42 |     # 从cookie池获取cookie
 43 |     def get_header(self):
 44 |         client_cookie = pymongo.MongoClient(MONGODB_URL, connect=False)
 45 |         db_cookie = client_cookie[COOKIE_DB]
 46 |         collection_cookie = db_cookie[COOKIE_TABLE]
 47 |         count = collection_cookie.count()
 48 |         if count < 5:
 49 |             print("cookie池的数量小于5,当前为", count)
 50 |         cursor = collection_cookie.find(no_cursor_timeout=True)
 51 |         for item in cursor:
 52 |             now_time = int(time.time())
 53 |             if int(item['time']) + 172800 > now_time:  # 超过48小时
 54 |                 if item['count_used'] < 3:
 55 |                     headers = {
 56 |                         "Cookie": item['Cookie'],
 57 |                         "Host": "xueqiu.com",
 58 |                         "Upgrade-Insecure-Requests": "1",
 59 |                         "User-Agent": item['User-Agent']
 60 |                     }
 61 |                     # 将count + 1
 62 |                     collection_cookie.update({'_id': item['_id']}, {'$set': {'count_used': item['count_used'] + 1}})
 63 |                     print(headers)
 64 |                     return headers
 65 |                 else:
 66 |                     pass
 67 |             else:
 68 |                 print(item['_id'], "超过时间点了！")
 69 |             # 将这条信息删除
 70 |             # collection_cookie.remove({"_id": item['_id']})
 71 |         cursor.close()
 72 | 
 73 | 
 74 |     # 获取页面信息
 75 |     def comments_get_page_index(self,comment_id,count = 10,page =1):
 76 |         data = {
 77 |             'id': comment_id,
 78 |             'count': count,
 79 |             'page': page,
 80 |             'reply': 'true',
 81 |             'type': 'status',
 82 |             'split': 'true',
 83 |             'asc': 'false'
 84 |         }
 85 |         params = urlencode(data)
 86 |         base = 'https://xueqiu.com/statuses/comments.json'
 87 |         url = base + '?' + params
 88 |         return self.get_html(url)
 89 | 
 90 | 
 91 |     def parse_page_index(self,html):
 92 |         try:
 93 |             data = json.loads(html)
 94 |             if data and 'comments' in data.keys():
 95 |                 for item in data.get('comments'):
 96 |                     yield item
 97 |         except JSONDecodeError:
 98 |             pass
 99 | 
100 | 
101 |     # 采用正则表达式获取具体的信息
102 |     def comments_parse_page_detail(self,html):
103 |         datas = []
104 |         items = self.parse_page_index(html)
105 |         for it in items:
106 |             time = self.deal_data(str(it['created_at']))
107 |             people = it['user']['screen_name']
108 |             content = it['text']
109 |             try:
110 |                 content_pattern = re.compile(r'<[^>]+>', re.S)
111 |                 content = content_pattern.sub('', content)
112 |                 content = content.replace("\u0026", "").replace("nbsp;", "")
113 |             except AttributeError:
114 |                 pass
115 |             data = {
116 |                 '评论人': people,
117 |                 '评论时间': time,
118 |                 '评论内容': content
119 |             }
120 |             datas.append(data)
121 |         return datas
122 | 
123 | 
124 |     # 处理日期 1515026126000
125 |     # 将UNIX时间戳转化成标准时间戳
126 |     def deal_data(self,data):
127 |         data = int(data[:-3])
128 |         format = '%Y-%m-%d %H:%M:%S'
129 |         value = time.localtime(data)
130 |         dt = time.strftime(format, value)
131 |         return dt
132 | 
133 | 
134 |     def get_comments(self,comment_id):
135 |         comments = []
136 |         ret_json = self.comments_get_page_index(comment_id)
137 |         data = json.loads(ret_json)
138 |         count = data.get('count')
139 |         if count != 0:
140 |             i = int(count / 10) + 1
141 |             for x in range(1, i + 1):
142 |                 re_json = self.comments_get_page_index(comment_id, count=10, page=x)
143 |                 res = self.comments_parse_page_detail(re_json)
144 |                 for r in res:
145 |                     comments.append(r)
146 |         return comments
147 | 
148 | 
149 |     def deal_url(self,url):
150 |         if self.type == 3 or self.type == 4 or self.type == 5:
151 |             comment_id = re.findall(re.compile('https://xueqiu.com/S/.*?/(\d+)', re.S), str(url))
152 |         else:
153 |             comment_id = re.findall(re.compile('https://xueqiu.com/\d+/(\d+)', re.S), str(url))
154 |         return comment_id[0]
155 | 
156 | 
157 |     def get_html(self,url, count=1):
158 |         global proxy
159 |         if count >= 5:
160 |             print('Tried Too Many Counts')
161 |             return None
162 |         try:
163 |             if proxy:
164 |                 proxies = {
165 |                     'http': 'http://' + proxy
166 |                 }
167 |                 print(proxies)
168 |                 response = requests.get(url, allow_redirects=False, headers=self.header, proxies=proxies)
169 |             else:
170 |                 response = requests.get(url, allow_redirects=False, headers=self.header)
171 |             if response.status_code == 200:
172 |                 return response.text
173 |             if response.status_code == 302:
174 |                 # Need Proxy
175 |                 print('302')
176 |                 proxy = self.get_proxy()
177 |                 if proxy:
178 |                     print('Using Proxy', proxy)
179 |                     return self.get_html(url)
180 |                 else:
181 |                     print('Get Proxy Failed')
182 |                     return None
183 |         except ConnectionError as e:
184 |             print('Error Occurred', e.args)
185 |             proxy = self.get_proxy()
186 |             count += 1
187 |             return self.get_html(url, count)
188 | 
189 |     def get_proxy(self):
190 |         try:
191 |             response = requests.get('http://127.0.0.1:5000/get')
192 |             if response.status_code == 200:
193 |                 return response.text
194 |             return None
195 |         except ConnectionError:
196 |             return None
197 | 
198 |     def start_MySQL(self):
199 |         conn = MySQLdb.connect(
200 |             host='192.168.1.108',
201 |             port=3306,
202 |             user='Andlinks',
203 |             passwd='Andlinks2017',
204 |             db='xueqiu',
205 |             charset='utf8')
206 | 
207 |         cur = conn.cursor()
208 |         myConn_list = [conn, cur]
209 |         return myConn_list
210 | 
211 |     def close_MySQL(self,cur, conn):
212 |         cur.close()
213 |         conn.close()
214 | 
215 | 
216 |     def main(self):
217 |         database_groups = self.databases_group
218 | 
219 |         myConn_list = self.start_MySQL()
220 |         cur = myConn_list[1]
221 |         conn = myConn_list[0]
222 | 
223 |         for db_name in database_groups:
224 |             client = pymongo.MongoClient(MONGODB_URL, connect=False)
225 |             db = client[db_name]
226 |             for table in db.collection_names():
227 |                 if table != 'system.indexes':
228 |                     collection = db[table]
229 |                     cursor = collection.find(no_cursor_timeout=True)
230 |                     for item in cursor:
231 |                         try:
232 |                             if item['count_key'] < 8:#  更新7天内数据
233 |                                 url = item['url']
234 |                                 name = item['股票名称']
235 |                                 _id = item['_id']
236 |                                 print("当前时间为：",time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),"当前数据库为：", db_name, " 表为：", table,"url为：",url)
237 |                                 comment_id = self.deal_url(url)
238 |                                 comments = self.get_comments(comment_id)
239 |                                 # 重新将Mongodb中信息更改
240 |                                 collection.update_one({"_id": _id}, {"$set": {"评论": comments}})
241 |                                 collection.update_one({"_id": _id}, {"$set": {"评论量": str(len(comments))}})
242 |                                 # 将新的数据插入到Mysql中
243 |                                 for temple in comments:
244 |                                     sql = "INSERT INTO " + self.get_mysql_databases() + "(url,company_name,comment_people,comment_time,comment_content) VALUES ('{}', '{}', '{}', '{}', '{}')".format(url, name, temple['评论人'], temple['评论时间'], temple['评论内容'])
245 |                                     print(sql)
246 |                                     try:
247 |                                         cur.execute(sql)
248 |                                         conn.commit()
249 |                                     except Exception as e:
250 |                                         print(e)
251 |                         except KeyError as e:
252 |                             print(e)
253 |                     cursor.close()
254 |         self.close_MySQL(cur, conn)


--------------------------------------------------------------------------------
/01_get_all_data/xueqiu_spider/copy_data_to_history.py:
--------------------------------------------------------------------------------
  1 | from xueqiu_spider.config import *
  2 | from multiprocessing import Pool
  3 | import time
  4 | 
  5 | import pymongo
  6 | 
  7 | class copy_history_db(object):
  8 |     # 将超过最新记录的数据转移到历史数据里面
  9 |     def copy_taolun(self, old_db, new_db):
 10 |         client = pymongo.MongoClient(MONGO_URL, connect=False)
 11 |         db_new = client[new_db]
 12 |         db = client[old_db]
 13 |         for table in db.collection_names():
 14 |             if table != 'system.indexes':
 15 |                 print('taolun_copy,table name is ', table)
 16 |                 collection = db[table]
 17 |                 cursor = collection.find(no_cursor_timeout=True)
 18 |                 for item in cursor:
 19 |                     try:
 20 |                         if item['count_key'] > MIDDLE_TIME:
 21 |                             try:
 22 |                                 data = {
 23 |                                     'url': item['url'],
 24 |                                     'A股代码': item['A股代码'],
 25 |                                     '股票名称': item['股票名称'],
 26 |                                     '时间': item['时间'],
 27 |                                     '发布者': item['发布者'],
 28 |                                     '标题': item['标题'],
 29 |                                     '正文': item['正文'],
 30 |                                     '转发量': item['转发量'],
 31 |                                     '评论量': item['评论量'],
 32 |                                     '点赞量': item['点赞量'],
 33 |                                     '评论': item['评论'],
 34 |                                 }
 35 |                                 print(data)
 36 |                             except KeyError:
 37 |                                 data = {
 38 |                                     'url': item['url'],
 39 |                                     'A股代码': item['A股代码'],
 40 |                                     '股票名称': item['股票名称'],
 41 |                                     '时间': item['时间'],
 42 |                                     '发布者': item['发布者'],
 43 |                                     '标题': item['标题'],
 44 |                                     '正文': item['正文'],
 45 |                                     '转发量': item['转发量'],
 46 |                                     '评论量': item['评论量'],
 47 |                                     '点赞量': item['点赞量'],
 48 |                                     '评论': [],
 49 |                                 }
 50 |                                 print(data)
 51 |                                 print("错误")
 52 |                             db_new[table].insert(data)
 53 |                             for da in data['评论']:
 54 |                                 com = {
 55 |                                     'url': item['url'],
 56 |                                     '股票名称': item['股票名称'],
 57 |                                     '评论人': da['评论人'],
 58 |                                     '评论时间': da['评论时间'],
 59 |                                     '评论内容': da['评论内容']
 60 |                                 }
 61 |                                 db_comment = client[HISTORY_COMMENTS]
 62 |                                 db_comment['yanbao_comments'].insert(com)
 63 |                     except KeyError:
 64 |                         print('count_key错误！' + str(item['_id']))
 65 |                 cursor.close()
 66 | 
 67 |     # 交易内容拷贝
 68 |     def copy_jiaoyi(self, old_db, new_db):
 69 |         client = pymongo.MongoClient(MONGO_URL, connect=False)
 70 |         db_new = client[new_db]
 71 |         db = client[old_db]
 72 |         for table in db.collection_names():
 73 |             if table != 'system.indexes':
 74 |                 print('jiaoyi_copy,table name is ', table)
 75 |                 collection = db[table]
 76 |                 cursor = collection.find(no_cursor_timeout=True)
 77 |                 for item in cursor:
 78 |                     try:
 79 |                         if item['count_key'] > MIDDLE_TIME:
 80 |                             try:
 81 |                                 data = {
 82 |                                     'url': item['url'],
 83 |                                     'A股代码': item['A股代码'],
 84 |                                     '股票名称': item['股票名称'],
 85 |                                     '时间': item['时间'],
 86 |                                     '发布者': item['发布者'],
 87 |                                     '标题': item['标题'],
 88 |                                     '正文': item['正文'],
 89 |                                     '转发量': item['转发量'],
 90 |                                     '评论量': item['评论量'],
 91 |                                     '点赞量': item['点赞量'],
 92 |                                     '评论': item['评论'],
 93 |                                 }
 94 |                                 print(data)
 95 |                             except KeyError:
 96 |                                 data = {
 97 |                                     'url': item['url'],
 98 |                                     'A股代码': item['A股代码'],
 99 |                                     '股票名称': item['股票名称'],
100 |                                     '时间': item['时间'],
101 |                                     '发布者': item['发布者'],
102 |                                     '标题': item['标题'],
103 |                                     '正文': item['正文'],
104 |                                     '转发量': item['转发量'],
105 |                                     '评论量': item['评论量'],
106 |                                     '点赞量': item['点赞量'],
107 |                                     '评论': [],
108 |                                 }
109 |                                 print(data)
110 |                                 print("错误")
111 |                             db_new[table].insert(data)
112 |                             for da in data['评论']:
113 |                                 com = {
114 |                                     'url': item['url'],
115 |                                     '股票名称': item['股票名称'],
116 |                                     '评论人': da['评论人'],
117 |                                     '评论时间': da['评论时间'],
118 |                                     '评论内容': da['评论内容']
119 |                                 }
120 |                                 db_comment = client[HISTORY_COMMENTS]
121 |                                 db_comment['yanbao_comments'].insert(com)
122 |                     except KeyError:
123 |                         print('count_key错误！' + str(item['_id']))
124 |                 cursor.close()
125 | 
126 |     # 新闻内容拷贝
127 |     def copy_xinwen(self, old_db, new_db):
128 |         client = pymongo.MongoClient(MONGO_URL, connect=False)
129 |         db_new = client[new_db]
130 |         db = client[old_db]
131 |         for table in db.collection_names():
132 |             if table != 'system.indexes':
133 |                 print('xinwen_copy,table name is ', table)
134 |                 collection = db[table]
135 |                 cursor = collection.find(no_cursor_timeout=True)
136 |                 for item in cursor:
137 |                     try:
138 |                         if item['count_key'] > MIDDLE_TIME:
139 |                             try:
140 |                                 data = {
141 |                                     'url': item['url'],
142 |                                     'A股代码': item['A股代码'],
143 |                                     '股票名称': item['股票名称'],
144 |                                     '时间': item['时间'],
145 |                                     '发布者': item['发布者'],
146 |                                     '标题': item['标题'],
147 |                                     '摘要': item['摘要'],
148 |                                     '转发量': item['转发量'],
149 |                                     '评论量': item['评论量'],
150 |                                     '点赞量': item['点赞量'],
151 |                                     '外部链接': item['外部链接'],
152 |                                     '评论': item['评论'],
153 |                                 }
154 |                                 print(data)
155 |                             except KeyError:
156 |                                 data = {
157 |                                     'url': item['url'],
158 |                                     'A股代码': item['A股代码'],
159 |                                     '股票名称': item['股票名称'],
160 |                                     '时间': item['时间'],
161 |                                     '发布者': item['发布者'],
162 |                                     '标题': item['标题'],
163 |                                     '摘要': item['摘要'],
164 |                                     '转发量': item['转发量'],
165 |                                     '评论量': item['评论量'],
166 |                                     '点赞量': item['点赞量'],
167 |                                     '外部链接': item['外部链接'],
168 |                                     '评论': [],
169 |                                 }
170 |                                 print(data)
171 |                                 print("错误")
172 |                             db_new[table].insert(data)
173 |                             for da in data['评论']:
174 |                                 com = {
175 |                                     'url': item['url'],
176 |                                     '股票名称': item['股票名称'],
177 |                                     '评论人': da['评论人'],
178 |                                     '评论时间': da['评论时间'],
179 |                                     '评论内容': da['评论内容']
180 |                                 }
181 |                                 db_comment = client[HISTORY_COMMENTS]
182 |                                 db_comment['yanbao_comments'].insert(com)
183 |                     except KeyError:
184 |                         print('count_key错误！' + str(item['_id']))
185 |                 cursor.close()
186 | 
187 |     # 公告内容拷贝
188 |     def copy_gonggao(self, old_db, new_db):
189 |         client = pymongo.MongoClient(MONGO_URL, connect=False)
190 |         db_new = client[new_db]
191 |         db = client[old_db]
192 |         for table in db.collection_names():
193 |             if table != 'system.indexes':
194 |                 print('gonggao_copy,table name is ', table)
195 |                 collection = db[table]
196 |                 cursor = collection.find(no_cursor_timeout=True)
197 |                 for item in cursor:
198 |                     try:
199 |                         if item['count_key'] > MIDDLE_TIME:
200 |                             try:
201 |                                 data = {
202 |                                     'url': item['url'],
203 |                                     'A股代码': item['A股代码'],
204 |                                     '股票名称': item['股票名称'],
205 |                                     '时间': item['时间'],
206 |                                     '发布者': item['发布者'],
207 |                                     '标题': item['标题'],
208 |                                     '正文': item['正文'],
209 |                                     '转发量': item['转发量'],
210 |                                     '评论量': item['评论量'],
211 |                                     '点赞量': item['点赞量'],
212 |                                     'PDF下载链接': item['PDF下载链接'],
213 |                                     '评论': item['评论'],
214 |                                 }
215 |                                 print(data)
216 |                             except KeyError:
217 |                                 data = {
218 |                                     'url': item['url'],
219 |                                     'A股代码': item['A股代码'],
220 |                                     '股票名称': item['股票名称'],
221 |                                     '时间': item['时间'],
222 |                                     '发布者': item['发布者'],
223 |                                     '标题': item['标题'],
224 |                                     '正文': item['正文'],
225 |                                     '转发量': item['转发量'],
226 |                                     '评论量': item['评论量'],
227 |                                     '点赞量': item['点赞量'],
228 |                                     'PDF下载链接': item['PDF下载链接'],
229 |                                     '评论': [],
230 |                                 }
231 |                                 print(data)
232 |                                 print("错误")
233 |                             db_new[table].insert(data)
234 |                             for da in data['评论']:
235 |                                 com = {
236 |                                     'url': item['url'],
237 |                                     '股票名称': item['股票名称'],
238 |                                     '评论人': da['评论人'],
239 |                                     '评论时间': da['评论时间'],
240 |                                     '评论内容': da['评论内容']
241 |                                 }
242 |                                 db_comment = client[HISTORY_COMMENTS]
243 |                                 db_comment['yanbao_comments'].insert(com)
244 |                     except KeyError:
245 |                         print('count_key错误！' + str(item['_id']))
246 |                 cursor.close()
247 | 
248 |     # 研报内容拷贝
249 |     def copy_yanbao(self, old_db, new_db):
250 |         client = pymongo.MongoClient(MONGO_URL, connect=False)
251 |         db_new = client[new_db]
252 |         db = client[old_db]
253 |         for table in db.collection_names():
254 |             if table != 'system.indexes':
255 |                 print('gonggao_copy,table name is ', table)
256 |                 collection = db[table]
257 |                 cursor = collection.find(no_cursor_timeout=True)
258 |                 for item in cursor:
259 |                     try:
260 |                         if item['count_key'] > MIDDLE_TIME:
261 |                             try:
262 |                                 data = {
263 |                                     'url': item['url'],
264 |                                     'A股代码': item['A股代码'],
265 |                                     '股票名称': item['股票名称'],
266 |                                     '时间': item['时间'],
267 |                                     '发布者': item['发布者'],
268 |                                     '标题': item['标题'],
269 |                                     '正文': item['正文'],
270 |                                     '转发量': item['转发量'],
271 |                                     '评论量': item['评论量'],
272 |                                     '点赞量': item['点赞量'],
273 |                                     '评论': item['评论'],
274 |                                 }
275 |                                 print(data)
276 |                             except KeyError:
277 |                                 data = {
278 |                                     'url': item['url'],
279 |                                     'A股代码': item['A股代码'],
280 |                                     '股票名称': item['股票名称'],
281 |                                     '时间': item['时间'],
282 |                                     '发布者': item['发布者'],
283 |                                     '标题': item['标题'],
284 |                                     '正文': item['正文'],
285 |                                     '转发量': item['转发量'],
286 |                                     '评论量': item['评论量'],
287 |                                     '点赞量': item['点赞量'],
288 |                                     '评论': [],
289 |                                 }
290 |                                 print(data)
291 |                                 print("错误")
292 |                             db_new[table].insert(data)
293 |                             for da in data['评论']:
294 |                                 com = {
295 |                                     'url': item['url'],
296 |                                     '股票名称': item['股票名称'],
297 |                                     '评论人': da['评论人'],
298 |                                     '评论时间': da['评论时间'],
299 |                                     '评论内容': da['评论内容']
300 |                                 }
301 |                                 db_comment = client[HISTORY_COMMENTS]
302 |                                 db_comment['yanbao_comments'].insert(com)
303 |                     except KeyError:
304 |                         print('count_key错误！' + str(item['_id']))
305 |                 cursor.close()
306 | 
307 |     def copy_history_main(self):
308 |         self.copy_taolun(MIDDLE_TAOLUN_DB, HISTORY_TAOLUN_DB)
309 |         self.copy_jiaoyi(MIDDLE_JIAOYI_DB, HISTORY_JIAOYI_DB)
310 |         self.copy_xinwen(MIDDLE_XINWEN_DB, HISTORY_XINWEN_DB)
311 |         self.copy_gonggao(MIDDLE_GONGGAO_DB, HISTORY_GONGGAO_DB)
312 |         self.copy_yanbao(MIDDLE_YANBAO_DB, HISTORY_YANBAO_DB)
313 | 
314 | 
315 | 
316 | 
317 | 


--------------------------------------------------------------------------------
/01_get_all_data/xueqiu_spider/content_spider.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # 雪球
  3 | import re
  4 | import json
  5 | import time
  6 | import datetime
  7 | import pymongo
  8 | import random
  9 | import requests
 10 | from xueqiu_spider.config import *
 11 | from urllib.parse import urlencode
 12 | from json.decoder import JSONDecodeError
 13 | from requests.exceptions import ConnectionError
 14 | 
 15 | proxy = None
 16 | count = 0
 17 | 
 18 | 
 19 | class DealHtml(object):
 20 |     def __init__(self, type, divide_db, divide_collection, add_time):
 21 |         self.type = type
 22 |         self.header = self.get_header()
 23 |         self.databases = self.get_databases()            # 存储到的数据库
 24 |         self.past_time = int(time.time()) - add_time     # 过去时间点
 25 | 
 26 |         # 保存的数据库
 27 |         client = pymongo.MongoClient(MONGO_URL, connect=False)
 28 |         self.db = client[self.databases]
 29 | 
 30 |         # 上市公司的表
 31 |         client_sjs = pymongo.MongoClient(MONGO_URL, connect=False)
 32 |         db_sjs = client_sjs[divide_db]
 33 |         self.collection_sjs = db_sjs[divide_collection]
 34 |         print('init end')
 35 | 
 36 |     # 获取存放到的数据库
 37 |     def get_databases(self):
 38 |         # 讨论 : 1 交易 : 2 新闻 : 3 公告 : 4 研报 : 5
 39 |         if self.type == 1:
 40 |             return TODAY_TAOLUN_DB
 41 |         elif self.type == 2:
 42 |             return TODAY_JIAOYI_DB
 43 |         elif self.type == 3:
 44 |             return TODAY_XINWEN_DB
 45 |         elif self.type == 4:
 46 |             return TODAY_GONGGAO_DB
 47 |         elif self.type == 5:
 48 |             return TODAY_YANBAO_DB
 49 |         else:
 50 |             return None
 51 | 
 52 |     # 从cookie池获取cookie
 53 |     def get_header(self):
 54 |         client_cookie = pymongo.MongoClient(MONGO_URL, connect=False)
 55 |         db_cookie = client_cookie['cookiesPool']
 56 |         collection_cookie = db_cookie['cookies']
 57 |         count = collection_cookie.count()
 58 |         if count < 5:
 59 |             print("cookie池的数量小于5,当前为", count)
 60 |         cursor = collection_cookie.find(no_cursor_timeout=True)
 61 |         for item in cursor:
 62 |             now_time = int(time.time())
 63 |             if int(item['time']) + 172800 > now_time:  # 超过48小时
 64 |                 if item['count_used'] < 3:
 65 |                     headers = {
 66 |                         "Cookie": item['Cookie'],
 67 |                         "Host": "xueqiu.com",
 68 |                         "Upgrade-Insecure-Requests": "1",
 69 |                         "User-Agent": item['User-Agent']
 70 |                     }
 71 |                     # 将count + 1
 72 |                     collection_cookie.update({'_id': item['_id']}, {'$set': {'count_used': item['count_used'] + 1}})
 73 |                     print(headers)
 74 |                     return headers
 75 |                 else:
 76 |                     pass
 77 |             else:
 78 |                 print(item['_id'], "超过时间点了！")
 79 |             # 将这条信息删除
 80 |                 collection_cookie.remove({"_id": item['_id']})
 81 |         cursor.close()
 82 | 
 83 |     # 获取cookie池
 84 |     def get_proxy(self):
 85 |         try:
 86 |             response = requests.get(PROXY_POOL_URL)
 87 |             if response.status_code == 200:
 88 |                 return response.text
 89 |             return None
 90 |         except ConnectionError:
 91 |             return None
 92 | 
 93 |     # 获取 html
 94 |     def get_html(self, url, count=1):
 95 |         global proxy
 96 |         if count >= 5:
 97 |             print('Tried Too Many Counts')
 98 |             return None
 99 |         try:
100 |             if proxy:
101 |                 proxies = {
102 |                     'http': 'http://' + proxy
103 |                 }
104 |                 response = requests.get(url, allow_redirects=False, headers=self.header, proxies=proxies)
105 |             else:
106 |                 response = requests.get(url, allow_redirects=False, headers=self.header)
107 |             if response.status_code == 200:
108 |                 return response.text
109 |             if response.status_code == 302:
110 |                 # Need Proxy
111 |                 proxy = self.get_proxy()
112 |                 if proxy:
113 |                     print('Using Proxy', proxy)
114 |                     return self.get_html(url)
115 |                 else:
116 |                     print('Get Proxy Failed')
117 |                     return None
118 |         except ConnectionError as e:
119 |             print('Error Occurred', e.args)
120 |             proxy = self.get_proxy()
121 |             count += 1
122 |             return self.get_html(url, count)
123 | 
124 |     # 根据不同网页获取 url
125 |     def get_page_url(self,i,ID):
126 |         if self.type == 1:
127 |             data = {
128 |                 'count': 10,
129 |                 'comment': 0,
130 |                 'symbol': ID,
131 |                 'hl': 0,
132 |                 'source': 'user',
133 |                 'sort': 'time',
134 |                 'page': i
135 |             }
136 |             params = urlencode(data)
137 |             base = 'https://xueqiu.com/statuses/search.json'
138 |             url = base + '?' + params
139 |             return url
140 |         elif self.type == 2:
141 |             data = {
142 |                 'count': 10,
143 |                 'comment': 0,
144 |                 'symbol': ID,
145 |                 'hl': 0,
146 |                 'source': 'trans',
147 |                 'sort': 'time',
148 |                 'page': i
149 |             }
150 |             params = urlencode(data)
151 |             base = 'https://xueqiu.com/statuses/search.json'
152 |             url = base + '?' + params
153 |             return url
154 |         elif self.type == 3:
155 |             data = {
156 |                 'count': 10,
157 |                 'symbol_id': ID,
158 |                 'source': '自选股新闻',
159 |                 'page': i
160 |             }
161 |             params = urlencode(data)
162 |             base = 'https://xueqiu.com/statuses/stock_timeline.json'
163 |             url = base + '?' + params
164 |             return url
165 |         elif self.type == 4:
166 |             data = {
167 |                 'count': 10,
168 |                 'symbol_id': ID,
169 |                 'source': '公告',
170 |                 'page': i
171 |             }
172 |             params = urlencode(data)
173 |             base = 'https://xueqiu.com/statuses/stock_timeline.json'
174 |             url = base + '?' + params
175 |             return url
176 |         elif self.type == 5:
177 |             data = {
178 |                 'count': 10,
179 |                 'symbol_id': ID,
180 |                 'source': '研报',
181 |                 'page': i
182 |             }
183 |             params = urlencode(data)
184 |             base = 'https://xueqiu.com/statuses/stock_timeline.json'
185 |             url = base + '?' + params
186 |             return url
187 |         else:
188 |             return None
189 | 
190 |     # 获取页面信息
191 |     def get_page_index(self, i, ID):
192 |         url = self.get_page_url(i,ID)
193 |         if url:
194 |             try:
195 |                 response = requests.get(url, headers=self.header)
196 |                 if response.status_code == 200:
197 |                     return response.text
198 |                 return None
199 |             except ConnectionError:
200 |                 print('Error occurred')
201 |                 return None
202 | 
203 |     # 获取页面具体信息
204 |     def get_page_detail(self, url):
205 |         return self.get_html(url)
206 | 
207 |     # 由JSON数据返回URl
208 |     def parse_page_index(self, text):
209 |         try:
210 |             data = json.loads(text)
211 |             if data and 'list' in data.keys():
212 |                 for item in data.get('list'):
213 |                     yield item.get('target')
214 |         except JSONDecodeError:
215 |             pass
216 | 
217 |     # 采用正则表达式获取具体的信息
218 |     def parse_page_detail(self, html, url, ID, Name):
219 |         html = self.get_html_json(html)
220 | 
221 |         try:
222 |             text_pattern = re.compile("id.*?user_id.*?\"title\":\"(.*?)\",\"created_at\":(\d+),\"retweet_count\":(\d+),\"reply_count\":(\d+),\"fav_count\":(\d+),.*?user.*?\"screen_name\":\"(.*?)\",.*?\"like_count\":(\d+),.*?\"is_answer.*?\"text\":\"(.*?)\",\"source\".*?}", re.S)
223 |             text = re.findall(text_pattern, str(html))
224 |             title = text[0][0] if text[0][0] else '无'
225 |             time = self.deal_data(text[0][1])
226 |             retweet_count = text[0][2]
227 |             reply_count = text[0][3]
228 |             screen_name = text[0][5]
229 |             like_count = text[0][6]
230 |             content = text[0][7]
231 |             if self.type == 3 or self.type == 4:
232 |                 try:
233 |                     out_url_pattern = re.compile(r'href=\\\\"(.*?)\\\\"\stitle', re.S)
234 |                     out_url = re.findall(out_url_pattern, content)[0]
235 |                 except (AttributeError, IndexError):
236 |                     out_url = None
237 |             try:
238 |                 content_pattern = re.compile(r'<[^>]+>', re.S)
239 |                 content = content_pattern.sub('', content)
240 |                 content = content.replace("&nbsp;", "").replace("\u0026", "")
241 |             except AttributeError:
242 |                 pass
243 |         except IndexError:
244 |             print('error')
245 |             return None
246 |         if self.type == 3:
247 |             return {
248 |                 'A股代码': ID,
249 |                 '股票名称': Name,
250 |                 'url': url,
251 |                 '标题': title,
252 |                 '时间': time,
253 |                 '摘要': content,
254 |                 '发布者': screen_name,
255 |                 '外部链接': out_url,
256 |                 '转发量': retweet_count,
257 |                 '评论量': reply_count,
258 |                 '点赞量': like_count,
259 |                 'count_key': 1
260 |             }
261 |         elif self.type == 4:
262 |             return {
263 |                 'A股代码': ID,
264 |                 '股票名称': Name,
265 |                 'url': url,
266 |                 '标题': title,
267 |                 '时间': time,
268 |                 '正文': content,
269 |                 '发布者': screen_name,
270 |                 'PDF下载链接': out_url,
271 |                 '转发量': retweet_count,
272 |                 '评论量': reply_count,
273 |                 '点赞量': like_count,
274 |                 'count_key': 1
275 |             }
276 |         else:
277 |             return {
278 |                 'A股代码': ID,
279 |                 '股票名称': Name,
280 |                 'url': url,
281 |                 '标题': title,
282 |                 '时间': time,
283 |                 '正文': content,
284 |                 '发布者': screen_name,
285 |                 '转发量': retweet_count,
286 |                 '评论量': reply_count,
287 |                 '点赞量': like_count,
288 |                 'count_key': 1
289 |             }
290 | 
291 |     # 提取 JSON
292 |     def get_html_json(self, html):
293 |         text_pattern = re.compile('window.SNOWMAN_STATUS\s=(.*?);\swindow.SNOWMAN_TARGET', re.S)
294 |         text = re.findall(text_pattern, html)
295 |         try:
296 |             # data = json.loads(text[0])
297 |             return text
298 |         except JSONDecodeError:
299 |             return None
300 | 
301 |     # 处理日期 1515026126000
302 |     # 将UNIX时间戳转化成标准时间戳
303 |     def deal_data(self, data):
304 |         data = int(data[:-3])
305 |         format = '%Y-%m-%d %H:%M:%S'
306 |         value = time.localtime(data)
307 |         dt = time.strftime(format, value)
308 |         return dt
309 | 
310 |     # 判断是否出现异常
311 |     def error_code(self, html):
312 |         try:
313 |             data = json.loads(html)
314 |             if data and 'error_code' in data.keys():
315 |                 return data['error_code']
316 |         except JSONDecodeError:
317 |             pass
318 | 
319 |     # 保存到Mongodb中
320 |     def save_to_mongodb(self, result, table_name):
321 |         if self.db[table_name].insert(result):
322 |             print('Successfully Saved to Mongodb', result['url'])
323 |             return True
324 |         return False
325 | 
326 |     # 获取最大页数
327 |     def get_maxPage(self, text):
328 |         try:
329 |             data = json.loads(text)
330 |             if data and 'maxPage' in data.keys():
331 |                 return data.get('maxPage')
332 |         except JSONDecodeError:
333 |             pass
334 | 
335 |     # 将每一条信息进行爬取
336 |     def get_connent(self, ID, Name, i):
337 |         start_url = XUEQIU_URL
338 |         get_json = self.get_page_index(i, ID)
339 |         if get_json:
340 |             if self.error_code(get_json) == "22621":  # 判断是否请求频繁
341 |                 self.header = self.get_header()
342 |             urls = self.parse_page_index(get_json)
343 |             if urls:
344 |                 for url in urls:
345 |                     url = start_url + url
346 |                     html = self.get_page_detail(url)
347 |                     if html:
348 |                         to_mongodb = self.parse_page_detail(html, url, ID, Name)
349 |                         # 写入数据库
350 |                         if to_mongodb:
351 |                             dt = to_mongodb['时间']
352 |                             # 转换成时间数组
353 |                             timeArray = time.strptime(dt, "%Y-%m-%d %H:%M:%S")
354 |                             # 转换成时间戳
355 |                             timestamp = time.mktime(timeArray)
356 |                             if timestamp < self.past_time:
357 |                                 return False
358 |                             else:
359 |                                 table_name = ID + "_" + Name
360 |                                 self.save_to_mongodb(to_mongodb, str(table_name))
361 |                 return True
362 | 
363 |     """
364 |         评论数据部分
365 |     """
366 |     # 获取页面信息
367 |     def comments_get_page_index(self, comment_id, count=10, page=1):
368 |         data = {
369 |             'id': comment_id,
370 |             'count': count,
371 |             'page': page,
372 |             'reply': 'true',
373 |             'type': 'status',
374 |             'split': 'true',
375 |             'asc': 'false'
376 |         }
377 |         params = urlencode(data)
378 |         base = 'https://xueqiu.com/statuses/comments.json'
379 |         url = base + '?' + params
380 |         return self.get_html(url)
381 | 
382 |     def parse_page_index_comment(self, html):
383 |         try:
384 |             data = json.loads(html)
385 |             if data and 'comments' in data.keys():
386 |                 for item in data.get('comments'):
387 |                     yield item
388 |         except JSONDecodeError:
389 |             pass
390 | 
391 |     # 采用正则表达式获取具体的信息
392 |     def comments_parse_page_detail(self, html):
393 |         datas = []
394 |         items = self.parse_page_index_comment(html)
395 |         for it in items:
396 |             time = self.deal_data(str(it['created_at']))
397 |             people = it['user']['screen_name']
398 |             content = it['text']
399 |             try:
400 |                 content_pattern = re.compile(r'<[^>]+>', re.S)
401 |                 content = content_pattern.sub('', content)
402 |                 content = content.replace("\u0026", "").replace("nbsp;", "").replace("\\u3000", "")
403 |             except AttributeError:
404 |                 pass
405 |             data = {
406 |                 '评论人': people,
407 |                 '评论时间': time,
408 |                 '评论内容': content
409 |             }
410 |             datas.append(data)
411 |         return datas
412 | 
413 |     def get_comments(self, comment_id):
414 |         comments = []
415 |         ret_json = self.comments_get_page_index(comment_id)
416 |         data = json.loads(ret_json)
417 |         count = data.get('count')
418 |         if count != 0:
419 |             i = int(count / 10) + 1
420 |             for x in range(1, i + 1):
421 |                 re_json = self.comments_get_page_index(comment_id, count=10, page=x)
422 |                 res = self.comments_parse_page_detail(re_json)
423 |                 for r in res:
424 |                     comments.append(r)
425 |         return comments
426 | 
427 |     def deal_url(self, url):
428 |         if self.type == 3 or self.type == 4 or self.type == 5:
429 |             comment_id = re.findall(re.compile('https://xueqiu.com/S/.*?/(\d+)', re.S), str(url))
430 |         else:
431 |             comment_id = re.findall(re.compile('https://xueqiu.com/\d+/(\d+)', re.S), str(url))
432 |         return comment_id[0]
433 | 
434 |     def comments_from(self,db_name):
435 |         client = pymongo.MongoClient(MONGO_URL, connect=False)
436 |         db = client[db_name]
437 |         for table in db.collection_names():
438 |             if table != 'system.indexes':
439 |                 collection = db[table]
440 |                 cursor = collection.find(no_cursor_timeout=True)
441 |                 for item in cursor:
442 |                     try:
443 |                         pinglun = item['评论']
444 |                     except KeyError:
445 |                         try:
446 |                             url = item['url']
447 |                             _id = item['_id']
448 |                             print(url)
449 |                             print("当前数据库为：", db_name, "\n 表为：", table)
450 |                             comment_id = self.deal_url(url)
451 |                             comments = self.get_comments(comment_id)
452 |                             collection.update_one({"_id": _id}, {"$set": {"评论": comments}})
453 |                         except KeyError:
454 |                             print("url error")
455 |                 cursor.close()
456 |     """
457 |         评论内容结束
458 |     """
459 | 
460 |     def main(self):
461 |         print('总共数据表为:',self.collection_sjs.find().count())
462 |         cursor = self.collection_sjs.find(no_cursor_timeout=True)
463 |         for item in cursor:
464 |             try:
465 |                 # A股代码
466 |                 ID = item['A股代码']
467 |                 # 公司简称
468 |                 Name = item['A股简称']
469 |                 start_url = XUEQIU_URL
470 |                 get_json = self.get_page_index(1, ID)
471 |                 if get_json:
472 |                     max_page = self.get_maxPage(get_json)
473 |                     for i in range(1, max_page + 1):
474 |                         print("当前时间为：", datetime.datetime.now(), '当前表为：', ID+ '_'+ Name,'存储的数据库为：', self.databases, '总共页码：', max_page, '现在页码:', i)
475 |                         if self.get_connent(ID, Name, i):
476 |                             sleeptime = random.randint(1, 3)
477 |                             time.sleep(sleeptime)
478 |                         else:
479 |                             break   # 跳出循环
480 |             except Exception:
481 |                 print(item + "KeyError!!!")
482 |         cursor.close()
483 | 
484 |         # 文章数据结束 开始评论数据爬取
485 |         try:
486 |             self.comments_from(self.databases)
487 |         except Exception:
488 |             self.comments_from(self.databases)
489 | 
490 | 


--------------------------------------------------------------------------------
/01_get_all_data/xueqiu_spider/copy_data_to_30_day.py:
--------------------------------------------------------------------------------
  1 | from xueqiu_spider.config import *
  2 | from multiprocessing import Pool
  3 | import time
  4 | import MySQLdb
  5 | import datetime
  6 | # 当天数据备份在30天数据库
  7 | 
  8 | import pymongo
  9 | class copy_middle_db(object):
 10 |     def __init__(self):
 11 |         self.pinglun_taolun_num  = 0
 12 |         self.pinglun_jiaoyi_num  = 0
 13 |         self.pinglun_xinwen_num  = 0
 14 |         self.pinglun_gonggao_num = 0
 15 |         self.pinglun_yanbao_num  = 0
 16 | 
 17 |     def start_MySQL(self):
 18 |         conn = MySQLdb.connect(
 19 |             host='192.168.1.108',
 20 |             port=3306,
 21 |             user='Andlinks',
 22 |             passwd='Andlinks2017',
 23 |             db='xueqiu',
 24 |             charset='utf8')
 25 | 
 26 |         cur = conn.cursor()
 27 |         myConn_list = [conn, cur]
 28 |         return myConn_list
 29 | 
 30 |     def close_MySQL(self,cur, conn):
 31 |         cur.close()
 32 |         conn.close()
 33 | 
 34 |     # 讨论内容拷贝
 35 |     def copy_taolun(self, old_db, new_db):
 36 |         client = pymongo.MongoClient(MONGO_URL, connect=False)
 37 |         db_new = client[new_db]
 38 |         db = client[old_db]
 39 | 
 40 |         myConn_list = self.start_MySQL()
 41 |         cur = myConn_list[1]
 42 |         conn = myConn_list[0]
 43 | 
 44 |         for table in db.collection_names():
 45 |             if table != 'system.indexes':
 46 |                 print('taolun_copy,table name is ',table)
 47 |                 collection = db[table]
 48 |                 cursor = collection.find(no_cursor_timeout=True)
 49 |                 for item in cursor:
 50 |                     try:
 51 |                         data = {
 52 |                             'url': item['url'],
 53 |                             'A股代码': item['A股代码'],
 54 |                             '股票名称': item['股票名称'],
 55 |                             '时间': item['时间'],
 56 |                             '发布者': item['发布者'],
 57 |                             '标题': item['标题'],
 58 |                             '正文': item['正文'],
 59 |                             '转发量': item['转发量'],
 60 |                             '评论量': item['评论量'],
 61 |                             '点赞量': item['点赞量'],
 62 |                             '评论': item['评论'],
 63 |                         }
 64 |                         print(data)
 65 |                     except KeyError:
 66 |                         data = {
 67 |                             'url': item['url'],
 68 |                             'A股代码': item['A股代码'],
 69 |                             '股票名称': item['股票名称'],
 70 |                             '时间': item['时间'],
 71 |                             '发布者': item['发布者'],
 72 |                             '标题': item['标题'],
 73 |                             '正文': item['正文'],
 74 |                             '转发量': item['转发量'],
 75 |                             '评论量': item['评论量'],
 76 |                             '点赞量': item['点赞量'],
 77 |                             '评论': [],
 78 |                         }
 79 |                         print(data)
 80 |                         print("错误")
 81 |                     db_new[table].insert(data)
 82 |                     for da in data['评论']:
 83 |                         com = {
 84 |                             'url': item['url'],
 85 |                             '股票名称': item['股票名称'],
 86 |                             '评论人': da['评论人'],
 87 |                             '评论时间': da['评论时间'],
 88 |                             '评论内容': da['评论内容']
 89 |                         }
 90 |                         self.pinglun_taolun_num += 1
 91 |                         db_comment = client[MIDDLE_COMMENTS]
 92 |                         db_comment['taolun_comments'].insert(com)
 93 | 
 94 |                         sql = "INSERT INTO all_taolun_comments_2(url,company_name,comment_people,comment_time,comment_content) VALUES ('{}', '{}', '{}', '{}', '{}')".format(com['url'], com['股票名称'], com['评论人'], com['评论时间'], com['评论内容'])
 95 |                         print(sql)
 96 |                         try:
 97 |                             cur.execute(sql)
 98 |                             conn.commit()
 99 |                         except Exception as e:
100 |                             print(e)
101 |                 cursor.close()
102 | 
103 |                 # 删除表中所有元素
104 |                 collection.remove()
105 | 
106 |         self.close_MySQL(cur, conn)
107 | 
108 |     # 交易内容拷贝
109 |     def copy_jiaoyi(self, old_db, new_db):
110 |         client = pymongo.MongoClient(MONGO_URL, connect=False)
111 |         db_new = client[new_db]
112 |         db = client[old_db]
113 | 
114 |         myConn_list = self.start_MySQL()
115 |         cur = myConn_list[1]
116 |         conn = myConn_list[0]
117 | 
118 |         for table in db.collection_names():
119 |             if table != 'system.indexes':
120 |                 print('jiaoyi_copy,table name is ',table)
121 |                 collection = db[table]
122 |                 cursor = collection.find(no_cursor_timeout=True)
123 |                 for item in cursor:
124 |                     print(item)
125 |                     try:
126 |                         data = {
127 |                             'url': item['url'],
128 |                             'A股代码': item['A股代码'],
129 |                             '股票名称': item['股票名称'],
130 |                             '时间': item['时间'],
131 |                             '发布者': item['发布者'],
132 |                             '标题': item['标题'],
133 |                             '正文': item['正文'],
134 |                             '转发量': item['转发量'],
135 |                             '评论量': item['评论量'],
136 |                             '点赞量': item['点赞量'],
137 |                             '评论': item['评论'],
138 |                         }
139 |                         print(data)
140 |                     except KeyError:
141 |                         data = {
142 |                             'url': item['url'],
143 |                             'A股代码': item['A股代码'],
144 |                             '股票名称': item['股票名称'],
145 |                             '时间': item['时间'],
146 |                             '发布者': item['发布者'],
147 |                             '标题': item['标题'],
148 |                             '正文': item['正文'],
149 |                             '转发量': item['转发量'],
150 |                             '评论量': item['评论量'],
151 |                             '点赞量': item['点赞量'],
152 |                             '评论': [],
153 |                         }
154 |                         print(data)
155 |                         print("错误")
156 |                     db_new[table].insert(data)
157 |                     for da in data['评论']:
158 |                         com = {
159 |                             'url': item['url'],
160 |                             '股票名称': item['股票名称'],
161 |                             '评论人': da['评论人'],
162 |                             '评论时间': da['评论时间'],
163 |                             '评论内容': da['评论内容']
164 |                         }
165 |                         self.pinglun_jiaoyi_num += 1
166 |                         db_comment = client[MIDDLE_COMMENTS]
167 |                         db_comment['jiaoyi_comments'].insert(com)
168 |                         sql = "INSERT INTO all_jiaoyi_comments_2(url,company_name,comment_people,comment_time,comment_content) VALUES ('{}', '{}', '{}', '{}', '{}')".format(com['url'], com['股票名称'], com['评论人'], com['评论时间'], com['评论内容'])
169 |                         print(sql)
170 |                         try:
171 |                             cur.execute(sql)
172 |                             conn.commit()
173 |                         except Exception as e:
174 |                             print(e)
175 |                 cursor.close()
176 |                 # 删除表中所有元素
177 |                 collection.remove()
178 |         self.close_MySQL(cur, conn)
179 | 
180 |     # 新闻内容拷贝
181 |     def copy_xinwen(self, old_db, new_db):
182 |         client = pymongo.MongoClient(MONGO_URL, connect=False)
183 |         db_new = client[new_db]
184 |         db = client[old_db]
185 | 
186 |         myConn_list = self.start_MySQL()
187 |         cur = myConn_list[1]
188 |         conn = myConn_list[0]
189 | 
190 |         for table in db.collection_names():
191 |             if table != 'system.indexes':
192 |                 print('xinwen_copy,table name is ',table)
193 |                 collection = db[table]
194 |                 cursor = collection.find(no_cursor_timeout=True)
195 |                 for item in cursor:
196 |                     print(item)
197 |                     try:
198 |                         data = {
199 |                             'url': item['url'],
200 |                             'A股代码': item['A股代码'],
201 |                             '股票名称': item['股票名称'],
202 |                             '时间': item['时间'],
203 |                             '发布者': item['发布者'],
204 |                             '标题': item['标题'],
205 |                             '摘要': item['摘要'],
206 |                             '转发量': item['转发量'],
207 |                             '评论量': item['评论量'],
208 |                             '点赞量': item['点赞量'],
209 |                             '外部链接': item['外部链接'],
210 |                             '评论': item['评论'],
211 |                         }
212 |                         print(data)
213 |                     except KeyError:
214 |                         data = {
215 |                             'url': item['url'],
216 |                             'A股代码': item['A股代码'],
217 |                             '股票名称': item['股票名称'],
218 |                             '时间': item['时间'],
219 |                             '发布者': item['发布者'],
220 |                             '标题': item['标题'],
221 |                             '摘要': item['摘要'],
222 |                             '转发量': item['转发量'],
223 |                             '评论量': item['评论量'],
224 |                             '点赞量': item['点赞量'],
225 |                             '外部链接': item['外部链接'],
226 |                             '评论': [],
227 |                         }
228 |                         print(data)
229 |                         print("错误")
230 |                     db_new[table].insert(data)
231 |                     for da in data['评论']:
232 |                         com = {
233 |                             'url': item['url'],
234 |                             '股票名称': item['股票名称'],
235 |                             '评论人': da['评论人'],
236 |                             '评论时间': da['评论时间'],
237 |                             '评论内容': da['评论内容']
238 |                         }
239 |                         self.pinglun_xinwen_num += 1
240 |                         db_comment = client[MIDDLE_COMMENTS]
241 |                         db_comment['xinwen_comments'].insert(com)
242 | 
243 |                         sql = "INSERT INTO all_xinwen_comments_2(url,company_name,comment_people,comment_time,comment_content) VALUES ('{}', '{}', '{}', '{}', '{}')".format(com['url'], com['股票名称'], com['评论人'], com['评论时间'], com['评论内容'])
244 |                         print(sql)
245 |                         try:
246 |                             cur.execute(sql)
247 |                             conn.commit()
248 |                         except Exception as e:
249 |                             print(e)
250 |                 cursor.close()
251 |                 # 删除表中所有元素
252 |                 collection.remove()
253 |         self.close_MySQL(cur, conn)
254 | 
255 | 
256 |     # 公告内容拷贝
257 |     def copy_gonggao(self, old_db, new_db):
258 |         client = pymongo.MongoClient(MONGO_URL, connect=False)
259 |         db_new = client[new_db]
260 |         db = client[old_db]
261 | 
262 |         myConn_list = self.start_MySQL()
263 |         cur = myConn_list[1]
264 |         conn = myConn_list[0]
265 | 
266 |         for table in db.collection_names():
267 |             if table != 'system.indexes':
268 |                 print('gonggao_copy,table name is ',table)
269 |                 collection = db[table]
270 |                 cursor = collection.find(no_cursor_timeout=True)
271 |                 for item in cursor:
272 |                     print(item)
273 |                     try:
274 |                         data = {
275 |                             'url': item['url'],
276 |                             'A股代码': item['A股代码'],
277 |                             '股票名称': item['股票名称'],
278 |                             '时间': item['时间'],
279 |                             '发布者': item['发布者'],
280 |                             '标题': item['标题'],
281 |                             '正文': item['正文'],
282 |                             '转发量': item['转发量'],
283 |                             '评论量': item['评论量'],
284 |                             '点赞量': item['点赞量'],
285 |                             'PDF下载链接': item['PDF下载链接'],
286 |                             '评论': item['评论'],
287 |                         }
288 |                         print(data)
289 |                     except KeyError:
290 |                         data = {
291 |                             'url': item['url'],
292 |                             'A股代码': item['A股代码'],
293 |                             '股票名称': item['股票名称'],
294 |                             '时间': item['时间'],
295 |                             '发布者': item['发布者'],
296 |                             '标题': item['标题'],
297 |                             '正文': item['正文'],
298 |                             '转发量': item['转发量'],
299 |                             '评论量': item['评论量'],
300 |                             '点赞量': item['点赞量'],
301 |                             'PDF下载链接': item['PDF下载链接'],
302 |                             '评论': [],
303 |                         }
304 |                         print(data)
305 |                         print("错误")
306 |                     db_new[table].insert(data)
307 |                     for da in data['评论']:
308 |                         com = {
309 |                             'url': item['url'],
310 |                             '股票名称': item['股票名称'],
311 |                             '评论人': da['评论人'],
312 |                             '评论时间': da['评论时间'],
313 |                             '评论内容': da['评论内容']
314 |                         }
315 | 
316 |                         self.pinglun_gonggao_num += 1
317 | 
318 |                         db_comment = client[MIDDLE_COMMENTS]
319 |                         db_comment['gonggao_comments'].insert(com)
320 | 
321 |                         sql = "INSERT INTO all_gonggao_comments_2(url,company_name,comment_people,comment_time,comment_content) VALUES ('{}', '{}', '{}', '{}', '{}')".format(
322 |                             com['url'], com['股票名称'], com['评论人'], com['评论时间'], com['评论内容'])
323 |                         print(sql)
324 |                         try:
325 |                             cur.execute(sql)
326 |                             conn.commit()
327 |                         except Exception as e:
328 |                             print(e)
329 |                 cursor.close()
330 |                 # 删除表中所有元素
331 |                 collection.remove()
332 |         self.close_MySQL(cur, conn)
333 | 
334 |     # 研报内容拷贝
335 |     def copy_yanbao(self, old_db, new_db):
336 |         client = pymongo.MongoClient(MONGO_URL, connect=False)
337 |         db_new = client[new_db]
338 |         db = client[old_db]
339 | 
340 |         myConn_list = self.start_MySQL()
341 |         cur = myConn_list[1]
342 |         conn = myConn_list[0]
343 | 
344 |         for table in db.collection_names():
345 |             if table != 'system.indexes':
346 |                 print('yanbao_copy,table name is ',table)
347 |                 collection = db[table]
348 |                 cursor = collection.find(no_cursor_timeout=True)
349 |                 for item in cursor:
350 |                     print(item)
351 |                     try:
352 |                         data = {
353 |                             'url': item['url'],
354 |                             'A股代码': item['A股代码'],
355 |                             '股票名称': item['股票名称'],
356 |                             '时间': item['时间'],
357 |                             '发布者': item['发布者'],
358 |                             '标题': item['标题'],
359 |                             '正文': item['正文'],
360 |                             '转发量': item['转发量'],
361 |                             '评论量': item['评论量'],
362 |                             '点赞量': item['点赞量'],
363 |                             '评论': item['评论'],
364 |                         }
365 |                         print(data)
366 |                     except KeyError:
367 |                         data = {
368 |                             'url': item['url'],
369 |                             'A股代码': item['A股代码'],
370 |                             '股票名称': item['股票名称'],
371 |                             '时间': item['时间'],
372 |                             '发布者': item['发布者'],
373 |                             '标题': item['标题'],
374 |                             '正文': item['正文'],
375 |                             '转发量': item['转发量'],
376 |                             '评论量': item['评论量'],
377 |                             '点赞量': item['点赞量'],
378 |                             '评论': [],
379 |                         }
380 |                         print(data)
381 |                         print("错误")
382 |                     db_new[table].insert(data)
383 |                     for da in data['评论']:
384 |                         com = {
385 |                             'url': item['url'],
386 |                             '股票名称': item['股票名称'],
387 |                             '评论人': da['评论人'],
388 |                             '评论时间': da['评论时间'],
389 |                             '评论内容': da['评论内容']
390 |                         }
391 | 
392 |                         self.pinglun_yanbao_num += 1
393 | 
394 |                         db_comment = client[MIDDLE_COMMENTS]
395 |                         db_comment['yanbao_comments'].insert(com)
396 | 
397 |                         sql = "INSERT INTO all_yanbao_comments_2(url,company_name,comment_people,comment_time,comment_content) VALUES ('{}', '{}', '{}', '{}', '{}')".format(
398 |                             com['url'], com['股票名称'], com['评论人'], com['评论时间'], com['评论内容'])
399 |                         print(sql)
400 |                         try:
401 |                             cur.execute(sql)
402 |                             conn.commit()
403 |                         except Exception as e:
404 |                             print(e)
405 |                 cursor.close()
406 |                 # 删除表中所有元素
407 |                 collection.remove()
408 |         self.close_MySQL(cur, conn)
409 | 
410 |     # 遍历5个数据库 将标记加 1
411 |     def deal_db(self):
412 |         self.mark(MIDDLE_TAOLUN_DB)
413 |         self.mark(MIDDLE_JIAOYI_DB)
414 |         self.mark(MIDDLE_XINWEN_DB)
415 |         self.mark(MIDDLE_GONGGAO_DB)
416 |         self.mark(MIDDLE_YANBAO_DB)
417 | 
418 |     def mark(self, DB):
419 |         client = pymongo.MongoClient(MONGO_URL, connect=False)
420 |         db = client[DB]
421 |         for table in db.collection_names():
422 |             if table != 'system.indexes':
423 |                 print('database is', DB, 'table name is ', table)
424 |                 collection = db[table]
425 |                 cursor = collection.find(no_cursor_timeout=True)
426 |                 for item in cursor:
427 |                     try:
428 |                         # 设置标记
429 |                         count_key = item['count_key'] + 1
430 |                         _id = item['_id']
431 |                         collection.update_one({"_id": _id}, {"$set": {"count_key": count_key}})
432 |                     except KeyError:
433 |                         _id = item['_id']
434 |                         collection.update_one({"_id": _id}, {"$set": {"count_key": 1}})
435 |                 cursor.close()
436 | 
437 | 
438 |     def copy_middle_main(self):
439 |         pool = Pool(5)
440 |         pool.apply_async(self.copy_taolun, (TODAY_TAOLUN_DB, MIDDLE_TAOLUN_DB))
441 |         time.sleep(1)
442 |         pool.apply_async(self.copy_jiaoyi, (TODAY_JIAOYI_DB, MIDDLE_JIAOYI_DB))
443 |         time.sleep(1)
444 |         pool.apply_async(self.copy_xinwen, (TODAY_XINWEN_DB, MIDDLE_XINWEN_DB))
445 |         time.sleep(1)
446 |         pool.apply_async(self.copy_gonggao, (TODAY_GONGGAO_DB, MIDDLE_GONGGAO_DB))
447 |         time.sleep(1)
448 |         pool.apply_async(self.copy_yanbao, (TODAY_YANBAO_DB, MIDDLE_YANBAO_DB))
449 |         time.sleep(1)
450 |         pool.close()  # 关闭进程池
451 |         pool.join()   # 主进程在这里等待，只有子进程全部结束之后，在会开启主线程
452 |         # 标注信息加 1
453 |         self.deal_db()
454 |         # 将今天的评论数据个数插入到Mysql的everyday_comments_number表中
455 |         myConn_list = self.start_MySQL()
456 |         cur = myConn_list[1]
457 |         conn = myConn_list[0]
458 |         sql = "INSERT INTO everyday_comments_number(pinglun_date,taolun_num,jiaoyi_num,xinwen_num,gonggao_num,yanbao_num) VALUES('{}',{},{},{},{},{})".format(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), int(self.pinglun_taolun_num), int(self.pinglun_jiaoyi_num), int(self.pinglun_xinwen_num), int(self.pinglun_gonggao_num), int(self.pinglun_yanbao_num))
459 |         print(sql)
460 |         try:
461 |             cur.execute(sql)
462 |             conn.commit()
463 |         except Exception as e:
464 |             print(e)
465 |         self.close_MySQL(cur, conn)
466 | 
467 | 
468 | 
469 | 
470 | 
471 | 
472 | 
473 | 


--------------------------------------------------------------------------------