├── 01_get_all_data ├── __init__.py ├── run.py └── xueqiu_spider │ ├── config.py │ ├── copy_data_to_history.py │ ├── content_spider.py │ └── copy_data_to_30_day.py ├── 03_get_cookies ├── __init__.py ├── ym │ ├── ym_config.py │ ├── detail_info.py │ └── ym.py ├── run.py └── login │ ├── save_cookie.py │ ├── chome_config.py │ └── open_chome.py ├── 02_add_new_comments ├── __init__.py ├── add_comments │ ├── config.py │ └── new_comments_spider.py └── run.py ├── 05_MongoDB_to_MySQL ├── __init__.py ├── mongo_to_mysql_taolun.py ├── mongo_to_mysql_jiaoyi.py ├── mongo_to_mysql_xinwen.py ├── mongo_to_mysql_yanbao.py └── mongo_to_mysql_gonggao.py ├── xueqiu.png ├── 04_get_history_from_wangyi ├── __init__.py ├── csv_to_mongodb.py └── get_history_wangyi_csv.py └── README.md /01_get_all_data/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | (主要)爬取前一天数据 3 | """ -------------------------------------------------------------------------------- /03_get_cookies/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 自动登录获取Cookies 3 | """ -------------------------------------------------------------------------------- /02_add_new_comments/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | (主要)用户评论数据定时更新 3 | """ -------------------------------------------------------------------------------- /05_MongoDB_to_MySQL/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 将MongoDB数据导入到MySQL中 3 | """ -------------------------------------------------------------------------------- /xueqiu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jasonhower/xueqiu/HEAD/xueqiu.png -------------------------------------------------------------------------------- /04_get_history_from_wangyi/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 定期从网易财经上爬取股票历史交易数据 3 | """ -------------------------------------------------------------------------------- /03_get_cookies/ym/ym_config.py: -------------------------------------------------------------------------------- 1 | 2 | # token 3 | TOKEN = '' 4 | 5 | # 项目代码 6 | ITEMID = 1232 -------------------------------------------------------------------------------- /03_get_cookies/run.py: -------------------------------------------------------------------------------- 1 | from login.open_chome import * 2 | import time 3 | if __name__ == '__main__': 4 | open_chrome() -------------------------------------------------------------------------------- /03_get_cookies/ym/detail_info.py: -------------------------------------------------------------------------------- 1 | class detail_info(): 2 | def __init__(self): 3 | pass 4 | 5 | # 获取账户余额 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 该项目主要是以雪球网中股票讨论、交易、咨询、公告和研报中用户评论数据的爬取。 2 | 3 | * 01_get_all_data:(主要)爬取前一天数据 4 | * 02_add_new_comments:(主要)用户评论数据定时更新 5 | * 03_get_cookies:模拟自动登录获取Cookies 6 | * 04_get_history_from_wangyi:定期从网易财经上爬取股票历史交易数据 7 | * 05_MongoDB_to_MySQL:将MongoDB数据导入到MySQL中 8 | 9 | ![雪球网爬取流程](https://github.com/wang1051992187/xueqiu/blob/master/xueqiu.png) 10 | -------------------------------------------------------------------------------- /03_get_cookies/login/save_cookie.py: -------------------------------------------------------------------------------- 1 | # 数据存到cookie池 2 | import pymongo 3 | import time 4 | from login.save_cookie import * 5 | 6 | def save_cookie(cookie,agent): 7 | client = pymongo.MongoClient('192.168.1.108', connect=False) 8 | db = client['cookiesPool'] 9 | collection = db['cookies'] 10 | data = { 11 | "Host": "xueqiu.com", 12 | "Upgrade-Insecure-Requests": "1", 13 | "Cookie": cookie, 14 | "User-Agent": agent, 15 | "time": int(time.time()), 16 | "count_used": 0 17 | } 18 | return collection.insert(data) -------------------------------------------------------------------------------- /02_add_new_comments/add_comments/config.py: -------------------------------------------------------------------------------- 1 | MONGODB_URL = '192.168.1.108' 2 | 3 | COOKIE_DB = 'cookiesPool' 4 | 5 | COOKIE_TABLE = 'cookies' 6 | 7 | POST_DAY = '' 8 | 9 | TAOLUN_DB_MYSQL = 'all_taolun_comments_2' 10 | JIAOYI_DB_MYSQL = 'all_jiaoyi_comments_2' 11 | XINWEN_DB_MYSQL = 'all_xinwen_comments_2' 12 | GONGGAO_DB_MYSQL = 'all_gonggao_comments_2' 13 | YANBAO_DB_MYSQL = 'all_yanbao_comments_2' 14 | 15 | 16 | MIDDLE_TAOLUN_DB = 'day30_taolun' 17 | MIDDLE_JIAOYI_DB = 'day30_jiaoyi' 18 | MIDDLE_XINWEN_DB = 'day30_xinwen' 19 | MIDDLE_GONGGAO_DB = 'day30_gonggao' 20 | MIDDLE_YANBAO_DB = 'day30_yanbao' 21 | 22 | -------------------------------------------------------------------------------- /02_add_new_comments/run.py: -------------------------------------------------------------------------------- 1 | from add_comments.config import * 2 | from add_comments.new_comments_spider import * 3 | from multiprocessing import Pool 4 | import time 5 | 6 | 7 | def run_main(type, databases): 8 | com = add_comments(type, databases) 9 | com.main() 10 | 11 | if __name__ == '__main__': 12 | # 实现5个进程池 13 | pool = Pool(5) 14 | 15 | pool.apply_async(run_main, (1, MIDDLE_TAOLUN_DB)) 16 | time.sleep(2) 17 | pool.apply_async(run_main, (2, MIDDLE_JIAOYI_DB)) 18 | time.sleep(2) 19 | pool.apply_async(run_main, (3, MIDDLE_XINWEN_DB)) 20 | time.sleep(2) 21 | pool.apply_async(run_main, (4, MIDDLE_GONGGAO_DB)) 22 | time.sleep(2) 23 | pool.apply_async(run_main, (5, [MIDDLE_YANBAO_DB])) 24 | 25 | pool.close() # 关闭进程池 26 | pool.join() # 主进程在这里等待,只有子进程全部结束之后,在会开启主线程 27 | -------------------------------------------------------------------------------- /04_get_history_from_wangyi/csv_to_mongodb.py: -------------------------------------------------------------------------------- 1 | 2 | import csv 3 | import pymongo 4 | 5 | import os.path 6 | import re 7 | 8 | 9 | # 遍历指定目录,显示目录下的所有文件名 10 | def eachFile(filepath): 11 | pathDir = os.listdir(filepath) 12 | print(pathDir) 13 | for allDir in pathDir: 14 | child = os.path.join('%s\%s' % (filepath, allDir)) 15 | if os.path.isfile(child): 16 | with open(child, 'rt', encoding="ANSI") as csvfile: 17 | reader = csv.DictReader(csvfile) 18 | column = [row for row in reader] 19 | print(column) 20 | client = pymongo.MongoClient('192.168.1.108', connect=False) 21 | db = client['all_history_transaction_data'] 22 | db[allDir[:-4]].insert(column) 23 | 24 | if __name__ == '__main__': 25 | # get_csv() 26 | filenames = 'D:\history' # refer root dir 27 | arr = [] 28 | eachFile(filenames) -------------------------------------------------------------------------------- /01_get_all_data/run.py: -------------------------------------------------------------------------------- 1 | from xueqiu_spider.content_spider import * 2 | from xueqiu_spider.copy_data_to_30_day import * 3 | from xueqiu_spider.copy_data_to_history import * 4 | from multiprocessing import Pool 5 | import time 6 | 7 | 8 | def run_main(type, divide_db, divide_collection, add_time): 9 | # type, divide_db, divide_collection, add_time 10 | dealhtml = DealHtml(type, divide_db, divide_collection, add_time) 11 | dealhtml.main() 12 | 13 | 14 | if __name__ == '__main__': 15 | # 实现20个进程池 16 | pool = Pool(20) 17 | 18 | for i in range(6): 19 | pool.apply_async(run_main, (i, COME_DB, SHANGHAI_A, PAST_DAY)) 20 | time.sleep(1) 21 | 22 | for i in range(6): 23 | pool.apply_async(run_main, (i, COME_DB, SHENZHEN_A, PAST_DAY)) 24 | time.sleep(1) 25 | 26 | pool.close() # 关闭进程池 27 | pool.join() # 主进程在这里等待,只有子进程全部结束之后,在会开启主线程 28 | 29 | # 24小时 --> 30天数据 30 | cp = copy_middle_db() 31 | cp.copy_middle_main() 32 | 33 | # 30天 --> 历史数据 34 | cp = copy_history_db() 35 | cp.copy_history_main() 36 | 37 | -------------------------------------------------------------------------------- /05_MongoDB_to_MySQL/mongo_to_mysql_taolun.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import pymongo 3 | import MySQLdb 4 | 5 | 6 | def start_MySQL(): 7 | conn = MySQLdb.connect( 8 | host='192.168.1.108', 9 | port=3306, 10 | user='', 11 | passwd='', 12 | db='xueqiu', 13 | charset='utf8') 14 | 15 | cur = conn.cursor() 16 | myConn_list = [conn, cur] 17 | return myConn_list 18 | 19 | 20 | def close_MySQL(cur,conn): 21 | cur.close() 22 | conn.close() 23 | 24 | 25 | if __name__ == "__main__": 26 | client = pymongo.MongoClient('192.168.1.108', 27017) 27 | TempleSpider = client['all_comments'] 28 | temple_comment_collect = TempleSpider['all_taolun_comments'] 29 | 30 | myConn_list = start_MySQL() 31 | cur = myConn_list[1] 32 | conn = myConn_list[0] 33 | 34 | for temple in temple_comment_collect.find(): 35 | sql = "INSERT INTO all_taolun_comments_2(url,company_name,comment_people,comment_time,comment_content) VALUES ('{}', '{}', '{}', '{}', '{}')".format(temple['url'],temple['股票名称'],temple['评论人'],temple['评论时间'],temple['评论内容']) 36 | print(sql) 37 | try: 38 | cur.execute(sql) 39 | conn.commit() 40 | except Exception as e: 41 | print(e) 42 | close_MySQL(cur, conn) -------------------------------------------------------------------------------- /03_get_cookies/login/chome_config.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | USER_AGENT = ['Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3298.4 Safari/537.36', 5 | 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36', 6 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0', 7 | 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBRO', 8 | 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0', 9 | 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.4737.400 QQBrowser/10.0.654.400', 10 | 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36 Maxthon/5.1.3.2000', 11 | 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36', 12 | 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', 13 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299', 14 | 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'] -------------------------------------------------------------------------------- /05_MongoDB_to_MySQL/mongo_to_mysql_jiaoyi.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import pymongo 3 | import MySQLdb 4 | 5 | 6 | def start_MySQL(): 7 | conn = MySQLdb.connect( 8 | host='192.168.1.108', 9 | port=3306, 10 | user='', 11 | passwd='', 12 | db='xueqiu', 13 | charset='utf8') 14 | 15 | cur = conn.cursor() 16 | myConn_list = [conn, cur] 17 | return myConn_list 18 | 19 | 20 | def close_MySQL(cur,conn): 21 | cur.close() 22 | conn.close() 23 | 24 | 25 | if __name__ == "__main__": 26 | client = pymongo.MongoClient('192.168.1.108', 27017) 27 | TempleSpider = client['all_comments'] 28 | temple_comment_collect = TempleSpider['all_jiaoyi_comments'] 29 | 30 | myConn_list = start_MySQL() 31 | cur = myConn_list[1] 32 | conn = myConn_list[0] 33 | 34 | for temple in temple_comment_collect.find(): 35 | sql = "INSERT INTO all_jiaoyi_comments_2(url,company_name,comment_people,comment_time,comment_content) VALUES ('{}', '{}', '{}', '{}', '{}')".format(temple['url'],temple['股票名称'],temple['评论人'],temple['评论时间'],temple['评论内容']) 36 | print(sql) 37 | try: 38 | cur.execute(sql) 39 | conn.commit() 40 | except Exception as e: 41 | sql = "insert into error(url,come_from) VALUES('{}','jiaoyi')".format(temple['url']) 42 | print(e) 43 | print(sql + "错误") 44 | cur.execute(sql) 45 | conn.commit() 46 | close_MySQL(cur, conn) -------------------------------------------------------------------------------- /05_MongoDB_to_MySQL/mongo_to_mysql_xinwen.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import pymongo 3 | import MySQLdb 4 | 5 | 6 | def start_MySQL(): 7 | conn = MySQLdb.connect( 8 | host='192.168.1.108', 9 | port=3306, 10 | user='', 11 | passwd='', 12 | db='xueqiu', 13 | charset='utf8') 14 | 15 | cur = conn.cursor() 16 | myConn_list = [conn, cur] 17 | return myConn_list 18 | 19 | 20 | def close_MySQL(cur,conn): 21 | cur.close() 22 | conn.close() 23 | 24 | 25 | if __name__ == "__main__": 26 | client = pymongo.MongoClient('192.168.1.108', 27017) 27 | TempleSpider = client['all_comments'] 28 | temple_comment_collect = TempleSpider['all_xinwen_comments'] 29 | 30 | myConn_list = start_MySQL() 31 | cur = myConn_list[1] 32 | conn = myConn_list[0] 33 | 34 | for temple in temple_comment_collect.find(): 35 | sql = "INSERT INTO all_xinwen_comments_2(url,company_name,comment_people,comment_time,comment_content) VALUES ('{}', '{}', '{}', '{}', '{}')".format(temple['url'],temple['股票名称'],temple['评论人'],temple['评论时间'],temple['评论内容']) 36 | print(sql) 37 | try: 38 | cur.execute(sql) 39 | conn.commit() 40 | except Exception as e: 41 | sql = "insert into error(url,come_from) VALUES('{}','xinwen')".format(temple['url']) 42 | print(e) 43 | print(sql + "错误") 44 | cur.execute(sql) 45 | conn.commit() 46 | close_MySQL(cur, conn) -------------------------------------------------------------------------------- /05_MongoDB_to_MySQL/mongo_to_mysql_yanbao.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import pymongo 3 | import MySQLdb 4 | 5 | 6 | def start_MySQL(): 7 | conn = MySQLdb.connect( 8 | host='192.168.1.108', 9 | port=3306, 10 | user='', 11 | passwd='', 12 | db='xueqiu', 13 | charset='utf8') 14 | 15 | cur = conn.cursor() 16 | myConn_list = [conn, cur] 17 | return myConn_list 18 | 19 | 20 | def close_MySQL(cur,conn): 21 | cur.close() 22 | conn.close() 23 | 24 | 25 | if __name__ == "__main__": 26 | client = pymongo.MongoClient('192.168.1.108', 27017) 27 | TempleSpider = client['all_comments'] 28 | temple_comment_collect = TempleSpider['all_yanbao_comments'] 29 | 30 | myConn_list = start_MySQL() 31 | cur = myConn_list[1] 32 | conn = myConn_list[0] 33 | 34 | for temple in temple_comment_collect.find(): 35 | sql = "INSERT INTO all_yanbao_comments_2(url,company_name,comment_people,comment_time,comment_content) VALUES ('{}', '{}', '{}', '{}', '{}')".format(temple['url'],temple['股票名称'],temple['评论人'],temple['评论时间'],temple['评论内容']) 36 | print(sql) 37 | try: 38 | cur.execute(sql) 39 | conn.commit() 40 | except Exception as e: 41 | sql = "insert into error(url,come_from) VALUES('{}','yanbao')".format(temple['url']) 42 | print(e) 43 | print(sql + "错误") 44 | cur.execute(sql) 45 | conn.commit() 46 | close_MySQL(cur, conn) -------------------------------------------------------------------------------- /05_MongoDB_to_MySQL/mongo_to_mysql_gonggao.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import pymongo 3 | import MySQLdb 4 | 5 | 6 | def start_MySQL(): 7 | conn = MySQLdb.connect( 8 | host='192.168.1.108', 9 | port=3306, 10 | user='', 11 | passwd='', 12 | db='xueqiu', 13 | charset='utf8') 14 | 15 | cur = conn.cursor() 16 | myConn_list = [conn, cur] 17 | return myConn_list 18 | 19 | 20 | def close_MySQL(cur,conn): 21 | cur.close() 22 | conn.close() 23 | 24 | 25 | if __name__ == "__main__": 26 | client = pymongo.MongoClient('192.168.1.108', 27017) 27 | TempleSpider = client['all_comments'] 28 | temple_comment_collect = TempleSpider['all_gonggao_comments'] 29 | 30 | myConn_list = start_MySQL() 31 | cur = myConn_list[1] 32 | conn = myConn_list[0] 33 | 34 | for temple in temple_comment_collect.find(): 35 | sql = "INSERT INTO all_gonggao_comments_2(url,company_name,comment_people,comment_time,comment_content) VALUES ('{}', '{}', '{}', '{}', '{}')".format(temple['url'],temple['股票名称'],temple['评论人'],temple['评论时间'],temple['评论内容']) 36 | print(sql) 37 | try: 38 | cur.execute(sql) 39 | conn.commit() 40 | except Exception as e: 41 | sql = "insert into error(url,come_from) VALUES('{}','gonggao')".format(temple['url']) 42 | print(e) 43 | print(sql + "错误") 44 | cur.execute(sql) 45 | conn.commit() 46 | close_MySQL(cur, conn) 47 | 48 | -------------------------------------------------------------------------------- /01_get_all_data/xueqiu_spider/config.py: -------------------------------------------------------------------------------- 1 | # Mongodb 的地址 2 | MONGO_URL = '192.168.1.108' 3 | 4 | # 雪球网地址 5 | XUEQIU_URL = 'https://xueqiu.com' 6 | 7 | # ip代理池地址 8 | PROXY_POOL_URL = 'http://127.0.0.1:5000/get' 9 | 10 | # A股来源数据库 11 | COME_DB = 'company' 12 | 13 | # A股来源表 14 | SHENZHEN_ZB = 'shanghai_1' # 深圳主板数据 15 | SHENZHEN_ZXQY = 'shanghai_2' # 深圳中小企业板数据 16 | SHENZHEN_CYB = 'shanghai_3' # 深圳创业板数据 17 | SHANGHAI_A = 'shanghai_4' # 上海A股数据 18 | 19 | SHANGHAI_A = 'shanghai' # 上海A股数据 20 | SHENZHEN_A = 'shenzhen' # 深圳A股数据 21 | 22 | # 间隔时间(天、时、分、秒)(每天爬取前一天数据) 23 | PAST_DAY = 1 * 24 * 60 * 60 24 | 25 | 26 | # 保存的数据库地址 27 | TODAY_TAOLUN_DB = 'day1_taolun' 28 | TODAY_JIAOYI_DB = 'day1_jiaoyi' 29 | TODAY_XINWEN_DB = 'day1_xinwen' 30 | TODAY_GONGGAO_DB = 'day1_gonggao' 31 | TODAY_YANBAO_DB = 'day1_yanbao' 32 | 33 | 34 | # 30天临时数据库 35 | MIDDLE_TIME = 30 36 | 37 | MIDDLE_TAOLUN_DB = 'day30_taolun' 38 | MIDDLE_JIAOYI_DB = 'day30_jiaoyi' 39 | MIDDLE_XINWEN_DB = 'day30_xinwen' 40 | MIDDLE_YANBAO_DB = 'day30_yanbao' 41 | MIDDLE_GONGGAO_DB = 'day30_gonggao' 42 | 43 | MIDDLE_COMMENTS = 'middle_comments' 44 | 45 | # 历史数据库 46 | HISTORY_TAOLUN_DB = 'xueqiu_history_taolun' 47 | HISTORY_JIAOYI_DB = 'xueqiu_history_jiaoyi' 48 | HISTORY_XINWEN_DB = 'xueqiu_history_xinwen' 49 | HISTORY_YANBAO_DB = 'xueqiu_history_yanbao' 50 | HISTORY_GONGGAO_DB = 'xueqiu_history_gonggao' 51 | 52 | HISTORY_COMMENTS = 'xueqiu_history_comments' 53 | 54 | TAOLUN_DB_MYSQL = 'all_taolun_comments_2' 55 | JIAOYI_DB_MYSQL = 'all_jiaoyi_comments_2' 56 | XINWEN_DB_MYSQL = 'all_xinwen_comments_2' 57 | YANBAO_DB_MYSQL = 'all_yanbao_comments_2' 58 | GONGGAO_DB_MYSQL = 'all_gonggao_comments_2' 59 | 60 | 61 | -------------------------------------------------------------------------------- /03_get_cookies/ym/ym.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 易码平台对接 3 | from ym.ym_config import * 4 | import requests 5 | import re 6 | import time 7 | class ym(): 8 | def __init__(self): 9 | self.token = TOKEN 10 | self.itemid = ITEMID 11 | 12 | # 获取账户信息 13 | def get_accountinfo(self): 14 | url = 'http://api.fxhyd.cn/UserInterface.aspx?action=getaccountinfo&token=' + self.token 15 | html = self.get_html(url) 16 | return self.deal_html(html) 17 | 18 | # 获取电话号码 19 | def get_mobile(self): 20 | url = 'http://api.fxhyd.cn/UserInterface.aspx?action=getmobile&itemid={}&token={}'.format(self.itemid, self.token) 21 | print(url) 22 | html = self.get_html(url) 23 | re = self.deal_html(html) 24 | if re[0] == 'success': 25 | return re[1] 26 | 27 | # 释放电话号码 28 | def release(self, mobile): 29 | url = 'http://api.fxhyd.cn/UserInterface.aspx?action=release&mobile={}&itemid={}&token={}'.format(mobile, self.itemid, self.token) 30 | html = self.get_html(url) 31 | return self.deal_html(html) 32 | 33 | # 释放全部电话号码 34 | def release_all(self): 35 | url = 'http://api.fxhyd.cn/UserInterface.aspx?action=releaseall&token={}'.format(self.token) 36 | html = self.get_html(url) 37 | return self.deal_html(html) 38 | 39 | # 获取短信 40 | def get_sms(self, mobile): 41 | for n in range(1, 13): 42 | time.sleep(5) 43 | url = 'http://api.fxhyd.cn/UserInterface.aspx?action=getsms&mobile={}&itemid={}&token={}&release=1'.format(mobile, self.itemid, self.token) 44 | html = self.get_html(url) 45 | print(html) 46 | if html != '3001': 47 | res = self.deal_html(html) 48 | print(res) 49 | if res[0] == 'success': 50 | info = re.findall(r'\d+', res[1]) 51 | return info[0] 52 | return None 53 | 54 | # 获取短信发送状态 55 | def get_send_sms_state(self): 56 | pass 57 | 58 | # 获取网页信息 59 | def get_html(self, url): 60 | try: 61 | response = requests.get(url) 62 | response.encoding = 'utf-8' 63 | if response.status_code == 200: 64 | return response.text 65 | return None 66 | except ConnectionError: 67 | return None 68 | 69 | # 解析网页信息 70 | def deal_html(self, html): 71 | print(html) 72 | return html.split("|") 73 | 74 | 75 | -------------------------------------------------------------------------------- /04_get_history_from_wangyi/get_history_wangyi_csv.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import time 3 | from lxml import etree 4 | import pymongo 5 | import re 6 | class Download_HistoryStock(object): 7 | def __init__(self, code, name, totalCount): 8 | self.code = code 9 | self.name = name 10 | self.totalCount = totalCount 11 | self.start_url = "http://quotes.money.163.com/trade/lsjysj_" + totalCount + ".html#01b07" 12 | print(self.start_url) 13 | self.headers = { 14 | "User-Agent": ":Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36" 15 | } 16 | 17 | def parse_url(self): 18 | response = requests.get(self.start_url) 19 | print(response.status_code) 20 | if response.status_code == 200: 21 | return etree.HTML(response.content) 22 | return False 23 | 24 | def get_date(self, response): 25 | # 得到开始和结束的日期 26 | start_date = ''.join(response.xpath('//input[@name="date_start_type"]/@value')[0].split('-')) 27 | end_date = ''.join(response.xpath('//input[@name="date_end_type"]/@value')[0].split('-')) 28 | return start_date, end_date 29 | 30 | def download(self, start_date, end_date): 31 | download_url = "http://quotes.money.163.com/service/chddata.html?code=" + self.code + "&start=" + start_date + "&end=" + end_date + "&fields=TCLOSE;HIGH;LOW;TOPEN;LCLOSE;CHG;PCHG;TURNOVER;VOTURNOVER;VATURNOVER;TCAP;MCAP" 32 | print(download_url) 33 | data = requests.get(download_url) 34 | f = open('D://history//' + self.name + '.csv', 'wb') 35 | 36 | for chunk in data.iter_content(chunk_size=10000): 37 | if chunk: 38 | f.write(chunk) 39 | print('股票---', self.code, '历史数据正在下载') 40 | 41 | def run(self): 42 | try: 43 | html = self.parse_url() 44 | start_date, end_date = self.get_date(html) 45 | self.download(start_date, end_date) 46 | except Exception as e: 47 | print(e) 48 | 49 | if __name__ == '__main__': 50 | # 获取股票列表 51 | client = pymongo.MongoClient('192.168.1.108', connect=False) 52 | db = client['company'] 53 | for table in db.collection_names(): 54 | if table != 'system.indexes': 55 | print('table name is ', table) 56 | collection = db[table] 57 | cursor = collection.find(no_cursor_timeout=True) 58 | for item in cursor: 59 | name = item['A股代码'] 60 | totalCount = re.sub("\D", "", name) 61 | if table =='shanghai': 62 | temp_code = '0' + totalCount 63 | elif table == 'shenzhen': 64 | temp_code = '1' + totalCount 65 | print(temp_code) 66 | time.sleep(1) 67 | download = Download_HistoryStock(temp_code, item['A股代码'] + '_' + item['A股简称'], totalCount) 68 | download.run() 69 | cursor.close() 70 | -------------------------------------------------------------------------------- /03_get_cookies/login/open_chome.py: -------------------------------------------------------------------------------- 1 | # 模拟自动登录 2 | import time 3 | import random 4 | from login.chome_config import * 5 | from login.save_cookie import * 6 | from ym.ym import ym 7 | from selenium import webdriver 8 | from selenium.webdriver.support.ui import WebDriverWait 9 | from selenium.webdriver.chrome.options import Options 10 | 11 | options = webdriver.ChromeOptions() 12 | 13 | options.add_argument('--headless') 14 | options.add_argument('--disable-gpu') 15 | 16 | 17 | # 设置中文 18 | options.add_argument('lang=zh_CN.UTF-8') 19 | uaList = USER_AGENT 20 | 21 | 22 | 23 | agent = random.choice(uaList) 24 | print(agent) 25 | options.binary_location = r'C:\Users\wangk\AppData\Local\Google\Chrome\Application\chrome.exe' 26 | # options.binary_location = '/opt/google/chrome/chrome' 27 | # 更换头部 28 | options.add_argument( 29 | 'user-agent="{}"'.format(agent)) 30 | 31 | 32 | # chrome_options = Options() 33 | options.add_argument('window-size=1920x3000') #指定浏览器分辨率 34 | options.add_argument('--disable-extensions') 35 | options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug 36 | # chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面 37 | # chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度 38 | options.add_argument('--no-sandbox') 39 | options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败 40 | # chrome_options.binary_location = r'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary' #手动指定使用的浏览器位置 41 | options.binary_location = r'/opt/google/chrome/chrome' 42 | 43 | # opener = webdriver.Chrome(r'/usr/local/bin/chromedriver', chrome_options=chrome_options) 44 | 45 | 46 | browser = webdriver.Chrome(r'/usr/local/bin/chromedriver', chrome_options=options) 47 | wait = WebDriverWait(browser, 10) 48 | 49 | y = ym() 50 | 51 | 52 | def open_chrome(): 53 | try: 54 | browser.get('https://xueqiu.com/') 55 | print(browser.get_cookies()) 56 | time.sleep(2) 57 | browser.find_element_by_class_name('nav__login__regist').click() 58 | # 获取手机号 59 | phone = y.get_mobile() 60 | print(phone) 61 | browser.find_element_by_css_selector('#app > div.modals.dimmer.js-shown > div:nth-child(1) > div.modal.modal__login > div.modal__login__main > div.modal__login__mod > div.modal__login__regist > div.modal__login__form > form > div:nth-child(1) > input[type="text"]').send_keys(phone) 62 | browser.find_element_by_css_selector('#app > div.modals.dimmer.js-shown > div:nth-child(1) > div.modal.modal__login > div.modal__login__main > div.modal__login__mod > div.modal__login__regist > div.modal__login__form > form > div:nth-child(1) > span:nth-child(3) > a').click() 63 | 64 | code = y.get_sms(phone) 65 | print(code) 66 | browser.find_element_by_css_selector('#app > div.modals.dimmer.js-shown > div:nth-child(1) > div.modal.modal__login > div.modal__login__main > div.modal__login__mod > div.modal__login__regist > div.modal__login__form > form > div:nth-child(2) > input[type="text"]').send_keys(code) 67 | time.sleep(2) 68 | browser.find_element_by_css_selector('#app > div.modals.dimmer.js-shown > div:nth-child(1) > div.modal.modal__login > div.modal__login__main > div.modal__login__btn').click() 69 | time.sleep(1) 70 | print(browser.get_cookies()) 71 | time.sleep(4) 72 | browser.find_element_by_css_selector('#app > div.modals.dimmer.js-shown > div:nth-child(2) > div > div.modal__confirm__btns > a.button.button-lg.modal__confirm__submit').click() 73 | time.sleep(4) 74 | cookies = browser.get_cookies() 75 | print(cookies) 76 | cook = "" 77 | for cookie in browser.get_cookies(): 78 | cook = cook + cookie['name'] + '=' + cookie['value'] + ';' 79 | print(cookie['name'], cookie['value']) 80 | print(cook) 81 | if save_cookie(cook,agent): 82 | print("Save to Mongodb Successfully") 83 | browser.close() 84 | return True 85 | except Exception: 86 | open_chrome() 87 | 88 | -------------------------------------------------------------------------------- /02_add_new_comments/add_comments/new_comments_spider.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # 增量爬评论数据 3 | import json 4 | import time 5 | import pymongo 6 | from requests.exceptions import ConnectionError 7 | import re 8 | import MySQLdb 9 | import requests 10 | import datetime 11 | from add_comments.config import * 12 | from urllib.parse import urlencode 13 | from json.decoder import JSONDecodeError 14 | 15 | 16 | proxy = None 17 | 18 | 19 | class add_comments(): 20 | def __init__(self, type, databases_group): 21 | self.type = type 22 | self.header = self.get_header() 23 | self.databases_group = databases_group 24 | 25 | 26 | # 获取存放到的Mysql数据库 27 | def get_mysql_databases(self): 28 | # 讨论 : 1 ; 交易 : 2 ; 新闻 : 3 ; 公告 : 4 ; 研报 : 5 29 | if self.type == 1: 30 | return TAOLUN_DB_MYSQL 31 | elif self.type == 2: 32 | return JIAOYI_DB_MYSQL 33 | elif self.type == 3: 34 | return XINWEN_DB_MYSQL 35 | elif self.type == 4: 36 | return GONGGAO_DB_MYSQL 37 | elif self.type == 5: 38 | return YANBAO_DB_MYSQL 39 | else: 40 | return None 41 | 42 | # 从cookie池获取cookie 43 | def get_header(self): 44 | client_cookie = pymongo.MongoClient(MONGODB_URL, connect=False) 45 | db_cookie = client_cookie[COOKIE_DB] 46 | collection_cookie = db_cookie[COOKIE_TABLE] 47 | count = collection_cookie.count() 48 | if count < 5: 49 | print("cookie池的数量小于5,当前为", count) 50 | cursor = collection_cookie.find(no_cursor_timeout=True) 51 | for item in cursor: 52 | now_time = int(time.time()) 53 | if int(item['time']) + 172800 > now_time: # 超过48小时 54 | if item['count_used'] < 3: 55 | headers = { 56 | "Cookie": item['Cookie'], 57 | "Host": "xueqiu.com", 58 | "Upgrade-Insecure-Requests": "1", 59 | "User-Agent": item['User-Agent'] 60 | } 61 | # 将count + 1 62 | collection_cookie.update({'_id': item['_id']}, {'$set': {'count_used': item['count_used'] + 1}}) 63 | print(headers) 64 | return headers 65 | else: 66 | pass 67 | else: 68 | print(item['_id'], "超过时间点了!") 69 | # 将这条信息删除 70 | # collection_cookie.remove({"_id": item['_id']}) 71 | cursor.close() 72 | 73 | 74 | # 获取页面信息 75 | def comments_get_page_index(self,comment_id,count = 10,page =1): 76 | data = { 77 | 'id': comment_id, 78 | 'count': count, 79 | 'page': page, 80 | 'reply': 'true', 81 | 'type': 'status', 82 | 'split': 'true', 83 | 'asc': 'false' 84 | } 85 | params = urlencode(data) 86 | base = 'https://xueqiu.com/statuses/comments.json' 87 | url = base + '?' + params 88 | return self.get_html(url) 89 | 90 | 91 | def parse_page_index(self,html): 92 | try: 93 | data = json.loads(html) 94 | if data and 'comments' in data.keys(): 95 | for item in data.get('comments'): 96 | yield item 97 | except JSONDecodeError: 98 | pass 99 | 100 | 101 | # 采用正则表达式获取具体的信息 102 | def comments_parse_page_detail(self,html): 103 | datas = [] 104 | items = self.parse_page_index(html) 105 | for it in items: 106 | time = self.deal_data(str(it['created_at'])) 107 | people = it['user']['screen_name'] 108 | content = it['text'] 109 | try: 110 | content_pattern = re.compile(r'<[^>]+>', re.S) 111 | content = content_pattern.sub('', content) 112 | content = content.replace("\u0026", "").replace("nbsp;", "") 113 | except AttributeError: 114 | pass 115 | data = { 116 | '评论人': people, 117 | '评论时间': time, 118 | '评论内容': content 119 | } 120 | datas.append(data) 121 | return datas 122 | 123 | 124 | # 处理日期 1515026126000 125 | # 将UNIX时间戳转化成标准时间戳 126 | def deal_data(self,data): 127 | data = int(data[:-3]) 128 | format = '%Y-%m-%d %H:%M:%S' 129 | value = time.localtime(data) 130 | dt = time.strftime(format, value) 131 | return dt 132 | 133 | 134 | def get_comments(self,comment_id): 135 | comments = [] 136 | ret_json = self.comments_get_page_index(comment_id) 137 | data = json.loads(ret_json) 138 | count = data.get('count') 139 | if count != 0: 140 | i = int(count / 10) + 1 141 | for x in range(1, i + 1): 142 | re_json = self.comments_get_page_index(comment_id, count=10, page=x) 143 | res = self.comments_parse_page_detail(re_json) 144 | for r in res: 145 | comments.append(r) 146 | return comments 147 | 148 | 149 | def deal_url(self,url): 150 | if self.type == 3 or self.type == 4 or self.type == 5: 151 | comment_id = re.findall(re.compile('https://xueqiu.com/S/.*?/(\d+)', re.S), str(url)) 152 | else: 153 | comment_id = re.findall(re.compile('https://xueqiu.com/\d+/(\d+)', re.S), str(url)) 154 | return comment_id[0] 155 | 156 | 157 | def get_html(self,url, count=1): 158 | global proxy 159 | if count >= 5: 160 | print('Tried Too Many Counts') 161 | return None 162 | try: 163 | if proxy: 164 | proxies = { 165 | 'http': 'http://' + proxy 166 | } 167 | print(proxies) 168 | response = requests.get(url, allow_redirects=False, headers=self.header, proxies=proxies) 169 | else: 170 | response = requests.get(url, allow_redirects=False, headers=self.header) 171 | if response.status_code == 200: 172 | return response.text 173 | if response.status_code == 302: 174 | # Need Proxy 175 | print('302') 176 | proxy = self.get_proxy() 177 | if proxy: 178 | print('Using Proxy', proxy) 179 | return self.get_html(url) 180 | else: 181 | print('Get Proxy Failed') 182 | return None 183 | except ConnectionError as e: 184 | print('Error Occurred', e.args) 185 | proxy = self.get_proxy() 186 | count += 1 187 | return self.get_html(url, count) 188 | 189 | def get_proxy(self): 190 | try: 191 | response = requests.get('http://127.0.0.1:5000/get') 192 | if response.status_code == 200: 193 | return response.text 194 | return None 195 | except ConnectionError: 196 | return None 197 | 198 | def start_MySQL(self): 199 | conn = MySQLdb.connect( 200 | host='192.168.1.108', 201 | port=3306, 202 | user='Andlinks', 203 | passwd='Andlinks2017', 204 | db='xueqiu', 205 | charset='utf8') 206 | 207 | cur = conn.cursor() 208 | myConn_list = [conn, cur] 209 | return myConn_list 210 | 211 | def close_MySQL(self,cur, conn): 212 | cur.close() 213 | conn.close() 214 | 215 | 216 | def main(self): 217 | database_groups = self.databases_group 218 | 219 | myConn_list = self.start_MySQL() 220 | cur = myConn_list[1] 221 | conn = myConn_list[0] 222 | 223 | for db_name in database_groups: 224 | client = pymongo.MongoClient(MONGODB_URL, connect=False) 225 | db = client[db_name] 226 | for table in db.collection_names(): 227 | if table != 'system.indexes': 228 | collection = db[table] 229 | cursor = collection.find(no_cursor_timeout=True) 230 | for item in cursor: 231 | try: 232 | if item['count_key'] < 8:# 更新7天内数据 233 | url = item['url'] 234 | name = item['股票名称'] 235 | _id = item['_id'] 236 | print("当前时间为:",time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),"当前数据库为:", db_name, " 表为:", table,"url为:",url) 237 | comment_id = self.deal_url(url) 238 | comments = self.get_comments(comment_id) 239 | # 重新将Mongodb中信息更改 240 | collection.update_one({"_id": _id}, {"$set": {"评论": comments}}) 241 | collection.update_one({"_id": _id}, {"$set": {"评论量": str(len(comments))}}) 242 | # 将新的数据插入到Mysql中 243 | for temple in comments: 244 | sql = "INSERT INTO " + self.get_mysql_databases() + "(url,company_name,comment_people,comment_time,comment_content) VALUES ('{}', '{}', '{}', '{}', '{}')".format(url, name, temple['评论人'], temple['评论时间'], temple['评论内容']) 245 | print(sql) 246 | try: 247 | cur.execute(sql) 248 | conn.commit() 249 | except Exception as e: 250 | print(e) 251 | except KeyError as e: 252 | print(e) 253 | cursor.close() 254 | self.close_MySQL(cur, conn) -------------------------------------------------------------------------------- /01_get_all_data/xueqiu_spider/copy_data_to_history.py: -------------------------------------------------------------------------------- 1 | from xueqiu_spider.config import * 2 | from multiprocessing import Pool 3 | import time 4 | 5 | import pymongo 6 | 7 | class copy_history_db(object): 8 | # 将超过最新记录的数据转移到历史数据里面 9 | def copy_taolun(self, old_db, new_db): 10 | client = pymongo.MongoClient(MONGO_URL, connect=False) 11 | db_new = client[new_db] 12 | db = client[old_db] 13 | for table in db.collection_names(): 14 | if table != 'system.indexes': 15 | print('taolun_copy,table name is ', table) 16 | collection = db[table] 17 | cursor = collection.find(no_cursor_timeout=True) 18 | for item in cursor: 19 | try: 20 | if item['count_key'] > MIDDLE_TIME: 21 | try: 22 | data = { 23 | 'url': item['url'], 24 | 'A股代码': item['A股代码'], 25 | '股票名称': item['股票名称'], 26 | '时间': item['时间'], 27 | '发布者': item['发布者'], 28 | '标题': item['标题'], 29 | '正文': item['正文'], 30 | '转发量': item['转发量'], 31 | '评论量': item['评论量'], 32 | '点赞量': item['点赞量'], 33 | '评论': item['评论'], 34 | } 35 | print(data) 36 | except KeyError: 37 | data = { 38 | 'url': item['url'], 39 | 'A股代码': item['A股代码'], 40 | '股票名称': item['股票名称'], 41 | '时间': item['时间'], 42 | '发布者': item['发布者'], 43 | '标题': item['标题'], 44 | '正文': item['正文'], 45 | '转发量': item['转发量'], 46 | '评论量': item['评论量'], 47 | '点赞量': item['点赞量'], 48 | '评论': [], 49 | } 50 | print(data) 51 | print("错误") 52 | db_new[table].insert(data) 53 | for da in data['评论']: 54 | com = { 55 | 'url': item['url'], 56 | '股票名称': item['股票名称'], 57 | '评论人': da['评论人'], 58 | '评论时间': da['评论时间'], 59 | '评论内容': da['评论内容'] 60 | } 61 | db_comment = client[HISTORY_COMMENTS] 62 | db_comment['yanbao_comments'].insert(com) 63 | except KeyError: 64 | print('count_key错误!' + str(item['_id'])) 65 | cursor.close() 66 | 67 | # 交易内容拷贝 68 | def copy_jiaoyi(self, old_db, new_db): 69 | client = pymongo.MongoClient(MONGO_URL, connect=False) 70 | db_new = client[new_db] 71 | db = client[old_db] 72 | for table in db.collection_names(): 73 | if table != 'system.indexes': 74 | print('jiaoyi_copy,table name is ', table) 75 | collection = db[table] 76 | cursor = collection.find(no_cursor_timeout=True) 77 | for item in cursor: 78 | try: 79 | if item['count_key'] > MIDDLE_TIME: 80 | try: 81 | data = { 82 | 'url': item['url'], 83 | 'A股代码': item['A股代码'], 84 | '股票名称': item['股票名称'], 85 | '时间': item['时间'], 86 | '发布者': item['发布者'], 87 | '标题': item['标题'], 88 | '正文': item['正文'], 89 | '转发量': item['转发量'], 90 | '评论量': item['评论量'], 91 | '点赞量': item['点赞量'], 92 | '评论': item['评论'], 93 | } 94 | print(data) 95 | except KeyError: 96 | data = { 97 | 'url': item['url'], 98 | 'A股代码': item['A股代码'], 99 | '股票名称': item['股票名称'], 100 | '时间': item['时间'], 101 | '发布者': item['发布者'], 102 | '标题': item['标题'], 103 | '正文': item['正文'], 104 | '转发量': item['转发量'], 105 | '评论量': item['评论量'], 106 | '点赞量': item['点赞量'], 107 | '评论': [], 108 | } 109 | print(data) 110 | print("错误") 111 | db_new[table].insert(data) 112 | for da in data['评论']: 113 | com = { 114 | 'url': item['url'], 115 | '股票名称': item['股票名称'], 116 | '评论人': da['评论人'], 117 | '评论时间': da['评论时间'], 118 | '评论内容': da['评论内容'] 119 | } 120 | db_comment = client[HISTORY_COMMENTS] 121 | db_comment['yanbao_comments'].insert(com) 122 | except KeyError: 123 | print('count_key错误!' + str(item['_id'])) 124 | cursor.close() 125 | 126 | # 新闻内容拷贝 127 | def copy_xinwen(self, old_db, new_db): 128 | client = pymongo.MongoClient(MONGO_URL, connect=False) 129 | db_new = client[new_db] 130 | db = client[old_db] 131 | for table in db.collection_names(): 132 | if table != 'system.indexes': 133 | print('xinwen_copy,table name is ', table) 134 | collection = db[table] 135 | cursor = collection.find(no_cursor_timeout=True) 136 | for item in cursor: 137 | try: 138 | if item['count_key'] > MIDDLE_TIME: 139 | try: 140 | data = { 141 | 'url': item['url'], 142 | 'A股代码': item['A股代码'], 143 | '股票名称': item['股票名称'], 144 | '时间': item['时间'], 145 | '发布者': item['发布者'], 146 | '标题': item['标题'], 147 | '摘要': item['摘要'], 148 | '转发量': item['转发量'], 149 | '评论量': item['评论量'], 150 | '点赞量': item['点赞量'], 151 | '外部链接': item['外部链接'], 152 | '评论': item['评论'], 153 | } 154 | print(data) 155 | except KeyError: 156 | data = { 157 | 'url': item['url'], 158 | 'A股代码': item['A股代码'], 159 | '股票名称': item['股票名称'], 160 | '时间': item['时间'], 161 | '发布者': item['发布者'], 162 | '标题': item['标题'], 163 | '摘要': item['摘要'], 164 | '转发量': item['转发量'], 165 | '评论量': item['评论量'], 166 | '点赞量': item['点赞量'], 167 | '外部链接': item['外部链接'], 168 | '评论': [], 169 | } 170 | print(data) 171 | print("错误") 172 | db_new[table].insert(data) 173 | for da in data['评论']: 174 | com = { 175 | 'url': item['url'], 176 | '股票名称': item['股票名称'], 177 | '评论人': da['评论人'], 178 | '评论时间': da['评论时间'], 179 | '评论内容': da['评论内容'] 180 | } 181 | db_comment = client[HISTORY_COMMENTS] 182 | db_comment['yanbao_comments'].insert(com) 183 | except KeyError: 184 | print('count_key错误!' + str(item['_id'])) 185 | cursor.close() 186 | 187 | # 公告内容拷贝 188 | def copy_gonggao(self, old_db, new_db): 189 | client = pymongo.MongoClient(MONGO_URL, connect=False) 190 | db_new = client[new_db] 191 | db = client[old_db] 192 | for table in db.collection_names(): 193 | if table != 'system.indexes': 194 | print('gonggao_copy,table name is ', table) 195 | collection = db[table] 196 | cursor = collection.find(no_cursor_timeout=True) 197 | for item in cursor: 198 | try: 199 | if item['count_key'] > MIDDLE_TIME: 200 | try: 201 | data = { 202 | 'url': item['url'], 203 | 'A股代码': item['A股代码'], 204 | '股票名称': item['股票名称'], 205 | '时间': item['时间'], 206 | '发布者': item['发布者'], 207 | '标题': item['标题'], 208 | '正文': item['正文'], 209 | '转发量': item['转发量'], 210 | '评论量': item['评论量'], 211 | '点赞量': item['点赞量'], 212 | 'PDF下载链接': item['PDF下载链接'], 213 | '评论': item['评论'], 214 | } 215 | print(data) 216 | except KeyError: 217 | data = { 218 | 'url': item['url'], 219 | 'A股代码': item['A股代码'], 220 | '股票名称': item['股票名称'], 221 | '时间': item['时间'], 222 | '发布者': item['发布者'], 223 | '标题': item['标题'], 224 | '正文': item['正文'], 225 | '转发量': item['转发量'], 226 | '评论量': item['评论量'], 227 | '点赞量': item['点赞量'], 228 | 'PDF下载链接': item['PDF下载链接'], 229 | '评论': [], 230 | } 231 | print(data) 232 | print("错误") 233 | db_new[table].insert(data) 234 | for da in data['评论']: 235 | com = { 236 | 'url': item['url'], 237 | '股票名称': item['股票名称'], 238 | '评论人': da['评论人'], 239 | '评论时间': da['评论时间'], 240 | '评论内容': da['评论内容'] 241 | } 242 | db_comment = client[HISTORY_COMMENTS] 243 | db_comment['yanbao_comments'].insert(com) 244 | except KeyError: 245 | print('count_key错误!' + str(item['_id'])) 246 | cursor.close() 247 | 248 | # 研报内容拷贝 249 | def copy_yanbao(self, old_db, new_db): 250 | client = pymongo.MongoClient(MONGO_URL, connect=False) 251 | db_new = client[new_db] 252 | db = client[old_db] 253 | for table in db.collection_names(): 254 | if table != 'system.indexes': 255 | print('gonggao_copy,table name is ', table) 256 | collection = db[table] 257 | cursor = collection.find(no_cursor_timeout=True) 258 | for item in cursor: 259 | try: 260 | if item['count_key'] > MIDDLE_TIME: 261 | try: 262 | data = { 263 | 'url': item['url'], 264 | 'A股代码': item['A股代码'], 265 | '股票名称': item['股票名称'], 266 | '时间': item['时间'], 267 | '发布者': item['发布者'], 268 | '标题': item['标题'], 269 | '正文': item['正文'], 270 | '转发量': item['转发量'], 271 | '评论量': item['评论量'], 272 | '点赞量': item['点赞量'], 273 | '评论': item['评论'], 274 | } 275 | print(data) 276 | except KeyError: 277 | data = { 278 | 'url': item['url'], 279 | 'A股代码': item['A股代码'], 280 | '股票名称': item['股票名称'], 281 | '时间': item['时间'], 282 | '发布者': item['发布者'], 283 | '标题': item['标题'], 284 | '正文': item['正文'], 285 | '转发量': item['转发量'], 286 | '评论量': item['评论量'], 287 | '点赞量': item['点赞量'], 288 | '评论': [], 289 | } 290 | print(data) 291 | print("错误") 292 | db_new[table].insert(data) 293 | for da in data['评论']: 294 | com = { 295 | 'url': item['url'], 296 | '股票名称': item['股票名称'], 297 | '评论人': da['评论人'], 298 | '评论时间': da['评论时间'], 299 | '评论内容': da['评论内容'] 300 | } 301 | db_comment = client[HISTORY_COMMENTS] 302 | db_comment['yanbao_comments'].insert(com) 303 | except KeyError: 304 | print('count_key错误!' + str(item['_id'])) 305 | cursor.close() 306 | 307 | def copy_history_main(self): 308 | self.copy_taolun(MIDDLE_TAOLUN_DB, HISTORY_TAOLUN_DB) 309 | self.copy_jiaoyi(MIDDLE_JIAOYI_DB, HISTORY_JIAOYI_DB) 310 | self.copy_xinwen(MIDDLE_XINWEN_DB, HISTORY_XINWEN_DB) 311 | self.copy_gonggao(MIDDLE_GONGGAO_DB, HISTORY_GONGGAO_DB) 312 | self.copy_yanbao(MIDDLE_YANBAO_DB, HISTORY_YANBAO_DB) 313 | 314 | 315 | 316 | 317 | -------------------------------------------------------------------------------- /01_get_all_data/xueqiu_spider/content_spider.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # 雪球 3 | import re 4 | import json 5 | import time 6 | import datetime 7 | import pymongo 8 | import random 9 | import requests 10 | from xueqiu_spider.config import * 11 | from urllib.parse import urlencode 12 | from json.decoder import JSONDecodeError 13 | from requests.exceptions import ConnectionError 14 | 15 | proxy = None 16 | count = 0 17 | 18 | 19 | class DealHtml(object): 20 | def __init__(self, type, divide_db, divide_collection, add_time): 21 | self.type = type 22 | self.header = self.get_header() 23 | self.databases = self.get_databases() # 存储到的数据库 24 | self.past_time = int(time.time()) - add_time # 过去时间点 25 | 26 | # 保存的数据库 27 | client = pymongo.MongoClient(MONGO_URL, connect=False) 28 | self.db = client[self.databases] 29 | 30 | # 上市公司的表 31 | client_sjs = pymongo.MongoClient(MONGO_URL, connect=False) 32 | db_sjs = client_sjs[divide_db] 33 | self.collection_sjs = db_sjs[divide_collection] 34 | print('init end') 35 | 36 | # 获取存放到的数据库 37 | def get_databases(self): 38 | # 讨论 : 1 交易 : 2 新闻 : 3 公告 : 4 研报 : 5 39 | if self.type == 1: 40 | return TODAY_TAOLUN_DB 41 | elif self.type == 2: 42 | return TODAY_JIAOYI_DB 43 | elif self.type == 3: 44 | return TODAY_XINWEN_DB 45 | elif self.type == 4: 46 | return TODAY_GONGGAO_DB 47 | elif self.type == 5: 48 | return TODAY_YANBAO_DB 49 | else: 50 | return None 51 | 52 | # 从cookie池获取cookie 53 | def get_header(self): 54 | client_cookie = pymongo.MongoClient(MONGO_URL, connect=False) 55 | db_cookie = client_cookie['cookiesPool'] 56 | collection_cookie = db_cookie['cookies'] 57 | count = collection_cookie.count() 58 | if count < 5: 59 | print("cookie池的数量小于5,当前为", count) 60 | cursor = collection_cookie.find(no_cursor_timeout=True) 61 | for item in cursor: 62 | now_time = int(time.time()) 63 | if int(item['time']) + 172800 > now_time: # 超过48小时 64 | if item['count_used'] < 3: 65 | headers = { 66 | "Cookie": item['Cookie'], 67 | "Host": "xueqiu.com", 68 | "Upgrade-Insecure-Requests": "1", 69 | "User-Agent": item['User-Agent'] 70 | } 71 | # 将count + 1 72 | collection_cookie.update({'_id': item['_id']}, {'$set': {'count_used': item['count_used'] + 1}}) 73 | print(headers) 74 | return headers 75 | else: 76 | pass 77 | else: 78 | print(item['_id'], "超过时间点了!") 79 | # 将这条信息删除 80 | collection_cookie.remove({"_id": item['_id']}) 81 | cursor.close() 82 | 83 | # 获取cookie池 84 | def get_proxy(self): 85 | try: 86 | response = requests.get(PROXY_POOL_URL) 87 | if response.status_code == 200: 88 | return response.text 89 | return None 90 | except ConnectionError: 91 | return None 92 | 93 | # 获取 html 94 | def get_html(self, url, count=1): 95 | global proxy 96 | if count >= 5: 97 | print('Tried Too Many Counts') 98 | return None 99 | try: 100 | if proxy: 101 | proxies = { 102 | 'http': 'http://' + proxy 103 | } 104 | response = requests.get(url, allow_redirects=False, headers=self.header, proxies=proxies) 105 | else: 106 | response = requests.get(url, allow_redirects=False, headers=self.header) 107 | if response.status_code == 200: 108 | return response.text 109 | if response.status_code == 302: 110 | # Need Proxy 111 | proxy = self.get_proxy() 112 | if proxy: 113 | print('Using Proxy', proxy) 114 | return self.get_html(url) 115 | else: 116 | print('Get Proxy Failed') 117 | return None 118 | except ConnectionError as e: 119 | print('Error Occurred', e.args) 120 | proxy = self.get_proxy() 121 | count += 1 122 | return self.get_html(url, count) 123 | 124 | # 根据不同网页获取 url 125 | def get_page_url(self,i,ID): 126 | if self.type == 1: 127 | data = { 128 | 'count': 10, 129 | 'comment': 0, 130 | 'symbol': ID, 131 | 'hl': 0, 132 | 'source': 'user', 133 | 'sort': 'time', 134 | 'page': i 135 | } 136 | params = urlencode(data) 137 | base = 'https://xueqiu.com/statuses/search.json' 138 | url = base + '?' + params 139 | return url 140 | elif self.type == 2: 141 | data = { 142 | 'count': 10, 143 | 'comment': 0, 144 | 'symbol': ID, 145 | 'hl': 0, 146 | 'source': 'trans', 147 | 'sort': 'time', 148 | 'page': i 149 | } 150 | params = urlencode(data) 151 | base = 'https://xueqiu.com/statuses/search.json' 152 | url = base + '?' + params 153 | return url 154 | elif self.type == 3: 155 | data = { 156 | 'count': 10, 157 | 'symbol_id': ID, 158 | 'source': '自选股新闻', 159 | 'page': i 160 | } 161 | params = urlencode(data) 162 | base = 'https://xueqiu.com/statuses/stock_timeline.json' 163 | url = base + '?' + params 164 | return url 165 | elif self.type == 4: 166 | data = { 167 | 'count': 10, 168 | 'symbol_id': ID, 169 | 'source': '公告', 170 | 'page': i 171 | } 172 | params = urlencode(data) 173 | base = 'https://xueqiu.com/statuses/stock_timeline.json' 174 | url = base + '?' + params 175 | return url 176 | elif self.type == 5: 177 | data = { 178 | 'count': 10, 179 | 'symbol_id': ID, 180 | 'source': '研报', 181 | 'page': i 182 | } 183 | params = urlencode(data) 184 | base = 'https://xueqiu.com/statuses/stock_timeline.json' 185 | url = base + '?' + params 186 | return url 187 | else: 188 | return None 189 | 190 | # 获取页面信息 191 | def get_page_index(self, i, ID): 192 | url = self.get_page_url(i,ID) 193 | if url: 194 | try: 195 | response = requests.get(url, headers=self.header) 196 | if response.status_code == 200: 197 | return response.text 198 | return None 199 | except ConnectionError: 200 | print('Error occurred') 201 | return None 202 | 203 | # 获取页面具体信息 204 | def get_page_detail(self, url): 205 | return self.get_html(url) 206 | 207 | # 由JSON数据返回URl 208 | def parse_page_index(self, text): 209 | try: 210 | data = json.loads(text) 211 | if data and 'list' in data.keys(): 212 | for item in data.get('list'): 213 | yield item.get('target') 214 | except JSONDecodeError: 215 | pass 216 | 217 | # 采用正则表达式获取具体的信息 218 | def parse_page_detail(self, html, url, ID, Name): 219 | html = self.get_html_json(html) 220 | 221 | try: 222 | text_pattern = re.compile("id.*?user_id.*?\"title\":\"(.*?)\",\"created_at\":(\d+),\"retweet_count\":(\d+),\"reply_count\":(\d+),\"fav_count\":(\d+),.*?user.*?\"screen_name\":\"(.*?)\",.*?\"like_count\":(\d+),.*?\"is_answer.*?\"text\":\"(.*?)\",\"source\".*?}", re.S) 223 | text = re.findall(text_pattern, str(html)) 224 | title = text[0][0] if text[0][0] else '无' 225 | time = self.deal_data(text[0][1]) 226 | retweet_count = text[0][2] 227 | reply_count = text[0][3] 228 | screen_name = text[0][5] 229 | like_count = text[0][6] 230 | content = text[0][7] 231 | if self.type == 3 or self.type == 4: 232 | try: 233 | out_url_pattern = re.compile(r'href=\\\\"(.*?)\\\\"\stitle', re.S) 234 | out_url = re.findall(out_url_pattern, content)[0] 235 | except (AttributeError, IndexError): 236 | out_url = None 237 | try: 238 | content_pattern = re.compile(r'<[^>]+>', re.S) 239 | content = content_pattern.sub('', content) 240 | content = content.replace(" ", "").replace("\u0026", "") 241 | except AttributeError: 242 | pass 243 | except IndexError: 244 | print('error') 245 | return None 246 | if self.type == 3: 247 | return { 248 | 'A股代码': ID, 249 | '股票名称': Name, 250 | 'url': url, 251 | '标题': title, 252 | '时间': time, 253 | '摘要': content, 254 | '发布者': screen_name, 255 | '外部链接': out_url, 256 | '转发量': retweet_count, 257 | '评论量': reply_count, 258 | '点赞量': like_count, 259 | 'count_key': 1 260 | } 261 | elif self.type == 4: 262 | return { 263 | 'A股代码': ID, 264 | '股票名称': Name, 265 | 'url': url, 266 | '标题': title, 267 | '时间': time, 268 | '正文': content, 269 | '发布者': screen_name, 270 | 'PDF下载链接': out_url, 271 | '转发量': retweet_count, 272 | '评论量': reply_count, 273 | '点赞量': like_count, 274 | 'count_key': 1 275 | } 276 | else: 277 | return { 278 | 'A股代码': ID, 279 | '股票名称': Name, 280 | 'url': url, 281 | '标题': title, 282 | '时间': time, 283 | '正文': content, 284 | '发布者': screen_name, 285 | '转发量': retweet_count, 286 | '评论量': reply_count, 287 | '点赞量': like_count, 288 | 'count_key': 1 289 | } 290 | 291 | # 提取 JSON 292 | def get_html_json(self, html): 293 | text_pattern = re.compile('window.SNOWMAN_STATUS\s=(.*?);\swindow.SNOWMAN_TARGET', re.S) 294 | text = re.findall(text_pattern, html) 295 | try: 296 | # data = json.loads(text[0]) 297 | return text 298 | except JSONDecodeError: 299 | return None 300 | 301 | # 处理日期 1515026126000 302 | # 将UNIX时间戳转化成标准时间戳 303 | def deal_data(self, data): 304 | data = int(data[:-3]) 305 | format = '%Y-%m-%d %H:%M:%S' 306 | value = time.localtime(data) 307 | dt = time.strftime(format, value) 308 | return dt 309 | 310 | # 判断是否出现异常 311 | def error_code(self, html): 312 | try: 313 | data = json.loads(html) 314 | if data and 'error_code' in data.keys(): 315 | return data['error_code'] 316 | except JSONDecodeError: 317 | pass 318 | 319 | # 保存到Mongodb中 320 | def save_to_mongodb(self, result, table_name): 321 | if self.db[table_name].insert(result): 322 | print('Successfully Saved to Mongodb', result['url']) 323 | return True 324 | return False 325 | 326 | # 获取最大页数 327 | def get_maxPage(self, text): 328 | try: 329 | data = json.loads(text) 330 | if data and 'maxPage' in data.keys(): 331 | return data.get('maxPage') 332 | except JSONDecodeError: 333 | pass 334 | 335 | # 将每一条信息进行爬取 336 | def get_connent(self, ID, Name, i): 337 | start_url = XUEQIU_URL 338 | get_json = self.get_page_index(i, ID) 339 | if get_json: 340 | if self.error_code(get_json) == "22621": # 判断是否请求频繁 341 | self.header = self.get_header() 342 | urls = self.parse_page_index(get_json) 343 | if urls: 344 | for url in urls: 345 | url = start_url + url 346 | html = self.get_page_detail(url) 347 | if html: 348 | to_mongodb = self.parse_page_detail(html, url, ID, Name) 349 | # 写入数据库 350 | if to_mongodb: 351 | dt = to_mongodb['时间'] 352 | # 转换成时间数组 353 | timeArray = time.strptime(dt, "%Y-%m-%d %H:%M:%S") 354 | # 转换成时间戳 355 | timestamp = time.mktime(timeArray) 356 | if timestamp < self.past_time: 357 | return False 358 | else: 359 | table_name = ID + "_" + Name 360 | self.save_to_mongodb(to_mongodb, str(table_name)) 361 | return True 362 | 363 | """ 364 | 评论数据部分 365 | """ 366 | # 获取页面信息 367 | def comments_get_page_index(self, comment_id, count=10, page=1): 368 | data = { 369 | 'id': comment_id, 370 | 'count': count, 371 | 'page': page, 372 | 'reply': 'true', 373 | 'type': 'status', 374 | 'split': 'true', 375 | 'asc': 'false' 376 | } 377 | params = urlencode(data) 378 | base = 'https://xueqiu.com/statuses/comments.json' 379 | url = base + '?' + params 380 | return self.get_html(url) 381 | 382 | def parse_page_index_comment(self, html): 383 | try: 384 | data = json.loads(html) 385 | if data and 'comments' in data.keys(): 386 | for item in data.get('comments'): 387 | yield item 388 | except JSONDecodeError: 389 | pass 390 | 391 | # 采用正则表达式获取具体的信息 392 | def comments_parse_page_detail(self, html): 393 | datas = [] 394 | items = self.parse_page_index_comment(html) 395 | for it in items: 396 | time = self.deal_data(str(it['created_at'])) 397 | people = it['user']['screen_name'] 398 | content = it['text'] 399 | try: 400 | content_pattern = re.compile(r'<[^>]+>', re.S) 401 | content = content_pattern.sub('', content) 402 | content = content.replace("\u0026", "").replace("nbsp;", "").replace("\\u3000", "") 403 | except AttributeError: 404 | pass 405 | data = { 406 | '评论人': people, 407 | '评论时间': time, 408 | '评论内容': content 409 | } 410 | datas.append(data) 411 | return datas 412 | 413 | def get_comments(self, comment_id): 414 | comments = [] 415 | ret_json = self.comments_get_page_index(comment_id) 416 | data = json.loads(ret_json) 417 | count = data.get('count') 418 | if count != 0: 419 | i = int(count / 10) + 1 420 | for x in range(1, i + 1): 421 | re_json = self.comments_get_page_index(comment_id, count=10, page=x) 422 | res = self.comments_parse_page_detail(re_json) 423 | for r in res: 424 | comments.append(r) 425 | return comments 426 | 427 | def deal_url(self, url): 428 | if self.type == 3 or self.type == 4 or self.type == 5: 429 | comment_id = re.findall(re.compile('https://xueqiu.com/S/.*?/(\d+)', re.S), str(url)) 430 | else: 431 | comment_id = re.findall(re.compile('https://xueqiu.com/\d+/(\d+)', re.S), str(url)) 432 | return comment_id[0] 433 | 434 | def comments_from(self,db_name): 435 | client = pymongo.MongoClient(MONGO_URL, connect=False) 436 | db = client[db_name] 437 | for table in db.collection_names(): 438 | if table != 'system.indexes': 439 | collection = db[table] 440 | cursor = collection.find(no_cursor_timeout=True) 441 | for item in cursor: 442 | try: 443 | pinglun = item['评论'] 444 | except KeyError: 445 | try: 446 | url = item['url'] 447 | _id = item['_id'] 448 | print(url) 449 | print("当前数据库为:", db_name, "\n 表为:", table) 450 | comment_id = self.deal_url(url) 451 | comments = self.get_comments(comment_id) 452 | collection.update_one({"_id": _id}, {"$set": {"评论": comments}}) 453 | except KeyError: 454 | print("url error") 455 | cursor.close() 456 | """ 457 | 评论内容结束 458 | """ 459 | 460 | def main(self): 461 | print('总共数据表为:',self.collection_sjs.find().count()) 462 | cursor = self.collection_sjs.find(no_cursor_timeout=True) 463 | for item in cursor: 464 | try: 465 | # A股代码 466 | ID = item['A股代码'] 467 | # 公司简称 468 | Name = item['A股简称'] 469 | start_url = XUEQIU_URL 470 | get_json = self.get_page_index(1, ID) 471 | if get_json: 472 | max_page = self.get_maxPage(get_json) 473 | for i in range(1, max_page + 1): 474 | print("当前时间为:", datetime.datetime.now(), '当前表为:', ID+ '_'+ Name,'存储的数据库为:', self.databases, '总共页码:', max_page, '现在页码:', i) 475 | if self.get_connent(ID, Name, i): 476 | sleeptime = random.randint(1, 3) 477 | time.sleep(sleeptime) 478 | else: 479 | break # 跳出循环 480 | except Exception: 481 | print(item + "KeyError!!!") 482 | cursor.close() 483 | 484 | # 文章数据结束 开始评论数据爬取 485 | try: 486 | self.comments_from(self.databases) 487 | except Exception: 488 | self.comments_from(self.databases) 489 | 490 | -------------------------------------------------------------------------------- /01_get_all_data/xueqiu_spider/copy_data_to_30_day.py: -------------------------------------------------------------------------------- 1 | from xueqiu_spider.config import * 2 | from multiprocessing import Pool 3 | import time 4 | import MySQLdb 5 | import datetime 6 | # 当天数据备份在30天数据库 7 | 8 | import pymongo 9 | class copy_middle_db(object): 10 | def __init__(self): 11 | self.pinglun_taolun_num = 0 12 | self.pinglun_jiaoyi_num = 0 13 | self.pinglun_xinwen_num = 0 14 | self.pinglun_gonggao_num = 0 15 | self.pinglun_yanbao_num = 0 16 | 17 | def start_MySQL(self): 18 | conn = MySQLdb.connect( 19 | host='192.168.1.108', 20 | port=3306, 21 | user='Andlinks', 22 | passwd='Andlinks2017', 23 | db='xueqiu', 24 | charset='utf8') 25 | 26 | cur = conn.cursor() 27 | myConn_list = [conn, cur] 28 | return myConn_list 29 | 30 | def close_MySQL(self,cur, conn): 31 | cur.close() 32 | conn.close() 33 | 34 | # 讨论内容拷贝 35 | def copy_taolun(self, old_db, new_db): 36 | client = pymongo.MongoClient(MONGO_URL, connect=False) 37 | db_new = client[new_db] 38 | db = client[old_db] 39 | 40 | myConn_list = self.start_MySQL() 41 | cur = myConn_list[1] 42 | conn = myConn_list[0] 43 | 44 | for table in db.collection_names(): 45 | if table != 'system.indexes': 46 | print('taolun_copy,table name is ',table) 47 | collection = db[table] 48 | cursor = collection.find(no_cursor_timeout=True) 49 | for item in cursor: 50 | try: 51 | data = { 52 | 'url': item['url'], 53 | 'A股代码': item['A股代码'], 54 | '股票名称': item['股票名称'], 55 | '时间': item['时间'], 56 | '发布者': item['发布者'], 57 | '标题': item['标题'], 58 | '正文': item['正文'], 59 | '转发量': item['转发量'], 60 | '评论量': item['评论量'], 61 | '点赞量': item['点赞量'], 62 | '评论': item['评论'], 63 | } 64 | print(data) 65 | except KeyError: 66 | data = { 67 | 'url': item['url'], 68 | 'A股代码': item['A股代码'], 69 | '股票名称': item['股票名称'], 70 | '时间': item['时间'], 71 | '发布者': item['发布者'], 72 | '标题': item['标题'], 73 | '正文': item['正文'], 74 | '转发量': item['转发量'], 75 | '评论量': item['评论量'], 76 | '点赞量': item['点赞量'], 77 | '评论': [], 78 | } 79 | print(data) 80 | print("错误") 81 | db_new[table].insert(data) 82 | for da in data['评论']: 83 | com = { 84 | 'url': item['url'], 85 | '股票名称': item['股票名称'], 86 | '评论人': da['评论人'], 87 | '评论时间': da['评论时间'], 88 | '评论内容': da['评论内容'] 89 | } 90 | self.pinglun_taolun_num += 1 91 | db_comment = client[MIDDLE_COMMENTS] 92 | db_comment['taolun_comments'].insert(com) 93 | 94 | sql = "INSERT INTO all_taolun_comments_2(url,company_name,comment_people,comment_time,comment_content) VALUES ('{}', '{}', '{}', '{}', '{}')".format(com['url'], com['股票名称'], com['评论人'], com['评论时间'], com['评论内容']) 95 | print(sql) 96 | try: 97 | cur.execute(sql) 98 | conn.commit() 99 | except Exception as e: 100 | print(e) 101 | cursor.close() 102 | 103 | # 删除表中所有元素 104 | collection.remove() 105 | 106 | self.close_MySQL(cur, conn) 107 | 108 | # 交易内容拷贝 109 | def copy_jiaoyi(self, old_db, new_db): 110 | client = pymongo.MongoClient(MONGO_URL, connect=False) 111 | db_new = client[new_db] 112 | db = client[old_db] 113 | 114 | myConn_list = self.start_MySQL() 115 | cur = myConn_list[1] 116 | conn = myConn_list[0] 117 | 118 | for table in db.collection_names(): 119 | if table != 'system.indexes': 120 | print('jiaoyi_copy,table name is ',table) 121 | collection = db[table] 122 | cursor = collection.find(no_cursor_timeout=True) 123 | for item in cursor: 124 | print(item) 125 | try: 126 | data = { 127 | 'url': item['url'], 128 | 'A股代码': item['A股代码'], 129 | '股票名称': item['股票名称'], 130 | '时间': item['时间'], 131 | '发布者': item['发布者'], 132 | '标题': item['标题'], 133 | '正文': item['正文'], 134 | '转发量': item['转发量'], 135 | '评论量': item['评论量'], 136 | '点赞量': item['点赞量'], 137 | '评论': item['评论'], 138 | } 139 | print(data) 140 | except KeyError: 141 | data = { 142 | 'url': item['url'], 143 | 'A股代码': item['A股代码'], 144 | '股票名称': item['股票名称'], 145 | '时间': item['时间'], 146 | '发布者': item['发布者'], 147 | '标题': item['标题'], 148 | '正文': item['正文'], 149 | '转发量': item['转发量'], 150 | '评论量': item['评论量'], 151 | '点赞量': item['点赞量'], 152 | '评论': [], 153 | } 154 | print(data) 155 | print("错误") 156 | db_new[table].insert(data) 157 | for da in data['评论']: 158 | com = { 159 | 'url': item['url'], 160 | '股票名称': item['股票名称'], 161 | '评论人': da['评论人'], 162 | '评论时间': da['评论时间'], 163 | '评论内容': da['评论内容'] 164 | } 165 | self.pinglun_jiaoyi_num += 1 166 | db_comment = client[MIDDLE_COMMENTS] 167 | db_comment['jiaoyi_comments'].insert(com) 168 | sql = "INSERT INTO all_jiaoyi_comments_2(url,company_name,comment_people,comment_time,comment_content) VALUES ('{}', '{}', '{}', '{}', '{}')".format(com['url'], com['股票名称'], com['评论人'], com['评论时间'], com['评论内容']) 169 | print(sql) 170 | try: 171 | cur.execute(sql) 172 | conn.commit() 173 | except Exception as e: 174 | print(e) 175 | cursor.close() 176 | # 删除表中所有元素 177 | collection.remove() 178 | self.close_MySQL(cur, conn) 179 | 180 | # 新闻内容拷贝 181 | def copy_xinwen(self, old_db, new_db): 182 | client = pymongo.MongoClient(MONGO_URL, connect=False) 183 | db_new = client[new_db] 184 | db = client[old_db] 185 | 186 | myConn_list = self.start_MySQL() 187 | cur = myConn_list[1] 188 | conn = myConn_list[0] 189 | 190 | for table in db.collection_names(): 191 | if table != 'system.indexes': 192 | print('xinwen_copy,table name is ',table) 193 | collection = db[table] 194 | cursor = collection.find(no_cursor_timeout=True) 195 | for item in cursor: 196 | print(item) 197 | try: 198 | data = { 199 | 'url': item['url'], 200 | 'A股代码': item['A股代码'], 201 | '股票名称': item['股票名称'], 202 | '时间': item['时间'], 203 | '发布者': item['发布者'], 204 | '标题': item['标题'], 205 | '摘要': item['摘要'], 206 | '转发量': item['转发量'], 207 | '评论量': item['评论量'], 208 | '点赞量': item['点赞量'], 209 | '外部链接': item['外部链接'], 210 | '评论': item['评论'], 211 | } 212 | print(data) 213 | except KeyError: 214 | data = { 215 | 'url': item['url'], 216 | 'A股代码': item['A股代码'], 217 | '股票名称': item['股票名称'], 218 | '时间': item['时间'], 219 | '发布者': item['发布者'], 220 | '标题': item['标题'], 221 | '摘要': item['摘要'], 222 | '转发量': item['转发量'], 223 | '评论量': item['评论量'], 224 | '点赞量': item['点赞量'], 225 | '外部链接': item['外部链接'], 226 | '评论': [], 227 | } 228 | print(data) 229 | print("错误") 230 | db_new[table].insert(data) 231 | for da in data['评论']: 232 | com = { 233 | 'url': item['url'], 234 | '股票名称': item['股票名称'], 235 | '评论人': da['评论人'], 236 | '评论时间': da['评论时间'], 237 | '评论内容': da['评论内容'] 238 | } 239 | self.pinglun_xinwen_num += 1 240 | db_comment = client[MIDDLE_COMMENTS] 241 | db_comment['xinwen_comments'].insert(com) 242 | 243 | sql = "INSERT INTO all_xinwen_comments_2(url,company_name,comment_people,comment_time,comment_content) VALUES ('{}', '{}', '{}', '{}', '{}')".format(com['url'], com['股票名称'], com['评论人'], com['评论时间'], com['评论内容']) 244 | print(sql) 245 | try: 246 | cur.execute(sql) 247 | conn.commit() 248 | except Exception as e: 249 | print(e) 250 | cursor.close() 251 | # 删除表中所有元素 252 | collection.remove() 253 | self.close_MySQL(cur, conn) 254 | 255 | 256 | # 公告内容拷贝 257 | def copy_gonggao(self, old_db, new_db): 258 | client = pymongo.MongoClient(MONGO_URL, connect=False) 259 | db_new = client[new_db] 260 | db = client[old_db] 261 | 262 | myConn_list = self.start_MySQL() 263 | cur = myConn_list[1] 264 | conn = myConn_list[0] 265 | 266 | for table in db.collection_names(): 267 | if table != 'system.indexes': 268 | print('gonggao_copy,table name is ',table) 269 | collection = db[table] 270 | cursor = collection.find(no_cursor_timeout=True) 271 | for item in cursor: 272 | print(item) 273 | try: 274 | data = { 275 | 'url': item['url'], 276 | 'A股代码': item['A股代码'], 277 | '股票名称': item['股票名称'], 278 | '时间': item['时间'], 279 | '发布者': item['发布者'], 280 | '标题': item['标题'], 281 | '正文': item['正文'], 282 | '转发量': item['转发量'], 283 | '评论量': item['评论量'], 284 | '点赞量': item['点赞量'], 285 | 'PDF下载链接': item['PDF下载链接'], 286 | '评论': item['评论'], 287 | } 288 | print(data) 289 | except KeyError: 290 | data = { 291 | 'url': item['url'], 292 | 'A股代码': item['A股代码'], 293 | '股票名称': item['股票名称'], 294 | '时间': item['时间'], 295 | '发布者': item['发布者'], 296 | '标题': item['标题'], 297 | '正文': item['正文'], 298 | '转发量': item['转发量'], 299 | '评论量': item['评论量'], 300 | '点赞量': item['点赞量'], 301 | 'PDF下载链接': item['PDF下载链接'], 302 | '评论': [], 303 | } 304 | print(data) 305 | print("错误") 306 | db_new[table].insert(data) 307 | for da in data['评论']: 308 | com = { 309 | 'url': item['url'], 310 | '股票名称': item['股票名称'], 311 | '评论人': da['评论人'], 312 | '评论时间': da['评论时间'], 313 | '评论内容': da['评论内容'] 314 | } 315 | 316 | self.pinglun_gonggao_num += 1 317 | 318 | db_comment = client[MIDDLE_COMMENTS] 319 | db_comment['gonggao_comments'].insert(com) 320 | 321 | sql = "INSERT INTO all_gonggao_comments_2(url,company_name,comment_people,comment_time,comment_content) VALUES ('{}', '{}', '{}', '{}', '{}')".format( 322 | com['url'], com['股票名称'], com['评论人'], com['评论时间'], com['评论内容']) 323 | print(sql) 324 | try: 325 | cur.execute(sql) 326 | conn.commit() 327 | except Exception as e: 328 | print(e) 329 | cursor.close() 330 | # 删除表中所有元素 331 | collection.remove() 332 | self.close_MySQL(cur, conn) 333 | 334 | # 研报内容拷贝 335 | def copy_yanbao(self, old_db, new_db): 336 | client = pymongo.MongoClient(MONGO_URL, connect=False) 337 | db_new = client[new_db] 338 | db = client[old_db] 339 | 340 | myConn_list = self.start_MySQL() 341 | cur = myConn_list[1] 342 | conn = myConn_list[0] 343 | 344 | for table in db.collection_names(): 345 | if table != 'system.indexes': 346 | print('yanbao_copy,table name is ',table) 347 | collection = db[table] 348 | cursor = collection.find(no_cursor_timeout=True) 349 | for item in cursor: 350 | print(item) 351 | try: 352 | data = { 353 | 'url': item['url'], 354 | 'A股代码': item['A股代码'], 355 | '股票名称': item['股票名称'], 356 | '时间': item['时间'], 357 | '发布者': item['发布者'], 358 | '标题': item['标题'], 359 | '正文': item['正文'], 360 | '转发量': item['转发量'], 361 | '评论量': item['评论量'], 362 | '点赞量': item['点赞量'], 363 | '评论': item['评论'], 364 | } 365 | print(data) 366 | except KeyError: 367 | data = { 368 | 'url': item['url'], 369 | 'A股代码': item['A股代码'], 370 | '股票名称': item['股票名称'], 371 | '时间': item['时间'], 372 | '发布者': item['发布者'], 373 | '标题': item['标题'], 374 | '正文': item['正文'], 375 | '转发量': item['转发量'], 376 | '评论量': item['评论量'], 377 | '点赞量': item['点赞量'], 378 | '评论': [], 379 | } 380 | print(data) 381 | print("错误") 382 | db_new[table].insert(data) 383 | for da in data['评论']: 384 | com = { 385 | 'url': item['url'], 386 | '股票名称': item['股票名称'], 387 | '评论人': da['评论人'], 388 | '评论时间': da['评论时间'], 389 | '评论内容': da['评论内容'] 390 | } 391 | 392 | self.pinglun_yanbao_num += 1 393 | 394 | db_comment = client[MIDDLE_COMMENTS] 395 | db_comment['yanbao_comments'].insert(com) 396 | 397 | sql = "INSERT INTO all_yanbao_comments_2(url,company_name,comment_people,comment_time,comment_content) VALUES ('{}', '{}', '{}', '{}', '{}')".format( 398 | com['url'], com['股票名称'], com['评论人'], com['评论时间'], com['评论内容']) 399 | print(sql) 400 | try: 401 | cur.execute(sql) 402 | conn.commit() 403 | except Exception as e: 404 | print(e) 405 | cursor.close() 406 | # 删除表中所有元素 407 | collection.remove() 408 | self.close_MySQL(cur, conn) 409 | 410 | # 遍历5个数据库 将标记加 1 411 | def deal_db(self): 412 | self.mark(MIDDLE_TAOLUN_DB) 413 | self.mark(MIDDLE_JIAOYI_DB) 414 | self.mark(MIDDLE_XINWEN_DB) 415 | self.mark(MIDDLE_GONGGAO_DB) 416 | self.mark(MIDDLE_YANBAO_DB) 417 | 418 | def mark(self, DB): 419 | client = pymongo.MongoClient(MONGO_URL, connect=False) 420 | db = client[DB] 421 | for table in db.collection_names(): 422 | if table != 'system.indexes': 423 | print('database is', DB, 'table name is ', table) 424 | collection = db[table] 425 | cursor = collection.find(no_cursor_timeout=True) 426 | for item in cursor: 427 | try: 428 | # 设置标记 429 | count_key = item['count_key'] + 1 430 | _id = item['_id'] 431 | collection.update_one({"_id": _id}, {"$set": {"count_key": count_key}}) 432 | except KeyError: 433 | _id = item['_id'] 434 | collection.update_one({"_id": _id}, {"$set": {"count_key": 1}}) 435 | cursor.close() 436 | 437 | 438 | def copy_middle_main(self): 439 | pool = Pool(5) 440 | pool.apply_async(self.copy_taolun, (TODAY_TAOLUN_DB, MIDDLE_TAOLUN_DB)) 441 | time.sleep(1) 442 | pool.apply_async(self.copy_jiaoyi, (TODAY_JIAOYI_DB, MIDDLE_JIAOYI_DB)) 443 | time.sleep(1) 444 | pool.apply_async(self.copy_xinwen, (TODAY_XINWEN_DB, MIDDLE_XINWEN_DB)) 445 | time.sleep(1) 446 | pool.apply_async(self.copy_gonggao, (TODAY_GONGGAO_DB, MIDDLE_GONGGAO_DB)) 447 | time.sleep(1) 448 | pool.apply_async(self.copy_yanbao, (TODAY_YANBAO_DB, MIDDLE_YANBAO_DB)) 449 | time.sleep(1) 450 | pool.close() # 关闭进程池 451 | pool.join() # 主进程在这里等待,只有子进程全部结束之后,在会开启主线程 452 | # 标注信息加 1 453 | self.deal_db() 454 | # 将今天的评论数据个数插入到Mysql的everyday_comments_number表中 455 | myConn_list = self.start_MySQL() 456 | cur = myConn_list[1] 457 | conn = myConn_list[0] 458 | sql = "INSERT INTO everyday_comments_number(pinglun_date,taolun_num,jiaoyi_num,xinwen_num,gonggao_num,yanbao_num) VALUES('{}',{},{},{},{},{})".format(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), int(self.pinglun_taolun_num), int(self.pinglun_jiaoyi_num), int(self.pinglun_xinwen_num), int(self.pinglun_gonggao_num), int(self.pinglun_yanbao_num)) 459 | print(sql) 460 | try: 461 | cur.execute(sql) 462 | conn.commit() 463 | except Exception as e: 464 | print(e) 465 | self.close_MySQL(cur, conn) 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | --------------------------------------------------------------------------------