├── README.md ├── Sourcefile └── Source.xml ├── Table_script_Mysql └── t_py_stocklist.sql └── stock_list.py /README.md: -------------------------------------------------------------------------------- 1 | # 使用 Python 抓股票数据的小爬虫 2 | 3 | 这个py主要是用来从东方财富抓取当天数据的小程序,因为都是来自中国的股市数据,所以全都用中文啦。 4 | 5 | ### stock_list.py 6 | 是这个Python程序的代码,抓数据然后插入到mysql数据库表。 7 | 8 | ### t_py_stocklist.sql 9 | 它是mysql表的建表语句。 10 | 11 | ### Source.xml 12 | 其实没有必要使用资源文件,只是为了将它规范下,使用xml作为数据的资源文件的尝试吧。 13 | -------------------------------------------------------------------------------- /Sourcefile/Source.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | oracle连接字符串 4 | 5 | ip地址 6 | 用户名 7 | 密码 8 | 数据库名 9 | 端口号 10 | 11 | 12 | 13 | http://hqdigi2.eastmoney.com/EM_Quote2010NumericApplication/index.aspx?type=s&sortType=C&sortRule=-1&pageSize=20&page=1&jsName=quote_123&style=33 14 | http://hqdigi2.eastmoney.com/EM_Quote2010NumericApplication/index.aspx?type=s&sortType=C&sortRule=-1&pageSize=20&page=%s&jsName=quote_123&style=33 15 | 16 | -------------------------------------------------------------------------------- /Table_script_Mysql/t_py_stocklist.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE `t_py_stocklist` ( 2 | `id` int(11) NOT NULL AUTO_INCREMENT, 3 | `eastmoney_code` varchar(8) DEFAULT NULL, 4 | `stock_code` varchar(8) DEFAULT NULL, 5 | `stock_name` varchar(20) DEFAULT NULL, 6 | `yes_values` varchar(10) DEFAULT NULL, 7 | `open_values` varchar(10) DEFAULT NULL, 8 | `new_values` varchar(10) DEFAULT NULL, 9 | `high_values` varchar(10) DEFAULT NULL, 10 | `low_values` varchar(10) DEFAULT NULL, 11 | `turnover_value` varchar(15) DEFAULT NULL, 12 | `turnover` varchar(15) DEFAULT NULL, 13 | `pricechange_value` varchar(15) DEFAULT NULL, 14 | `pricechange_ratio` varchar(15) DEFAULT NULL, 15 | `average_price` varchar(15) DEFAULT NULL, 16 | `amplitude` varchar(15) DEFAULT NULL, 17 | `weibi` varchar(15) DEFAULT NULL, 18 | `insert_date` varchar(15) DEFAULT NULL, 19 | PRIMARY KEY (`id`) 20 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 21 | 22 | -------------------------------------------------------------------------------- /stock_list.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: UTF-8 -*- 3 | try: 4 | import xml.etree.cElementTree as ET 5 | except ImportError: 6 | import xml.etree.ElementTree as ET 7 | import os 8 | import time 9 | import pymysql 10 | from urllib import request 11 | 12 | def get_stocklist(): 13 | # 资源文件位置 14 | Source_file = os.getcwd()+"\Sourcefile\Source.xml" 15 | # 资源文件中,mysql的连接信息。 16 | source_mysql_host = '' 17 | source_mysql_username = '' 18 | source_mysql_password = '' 19 | source_mysql_db = ''; 20 | source_mysql_port = ''; 21 | # 获得股票页数的链接 22 | sina_page_num = '' 23 | # 获得每页股票的链接 24 | sina_page_list = '' 25 | # 收集网页中获得的股票数据 26 | stock_values=[] 27 | # 得到一个运行时间 28 | key_time = str(time.strftime("%Y-%m-%d %H:%M:%S")) 29 | # 插入mysql数据库表T_PY_STOCKLIST 30 | ins_list_sql = "INSERT INTO T_PY_STOCKLIST (eastmoney_code,stock_code,stock_name,yes_values,open_values,new_values," \ 31 | "high_values,low_values,turnover_value,turnover,pricechange_value,pricechange_ratio,average_price," \ 32 | "amplitude,weibi,INSERT_DATE)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" 33 | try: 34 | for event, elem in ET.iterparse(Source_file): # reads a xml file 35 | tag_name = elem.tag 36 | if event == 'end': 37 | if tag_name == 'my01_hostname': source_mysql_host = elem.text if elem.text is not None else Exception("Can't get the hostname of mysql") 38 | elif tag_name == 'my01_username':source_mysql_username = elem.text if elem.text is not None else Exception("Can't get the username of mysql") 39 | elif tag_name == 'my01_password':source_mysql_password = elem.text if elem.text is not None else Exception("Can't get the password of mysql") 40 | elif tag_name == 'my01_db':source_mysql_db = elem.text if elem.text is not None else Exception("Can't get the db name of mysql") 41 | elif tag_name == 'my01_port':source_mysql_port = elem.text if elem.text is not None else Exception("Can't get the port of mysql") 42 | elif tag_name == 'stock_pagenum':sina_page_num = elem.text if elem.text is not None else Exception("Can't get the count of page") 43 | elif tag_name == 'stock_list': sina_page_list = elem.text if elem.text is not None else Exception("Can't get the list of the stock") 44 | 45 | conn= pymysql.connect(host=source_mysql_host, port=int(source_mysql_port), user=source_mysql_username, passwd=source_mysql_password,db = source_mysql_db, charset='UTF8') 46 | sto_cursor = conn.cursor() 47 | # 解析网页 48 | response = request.urlopen(sina_page_num) 49 | content = response.read().decode('utf-8').replace('"',',') 50 | count = len(content) 51 | start_count = content.find('pages:' )+6 52 | pages_count = int(content[start_count:count-1]) 53 | pages_start = 1 54 | # 对每一个网页进行分析 55 | while pages_start< pages_count: 56 | i = 1; 57 | sub_response = request.urlopen(sina_page_list % str(pages_start)) 58 | print(sina_page_list % str(pages_start)) 59 | sub_content = sub_response.read().decode('utf-8').replace('"',',') 60 | stock_list = sub_content.split(',') 61 | col_count = len(stock_list) 62 | # 收集股票结果 63 | while i< col_count: 64 | stock_values.append((stock_list[i],stock_list[i+1],stock_list[i+2],stock_list[i+3] ,stock_list[i+4] , 65 | stock_list[i+5],stock_list[i+6] ,stock_list[i+7] ,stock_list[i+8],stock_list[i+9] , 66 | stock_list[i+10],stock_list[i+11],stock_list[i+12],stock_list[i+13],stock_list[i+14], 67 | key_time)) 68 | 69 | i = i+35 70 | pages_start = pages_start+1 71 | # 插入数据库中 72 | sto_cursor.executemany(ins_list_sql,stock_values) 73 | conn.commit() 74 | except Exception as e: 75 | print(e) 76 | finally: 77 | sto_cursor.close() 78 | conn.close() 79 | get_stocklist() 80 | --------------------------------------------------------------------------------