├── .gitignore ├── .idea ├── .gitignore ├── Financial-report-acquisition-and-data-processing-with-Python.iml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── 001_download_annual_report.py ├── 001_download_annual_report下载年报重置版本.ipynb ├── 002_import_company_introduction_to_excel.py ├── 002爬取公司介绍.ipynb ├── 003_getting_audit_firm_name_and_pay.py ├── 003_getting_audit_firm_name_and_pay_2017.py ├── 003年审事务所.ipynb ├── 004_getting_report_file_and_information_into_excel.py ├── 004爬公司年报.ipynb ├── 005_general_spider_for_finacial_data.py ├── 005_getting_internal_control_firm_name_and_pay.py ├── 006_getting_company_information_updated_on_2020_03_19.py ├── 007_spider_for_12306.py ├── 20200421zhengzhou_land.ipynb ├── 20200512下载财务报表-网易财经.py ├── 20200512获取年报中竞争力和风险信息.py ├── AB_NYC_2019.ipynb ├── Acquiring_POI_using_Python.py ├── Acquiring_POI_using_Python_updated.py ├── README.md ├── excel ├── 2020年1月中华人民共和国县以上行政区划代码.xlsx ├── kaggle_酒店预订需求预测.ipynb └── 上市公司信息和年报审计事务所-2018年_20200320_2新增锂电池.xlsx ├── get_data_from_PDF.ipynb ├── getting_data_from_eastmoney_with_Python.ipynb ├── getting_table_with_python.ipynb ├── jq_industry_data_20200609.ipynb ├── jq_工业互联网概念股_20200610.ipynb ├── jq_智能手机概念股_20200610.ipynb ├── jq_概念股人均营收_20200609.ipynb ├── jq_概念股人均营收_火电激光SaaS.ipynb ├── jq_概念股人均营收_软件互联网.ipynb ├── jq_概念股的平均员工数_20200609.ipynb ├── jq_量子计算机概念股_20200610.ipynb ├── jqdata20200601.ipynb ├── jqdata20200601.py ├── jqdata_20200608_electricity_ultility.ipynb ├── jqdata_stock_info_20200608.ipynb ├── jqdata_stock_shareholders_20200608.ipynb ├── jqdata_获取报告期财务数据_20200610.ipynb ├── jqdata_资产负债表_20200610.ipynb ├── kaggle_共享单车.ipynb ├── pachongchuangguan.py ├── pccg.py ├── pccg2 ├── stock_list ├── 1.txt ├── 2.txt ├── 20200320.txt ├── 3.txt ├── 4.txt ├── 5.txt ├── scrawler_url.txt └── 新能源电池20200320.xlsx ├── test.py ├── testqq.py ├── tieba.py ├── tushare_20200520.ipynb ├── zhihu_04_hotel_stock.py ├── zhihu_04_mask_stock.py └── 爬虫-新浪财经-重置版.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | *.xml 3 | *.xls 4 | jqdatasdk-master/build/lib/jqdatasdk/__init__.py 5 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Default ignored files 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /.idea/Financial-report-acquisition-and-data-processing-with-Python.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /001_download_annual_report.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | import re 3 | import os 4 | import time 5 | import random 6 | file_folder= "C://TEMP/Excel/";file_name="stock-list-20200512-electricity1.txt" 7 | # file_folder= "C://TEMP/Excel/";file_name="stock-list-20200512-electricity.txt" 8 | f=open(file_folder+file_name,'rb') 9 | stock = [] 10 | for line in f.readlines(): 11 | line=line.decode('utf-8') 12 | print(line,end = '') 13 | line = line.replace('\n','') 14 | stock.append(line) 15 | #print(stock) 16 | f.close() 17 | print('stock is',stock[:5]) 18 | 19 | for each in stock[1:]: 20 | print('~~~~each is',each) 21 | each1=each[2:8];each0=each[0:8];each3=each 22 | print(each3) 23 | print('1股票代码',each1) 24 | url='http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/'+each1+'/page_type/ndbg.phtml' 25 | # url='http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/'+each+'/page_type/ndbg.phtml' 26 | req = urllib.request.Request(url) 27 | print('2',each1) 28 | req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0') 29 | page = urllib.request.urlopen(req) 30 | time.sleep(random.random() * 3) 31 | print('3',each1) 32 | try: 33 | html = page.read().decode('gbk') 34 | target = r'&id=[_0-9_]{7}' 35 | print('4',each1) 36 | # target = r'&id=[_0-9_]{6}' 37 | target_list = re.findall(target,html) 38 | print(target_list) 39 | # os.mkdir('./'+str(each)) 40 | # os.mkdir('./'+'electricity-'+each) 41 | sid = each1 42 | print('5',each1,'2019和2015年target_list',target_list[0]) 43 | print("6sid is",sid) 44 | year=2019 45 | try: 46 | each2=target_list[0] 47 | # for each2 in target_list[0]: 48 | # for each2 in target_list[0,3]: 49 | #print(a) 50 | print('7',each1,each2) 51 | target_url='http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllBulletinDetail.php?stockid='+sid+each2 52 | print('--8--',each1) 53 | print('下载链接target_url,',target_url) 54 | treq = urllib.request.Request(target_url) 55 | time.sleep(random.random() * 3) 56 | treq.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0') 57 | tpage = urllib.request.urlopen(treq) 58 | time.sleep(random.random() * 3) 59 | print("--9--file_url is") 60 | try: 61 | print("--9-1-file_url is") 62 | thtml = tpage.read().decode('gbk') 63 | #print(thtml) 64 | file_url = re.search('http://file.finance.sina.com.cn/211.154.219.97:9494/.*?PDF',thtml) 65 | print('--9--file_url is',file_url.group(0)) 66 | try: 67 | #print(file_url.group(0)) 68 | report_year=file_url.group(0).split("/")[-4] 69 | print('report_year',report_year) 70 | report_year=str(int(report_year)-1) 71 | print('report_year',report_year) 72 | each3 = each[0:-1] 73 | print(each3) 74 | report_file=each3+report_year 75 | print(report_file,report_file,report_file,report_file) 76 | local = file_folder+'annual_report_2019/'+'风电'+report_file+'.pdf' 77 | # local = './' +'annual_report_2019'+ '/' +each3+report_year+'.pdf' 78 | # print('local',local) 79 | # # local = './'+each+'/'+file_url.group(0).split("/")[-4]+'.pdf' 80 | # # local = './'+each+'/'+file_url.group(0).split("/")[-1]+'.pdf' 81 | # # local = './'+sid+'/'+file_url.group(0).split("/")[-1]+'.pdf' 82 | # print("10 local is",local) 83 | #调试用作文件占位 84 | #open(local, 'wb').write(b'success') 85 | # print('11',each1,'local',local) 86 | urllib.request.urlretrieve(file_url.group(0),local,None) 87 | except: 88 | print('PDF失效;'+target_url) 89 | except: 90 | print(each1,'年报下载页面编码错误;'+target_url) 91 | except: 92 | print('each2',each2) 93 | except: 94 | print('年报列表页面编码错误;'+url) -------------------------------------------------------------------------------- /001_download_annual_report下载年报重置版本.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import urllib.request\n", 10 | "import re\n", 11 | "import os\n", 12 | "import time\n", 13 | "import random" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 7, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "# file_folder= \"C:/TEMP/Excel/\";file_name=\"stock-list-20200512-electricity1.txt\"\n", 23 | "# # file_folder= \"C://TEMP/Excel/\";file_name=\"stock-list-20200512-electricity.txt\"\n", 24 | "# f=open(file_folder+file_name,'rb')\n", 25 | "f=['万兴科技SZ300624','恒通科技sz300374']" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 13, 31 | "metadata": { 32 | "tags": [] 33 | }, 34 | "outputs": [ 35 | { 36 | "output_type": "stream", 37 | "name": "stdout", 38 | "text": "\n 万兴科技SZ300624\n恒通科技sz300374\n\n" 39 | } 40 | ], 41 | "source": [ 42 | "# stock = []\n", 43 | "# for line in f.readlines():\n", 44 | "# line=line.decode('utf-8')\n", 45 | "# print(line,end = '')\n", 46 | "# line = line.replace('\\n','')\n", 47 | "# stock.append(line)\n", 48 | "# print(stock)\n", 49 | "# f.close()\n", 50 | "stock=f\n", 51 | "print(stock)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 14, 57 | "metadata": { 58 | "tags": [] 59 | }, 60 | "outputs": [ 61 | { 62 | "output_type": "stream", 63 | "name": "stdout", 64 | "text": "~~~~each is <\n<\n1股票代码 \n2 \n3 \n4 \n[]\n年报列表页面编码错误;http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid//page_type/ndbg.phtml\n" 65 | } 66 | ], 67 | "source": [ 68 | "for each in stock[1:2]:\n", 69 | " print('~~~~each is',each)\n", 70 | " each1=each[2:8];each0=each[0:8];each3=each\n", 71 | " print(each3)\n", 72 | " print('1股票代码',each1)\n", 73 | " url='http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/'+each1+'/page_type/ndbg.phtml'\n", 74 | " # url='http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/'+each+'/page_type/ndbg.phtml'\n", 75 | " req = urllib.request.Request(url)\n", 76 | " print('2',each1)\n", 77 | " req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0')\n", 78 | " page = urllib.request.urlopen(req)\n", 79 | " time.sleep(random.random() * 3)\n", 80 | " print('3',each1)\n", 81 | " try:\n", 82 | " html = page.read().decode('gbk')\n", 83 | " target = r'&id=[_0-9_]{7}'\n", 84 | " print('4',each1)\n", 85 | " # target = r'&id=[_0-9_]{6}'\n", 86 | " target_list = re.findall(target,html)\n", 87 | " print(target_list)\n", 88 | " # os.mkdir('./'+str(each))\n", 89 | " # os.mkdir('./'+'electricity-'+each)\n", 90 | " sid = each1\n", 91 | " print('5',each1,'2019和2015年target_list',target_list[0])\n", 92 | " print(\"6sid is\",sid)\n", 93 | " year=2019\n", 94 | " try:\n", 95 | " each2=target_list[0]\n", 96 | " # for each2 in target_list[0]:\n", 97 | " # for each2 in target_list[0,3]:\n", 98 | " #print(a)\n", 99 | " print('7',each1,each2)\n", 100 | " target_url='http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllBulletinDetail.php?stockid='+sid+each2\n", 101 | " print('--8--',each1)\n", 102 | " print('下载链接target_url,',target_url)\n", 103 | " treq = urllib.request.Request(target_url)\n", 104 | " time.sleep(random.random() * 3)\n", 105 | " treq.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0')\n", 106 | " tpage = urllib.request.urlopen(treq)\n", 107 | " time.sleep(random.random() * 3)\n", 108 | " print(\"--9--file_url is\")\n", 109 | " try:\n", 110 | " print(\"--9-1-file_url is\")\n", 111 | " thtml = tpage.read().decode('gbk')\n", 112 | " #print(thtml)\n", 113 | " file_url = re.search('http://file.finance.sina.com.cn/211.154.219.97:9494/.*?PDF',thtml)\n", 114 | " print('--9--file_url is',file_url.group(0))\n", 115 | " try:\n", 116 | " #print(file_url.group(0))\n", 117 | " report_year=file_url.group(0).split(\"/\")[-4]\n", 118 | " print('report_year',report_year)\n", 119 | " report_year=str(int(report_year)-1)\n", 120 | " print('report_year',report_year)\n", 121 | " each3 = each[0:-1]\n", 122 | " print(each3)\n", 123 | " report_file=each3+report_year\n", 124 | " print(report_file,report_file,report_file,report_file)\n", 125 | " local = file_folder+'annual_report_2019/'+'风电'+report_file+'.pdf'\n", 126 | " # local = './' +'annual_report_2019'+ '/' +each3+report_year+'.pdf'\n", 127 | " # print('local',local)\n", 128 | " # # local = './'+each+'/'+file_url.group(0).split(\"/\")[-4]+'.pdf'\n", 129 | " # # local = './'+each+'/'+file_url.group(0).split(\"/\")[-1]+'.pdf'\n", 130 | " # # local = './'+sid+'/'+file_url.group(0).split(\"/\")[-1]+'.pdf'\n", 131 | " # print(\"10 local is\",local)\n", 132 | " #调试用作文件占位\n", 133 | " #open(local, 'wb').write(b'success')\n", 134 | " # print('11',each1,'local',local)\n", 135 | " urllib.request.urlretrieve(file_url.group(0),local,None)\n", 136 | " except:\n", 137 | " print('PDF失效;'+target_url)\n", 138 | " except:\n", 139 | " print(each1,'年报下载页面编码错误;'+target_url)\n", 140 | " except:\n", 141 | " print('each2',each2)\n", 142 | " except:\n", 143 | " print('年报列表页面编码错误;'+url)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [] 152 | } 153 | ], 154 | "metadata": { 155 | "language_info": { 156 | "codemirror_mode": { 157 | "name": "ipython", 158 | "version": 3 159 | }, 160 | "file_extension": ".py", 161 | "mimetype": "text/x-python", 162 | "name": "python", 163 | "nbconvert_exporter": "python", 164 | "pygments_lexer": "ipython3", 165 | "version": "3.7.4-final" 166 | }, 167 | "orig_nbformat": 2, 168 | "kernelspec": { 169 | "name": "python37432bit6a4013f1a8ec4766a4af9787f2730623", 170 | "display_name": "Python 3.7.4 32-bit" 171 | } 172 | }, 173 | "nbformat": 4, 174 | "nbformat_minor": 2 175 | } -------------------------------------------------------------------------------- /002_import_company_introduction_to_excel.py: -------------------------------------------------------------------------------- 1 | import xlrd #（excel read）来读取Excel文件 2 | import xlwt #（excel write）来生成Excel文件 3 | workbook = xlwt.Workbook() # 新建一个工作簿 4 | sheet = workbook.add_sheet("sheet_name") # 在工作簿中新建一个表格 5 | def write_excel_xls(path,value,inum): 6 | index = len(value) # 获取需要写入数据的行数 7 | # print("index is",index) 8 | for num in range(0, index): 9 | sheet.write(inum,num,value[num]) 10 | # for i in range(0, index): 11 | # for j in range(0, len(value[i])): 12 | # sheet.write(i, j, value[i][j]) # 像表格中写入数据（对应的行和列） 13 | 14 | print("xls格式表格写入数据成功！") 15 | # import xlutils #暂时没有办法安装xlutils 16 | # def write_excel_xls_append(path, value): 17 | # index = len(value) # 获取需要写入数据的行数 18 | # workbook = xlrd.open_workbook(path) # 打开工作簿 19 | # sheets = workbook.sheet_names() # 获取工作簿中的所有表格 20 | # worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格 21 | # rows_old = worksheet.nrows # 获取表格中已存在的数据的行数 22 | # new_workbook = xlutils.copy(workbook) # 将xlrd对象拷贝转化为xlwt对象 23 | # new_worksheet = new_workbook.get_sheet(0) # 获取转化后工作簿中的第一个表格 24 | # for i in range(0, index): 25 | # for j in range(0, len(value[i])): 26 | # new_worksheet.write(i+rows_old, j, value[i][j]) # 追加写入数据，注意是从i+rows_old行开始写入 27 | # new_workbook.save(path) # 保存工作簿 28 | # print("xls格式表格【追加】写入数据成功！") 29 | 30 | 31 | 32 | # coding=UTF-8 33 | file_folder= "C:/" 34 | file_name="stock_workbook.xls" 35 | file_source="stock.txt" 36 | 37 | # print(file_foler+file_name) 38 | 39 | #构造url 40 | url_source=open(file_folder+file_source) 41 | f=open(file_folder+file_source) 42 | stock = [] 43 | for line in f.readlines(): 44 | #print(line,end = '') 45 | line = line.replace('\n','') 46 | stock.append(line) 47 | #print(stock) 48 | f.close() 49 | #print(stock) 50 | iinum = 1 51 | for each in stock: 52 | # print(each[2:]) 53 | each1=each[2:8] 54 | # print(each) 55 | url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vCI_CorpInfo/stockid/'+each1+'.phtml' 56 | 57 | # 开始爬虫 58 | import requests 59 | headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} 60 | # print(url) 61 | response = requests.get(url, headers=headers) 62 | response.encoding = 'gbk' # 解决乱码问题 63 | html_text = response.text 64 | # 开始爬虫 65 | 66 | import lxml 67 | from lxml import etree 68 | selector = etree.HTML(html_text) 69 | title = selector.xpath('//table/tr/td//text()') 70 | 71 | # 去除其中的冒号 72 | title_1 = [] 73 | for i in title: 74 | i = i.strip() 75 | i = i.strip('：') 76 | title_1.append(i) 77 | print("title_1", title_1[0:40]) 78 | # 去除其中的冒号 79 | 80 | # 存入excel 81 | path1 = file_folder + '3' + file_name 82 | print(path1) 83 | write_excel_xls(path1, title_1,iinum) 84 | workbook.save(path1) # 保存工作簿 85 | # 存入excel 86 | iinum=iinum+1 -------------------------------------------------------------------------------- /002爬取公司介绍.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "language_info": { 4 | "codemirror_mode": { 5 | "name": "ipython", 6 | "version": 3 7 | }, 8 | "file_extension": ".py", 9 | "mimetype": "text/x-python", 10 | "name": "python", 11 | "nbconvert_exporter": "python", 12 | "pygments_lexer": "ipython3", 13 | "version": "3.8.4-final" 14 | }, 15 | "orig_nbformat": 2, 16 | "kernelspec": { 17 | "name": "python38464bit9379dd402be645a4a272fa7043f0c61f", 18 | "display_name": "Python 3.8.4 64-bit" 19 | } 20 | }, 21 | "nbformat": 4, 22 | "nbformat_minor": 2, 23 | "cells": [ 24 | { 25 | "cell_type": "code", 26 | "execution_count": 5, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import xlrd #（excel read）来读取Excel文件\n", 31 | "import xlwt #（excel write）来生成Excel文件\n", 32 | "workbook = xlwt.Workbook() # 新建一个工作簿\n", 33 | "sheet = workbook.add_sheet(\"sheet_name\") # 在工作簿中新建一个表格\n", 34 | "def write_excel_xls(path,value,inum):\n", 35 | " index = len(value) # 获取需要写入数据的行数\n", 36 | " # print(\"index is\",index)\n", 37 | " for num in range(0, index):\n", 38 | " sheet.write(inum,num,value[num])\n", 39 | " # for i in range(0, index):\n", 40 | " # for j in range(0, len(value[i])):\n", 41 | " # sheet.write(i, j, value[i][j]) # 像表格中写入数据（对应的行和列）\n", 42 | "\n", 43 | " print(\"xls格式表格写入数据成功！\")" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 6, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "import xlutils #暂时没有办法安装xlutils\n", 53 | "def write_excel_xls_append(path, value):\n", 54 | " index = len(value) # 获取需要写入数据的行数\n", 55 | " workbook = xlrd.open_workbook(path) # 打开工作簿\n", 56 | " sheets = workbook.sheet_names() # 获取工作簿中的所有表格\n", 57 | " worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格\n", 58 | " rows_old = worksheet.nrows # 获取表格中已存在的数据的行数\n", 59 | " new_workbook = xlutils.copy(workbook) # 将xlrd对象拷贝转化为xlwt对象\n", 60 | " new_worksheet = new_workbook.get_sheet(0) # 获取转化后工作簿中的第一个表格\n", 61 | " for i in range(0, index):\n", 62 | " for j in range(0, len(value[i])):\n", 63 | " new_worksheet.write(i+rows_old, j, value[i][j]) # 追加写入数据，注意是从i+rows_old行开始写入\n", 64 | " new_workbook.save(path) # 保存工作簿\n", 65 | " print(\"xls格式表格【追加】写入数据成功！\")" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 10, 71 | "metadata": { 72 | "tags": [] 73 | }, 74 | "outputs": [ 75 | { 76 | "output_type": "stream", 77 | "name": "stdout", 78 | "text": "C:/Temp/stock_workbook.xls\n" 79 | } 80 | ], 81 | "source": [ 82 | "# coding=UTF-8\n", 83 | "file_folder= \"C:/Temp/\"\n", 84 | "file_name=\"stock_workbook.xls\"\n", 85 | "file_source=\"stock.txt\"\n", 86 | "\n", 87 | "print(file_folder+file_name)\n", 88 | "\n" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 12, 94 | "metadata": { 95 | "tags": [] 96 | }, 97 | "outputs": [ 98 | { 99 | "output_type": "stream", 100 | "name": "stdout", 101 | "text": "sz600600\nsh000123\nsh000449\nsh000666" 102 | } 103 | ], 104 | "source": [ 105 | "#构造url\n", 106 | "url_source=open(file_folder+file_source)\n", 107 | "f=open(file_folder+file_source)\n", 108 | "stock = []\n", 109 | "for line in f.readlines():\n", 110 | " print(line,end = '')\n", 111 | " line = line.replace('\\n','')\n", 112 | " stock.append(line)\n", 113 | "#print(stock)\n", 114 | "f.close()\n", 115 | "#print(stock)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 15, 121 | "metadata": { 122 | "tags": [] 123 | }, 124 | "outputs": [ 125 | { 126 | "output_type": "stream", 127 | "name": "stdout", 128 | "text": "title_1 ['公司名称', '青岛啤酒股份有限公司', '公司英文名称', 'Tsingtao Brewery Company Limited', '上市市场', '上海证券交易所', '上市日期', '', '1993-08-27', '发行价格', '主承销商', '', '上海申银证券公司', '', '成立日期', '1995-12-27', '注册资本', '135098万元', '机构类型', '其它', '组织形式', '董事会秘书', '张瑞祥', '公司电话', '0532-85713831', '董秘电话', '0532-85713831', '公司传真', '0532-85713240', '董秘传真', '0532-85713240', '公司电子邮箱', '', 'info@tsingtao.com.cn', '', '董秘电子邮箱', 'secretary@tsingtao.com.cn', '公司网址', 'http://www.tsingtao.com.cn', '邮政编码']\nC:/Temp/3stock_workbook.xls\nxls格式表格写入数据成功！\ntitle_1 ['公司名称', '公司英文名称', '上市市场', '深圳证券交易所', '上市日期', '--', '', '发行价格', '主承销商', '', '成立日期', '注册资本', '0万元', '机构类型', '组织形式', '董事会秘书', '公司电话', '董秘电话', '公司传真', '董秘传真', '公司电子邮箱', '', '', '董秘电子邮箱', '公司网址', 'http://', '邮政编码', '信息披露网址', '证券简称更名历史', '注册地址', '办公地址', '公司简介', '经营范围', '↑', '返回页顶', '↑', '记录登录状态一个月', '', '登录', '']\nC:/Temp/3stock_workbook.xls\nxls格式表格写入数据成功！\ntitle_1 ['公司名称', '公司英文名称', '上市市场', '深圳证券交易所', '上市日期', '--', '', '发行价格', '主承销商', '', '成立日期', '注册资本', '0万元', '机构类型', '组织形式', '董事会秘书', '公司电话', '董秘电话', '公司传真', '董秘传真', '公司电子邮箱', '', '', '董秘电子邮箱', '公司网址', 'http://', '邮政编码', '信息披露网址', '证券简称更名历史', '注册地址', '办公地址', '公司简介', '经营范围', '↑', '返回页顶', '↑', '记录登录状态一个月', '', '登录', '']\nC:/Temp/3stock_workbook.xls\nxls格式表格写入数据成功！\ntitle_1 ['公司名称', '经纬纺织机械股份有限公司', '公司英文名称', 'Jingwei Textile Machinery Company Limited', '上市市场', '深圳证券交易所', '上市日期', '', '1996-12-10', '发行价格', '主承销商', '', '成立日期', '1996-03-29', '注册资本', '70413万元', '机构类型', '其它', '组织形式', '国有企业', '董事会秘书', '叶雪华', '公司电话', '010-84534078', '董秘电话', '010-84534078-8188', '公司传真', '010-84534135', '董秘传真', '010-84534135', '公司电子邮箱', '', 'jwgf@jwgf.com', '', '董秘电子邮箱', 'jwzd@jwgf.com', '公司网址', 'http://www.jwgf.com', '邮政编码', '100125']\nC:/Temp/3stock_workbook.xls\nxls格式表格写入数据成功！\n" 129 | } 130 | ], 131 | "source": [ 132 | "iinum = 1\n", 133 | "for each in stock:\n", 134 | " # print(each[2:])\n", 135 | " each1=each[2:8]\n", 136 | " # print(each)\n", 137 | " url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vCI_CorpInfo/stockid/'+each1+'.phtml'\n", 138 | "\n", 139 | " # 开始爬虫\n", 140 | " import requests\n", 141 | " headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}\n", 142 | " # print(url)\n", 143 | " response = requests.get(url, headers=headers)\n", 144 | " response.encoding = 'gbk' # 解决乱码问题\n", 145 | " html_text = response.text\n", 146 | " # 开始爬虫\n", 147 | "\n", 148 | " import lxml\n", 149 | " from lxml import etree\n", 150 | " selector = etree.HTML(html_text)\n", 151 | " title = selector.xpath('//table/tr/td//text()')\n", 152 | "\n", 153 | " # 去除其中的冒号\n", 154 | " title_1 = []\n", 155 | " for i in title:\n", 156 | " i = i.strip()\n", 157 | " i = i.strip('：')\n", 158 | " title_1.append(i)\n", 159 | " print(\"title_1\", title_1[0:40])\n", 160 | " # 去除其中的冒号\n", 161 | "\n", 162 | " # 存入excel\n", 163 | " path1 = file_folder + '3' + file_name\n", 164 | " print(path1)\n", 165 | " write_excel_xls(path1, title_1,iinum)\n", 166 | " workbook.save(path1) # 保存工作簿\n", 167 | " # 存入excel\n", 168 | " iinum=iinum+1\n", 169 | "print('————————————————全部完成————————————————)" 170 | ] 171 | } 172 | ] 173 | } -------------------------------------------------------------------------------- /003_getting_audit_firm_name_and_pay.py: -------------------------------------------------------------------------------- 1 | import xlrd #（excel read）来读取Excel文件 2 | import xlwt #（excel write）来生成Excel文件 3 | workbook = xlwt.Workbook() # 新建一个工作簿 4 | sheet = workbook.add_sheet("sheet_name") # 在工作簿中新建一个表格 5 | def write_excel_xls(path,value,inum): 6 | index = len(value) # 获取需要写入数据的行数 7 | # print("index is",index) 8 | for num in range(0, index): 9 | sheet.write(inum,num,value[num]) 10 | # for i in range(0, index): 11 | # for j in range(0, len(value[i])): 12 | # sheet.write(i, j, value[i][j]) # 像表格中写入数据（对应的行和列） 13 | 14 | print("xls格式表格写入数据成功！") 15 | 16 | # coding=UTF-8 17 | file_folder= "C:/python/" 18 | file_name="stock_workbook.xls" 19 | # file_source="stock.txt" 20 | 21 | # print(file_foler+file_name) 22 | 23 | ######################## 24 | import urllib.request 25 | import re 26 | import os 27 | import time 28 | import random 29 | 30 | f=open(file_folder+file_name) 31 | stock = [] 32 | for line in f.readlines(): 33 | #print(line,end = '') 34 | line = line.replace('\n','') 35 | stock.append(line) 36 | #print(stock) 37 | f.close() 38 | print('stock is',stock[:5]) 39 | iinum = 1 40 | for each in stock: 41 | # print(each[2:]) 42 | each1=each[2:8] 43 | print('1股票代码',each) 44 | url='http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/'+each1+'/page_type/ndbg.phtml' 45 | # url='http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/'+each+'/page_type/ndbg.phtml' 46 | req = urllib.request.Request(url) 47 | print('2',each) 48 | req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0') 49 | page = urllib.request.urlopen(req) 50 | time.sleep(random.random() * 3) 51 | print('3',each) 52 | try: 53 | html = page.read().decode('gbk') 54 | target = r'&id=[_0-9_]{7}' 55 | print('4',each) 56 | # target = r'&id=[_0-9_]{6}' 57 | target_list = re.findall(target,html) 58 | # os.mkdir('./') 59 | # os.mkdir('./'+each) 60 | # os.mkdir('./'+'electricity-'+each) 61 | sid = each1 62 | if len(target_list)>0: 63 | print('5',each1,'2018和2015年target_list',target_list) 64 | print("6sid is",sid) 65 | year=2017 66 | each2=target_list[1] 67 | # year=2018 68 | # each2=target_list[0] 69 | target_url='http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllBulletinDetail.php?stockid='+sid+each2 70 | print('--8--',each1,year,'年报详情链接target_url,',target_url) 71 | 72 | url=target_url 73 | # 开始爬虫 74 | import requests 75 | headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} 76 | # print(url) 77 | response = requests.get(url, headers=headers) 78 | response.encoding = 'gbk' # 解决乱码问题 79 | html_text = response.text 80 | # 开始爬虫 81 | 82 | import lxml 83 | from lxml import etree 84 | selector = etree.HTML(html_text) 85 | # title = selector.xpath('//table/tr/td//text()')#2018 86 | title = selector.xpath('//tbody/tr/td//text()')#2017 87 | # 去除其中的冒号 88 | title_1 = [] 89 | for i in title: 90 | i = i.strip() 91 | i = i.strip('：') 92 | # title_1.append(i) 93 | for j in i.split(): 94 | print("--------------",j) 95 | title_1.append(j) 96 | # title_1.append(i) 97 | print("title_1", title_1[00:1500]) 98 | time_now=int(time.time()) 99 | print(type(time_now)) 100 | time_now=str(time_now) 101 | print(type(time_now)) 102 | print(time_now) 103 | path1 =file_folder+time_now+"_"+str(year)+file_name 104 | print(path1) 105 | for title in range(len(title_1)): 106 | if title_1[title].find('境内会计师事务所名称')>-1: 107 | for i in range(20): 108 | print(i,"境内会计师事务所名称",title_1[title+i]) 109 | # 存入excel 110 | write_excel_xls(path1, [each[0:7], each[8:], target_url, title_1[title],title_1[title+1],title_1[title+2],title_1[title+3],title_1[title+4],title_1[title+5],title_1[title+6],title_1[title+7],title_1[title+8],title_1[title+9],title_1[title+10],title_1[title+11],title_1[title+12]], iinum) 111 | workbook.save(path1) # 保存工作簿 112 | # 存入excel 113 | 114 | # 去除其中的冒号 115 | 116 | 117 | iinum=iinum+1 118 | except: 119 | print('年报列表页面编码错误;',path1,str(year)+title_1[0]) -------------------------------------------------------------------------------- /003_getting_audit_firm_name_and_pay_2017.py: -------------------------------------------------------------------------------- 1 | import xlrd #（excel read）来读取Excel文件 2 | import xlwt #（excel write）来生成Excel文件 3 | workbook = xlwt.Workbook() # 新建一个工作簿 4 | sheet = workbook.add_sheet("sheet_name") # 在工作簿中新建一个表格 5 | def write_excel_xls(path,value,inum): 6 | index = len(value) # 获取需要写入数据的行数 7 | # print("index is",index) 8 | for num in range(0, index): 9 | sheet.write(inum,num,value[num]) 10 | # for i in range(0, index): 11 | # for j in range(0, len(value[i])): 12 | # sheet.write(i, j, value[i][j]) # 像表格中写入数据（对应的行和列） 13 | 14 | print("xls格式表格写入数据成功！") 15 | 16 | # coding=UTF-8 17 | file_folder= "C:/python/" 18 | file_name="stock_workbook.xls" 19 | # file_source="stock.txt" 20 | 21 | # print(file_foler+file_name) 22 | 23 | ######################## 24 | import urllib.request 25 | import re 26 | import os 27 | import time 28 | import random 29 | 30 | f=open(file_folder+file_name) 31 | stock = [] 32 | for line in f.readlines(): 33 | #print(line,end = '') 34 | line = line.replace('\n','') 35 | stock.append(line) 36 | #print(stock) 37 | f.close() 38 | print('stock is',stock[:5]) 39 | iinum = 1 40 | for each in stock: 41 | # print(each[2:]) 42 | each1=each[2:8] 43 | print('1股票代码',each) 44 | url='http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/'+each1+'/page_type/ndbg.phtml' 45 | # url='http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/'+each+'/page_type/ndbg.phtml' 46 | req = urllib.request.Request(url) 47 | print('2',each) 48 | req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0') 49 | page = urllib.request.urlopen(req) 50 | time.sleep(random.random() * 3) 51 | print('3',each) 52 | try: 53 | html = page.read().decode('gbk') 54 | target = r'&id=[_0-9_]{7}' 55 | print('4',each) 56 | # target = r'&id=[_0-9_]{6}' 57 | target_list = re.findall(target,html) 58 | # os.mkdir('./') 59 | # os.mkdir('./'+each) 60 | # os.mkdir('./'+'electricity-'+each) 61 | sid = each1 62 | if len(target_list)>0: 63 | print('5',each1,'2018和2015年target_list',target_list) 64 | print("6sid is",sid) 65 | year=2017 66 | each2=target_list[1] 67 | # year=2018 68 | # each2=target_list[0] 69 | target_url='http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllBulletinDetail.php?stockid='+sid+each2 70 | print('--8--',each1,year,'年报详情链接target_url,',target_url) 71 | 72 | url=target_url 73 | # 开始爬虫 74 | import requests 75 | headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} 76 | # print(url) 77 | response = requests.get(url, headers=headers) 78 | response.encoding = 'gbk' # 解决乱码问题 79 | html_text = response.text 80 | # 开始爬虫 81 | 82 | import lxml 83 | from lxml import etree 84 | selector = etree.HTML(html_text) 85 | # title = selector.xpath('//table/tr/td//text()')#2018 86 | title = selector.xpath('//tbody/tr/td//text()')#2017 87 | # 去除其中的冒号 88 | title_1 = [] 89 | for i in title: 90 | i = i.strip() 91 | i = i.strip('：') 92 | # title_1.append(i) 93 | for j in i.split(): 94 | print("--------------",j) 95 | title_1.append(j) 96 | # title_1.append(i) 97 | print("title_1", title_1[00:1500]) 98 | time_now=int(time.time()) 99 | print(type(time_now)) 100 | time_now=str(time_now) 101 | print(type(time_now)) 102 | print(time_now) 103 | path1 =file_folder+time_now+"_"+str(year)+file_name 104 | print(path1) 105 | for title in range(len(title_1)): 106 | if title_1[title].find('境内会计师事务所名称')>-1: 107 | for i in range(20): 108 | print(i,"境内会计师事务所名称",title_1[title+i]) 109 | # 存入excel 110 | write_excel_xls(path1, [each[0:7], each[8:], target_url, title_1[title],title_1[title+1],title_1[title+2],title_1[title+3],title_1[title+4],title_1[title+5],title_1[title+6],title_1[title+7],title_1[title+8],title_1[title+9],title_1[title+10],title_1[title+11],title_1[title+12]], iinum) 111 | workbook.save(path1) # 保存工作簿 112 | # 存入excel 113 | 114 | # 去除其中的冒号 115 | 116 | 117 | iinum=iinum+1 118 | except: 119 | print('年报列表页面编码错误;',path1,str(year)+title_1[0]) -------------------------------------------------------------------------------- /004_getting_report_file_and_information_into_excel.py: -------------------------------------------------------------------------------- 1 | import xlrd #（excel read）来读取Excel文件 2 | import xlwt #（excel write）来生成Excel文件 3 | workbook = xlwt.Workbook() # 新建一个工作簿 4 | sheet = workbook.add_sheet("sheet1") # 在工作簿中新建一个表格 5 | def write_excel_xls(path,value,inum): 6 | index = len(value) # 获取需要写入数据的行数 7 | # print("index is",index) 8 | for num in range(0, index): 9 | sheet.write(inum,num,value[num]) 10 | print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") 11 | print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") 12 | print("~~xls格式表格写入数据成功！~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") 13 | print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") 14 | print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") 15 | ########################################################################## 16 | # coding=UTF-8 17 | file_folder1= "C:/Users/wade z shao/Documents/202001Shennandonglu5016#29F/XXX/" 18 | file_folder= file_folder1+"python/" 19 | # file_name="test.xls";f=open(file_folder1+'stock_electricity_3.txt') 20 | # file_name="沪深300.xls";f=open(file_folder1+'sh_sz_300.txt') 21 | ########################################################################## 22 | import urllib.request 23 | import re 24 | import os 25 | import time 26 | import random 27 | import requests 28 | ########################################################################## 29 | stock = [] 30 | for line in f.readlines(): 31 | #print(line,end = '') 32 | line = line.replace('\n','') 33 | stock.append(line) 34 | f.close() 35 | print('stock is',stock[:5]) 36 | ########################################################################## 37 | iinum = 1 38 | for each in stock: 39 | # for each in stock: 40 | print("**********************************") 41 | print("******",iinum,"******",len(stock)) 42 | print("**********************************") 43 | # print(each[2:]) 44 | each1=each[2:8] 45 | print('1',each) 46 | #新浪行情中心#新浪行情中心#新浪行情中心 47 | url_sina_introduction='http://vip.stock.finance.sina.com.cn/corp/go.php/vCI_CorpInfo/stockid/'+each1+'.phtml' 48 | url_sina_report='http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/'+each1+'/page_type/ndbg.phtml' 49 | #网易财经#网易财经#网易财经#网易财经 50 | url_netease_index='http://quotes.money.163.com/'+each1+'.html' #网易个股主页 51 | url_netease_introduction='http://quotes.money.163.com/f10/gszl_'+each1+'.html#11c01#' #网易个股公司简介 52 | url_netease_report='http://quotes.money.163.com/f10/gsgg_'+each1+',dqbg.html' #网易公司定期报告，无法下载，但是有详细全文内容很全 53 | #网易财经如宁波韵升( 600366) 公司公告http://quotes.money.163.com/f10/gsgg_600366,dqbg.html 54 | url_xueqiu_index="https://xueqiu.com/S/"+each[0:8] 55 | url_eastmoney_report="http://data.eastmoney.com/notices/stock/300644.html"#年报混编了，不方便下载 56 | url_ifeng='http://app.finance.ifeng.com/data/stock/dqbg.php?symbol='+each1#凤凰财经的年报链接，不过无法访问了暂时 57 | print('###########公司信息##############################') 58 | headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} 59 | # print(url) 60 | response_sina_introduction = requests.get(url_sina_introduction, headers=headers) 61 | response_sina_introduction.encoding = 'gbk' # 解决乱码问题 62 | html_text_sina_introduction = response_sina_introduction.text 63 | # 开始爬虫# 开始爬虫# 开始爬虫# 开始爬虫# 开始爬虫 64 | import lxml 65 | from lxml import etree 66 | selector = etree.HTML(html_text_sina_introduction) 67 | title_sina_introduction = selector.xpath('//table/tr/td//text()') 68 | # 去除其中的冒号# 去除其中的冒号 69 | title_1_sina_introduction = [] 70 | for i in title_sina_introduction: 71 | i = i.strip() 72 | i = i.strip('：') 73 | title_1_sina_introduction.append(i) 74 | print("title_1_sina_introduction", title_1_sina_introduction[0:4]) 75 | # 去除其中的冒号# 去除其中的冒号 76 | print('###########公司信息##############################') 77 | print('###########年报审计的会计师事务所#################') 78 | 79 | req = urllib.request.Request(url_sina_report) 80 | print('2',each) 81 | req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0') 82 | page = urllib.request.urlopen(req) 83 | #### time.sleep(random.random() * 15) 84 | time.sleep(random.random() * 15) 85 | #### time.sleep(random.random() * 15) 86 | print('3',each) 87 | 88 | 89 | #嗅探年报的网站链接 #嗅探年报的网站链接 90 | try: 91 | html = page.read().decode('gbk');target = r'&id=[_0-9_]{7}';target_list = re.findall(target,html) 92 | sid = each1;target_url = 'http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllBulletinDetail.php?stockid=' + sid 93 | if len(target_list)>0: 94 | year = 2018;each2018 = target_list[0];target_url_2018=target_url+each2018 95 | print('----2018----',each1,year,'年报链接,',target_url_2018) 96 | # 嗅探年报的网站链接 #嗅探年报的网站链接 97 | 98 | ### 爬取会计师事务所### 爬取会计师事务所 99 | #### time.sleep(random.random() * 15) 100 | time.sleep(random.random() *15) 101 | #### time.sleep(random.random() * 15) 102 | import requests 103 | headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} 104 | response = requests.get(target_url_2018, headers=headers) 105 | response.encoding = 'gbk' # 解决乱码问题 106 | html_text = response.text 107 | import lxml 108 | from lxml import etree 109 | selector = etree.HTML(html_text) 110 | title = selector.xpath('//table/tr/td//text()')#2018年会计师事务所 111 | # title = selector.xpath('//tbody/tr/td//text()')#2017 112 | # 去除其中的冒号 113 | title_1 = [] 114 | for i in title: 115 | i = i.strip() 116 | i = i.strip('：') 117 | # title_1.append(i) 118 | for j in i.split(): 119 | # print("--------------",j) 120 | title_1.append(j) 121 | # title_1.append(i) 122 | # print("title_1", title_1[00:1500]) 123 | time_now=int(time.time()) 124 | time_now=str(time_now)[0:7] 125 | path1 =file_folder+time_now+"_"+str(year)+file_name+'.xls' 126 | print(path1) 127 | for title in range(len(title_1)): 128 | if title_1[title].find('境内会计师事务所名称')>-1: 129 | for i in range(1): 130 | # for i in range(20): 131 | print(i,title_1[title+i]) 132 | # 存入excel 133 | # write_excel_xls(path1, [each[0:7], each[8:], year,target_url, title_1[title],title_1[title+1],title_1[title+2],title_1[title+3],title_1[title+4],title_1[title+5],title_1[title+6],title_1[title+7],title_1[title+8],title_1[title+9],title_1[title+10],title_1[title+11],title_1[title+12]]+title_1_sina_introduction, iinum) 134 | # workbook.save(path1) # 保存工作簿 135 | # 存入excel 136 | iinum=iinum+1 137 | ### 爬取会计师事务所### 爬取会计师事务所 138 | 139 | 140 | 141 | ###下载年报 ###下载年报 ###下载年报 142 | # if len(target_list) > 0: 143 | # year = 2018; 144 | # each2018 = target_list[0]; 145 | # target_url_2018 = target_url + each2018 146 | # os.mkdir('./'+file_name+each[2:]) 147 | time.sleep(random.random() * 8) 148 | treq = urllib.request.Request(target_url_2018) 149 | treq.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0') 150 | tpage = urllib.request.urlopen(treq) 151 | time.sleep(random.random() * 3) 152 | try: 153 | thtml = tpage.read().decode('gbk') 154 | #print(thtml) 155 | file_url = re.search('http://file.finance.sina.com.cn/211.154.219.97:9494/.*?PDF',thtml) 156 | print("--9--file_url is",file_url.group(0)) 157 | 158 | try: 159 | #print(file_url.group(0)) 160 | local = './'+file_name+each[2:]+'/'+each[2:]+'2018年年报'+'.pdf' 161 | # local = './'+each+'/'+file_url.group(0).split("/")[-4]+'.pdf' 162 | # local = './'+each+'/'+file_url.group(0).split("/")[-1]+'.pdf' 163 | # local = './'+sid+'/'+file_url.group(0).split("/")[-1]+'.pdf' 164 | print("10 local is",local) 165 | #调试用作文件占位 166 | #open(local, 'wb').write(b'success') 167 | print('11',each1,'local',local) 168 | # urllib.request.urlretrieve(file_url.group(0),local,None) 169 | except: 170 | print('2018PDF失效;'+target_url) 171 | except: 172 | print(each1,'2018年报下载页面编码错误;'+target_url) 173 | if len(target_list)>3: 174 | print('----2015----',each1,'2015年target_list',target_list[3]) 175 | each2015 = target_list[3];target_url_2015=target_url+each2015 176 | # year=2017;each2017=target_list[1] 177 | time.sleep(random.random() * 8) 178 | treq_2015 = urllib.request.Request(target_url_2015) 179 | treq_2015.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0') 180 | tpage = urllib.request.urlopen(treq_2015) 181 | time.sleep(random.random() * 8) 182 | try: 183 | thtml_2015 = tpage.read().decode('gbk') 184 | #print(thtml) 185 | file_url_2015 = re.search('http://file.finance.sina.com.cn/211.154.219.97:9494/.*?PDF',thtml_2015) 186 | print("--2015--file_url is",file_url_2015.group(0)) 187 | 188 | try: 189 | #print(file_url.group(0)) 190 | local = './'+file_name+each[2:]+'/'+each[2:]+'2015年年报'+'.pdf' 191 | # report_year=(file_url_2015.group(0).split("/")[-4]) 192 | # local = './'+file_name+each+'/'+report_year+'.pdf' 193 | # local = './'+each+'/'+file_url.group(0).split("/")[-1]+'.pdf' 194 | # local = './'+sid+'/'+file_url.group(0).split("/")[-1]+'.pdf' 195 | print("2015 local is",local) 196 | #调试用作文件占位 197 | #open(local, 'wb').write(b'success') 198 | print('2015',each1,'local',local) 199 | urllib.request.urlretrieve(file_url_2015.group(0),local,None) 200 | except: 201 | print('2015PDF失效;'+target_url) 202 | except: 203 | print(each1,'2015年报下载页面编码错误;'+target_url) 204 | 205 | ###下载年报 ###下载年报 ###下载年报 206 | except: 207 | print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@') 208 | print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@') 209 | print(each,'年报列表页面编码错误;',path1,str(year)+title_1[0]) 210 | print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@') 211 | print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@') -------------------------------------------------------------------------------- /004爬公司年报.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "language_info": { 4 | "codemirror_mode": { 5 | "name": "ipython", 6 | "version": 3 7 | }, 8 | "file_extension": ".py", 9 | "mimetype": "text/x-python", 10 | "name": "python", 11 | "nbconvert_exporter": "python", 12 | "pygments_lexer": "ipython3", 13 | "version": "3.8.4-final" 14 | }, 15 | "orig_nbformat": 2, 16 | "kernelspec": { 17 | "name": "python38464bit9379dd402be645a4a272fa7043f0c61f", 18 | "display_name": "Python 3.8.4 64-bit" 19 | } 20 | }, 21 | "nbformat": 4, 22 | "nbformat_minor": 2, 23 | "cells": [ 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import xlrd #（excel read）来读取Excel文件\n", 31 | "import xlwt #（excel write）来生成Excel文件\n", 32 | "workbook = xlwt.Workbook() # 新建一个工作簿\n", 33 | "sheet = workbook.add_sheet(\"sheet1\") # 在工作簿中新建一个表格\n", 34 | "def write_excel_xls(path,value,inum):\n", 35 | " index = len(value) # 获取需要写入数据的行数\n", 36 | " # print(\"index is\",index)\n", 37 | " for num in range(0, index):\n", 38 | " sheet.write(inum,num,value[num])\n", 39 | " print(\"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\")\n", 40 | " print(\"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\")\n", 41 | " print(\"~~xls格式表格写入数据成功！~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\")\n", 42 | " print(\"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\")\n", 43 | " print(\"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\")" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 4, 49 | "metadata": { 50 | "tags": [] 51 | }, 52 | "outputs": [ 53 | { 54 | "output_type": "stream", 55 | "name": "stdout", 56 | "text": "C:/Temp/stock.txt\n" 57 | } 58 | ], 59 | "source": [ 60 | "# coding=UTF-8\n", 61 | "file_folder1= \"C:/\"\n", 62 | "file_folder= file_folder1+\"Temp/stock.txt\"\n", 63 | "# file_name=\"test.xls\";f=open(file_folder1+'stock_electricity_3.txt')\n", 64 | "# file_name=\"沪深300.xls\";f=open(file_folder1+'sh_sz_300.txt')\n", 65 | "print(file_folder)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 7, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "f=open(file_folder)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 5, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "import urllib.request\n", 84 | "import re\n", 85 | "import os\n", 86 | "import time\n", 87 | "import random\n", 88 | "import requests" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 8, 94 | "metadata": { 95 | "tags": [] 96 | }, 97 | "outputs": [ 98 | { 99 | "output_type": "stream", 100 | "name": "stdout", 101 | "text": "stock is ['sz600600', 'sh000123', 'sh000449', 'sh000666']\n" 102 | } 103 | ], 104 | "source": [ 105 | "stock = []\n", 106 | "for line in f.readlines():\n", 107 | " #print(line,end = '')\n", 108 | " line = line.replace('\\n','')\n", 109 | " stock.append(line)\n", 110 | "f.close()\n", 111 | "print('stock is',stock[:5])" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "iinum = 1\n", 121 | "for each in stock:\n", 122 | "# for each in stock:\n", 123 | " print(\"**********************************\")\n", 124 | " print(\"******\",iinum,\"******\",len(stock))\n", 125 | " print(\"**********************************\")\n", 126 | " # print(each[2:])\n", 127 | " each1=each[2:8]\n", 128 | " print('1',each)\n", 129 | " #新浪行情中心#新浪行情中心#新浪行情中心\n", 130 | " url_sina_introduction='http://vip.stock.finance.sina.com.cn/corp/go.php/vCI_CorpInfo/stockid/'+each1+'.phtml'\n", 131 | " url_sina_report='http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/'+each1+'/page_type/ndbg.phtml'\n", 132 | " #网易财经#网易财经#网易财经#网易财经\n", 133 | " url_netease_index='http://quotes.money.163.com/'+each1+'.html' #网易个股主页\n", 134 | " url_netease_introduction='http://quotes.money.163.com/f10/gszl_'+each1+'.html#11c01#' #网易个股公司简介\n", 135 | " url_netease_report='http://quotes.money.163.com/f10/gsgg_'+each1+',dqbg.html' #网易公司定期报告，无法下载，但是有详细全文内容很全\n", 136 | " #网易财经如宁波韵升( 600366) 公司公告http://quotes.money.163.com/f10/gsgg_600366,dqbg.html\n", 137 | " url_xueqiu_index=\"https://xueqiu.com/S/\"+each[0:8]\n", 138 | " url_eastmoney_report=\"http://data.eastmoney.com/notices/stock/300644.html\"#年报混编了，不方便下载\n", 139 | " url_ifeng='http://app.finance.ifeng.com/data/stock/dqbg.php?symbol='+each1#凤凰财经的年报链接，不过无法访问了暂时\n", 140 | " print('###########公司信息##############################')\n", 141 | " headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}\n", 142 | " # print(url)\n", 143 | " response_sina_introduction = requests.get(url_sina_introduction, headers=headers)\n", 144 | " response_sina_introduction.encoding = 'gbk' # 解决乱码问题\n", 145 | " html_text_sina_introduction = response_sina_introduction.text\n", 146 | " # 开始爬虫# 开始爬虫# 开始爬虫# 开始爬虫# 开始爬虫\n", 147 | " import lxml\n", 148 | " from lxml import etree\n", 149 | " selector = etree.HTML(html_text_sina_introduction)\n", 150 | " title_sina_introduction = selector.xpath('//table/tr/td//text()')\n", 151 | " # 去除其中的冒号# 去除其中的冒号\n", 152 | " title_1_sina_introduction = []\n", 153 | " for i in title_sina_introduction:\n", 154 | " i = i.strip()\n", 155 | " i = i.strip('：')\n", 156 | " title_1_sina_introduction.append(i)\n", 157 | " print(\"title_1_sina_introduction\", title_1_sina_introduction[0:4])\n", 158 | " # 去除其中的冒号# 去除其中的冒号\n", 159 | " print('###########公司信息##############################')\n", 160 | " print('###########年报审计的会计师事务所#################')\n", 161 | "\n", 162 | " req = urllib.request.Request(url_sina_report)\n", 163 | " print('2',each)\n", 164 | " req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0')\n", 165 | " page = urllib.request.urlopen(req)\n", 166 | "#### time.sleep(random.random() * 15)\n", 167 | " time.sleep(random.random() * 15)\n", 168 | "#### time.sleep(random.random() * 15)\n", 169 | " print('3',each)\n", 170 | "\n", 171 | "\n", 172 | " #嗅探年报的网站链接 #嗅探年报的网站链接\n", 173 | " try:\n", 174 | " html = page.read().decode('gbk');target = r'&id=[_0-9_]{7}';target_list = re.findall(target,html)\n", 175 | " sid = each1;target_url = 'http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllBulletinDetail.php?stockid=' + sid\n", 176 | " if len(target_list)>0:\n", 177 | " year = 2018;each2018 = target_list[0];target_url_2018=target_url+each2018\n", 178 | " print('----2018----',each1,year,'年报链接,',target_url_2018)\n", 179 | " # 嗅探年报的网站链接 #嗅探年报的网站链接\n", 180 | "\n", 181 | " ### 爬取会计师事务所### 爬取会计师事务所\n", 182 | " #### time.sleep(random.random() * 15)\n", 183 | " time.sleep(random.random() *15)\n", 184 | " #### time.sleep(random.random() * 15)\n", 185 | " import requests\n", 186 | " headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}\n", 187 | " response = requests.get(target_url_2018, headers=headers)\n", 188 | " response.encoding = 'gbk' # 解决乱码问题\n", 189 | " html_text = response.text\n", 190 | " import lxml\n", 191 | " from lxml import etree\n", 192 | " selector = etree.HTML(html_text)\n", 193 | " title = selector.xpath('//table/tr/td//text()')#2018年会计师事务所\n", 194 | " # title = selector.xpath('//tbody/tr/td//text()')#2017\n", 195 | " # 去除其中的冒号\n", 196 | " title_1 = []\n", 197 | " for i in title:\n", 198 | " i = i.strip()\n", 199 | " i = i.strip('：')\n", 200 | " # title_1.append(i)\n", 201 | " for j in i.split():\n", 202 | " # print(\"--------------\",j)\n", 203 | " title_1.append(j)\n", 204 | " # title_1.append(i)\n", 205 | " # print(\"title_1\", title_1[00:1500])\n", 206 | " time_now=int(time.time())\n", 207 | " time_now=str(time_now)[0:7]\n", 208 | " path1 =file_folder+time_now+\"_\"+str(year)+file_name+'.xls'\n", 209 | " print(path1)\n", 210 | " for title in range(len(title_1)):\n", 211 | " if title_1[title].find('境内会计师事务所名称')>-1:\n", 212 | " for i in range(1):\n", 213 | " # for i in range(20):\n", 214 | " print(i,title_1[title+i])\n", 215 | " # 存入excel\n", 216 | " # write_excel_xls(path1, [each[0:7], each[8:], year,target_url, title_1[title],title_1[title+1],title_1[title+2],title_1[title+3],title_1[title+4],title_1[title+5],title_1[title+6],title_1[title+7],title_1[title+8],title_1[title+9],title_1[title+10],title_1[title+11],title_1[title+12]]+title_1_sina_introduction, iinum)\n", 217 | " # workbook.save(path1) # 保存工作簿\n", 218 | " # 存入excel\n", 219 | " iinum=iinum+1\n", 220 | " ### 爬取会计师事务所### 爬取会计师事务所\n", 221 | "\n", 222 | "\n", 223 | "\n", 224 | " ###下载年报 ###下载年报 ###下载年报\n", 225 | " # if len(target_list) > 0:\n", 226 | " # year = 2018;\n", 227 | " # each2018 = target_list[0];\n", 228 | " # target_url_2018 = target_url + each2018\n", 229 | " # os.mkdir('./'+file_name+each[2:])\n", 230 | " time.sleep(random.random() * 8)\n", 231 | " treq = urllib.request.Request(target_url_2018)\n", 232 | " treq.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0')\n", 233 | " tpage = urllib.request.urlopen(treq)\n", 234 | " time.sleep(random.random() * 3)\n", 235 | " try:\n", 236 | " thtml = tpage.read().decode('gbk')\n", 237 | " #print(thtml)\n", 238 | " file_url = re.search('http://file.finance.sina.com.cn/211.154.219.97:9494/.*?PDF',thtml)\n", 239 | " print(\"--9--file_url is\",file_url.group(0))\n", 240 | "\n", 241 | " try:\n", 242 | " #print(file_url.group(0))\n", 243 | " local = './'+file_name+each[2:]+'/'+each[2:]+'2018年年报'+'.pdf'\n", 244 | " # local = './'+each+'/'+file_url.group(0).split(\"/\")[-4]+'.pdf'\n", 245 | " # local = './'+each+'/'+file_url.group(0).split(\"/\")[-1]+'.pdf'\n", 246 | " # local = './'+sid+'/'+file_url.group(0).split(\"/\")[-1]+'.pdf'\n", 247 | " print(\"10 local is\",local)\n", 248 | " #调试用作文件占位\n", 249 | " #open(local, 'wb').write(b'success')\n", 250 | " print('11',each1,'local',local)\n", 251 | " # urllib.request.urlretrieve(file_url.group(0),local,None)\n", 252 | " except:\n", 253 | " print('2018PDF失效;'+target_url)\n", 254 | " except:\n", 255 | " print(each1,'2018年报下载页面编码错误;'+target_url)\n", 256 | " if len(target_list)>3:\n", 257 | " print('----2015----',each1,'2015年target_list',target_list[3])\n", 258 | " each2015 = target_list[3];target_url_2015=target_url+each2015\n", 259 | " # year=2017;each2017=target_list[1]\n", 260 | " time.sleep(random.random() * 8)\n", 261 | " treq_2015 = urllib.request.Request(target_url_2015)\n", 262 | " treq_2015.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0')\n", 263 | " tpage = urllib.request.urlopen(treq_2015)\n", 264 | " time.sleep(random.random() * 8)\n", 265 | " try:\n", 266 | " thtml_2015 = tpage.read().decode('gbk')\n", 267 | " #print(thtml)\n", 268 | " file_url_2015 = re.search('http://file.finance.sina.com.cn/211.154.219.97:9494/.*?PDF',thtml_2015)\n", 269 | " print(\"--2015--file_url is\",file_url_2015.group(0))\n", 270 | "\n", 271 | " try:\n", 272 | " #print(file_url.group(0))\n", 273 | " local = './'+file_name+each[2:]+'/'+each[2:]+'2015年年报'+'.pdf'\n", 274 | " # report_year=(file_url_2015.group(0).split(\"/\")[-4])\n", 275 | " # local = './'+file_name+each+'/'+report_year+'.pdf'\n", 276 | " # local = './'+each+'/'+file_url.group(0).split(\"/\")[-1]+'.pdf'\n", 277 | " # local = './'+sid+'/'+file_url.group(0).split(\"/\")[-1]+'.pdf'\n", 278 | " print(\"2015 local is\",local)\n", 279 | " #调试用作文件占位\n", 280 | " #open(local, 'wb').write(b'success')\n", 281 | " print('2015',each1,'local',local)\n", 282 | " urllib.request.urlretrieve(file_url_2015.group(0),local,None)\n", 283 | " except:\n", 284 | " print('2015PDF失效;'+target_url)\n", 285 | " except:\n", 286 | " print(each1,'2015年报下载页面编码错误;'+target_url)\n", 287 | "\n", 288 | " ###下载年报 ###下载年报 ###下载年报\n", 289 | " except:\n", 290 | " print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@')\n", 291 | " print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@')\n", 292 | " print(each,'年报列表页面编码错误;',path1,str(year)+title_1[0])\n", 293 | " print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@')\n", 294 | " print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@')" 295 | ] 296 | } 297 | ] 298 | } -------------------------------------------------------------------------------- /005_general_spider_for_finacial_data.py: -------------------------------------------------------------------------------- 1 | def sleep(n): 2 | print('~~~~~~缓冲中~~~~~loading~~~~~~~~') 3 | import time 4 | import random 5 | time.sleep(random.random() * n) 6 | def time_now(): 7 | import time 8 | return time.strftime('%Y-%m-%d-%H%M%S',time.localtime(time.time())) 9 | def html_save(data):#filename为写入CSV文件的路径，data为要写入数据列表. 10 | filename='C:/data/html/' + str(time_now()) + '.html' 11 | with open(filename, 'w', encoding='utf-8') as f: 12 | # with open('C:/data/'+str(time_now())+'.html', 'w', encoding='gbk') as f: 13 | f.write(data) 14 | print("保存",filename,"文件成功") 15 | def text_save(data):#filename为写入CSV文件的路径，data为要写入数据列表. 16 | filename='C:/data/text/'+str(time_now())+'.txt' 17 | file = open(filename,'a', encoding='utf-8') 18 | file.write(data) 19 | # for i in range(len(data)): 20 | # s = str(data[i]).replace('[','').replace(']','')#去除[],这两行按数据不同，可以选择 21 | # s = s.replace("'",'').replace(',','') +'\n' #去除单引号，逗号，每行末尾追加换行符 22 | # file.write(s) 23 | file.close() 24 | print("保存",filename,"文件成功") 25 | def write_csv(data): 26 | filename='C:/data/csv/'+str(time_now())+'.csv' 27 | import csv 28 | f = open(filename, 'w', encoding='utf-8') 29 | writer = csv.writer(f) 30 | writer.writerow(data) 31 | print("保存", filename, "文件成功") 32 | def headers(): 33 | USER_AGENT_LIST = [ 34 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 35 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 36 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 37 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 38 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 39 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 40 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 41 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 42 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 43 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 44 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 45 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 46 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 47 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 48 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 49 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 50 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", 51 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", 52 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", 53 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", 54 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER", 55 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", 56 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", 57 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", 58 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", 59 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", 60 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", 61 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", 62 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", 63 | "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", 64 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", 65 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", 66 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", 67 | "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", 68 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", 69 | ] 70 | import random 71 | USER_AGENT = random.choice(USER_AGENT_LIST) 72 | headers = {'user-agent': USER_AGENT} 73 | return headers 74 | def get_521_content(url): 75 | import requests;import re 76 | headers={ 77 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36' 78 | } 79 | req=requests.get(url,headers=headers) 80 | cookies=req.cookies 81 | cookies='; '.join(['='.join(item) for item in cookies.items()]) 82 | txt_521=req.text 83 | txt_521=''.join(re.findall('',txt_521)) 84 | print(txt_521,txt_521) 85 | print('cookies','\n',cookies) 86 | return (txt_521,cookies) 87 | def COOKIES(url): 88 | import selenium 89 | from selenium import webdriver 90 | driver=webdriver.Chrome() 91 | driver.get("url") 92 | cj= driver.get_cookies() 93 | cookie='' 94 | for c in cj: 95 | cookie += c['name'] +'=' + c['value'] +';' 96 | return cookie 97 | driver.quit() 98 | def requested(url,xpath1,*keywords): 99 | import requests 100 | import re 101 | from lxml import etree 102 | # response = requests.get(url, headers=headers(),cookie=get_521_content(url)[1]); 103 | response=requests.get(url,headers=headers()); 104 | write_csv(response.text) 105 | html_save(response.text) 106 | text_save(response.text) 107 | if response.status_code== 200: 108 | pass 109 | else: 110 | print(response) 111 | # response=requests.get(url,headers=headers(),cookie=COOKIES(url));print(response) 112 | response.encoding=response.apparent_encoding 113 | # response.encoding='utf-8' 114 | # print(response.content) 115 | # print(response.text) 116 | html=etree.HTML(response.text) 117 | items=html.xpath(xpath1) 118 | print('共找到',len(items),'条符合条件的结果') 119 | if len(items)>1: 120 | for i in range(len(items)): 121 | # print(len(items),'-',i+1,'','',items[i]) 122 | # print(keywords) 123 | if keywords==(): 124 | pass 125 | elif items[i].find(keywords)>-1: 126 | print(i,len(items),keywords,items[i]) 127 | else: 128 | print(keywords,'not found未找到') 129 | return items 130 | else: 131 | print(url,'未找到结果') 132 | sleep(5) 133 | 134 | 135 | ############################################################################################## 136 | ############# bidchance.com招标网 ################# 137 | ############################################################################################### 138 | # # url='http://www.bidchance.com/freesearch.do?filetype=&channel=zhongbiao¤tpage=1&searchtype=&queryword=&displayStyle=&pstate=&field=&leftday=&province=&bidfile=&project=&heshi=&recommend=&field=&jing=&starttime=&endtime=&attachment=' 139 | # # xpath1='//text()' 140 | # # requested(url,xpath1) 141 | # url='http://www.bidchance.com/freesearch.do?filetype=&channel=&channels=zhongbiao¤tpage=1&searchtype=&queryword=%C4%DA%BF%D8&displayStyle=&pstate=&field=title&leftday=&province=&bidfile=&project=&heshi=&recommend=&field=title&jing=&starttime=&endtime=&attachment=' 142 | # ##内控标题搜索 143 | # xpath1='//div//tr/td//text()' 144 | # get_521_content(url) 145 | # requested(url,xpath1) 146 | 147 | 148 | ############################################################################################## 149 | ############# 铁路12306 ################# 150 | ############################################################################################## 151 | # url='https://kyfw.12306.cn/otn/leftTicket/init?linktypeid=dc' 152 | # xpath1='//a//text()' 153 | # requested(url,xpath1) 154 | # 转到新的文件里面了 155 | 156 | 157 | 158 | ############################################################################################## 159 | ############# 京东 ################# 160 | ############################################################################################### 161 | # url='https://www.jd.com/' 162 | # xpath1='//a//text()' 163 | # requested(url,xpath1) 164 | 165 | ############################################################################################## 166 | ############# 淘宝 ################# 167 | ############################################################################################### 168 | # url='https://www.taobao.com/' 169 | # xpath1='//a//text()' 170 | # requested(url,xpath1) 171 | 172 | ############################################################################################## 173 | ############# 36kr ################# 174 | ############################################################################################### 175 | url='https://www.36kr.com/' 176 | xpath1='//a//text()' 177 | requested(url,xpath1) 178 | 179 | ############################################################################################## 180 | ############# 虎嗅网 ################# 181 | ############################################################################################### 182 | # 183 | # url='https://m.huxiu.com/' 184 | # xpath1='//div/h1//text()' 185 | # requested(url,xpath1) 186 | ############################################################################################## 187 | ############# 豆瓣网 ################# 188 | ############################################################################################### 189 | # url='https://book.douban.com/' 190 | # xpath1='//div/h4/text()'# 191 | # requested(url,xpath1) 192 | 193 | ############################################################################################## 194 | ############# 新浪网 ################# 195 | ############################################################################################## 196 | # url='https://sports.sina.cn/?vt=4&pos=108' 197 | # xpath1='//h2//em/text()' 198 | # requested(url,xpath1) 199 | 200 | ############################################################################################## 201 | ############# 应届生求职网 ################# 202 | ############################################################################################## 203 | # url_0='http://www.yingjiesheng.com/' 204 | # # city='jinan';url_1='job/list_'#济南 205 | # city='shenzhen';url_1='-morejob-' 206 | # url_2='.html' 207 | # for i in range(10): 208 | # print(city,'-------------',i+1,'--------------------') 209 | # url=url_0+city+url_1+str(i+1)+url_2 210 | # xpath1='//td/a//text()' 211 | # xpath_date = '//td[contains(@class,"date cen")]/text()' 212 | # # xpath_date='//td[@class="date cen"]/text()' 213 | # xpath_url='//td[@class="item1"]/a/@href' 214 | # requested(url,xpath1,'风险') 215 | # requested(url, xpath1, '风控') 216 | # requested(url, xpath1, '数据') 217 | # # requested(url,xpath_date) 218 | # # requested(url,xpath_url) 219 | # sleep(10) 220 | 221 | 222 | ############################################################################################## 223 | ############# 豆瓣网 ################# 224 | ############################################################################################## 225 | # import requests 226 | # import ssl 227 | # from lxml import etree 228 | # 229 | # ssl._create_default_https_context = ssl._create_unverified_context 230 | # 231 | # session = requests.Session() 232 | # for id in range(0, 251, 25): 233 | # URL = 'https://movie.douban.com/top250/?start=' + str(id) 234 | # req = session.get(URL) 235 | # print(req) 236 | # # 设置网页编码格式 237 | # req.encoding = 'utf8' 238 | # # 将request.content 转化为 Element 239 | # root = etree.HTML(req.content) 240 | # # 选取 ol/li/div[@class="item"] 不管它们在文档中的位置 241 | # items = root.xpath('//ol/li/div[@class="item"]') 242 | # for item in items: 243 | # # 注意可能只有中文名，没有英文名；可能没有quote简评 244 | # rank, name, alias, rating_num, quote, url = "", "", "", "", "", "" 245 | # try: 246 | # url = item.xpath('./div[@class="pic"]/a/@href')[0] 247 | # rank = item.xpath('./div[@class="pic"]/em/text()')[0] 248 | # title = item.xpath('./div[@class="info"]//a/span[@class="title"]/text()') 249 | # name = title[0].encode('gb2312', 'ignore').decode('gb2312') 250 | # alias = title[1].encode('gb2312', 'ignore').decode('gb2312') if len(title) == 2 else "" 251 | # rating_num = item.xpath('.//div[@class="bd"]//span[@class="rating_num"]/text()')[0] 252 | # quote_tag = item.xpath('.//div[@class="bd"]//span[@class="inq"]') 253 | # if len(quote_tag) is not 0: 254 | # quote = quote_tag[0].text.encode('gb2312', 'ignore').decode('gb2312').replace('\xa0', '') 255 | # # 输出排名，评分，简介 256 | # print(rank, rating_num, quote) 257 | # # 输出中文名，英文名 258 | # print(name.encode('gb2312', 'ignore').decode('gb2312'), 259 | # alias.encode('gb2312', 'ignore').decode('gb2312').replace('/', ',')) 260 | # except: 261 | # print('faild!') 262 | # pass 263 | # ———————————————— 264 | # 版权声明：本文为CSDN博主「jeikerxiao」的原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接及本声明。 265 | # 原文链接：https://blog.csdn.net/jeikerxiao/article/details/73530529 266 | 267 | ############################################################################################## 268 | #############新浪微博################# 269 | ############################################################################################## 270 | # import requests 271 | # headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'} 272 | # url="https://weibo.com" 273 | # res = requests.get(url,headers=headers) 274 | # print(res) 275 | # xpath1='//div//text()' 276 | # requested(url,xpath1) 277 | # ———————————————— 278 | # 版权声明：本文为CSDN博主「阿叶_」的原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接及本声明。 279 | # 原文链接：https://blog.csdn.net/weixin_43902320/article/details/104342771 -------------------------------------------------------------------------------- /005_getting_internal_control_firm_name_and_pay.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | file_folder='C:/Users/wade z shao/Documents/GitHub/yunzhuan/Financial-report-acquisition-and-data-processing-with-Python/' 3 | file_code='5';file_source='stock_list/'+file_code+".txt" 4 | file_name="internal_control_firm"+file_code+'.xls' 5 | print(file_folder+file_name) 6 | 7 | import xlrd #（excel read）来读取Excel文件 8 | import xlwt #（excel write）来生成Excel文件 9 | workbook = xlwt.Workbook() # 新建一个工作簿 10 | sheet = workbook.add_sheet(file_code) # 在工作簿中新建一个表格 11 | def write_excel_xls(path,value,inum): 12 | index = len(value) # 获取需要写入数据的行数 13 | # print("index is",index) 14 | for num in range(0, index): 15 | sheet.write(inum,num,value[num]) 16 | # for i in range(0, index): 17 | # for j in range(0, len(value[i])): 18 | # sheet.write(i, j, value[i][j]) # 像表格中写入数据（对应的行和列） 19 | 20 | print("xls格式表格写入数据成功！") 21 | 22 | 23 | 24 | ######################## 25 | import urllib.request 26 | import re 27 | import os 28 | import time 29 | import random 30 | 31 | f=open(file_folder+file_source) 32 | stock = [] 33 | for line in f.readlines(): 34 | #print(line,end = '') 35 | line = line.replace('\n','') 36 | stock.append(line) 37 | #print(stock) 38 | f.close() 39 | print('stock is',stock[:5]) 40 | iinum = 1 41 | for each in stock: 42 | # print(each[2:]) 43 | each1=each[2:8] 44 | print('1股票代码',each) 45 | url='http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/'+each1+'/page_type/ndbg.phtml' 46 | print(url) 47 | # url='http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/'+each+'/page_type/ndbg.phtml' 48 | req = urllib.request.Request(url) 49 | print('2',each) 50 | req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0') 51 | page = urllib.request.urlopen(req) 52 | time.sleep(random.random() * 10) 53 | print('3',each) 54 | try: 55 | html = page.read().decode('gbk') 56 | target = r'&id=[_0-9_]{7}' 57 | print('4',each) 58 | # target = r'&id=[_0-9_]{6}' 59 | target_list = re.findall(target,html) 60 | print('5',target_list) 61 | # os.mkdir('./') 62 | # os.mkdir('./'+each) 63 | # os.mkdir('./'+'electricity-'+each) 64 | sid = each1 65 | if len(target_list)>0: 66 | print('5',each1,'2018和2015年target_list',target_list) 67 | print("6sid is",sid) 68 | # year=2017;each2=target_list[1] 69 | year=2018;each2=target_list[0] 70 | target_url='http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllBulletinDetail.php?stockid='+sid+each2 71 | print('--8--',each1,year,'年报详情链接target_url,',target_url) 72 | 73 | url=target_url 74 | # 开始爬虫 75 | import requests 76 | headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} 77 | # print(url) 78 | response = requests.get(url, headers=headers) 79 | response.encoding = 'gbk' # 解决乱码问题 80 | html_text = response.text 81 | # 开始爬虫 82 | 83 | import lxml 84 | from lxml import etree 85 | selector = etree.HTML(html_text) 86 | title = selector.xpath('//table/tr/td//text()')#2018 87 | # title = selector.xpath('//tbody/tr/td//text()')#2017 88 | # 去除其中的冒号 89 | title_1 = [] 90 | for i in title: 91 | i = i.strip() 92 | i = i.strip('：') 93 | # title_1.append(i) 94 | for j in i.split(): 95 | # print("--------------",j) 96 | title_1.append(j) 97 | # title_1.append(i) 98 | # print("title_1", title_1[00:1500]) 99 | time_now=int(time.time()); time_now= str(time_now) 100 | path1 =file_folder+time_now[0:7]+"_"+str(year)+file_name 101 | print(path1) 102 | title_1_firm=[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]#21个 103 | for title in range(len(title_1)): 104 | if title_1[title].find('内部控制审计会计师') > -1: 105 | title_1_firm.clear() 106 | # if title_1[title].find('内部控制审计会计师事务所')>-1: 内部控制审 107 | for i in range(20): 108 | title_1_firm.append(title_1[title+i]) 109 | print(i, "境内会计师事务所名称",title_1[title+i]) 110 | print(title_1_firm[-1]) 111 | elif title_1[title].find('内部控制审计机构') > -1: 112 | title_1_firm.clear() 113 | for i in range(20): 114 | title_1_firm.append(title_1[title+i]) 115 | print(i, "内部控制审计机构",title_1[title+i]) 116 | print(title_1_firm[-1]) 117 | # 存入excel 118 | write_excel_xls(path1, [each[0:7], each[8:], target_url]+title_1_firm, iinum) 119 | workbook.save(path1) # 保存工作簿 120 | # 存入excel 121 | 122 | # 去除其中的冒号 123 | 124 | 125 | iinum=iinum+1 126 | except: 127 | print('年报列表页面编码错误;',path1,str(year)+title_1[0]) -------------------------------------------------------------------------------- /006_getting_company_information_updated_on_2020_03_19.py: -------------------------------------------------------------------------------- 1 | def sleep(n): 2 | import time 3 | import random 4 | time.sleep(random.random() * n) 5 | print('~~~~~~sleeping~~~~~~~~') 6 | import xlrd #（excel read）来读取Excel文件 7 | import xlwt #（excel write）来生成Excel文件 8 | workbook = xlwt.Workbook() # 新建一个工作簿 9 | sheet = workbook.add_sheet("sheet_name") # 在工作簿中新建一个表格 10 | def write_excel_xls(path,value,inum): 11 | index = len(value) # 获取需要写入数据的行数 12 | # print("index is",index) 13 | for num in range(0, index): 14 | sheet.write(inum,num,value[num]) 15 | # for i in range(0, index): 16 | # for j in range(0, len(value[i])): 17 | # sheet.write(i, j, value[i][j]) # 像表格中写入数据（对应的行和列） 18 | 19 | print("xls格式表格写入数据成功！") 20 | # def write_excel_xls_append(path, value): 21 | # import xlutils # 需要安装xlutils 22 | # index = len(value) # 获取需要写入数据的行数 23 | # workbook = xlrd.open_workbook(path) # 打开工作簿 24 | # sheets = workbook.sheet_names() # 获取工作簿中的所有表格 25 | # worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格 26 | # rows_old = worksheet.nrows # 获取表格中已存在的数据的行数 27 | # new_workbook = xlutils.copy(workbook) # 将xlrd对象拷贝转化为xlwt对象 28 | # new_worksheet = new_workbook.get_sheet(0) # 获取转化后工作簿中的第一个表格 29 | # for i in range(0, index): 30 | # for j in range(0, len(value[i])): 31 | # new_worksheet.write(i+rows_old, j, value[i][j]) # 追加写入数据，注意是从i+rows_old行开始写入 32 | # new_workbook.save(path) # 保存工作簿 33 | # print("xls格式表格【追加】写入数据成功！") 34 | def read_txt(file_folder,txt_name): 35 | f = open(file_folder + txt_name) 36 | stock_list=[] 37 | for line in f.readlines(): 38 | line = line.replace('\n', '') 39 | stock_list.append(line) 40 | f.close() 41 | return stock_list 42 | def read_excel(file_folder,excel_name): 43 | import xlrd 44 | stock_list=[] 45 | data=xlrd.open_workbook(file_folder+excel_name) 46 | sheet = data.sheet_by_index(0) 47 | for i in range(sheet.nrows): 48 | # print(i,sheet.row_values(i)[0]) 49 | stock_list.append(sheet.row_values(i)[0]) 50 | print(stock_list) 51 | return stock_list 52 | def requested(url,xpath1): 53 | import requests 54 | headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} 55 | response=requests.get(url,headers=headers);sleep(3) 56 | response.encoding=response.apparent_encoding 57 | # return response 58 | import lxml 59 | from lxml import etree 60 | title_1=etree.HTML(response.text).xpath(xpath1); 61 | return title_1 62 | 63 | def get_annual_report_url(url_report_list): 64 | import requests;import re;from lxml import etree 65 | import urllib;import urllib.request 66 | req = urllib.request.Request(url_report_list) 67 | req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0') 68 | page = urllib.request.urlopen(req) 69 | html = page.read().decode('gbk') 70 | # headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} 71 | # response=requests.get(url_report_list,headers=headers) 72 | # sleep(5) 73 | # response.encoding=response.apparent_encoding 74 | # html=etree.HTML(response.text) 75 | target = r'&id=[_0-9_]{7}' 76 | target_list = re.findall(target,html) 77 | target_url_0 = 'http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllBulletinDetail.php?stockid=' 78 | target_url=target_url_0+each[2:8]+target_list[0] 79 | print(target_url) 80 | return target_url 81 | # def get_audit_firm(url_year_report): 82 | # audit_firm=[0,0] 83 | # url_2018_report=get_annual_report_url(url_year_report) 84 | # xpath1='//table/tr/td//text()' 85 | # audit_firm=requested(url_2018_report,xpath1) 86 | # return audit_firm 87 | 88 | # coding=UTF-8 89 | file_folder='C:/Users/wade z shao/Documents/GitHub/yunzhuan/Financial-report-acquisition-and-data-processing-with-Python/' 90 | txt_name="stock_list/20200320.txt";stock_list=read_txt(file_folder,txt_name) 91 | excel_name= "stock_list/新能源电池20200320.xlsx";stock_list=read_excel(file_folder,excel_name) 92 | file_name='excel/2020-03-20_' +"光伏发电（新表）20200320"+'_4'+".xls" 93 | 94 | iinum = 1 95 | for each in stock_list[1:26]: 96 | print("----",iinum,"-------",'26',"------",each[:8]) 97 | print(len(each[2:8]),each[2:8]) 98 | if each[2:8].isdigit() and len(each[2:8])>4: 99 | # 爬取上市公司基本信息，得到信息为title_1,经过处理得到title_2 100 | url_intro_0 = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vCI_CorpInfo/stockid/' 101 | url_intro=url_intro_0+each[2:8]+'.phtml'#公司简介 102 | xpath1='//table/tr/td//text()' 103 | title_1=requested(url_intro,xpath1) 104 | sleep(5); 105 | title_2 = [0,0,0,0,0]; 106 | if len(title_1)>0: 107 | title_2.clear() 108 | for i in range(len(title_1)): 109 | title_1[i] = title_1[i].strip();title_1[i] = title_1[i].strip('：')# 去除其中的冒号 110 | if title_1[i].find('公司名称')>-1: 111 | title_2.append(title_1[i+1]) 112 | if title_1[i].find('公司英文名称')>-1: 113 | title_2.append(title_1[i+1]) 114 | if title_1[i].find('公司网址')>-1: 115 | title_2.append(title_1[i+1]) 116 | if title_1[i].find('办公地址')>-1: 117 | title_2.append(title_1[i+1]) 118 | if title_1[i].find('经营范围')>-1: 119 | title_2.append(title_1[i+1]) 120 | # 爬取上市公司基本信息，得到信息为title_1,经过处理得到title_2 121 | 122 | #获取2018年报链接url_2018_report，并提取内容 123 | url_index='http://finance.sina.com.cn/realstock/company/'+each[:8]+'/nc.shtml'#公司主页 124 | url_report_list_0='http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/'#公司年报列表 125 | url_report_list=url_report_list_0+each[2:8]+'/page_type/ndbg.phtml'#公司年报列表 126 | url_2018_report=get_annual_report_url(url_report_list)#获取2018年报页面 127 | xpath1='//table/tr/td//text()' 128 | title_1=requested(url_2018_report,xpath1)#爬取上市公司年审事务所 129 | audit_firm=[0,0] 130 | for i in range(len(title_1)): 131 | title_1[i] = title_1[i].strip();title_1[i] = title_1[i].strip('：')# 去除其中的冒号 132 | if title_1[i].find('境内会计师事务所名称')>-1: 133 | audit_firm.clear() 134 | audit_firm.append(title_1[i+1]) 135 | if title_1[i].find('境内会计师事务所报酬')>-1: 136 | audit_firm.append(title_1[i+1]) 137 | # 获取2018年报链接url_2018_report，并提取内容 138 | 139 | # 保存工作簿 140 | path1 = file_folder + file_name 141 | write_excel_xls(path1,[each[:8],each[8:],url_intro,url_2018_report]+ audit_firm+title_2,iinum) 142 | workbook.save(path1) 143 | iinum = iinum + 1 144 | # 保存工作簿 -------------------------------------------------------------------------------- /007_spider_for_12306.py: -------------------------------------------------------------------------------- 1 | #utf-8 2 | import requests 3 | url = 'https://kyfw.12306.cn/otn/leftTicket/queryZ?' \ 4 | 'leftTicketDTO.train_date=2020-02-01&' \ 5 | 'leftTicketDTO.from_station=SHH&' \ 6 | 'leftTicketDTO.to_station=BJP&purpose_codes=ADULT' 7 | response=requests.get(url) 8 | response.encoding=response.apparent_encoding 9 | print(response.text) 10 | # url='https://kyfw.12306.cn/otn/leftTicket/init?linktypeid=dc' 11 | -------------------------------------------------------------------------------- /20200512下载财务报表-网易财经.py: -------------------------------------------------------------------------------- 1 | def download_financial_table(stock_code): 2 | import re,urllib 3 | import xlwt 4 | from bs4 import BeautifulSoup 5 | from time import sleep 6 | # url_lrb = 'http://quotes.money.163.com/service/lrb_'+str(stock_code[2:8])+'.html' 7 | url_zcfzb = 'http://quotes.money.163.com/service/zcfzb_' + str(stock_code[2:8]) + '.html' 8 | url_xjllb = 'http://quotes.money.163.com/service/xjllb_' + str(stock_code[2:8]) + '.html' 9 | while True: 10 | try: 11 | # content_lrb = urllib.request.urlopen(url_lrb,timeout=2).read() 12 | content_zcfzb= urllib.request.urlopen(url_zcfzb, timeout=2).read() 13 | content_xjllb = urllib.request.urlopen(url_xjllb, timeout=2).read() 14 | # print(content_lrb) 15 | # with open('C://TEMP/Excel/financial_table/'+str(stock_code[0:-1].strip('*'))+'利润表.csv','wb') as f: 16 | # f.write(content_lrb) 17 | sleep(1) 18 | with open('C://TEMP/Excel/financial_table/'+str(stock_code[0:-1].strip('*'))+'资产负债表.csv','wb') as f: 19 | f.write(content_zcfzb) 20 | sleep(1) 21 | with open('C://TEMP/Excel/financial_table/'+str(stock_code[0:-1].strip('*'))+'现金流量表.csv','wb') as f: 22 | f.write(content_xjllb) 23 | print(stock_code) 24 | sleep(1) 25 | 26 | break 27 | except Exception as e: 28 | if str(e) =='HTTP Error 404: Not Found': 29 | break 30 | else: 31 | print(e) 32 | continue 33 | ########################################################### 34 | # ——————————————————————主程序—————————————————————————————— 35 | ########################################################### 36 | file_folder= "C://TEMP/Excel/";file_name="stock-list-20200512-electricity.txt" 37 | f=open(file_folder+file_name,'rb') 38 | stock = [] 39 | for line in f.readlines(): 40 | line=line.decode('utf-8') 41 | print(line,end = '') 42 | line = line.replace('\n','') 43 | stock.append(line) 44 | #print(stock) 45 | f.close() 46 | print('stock is',stock[:5]) 47 | 48 | for each in stock[1:]: 49 | print('~~~~each is',each.strip('*')) 50 | each1=each[2:8];each0=each[0:8];each3=each 51 | print(each3) 52 | download_financial_table(each.strip('*ST')) 53 | 54 | # count = 1 55 | # for count in range(600500,600501): 56 | # url = 'http://quotes.money.163.com/service/lrb_'+str(count)+'.html' 57 | # while True: 58 | # try: 59 | # content = urllib.request.urlopen(url,timeout=2).read() 60 | # print(content) 61 | # with open('C://TEMP/Excel/financial_table/'+str(count)+'利润表.csv','wb') as f: 62 | # f.write(content) 63 | # print(count) 64 | # sleep(1) 65 | # break 66 | # except Exception as e: 67 | # if str(e) =='HTTP Error 404: Not Found': 68 | # break 69 | # else: 70 | # print(e) 71 | # continue -------------------------------------------------------------------------------- /20200512获取年报中竞争力和风险信息.py: -------------------------------------------------------------------------------- 1 | import xlrd #（excel read）来读取Excel文件 2 | import xlwt #（excel write）来生成Excel文件 3 | workbook = xlwt.Workbook() # 新建一个工作簿 4 | sheet = workbook.add_sheet("sheet_name") # 在工作簿中新建一个表格 5 | def write_excel_xls(path,value,inum): 6 | index = len(value) # 获取需要写入数据的行数 7 | # print("index is",index) 8 | for num in range(0, index): 9 | sheet.write(inum,num,value[num]) 10 | # for i in range(0, index): 11 | # for j in range(0, len(value[i])): 12 | # sheet.write(i, j, value[i][j]) # 像表格中写入数据（对应的行和列） 13 | 14 | print("xls格式表格写入数据成功！") 15 | 16 | # coding=UTF-8 17 | file_folder= "C:/Temp/Excel/" 18 | file_name="20200512"+"stock_report_advantage_risk风电.xls" 19 | # file_source="stock_list/stock_list_advantage_risk.txt" 20 | file_source='stock-list-20200512-electricity1.txt' 21 | # print(file_foler+file_name) 22 | 23 | ######################## 24 | import urllib.request 25 | import re 26 | import os 27 | import time 28 | import random 29 | 30 | stock=[] 31 | f=open(file_folder+file_source,'rb') 32 | for line in f.readlines(): 33 | line=line.decode() 34 | print(line,end = '') 35 | line = line.replace('\n','') 36 | stock.append(line) 37 | print(stock) 38 | f.close() 39 | print('stock is',stock[1:5]) 40 | iinum = 1 41 | for each in stock[1:]: 42 | # print(each[2:]) 43 | each1=each[2:8] 44 | print('1股票代码',each) 45 | url='http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/'+each1+'/page_type/ndbg.phtml' 46 | # url='http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/'+each+'/page_type/ndbg.phtml' 47 | req = urllib.request.Request(url) 48 | print('2',each) 49 | req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0') 50 | page = urllib.request.urlopen(req) 51 | time.sleep(random.random() * 3) 52 | print('3',each) 53 | try: 54 | html = page.read().decode('gbk') 55 | target = r'&id=[_0-9_]{7}' 56 | print('4',each) 57 | # target = r'&id=[_0-9_]{6}' 58 | target_list = re.findall(target,html) 59 | # os.mkdir('./') 60 | # os.mkdir('./'+each) 61 | # os.mkdir('./'+'electricity-'+each) 62 | sid = each1 63 | if len(target_list)>0: 64 | print('5',each1,'2018和2015年target_list',target_list) 65 | print("6sid is",sid) 66 | year=2019 67 | each2=target_list[0] 68 | target_url='http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllBulletinDetail.php?stockid='+sid+each2 69 | print('--8--',each1,year,'年报详情链接target_url,',target_url) 70 | 71 | url=target_url 72 | # 开始爬虫 73 | import requests 74 | headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} 75 | # print(url) 76 | response = requests.get(url, headers=headers) 77 | response.encoding = 'gbk' # 解决乱码问题 78 | html_text = response.text 79 | # 开始爬虫 80 | 81 | import lxml 82 | from lxml import etree 83 | selector = etree.HTML(html_text) 84 | # title = selector.xpath('//table/tr/td//text()') 85 | title = selector.xpath('//p//text()') 86 | # 去除其中的冒号 87 | title_1 = [] 88 | for i in title: 89 | i = i.strip() 90 | i = i.strip('：') 91 | title_1.append(i) 92 | # print("title_1", title_1[1500:]) 93 | path1 = file_folder +'工作1_' + file_name 94 | print('###############',path1) 95 | title_1_advantage = [] 96 | for title in range(len(title_1)): 97 | if title_1[title].find('报告期内核心竞争力分析')>-1: 98 | for i in range(30): 99 | # print('###############', i, '--title_1_advantage--', title_1_advantage) 100 | # print('###############',i,'----',title_1[title+i]) 101 | title_1_advantage.append(title_1[title+i]) 102 | # print('###############',title_1_advantage[-1]) 103 | print('###############',title_1_advantage) 104 | elif title_1[title].find('可能面对的风险')>-1: 105 | for j in range(20): 106 | title_1_advantage.append(title_1[title+j]) 107 | print('###############',title_1_advantage) 108 | write_excel_xls(path1, [each[0:7], each[8:], target_url]+title_1_advantage[:19], iinum) 109 | iinum=iinum+1 110 | write_excel_xls(path1, [each[0:7], each[8:], target_url] + title_1_advantage[20:], iinum) 111 | workbook.save(path1) # 保存工作簿 112 | # 存入excel 113 | 114 | # 去除其中的冒号 115 | 116 | 117 | iinum=iinum+1 118 | 119 | except: 120 | print('年报列表页面编码错误;',path1,+title_1[0]) -------------------------------------------------------------------------------- /Acquiring_POI_using_Python.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 第一行必须有，否则报中文字符非ascii码错误 3 | 4 | import urllib.request 5 | from urllib.parse import quote 6 | import string 7 | import json 8 | import time 9 | 10 | # ak需要在百度地图开放平台申请 11 | # ak = "Qgnt7lFOgBR2NkDN963RL26zP0USR024" 12 | ak ="C75tUrXIZKbn3QOMB3e9cIx7g9fsMrUo" 13 | 14 | # 关键词 15 | query = ["社会福利院"] 16 | page_size = 20 17 | page_num = 0 18 | scope = 1 19 | 20 | # 范围： 21 | # 左下坐标 30.379,114.118 22 | # 右上坐标 30.703,114.665 23 | # 中间坐标 30.541,114.3915 24 | 25 | bounds = [ 26 | [30.379, 114.118, 30.541, 114.3915], 27 | [30.379, 114.3915, 30.541, 114.665], 28 | [30.541, 114.118, 30.703, 114.3915], 29 | [30.541, 114.3915, 30.703, 114.665] 30 | ] 31 | 32 | new_bounds = [] 33 | # col_row 将bounds的每一小块继续细分为3行3列，可以防止区域内的搜索数量上限400 34 | col_row = 3 35 | for lst in bounds: 36 | distance_lat = (lst[2] - lst[0]) / col_row 37 | distance_lon = (lst[3] - lst[1]) / col_row 38 | for i in range(col_row): 39 | for j in range(col_row): 40 | lst_temp = [] 41 | lst_temp.append(lst[0] + distance_lat * i) 42 | lst_temp.append(lst[1] + distance_lon * j) 43 | lst_temp.append(lst[0] + distance_lat * (i + 1)) 44 | lst_temp.append(lst[1] + distance_lon * (j + 1)) 45 | new_bounds.append(lst_temp) 46 | 47 | queryResults = [] 48 | 49 | for bound in new_bounds[0:1]: 50 | np = True 51 | a = [] 52 | while np == True: 53 | # 使用百度提供的url拼接条件 54 | url = "http://api.map.baidu.com/place/v2/search?ak=" + str(ak) + "&output=json&query=" + str( 55 | query[0]) + "&page_size=" + str(page_size) + "&page_num=" + str(page_num) + "&bounds=" + str( 56 | bound[0]) + "," + str(bound[1]) + "," + str(bound[2]) + "," + str(bound[3]) 57 | url = quote(url, safe=string.printable) 58 | 59 | # 请求url读取，创建网页对象 60 | jsonf = urllib.request.urlopen(url) 61 | page_num = page_num + 1 62 | 63 | # 判断查询翻页进程 64 | jsonfile = jsonf.read() 65 | s = json.loads(jsonfile) 66 | total = int(s["total"]) 67 | a.append(total) 68 | 69 | queryResults.append(s) 70 | print("queryResults\n",queryResults) 71 | max_page = int(a[0] / page_size) + 1 72 | # 防止并发过高，百度地图要求并发小于120 73 | time.sleep(1) 74 | 75 | if page_num > max_page: 76 | np = False 77 | page_num = 0 78 | print("search complete") 79 | print("output: " + str(bound)) 80 | print("total: " + str(a[0])) 81 | print("") 82 | 83 | results = open("results.txt", 'a') 84 | results.write(str(queryResults).encode('utf-8').decode('unicode_escape')) 85 | results.close() 86 | print("ALL DONE!") 87 | -------------------------------------------------------------------------------- /Acquiring_POI_using_Python_updated.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 第一行必须有，否则报中文字符非ascii码错误 3 | 4 | # import urllib.request 5 | # from urllib.parse import quote 6 | # import string 7 | import json 8 | import time 9 | import requests 10 | 11 | def sleep(n): 12 | time.sleep(n) 13 | print("~~~sleeping~~~~") 14 | def requested(url): 15 | sleep(5) 16 | html=requests.get(url) 17 | data=html.json() 18 | return data 19 | def write_txt(file,data): 20 | f=open(file,'a',encoding='utf-8') 21 | f.write(data) 22 | print(data,'写入',file,'成功') 23 | f.close() 24 | def time_now(): 25 | time_now=time.strftime('%Y-%m-%d-%H%M%S',time.localtime(time.time())) 26 | print(time_now) 27 | return str(time_now) 28 | 29 | ak ="C75tUrXIZKbn3QOMB3e9cIx7g9fsMrUo" 30 | # url='http://api.map.baidu.com/place/v2/search?query=楚雄& bounds=24.390894,102.174112,26.548645,103.678942&page_size=20&page_num=0&output=json&ak='+ak 31 | 32 | url_1='http://api.map.baidu.com/place/v2/search?query=' 33 | url_2='南京路'#搜索关键词 34 | url_3='®ion=' 35 | url_4='信阳'#所在城市 36 | url_5='&page_size=20&page_num=' 37 | url_6='1' 38 | url_7='&output=json&ak='+ak 39 | file=time_now()+url_4+'-'+url_2+"results.txt" 40 | for url_6 in range(20): 41 | url=url_1+url_2+url_3+url_4+url_5+str(url_6)+url_7 42 | data=requested(url) 43 | for item in data['results']: 44 | jname = item['name'] 45 | jlat = item['location']['lat'] 46 | jlon = item['location']['lng'] 47 | jadd = item['address'] 48 | j_data=url_4+url_2+','+str(url_6)+','+jname+','+str(jlat)+','+str(jlon)+','+jadd+'\n' 49 | write_txt(file, j_data) 50 | 51 | 52 | 53 | # for item in data['results']: 54 | # jname=item['name'] 55 | # jadd=item['address'] 56 | # j_data=jname+','+jadd+'\n' 57 | # write_txt(file,j_data) 58 | 59 | #感谢https://blog.csdn.net/sinat_41310868/article/details/78746094 60 | 61 | 62 | # page_size = 20 63 | # page_num = 0 64 | # scope = 1 65 | # 范围： 66 | # 左下坐标 30.379,114.118 67 | # 右上坐标 30.703,114.665 68 | # 中间坐标 30.541,114.3915 69 | # bounds = [ 70 | # [30.379, 114.118, 30.541, 114.3915], 71 | # [30.379, 114.3915, 30.541, 114.665], 72 | # [30.541, 114.118, 30.703, 114.3915], 73 | # [30.541, 114.3915, 30.703, 114.665] 74 | # ] 75 | # 76 | # new_bounds = [] 77 | # # col_row 将bounds的每一小块继续细分为3行3列，可以防止区域内的搜索数量上限400 78 | # col_row = 3 79 | # for lst in bounds: 80 | # distance_lat = (lst[2] - lst[0]) / col_row 81 | # distance_lon = (lst[3] - lst[1]) / col_row 82 | # for i in range(col_row): 83 | # for j in range(col_row): 84 | # lst_temp = [] 85 | # lst_temp.append(lst[0] + distance_lat * i) 86 | # lst_temp.append(lst[1] + distance_lon * j) 87 | # lst_temp.append(lst[0] + distance_lat * (i + 1)) 88 | # lst_temp.append(lst[1] + distance_lon * (j + 1)) 89 | # new_bounds.append(lst_temp) 90 | # 91 | # queryResults = [] 92 | # 93 | # for bound in new_bounds[0:1]: 94 | # np = True 95 | # a = [] 96 | # while np == True: 97 | # # 使用百度提供的url拼接条件 98 | # url = "http://api.map.baidu.com/place/v2/search?ak=" + str(ak) + "&output=json&query=" + str( 99 | # query[0]) + "&page_size=" + str(page_size) + "&page_num=" + str(page_num) + "&bounds=" + str( 100 | # bound[0]) + "," + str(bound[1]) + "," + str(bound[2]) + "," + str(bound[3]) 101 | # url = quote(url, safe=string.printable) 102 | # 103 | # # 请求url读取，创建网页对象 104 | # jsonf = urllib.request.urlopen(url) 105 | # page_num = page_num + 1 106 | # 107 | # # 判断查询翻页进程 108 | # jsonfile = jsonf.read() 109 | # s = json.loads(jsonfile) 110 | # total = int(s["total"]) 111 | # a.append(total) 112 | # 113 | # queryResults.append(s) 114 | # print("queryResults\n",queryResults) 115 | # max_page = int(a[0] / page_size) + 1 116 | # # 防止并发过高，百度地图要求并发小于120 117 | # time.sleep(1) 118 | # 119 | # if page_num > max_page: 120 | # np = False 121 | # page_num = 0 122 | # print("search complete") 123 | # print("output: " + str(bound)) 124 | # print("total: " + str(a[0])) 125 | # print("") 126 | # 127 | # results = open("results.txt", 'a') 128 | # results.write(str(queryResults).encode('utf-8').decode('unicode_escape')) 129 | # results.close() 130 | # print("ALL DONE!") 131 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | on Mar 16 2020 2 | 1. Python读年报1-下载上市公司年报https://zhuanlan.zhihu.com/p/112231778 3 | ![下载上市公司年报](http://pic1.zhimg.com/80/v2-b572e42a4b83ad9de8f49d85333fbf80_1440w.jpg) 4 | 2. Python读年报2-将上市公司信息写入Excel https://zhuanlan.zhihu.com/p/111249510 5 | ![将上市公司信息写入Excel](http://pic3.zhimg.com/v2-374b4f450b4e4b899f5bde30f9423790_1200x500.jpg) 6 | 7 | 8 Jan 2017 8 | # Spider 9 | ##Learning Python. 10 | 11 | I learned C programming when I was a sophomore or a senior student when I was studying in Qingdao University. After I graduated from Pekin University in about 2015, deep learning became quite popular in China. I started to konw that Python was very useful, and not too difficult to start. I began to search how to realize a web spider. And the following is what I wrote at that time, posting in [zhihu.com](https://www.zhihu.com/question/20899988/answer/81789394).
12 | 我大学曾学习过C语言课程，但是当时并不觉得写代码很有趣，研究生快毕业才了解数据工程师和物理其实很像，而且人工智能越来越火，仿佛不知道神经网络就有愧于学习理工科。我那段时间才开始准备自学Python，我学习的起点定在了爬虫。这是我去年这个时候发表在知乎上的学习记录。[如何入门 Python 爬虫？](https://www.zhihu.com/question/20899988/answer/81789394)
13 | 在两周前，我突然忘记为什么，又开始了学习python，最开始是购买了一本利用Python进行数据分析，通过安卓的app“Python Excercise”，练习使用命令行运行python文件，后来自然而然登陆了codecedemy，完成了其中python语言的入门学习，自己重复这些代码大约用了两周，这两周里学习编程占绝了我绝大部分时间，除了和何玉婷去张家口，以及济南高新区管委会面试之外，我几乎除了吃饭，睡觉，就是学习Python。后来发现了[Dataquest](https://www.dataquest.io/mission/6/getting-started-with-num-py/2/lists-of-lists)，[怎么用最短时间高效而踏实地学习 Python？](https://www.zhihu.com/question/28530832/answer/58656332)感谢[飞绝眷岭](https://www.zhihu.com/people/juanling/answers)的推荐。
14 | 对于现在的我来说，Python对我已经意味着不同的东西了。我改变了一些习惯，敲击键盘速度更快了，对于代码理解更深了。以前会认为学习计算机好就业，赚钱多，或者码农辛苦，不太擅长言谈，看到爬虫教程会看几眼而走掉。或者不了解编程学习方法，不知道自学需要多久，是不是适合自学，以及如何通过自己的思考，反复练习，掌握一门语言。Python对我学习能力有提高。
15 | 不过同时也了解编程是一个非常系统的学科，不仅仅需要天赋（高中获得OI金牌或者大学练习ACM），而也需要一定的熟练和努力。不再是很枯燥的黑底绿字，也可以很漂亮。
16 | 之后我希望可以继续沿着Data Scientist之路向前走，我之前很喜欢infographic，觉得很漂亮，我认为这个和python应该也有关系，通过对于data的理解加深，我应该可以做出更好的infographic。
17 | 我以前总是拿不定学习哪种语言，C++、C#、PHP、Python、Ruby、Java、Swift,甚至H5。现在我知道每一种语言最开始阶段写出hello world都是可能的，但是后面刻意练习需要很多精力，不可能同时完成这些。我以后不会鲁莽地开始新语言的学习了，我再也不会看21天精通XX语言这样的链接了。
18 | 最近有时间学习Python一方面是很好的机会，另一方面也是深深地空虚，失去了工作，重新寻找自己的职业生涯，这些让我非常痛苦无力，我现在已经两个月没有取得收入了，感觉有一道悬崖在前方，非常可怕，我想快点逃出去。之后进入工作，我的状态和时间都会受到影响，不知道是否可以保持学习的热情，如果很遗憾不再坚持，也感谢这段时间Python的陪伴。
19 | 2017-01-08 20 | -------------------------------------------------------------------------------- /excel/2020年1月中华人民共和国县以上行政区划代码.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShaoZC/Financial-report-acquisition-and-data-processing-with-Python/c20775f8448aea88be49b2204682f9331c75a085/excel/2020年1月中华人民共和国县以上行政区划代码.xlsx -------------------------------------------------------------------------------- /excel/上市公司信息和年报审计事务所-2018年_20200320_2新增锂电池.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShaoZC/Financial-report-acquisition-and-data-processing-with-Python/c20775f8448aea88be49b2204682f9331c75a085/excel/上市公司信息和年报审计事务所-2018年_20200320_2新增锂电池.xlsx -------------------------------------------------------------------------------- /get_data_from_PDF.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 2, 4 | "metadata": { 5 | "language_info": { 6 | "name": "python", 7 | "codemirror_mode": { 8 | "name": "ipython", 9 | "version": 3 10 | }, 11 | "version": "3.7.4-final" 12 | }, 13 | "orig_nbformat": 2, 14 | "file_extension": ".py", 15 | "mimetype": "text/x-python", 16 | "name": "python", 17 | "npconvert_exporter": "python", 18 | "pygments_lexer": "ipython3", 19 | "version": 3, 20 | "kernelspec": { 21 | "name": "python37432bit6a4013f1a8ec4766a4af9787f2730623", 22 | "display_name": "Python 3.7.4 32-bit" 23 | } 24 | }, 25 | "cells": [ 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "本文为你展示，如何用Python把许多PDF文件的文本内容批量提取出来，并且整理存储到数据框中，以便于后续的数据分析。" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "## 使用PyPDF2从pdf中提取简单文本，示例代码如下：\n" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 1, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "output_type": "stream", 47 | "name": "stdout", 48 | "text": "total page number is 72\n" 49 | } 50 | ], 51 | "source": [ 52 | "# pdf='https://www.pwccn.com/zh/consulting/publications/industrial-infrastructure-digitalisation.pdf'\n", 53 | "pdf='https://www.pwc.com/gx/en/world-2050/assets/pwc-the-world-in-2050-full-report-feb-2017.pdf'\n", 54 | "# pdf_file='industrial-infrastructure-digitalisation.pdf'\n", 55 | "pdf_file='pwc-the-world-in-2050-full-report-feb-2017.pdf'\n", 56 | "\n", 57 | "import PyPDF2\n", 58 | "pdfFileObj = open(pdf_file, 'rb')\n", 59 | "pdfReader = PyPDF2.PdfFileReader(pdfFileObj)\n", 60 | "print('total page number is',pdfReader.numPages)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 2, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "pdf_file='pwc-the-world-in-2050-full-report-feb-2017.pdf'\n", 70 | "import PyPDF2\n", 71 | "\n", 72 | "pdfFileObj = open(pdf_file, 'rb')\n", 73 | "pdfReader = PyPDF2.PdfFileReader(pdfFileObj)\n", 74 | "pages=pdfReader.numPages\n", 75 | "for i in range(pages-72,pages-72):\n", 76 | " pageObj = pdfReader.getPage(i)\n", 77 | " print('——————————————————第',i,'页————————————————————\\n',pageObj.extractText())\n", 78 | " print('——————————————————第',i,'页————————————————————\\n')" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "file_foler='C:/TEMP/Excel/'\n", 88 | "pdf1=file_folder+'华电福新：二零一八年年度报告.PDF'\n", 89 | "pdf2=file_folder+'中广核新能源：2018年年报.PDF'\n", 90 | "pdf3=file_folder+'京能清洁能源：年报2018.PDF'\n", 91 | "import PyPDF2\n", 92 | "pdfFileObj=open(pdf1,'rb')\n", 93 | "pdfReader=PyPDF2.PdfFileReader(pdfFileObj)\n", 94 | "pages=pdfReader.numPages\n", 95 | "for i in range(pages)\n", 96 | " pageObj=pdfReader.getPage(i)\n", 97 | " text=pageObj.extractText() " 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "### 简体和繁体转换\n", 105 | "https://raw.githubusercontent.com/skydark/nstools/master/zhtools/langconv.py\n", 106 | "https://raw.githubusercontent.com/skydark/nstools/master/zhtools/zh_wiki.py" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 44, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "from langconv import *\n", 116 | "def Traditional2Simplified(sentence):\n", 117 | " '''\n", 118 | " 将sentence中的繁体字转为简体字\n", 119 | " :param sentence: 待转换的句子\n", 120 | " :return: 将句子中繁体字转换为简体字之后的句子\n", 121 | " '''\n", 122 | " sentence = Converter('zh-hans').convert(sentence)\n", 123 | " return sentence\n", 124 | "# 将繁体字转换成简体字，并写入text中\n", 125 | "def time_now():\n", 126 | " import time\n", 127 | " time_now=time.strftime('%Y-%m-%d-%H%M%S',time.localtime(time.time()))\n", 128 | " time_now=str(time_now)\n", 129 | " return time_now\n", 130 | "with open('C:/TEMP/Excel/'+time_now()+'.txt','w') as f:\n", 131 | " f.write('test')\n", 132 | "\n", 133 | "" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 18, 139 | "metadata": { 140 | "tags": [ 141 | "outputPrepend", 142 | "outputPrepend", 143 | "outputPrepend", 144 | "outputPrepend", 145 | "outputPrepend", 146 | "outputPrepend", 147 | "outputPrepend", 148 | "outputPrepend" 149 | ] 150 | }, 151 | "outputs": [ 152 | { 153 | "output_type": "stream", 154 | "name": "stdout", 155 | "text": "180 - 0 忧郁的台湾乌龟\n180 - 1 忧郁的台湾乌龟\n180 - 2 忧郁的台湾乌龟\n180 - 3 忧郁的台湾乌龟\n180 - 4 忧郁的台湾乌龟\n180 - 5 忧郁的台湾乌龟\n180 - 6 忧郁的台湾乌龟\n180 - 7 忧郁的台湾乌龟\n180 - 8 忧郁的台湾乌龟\n180 - 9 忧郁的台湾乌龟\n180 - 10 忧郁的台湾乌龟\n180 - 11 忧郁的台湾乌龟\n180 - 12 忧郁的台湾乌龟\n180 - 13 忧郁的台湾乌龟\n180 - 14 忧郁的台湾乌龟\n180 - 15 忧郁的台湾乌龟\n" 156 | } 157 | ], 158 | "source": [ 159 | "# -*-coding:gbk-*-\n", 160 | "file_folder='C:/TEMP/Excel/'\n", 161 | "pdf=file_folder+'麦肯锡中国银行业CEO季刊2018年夏季刊_精简版.pdf'\n", 162 | "pdf1=file_folder+'华电福新：二零一八年年度报告.PDF'\n", 163 | "pdf2=file_folder+'中广核新能源：2018年年报.PDF'\n", 164 | "pdf3=file_folder+'京能清洁能源：年报2018.PDF'\n", 165 | "import PyPDF2\n", 166 | "pdfFileObj=open(pdf,'rb')\n", 167 | "pdfReader=PyPDF2.PdfFileReader(pdfFileObj)\n", 168 | "pages=pdfReader.numPages\n", 169 | "content=''\n", 170 | "for i in range(int(pages/10)):\n", 171 | " pageObj=pdfReader.getPage(i)\n", 172 | " text=pageObj.extractText()\n", 173 | " content+=content+'\\n'+str(i)+'\\n'+text\n", 174 | " print(pages,'-',i,' ',Traditional2Simplified('憂郁的臺灣烏龜'))\n", 175 | "print(content)\n", 176 | " # print(Traditional2Simplified(text))\n", 177 | "" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "## 使用tabula从pdf中读取表格数据\n", 185 | "### 安装tabula\n", 186 | "pip install -i https://pypi.tuna.tsinghua.edu.cn/simple tabula-py\n", 187 | "### 安装java\n", 188 | "https://repo.huaweicloud.com/java/jdk/\n", 189 | "### 安装openpyxl\n", 190 | "pip install -i https://pypi.tuna.tsinghua.edu.cn/simple openpyxl\n" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "#中国家庭金融调查报告 - Sohu \n", 200 | "pdf=\"http://money.sohu.com/upload/chinajrdcbg0510.pdf\"\n", 201 | "pdf_file='pwc-the-world-in-2050-full-report-feb-2017.pdf'\n", 202 | "import tabula\n", 203 | "# df = tabula.read_pdf(pdf_file,encoding='gbk',,pages='10')\n", 204 | "df = tabula.read_pdf(pdf,encoding='gbk',multiple_tables = True,pages='5')\n", 205 | "print(df)\n", 206 | "#实现效果可以点击https://zhuanlan.zhihu.com/p/122214626" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "### 将pdf中的表格导出到excel" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "import tabula\n", 223 | "pdf1='C:/TEMP/Excel/chicago.pdf'\n", 224 | "excel1='C:/TEMP/Excel/chicago.csv'\n", 225 | "# tabula.convert_into(pdf1,excel1,output_format =\"xlsx\")\n", 226 | "tabula.convert_into(pdf1,excel1,pages='all')" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "实现效果可以参见https://zhuanlan.zhihu.com/p/122214626" 234 | ] 235 | } 236 | ] 237 | } -------------------------------------------------------------------------------- /getting_data_from_eastmoney_with_Python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 2, 4 | "metadata": { 5 | "language_info": { 6 | "name": "python", 7 | "codemirror_mode": { 8 | "name": "ipython", 9 | "version": 3 10 | }, 11 | "version": "3.7.4-final" 12 | }, 13 | "orig_nbformat": 2, 14 | "file_extension": ".py", 15 | "mimetype": "text/x-python", 16 | "name": "python", 17 | "npconvert_exporter": "python", 18 | "pygments_lexer": "ipython3", 19 | "version": 3, 20 | "kernelspec": { 21 | "name": "python37432bit6a4013f1a8ec4766a4af9787f2730623", 22 | "display_name": "Python 3.7.4 32-bit" 23 | } 24 | }, 25 | "cells": [ 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## 以下为从东方财富网获取数据的演示\n", 31 | "爬取了2017-11-29到2017年11月30日期间，股东增持数据\n", 32 | "http://data.eastmoney.com/executive/gdzjc-jzc.html，用到了以下方法\n", 33 | "- 字典slice\n", 34 | "- eval() str转dict\n", 35 | "- split() 数组分列" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 23, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "output_type": "stream", 45 | "name": "stdout", 46 | "text": "{\"Message\":\"\",\"Status\":0,\"Data\":[{\"TableName\":\"RptShareholdersIncreaseMap\",\"TotalPage\":1,\"ConsumeMSecond\":609,\"SplitSymbol\":\"|\",\"FieldName\":\"SHCode,CompanyCode,SCode,Close,ChangePercent,SName,ShareHdName,FX,ChangeNum,BDSLZLTB,BDZGBBL,JYFS,BDHCGZS,BDHCGBL,BDHCYLTGSL,BDHCYLTSLZLTGB,BDKS,BDJZ,NOTICEDATE\",\"Data\":[\"80628780|10000848|000506|2.79|1.45|中润资源|宁波梅山保税港区冉盛盛昌投资管理合伙企业(有限合伙)|增持|919.2401|0.99|0.9895|二级市场|4561.2401|4.91|4561.2401|4.91|1899-12-30|2017-11-29|2017-11-30\",\"70411877|10001090|000793|3.15|0.32|华闻集团|渤海国际信托股份有限公司-永盈1号单一资金信托|增持|441.6|0.24|0.2205|二级市场|4847.3577|2.42|4847.3577|2.63|2017-11-29|2017-11-29|2017-11-30\",\"80437115|10000083|600110|5.15|0.78|诺德股份|深圳市邦民创业投资有限公司|增持|133.24|0.12|0.1158|二级市场|10410.7774|9.05|10410.7774|9.05|2017-11-15|2017-11-29|2017-11-30\",\"70411465|80357739|002832|19.56|1.93|比音勒芬|兴证资管鑫众-比音勒芬1号定向资产管理计划|增持|58.8197|2.21|0.55|二级市场|58.8197|0.55|||1899-12-30|2017-11-29|2017-11-29\",\"80548023|80133697|300010|15.12|1.89|立思辰|新余绿萝投资合伙企业(有限合伙)|增持|103.4999|0.17|0.1195|二级市场|285.8079|0.33|103.4999|0.17|1899-12-30|2017-11-29|2017-11-29\",\"80548025|80133697|300010|15.12|1.89|立思辰|共青城大益祥云投资管理合伙企业(有限合伙)|增持|21.32|0.03|0.0259|二级市场|57.632|0.07|21.32|0.03|1899-12-30|2017-11-29|2017-11-29\",\"80603777|80160477|300169|6.15|0.82|天晟新材|晟衍(上海)投资管理有限公司|增持|30.75|0.12|0.0944|二级市场|1879.3322|5.77|1879.3322|7.3|2017-11-29|2017-11-29|2017-11-29\",\"70411891|80078721|002463|24.73|0.57|沪电股份|华泰资管沪电股份共赢1号定向资产管理计划|增持|1337.9|0.8|0.8|二级市场|1337.9|0.8|||2017-11-28|2017-11-28|2017-11-30\",\"80007936|10000052|600079|20.35|-0.05|人福医药|武汉当代科技产业集团股份有限公司|增持|229.4085|0.22|0.1694|二级市场|39020.2049|28.82|18813.8207|17.79|2017-11-28|2017-11-28|2017-11-30\",\"80028812|10000402|600595|1.97|0.00|*ST中孚|河南豫联能源集团有限责任公司|增持|3.4439|0|0.002|二级市场|93895.7844|53.92|93895.7844|53.92|2017-11-28|2017-11-28|2017-11-30\",\"80504205|10001397|002014|9.88|0.30|永新股份|黄山永佳投资有限公司|增持|279.5476|0.86|0.8326|二级市场|10615.9204|31.62|337.5476|1.04|2017-10-30|2017-11-28|2017-11-29\",\"80198314|10001397|002014|9.88|0.30|永新股份|上海原龙投资控股(集团)有限公司|增持|177.9129|0.55|0.5299|二级市场|3290.5958|9.8|3290.5958|10.17|2017-11-28|2017-11-28|2017-11-29\",\"80088053|80088005|002083|7.96|0.13|孚日股份|孚日控股集团股份有限公司|增持|202.63|0.23|0.2232|二级市场|25101.7396|27.65|25101.7396|29.01|2017-11-28|2017-11-28|2017-11-29\",\"73118791|80132397|002422|22.70|0.89|科伦药业|创赢投资10号集合资金信托计划|增持|2366|2.3|1.6457|二级市场|3436|2.39|3436|3.34|2017-11-28|2017-11-28|2017-11-29\",\"80603777|80160477|300169|6.15|0.82|天晟新材|晟衍(上海)投资管理有限公司|增持|149.17|0.58|0.4575|二级市场|1848.5822|5.67|1848.5822|7.18|2017-11-28|2017-11-28|2017-11-29\",\"80603777|80160477|300169|6.15|0.82|天晟新材|晟衍(上海)投资管理有限公司|增持|69.51|0.27|0.2131|二级市场|1699.4122|5.21|1699.4122|6.6|2017-11-27|2017-11-27|2017-11-29\"]}]}\n" 47 | } 48 | ], 49 | "source": [ 50 | "url=\"http://datainterface3.eastmoney.com/EM_DataCenter_V3/api/GDZC/GetGDZC?tkn=eastmoney&cfg=gdzc&secucode=&fx=1&sharehdname=&pageSize=50&pageNum=1&sortFields=BDJZ&sortDirec=1&startDate=2017-11-29&endDate=2017-11-30\"\n", 51 | "# 原文链接：https://blog.csdn.net/binosun/article/details/78697332\n", 52 | "import requests\n", 53 | "response=requests.get(url)\n", 54 | "print(response.text)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 24, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "output_type": "stream", 64 | "name": "stdout", 65 | "text": "\n\n0\n[{'TableName': 'RptShareholdersIncreaseMap', 'TotalPage': 1, 'ConsumeMSecond': 609, 'SplitSymbol': '|', 'FieldName': 'SHCode,CompanyCode,SCode,Close,ChangePercent,SName,ShareHdName,FX,ChangeNum,BDSLZLTB,BDZGBBL,JYFS,BDHCGZS,BDHCGBL,BDHCYLTGSL,BDHCYLTSLZLTGB,BDKS,BDJZ,NOTICEDATE', 'Data': ['80628780|10000848|000506|2.79|1.45|中润资源|宁波梅山保税港区冉盛盛昌投资管理合伙企业(有限合伙)|增持|919.2401|0.99|0.9895|二级市场|4561.2401|4.91|4561.2401|4.91|1899-12-30|2017-11-29|2017-11-30', '70411877|10001090|000793|3.15|0.32|华闻集团|渤海国际信托股份有限公司-永盈1号单一资金信托|增持|441.6|0.24|0.2205|二级市场|4847.3577|2.42|4847.3577|2.63|2017-11-29|2017-11-29|2017-11-30', '80437115|10000083|600110|5.15|0.78|诺德股份|深圳市邦民创业投资有限公司|增持|133.24|0.12|0.1158|二级市场|10410.7774|9.05|10410.7774|9.05|2017-11-15|2017-11-29|2017-11-30', '70411465|80357739|002832|19.56|1.93|比音勒芬|兴证资管鑫众-比音勒芬1号定向资产管理计划|增持|58.8197|2.21|0.55|二级市场|58.8197|0.55|||1899-12-30|2017-11-29|2017-11-29', '80548023|80133697|300010|15.12|1.89|立思辰|新余绿萝投资合伙企业(有限合伙)|增持|103.4999|0.17|0.1195|二级市场|285.8079|0.33|103.4999|0.17|1899-12-30|2017-11-29|2017-11-29', '80548025|80133697|300010|15.12|1.89|立思辰|共青城大益祥云投资管理合伙企业(有限合伙)|增持|21.32|0.03|0.0259|二级市场|57.632|0.07|21.32|0.03|1899-12-30|2017-11-29|2017-11-29', '80603777|80160477|300169|6.15|0.82|天晟新材|晟衍(上海)投资管理有限公司|增持|30.75|0.12|0.0944|二级市场|1879.3322|5.77|1879.3322|7.3|2017-11-29|2017-11-29|2017-11-29', '70411891|80078721|002463|24.73|0.57|沪电股份|华泰资管沪电股份共赢1号定向资产管理计划|增持|1337.9|0.8|0.8|二级市场|1337.9|0.8|||2017-11-28|2017-11-28|2017-11-30', '80007936|10000052|600079|20.35|-0.05|人福医药|武汉当代科技产业集团股份有限公司|增持|229.4085|0.22|0.1694|二级市场|39020.2049|28.82|18813.8207|17.79|2017-11-28|2017-11-28|2017-11-30', '80028812|10000402|600595|1.97|0.00|*ST中孚|河南豫联能源集团有限责任公司|增持|3.4439|0|0.002|二级市场|93895.7844|53.92|93895.7844|53.92|2017-11-28|2017-11-28|2017-11-30', '80504205|10001397|002014|9.88|0.30|永新股份|黄山永佳投资有限公司|增持|279.5476|0.86|0.8326|二级市场|10615.9204|31.62|337.5476|1.04|2017-10-30|2017-11-28|2017-11-29', '80198314|10001397|002014|9.88|0.30|永新股份|上海原龙投资控股(集团)有限公司|增持|177.9129|0.55|0.5299|二级市场|3290.5958|9.8|3290.5958|10.17|2017-11-28|2017-11-28|2017-11-29', '80088053|80088005|002083|7.96|0.13|孚日股份|孚日控股集团股份有限公司|增持|202.63|0.23|0.2232|二级市场|25101.7396|27.65|25101.7396|29.01|2017-11-28|2017-11-28|2017-11-29', '73118791|80132397|002422|22.70|0.89|科伦药业|创赢投资10号集合资金信托计划|增持|2366|2.3|1.6457|二级市场|3436|2.39|3436|3.34|2017-11-28|2017-11-28|2017-11-29', '80603777|80160477|300169|6.15|0.82|天晟新材|晟衍(上海)投资管理有限公司|增持|149.17|0.58|0.4575|二级市场|1848.5822|5.67|1848.5822|7.18|2017-11-28|2017-11-28|2017-11-29', '80603777|80160477|300169|6.15|0.82|天晟新材|晟衍(上海)投资管理有限公司|增持|69.51|0.27|0.2131|二级市场|1699.4122|5.21|1699.4122|6.6|2017-11-27|2017-11-27|2017-11-29']}]\n" 66 | } 67 | ], 68 | "source": [ 69 | "# dict = {'a': 1, 'b': 2, 'b': '3'}\n", 70 | "# print(dict['a'])\n", 71 | "items={}\n", 72 | "items=response.text\n", 73 | "print(type(items))\n", 74 | "items=eval(items)\n", 75 | "print(items[\"Message\"])\n", 76 | "print(items[\"Status\"])\n", 77 | "print(items[\"Data\"])\n", 78 | "# for i in range(len(items)):\n", 79 | " # print(items(i))" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 27, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "output_type": "stream", 89 | "name": "stdout", 90 | "text": "*************************************************************\n['80628780', '10000848', '000506', '2.79', '1.45', '中润资源', '宁波梅山保税港区冉盛盛昌投资管理合伙企业(有限合伙)', '增持', '919.2401', '0.99', '0.9895', '二级市场', '4561.2401', '4.91', '4561.2401', '4.91', '1899-12-30', '2017-11-29', '2017-11-30']\n*************************************************************\n['70411877', '10001090', '000793', '3.15', '0.32', '华闻集团', '渤海国际信托股份有限公司-永盈1号单一资金信托', '增持', '441.6', '0.24', '0.2205', '二级市场', '4847.3577', '2.42', '4847.3577', '2.63', '2017-11-29', '2017-11-29', '2017-11-30']\n*************************************************************\n['80437115', '10000083', '600110', '5.15', '0.78', '诺德股份', '深圳市邦民创业投资有限公司', '增持', '133.24', '0.12', '0.1158', '二级市场', '10410.7774', '9.05', '10410.7774', '9.05', '2017-11-15', '2017-11-29', '2017-11-30']\n*************************************************************\n['70411465', '80357739', '002832', '19.56', '1.93', '比音勒芬', '兴证资管鑫众-比音勒芬1号定向资产管理计划', '增持', '58.8197', '2.21', '0.55', '二级市场', '58.8197', '0.55', '', '', '1899-12-30', '2017-11-29', '2017-11-29']\n*************************************************************\n['80548023', '80133697', '300010', '15.12', '1.89', '立思辰', '新余绿萝投资合伙企业(有限合伙)', '增持', '103.4999', '0.17', '0.1195', '二级市场', '285.8079', '0.33', '103.4999', '0.17', '1899-12-30', '2017-11-29', '2017-11-29']\n*************************************************************\n['80548025', '80133697', '300010', '15.12', '1.89', '立思辰', '共青城大益祥云投资管理合伙企业(有限合伙)', '增持', '21.32', '0.03', '0.0259', '二级市场', '57.632', '0.07', '21.32', '0.03', '1899-12-30', '2017-11-29', '2017-11-29']\n*************************************************************\n['80603777', '80160477', '300169', '6.15', '0.82', '天晟新材', '晟衍(上海)投资管理有限公司', '增持', '30.75', '0.12', '0.0944', '二级市场', '1879.3322', '5.77', '1879.3322', '7.3', '2017-11-29', '2017-11-29', '2017-11-29']\n*************************************************************\n['70411891', '80078721', '002463', '24.73', '0.57', '沪电股份', '华泰资管沪电股份共赢1号定向资产管理计划', '增持', '1337.9', '0.8', '0.8', '二级市场', '1337.9', '0.8', '', '', '2017-11-28', '2017-11-28', '2017-11-30']\n*************************************************************\n['80007936', '10000052', '600079', '20.35', '-0.05', '人福医药', '武汉当代科技产业集团股份有限公司', '增持', '229.4085', '0.22', '0.1694', '二级市场', '39020.2049', '28.82', '18813.8207', '17.79', '2017-11-28', '2017-11-28', '2017-11-30']\n*************************************************************\n['80028812', '10000402', '600595', '1.97', '0.00', '*ST中孚', '河南豫联能源集团有限责任公司', '增持', '3.4439', '0', '0.002', '二级市场', '93895.7844', '53.92', '93895.7844', '53.92', '2017-11-28', '2017-11-28', '2017-11-30']\n*************************************************************\n['80504205', '10001397', '002014', '9.88', '0.30', '永新股份', '黄山永佳投资有限公司', '增持', '279.5476', '0.86', '0.8326', '二级市场', '10615.9204', '31.62', '337.5476', '1.04', '2017-10-30', '2017-11-28', '2017-11-29']\n*************************************************************\n['80198314', '10001397', '002014', '9.88', '0.30', '永新股份', '上海原龙投资控股(集团)有限公司', '增持', '177.9129', '0.55', '0.5299', '二级市场', '3290.5958', '9.8', '3290.5958', '10.17', '2017-11-28', '2017-11-28', '2017-11-29']\n*************************************************************\n['80088053', '80088005', '002083', '7.96', '0.13', '孚日股份', '孚日控股集团股份有限公司', '增持', '202.63', '0.23', '0.2232', '二级市场', '25101.7396', '27.65', '25101.7396', '29.01', '2017-11-28', '2017-11-28', '2017-11-29']\n*************************************************************\n['73118791', '80132397', '002422', '22.70', '0.89', '科伦药业', '创赢投资10号集合资金信托计划', '增持', '2366', '2.3', '1.6457', '二级市场', '3436', '2.39', '3436', '3.34', '2017-11-28', '2017-11-28', '2017-11-29']\n*************************************************************\n['80603777', '80160477', '300169', '6.15', '0.82', '天晟新材', '晟衍(上海)投资管理有限公司', '增持', '149.17', '0.58', '0.4575', '二级市场', '1848.5822', '5.67', '1848.5822', '7.18', '2017-11-28', '2017-11-28', '2017-11-29']\n*************************************************************\n['80603777', '80160477', '300169', '6.15', '0.82', '天晟新材', '晟衍(上海)投资管理有限公司', '增持', '69.51', '0.27', '0.2131', '二级市场', '1699.4122', '5.21', '1699.4122', '6.6', '2017-11-27', '2017-11-27', '2017-11-29']\n" 91 | } 92 | ], 93 | "source": [ 94 | "item=items[\"Data\"][0]\n", 95 | "# print(item[\"Data\"])\n", 96 | "for i in item[\"Data\"]:\n", 97 | " print('*************************************************************')\n", 98 | " # print(i)\n", 99 | " # print('--------------')\n", 100 | " i=i.split('|')\n", 101 | " print(i)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [] 110 | } 111 | ] 112 | } -------------------------------------------------------------------------------- /getting_table_with_python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 2, 4 | "metadata": { 5 | "language_info": { 6 | "name": "python", 7 | "codemirror_mode": { 8 | "name": "ipython", 9 | "version": 3 10 | }, 11 | "version": "3.7.4-final" 12 | }, 13 | "orig_nbformat": 2, 14 | "file_extension": ".py", 15 | "mimetype": "text/x-python", 16 | "name": "python", 17 | "npconvert_exporter": "python", 18 | "pygments_lexer": "ipython3", 19 | "version": 3, 20 | "kernelspec": { 21 | "name": "python37432bit6a4013f1a8ec4766a4af9787f2730623", 22 | "display_name": "Python 3.7.4 32-bit" 23 | } 24 | }, 25 | "cells": [ 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## python + pandas爬取网页表格数据" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "这里以工标网标准数据为例 http://www.csres.com/notice/50655.html " 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 13, 43 | "metadata": { 44 | "tags": [ 45 | "outputPrepend" 46 | ] 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "url='http://www.csres.com/notice/50655.html'\n", 51 | "import requests\n", 52 | "import lxml.etree\n", 53 | "respond=requests.get(url)\n", 54 | "# print(respond.text)\n", 55 | "text=lxml.etree.HTML(respond.text)\n", 56 | "# print(text)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 15, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "output_type": "stream", 66 | "name": "stdout", 67 | "text": "\n" 68 | } 69 | ], 70 | "source": [ 71 | "table = text.xpath('//table[@id=\"table1\"]')\n", 72 | "print(table[0])\n", 73 | "table = lxml.etree.tostring(table[0], encoding='utf-8').decode()" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 16, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "output_type": "stream", 83 | "name": "stdout", 84 | "text": "序号标准编号标准名称代替标准号发布日期 \\\n0 1 JB/T 13517-2018 V型球阀 NaN 2018-07-04 \n1 2 JB/T 13518-2018 高压加热器用三通阀 NaN 2018-07-04 \n2 3 JB/T 13519-2018 气动摩擦片浮动式制动器 NaN 2018-07-04 \n3 4 JB/T 13520-2018 气动摩擦片浮动式离合器 NaN 2018-07-04 \n4 5 JB/T 13521-2018 热连轧地下卷取机技术条件 NaN 2018-07-04 \n.. ... ... ... ... ... \n178 179 SJ/T 11445.5-2018 信息技术服务外包第5部分：发包方项目管理规范 NaN 2018-07-04 \n179 180 SJ/T 11684-2018 信息技术服务信息系统服务监理规范 NaN 2018-07-04 \n180 181 SJ/T 11720-2018 高性能计算机刀片式服务器计算刀片机械技术要求 NaN 2018-07-04 \n181 182 SJ/T 11723-2018 锂离子电池用电解液 NaN 2018-07-04 \n182 183 SJ/T 11724-2018 锂原电池用电解液 NaN 2018-07-04 \n\n 实施日期 \n0 2019-05-01 \n1 2019-05-01 \n2 2019-05-01 \n3 2019-05-01 \n4 2019-05-01 \n.. ... \n178 2018-10-01 \n179 2018-10-01 \n180 2018-10-01 \n181 2018-10-01 \n182 2018-10-01 \n\n[183 rows x 6 columns]\n" 85 | } 86 | ], 87 | "source": [ 88 | "import pandas\n", 89 | "df = pandas.read_html(table, encoding='utf-8', header=0)[0]\n", 90 | "print(df)\n", 91 | "results = list(df.T.to_dict().values()) # 转换成列表嵌套字典的格式" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "这就是从网页提取表格的过程了，感谢https://blog.csdn.net/zhang862520682/article/details/86701078" 99 | ] 100 | } 101 | ] 102 | } -------------------------------------------------------------------------------- /jq_概念股人均营收_20200609.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 20, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "output_type": "stream", 10 | "name": "stdout", 11 | "text": "提示：当前环境pandas版本为0.25，get_price与get_fundamentals_continuously接口panel参数将固定为False\n注意：0.25以上版本pandas不支持panel，如使用该数据结构和相关函数请注意修改\nauth success \n" 12 | } 13 | ], 14 | "source": [ 15 | "import jqdatasdk\n", 16 | "from jqdatasdk import *\n", 17 | "jqdatasdk.auth(\"13141315365\", \"315365\")" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 5, 23 | "metadata": {}, 24 | "outputs": [ 25 | { 26 | "output_type": "stream", 27 | "name": "stdout", 28 | "text": "001\n002\n003\n004\n005\n006\n007\n008\n009\n" 29 | } 30 | ], 31 | "source": [ 32 | "#!/usr/bin/env python3\n", 33 | "#-*- coding:utf-8 -*-\n", 34 | "for i in range(1,10):\n", 35 | " print (str(i).rjust(3,'0'))" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## 参股金融概念" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 43, 48 | "metadata": { 49 | "tags": [ 50 | "outputPrepend", 51 | "outputPrepend", 52 | "outputPrepend", 53 | "outputPrepend", 54 | "outputPrepend", 55 | "outputPrepend", 56 | "outputPrepend", 57 | "outputPrepend", 58 | "outputPrepend", 59 | "outputPrepend", 60 | "outputPrepend", 61 | "outputPrepend", 62 | "outputPrepend", 63 | "outputPrepend", 64 | "outputPrepend", 65 | "outputPrepend", 66 | "outputPrepend", 67 | "outputPrepend", 68 | "outputPrepend", 69 | "outputPrepend", 70 | "outputPrepend", 71 | "outputPrepend", 72 | "outputPrepend", 73 | "outputPrepend" 74 | ] 75 | }, 76 | "outputs": [ 77 | { 78 | "output_type": "stream", 79 | "name": "stdout", 80 | "text": "******** GN001 ********\n name code employee total_operating_revenue ratio\n0 深圳能源 000027.XSHE 6935 2.081700e+10 NaN\n name code employee total_operating_revenue ratio\n0 国际实业 000159.XSHE 295 430842047.3 NaN\n name code employee total_operating_revenue ratio\n0 粤高速A 000429.XSHE 2154 3.057936e+09 NaN\n name code employee total_operating_revenue ratio\n0 冰山冷热 000530.XSHE 1582 1.831851e+09 NaN\n name code employee total_operating_revenue ratio\n0 佛山照明 000541.XSHE 7541 3.337577e+09 NaN\n name code employee total_operating_revenue ratio\n0 皖能电力 000543.XSHE 3498 1.609224e+10 NaN\n name code employee total_operating_revenue ratio\n0 创元科技 000551.XSHE 2636 3.237263e+09 NaN\n name code employee total_operating_revenue ratio\n0 苏常柴A 000570.XSHE 3010 2.040134e+09 NaN\n name code employee total_operating_revenue ratio\n0 宝新能源 000690.XSHE 1370 5.626823e+09 NaN\n name code employee total_operating_revenue ratio\n0 模塑科技 000700.XSHE 4239 5.489528e+09 NaN\n name code employee total_operating_revenue ratio\n0 正虹科技 000702.XSHE 1047 1.123012e+09 NaN\n name code employee total_operating_revenue ratio\n0 浙江震元 000705.XSHE 1904 3.243149e+09 NaN\n name code employee total_operating_revenue ratio\n0 新华制药 000756.XSHE 6399 5.606021e+09 NaN\n name code employee total_operating_revenue ratio\n0 华茂股份 000850.XSHE 5118 2.978917e+09 NaN\n name code employee total_operating_revenue ratio\n0 金陵药业 000919.XSHE 4767 2.534921e+09 NaN\n name code employee total_operating_revenue ratio\n0 华西股份 000936.XSHE 941 3.186875e+09 NaN\n name code employee total_operating_revenue ratio\n0 浙能电力 600023.XSHG 10465 5.437055e+10 NaN\n name code employee total_operating_revenue ratio\n0 黄山旅游 600054.XSHG 3605 1.606740e+09 NaN\n name code employee total_operating_revenue ratio\n0 南京高科 600064.XSHG 811 2.930211e+09 NaN\n name code employee total_operating_revenue ratio\n0 浙江东日 600113.XSHG 796 4.983711e+08 NaN\n name code employee total_operating_revenue ratio\n0 上海建工 600170.XSHG 42762 2.054967e+11 NaN\n name code employee total_operating_revenue ratio\n0 华资实业 600191.XSHG 490 1.110705e+08 NaN\n name code employee total_operating_revenue ratio\n0 海航控股 600221.XSHG 34048 7.238941e+10 NaN\n name code employee total_operating_revenue ratio\n0 两面针 600249.XSHG 2314 1.186674e+09 NaN\n name code employee total_operating_revenue ratio\n0 中恒集团 600252.XSHG 2469 3.814056e+09 NaN\n name code employee total_operating_revenue ratio\n0 阳光照明 600261.XSHG 8188 5.316197e+09 NaN\n name code employee total_operating_revenue ratio\n0 城建发展 600266.XSHG 2757 1.643188e+10 NaN\n name code employee total_operating_revenue ratio\n0 赣粤高速 600269.XSHG 3478 5.039442e+09 NaN\n name code employee total_operating_revenue ratio\n0 东方创业 600278.XSHG 1647 1.769248e+10 NaN\n name code employee total_operating_revenue ratio\n0 浦东建设 600284.XSHG 566 6.219611e+09 NaN\n name code employee total_operating_revenue ratio\n0 江苏舜天 600287.XSHG 2130 4.621764e+09 NaN\n name code employee total_operating_revenue ratio\n0 大恒科技 600288.XSHG 2440 3.305798e+09 NaN\n name code employee total_operating_revenue ratio\n0 远达环保 600292.XSHG 2230 4.067544e+09 NaN\n name code employee total_operating_revenue ratio\n0 小商品城 600415.XSHG 4604 4.042768e+09 NaN\n name code employee total_operating_revenue ratio\n0 好当家 600467.XSHG 4298 1.226123e+09 NaN\n name code employee total_operating_revenue ratio\n0 黑牡丹 600510.XSHG 3464 7.776961e+09 NaN\n name code employee total_operating_revenue ratio\n0 龙溪股份 600592.XSHG 2544 9.460285e+08 NaN\n name code employee total_operating_revenue ratio\n0 大众交通 600611.XSHG 9293 3.802490e+09 NaN\n name code employee total_operating_revenue ratio\n0 新黄浦 600638.XSHG 740 1.316166e+09 NaN\n name code employee total_operating_revenue ratio\n0 浦东金桥 600639.XSHG 218 3.352383e+09 NaN\n name code employee total_operating_revenue ratio\n0 锦江投资 600650.XSHG 8014 2.541806e+09 NaN\n name code employee total_operating_revenue ratio\n0 南京医药 600713.XSHG 4767 3.715574e+10 NaN\n name code employee total_operating_revenue ratio\n0 天津港 600717.XSHG 7623 1.288467e+10 NaN\n name code employee total_operating_revenue ratio\n0 兰州民百 600738.XSHG 701 1.882507e+09 NaN\n name code employee total_operating_revenue ratio\n0 华域汽车 600741.XSHG 33389 1.440236e+11 NaN\n name code employee total_operating_revenue ratio\n0 综艺股份 600770.XSHG 673 4.584265e+08 NaN\n name code employee total_operating_revenue ratio\n0 友好集团 600778.XSHG 3236 5.238606e+09 NaN\n name code employee total_operating_revenue ratio\n0 轻纺城 600790.XSHG 1082 981239467.1 NaN\n name code employee total_operating_revenue ratio\n0 国电电力 600795.XSHG 36162 1.165993e+11 NaN\n name code employee total_operating_revenue ratio\n0 福建水泥 600802.XSHG 1881 3.043783e+09 NaN\n name code employee total_operating_revenue ratio\n0 百联股份 600827.XSHG 35971 5.045877e+10 NaN\n name code employee total_operating_revenue ratio\n0 上海九百 600838.XSHG 190 74232786.59 NaN\n name code employee total_operating_revenue ratio\n0 杉杉股份 600884.XSHG 4681 8.679911e+09 NaN\n name code employee total_operating_revenue ratio\n0 国芳集团 601086.XSHG 1414 2.765901e+09 NaN\n name code employee total_operating_revenue ratio\n0 重庆水务 601158.XSHG 5817 5.638549e+09 NaN\n name code employee total_operating_revenue ratio\n0 中国交建 601800.XSHG 124457 5.547924e+11 NaN\n name code employee total_operating_revenue ratio\n0 新华文轩 601811.XSHG 7728 8.842458e+09 NaN\n name code employee total_operating_revenue ratio\n0 南方传媒 601900.XSHG 5639 6.525319e+09 NaN\n name code employee total_operating_revenue ratio\n0 出版传媒 601999.XSHG 2374 2.710640e+09 NaN\n name code employee total_operating_revenue ratio\n0 川仪股份 603100.XSHG 4696 3.968890e+09 NaN\n name code employee total_operating_revenue ratio\n0 坤彩科技 603826.XSHG 521 6.170060e+08 NaN\n" 81 | } 82 | ], 83 | "source": [ 84 | "\n", 85 | "for i in range(1,2):\n", 86 | " conceptno='GN'+str(i).rjust(3,'0') # 加载概念股清单\n", 87 | " print('********',conceptno,'********')\n", 88 | " # print(conceptno)\n", 89 | " try: \n", 90 | " # print('123')\n", 91 | " stocks=get_concept_stocks(conceptno, date=None) # 员工人数\n", 92 | " # print(stocks)\n", 93 | " for code1 in stocks: \n", 94 | " # print('123')\n", 95 | " # print(code)\n", 96 | " yuangong=finance.run_query(query(finance.STK_EMPLOYEE_INFO.name,finance.STK_EMPLOYEE_INFO.code,finance.STK_EMPLOYEE_INFO.employee).filter(finance.STK_EMPLOYEE_INFO.code==code1,finance.STK_EMPLOYEE_INFO.pub_date>='2019-12-01').limit(10))\n", 97 | " # yuan_gong=finance.run_query(query(finance.STK_EMPLOYEE_INFO).filter(finance.STK_EMPLOYEE_INFO.code==code,finance.STK_EMPLOYEE_INFO.pub_date>='2019-12-01').limit(1000)) \n", 98 | " # print(conceptno,yuangong) \n", 99 | " yuan_gong=finance.run_query(query(finance.STK_INCOME_STATEMENT.code,finance.STK_INCOME_STATEMENT.total_operating_revenue).filter(finance.STK_INCOME_STATEMENT.code==code1,finance.STK_INCOME_STATEMENT.end_date=='2019-12-31').limit(1000))\n", 100 | " \n", 101 | " # print(yuan_gong.fillna(1)) \n", 102 | " import pandas# 将两个表连接起来\n", 103 | " # print(pandas.merge(yuangong,yuan_gong))\n", 104 | " df=pandas.merge(yuangong,yuan_gong)\n", 105 | " df['ratio'] = df['total_operating_revenue'].div(df.groupby('name')['total_operating_revenue'].shift(1))\n", 106 | " print(df.head())\n", 107 | " \n", 108 | " except:\n", 109 | " print(\"~~~~~\",conceptno)\n", 110 | " pass" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 38, 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "output_type": "stream", 120 | "name": "stdout", 121 | "text": "employee name code company_name total_operating_revenue\n0 521 坤彩科技 603826.XSHG 福建坤彩材料科技股份有限公司 6.170060e+08\n" 122 | } 123 | ], 124 | "source": [ 125 | "# 将两个表连接起来\n", 126 | "import pandas\n", 127 | "print(pandas.merge(yuangong,yuan_gong))" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "## 人工智能概念\n", 135 | "GN201\t201\t人工智能" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 46, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "output_type": "stream", 145 | "name": "stdout", 146 | "text": "******** GN201 ********\n name code employee total_operating_revenue ratio\n0 浪潮信息 000977.XSHE 5469 5.165328e+10 NaN\n name code employee total_operating_revenue ratio\n0 紫光国微 002049.XSHE 2192 3.430410e+09 NaN\n name code employee total_operating_revenue ratio\n0 东华软件 002065.XSHE 8991 8.849013e+09 NaN\n name code employee total_operating_revenue ratio\n0 软控股份 002073.XSHE 2626 2.865379e+09 NaN\n name code employee total_operating_revenue ratio\n0 广电运通 002152.XSHE 26018 6.496265e+09 NaN\n name code employee total_operating_revenue ratio\n0 海得控制 002184.XSHE 958 2.058412e+09 NaN\n name code employee total_operating_revenue ratio\n0 江南化工 002226.XSHE 4044 3.633187e+09 NaN\n name code employee total_operating_revenue ratio\n0 科大讯飞 002230.XSHE 10447 1.007869e+10 NaN\n name code employee total_operating_revenue ratio\n0 川大智胜 002253.XSHE 716 3.425091e+08 NaN\n name code employee total_operating_revenue ratio\n0 中电兴发 002298.XSHE 2840 2.755676e+09 NaN\n name code employee total_operating_revenue ratio\n0 高乐股份 002348.XSHE 1156 7.124312e+08 NaN\n name code employee total_operating_revenue ratio\n0 汉王科技 002362.XSHE 1771 1.104508e+09 NaN\n name code employee total_operating_revenue ratio\n0 海康威视 002415.XSHE 40403 5.765811e+10 NaN\n name code employee total_operating_revenue ratio\n0 机器人 300024.XSHE 4559 2.745485e+09 NaN\n name code employee total_operating_revenue ratio\n0 赛为智能 300044.XSHE 1006 1.276630e+09 NaN\n name code employee total_operating_revenue ratio\n0 思创医惠 300078.XSHE 2246 1.573994e+09 NaN\n name code employee total_operating_revenue ratio\n0 美亚柏科 300188.XSHE 3318 2.067410e+09 NaN\nEmpty DataFrame\nColumns: [name, employee, code, total_operating_revenue, ratio]\nIndex: []\n name code employee total_operating_revenue ratio\n0 科大智能 300222.XSHE 3257 2.313319e+09 NaN\n name code employee total_operating_revenue ratio\n0 北京君正 300223.XSHE 263 3.393512e+08 NaN\n name code employee total_operating_revenue ratio\n0 拓尔思 300229.XSHE 1761 9.673060e+08 NaN\n name code employee total_operating_revenue ratio\n0 慈星股份 300307.XSHE 1650 1.521034e+09 NaN\n name code employee total_operating_revenue ratio\n0 东方网力 300367.XSHE 1159 3.535022e+08 NaN\n name code employee total_operating_revenue ratio\n0 劲拓股份 300400.XSHE 1161 4.953870e+08 NaN\n name code employee total_operating_revenue ratio\n0 昆仑万维 300418.XSHE 717 3.687884e+09 NaN\n name code employee total_operating_revenue ratio\n0 全志科技 300458.XSHE 615 1.463360e+09 NaN\n name code employee total_operating_revenue ratio\n0 景嘉微 300474.XSHE 723 5.307872e+08 NaN\n name code employee total_operating_revenue ratio\n0 神思电子 300479.XSHE 651 470389564.7 NaN\n name code employee total_operating_revenue ratio\n0 富瀚微 300613.XSHE 241 522080228.2 NaN\n name code employee total_operating_revenue ratio\n0 华胜天成 600410.XSHG 3980 4.575030e+09 NaN\n name code employee total_operating_revenue ratio\n0 恒生电子 600570.XSHG 7357 3.871840e+09 NaN\n name code employee total_operating_revenue ratio\n0 佳都科技 600728.XSHG 2336 5.011851e+09 NaN\n name code employee total_operating_revenue ratio\n0 浪潮软件 600756.XSHG 1899 1.455590e+09 NaN\n name code employee total_operating_revenue ratio\n0 浙大网新 600797.XSHG 4839 3.760815e+09 NaN\n name code employee total_operating_revenue ratio\n0 三六零 601360.XSHG 5824 1.284110e+10 NaN\n name code employee total_operating_revenue ratio\n0 中科曙光 603019.XSHG 3049 9.526470e+09 NaN\n" 147 | } 148 | ], 149 | "source": [ 150 | "\n", 151 | "for i in range(201,202):\n", 152 | " conceptno='GN'+str(i).rjust(3,'0') # 加载概念股清单\n", 153 | " print('********',conceptno,'********')\n", 154 | " # print(conceptno)\n", 155 | " try: \n", 156 | " # print('123')\n", 157 | " stocks=get_concept_stocks(conceptno, date=None) # 员工人数\n", 158 | " # print(stocks)\n", 159 | " for code1 in stocks: \n", 160 | " # print('123')\n", 161 | " # print(code)\n", 162 | " yuangong=finance.run_query(query(finance.STK_EMPLOYEE_INFO.name,finance.STK_EMPLOYEE_INFO.code,finance.STK_EMPLOYEE_INFO.employee).filter(finance.STK_EMPLOYEE_INFO.code==code1,finance.STK_EMPLOYEE_INFO.pub_date>='2019-12-01').limit(10))\n", 163 | " # yuan_gong=finance.run_query(query(finance.STK_EMPLOYEE_INFO).filter(finance.STK_EMPLOYEE_INFO.code==code,finance.STK_EMPLOYEE_INFO.pub_date>='2019-12-01').limit(1000)) \n", 164 | " # print(conceptno,yuangong) \n", 165 | " yuan_gong=finance.run_query(query(finance.STK_INCOME_STATEMENT.code,finance.STK_INCOME_STATEMENT.total_operating_revenue).filter(finance.STK_INCOME_STATEMENT.code==code1,finance.STK_INCOME_STATEMENT.end_date=='2019-12-31').limit(1000))\n", 166 | " \n", 167 | " # print(yuan_gong.fillna(1)) \n", 168 | " import pandas# 将两个表连接起来\n", 169 | " # print(pandas.merge(yuangong,yuan_gong))\n", 170 | " df=pandas.merge(yuangong,yuan_gong)\n", 171 | " df['ratio'] = df['total_operating_revenue'].div(df.groupby('name')['total_operating_revenue'].shift(1))\n", 172 | " print(df.head())\n", 173 | " \n", 174 | " except:\n", 175 | " print(\"~~~~~\",conceptno)\n", 176 | " pass" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 45, 182 | "metadata": {}, 183 | "outputs": [ 184 | { 185 | "output_type": "stream", 186 | "name": "stdout", 187 | "text": "******** GN201 ********\nGN201\n~~~~~ GN201\n" 188 | } 189 | ], 190 | "source": [ 191 | "# 尝试另一种方式不成功\n", 192 | "\n", 193 | "for i in range(201,202):\n", 194 | " conceptno='GN'+str(i).rjust(3,'0') # 加载概念股清单\n", 195 | " print('********',conceptno,'********')\n", 196 | " print(conceptno)\n", 197 | " try: \n", 198 | " # print('123')\n", 199 | " stocks=get_concept_stocks(conceptno, date=None) # 员工人数\n", 200 | " # print(stocks)\n", 201 | " for code1 in stocks: \n", 202 | " # print('123')\n", 203 | " # print(code)\n", 204 | " yuangong=finance.run_query(query(finance.STK_EMPLOYEE_INFO.employee,finance.STK_EMPLOYEE_INFO.name,finance.STK_EMPLOYEE_INFO.code,finance.STK_INCOME_STATEMENT.code,finance.STK_INCOME_STATEMENT.company_name,finance.STK_INCOME_STATEMENT.total_operating_revenue).filter(finance.STK_EMPLOYEE_INFO.code==code1,finance.STK_EMPLOYEE_INFO.pub_date>='2019-12-01',finance.STK_INCOME_STATEMENT.code==code1,finance.STK_INCOME_STATEMENT.end_date=='2019-12-31').limit(10))\n", 205 | " # yuan_gong=finance.run_query(query(finance.STK_EMPLOYEE_INFO).filter(finance.STK_EMPLOYEE_INFO.code==code,finance.STK_EMPLOYEE_INFO.pub_date>='2019-12-01').limit(1000)) \n", 206 | " print(conceptno,yuangong) \n", 207 | " # yuan_gong=finance.run_query(query(finance.STK_INCOME_STATEMENT.code,finance.STK_INCOME_STATEMENT.company_name,finance.STK_INCOME_STATEMENT.total_operating_revenue).filter(finance.STK_INCOME_STATEMENT.code==code1,finance.STK_INCOME_STATEMENT.end_date=='2019-12-31').limit(1000))\n", 208 | " \n", 209 | " # print(yuan_gong.fillna(1))\n", 210 | " # print(conceptno,yuan_gong['employee'].mean())\n", 211 | " # ping_jun_shi_zhi=\n", 212 | " except:\n", 213 | " print(\"~~~~~\",conceptno)\n", 214 | " pass" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [] 223 | } 224 | ], 225 | "metadata": { 226 | "language_info": { 227 | "codemirror_mode": { 228 | "name": "ipython", 229 | "version": 3 230 | }, 231 | "file_extension": ".py", 232 | "mimetype": "text/x-python", 233 | "name": "python", 234 | "nbconvert_exporter": "python", 235 | "pygments_lexer": "ipython3", 236 | "version": "3.7.4-final" 237 | }, 238 | "orig_nbformat": 2, 239 | "kernelspec": { 240 | "name": "python37432bit6a4013f1a8ec4766a4af9787f2730623", 241 | "display_name": "Python 3.7.4 32-bit" 242 | } 243 | }, 244 | "nbformat": 4, 245 | "nbformat_minor": 2 246 | } -------------------------------------------------------------------------------- /jq_概念股人均营收_火电激光SaaS.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "output_type": "stream", 10 | "name": "stdout", 11 | "text": "提示：当前环境pandas版本为0.25，get_price与get_fundamentals_continuously接口panel参数将固定为False\n注意：0.25以上版本pandas不支持panel，如使用该数据结构和相关函数请注意修改\nauth success \n" 12 | } 13 | ], 14 | "source": [ 15 | "import jqdatasdk\n", 16 | "from jqdatasdk import *\n", 17 | "jqdatasdk.auth(\"13141315365\", \"315365\")" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "# 火电概念\n", 25 | "GN874\t874\t火电" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "output_type": "stream", 35 | "name": "stdout", 36 | "text": "******** GN874 ********\n name code employee total_operating_revenue ratio\n0 深圳能源 000027.XSHE 6935 2.081700e+10 NaN\n name code employee total_operating_revenue ratio\n0 穗恒运A 000531.XSHE 1185 3.228594e+09 NaN\n name code employee total_operating_revenue ratio\n0 粤电力A 000539.XSHE 6688 2.936016e+10 NaN\n name code employee total_operating_revenue ratio\n0 皖能电力 000543.XSHE 3498 1.609224e+10 NaN\n name code employee total_operating_revenue ratio\n0 建投能源 000600.XSHE 4540 1.396372e+10 NaN\n name code employee total_operating_revenue ratio\n0 宝新能源 000690.XSHE 1370 5.626823e+09 NaN\n name code employee total_operating_revenue ratio\n0 新能泰山 000720.XSHE 889 3.580388e+09 NaN\n name code employee total_operating_revenue ratio\n0 漳泽电力 000767.XSHE 8157 1.202500e+10 NaN\n name code employee total_operating_revenue ratio\n0 吉电股份 000875.XSHE 2788 8.454148e+09 NaN\n name code employee total_operating_revenue ratio\n0 赣能股份 000899.XSHE 746 2.670328e+09 NaN\n name code employee total_operating_revenue ratio\n0 东方能源 000958.XSHE 2816 1.150051e+10 NaN\n name code employee total_operating_revenue ratio\n0 长源电力 000966.XSHE 3109 7.366107e+09 NaN\n name code employee total_operating_revenue ratio\n0 豫能控股 001896.XSHE 2834 8.089293e+09 NaN\n name code employee total_operating_revenue ratio\n0 龙源技术 300105.XSHE 572 5.133364e+08 NaN\n name code employee total_operating_revenue ratio\n0 华能国际 600011.XSHG 58263 1.734848e+11 NaN\n name code employee total_operating_revenue ratio\n0 上海电力 600021.XSHG 6329 2.369003e+10 NaN\n name code employee total_operating_revenue ratio\n0 浙能电力 600023.XSHG 10465 5.437055e+10 NaN\n name code employee total_operating_revenue ratio\n0 华电国际 600027.XSHG 27287 9.365443e+10 NaN\n name code employee total_operating_revenue ratio\n0 广州发展 600098.XSHG 5316 2.964307e+10 NaN\n name code employee total_operating_revenue ratio\n0 桂冠电力 600236.XSHG 3593 9.043440e+09 NaN\n name code employee total_operating_revenue ratio\n0 远达环保 600292.XSHG 2230 4.067544e+09 NaN\n name code employee total_operating_revenue ratio\n0 涪陵电力 600452.XSHG 782 2.621898e+09 NaN\n name code employee total_operating_revenue ratio\n0 福能股份 600483.XSHG 2983 9.945408e+09 NaN\n name code employee total_operating_revenue ratio\n0 天富能源 600509.XSHG 2535 4.894292e+09 NaN\n name code employee total_operating_revenue ratio\n0 京能电力 600578.XSHG 5736 1.843836e+10 NaN\n name code employee total_operating_revenue ratio\n0 申能股份 600642.XSHG 2645 3.884130e+10 NaN\n name code employee total_operating_revenue ratio\n0 川投能源 600674.XSHG 611 8.383316e+08 NaN\n name code employee total_operating_revenue ratio\n0 华银电力 600744.XSHG 5891 8.993163e+09 NaN\n name code employee total_operating_revenue ratio\n0 通宝能源 600780.XSHG 4799 6.614815e+09 NaN\n name code employee total_operating_revenue ratio\n0 国电电力 600795.XSHG 36162 1.165993e+11 NaN\n name code employee total_operating_revenue ratio\n0 内蒙华电 600863.XSHG 6834 1.447716e+10 NaN\n name code employee total_operating_revenue ratio\n0 东方电气 600875.XSHG 17360 3.284032e+10 NaN\n name code employee total_operating_revenue ratio\n0 国投电力 600886.XSHG 9374 4.243346e+10 NaN\n name code employee total_operating_revenue ratio\n0 大唐发电 601991.XSHG 32976 9.545306e+10 NaN\n" 37 | } 38 | ], 39 | "source": [ 40 | "for i in range(874,875):\n", 41 | " conceptno='GN'+str(i).rjust(3,'0') # 加载概念股清单\n", 42 | " print('********',conceptno,'********')\n", 43 | " # print(conceptno)\n", 44 | " try: \n", 45 | " # print('123')\n", 46 | " stocks=get_concept_stocks(conceptno, date=None) # 员工人数\n", 47 | " # print(stocks)\n", 48 | " for code1 in stocks: \n", 49 | " # print('123')\n", 50 | " # print(code)\n", 51 | " yuangong=finance.run_query(query(finance.STK_EMPLOYEE_INFO.name,finance.STK_EMPLOYEE_INFO.code,finance.STK_EMPLOYEE_INFO.employee).filter(finance.STK_EMPLOYEE_INFO.code==code1,finance.STK_EMPLOYEE_INFO.pub_date>='2019-12-01').limit(10))\n", 52 | " # yuan_gong=finance.run_query(query(finance.STK_EMPLOYEE_INFO).filter(finance.STK_EMPLOYEE_INFO.code==code,finance.STK_EMPLOYEE_INFO.pub_date>='2019-12-01').limit(1000)) \n", 53 | " # print(conceptno,yuangong) \n", 54 | " yuan_gong=finance.run_query(query(finance.STK_INCOME_STATEMENT.code,finance.STK_INCOME_STATEMENT.total_operating_revenue).filter(finance.STK_INCOME_STATEMENT.code==code1,finance.STK_INCOME_STATEMENT.end_date=='2019-12-31').limit(1000))\n", 55 | " \n", 56 | " # print(yuan_gong.fillna(1)) \n", 57 | " import pandas# 将两个表连接起来\n", 58 | " # print(pandas.merge(yuangong,yuan_gong))\n", 59 | " df=pandas.merge(yuangong,yuan_gong)\n", 60 | " df['ratio'] = df['total_operating_revenue'].div(df.groupby('name')['total_operating_revenue'].shift(1))\n", 61 | " print(df.head())\n", 62 | " \n", 63 | " except:\n", 64 | " print(\"~~~~~\",conceptno)\n", 65 | " pass" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "# 激光\n", 73 | "GN811\t811\t激光" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 3, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "output_type": "stream", 83 | "name": "stdout", 84 | "text": "******** GN811 ********\n name code employee total_operating_revenue ratio\n0 紫光股份 000938.XSHE 14484 5.409906e+10 NaN\n name code employee total_operating_revenue ratio\n0 华工科技 000988.XSHE 6771 5.460245e+09 NaN\n name code employee total_operating_revenue ratio\n0 大族激光 002008.XSHE 13005 9.562627e+09 NaN\n name code employee total_operating_revenue ratio\n0 中光学 002189.XSHE 3225 2.552363e+09 NaN\n name code employee total_operating_revenue ratio\n0 福晶科技 002222.XSHE 1259 5.011457e+08 NaN\n name code employee total_operating_revenue ratio\n0 华明装备 002270.XSHE 1562 1.202520e+09 NaN\n name code employee total_operating_revenue ratio\n0 恒久科技 002808.XSHE 419 3.163791e+08 NaN\n name code employee total_operating_revenue ratio\n0 机器人 300024.XSHE 4559 2.745485e+09 NaN\n name code employee total_operating_revenue ratio\n0 迪威迅 300167.XSHE 975 5.252640e+08 NaN\n name code employee total_operating_revenue ratio\n0 金运激光 300220.XSHE 288 2.193012e+08 NaN\n name code employee total_operating_revenue ratio\n0 光韵达 300227.XSHE 1490 7.904286e+08 NaN\n name code employee total_operating_revenue ratio\n0 民德电子 300656.XSHE 232 3.053722e+08 NaN\n name code employee total_operating_revenue ratio\n0 锐科激光 300747.XSHE 2076 2.010159e+09 NaN\n name code employee total_operating_revenue ratio\n0 帝尔激光 300776.XSHE 440 6.999479e+08 NaN\n name code employee total_operating_revenue ratio\n0 大恒科技 600288.XSHG 2440 3.305798e+09 NaN\n name code employee total_operating_revenue ratio\n0 光峰科技 688007.XSHG 1246 1.979149e+09 NaN\n name code employee total_operating_revenue ratio\n0 杰普特 688025.XSHG 929 5.676799e+08 NaN\n name code employee total_operating_revenue ratio\n0 柏楚电子 688188.XSHG 224 3.760710e+08 NaN\n" 85 | } 86 | ], 87 | "source": [ 88 | "for i in range(811,812):\n", 89 | " conceptno='GN'+str(i).rjust(3,'0') # 加载概念股清单\n", 90 | " print('********',conceptno,'********')\n", 91 | " # print(conceptno)\n", 92 | " try: \n", 93 | " # print('123')\n", 94 | " stocks=get_concept_stocks(conceptno, date=None) # 员工人数\n", 95 | " # print(stocks)\n", 96 | " for code1 in stocks: \n", 97 | " # print('123')\n", 98 | " # print(code)\n", 99 | " yuangong=finance.run_query(query(finance.STK_EMPLOYEE_INFO.name,finance.STK_EMPLOYEE_INFO.code,finance.STK_EMPLOYEE_INFO.employee).filter(finance.STK_EMPLOYEE_INFO.code==code1,finance.STK_EMPLOYEE_INFO.pub_date>='2019-12-01').limit(10))\n", 100 | " # yuan_gong=finance.run_query(query(finance.STK_EMPLOYEE_INFO).filter(finance.STK_EMPLOYEE_INFO.code==code,finance.STK_EMPLOYEE_INFO.pub_date>='2019-12-01').limit(1000)) \n", 101 | " # print(conceptno,yuangong) \n", 102 | " yuan_gong=finance.run_query(query(finance.STK_INCOME_STATEMENT.code,finance.STK_INCOME_STATEMENT.total_operating_revenue).filter(finance.STK_INCOME_STATEMENT.code==code1,finance.STK_INCOME_STATEMENT.end_date=='2019-12-31').limit(1000))\n", 103 | " \n", 104 | " # print(yuan_gong.fillna(1)) \n", 105 | " import pandas# 将两个表连接起来\n", 106 | " # print(pandas.merge(yuangong,yuan_gong))\n", 107 | " df=pandas.merge(yuangong,yuan_gong)\n", 108 | " df['ratio'] = df['total_operating_revenue'].div(df.groupby('name')['total_operating_revenue'].shift(1))\n", 109 | " print(df.head())\n", 110 | " \n", 111 | " except:\n", 112 | " print(\"~~~~~\",conceptno)\n", 113 | " pass" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 5, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "output_type": "execute_result", 123 | "data": { 124 | "text/plain": "['600288.XSHG',\n '688007.XSHG',\n '688025.XSHG',\n '688188.XSHG',\n '000988.XSHE',\n '002008.XSHE',\n '002077.XSHE',\n '002222.XSHE',\n '002559.XSHE',\n '002935.XSHE',\n '300220.XSHE',\n '300410.XSHE',\n '300620.XSHE',\n '300747.XSHE',\n '300776.XSHE']" 125 | }, 126 | "metadata": {}, 127 | "execution_count": 5 128 | } 129 | ], 130 | "source": [ 131 | "stock_laser_sina=['sh600288',\n", 132 | "'sh688007',\n", 133 | "'sh688025',\n", 134 | "'sh688188',\n", 135 | "'sz000988',\n", 136 | "'sz002008',\n", 137 | "'sz002077',\n", 138 | "'sz002222',\n", 139 | "'sz002559',\n", 140 | "'sz002935',\n", 141 | "'sz300220',\n", 142 | "'sz300410',\n", 143 | "'sz300620',\n", 144 | "'sz300747',\n", 145 | "'sz300776']\n", 146 | "\n", 147 | "stock_laser_sina=normalize_code(stock_laser_sina)\n", 148 | "stock_laser_sina" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 6, 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "output_type": "stream", 158 | "name": "stdout", 159 | "text": "name code employee total_operating_revenue ratio\n0 大恒科技 600288.XSHG 2440 3.305798e+09 NaN\n name code employee total_operating_revenue ratio\n0 光峰科技 688007.XSHG 1246 1.979149e+09 NaN\n name code employee total_operating_revenue ratio\n0 杰普特 688025.XSHG 929 5.676799e+08 NaN\n name code employee total_operating_revenue ratio\n0 柏楚电子 688188.XSHG 224 3.760710e+08 NaN\n name code employee total_operating_revenue ratio\n0 华工科技 000988.XSHE 6771 5.460245e+09 NaN\n name code employee total_operating_revenue ratio\n0 大族激光 002008.XSHE 13005 9.562627e+09 NaN\n name code employee total_operating_revenue ratio\n0 *ST大港 002077.XSHE 836 9.323965e+08 NaN\n name code employee total_operating_revenue ratio\n0 福晶科技 002222.XSHE 1259 5.011457e+08 NaN\n name code employee total_operating_revenue ratio\n0 亚威股份 002559.XSHE 1685 1.468130e+09 NaN\n name code employee total_operating_revenue ratio\n0 天奥电子 002935.XSHE 579 8.669056e+08 NaN\n name code employee total_operating_revenue ratio\n0 金运激光 300220.XSHE 288 2.193012e+08 NaN\n name code employee total_operating_revenue ratio\n0 正业科技 300410.XSHE 1731 1.045970e+09 NaN\n name code employee total_operating_revenue ratio\n0 光库科技 300620.XSHE 1046 3.907800e+08 NaN\n name code employee total_operating_revenue ratio\n0 锐科激光 300747.XSHE 2076 2.010159e+09 NaN\n name code employee total_operating_revenue ratio\n0 帝尔激光 300776.XSHE 440 6.999479e+08 NaN\n" 160 | } 161 | ], 162 | "source": [ 163 | "for i in range(811,812):\n", 164 | " conceptno='GN'+str(i).rjust(3,'0') # 加载概念股清单\n", 165 | " # print('********',conceptno,'********')\n", 166 | " # print(conceptno)\n", 167 | " try: \n", 168 | " # print('123')\n", 169 | " stocks=get_concept_stocks(conceptno, date=None) # 员工人数\n", 170 | " # print(stocks)\n", 171 | " for code1 in stock_laser_sina: \n", 172 | " # for code1 in stocks: \n", 173 | " # print('123')\n", 174 | " # print(code)\n", 175 | " yuangong=finance.run_query(query(finance.STK_EMPLOYEE_INFO.name,finance.STK_EMPLOYEE_INFO.code,finance.STK_EMPLOYEE_INFO.employee).filter(finance.STK_EMPLOYEE_INFO.code==code1,finance.STK_EMPLOYEE_INFO.pub_date>='2019-12-01').limit(10))\n", 176 | " # yuan_gong=finance.run_query(query(finance.STK_EMPLOYEE_INFO).filter(finance.STK_EMPLOYEE_INFO.code==code,finance.STK_EMPLOYEE_INFO.pub_date>='2019-12-01').limit(1000)) \n", 177 | " # print(conceptno,yuangong) \n", 178 | " yuan_gong=finance.run_query(query(finance.STK_INCOME_STATEMENT.code,finance.STK_INCOME_STATEMENT.total_operating_revenue).filter(finance.STK_INCOME_STATEMENT.code==code1,finance.STK_INCOME_STATEMENT.end_date=='2019-12-31').limit(1000))\n", 179 | " \n", 180 | " # print(yuan_gong.fillna(1)) \n", 181 | " import pandas# 将两个表连接起来\n", 182 | " # print(pandas.merge(yuangong,yuan_gong))\n", 183 | " df=pandas.merge(yuangong,yuan_gong)\n", 184 | " df['ratio'] = df['total_operating_revenue'].div(df.groupby('name')['total_operating_revenue'].shift(1))\n", 185 | " print(df.head())\n", 186 | " \n", 187 | " except:\n", 188 | " print(\"~~~~~\",conceptno)\n", 189 | " pass" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "## SaaS" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 3, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "stock_saas_sina=['sh600588','sh600602','sh600756','sh603039','sz000606','sz002123','sz002137','sz002279','sz002301','sz002467','sz002530','sz300051','sz300170','sz300250','sz300271','sz300352','sz300365','sz300386']\n", 206 | "stock_saas_sina=normalize_code(stock_saas_sina)\n" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 4, 212 | "metadata": {}, 213 | "outputs": [ 214 | { 215 | "output_type": "stream", 216 | "name": "stdout", 217 | "text": "name code employee total_operating_revenue ratio\n0 用友网络 600588.XSHG 17271 8.509660e+09 NaN\n name code employee total_operating_revenue ratio\n0 云赛智联 600602.XSHG 2287 4.889124e+09 NaN\n name code employee total_operating_revenue ratio\n0 浪潮软件 600756.XSHG 1899 1.455590e+09 NaN\n name code employee total_operating_revenue ratio\n0 泛微网络 603039.XSHG 1237 1.286034e+09 NaN\n name code employee total_operating_revenue ratio\n0 顺利办 000606.XSHE 2143 2.025038e+09 NaN\n name code employee total_operating_revenue ratio\n0 梦网集团 002123.XSHE 1040 3.200697e+09 NaN\n name code employee total_operating_revenue ratio\n0 麦达数字 002137.XSHE 777 7.630292e+08 NaN\n name code employee total_operating_revenue ratio\n0 久其软件 002279.XSHE 3208 3.101650e+09 NaN\n name code employee total_operating_revenue ratio\n0 齐心集团 002301.XSHE 3233 5.981270e+09 NaN\n name code employee total_operating_revenue ratio\n0 二六三 002467.XSHE 1060 1.013566e+09 NaN\n name code employee total_operating_revenue ratio\n0 金财互联 002530.XSHE 3770 1.262451e+09 NaN\n name code employee total_operating_revenue ratio\n0 三五互联 300051.XSHE 660 2.883086e+08 NaN\n name code employee total_operating_revenue ratio\n0 汉得信息 300170.XSHE 10355 2.723441e+09 NaN\n name code employee total_operating_revenue ratio\n0 初灵信息 300250.XSHE 812 4.858349e+08 NaN\n name code employee total_operating_revenue ratio\n0 华宇软件 300271.XSHE 7339 3.510148e+09 NaN\n name code employee total_operating_revenue ratio\n0 北信源 300352.XSHE 1346 7.219824e+08 NaN\n name code employee total_operating_revenue ratio\n0 恒华科技 300365.XSHE 1059 1.123172e+09 NaN\n name code employee total_operating_revenue ratio\n0 飞天诚信 300386.XSHE 932 9.396205e+08 NaN\n" 218 | } 219 | ], 220 | "source": [ 221 | "try: \n", 222 | " for code1 in stock_saas_sina: \n", 223 | " yuangong=finance.run_query(query(finance.STK_EMPLOYEE_INFO.name,finance.STK_EMPLOYEE_INFO.code,finance.STK_EMPLOYEE_INFO.employee).filter(finance.STK_EMPLOYEE_INFO.code==code1,finance.STK_EMPLOYEE_INFO.pub_date>='2019-12-01').limit(10))\n", 224 | "\n", 225 | " yuan_gong=finance.run_query(query(finance.STK_INCOME_STATEMENT.code,finance.STK_INCOME_STATEMENT.total_operating_revenue).filter(finance.STK_INCOME_STATEMENT.code==code1,finance.STK_INCOME_STATEMENT.end_date=='2019-12-31').limit(1000))\n", 226 | " \n", 227 | " # print(yuan_gong.fillna(1)) \n", 228 | " import pandas# 将两个表连接起来\n", 229 | " # print(pandas.merge(yuangong,yuan_gong))\n", 230 | " df=pandas.merge(yuangong,yuan_gong)\n", 231 | " df['ratio'] = df['total_operating_revenue'].div(df.groupby('name')['total_operating_revenue'].shift(1))\n", 232 | " print(df.head())\n", 233 | " \n", 234 | "except:\n", 235 | " print(\"~~~~~\")\n", 236 | " pass" 237 | ] 238 | } 239 | ], 240 | "metadata": { 241 | "language_info": { 242 | "codemirror_mode": { 243 | "name": "ipython", 244 | "version": 3 245 | }, 246 | "file_extension": ".py", 247 | "mimetype": "text/x-python", 248 | "name": "python", 249 | "nbconvert_exporter": "python", 250 | "pygments_lexer": "ipython3", 251 | "version": "3.7.4-final" 252 | }, 253 | "orig_nbformat": 2, 254 | "kernelspec": { 255 | "name": "python37432bit6a4013f1a8ec4766a4af9787f2730623", 256 | "display_name": "Python 3.7.4 32-bit" 257 | } 258 | }, 259 | "nbformat": 4, 260 | "nbformat_minor": 2 261 | } -------------------------------------------------------------------------------- /jqdata20200601.py: -------------------------------------------------------------------------------- 1 | import jqdatasdk 2 | jqdatasdk.auth("13141315365", "315365") 3 | # jqdatasdk.get_price("000001.XSHE") 4 | # print(jqdatasdk.get_price("000001.XSHE", start_date="2017-01-01", end_date="2017-12-31")) 5 | 6 | 7 | # # 获取所有沪深300的股票 8 | # # stocks = jqdatasdk.get_index_stocks('000300.XSHG') 9 | # # print(stocks) 10 | 11 | # normalize_code-股票代码格式转化 12 | # 将其他形式的股票代码转换为jqdatasdk函数可用的股票代码形式。仅适用于A股市场股票代码、期货以及基金代码,支持传入单只股票或一个股票list 示例 13 | # #输入 14 | # normalize_code(['000001', 'SZ000001', '000001SZ', '000001.sz', '000001.XSHE']) 15 | # #输出 16 | # ['000001.XSHE', '000001.XSHE', '000001.XSHE', '000001.XSHE', '000001.XSHE'] 17 | 18 | # # 查询'000001.XSHE'的所有市值数据, 时间是2015-10-15 19 | # q = jqdatasdk.query( 20 | # jqdatasdk.valuation 21 | # ).filter( 22 | # jqdatasdk.valuation.code == '000001.XSHE' 23 | # ) 24 | # df = jqdatasdk.get_fundamentals(q, '2015-10-15') 25 | # # 打印出总市值 26 | # print(df['market_cap'][0]) 27 | 28 | # # 查询平安银行2014年的年报 29 | # q = jqdatasdk.query( 30 | # jqdatasdk.income.statDate, 31 | # jqdatasdk.income.code, 32 | # jqdatasdk.income.basic_eps, 33 | # jqdatasdk.cash_flow.goods_sale_and_service_render_cash 34 | # ).filter( 35 | # jqdatasdk.income.code == '000001.XSHE', 36 | # ) 37 | # 38 | # ret = jqdatasdk.get_fundamentals(q, statDate='2014') 39 | # print(ret) 40 | 41 | # ####资产负债表######### 42 | # ####资产负债表######### 43 | # ####资产负债表######### 44 | # ####资产负债表######### 45 | # ####资产负债表######### 46 | # # fixed_assets 固定资产 47 | # # total_assets 资产总计 48 | # from jqdatasdk import finance 49 | # 50 | # # codeqianyuandianli="002039.xshe" 51 | # # codeqianyuandianli=jqdatasdk.normalize_code('002039') 52 | # codeshuidian=['sh600900','sh600025','sh600886','sh600236','002039','sh600674','sz000722','sz000883','000791.SZ','000993','000601','600868'] 53 | # for code in codeshuidian: 54 | # codeqianyuandianli=jqdatasdk.normalize_code(code) 55 | # print("\n*********",code,"***********************") 56 | # qianyuandianli=finance.run_query(jqdatasdk.query(finance.STK_BALANCE_SHEET.fixed_assets, 57 | # finance.STK_BALANCE_SHEET.total_assets, 58 | # finance.STK_BALANCE_SHEET.code, 59 | # finance.STK_BALANCE_SHEET.cash_equivalents, 60 | # finance.STK_BALANCE_SHEET.pub_date, 61 | # finance.STK_BALANCE_SHEET.start_date, 62 | # finance.STK_BALANCE_SHEET.end_date, 63 | # finance.STK_BALANCE_SHEET.company_name 64 | # ).filter(finance.STK_BALANCE_SHEET.code==codeqianyuandianli, 65 | # finance.STK_BALANCE_SHEET.pub_date>='2010-01-01', 66 | # finance.STK_BALANCE_SHEET.report_type==0 67 | # ).limit(50)) 68 | # print(qianyuandianli) 69 | 70 | ###### 有息负债 ################ 71 | ###### 有息负债 ################ 72 | ###### 有息负债 ################ 73 | ###### 有息负债 ################ 74 | # shortterm_loan 短期借款 decimal(20,4) 75 | # non_current_liability_in_one_year 一年内到期的非流动负债 decimal(20,4) 76 | # longterm_loan 长期借款 decimal(20,4) 77 | # bonds_payable 应付债券 decimal(20,4) 78 | import pandas 79 | # qianyuandianli=pandas.DataFrame(columns=['A', 'B', 'C', 'D','A', 'B', 'C', 'D','A']) 80 | from jqdatasdk import finance 81 | codeshuidian=['sh600900','sh600025','sh600886','sh600236','002039','sh600674','sz000722','sz000883','000791.SZ','000993','000601','600868'] 82 | for code in codeshuidian: 83 | codeqianyuandianli=jqdatasdk.normalize_code(code) 84 | # print("\n*********",code,"***********************") 85 | qianyuandianli1=finance.run_query(jqdatasdk.query(finance.STK_BALANCE_SHEET.shortterm_loan, 86 | finance.STK_BALANCE_SHEET.longterm_loan, 87 | finance.STK_BALANCE_SHEET.non_current_liability_in_one_year, 88 | finance.STK_BALANCE_SHEET.bonds_payable, 89 | finance.STK_BALANCE_SHEET.total_assets, 90 | finance.STK_BALANCE_SHEET.code,finance.STK_BALANCE_SHEET.pub_date, 91 | finance.STK_BALANCE_SHEET.start_date, 92 | finance.STK_BALANCE_SHEET.end_date, 93 | finance.STK_BALANCE_SHEET.company_name 94 | ).filter(finance.STK_BALANCE_SHEET.code==codeqianyuandianli, 95 | finance.STK_BALANCE_SHEET.pub_date>='2010-01-01', 96 | finance.STK_BALANCE_SHEET.report_type==0 97 | ).limit(50)) 98 | qianyuandianli1=qianyuandianli1.fillna(0) 99 | # qianyuandianli1.append(qianyuandianli1) 100 | print((qianyuandianli1['shortterm_loan'] 101 | +qianyuandianli1['longterm_loan'] 102 | +qianyuandianli1['non_current_liability_in_one_year'] 103 | +qianyuandianli1['bonds_payable']) 104 | /qianyuandianli1['total_assets'],qianyuandianli1['company_name']) 105 | # qianyuandianli1.to_excel('C:/TEMP/Excel/loan_2010-2020_hydro_power_20200601.xlsx') 106 | 107 | -------------------------------------------------------------------------------- /jqdata_20200608_electricity_ultility.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 使用JQData获取企业的营收、资产、所在行业" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "output_type": "stream", 17 | "name": "stdout", 18 | "text": "提示：当前环境pandas版本为0.25，get_price与get_fundamentals_continuously接口panel参数将固定为False\n注意：0.25以上版本pandas不支持panel，如使用该数据结构和相关函数请注意修改\nauth success \n" 19 | } 20 | ], 21 | "source": [ 22 | "import jqdatasdk\n", 23 | "from jqdatasdk import *\n", 24 | "jqdatasdk.auth(\"13141315365\", \"315365\")" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## 电力" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 9, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "output_type": "stream", 41 | "name": "stdout", 42 | "text": "http://f10.eastmoney.com/f10_v2/ShareholderResearch.aspx?code=sh600900#sdgd-0\nhttp://f10.eastmoney.com/f10_v2/ShareholderResearch.aspx?code=sz001896#sdgd-0\n" 43 | } 44 | ], 45 | "source": [ 46 | "dian_li_stocks0=['sh600900','sz001896','sh600995','sz000939','sh600780','sz000720','sh600310','sh600163','sh600868','sh600098','sh600483','sh600886','sz000531','sz000591','sz000966','sh600452','sh600972','sz000791','sh600101','sz000875','sh600642','sh600982','sz002608','sh603693','sz000155','sz000722','sh601985','sz002499','sz002039','sz000543','sh600969','sh600724','sh600396','sh600509','sh600578','sh600674','sz000883','sh600236','sh601619','sz000958','sh600505','sh600744','sz300125','sz000993','sh601016','sh600644','sz000767','sz002616']\n", 47 | "for stock0 in dian_li_stocks0[:2]:\n", 48 | " url='http://f10.eastmoney.com/f10_v2/ShareholderResearch.aspx?code='+stock0+'#sdgd-0'\n", 49 | " print(url)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 13, 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "output_type": "stream", 59 | "name": "stdout", 60 | "text": "{'industry_code': 'D44', 'industry_name': '电力、热力生产和供应业'}\n{'industry_code': 'D44', 'industry_name': '电力、热力生产和供应业'}\n{'industry_code': 'D44', 'industry_name': '电力、热力生产和供应业'}\n{'industry_code': 'D44', 'industry_name': '电力、热力生产和供应业'}\n{'industry_code': 'D44', 'industry_name': '电力、热力生产和供应业'}\n" 61 | } 62 | ], 63 | "source": [ 64 | "dian_li_stocks=normalize_code(dian_li_stocks0)\n", 65 | "for stock in dian_li_stocks[:5]:\n", 66 | " print(get_industry(stock, date=None)[stock]['zjw'])" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 15, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "output_type": "stream", 76 | "name": "stdout", 77 | "text": "operating_revenue total_assets code\n0 4.243346e+10 2.247215e+11 600886.XSHG\n operating_revenue total_assets code\n0 3.228594e+09 1.162818e+10 000531.XSHE\n operating_revenue total_assets code\n0 5.011085e+09 3.744986e+10 000591.XSHE\n operating_revenue total_assets code\n0 7.366107e+09 9.743659e+09 000966.XSHE\n" 78 | } 79 | ], 80 | "source": [ 81 | "for stock in dian_li_stocks[11:15]:\n", 82 | " q = query(\n", 83 | " income.operating_revenue,\n", 84 | " balance.total_assets,\n", 85 | " income.code,\n", 86 | " ).filter(\n", 87 | " income.code == stock,\n", 88 | " )\n", 89 | " ret = get_fundamentals(q, statDate='2019')\n", 90 | " print(ret)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "## 公用事业" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 2, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "gong_yong_stocks0=['sh603903','sh600635','sh600681','sh600617','sz000669','sh600856','sh600874','sz300388','sh601368','sz000593','sz002259','sz000421','sh600283','sh601158','sz002267','sh600917','sz000598','sh603817','sh600008','sz000605','sh603393','sh600168','sz300664','sh600167','sh600719','sh600187',]\n", 107 | "gong_yong_stocks=normalize_code(gong_yong_stocks0)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 12, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "output_type": "stream", 117 | "name": "stdout", 118 | "text": "{'industry_code': 'N77', 'industry_name': '生态保护和环境治理业'}\n{'industry_code': 'D45', 'industry_name': '燃气生产和供应业'}\n{'industry_code': 'D45', 'industry_name': '燃气生产和供应业'}\n{'industry_code': 'D45', 'industry_name': '燃气生产和供应业'}\n{'industry_code': 'D45', 'industry_name': '燃气生产和供应业'}\n" 119 | } 120 | ], 121 | "source": [ 122 | "for stock in gong_yong_stocks[:5]:\n", 123 | " print(get_industry(stock, date=None)[stock]['zjw'])" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 17, 129 | "metadata": {}, 130 | "outputs": [ 131 | { 132 | "output_type": "stream", 133 | "name": "stdout", 134 | "text": "operating_revenue total_assets code\n0 1.336815e+09 2.924176e+09 603903.XSHG\n operating_revenue total_assets code\n0 5.459800e+09 2.256540e+10 600635.XSHG\n operating_revenue total_assets code\n0 4.881559e+09 9.147336e+09 600681.XSHG\n operating_revenue total_assets code\n0 1.090126e+10 2.951050e+10 600617.XSHG\n operating_revenue total_assets code\n0 3.727926e+09 9.234693e+09 000669.XSHE\n" 135 | } 136 | ], 137 | "source": [ 138 | "for stock in gong_yong_stocks[:5]:\n", 139 | " q = query(\n", 140 | " income.operating_revenue,\n", 141 | " balance.total_assets,\n", 142 | " income.code,\n", 143 | " ).filter(\n", 144 | " income.code == stock,\n", 145 | " )\n", 146 | " print(get_fundamentals(q, statDate='2019'))" 147 | ] 148 | } 149 | ], 150 | "metadata": { 151 | "language_info": { 152 | "codemirror_mode": { 153 | "name": "ipython", 154 | "version": 3 155 | }, 156 | "file_extension": ".py", 157 | "mimetype": "text/x-python", 158 | "name": "python", 159 | "nbconvert_exporter": "python", 160 | "pygments_lexer": "ipython3", 161 | "version": "3.7.4-final" 162 | }, 163 | "orig_nbformat": 2, 164 | "kernelspec": { 165 | "name": "python37432bit6a4013f1a8ec4766a4af9787f2730623", 166 | "display_name": "Python 3.7.4 32-bit" 167 | } 168 | }, 169 | "nbformat": 4, 170 | "nbformat_minor": 2 171 | } -------------------------------------------------------------------------------- /jqdata_stock_shareholders_20200608.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 获取上市公司股东和股本信息" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "output_type": "stream", 17 | "name": "stdout", 18 | "text": "提示：当前环境pandas版本为0.25，get_price与get_fundamentals_continuously接口panel参数将固定为False\n注意：0.25以上版本pandas不支持panel，如使用该数据结构和相关函数请注意修改\nauth success \n" 19 | } 20 | ], 21 | "source": [ 22 | "import jqdatasdk\n", 23 | "from jqdatasdk import *\n", 24 | "jqdatasdk.auth(\"13141315365\", \"315365\")" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "### 十大股东" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "import numpy" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "output_type": "stream", 50 | "name": "stdout", 51 | "text": "id company_name company_id code end_date pub_date \\\n0 1004153 中持水务股份有限公司 300030530 603903.XSHG 2017-02-22 2017-02-22 \n1 1004154 中持水务股份有限公司 300030530 603903.XSHG 2017-02-22 2017-02-22 \n2 1004155 中持水务股份有限公司 300030530 603903.XSHG 2017-02-22 2017-02-22 \n3 1004156 中持水务股份有限公司 300030530 603903.XSHG 2017-02-22 2017-02-22 \n4 1004157 中持水务股份有限公司 300030530 603903.XSHG 2017-02-22 2017-02-22 \n5 1004158 中持水务股份有限公司 300030530 603903.XSHG 2017-02-22 2017-02-22 \n6 1004160 中持水务股份有限公司 300030530 603903.XSHG 2017-02-22 2017-02-22 \n7 1004161 中持水务股份有限公司 300030530 603903.XSHG 2017-02-22 2017-02-22 \n8 1004162 中持水务股份有限公司 300030530 603903.XSHG 2017-02-22 2017-02-22 \n9 1004163 中持水务股份有限公司 300030530 603903.XSHG 2017-02-22 2017-02-22 \n\n change_reason_id change_reason shareholder_rank \\\n0 306025 招股说明书 1 \n1 306025 招股说明书 2 \n2 306025 招股说明书 3 \n3 306025 招股说明书 4 \n4 306025 招股说明书 5 \n5 306025 招股说明书 6 \n6 306025 招股说明书 7 \n7 306025 招股说明书 8 \n8 306025 招股说明书 9 \n9 306025 招股说明书 10 \n\n shareholder_name ... shareholder_id shareholder_class_id \\\n0 中持（北京）环保发展有限公司 ... 100163287 307099 \n1 启明创富投资有限公司 ... 100163270 307099 \n2 苏州纪源科星股权投资合伙企业（有限合伙） ... 100110004 307099 \n3 SCC Venture 2010（HK）Limited ... 100163290 307099 \n4 上海联新二期股权投资中心（有限合伙） ... 100163273 307099 \n5 许国栋 ... 100163283 307001 \n6 苏州启明创智股权投资合伙企业（有限合伙） ... 100128674 307099 \n7 北极光早期创业投资企业 ... 100163263 307013 \n8 启明亚洲投资有限公司 ... 100163271 307099 \n9 北极光创业投资企业 ... 100115932 307013 \n\n shareholder_class share_number share_ratio sharesnature_id sharesnature \\\n0 其他机构 24138324.0 31.42 308001 境内法人股 \n1 其他机构 9000000.0 11.71 308002 境外法人股 \n2 其他机构 7113750.0 9.26 308001 境内法人股 \n3 其他机构 6000000.0 7.81 308002 境外法人股 \n4 其他机构 4713774.0 6.14 308001 境内法人股 \n5 自然人 4500000.0 5.86 308003 自然人持股 \n6 其他机构 3048750.0 3.97 308001 境内法人股 \n7 风险投资 2776096.0 3.61 308001 境内法人股 \n8 其他机构 2168825.0 2.82 308002 境外法人股 \n9 风险投资 2082072.0 2.71 308001 境内法人股 \n\n share_pledge_freeze share_pledge share_freeze \n0 None None None \n1 None None None \n2 None None None \n3 None None None \n4 None None None \n5 None None None \n6 None None None \n7 None None None \n8 None None None \n9 None None None \n\n[10 rows x 21 columns]\n" 52 | } 53 | ], 54 | "source": [ 55 | "#指定查询对象为中持股份的十大股东情况，限定返回条数为10条\n", 56 | "code1=normalize_code(\"603903\")#中持股份\n", 57 | "q=query(finance.STK_SHAREHOLDER_TOP10).filter(finance.STK_SHAREHOLDER_TOP10.code==code1,finance.STK_SHAREHOLDER_TOP10.pub_date>'2015-01-01').limit(10)\n", 58 | "df=finance.run_query(q)\n", 59 | "print(df)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "### 十大流通股东" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 5, 72 | "metadata": { 73 | "tags": [ 74 | "outputPrepend" 75 | ] 76 | }, 77 | "outputs": [ 78 | { 79 | "output_type": "stream", 80 | "name": "stdout", 81 | "text": "id company_id company_name code end_date pub_date \\\n0 4494014 300030530 中持水务股份有限公司 603903.XSHG 2018-12-31 2019-04-19 \n1 4494015 300030530 中持水务股份有限公司 603903.XSHG 2018-12-31 2019-04-19 \n2 4494016 300030530 中持水务股份有限公司 603903.XSHG 2018-12-31 2019-04-19 \n3 4494017 300030530 中持水务股份有限公司 603903.XSHG 2018-12-31 2019-04-19 \n4 4494018 300030530 中持水务股份有限公司 603903.XSHG 2018-12-31 2019-04-19 \n5 4494019 300030530 中持水务股份有限公司 603903.XSHG 2018-12-31 2019-04-19 \n6 4494020 300030530 中持水务股份有限公司 603903.XSHG 2018-12-31 2019-04-19 \n7 4494021 300030530 中持水务股份有限公司 603903.XSHG 2018-12-31 2019-04-19 \n8 4494022 300030530 中持水务股份有限公司 603903.XSHG 2018-12-31 2019-04-19 \n9 4494023 300030530 中持水务股份有限公司 603903.XSHG 2018-12-31 2019-04-19 \n\n change_reason_id change_reason shareholder_rank shareholder_id \\\n0 306019 定期报告 1 100163270 \n1 306019 定期报告 2 100110004 \n2 306019 定期报告 3 100163273 \n3 306019 定期报告 4 100163290 \n4 306019 定期报告 5 100128674 \n5 306019 定期报告 6 100181268 \n6 306019 定期报告 7 100163271 \n7 306019 定期报告 8 100147702 \n8 306019 定期报告 9 100001386 \n9 306019 定期报告 10 100151991 \n\n shareholder_name shareholder_name_en shareholder_class_id \\\n0 启明创富投资有限公司 None 307099 \n1 苏州纪源科星股权投资合伙企业(有限合伙) None 307099 \n2 上海联新二期股权投资中心(有限合伙) None 307099 \n3 SCC VENTURE 2010 (HK) LIMITED None 307099 \n4 苏州启明创智股权投资合伙企业(有限合伙) None 307099 \n5 邵凯 None 307001 \n6 启明亚洲投资有限公司 None 307099 \n7 张翼飞 None 307001 \n8 陆晋泉 None 307001 \n9 陈德清 None 307001 \n\n shareholder_class share_number share_ratio sharesnature_id sharesnature \n0 其他机构 7800000.0 7.548 308007 流通A股 \n1 其他机构 6080751.0 5.884 308007 流通A股 \n2 其他机构 3978874.0 3.850 308007 流通A股 \n3 其他机构 3933499.0 3.807 308007 流通A股 \n4 其他机构 2448750.0 2.370 308007 流通A股 \n5 自然人 1800000.0 1.742 308007 流通A股 \n6 其他机构 1568825.0 1.518 308007 流通A股 \n7 自然人 1350000.0 1.306 308007 流通A股 \n8 自然人 1300000.0 1.258 308007 流通A股 \n9 自然人 1200000.0 1.161 308007 流通A股 \n" 82 | } 83 | ], 84 | "source": [ 85 | "q=query(finance.STK_SHAREHOLDER_FLOATING_TOP10).filter(finance.STK_SHAREHOLDER_FLOATING_TOP10.code==code1,finance.STK_SHAREHOLDER_FLOATING_TOP10.pub_date>'2019-01-01').limit(10)\n", 86 | "df=finance.run_query(q)\n", 87 | "print(df)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "了解更多请点击https://www.joinquant.com/help/api/help?name=JQData#%E5%8D%81%E5%A4%A7%E8%82%A1%E4%B8%9C" 95 | ] 96 | } 97 | ], 98 | "metadata": { 99 | "language_info": { 100 | "codemirror_mode": { 101 | "name": "ipython", 102 | "version": 3 103 | }, 104 | "file_extension": ".py", 105 | "mimetype": "text/x-python", 106 | "name": "python", 107 | "nbconvert_exporter": "python", 108 | "pygments_lexer": "ipython3", 109 | "version": "3.7.4-final" 110 | }, 111 | "orig_nbformat": 2, 112 | "kernelspec": { 113 | "name": "python37432bit6a4013f1a8ec4766a4af9787f2730623", 114 | "display_name": "Python 3.7.4 32-bit" 115 | } 116 | }, 117 | "nbformat": 4, 118 | "nbformat_minor": 2 119 | } -------------------------------------------------------------------------------- /jqdata_获取报告期财务数据_20200610.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "output_type": "stream", 10 | "name": "stdout", 11 | "text": "提示：当前环境pandas版本为0.25，get_price与get_fundamentals_continuously接口panel参数将固定为False\n注意：0.25以上版本pandas不支持panel，如使用该数据结构和相关函数请注意修改\nauth success \n" 12 | } 13 | ], 14 | "source": [ 15 | "import jqdatasdk\n", 16 | "from jqdatasdk import *\n", 17 | "jqdatasdk.auth(\"13141315365\", \"315365\")" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 3, 23 | "metadata": {}, 24 | "outputs": [ 25 | { 26 | "output_type": "stream", 27 | "name": "stdout", 28 | "text": "['000016.XSHE', '000020.XSHE', '000021.XSHE', '000045.XSHE', '000050.XSHE', '000063.XSHE', '000066.XSHE', '000068.XSHE', '000100.XSHE', '000413.XSHE']\n" 29 | } 30 | ], 31 | "source": [ 32 | "# 计算机、通信和其他电子设备制造业\n", 33 | "print(get_industry_stocks(\"C39\", date=None)[:10])" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "### 业绩预告示例" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 5, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "output_type": "stream", 50 | "name": "stdout", 51 | "text": "id company_id code name end_date report_type_id \\\n0 295753 430000016 000016.XSHE 康佳集团股份有限公司 2015-06-30 304002 \n1 295754 430000016 000016.XSHE 康佳集团股份有限公司 2015-09-30 304003 \n\n report_type pub_date type_id type profit_min profit_max \\\n0 中报预告 2015-07-11 305007 业绩预亏 -300000000.0 -240000000.0 \n1 三季度预告 2015-10-15 305007 业绩预亏 -880000000.0 -830000000.0 \n\n profit_last profit_ratio_min profit_ratio_max \\\n0 45360110.62 -761.3740 -629.0992 \n1 47579255.05 -1949.5456 -1844.4577 \n\n content \n0 预计2015年1-6月归属于上市公司股东的净利润亏损:24,000万元–30,000万元 \n1 预计2015年1-9月归属于上市公司股东的净利润亏损:83,000万元–88,000万元 \n" 52 | } 53 | ], 54 | "source": [ 55 | "stock_electrical=get_industry_stocks(\"C39\", date=None)\n", 56 | "from jqdatasdk import finance\n", 57 | "for code in stock_electrical[:1]:\n", 58 | " q=query(finance.STK_FIN_FORCAST).filter(finance.STK_FIN_FORCAST.code==code,finance.STK_FIN_FORCAST.pub_date>='2015-01-01').limit(2)\n", 59 | " df=finance.run_query(q)\n", 60 | " print(df) " 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "### 合并利润表" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "**营业收入** \n", 75 | "total_operating_revenue\t营业总收入\tdecimal(20,4) \n", 76 | "operating_revenue\t营业收入\tdecimal(20,4)\t" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 11, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "output_type": "stream", 86 | "name": "stdout", 87 | "text": "total_operating_revenue operating_revenue company_name code \\\n0 1.942349e+10 1.942349e+10 康佳集团股份有限公司 000016.XSHE \n1 4.569152e+09 4.569152e+09 康佳集团股份有限公司 000016.XSHE \n\n pub_date start_date end_date report_date report_type \n0 2015-04-03 2014-01-01 2014-12-31 2014-12-31 0 \n1 2015-04-29 2015-01-01 2015-03-31 2015-03-31 0 \n" 88 | } 89 | ], 90 | "source": [ 91 | "for code in stock_electrical[:1]:\n", 92 | " q=query(finance.STK_INCOME_STATEMENT.total_operating_revenue,finance.STK_INCOME_STATEMENT.operating_revenue,finance.STK_INCOME_STATEMENT.company_name, finance.STK_INCOME_STATEMENT.code, finance.STK_INCOME_STATEMENT.pub_date, finance.STK_INCOME_STATEMENT.start_date, finance.STK_INCOME_STATEMENT.end_date, finance.STK_INCOME_STATEMENT.report_date, finance.STK_INCOME_STATEMENT.report_type).filter(finance.STK_INCOME_STATEMENT.code==code,finance.STK_INCOME_STATEMENT.pub_date>='2015-01-01',finance.STK_INCOME_STATEMENT.report_type==0).limit(2)\n", 93 | " df=finance.run_query(q)\n", 94 | " print(df)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "**营业税金及附加** \n", 102 | "operating_tax_surcharges\t营业税金及附加\tdecimal(20,4)\t" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 12, 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "output_type": "stream", 112 | "name": "stdout", 113 | "text": "operating_tax_surcharges company_name code pub_date start_date \\\n0 60527648.50 康佳集团股份有限公司 000016.XSHE 2015-04-03 2014-01-01 \n1 13913541.44 康佳集团股份有限公司 000016.XSHE 2015-04-29 2015-01-01 \n\n end_date report_date report_type \n0 2014-12-31 2014-12-31 0 \n1 2015-03-31 2015-03-31 0 \n" 114 | } 115 | ], 116 | "source": [ 117 | "for code in stock_electrical[:1]:\n", 118 | " q=query(finance.STK_INCOME_STATEMENT.operating_tax_surcharges,finance.STK_INCOME_STATEMENT.company_name, finance.STK_INCOME_STATEMENT.code, finance.STK_INCOME_STATEMENT.pub_date, finance.STK_INCOME_STATEMENT.start_date, finance.STK_INCOME_STATEMENT.end_date, finance.STK_INCOME_STATEMENT.report_date, finance.STK_INCOME_STATEMENT.report_type).filter(finance.STK_INCOME_STATEMENT.code==code,finance.STK_INCOME_STATEMENT.pub_date>='2015-01-01',finance.STK_INCOME_STATEMENT.report_type==0).limit(2)\n", 119 | " df=finance.run_query(q)\n", 120 | " print(df)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "**营业成本** \n", 128 | "total_operating_cost\t营业总成本\tdecimal(20,4) \n", 129 | "operating_cost\t营业成本\tdecimal(20,4)\t" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 10, 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "output_type": "stream", 139 | "name": "stdout", 140 | "text": "total_operating_cost operating_cost company_name code pub_date \\\n0 2.016998e+10 1.673375e+10 康佳集团股份有限公司 000016.XSHE 2015-04-03 \n1 4.578328e+09 3.861909e+09 康佳集团股份有限公司 000016.XSHE 2015-04-29 \n\n start_date end_date report_date report_type \n0 2014-01-01 2014-12-31 2014-12-31 0 \n1 2015-01-01 2015-03-31 2015-03-31 0 \n" 141 | } 142 | ], 143 | "source": [ 144 | "for code in stock_electrical[:1]:\n", 145 | " q=query(finance.STK_INCOME_STATEMENT.total_operating_cost,finance.STK_INCOME_STATEMENT.operating_cost, finance.STK_INCOME_STATEMENT.company_name, finance.STK_INCOME_STATEMENT.code, finance.STK_INCOME_STATEMENT.pub_date, finance.STK_INCOME_STATEMENT.start_date, finance.STK_INCOME_STATEMENT.end_date, finance.STK_INCOME_STATEMENT.report_date, finance.STK_INCOME_STATEMENT.report_type).filter(finance.STK_INCOME_STATEMENT.code==code,finance.STK_INCOME_STATEMENT.pub_date>='2015-01-01',finance.STK_INCOME_STATEMENT.report_type==0).limit(2)\n", 146 | " df=finance.run_query(q)\n", 147 | " print(df)" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "**费用** \n", 155 | "- sale_expense\t销售费用\tdecimal(20,4) \t \n", 156 | "- administration_expense\t管理费用\tdecimal(20,4)\t \n", 157 | "- exploration_expense\t堪探费用\tdecimal(20,4)\t勘探费用用于核算企业（石油天然气开采）核算的油气勘探过程中发生的地质调查、物理化学勘探各项支出和非成功探井等支出。 \n", 158 | "- financial_expense\t财务费用\tdecimal(20,4)\t\t" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 13, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "output_type": "stream", 168 | "name": "stdout", 169 | "text": "sale_expense administration_expense exploration_expense \\\n0 2.414468e+09 6.869304e+08 None \n1 4.797445e+08 1.462263e+08 None \n\n financial_expense company_name code pub_date start_date \\\n0 1.327638e+08 康佳集团股份有限公司 000016.XSHE 2015-04-03 2014-01-01 \n1 7.655130e+07 康佳集团股份有限公司 000016.XSHE 2015-04-29 2015-01-01 \n\n end_date report_date report_type \n0 2014-12-31 2014-12-31 0 \n1 2015-03-31 2015-03-31 0 \n" 170 | } 171 | ], 172 | "source": [ 173 | "for code in stock_electrical[:1]:\n", 174 | " q=query(finance.STK_INCOME_STATEMENT.sale_expense,finance.STK_INCOME_STATEMENT.administration_expense,finance.STK_INCOME_STATEMENT.exploration_expense,finance.STK_INCOME_STATEMENT.financial_expense,finance.STK_INCOME_STATEMENT.company_name, finance.STK_INCOME_STATEMENT.code, finance.STK_INCOME_STATEMENT.pub_date, finance.STK_INCOME_STATEMENT.start_date, finance.STK_INCOME_STATEMENT.end_date, finance.STK_INCOME_STATEMENT.report_date, finance.STK_INCOME_STATEMENT.report_type).filter(finance.STK_INCOME_STATEMENT.code==code,finance.STK_INCOME_STATEMENT.pub_date>='2015-01-01',finance.STK_INCOME_STATEMENT.report_type==0).limit(2)\n", 175 | " df=finance.run_query(q)\n", 176 | " print(df)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "**** \n" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "for code in stock_electrical[:1]:\n", 195 | " q=query(finance.STK_INCOME_STATEMENT.total_operating_revenue,finance.STK_INCOME_STATEMENT.company_name, finance.STK_INCOME_STATEMENT.code, finance.STK_INCOME_STATEMENT.pub_date, finance.STK_INCOME_STATEMENT.start_date, finance.STK_INCOME_STATEMENT.end_date, finance.STK_INCOME_STATEMENT.report_date, finance.STK_INCOME_STATEMENT.report_type).filter(finance.STK_INCOME_STATEMENT.code==code,finance.STK_INCOME_STATEMENT.pub_date>='2015-01-01',finance.STK_INCOME_STATEMENT.report_type==0).limit(2)\n", 196 | " df=finance.run_query(q)\n", 197 | " print(df)" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "# 字段名称\t中文名称\t字段类型\t含义\n", 207 | "# company_id\t公司ID\tint\t\n", 208 | "# company_name\t公司名称\tvarchar(100)\t\n", 209 | "# code\t股票代码\tvarchar(12)\t\n", 210 | "# a_code\tA股代码\tvarchar(12)\t\n", 211 | "# b_code\tB股代码\tvarchar(12)\t\n", 212 | "# h_code\tH股代码\tvarchar(12)\t\n", 213 | "# pub_date\t公告日期\tdate\t\n", 214 | "# start_date\t开始日期\tdate\t\n", 215 | "# end_date\t截止日期\tdate\t\n", 216 | "# report_date\t报告期\tdate\t\n", 217 | "# report_type\t报告期类型\tint\t0：本期，1：上期\n", 218 | "# source_id\t报表来源编码\tint\t如下报表来源编码\n", 219 | "# source\t报表来源\tvarchar(60)\t选择时程序自动填入" 220 | ] 221 | } 222 | ], 223 | "metadata": { 224 | "language_info": { 225 | "codemirror_mode": { 226 | "name": "ipython", 227 | "version": 3 228 | }, 229 | "file_extension": ".py", 230 | "mimetype": "text/x-python", 231 | "name": "python", 232 | "nbconvert_exporter": "python", 233 | "pygments_lexer": "ipython3", 234 | "version": "3.7.4-final" 235 | }, 236 | "orig_nbformat": 2, 237 | "kernelspec": { 238 | "name": "python37432bit6a4013f1a8ec4766a4af9787f2730623", 239 | "display_name": "Python 3.7.4 32-bit" 240 | } 241 | }, 242 | "nbformat": 4, 243 | "nbformat_minor": 2 244 | } -------------------------------------------------------------------------------- /pachongchuangguan.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | import re 5 | 6 | website = 'http://www.heibanke.com/lesson/crawler_ex00/' 7 | ruler = re.compile(r'数字[^\d]*(\d+)[\.<]') 8 | 9 | html = requests.get(website).content 10 | number = ruler.findall(html) 11 | index = 1 12 | while number: 13 | website2 = website + number[0] 14 | html = requests.get(website2).content 15 | number = ruler.findall(html) 16 | print "访问网页%d: %s" %(index, website2) 17 | index += 1 18 | else: 19 | print "\n下一关的入口: %s" % website2 -------------------------------------------------------------------------------- /pccg.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import urllib 3 | import re 4 | website='http://www.heibanke.com/lesson/crawler_ex00/' 5 | html=urllib.urlopen(website).read() 6 | index=re.findall(r'输入数字([0-9]{5})',html) 7 | while index: 8 | url='http://www.heibanke.com/lesson/crawler_ex00/%s/' % index[0] 9 | print url 10 | tmp_re=urllib.urlopen(url) 11 | html=tmp_re.read() 12 | index=re.findall(r'数字是([0-9]{5})',html) 13 | print html 14 | #print html 15 | -------------------------------------------------------------------------------- /pccg2: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import urllib 3 | import re 4 | 5 | data={'username':'Tom'} 6 | url='http://www.heibanke.com/lesson/crawler_ex01/' 7 | 8 | for num in range(1,31): 9 | data['password']=num 10 | post_data=urllib.urlencode(data) 11 | print post_data 12 | response=urllib.urlopen(url,post_data) 13 | html=response.read() 14 | result=re.findall('密码错误',html) 15 | if not result: 16 | print html 17 | break -------------------------------------------------------------------------------- /stock_list/1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShaoZC/Financial-report-acquisition-and-data-processing-with-Python/c20775f8448aea88be49b2204682f9331c75a085/stock_list/1.txt -------------------------------------------------------------------------------- /stock_list/2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShaoZC/Financial-report-acquisition-and-data-processing-with-Python/c20775f8448aea88be49b2204682f9331c75a085/stock_list/2.txt -------------------------------------------------------------------------------- /stock_list/20200320.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShaoZC/Financial-report-acquisition-and-data-processing-with-Python/c20775f8448aea88be49b2204682f9331c75a085/stock_list/20200320.txt -------------------------------------------------------------------------------- /stock_list/3.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShaoZC/Financial-report-acquisition-and-data-processing-with-Python/c20775f8448aea88be49b2204682f9331c75a085/stock_list/3.txt -------------------------------------------------------------------------------- /stock_list/4.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShaoZC/Financial-report-acquisition-and-data-processing-with-Python/c20775f8448aea88be49b2204682f9331c75a085/stock_list/4.txt -------------------------------------------------------------------------------- /stock_list/5.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShaoZC/Financial-report-acquisition-and-data-processing-with-Python/c20775f8448aea88be49b2204682f9331c75a085/stock_list/5.txt -------------------------------------------------------------------------------- /stock_list/scrawler_url.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShaoZC/Financial-report-acquisition-and-data-processing-with-Python/c20775f8448aea88be49b2204682f9331c75a085/stock_list/scrawler_url.txt -------------------------------------------------------------------------------- /stock_list/新能源电池20200320.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShaoZC/Financial-report-acquisition-and-data-processing-with-Python/c20775f8448aea88be49b2204682f9331c75a085/stock_list/新能源电池20200320.xlsx -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 2 | import urllib 3 | import re 4 | 5 | tmp_re=urllib.urlopen('http://www.heibanke.com/lesson/crawler_ex00/') 6 | html=tmp_re.read() 7 | index=re.findall(r'输入数字([0-9]{5})',html) 8 | 9 | while index: 10 | url='http://www.heibanke.com/lesson/crawler_ex00/%s/' % index[0] 11 | print url 12 | tmp_re=urllib.urlopen(url) 13 | html=tmp_re.read() 14 | index=re.findall(r'数字是([0-9]{5})',html) 15 | print html -------------------------------------------------------------------------------- /testqq.py: -------------------------------------------------------------------------------- 1 | print('hello world') 2 | import urllib2 3 | request=urllib2.Request(url="http://www.qq.com") 4 | result=urllib2.urlopen(request).read() 5 | print result -------------------------------------------------------------------------------- /tieba.py: -------------------------------------------------------------------------------- 1 | __author__ = 'CQC' 2 | 3 | 4 | import urllib 5 | import urllib2 6 | import re 7 | import tool 8 | import os 9 | 10 | 11 | class Spider: 12 | 13 | 14 | def __init__(self): 15 | self.siteURL = 'http://mm.taobao.com/json/request_top_list.htm' 16 | self.tool = tool.Tool() 17 | 18 | 19 | def getPage(self,pageIndex): 20 | url = self.siteURL + "?page=" + str(pageIndex) 21 | request = urllib2.Request(url) 22 | response = urllib2.urlopen(request) 23 | return response.read().decode('gbk') 24 | 25 | 26 | def getContents(self,pageIndex): 27 | page = self.getPage(pageIndex) 28 | pattern = re.compile('

(.*?)