├── README.md ├── data-mongod.jpg └── tdx_mongodb_operation.py /README.md: -------------------------------------------------------------------------------- 1 | # 通达信数据清洗与MongoDB数据库操作 2 | 3 | 4 | ------------------------------- 5 | 6 | ## * 简介 7 | 8 | 基本步骤如下: 9 | 10 | - 先从通达信上下载1分钟和5分钟的期货数据,操作:系统-盘后数据下载 11 | - 然后从通达信导出txt或csv文件到指定目录,操作:系统-数据导出-高级导出(注意:这里导出的时候,有分隔格式的选择,优先选择逗号分隔,如果选择其他类型分隔,需要在py文件中第34行代码,split函数里换为相应的分隔字符) 12 | - 启动MongoDB,然后运行tdx_mongodb-operation.py脚本 13 | 注意:这里默认你已经安装好MongoDB、Studio3T(数据可视化工具),并做好了相关配置 14 | 15 | 开发环境`Python-v3(3.6)`: 16 | 17 | - pandas==0.20.0 18 | - numpy==1.13.3+mkl 19 | - pymongo==3.6.0 20 | 21 | ## * 数据清洗与数据库操作(`tdx_mongodb-operation.py`) 22 | 23 | - conn_mongodb函数,连接数据库,返回一个collection 24 | - output_symbol_list函数,返回列表,每个元素包含了数据文件夹下所有txt/csv文件的绝对路径和对应的期货品种 25 | - gen_data_from_txt函数,将txt/csv文件导入并返回DataFrame类型行情数据 26 | - transfrom函数,将gen_data_from_txt函数导入的数据中的TradingDay(即夜盘数据属于次日交易日)转为ActionDay(即按照正常时间顺序) 27 | - cut函数,将5分钟数据中的冗余数据清理掉 28 | - data_processing函数,数据预处理函数,返回最终处理后的数据 29 | - extract_info函数,从数据库中抽取特定标签数据用作去重处理 30 | - insert_to_database函数,第一次创建数据库的时候,直接插入数据,往后的维护,需要去重处理之后再插入新数据 31 | - multi_thread_run函数,单线程处还是有点慢,多线程同时处理多个数据文件导入,效率会高很多,参数:max_threads_num最大线程数量, 32 |   file_path是通达信txt/csv数据所在文件夹目录 33 | 34 | ## * 用法 35 | 36 | - 配好运行环境以及安装MongoDB,最好再安装一个MongoDB的可视化管理工具Studio 3T 37 | - 启动MongoDB,最后运行该py脚本即可,需要设置数据跨度(1分钟还是5分钟)、数据库名称、最大启动线程数量,以及通达信txt/csv数据文件夹路径 38 | - 最终运行后,在MongoDB上显示如下: 39 | ![image](https://github.com/DemonDamon/tongdaxin-futures-data-clearing-database-operation/blob/master/data-mongod.jpg) 40 | -------------------------------------------------------------------------------- /data-mongod.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DemonDamon/tongdaxin-futures-data-clearing-database-operation/2ffb0c0c078dbb464ab78f2e034199c3d585be67/data-mongod.jpg -------------------------------------------------------------------------------- /tdx_mongodb_operation.py: -------------------------------------------------------------------------------- 1 | import os, warnings, threading 2 | import numpy as np 3 | import pandas as pd 4 | from sys import path 5 | from pymongo import MongoClient 6 | from concurrent import futures 7 | 8 | class tdx_mongodb_operation(object): 9 | def __init__(self,*arg,**kwarg): 10 | self._IP = arg[0] 11 | self._PORT = arg[1] 12 | self._dataFrame = kwarg["dataframe"] 13 | self._databaseName = kwarg["database"] 14 | 15 | def conn_mongodb(self,collectionName): 16 | self._Conn = MongoClient(self._IP, self._PORT) 17 | self._mydb = self._Conn[self._databaseName] 18 | collection = self._mydb.get_collection(collectionName) 19 | return collection 20 | 21 | def output_symbol_list(self,file_path): 22 | symbol_info_list = [] 23 | for root, dirs, files in os.walk(file_path): 24 | for file in files: 25 | if os.path.splitext(file)[1] == '.txt': 26 | symbol_info_list.append([os.path.join(root, file),file.replace('L8.txt','')]) 27 | return symbol_info_list 28 | 29 | def gen_data_from_txt(self,symbol_path): 30 | raw_data = []; Date = []; Time = [] 31 | Open = []; High = []; Low = []; Close = [] 32 | Volumn = []; OpenInterest = [] 33 | for line in open(symbol_path): 34 | row = line.split(',') 35 | if row[0].isdigit(): 36 | Date.append(row[0]) 37 | Time.append(row[1]) 38 | Open.append(row[2]) 39 | High.append(row[3]) 40 | Low.append(row[4]) 41 | Close.append(row[5]) 42 | Volumn.append(row[6]) 43 | OpenInterest.append(row[7]) 44 | raw_data = pd.DataFrame({'Date' : np.int64(Date), 45 | 'Time' : np.int32(Time), 46 | 'Open' : np.double(Open), 47 | 'High' : np.double(High), 48 | 'Low' : np.double(Low), 49 | 'Close' : np.double(Close), 50 | 'Volumn' : np.int64(Volumn), 51 | 'OpenInterest' : np.int64(OpenInterest)}, 52 | columns=['Date','Time','Open', 53 | 'High','Low','Close', 54 | 'Volumn','OpenInterest']) 55 | return raw_data 56 | 57 | def transfrom(self,latest_raw_data): 58 | if self._dataFrame == 5: 59 | print(' data frame: 5 mins ') 60 | i = 0 61 | while i < len(latest_raw_data): 62 | if latest_raw_data.iloc[i,1] == 1500: 63 | k = i + 1 64 | t = [] 65 | date = latest_raw_data.iloc[i,0] 66 | while k < len(latest_raw_data) and latest_raw_data.iloc[k,1] != 905: 67 | if latest_raw_data.iloc[k,1] >= 2105 and latest_raw_data.iloc[k,1] <= 2355: 68 | t.append(k) 69 | k += 1 70 | if len(t) != 0: 71 | for j in range(len(t)): 72 | latest_raw_data.iloc[t[j],0] = date 73 | i = k 74 | else: 75 | i += 1 76 | if i == len(latest_raw_data) - 1: 77 | latest_raw_data.iloc[i,0] = latest_raw_data.iloc[i-1,0] 78 | print(' have transfromed trading-date to action-date. ') 79 | elif self._dataFrame == 1: 80 | print(' data frame: 1 mins ') 81 | i = 0 82 | while i < len(latest_raw_data): 83 | if latest_raw_data.iloc[i,1] == 1500: 84 | k = i + 1 85 | t = [] 86 | date = latest_raw_data.iloc[i,0] 87 | while k < len(latest_raw_data) and latest_raw_data.iloc[k,1] != 901: 88 | if latest_raw_data.iloc[k,1] >= 2101 and latest_raw_data.iloc[k,1] <= 2359: 89 | t.append(k) 90 | k += 1 91 | if len(t) != 0: 92 | for j in range(len(t)): 93 | latest_raw_data.iloc[t[j],0] = date 94 | i = k 95 | else: 96 | i += 1 97 | if i == len(latest_raw_data) - 1: 98 | latest_raw_data.iloc[i,0] = latest_raw_data.iloc[i-1,0] 99 | print(' have transfromed trading-date to action-date. ') 100 | latest_transformed_data = latest_raw_data 101 | return latest_transformed_data 102 | 103 | def cut(self,latest_transformed_data): 104 | print(' date length:' + str(len(latest_transformed_data))) 105 | t5 = []; t = 1; k = 1; u = 1 106 | while t == k: 107 | for i in range(len(latest_transformed_data)): 108 | if (i+1) % 5 == 0: 109 | t5.append(latest_transformed_data.iloc[i,1]) 110 | if t5[-1] % 5 != 0: 111 | idx = i - np.where(latest_transformed_data.iloc[range(i,i-3,-1),1] - \ 112 | latest_transformed_data.iloc[range(i-1,i-4,-1),1] != 1) 113 | latest_transformed_data.drop([idx],inplace=True) #inplace=True直接改变内存的值 114 | latest_transformed_data = latest_transformed_data.reset_index(drop=True) #删除原来索引,重新建立从0开始的索引 115 | print(' have cleared redundant data ' + str(u) + ', and data length is ' + \ 116 | str(len(latest_transformed_data)) + ' right now. ') 117 | u += 1 118 | break 119 | if i == len(latest_transformed_data) - 1: 120 | k += 1 121 | print(' redundant data not exist anymore. ') 122 | processed_data = latest_transformed_data 123 | return processed_data 124 | 125 | def data_processing(self,symbol_path): 126 | latest_raw_data = self.gen_data_from_txt(symbol_path) 127 | # 1.transfrom trading-day to action-day 128 | latest_transformed_data = self.transfrom(latest_raw_data) 129 | # 2.clear redundant data 130 | if self._dataFrame == 5: 131 | processed_data = self.cut(latest_transformed_data) 132 | else: 133 | processed_data = latest_transformed_data 134 | return processed_data 135 | 136 | def extract_info(self,tag_list,collectionName): 137 | collection = self.conn_mongodb(collectionName) 138 | tag_data = [] 139 | for tag in tag_list: 140 | exec(tag + " = collection.distinct('" + tag + "')") 141 | exec("tag_data.append(" + tag + ")") 142 | return tag_data 143 | 144 | def insert_to_database(self,symbol_path,symbol): 145 | collection = self.conn_mongodb(symbol) 146 | latest_processed_data = self.data_processing(symbol_path) 147 | # 1.extract specific info from db 148 | date_distinct_list = self.extract_info(['Date'],symbol) 149 | if date_distinct_list[0] == []: 150 | for i in range(len(latest_processed_data)): 151 | data = {'_id' : str(i), 152 | 'Date' : str(latest_processed_data.iloc[i,0]), 153 | 'Time' : str(latest_processed_data.iloc[i,1]), 154 | 'Open' : str(latest_processed_data.iloc[i,2]), 155 | 'High' : str(latest_processed_data.iloc[i,3]), 156 | 'Low' : str(latest_processed_data.iloc[i,4]), 157 | 'Close' : str(latest_processed_data.iloc[i,5]), 158 | 'Volumn' : str(latest_processed_data.iloc[i,6]), 159 | 'OpenInterest' : str(latest_processed_data.iloc[i,7])} 160 | collection.insert_one(data) 161 | print(' * finish inserting ' + symbol + ' data. ') 162 | else: 163 | # 2.duplicate removal and insert latest data to database 164 | date = np.int64(date_distinct_list[0]) 165 | last_date = str(np.max(date)) 166 | data = collection.find({'Date':last_date}) #return Object 167 | last_time = max([int(k["Time"]) for k in data]) 168 | latest_processed_data_1 = latest_processed_data[latest_processed_data.Date==int(last_date)] 169 | start_insert_ind = latest_processed_data_1[latest_processed_data_1.Time==last_time].index[0] 170 | for i in range(len(latest_processed_data)): 171 | if i > start_insert_ind: 172 | data = {'_id' : str(i), 173 | 'Date' : str(latest_processed_data.iloc[i,0]), 174 | 'Time' : str(latest_processed_data.iloc[i,1]), 175 | 'Open' : str(latest_processed_data.iloc[i,2]), 176 | 'High' : str(latest_processed_data.iloc[i,3]), 177 | 'Low' : str(latest_processed_data.iloc[i,4]), 178 | 'Close' : str(latest_processed_data.iloc[i,5]), 179 | 'Volumn' : str(latest_processed_data.iloc[i,6]), 180 | 'OpenInterest' : str(latest_processed_data.iloc[i,7])} 181 | collection.insert_one(data) 182 | print(' * finish removing duplication and inserting latest ' + symbol + ' data. ') 183 | 184 | def multi_thread_run(self,max_threads_num,file_path): 185 | symbol_info_list = self.output_symbol_list(file_path) 186 | #symbol_info[0] is symbol path; symbol_info[1] is symbol name 187 | with futures.ThreadPoolExecutor(max_workers=max_threads_num) as executor: 188 | future_to_symbol = {executor.submit(self.insert_to_database,symbol_info[0],symbol_info[1]) : \ 189 | ind for ind, symbol_info in enumerate(symbol_info_list)} 190 | 191 | if __name__ == '__main__': 192 | Data_Info_Dist = {"dataframe":[1,5],"databasename":["futures_1min_data","futures_5min_data"],\ 193 | "filepath":["D:\\Quant_Python\\tongdaxin_data\\1min_txt","D:\\Quant_Python\\tongdaxin_data\\5min_txt"]} 194 | for i in range(len(Data_Info_Dist["dataframe"])): 195 | tmo = tdx_mongodb_operation("localhost",27017,dataframe=Data_Info_Dist["dataframe"][i],\ 196 | database=Data_Info_Dist["databasename"][i]) 197 | tmo.multi_thread_run(max_threads_num=4,file_path=Data_Info_Dist["filepath"][i]) 198 | --------------------------------------------------------------------------------