├── .gitattributes ├── .github └── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── conda-recipe └── meta.yaml ├── datamine ├── __init__.py ├── io.py ├── loaders │ ├── 1qbit.py │ ├── __init__.py │ ├── base.py │ ├── block.py │ ├── cryptocurrency.py │ ├── eod.py │ ├── eris.py │ ├── fx.py │ ├── govpx.py │ ├── liqtool.py │ ├── orbitalinsight.py │ ├── rsmetrics.py │ ├── sofr.py │ ├── sofrsr.py │ ├── telluslabs.py │ ├── tick.py │ └── voi.py └── utils.py ├── docs └── CME Query API's - EOD_Block_Tick_BBO - Google Docs.pdf ├── examples ├── Load Datamine Data Locally Example.ipynb └── images │ ├── BitcoinEndofDayValue.png │ └── BitcoinRTIndexValue.png ├── setup.cfg └── setup.py /.gitattributes: -------------------------------------------------------------------------------- 1 | datamine/_version.py export-subst 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | examples/data/ 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | .ipynb_checkpoints/ 61 | node_modules/ 62 | screenshots/ 63 | *.xunit.xml 64 | 65 | # npm 66 | package-lock.json 67 | 68 | data 69 | settings.json 70 | 71 | # datamine logs 72 | datamine.log 73 | 74 | # Spyder Project Files 75 | .spyproject 76 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: generic 2 | sudo: false 3 | 4 | branches: 5 | only: 6 | - master 7 | - "/^\\d+\\.\\d+.*$/" 8 | 9 | env: 10 | os: 11 | - linux 12 | 13 | install: 14 | - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh 15 | - bash miniconda.sh -b -p $HOME/miniconda 16 | - source $HOME/miniconda/bin/activate 17 | - conda config --set always_yes yes --set changeps1 no --set auto_update_conda no 18 | - conda install conda conda-build conda-verify anaconda-client 19 | - conda info -a 20 | - printenv | sort 21 | 22 | script: 23 | - conda build conda-recipe 24 | - conda build --test $HOME/miniconda/conda-bld/*/*.tar.bz2 25 | # We're not uploading these, but we can at least ensure that they build. 26 | - python setup.py sdist 27 | - python setup.py bdist_wheel 28 | 29 | deploy: 30 | - provider: script 31 | skip_cleanup: true 32 | on: 33 | branch: master 34 | tags: true 35 | script: 36 | - anaconda --verbose --token $ANACONDA_TOKEN upload --user CME_Group $HOME/miniconda/conda-bld/*/*.tar.bz2 37 | --force 38 | - provider: script 39 | skip_cleanup: true 40 | on: 41 | branch: master 42 | tags: false 43 | script: 44 | - anaconda --verbose --token $ANACONDA_TOKEN upload --user CME_Group --label 45 | main $HOME/miniconda/conda-bld/*/*.tar.bz2 --force 46 | - provider: pypi 47 | skip_cleanup: true 48 | on: 49 | branch: master 50 | tags: false 51 | username: "__token__" 52 | password: $PYPI_TOKEN 53 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018, CME Group 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CME Datamine 2 | [![Build Status](https://travis-ci.org/CMEGroup/datamine_python.svg?branch=master)](https://travis-ci.org/CMEGroup/datamine_python) 3 | # Overview 4 | 5 | CME Datamine is offered via a self-service cloud solution, where you can access more than 6 | 450 terabytes of historical data almost instantaneously, using some of the most flexible 7 | data delivery methods available. Extensively back-test strategies using real benchmark 8 | markets that date back as far as the 1970s, to help you gauge profitability and risk. 9 | 10 | This python package will support your rapid analysis by supplying a basic framework for 11 | direct iteration with CME Datamine cloud system to accomplish the following tasks. 12 | 13 | 1. Load your data item catalog which you have subscribed 14 | 2. Download your data items to your local machines from the cloud 15 | 3. Specific data items automatically structured into a Pandas dataframe from 16 | your local copy. This includes correct typing and other generic routines to support 17 | your analysis needs. 18 | 4. Examples of working with this data in Pandas via a collection of Ipyhon Notebook files. 19 | 20 | # Installation 21 | 22 | ## Conda 23 | 24 | The easiest way to install this package is to do so in a 25 | Python environment created with [Anaconda](https://www.anaconda.com/distribution/) 26 | or its minimalist alternative [Miniconda](https://docs.conda.io/en/latest/miniconda.html). 27 | Once this environment is installed and activated, simply run this command: 28 | ``` 29 | conda install -c cme_group datamine 30 | ``` 31 | 32 | ## PyPi 33 | 34 | Installation from [PyPi](https://pypi.org/project/datamine/) 35 | ``` 36 | pip install datamine 37 | ``` 38 | 39 | ## From source 40 | 41 | To install from source, clone this repository and execute 42 | ``` 43 | pip install . 44 | ``` 45 | If you wish to install the package in writable mode for development, do 46 | ``` 47 | pip install -e . 48 | ``` 49 | 50 | # Example usage 51 | 52 | The following sections quickly outline some of the simple methods to access 53 | CME Datamine data. For interactive use, we recommend the use of a 54 | [Jupyter](https://jupyter.org) notebook or the 55 | [JupyterLab](https://jupyterlab.readthedocs.io/en/latest) platform. 56 | 57 | ## Load My Data Catalog Items 58 | 59 | ```buildoutcfg 60 | myDatamine = dm.DatamineCon(username='YOUR_CME_APP_ID', password='YOUR_CME_APP_PASSWORD', path='./data/') 61 | #Get My Datamine Data Catalog 62 | myDatamine.get_catalog(limit=1000) 63 | # Review one of the data catalog items as supplied in dict format. 64 | myDatamine.data_catalog.popitem() 65 | 66 | 67 | ``` 68 | 69 | ## Download Specific Data Products 70 | You can request specific data products. Current data products supported are as follows. 71 | When requesting your data, you must specify the _dataset_ tag or leave it blank will request 72 | all items in your catalog. 73 | 74 | ### CME Data Products 75 | 76 | | Data Set Name | Data Type | _dataset_ Tag | 77 | |--- |--- |---| 78 | | CME Time and Sales | Price | TICK | 79 | | CME Market Depth MBO | Price | MBO | 80 | | CME CF Crypto Currency | Index | CRYPTOCURRENCY | 81 | | BrokerTec Top of Book | Price | NEXBROKERTECTOB | 82 | | BrokerTec Depth of Book | Price | NEXBROKERTECDOB | 83 | | BrokerTec Full Book | Price | NEXBROKERTECFOB | 84 | | Eris PAI | Market Analytics | ERIS | 85 | | SSTL INT Settlements | Price | STL | 86 | 87 | 88 | 89 | ### Third Party Data 90 | 91 | | Data Set Name | Data Type | _dataset_ Tag | 92 | |--- |--- |---| 93 | | TellusLabs | Alternative - Ags | TELLUSLABS | 94 | | Orbital Insight | Alternative - Energy | ORBITALINSIGHT | 95 | | Bantix Technologies | Market Analytics - Options | BANTIX | 96 | | RS Metrics | Alternative - Metals | RSMETRICS | 97 | | 1 Qbit | Market Analytics | 1QBIT | 98 | 99 | 100 | A complete list of data products can be reviewed on [CME Datamine]([https://datamine.cmegroup.com/#t=p&p=cme.dataHome) 101 | 102 | 103 | Example request for specific Data Sets using the _dataset_ tag. 104 | ```buildoutcfg 105 | myDatamine.get_catalog(dataset='CRYPTOCURRENCY', limit=1000) 106 | myDatamine.get_catalog(dataset='TICK', limit=1000) 107 | myDatamine.get_catalog(dataset='TELLUSLABS', limit=1000) 108 | myDatamine.get_catalog(dataset='RSMETRICS', limit=1000) 109 | ``` 110 | 111 | ## Use Bitcoin Information in Analysis 112 | The following example can be found in the [Load Datamine Data Locally Example Notebook](https://github.com/CMEGroup/datamine_python/blob/master/examples/Load%20Datamine%20Data%20Locally%20Example.ipynb) 113 | ```buildoutcfg 114 | myDatamine.get_catalog(dataset='CRYPTOCURRENCY', limit=1000) 115 | myDatamine.crypto_load() 116 | 117 | #plot second interval index values for Bitcoin 118 | indexValue = myDatamine.crypto_DF.loc[myDatamine.crypto_DF['symbol'] =='BRTI','mdEntryPx'].plot(figsize=[15,5]); 119 | plt.title('Historical Bitcoin Intraday Reference Rate') 120 | plt.xlabel('Date') 121 | plt.ylabel('USD/BTC') 122 | plt.style.use('fivethirtyeight') 123 | plt.show() 124 | 125 | ``` 126 | ![Bitcoin RT Index Plot Example](https://github.com/CMEGroup/datamine_python/blob/master/examples/images/BitcoinRTIndexValue.png "Bitcoin Logo") 127 | 128 | 129 | ## Questions and Comments? 130 | Please use the Issues feature. 131 | 132 | 133 | ## Notice 134 | The information herein has been complied by CME Group for general informational and education purposes only and does not constitute trading advice or the solicitation of purchases or sale of futures, options, or swaps. The views in this video reflect solely those of the author and not necessarily those of CME Group or its affiliated institutions. All examples discussed are hypothetical situations, used for explanation purposes only, and should not be considered investment advice of the results of actual market experience. Although every attempt has been made to ensure the accuracy of the information herein, CME Group and its affiliates assume no responsibility for any errors or omissions. All data is sourced by CME Group unless otherwise stated. All matters pertaining to rules and specification herein are made subject to and are superseded by official CME, CBOT, NYMEX, and COMEX rules. Current rules should be consulted in all cases concerning contact specifications. 135 | 136 | CME Group, the Globe Logo, CME, Globex, E-Mini, CME Direct, CME Datamine and Chicago Mercantile Exchange are trademarks of Chicago Mercantile Exchange Inc. CBOT is a trademark of the Board of Trade of the City of Chicago, Inc. NYMEX is a trademark of New York Mercantile Exchange, Inc. COMEX is a trademark of Commodity Exchange, Inc. All other trademarks are the property of their respective owners. 137 | -------------------------------------------------------------------------------- /conda-recipe/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set data = load_setup_py_data() %} 2 | 3 | package: 4 | name: datamine 5 | version: {{ data.get('version') }} 6 | 7 | source: 8 | path: ../ 9 | 10 | build: 11 | number: 0 12 | noarch: python 13 | script: 14 | - {{ PYTHON }} -m pip install . --no-deps --ignore-installed --no-cache-dir 15 | 16 | requirements: 17 | host: 18 | - python {{ python }} 19 | - pip 20 | run: 21 | - python 22 | - requests 23 | - urllib3 24 | - pandas 25 | - tqdm 26 | - futures-compat 27 | 28 | test: 29 | imports: 30 | - datamine 31 | - datamine.io 32 | - datamine.utils 33 | - datamine.loaders 34 | 35 | about: 36 | home: https://github.com/CMEGroup/datamine_python 37 | license: BSD 3-Clause 38 | license_file: LICENSE 39 | summary: 'Python connector for the CME DataMine service.' 40 | -------------------------------------------------------------------------------- /datamine/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /datamine/io.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple Python client for CME Group Datamine 3 | 4 | https://datamine.cmegroup.com 5 | 6 | .. moduleauthor:: Aaron Walters 7 | 8 | """ 9 | 10 | import requests 11 | import urllib3 12 | import cgi 13 | import os 14 | import sys 15 | from datetime import datetime 16 | import logging 17 | 18 | # Generate logger 19 | logging.basicConfig(filename='datamine.log', filemode='w', format='%(levelname)s - %(asctime)s - %(message)s', level=logging.ERROR) 20 | 21 | from .utils import tqdm_execute_tasks, MAX_WORKERS, logger 22 | from .loaders import Loader 23 | 24 | DEFAULT_URL = 'https://datamine.cmegroup.com/cme/api/v1' 25 | NO_LIMIT = sys.maxsize 26 | TIMEOUTS = (3.05, 60) 27 | PAGE_SIZE = 1000 28 | CHUNK_SIZE = 1024 29 | 30 | 31 | def _url_params(url): 32 | parts = url.split('?', 1) 33 | if len(parts) == 1: 34 | return parts[0], None 35 | return parts[0], dict(map(lambda x: x.split('=', 1), parts[1].split('&'))) 36 | 37 | class RequestError(RuntimeError): 38 | pass 39 | 40 | 41 | class DatamineCon(object): 42 | """ 43 | This class operates with CME Datamine to retrieve your data catalog, 44 | download specific data onto a specified path, load the data from 45 | that path, and finally structure data 46 | 47 | Example usage:: 48 | 49 | import datamine_io 50 | 51 | datamine = datamine_io.Datamine(user='CHANGE_ME', passwd='CHANGE_ME') 52 | datamine.getCatalog() 53 | datamine.loadCrypto() #for crypto data sets 54 | 55 | datamine.debug = True # turn on debug logging 56 | """ 57 | 58 | debug = False 59 | 60 | def __init__(self, path='./', username=None, password=None, 61 | url=DEFAULT_URL, threads=MAX_WORKERS): 62 | """creates the variables associated with the class 63 | 64 | :type path: string 65 | :param path: The URL path where you would files save on your local environment 66 | 67 | :type user: string 68 | :param user: CME Group Login User Name. See https://www.cmegroup.com/market-data/datamine-api.html 69 | 70 | :type password: string 71 | :param password: CME Group APP Password for Datamine Services 72 | 73 | :type url: string 74 | :param url: The primary URL for CME Datamine API. 75 | 76 | :type url: int 77 | :param url: The number of threads for downloading files. 78 | """ 79 | self.url = url 80 | 81 | # Leverage basic request/urllib3 functionality as much as possible: 82 | # Persistent sessions, connection pooling, retry management 83 | self.session = requests.Session() 84 | self.session.auth = requests.auth.HTTPBasicAuth(username, password) 85 | retry = urllib3.util.Retry(read=3, backoff_factor=2, status_forcelist=[400]) 86 | adapter = requests.adapters.HTTPAdapter(max_retries=retry) 87 | self.session.mount('', adapter) 88 | 89 | self.path = path 90 | self.data_catalog = {} 91 | self._dataset = None 92 | self._limit = -1 93 | self.threads = threads 94 | 95 | def _call_api(self, endpoint, params, stream=False): 96 | url = self.url + '/' + endpoint 97 | param_str = '&'.join('{}={}'.format(*p) for p in params.items()) 98 | logger.debug('_call_api: {}'.format(param_str)) 99 | return self.session.get(url, timeout=TIMEOUTS, params=params, stream=stream) 100 | 101 | def download_file(self, fid): 102 | """Download a single file denoted by the given FID. 103 | 104 | :type fid: string 105 | :param fid: The FID of the file to be retrieved. 106 | """ 107 | 108 | if fid not in self.data_catalog: 109 | raise RequestError('FID not found in the catalog: {}'.format(fid)) 110 | record = self.data_catalog[fid] 111 | supplied_url, params = _url_params(record['url']) 112 | assert supplied_url == self.url + '/download' 113 | response = self._call_api('download', params, stream=True) 114 | try: 115 | # The filename is embedded in the Content-Disposition header 116 | header = response.headers.get('content-disposition', '') 117 | try: 118 | filename = cgi.parse_header(header)[1]['filename'] 119 | except Exception: 120 | filename = 'error.txt' 121 | print ('''File Handling Area, looking for Content-Disposition Header and Lacks a 'header'...''') 122 | print('Expected a "filename" entry in the Content-Disposition header found:\n {}'.format(header)) 123 | print('See log file for further detail.') 124 | logging.error(str(record['dataset']) + ' ' + str(supplied_url) + ' ' + ' ' + str(params) + ' ' + ('Expected a "filename" entry in the Content-Disposition header found:\n {}'.format(header))) 125 | pass 126 | 127 | dest_path = os.path.join(self.path, record['dataset']) 128 | if not os.path.exists(dest_path): 129 | try: 130 | os.makedirs(dest_path) 131 | except: 132 | pass 133 | abs_path = os.path.join(dest_path, os.path.basename(filename)) 134 | with open(abs_path, 'wb') as target: 135 | try: 136 | for chunk in response.iter_content(chunk_size=CHUNK_SIZE): 137 | if chunk: 138 | target.write(chunk) 139 | target.flush() 140 | except: 141 | pass 142 | finally: 143 | # It would be more convenient to use the context manager idiom, 144 | # but avoiding it allows us to support older versions of requests. 145 | response.close() 146 | 147 | def download_data(self, dataset=None): 148 | """Download the entire catalog or a specific dataset to the local directory. 149 | 150 | :type dataset: string, or None 151 | :param dataset: The specific CME Datamine dataset name as retreived from catalog. 152 | If None, the entire catalog is downloaded. 153 | """ 154 | 155 | fids = [fid for fid, record in self.data_catalog.items() 156 | if dataset is None or record['dataset'] == dataset] 157 | description = 'downloading {} data'.format(dataset if dataset else 'all datasets') 158 | tqdm_execute_tasks(self.download_file, fids, description, self.threads, mode='thread') 159 | 160 | def get_catalog(self, dataset=None, limit=None, refresh=False): 161 | """Get the list of data files avaliable to you 162 | This may take time depending upon how many items are currenty 163 | have available to your login. Items are retrieved in groups of 1000 164 | per the standard call support. 165 | 166 | Parameters 167 | ---------- 168 | :type dataset: string 169 | :param dataset: The specific dataset items that you would like to retrieve. 170 | 171 | :type limit: integer 172 | :param limit: Limits the amount of catalog items you would like to retrieve. 173 | 174 | :type refresh: bool 175 | :param refresh: Set to True if you want to force a refresh of the local copy. 176 | 177 | Creates 178 | ------- 179 | :creates: python.dictionary self.data_catalog -- containing custom data catalog available. 180 | 181 | Returns 182 | ------- 183 | Returns None -- dictionary of the data catalog from Datamine 184 | """ 185 | logger.info('get_catalog: retrieving {}, limit {}'.format(dataset if dataset else 'all datasets', limit)) 186 | 187 | # No need to download more data if: 188 | # -- if the dataset matches, and the new limit is smaller 189 | # -- if the previous dataset was None, and there was no limit 190 | if limit is None: 191 | limit = NO_LIMIT 192 | elif not isinstance(limit, int) or limit < 0: 193 | raise RequestError('Invalid limit value: {!r}'.format(limit)) 194 | is_valid = (self._dataset == dataset and limit <= self._limit or 195 | self._dataset is None and self._limit == NO_LIMIT) 196 | 197 | if refresh or not is_valid: 198 | if self._limit >= 0: 199 | reason = 'by request' if refresh else 'for new parameters' 200 | logger.debug('get_catalog: refreshing {}'.format(reason)) 201 | self.data_catalog = {} 202 | self._dataset = None 203 | self._limit = 0 204 | is_valid = False 205 | 206 | if is_valid: 207 | logger.info('get_catalog: requested data already downloaded') 208 | return 209 | 210 | params = {} 211 | duplicates = 0 212 | nrecs = len(self.data_catalog) 213 | if dataset: 214 | params['dataset'] = dataset 215 | while True: 216 | params['limit'] = min(PAGE_SIZE, limit - nrecs) 217 | if params['limit'] <= 0: 218 | logger.warning('get_catalog: {}-record limit reached'.format(limit)) 219 | break 220 | 221 | resp = self._call_api('list', params) 222 | if resp.text == '"Could not initiate UNO connection"': 223 | raise RequestError('Invalid username/password combination.') 224 | try: 225 | response = resp.json() 226 | if response is None: 227 | logger.warning('get_catalog: empty record obtained, assuming end of data reached') 228 | limit = NO_LIMIT 229 | break 230 | files = response['files'] 231 | next_url = response['paging']['next'] 232 | except (ValueError, TypeError): 233 | raise RequestError('Invalid JSON data:\n URL: {}\n Text: {}\n'.format(resp.url, resp.text)) 234 | 235 | self.data_catalog.update((item['fid'], item) for item in files) 236 | orecs, nrecs = nrecs, len(self.data_catalog) 237 | duplicates += orecs + len(files) - nrecs 238 | 239 | if not next_url: 240 | logger.debug('get_catalog: end of data raeached') 241 | limit = NO_LIMIT 242 | break 243 | _, params = _url_params(next_url) 244 | 245 | logger.info('get_catalog: {} records downloaded, {} duplicates, {} saved'.format(nrecs + duplicates, duplicates, nrecs)) 246 | self._limit = max(limit, len(self.data_catalog)) 247 | self._dataset = dataset 248 | 249 | def load_dataset(self, dataset, download=True, limit=None, dataset_args = {}): 250 | """Load a dataset, optionally downloading files listed in the catalog. 251 | Parameters 252 | ---------- 253 | :param download: Attempt to download any data avaliable before loading data from local disk. 254 | :type download: bool 255 | 256 | :param limit: Limit the number of files loaded to the given number. 257 | :type limit: integer, or None 258 | 259 | Returns 260 | ------- 261 | :returns: pandas.DataFrame 262 | """ 263 | 264 | if download: 265 | self.download_data(dataset) 266 | 267 | path = os.path.join(self.path, dataset) 268 | return Loader.by_name(dataset, dataset_args).load(path, limit=limit) 269 | 270 | ''' 271 | Script consists of "load" and "download" functions. 272 | "download" functions only download files into local directory 273 | "load" functions download files into local directory, and then read + structure into a pandas DataFrame 274 | 275 | Design pattern for _download family 276 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 277 | Parameters 278 | ---------- 279 | :param download: Attempt to download any data available before loading data from local disk. 280 | :type download: bool. 281 | 282 | Creates 283 | ------- 284 | :creates: None 285 | 286 | Returns 287 | ------- 288 | :returns: None 289 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 290 | 291 | Design pattern for _load family 292 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 293 | Parameters 294 | ---------- 295 | :param download: Attempt to download any data avaliable before loading data from local disk. 296 | :type download: bool. 297 | 298 | Creates 299 | ------- 300 | :creates: pandas.DataFrame object.datasetname_DF 301 | 302 | Returns 303 | ------- 304 | :returns: None 305 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 306 | ''' 307 | 308 | def block_load(self, download=True): 309 | """ 310 | Data Set - Block Trades 311 | File Path - /BLOCK 312 | Function Type - Download & Load 313 | Help URL - https://www.cmegroup.com/confluence/display/EPICSANDBOX/Block+Trades 314 | """ 315 | self.block_DF = self.load_dataset('BLOCK') 316 | 317 | 318 | def brokertech_tob_download(self, download=True): 319 | """ 320 | Data Set - Nex BrokerTech Top of Book Data Sets 321 | File Path - /NEXBROKERTECTOB 322 | Function Type - Download Only 323 | Help URL - https://www.cmegroup.com/confluence/display/EPICSANDBOX/NEX+-+BrokerTec+Historical+Data 324 | """ 325 | if download: 326 | self.download_data('NEXBROKERTECTOB') 327 | 328 | def brokertech_dob_download(self, download=True): 329 | """ 330 | Data Set - Nex BrokerTech Depth of Book Data Sets 331 | File Path - /NEXBROKERTECDOB 332 | Function Type - Download Only 333 | Help URL - https://www.cmegroup.com/confluence/display/EPICSANDBOX/NEX+-+BrokerTec+Historical+Data 334 | """ 335 | if download: 336 | self.download_data('NEXBROKERTECDOB') 337 | 338 | def brokertech_fob_download(self, download=True): 339 | """ 340 | Data Set - Nex BrokerTech Full Book Data Sets 341 | File Path - /NEXBROKERTECFOB 342 | Function Type - Download Only 343 | Help URL - https://www.cmegroup.com/confluence/display/EPICSANDBOX/NEX+-+BrokerTec+Historical+Data 344 | """ 345 | if download: 346 | self.download_data('NEXBROKERTECFOB') 347 | 348 | def crypto_load(self, download=True): 349 | """ 350 | Data Set - Crypto Data, Bitcoin & Etherium 351 | File Path - /cryptocurrency 352 | Function Type - Download & Load 353 | Help URL - https://www.cmegroup.com/confluence/display/EPICSANDBOX/Liquidity+Tool+Dataset 354 | """ 355 | self.crypto_DF = self.load_dataset('CRYPTOCURRENCY') 356 | 357 | def eod_load(self, download=True): 358 | """ 359 | Data Set - End of Day Complete 360 | File Path - /EOD 361 | Function Type - Download & Load 362 | Help URL - https://www.cmegroup.com/confluence/display/EPICSANDBOX/End+of+Day 363 | """ 364 | self.eod_DF = self.load_dataset('EOD', download=download) 365 | 366 | def voi_load(self, download=True): 367 | """ 368 | Data Set - Volume and Open Interest 369 | File Path - /VOI 370 | Function Type - Download & Load 371 | Help URL - https://www.cmegroup.com/confluence/display/EPICSANDBOX/Volume+and+Open+Interest 372 | """ 373 | self.voi_DF = self.load_dataset('VOI', download=download) 374 | 375 | def eris_load(self, download=True): 376 | """ 377 | Data Set - Eris PAI 378 | File Path - /ERIS 379 | Function Type - Download & Load 380 | Help URL - https://www.cmegroup.com/confluence/display/EPICSANDBOX/Eris+PAI+Dataset 381 | """ 382 | self.eris_DF = self.load_dataset('ERIS') 383 | 384 | def fx_load(self, download=True): 385 | """ 386 | Data Set - FX Premium 387 | File Path - /FX 388 | Function Type - Download & Load 389 | Help URL - https://www.cmegroup.com/confluence/display/EPICSANDBOX/Premium+FX+Feed+Historical+Data 390 | Warning -- Files are large when uncompressed 391 | """ 392 | self.fx_DF = self.load_dataset('FX', download=download) 393 | 394 | # def govpx_load(self, download=True, dataset_args = {}): 395 | # """ 396 | # Data Set - GovPX 397 | # File Path - /GOVPX 398 | # Function Type - Download & Load 399 | # Help URL - https://www.cmegroup.com/confluence/display/EPICSANDBOX/GovPX+Historical+Data# 400 | # """ 401 | # self.govpx_DF = self.load_dataset(dataset = 'GOVPX', dataset_args = dataset_args, download=download) 402 | 403 | def govpx_download(self, download=True): 404 | """ 405 | Data Set - GovPX 406 | File Path - /GOVPX 407 | Function Type - Download Only 408 | Help URL - https://www.cmegroup.com/confluence/display/EPICSANDBOX/GovPX+Historical+Data 409 | """ 410 | if download: 411 | self.download_data('GOVPX') 412 | 413 | def govpxeod_download(self, download=True): 414 | """ 415 | Data Set - GovPX End of Day 416 | File Path - /GOVPXEOD 417 | Function Type - Download Only 418 | Help URL - https://www.cmegroup.com/confluence/display/EPICSANDBOX/GovPX+End+of+Day+Historical+Data 419 | """ 420 | if download: 421 | self.download_data('GOVPXEOD') 422 | 423 | def STL_download(self, download=True): 424 | """ 425 | Data Set - STL INT Settlements 426 | File Path - /STL 427 | Function Type - Download Only 428 | Help URL - https://www.cmegroup.com/confluence/display/EPICSANDBOX/STL+INT+Settlements 429 | """ 430 | if download: 431 | self.download_data('STL') 432 | 433 | def liqtool_load(self, download=True): 434 | """ 435 | Data Set - Liquidity Tool 436 | File Path - /LIQTOOL 437 | Function Type - Download & Load 438 | Help URL - https://www.cmegroup.com/confluence/display/EPICSANDBOX/Liquidity+Tool+Dataset 439 | """ 440 | self.liqtool_DF = self.load_dataset('LIQTOOL') 441 | 442 | def MD_download(self, download=True): 443 | """ 444 | Data Set - Market Depth FIX 445 | File Path - /MD 446 | Function Type - Download Only 447 | Help URL - https://www.cmegroup.com/confluence/display/EPICSANDBOX/Market+Depth 448 | """ 449 | if download: 450 | self.download_data('MD') 451 | 452 | def RLC_download(self, download=True): 453 | """ 454 | Data Set - Market Depth RLC 455 | File Path - /RLC 456 | Function Type - Download Only 457 | Help URL - https://www.cmegroup.com/confluence/display/EPICSANDBOX/Market+Depth 458 | """ 459 | if download: 460 | self.download_data('RLC') 461 | 462 | def RLCSECDEF_download(self, download=True): 463 | """ 464 | Data Set - SECDEF RLC 465 | File Path - /RLCSECDEF 466 | Function Type - Download Only 467 | Help URL - 468 | """ 469 | if download: 470 | self.download_data('RLCSECDEF') 471 | 472 | def MBO_download(self, download=True): 473 | """ 474 | Data Set - MBO FIX 475 | File Path - /MBO 476 | Function Type - Download Only 477 | Help URL - https://wiki.chicago.cme.com/confluence/display/EPICSANDBOX/MBO+FIX 478 | """ 479 | if download: 480 | self.download_data('MBO') 481 | 482 | def PCAP_download(self, download=True): 483 | """ 484 | Data Set - Packet Capture (PCAP) 485 | File Path - /PCAP 486 | Function Type - Download Only 487 | Help URL - https://www.cmegroup.com/confluence/display/EPICSANDBOX/Packet+Capture+Dataset 488 | """ 489 | if download: 490 | self.download_data('PCAP') 491 | 492 | def sofrois_load(self, download=True): 493 | """ 494 | Data Set - SOFR OIS Index 495 | File Path - /SOFR 496 | Function Type - Download & Load 497 | Help URL - https://www.cmegroup.com/market-data/faq-sofr-third-party-data.html 498 | """ 499 | self.sofrois_DF = self.load_dataset('SOFR', download=download) 500 | 501 | def sofrstriprates_load(self, orient='long', download=True): 502 | """ 503 | Data Set - SOFR Strip Rates 504 | File Path - /SOFRSR 505 | Function Type - Download & Load 506 | Help URL - https://www.cmegroup.com/confluence/display/EPICSANDBOX/SOFR+Strip+Rates 507 | """ 508 | if orient == 'wide': 509 | 510 | self.sofrstriprates_DF = self.load_dataset('SOFRSR', 511 | download=download).pivot(index='businessDate', 512 | columns='Description', values='rate').sort_values('businessDate', ascending=False).reset_index() 513 | 514 | self.sofrstriprates_DF['businessDate'] = self.sofrstriprates_DF['businessDate'].dt.date 515 | self.sofrstriprates_DF.set_index('businessDate', inplace=True) 516 | elif orient == 'long': 517 | self.sofrstriprates_DF = self.load_dataset('SOFRSR', download=download) 518 | else: 519 | print("Incorrect orientation parameter. Defaulting to long.") 520 | self.sofrstriprates_DF = self.load_dataset('SOFRSR', download=download) 521 | 522 | def SECDEF_download(self, download=True): 523 | """ 524 | Data Set - Securities Definition (SECDEF) 525 | File Path - /SECDEF 526 | Function Type - Download Only 527 | Help URL - Not Applicable 528 | """ 529 | if download: 530 | self.download_data('SECDEF') 531 | 532 | def time_sales_load(self, download=True): 533 | """ 534 | Data Set - Time and Sales (TICK) 535 | File Path - /TICK 536 | Function Type - Download & Load 537 | Help URL - 538 | """ 539 | self.time_sales_DF = self.load_dataset('TICK') 540 | 541 | def BBO_download(self, download=True): 542 | """ 543 | Data Set - Top-of-Book (BBO) 544 | File Path - /BBO 545 | Function Type - Download Only 546 | Help URL - https://www.cmegroup.com/confluence/display/EPICSANDBOX/Top+of+Book+-+BBO 547 | """ 548 | if download: 549 | self.download_data('BBO') 550 | 551 | def bantix_download(self, download=True): 552 | """ 553 | Data Set - bantix 554 | File Path - /BANTIX 555 | Function Type - Download Only 556 | Help URL - https://www.cmegroup.com/market-data/quikstrike-via-bantix-technologies.html 557 | """ 558 | if download: 559 | self.download_data('BANTIX') 560 | 561 | def JSE_download(self, download=True): 562 | """ 563 | Data Set - Johannesburg Stock Exchange 564 | File Path - /JSE 565 | Function Type - Download Only 566 | Help URL - 567 | """ 568 | if download: 569 | self.download_data('JSE') 570 | 571 | def orbital_insights_load(self, download=True): 572 | """ 573 | Data Set - Orbital Insights (https://orbitalinsight.com/) 574 | File Path - /ORBITALINSIGHT 575 | Function Type - Download & Load 576 | Help URL - https://www.cmegroup.com/market-data/orbital-insight/faq.html 577 | """ 578 | self.orbital_insights_DF = self.load_dataset('ORBITALINSIGHT') 579 | 580 | def rsmetrics_load(self, download=True): 581 | """ 582 | Data Set - RS Metrics 583 | File Path - /RSMETRICS 584 | Function Type - Download & Load 585 | Help URL - https://www.cmegroup.com/market-data/rs-metrics/faq-rs-metrics.html 586 | """ 587 | self.rsmetrics_DF = self.load_dataset('RSMETRICS') 588 | 589 | def tellus_labs_load(self, download=True): 590 | """ 591 | Data Set - Tellus Labs (https://telluslabs.com) 592 | File Path - /TELLUSLABS 593 | Function Type - Download & Load 594 | Help URL - https://www.cmegroup.com/education/articles-and-reports/telluslabs-faq.html 595 | """ 596 | self.tellus_labs_DF = self.load_dataset('TELLUSLABS') 597 | 598 | def oneqbit_load(self, download=True): 599 | """ 600 | Data Set - 1Qbit 601 | File Path - /1QBIT 602 | Function Type - Download & Load 603 | Help URL - https://www.cmegroup.com/market-data/faq-1qbit.html 604 | """ 605 | self.oneqbit_DF = self.load_dataset('1QBIT') 606 | -------------------------------------------------------------------------------- /datamine/loaders/1qbit.py: -------------------------------------------------------------------------------- 1 | from . import Loader 2 | import pandas as pd 3 | 4 | class OneQBitLoader(Loader): 5 | dataset = '1QBIT' 6 | fileglob = '1QBit_*.csv' 7 | 8 | columns = ['TRADEDATE', 'DATA_SOURCE', 'EODDESC', 'CHART_TITLE', 9 | 'YYYY', 'MM', 'DD', 'DATECODE_EXCEL', 'DATE_LABEL', 'F_PROD_CODE', 'O_PROD_CODE', 10 | 'PRICE_SETTLE_ACTIVE', 'PRICE_HIGH_ACTIVE', 'PRICE_LOW_ACTIVE', 'YYYY_ACTIVE', 'MM_ACTIVE', 'F_VOLUME_ACTIVE', 11 | 'PRICE_SETTLE_NEXT', 'PRICE_HIGH_NEXT', 'PRICE_LOW_NEXT', 'YYYY_NEXT', 'MM_NEXT', 'F_VOLUME_NEXT', 12 | 'F_VOLUME', 'IMPLIED_VOL', 'PUT_VOLUME', 'CALL_VOLUME', 'OPTIONS_VOLUME', 'PUT_OI', 'CALL_OI', 'O_OI', 13 | 'CURRENT_PRICE_MOST_ACTIVE', 'PREVIOUS_PRICE_MOST_ACTIVE', 'PRICE_PCT_CHG', 'EXCESS_RETURN_INDEX', 14 | 'IMPLIED_VOL_ST', 'IMPLIED_VOL_LT', 'DAILY_VARIANCE', 'HISTORICAL_STD_ST', 'HISTORICAL_STD_LT', 'RATIO_STD_ST_LT', 15 | 'RATIO_STD_ST_TO_IMPLIED_VOL_CURRENT', 'RATIO_HIGH_LOW_PCT', 'HIGH_LOW_PCT_ST', 'HIGH_LOW_PCT_LT', 16 | 'RATIO_HIGH_LOW_ST_LT', 'PUT_VOLUME_ST', 'PUT_VOLUME_LT', 'RATIO_PUT_VOLUME_ST_LT', 'CALL_VOLUME_ST', 17 | 'CALL_VOLUME_LT', 'RATIO_CALL_VOLUME_ST_LT', 'RATIO_PUT_CALL_VOLUME_ST', 'RATIO_PUT_CALL_VOLUME_LT', 18 | 'PCT_DIF_PUT_CALL_ST_LT_RATIO', 'MOMENTUM_ST', 'MOMENTUM_LT', 'RATIO_MOMENTUM_ST_LT', 'RATIO_MOMENTUM_TO_STD_ST', 19 | 'RATIO_MOMENTUM_TO_STD_LT', 'PRICE_20D_MA', 'PRICE_60D_MA', 'PRICE_200D_MA', 'PCT_DIF_CURRENT_200D_PRICE', 20 | 'PCT_DIF_20D_200D_PRICE', 'PEAK_PRICE', 'PEAK_200D_PRICE', '20PCT_BELOW_PEAK_200D', '20PCT_ABOVE_60DMA', 21 | '20PCT_BELOW_60DMA', 'MIX_PROB_20PCT_ABOVE_60DMA', 22 | 'MIX_PROB_20PCT_BELOW_60DMA', 'MIX_MEAN', 'MIX_MEDIAN', 'MIX_MODE_1', 'MIX_MODE_2', 23 | 'MIX_STD', 'MIX_STD_LT', 'MIX_SKEW', 'MIX_KURTOSIS', 'MIX_STATE', 'MIX_COMPLACENT', 'MIX_BALANCED', 'MIX_ANXIOUS', 24 | 'MIX_CONFLICTED', 'MIX_MODALITY', 'MIX_DISTANCE', 'MIX_INTENSITY', 'MIX_LOW_BIN', 'MIX_BIN_SIZE', 'MIX_BINS', 25 | 'MIX_BIN_NEG_100', 'MIX_BIN_NEG_99', 'MIX_BIN_NEG_98', 'MIX_BIN_NEG_97', 'MIX_BIN_NEG_96', 'MIX_BIN_NEG_95', 26 | 'MIX_BIN_NEG_94', 'MIX_BIN_NEG_93', 'MIX_BIN_NEG_92', 'MIX_BIN_NEG_91', 'MIX_BIN_NEG_90', 'MIX_BIN_NEG_89', 27 | 'MIX_BIN_NEG_88', 'MIX_BIN_NEG_87', 'MIX_BIN_NEG_86', 'MIX_BIN_NEG_85', 'MIX_BIN_NEG_84', 'MIX_BIN_NEG_83', 28 | 'MIX_BIN_NEG_82', 'MIX_BIN_NEG_81', 'MIX_BIN_NEG_80', 'MIX_BIN_NEG_79', 'MIX_BIN_NEG_78', 'MIX_BIN_NEG_77', 29 | 'MIX_BIN_NEG_76', 'MIX_BIN_NEG_75', 'MIX_BIN_NEG_74', 'MIX_BIN_NEG_73', 'MIX_BIN_NEG_72', 'MIX_BIN_NEG_71', 30 | 'MIX_BIN_NEG_70', 'MIX_BIN_NEG_69', 'MIX_BIN_NEG_68', 'MIX_BIN_NEG_67', 'MIX_BIN_NEG_66', 'MIX_BIN_NEG_65', 31 | 'MIX_BIN_NEG_64', 'MIX_BIN_NEG_63', 'MIX_BIN_NEG_62', 'MIX_BIN_NEG_61', 'MIX_BIN_NEG_60', 'MIX_BIN_NEG_59', 32 | 'MIX_BIN_NEG_58', 'MIX_BIN_NEG_57', 'MIX_BIN_NEG_56', 'MIX_BIN_NEG_55', 'MIX_BIN_NEG_54', 'MIX_BIN_NEG_53', 33 | 'MIX_BIN_NEG_52', 'MIX_BIN_NEG_51', 'MIX_BIN_NEG_50', 'MIX_BIN_NEG_49', 'MIX_BIN_NEG_48', 'MIX_BIN_NEG_47', 34 | 'MIX_BIN_NEG_46', 'MIX_BIN_NEG_45', 'MIX_BIN_NEG_44', 'MIX_BIN_NEG_43', 'MIX_BIN_NEG_42', 'MIX_BIN_NEG_41', 35 | 'MIX_BIN_NEG_40', 'MIX_BIN_NEG_39', 'MIX_BIN_NEG_38', 'MIX_BIN_NEG_37', 'MIX_BIN_NEG_36', 'MIX_BIN_NEG_35', 36 | 'MIX_BIN_NEG_34', 'MIX_BIN_NEG_33', 'MIX_BIN_NEG_32', 'MIX_BIN_NEG_31', 'MIX_BIN_NEG_30', 'MIX_BIN_NEG_29', 37 | 'MIX_BIN_NEG_28', 'MIX_BIN_NEG_27', 'MIX_BIN_NEG_26', 'MIX_BIN_NEG_25', 'MIX_BIN_NEG_24', 'MIX_BIN_NEG_23', 38 | 'MIX_BIN_NEG_22', 'MIX_BIN_NEG_21', 'MIX_BIN_NEG_20', 'MIX_BIN_NEG_19', 'MIX_BIN_NEG_18', 'MIX_BIN_NEG_17', 39 | 'MIX_BIN_NEG_16', 'MIX_BIN_NEG_15', 'MIX_BIN_NEG_14', 'MIX_BIN_NEG_13', 'MIX_BIN_NEG_12', 'MIX_BIN_NEG_11', 40 | 'MIX_BIN_NEG_10', 'MIX_BIN_NEG_09', 'MIX_BIN_NEG_08', 'MIX_BIN_NEG_07', 'MIX_BIN_NEG_06', 'MIX_BIN_NEG_05', 41 | 'MIX_BIN_NEG_04', 'MIX_BIN_NEG_03', 'MIX_BIN_NEG_02', 'MIX_BIN_NEG_01', 'MIX_BIN_POS_00', 'MIX_BIN_POS_01', 42 | 'MIX_BIN_POS_02', 'MIX_BIN_POS_03', 'MIX_BIN_POS_04', 'MIX_BIN_POS_05', 'MIX_BIN_POS_06', 'MIX_BIN_POS_07', 43 | 'MIX_BIN_POS_08', 'MIX_BIN_POS_09', 'MIX_BIN_POS_10', 'MIX_BIN_POS_11', 'MIX_BIN_POS_12', 'MIX_BIN_POS_13', 44 | 'MIX_BIN_POS_14', 'MIX_BIN_POS_15', 'MIX_BIN_POS_16', 'MIX_BIN_POS_17', 'MIX_BIN_POS_18', 'MIX_BIN_POS_19', 45 | 'MIX_BIN_POS_20', 'MIX_BIN_POS_21', 'MIX_BIN_POS_22', 'MIX_BIN_POS_23', 'MIX_BIN_POS_24', 'MIX_BIN_POS_25', 46 | 'MIX_BIN_POS_26', 'MIX_BIN_POS_27', 'MIX_BIN_POS_28', 'MIX_BIN_POS_29', 'MIX_BIN_POS_30', 'MIX_BIN_POS_31', 47 | 'MIX_BIN_POS_32', 'MIX_BIN_POS_33', 'MIX_BIN_POS_34', 'MIX_BIN_POS_35', 'MIX_BIN_POS_36', 'MIX_BIN_POS_37', 48 | 'MIX_BIN_POS_38', 'MIX_BIN_POS_39', 'MIX_BIN_POS_40', 'MIX_BIN_POS_41', 'MIX_BIN_POS_42', 'MIX_BIN_POS_43', 49 | 'MIX_BIN_POS_44', 'MIX_BIN_POS_45', 'MIX_BIN_POS_46', 'MIX_BIN_POS_47', 'MIX_BIN_POS_48', 'MIX_BIN_POS_49', 50 | 'MIX_BIN_POS_50', 'MIX_BIN_POS_51', 'MIX_BIN_POS_52', 'MIX_BIN_POS_53', 'MIX_BIN_POS_54', 'MIX_BIN_POS_55', 51 | 'MIX_BIN_POS_56', 'MIX_BIN_POS_57', 'MIX_BIN_POS_58', 'MIX_BIN_POS_59', 'MIX_BIN_POS_60', 'MIX_BIN_POS_61', 52 | 'MIX_BIN_POS_62', 'MIX_BIN_POS_63', 'MIX_BIN_POS_64', 'MIX_BIN_POS_65', 'MIX_BIN_POS_66', 'MIX_BIN_POS_67', 53 | 'MIX_BIN_POS_68', 'MIX_BIN_POS_69', 'MIX_BIN_POS_70', 'MIX_BIN_POS_71', 'MIX_BIN_POS_72', 'MIX_BIN_POS_73', 54 | 'MIX_BIN_POS_74', 'MIX_BIN_POS_75', 'MIX_BIN_POS_76', 'MIX_BIN_POS_77', 'MIX_BIN_POS_78', 'MIX_BIN_POS_79', 55 | 'MIX_BIN_POS_80', 'MIX_BIN_POS_81', 'MIX_BIN_POS_82', 'MIX_BIN_POS_83', 'MIX_BIN_POS_84', 'MIX_BIN_POS_85', 56 | 'MIX_BIN_POS_86', 'MIX_BIN_POS_87', 'MIX_BIN_POS_88', 'MIX_BIN_POS_89', 'MIX_BIN_POS_90', 'MIX_BIN_POS_91', 57 | 'MIX_BIN_POS_92', 'MIX_BIN_POS_93', 'MIX_BIN_POS_94', 'MIX_BIN_POS_95', 'MIX_BIN_POS_96', 'MIX_BIN_POS_97', 58 | 'MIX_BIN_POS_98', 'MIX_BIN_POS_99', 'MIX_BIN_POS_100', 'MIX_BIN_POS_101', 'MIX_BIN_POS_102', 'MIX_BIN_POS_103', 59 | 'MIX_BIN_POS_104', 'MIX_BIN_POS_105', 'MIX_BIN_POS_106', 'MIX_BIN_POS_107', 'MIX_BIN_POS_108', 'MIX_BIN_POS_109', 60 | 'MIX_BIN_POS_110', 'MIX_BIN_POS_111', 'MIX_BIN_POS_112', 'MIX_BIN_POS_113', 'MIX_BIN_POS_114', 'MIX_BIN_POS_115', 61 | 'MIX_BIN_POS_116', 'MIX_BIN_POS_117', 'MIX_BIN_POS_118', 'MIX_BIN_POS_119', 'MIX_BIN_POS_120', 'MIX_BIN_POS_121', 62 | 'MIX_BIN_POS_122', 'MIX_BIN_POS_123', 'MIX_BIN_POS_124', 'MIX_BIN_POS_125', 'MIX_BIN_POS_126', 'MIX_BIN_POS_127', 63 | 'MIX_BIN_POS_128', 'MIX_BIN_POS_129', 'MIX_BIN_POS_130', 'MIX_BIN_POS_131', 'MIX_BIN_POS_132', 'MIX_BIN_POS_133', 64 | 'MIX_BIN_POS_134', 'MIX_BIN_POS_135', 'MIX_BIN_POS_136', 'MIX_BIN_POS_137', 'MIX_BIN_POS_138', 'MIX_BIN_POS_139', 65 | 'MIX_BIN_POS_140', 'MIX_BIN_POS_141', 'MIX_BIN_POS_142', 'MIX_BIN_POS_143', 'MIX_BIN_POS_144', 'MIX_BIN_POS_145', 66 | 'MIX_BIN_POS_146', 'MIX_BIN_POS_147', 'MIX_BIN_POS_148', 'MIX_BIN_POS_149', 'MIX_BIN_POS_150', 'MIX_BIN_POS_151', 67 | 'MIX_BIN_POS_152', 'MIX_BIN_POS_153', 'MIX_BIN_POS_154', 'MIX_BIN_POS_155'] 68 | 69 | dtypes = {'category': ('DATA_SOURCE', 'EODDESC', 'CHART_TITLE', 'F_PROD_CODE', 'O_PROD_CODE','MIX_STATE','MIX_MODALITY'), 70 | 'int64': ('YYYY', 'MM', 'DD', 'YYYY_ACTIVE', 'MM_ACTIVE', 'F_VOLUME_ACTIVE', 71 | 'YYYY_NEXT', 'MM_NEXT', 'F_VOLUME_NEXT', 'F_VOLUME', 'PUT_VOLUME', 'CALL_VOLUME', 72 | 'OPTIONS_VOLUME', 'PUT_OI', 'CALL_OI','O_OI', 73 | 'MIX_COMPLACENT', 'MIX_BALANCED', 'MIX_ANXIOUS', 'MIX_CONFLICTED','MIX_DISTANCE'), 74 | 75 | 'float': ('DATECODE_EXCEL','CURRENT_PRICE_MOST_ACTIVE', 'PREVIOUS_PRICE_MOST_ACTIVE', 'PRICE_PCT_CHG', 'EXCESS_RETURN_INDEX', 76 | 'IMPLIED_VOL_ST', 'IMPLIED_VOL_LT', 'DAILY_VARIANCE', 'HISTORICAL_STD_ST', 'HISTORICAL_STD_LT', 'RATIO_STD_ST_LT', 77 | 'RATIO_STD_ST_TO_IMPLIED_VOL_CURRENT', 'RATIO_HIGH_LOW_PCT', 'HIGH_LOW_PCT_ST', 'HIGH_LOW_PCT_LT', 78 | 'RATIO_HIGH_LOW_ST_LT', 'PUT_VOLUME_ST', 'PUT_VOLUME_LT', 'RATIO_PUT_VOLUME_ST_LT', 'CALL_VOLUME_ST', 79 | 'CALL_VOLUME_LT', 'RATIO_CALL_VOLUME_ST_LT', 'RATIO_PUT_CALL_VOLUME_ST', 'RATIO_PUT_CALL_VOLUME_LT', 80 | 'PCT_DIF_PUT_CALL_ST_LT_RATIO', 'MOMENTUM_ST', 'MOMENTUM_LT', 'RATIO_MOMENTUM_ST_LT', 'RATIO_MOMENTUM_TO_STD_ST', 81 | 'RATIO_MOMENTUM_TO_STD_LT', 'PRICE_20D_MA', 'PRICE_60D_MA', 'PRICE_200D_MA', 'PCT_DIF_CURRENT_200D_PRICE', 82 | 'PCT_DIF_20D_200D_PRICE', 'PEAK_PRICE', 'PEAK_200D_PRICE', '20PCT_BELOW_PEAK_200D', '20PCT_ABOVE_60DMA', 83 | '20PCT_BELOW_60DMA', 'MIX_PROB_20PCT_ABOVE_60DMA', 84 | 'MIX_PROB_20PCT_BELOW_60DMA', 'MIX_MEAN', 'MIX_MEDIAN', 'MIX_MODE_1', 'MIX_MODE_2', 85 | 'MIX_STD', 'MIX_STD_LT', 'MIX_SKEW', 'MIX_KURTOSIS','MIX_INTENSITY', 'MIX_LOW_BIN', 'MIX_BIN_SIZE', 'MIX_BINS', 86 | 'MIX_BIN_NEG_100', 'MIX_BIN_NEG_99', 'MIX_BIN_NEG_98', 'MIX_BIN_NEG_97', 'MIX_BIN_NEG_96', 'MIX_BIN_NEG_95', 87 | 'MIX_BIN_NEG_94', 'MIX_BIN_NEG_93', 'MIX_BIN_NEG_92', 'MIX_BIN_NEG_91', 'MIX_BIN_NEG_90', 'MIX_BIN_NEG_89', 88 | 'MIX_BIN_NEG_88', 'MIX_BIN_NEG_87', 'MIX_BIN_NEG_86', 'MIX_BIN_NEG_85', 'MIX_BIN_NEG_84', 'MIX_BIN_NEG_83', 89 | 'MIX_BIN_NEG_82', 'MIX_BIN_NEG_81', 'MIX_BIN_NEG_80', 'MIX_BIN_NEG_79', 'MIX_BIN_NEG_78', 'MIX_BIN_NEG_77', 90 | 'MIX_BIN_NEG_76', 'MIX_BIN_NEG_75', 'MIX_BIN_NEG_74', 'MIX_BIN_NEG_73', 'MIX_BIN_NEG_72', 'MIX_BIN_NEG_71', 91 | 'MIX_BIN_NEG_70', 'MIX_BIN_NEG_69', 'MIX_BIN_NEG_68', 'MIX_BIN_NEG_67', 'MIX_BIN_NEG_66', 'MIX_BIN_NEG_65', 92 | 'MIX_BIN_NEG_64', 'MIX_BIN_NEG_63', 'MIX_BIN_NEG_62', 'MIX_BIN_NEG_61', 'MIX_BIN_NEG_60', 'MIX_BIN_NEG_59', 93 | 'MIX_BIN_NEG_58', 'MIX_BIN_NEG_57', 'MIX_BIN_NEG_56', 'MIX_BIN_NEG_55', 'MIX_BIN_NEG_54', 'MIX_BIN_NEG_53', 94 | 'MIX_BIN_NEG_52', 'MIX_BIN_NEG_51', 'MIX_BIN_NEG_50', 'MIX_BIN_NEG_49', 'MIX_BIN_NEG_48', 'MIX_BIN_NEG_47', 95 | 'MIX_BIN_NEG_46', 'MIX_BIN_NEG_45', 'MIX_BIN_NEG_44', 'MIX_BIN_NEG_43', 'MIX_BIN_NEG_42', 'MIX_BIN_NEG_41', 96 | 'MIX_BIN_NEG_40', 'MIX_BIN_NEG_39', 'MIX_BIN_NEG_38', 'MIX_BIN_NEG_37', 'MIX_BIN_NEG_36', 'MIX_BIN_NEG_35', 97 | 'MIX_BIN_NEG_34', 'MIX_BIN_NEG_33', 'MIX_BIN_NEG_32', 'MIX_BIN_NEG_31', 'MIX_BIN_NEG_30', 'MIX_BIN_NEG_29', 98 | 'MIX_BIN_NEG_28', 'MIX_BIN_NEG_27', 'MIX_BIN_NEG_26', 'MIX_BIN_NEG_25', 'MIX_BIN_NEG_24', 'MIX_BIN_NEG_23', 99 | 'MIX_BIN_NEG_22', 'MIX_BIN_NEG_21', 'MIX_BIN_NEG_20', 'MIX_BIN_NEG_19', 'MIX_BIN_NEG_18', 'MIX_BIN_NEG_17', 100 | 'MIX_BIN_NEG_16', 'MIX_BIN_NEG_15', 'MIX_BIN_NEG_14', 'MIX_BIN_NEG_13', 'MIX_BIN_NEG_12', 'MIX_BIN_NEG_11', 101 | 'MIX_BIN_NEG_10', 'MIX_BIN_NEG_09', 'MIX_BIN_NEG_08', 'MIX_BIN_NEG_07', 'MIX_BIN_NEG_06', 'MIX_BIN_NEG_05', 102 | 'MIX_BIN_NEG_04', 'MIX_BIN_NEG_03', 'MIX_BIN_NEG_02', 'MIX_BIN_NEG_01', 'MIX_BIN_POS_00', 'MIX_BIN_POS_01', 103 | 'MIX_BIN_POS_02', 'MIX_BIN_POS_03', 'MIX_BIN_POS_04', 'MIX_BIN_POS_05', 'MIX_BIN_POS_06', 'MIX_BIN_POS_07', 104 | 'MIX_BIN_POS_08', 'MIX_BIN_POS_09', 'MIX_BIN_POS_10', 'MIX_BIN_POS_11', 'MIX_BIN_POS_12', 'MIX_BIN_POS_13', 105 | 'MIX_BIN_POS_14', 'MIX_BIN_POS_15', 'MIX_BIN_POS_16', 'MIX_BIN_POS_17', 'MIX_BIN_POS_18', 'MIX_BIN_POS_19', 106 | 'MIX_BIN_POS_20', 'MIX_BIN_POS_21', 'MIX_BIN_POS_22', 'MIX_BIN_POS_23', 'MIX_BIN_POS_24', 'MIX_BIN_POS_25', 107 | 'MIX_BIN_POS_26', 'MIX_BIN_POS_27', 'MIX_BIN_POS_28', 'MIX_BIN_POS_29', 'MIX_BIN_POS_30', 'MIX_BIN_POS_31', 108 | 'MIX_BIN_POS_32', 'MIX_BIN_POS_33', 'MIX_BIN_POS_34', 'MIX_BIN_POS_35', 'MIX_BIN_POS_36', 'MIX_BIN_POS_37', 109 | 'MIX_BIN_POS_38', 'MIX_BIN_POS_39', 'MIX_BIN_POS_40', 'MIX_BIN_POS_41', 'MIX_BIN_POS_42', 'MIX_BIN_POS_43', 110 | 'MIX_BIN_POS_44', 'MIX_BIN_POS_45', 'MIX_BIN_POS_46', 'MIX_BIN_POS_47', 'MIX_BIN_POS_48', 'MIX_BIN_POS_49', 111 | 'MIX_BIN_POS_50', 'MIX_BIN_POS_51', 'MIX_BIN_POS_52', 'MIX_BIN_POS_53', 'MIX_BIN_POS_54', 'MIX_BIN_POS_55', 112 | 'MIX_BIN_POS_56', 'MIX_BIN_POS_57', 'MIX_BIN_POS_58', 'MIX_BIN_POS_59', 'MIX_BIN_POS_60', 'MIX_BIN_POS_61', 113 | 'MIX_BIN_POS_62', 'MIX_BIN_POS_63', 'MIX_BIN_POS_64', 'MIX_BIN_POS_65', 'MIX_BIN_POS_66', 'MIX_BIN_POS_67', 114 | 'MIX_BIN_POS_68', 'MIX_BIN_POS_69', 'MIX_BIN_POS_70', 'MIX_BIN_POS_71', 'MIX_BIN_POS_72', 'MIX_BIN_POS_73', 115 | 'MIX_BIN_POS_74', 'MIX_BIN_POS_75', 'MIX_BIN_POS_76', 'MIX_BIN_POS_77', 'MIX_BIN_POS_78', 'MIX_BIN_POS_79', 116 | 'MIX_BIN_POS_80', 'MIX_BIN_POS_81', 'MIX_BIN_POS_82', 'MIX_BIN_POS_83', 'MIX_BIN_POS_84', 'MIX_BIN_POS_85', 117 | 'MIX_BIN_POS_86', 'MIX_BIN_POS_87', 'MIX_BIN_POS_88', 'MIX_BIN_POS_89', 'MIX_BIN_POS_90', 'MIX_BIN_POS_91', 118 | 'MIX_BIN_POS_92', 'MIX_BIN_POS_93', 'MIX_BIN_POS_94', 'MIX_BIN_POS_95', 'MIX_BIN_POS_96', 'MIX_BIN_POS_97', 119 | 'MIX_BIN_POS_98', 'MIX_BIN_POS_99', 'MIX_BIN_POS_100', 'MIX_BIN_POS_101', 'MIX_BIN_POS_102', 'MIX_BIN_POS_103', 120 | 'MIX_BIN_POS_104', 'MIX_BIN_POS_105', 'MIX_BIN_POS_106', 'MIX_BIN_POS_107', 'MIX_BIN_POS_108', 'MIX_BIN_POS_109', 121 | 'MIX_BIN_POS_110', 'MIX_BIN_POS_111', 'MIX_BIN_POS_112', 'MIX_BIN_POS_113', 'MIX_BIN_POS_114', 'MIX_BIN_POS_115', 122 | 'MIX_BIN_POS_116', 'MIX_BIN_POS_117', 'MIX_BIN_POS_118', 'MIX_BIN_POS_119', 'MIX_BIN_POS_120', 'MIX_BIN_POS_121', 123 | 'MIX_BIN_POS_122', 'MIX_BIN_POS_123', 'MIX_BIN_POS_124', 'MIX_BIN_POS_125', 'MIX_BIN_POS_126', 'MIX_BIN_POS_127', 124 | 'MIX_BIN_POS_128', 'MIX_BIN_POS_129', 'MIX_BIN_POS_130', 'MIX_BIN_POS_131', 'MIX_BIN_POS_132', 'MIX_BIN_POS_133', 125 | 'MIX_BIN_POS_134', 'MIX_BIN_POS_135', 'MIX_BIN_POS_136', 'MIX_BIN_POS_137', 'MIX_BIN_POS_138', 'MIX_BIN_POS_139', 126 | 'MIX_BIN_POS_140', 'MIX_BIN_POS_141', 'MIX_BIN_POS_142', 'MIX_BIN_POS_143', 'MIX_BIN_POS_144', 'MIX_BIN_POS_145', 127 | 'MIX_BIN_POS_146', 'MIX_BIN_POS_147', 'MIX_BIN_POS_148', 'MIX_BIN_POS_149', 'MIX_BIN_POS_150', 'MIX_BIN_POS_151', 128 | 'MIX_BIN_POS_152', 'MIX_BIN_POS_153', 'MIX_BIN_POS_154', 'MIX_BIN_POS_155'), 129 | 'date': ('DATE_LABEL'), 130 | 'date:%Y%m%d': ('TRADEDATE')} 131 | 132 | def _load(self, file): 133 | df = pd.read_csv(file, skiprows = [1,2], low_memory=False) 134 | return df 135 | 136 | oneqbitloader = OneQBitLoader() 137 | -------------------------------------------------------------------------------- /datamine/loaders/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import Loader 2 | -------------------------------------------------------------------------------- /datamine/loaders/base.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import glob 4 | import sys 5 | 6 | from importlib import import_module 7 | from importlib import reload 8 | from ..utils import tqdm_execute_tasks, logger 9 | 10 | __all__ = ['Loader'] 11 | 12 | 13 | class Loader(object): 14 | columns = None 15 | dtypes = None 16 | dataset = None 17 | dataset_args = None 18 | fileglob = '*.csv' 19 | index = None 20 | 21 | _by_name = None 22 | 23 | @classmethod 24 | def _load_datasets(cls): 25 | cls._by_name = {} 26 | pkg = __name__.rsplit('.', 1)[0] 27 | fpath, base = os.path.split(__file__) 28 | for fname in glob.glob(os.path.join(fpath, '*.py')): 29 | fname = os.path.basename(fname) 30 | if fname in (base, '__init__.py'): 31 | continue 32 | module = import_module('.' + fname[:-3], pkg) 33 | for key, value in module.__dict__.items(): 34 | if isinstance(value, cls): 35 | if not isinstance(value.dataset, str): 36 | raise RuntimeError('Invalid Loader: dataset must be a string, not {}'.format(type(value.dataset))) 37 | elif value.dataset in cls._by_name: 38 | raise RuntimeError('Invalid Loader: duplicate loader for {} dataset'.format(value.dataset)) 39 | else: 40 | cls._by_name[value.dataset] = value 41 | # {'BLOCK' : } 42 | 43 | @classmethod 44 | def datasets(cls): 45 | if cls._by_name is None: 46 | cls._load_datasets() 47 | return list(cls._by_name.keys()) 48 | 49 | @classmethod 50 | def by_name(cls, dataset, dataset_args = {}): 51 | cls.dataset_args = dataset_args 52 | if cls._by_name is None: 53 | cls._load_datasets() 54 | if dataset not in cls._by_name: 55 | raise RuntimeError('Dataset not found: {}'.format(dataset)) 56 | return cls._by_name[dataset] 57 | 58 | def _set_dtypes(self, df): 59 | if self.dtypes is None: 60 | return 61 | 62 | column_check = [] 63 | for k, v in self.dtypes.items(): 64 | for value in v: 65 | column_check.append(value) 66 | if self.columns is not None: 67 | if set(self.columns).difference(column_check): 68 | print("Mismatched column names & dtypes. Mismatches:", set(self.columns).difference(column_check)) 69 | logger.error(("Mismatched column names & dtypes. Mismatches:", set(self.columns).difference(column_check))) 70 | for dtype, cols in self.dtypes.items(): 71 | for col in ((cols,) if isinstance(cols, str) else cols): 72 | if col in df: 73 | if dtype.startswith('date'): 74 | format = None if dtype == 'date' else dtype[5:] 75 | df[col] = pd.to_datetime(df[col], format=format, utc=True, errors='ignore') 76 | else: 77 | df[col] = df[col].astype(dtype, errors='ignore') 78 | 79 | def _glob(self, path): 80 | return glob.glob(os.path.join(path, self.fileglob)) 81 | 82 | def _load(self, filename): 83 | '''Return a raw, unprocessed dataframe.''' 84 | return pd.read_csv(filename, low_memory=False) 85 | 86 | def _load_single(self, filename): 87 | '''Use _load to read a dataframe from disk, then assign new column 88 | names and coerce the datatypes, as appropriate.''' 89 | df = self._load(filename) 90 | if self.columns is not None: 91 | df.columns = self.columns 92 | self._set_dtypes(df) 93 | if self.index is not None: 94 | df = df.set_index(self.index) 95 | return df 96 | 97 | def _finalize(self, df): 98 | return df 99 | 100 | def load(self, filenames, limit=None, max_workers=None): 101 | '''Load a composite dataframe by concatenating individual files.''' 102 | if isinstance(filenames, str): 103 | if os.path.isdir(filenames): 104 | filenames = self._glob(filenames) 105 | elif '*' in filenames: 106 | filenames = glob.glob(filenames) 107 | else: 108 | filenames = [filenames] 109 | nframes = len(filenames) 110 | if limit and nframes > limit: 111 | logger.info('limiting to {}/{} files'.format(limit, nframes)) 112 | filenames = filenames[-limit:] 113 | nframes = limit 114 | if nframes == 0: 115 | result = pd.DataFrame(columns=self.columns) 116 | self._set_dtypes(result) 117 | elif nframes == 1: 118 | result = self._load_single(filenames[0]) 119 | else: 120 | result = tqdm_execute_tasks(self._load_single, filenames, 121 | 'reading {} data'.format(self.dataset), max_workers) 122 | logger.info('concatenating {} dataframes'.format(nframes)) 123 | result = pd.concat(result, ignore_index=self.index is None) 124 | # Set the categorical columns again, because concatenation often 125 | # results in a reversion to object dtype 126 | cols = self.dtypes.get('category', ()) 127 | for col in ((cols,) if isinstance(cols, str) else cols): 128 | if col in result: 129 | result[col] = result[col].astype('category', errors='ignore') 130 | return self._finalize(result) 131 | -------------------------------------------------------------------------------- /datamine/loaders/block.py: -------------------------------------------------------------------------------- 1 | from . import Loader 2 | 3 | import pandas as pd 4 | import datetime 5 | import pytz 6 | 7 | 8 | class BlockLoader(Loader): 9 | dataset = 'BLOCK' 10 | fileglob = '*.csv.gz' 11 | 12 | # Column "Product Type 2" has an extra space after the name. 13 | # columns = ['Trade Datetime', 'Reported Datetime', 14 | # 'Contract Symbol', 'Product Code', 'Asset Class', 'Market Sector', 'Description ', 'Product Type ', 'Contract Year', 'Contract Month', 'Strike Price', 'Put/Call', 'Exchange Code', 'Trade Price', 'Trade Quantity', 'Trade Source', 'Spread Type', 'Spread Description', 15 | # 'Contract Symbol 2', 'Product Code 2', 'Asset Class 2', 'Market Sector 2', 'Description 2', 'Product Type 2 ', 'Contract Year 2', 'Contract Month 2', 'Strike Price 2', 'Put/Call 2', 'Exchange Code 2','Trade Price 2', 'Trade Quantity 2', 16 | # 'Contract Symbol 3', 'Product Code 3', 'Asset Class 3', 'Market Sector 3', 'Description 3', 'Product Type 3 ', 'Contract Year 3', 'Contract Month 3', 'Strike Price 3', 'Put/Call 3', 'Exchange Code 3', 17 | # 'Contract Symbol 4', 'Product Code 4', 'Asset Class 4', 'Market Sector 4', 'Description 4', 'Product Type 4 ', 'Contract Year 4', 'Contract Month 4', 'Strike Price 4', 'Put/Call 4', 'Exchange Code 4'] 18 | 19 | dtypes = {'category': ('Contract Symbol', 'Product Code', 'Asset Class', 'Market Sector', 'Description ', 'Product Type ', 'Put/Call', 'Exchange Code', 'Trade Source', 'Spread Type', 'Spread Description', 20 | 'Contract Symbol 2', 'Product Code 2', 'Asset Class 2', 'Market Sector 2', 'Description 2', 'Product Type 2 ', 'Put/Call 2', 'Exchange Code 2', 21 | 'Contract Symbol 3', 'Product Code 3', 'Asset Class 3', 'Market Sector 3', 'Description 3', 'Product Type 3 ', 'Put/Call 3', 'Exchange Code 3', 22 | 'Contract Symbol 4', 'Product Code 4', 'Asset Class 4', 'Market Sector 4', 'Description 4', 'Product Type 4 ', 'Put/Call 4', 'Exchange Code 4'), 23 | 'int64': ('Contract Year', 'Contract Month', 24 | 'Contract Year 2', 'Contract Month 2', 25 | 'Contract Year 3', 'Contract Month 3', 26 | 'Contract Year 4', 'Contract Month 4',), 27 | 'float': ('Strike Price', 'Trade Price', 'Trade Quantity', 28 | 'Strike Price 2', 'Trade Price 2', 'Trade Quantity 2', 29 | 'Strike Price 3', 30 | 'Strike Price 4'), 31 | 'date': ()} 32 | 33 | def _load(self, file): 34 | df = pd.read_csv(file, low_memory = False) 35 | 36 | df['Trade Datetime'] = df['Trade Date'].astype('str') + ' ' + df['Trade Time'].astype('str') 37 | df['Reported Datetime'] = df['Trade Date'].astype('str') + ' ' + df['Reported Time'].astype('str') 38 | 39 | timezone = df['Trade Datetime'].str[-2:] 40 | if timezone.unique()[0] == "ET": 41 | sub_string = " ET" 42 | timezone = pytz.timezone("US/Eastern") 43 | elif timezone.unique()[0] == "CT": 44 | sub_string = " CT" 45 | timezone = pytz.timezone("US/Central") 46 | else: 47 | pass 48 | 49 | df['Trade Datetime'] = df['Trade Datetime'].str.replace(sub_string, "") 50 | df['Reported Datetime'] = df['Reported Datetime'].str.replace(sub_string, "") 51 | 52 | df['Trade Datetime'] = df['Trade Datetime'].apply(datetime.datetime.strptime, args=('%Y%m%d %H:%M:%S',)) 53 | df['Trade Datetime'] = df['Trade Datetime'].apply(timezone.localize) 54 | df['Reported Datetime'] = df['Reported Datetime'].apply(datetime.datetime.strptime, args=('%Y%m%d %H:%M',)) 55 | df['Reported Datetime'] = df['Reported Datetime'].apply(timezone.localize) 56 | 57 | df = df.drop(['Trade Date', 'Trade Time', 'Reported Time'], axis=1) 58 | return(df) 59 | 60 | 61 | blockLoader = BlockLoader() 62 | -------------------------------------------------------------------------------- /datamine/loaders/cryptocurrency.py: -------------------------------------------------------------------------------- 1 | from . import Loader 2 | 3 | import pandas as pd 4 | import gzip 5 | import json 6 | 7 | class CryptocurrencyLoader(Loader): 8 | dataset = 'CRYPTOCURRENCY' 9 | fileglob = '*_btcIndexJson.gz' 10 | index = 'mdEntryDateTime' 11 | 12 | dtypes = {'category': ('mdEntryCode', 'mdEntryType', 'mdUpdateAction', 13 | 'symbol', 'openCloseSettlFlag'), 14 | 'int64': ('rptSeq',), 15 | 'float': ('netChgPrevDay', 'netPctChg', 'mdEntryPx'), 16 | 'date:%Y%m%d_%H:%M:%S.%f': 'mdEntryDateTime'} 17 | 18 | def _load(self, filename): 19 | result = [] 20 | with gzip.open(filename, 'rt', encoding='utf-8') as f: 21 | for line in f: 22 | line = json.loads(line) 23 | if 'mdEntries' in line: 24 | result.append(line['mdEntries'][0]) 25 | result = pd.DataFrame(result) 26 | result['mdEntryDateTime'] = result['mdEntryDate'] + '_' + result['mdEntryTime'] 27 | return result.drop(['mdEntryDate', 'mdEntryType'], axis=1) 28 | 29 | cryptocurrencyLoader = CryptocurrencyLoader() 30 | -------------------------------------------------------------------------------- /datamine/loaders/eod.py: -------------------------------------------------------------------------------- 1 | from . import Loader 2 | 3 | import pandas as pd 4 | import numpy as np 5 | 6 | class EODLoader(Loader): 7 | dataset = 'EOD' 8 | fileglob = '*.gz' 9 | 10 | columns = ['Trade Date','Exchange Code', 'Asset Class', 'Product Code', 'Clearing Code', 11 | 'Product Description', 'Product Type', 'Underlying Product Code', 12 | 'Put/Call', 'Strike Price', 'Contract Year', 'Contract Month', 13 | 'Contract Day', 'Settlement', 'Settlement Cabinet Indicator', 14 | 'Open Interest', 'Total Volume', 'Globex Volume', 'Floor Volume', 15 | 'PNT Volume', 'Block Volume', 'EFP Volume', 'EOO Volume', 'EFR Volume', 16 | 'EFS Volume', 'EFB Volume', 'EFM Volume', 'SUB Volume', 'OPNT Volume', 17 | 'TAS Volume', 'TAS Block Volume', 'TAM Singapore Volume', 18 | 'TAM Singapore Block Volume', 'TAM London Volume', 19 | 'TAM London Block Volume', 'Globex Open Price', 20 | 'Globex Open Price Bid/Ask Indicator', 21 | 'Globex Open Price Cabinet Indicator', 'Globex High Price', 22 | 'Globex High Price Bid/Ask Indicator', 23 | 'Globex High Price Cabinet Indicator', 'Globex Low Price', 24 | 'Globex Low Price Bid/Ask Indicator', 25 | 'Globex Low Price Cabinet Indicator', 'Globex Close Price', 26 | 'Globex Close Price Bid/Ask Indicator', 27 | 'Globex Close Price Cabinet Indicator', 'Floor Open Price', 28 | 'Floor Open Price Bid/Ask Indicator', 29 | 'Floor Open Price Cabinet Indicator', 'Floor Open Second Price', 30 | 'Floor Open Second Price Bid/Ask Indicator', 'Floor High Price', 31 | 'Floor High Price Bid/Ask Indicator', 32 | 'Floor High Price Cabinet Indicator', 'Floor Low Price', 33 | 'Floor Low Price Bid/Ask Indicator', 34 | 'Floor Low Price Cabinet Indicator', 'Floor Close Price', 35 | 'Floor Close Price Bid/Ask Indicator', 36 | 'Floor Close Price Cabinet Indicator', 'Floor Close Second Price', 37 | 'Floor Close Second Price Bid/Ask Indicator', 'Floor Post-Close Price', 38 | 'Floor Post-Close Price Bid/Ask Indicator', 39 | 'Floor Post-Close Second Price', 40 | 'Floor Post-Close Second Price Bid/Ask Indicator', 'Delta', 41 | 'Implied Volatility', 'Last Trade Date', 'TAM (Trade At Marker)'] 42 | 43 | dtypes = {'category': ('Settlement Cabinet Indicator', 'Asset Class', 'Product Code', 'Clearing Code', 44 | 'Product Description', 'Product Type', 'Underlying Product Code', 45 | 'Put/Call', 'Strike Price', 'Contract Year', 'Contract Month', 46 | 'Contract Day','Exchange Code','Globex Open Price Bid/Ask Indicator', 47 | 'Globex Open Price Cabinet Indicator','Globex High Price Bid/Ask Indicator', 48 | 'Globex High Price Cabinet Indicator','Globex Close Price Bid/Ask Indicator', 49 | 'Globex Close Price Cabinet Indicator','Floor Open Price Bid/Ask Indicator', 50 | 'Floor Open Price Cabinet Indicator','Globex Low Price Bid/Ask Indicator', 51 | 'Globex Low Price Cabinet Indicator', 'Floor Open Second Price Bid/Ask Indicator', 52 | 'Floor High Price Bid/Ask Indicator', 53 | 'Floor High Price Cabinet Indicator','Floor Low Price Bid/Ask Indicator', 54 | 'Floor Low Price Cabinet Indicator','Floor Close Price Bid/Ask Indicator', 55 | 'Floor Close Price Cabinet Indicator','Floor Post-Close Price Bid/Ask Indicator', 56 | 'Floor Post-Close Second Price Bid/Ask Indicator','Floor Close Second Price Bid/Ask Indicator', 57 | ), 58 | 'int64': ('Open Interest', 'Total Volume', 'Globex Volume', 'Floor Volume', 59 | 'PNT Volume', 'Block Volume', 'EFP Volume', 'EOO Volume', 'EFR Volume', 60 | 'EFS Volume', 'EFB Volume', 'EFM Volume', 'SUB Volume', 'OPNT Volume', 61 | 'TAS Volume', 'TAS Block Volume', 'TAM Singapore Volume', 62 | 'TAM Singapore Block Volume', 'TAM London Volume', 63 | 'TAM London Block Volume'), 64 | 'float': ('Settlement', 65 | 'Globex Open Price', 66 | 'Globex High Price', 67 | 'Globex Low Price', 68 | 'Globex Close Price', 69 | 'Floor Open Price', 70 | 'Floor Open Second Price', 71 | 'Floor High Price', 72 | 'Floor Low Price', 73 | 'Floor Close Price', 74 | 'Floor Close Second Price', 75 | 'Floor Post-Close Price', 76 | 'Floor Post-Close Second Price', 77 | 'Delta', 78 | 'Implied Volatility', 'TAM (Trade At Marker)'), 79 | 'date:%Y%m%d': ('Trade Date','Last Trade Date'), 80 | } 81 | 82 | def _load(self, file): 83 | df = pd.read_csv(file, skiprows=1, header=None, low_memory=False) 84 | if len(df.columns) == 70: 85 | df.insert(len(df.columns), "TAM (Trade At Marker)", float(np.nan)) 86 | return df 87 | 88 | eodLoader = EODLoader() 89 | -------------------------------------------------------------------------------- /datamine/loaders/eris.py: -------------------------------------------------------------------------------- 1 | from . import Loader 2 | 3 | import pandas as pd 4 | import numpy as np 5 | 6 | class ErisLoader(Loader): 7 | dataset = 'ERIS' 8 | fileglob = 'ERIS_*.csv' 9 | 10 | columns = ['Symbol', 'FinalSettlementPrice', 'EvaluationDate', 'FirstTradeDate', 11 | 'ErisPAIDate', 'EffectiveDate', 'CashFlowAlignmentDate', 'MaturityDate', 'NPV (A)', 12 | 'FixedNPV', 'FloatingNPV', 'Coupon (%)', 'FairCoupon (%)', 'FixedPayment', 'FloatingPayment', 13 | 'NextFixedPaymentDate', 'NextFixedPaymentAmount', 'PreviousFixingDate', 'PreviousFixingRate', 14 | 'NextFloatingPaymentDate', 'NextFloatingPaymentAmount', 'NextFixingDate', 'PreviousSettlementDate', 15 | 'PreviousSettlementPrice', 'PreviousErisPAI', 'FedFundsDate', 'FedFundsRate (%)', 'AccrualDays', 16 | 'DailyIncrementalErisPAI', 'AccruedCoupons (B)', 'ErisPAI (C)', 'SettlementPrice (100+A+B-C)', 17 | 'RFQ NPV TickSize ($)', 'Nominal', 'ResetRateDescriptor', 'InterpolationFactor', 'HighTradePrice', 18 | 'LowTradePrice', 'LastTradePrice', 'DailyContractVolume', 'Tag55(T)', 'Tag65(T)', 'Tag55(T+1)', 19 | 'Tag65(T+1)', 'LastTradeDate', 'InitialSpeculatorMargin', 'SecondarySpeculatorMargin', 20 | 'InitialHedgerMargin', 'SecondaryHedgerMargin', 'ExchangeSymbol (EX005)', 'BloombergTicker', 21 | 'FirstFixingDate', 'Category', 'BenchmarkContractName', 'PV01', 'DV01', 'ShortName', 22 | 'EffectiveYearMonth', 'UnpaidFixedAccrualStartDate', 'UnpaidFixedAccrual', 'UnpaidFloatingAccrualStartDate', 'UnpaidFloatingAccrual', 'NetUnpaidFixedFloatingAccrual', 'NPV(A)lessNetUnpaidFixedFloatingAccrual', 'AccruedCoupons(B)plusNetUnpaidFixedFloatingAccrual'] 23 | 24 | dtypes = {'category': ('Symbol', 'ResetRateDescriptor', 'ExchangeSymbol (EX005)', 'BloombergTicker', 'EffectiveYearMonth'), 25 | 'int64': ('AccrualDays', 'EffectiveYearMonth', 'Nominal'), 26 | 'float': ('FinalSettlementPrice', 'NPV (A)', 'FixedNPV', 'FloatingNPV', 'Coupon (%)', 27 | 'FairCoupon (%)', 'FixedPayment', 'FloatingPayment', 'NextFixedPaymentAmount', 28 | 'PreviousFixingRate', 'NextFloatingPaymentAmount', 'PreviousSettlementPrice', 29 | 'PreviousErisPAI', 'FedFundsRate (%)', 'DailyIncrementalErisPAI', 'AccruedCoupons (B)', 30 | 'ErisPAI (C)', 'SettlementPrice (100+A+B-C)', 'InterpolationFactor', 31 | 'HighTradePrice', 'PV01', 'DV01', 32 | 'UnpaidFixedAccrual','UnpaidFloatingAccrual','NetUnpaidFixedFloatingAccrual', 33 | 'NPV(A)lessNetUnpaidFixedFloatingAccrual', 'AccruedCoupons(B)plusNetUnpaidFixedFloatingAccrual'), 34 | 'date:%m/%d/%Y': ('EvaluationDate', 'FirstTradeDate', 'ErisPAIDate', 35 | 'EffectiveDate', 'CashFlowAlignmentDate', 'MaturityDate', 36 | 'NextFixedPaymentDate', 'PreviousFixingDate', 'NextFloatingPaymentDate', 37 | 'NextFixingDate', 'PreviousSettlementDate', 38 | 'FedFundsDate', 'LastTradeDate', 'FirstFixingDate', 39 | 'UnpaidFixedAccrualStartDate', 'UnpaidFloatingAccrualStartDate')} 40 | 41 | def _load(self, file): 42 | df = pd.read_csv(file, low_memory=False) 43 | if len(df.columns) == 58: 44 | col_adjustment = {'UnpaidFixedAccrualStartDate' : np.datetime64(), 'UnpaidFixedAccrual' : float(), 'UnpaidFloatingAccrualStartDate' : np.datetime64(), 'UnpaidFloatingAccrual' : float(), 'NetUnpaidFixedFloatingAccrual' : float(), 'NPV(A)lessNetUnpaidFixedFloatingAccrual' : float(), 'AccruedCoupons(B)plusNetUnpaidFixedFloatingAccrual' : float()} 45 | for k, v in col_adjustment.items(): 46 | df.insert(len(df.columns), k, v) 47 | return df 48 | 49 | erisLoader = ErisLoader() 50 | -------------------------------------------------------------------------------- /datamine/loaders/fx.py: -------------------------------------------------------------------------------- 1 | from . import Loader 2 | 3 | import pandas as pd 4 | 5 | class FXLoader(Loader): 6 | dataset = 'FX' 7 | fileglob = '*.gz' 8 | 9 | columns = ['Timestamp', 'Pair', 'Ask', 'Bid'] 10 | 11 | dtypes = {'category': ('Pair',), 12 | 'int64': (), 13 | 'float': ('Ask','Bid'), 14 | 'date': ('Timestamp',), 15 | } 16 | 17 | def _load(self, file): 18 | df = pd.read_csv(file, skiprows=1, header=None, low_memory=False) 19 | 20 | return df 21 | 22 | fxLoader = FXLoader() 23 | -------------------------------------------------------------------------------- /datamine/loaders/govpx.py: -------------------------------------------------------------------------------- 1 | from datamine.loaders import Loader 2 | 3 | import pandas as pd 4 | 5 | class GOVPXLoader(Loader): 6 | 7 | dataset = 'GOVPX' 8 | 9 | govpx_us_treasury_cols = ['Timestamp','Producer','Record','Ask','AskType','AskYield','Bid','BidType','BidYield','BidYieldChg','CashAskPrice','CashBidPrice','CashMidPrice','Change','Coupon','CUSIP','Description','DollarFlow','High','ICAPVOL','IndicativeAskPrice','IndicativeAskYield','IndicativeBidPrice','IndicativeBidYield','IssueDate','Last','LastHitorTake','LastYield','Low','MaturityDate','Mid','MidChg','MidSnapChg','MidYield','MidYldSnapChg','Open','SettlementDate','ShortDescription','TreasuryType','VoiceAskPrice','VoiceAskSize','VoiceAskYield','VoiceBidPrice','VoiceBidSize','VoiceBidYield','VoiceTradeSize','VWAP','VWAP10AM-11AM','VWAP11AM-12PM','VWAP12PM-1PM','VWAP1PM-2PM','VWAP2PM-3PM','VWAP3PM-4PM','VWAP8AM-9AM','VWAP9AM-10AM','VWAY','VWAY10AM-11AM','VWAY11AM-12PM','VWAY12PM-1PM','VWAY1PM-2PM','VWAY2PM-3PM','VWAY3PM-4PM','VWAY8AM-9AM','VWAY9AM-10AM'] 10 | govpx_us_tips_cols = ['Timestamp','Producer','Record','Ask','AskYield','Bid','BidYield','BidYieldChg','BidYieldChg','Coupon','CUSIP','Description','High','ICAPVOL','IndicativeAskPrice','IndicativeAskYield','IndicativeBidPrice','IndicativeBidYield','IssueDate','Last','LastHitorTake','LastYield','Low','MaturityDate','Mid','MidChg','MidSnapChg','MidYield','MidYldSnapChg','Open','SettlementDate','ShortDescription','Spread','TreasuryType','VoiceAskPrice','VoiceAskSize','VoiceAskYield','VoiceBidPrice','VoiceBidSize','VoiceBidYield','VoiceTradeSize'] 11 | govpx_us_frn_cols = ['Date','Producer','Record','Ask','AskYield','Bid','BidYield','CashAskPrice','CashBidPrice','CashMidPrice','Coupon','CUSIP','Description','FirstCouponDate','FRNIndexRate','High','IndicativeAskPrice','IndicativeAskYield','IndicativeBidPrice','IssueDate','Last','LastHitorTake','LastYield','Low','MaturityDate','Mid','MidSnapChg','MidYield','MidYldSnapChg','ModifiedDuration','Open','PriceDuration','SettlementDate','TreasuryType','VoiceAskPrice','VoiceAskSize','VoiceAskYield','VoiceBidPrice','VoiceBidSize','VoiceBidYield','VoiceTradeSize'] 12 | govpx_us_agencies_cols = ['Timestamp','Producer','Record','AgencySwapSpd','AgencySwapSprdChg','Ask','AskSpread','AskYield','AskYTMSpread','Bid','BidSpread','BidYield','BidYTMSpread','Change','Coupon','CUSIP','Description','IndicativeAskYield','IndicativeAskSpd','IndicativeBidYield','IndicativeBidSpd','IndicativeBidYield','MaturityDate'] 13 | 14 | govpx_us_treasury_dtypes = {'category': ('Producer','Record','CUSIP','Description','LastHitorTake','ShortDescription',), 15 | 'int64': ('ICAPVOL','TreasuryType','VoiceAskSize','VoiceBidSize',), 16 | 'float': ('Ask','AskType','AskYield', 17 | 'Bid','BidType','BidYield','BidYieldChg', 18 | 'CashAskPrice','CashBidPrice','CashMidPrice', 19 | 'Change','Coupon','DollarFlow','High', 20 | 'IndicativeAskPrice','IndicativeAskYield','IndicativeBidPrice','IndicativeBidYield', 21 | 'Last', 'LastYield', 'Low', 22 | 'Mid','MidChg','MidSnapChg','MidYield','MidYldSnapChg', 23 | 'Open','VoiceAskPrice','VoiceAskYield','VoiceBidPrice','VoiceBidYield','VoiceTradeSize', 24 | 'VWAP','VWAP10AM-11AM','VWAP11AM-12PM','VWAP12PM-1PM','VWAP1PM-2PM','VWAP2PM-3PM','VWAP3PM-4PM','VWAP8AM-9AM','VWAP9AM-10AM', 25 | 'VWAY','VWAY10AM-11AM','VWAY11AM-12PM','VWAY12PM-1PM','VWAY1PM-2PM','VWAY2PM-3PM','VWAY3PM-4PM','VWAY8AM-9AM','VWAY9AM-10AM',), 26 | 'date': ('Timestamp','IssueDate','MaturityDate','SettlementDate'), 27 | } 28 | 29 | govpx_us_tips_dtypes = {'category': ('Producer','Record','CUSIP','Description','LastHitorTake','ShortDescription',), 30 | 'int64': ('ICAPVOL','TreasuryType','VoiceAskSize','VoiceBidSize',), 31 | 'float': ('Ask','AskYield', 32 | 'Bid','BidYield','BidYieldChg', 33 | 'Coupon','High', 34 | 'IndicativeAskPrice','IndicativeAskYield','IndicativeBidPrice','IndicativeBidYield', 35 | 'Last', 'LastYield', 'Low', 36 | 'Mid','MidChg','MidSnapChg','MidYield','MidYldSnapChg', 37 | 'Open','Spread','VoiceAskPrice','VoiceAskYield','VoiceBidPrice','VoiceBidYield','VoiceTradeSize',), 38 | 'date': ('Timestamp','IssueDate','MaturityDate','SettlementDate'), 39 | } 40 | 41 | govpx_us_frn_dtypes = {'category': ('Producer','Record','CUSIP','Description','LastHitorTake',), 42 | 'int64': ('TreasuryType','VoiceAskSize','VoiceBidSize',), 43 | 'float': ('Ask','AskYield', 44 | 'Bid','BidYield', 45 | 'CashAskPrice','CashBidPrice','CashMidPrice', 46 | 'Coupon','High', 47 | 'IndicativeAskPrice','IndicativeAskYield','IndicativeBidPrice', 48 | 'Last', 'LastYield', 'Low', 49 | 'Mid','MidSnapChg','MidYield','MidYldSnapChg', 50 | 'Open','VoiceAskPrice','VoiceAskYield','VoiceBidPrice','VoiceBidYield','VoiceTradeSize', 51 | 'FRNIndexRate','ModifiedDuration','PriceDuration'), 52 | 'date': ('Date', 'IssueDate','MaturityDate','SettlementDate', 'FirstCouponDate'), 53 | } 54 | 55 | govpx_us_agencies_dtypes = {'category': ('Producer','Record','CUSIP','Description',), 56 | 'int64': (), 57 | 'float': ('Ask','AskYield', 58 | 'Bid','BidYield', 59 | 'Change','Coupon', 60 | 'IndicativeAskYield','IndicativeBidYield', 61 | 'AgencySwapSpd','AgencySwapSprdChg', 62 | 'AskSpread','AskYTMSpread', 63 | 'BidSpread','BidYTMSpread', 64 | 'IndicativeAskSpd','IndicativeBidSpd',), 65 | 'date': ('Timestamp','MaturityDate',), 66 | } 67 | 68 | if Loader.dataset_args == None: 69 | print("Specify a dataset for the GovPX loader.") 70 | else: 71 | for k, v in Loader.dataset_args.items(): 72 | if k == 'dataset': 73 | if v == 'treasury': 74 | columns = govpx_us_treasury_cols 75 | dtypes = govpx_us_treasury_dtypes 76 | fileglob = "*_UST_*.csv" 77 | elif v == 'tips': 78 | columns = govpx_us_tips_cols 79 | dtypes = govpx_us_tips_dtypes 80 | fileglob = "*_TIPS_*.csv" 81 | elif v == 'frn': 82 | columns = govpx_us_frn_cols 83 | dtypes = govpx_us_frn_dtypes 84 | fileglob = "*_FRN_*.csv" 85 | elif v == 'agencies': 86 | columns = govpx_us_agencies_cols 87 | dtypes = govpx_us_agencies_dtypes 88 | fileglob = "*_Agencies_*.csv" 89 | print("Complete reload") 90 | 91 | def _load(self, file): 92 | df = pd.read_csv(file, skiprows=1, header=None, low_memory=False) 93 | return df 94 | 95 | govpxLoader = GOVPXLoader() 96 | -------------------------------------------------------------------------------- /datamine/loaders/liqtool.py: -------------------------------------------------------------------------------- 1 | from . import Loader 2 | 3 | import pandas as pd 4 | from datetime import datetime, timedelta 5 | start = datetime(1970, 1, 1) # Unix epoch start time 6 | 7 | class LiqLoader(Loader): 8 | dataset = 'LIQTOOL' 9 | fileglob = 'LIQTOOL_*.csv.gz' 10 | index = 'tradedate' 11 | 12 | dtypes = {'category': ('symbol', 'time_zone'), 13 | 'int64': ('lot_1_size', 'lot_2_size', 'lot_3_size', 'lot_4_size', 'lot_5_size', 14 | 'lot_6_size', 'lot_7_size', 'lot_8_size', 'lot_9_size', 'lot_10_size', 15 | 'lot_11_size', 'lot_12_size', 'lot_13_size', 'lot_14_size', 'lot_15_size', 16 | 'lot_16_size', 'lot_17_size', 'lot_18_size', 'lot_19_size', 'lot_20_size', 17 | 'lot_21_size', 'lot_22_size', 'lot_23_size', 'lot_24_size', 'lot_25_size', 'frontmonth'), 18 | 'float': ('avg_level_1_spread', 'avg_level_1_midprice', 'avg_level_1_weightedprice', 'avg_level_1_ask_price', 'avg_level_1_bid_price', 'avg_level_1_ask_quantity', 'avg_level_1_bid_quantity', 'avg_level_1_ask_orders', 'avg_level_1_bid_orders', 19 | 'avg_level_2_spread', 'avg_level_2_midprice', 'avg_level_2_weightedprice', 'avg_level_2_ask_price', 'avg_level_2_bid_price', 'avg_level_2_ask_quantity', 'avg_level_2_bid_quantity', 'avg_level_2_ask_orders', 'avg_level_2_bid_orders', 20 | 'avg_level_3_spread', 'avg_level_3_midprice', 'avg_level_3_weightedprice', 'avg_level_3_ask_price', 'avg_level_3_bid_price', 'avg_level_3_ask_quantity', 'avg_level_3_bid_quantity', 'avg_level_3_ask_orders', 'avg_level_3_bid_orders', 21 | 'avg_level_4_spread', 'avg_level_4_midprice', 'avg_level_4_weightedprice', 'avg_level_4_ask_price', 'avg_level_4_bid_price', 'avg_level_4_ask_quantity', 'avg_level_4_bid_quantity', 'avg_level_4_ask_orders', 'avg_level_4_bid_orders', 22 | 'avg_level_5_spread', 'avg_level_5_midprice', 'avg_level_5_weightedprice', 'avg_level_5_ask_price', 'avg_level_5_bid_price', 'avg_level_5_ask_quantity', 'avg_level_5_bid_quantity', 'avg_level_5_ask_orders', 'avg_level_5_bid_orders', 23 | 'avg_level_6_spread', 'avg_level_6_midprice', 'avg_level_6_weightedprice', 'avg_level_6_ask_price', 'avg_level_6_bid_price', 'avg_level_6_ask_quantity', 'avg_level_6_bid_quantity', 'avg_level_6_ask_orders', 'avg_level_6_bid_orders', 24 | 'avg_level_7_spread', 'avg_level_7_midprice', 'avg_level_7_weightedprice', 'avg_level_7_ask_price', 'avg_level_7_bid_price', 'avg_level_7_ask_quantity', 'avg_level_7_bid_quantity', 'avg_level_7_ask_orders', 'avg_level_7_bid_orders', 25 | 'avg_level_8_spread', 'avg_level_8_midprice', 'avg_level_8_weightedprice', 'avg_level_8_ask_price', 'avg_level_8_bid_price', 'avg_level_8_ask_quantity', 'avg_level_8_bid_quantity', 'avg_level_8_ask_orders', 'avg_level_8_bid_orders', 26 | 'avg_level_9_spread', 'avg_level_9_midprice', 'avg_level_9_weightedprice', 'avg_level_9_ask_price', 'avg_level_9_bid_price', 'avg_level_9_ask_quantity', 'avg_level_9_bid_quantity', 'avg_level_9_ask_orders', 'avg_level_9_bid_orders', 27 | 'avg_level_10_spread', 'avg_level_10_midprice', 'avg_level_10_weightedprice', 'avg_level_10_ask_price', 'avg_level_10_bid_price', 'avg_level_10_ask_quantity', 'avg_level_10_bid_quantity', 'avg_level_10_ask_orders', 'avg_level_10_bid_orders', 28 | 'lot_1_buy_ctt', 'lot_1_sell_ctt', 'lot_1_buy_depth', 'lot_1_sell_depth', 29 | 'lot_2_buy_ctt', 'lot_2_sell_ctt', 'lot_2_buy_depth', 'lot_2_sell_depth', 30 | 'lot_3_buy_ctt', 'lot_3_sell_ctt', 'lot_3_buy_depth', 'lot_3_sell_depth', 31 | 'lot_4_buy_ctt', 'lot_4_sell_ctt', 'lot_4_buy_depth', 'lot_4_sell_depth', 32 | 'lot_5_buy_ctt', 'lot_5_sell_ctt', 'lot_5_buy_depth', 'lot_5_sell_depth', 33 | 'lot_6_buy_ctt', 'lot_6_sell_ctt', 'lot_6_buy_depth', 'lot_6_sell_depth', 34 | 'lot_7_buy_ctt', 'lot_7_sell_ctt', 'lot_7_buy_depth', 'lot_7_sell_depth', 35 | 'lot_8_buy_ctt', 'lot_8_sell_ctt', 'lot_8_buy_depth', 'lot_8_sell_depth', 36 | 'lot_9_buy_ctt', 'lot_9_sell_ctt', 'lot_9_buy_depth', 'lot_9_sell_depth', 37 | 'lot_10_buy_ctt', 'lot_10_sell_ctt', 'lot_10_buy_depth', 'lot_10_sell_depth', 38 | 'lot_11_buy_ctt', 'lot_11_sell_ctt', 'lot_11_buy_depth', 'lot_11_sell_depth', 39 | 'lot_12_buy_ctt', 'lot_12_sell_ctt', 'lot_12_buy_depth', 'lot_12_sell_depth', 40 | 'lot_13_buy_ctt', 'lot_13_sell_ctt', 'lot_13_buy_depth', 'lot_13_sell_depth', 41 | 'lot_14_buy_ctt', 'lot_14_sell_ctt', 'lot_14_buy_depth', 'lot_14_sell_depth', 42 | 'lot_15_buy_ctt', 'lot_15_sell_ctt', 'lot_15_buy_depth', 'lot_15_sell_depth', 43 | 'lot_16_buy_ctt', 'lot_16_sell_ctt', 'lot_16_buy_depth', 'lot_16_sell_depth', 44 | 'lot_17_buy_ctt', 'lot_17_sell_ctt', 'lot_17_buy_depth', 'lot_17_sell_depth', 45 | 'lot_18_buy_ctt', 'lot_18_sell_ctt', 'lot_18_buy_depth', 'lot_18_sell_depth', 46 | 'lot_19_buy_ctt', 'lot_19_sell_ctt', 'lot_19_buy_depth', 'lot_19_sell_depth', 47 | 'lot_20_buy_ctt', 'lot_20_sell_ctt', 'lot_20_buy_depth', 'lot_20_sell_depth', 48 | 'lot_21_buy_ctt', 'lot_21_sell_ctt', 'lot_21_buy_depth', 'lot_21_sell_depth', 49 | 'lot_22_buy_ctt', 'lot_22_sell_ctt', 'lot_22_buy_depth', 'lot_22_sell_depth', 50 | 'lot_23_buy_ctt', 'lot_23_sell_ctt', 'lot_23_buy_depth', 'lot_23_sell_depth', 51 | 'lot_24_buy_ctt', 'lot_24_sell_ctt', 'lot_24_buy_depth', 'lot_24_sell_depth', 52 | 'lot_25_buy_ctt', 'lot_25_sell_ctt', 'lot_25_buy_depth', 'lot_25_sell_depth',), 53 | 'date': ('unixtime',), 54 | 'date:%Y%m%d': ('tradedate',)} 55 | 56 | def _load(self, file): 57 | df = pd.read_csv(file, low_memory = False) 58 | df['unixtime'] = df['unix_in_sec'].apply(lambda x: start + timedelta(seconds=x)) 59 | df = df.drop(['unix_in_sec'], axis=1) 60 | return(df) 61 | 62 | liqLoader = LiqLoader() 63 | -------------------------------------------------------------------------------- /datamine/loaders/orbitalinsight.py: -------------------------------------------------------------------------------- 1 | from . import Loader 2 | 3 | import pandas as pd 4 | import os 5 | 6 | class OrbitalInsightLoader(Loader): 7 | dataset = 'ORBITALINSIGHT' 8 | fileglob = 'ORBITALINSIGHT_*.csv' 9 | 10 | columns = ['storage.capacity.estimate', 'volume.estimate.stderr', 'scaled.estimate.stderr', 11 | 'total.available.tanks', 'smoothed.estimate', 'sampled.tanks.1w', 12 | 'sampled.tanks.1d', 'volume.estimate', 'scaled.estimate', 'truth_value_mb', 13 | 'sampled.tanks', 'date', 'location'] 14 | 15 | dtypes = {'category': ('location',), 16 | 'int64': ('sampled.tanks', 'sampled.tanks.1d', 'sampled.tanks.1w', 'total.available.tanks'), 17 | 'float': ('smoothed.estimate', 'storage.capacity.estimate', 18 | 'truth_value_mb', 'volume.estimate', 'volume.estimate.stderr', 19 | 'scaled.estimate', 'scaled.estimate.stderr'), 20 | 'date': 'date'} 21 | 22 | def _load(self, file): 23 | _, location, sublocation, _ = os.path.basename(file).split('_', 3) 24 | if sublocation != '0': 25 | location = location + '_' + sublocation 26 | df = pd.read_csv(file, low_memory=False) 27 | df['location'] = location 28 | return df 29 | 30 | orbitalInsightLoader = OrbitalInsightLoader() 31 | -------------------------------------------------------------------------------- /datamine/loaders/rsmetrics.py: -------------------------------------------------------------------------------- 1 | from . import Loader 2 | 3 | import glob 4 | import os 5 | 6 | class RSMetricsLoader(Loader): 7 | dataset = 'RSMETRICS' 8 | 9 | names = ['Order', 'Ticker', 'Type', 'Full.Name', 'Name', 'Location.Type', 'Smelter.Storage', 10 | 'Metal.Shape', 'Metal.Type', 'YearMonthDayUTC', 'Address', 'City', 'State', 'Zip', 11 | 'Country', 'Employee.Cars', 'Containers', 'Trucks', 'Tippers', 'Total.Area.Metal.stocks.m2', 12 | 'Area.Piles.m2', 'Area.Concentrate.Bags.m2', 'Area.Cathodes.m2', 'Area.Anodes.m2', 13 | 'Comments', 'Notes', 'Time_Date', 'Time', 'Month', 'Day', 'Year', 'PrePost', 'DOW', 14 | 'Week.End', 'Region', 'Subregion', 'Latitude', 'Longitude', 'DIRECTORY', 'GMP', 15 | 'Location', 'Metal', 'YearMonth', 'Tot.Area', 'Drop'] 16 | 17 | dtypes = {'category': ('Ticker', 'Type', 'Full.Name', 'Name', 'Location.Type', 18 | 'Smelter.Storage', 'Metal.Shape', 'Metal.Type', 'Country', 'PrePost', 'PrePost', 19 | 'Location', 'Metal'), 20 | 'int64': ('Employee.Cars', 'Containers', 'Trucks', 'Tippers', 'Total.Area.Metal.stocks.m2', 21 | 'Area.Piles.m2', 'Area.Concentrate.Bags.m2', 'Area.Cathodes.m2', 22 | 'Area.Anodes.m2', 'Tot.Area'), 23 | 'date:%Y-%m-%d': ('Notes', ), 24 | 'date:%H:%M %m-%d-%Y': ('Time_Date', )} 25 | 26 | # Return the weekly data first, then the daily 27 | def _glob(self, path): 28 | base = os.path.join(path, 'RSMETRICS_*') 29 | return glob.glob(base + '_WEEKLY_*.csv') + glob.glob(base + '_DAILY_*.csv') 30 | 31 | rsMetricsLoader = RSMetricsLoader() 32 | -------------------------------------------------------------------------------- /datamine/loaders/sofr.py: -------------------------------------------------------------------------------- 1 | from . import Loader 2 | 3 | import pandas as pd 4 | 5 | class SOFROISLoader(Loader): 6 | dataset = 'SOFR' 7 | fileglob = 'SOFR_OIS_*.csv' 8 | columns = ['Trade Date', 'Exchange Code', 'Currency','Commodity Code', 9 | 'Short Description','Long Description', 'Curve Date', 'Offset', 10 | 'Discount Factor', 'Forward rate', 'Rate'] 11 | 12 | 13 | dtypes = {'category': ('Exchange Code', 'Currency', 'Commodity Code', 14 | 'Short Description', 'Long Description','Curve Date','Forward rate'), 15 | 'int64': ('Offset',), 16 | 'float': ('Discount Factor','Rate'), 17 | 'date:%Y%m%d': ('Trade Date',)} 18 | 19 | def _load(self, file): 20 | # Assumption: the header from the value column provides 21 | # the name of the measure for that CSV file. 22 | df = pd.read_csv(file, low_memory=False) 23 | return df 24 | 25 | sofroisLoader = SOFROISLoader() 26 | -------------------------------------------------------------------------------- /datamine/loaders/sofrsr.py: -------------------------------------------------------------------------------- 1 | from . import Loader 2 | 3 | import pandas as pd 4 | import gzip 5 | import json 6 | 7 | class SOFRStripRatesLoader(Loader): 8 | dataset = 'SOFRSR' 9 | fileglob = 'SOFRSR_TermRate_Fixings_*.JSON' 10 | 11 | columns = ['rate','transactionTime','businessDate','productCode','securityId','productDescription'] 12 | 13 | dtypes = { 14 | 'category': ('productCode', 'productDescription', 'securityId',), 15 | 'float': ('rate',), 16 | 'date:%m-%d-%Y' : ('businessDate'), 17 | 'date:%m-%d-%Y:%H:%M:%S' : ('transactionTime') 18 | } 19 | 20 | def _load(self, filename): 21 | result = [] 22 | with open(filename, 'rt', encoding='utf-8') as f: 23 | for line in f: 24 | line = json.loads(line) 25 | result = pd.json_normalize(line['payload']) 26 | 27 | return result 28 | 29 | SOFRstripratesLoader = SOFRStripRatesLoader() 30 | -------------------------------------------------------------------------------- /datamine/loaders/telluslabs.py: -------------------------------------------------------------------------------- 1 | from . import Loader 2 | 3 | import pandas as pd 4 | 5 | class TellusLabsLoader(Loader): 6 | dataset = 'TELLUSLABS' 7 | fileglob = 'TELLUSLABS_*.csv' 8 | index = 'metric_date' 9 | columns = ['crop', 'country_iso', 'geo_level', 'geo_id', 10 | 'geo_display_name', 'metric_date', 11 | 'value', 'measure'] 12 | dtypes = {'category': ('crop', 'country_iso', 'geo_level', 13 | 'geo_display_name', 'measure'), 14 | 'int64': ('geo_id',), 15 | 'float': ('value',), 16 | 'date:%Y-%m-%d': ('metric_date',)} 17 | 18 | def _load(self, file): 19 | # Assumption: the header from the value column provides 20 | # the name of the measure for that CSV file. 21 | df = pd.read_csv(file, low_memory=False) 22 | df['measure'] = df.columns[-1] 23 | return df 24 | 25 | tellusLabsLoader = TellusLabsLoader() 26 | -------------------------------------------------------------------------------- /datamine/loaders/tick.py: -------------------------------------------------------------------------------- 1 | from . import Loader 2 | 3 | import pandas as pd 4 | 5 | class TickLoader(Loader): 6 | dataset = 'TICK' 7 | fileglob = '*.gz' 8 | 9 | columns = ['trade_date_time', 'trade_date', 'trade_time', 10 | 'trade_sequence_number', 'session_indicator', 11 | 'ticker_symbol', 'future_option_index_indicator', 'contract_delivery_date', 12 | 'trade_quantity', 'strike_price', 'trade_price', 'ask_bid_type', 13 | 'indicative_quote_type', 'market_quote', 'close_open_type', 14 | 'valid_open_exception', 'post_close', 'cancel_code_type', 15 | 'insert_code_type', 'fast_late_indicator', 'cabinet_indicator', 16 | 'book_indicator', 'entry_date', 'exchange_code'] 17 | 18 | dtypes = {'category': ('session_indicator', 'ticker_symbol', 'future_option_index_indicator', 19 | 'close_open_type', 'exchange_code', 'ask_bid_type', 'indicative_quote_type', 20 | 'valid_open_exception', 'post_close', 'cancel_code_type', 21 | 'insert_code_type', 'fast_late_indicator', 'cabinet_indicator', 'book_indicator'), 22 | 'int64': ('trade_sequence_number', 'contract_delivery_date', 'trade_quantity'), 23 | 'float': ('strike_price', 'trade_price'), 24 | 'date:%H:%M:%s': ('trade_time'), 25 | 'date:%Y%m%d': ('trade_date', 'entry_date'), 26 | 'date': ('trade_date_time')} 27 | 28 | def _load(self, file): 29 | df = pd.read_csv(file, header=None, low_memory=False) 30 | 31 | # Make trade_date_time the first column 32 | df.insert(0, -1, df[0].astype(str) + 'T' + df[1].astype(str)) 33 | 34 | return(df) 35 | 36 | tickLoader = TickLoader() 37 | -------------------------------------------------------------------------------- /datamine/loaders/voi.py: -------------------------------------------------------------------------------- 1 | from . import Loader 2 | 3 | import pandas as pd 4 | 5 | class VOILoader(Loader): 6 | dataset = 'VOI' 7 | fileglob = '*.gz' 8 | 9 | columns = ['Trade Date','Exchange Code','Product Code','Product Description', 10 | 'Product Type','Put/Call','Strike Price', 11 | 'Contract Year','Contract Month','Open Interest', 12 | 'Total Volume','Globex Volume','Floor Volume','PNT Volume', 13 | 'Block Volume','DataType'] 14 | 15 | dtypes = {'category': ('Exchange Code','Product Code','Product Description', 16 | 'Product Type','Put/Call','Strike Price', 17 | 'Contract Year','Contract Month','DataType'), 18 | 'int64': ('Open Interest', 19 | 'Total Volume','Globex Volume','Floor Volume','PNT Volume', 20 | 'Block Volume'), 21 | 'float': (), 22 | 'date:%Y%m%d:%s': ('Trade Date',), 23 | } 24 | 25 | def _load(self, file): 26 | df = pd.read_csv(file, skiprows=1, header=None, low_memory=False) 27 | 28 | #Need to extract the timing of the data from the file name. 29 | if file[-17] == 'p': 30 | df['DataType'] = 'Preliminary' 31 | if file[-17] == 'f': 32 | df['DataType'] = 'Final' 33 | return df 34 | 35 | voiLoader = VOILoader() 36 | -------------------------------------------------------------------------------- /datamine/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from tqdm import tqdm 4 | from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed 5 | 6 | MAX_WORKERS = 4 7 | 8 | logger = logging.getLogger(__name__.rsplit('.', 1)[0]) 9 | 10 | # If we're in a Jupyter notebook, we need to play some tricks 11 | # in order to get the logger output to show up in the notebook. 12 | try: 13 | from IPython import get_ipython 14 | if 'IPKernelApp' in get_ipython().config: 15 | import sys 16 | logger.handlers = [logging.StreamHandler(sys.stderr)] 17 | logger.setLevel(logging.INFO) 18 | except Exception: 19 | pass 20 | 21 | def tqdm_execute_tasks(fn, keys, desc, max_workers=MAX_WORKERS, mode='process'): 22 | """ 23 | Equivalent to executor.map(fn, values), but uses a tqdm-based progress bar 24 | """ 25 | if max_workers == 1: 26 | return [fn(key) for key in tqdm(keys, desc=desc)] 27 | # Processes are better for the dataframe loading tasks, but 28 | # threads are significantly better for downloads 29 | Executor = ThreadPoolExecutor if mode == 'thread' else ProcessPoolExecutor 30 | with Executor(max_workers=max_workers) as executor: 31 | futures = [executor.submit(fn, key) for key in keys] 32 | for f in tqdm(as_completed(futures), total=len(keys), desc=desc): 33 | pass 34 | return [f.result() for f in futures] 35 | -------------------------------------------------------------------------------- /docs/CME Query API's - EOD_Block_Tick_BBO - Google Docs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CMEGroup/datamine_python/0454c8f04cb379de10c2949590bf6b92479af520/docs/CME Query API's - EOD_Block_Tick_BBO - Google Docs.pdf -------------------------------------------------------------------------------- /examples/images/BitcoinEndofDayValue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CMEGroup/datamine_python/0454c8f04cb379de10c2949590bf6b92479af520/examples/images/BitcoinEndofDayValue.png -------------------------------------------------------------------------------- /examples/images/BitcoinRTIndexValue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CMEGroup/datamine_python/0454c8f04cb379de10c2949590bf6b92479af520/examples/images/BitcoinRTIndexValue.png -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | 4 | [flake8] 5 | ignore=W504,E501 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open('README.md') as fp: 4 | long_description = fp.read() 5 | 6 | setup( 7 | name="datamine", 8 | version="0.21.post2", 9 | description="CME Group DataMine Python Package", 10 | url="https://github.com/CMEGroup/datamine_python", 11 | author="Aaron Walters", 12 | author_email="aaron.walters@cmegroup.com", 13 | maintainer="Hamza Amjad", 14 | maintainer_email="hamza.amjad@cmegroup.com", 15 | license="BSD 3-Clause", 16 | install_requires=['requests', 'urllib3', 'pandas', 'tqdm', 'futures'], 17 | packages=find_packages(exclude=['tests']), 18 | long_description=long_description, 19 | long_description_content_type="text/markdown", 20 | classifiers=[ 21 | "Development Status :: 3 - Alpha", 22 | "Intended Audience :: Financial and Insurance Industry", 23 | "License :: OSI Approved :: BSD License", 24 | "Programming Language :: Python :: 3.5", 25 | "Programming Language :: Python :: 3.6", 26 | "Programming Language :: Python :: 3.7", 27 | ]) 28 | --------------------------------------------------------------------------------