├── .gitignore ├── LICENSE ├── README.md ├── requirements.txt └── scrape.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Bryant Moscon 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BITMEX Historical Data Scraper 2 | 3 | Bitmex no longer offers historical trade data on their REST API. They do have the data in a public AWS bucket, which this scrapes and converts to CSV files (by year). 4 | 5 | 6 | ### Installation 7 | 1. Clone/download repository 8 | 2. Install requirements: `pip install -r requirements.txt` 9 | 10 | 11 | ### Usage 12 | * `python scrape.py` - Scrape all available data 13 | * `python scrape.py --start YYYYMMDD` - Scrape data from start date through yesterday 14 | * `python scrape.py --start YYYYMMDD --end YYYYMMDD` - Scrape data from start date through end date (inclusive) 15 | * `python scrape.py --end YYYYMMDD` - Scrape data from start of data through end date 16 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | pandas 3 | -------------------------------------------------------------------------------- /scrape.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime as dt 2 | from datetime import timedelta 3 | import argparse 4 | import gzip 5 | import glob 6 | import os 7 | import shutil 8 | import time 9 | 10 | import requests 11 | 12 | 13 | # https://public.bitmex.com/?prefix=data/trade/ 14 | endpoint = 'https://s3-eu-west-1.amazonaws.com/public.bitmex.com/data/trade/{}.csv.gz' 15 | 16 | 17 | def scrape(year, date, end): 18 | end_date = min(dt(year, 12, 31), dt.today() - timedelta(days=1)) 19 | 20 | while date <= end_date and date <= end: 21 | date_str = date.strftime('%Y%m%d') 22 | print("Processing {}...".format(date)) 23 | count = 0 24 | while True: 25 | r = requests.get(endpoint.format(date_str)) 26 | if r.status_code == 200: 27 | break 28 | else: 29 | count += 1 30 | if count == 10: 31 | r.raise_for_status() 32 | print("Error processing {} - {}, trying again".format(date, r.status_code)) 33 | time.sleep(10) 34 | 35 | 36 | with open(date_str, 'wb') as fp: 37 | fp.write(r.content) 38 | 39 | with gzip.open(date_str, 'rb') as fp: 40 | data = fp.read() 41 | 42 | with open(date_str, 'wb') as fp: 43 | fp.write(data) 44 | 45 | date += timedelta(days=1) 46 | 47 | 48 | def merge(year): 49 | print("Generating CSV for {}".format(year)) 50 | files = sorted(glob.glob("{}*".format(year))) 51 | first = True 52 | with open("{}.csv".format(year), 'wb') as out: 53 | for f in files: 54 | with open(f, 'rb') as fp: 55 | if first is False: 56 | fp.readline() 57 | first = False 58 | shutil.copyfileobj(fp, out) 59 | for f in files: 60 | os.unlink(f) 61 | 62 | 63 | if __name__ == '__main__': 64 | parser = argparse.ArgumentParser(description='BitMex historical data scraper. Scrapes files into single year CSVs') 65 | parser.add_argument('--start', default="20141122", help='start date, in YYYYMMDD format. Default is 2014-11-22, the earliest data date for BitMex') 66 | parser.add_argument('--end', default=None, help='end date, in YYYYMMDD format. Default is yesterday') 67 | args = parser.parse_args() 68 | 69 | start = dt.strptime(args.start, '%Y%m%d') 70 | end = dt.strptime(args.end, '%Y%m%d') if args.end else dt.utcnow() 71 | 72 | years = list(range(start.year, end.year + 1)) 73 | 74 | starts = [dt(year, 1, 1) for year in years] 75 | starts[0] = start 76 | 77 | for year, start in zip(years, starts): 78 | scrape(year, start, end) 79 | merge(year) 80 | --------------------------------------------------------------------------------