├── .gitignore
├── LICENSE
├── README.md
├── requirements.txt
└── scrape.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Bryant Moscon
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # BITMEX Historical Data Scraper
 2 | 
 3 | Bitmex no longer offers historical trade data on their REST API. They do have the data in a public AWS bucket, which this scrapes and converts to CSV files (by year).
 4 | 
 5 | 
 6 | ### Installation
 7 | 1. Clone/download repository
 8 | 2. Install requirements: `pip install -r requirements.txt`
 9 | 
10 | 
11 | ### Usage
12 | * `python scrape.py` - Scrape all available data
13 | * `python scrape.py --start YYYYMMDD` - Scrape data from start date through yesterday
14 | * `python scrape.py --start YYYYMMDD --end YYYYMMDD` - Scrape data from start date through end date (inclusive)
15 | * `python scrape.py --end YYYYMMDD` - Scrape data from start of data through end date
16 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | pandas
3 | 


--------------------------------------------------------------------------------
/scrape.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime as dt
 2 | from datetime import timedelta
 3 | import argparse
 4 | import gzip
 5 | import glob
 6 | import os
 7 | import shutil
 8 | import time
 9 | 
10 | import requests
11 | 
12 | 
13 | # https://public.bitmex.com/?prefix=data/trade/
14 | endpoint = 'https://s3-eu-west-1.amazonaws.com/public.bitmex.com/data/trade/{}.csv.gz'
15 | 
16 | 
17 | def scrape(year, date, end):
18 |     end_date = min(dt(year, 12, 31), dt.today() - timedelta(days=1))
19 | 
20 |     while date <= end_date and date <= end:
21 |         date_str = date.strftime('%Y%m%d')
22 |         print("Processing {}...".format(date))
23 |         count = 0
24 |         while True:
25 |             r = requests.get(endpoint.format(date_str))
26 |             if r.status_code == 200:
27 |                 break
28 |             else:
29 |                 count += 1
30 |                 if count == 10:
31 |                     r.raise_for_status()
32 |                 print("Error processing {} - {}, trying again".format(date, r.status_code))
33 |                 time.sleep(10)
34 | 
35 | 
36 |         with open(date_str, 'wb') as fp:
37 |             fp.write(r.content)
38 | 
39 |         with gzip.open(date_str, 'rb') as fp:
40 |             data = fp.read()
41 | 
42 |         with open(date_str, 'wb') as fp:
43 |             fp.write(data)
44 | 
45 |         date += timedelta(days=1)
46 | 
47 | 
48 | def merge(year):
49 |     print("Generating CSV for {}".format(year))
50 |     files = sorted(glob.glob("{}*".format(year)))
51 |     first = True
52 |     with open("{}.csv".format(year), 'wb') as out:
53 |         for f in files:
54 |             with open(f, 'rb') as fp:
55 |                 if first is False:
56 |                     fp.readline()
57 |                 first = False
58 |                 shutil.copyfileobj(fp, out)
59 |     for f in files:
60 |         os.unlink(f)
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     parser = argparse.ArgumentParser(description='BitMex historical data scraper. Scrapes files into single year CSVs')
65 |     parser.add_argument('--start', default="20141122", help='start date, in YYYYMMDD format. Default is 2014-11-22, the earliest data date for BitMex')
66 |     parser.add_argument('--end', default=None, help='end date, in YYYYMMDD format. Default is yesterday')
67 |     args = parser.parse_args()
68 | 
69 |     start = dt.strptime(args.start, '%Y%m%d')
70 |     end = dt.strptime(args.end, '%Y%m%d') if args.end else dt.utcnow()
71 | 
72 |     years = list(range(start.year, end.year + 1))
73 | 
74 |     starts = [dt(year, 1, 1) for year in years]
75 |     starts[0] = start
76 | 
77 |     for year, start in zip(years, starts):
78 |         scrape(year, start, end)
79 |         merge(year)
80 | 


--------------------------------------------------------------------------------