├── readme.md ├── listcreator.py ├── log.txt ├── unpack.py └── scraper.py /readme.md: -------------------------------------------------------------------------------- 1 | # betfairdata 2 | 3 | Code for downloading, unzipping and aggregating betfair data. 4 | 5 | http://data.betfair.com/ 6 | 7 | - Truncate log.txt 8 | - Run scraper.py (add username / password) 9 | - Run unpack.py (add mysql host/user/password) 10 | - Repeat 2 and 3 to load latest data 11 | 12 | 13 | Don't judge me on the quality of the code.. 14 | -------------------------------------------------------------------------------- /listcreator.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | import datetime 4 | import csv 5 | 6 | 7 | class ListCreator(object): 8 | 9 | def __init__(self, directory): 10 | self.directory = directory 11 | self.downloaded_list = [] 12 | self.to_download_list = [] 13 | 14 | def create_URL_list(self): 15 | self.downloaded_list = [] 16 | with open(self.directory + 'log.txt', 'r') as file: 17 | reader = csv.reader(file) 18 | next(reader, None) 19 | for row in reader: 20 | self.downloaded_list.append(row[2]) 21 | print('AlreadyDownloaded', len(self.downloaded_list)) 22 | return self.downloaded_list 23 | 24 | def load_URLs(self): 25 | self.to_download_list = [] 26 | date_home_page = 'http://data.betfair.com/#null' 27 | r = requests.get(date_home_page) 28 | data = r.text 29 | soup = BeautifulSoup(data, 'html.parser') 30 | for a in soup.findAll('a'): 31 | if '#null' in a['href']: 32 | a = str(a) 33 | url = a[a.index("(") + 1:a.rindex(")")].split(',')[0].replace("'", '') 34 | filename = url[33::] 35 | data = ['http://data.betfair.com/' + url, filename] 36 | if filename not in self.downloaded_list and len(filename) == 32: 37 | self.to_download_list.append(filename) 38 | return self.to_download_list 39 | 40 | def log_download(self, URL, file, filename): 41 | with open(self.directory + 'log.txt', 'a', newline="\n") as f: 42 | writer = csv.writer(f) 43 | writer.writerow((URL, file, filename, datetime.datetime.now())) 44 | -------------------------------------------------------------------------------- /log.txt: -------------------------------------------------------------------------------- 1 | http://data.betfair.com/datastore/downloadfile.aspx?file=c5a08a62bbddf7b592bf948d97f63f10,/datastore/data/bfinf_horse_161219to161225_161228120000.zip,c5a08a62bbddf7b592bf948d97f63f10,2017-01-23 19:46:49.734778 2 | http://data.betfair.com/datastore/downloadfile.aspx?file=c5a08a62bbddf7b592bf948d97f63f10,/datastore/data/bfinf_horse_161219to161225_161228120000.zip,c5a08a62bbddf7b592bf948d97f63f10,2017-01-23 19:47:57.509700 3 | http://data.betfair.com/datastore/downloadfile.aspx?file=a66f71e4fcc4c328ef35f0299b02c142,/datastore/data/bfinf_horse_170102to170108_170111120004.zip,a66f71e4fcc4c328ef35f0299b02c142,2017-01-23 19:48:14.947477 4 | http://data.betfair.com/datastore/downloadfile.aspx?file=81183fddf41d19a4bd4517856bce4096,/datastore/data/bfinf_horse_170109to170115_170118120002.zip,81183fddf41d19a4bd4517856bce4096,2017-01-23 19:48:15.358642 5 | http://data.betfair.com/datastore/downloadfile.aspx?file=8d2979d671312e740f8a419551d462d5,/datastore/data/bfinf_horse_161212to161218_161221120001.zip,8d2979d671312e740f8a419551d462d5,2017-01-23 19:48:24.616177 6 | http://data.betfair.com/datastore/downloadfile.aspx?file=3130cd351454e326576bfbe2ac6fb03b,/datastore/data/bfinf_horse_161226to170101_170104120002.zip,3130cd351454e326576bfbe2ac6fb03b,2017-01-23 19:48:29.757314 7 | http://data.betfair.com/datastore/downloadfile.aspx?file=b2c6b7c99a2d9700c3b04f10e62b74da,/datastore/data/bfinf_horse_161205to161211_161214120005.zip,b2c6b7c99a2d9700c3b04f10e62b74da,2017-01-23 19:49:13.378715 8 | http://data.betfair.com/datastore/downloadfile.aspx?file=302838b20658be3c729b8d597cea9aee,/datastore/data/bfinf_horse_161121to161127_161130120001.zip,302838b20658be3c729b8d597cea9aee,2017-01-23 19:49:13.795990 9 | http://data.betfair.com/datastore/downloadfile.aspx?file=6ddad5266c2f6314785ec0f0a8409d14,/datastore/data/bfinf_horse_161128to161204_161207120000.zip,6ddad5266c2f6314785ec0f0a8409d14,2017-01-23 19:49:15.988496 10 | http://data.betfair.com/datastore/downloadfile.aspx?file=ffdfd293f32c03f7725179106a3b3b36,/datastore/data/bfinf_horse_161107to161113_161116120004.zip,ffdfd293f32c03f7725179106a3b3b36,2017-01-23 19:49:25.843002 11 | http://data.betfair.com/datastore/downloadfile.aspx?file=f56b08e22f32a4d315625e2628efc19a,/datastore/data/bfinf_horse_161114to161120_161123120003.zip,f56b08e22f32a4d315625e2628efc19a,2017-01-23 19:49:27.980456 12 | http://data.betfair.com/datastore/downloadfile.aspx?file=333c6196115fef1c16a8b4b8f0fad4a0,/datastore/data/bfinf_horse_161031to161106_161109120004.zip,333c6196115fef1c16a8b4b8f0fad4a0,2017-01-23 19:54:54.853264 13 | -------------------------------------------------------------------------------- /unpack.py: -------------------------------------------------------------------------------- 1 | import os 2 | import zipfile 3 | from io import StringIO 4 | import pandas as pd 5 | import numpy as np 6 | import pandas.io.sql as pd_sql 7 | import pymysql 8 | import queue 9 | import threading 10 | import datetime 11 | pymysql.install_as_MySQLdb() 12 | 13 | 14 | def create_data_frame(data, loadId): 15 | df = pd.read_csv(data, header=0, error_bad_lines=False) 16 | df['SETTLED_DATE'] = df['SETTLED_DATE'].str[:10] 17 | # pd.to_datetime(df['SETTLED_DATE'], format='%d-%m-%Y %H:%M:%S') 18 | df = df.drop(['ODDS', 'SELECTION_ID', 'WIN_FLAG', 'EVENT_ID'], axis=1) 19 | if 'COURSE' in df.columns: 20 | t = df.groupby(['SPORTS_ID', 'COUNTRY', 'COURSE', 'SETTLED_DATE', 'IN_PLAY'], sort=True).sum() 21 | else: 22 | t = df.groupby(['SPORTS_ID', 'SETTLED_DATE', 'IN_PLAY'], sort=True).sum() 23 | t['LOAD_ID'] = loadId 24 | t['VOLUME_MATCHED'] = np.round(t['VOLUME_MATCHED'], 2) 25 | t['DATE_LOADED'] = datetime.datetime.now() 26 | return t 27 | 28 | 29 | def write_to_sql(dataframe, tablename='BETFAIR_ALL_DATA'): 30 | pd_sql.to_sql(dataframe, tablename, conn, if_exists='append', flavor='mysql') 31 | 32 | 33 | def test_if(loadId): 34 | cur = conn.cursor() 35 | cur.execute("SELECT COUNT(*) FROM BETFAIR_ALL_DATA WHERE LOAD_ID = %s", (loadId,)) 36 | output = cur.fetchone()[0] 37 | if output == 0: 38 | return True 39 | else: 40 | return False 41 | 42 | 43 | def load_to_list(): 44 | cur = conn.cursor() 45 | cur.execute("SELECT DISTINCT(LOAD_ID) FROM BETFAIR_ALL_DATA") 46 | out = cur.fetchall() 47 | COLUMN = 0 48 | column = [elt[COLUMN] for elt in out] 49 | return column 50 | 51 | 52 | def my(myQu): 53 | print('starting mysql worker') 54 | while True: 55 | df = myQu.get() 56 | write_to_sql(df) 57 | print('written') 58 | 59 | 60 | def worker(fileQu, myQu): 61 | print('starting worker') 62 | while not fileQu.empty(): 63 | file = fileQu.get() 64 | print('file', file) 65 | fn = file 66 | filename = os.path.splitext(file)[0] 67 | csv_file = '.'.join([os.path.splitext(fn)[0], 'csv']) 68 | 69 | if os.path.splitext(fn)[1] == '.zip': 70 | try: 71 | filehandle = open(file, 'rb') 72 | zfile = zipfile.ZipFile(filehandle) 73 | 74 | for f in zfile.infolist(): 75 | try: 76 | csv_file = '.'.join([os.path.splitext(f.filename)[0], 'csv']) 77 | 78 | data = StringIO(zfile.read(csv_file).decode('utf-8')) 79 | df = create_data_frame(data, fn) 80 | myQu.put(df) 81 | # time.sleep(10000) 82 | except: 83 | csv_file = '.'.join([os.path.splitext(f.filename)[0], 'txt']) 84 | 85 | data = StringIO(zfile.read(csv_file).decode('utf-8')) 86 | df = create_data_frame(data, fn) 87 | myQu.put(df) 88 | except: 89 | print() 90 | print('error ', file) 91 | print() 92 | elif os.path.splitext(fn)[1] == '.csv': 93 | if 'rar' in fn: 94 | fn = os.path.join('download', str(os.path.splitext(fn)[0].split('/')[2]) + '.csv') 95 | else: 96 | fn = os.path.join('download', str(os.path.splitext(fn)[0].split('/')[1]) + '.csv') 97 | data = open(fn) 98 | df = create_data_frame(data, fn) 99 | myQu.put(df) 100 | elif os.path.splitext(fn)[1] == '.rar': 101 | pass 102 | else: 103 | print('error', file) 104 | 105 | 106 | conn = pymysql.connect(host='', port=3306, 107 | user='', passwd='', db='DATA') 108 | myQu = queue.Queue() 109 | fileQu = queue.Queue() 110 | threading.Thread(target=my, args=(myQu,), daemon=True).start() 111 | 112 | loads = load_to_list() 113 | print(len(loads)) 114 | toloads = [] 115 | 116 | for subdir, dirs, files in os.walk('download'): 117 | files = [f for f in files if not f[0] == '.'] 118 | for fn in files: 119 | if fn.split('.')[1] == 'zip': 120 | file = os.path.join('download', fn) 121 | z = os.path.join('download', fn) 122 | elif fn.split('.')[1] in ['csv']: 123 | file = os.path.join(str(os.path.splitext(fn)[0]) + '.csv') 124 | z = os.path.join(str(os.path.splitext(fn)[0]) + '.' + fn.split('.')[1]) 125 | if file not in loads: 126 | fileQu.put(file) 127 | toloads.append(file) 128 | else: 129 | os.remove(z) 130 | 131 | print('toload', len(toloads), toloads) 132 | 133 | 134 | worker(fileQu, myQu) 135 | -------------------------------------------------------------------------------- /scraper.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import queue 3 | import threading 4 | from bs4 import BeautifulSoup 5 | from requests.adapters import HTTPAdapter 6 | from requests.packages.urllib3.poolmanager import PoolManager 7 | import ssl 8 | 9 | from listcreator import ListCreator 10 | 11 | 12 | class MyAdapter(HTTPAdapter): 13 | def init_poolmanager(self, connections, maxsize, block=False): 14 | self.poolmanager = PoolManager( 15 | num_pools=connections, 16 | maxsize=maxsize, 17 | block=block, 18 | ssl_version=ssl.PROTOCOL_TLSv1 19 | ) 20 | 21 | headers = { 22 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 23 | 'Accept-Encoding': 'gzip, deflate', 24 | 'Content-Type': 'application/x-www-form-urlencoded', 25 | 'Cache-Control': 'max-age=0', 26 | 'Connection': 'keep-alive', 27 | 'Content-Length': 109, 28 | 'EnableViewState': False 29 | } 30 | 31 | 32 | class Downloader(object): 33 | 34 | def __init__(self): 35 | self.viewstate = None 36 | self.eventvalidation = None 37 | self.viewstategenerator = None 38 | 39 | def login(self, session, URL, filename, username, password): 40 | 41 | login = 'https://bdpconsole.betfair.com/login/data/login.aspx?ReturnUrl=%2fdatastore%2fdownloadfile.' \ 42 | 'aspx%3ffile%3d' + filename + '&file=' + filename 43 | params = {'file': filename} 44 | r = session.get(URL, params=params) 45 | 46 | data = r.text 47 | soup = BeautifulSoup(data, 'html.parser') 48 | self.viewstate = soup.select("#__VIEWSTATE")[0]['value'] 49 | self.eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value'] 50 | self.viewstategenerator = soup.select('#__VIEWSTATEGENERATOR')[0]['value'] 51 | 52 | payload = { 53 | '__EVENTTARGET': '', 54 | '__EVENTARGUMENT': '', 55 | '__VIEWSTATE': self.viewstate, 56 | '__VIEWSTATEGENERATOR': self.viewstategenerator, 57 | '__EVENTVALIDATION': self.eventvalidation, 58 | 'txtUser': username, 59 | 'txtPass': password, 60 | 'btnAccept': 'Login' 61 | } 62 | params = { 63 | 'ReturnUrl': URL[27:], 64 | 'file': filename 65 | } 66 | r = session.post(login, data=payload, params=params, cookies=session.cookies, headers=headers) 67 | 68 | data = r.text 69 | soup = BeautifulSoup(data, 'html.parser') 70 | viewstate = soup.findAll("input", {"type": "hidden", "name": "__VIEWSTATE"}) 71 | viewstategenerator = soup.findAll("input", {"type": "hidden", "name": "__VIEWSTATEGENERATOR"}) 72 | self.viewstate = viewstate[0]['value'] 73 | self.viewstategenerator = viewstategenerator[0]['value'] 74 | print('login', self.viewstate, self.viewstategenerator) 75 | 76 | def get_location(self, session, URL): 77 | payload = { 78 | '__VIEWSTATE': self.viewstate, 79 | '__VIEWSTATEGENERATOR': self.viewstategenerator, 80 | 'btnAccept': 'Download' 81 | } 82 | req = session.post(URL, data=payload, allow_redirects=False) 83 | return req.headers['Location'] 84 | 85 | def download_file(self, session, loc): 86 | name = loc.split('/')[-1].split('.')[0] 87 | filetype = loc.split('.')[1] 88 | payload = { 89 | '__VIEWSTATE': self.viewstate, 90 | '__VIEWSTATEGENERATOR': self.viewstategenerator, 91 | 'btnAccept': 'Download' 92 | } 93 | URL = 'http://data.betfair.com' + loc 94 | req = session.get(URL, data=payload, allow_redirects=False) 95 | 96 | if filetype in ['zip', 'rar']: 97 | with open('download/' + name + '.' + filetype, 'wb') as f: 98 | for chunk in req.iter_content(chunk_size=1024): 99 | if chunk: # filter out keep-alive new chunks 100 | f.write(chunk) 101 | return True 102 | else: 103 | print('error', loc, name, filetype) 104 | return False 105 | 106 | 107 | def log(logQu): 108 | zm = 0 109 | while True: 110 | data = logQu.get() 111 | (URL, loc, filename) = data 112 | d.log_download(URL, loc, filename) 113 | zm += 1 114 | print(str(zm) + '/' + str(total)) 115 | 116 | 117 | def worker(s, fileQu): 118 | print('Starting worker thread') 119 | while not fileQu.empty(): 120 | filename = fileQu.get() 121 | URL = 'http://data.betfair.com/datastore/downloadfile.aspx?file=' + filename 122 | 123 | loc = download.get_location(s, URL) 124 | ds = download.download_file(s, loc) 125 | file = loc.split('/')[-1] 126 | data = (URL, loc, filename) 127 | log_queue.put(data) 128 | 129 | 130 | d = ListCreator('') 131 | d.create_URL_list() 132 | filenames = d.load_URLs() 133 | total = len(filenames) 134 | print('toDownload', len(filenames)) 135 | 136 | log_queue = queue.Queue() 137 | file_queue = queue.Queue() 138 | 139 | threading.Thread(target=log, args=(log_queue,), daemon=True).start() 140 | for file in filenames: 141 | file_queue.put(file) 142 | 143 | 144 | download = Downloader() 145 | 146 | with requests.Session() as s: 147 | s.cookies.clear() 148 | s.mount('https://', MyAdapter()) 149 | 150 | URL = 'http://data.betfair.com/datastore/downloadfile.aspx?file=' + filenames[0] 151 | download.login(s, URL, filenames[0], username='', password='') 152 | 153 | for i in range(0, 5): 154 | threading.Thread(target=worker, args=(s, file_queue,)).start() 155 | --------------------------------------------------------------------------------