├── readme.md
├── listcreator.py
├── log.txt
├── unpack.py
└── scraper.py


/readme.md:
--------------------------------------------------------------------------------
 1 | # betfairdata
 2 | 
 3 | Code for downloading, unzipping and aggregating betfair data.
 4 | 
 5 | http://data.betfair.com/
 6 | 
 7 | - Truncate log.txt
 8 | - Run scraper.py (add username / password)
 9 | - Run unpack.py (add mysql host/user/password)
10 | - Repeat 2 and 3 to load latest data
11 | 
12 | 
13 | Don't judge me on the quality of the code..
14 | 


--------------------------------------------------------------------------------
/listcreator.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import requests
 3 | import datetime
 4 | import csv
 5 | 
 6 | 
 7 | class ListCreator(object):
 8 | 
 9 |     def __init__(self, directory):
10 |         self.directory = directory
11 |         self.downloaded_list = []
12 |         self.to_download_list = []
13 | 
14 |     def create_URL_list(self):
15 |         self.downloaded_list = []
16 |         with open(self.directory + 'log.txt', 'r') as file:
17 |             reader = csv.reader(file)
18 |             next(reader, None)
19 |             for row in reader:
20 |                 self.downloaded_list.append(row[2])
21 |         print('AlreadyDownloaded', len(self.downloaded_list))
22 |         return self.downloaded_list
23 | 
24 |     def load_URLs(self):
25 |         self.to_download_list = []
26 |         date_home_page = 'http://data.betfair.com/#null'
27 |         r = requests.get(date_home_page)
28 |         data = r.text
29 |         soup = BeautifulSoup(data, 'html.parser')
30 |         for a in soup.findAll('a'):
31 |             if '#null' in a['href']:
32 |                 a = str(a)
33 |                 url = a[a.index("(") + 1:a.rindex(")")].split(',')[0].replace("'", '')
34 |                 filename = url[33::]
35 |                 data = ['http://data.betfair.com/' + url, filename]
36 |                 if filename not in self.downloaded_list and len(filename) == 32:
37 |                     self.to_download_list.append(filename)
38 |         return self.to_download_list
39 | 
40 |     def log_download(self, URL, file, filename):
41 |         with open(self.directory + 'log.txt', 'a', newline="\n") as f:
42 |             writer = csv.writer(f)
43 |             writer.writerow((URL, file, filename, datetime.datetime.now()))
44 | 


--------------------------------------------------------------------------------
/log.txt:
--------------------------------------------------------------------------------
 1 | http://data.betfair.com/datastore/downloadfile.aspx?file=c5a08a62bbddf7b592bf948d97f63f10,/datastore/data/bfinf_horse_161219to161225_161228120000.zip,c5a08a62bbddf7b592bf948d97f63f10,2017-01-23 19:46:49.734778
 2 | http://data.betfair.com/datastore/downloadfile.aspx?file=c5a08a62bbddf7b592bf948d97f63f10,/datastore/data/bfinf_horse_161219to161225_161228120000.zip,c5a08a62bbddf7b592bf948d97f63f10,2017-01-23 19:47:57.509700
 3 | http://data.betfair.com/datastore/downloadfile.aspx?file=a66f71e4fcc4c328ef35f0299b02c142,/datastore/data/bfinf_horse_170102to170108_170111120004.zip,a66f71e4fcc4c328ef35f0299b02c142,2017-01-23 19:48:14.947477
 4 | http://data.betfair.com/datastore/downloadfile.aspx?file=81183fddf41d19a4bd4517856bce4096,/datastore/data/bfinf_horse_170109to170115_170118120002.zip,81183fddf41d19a4bd4517856bce4096,2017-01-23 19:48:15.358642
 5 | http://data.betfair.com/datastore/downloadfile.aspx?file=8d2979d671312e740f8a419551d462d5,/datastore/data/bfinf_horse_161212to161218_161221120001.zip,8d2979d671312e740f8a419551d462d5,2017-01-23 19:48:24.616177
 6 | http://data.betfair.com/datastore/downloadfile.aspx?file=3130cd351454e326576bfbe2ac6fb03b,/datastore/data/bfinf_horse_161226to170101_170104120002.zip,3130cd351454e326576bfbe2ac6fb03b,2017-01-23 19:48:29.757314
 7 | http://data.betfair.com/datastore/downloadfile.aspx?file=b2c6b7c99a2d9700c3b04f10e62b74da,/datastore/data/bfinf_horse_161205to161211_161214120005.zip,b2c6b7c99a2d9700c3b04f10e62b74da,2017-01-23 19:49:13.378715
 8 | http://data.betfair.com/datastore/downloadfile.aspx?file=302838b20658be3c729b8d597cea9aee,/datastore/data/bfinf_horse_161121to161127_161130120001.zip,302838b20658be3c729b8d597cea9aee,2017-01-23 19:49:13.795990
 9 | http://data.betfair.com/datastore/downloadfile.aspx?file=6ddad5266c2f6314785ec0f0a8409d14,/datastore/data/bfinf_horse_161128to161204_161207120000.zip,6ddad5266c2f6314785ec0f0a8409d14,2017-01-23 19:49:15.988496
10 | http://data.betfair.com/datastore/downloadfile.aspx?file=ffdfd293f32c03f7725179106a3b3b36,/datastore/data/bfinf_horse_161107to161113_161116120004.zip,ffdfd293f32c03f7725179106a3b3b36,2017-01-23 19:49:25.843002
11 | http://data.betfair.com/datastore/downloadfile.aspx?file=f56b08e22f32a4d315625e2628efc19a,/datastore/data/bfinf_horse_161114to161120_161123120003.zip,f56b08e22f32a4d315625e2628efc19a,2017-01-23 19:49:27.980456
12 | http://data.betfair.com/datastore/downloadfile.aspx?file=333c6196115fef1c16a8b4b8f0fad4a0,/datastore/data/bfinf_horse_161031to161106_161109120004.zip,333c6196115fef1c16a8b4b8f0fad4a0,2017-01-23 19:54:54.853264
13 | 


--------------------------------------------------------------------------------
/unpack.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import zipfile
  3 | from io import StringIO
  4 | import pandas as pd
  5 | import numpy as np
  6 | import pandas.io.sql as pd_sql
  7 | import pymysql
  8 | import queue
  9 | import threading
 10 | import datetime
 11 | pymysql.install_as_MySQLdb()
 12 | 
 13 | 
 14 | def create_data_frame(data, loadId):
 15 |     df = pd.read_csv(data, header=0, error_bad_lines=False)
 16 |     df['SETTLED_DATE'] = df['SETTLED_DATE'].str[:10]
 17 |     # pd.to_datetime(df['SETTLED_DATE'], format='%d-%m-%Y %H:%M:%S')
 18 |     df = df.drop(['ODDS', 'SELECTION_ID', 'WIN_FLAG', 'EVENT_ID'], axis=1)
 19 |     if 'COURSE' in df.columns:
 20 |         t = df.groupby(['SPORTS_ID', 'COUNTRY', 'COURSE', 'SETTLED_DATE', 'IN_PLAY'], sort=True).sum()
 21 |     else:
 22 |         t = df.groupby(['SPORTS_ID', 'SETTLED_DATE', 'IN_PLAY'], sort=True).sum()
 23 |     t['LOAD_ID'] = loadId
 24 |     t['VOLUME_MATCHED'] = np.round(t['VOLUME_MATCHED'], 2)
 25 |     t['DATE_LOADED'] = datetime.datetime.now()
 26 |     return t
 27 | 
 28 | 
 29 | def write_to_sql(dataframe, tablename='BETFAIR_ALL_DATA'):
 30 |     pd_sql.to_sql(dataframe, tablename, conn, if_exists='append', flavor='mysql')
 31 | 
 32 | 
 33 | def test_if(loadId):
 34 |     cur = conn.cursor()
 35 |     cur.execute("SELECT COUNT(*) FROM BETFAIR_ALL_DATA WHERE LOAD_ID = %s", (loadId,))
 36 |     output = cur.fetchone()[0]
 37 |     if output == 0:
 38 |         return True
 39 |     else:
 40 |         return False
 41 | 
 42 | 
 43 | def load_to_list():
 44 |     cur = conn.cursor()
 45 |     cur.execute("SELECT DISTINCT(LOAD_ID) FROM BETFAIR_ALL_DATA")
 46 |     out = cur.fetchall()
 47 |     COLUMN = 0
 48 |     column = [elt[COLUMN] for elt in out]
 49 |     return column
 50 | 
 51 | 
 52 | def my(myQu):
 53 |     print('starting mysql worker')
 54 |     while True:
 55 |         df = myQu.get()
 56 |         write_to_sql(df)
 57 |         print('written')
 58 | 
 59 | 
 60 | def worker(fileQu, myQu):
 61 |     print('starting worker')
 62 |     while not fileQu.empty():
 63 |         file = fileQu.get()
 64 |         print('file', file)
 65 |         fn = file
 66 |         filename = os.path.splitext(file)[0]
 67 |         csv_file = '.'.join([os.path.splitext(fn)[0], 'csv'])
 68 | 
 69 |         if os.path.splitext(fn)[1] == '.zip':
 70 |             try:
 71 |                 filehandle = open(file, 'rb')
 72 |                 zfile = zipfile.ZipFile(filehandle)
 73 | 
 74 |                 for f in zfile.infolist():
 75 |                     try:
 76 |                         csv_file = '.'.join([os.path.splitext(f.filename)[0], 'csv'])
 77 | 
 78 |                         data = StringIO(zfile.read(csv_file).decode('utf-8'))
 79 |                         df = create_data_frame(data, fn)
 80 |                         myQu.put(df)
 81 |                         # time.sleep(10000)
 82 |                     except:
 83 |                         csv_file = '.'.join([os.path.splitext(f.filename)[0], 'txt'])
 84 | 
 85 |                         data = StringIO(zfile.read(csv_file).decode('utf-8'))
 86 |                         df = create_data_frame(data, fn)
 87 |                         myQu.put(df)
 88 |             except:
 89 |                 print()
 90 |                 print('error    ', file)
 91 |                 print()
 92 |         elif os.path.splitext(fn)[1] == '.csv':
 93 |             if 'rar' in fn:
 94 |                 fn = os.path.join('download', str(os.path.splitext(fn)[0].split('/')[2]) + '.csv')
 95 |             else:
 96 |                 fn = os.path.join('download', str(os.path.splitext(fn)[0].split('/')[1]) + '.csv')
 97 |             data = open(fn)
 98 |             df = create_data_frame(data, fn)
 99 |             myQu.put(df)
100 |         elif os.path.splitext(fn)[1] == '.rar':
101 |             pass
102 |         else:
103 |             print('error', file)
104 | 
105 | 
106 | conn = pymysql.connect(host='', port=3306,
107 |                        user='', passwd='', db='DATA')
108 | myQu = queue.Queue()
109 | fileQu = queue.Queue()
110 | threading.Thread(target=my, args=(myQu,), daemon=True).start()
111 | 
112 | loads = load_to_list()
113 | print(len(loads))
114 | toloads = []
115 | 
116 | for subdir, dirs, files in os.walk('download'):
117 |     files = [f for f in files if not f[0] == '.']
118 |     for fn in files:
119 |         if fn.split('.')[1] == 'zip':
120 |             file = os.path.join('download', fn)
121 |             z = os.path.join('download', fn)
122 |         elif fn.split('.')[1] in ['csv']:
123 |             file = os.path.join(str(os.path.splitext(fn)[0]) + '.csv')
124 |             z = os.path.join(str(os.path.splitext(fn)[0]) + '.' + fn.split('.')[1])
125 |         if file not in loads:
126 |             fileQu.put(file)
127 |             toloads.append(file)
128 |         else:
129 |             os.remove(z)
130 | 
131 | print('toload', len(toloads), toloads)
132 | 
133 | 
134 | worker(fileQu, myQu)
135 | 


--------------------------------------------------------------------------------
/scraper.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import queue
  3 | import threading
  4 | from bs4 import BeautifulSoup
  5 | from requests.adapters import HTTPAdapter
  6 | from requests.packages.urllib3.poolmanager import PoolManager
  7 | import ssl
  8 | 
  9 | from listcreator import ListCreator
 10 | 
 11 | 
 12 | class MyAdapter(HTTPAdapter):
 13 |     def init_poolmanager(self, connections, maxsize, block=False):
 14 |         self.poolmanager = PoolManager(
 15 |             num_pools=connections,
 16 |             maxsize=maxsize,
 17 |             block=block,
 18 |             ssl_version=ssl.PROTOCOL_TLSv1
 19 |         )
 20 | 
 21 | headers = {
 22 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 23 |     'Accept-Encoding': 'gzip, deflate',
 24 |     'Content-Type': 'application/x-www-form-urlencoded',
 25 |     'Cache-Control': 'max-age=0',
 26 |     'Connection': 'keep-alive',
 27 |     'Content-Length': 109,
 28 |     'EnableViewState': False
 29 | }
 30 | 
 31 | 
 32 | class Downloader(object):
 33 | 
 34 |     def __init__(self):
 35 |         self.viewstate = None
 36 |         self.eventvalidation = None
 37 |         self.viewstategenerator = None
 38 | 
 39 |     def login(self, session, URL, filename, username, password):
 40 | 
 41 |         login = 'https://bdpconsole.betfair.com/login/data/login.aspx?ReturnUrl=%2fdatastore%2fdownloadfile.' \
 42 |                 'aspx%3ffile%3d' + filename + '&file=' + filename
 43 |         params = {'file': filename}
 44 |         r = session.get(URL, params=params)
 45 | 
 46 |         data = r.text
 47 |         soup = BeautifulSoup(data, 'html.parser')
 48 |         self.viewstate = soup.select("#__VIEWSTATE")[0]['value']
 49 |         self.eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
 50 |         self.viewstategenerator = soup.select('#__VIEWSTATEGENERATOR')[0]['value']
 51 | 
 52 |         payload = {
 53 |             '__EVENTTARGET': '',
 54 |             '__EVENTARGUMENT': '',
 55 |             '__VIEWSTATE': self.viewstate,
 56 |             '__VIEWSTATEGENERATOR': self.viewstategenerator,
 57 |             '__EVENTVALIDATION': self.eventvalidation,
 58 |             'txtUser': username,
 59 |             'txtPass': password,
 60 |             'btnAccept': 'Login'
 61 |         }
 62 |         params = {
 63 |             'ReturnUrl': URL[27:],
 64 |             'file': filename
 65 |         }
 66 |         r = session.post(login, data=payload, params=params, cookies=session.cookies, headers=headers)
 67 | 
 68 |         data = r.text
 69 |         soup = BeautifulSoup(data, 'html.parser')
 70 |         viewstate = soup.findAll("input", {"type": "hidden", "name": "__VIEWSTATE"})
 71 |         viewstategenerator = soup.findAll("input", {"type": "hidden", "name": "__VIEWSTATEGENERATOR"})
 72 |         self.viewstate = viewstate[0]['value']
 73 |         self.viewstategenerator = viewstategenerator[0]['value']
 74 |         print('login', self.viewstate, self.viewstategenerator)
 75 | 
 76 |     def get_location(self, session, URL):
 77 |         payload = {
 78 |             '__VIEWSTATE': self.viewstate,
 79 |             '__VIEWSTATEGENERATOR': self.viewstategenerator,
 80 |             'btnAccept': 'Download'
 81 |         }
 82 |         req = session.post(URL, data=payload, allow_redirects=False)
 83 |         return req.headers['Location']
 84 | 
 85 |     def download_file(self, session, loc):
 86 |         name = loc.split('/')[-1].split('.')[0]
 87 |         filetype = loc.split('.')[1]
 88 |         payload = {
 89 |             '__VIEWSTATE': self.viewstate,
 90 |             '__VIEWSTATEGENERATOR': self.viewstategenerator,
 91 |             'btnAccept': 'Download'
 92 |         }
 93 |         URL = 'http://data.betfair.com' + loc
 94 |         req = session.get(URL, data=payload, allow_redirects=False)
 95 | 
 96 |         if filetype in ['zip', 'rar']:
 97 |             with open('download/' + name + '.' + filetype, 'wb') as f:
 98 |                 for chunk in req.iter_content(chunk_size=1024):
 99 |                     if chunk: # filter out keep-alive new chunks
100 |                         f.write(chunk)
101 |             return True
102 |         else:
103 |             print('error', loc, name, filetype)
104 |             return False
105 | 
106 | 
107 | def log(logQu):
108 |     zm = 0
109 |     while True:
110 |         data = logQu.get()
111 |         (URL, loc, filename) = data
112 |         d.log_download(URL, loc, filename)
113 |         zm += 1
114 |         print(str(zm) + '/' + str(total))
115 | 
116 | 
117 | def worker(s, fileQu):
118 |     print('Starting worker thread')
119 |     while not fileQu.empty():
120 |         filename = fileQu.get()
121 |         URL = 'http://data.betfair.com/datastore/downloadfile.aspx?file=' + filename
122 | 
123 |         loc = download.get_location(s, URL)
124 |         ds = download.download_file(s, loc)
125 |         file = loc.split('/')[-1]
126 |         data = (URL, loc, filename)
127 |         log_queue.put(data)
128 | 
129 | 
130 | d = ListCreator('')
131 | d.create_URL_list()
132 | filenames = d.load_URLs()
133 | total = len(filenames)
134 | print('toDownload', len(filenames))
135 | 
136 | log_queue = queue.Queue()
137 | file_queue = queue.Queue()
138 | 
139 | threading.Thread(target=log, args=(log_queue,), daemon=True).start()
140 | for file in filenames:
141 |     file_queue.put(file)
142 | 
143 | 
144 | download = Downloader()
145 | 
146 | with requests.Session() as s:
147 |     s.cookies.clear()
148 |     s.mount('https://', MyAdapter())
149 | 
150 |     URL = 'http://data.betfair.com/datastore/downloadfile.aspx?file=' + filenames[0]
151 |     download.login(s, URL, filenames[0], username='', password='')
152 | 
153 |     for i in range(0, 5):
154 |         threading.Thread(target=worker, args=(s, file_queue,)).start()
155 | 


--------------------------------------------------------------------------------