├── .gitignore ├── Readme.md └── scripts ├── scraper.py └── parser.py /.gitignore: -------------------------------------------------------------------------------- 1 | data/html_files/ 2 | dev/ -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | ## Get Cricket Data 2 | 3 | Scripts for scraping [espncricinfo.com](http://www.espncricinfo.com/) and data on over 43,000 cricket matches. 4 | 5 | ### Scripts 6 | 7 | * [scraper.py](scripts/scraper.py) scrapes data from espncricinfo and downloads it to a local directory. 8 | * [parser.py](scripts/parser.py) parses the data and creates a [csv with match level data](data/final_output.csv) 9 | 10 | ### Data 11 | 12 | The scripts produce [final_output.csv](data/final_output.csv) 13 | 14 | ### Application 15 | * Article based on current scripts/data: [Fairly Random](https://github.com/dwillis/toss-up) 16 | * An article based on previous version of the data: [Cricket: An Unfairly Random Game?](http://gbytes.gsood.com/2011/05/07/cricket-an-unfairly-random-game/) 17 | 18 | ### License 19 | 20 | The license is only for the scripts and not for the data. The scripts are released under the [MIT License](https://opensource.org/licenses/MIT) 21 | -------------------------------------------------------------------------------- /scripts/scraper.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 3 | Download Cricket Data 4 | 5 | ''' 6 | #!/usr/bin/env python 7 | # -*- coding: utf-8 -*- 8 | 9 | import urllib2 10 | import csv 11 | import sys 12 | import time 13 | import os 14 | import unicodedata 15 | from urlparse import urlparse 16 | from BeautifulSoup import BeautifulSoup, SoupStrainer 17 | 18 | BASE_URL = 'http://www.espncricinfo.com' 19 | 20 | if not os.path.exists('./espncricinfo-fc'): 21 | os.mkdir('./espncricinfo-fc') 22 | 23 | for i in range(0, 6019): 24 | #odi: soupy = BeautifulSoup(urllib2.urlopen('http://search.espncricinfo.com/ci/content/match/search.html?search=odi;all=1;page=' + str(i)).read()) 25 | #test: soupy = BeautifulSoup(urllib2.urlopen('http://search.espncricinfo.com/ci/content/match/search.html?search=test;all=1;page=' + str(i)).read()) 26 | #t20i: soupy = BeautifulSoup(urllib2.urlopen('http://search.espncricinfo.com/ci/content/match/search.html?search=t20i;all=1;page=' + str(i)).read()) 27 | #t20: soupy = BeautifulSoup(urllib2.urlopen('http://search.espncricinfo.com/ci/content/match/search.html?search=t20;all=1;page=' + str(i)).read()) 28 | #list a: soupy = BeautifulSoup(urllib2.urlopen('http://search.espncricinfo.com/ci/content/match/search.html?search=list%20a;all=1;page=' + str(i)).read()) 29 | #fc: 30 | soupy = BeautifulSoup(urllib2.urlopen('http://search.espncricinfo.com/ci/content/match/search.html?search=first%20class;all=1;page=' + str(i)).read()) 31 | 32 | time.sleep(1) 33 | for new_host in soupy.findAll('a', {'class' : 'srchPlyrNmTxt'}): 34 | try: 35 | new_host = new_host['href'] 36 | except: 37 | continue 38 | odiurl = BASE_URL + urlparse(new_host).geturl() 39 | new_host = unicodedata.normalize('NFKD', new_host).encode('ascii','ignore') 40 | print new_host 41 | #print(type(str.split(new_host)[3])) 42 | print str.split(new_host, "/")[4] 43 | html = urllib2.urlopen(odiurl).read() 44 | if html: 45 | with open('espncricinfo-fc/{0!s}'.format(str.split(new_host, "/")[4]), "wb") as f: 46 | f.write(html) 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /scripts/parser.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 3 | Parse Downloaded Cricket Data 4 | 5 | ''' 6 | 7 | import os 8 | 9 | INPUT_FOLDER = ["espncricinfo-t20","espncricinfo-lista","espncricinfo-fc","espncricinfo-odi","espncricinfo-t20i","espncricinfo-test"] 10 | FINAL_OUTPUT_FILE = "final_output.csv" 11 | 12 | HEADER = "url, team1, team2, win_toss, bat_or_bowl, outcome, win_game, date, day_n_night, ground, rain, duckworth_lewis, match_id, type_of_match" 13 | 14 | def removeEndLineCharacter(mystr): 15 | if mystr.find("\n"): 16 | return mystr[:-1] 17 | 18 | def removeDoubleQuote(myStr): 19 | return myStr.replace("\"", "").strip(); 20 | 21 | def removeDivInText(myStr, divs): 22 | for div in divs: 23 | myStr = myStr.replace(div,"") 24 | return myStr 25 | 26 | def refineHTMLToText(myStr): 27 | for div in USELESS_TAGS: 28 | myStr = myStr.replace(div,"") 29 | myStr = myStr.replace(" "," ").strip() 30 | return myStr 31 | 32 | 33 | def findWithPattern(mystr, startPattern, endPattern): 34 | """ 35 | Find the string that starts with and ends with in the orginal string . 36 | Args: 37 | + mystr: orginal string. 38 | + startPattern: 39 | + endPattern: 40 | Returns: 41 | + The found string, 42 | + and the remained part of the orginal string. 43 | """ 44 | x = mystr.find(startPattern) 45 | if x==-1: 46 | return "",mystr 47 | mystr = mystr[x + len(startPattern):] 48 | y = mystr.find(endPattern) 49 | if y==-1: 50 | return "",mystr 51 | return mystr[:y], mystr[y+len(endPattern):] 52 | 53 | def extractDataFrom(data): 54 | # Extract 2 teams name 55 | team1, tmp = findWithPattern(data,'class="teamLink">','') 56 | team2, tmp = findWithPattern(tmp,'class="teamLink">','') 57 | 58 | # Extract win_toss 59 | toss_div, tmp = findWithPattern(data,"Toss - ","") 60 | if toss_div.find(team1)>-1: 61 | win_toss = team1 62 | elif toss_div.find(team2)>-1: 63 | win_toss = team2 64 | else: 65 | win_toss = "" #!!! 66 | 67 | # Extract bat_or_bowl 68 | if toss_div.find("bat")>-1: 69 | bat_or_bowl = "bat" 70 | elif toss_div.find("field")>-1: 71 | bat_or_bowl = "bowl" 72 | else: 73 | bat_or_bowl = "" #!!! 74 | 75 | # Extract outcome and the winning team if any 76 | outcome, tmp = findWithPattern(data, '
','
') 77 | #print outcome 78 | k = outcome.find("won") 79 | win_game = "" 80 | if k>-1 and outcome.find("drawn")==-1: 81 | if outcome[:k].find(team1)>-1: 82 | win_game = team1 83 | elif outcome[:k].find(team2)>-1: 84 | win_game = team2 85 | else: 86 | wingame = "" #!!! 87 | 88 | # Extract the date 89 | titleDiv, tmp = findWithPattern(data, '','') 90 | date = "".join(titleDiv.split("|")[0].split(",")[-2:]) 91 | 92 | # Extract day_n_night 93 | day_n_night = int(data.find("day/night")>-1) 94 | 95 | # Extract ground 96 | ground,tmp = findWithPattern(data,'title="view the ground profile for','"') 97 | 98 | # Extract rain 99 | matchNotesDiv,tmp = findWithPattern(data, ">Match Notes<","") 100 | rain = int(matchNotesDiv.find(" rain")>-1 or matchNotesDiv.find("Rain")>-1) 101 | 102 | # Extract duckworth_lewis 103 | duckworth_lewis = int(data.find("D/L method")>-1) 104 | 105 | """ 106 | id1, tmp = findWithPattern(data,'Test no. ','<') 107 | id2, tmp = findWithPattern(data,'ODI no. ','<') 108 | id3, tmp = findWithPattern(data,'>List A ','<') 109 | id4, tmp = findWithPattern(data,'unofficial ODI ','<') 110 | match_id = "NA" 111 | for mid in [id1,id2,id3,id4]: 112 | if mid: 113 | match_id = mid 114 | break 115 | """ 116 | # Extract match_id 117 | match_id, tmp = findWithPattern(data,'data-matchId="','"') 118 | 119 | # Return result 120 | return team1, team2, win_toss, bat_or_bowl, outcome, win_game, date, day_n_night, ground, rain, duckworth_lewis, match_id 121 | 122 | 123 | ##################################START PROCESSING DATA######################################### 124 | outputFile = open (FINAL_OUTPUT_FILE, "w") 125 | outputFile.write(HEADER + "\n") 126 | for folder in INPUT_FOLDER: 127 | print "---PROCESSING FOLDER {0!s}---".format(folder) 128 | counter = 0 129 | for url in os.listdir(folder): 130 | data = open(folder + "/" + url).read() 131 | team1, team2, win_toss, bat_or_bowl, outcome, win_game, date, day_n_night, ground, rain, duckworth_lewis, match_id = extractDataFrom(data) 132 | url_new = folder + "/" + url 133 | type_of_match = folder.split("-")[-1].upper() 134 | rowStr = '"{0!s}","{1!s}","{2!s}","{3!s}","{4!s}","{5!s}","{6!s}","{7!s}","{8!s}","{9!s}","{10!s}","{11!s}","{12!s}","{13!s}"\n'.format(url_new, team1, team2, win_toss, bat_or_bowl, outcome, win_game, date, day_n_night, ground, rain, duckworth_lewis, match_id, type_of_match) 135 | outputFile.write(rowStr) 136 | counter = counter + 1 137 | if counter%1000==0: 138 | print " + Processing file {0:d}000th".format(counter/1000) 139 | outputFile.close() 140 | 141 | ##################################FINISHED######################################### 142 | print "DONE. Wrote output to {0!s}".format(FINAL_OUTPUT_FILE) 143 | --------------------------------------------------------------------------------