├── .gitignore
├── Readme.md
└── scripts
    ├── scraper.py
    └── parser.py


/.gitignore:
--------------------------------------------------------------------------------
1 | data/html_files/
2 | dev/


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
 1 | ## Get Cricket Data
 2 | 
 3 | Scripts for scraping [espncricinfo.com](http://www.espncricinfo.com/) and data on over 43,000 cricket matches.
 4 | 
 5 | ### Scripts
 6 | 
 7 | * [scraper.py](scripts/scraper.py) scrapes data from espncricinfo and downloads it to a local directory.
 8 | * [parser.py](scripts/parser.py) parses the data and creates a [csv with match level data](data/final_output.csv)
 9 | 
10 | ### Data
11 | 
12 | The scripts produce [final_output.csv](data/final_output.csv)
13 | 
14 | ### Application
15 | * Article based on current scripts/data: [Fairly Random](https://github.com/dwillis/toss-up)
16 | * An article based on previous version of the data: [Cricket: An Unfairly Random Game?](http://gbytes.gsood.com/2011/05/07/cricket-an-unfairly-random-game/)
17 | 
18 | ### License
19 | 
20 | The license is only for the scripts and not for the data. The scripts are released under the [MIT License](https://opensource.org/licenses/MIT)
21 | 


--------------------------------------------------------------------------------
/scripts/scraper.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | Download Cricket Data
 4 | 
 5 | '''
 6 | #!/usr/bin/env python
 7 | # -*- coding: utf-8 -*-
 8 | 
 9 | import urllib2
10 | import csv
11 | import sys
12 | import time
13 | import os
14 | import unicodedata
15 | from urlparse import urlparse
16 | from BeautifulSoup import BeautifulSoup, SoupStrainer
17 | 
18 | BASE_URL = 'http://www.espncricinfo.com'
19 | 
20 | if not os.path.exists('./espncricinfo-fc'):
21 |     os.mkdir('./espncricinfo-fc')
22 | 
23 | for i in range(0, 6019):
24 |     #odi: soupy = BeautifulSoup(urllib2.urlopen('http://search.espncricinfo.com/ci/content/match/search.html?search=odi;all=1;page=' + str(i)).read())
25 |     #test: soupy = BeautifulSoup(urllib2.urlopen('http://search.espncricinfo.com/ci/content/match/search.html?search=test;all=1;page=' + str(i)).read())
26 |     #t20i: soupy = BeautifulSoup(urllib2.urlopen('http://search.espncricinfo.com/ci/content/match/search.html?search=t20i;all=1;page=' + str(i)).read())
27 |     #t20: soupy = BeautifulSoup(urllib2.urlopen('http://search.espncricinfo.com/ci/content/match/search.html?search=t20;all=1;page=' + str(i)).read())
28 |     #list a: soupy = BeautifulSoup(urllib2.urlopen('http://search.espncricinfo.com/ci/content/match/search.html?search=list%20a;all=1;page=' + str(i)).read())
29 |     #fc:
30 |     soupy = BeautifulSoup(urllib2.urlopen('http://search.espncricinfo.com/ci/content/match/search.html?search=first%20class;all=1;page=' + str(i)).read())
31 | 
32 |     time.sleep(1)
33 |     for new_host in soupy.findAll('a', {'class' : 'srchPlyrNmTxt'}):
34 |         try:
35 |             new_host = new_host['href']
36 |         except:
37 |             continue
38 |         odiurl = BASE_URL + urlparse(new_host).geturl()
39 |         new_host = unicodedata.normalize('NFKD', new_host).encode('ascii','ignore')
40 |         print new_host
41 |         #print(type(str.split(new_host)[3]))
42 |         print str.split(new_host, "/")[4]
43 |         html = urllib2.urlopen(odiurl).read()
44 |         if html:
45 |             with open('espncricinfo-fc/{0!s}'.format(str.split(new_host, "/")[4]), "wb") as f:
46 |                 f.write(html)
47 | 
48 | 
49 | 
50 |         
51 | 


--------------------------------------------------------------------------------
/scripts/parser.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | 
  3 | Parse Downloaded Cricket Data
  4 | 
  5 | '''
  6 | 
  7 | import os
  8 | 
  9 | INPUT_FOLDER = ["espncricinfo-t20","espncricinfo-lista","espncricinfo-fc","espncricinfo-odi","espncricinfo-t20i","espncricinfo-test"]
 10 | FINAL_OUTPUT_FILE = "final_output.csv"
 11 | 
 12 | HEADER = "url, team1, team2, win_toss, bat_or_bowl, outcome, win_game, date, day_n_night, ground, rain, duckworth_lewis, match_id, type_of_match"
 13 | 
 14 | def removeEndLineCharacter(mystr):
 15 |     if mystr.find("\n"):
 16 |         return mystr[:-1]
 17 |     
 18 | def removeDoubleQuote(myStr):
 19 |     return myStr.replace("\"", "").strip();
 20 | 
 21 | def removeDivInText(myStr, divs):
 22 |     for div in divs:
 23 |         myStr = myStr.replace(div,"")
 24 |     return myStr
 25 | 
 26 | def refineHTMLToText(myStr):
 27 |     for div in USELESS_TAGS:
 28 |         myStr = myStr.replace(div,"")
 29 |     myStr = myStr.replace("  "," ").strip()
 30 |     return myStr
 31 | 
 32 | 
 33 | def findWithPattern(mystr, startPattern, endPattern):
 34 |     """
 35 |     Find the string that starts with <startPattern> and ends with <endPattern> in the orginal string <mystr>.
 36 |     Args:
 37 |         + mystr: orginal string.
 38 |         + startPattern: 
 39 |         + endPattern: 
 40 |     Returns:
 41 |         + The found string,
 42 |         + and the remained part of the orginal string.
 43 |     """
 44 |     x = mystr.find(startPattern)
 45 |     if x==-1:
 46 |         return "",mystr
 47 |     mystr = mystr[x + len(startPattern):]
 48 |     y = mystr.find(endPattern)
 49 |     if y==-1:
 50 |         return "",mystr
 51 |     return mystr[:y], mystr[y+len(endPattern):]
 52 | 
 53 | def extractDataFrom(data):
 54 |     # Extract 2 teams name
 55 |     team1, tmp = findWithPattern(data,'class="teamLink">','</a>')
 56 |     team2, tmp = findWithPattern(tmp,'class="teamLink">','</a>')
 57 | 
 58 |     # Extract win_toss
 59 |     toss_div, tmp = findWithPattern(data,"Toss  - <span class='normal'>","</span>")
 60 |     if toss_div.find(team1)>-1:
 61 |         win_toss = team1
 62 |     elif toss_div.find(team2)>-1:
 63 |         win_toss = team2
 64 |     else:
 65 |         win_toss = "" #!!!
 66 | 
 67 |     # Extract bat_or_bowl
 68 |     if toss_div.find("bat")>-1:
 69 |         bat_or_bowl = "bat"
 70 |     elif toss_div.find("field")>-1:
 71 |         bat_or_bowl = "bowl"
 72 |     else:
 73 |         bat_or_bowl = "" #!!!
 74 | 
 75 |     # Extract outcome and the winning team if any
 76 |     outcome, tmp = findWithPattern(data, '<div class="innings-requirement">','</div>')
 77 |     #print outcome
 78 |     k = outcome.find("won")
 79 |     win_game = ""
 80 |     if k>-1 and outcome.find("drawn")==-1:
 81 |         if outcome[:k].find(team1)>-1:
 82 |             win_game = team1
 83 |         elif outcome[:k].find(team2)>-1:
 84 |             win_game = team2
 85 |         else:
 86 |             wingame = "" #!!!
 87 | 
 88 |     # Extract the date
 89 |     titleDiv, tmp = findWithPattern(data, '<title>','</title>')
 90 |     date = "".join(titleDiv.split("|")[0].split(",")[-2:])
 91 | 
 92 |     # Extract day_n_night
 93 |     day_n_night = int(data.find("day/night")>-1)
 94 | 
 95 |     # Extract ground
 96 |     ground,tmp = findWithPattern(data,'title="view the ground profile for','"')
 97 |             
 98 |     # Extract rain
 99 |     matchNotesDiv,tmp = findWithPattern(data, ">Match Notes<","</div>")
100 |     rain = int(matchNotesDiv.find(" rain")>-1 or matchNotesDiv.find("Rain")>-1)
101 | 
102 |     # Extract duckworth_lewis
103 |     duckworth_lewis = int(data.find("D/L method")>-1)
104 |     
105 |     """
106 |     id1, tmp = findWithPattern(data,'Test no. ','<')
107 |     id2, tmp = findWithPattern(data,'ODI no. ','<')
108 |     id3, tmp = findWithPattern(data,'>List A ','<')
109 |     id4, tmp = findWithPattern(data,'unofficial ODI ','<')
110 |     match_id = "NA"
111 |     for mid in [id1,id2,id3,id4]:
112 |         if mid:
113 |             match_id = mid
114 |             break
115 |     """
116 |     # Extract match_id
117 |     match_id, tmp = findWithPattern(data,'data-matchId="','"')
118 | 
119 |     # Return result
120 |     return  team1, team2, win_toss, bat_or_bowl, outcome, win_game, date, day_n_night, ground, rain, duckworth_lewis, match_id
121 |             
122 | 
123 | ##################################START PROCESSING DATA#########################################
124 | outputFile = open (FINAL_OUTPUT_FILE, "w")
125 | outputFile.write(HEADER + "\n")
126 | for folder in INPUT_FOLDER:
127 |     print "---PROCESSING FOLDER {0!s}---".format(folder)
128 |     counter = 0
129 |     for url in os.listdir(folder):
130 |         data = open(folder + "/" + url).read()
131 |         team1, team2, win_toss, bat_or_bowl, outcome, win_game, date, day_n_night, ground, rain, duckworth_lewis, match_id = extractDataFrom(data)
132 |         url_new = folder + "/" + url
133 |         type_of_match = folder.split("-")[-1].upper()
134 |         rowStr = '"{0!s}","{1!s}","{2!s}","{3!s}","{4!s}","{5!s}","{6!s}","{7!s}","{8!s}","{9!s}","{10!s}","{11!s}","{12!s}","{13!s}"\n'.format(url_new, team1, team2, win_toss, bat_or_bowl, outcome, win_game, date, day_n_night, ground, rain, duckworth_lewis, match_id, type_of_match)
135 |         outputFile.write(rowStr)
136 |         counter = counter + 1
137 |         if counter%1000==0:
138 |             print "   + Processing file {0:d}000th".format(counter/1000)
139 | outputFile.close()
140 | 
141 | ##################################FINISHED#########################################
142 | print "DONE. Wrote output to {0!s}".format(FINAL_OUTPUT_FILE)
143 | 


--------------------------------------------------------------------------------