├── README ├── linkedin.py ├── webometric.py └── world_university_names.sql /README: -------------------------------------------------------------------------------- 1 | World University Names Database 2 | --------------------------------- 3 | 4 | A world uinversity names database is collected from LinkedIn and Webometric using 5 | python script. 6 | 7 | linkedin.py 8 | is a python script to grab university names data from LinkedIn 9 | 10 | webometric.py 11 | is a python script to gram university names data from Webometric 12 | 13 | world_university_names.sql 14 | MySQLdump for countries, universities from LinkedIn and Webometric. 15 | 16 | -------------------------------------------------------------------------------- /linkedin.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | from xml.dom import minidom 3 | import MySQLdb 4 | 5 | WS_URL = 'http://www.linkedin.com/wsSchoolDir?q=&country=%s' 6 | DB_HOST = 'localhost' 7 | DB_NAME = 'bell' 8 | DB_USER = 'root' 9 | DB_PASSWD = '123456' 10 | TBL_COUNTRY = 'countries' 11 | COUNTRY_ID = 'id' 12 | COUNTRY_CODE = 'iso2' 13 | TBL_SCHOOL = 'universities' 14 | 15 | def buildSchools(): 16 | conn = MySQLdb.connect( 17 | host = DB_HOST, 18 | user = DB_USER, 19 | passwd = DB_PASSWD, 20 | db = DB_NAME, 21 | charset = 'utf8' 22 | ) 23 | print "Creating table %s\n" % TBL_SCHOOL 24 | cursor = conn.cursor() 25 | cursor.execute("DROP TABLE IF EXISTS %s" % (TBL_SCHOOL)) 26 | cursor.execute("CREATE TABLE %s(`id` INT(11) NOT NULL AUTO_INCREMENT,`country_id` INT(5) NOT NULL,`name` VARCHAR(150) NOT NULL, PRIMARY KEY (`id`), KEY `country_id` (`country_id`)) ENGINE=MyISAM DEFAULT CHARSET=utf8" % (TBL_SCHOOL)) 27 | 28 | print "Get countries from table %s\n\n" % TBL_COUNTRY 29 | cursor.execute("SELECT %s, %s FROM %s WHERE 1" % (COUNTRY_ID, COUNTRY_CODE, TBL_COUNTRY)) 30 | countries = cursor.fetchall() 31 | 32 | print "Trying to build schools..\n" 33 | for country in countries: 34 | print "Populate schools in %s\n" % country[1] 35 | schools = getSchools(country[1].lower()) 36 | for school in schools: 37 | if len(school.getAttribute('v')) > 0 and school.getAttribute('v') != '0': 38 | univ = school.childNodes[0].data.replace('"', '\\"') 39 | cursor.execute('''INSERT INTO %s VALUES(NULL, %d, "%s")''' % (TBL_SCHOOL, country[0], univ)) 40 | print "\nEnd building schools." 41 | cursor.close() 42 | conn.close() 43 | 44 | def getSchools(code): 45 | dom = minidom.parse(urllib.urlopen(WS_URL % code)) 46 | return dom.getElementsByTagName('s') 47 | 48 | if __name__ == "__main__": 49 | buildSchools() -------------------------------------------------------------------------------- /webometric.py: -------------------------------------------------------------------------------- 1 | from sgmllib import SGMLParser 2 | from urllib2 import urlopen, Request, BaseHandler 3 | from httplib import BadStatusLine 4 | import time 5 | import MySQLdb 6 | 7 | WS_URL = 'http://www.webometrics.info/university_by_country.asp?country=%s' 8 | DB_HOST = 'localhost' 9 | DB_NAME = 'bell' 10 | DB_USER = 'root' 11 | DB_PASSWD = '123456' 12 | TBL_COUNTRY = 'countries' 13 | COUNTRY_ID = 'id' 14 | COUNTRY_CODE = 'iso2' 15 | TBL_SCHOOL = 'webometric_universities' 16 | 17 | class UnivParser(SGMLParser): 18 | domain = 'http://www.webometrics.info/' 19 | path = '/university_by_country.asp?country=%s' 20 | univ = [] 21 | errorURL = [] 22 | 23 | insideRowData = 0 24 | insideColOfUnivName = 0 25 | passedColOfUnivName = 0 26 | insideColOfUnivLink = 0 27 | insideUnivLink = 0 28 | currentUniv = {} 29 | 30 | sleepCount = 1; 31 | insideColNav = 0 32 | insideLinkOfNav = 0 33 | currentLinkOfNav = '' 34 | 35 | nextPage = '' 36 | pageNumber = 1 37 | endOfPage = 0 38 | 39 | def __init__(self): 40 | SGMLParser.__init__(self, verbose=0) 41 | 42 | def parse(self, url): 43 | self.univ = [] 44 | self.pageNumber = 1 45 | self.goToNextPage(url) 46 | return self.univ 47 | 48 | def goToNextPage(self, url): 49 | print "crawl page %d" % self.pageNumber 50 | 51 | self.insideRowData = 0 52 | self.insideColOfUnivName = 0 53 | self.passedColOfUnivName = 0 54 | self.insideColOfUnivLink = 0 55 | self.insideUnivLink = 0 56 | self.currentUniv = {} 57 | 58 | self.insideColNav = 0 59 | self.insideLinkOfNav = 0 60 | self.currentLinkOfNav = '' 61 | 62 | self.nextPage = '' 63 | self.endOfPage = 0 64 | 65 | try: 66 | req = urlopen(url) 67 | except IOError, e: 68 | print "Oops, we got HTTPError." 69 | reason = "" 70 | if hasattr(e, 'reason'): 71 | reason = 'Failed to reach a server. Reason: %d' % e.reason 72 | elif hasattr(e, 'code'): 73 | reason = 'The server couldn\'t fulfill the request. Error code: %d' % e.code 74 | print reason 75 | self.errorURL.append({'url': url, 'reason': reason}) 76 | return 77 | except BadStatusLine, e: 78 | reason = "" 79 | if hasattr(e, 'reason'): 80 | reason = 'Failed to reach a server. Reason: %d' % e.reason 81 | elif hasattr(e, 'code'): 82 | reason = 'The server couldn\'t fulfill the request. Error code: %d' % e.code 83 | print reason 84 | self.errorURL.append({'url': url, 'reason': reason}) 85 | return 86 | finally: 87 | self.sleepCount = 1 88 | 89 | self.feed(req.read()) 90 | req.close() 91 | 92 | if len(self.nextPage): 93 | self.goToNextPage(self.domain + self.nextPage) 94 | 95 | def __sleep(self): 96 | time.sleep(0.5 * self.sleepCount) 97 | 98 | def getErrorURL(self): 99 | return self.errorURL 100 | 101 | def start_tr(self, attrs): 102 | if self.insideRowData == 0: 103 | for name, val in attrs: 104 | if name == 'class' and val == 'nav6a': 105 | self.insideRowData = 1 106 | 107 | def start_td(self, attrs): 108 | if self.insideRowData: 109 | if self.passedColOfUnivName == 0: 110 | self.insideColOfUnivName = 1 111 | else: 112 | self.insideColOfUnivLink = 1 113 | else: # inside col of nav 114 | for name, val in attrs: 115 | if name == 'class' and val == 'nav6a': 116 | self.insideColNav = 1 117 | 118 | def start_a(self, attrs): 119 | if self.insideColOfUnivName: 120 | self.insideUnivLink = 1 121 | elif self.insideColNav: 122 | for name, val in attrs: 123 | if name == 'class' and val == 'nav6a': 124 | self.insideLinkOfNav = 1 125 | if name == 'href': 126 | self.currentLinkOfNav = val 127 | 128 | def end_tr(self): 129 | if self.insideRowData: 130 | self.insideRowData = 0 131 | self.passedColOfUnivName = 0 132 | self.univ.append( self.currentUniv ) 133 | self.currentUniv = {} 134 | 135 | def end_td(self): 136 | if self.insideRowData: 137 | if self.insideColOfUnivName: 138 | self.insideColOfUnivName = 0 139 | self.passedColOfUnivName = 1 140 | else: 141 | self.insideColOfUnivLink = 0 142 | elif self.insideColNav: 143 | self.insideColNav = 0 144 | 145 | def end_a(self): 146 | if self.insideUnivLink: 147 | self.insideUnivLink = 0 148 | elif self.insideLinkOfNav: 149 | self.insideLinkOfNav = 0 150 | self.currentLinkOfNav = '' 151 | 152 | def handle_data(self, data): 153 | if self.insideUnivLink: 154 | self.currentUniv['name'] = unicode( data.strip().replace('"', '\\"'), 'latin-1') 155 | print self.currentUniv['name'] 156 | elif self.insideColOfUnivLink: 157 | self.currentUniv['link'] = u'%s' % data.strip() 158 | elif self.insideLinkOfNav and data.lower() == 'next' and self.nextPage == '': 159 | self.nextPage = self.currentLinkOfNav 160 | self.pageNumber += 1 161 | 162 | def buildSchools(): 163 | conn = MySQLdb.connect( 164 | host = DB_HOST, 165 | user = DB_USER, 166 | passwd = DB_PASSWD, 167 | db = DB_NAME, 168 | charset = 'latin1' 169 | ) 170 | print "Creating table %s\n" % TBL_SCHOOL 171 | cursor = conn.cursor() 172 | cursor.execute("DROP TABLE IF EXISTS %s" % (TBL_SCHOOL)) 173 | cursor.execute("CREATE TABLE %s(`id` INT(11) NOT NULL AUTO_INCREMENT,`country_id` INT(5) NOT NULL,`name` VARCHAR(150) NOT NULL, `url` VARCHAR(150) NOT NULL, PRIMARY KEY (`id`), KEY `country_id` (`country_id`)) ENGINE=MyISAM DEFAULT CHARSET=utf8" % (TBL_SCHOOL)) 174 | 175 | print "Get countries from table %s\n\n" % TBL_COUNTRY 176 | cursor.execute("SELECT %s, %s FROM %s WHERE 1" % (COUNTRY_ID, COUNTRY_CODE, TBL_COUNTRY)) 177 | countries = cursor.fetchall() 178 | 179 | w = UnivParser() 180 | 181 | print "Trying to build schools from webometric..\n" 182 | for country in countries: 183 | print "\nPopulate schools in %s" % country[1] 184 | schools = w.parse(WS_URL % country[1].lower()) 185 | for school in schools: 186 | cursor.execute('''INSERT INTO %s VALUES(NULL, %d, "%s", "%s")''' % (TBL_SCHOOL, country[0], school['name'], school['link'])) 187 | print "\nEnd building schools." 188 | 189 | """ print error """ 190 | for err in w.getErrorURL(): 191 | print "%s\n" % err 192 | 193 | w.close() 194 | cursor.close() 195 | conn.close() 196 | 197 | def insertSchools(url, cc): 198 | conn = MySQLdb.connect( 199 | host = DB_HOST, 200 | user = DB_USER, 201 | passwd = DB_PASSWD, 202 | db = DB_NAME, 203 | charset = 'latin1' 204 | ) 205 | cursor = conn.cursor() 206 | cursor.execute("SELECT %s FROM %s WHERE iso2 = '%s'" % (COUNTRY_ID, TBL_COUNTRY, cc.upper())) 207 | country = cursor.fetchall() 208 | 209 | w = UnivParser() 210 | schools = w.parse(url) 211 | for school in schools: 212 | cursor.execute('''INSERT INTO %s VALUES(NULL, %d, "%s", "%s")''' % (TBL_SCHOOL, country[0][0], school['name'], school['link'])) 213 | print "\nEnd building schools." 214 | 215 | """ print error """ 216 | for err in w.getErrorURL(): 217 | print "%s\n" % err 218 | 219 | w.close() 220 | cursor.close() 221 | conn.close() 222 | 223 | if __name__ == "__main__": 224 | buildSchools() --------------------------------------------------------------------------------