├── crawler.log ├── simple_crawler ├── README └── crawler.py ├── weights.py ├── README ├── classes.py ├── utils.py └── github-crawler.py /crawler.log: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /simple_crawler/README: -------------------------------------------------------------------------------- 1 | This project contains a simple python crawler. Right now I will keep things simple and build a crawler that will visit all the links on a page upto a certain depth. Maybe things can be extended later. 2 | 3 | To crawl a particular url, you need to give that as a command line argument 4 | for example to crawl mycareerstack.com give run the python script as 5 | 6 | python crawler.py http://mycareerstack.com 7 | 8 | The crawler crawls links upto depth 5, by depth 5 it means that the crawler does a breadth first search going down 5 levels from the root url. Since it does a breadth first search all the links of the root url are collected first and then they are visited and so on. 9 | -------------------------------------------------------------------------------- /weights.py: -------------------------------------------------------------------------------- 1 | # Different weights that are to be assigned 2 | # These weights are to be used when calculating the statitics of 3 | # the user who is being crawled 4 | follower_weight = 1 5 | forker_with_commit = 1 6 | forker_without_commit = 1 7 | watcher_weight = 1 8 | weight_for_code = 1/100 9 | weight_for_others_code = 1/10 10 | 11 | # weight assigned to the owner whose repository is forked and 12 | # a pull request is accepted 13 | fork_with_commit_owner = 1 14 | fork_with_commit_code = 1/10 15 | 16 | # weight to be assigned to the lines of code that a user has written to a repository 17 | # that he has forked but editing for his/her own purpose 18 | fork_without_commit_code = 1/10 19 | 20 | forked_repo_watcher_weight = 1 21 | forked_repo_forker_wight = 1 22 | forked_repo_lines_of_code = 1/100 23 | 24 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | The crawler does the following things 2 | 3 | 1) Find out the languages in which the person has written the code 4 | 2) Find out the number of owned repositories and forked repositories 5 | 3) Find out the number of followers 6 | 4) Calculate the statistics of the repositories 7 | 8 | The statistics are calcualtes on the following criteria 9 | 10 | 1) Number of lines of code 11 | 2) Number of forks, taking in account the statistics of users that have forked the repository 12 | 3) Number of watchers, taking in account the users those who are watching the repository 13 | 14 | In case of forked repositories, it checks for user's contribution to the repository 15 | by checking whether the pull requests have been accepted or not. 16 | 17 | At the end it generates a metric depending on all the values that have been 18 | calcualted above. 19 | 20 | Usage: 21 | python github-crawler.py 22 | e.g. 23 | python github-crawler.py sachingupta006 24 | 25 | Dependancies 26 | 27 | install libxml2 from here 28 | http://www.linuxfromscratch.org/blfs/view/cvs/general/libxml2.html 29 | 30 | install libxslt from here 31 | http://www.linuxfromscratch.org/blfs/view/6.3/general/libxslt.html 32 | 33 | sudo apt_get install python2.7-dev 34 | easy_install --allow-hosts=lxml.de,*.python.org lxml 35 | easy_install iso8601 36 | -------------------------------------------------------------------------------- /classes.py: -------------------------------------------------------------------------------- 1 | # Stores information about a User(other than the one being crawled) 2 | class otherUser(object): 3 | 4 | def __init__(self,name=None,link=None): 5 | 6 | self.name = name 7 | self.link = link 8 | self.followers = 0 9 | self.own_repos = [] 10 | self.forked_repos = [] 11 | self.impact_value = 0 12 | 13 | # Stores information about a repository 14 | class ownRepository(object): 15 | 16 | def __init__(self,name=None,link=None,repo_type=None,lang=None): 17 | 18 | self.name = name 19 | self.type = repo_type 20 | self.lang = lang 21 | self.link = link 22 | self.forks = [] 23 | self.watchers = 0 24 | self.own_commits = [] 25 | self.other_commits = [] 26 | self.activity = 0 27 | self.ownLinesOfCode = 0 28 | self.otherLinesOfCode = 0 29 | 30 | # Stores information about a repository 31 | class forkRepository(object): 32 | 33 | def __init__(self,name=None,link=None,repo_type=None,lang=None,forked_from=None): 34 | 35 | self.name = name 36 | self.link = link 37 | 38 | # name of the person whose repository is forked 39 | self.owner = None 40 | # link of original repo 41 | self.forked_from = forked_from 42 | 43 | self.type = repo_type 44 | self.lang = lang 45 | 46 | # if a forked repository is forked 47 | # the fork is attributed to the original repo 48 | # same is with watchers 49 | self.forks = [] 50 | self.watchers = 0 51 | 52 | # the commits here refer to pull requests that have been acceppted 53 | # by the owner in his own repository 54 | self.own_commits = [] 55 | self.other_commits = [] 56 | self.activity = 0 57 | self.ownLinesOfCode = 0 58 | self.otherLinesOfCode = 0 59 | 60 | # the person may not give any pull requests and develope the 61 | # repository natively 62 | self.selfLinesOfCode = 0 63 | 64 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | import urlparse 3 | from lxml.html import parse 4 | from BeautifulSoup import BeautifulSoup 5 | from locale import * 6 | setlocale(LC_NUMERIC, '') 7 | 8 | # Returns a lxml document from a url 9 | def doc_from_url(url,shouldPrint=True): 10 | 11 | if shouldPrint: 12 | print "fetching: "+ url 13 | page = urllib2.urlopen(url) 14 | doc = parse(page).getroot() 15 | return doc 16 | 17 | # Returns a beautifulSoup element from a url 18 | def soup_from_url(url,shouldPrint=True): 19 | 20 | if shouldPrint: 21 | print "fetching: " + url 22 | response = urllib2.urlopen(url) 23 | page = response.read() 24 | soup = BeautifulSoup(page) 25 | return soup 26 | 27 | # takes url of a reository and returns the total lines of code 28 | def totalLinesOfCode(url): 29 | 30 | linesOfCode = 0 31 | 32 | doc = doc_from_url(url) 33 | history = doc.cssselect('div.history a')[0].get('href') 34 | # goes to the commit page, and extracts info about each commit 35 | history = urlparse.urljoin(domain,history) 36 | 37 | page_value = 0 38 | 39 | while(True): 40 | 41 | page_value += 1 42 | page_number = '?page='+str(page_value) 43 | history_url = urlparse.urljoin(history,page_number) 44 | 45 | try: 46 | history_doc = doc_from_url(history_url) 47 | commits = history_doc.cssselect('li.commit-group-item') 48 | 49 | for commit in commits: 50 | 51 | a = commit.cssselect('a.message')[0].get('href') 52 | # just append the github domain in the beginning 53 | a = urlparse.urljoin(domain,a) 54 | 55 | # Calculate the number of lines of code 56 | commit_doc = doc_from_url(a) 57 | 58 | # Find out the number of parents of this commit, if there are 2, then this is a merge 59 | # commit and we do not have to attribute to the repository author 60 | parents = int(commit_doc.cssselect('span.sha-block')[1].text_content().split()[0]) 61 | 62 | # this commit has only parent and hence is not a merge commit 63 | if parents == 1: 64 | 65 | authors = commit_doc.cssselect('span.author-name') 66 | 67 | # can the length be greater than 2 in any case 68 | if len(authors) <= 2: 69 | 70 | # this p element contains the text where the info about the commit is written 71 | commit_info = commit_doc.cssselect('p.explain')[0].text_content() 72 | numbers = [] 73 | # it contains 3 numbers 74 | # 1 - number of files changed 75 | # 2 - number of lines added 76 | # 3 - number of lines deleted 77 | for string in commit_info.split(): 78 | try: 79 | a = atoi(string) 80 | numbers.append(a) 81 | except Exception: 82 | pass 83 | 84 | linesOfCode += numbers[1] 85 | linesOfCode -= numbers[2] 86 | 87 | except Exception: 88 | break 89 | 90 | -------------------------------------------------------------------------------- /simple_crawler/crawler.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import urllib2 3 | import re 4 | import urlparse 5 | from collections import deque 6 | 7 | # This regex is not complete, so using Beautiful Soup to extract links from web page 8 | linkregex = re.compile(r'', re.IGNORECASE) 9 | 10 | # Goes to a depth 5 for the input url 11 | search_depth = 1 12 | 13 | from BeautifulSoup import BeautifulSoup 14 | 15 | class Crawler(object): 16 | 17 | def __init__(self, root, depth): 18 | 19 | self.root = root 20 | self.depth = depth 21 | self.host = urlparse.urlparse(self.root).netloc 22 | self.crawled = [] 23 | self.links = 1 #including the root url 24 | self.externalLinks = [] 25 | self.uncrawled = [] 26 | 27 | def crawl(self): 28 | 29 | page = GetLinks(self.root) 30 | page.get() 31 | parentQ = deque() 32 | childQ = deque() 33 | 34 | parentQ.append(self.root) 35 | level = 0 36 | 37 | while True: 38 | 39 | try: 40 | url = parentQ.popleft() 41 | except: 42 | level+=1 43 | print("\n") 44 | if level == self.depth: 45 | break 46 | 47 | else: 48 | 49 | # transfer all urls from the child queue to the parent queue 50 | while childQ: 51 | url = childQ.popleft() 52 | parentQ.append(url) 53 | 54 | 55 | # break if the queue is empty 56 | if not parentQ: 57 | print "No more links found" 58 | print "Finishing...." 59 | break 60 | else: 61 | continue 62 | 63 | if url not in self.crawled: 64 | 65 | try: 66 | 67 | # extract the host out of the new url 68 | host = urlparse.urlparse(url).netloc 69 | # if it matches with the current root .* includes any subdomains 70 | if re.match(".*%s" % self.host, host): 71 | 72 | print "crawling: " + url 73 | self.links+=1 74 | self.crawled.append(url) 75 | page = GetLinks(url) 76 | page.get() 77 | for new_url in page.urls: 78 | if new_url not in self.crawled: 79 | childQ.append(new_url) 80 | else: 81 | self.externalLinks.append(url) 82 | 83 | except Exception, e: 84 | print "ERROR: Can't process url '%s' (%s)" % (url, e) 85 | 86 | while childQ: 87 | link = childQ.popleft() 88 | self.uncrawled.append(link) 89 | 90 | class GetLinks(object): 91 | 92 | def __init__(self,url): 93 | self.url = url 94 | self.urls = [] 95 | 96 | def get(self): 97 | 98 | # Fetch the page contents 99 | url = urlparse.urlparse(self.url) 100 | request = urllib2.Request(self.url) 101 | response = urllib2.urlopen(request) 102 | page = response.read() 103 | 104 | # Extract urls from the page 105 | # links = linkregex.findall(page) 106 | # can't use regex here, some problems with that using beautiful soup 107 | soup = BeautifulSoup(page) 108 | tags = soup('a') 109 | for tag in tags: 110 | link = tag.get("href") 111 | if link.startswith('/'): 112 | link = url.scheme + '://' + url.netloc + link 113 | elif link.startswith('#'): 114 | if link == '#': 115 | tags.remove(tag) 116 | continue 117 | else: 118 | link = url.scheme + '://' + url.netloc + url.path 119 | elif not link.startswith('http'): 120 | link = 'http://' + url[1] + '/' + link 121 | 122 | # specific to mycareerstack.com 123 | # remove this 124 | if not "accounts" in link: 125 | self.urls.append(link) 126 | 127 | def main(): 128 | 129 | if len(sys.argv) < 2: 130 | print 'No start url was given' 131 | sys.exit() 132 | 133 | url = sys.argv[1] 134 | print "Crawling %s (Max Depth: %d)" % (url, search_depth) 135 | crawler = Crawler(url,search_depth) 136 | crawler.crawl() 137 | print "Total internal links found " + str(crawler.links) 138 | print "Total links crawled " + str(len(crawler.crawled)) 139 | 140 | print "\nUncrawled links " 141 | print "\n".join(crawler.uncrawled) 142 | 143 | print "\nExternal links:" 144 | print "\n".join(crawler.externalLinks) 145 | 146 | if __name__ == "__main__": 147 | main() 148 | 149 | 150 | 151 | -------------------------------------------------------------------------------- /github-crawler.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import sys 3 | import logging 4 | logging.basicConfig(filename='crawler.log', level=logging.DEBUG) 5 | import urllib2 6 | import urlparse 7 | from collections import deque 8 | from lxml.html import parse 9 | from BeautifulSoup import BeautifulSoup 10 | import iso8601 11 | 12 | from locale import * 13 | setlocale(LC_NUMERIC, '') 14 | 15 | from weights import * 16 | from utils import * 17 | from classes import ownRepository, forkRepository 18 | 19 | # This will contain other objects that will be used to 20 | # calculate the statistics of a user 21 | UserRepo = {} 22 | 23 | # An object of this class is instantiated whenever a user is to be crawled 24 | class Crawler(object): 25 | 26 | def __init__(self, username): 27 | 28 | self.username = username 29 | self.domain = "https://github.com/" 30 | self.root = urlparse.urljoin(self.domain, self.username) 31 | print self.root 32 | self.skills = set([]) 33 | self.own_repo = [] 34 | self.forked_repo = [] 35 | self.followers = [] 36 | self.stat = 0 37 | 38 | # Once a cralwer object is instantiated, this function is called from main 39 | def crawl(self): 40 | 41 | print "\nGetting the repositories" 42 | self.getRepositories() 43 | print "\nGetting the followers" 44 | self.getFollowers() 45 | print "\nGetting own repo info" 46 | self.getOwnRepoInfo() 47 | print "\nGetting forked repo info" 48 | self.getForkRepoInfo() 49 | print "\nPrinting follower stats" 50 | self.calculateFollowerStats() 51 | print "\nCalculating repo stats" 52 | self.calculateRepoStats() 53 | 54 | # makes a list of all the repositories in the user's profile 55 | def getRepositories(self): 56 | 57 | url = self.root + '/repositories' 58 | soup = soup_from_url(url) 59 | 60 | try: 61 | 62 | own_repos = soup.findAll('li',{"class":"simple public source"}) 63 | fork_repos = soup.findAll('li',{"class":"simple public fork"}) 64 | 65 | for r in own_repos: 66 | 67 | repo_type = "own" 68 | lang = r.find('li').string 69 | a_link = r.find('h3').find('a') 70 | name = a_link.string 71 | link = urlparse.urljoin(self.domain, a_link['href']) 72 | repo = ownRepository(name,link,repo_type,lang) 73 | self.skills.add(lang) 74 | self.own_repo.append(repo) 75 | 76 | for r in fork_repos: 77 | 78 | repo_type = "fork" 79 | # language of the repository 80 | lang = r.find('li').string 81 | # link of he forked repository 82 | a_link = r.find('h3').find('a') 83 | name = a_link.string 84 | link = urlparse.urljoin(self.domain, a_link['href']) 85 | # link of the repository from where it is forked 86 | forked_from = r.find('p', {"class":"fork-flag"}).find('a')['href'] 87 | forked_from = urlparse.urljoin(self.domain, forked_from) 88 | # create an object of the repository 89 | repo = forkRepository(name,link,repo_type,lang,forked_from) 90 | self.skills.add(lang) 91 | self.forked_repo.append(repo) 92 | 93 | except AttributeError: 94 | print "User does not have any repository" 95 | 96 | # Makes a list of followers 97 | def getFollowers(self): 98 | 99 | url = self.root + '/followers' 100 | soup = soup_from_url(url) 101 | 102 | try: 103 | followers = soup.find(id="watchers").findAll('li') 104 | for f in followers: 105 | link = f.findAll('a')[1] 106 | name = link.string 107 | url = urlparse.urljoin(self.domain,link['href']) 108 | follower = {'name':name, 'url':url} 109 | self.followers.append(follower) 110 | 111 | except AttributeError: 112 | print "User does not have any followers" 113 | 114 | # collect statistics of a repository that is user's own repository 115 | def getOwnRepoInfo(self): 116 | 117 | # collect stats from the users own repo 118 | # TODO do we have to give any weightage to the fact that other users have contributed 119 | # to this repository 120 | for r in self.own_repo: 121 | 122 | doc = doc_from_url(r.link) 123 | 124 | # TODO Do we need to see who all are watching or just the number would suffice 125 | watchers = doc.cssselect('li.watchers a')[0].text_content().strip() 126 | r.watchers = int(watchers) 127 | 128 | # have to get the name of all those who have forked this repository 129 | # link is of type //network/ 130 | fork_link = doc.cssselect('li.forks a')[0].get('href') 131 | # members needs to be appended to get to the actual page that contains the forkers 132 | fork_link = fork_link +"/members" 133 | fork_link = urlparse.urljoin(self.domain,fork_link) 134 | fork_page = urllib2.urlopen(fork_link) 135 | fork_doc = parse(fork_page).getroot() 136 | 137 | # Get the name and link of each forker 138 | forkers = fork_doc.xpath(forker_xpath) 139 | for fork in forkers: 140 | name = fork.text_content() 141 | link = urlparse.urljoin(self.domain, name) 142 | # if a forker has also commited then higher weigthage needs to be given 143 | name_and_link = {'name':name, 'url':link, 'hasCommitted': False} 144 | r.forks.append(name_and_link) 145 | 146 | clone_doc = doc_from_url(r.link + clone_link) 147 | clone_text = clone_doc.cssselect("div#path")[0].text_content().split() 148 | 149 | # The first number is the number of clones over the last 4 weeks 150 | # TODO save this count for the forked repository as well 151 | for string in clone_text: 152 | try: 153 | a = atoi(string) 154 | self.clones = a 155 | break 156 | except Exception: 157 | pass 158 | 159 | # word history is a bit misleading contains the link as //commits/master/ 160 | try: 161 | history = doc.cssselect('div.history a')[0].get('href') 162 | except IndexError: 163 | print "Repository ("+r.name+") is empty" 164 | 165 | # goes to the commit page, and extracts info about each commit 166 | history = urlparse.urljoin(self.root,history) 167 | 168 | page_value = 0 169 | 170 | while(True): 171 | 172 | page_value += 1 173 | page_number = '?page='+str(page_value) 174 | history_url = urlparse.urljoin(history,page_number) 175 | 176 | try: 177 | history_doc = doc_from_url(history_url) 178 | commits = history_doc.cssselect('li.commit-group-item') 179 | 180 | for commit in commits: 181 | 182 | # this link contains the user name as well 183 | a = commit.cssselect('a.message')[0].get('href') 184 | # just append the github domain in the beginning 185 | a = urlparse.urljoin(self.domain,a) 186 | 187 | # Calculate the number of lines of code 188 | commit_doc = doc_from_url(a) 189 | 190 | # Find out the number of parents of this commit, if there are 2, then this is a merge 191 | # commit and we do not have to attribute to the repository author 192 | parents = int(commit_doc.cssselect('span.sha-block')[1].text_content().split()[0]) 193 | 194 | # this commit has only parent and hence is not a merge commit 195 | if parents <= 1: 196 | 197 | time = commit_doc.cssselect('time')[0].get('datetime') 198 | authors = commit_doc.cssselect('span.author-name') 199 | 200 | name = "" 201 | 202 | # can the length be greater than 2 in any case 203 | if len(authors) > 1: 204 | 205 | # This should not happend, a commit cannot have more than 1 author 206 | if len(authors) > 2: 207 | logging.warning('Repository ' + r.name + ' has a commit with sha ' \ 208 | + commit_doc.cssselect('span.sha').text_content() + ' which has more than 2 authors') 209 | 210 | commiter = commit_doc.cssselect('span.committer span.author-name') 211 | 212 | if len(commiter) == 1: 213 | commiter_name = commiter[0].text_content().strip() 214 | 215 | for author in authors: 216 | author_name = author.text_content().strip() 217 | if not author_name == commiter_name: 218 | name = author_name 219 | else: 220 | logging.warning('Repository ' + r.name + ' has a commit with sha ' \ 221 | + commit_doc.cssselect('span.sha').text_content() + ' which has more than 2 authors and no commiter') 222 | 223 | else: 224 | name = authors[0].text_content().strip() 225 | 226 | data = {'link':a, 'time':time, 'author':name } 227 | 228 | # this p element contains the text where the info about the commit is written 229 | commit_info = commit_doc.cssselect('p.explain')[0].text_content() 230 | numbers = [] 231 | # it contains 3 numbers 232 | # 1 - number of files changed 233 | # 2 - number of lines added 234 | # 3 - number of lines deleted 235 | for string in commit_info.split(): 236 | try: 237 | a = atoi(string) 238 | numbers.append(a) 239 | except Exception: 240 | pass 241 | 242 | additions = numbers[1] 243 | deletions = numbers[2] 244 | data['additions'] = additions 245 | data['deletions'] = deletions 246 | 247 | # This is user's own commit 248 | if name == self.username: 249 | 250 | # Update the lines of code by the user for that repo 251 | r.ownLinesOfCode += additions 252 | r.ownLinesOfCode -= deletions 253 | 254 | # add this commit as own commit 255 | r.own_commits.append(data) 256 | 257 | # somebody else has committed 258 | else: 259 | 260 | # TODO look for some better method of finding whether a user 261 | # TODO hasCommitted flag is set or not 262 | 263 | # If this author has also forked the repository 264 | # then store this info, it will be used in assigning weight 265 | # to the forker 266 | for forker in r.forks: 267 | if forker['name'] == name: 268 | forker['hasCommitted'] = True 269 | break 270 | 271 | r.otherLinesOfCode += additions 272 | r.otherLinesOfCode -= deletions 273 | r.other_commits.append(data) 274 | 275 | 276 | except Exception: 277 | break 278 | 279 | # TODO see why is this not working 280 | try: 281 | 282 | start_date = iso8601.parse_date(r.own_commits[-1]['time']) 283 | end_date = iso8601.parse_date(r.own_commits[0]['time']) 284 | diff = start_date - end_date 285 | self.activity = diff.days 286 | 287 | except Exception: 288 | pass 289 | 290 | # collect statistics of a repository that a user has forked from somehwher 291 | def getForkRepoInfo(self): 292 | 293 | for r in self.forked_repo: 294 | 295 | print "\n" 296 | # first we will collect information from the repository 297 | # which the user has forked, this info is useful only 298 | # if the pull requests have been accepted by the owner 299 | 300 | # set hasCommitted to True if the user does a commit 301 | # to this repository 302 | contributor_doc = doc_from_url(r.forked_from + contributor_link) 303 | contributor_element = contributor_doc.cssselect('ul.members li') 304 | 305 | hasCommitted = False 306 | for contributor in contributor_element: 307 | name = contributor.cssselect('a')[1].text_content() 308 | r.contributors.append(name) 309 | if name == self.username: 310 | hasCommitted = True 311 | 312 | #### If the user has committed then we need to find out those commits which the user 313 | #### has done and calcualte the number of lines of code and other statistics of the repo 314 | 315 | if hasCommitted: 316 | 317 | doc = doc_from_url(r.forked_from) 318 | 319 | # word history is a bit misleading contains the link as //commits/master/ 320 | history = doc.cssselect('div.history a')[0].get('href') 321 | 322 | # goes to the commit page, and extracts info about each commit 323 | history = urlparse.urljoin(self.root,history) 324 | 325 | page_value = 0 326 | 327 | while(True): 328 | 329 | page_value += 1 330 | page_number = '?page='+str(page_value) 331 | history_url = urlparse.urljoin(history,page_number) 332 | 333 | try: 334 | history_doc = doc_from_url(history_url) 335 | commits = history_doc.cssselect('li.commit-group-item') 336 | 337 | for commit in commits: 338 | 339 | authors = commit.cssselect('span.author-name') 340 | name = "" 341 | 342 | # can the length be greater than 2 in any case 343 | if len(authors) > 1: 344 | 345 | # This should not happend, a commit cannot have more than 1 author 346 | if len(authors) > 2: 347 | logging.warning('Repository ' + r.name + ' has a commit with sha ' \ 348 | + commit.cssselect('span.sha').text_content() + ' which has more than 2 authors') 349 | 350 | commiter = commit.cssselect('span.committer span.author-name') 351 | 352 | if len(commiter) == 1: 353 | commiter_name = commiter[0].text_content().strip() 354 | 355 | for author in authors: 356 | author_name = author.text_content().strip() 357 | if not author_name == commiter_name: 358 | name = author_name 359 | else: 360 | logging.warning('Repository ' + r.name + ' has a commit with sha ' \ 361 | + commit.cssselect('span.sha').text_content() + ' which has more than 2 authors and no commiter') 362 | 363 | else: 364 | name = authors[0].text_content().strip() 365 | 366 | # The name of the committer is the same as that of the 367 | if name == self.username: 368 | 369 | # this link contains the user name as well 370 | a = commit.cssselect('a.message')[0].get('href') 371 | # just append the github domain in the beginning 372 | a = urlparse.urljoin(self.domain,a) 373 | 374 | # Calculate the number of lines of code 375 | commit_doc = doc_from_url(a) 376 | 377 | time = commit_doc.cssselect('time')[0].get('datetime') 378 | 379 | data = {'link':a, 'time':time, 'author':name } 380 | 381 | # this p element contains the text where the info about the commit is written 382 | commit_info = commit_doc.cssselect('p.explain')[0].text_content() 383 | 384 | numbers = [] 385 | # it contains 3 numbers 386 | # 1 - number of files changed 387 | # 2 - number of lines added 388 | # 3 - number of lines deleted 389 | for string in commit_info.split(): 390 | try: 391 | a = atoi(string) 392 | numbers.append(a) 393 | except Exception: 394 | pass 395 | 396 | additions = numbers[1] 397 | deletions = numbers[2] 398 | data['additions'] = additions 399 | data['deletions'] = deletions 400 | 401 | # Update the lines of code by the user for that repo 402 | r.ownLinesOfCode += additions 403 | r.ownLinesOfCode -= deletions 404 | 405 | # add this commit as own commit 406 | r.own_commits.append(data) 407 | 408 | except Exception: 409 | #print Exception 410 | break 411 | 412 | # if the user has committed then other contents of the repository are 413 | # of use to us, need to decide if we need to maintain a list of all the forkers 414 | 415 | # find out the owner of the repository that has been forked 416 | owner = doc.cssselect('div.title-actions-bar span')[0].text_content().strip() 417 | r.owner = owner 418 | 419 | # TODO Do we need to see who all are watching or just the number would suffice 420 | watchers = doc.cssselect('li.watchers a')[0].text_content().strip() 421 | r.watchers = int(watchers) 422 | 423 | # TODO do we need to get all the names or just the number would be sufficient 424 | # link is of type //network/ 425 | fork_link = doc.cssselect('li.forks a')[0].get('href') 426 | # members needs to be appended to get to the actual page that contains the forkers 427 | fork_link = fork_link +"/members" 428 | fork_link = urlparse.urljoin(self.domain,fork_link) 429 | fork_page = urllib2.urlopen(fork_link) 430 | fork_doc = parse(fork_page).getroot() 431 | 432 | # Get the name and link of each forker 433 | forkers = fork_doc.xpath(forker_xpath) 434 | for fork in forkers: 435 | name = fork.text_content() 436 | link = urlparse.urljoin(self.domain, name) 437 | # if a forker has also commited then higher weigthage needs to be given 438 | name_and_link = {'name':name, 'url':link, 'hasCommitted': False} 439 | r.forks.append(name_and_link) 440 | 441 | else: 442 | print "User has not committed to the forked repository" 443 | 444 | ############################################################################################### 445 | 446 | # Now we will collect info from the repository that is in the users 447 | # a/c, it may contain some work that the user either did request to be pulled 448 | # or was not acceppted, but since some work has been done some weight should be attributed 449 | 450 | contributor_doc = doc_from_url(r.link + contributor_link) 451 | contributor_element = contributor_doc.cssselect('ul.members li') 452 | 453 | hasCommitted = False 454 | for contributor in contributor_element: 455 | name = contributor.csseselect('a')[1].text_content() 456 | r.contributors.append(name) 457 | if name == self.username: 458 | hasCommitted = True 459 | 460 | # TODO How to take into account those commits which have been both pulled and are in the local repository 461 | # TODO as well. Is it worth so much of trouble? 462 | ## If the user has committed and these commits have not been pulled in the other repository then 463 | if hasCommitted: 464 | 465 | clone_doc = doc_from_url(r.link + clone_link) 466 | clone_text = clone_doc.cssselect("div#path")[0].text_content().split() 467 | 468 | # The first number is the number of clones over the last 4 weeks 469 | # TODO save this count for the forked repository as well 470 | for string in clone_text: 471 | try: 472 | a = atoi(string) 473 | self.clones = a 474 | break 475 | except Exception: 476 | pass 477 | 478 | doc = doc_from_url(r.link) 479 | 480 | # word history is a bit misleading contains the link as //commits/master/ 481 | history = doc.cssselect('div.history a')[0].get('href') 482 | 483 | # goes to the commit page, and extracts info about each commit 484 | history = urlparse.urljoin(self.root,history) 485 | 486 | page_value = 0 487 | 488 | while(True): 489 | 490 | page_value += 1 491 | page_number = '?page='+str(page_value) 492 | history_url = urlparse.urljoin(history,page_number) 493 | 494 | try: 495 | history_doc = doc_from_url(history_url) 496 | commits = history_doc.cssselect('li.commit-group-item') 497 | 498 | for commit in commits: 499 | 500 | authors = commit.cssselect('span.author-name') 501 | name = "" 502 | 503 | # can the length be greater than 2 in any case 504 | if len(authors) > 1: 505 | 506 | # This should not happend, a commit cannot have more than 1 author 507 | if len(authors) > 2: 508 | logging.warning('Repository ' + r.name + ' has a commit with sha ' \ 509 | + commit_doc.cssselect('span.sha').text_content() + ' which has more than 2 authors') 510 | 511 | commiter = commit.cssselect('span.committer span.author-name') 512 | if len(commiter) == 1: 513 | commiter_name = commiter[0].text_content().strip() 514 | 515 | for author in authors: 516 | author_name = author.text_content().strip() 517 | if not author_name == commiter_name: 518 | name = author_name 519 | else: 520 | logging.warning('Repository ' + r.name + ' has a commit with sha ' \ 521 | + commit_doc.cssselect('span.sha').text_content() + ' which has more than 2 authors and no commiter') 522 | 523 | else: 524 | name = authors[0].text_content().strip() 525 | 526 | # The name of the committer is the same as that of the 527 | if name == self.username: 528 | 529 | # this link contains the user name as well 530 | a = commit.cssselect('a.message')[0].get('href') 531 | # just append the github domain in the beginning 532 | a = urlparse.urljoin(self.domain,a) 533 | 534 | # Calculate the number of lines of code 535 | commit_doc = doc_from_url(a) 536 | 537 | time = commit_doc.cssselect('time')[0].get('datetime') 538 | 539 | data = {'link':a, 'time':time, 'author':name } 540 | 541 | # this p element contains the text where the info about the commit is written 542 | commit_info = commit_doc.cssselect('p.explain')[0].text_content() 543 | 544 | numbers = [] 545 | # it contains 3 numbers 546 | # 1 - number of files changed 547 | # 2 - number of lines added 548 | # 3 - number of lines deleted 549 | for string in commit_info.split(): 550 | try: 551 | a = atoi(string) 552 | numbers.append(a) 553 | except Exception: 554 | pass 555 | 556 | additions = numbers[1] 557 | deletions = numbers[2] 558 | data['additions'] = additions 559 | data['deletions'] = deletions 560 | 561 | # Update the lines of code for the user, that has written in the 562 | # forked repository but not sent for pull request 563 | r.selfLinesOfCode += additions 564 | r.selfLinesOfCode -= deletions 565 | 566 | except Exception: 567 | break 568 | 569 | else: 570 | print "User has not committed locally to the fork" 571 | 572 | # Calculate the follower stats 573 | def calculateFollowerStats(self): 574 | 575 | for follower in self.followers: 576 | name = follower['name'] 577 | old_value = self.stat 578 | if name in UserRepo: 579 | self.stat += follower_weight * UserRepo[name].impact_value 580 | else: 581 | self.stat += follower_weight * getUserStats(follower,UserRepo) 582 | print name + " following you, added points: " + str(self.stat - old_value) 583 | 584 | def calculateRepoStats(self): 585 | 586 | # points for own repositories 587 | for r in self.own_repo: 588 | 589 | print r.name 590 | old_value = self.stat 591 | # points from the people who have forked 592 | for forker in r.forks: 593 | 594 | name = forker['name'] 595 | 596 | if name in UserRepo: 597 | impact = UserRepo[name].impact_value 598 | else: 599 | impact = getUserStats(forker,UserRepo) 600 | 601 | if forker['hasCommitted']: 602 | self.stat += impact * forker_with_commit 603 | else: 604 | self.stat += impact * forker_without_commit 605 | 606 | # points from the watchers 607 | self.stat += r.watchers * watcher_weight 608 | 609 | # points from clones 610 | self.stat += r.clones * clone_weight 611 | 612 | # points from the number of lines of code 613 | self.stat += r.ownLinesOfCode * weight_for_code 614 | self.stat += r.otherLinesOfCode * weight_for_others_code 615 | print "Points added from repository("+r.name+"): " + str(self.stat - old_value) 616 | 617 | # points for forked repositories 618 | for r in self.forked_repo: 619 | 620 | old_value = self.stat 621 | # link of the repository from where it is forked 622 | forked_from = r.forked_from 623 | 624 | # it means the user has contributed to the repository 625 | # we do not need to calculate the repo stats because they 626 | # are already available in the repository object 627 | # however we need to calculate the stats for the person who owns the repo 628 | if r.ownLinesOfCode > 0: 629 | 630 | owner = r.owner 631 | owner_link = urlparse.urljoin(self.domain,owner) 632 | 633 | if owner in UserRepo: 634 | self.stat += fork_with_commit_owner * UserRepo[name].impact_value 635 | else: 636 | name_and_link = {'name':owner, 'url':owner_link} 637 | self.stat += fork_with_commit_owner * getUserStats(name_and_link,UserRepo) 638 | 639 | # points awarded for the lines of code commited in others repository 640 | linesOfCode = totalLinesOfCode(forked_from) 641 | 642 | # points awarded due to the quality of the forked repo 643 | repo_points = forked_repo_watcher_weight* r.watchers + forked_repo_forker_wight*len(r.forks) + forked_repo_lines_of_code*linesOfCode 644 | 645 | # points awarded due to contribution to the repo 646 | code_points = fork_with_commit_code * r.ownLinesOfCode/linesOfCode 647 | 648 | self.stat += code_points + repo_points 649 | 650 | # points awarded for lines of code written in forked repository that have not been pulled 651 | if r.selfLinesOfCode > 0: 652 | self.stat += fork_without_commit_code * r.selfLinesOfCode 653 | 654 | print "Points added from forked repository("+r.name+"): " + str(self.stat - old_value) 655 | 656 | def main(): 657 | 658 | if len(sys.argv) < 2: 659 | print 'No start url was given' 660 | sys.exit() 661 | 662 | url = sys.argv[1] 663 | print "Crawling the github profile %s " % url 664 | crawler = Crawler(url) 665 | crawler.crawl() 666 | 667 | print "\nOwn repositories: " + str(len(crawler.own_repo)) 668 | print "Forked repositories: " + str(len(crawler.forked_repo)) 669 | print "Followers: " + str(len(crawler.followers)) 670 | print "Skills: " + "\t".join(crawler.skills) 671 | 672 | print "\nStats about own repository" 673 | for repo in crawler.own_repo: 674 | print "Name: " + repo.name 675 | print "Language: " + repo.lang 676 | 677 | if repo.watchers > 0: 678 | print "Watchers: %d " % repo.watchers 679 | 680 | if len(repo.forks) > 0: 681 | print "Forks: %d " % len(repo.forks) 682 | 683 | print "Self commits: %d" % len(repo.own_commits) 684 | print "Lines of Code: %d" % repo.ownLinesOfCode 685 | print "Activity: %d" % repo.activity 686 | 687 | print "\n" 688 | 689 | print "\nStats about forked repository" 690 | for repo in crawler.forked_repo: 691 | print "Name: " + repo.name 692 | print "Language: " + repo.lang 693 | print "Forks: " + str(len(repo.forks)) 694 | 695 | print "Own commits: " + str(len(repo.own_commits)) 696 | print "\n" 697 | 698 | 699 | if __name__ == "__main__": 700 | main() 701 | --------------------------------------------------------------------------------