├── .gitignore ├── .project ├── .pydevproject ├── LICENSE.txt ├── args_parser.py ├── crawler.py ├── dependencies.txt ├── github ├── __init__.py ├── __init__.pyc ├── data_manager.py ├── exceptions.py ├── git_downloader.py ├── oauthManager.py ├── repository.py ├── repository_list.py └── session.py ├── main.py └── parallel_cloning.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | convert.py* 3 | .fuse* 4 | crawled/ 5 | backups/ 6 | authentication 7 | github_crawl.sh 8 | *~ 9 | push_token 10 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | githubSpider 4 | 5 | 6 | 7 | 8 | 9 | org.python.pydev.PyDevBuilder 10 | 11 | 12 | 13 | 14 | 15 | org.python.pydev.pythonNature 16 | 17 | 18 | -------------------------------------------------------------------------------- /.pydevproject: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | /${PROJECT_DIR_NAME} 5 | 6 | python 2.7 7 | Default 8 | 9 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | This project is licensed under the terms of the MIT license. 2 | 3 | Copyright (c) 2018 Tommi Unruh 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | 6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | 8 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | -------------------------------------------------------------------------------- /args_parser.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 19, 2015 3 | 4 | @author: Tommi Unruh 5 | ''' 6 | 7 | import re 8 | import copy 9 | 10 | class ModeArgsParser(object): 11 | ''' 12 | classdocs 13 | ''' 14 | 15 | KEY_MODE = "mode" 16 | KEY_ORDER = "order" 17 | KEY_EXPLANATION = "key_explanation" 18 | KEY_ARGS_OPTIONAL = "optional_args" 19 | KEY_ARGS_NECESSARY = "necessary_args" 20 | KEY_ARGS_OPTIONAL_WVAL = "optional_args_w_value" 21 | KEY_ARGS_NECESSARY_WVAL = "necessary_args_w_value" 22 | 23 | def __init__(self): 24 | ''' 25 | Constructor 26 | ''' 27 | self.combinations = {} 28 | 29 | 30 | def addArgumentsCombination(self, mode, necessary_args=None, 31 | optional_args=None, order=None, 32 | explanation=None): 33 | """ 34 | Prepare a dictionary of necessary and optional values, 35 | with and without values respectively. 36 | """ 37 | self.combinations[mode] = { 38 | self.KEY_ORDER: [], 39 | self.KEY_EXPLANATION: None, 40 | self.KEY_ARGS_OPTIONAL: [], 41 | self.KEY_ARGS_NECESSARY: [], 42 | self.KEY_ARGS_OPTIONAL_WVAL: [], 43 | self.KEY_ARGS_NECESSARY_WVAL: [], 44 | } 45 | 46 | # Parse necessary arguments. 47 | if necessary_args: 48 | # Parse short versions first 49 | for s_arg, l_arg in necessary_args: 50 | # If a key ends in "=", we expect it to 51 | # be a key-value pair. 52 | if s_arg: 53 | if s_arg[-1] == "=": 54 | (self.combinations[mode] 55 | [self.KEY_ARGS_NECESSARY_WVAL].append( 56 | [s_arg[:-1], l_arg] 57 | )) 58 | 59 | else: 60 | # Key does not end in "=". 61 | (self.combinations[mode] 62 | [self.KEY_ARGS_NECESSARY].append( 63 | [s_arg, l_arg] 64 | )) 65 | 66 | elif not l_arg: 67 | # s_arg and l_arg are both None, which is not correct. 68 | raise NoneTypeCombinationException() 69 | 70 | # Parse optional arguments. 71 | if optional_args: 72 | # Parse short versions first 73 | for s_arg, l_arg in optional_args: 74 | # If a key ends in "=", we expect it to 75 | # be a key-value pair. 76 | if s_arg: 77 | if s_arg[-1] == "=": 78 | (self.combinations[mode] 79 | [self.KEY_ARGS_OPTIONAL_WVAL].append( 80 | [s_arg[:-1], l_arg] 81 | )) 82 | 83 | else: 84 | # Key does not end in "=". 85 | (self.combinations[mode] 86 | [self.KEY_ARGS_OPTIONAL].append( 87 | [s_arg, l_arg] 88 | )) 89 | 90 | elif not l_arg: 91 | # s_arg and l_arg are both None, which is not correct. 92 | raise NoneTypeCombinationException() 93 | 94 | # Setup order of arguments. 95 | # This is important for returning the results. 96 | # Arguments on the command line can be mixed up! 97 | if order: 98 | self.combinations[mode][self.KEY_ORDER] = order 99 | 100 | else: 101 | # No order specified, so build the default one: 102 | # Necessary arguments first, as specified. Then optional ones. 103 | if necessary_args: 104 | for s_arg, l_arg in necessary_args: 105 | if s_arg[-1] == "=": 106 | self.combinations[mode][self.KEY_ORDER].append( 107 | s_arg[:-1] 108 | ) 109 | else: 110 | self.combinations[mode][self.KEY_ORDER].append( 111 | s_arg 112 | ) 113 | 114 | if optional_args: 115 | for s_arg, l_arg in optional_args: 116 | if s_arg[-1] == "=": 117 | self.combinations[mode][self.KEY_ORDER].append( 118 | s_arg[:-1] 119 | ) 120 | else: 121 | self.combinations[mode][self.KEY_ORDER].append( 122 | s_arg 123 | ) 124 | 125 | if explanation: 126 | self.combinations[mode][self.KEY_EXPLANATION] = explanation 127 | 128 | # Create a duplicate of combinations as a helper variable. 129 | # It is necessary to construct the usage() message. 130 | self.combinations_helper = copy.deepcopy(self.combinations) 131 | 132 | def parseMode(self, arg): 133 | """ 134 | Check if mode ('arg') is implemented. 135 | """ 136 | mode = None 137 | arg = arg.strip() 138 | 139 | if arg[0] == "-": 140 | raise WrongFormatException(arg) 141 | 142 | else: 143 | # Check if this mode is available. 144 | for key in self.combinations: 145 | if key == arg: 146 | mode = arg 147 | break 148 | 149 | if mode: 150 | return mode 151 | 152 | else: 153 | raise WrongModeException(arg) 154 | 155 | 156 | def parseArgs(self, mode, args): 157 | # Expects args[0] to be a mode value, 158 | # i.e. it should not have a minus sign in front of it. 159 | mode = self.parseMode(mode) 160 | return self.getOpts(mode, args) 161 | 162 | def getOpts(self, mode, args): 163 | """ 164 | Parse args and return them in order, as specified by self.combinations. 165 | """ 166 | # Remark: re_short_option will also match long options. 167 | # Therefore, look for long options first, then for short options. 168 | re_long_option = re.compile("--([a-zA-Z]+)") 169 | re_short_option = re.compile("-([a-zA-Z]+)") 170 | 171 | result = {} 172 | skip = False 173 | parsed_vals = {} 174 | 175 | for i, _ in enumerate(args): 176 | if not skip: 177 | key = None 178 | full_key = None 179 | 180 | # Check for long option. 181 | long_hit = re_long_option.match(args[i]) 182 | 183 | if long_hit: 184 | key = long_hit.group(1) 185 | full_key = long_hit.group(0) 186 | 187 | else: 188 | # No long option found, check for short option. 189 | short_hit = re_short_option.match(args[i]) 190 | if short_hit: 191 | key = short_hit.group(1) 192 | full_key = short_hit.group(0) 193 | 194 | if not key: 195 | # No short, no long option found. 196 | raise WrongFormatException(args[i]) 197 | 198 | val = self.parseNextKeyValue(args, i) 199 | 200 | if val: 201 | skip = True 202 | 203 | # Check if key-val pair is correct for this command. 204 | is_permitted = self.argPermitted(full_key, val, mode) 205 | 206 | if is_permitted: 207 | result[key] = val 208 | 209 | else: 210 | skip = False 211 | 212 | # Are necessary arguments still missing? 213 | if self.isMissingArgs(self.combinations[mode]): 214 | raise MissingParameterException(self.combinations[mode]) 215 | 216 | # Add mode to result 217 | parsed_vals[self.KEY_MODE] = mode 218 | 219 | # Bring arguments in order. 220 | # for elem in self.combinations[mode][self.KEY_ORDER]: 221 | # if elem in result: 222 | for key in result: 223 | parsed_vals[key] = result[key] 224 | 225 | return parsed_vals 226 | 227 | def parseNextKeyValue(self, args, i): 228 | """ 229 | Check next argument for a given value for this key. 230 | """ 231 | val = None 232 | 233 | if len(args) > i + 1: 234 | parsed_val = args[i+1] 235 | if len(parsed_val) > 1 and parsed_val[0:2] != "--" and parsed_val[0] != "-": 236 | val = parsed_val 237 | 238 | elif len(parsed_val) == 1 and parsed_val != "-": 239 | val = parsed_val 240 | 241 | return val 242 | 243 | def isMissingArgs(self, combination): 244 | if ( 245 | combination[self.KEY_ARGS_NECESSARY] or 246 | combination[self.KEY_ARGS_NECESSARY_WVAL] 247 | ): 248 | return True 249 | 250 | def argPermitted(self, key, val, mode): 251 | """ 252 | Check if a given key-val pair is correctly specified. 253 | If so, remove it from the combination dictionary, so that 254 | it will be ignored for further parsing. 255 | """ 256 | KEY_SHORT = 0 257 | KEY_LONG = 1 258 | 259 | combination = self.combinations[mode] 260 | 261 | found_permitted_arg = False 262 | orig_key = key 263 | key_type = -1 264 | 265 | # clear key from leading minuses. (e.g. --abc or -abc = abc) 266 | if key[0] == "-": 267 | key = key[1:] 268 | key_type = KEY_SHORT 269 | 270 | if key[0] == "-": 271 | key = key[1:] 272 | key_type = KEY_LONG 273 | 274 | # Check if value is permitted in keys which do not need a value. 275 | for i, combinations_key in enumerate( 276 | combination[self.KEY_ARGS_NECESSARY] 277 | ): 278 | if ( 279 | key_type == KEY_SHORT and combinations_key[KEY_SHORT] == key or 280 | key_type == KEY_LONG and combinations_key[KEY_LONG] == key 281 | ): 282 | # Key found. 283 | # Was a value given? 284 | if val: 285 | raise UnneccessaryValueException(orig_key) 286 | else: 287 | combination[self.KEY_ARGS_NECESSARY].pop(i) 288 | found_permitted_arg = True 289 | 290 | if not found_permitted_arg: 291 | # Check if value is permitted in keys which do need a value. 292 | for i, combinations_key in enumerate( 293 | combination[self.KEY_ARGS_NECESSARY_WVAL] 294 | ): 295 | if ( 296 | key_type == KEY_SHORT and combinations_key[KEY_SHORT] == key or 297 | key_type == KEY_LONG and combinations_key[KEY_LONG] == key 298 | ): 299 | # Key found. 300 | # Was a value given? 301 | if val: 302 | combination[self.KEY_ARGS_NECESSARY_WVAL].pop(i) 303 | found_permitted_arg = True 304 | else: 305 | raise MissingValueException(orig_key) 306 | 307 | if not found_permitted_arg: 308 | # Check if value is permitted in optional keys 309 | # which do not need a value. 310 | for i, combinations_key in enumerate( 311 | combination[self.KEY_ARGS_OPTIONAL] 312 | ): 313 | if ( 314 | key_type == KEY_SHORT and combinations_key[KEY_SHORT] == key or 315 | key_type == KEY_LONG and combinations_key[KEY_LONG] == key 316 | ): 317 | # Key found. 318 | # Was a value given? 319 | if val: 320 | raise UnneccessaryValueException(orig_key) 321 | else: 322 | combination[self.KEY_ARGS_OPTIONAL].pop(i) 323 | found_permitted_arg = True 324 | 325 | if not found_permitted_arg: 326 | # Check if value is permitted in optional keys 327 | # which do need a value. 328 | for i, combinations_key in enumerate( 329 | combination[self.KEY_ARGS_OPTIONAL_WVAL] 330 | ): 331 | if ( 332 | key_type == KEY_SHORT and combinations_key[KEY_SHORT] == key or 333 | key_type == KEY_LONG and combinations_key[KEY_LONG] == key 334 | ): 335 | # Key found. 336 | # Was a value given? 337 | if val: 338 | combination[self.KEY_ARGS_OPTIONAL_WVAL].pop(i) 339 | found_permitted_arg = True 340 | else: 341 | raise MissingValueException(orig_key) 342 | 343 | if not found_permitted_arg: 344 | raise WrongParameterException(mode, orig_key) 345 | 346 | return found_permitted_arg 347 | 348 | def printHelp(self, arg0): 349 | """ 350 | Print usage. 351 | """ 352 | # Construct usage string 353 | usage = ( 354 | "Usage: python " + str(arg0) + " MODE necessary_arg0, necessary_arg1" 355 | ", .. optional_arg0, optional_arg1, ...\n" 356 | ) 357 | 358 | # Print all modes. 359 | modes = "\nMODES: " 360 | for key in self.combinations_helper: 361 | modes += str(key) + ", " 362 | 363 | modes = modes[:-2] + "\n" 364 | 365 | args = "\nMODE ARGS [OPTIONAL_ARGS]:\n" 366 | 367 | 368 | # Construct mode-argument combination-strings. 369 | for mode in self.combinations_helper: 370 | counter = 0 371 | arg = "\t" + mode + "\t\t" 372 | for key in self.combinations_helper[mode][self.KEY_ARGS_NECESSARY_WVAL]: 373 | 374 | arg += "-" + str(key[0]) 375 | if key[1]: 376 | arg += "/--" + str(key[1]) 377 | 378 | arg += " arg" + str(counter) + " " 379 | counter += 1 380 | 381 | for key in self.combinations_helper[mode][self.KEY_ARGS_NECESSARY]: 382 | arg += "-" + str(key[0]) 383 | if key[1]: 384 | arg += "/--" + str(key[1]) + " " 385 | 386 | if ( 387 | self.combinations_helper[mode][self.KEY_ARGS_OPTIONAL_WVAL] or 388 | self.combinations_helper[mode][self.KEY_ARGS_OPTIONAL] 389 | ): 390 | arg += "[" 391 | 392 | for key in self.combinations_helper[mode][self.KEY_ARGS_OPTIONAL_WVAL]: 393 | arg += "-" + str(key[0]) 394 | if key[1]: 395 | arg += "/--" + str(key[1]) 396 | 397 | arg += " arg" + str(counter) + ", " 398 | counter += 1 399 | 400 | for key in self.combinations_helper[mode][self.KEY_ARGS_OPTIONAL]: 401 | arg += "-" + str(key[0]) 402 | if key[1]: 403 | arg += "/--" + str(key[1]) + ", " 404 | 405 | if ( 406 | self.combinations_helper[mode][self.KEY_ARGS_OPTIONAL_WVAL] or 407 | self.combinations_helper[mode][self.KEY_ARGS_OPTIONAL] 408 | ): 409 | arg = arg[:-2] + "]" 410 | 411 | args += arg + "\n" 412 | 413 | # Also print explanations for each mode. 414 | explanations = "\nDESCRIPTION:\n" 415 | tabulator = "\t" 416 | for key in self.combinations_helper: 417 | if self.combinations_helper[key][self.KEY_EXPLANATION]: 418 | explanation = "Mode: " + str(key) + "\n" + tabulator 419 | explanation += self.combinations_helper[key][self.KEY_EXPLANATION] 420 | 421 | explanations += explanation + "\n\n" 422 | 423 | print (usage + modes + args + explanations), 424 | 425 | class WrongModeException(BaseException): 426 | def __init__(self, val=None): 427 | self.val = val 428 | 429 | def __str__(self): 430 | if self.val: 431 | return "Mode '%s' is not implemented." % self.val 432 | 433 | else: 434 | return "Given mode is not implemented." 435 | 436 | class WrongFormatException(BaseException): 437 | def __init__(self, val=None): 438 | self.val = val 439 | 440 | def __str__(self): 441 | if self.val: 442 | return "Argument '%s' is malformed." % self.val 443 | 444 | else: 445 | return "An argument is malformed." 446 | 447 | class NoneTypeCombinationException(BaseException): 448 | def __str__(self): 449 | return "Combination cannot contain combination [None, None]." 450 | 451 | class MissingValueException(BaseException): 452 | def __init__(self, val=None): 453 | self.val = val 454 | 455 | def __str__(self): 456 | if self.val: 457 | return "You did not specify a value for key '%s'." % self.val 458 | 459 | else: 460 | return "You did not specify a necessary value." 461 | 462 | class MissingParameterException(BaseException): 463 | def __init__(self, combinations=None): 464 | self.combinations = combinations 465 | 466 | def __str__(self): 467 | KEY_ARGS_NECESSARY = "necessary_args" 468 | KEY_ARGS_NECESSARY_WVAL = "necessary_args_w_value" 469 | 470 | if self.combinations: 471 | missing = "" 472 | for _list in self.combinations[KEY_ARGS_NECESSARY]: 473 | if _list[1] != None: 474 | missing += "-%s/--%s, " % (_list[0], _list[1]) 475 | else: 476 | missing += "-%s, " % (_list[0]) 477 | 478 | for _list in self.combinations[KEY_ARGS_NECESSARY_WVAL]: 479 | if _list[1] != None: 480 | missing += "-%s/--%s, " % (_list[0], _list[1]) 481 | else: 482 | missing += "-%s, " % (_list[0]) 483 | 484 | missing = missing[:-2] 485 | return "Missing parameters: %s" % missing 486 | 487 | else: 488 | return "Missing parameters. Aborting..." 489 | 490 | class UnneccessaryValueException(BaseException): 491 | def __init__(self, val=None): 492 | self.val = val 493 | 494 | def __str__(self): 495 | if self.val: 496 | return ( 497 | "You did specify a value for key '%s'," 498 | " but it does not need one." % self.val 499 | ) 500 | 501 | else: 502 | return ( 503 | "You did specify a value for a key," 504 | " which does not need one." 505 | ) 506 | 507 | class WrongParameterException(BaseException): 508 | def __init__(self, mode, param): 509 | self.mode = mode 510 | self.param = param 511 | 512 | def __str__(self): 513 | return ( 514 | "Parameter '%s' is not allowed for command '%s'." % ( 515 | self.param, self.mode 516 | ) 517 | ) 518 | -------------------------------------------------------------------------------- /crawler.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 4, 2015 3 | 4 | @author: Tommi Unruh 5 | ''' 6 | 7 | import sys 8 | import re 9 | import os 10 | import shutil 11 | 12 | from github.session import Session as GithubSession 13 | from github.repository_list import RepositoryList 14 | from github.exceptions import RatelimitExceededException 15 | import signal 16 | from github.oauthManager import * 17 | import errno 18 | from github.data_manager import DataManager 19 | from time import sleep 20 | from threading import Thread 21 | 22 | class Crawler(object): 23 | ''' 24 | classdocs 25 | ''' 26 | 27 | # constants 28 | FILE_AUTHENTICATION = "authentication" 29 | 30 | LINK_API = "https://api.github.com" 31 | LINK_REPO_API = LINK_API + "/repositories" 32 | LINK_SEARCH_API = LINK_API + "/search/repositories" 33 | LINK_RATE_LIMIT = LINK_API + "/rate_limit" 34 | HEADER_USER_AGENT = None 35 | HEADER_XRATELIMIT_LIMIT = "X-RateLimit-Limit" 36 | HEADER_XRATELIMIT_REMAINING = "X-RateLimit-Remaining" 37 | 38 | KEY_NEXT = "next" 39 | KEY_SINCE = "since" 40 | KEY_COUNT = "count" 41 | KEY_START = "start" 42 | KEY_CLONE_URL = "clone_url" 43 | KEY_RL_REMAIN = "X-RateLimit-Remaining" 44 | KEY_STATUS_CODE = "status_code" 45 | KEY_CRAWLED_LINKS = "crawled_links" 46 | 47 | # GitHub Session object 48 | s = None 49 | 50 | def __init__(self, file_path): 51 | ''' 52 | Constructor 53 | ''' 54 | # DataManager handles file reading/writing. 55 | self.datamanager = DataManager() 56 | 57 | # Get OAuth from file 'authentication'. 58 | auth_file = file_path 59 | auth_manager = OAuthManager(filename=auth_file) 60 | auth = None 61 | try: 62 | auth = auth_manager.getAuthData() 63 | 64 | except (AuthFileNotFoundException, AuthException): 65 | # Authentication file not found or malformatted. Recreate it. 66 | auth = self.initiateAuthCreation(auth_manager) 67 | 68 | except NoCredentialsException: 69 | oauth = None 70 | user_agent = None 71 | 72 | if auth: 73 | oauth = auth[auth_manager.KEY_OAUTH] 74 | user_agent = auth[auth_manager.KEY_USER_AGENT] 75 | 76 | self.OAUTH = oauth 77 | self.HEADER_USER_AGENT = user_agent 78 | 79 | self.HEADERS = { 80 | 'User-Agent': self.HEADER_USER_AGENT, 81 | 'Authorization': "token %s" % self.OAUTH, 82 | } 83 | 84 | # Setup authentication and settings 85 | self.s = GithubSession(self.OAUTH, self.HEADER_USER_AGENT) 86 | 87 | def initiateAuthCreation(self, auth_manager): 88 | try: 89 | auth_manager.createAuth() 90 | auth = auth_manager.getAuthData() 91 | print "Authentication process done. Continuing..." 92 | 93 | except OAuthCreationException: 94 | # OAuth error. Maybe the OAuth token could not be created, because 95 | # it already exists. 96 | print ( 97 | "OAuth error. Maybe authentication file could not be written " 98 | "because of missing write-privilege." 99 | ) 100 | sys.exit() 101 | 102 | return auth 103 | 104 | def crawlReposWUpdate(self, data_filename): 105 | self.crawlRepos(data_filename, skip=False) 106 | 107 | def crawlRepos(self, file_links, skip=True, _filter=None): 108 | current_ratelimit = self.getRateLimit()["core"]["remaining"] 109 | if current_ratelimit == 0: 110 | self.endExecution() 111 | 112 | url = None 113 | copy_only = False 114 | 115 | file_links_backup = "" 116 | 117 | # Filehandle for writing. 118 | fw = None 119 | f_links = None 120 | 121 | 122 | TEXT_PROCESSING = "Processing contents of file: " 123 | # If a links file already exists from earlier crawls, then parse it. 124 | if os.path.isfile(file_links): 125 | print "File '%s' exists already. Will be appending to it." % (file_links) 126 | 127 | file_links_backup = file_links + "_backup" 128 | 129 | def restoreBackup(signum, frame): 130 | """ 131 | Inner function: Restore original file from backup upon 132 | termination in backup process. 133 | """ 134 | msg = "Got exit signal. Restoring original file from backup..." 135 | print "\n%s\r" % (msg), 136 | 137 | if fw: 138 | fw.close() 139 | 140 | if f_links: 141 | f_links.close() 142 | 143 | # Copy backup file back. 144 | shutil.copyfile(file_links_backup, file_links) 145 | 146 | print "%s Done." % (msg) 147 | 148 | sys.exit() 149 | 150 | # Catch process-kill signal. 151 | signal.signal(signal.SIGTERM, restoreBackup) 152 | 153 | # Also catch Ctrl-C/D. 154 | signal.signal(signal.SIGINT, restoreBackup) 155 | 156 | os.rename(file_links, file_links_backup) 157 | 158 | f_links = open(file_links_backup, 'r') 159 | 160 | if skip: 161 | # We do not want to recrawl old data, so 162 | # just copy-paste it. 163 | shutil.copyfile(file_links_backup, file_links) 164 | 165 | # Open fh for writing. 166 | fw = open(file_links, 'a') 167 | 168 | print TEXT_PROCESSING + str(file_links) + "..." 169 | sys.stdout.flush() 170 | 171 | if skip: 172 | # We do not want to recrawl old data. 173 | # Therefore, get the last next-link from the old data, 174 | # so that we can continue crawling from there. 175 | data = self.datamanager.getDataLikeTail(file_links, 176 | 1, stepsize=65) 177 | 178 | url = self.datamanager.extractNextURL(data) 179 | else: 180 | old_data = f_links 181 | 182 | etag = None 183 | repos = None 184 | next_url = None 185 | 186 | file_pos = None 187 | # Parse old data if skip was not specified. 188 | while 1 and not skip: 189 | try: 190 | file_pos = old_data.tell() 191 | parsed_data = self.datamanager.parseNextBlock(old_data) 192 | 193 | if parsed_data: 194 | _repos, url, etag, next_url = parsed_data 195 | 196 | repos = RepositoryList( 197 | url, etag, repos=_repos, 198 | next_url=next_url 199 | ) 200 | 201 | if not skip: 202 | try: 203 | # Update data, by requesting Github API. 204 | self.nextBackupCrawl(fw, repos, 205 | copy_only=copy_only, 206 | _filter=_filter) 207 | 208 | except RatelimitExceededException: 209 | # No ratelimit remaining, continue 210 | # to only copy the old data and finish. 211 | copy_only = True 212 | 213 | # We finished parsing the old data. 214 | else: 215 | break 216 | 217 | # Encountered malformatted block, probably because 218 | # the original data file was cut/edited. 219 | # Rewind the file position and skip one line. 220 | except IOError as err: 221 | old_data.seek(file_pos, os.SEEK_SET) 222 | old_data.readline() 223 | print err, " Skipping this line!" 224 | 225 | if repos: 226 | url = repos.getNextURL() 227 | 228 | # Remove backup signal handlers. 229 | # SIG_DFL is the standard signal handle for any signal. 230 | signal.signal(signal.SIGTERM, signal.SIG_DFL) 231 | signal.signal(signal.SIGINT, signal.SIG_DFL) 232 | print "Done parsing old data." 233 | 234 | if copy_only: 235 | self.endExecution() 236 | 237 | repos = None 238 | 239 | try: 240 | # Parsing finished or no backup file found. Start crawling new data. 241 | if not fw: 242 | # There was no backup file 243 | fw = open(file_links, 'a') 244 | 245 | if not url: 246 | # We do not have a URL to start form yet. 247 | # Start crawling from the beginning. 248 | repos = self.nextCrawl(fw, _filter=_filter) 249 | url = repos.getNextURL() 250 | 251 | # Parse until ratelimit is reached. 252 | while url: 253 | # Crawl next page 254 | repos = self.nextCrawl(fw, url=url, _filter=_filter) 255 | url = repos.getNextURL() 256 | 257 | fw.close() 258 | 259 | except RatelimitExceededException: 260 | self.endExecution() 261 | 262 | def nextBackupCrawl(self, fh, repository_list, 263 | copy_only=False, _filter=None): 264 | """ 265 | Get up-to-date data for already crawled repositories. 266 | If 'copy_only' is specified, we only copy old data from 267 | the backup file to not lose any already crawled data. 268 | """ 269 | result = None 270 | 271 | if not copy_only: 272 | # We do not want to simply copy the old data - 273 | # check for an update. 274 | print "Updating from: %s" % repository_list.getURL() 275 | 276 | result = self.s.update(repository_list) 277 | 278 | if result: 279 | print "Found update!" 280 | 281 | if _filter: 282 | # Filter results 283 | repository_list.filter(self.s, self.DEFAULT_REPO_FILTER) 284 | 285 | self.datamanager.writeRepositoryList(fh, repository_list) 286 | 287 | return result 288 | 289 | def nextCrawl(self, fh, url=None, _filter=None): 290 | """ 291 | Crawl repositories from GitHub. 292 | 'url' is used to specify the next parse-URL. 293 | """ 294 | result = None 295 | 296 | _format = "Crawling: %s" 297 | 298 | # Setup visual feedback thread. 299 | visual_feedback = visualCrawlingFeedback() 300 | 301 | if url: 302 | _format = _format % url 303 | sys.stdout.write(_format + "\r") 304 | sys.stdout.flush() 305 | 306 | visual_feedback.setMsg(_format) 307 | visual_feedback.start() 308 | result = self.s.getRepos(url=url) 309 | 310 | else: 311 | _format = _format % "From beginning." 312 | sys.stdout.write(_format + "\r") 313 | sys.stdout.flush() 314 | 315 | visual_feedback.setMsg(_format) 316 | visual_feedback.start() 317 | result = self.s.getRepos() 318 | 319 | if _filter: 320 | # Filter results 321 | result.filter(self.s, _filter) 322 | 323 | # Write new results from Github. 324 | self.datamanager.writeRepositoryList(fh, result) 325 | 326 | visual_feedback.stopFeedback() 327 | 328 | print visual_feedback.getMsg() + "Saved to file." 329 | 330 | return result 331 | 332 | @staticmethod 333 | def getKeyFromCrawlData(input_file, output_file, 334 | keys=KEY_CLONE_URL): 335 | """ 336 | Extract the value for 'key' from every crawled repository in file 337 | 'input_file'. 338 | Output is redirected into 'output_file'. 339 | """ 340 | DataManager.getKeysFromCrawlData(input_file, output_file, keys) 341 | 342 | @staticmethod 343 | def extractReposFiltered(input_file, output_file, 344 | _filter=None): 345 | """ 346 | Extract any repository from 'input_file' that matches 'filter', 347 | into 'output_file'. 348 | """ 349 | DataManager.extractReposFiltered(input_file, output_file, _filter) 350 | 351 | def endExecution(self): 352 | print "Ratelimit reached. Quitting..." 353 | sys.exit() 354 | 355 | def getNextURL(self, _dict, next_link=None): 356 | """ 357 | Find the URL in _dict and return it. 358 | Empty string if it does not exist. 359 | 'next_link' can be used to specify an alternative if there is no 360 | link in _dict. 361 | """ 362 | if self.KEY_NEXT_URL in _dict: 363 | return _dict[self.KEY_NEXT_URL] 364 | else: 365 | if next_link: 366 | return next_link 367 | else: 368 | return "" 369 | 370 | def search(self, q="language:PHP", sort=None, order=None): 371 | """ 372 | Search GitHub for 'q'. 373 | Any search is limited to 1000 results. 374 | """ 375 | # Could yield problems, because no deep copy is done. 376 | # TODO: (maybe) 377 | resp = r.get(self.addOAuth(self.LINK_SEARCH_API + "?q=" + q), 378 | headers=self.HEADERS) 379 | 380 | decoded = json.loads(resp.text) 381 | 382 | for _dict in decoded["items"]: 383 | print _dict["clone_url"] 384 | 385 | return decoded 386 | 387 | def getRateLimit(self): 388 | return self.s.getRatelimit() 389 | 390 | def addOAuth(self, url): 391 | """ 392 | Add the OAuth get-parameter to the specified 'url'. 393 | """ 394 | token_query = "access_token=" + self.OAUTH 395 | if url.find('?') != -1: 396 | url += "&" + token_query 397 | else: 398 | url += "?" + token_query 399 | 400 | return url 401 | 402 | ### LEGACY CODE 403 | ### ~~~~~~~~~~~ 404 | def crawlSearchDays(self, start, end, q="langauge:PHP", sort=None, order=None): 405 | """ 406 | Crawl the clone urls for the search query 'q'. 407 | However, the query will be modified to only show results of 408 | a certain day. 409 | This will be repeated until each day in [start, end] was queried. 410 | Therefore, 'start' and 'end' have to be dates of format YYYY-MM-DD. 411 | 412 | Some days may be skipped due to different length of months. 413 | """ 414 | # Check start and end format first. 415 | r = re.compile('^[0-9]{4}-[0-9]{2}-[0-9]{2}$') 416 | if not r.match(start) or not r.match(end): 417 | # 'start' or 'end' have a wrong format. 418 | print ( 419 | "'start' and 'end' are expected to be of format YYYY-MM-DD." 420 | "'%s' and '%s' were given." % (start, end) 421 | ) 422 | return -1 423 | 424 | else: 425 | # Parameters are ok, continue 426 | pass 427 | 428 | def crawlSearching(self, q="language:PHP", sort=None, order=None): 429 | """ 430 | Crawl the clone urls for the search query 'q'. 431 | The response is split into 10 URLs with 100 repositories each. 432 | """ 433 | per_page = 100 434 | page = 0 435 | 436 | for page in range(1, 11): 437 | resp = self.search(q + "&per_page=" + str(per_page) + 438 | "&page=" + str(page)) 439 | 440 | # Check if the response was empty, so that we can reduce 441 | # the load on the GitHub API servers. 442 | if not resp["items"]: 443 | break 444 | 445 | class visualCrawlingFeedback(Thread): 446 | def __init__(self): 447 | super(visualCrawlingFeedback, self).__init__() 448 | self.done = False 449 | 450 | # Set every new thread to a 'daemon'-thread, so that it is killed 451 | # upon exiting parent, i.e. in case of CTRL-C. 452 | self.daemon = True 453 | 454 | def run(self): 455 | counter = 0 456 | self.msg += "." 457 | sys.stdout.write(self.msg + "\r") 458 | sys.stdout.flush() 459 | sleep(1) 460 | 461 | while not self.done: 462 | if counter < 3: 463 | self.msg += "." 464 | counter += 1 465 | else: 466 | self.msg = self.msg[:-3] + " " 467 | counter = 0 468 | 469 | sys.stdout.write(self.msg + "\r") 470 | sys.stdout.flush() 471 | 472 | if counter == 0: 473 | self.msg = self.msg[:-3] 474 | 475 | sleep(1) 476 | 477 | def setMsg(self, msg): 478 | self.msg = msg 479 | 480 | def stopFeedback(self): 481 | self.done = True 482 | 483 | def getMsg(self): 484 | return self.msg -------------------------------------------------------------------------------- /dependencies.txt: -------------------------------------------------------------------------------- 1 | installed custom python modules: requests, pexpect 2 | pip install requests 3 | pip install pexpect 4 | (equivalent: easy_install) 5 | 6 | for the git cloning feature, git has to be installed and accessible in $PATH. 7 | -------------------------------------------------------------------------------- /github/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tommiu/GithubSpider/72ec160b24c416411ad752c92a6489b5cbcdabe2/github/__init__.py -------------------------------------------------------------------------------- /github/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tommiu/GithubSpider/72ec160b24c416411ad752c92a6489b5cbcdabe2/github/__init__.pyc -------------------------------------------------------------------------------- /github/data_manager.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Aug 29, 2015 3 | 4 | @author: tommi 5 | ''' 6 | import os 7 | import errno 8 | import sys 9 | from github.repository_list import RepositoryList 10 | 11 | class DataManager(object): 12 | ''' 13 | Manages the saving and loading of data. 14 | ''' 15 | COMMENT_CHAR = "#" 16 | 17 | KEY_ETAG = "ETag" 18 | KEY_THIS_URL = "url" 19 | KEY_NEXT_URL = "next_url" 20 | 21 | FILTERKEY_SIZE = "size" 22 | FILTERKEY_STARS = "stars" 23 | FILTERKEY_EMPTY = "nofilter" 24 | 25 | def __init__(self): 26 | ''' 27 | Constructor 28 | ''' 29 | 30 | def parseNextBlock(self, fh): 31 | """ 32 | Parse next block of data. Expect: 33 | 1. List of dictionaries. 34 | 2. # url: https://api.github.com/repositories?since=XXX 35 | 3. # ETag: W/"unique_string" 36 | 4. # next_url: https://api.github.com/repositories?since=XXX 37 | """ 38 | url = None 39 | etag = None 40 | repos = None 41 | url_link = None 42 | 43 | # 'counter' determines the correct sequence/file-format of 44 | # the given links-file. 45 | counter = 0 46 | # Parse four lines of data. 47 | for l in fh: 48 | counter += 1 49 | 50 | # Does the line start with '#', indicating a comment? 51 | if self.isComment(l): 52 | 53 | # IMPORTANT: By specifying counter < 4, any order of 54 | # url, next_url and etag is allowed. 55 | # The speedloss of having to do extra checks of 56 | # isURL() and isNext() is negligible. 57 | if self.isURL(l) and counter == 2: 58 | url = self.getVal(l, sep=' ', index=2) 59 | 60 | elif self.isEtag(l) and counter == 3: 61 | etag = self.getVal(l) 62 | 63 | elif self.isNext(l) and counter == 4: 64 | next_url = self.getVal(l, sep=' ', index=2) 65 | 66 | else: 67 | raise IOError("File is malformatted, stopping at line: " 68 | "%s" % l) 69 | 70 | else: 71 | if l != "" and counter == 1: 72 | repos = l.strip() 73 | 74 | # We are done with parsing a single block of data. 75 | if counter == 4: 76 | if url and etag and repos and next_url: 77 | return ( 78 | repos.strip(), url.strip(), 79 | etag.strip(), next_url.strip() 80 | ) 81 | 82 | else: 83 | raise IOError("Encountered an error: " 84 | "Data in file is malformatted.\n" 85 | "found repos? %s\n" 86 | "url: %s\n" 87 | "etag: %s\n" 88 | "next url: %s" % ( 89 | "Yes" if repos else "No", 90 | str(url), 91 | str(etag), 92 | str(next_url) 93 | )) 94 | 95 | # For loop exited before returning, indicating the end 'fh'. 96 | return None 97 | 98 | def getDataLikeTail(self, filename, count, stepsize=2048): 99 | """ 100 | Efficient way to read the last lines of a huge file. 101 | """ 102 | sep = "\n" 103 | 104 | with open(filename, 'rb') as fh: 105 | # Go to end of file. 106 | pos = 0 107 | linecount = 0 108 | fh.seek(0, os.SEEK_END) 109 | 110 | while linecount <= count: 111 | try: 112 | # Go backwards in file. 113 | fh.seek(-stepsize, os.SEEK_CUR) 114 | 115 | # Count found newlines. 116 | linecount += fh.read(stepsize).count(sep) 117 | 118 | # We just went forwards, so go back again. 119 | fh.seek(-stepsize, os.SEEK_CUR) 120 | 121 | except IOError as e: 122 | if e.errno == errno.EINVAL: 123 | # Attempted to seek past the start while stepping back. 124 | stepsize = fh.tell() 125 | fh.seek(0, os.SEEK_SET) 126 | 127 | # Read from beginning. 128 | linecount += fh.read(stepsize).count(sep) 129 | 130 | pos = 0 131 | break 132 | 133 | pos = fh.tell() 134 | 135 | # Now read data. 136 | with open(filename, 'r') as fh: 137 | fh.seek(pos, os.SEEK_SET) 138 | 139 | for line in fh: 140 | # We found n (or even more) lines, 141 | # so we could need to skip some lines. 142 | if linecount > count: 143 | linecount -= 1 144 | continue 145 | 146 | # Otherwise return data. 147 | yield line 148 | 149 | def writeRepositoryList(self, fh, repository_list): 150 | """ 151 | Write crawled repository_list to filehandler 'fh'. 152 | """ 153 | fh.write(str(repository_list) + "\n") 154 | fh.write(self.COMMENT_CHAR + " " + self.KEY_THIS_URL + ": %s\n" % 155 | repository_list.getURL()) 156 | fh.write(self.COMMENT_CHAR + " " + self.KEY_ETAG + ": %s\n" % 157 | repository_list.getEtag()) 158 | fh.write(self.COMMENT_CHAR + " " + self.KEY_NEXT_URL + ": %s\n" % 159 | repository_list.getNextURL()) 160 | 161 | fh.flush() 162 | 163 | @staticmethod 164 | def isComment(_str): 165 | return _str.startswith(DataManager.COMMENT_CHAR) 166 | 167 | @staticmethod 168 | def getKeysFromCrawlData(input_file, output_file, keys): 169 | """ 170 | Extract the value for 'key' from every crawled repository in file 171 | 'input_file'. 172 | Output is redirected into 'output_file'. 173 | """ 174 | # Parse "keys". Can be a single key or 175 | # multiple keys seperated by commas. 176 | filter_keys = [] 177 | 178 | if "," in keys: 179 | filter_keys = keys.split(",") 180 | filter_keys = [key.strip() for key in filter_keys] 181 | else: 182 | filter_keys.append(keys) 183 | 184 | header = "# " 185 | header += " ".join(filter_keys) 186 | header += "\n" 187 | header += "#-----------------------------" 188 | 189 | # Extract values 190 | with open(input_file, 'r') as fr: 191 | with open(output_file, 'w') as fw: 192 | # Write "header" line first. 193 | fw.write(header + "\n") 194 | 195 | for l in fr: 196 | if not DataManager.isComment(l): 197 | if l != "": 198 | repos = RepositoryList(repos=l) 199 | 200 | if not repos.isEmpty(): 201 | # Found a list of repo dictionaries. 202 | # Read it and get its value for 'key'. 203 | for repo in repos: 204 | _output = "" 205 | for key in filter_keys: 206 | _output += str(repo[key]) + " " 207 | fw.write(_output.strip() + "\n") 208 | 209 | @staticmethod 210 | def extractReposFiltered(input_file, output_file, 211 | _filter=None): 212 | """ 213 | Extract any repository from 'input_file' that matches 'filter', 214 | into 'output_file'. 215 | """ 216 | flow = [] 217 | try: 218 | flow = DataManager.parseFilter(_filter) 219 | 220 | except Exception as err: 221 | print err 222 | sys.exit() 223 | 224 | if flow[0] == -1: 225 | print "Could not parse filter correctly. Quitting..." 226 | sys.exit() 227 | 228 | elif flow[0] == DataManager.FILTERKEY_EMPTY: 229 | print "Empty filter specified, copying all repositories." 230 | 231 | fr = open(input_file, 'r') 232 | fw = open(output_file, 'w') 233 | 234 | filtered_repos = RepositoryList() 235 | for l in fr.readlines(): 236 | if not DataManager.isComment(l): 237 | if l != "" and l != "[]\n": 238 | # Found a list of repo dictionaries. Read it. 239 | repos = RepositoryList(repos=l) 240 | 241 | for repo in repos: 242 | is_suitable = True 243 | 244 | # Apply filter and append 245 | # suitable repos to the result. 246 | if flow[0] == DataManager.FILTERKEY_STARS: 247 | # Extract stars value 248 | stars = repo.getStars() 249 | 250 | if flow[1] != -1: 251 | if stars != flow[1]: 252 | is_suitable = False 253 | else: 254 | if flow[2] != -1: 255 | # specified filter: stars > flow[2] 256 | if stars <= flow[2]: 257 | is_suitable = False 258 | if flow[3] != -1: 259 | # specified filter: stars < flow[3] 260 | if stars >= flow[3]: 261 | is_suitable = False 262 | 263 | elif flow[0] == DataManager.FILTERKEY_SIZE: 264 | # Extract size value 265 | size = repo.getSize() 266 | 267 | if flow[1] != -1: 268 | # specified filter: size > flow[1] 269 | if size <= flow[1]: 270 | is_suitable = False 271 | else: 272 | if flow[2] != -1: 273 | # specified filter: size > flow[2] 274 | if size >= flow[2]: 275 | is_suitable = False 276 | 277 | elif flow[0] == DataManager.FILTERKEY_EMPTY: 278 | pass 279 | 280 | if is_suitable: 281 | filtered_repos += repo 282 | 283 | # Print out the number of matched repositories. 284 | _len = len(filtered_repos) 285 | _str = "repository" if _len == 1 else "repositories" 286 | print "%d %s matched and written to file." % (_len, _str) 287 | fw.write(str(filtered_repos)) 288 | 289 | fr.close() 290 | fw.close() 291 | 292 | @staticmethod 293 | def parseFilter(_filter): 294 | """ 295 | Parse a given filter and extract interesting values. 296 | """ 297 | flow = [-1, -1, -1, -1] 298 | 299 | if _filter: 300 | # Expecting filter of type 'keyword="values"'. A value can be 301 | # "=5", so do not just .split("="). 302 | index = _filter.find(":") 303 | 304 | if index > 0: 305 | key = _filter[0:index].strip() 306 | val = _filter[index+1:].strip() 307 | else: 308 | raise ValueError("Filter format is wrong. You gave: %s. " 309 | "However, expected is '%s'!" % ( 310 | _filter, "key:\"values\"" 311 | )) 312 | 313 | if key == DataManager.FILTERKEY_STARS and val: 314 | flow[0] = key 315 | 316 | # Expecting "=int", ">int", "int int" or "" 318 | for _val in val.split(" "): 319 | # Ignore empty values 320 | if _val: 321 | # Check for "=int" 322 | index = _val.find("=") 323 | if index != -1: 324 | # Found "=" 325 | 326 | # Ignore values found earlier. 327 | flow[1] = int(_val[index+1:].strip()) 328 | 329 | # Break and ignore rest. 330 | break 331 | 332 | # Check for ">int" 333 | index = _val.find(">") 334 | if index != -1: 335 | # Found ">" 336 | 337 | flow[2] = int(_val[index+1:].strip()) 338 | 339 | continue 340 | 341 | # Check for "= flow[3] 351 | ): 352 | raise ValueError("Filter will not yield " 353 | "any results: >%d <%d." % ( 354 | flow[2], flow[3] 355 | )) 356 | elif ( 357 | flow[1] == -1 and flow[2] == -1 and flow[3] == -1 358 | ): 359 | raise ValueError( 360 | "Filter could not be parsed. \nExample filters: " 361 | "stars:\"=2\", stars:\">2 <5\", stars:\"<10\"" 362 | ) 363 | 364 | elif key == DataManager.FILTERKEY_SIZE and val: 365 | flow[0] = key 366 | 367 | # Expecting ">int", "int int" or "" 369 | for _val in val.split(" "): 370 | # Ignore empty values 371 | if _val: 372 | # Check for ">int" 373 | index = _val.find(">") 374 | if index != -1: 375 | # Found ">" 376 | 377 | flow[1] = int(_val[index+1:].strip()) 378 | 379 | continue 380 | 381 | # Check for "= flow[2] - 1: 389 | raise ValueError( 390 | "Filter will not yield any results: >%d <%d." % ( 391 | flow[1], flow[2] 392 | ) 393 | ) 394 | 395 | elif flow[1] == -1 and flow[2] == -1: 396 | raise ValueError( 397 | "Filter could not be parsed. \nExample filters: " 398 | "size:\">50 <1000\", size=\"<500\", size:\">1000\"" 399 | ) 400 | 401 | elif key == DataManager.FILTERKEY_EMPTY: 402 | flow[0] = key 403 | 404 | else: 405 | raise ValueError("Filter not known: %s" % (key)) 406 | 407 | return flow 408 | 409 | def isEtag(self, _str): 410 | try: 411 | key, _ = _str.split(":") 412 | if key[2:] == self.KEY_ETAG: 413 | return True 414 | 415 | except ValueError: 416 | pass 417 | 418 | return False 419 | 420 | def isURL(self, _str): 421 | try: 422 | _, key, _ = _str.split(" ") 423 | if key.startswith(self.KEY_THIS_URL): 424 | return True 425 | 426 | except ValueError: 427 | pass 428 | 429 | return False 430 | 431 | def isNext(self, _str): 432 | try: 433 | _, key, _ = _str.split(" ") 434 | if key.startswith(self.KEY_NEXT_URL): 435 | return True 436 | 437 | except ValueError: 438 | pass 439 | 440 | return False 441 | 442 | def extractNextURL(self, generator): 443 | for l in generator: 444 | if self.isNext(l): 445 | return self.getVal(l, sep=' ', index=2) 446 | 447 | # No next URL found. 448 | raise IOError("next_url not found.") 449 | 450 | def getVal(self, _str, sep=':', index=1): 451 | """ 452 | Return the val if _str includes one. 453 | Otherwise return False. 454 | """ 455 | # "# " + self.KEY_SINCE + ": %d\n" % result[self.KEY_SINCE]) 456 | # "# " + self.KEY_ETAG + ": %s\n" % result[self.KEY_ETAG]) 457 | try: 458 | _arr = _str.split(sep) 459 | return _arr[index].strip() 460 | except ValueError: 461 | return False 462 | -------------------------------------------------------------------------------- /github/exceptions.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 18, 2015 3 | 4 | @author: Tommi Unruh 5 | ''' 6 | class RatelimitExceededException(BaseException): 7 | def __str__(self): 8 | return "Your ratelimit is exceeded!" 9 | 10 | class UnavailableRepoException(BaseException): 11 | def __str__(self): 12 | return "Repository is unavailable." 13 | 14 | class DidNotCrawlRepoDetailsException(BaseException): 15 | def __init__(self, _key=None): 16 | self._key = _key 17 | 18 | def __str__(self): 19 | if self._key: 20 | return ( 21 | "This repository object does not contain the specified key '%s', " 22 | "because its detailed representation was not requested " 23 | "beforehand." % self._key 24 | ) 25 | else: 26 | return ( 27 | "This repository object does not contain the specified key, " 28 | "because its detailed representation was not requested " 29 | "beforehand." 30 | ) 31 | 32 | class KeyNotFoundException(BaseException): 33 | def __init__(self, _key=None): 34 | self._key = _key 35 | 36 | def __str__(self): 37 | if self._key: 38 | return ( 39 | "This repository object does not contain the specified key: %s" % ( 40 | self._key 41 | ) 42 | ) 43 | 44 | else: 45 | return ( 46 | "This repository object does not contain the specified key." 47 | ) -------------------------------------------------------------------------------- /github/git_downloader.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Aug 13, 2015 3 | 4 | @author: Tommi Unruh 5 | ''' 6 | 7 | import subprocess 8 | import os 9 | from time import sleep 10 | import sys 11 | import signal 12 | import imp 13 | 14 | import shutil 15 | 16 | import pexpect 17 | 18 | class GitDownloader(object): 19 | """ 20 | Manages the download of git repositories. 21 | """ 22 | def __init__(self, dir_path): 23 | self.OUT_DIR = dir_path 24 | 25 | if self.OUT_DIR[-1] != "/": 26 | self.OUT_DIR += "/" 27 | 28 | self.plugins = {} 29 | 30 | def cloneAllFromFile(self, filename, linenumber=0, delete=False): 31 | """ 32 | Clone repositories from links, that are read from 'filename', starting 33 | at linenumber 'linenumber'. 34 | """ 35 | clone_count = 0 36 | linenumber = int(linenumber) 37 | self.interrupt = False 38 | 39 | if delete: 40 | print ( 41 | "Cloning was called with 'delete' specified. After cloning " 42 | "and processing a repository, it will be deleted again to " 43 | "free space." 44 | ) 45 | 46 | def catchInterrupt(signum, frame): 47 | """ 48 | Catch CTRL-C/D and exit in a safe manner. 49 | """ 50 | file_path = self.OUT_DIR + "cloning_interrupted" 51 | 52 | # Write linenumber to file, so that the user can continue there 53 | # next time. 54 | with open(file_path, 'w') as fh: 55 | fh.write(str(filename) + "\n") 56 | fh.write(str(linenumber) + "\n") 57 | 58 | print ( 59 | "Stopped at line '%d'. Cloned %d repositories.\n" 60 | "Also wrote path of the link file " 61 | " and the linenumber to file '%s'." 62 | ) % (linenumber, clone_count, file_path) 63 | 64 | self.interrupt = True 65 | 66 | with open(filename, 'r') as fh: 67 | # If specified skip lines in links-file. 68 | if linenumber > 1: 69 | self.goToLine(fh, linenumber) 70 | 71 | # Catch process-kill signal. 72 | signal.signal(signal.SIGTERM, catchInterrupt) 73 | 74 | # Also catch Ctrl-C/D. 75 | signal.signal(signal.SIGINT, catchInterrupt) 76 | 77 | l = fh.readline() 78 | 79 | while l and not self.interrupt: 80 | out_dir = None 81 | try: 82 | print "Trying link on line %d in file '%s'" % (linenumber, 83 | filename) 84 | out_dir = self.cloneRepoLink(l.strip(), linenumber) 85 | clone_count += 1 86 | # If any success handler was specified by the user, 87 | # execute it using the path of the 88 | # downloaded repository as an argument. 89 | try: 90 | if not self.interrupt: 91 | # If a plugin was specified to process 92 | # the repository, it will be run. 93 | self.runSuccessHandler(out_dir) 94 | 95 | except OSError as err: 96 | print err 97 | 98 | except pexpect.TIMEOUT: 99 | print "Timed out." 100 | print "Skipping..." 101 | 102 | # EOF = process finished in unhandled way. 103 | except pexpect.EOF: 104 | clone_count += 1 105 | 106 | except ( 107 | RepositoryExistsException, 108 | RepositoryDoesNotExistException, 109 | CredentialsExpectedException 110 | ) as err: 111 | print err.message 112 | print "Skipping..." 113 | 114 | try: 115 | out_dir = err.out_dir 116 | 117 | # CredentialsExpectedException does not 118 | # have a 'out_dir' variable. 119 | except: 120 | pass 121 | 122 | finally: 123 | linenumber += 1 124 | l = fh.readline() 125 | 126 | if delete and out_dir: 127 | # Delete repository. 128 | print "Deleting directory '%s'." % out_dir 129 | shutil.rmtree(out_dir) 130 | 131 | # Remove backup signal handlers. 132 | # SIG_DFL is the standard signal handle for any signal. 133 | signal.signal(signal.SIGTERM, signal.SIG_DFL) 134 | signal.signal(signal.SIGINT, signal.SIG_DFL) 135 | 136 | if not self.interrupt: 137 | print "End of file reached, my work is done!" 138 | 139 | def cloneRepoLink(self, link, int_test): 140 | msg = "Cloning repository: %s..." % link 141 | 142 | last_slash_index = link.rfind("/") 143 | second_last_index = link.rfind("/", 0, last_slash_index) 144 | 145 | repo_name = link[last_slash_index + 1 : -4] 146 | author_name = link[second_last_index + 1 : last_slash_index] 147 | 148 | # reponame_authorname-format enables us to clone repositories of 149 | # the same name, but of different authors. 150 | out_dir = self.OUT_DIR + author_name + "_" + repo_name 151 | 152 | print "%s" % msg 153 | sys.stdout.flush() 154 | 155 | # Start cloning the repository from 'link' simply using 'git' from 156 | # the user's system PATH variable. 157 | # 1 hour max. per repository until timeout. 158 | process = pexpect.spawn("git", ["clone", link, out_dir], 3600) 159 | expectation = process.expect([ 160 | 'Username', 161 | 'already exists and is not an empty directory', 162 | 'does not exist' 163 | ]) 164 | 165 | if expectation == 0: 166 | raise CredentialsExpectedException() 167 | 168 | elif expectation == 1: 169 | raise RepositoryExistsException( 170 | process.before + process.after, 171 | out_dir 172 | ) 173 | 174 | elif expectation == 2: 175 | raise RepositoryDoesNotExistException( 176 | process.before + process.after, 177 | out_dir 178 | ) 179 | 180 | return out_dir 181 | 182 | def goToLine(self, fh, linenumber): 183 | """ 184 | Go to 'linenumber' of a huge text file in an (memory-)efficient way. 185 | """ 186 | if linenumber < 1: 187 | raise IOError( 188 | "Specified linenumber '%d' is smaller than 1." % linenumber 189 | ) 190 | 191 | fh.seek(0, os.SEEK_SET) 192 | 193 | # Skip lines until desired line is reached. 194 | for _ in range(0, linenumber - 1): 195 | read = fh.readline() 196 | if read == "": 197 | # Empty string represents EOF. 198 | raise OutOfScopeException(msg="goToLine error: ", 199 | line=linenumber) 200 | 201 | def setSuccessHandler(self, package_path): 202 | """ 203 | Load a python package, that will be executed each time a repository 204 | was successfully downloaded. 205 | """ 206 | # Get module infos from module in 'package_path'. 207 | # For that, we need to split the path into its package and the module. 208 | # Example: example/dir/module.py 209 | # -> Name: module 210 | # -> [Path: example/dir] 211 | try: 212 | plugin_name = package_path[package_path.rfind("/")+1:-3] 213 | plugin_dir = package_path[:package_path.rfind("/")] 214 | 215 | info = imp.find_module(plugin_name, [plugin_dir]) 216 | 217 | self.plugins[package_path] = imp.load_module(plugin_name, *info) 218 | 219 | except Exception as err: 220 | raise OSError(err) 221 | 222 | def runSuccessHandler(self, dir_path): 223 | """ 224 | Execute each specified success handler. 225 | """ 226 | if self.plugins: 227 | _files = os.listdir(dir_path) 228 | for key in self.plugins: 229 | self.plugins[key].run(_files) 230 | 231 | 232 | class CredentialsExpectedException(BaseException): 233 | def __init__(self, msg=None): 234 | if msg: 235 | self.message = msg 236 | 237 | else: 238 | self.message = ( 239 | "Login credentials were requested." 240 | ) 241 | 242 | class RepositoryExistsException(BaseException): 243 | def __init__(self, msg=None, out_dir=None): 244 | if msg: 245 | self.message = msg 246 | 247 | else: 248 | self.message = ( 249 | "Repository does exist already." 250 | ) 251 | 252 | if out_dir: 253 | self.out_dir = out_dir 254 | 255 | class RepositoryDoesNotExistException(BaseException): 256 | def __init__(self, msg=None, out_dir=None): 257 | if msg: 258 | self.message = msg 259 | 260 | else: 261 | self.message = ( 262 | "Repository is not accessible on GitHub.com." 263 | ) 264 | 265 | if out_dir: 266 | self.out_dir = out_dir 267 | 268 | 269 | 270 | class OutOfScopeException(BaseException): 271 | def __init__(self, msg=None, line=None): 272 | if msg: 273 | self.message = msg 274 | 275 | if line: 276 | self.message += "Line %d is out of scope." % line 277 | 278 | else: 279 | self.message = ( 280 | "goToLine() was called with a linenumber, " 281 | "which was out of scope." 282 | ) 283 | 284 | def __str__(self): 285 | return self.message 286 | -------------------------------------------------------------------------------- /github/oauthManager.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 21, 2015 3 | 4 | @author: Tommi Unruh 5 | ''' 6 | 7 | import requests as r 8 | import getpass 9 | import json 10 | import os 11 | 12 | class OAuthManager(object): 13 | """ 14 | Manages creation and loading/parsing of authorization data for Github.com. 15 | """ 16 | 17 | KEY_OAUTH = "OAuth" 18 | KEY_USER_AGENT = "user_agent" 19 | 20 | def __init__(self, filename=None): 21 | ''' 22 | Constructor 23 | ''' 24 | self.FILE = filename 25 | self.AUTH = None 26 | 27 | def getAuthData(self): 28 | if not self.AUTH: 29 | # OAuth not found, try to parse it from file. 30 | self.parseAuthentication(self.FILE) 31 | 32 | return self.AUTH 33 | 34 | def parseAuthentication(self, filename): 35 | try: 36 | with open (filename, 'r') as fh: 37 | # Parse first line, should be OAuth token. 38 | oauth = fh.readline().strip() 39 | # Parse second line, should be user agent. 40 | user_agent = fh.readline().strip() 41 | 42 | if oauth == "" or user_agent == "": 43 | raise AuthException() 44 | 45 | self.setAuth(oauth, user_agent) 46 | 47 | except IOError: 48 | raise AuthFileNotFoundException() 49 | 50 | def createAuth(self): 51 | print ( 52 | "Authentication file not found! This is probably your first use.\n" 53 | "We need to install an OAuth token for this crawler to work.\n" 54 | "This token does not need ANY access to your Github account.\n" 55 | "You can create one manually on https://github.com/settings/tokens\n" 56 | "or let me create one for you. However, you will need to specify\n" 57 | "your github username and password once. It will not be remembered " 58 | "or transfered somewhere else than github." 59 | ) 60 | 61 | manual_oauth = False 62 | user_input = self.getValidUserInput( 63 | "Do you want to enter one manually? [y/N]", 64 | ["y", "Y", "N", "n"], 65 | default="N" 66 | ) 67 | 68 | if user_input.lower() == "y" : 69 | manual_oauth = True 70 | 71 | oauth = None 72 | username = None 73 | 74 | if manual_oauth: 75 | oauth = raw_input("Please enter your OAuth token: ").strip() 76 | username = raw_input("Please enter your Github email: ").strip() 77 | else: 78 | print ( 79 | "Alright, let's create an OAuth token for your " 80 | "Github account and this application!" 81 | ) 82 | 83 | oauth, username = self.createOAuthUntilSuccess() 84 | 85 | with open(self.FILE, 'w') as fh: 86 | fh.write(oauth.strip() + "\n") 87 | fh.write(username.strip() + "\n") 88 | 89 | print ( 90 | "OAuth file \"authentication\" successfully written!\n" 91 | "Future executions will automatically read your authentication data" 92 | " from that file." 93 | ) 94 | 95 | self.setAuth(oauth, username) 96 | 97 | def createOAuthUntilSuccess(self): 98 | """ 99 | Repeat asking the user for username/password, until a valid 100 | combination is specified. This data will be used to create an OAuth 101 | token for the 'username' account. 102 | """ 103 | username = raw_input("Please enter your Github email: ") 104 | password = getpass.getpass("Please enter your Github password: ") 105 | 106 | oauth = self.createOAuthToken(username, password) 107 | 108 | return (oauth, username) 109 | 110 | def createOAuthToken(self, username, password, header=None): 111 | """ 112 | Request Github API for OAuth token creation. 113 | 'header' can be used to pass extra headers, which are necessary for 114 | two-factor authentication. 115 | """ 116 | url = "https://api.github.com/authorizations" 117 | 118 | payload = { 119 | "scopes": [], 120 | "note": "githubSpider token." 121 | } 122 | 123 | resp = r.post(url, 124 | auth=(username, password), 125 | data=json.dumps(payload), 126 | headers=header) 127 | 128 | oauth = self.processOAuthResponse(resp, username, password) 129 | 130 | return oauth 131 | 132 | def processOAuthResponse(self, resp, username, password): 133 | decoded = json.loads(resp.text) 134 | oauth = None 135 | 136 | if resp.status_code == 201: 137 | # Success. 138 | print ( 139 | "OAuth successfully created in file 'authentication'.\n" 140 | "Remember: Do not transfer your OAuth token to anybody!" 141 | ) 142 | oauth = decoded["token"] 143 | 144 | elif resp.status_code == 422: 145 | # OAuth already exists. 146 | print ( 147 | "Error: OAuth already exists for this application.\n" 148 | "Visit https://github.com/settings/tokens and delete\n" 149 | "the githubSpider token. Then, please try again." 150 | ) 151 | 152 | elif resp.status_code == 401: 153 | # Bad credentials or two-factor authentication. 154 | # Check for two-factor authentication header: 155 | # "X-GitHub-OTP: required; :2fa-type", 156 | # where 2fa-type = "sms" or other case 157 | 158 | KEY_TWO_FACTOR = "X-GitHub-OTP" 159 | if KEY_TWO_FACTOR in resp.headers: 160 | two_factor_header = resp.headers[KEY_TWO_FACTOR] 161 | 162 | # Check if two-factor-authentication is done via SMS or App. 163 | method = None 164 | if two_factor_header.find("sms") != -1: 165 | method = "via SMS" 166 | else: 167 | method = "via your Github application" 168 | 169 | print ( 170 | "You setup two-factor authentication. You should get " 171 | "the one-time password %s shortly." % method 172 | ) 173 | 174 | two_factor_pw = raw_input( 175 | "Please enter your one-time password: " 176 | ) 177 | 178 | header = {KEY_TWO_FACTOR: two_factor_pw} 179 | 180 | # Query OAuth creation again, this time send username, password 181 | # and one-time password. 182 | oauth = self.createOAuthToken(username, password, header) 183 | 184 | else: 185 | # Bad credentials. 186 | print ( 187 | "Error: Bad credentials, try again." 188 | ) 189 | 190 | self.createOAuthUntilSuccess() 191 | 192 | elif resp.status_code == 403: 193 | # API rate limit exceeded. 194 | print ( 195 | "Your Github API rate limit is already exceeded. " 196 | "Cannot query API for OAuth creation until rate limit is reset." 197 | ) 198 | 199 | if not oauth: 200 | raise OAuthCreationException() 201 | 202 | return oauth 203 | 204 | def getValidUserInput(self, msg, valid_answers, default=None): 205 | """ 206 | Ask user to input data until he entered a valid input. 207 | If 'default' is given, it will be returned on no user input (=user 208 | just input "\n"). 209 | """ 210 | if default: 211 | valid_answers.append("") 212 | 213 | user_input = raw_input(msg) 214 | while not self.isValidUserInput(user_input, valid_answers): 215 | user_input = raw_input(msg) 216 | 217 | if user_input == "" and default: 218 | user_input = default 219 | 220 | return user_input 221 | 222 | def isValidUserInput(self, user_input, valid_answers): 223 | for answer in valid_answers: 224 | if user_input == answer: 225 | return True 226 | 227 | return False 228 | 229 | def setAuth(self, oauth, user_agent): 230 | self.testAuth(oauth) 231 | 232 | self.AUTH = { 233 | self.KEY_OAUTH: oauth, 234 | self.KEY_USER_AGENT: user_agent 235 | } 236 | 237 | def testAuth(self, oauth_token): 238 | url = "https://api.github.com/rate_limit" 239 | header = { 240 | "Authorization": "token %s" % (oauth_token) 241 | } 242 | 243 | resp = r.get(url, headers=header) 244 | 245 | if resp.status_code != 200: 246 | print ( 247 | "Found bad credentials in authentication " 248 | "file 'authentication'." 249 | ) 250 | 251 | user_input = self.getValidUserInput( 252 | "Do you want to delete it? [Y/n]", 253 | ["y", "Y", "N", "n"], 254 | default="Y" 255 | ) 256 | 257 | if user_input.lower() == "y": 258 | msg = "Deleting authentication file..." 259 | print "%s\r" % (msg), 260 | 261 | os.remove(self.FILE) 262 | 263 | print "%s Done." % (msg) 264 | 265 | raise AuthException() 266 | 267 | else: 268 | print "You chose to not delete the authentication data." 269 | 270 | raise NoCredentialsException() 271 | 272 | 273 | ### Exceptions 274 | class AuthException(BaseException): 275 | def __str__(self): 276 | return "No allowed authentication found in file 'authentication'." 277 | 278 | class AuthFileNotFoundException(BaseException): 279 | def __str__(self): 280 | return "Authentication file not found. Expecting file 'authentication'." 281 | 282 | class OAuthCreationException(BaseException): 283 | def __str__(self): 284 | return "Failed to create OAuth token." 285 | 286 | class NoAuthException(BaseException): 287 | def __str__(self): 288 | return ( 289 | "No OAuth or user agent available. " 290 | "Did you specify or parse them before?" 291 | ) 292 | 293 | class NoCredentialsException(BaseException): 294 | def __str__(self): 295 | return "No credentials given." -------------------------------------------------------------------------------- /github/repository.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 17, 2015 3 | 4 | @author: Tommi Unruh 5 | ''' 6 | 7 | import json 8 | from github.exceptions import * 9 | 10 | class Repository(object): 11 | """ 12 | Class representing a repository from Github. 13 | """ 14 | 15 | def __init__(self, _dict): 16 | ''' 17 | Constructor 18 | ''' 19 | if isinstance(_dict, basestring): 20 | # '_dict' is given as a string. 21 | self._dict = json.loads(_dict) 22 | 23 | elif isinstance(_dict, dict): 24 | # '_dict' is given as a dict (=already json-decoded). 25 | self._dict = _dict 26 | 27 | else: 28 | raise Exception("Given value for '_dict' is not valid: '%s'." % ( 29 | _dict 30 | )) 31 | 32 | def filter(self, _filter): 33 | """ 34 | If all key,values match, return True. False otherwise. 35 | """ 36 | for key in _filter: 37 | if key in self._dict: 38 | if ( 39 | str(self._dict[key]).lower() != str(_filter[key]).lower() 40 | ): 41 | return False 42 | 43 | else: 44 | return False 45 | 46 | return True 47 | 48 | def __str__(self): 49 | return json.dumps(self._dict) 50 | 51 | def __getitem__(self, _key): 52 | return self.getValue(_key) 53 | 54 | def getValue(self, _key): 55 | """ 56 | General method to acquire values associated with '_key'. 57 | """ 58 | if _key in self._dict: 59 | return self._dict[_key] 60 | else: 61 | raise KeyNotFoundException(_key) 62 | 63 | def getStars(self): 64 | try: 65 | KEY = "stargazers_count" 66 | return self.getValue(KEY) 67 | except KeyNotFoundException: 68 | raise DidNotCrawlRepoDetailsException(KEY) 69 | 70 | def getSize(self): 71 | try: 72 | KEY = "size" 73 | return self.getValue(KEY) 74 | except KeyNotFoundException: 75 | raise DidNotCrawlRepoDetailsException(KEY) 76 | 77 | def getURL(self): 78 | KEY = "url" 79 | return self.getValue(KEY) 80 | 81 | def getDict(self): 82 | return self._dict -------------------------------------------------------------------------------- /github/repository_list.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 17, 2015 3 | 4 | @author: Tommi Unruh 5 | ''' 6 | import json 7 | from github.repository import Repository 8 | from github.exceptions import UnavailableRepoException 9 | 10 | class RepositoryList(object): 11 | """ 12 | Class representing a list of repositories from Github. 13 | """ 14 | 15 | def __init__(self, url=None, etag=None, repos="[]", next_url=None): 16 | ''' 17 | Constructor 18 | ''' 19 | self.url = url 20 | self.etag = etag 21 | self.next_url = next_url 22 | 23 | self.setRepos(repos) 24 | 25 | def filter(self, session, _filter): 26 | """ 27 | Remove repositories from list, that do not match filter. 28 | '_filter' should be a dictionary stating a value for each defined key. 29 | e.g. {"language": "PHP", "stargazers_count": 5}. 30 | 31 | Additionally, we get more details for each repository, because 32 | we query each repository individually. 33 | """ 34 | filtered_repos = [] 35 | for repo in self.repos: 36 | # Query repo and check filter. 37 | try: 38 | full_repo = session.getRepo(repo.getURL()) 39 | if full_repo.filter(_filter): 40 | filtered_repos.append(full_repo) 41 | 42 | except UnavailableRepoException: 43 | # Skip repository 44 | pass 45 | 46 | self.repos = filtered_repos 47 | 48 | def __iadd__(self, other): 49 | self.repos.append(other) 50 | return self 51 | 52 | def __str__(self): 53 | """ 54 | Get textual representation of list of repositories. 55 | """ 56 | repos_decoded = [] 57 | 58 | for repo in self.repos: 59 | repos_decoded.append(repo.getDict()) 60 | 61 | return json.dumps(repos_decoded) 62 | 63 | def __iter__(self): 64 | """ 65 | Iterate over the list of repositories in self.repos. 66 | """ 67 | ctr = 0 68 | while ctr < len(self.repos): 69 | yield self.repos[ctr] 70 | ctr += 1 71 | 72 | def __len__(self): 73 | return len(self.repos) 74 | 75 | def isEmpty(self): 76 | return True if not self.repos else False 77 | 78 | def getURL(self): 79 | return self.url 80 | 81 | def setURL(self, url): 82 | self.url = url 83 | 84 | def getEtag(self): 85 | return self.etag 86 | 87 | def setETag(self, etag): 88 | self.etag = etag 89 | 90 | def getNextURL(self): 91 | return self.next_url 92 | 93 | def setNextURL(self, next_url): 94 | self.next_url = next_url 95 | 96 | def setRepos(self, repos): 97 | self.repos = [] 98 | 99 | if isinstance(repos, basestring): 100 | # 'repos' is given as a string. 101 | repos = json.loads(repos) 102 | 103 | self.repos = [] 104 | for _dict in repos: 105 | # Transform each dictionary into a Repository object. 106 | self.repos.append(Repository(_dict)) 107 | 108 | return True 109 | 110 | elif isinstance(repos, list): 111 | # 'repos' is given as a list (=already json-decoded). 112 | # Check if the list is populated with dictionaries or 113 | # Repository objects. 114 | for _obj in repos: 115 | if isinstance(_obj, dict): 116 | # _obj is dict, transform it to Repository. 117 | self.repos.append(Repository(_obj)) 118 | 119 | elif isinstance(_obj, Repository): 120 | # _obj already is Repository, just append it. 121 | self.repos.append(_obj) 122 | 123 | return True 124 | 125 | raise Exception("Given value for 'repos' is not valid: '%s'." % ( 126 | repos 127 | )) -------------------------------------------------------------------------------- /github/session.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 17, 2015 3 | 4 | @author: Tommi Unruh 5 | ''' 6 | 7 | import requests 8 | import json 9 | 10 | from exceptions import * 11 | from github.repository_list import RepositoryList 12 | from github.repository import Repository 13 | from time import sleep 14 | 15 | class Session(object): 16 | """ 17 | This class saves the user's authorization infos and is able to do requests 18 | to the Github API on behalf of the authorized user. 19 | """ 20 | 21 | SLEEP = 0.5 22 | 23 | URL_API = "https://api.github.com" 24 | URL_REPOS = URL_API + "/repositories" 25 | URL_SEARCH = URL_API + "/search/repositories" 26 | URL_RATE_LIMIT = URL_API + "/rate_limit" 27 | 28 | KEY_ETAG = "ETag" 29 | KEY_RL_REMAIN = "X-RateLimit-Remaining" 30 | 31 | STATUS_UNAVAILABLE = 403 32 | 33 | def __init__(self, OAuth=None, user_agent=None): 34 | """ 35 | Setup session. 36 | """ 37 | self.HEADERS = {} 38 | 39 | if OAuth and user_agent: 40 | self.setOAuth(OAuth) 41 | self.setUserAgent(user_agent) 42 | 43 | self.HEADERS = { 44 | 'User-Agent': user_agent, 45 | 'Authorization': "token %s" % OAuth 46 | } 47 | 48 | elif not OAuth: 49 | print ( 50 | "No authorization token given, continuing unauthenticated.\n" 51 | "Unauthenticated requests are limited to 60 per hour, while\n" 52 | "authenticated requests are limited to 5000 per hour." 53 | ) 54 | 55 | def getRatelimit(self): 56 | """ 57 | Request Github API for ratelimit info for this session. 58 | """ 59 | resp = self.sessionRequestGet(self.URL_RATE_LIMIT) 60 | _dict = json.loads(resp.text) 61 | 62 | if resp.status_code == 200: 63 | return _dict["resources"] 64 | else: 65 | raise Exception("Encountered a problem. Github answered with" 66 | ":\n%s" % _dict) 67 | 68 | return _dict 69 | 70 | def getRepos(self, since=0, url=None): 71 | """ 72 | Get a list of repositories. 73 | """ 74 | response = None 75 | if url: 76 | response = self.sessionRequestGet(url) 77 | else: 78 | url = self.URL_REPOS + "?since=" + str(since) 79 | response = self.sessionRequestGet(url) 80 | 81 | etag = response.headers[self.KEY_ETAG] 82 | repos = json.loads(response.text) 83 | next_url = response.links["next"]["url"] 84 | 85 | repos = RepositoryList(url, etag, repos, next_url) 86 | 87 | return repos 88 | 89 | def getRepo(self, url): 90 | """ 91 | Query a single repository. 92 | """ 93 | response = self.sessionRequestGet(url) 94 | 95 | return Repository(response.text) 96 | 97 | def update(self, repository_list): 98 | """ 99 | Query API for an updated list of 'repository_list'. 100 | """ 101 | header = {"If-None-Match": repository_list.getEtag()} 102 | response = self.sessionRequestGet(repository_list.getURL(), header) 103 | 104 | if response.status_code == 200: 105 | # Found update 106 | 107 | etag = response.headers[self.KEY_ETAG] 108 | repos = json.loads(response.text) 109 | next_url = response.links["next"]["url"] 110 | 111 | repository_list.setETag(etag) 112 | repository_list.setRepos(repos) 113 | repository_list.setNextURL(next_url) 114 | 115 | return True 116 | 117 | return False 118 | 119 | def sessionRequestGet(self, url, headers=None): 120 | """ 121 | Send a get-request with all session-headers. 122 | """ 123 | try: 124 | if headers: 125 | header = self.HEADERS.copy() 126 | header.update(headers) 127 | 128 | response = requests.get(url, headers=header) 129 | else: 130 | response = requests.get(url, headers=self.HEADERS) 131 | 132 | if response.status_code == self.STATUS_UNAVAILABLE: 133 | if response.headers[self.KEY_RL_REMAIN] == 0: 134 | # Ratelimit 0 reached. 135 | raise RatelimitExceededException() 136 | 137 | else: 138 | # Unavailable resource 139 | raise UnavailableRepoException() 140 | 141 | except requests.exceptions.ConnectionError as err: 142 | print err 143 | print "Sleeping %d seconds and retrying with same URL." % self.SLEEP 144 | sleep(0.5) 145 | response = self.sessionRequestGet(url, headers) 146 | 147 | return response 148 | 149 | def addOAuth(self, url): 150 | """ 151 | Add the OAuth get-parameter to the specified 'url'. 152 | """ 153 | token_query = "access_token=" + self.OAUTH["token"] 154 | if url.find('?') != -1: 155 | url += "&" + token_query 156 | else: 157 | url += "?" + token_query 158 | 159 | return url 160 | 161 | def setOAuth(self, OAuth): 162 | self.OAuth = OAuth 163 | 164 | def setUserAgent(self, user_agent): 165 | self.user_agent = user_agent 166 | 167 | def setPerPage(self, per_page): 168 | per_page = int(per_page) 169 | 170 | if per_page: 171 | self.per_page = per_page 172 | else: 173 | raise ValueError("'per_page' parameter could not be set.") -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 4, 2015 3 | 4 | @author: Tommi Unruh 5 | ''' 6 | 7 | from crawler import Crawler 8 | import sys 9 | from args_parser import ModeArgsParser 10 | from github.git_downloader import GitDownloader, OutOfScopeException 11 | import json 12 | 13 | ARGS_HELP = "help" 14 | ARGS_RATELIMIT = "ratelimit" 15 | ARGS_CRAWL_REPOS = "crawl" 16 | ARGS_CLONE_REPOS = "clone" 17 | ARGS_EXTRACT_KEYDATA = "extract" 18 | ARGS_EXTRACTREPOS_FILTERED = "filter" 19 | 20 | REPO_KEY_LANGUAGE = "language" 21 | DEFAULT_REPO_FILTER = {REPO_KEY_LANGUAGE: "PHP"} 22 | 23 | REPO_ALLOWED_KEYS = [ 24 | 'issues_url', 'stargazers_count', 'forks_url', 'mirror_url', 25 | 'subscription_url', 'notifications_url', 'collaborators_url', 26 | 'updated_at', 'private', 'pulls_url', 'issue_comment_url', 27 | 'labels_url', 'has_wiki', 'full_name', 'owner', 'statuses_url', 28 | 'id', 'keys_url', 'description', 'subscribers_count', 29 | 'tags_url', 'network_count', 'downloads_url', 'assignees_url', 30 | 'contents_url', 'has_pages', 'git_refs_url', 31 | 'open_issues_count', 'clone_url', 'watchers_count', 32 | 'git_tags_url', 'milestones_url', 'languages_url', 'size', 33 | 'homepage', 'fork', 'commits_url', 'releases_url', 34 | 'issue_events_url', 'archive_url', 'comments_url', 35 | 'events_url', 'contributors_url', 'html_url', 'forks', 36 | 'compare_url', 'open_issues', 'git_url', 'svn_url', 37 | 'merges_url', 'has_issues', 'ssh_url', 'blobs_url', 38 | 'git_commits_url', 'hooks_url', 'has_downloads', 'watchers', 39 | 'name', 'language', 'url', 'created_at', 'pushed_at', 40 | 'forks_count', 'default_branch', 'teams_url', 'trees_url', 41 | 'branches_url', 'subscribers_url', 'stargazers_url'] 42 | 43 | def main(argv): 44 | """ 45 | Entry point of execution. Handles program arguments and 46 | acts accordingly. 47 | """ 48 | auth_file = "authentication" 49 | 50 | # Setup command line arguments. 51 | parser = ModeArgsParser() 52 | setupArgs(parser) 53 | 54 | flow = None 55 | crawler = None 56 | 57 | try: 58 | flow = parser.parseArgs(argv[1], argv[2:]) 59 | 60 | # Check if authentication file was specified. 61 | if "a" in flow: 62 | auth_file = flow["a"] 63 | elif "auth" in flow: 64 | auth_file = flow["auth"] 65 | 66 | except: 67 | parser.printHelp(argv[0]) 68 | sys.exit() 69 | 70 | # Evaluate program arguments and start program. 71 | if flow[parser.KEY_MODE] == ARGS_HELP: 72 | parser.printHelp(argv[0]) 73 | 74 | if flow[parser.KEY_MODE] == ARGS_RATELIMIT: 75 | crawler = Crawler(auth_file) 76 | _dict = crawler.getRateLimit() 77 | print "Rate Limits:" 78 | print "core:" , _dict["core"] 79 | print "search:", _dict["search"] 80 | 81 | elif flow[parser.KEY_MODE] == ARGS_CRAWL_REPOS: 82 | crawler = Crawler(auth_file) 83 | 84 | if "ds" in flow or "dontskip" in flow: 85 | skip = False 86 | else: 87 | skip = True 88 | 89 | try: 90 | if "f" in flow: 91 | _filter = flow["f"] 92 | _filter = convertIntoDict(_filter) 93 | 94 | elif "filter" in flow: 95 | _filter = flow["filter"] 96 | _filter = convertIntoDict(_filter) 97 | 98 | else: 99 | _filter = DEFAULT_REPO_FILTER 100 | 101 | except Exception as err: 102 | print err 103 | 104 | finally: 105 | crawler.crawlRepos(flow["in"], skip, _filter=_filter) 106 | 107 | elif flow[parser.KEY_MODE] == ARGS_EXTRACT_KEYDATA: 108 | if "k" in flow or "key" in flow: 109 | try: 110 | key = flow["k"] 111 | except: 112 | key = flow["key"] 113 | finally: 114 | Crawler.getKeyFromCrawlData(flow["in"], flow["out"], key) 115 | 116 | else: 117 | Crawler.getKeyFromCrawlData(flow["in"], flow["out"]) 118 | 119 | elif flow[parser.KEY_MODE] == ARGS_EXTRACTREPOS_FILTERED: 120 | try: 121 | _filter = flow["f"] 122 | except: 123 | _filter = flow["filter"] 124 | finally: 125 | Crawler.extractReposFiltered(flow["in"], flow["out"], _filter) 126 | 127 | # cloning repos 128 | elif flow[parser.KEY_MODE] == ARGS_CLONE_REPOS: 129 | downloader = GitDownloader(flow["out"]) 130 | 131 | try: 132 | _line = flow["l"] 133 | except: 134 | try: 135 | _line = flow["_line"] 136 | except: 137 | _line = 0 138 | 139 | delete = False 140 | if "d" in flow or "delete" in flow: 141 | delete = True 142 | 143 | plugin = False 144 | try: 145 | downloader.setSuccessHandler(flow["p"]) 146 | plugin = True 147 | 148 | except Exception as err: 149 | try: 150 | downloader.setSuccessHandler(flow["plugin"]) 151 | plugin = True 152 | except: 153 | pass 154 | 155 | if delete and not plugin: 156 | print ( 157 | "A combination of -d/--delete without -p/--plugin is " 158 | "not allowed." 159 | ) 160 | sys.exit() 161 | 162 | try: 163 | downloader.cloneAllFromFile( 164 | flow["in"], 165 | linenumber=_line, 166 | delete=delete 167 | ) 168 | 169 | except OutOfScopeException as err: 170 | print ( 171 | "The specified line number '%s' in parameter '-l/--line' is " 172 | "out of scope for file '%s'." % (_line, flow["in"]) 173 | ) 174 | 175 | def convertIntoDict(_str): 176 | try: 177 | _dict = json.loads(_str) 178 | 179 | except: 180 | _dict = None 181 | 182 | if isinstance(_dict, dict): 183 | valid = True 184 | for key in _dict: 185 | if key not in REPO_ALLOWED_KEYS: 186 | valid = False 187 | invalid_key = key 188 | break 189 | 190 | if valid: 191 | return _dict 192 | 193 | else: 194 | raise ValueError("Dictionary key '%s' is not a valid " 195 | "key of a repository" % invalid_key) 196 | 197 | raise ValueError("Filter should be specified as a " 198 | "JSON-decoded python dictionary.") 199 | 200 | def setupArgs(parser): 201 | """ 202 | Setup command line arguments combinations. 203 | """ 204 | # Ratelimit: ratelimit 205 | explanation = "Check your ratelimit." 206 | parser.addArgumentsCombination(ARGS_RATELIMIT, 207 | optional_args=[["a=", "auth"]], 208 | explanation=explanation) 209 | 210 | # Help: help 211 | explanation = "Print this help." 212 | parser.addArgumentsCombination(ARGS_HELP, explanation=explanation) 213 | 214 | # Crawl repos: crawl -in file -out file (-s/--skip, -a/--auth, -f/--filter) 215 | explanation = ( 216 | "Crawl repositories from Github.com " 217 | "to file specified with \"-in\". " 218 | "-ds/--dontskip can be used to first check for updates " 219 | "for already crawled repositories in file. " 220 | "The input file will be renamed to input_file_backup. " 221 | "Use -f/--filter followed by a python dictionary to " 222 | "specify a filter to only save information of repositories " 223 | "which apply to that filter. " 224 | "The default filter is {\"language\": \"PHP\"}, but any " 225 | "python dictionary is allowed." 226 | ) 227 | parser.addArgumentsCombination( 228 | ARGS_CRAWL_REPOS, 229 | [["in=", None]], 230 | [ 231 | ["ds", "dontskip"], 232 | ["a=", "auth"], 233 | ["f=", "filter"] 234 | ], 235 | explanation=explanation 236 | ) 237 | 238 | explanation = ( 239 | "Extract the value associated with '-k/--key' from " 240 | "crawled repositories in '-in' and write it to '-out'." 241 | "Default for 'k/--key' is 'clone_url', which " 242 | "specifies the URL for cloning a repository. " 243 | "However, '-k/--key' can take a list of keys to extract, " 244 | "separated by commas. Example: -k \"id, clone_url\"" 245 | ) 246 | # Extract key data: extract -in file -out file (-k/--key) 247 | parser.addArgumentsCombination(ARGS_EXTRACT_KEYDATA, 248 | [["in=", None], ["out=", None]], 249 | [["k=", "key"]], 250 | explanation=explanation 251 | ) 252 | 253 | explanation = ( 254 | "Filter the repositories from file '-in' and write " 255 | "filtered repositories to '-out'. '-f/--filter' specifies " 256 | "the filter criterion. Currently supported: stars:=x, stars:>x " 257 | "stars:x x, size:x