├── .gitignore
├── .project
├── .pydevproject
├── LICENSE.txt
├── args_parser.py
├── crawler.py
├── dependencies.txt
├── github
├── __init__.py
├── __init__.pyc
├── data_manager.py
├── exceptions.py
├── git_downloader.py
├── oauthManager.py
├── repository.py
├── repository_list.py
└── session.py
├── main.py
└── parallel_cloning.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | convert.py*
3 | .fuse*
4 | crawled/
5 | backups/
6 | authentication
7 | github_crawl.sh
8 | *~
9 | push_token
10 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | githubSpider
4 |
5 |
6 |
7 |
8 |
9 | org.python.pydev.PyDevBuilder
10 |
11 |
12 |
13 |
14 |
15 | org.python.pydev.pythonNature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.pydevproject:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | /${PROJECT_DIR_NAME}
5 |
6 | python 2.7
7 | Default
8 |
9 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | This project is licensed under the terms of the MIT license.
2 |
3 | Copyright (c) 2018 Tommi Unruh
4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
5 |
6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
7 |
8 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
9 |
--------------------------------------------------------------------------------
/args_parser.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Jul 19, 2015
3 |
4 | @author: Tommi Unruh
5 | '''
6 |
7 | import re
8 | import copy
9 |
10 | class ModeArgsParser(object):
11 | '''
12 | classdocs
13 | '''
14 |
15 | KEY_MODE = "mode"
16 | KEY_ORDER = "order"
17 | KEY_EXPLANATION = "key_explanation"
18 | KEY_ARGS_OPTIONAL = "optional_args"
19 | KEY_ARGS_NECESSARY = "necessary_args"
20 | KEY_ARGS_OPTIONAL_WVAL = "optional_args_w_value"
21 | KEY_ARGS_NECESSARY_WVAL = "necessary_args_w_value"
22 |
23 | def __init__(self):
24 | '''
25 | Constructor
26 | '''
27 | self.combinations = {}
28 |
29 |
30 | def addArgumentsCombination(self, mode, necessary_args=None,
31 | optional_args=None, order=None,
32 | explanation=None):
33 | """
34 | Prepare a dictionary of necessary and optional values,
35 | with and without values respectively.
36 | """
37 | self.combinations[mode] = {
38 | self.KEY_ORDER: [],
39 | self.KEY_EXPLANATION: None,
40 | self.KEY_ARGS_OPTIONAL: [],
41 | self.KEY_ARGS_NECESSARY: [],
42 | self.KEY_ARGS_OPTIONAL_WVAL: [],
43 | self.KEY_ARGS_NECESSARY_WVAL: [],
44 | }
45 |
46 | # Parse necessary arguments.
47 | if necessary_args:
48 | # Parse short versions first
49 | for s_arg, l_arg in necessary_args:
50 | # If a key ends in "=", we expect it to
51 | # be a key-value pair.
52 | if s_arg:
53 | if s_arg[-1] == "=":
54 | (self.combinations[mode]
55 | [self.KEY_ARGS_NECESSARY_WVAL].append(
56 | [s_arg[:-1], l_arg]
57 | ))
58 |
59 | else:
60 | # Key does not end in "=".
61 | (self.combinations[mode]
62 | [self.KEY_ARGS_NECESSARY].append(
63 | [s_arg, l_arg]
64 | ))
65 |
66 | elif not l_arg:
67 | # s_arg and l_arg are both None, which is not correct.
68 | raise NoneTypeCombinationException()
69 |
70 | # Parse optional arguments.
71 | if optional_args:
72 | # Parse short versions first
73 | for s_arg, l_arg in optional_args:
74 | # If a key ends in "=", we expect it to
75 | # be a key-value pair.
76 | if s_arg:
77 | if s_arg[-1] == "=":
78 | (self.combinations[mode]
79 | [self.KEY_ARGS_OPTIONAL_WVAL].append(
80 | [s_arg[:-1], l_arg]
81 | ))
82 |
83 | else:
84 | # Key does not end in "=".
85 | (self.combinations[mode]
86 | [self.KEY_ARGS_OPTIONAL].append(
87 | [s_arg, l_arg]
88 | ))
89 |
90 | elif not l_arg:
91 | # s_arg and l_arg are both None, which is not correct.
92 | raise NoneTypeCombinationException()
93 |
94 | # Setup order of arguments.
95 | # This is important for returning the results.
96 | # Arguments on the command line can be mixed up!
97 | if order:
98 | self.combinations[mode][self.KEY_ORDER] = order
99 |
100 | else:
101 | # No order specified, so build the default one:
102 | # Necessary arguments first, as specified. Then optional ones.
103 | if necessary_args:
104 | for s_arg, l_arg in necessary_args:
105 | if s_arg[-1] == "=":
106 | self.combinations[mode][self.KEY_ORDER].append(
107 | s_arg[:-1]
108 | )
109 | else:
110 | self.combinations[mode][self.KEY_ORDER].append(
111 | s_arg
112 | )
113 |
114 | if optional_args:
115 | for s_arg, l_arg in optional_args:
116 | if s_arg[-1] == "=":
117 | self.combinations[mode][self.KEY_ORDER].append(
118 | s_arg[:-1]
119 | )
120 | else:
121 | self.combinations[mode][self.KEY_ORDER].append(
122 | s_arg
123 | )
124 |
125 | if explanation:
126 | self.combinations[mode][self.KEY_EXPLANATION] = explanation
127 |
128 | # Create a duplicate of combinations as a helper variable.
129 | # It is necessary to construct the usage() message.
130 | self.combinations_helper = copy.deepcopy(self.combinations)
131 |
132 | def parseMode(self, arg):
133 | """
134 | Check if mode ('arg') is implemented.
135 | """
136 | mode = None
137 | arg = arg.strip()
138 |
139 | if arg[0] == "-":
140 | raise WrongFormatException(arg)
141 |
142 | else:
143 | # Check if this mode is available.
144 | for key in self.combinations:
145 | if key == arg:
146 | mode = arg
147 | break
148 |
149 | if mode:
150 | return mode
151 |
152 | else:
153 | raise WrongModeException(arg)
154 |
155 |
156 | def parseArgs(self, mode, args):
157 | # Expects args[0] to be a mode value,
158 | # i.e. it should not have a minus sign in front of it.
159 | mode = self.parseMode(mode)
160 | return self.getOpts(mode, args)
161 |
162 | def getOpts(self, mode, args):
163 | """
164 | Parse args and return them in order, as specified by self.combinations.
165 | """
166 | # Remark: re_short_option will also match long options.
167 | # Therefore, look for long options first, then for short options.
168 | re_long_option = re.compile("--([a-zA-Z]+)")
169 | re_short_option = re.compile("-([a-zA-Z]+)")
170 |
171 | result = {}
172 | skip = False
173 | parsed_vals = {}
174 |
175 | for i, _ in enumerate(args):
176 | if not skip:
177 | key = None
178 | full_key = None
179 |
180 | # Check for long option.
181 | long_hit = re_long_option.match(args[i])
182 |
183 | if long_hit:
184 | key = long_hit.group(1)
185 | full_key = long_hit.group(0)
186 |
187 | else:
188 | # No long option found, check for short option.
189 | short_hit = re_short_option.match(args[i])
190 | if short_hit:
191 | key = short_hit.group(1)
192 | full_key = short_hit.group(0)
193 |
194 | if not key:
195 | # No short, no long option found.
196 | raise WrongFormatException(args[i])
197 |
198 | val = self.parseNextKeyValue(args, i)
199 |
200 | if val:
201 | skip = True
202 |
203 | # Check if key-val pair is correct for this command.
204 | is_permitted = self.argPermitted(full_key, val, mode)
205 |
206 | if is_permitted:
207 | result[key] = val
208 |
209 | else:
210 | skip = False
211 |
212 | # Are necessary arguments still missing?
213 | if self.isMissingArgs(self.combinations[mode]):
214 | raise MissingParameterException(self.combinations[mode])
215 |
216 | # Add mode to result
217 | parsed_vals[self.KEY_MODE] = mode
218 |
219 | # Bring arguments in order.
220 | # for elem in self.combinations[mode][self.KEY_ORDER]:
221 | # if elem in result:
222 | for key in result:
223 | parsed_vals[key] = result[key]
224 |
225 | return parsed_vals
226 |
227 | def parseNextKeyValue(self, args, i):
228 | """
229 | Check next argument for a given value for this key.
230 | """
231 | val = None
232 |
233 | if len(args) > i + 1:
234 | parsed_val = args[i+1]
235 | if len(parsed_val) > 1 and parsed_val[0:2] != "--" and parsed_val[0] != "-":
236 | val = parsed_val
237 |
238 | elif len(parsed_val) == 1 and parsed_val != "-":
239 | val = parsed_val
240 |
241 | return val
242 |
243 | def isMissingArgs(self, combination):
244 | if (
245 | combination[self.KEY_ARGS_NECESSARY] or
246 | combination[self.KEY_ARGS_NECESSARY_WVAL]
247 | ):
248 | return True
249 |
250 | def argPermitted(self, key, val, mode):
251 | """
252 | Check if a given key-val pair is correctly specified.
253 | If so, remove it from the combination dictionary, so that
254 | it will be ignored for further parsing.
255 | """
256 | KEY_SHORT = 0
257 | KEY_LONG = 1
258 |
259 | combination = self.combinations[mode]
260 |
261 | found_permitted_arg = False
262 | orig_key = key
263 | key_type = -1
264 |
265 | # clear key from leading minuses. (e.g. --abc or -abc = abc)
266 | if key[0] == "-":
267 | key = key[1:]
268 | key_type = KEY_SHORT
269 |
270 | if key[0] == "-":
271 | key = key[1:]
272 | key_type = KEY_LONG
273 |
274 | # Check if value is permitted in keys which do not need a value.
275 | for i, combinations_key in enumerate(
276 | combination[self.KEY_ARGS_NECESSARY]
277 | ):
278 | if (
279 | key_type == KEY_SHORT and combinations_key[KEY_SHORT] == key or
280 | key_type == KEY_LONG and combinations_key[KEY_LONG] == key
281 | ):
282 | # Key found.
283 | # Was a value given?
284 | if val:
285 | raise UnneccessaryValueException(orig_key)
286 | else:
287 | combination[self.KEY_ARGS_NECESSARY].pop(i)
288 | found_permitted_arg = True
289 |
290 | if not found_permitted_arg:
291 | # Check if value is permitted in keys which do need a value.
292 | for i, combinations_key in enumerate(
293 | combination[self.KEY_ARGS_NECESSARY_WVAL]
294 | ):
295 | if (
296 | key_type == KEY_SHORT and combinations_key[KEY_SHORT] == key or
297 | key_type == KEY_LONG and combinations_key[KEY_LONG] == key
298 | ):
299 | # Key found.
300 | # Was a value given?
301 | if val:
302 | combination[self.KEY_ARGS_NECESSARY_WVAL].pop(i)
303 | found_permitted_arg = True
304 | else:
305 | raise MissingValueException(orig_key)
306 |
307 | if not found_permitted_arg:
308 | # Check if value is permitted in optional keys
309 | # which do not need a value.
310 | for i, combinations_key in enumerate(
311 | combination[self.KEY_ARGS_OPTIONAL]
312 | ):
313 | if (
314 | key_type == KEY_SHORT and combinations_key[KEY_SHORT] == key or
315 | key_type == KEY_LONG and combinations_key[KEY_LONG] == key
316 | ):
317 | # Key found.
318 | # Was a value given?
319 | if val:
320 | raise UnneccessaryValueException(orig_key)
321 | else:
322 | combination[self.KEY_ARGS_OPTIONAL].pop(i)
323 | found_permitted_arg = True
324 |
325 | if not found_permitted_arg:
326 | # Check if value is permitted in optional keys
327 | # which do need a value.
328 | for i, combinations_key in enumerate(
329 | combination[self.KEY_ARGS_OPTIONAL_WVAL]
330 | ):
331 | if (
332 | key_type == KEY_SHORT and combinations_key[KEY_SHORT] == key or
333 | key_type == KEY_LONG and combinations_key[KEY_LONG] == key
334 | ):
335 | # Key found.
336 | # Was a value given?
337 | if val:
338 | combination[self.KEY_ARGS_OPTIONAL_WVAL].pop(i)
339 | found_permitted_arg = True
340 | else:
341 | raise MissingValueException(orig_key)
342 |
343 | if not found_permitted_arg:
344 | raise WrongParameterException(mode, orig_key)
345 |
346 | return found_permitted_arg
347 |
348 | def printHelp(self, arg0):
349 | """
350 | Print usage.
351 | """
352 | # Construct usage string
353 | usage = (
354 | "Usage: python " + str(arg0) + " MODE necessary_arg0, necessary_arg1"
355 | ", .. optional_arg0, optional_arg1, ...\n"
356 | )
357 |
358 | # Print all modes.
359 | modes = "\nMODES: "
360 | for key in self.combinations_helper:
361 | modes += str(key) + ", "
362 |
363 | modes = modes[:-2] + "\n"
364 |
365 | args = "\nMODE ARGS [OPTIONAL_ARGS]:\n"
366 |
367 |
368 | # Construct mode-argument combination-strings.
369 | for mode in self.combinations_helper:
370 | counter = 0
371 | arg = "\t" + mode + "\t\t"
372 | for key in self.combinations_helper[mode][self.KEY_ARGS_NECESSARY_WVAL]:
373 |
374 | arg += "-" + str(key[0])
375 | if key[1]:
376 | arg += "/--" + str(key[1])
377 |
378 | arg += " arg" + str(counter) + " "
379 | counter += 1
380 |
381 | for key in self.combinations_helper[mode][self.KEY_ARGS_NECESSARY]:
382 | arg += "-" + str(key[0])
383 | if key[1]:
384 | arg += "/--" + str(key[1]) + " "
385 |
386 | if (
387 | self.combinations_helper[mode][self.KEY_ARGS_OPTIONAL_WVAL] or
388 | self.combinations_helper[mode][self.KEY_ARGS_OPTIONAL]
389 | ):
390 | arg += "["
391 |
392 | for key in self.combinations_helper[mode][self.KEY_ARGS_OPTIONAL_WVAL]:
393 | arg += "-" + str(key[0])
394 | if key[1]:
395 | arg += "/--" + str(key[1])
396 |
397 | arg += " arg" + str(counter) + ", "
398 | counter += 1
399 |
400 | for key in self.combinations_helper[mode][self.KEY_ARGS_OPTIONAL]:
401 | arg += "-" + str(key[0])
402 | if key[1]:
403 | arg += "/--" + str(key[1]) + ", "
404 |
405 | if (
406 | self.combinations_helper[mode][self.KEY_ARGS_OPTIONAL_WVAL] or
407 | self.combinations_helper[mode][self.KEY_ARGS_OPTIONAL]
408 | ):
409 | arg = arg[:-2] + "]"
410 |
411 | args += arg + "\n"
412 |
413 | # Also print explanations for each mode.
414 | explanations = "\nDESCRIPTION:\n"
415 | tabulator = "\t"
416 | for key in self.combinations_helper:
417 | if self.combinations_helper[key][self.KEY_EXPLANATION]:
418 | explanation = "Mode: " + str(key) + "\n" + tabulator
419 | explanation += self.combinations_helper[key][self.KEY_EXPLANATION]
420 |
421 | explanations += explanation + "\n\n"
422 |
423 | print (usage + modes + args + explanations),
424 |
425 | class WrongModeException(BaseException):
426 | def __init__(self, val=None):
427 | self.val = val
428 |
429 | def __str__(self):
430 | if self.val:
431 | return "Mode '%s' is not implemented." % self.val
432 |
433 | else:
434 | return "Given mode is not implemented."
435 |
436 | class WrongFormatException(BaseException):
437 | def __init__(self, val=None):
438 | self.val = val
439 |
440 | def __str__(self):
441 | if self.val:
442 | return "Argument '%s' is malformed." % self.val
443 |
444 | else:
445 | return "An argument is malformed."
446 |
447 | class NoneTypeCombinationException(BaseException):
448 | def __str__(self):
449 | return "Combination cannot contain combination [None, None]."
450 |
451 | class MissingValueException(BaseException):
452 | def __init__(self, val=None):
453 | self.val = val
454 |
455 | def __str__(self):
456 | if self.val:
457 | return "You did not specify a value for key '%s'." % self.val
458 |
459 | else:
460 | return "You did not specify a necessary value."
461 |
462 | class MissingParameterException(BaseException):
463 | def __init__(self, combinations=None):
464 | self.combinations = combinations
465 |
466 | def __str__(self):
467 | KEY_ARGS_NECESSARY = "necessary_args"
468 | KEY_ARGS_NECESSARY_WVAL = "necessary_args_w_value"
469 |
470 | if self.combinations:
471 | missing = ""
472 | for _list in self.combinations[KEY_ARGS_NECESSARY]:
473 | if _list[1] != None:
474 | missing += "-%s/--%s, " % (_list[0], _list[1])
475 | else:
476 | missing += "-%s, " % (_list[0])
477 |
478 | for _list in self.combinations[KEY_ARGS_NECESSARY_WVAL]:
479 | if _list[1] != None:
480 | missing += "-%s/--%s, " % (_list[0], _list[1])
481 | else:
482 | missing += "-%s, " % (_list[0])
483 |
484 | missing = missing[:-2]
485 | return "Missing parameters: %s" % missing
486 |
487 | else:
488 | return "Missing parameters. Aborting..."
489 |
490 | class UnneccessaryValueException(BaseException):
491 | def __init__(self, val=None):
492 | self.val = val
493 |
494 | def __str__(self):
495 | if self.val:
496 | return (
497 | "You did specify a value for key '%s',"
498 | " but it does not need one." % self.val
499 | )
500 |
501 | else:
502 | return (
503 | "You did specify a value for a key,"
504 | " which does not need one."
505 | )
506 |
507 | class WrongParameterException(BaseException):
508 | def __init__(self, mode, param):
509 | self.mode = mode
510 | self.param = param
511 |
512 | def __str__(self):
513 | return (
514 | "Parameter '%s' is not allowed for command '%s'." % (
515 | self.param, self.mode
516 | )
517 | )
518 |
--------------------------------------------------------------------------------
/crawler.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Jul 4, 2015
3 |
4 | @author: Tommi Unruh
5 | '''
6 |
7 | import sys
8 | import re
9 | import os
10 | import shutil
11 |
12 | from github.session import Session as GithubSession
13 | from github.repository_list import RepositoryList
14 | from github.exceptions import RatelimitExceededException
15 | import signal
16 | from github.oauthManager import *
17 | import errno
18 | from github.data_manager import DataManager
19 | from time import sleep
20 | from threading import Thread
21 |
22 | class Crawler(object):
23 | '''
24 | classdocs
25 | '''
26 |
27 | # constants
28 | FILE_AUTHENTICATION = "authentication"
29 |
30 | LINK_API = "https://api.github.com"
31 | LINK_REPO_API = LINK_API + "/repositories"
32 | LINK_SEARCH_API = LINK_API + "/search/repositories"
33 | LINK_RATE_LIMIT = LINK_API + "/rate_limit"
34 | HEADER_USER_AGENT = None
35 | HEADER_XRATELIMIT_LIMIT = "X-RateLimit-Limit"
36 | HEADER_XRATELIMIT_REMAINING = "X-RateLimit-Remaining"
37 |
38 | KEY_NEXT = "next"
39 | KEY_SINCE = "since"
40 | KEY_COUNT = "count"
41 | KEY_START = "start"
42 | KEY_CLONE_URL = "clone_url"
43 | KEY_RL_REMAIN = "X-RateLimit-Remaining"
44 | KEY_STATUS_CODE = "status_code"
45 | KEY_CRAWLED_LINKS = "crawled_links"
46 |
47 | # GitHub Session object
48 | s = None
49 |
50 | def __init__(self, file_path):
51 | '''
52 | Constructor
53 | '''
54 | # DataManager handles file reading/writing.
55 | self.datamanager = DataManager()
56 |
57 | # Get OAuth from file 'authentication'.
58 | auth_file = file_path
59 | auth_manager = OAuthManager(filename=auth_file)
60 | auth = None
61 | try:
62 | auth = auth_manager.getAuthData()
63 |
64 | except (AuthFileNotFoundException, AuthException):
65 | # Authentication file not found or malformatted. Recreate it.
66 | auth = self.initiateAuthCreation(auth_manager)
67 |
68 | except NoCredentialsException:
69 | oauth = None
70 | user_agent = None
71 |
72 | if auth:
73 | oauth = auth[auth_manager.KEY_OAUTH]
74 | user_agent = auth[auth_manager.KEY_USER_AGENT]
75 |
76 | self.OAUTH = oauth
77 | self.HEADER_USER_AGENT = user_agent
78 |
79 | self.HEADERS = {
80 | 'User-Agent': self.HEADER_USER_AGENT,
81 | 'Authorization': "token %s" % self.OAUTH,
82 | }
83 |
84 | # Setup authentication and settings
85 | self.s = GithubSession(self.OAUTH, self.HEADER_USER_AGENT)
86 |
87 | def initiateAuthCreation(self, auth_manager):
88 | try:
89 | auth_manager.createAuth()
90 | auth = auth_manager.getAuthData()
91 | print "Authentication process done. Continuing..."
92 |
93 | except OAuthCreationException:
94 | # OAuth error. Maybe the OAuth token could not be created, because
95 | # it already exists.
96 | print (
97 | "OAuth error. Maybe authentication file could not be written "
98 | "because of missing write-privilege."
99 | )
100 | sys.exit()
101 |
102 | return auth
103 |
104 | def crawlReposWUpdate(self, data_filename):
105 | self.crawlRepos(data_filename, skip=False)
106 |
107 | def crawlRepos(self, file_links, skip=True, _filter=None):
108 | current_ratelimit = self.getRateLimit()["core"]["remaining"]
109 | if current_ratelimit == 0:
110 | self.endExecution()
111 |
112 | url = None
113 | copy_only = False
114 |
115 | file_links_backup = ""
116 |
117 | # Filehandle for writing.
118 | fw = None
119 | f_links = None
120 |
121 |
122 | TEXT_PROCESSING = "Processing contents of file: "
123 | # If a links file already exists from earlier crawls, then parse it.
124 | if os.path.isfile(file_links):
125 | print "File '%s' exists already. Will be appending to it." % (file_links)
126 |
127 | file_links_backup = file_links + "_backup"
128 |
129 | def restoreBackup(signum, frame):
130 | """
131 | Inner function: Restore original file from backup upon
132 | termination in backup process.
133 | """
134 | msg = "Got exit signal. Restoring original file from backup..."
135 | print "\n%s\r" % (msg),
136 |
137 | if fw:
138 | fw.close()
139 |
140 | if f_links:
141 | f_links.close()
142 |
143 | # Copy backup file back.
144 | shutil.copyfile(file_links_backup, file_links)
145 |
146 | print "%s Done." % (msg)
147 |
148 | sys.exit()
149 |
150 | # Catch process-kill signal.
151 | signal.signal(signal.SIGTERM, restoreBackup)
152 |
153 | # Also catch Ctrl-C/D.
154 | signal.signal(signal.SIGINT, restoreBackup)
155 |
156 | os.rename(file_links, file_links_backup)
157 |
158 | f_links = open(file_links_backup, 'r')
159 |
160 | if skip:
161 | # We do not want to recrawl old data, so
162 | # just copy-paste it.
163 | shutil.copyfile(file_links_backup, file_links)
164 |
165 | # Open fh for writing.
166 | fw = open(file_links, 'a')
167 |
168 | print TEXT_PROCESSING + str(file_links) + "..."
169 | sys.stdout.flush()
170 |
171 | if skip:
172 | # We do not want to recrawl old data.
173 | # Therefore, get the last next-link from the old data,
174 | # so that we can continue crawling from there.
175 | data = self.datamanager.getDataLikeTail(file_links,
176 | 1, stepsize=65)
177 |
178 | url = self.datamanager.extractNextURL(data)
179 | else:
180 | old_data = f_links
181 |
182 | etag = None
183 | repos = None
184 | next_url = None
185 |
186 | file_pos = None
187 | # Parse old data if skip was not specified.
188 | while 1 and not skip:
189 | try:
190 | file_pos = old_data.tell()
191 | parsed_data = self.datamanager.parseNextBlock(old_data)
192 |
193 | if parsed_data:
194 | _repos, url, etag, next_url = parsed_data
195 |
196 | repos = RepositoryList(
197 | url, etag, repos=_repos,
198 | next_url=next_url
199 | )
200 |
201 | if not skip:
202 | try:
203 | # Update data, by requesting Github API.
204 | self.nextBackupCrawl(fw, repos,
205 | copy_only=copy_only,
206 | _filter=_filter)
207 |
208 | except RatelimitExceededException:
209 | # No ratelimit remaining, continue
210 | # to only copy the old data and finish.
211 | copy_only = True
212 |
213 | # We finished parsing the old data.
214 | else:
215 | break
216 |
217 | # Encountered malformatted block, probably because
218 | # the original data file was cut/edited.
219 | # Rewind the file position and skip one line.
220 | except IOError as err:
221 | old_data.seek(file_pos, os.SEEK_SET)
222 | old_data.readline()
223 | print err, " Skipping this line!"
224 |
225 | if repos:
226 | url = repos.getNextURL()
227 |
228 | # Remove backup signal handlers.
229 | # SIG_DFL is the standard signal handle for any signal.
230 | signal.signal(signal.SIGTERM, signal.SIG_DFL)
231 | signal.signal(signal.SIGINT, signal.SIG_DFL)
232 | print "Done parsing old data."
233 |
234 | if copy_only:
235 | self.endExecution()
236 |
237 | repos = None
238 |
239 | try:
240 | # Parsing finished or no backup file found. Start crawling new data.
241 | if not fw:
242 | # There was no backup file
243 | fw = open(file_links, 'a')
244 |
245 | if not url:
246 | # We do not have a URL to start form yet.
247 | # Start crawling from the beginning.
248 | repos = self.nextCrawl(fw, _filter=_filter)
249 | url = repos.getNextURL()
250 |
251 | # Parse until ratelimit is reached.
252 | while url:
253 | # Crawl next page
254 | repos = self.nextCrawl(fw, url=url, _filter=_filter)
255 | url = repos.getNextURL()
256 |
257 | fw.close()
258 |
259 | except RatelimitExceededException:
260 | self.endExecution()
261 |
262 | def nextBackupCrawl(self, fh, repository_list,
263 | copy_only=False, _filter=None):
264 | """
265 | Get up-to-date data for already crawled repositories.
266 | If 'copy_only' is specified, we only copy old data from
267 | the backup file to not lose any already crawled data.
268 | """
269 | result = None
270 |
271 | if not copy_only:
272 | # We do not want to simply copy the old data -
273 | # check for an update.
274 | print "Updating from: %s" % repository_list.getURL()
275 |
276 | result = self.s.update(repository_list)
277 |
278 | if result:
279 | print "Found update!"
280 |
281 | if _filter:
282 | # Filter results
283 | repository_list.filter(self.s, self.DEFAULT_REPO_FILTER)
284 |
285 | self.datamanager.writeRepositoryList(fh, repository_list)
286 |
287 | return result
288 |
289 | def nextCrawl(self, fh, url=None, _filter=None):
290 | """
291 | Crawl repositories from GitHub.
292 | 'url' is used to specify the next parse-URL.
293 | """
294 | result = None
295 |
296 | _format = "Crawling: %s"
297 |
298 | # Setup visual feedback thread.
299 | visual_feedback = visualCrawlingFeedback()
300 |
301 | if url:
302 | _format = _format % url
303 | sys.stdout.write(_format + "\r")
304 | sys.stdout.flush()
305 |
306 | visual_feedback.setMsg(_format)
307 | visual_feedback.start()
308 | result = self.s.getRepos(url=url)
309 |
310 | else:
311 | _format = _format % "From beginning."
312 | sys.stdout.write(_format + "\r")
313 | sys.stdout.flush()
314 |
315 | visual_feedback.setMsg(_format)
316 | visual_feedback.start()
317 | result = self.s.getRepos()
318 |
319 | if _filter:
320 | # Filter results
321 | result.filter(self.s, _filter)
322 |
323 | # Write new results from Github.
324 | self.datamanager.writeRepositoryList(fh, result)
325 |
326 | visual_feedback.stopFeedback()
327 |
328 | print visual_feedback.getMsg() + "Saved to file."
329 |
330 | return result
331 |
332 | @staticmethod
333 | def getKeyFromCrawlData(input_file, output_file,
334 | keys=KEY_CLONE_URL):
335 | """
336 | Extract the value for 'key' from every crawled repository in file
337 | 'input_file'.
338 | Output is redirected into 'output_file'.
339 | """
340 | DataManager.getKeysFromCrawlData(input_file, output_file, keys)
341 |
342 | @staticmethod
343 | def extractReposFiltered(input_file, output_file,
344 | _filter=None):
345 | """
346 | Extract any repository from 'input_file' that matches 'filter',
347 | into 'output_file'.
348 | """
349 | DataManager.extractReposFiltered(input_file, output_file, _filter)
350 |
351 | def endExecution(self):
352 | print "Ratelimit reached. Quitting..."
353 | sys.exit()
354 |
355 | def getNextURL(self, _dict, next_link=None):
356 | """
357 | Find the URL in _dict and return it.
358 | Empty string if it does not exist.
359 | 'next_link' can be used to specify an alternative if there is no
360 | link in _dict.
361 | """
362 | if self.KEY_NEXT_URL in _dict:
363 | return _dict[self.KEY_NEXT_URL]
364 | else:
365 | if next_link:
366 | return next_link
367 | else:
368 | return ""
369 |
370 | def search(self, q="language:PHP", sort=None, order=None):
371 | """
372 | Search GitHub for 'q'.
373 | Any search is limited to 1000 results.
374 | """
375 | # Could yield problems, because no deep copy is done.
376 | # TODO: (maybe)
377 | resp = r.get(self.addOAuth(self.LINK_SEARCH_API + "?q=" + q),
378 | headers=self.HEADERS)
379 |
380 | decoded = json.loads(resp.text)
381 |
382 | for _dict in decoded["items"]:
383 | print _dict["clone_url"]
384 |
385 | return decoded
386 |
387 | def getRateLimit(self):
388 | return self.s.getRatelimit()
389 |
390 | def addOAuth(self, url):
391 | """
392 | Add the OAuth get-parameter to the specified 'url'.
393 | """
394 | token_query = "access_token=" + self.OAUTH
395 | if url.find('?') != -1:
396 | url += "&" + token_query
397 | else:
398 | url += "?" + token_query
399 |
400 | return url
401 |
402 | ### LEGACY CODE
403 | ### ~~~~~~~~~~~
404 | def crawlSearchDays(self, start, end, q="langauge:PHP", sort=None, order=None):
405 | """
406 | Crawl the clone urls for the search query 'q'.
407 | However, the query will be modified to only show results of
408 | a certain day.
409 | This will be repeated until each day in [start, end] was queried.
410 | Therefore, 'start' and 'end' have to be dates of format YYYY-MM-DD.
411 |
412 | Some days may be skipped due to different length of months.
413 | """
414 | # Check start and end format first.
415 | r = re.compile('^[0-9]{4}-[0-9]{2}-[0-9]{2}$')
416 | if not r.match(start) or not r.match(end):
417 | # 'start' or 'end' have a wrong format.
418 | print (
419 | "'start' and 'end' are expected to be of format YYYY-MM-DD."
420 | "'%s' and '%s' were given." % (start, end)
421 | )
422 | return -1
423 |
424 | else:
425 | # Parameters are ok, continue
426 | pass
427 |
428 | def crawlSearching(self, q="language:PHP", sort=None, order=None):
429 | """
430 | Crawl the clone urls for the search query 'q'.
431 | The response is split into 10 URLs with 100 repositories each.
432 | """
433 | per_page = 100
434 | page = 0
435 |
436 | for page in range(1, 11):
437 | resp = self.search(q + "&per_page=" + str(per_page) +
438 | "&page=" + str(page))
439 |
440 | # Check if the response was empty, so that we can reduce
441 | # the load on the GitHub API servers.
442 | if not resp["items"]:
443 | break
444 |
445 | class visualCrawlingFeedback(Thread):
446 | def __init__(self):
447 | super(visualCrawlingFeedback, self).__init__()
448 | self.done = False
449 |
450 | # Set every new thread to a 'daemon'-thread, so that it is killed
451 | # upon exiting parent, i.e. in case of CTRL-C.
452 | self.daemon = True
453 |
454 | def run(self):
455 | counter = 0
456 | self.msg += "."
457 | sys.stdout.write(self.msg + "\r")
458 | sys.stdout.flush()
459 | sleep(1)
460 |
461 | while not self.done:
462 | if counter < 3:
463 | self.msg += "."
464 | counter += 1
465 | else:
466 | self.msg = self.msg[:-3] + " "
467 | counter = 0
468 |
469 | sys.stdout.write(self.msg + "\r")
470 | sys.stdout.flush()
471 |
472 | if counter == 0:
473 | self.msg = self.msg[:-3]
474 |
475 | sleep(1)
476 |
477 | def setMsg(self, msg):
478 | self.msg = msg
479 |
480 | def stopFeedback(self):
481 | self.done = True
482 |
483 | def getMsg(self):
484 | return self.msg
--------------------------------------------------------------------------------
/dependencies.txt:
--------------------------------------------------------------------------------
1 | installed custom python modules: requests, pexpect
2 | pip install requests
3 | pip install pexpect
4 | (equivalent: easy_install)
5 |
6 | for the git cloning feature, git has to be installed and accessible in $PATH.
7 |
--------------------------------------------------------------------------------
/github/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tommiu/GithubSpider/72ec160b24c416411ad752c92a6489b5cbcdabe2/github/__init__.py
--------------------------------------------------------------------------------
/github/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tommiu/GithubSpider/72ec160b24c416411ad752c92a6489b5cbcdabe2/github/__init__.pyc
--------------------------------------------------------------------------------
/github/data_manager.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Aug 29, 2015
3 |
4 | @author: tommi
5 | '''
6 | import os
7 | import errno
8 | import sys
9 | from github.repository_list import RepositoryList
10 |
11 | class DataManager(object):
12 | '''
13 | Manages the saving and loading of data.
14 | '''
15 | COMMENT_CHAR = "#"
16 |
17 | KEY_ETAG = "ETag"
18 | KEY_THIS_URL = "url"
19 | KEY_NEXT_URL = "next_url"
20 |
21 | FILTERKEY_SIZE = "size"
22 | FILTERKEY_STARS = "stars"
23 | FILTERKEY_EMPTY = "nofilter"
24 |
25 | def __init__(self):
26 | '''
27 | Constructor
28 | '''
29 |
30 | def parseNextBlock(self, fh):
31 | """
32 | Parse next block of data. Expect:
33 | 1. List of dictionaries.
34 | 2. # url: https://api.github.com/repositories?since=XXX
35 | 3. # ETag: W/"unique_string"
36 | 4. # next_url: https://api.github.com/repositories?since=XXX
37 | """
38 | url = None
39 | etag = None
40 | repos = None
41 | url_link = None
42 |
43 | # 'counter' determines the correct sequence/file-format of
44 | # the given links-file.
45 | counter = 0
46 | # Parse four lines of data.
47 | for l in fh:
48 | counter += 1
49 |
50 | # Does the line start with '#', indicating a comment?
51 | if self.isComment(l):
52 |
53 | # IMPORTANT: By specifying counter < 4, any order of
54 | # url, next_url and etag is allowed.
55 | # The speedloss of having to do extra checks of
56 | # isURL() and isNext() is negligible.
57 | if self.isURL(l) and counter == 2:
58 | url = self.getVal(l, sep=' ', index=2)
59 |
60 | elif self.isEtag(l) and counter == 3:
61 | etag = self.getVal(l)
62 |
63 | elif self.isNext(l) and counter == 4:
64 | next_url = self.getVal(l, sep=' ', index=2)
65 |
66 | else:
67 | raise IOError("File is malformatted, stopping at line: "
68 | "%s" % l)
69 |
70 | else:
71 | if l != "" and counter == 1:
72 | repos = l.strip()
73 |
74 | # We are done with parsing a single block of data.
75 | if counter == 4:
76 | if url and etag and repos and next_url:
77 | return (
78 | repos.strip(), url.strip(),
79 | etag.strip(), next_url.strip()
80 | )
81 |
82 | else:
83 | raise IOError("Encountered an error: "
84 | "Data in file is malformatted.\n"
85 | "found repos? %s\n"
86 | "url: %s\n"
87 | "etag: %s\n"
88 | "next url: %s" % (
89 | "Yes" if repos else "No",
90 | str(url),
91 | str(etag),
92 | str(next_url)
93 | ))
94 |
95 | # For loop exited before returning, indicating the end 'fh'.
96 | return None
97 |
98 | def getDataLikeTail(self, filename, count, stepsize=2048):
99 | """
100 | Efficient way to read the last lines of a huge file.
101 | """
102 | sep = "\n"
103 |
104 | with open(filename, 'rb') as fh:
105 | # Go to end of file.
106 | pos = 0
107 | linecount = 0
108 | fh.seek(0, os.SEEK_END)
109 |
110 | while linecount <= count:
111 | try:
112 | # Go backwards in file.
113 | fh.seek(-stepsize, os.SEEK_CUR)
114 |
115 | # Count found newlines.
116 | linecount += fh.read(stepsize).count(sep)
117 |
118 | # We just went forwards, so go back again.
119 | fh.seek(-stepsize, os.SEEK_CUR)
120 |
121 | except IOError as e:
122 | if e.errno == errno.EINVAL:
123 | # Attempted to seek past the start while stepping back.
124 | stepsize = fh.tell()
125 | fh.seek(0, os.SEEK_SET)
126 |
127 | # Read from beginning.
128 | linecount += fh.read(stepsize).count(sep)
129 |
130 | pos = 0
131 | break
132 |
133 | pos = fh.tell()
134 |
135 | # Now read data.
136 | with open(filename, 'r') as fh:
137 | fh.seek(pos, os.SEEK_SET)
138 |
139 | for line in fh:
140 | # We found n (or even more) lines,
141 | # so we could need to skip some lines.
142 | if linecount > count:
143 | linecount -= 1
144 | continue
145 |
146 | # Otherwise return data.
147 | yield line
148 |
149 | def writeRepositoryList(self, fh, repository_list):
150 | """
151 | Write crawled repository_list to filehandler 'fh'.
152 | """
153 | fh.write(str(repository_list) + "\n")
154 | fh.write(self.COMMENT_CHAR + " " + self.KEY_THIS_URL + ": %s\n" %
155 | repository_list.getURL())
156 | fh.write(self.COMMENT_CHAR + " " + self.KEY_ETAG + ": %s\n" %
157 | repository_list.getEtag())
158 | fh.write(self.COMMENT_CHAR + " " + self.KEY_NEXT_URL + ": %s\n" %
159 | repository_list.getNextURL())
160 |
161 | fh.flush()
162 |
163 | @staticmethod
164 | def isComment(_str):
165 | return _str.startswith(DataManager.COMMENT_CHAR)
166 |
167 | @staticmethod
168 | def getKeysFromCrawlData(input_file, output_file, keys):
169 | """
170 | Extract the value for 'key' from every crawled repository in file
171 | 'input_file'.
172 | Output is redirected into 'output_file'.
173 | """
174 | # Parse "keys". Can be a single key or
175 | # multiple keys seperated by commas.
176 | filter_keys = []
177 |
178 | if "," in keys:
179 | filter_keys = keys.split(",")
180 | filter_keys = [key.strip() for key in filter_keys]
181 | else:
182 | filter_keys.append(keys)
183 |
184 | header = "# "
185 | header += " ".join(filter_keys)
186 | header += "\n"
187 | header += "#-----------------------------"
188 |
189 | # Extract values
190 | with open(input_file, 'r') as fr:
191 | with open(output_file, 'w') as fw:
192 | # Write "header" line first.
193 | fw.write(header + "\n")
194 |
195 | for l in fr:
196 | if not DataManager.isComment(l):
197 | if l != "":
198 | repos = RepositoryList(repos=l)
199 |
200 | if not repos.isEmpty():
201 | # Found a list of repo dictionaries.
202 | # Read it and get its value for 'key'.
203 | for repo in repos:
204 | _output = ""
205 | for key in filter_keys:
206 | _output += str(repo[key]) + " "
207 | fw.write(_output.strip() + "\n")
208 |
209 | @staticmethod
210 | def extractReposFiltered(input_file, output_file,
211 | _filter=None):
212 | """
213 | Extract any repository from 'input_file' that matches 'filter',
214 | into 'output_file'.
215 | """
216 | flow = []
217 | try:
218 | flow = DataManager.parseFilter(_filter)
219 |
220 | except Exception as err:
221 | print err
222 | sys.exit()
223 |
224 | if flow[0] == -1:
225 | print "Could not parse filter correctly. Quitting..."
226 | sys.exit()
227 |
228 | elif flow[0] == DataManager.FILTERKEY_EMPTY:
229 | print "Empty filter specified, copying all repositories."
230 |
231 | fr = open(input_file, 'r')
232 | fw = open(output_file, 'w')
233 |
234 | filtered_repos = RepositoryList()
235 | for l in fr.readlines():
236 | if not DataManager.isComment(l):
237 | if l != "" and l != "[]\n":
238 | # Found a list of repo dictionaries. Read it.
239 | repos = RepositoryList(repos=l)
240 |
241 | for repo in repos:
242 | is_suitable = True
243 |
244 | # Apply filter and append
245 | # suitable repos to the result.
246 | if flow[0] == DataManager.FILTERKEY_STARS:
247 | # Extract stars value
248 | stars = repo.getStars()
249 |
250 | if flow[1] != -1:
251 | if stars != flow[1]:
252 | is_suitable = False
253 | else:
254 | if flow[2] != -1:
255 | # specified filter: stars > flow[2]
256 | if stars <= flow[2]:
257 | is_suitable = False
258 | if flow[3] != -1:
259 | # specified filter: stars < flow[3]
260 | if stars >= flow[3]:
261 | is_suitable = False
262 |
263 | elif flow[0] == DataManager.FILTERKEY_SIZE:
264 | # Extract size value
265 | size = repo.getSize()
266 |
267 | if flow[1] != -1:
268 | # specified filter: size > flow[1]
269 | if size <= flow[1]:
270 | is_suitable = False
271 | else:
272 | if flow[2] != -1:
273 | # specified filter: size > flow[2]
274 | if size >= flow[2]:
275 | is_suitable = False
276 |
277 | elif flow[0] == DataManager.FILTERKEY_EMPTY:
278 | pass
279 |
280 | if is_suitable:
281 | filtered_repos += repo
282 |
283 | # Print out the number of matched repositories.
284 | _len = len(filtered_repos)
285 | _str = "repository" if _len == 1 else "repositories"
286 | print "%d %s matched and written to file." % (_len, _str)
287 | fw.write(str(filtered_repos))
288 |
289 | fr.close()
290 | fw.close()
291 |
292 | @staticmethod
293 | def parseFilter(_filter):
294 | """
295 | Parse a given filter and extract interesting values.
296 | """
297 | flow = [-1, -1, -1, -1]
298 |
299 | if _filter:
300 | # Expecting filter of type 'keyword="values"'. A value can be
301 | # "=5", so do not just .split("=").
302 | index = _filter.find(":")
303 |
304 | if index > 0:
305 | key = _filter[0:index].strip()
306 | val = _filter[index+1:].strip()
307 | else:
308 | raise ValueError("Filter format is wrong. You gave: %s. "
309 | "However, expected is '%s'!" % (
310 | _filter, "key:\"values\""
311 | ))
312 |
313 | if key == DataManager.FILTERKEY_STARS and val:
314 | flow[0] = key
315 |
316 | # Expecting "=int", ">int", "int int" or ""
318 | for _val in val.split(" "):
319 | # Ignore empty values
320 | if _val:
321 | # Check for "=int"
322 | index = _val.find("=")
323 | if index != -1:
324 | # Found "="
325 |
326 | # Ignore values found earlier.
327 | flow[1] = int(_val[index+1:].strip())
328 |
329 | # Break and ignore rest.
330 | break
331 |
332 | # Check for ">int"
333 | index = _val.find(">")
334 | if index != -1:
335 | # Found ">"
336 |
337 | flow[2] = int(_val[index+1:].strip())
338 |
339 | continue
340 |
341 | # Check for "= flow[3]
351 | ):
352 | raise ValueError("Filter will not yield "
353 | "any results: >%d <%d." % (
354 | flow[2], flow[3]
355 | ))
356 | elif (
357 | flow[1] == -1 and flow[2] == -1 and flow[3] == -1
358 | ):
359 | raise ValueError(
360 | "Filter could not be parsed. \nExample filters: "
361 | "stars:\"=2\", stars:\">2 <5\", stars:\"<10\""
362 | )
363 |
364 | elif key == DataManager.FILTERKEY_SIZE and val:
365 | flow[0] = key
366 |
367 | # Expecting ">int", "int int" or ""
369 | for _val in val.split(" "):
370 | # Ignore empty values
371 | if _val:
372 | # Check for ">int"
373 | index = _val.find(">")
374 | if index != -1:
375 | # Found ">"
376 |
377 | flow[1] = int(_val[index+1:].strip())
378 |
379 | continue
380 |
381 | # Check for "= flow[2] - 1:
389 | raise ValueError(
390 | "Filter will not yield any results: >%d <%d." % (
391 | flow[1], flow[2]
392 | )
393 | )
394 |
395 | elif flow[1] == -1 and flow[2] == -1:
396 | raise ValueError(
397 | "Filter could not be parsed. \nExample filters: "
398 | "size:\">50 <1000\", size=\"<500\", size:\">1000\""
399 | )
400 |
401 | elif key == DataManager.FILTERKEY_EMPTY:
402 | flow[0] = key
403 |
404 | else:
405 | raise ValueError("Filter not known: %s" % (key))
406 |
407 | return flow
408 |
409 | def isEtag(self, _str):
410 | try:
411 | key, _ = _str.split(":")
412 | if key[2:] == self.KEY_ETAG:
413 | return True
414 |
415 | except ValueError:
416 | pass
417 |
418 | return False
419 |
420 | def isURL(self, _str):
421 | try:
422 | _, key, _ = _str.split(" ")
423 | if key.startswith(self.KEY_THIS_URL):
424 | return True
425 |
426 | except ValueError:
427 | pass
428 |
429 | return False
430 |
431 | def isNext(self, _str):
432 | try:
433 | _, key, _ = _str.split(" ")
434 | if key.startswith(self.KEY_NEXT_URL):
435 | return True
436 |
437 | except ValueError:
438 | pass
439 |
440 | return False
441 |
442 | def extractNextURL(self, generator):
443 | for l in generator:
444 | if self.isNext(l):
445 | return self.getVal(l, sep=' ', index=2)
446 |
447 | # No next URL found.
448 | raise IOError("next_url not found.")
449 |
450 | def getVal(self, _str, sep=':', index=1):
451 | """
452 | Return the val if _str includes one.
453 | Otherwise return False.
454 | """
455 | # "# " + self.KEY_SINCE + ": %d\n" % result[self.KEY_SINCE])
456 | # "# " + self.KEY_ETAG + ": %s\n" % result[self.KEY_ETAG])
457 | try:
458 | _arr = _str.split(sep)
459 | return _arr[index].strip()
460 | except ValueError:
461 | return False
462 |
--------------------------------------------------------------------------------
/github/exceptions.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Jul 18, 2015
3 |
4 | @author: Tommi Unruh
5 | '''
6 | class RatelimitExceededException(BaseException):
7 | def __str__(self):
8 | return "Your ratelimit is exceeded!"
9 |
10 | class UnavailableRepoException(BaseException):
11 | def __str__(self):
12 | return "Repository is unavailable."
13 |
14 | class DidNotCrawlRepoDetailsException(BaseException):
15 | def __init__(self, _key=None):
16 | self._key = _key
17 |
18 | def __str__(self):
19 | if self._key:
20 | return (
21 | "This repository object does not contain the specified key '%s', "
22 | "because its detailed representation was not requested "
23 | "beforehand." % self._key
24 | )
25 | else:
26 | return (
27 | "This repository object does not contain the specified key, "
28 | "because its detailed representation was not requested "
29 | "beforehand."
30 | )
31 |
32 | class KeyNotFoundException(BaseException):
33 | def __init__(self, _key=None):
34 | self._key = _key
35 |
36 | def __str__(self):
37 | if self._key:
38 | return (
39 | "This repository object does not contain the specified key: %s" % (
40 | self._key
41 | )
42 | )
43 |
44 | else:
45 | return (
46 | "This repository object does not contain the specified key."
47 | )
--------------------------------------------------------------------------------
/github/git_downloader.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Aug 13, 2015
3 |
4 | @author: Tommi Unruh
5 | '''
6 |
7 | import subprocess
8 | import os
9 | from time import sleep
10 | import sys
11 | import signal
12 | import imp
13 |
14 | import shutil
15 |
16 | import pexpect
17 |
18 | class GitDownloader(object):
19 | """
20 | Manages the download of git repositories.
21 | """
22 | def __init__(self, dir_path):
23 | self.OUT_DIR = dir_path
24 |
25 | if self.OUT_DIR[-1] != "/":
26 | self.OUT_DIR += "/"
27 |
28 | self.plugins = {}
29 |
30 | def cloneAllFromFile(self, filename, linenumber=0, delete=False):
31 | """
32 | Clone repositories from links, that are read from 'filename', starting
33 | at linenumber 'linenumber'.
34 | """
35 | clone_count = 0
36 | linenumber = int(linenumber)
37 | self.interrupt = False
38 |
39 | if delete:
40 | print (
41 | "Cloning was called with 'delete' specified. After cloning "
42 | "and processing a repository, it will be deleted again to "
43 | "free space."
44 | )
45 |
46 | def catchInterrupt(signum, frame):
47 | """
48 | Catch CTRL-C/D and exit in a safe manner.
49 | """
50 | file_path = self.OUT_DIR + "cloning_interrupted"
51 |
52 | # Write linenumber to file, so that the user can continue there
53 | # next time.
54 | with open(file_path, 'w') as fh:
55 | fh.write(str(filename) + "\n")
56 | fh.write(str(linenumber) + "\n")
57 |
58 | print (
59 | "Stopped at line '%d'. Cloned %d repositories.\n"
60 | "Also wrote path of the link file "
61 | " and the linenumber to file '%s'."
62 | ) % (linenumber, clone_count, file_path)
63 |
64 | self.interrupt = True
65 |
66 | with open(filename, 'r') as fh:
67 | # If specified skip lines in links-file.
68 | if linenumber > 1:
69 | self.goToLine(fh, linenumber)
70 |
71 | # Catch process-kill signal.
72 | signal.signal(signal.SIGTERM, catchInterrupt)
73 |
74 | # Also catch Ctrl-C/D.
75 | signal.signal(signal.SIGINT, catchInterrupt)
76 |
77 | l = fh.readline()
78 |
79 | while l and not self.interrupt:
80 | out_dir = None
81 | try:
82 | print "Trying link on line %d in file '%s'" % (linenumber,
83 | filename)
84 | out_dir = self.cloneRepoLink(l.strip(), linenumber)
85 | clone_count += 1
86 | # If any success handler was specified by the user,
87 | # execute it using the path of the
88 | # downloaded repository as an argument.
89 | try:
90 | if not self.interrupt:
91 | # If a plugin was specified to process
92 | # the repository, it will be run.
93 | self.runSuccessHandler(out_dir)
94 |
95 | except OSError as err:
96 | print err
97 |
98 | except pexpect.TIMEOUT:
99 | print "Timed out."
100 | print "Skipping..."
101 |
102 | # EOF = process finished in unhandled way.
103 | except pexpect.EOF:
104 | clone_count += 1
105 |
106 | except (
107 | RepositoryExistsException,
108 | RepositoryDoesNotExistException,
109 | CredentialsExpectedException
110 | ) as err:
111 | print err.message
112 | print "Skipping..."
113 |
114 | try:
115 | out_dir = err.out_dir
116 |
117 | # CredentialsExpectedException does not
118 | # have a 'out_dir' variable.
119 | except:
120 | pass
121 |
122 | finally:
123 | linenumber += 1
124 | l = fh.readline()
125 |
126 | if delete and out_dir:
127 | # Delete repository.
128 | print "Deleting directory '%s'." % out_dir
129 | shutil.rmtree(out_dir)
130 |
131 | # Remove backup signal handlers.
132 | # SIG_DFL is the standard signal handle for any signal.
133 | signal.signal(signal.SIGTERM, signal.SIG_DFL)
134 | signal.signal(signal.SIGINT, signal.SIG_DFL)
135 |
136 | if not self.interrupt:
137 | print "End of file reached, my work is done!"
138 |
139 | def cloneRepoLink(self, link, int_test):
140 | msg = "Cloning repository: %s..." % link
141 |
142 | last_slash_index = link.rfind("/")
143 | second_last_index = link.rfind("/", 0, last_slash_index)
144 |
145 | repo_name = link[last_slash_index + 1 : -4]
146 | author_name = link[second_last_index + 1 : last_slash_index]
147 |
148 | # reponame_authorname-format enables us to clone repositories of
149 | # the same name, but of different authors.
150 | out_dir = self.OUT_DIR + author_name + "_" + repo_name
151 |
152 | print "%s" % msg
153 | sys.stdout.flush()
154 |
155 | # Start cloning the repository from 'link' simply using 'git' from
156 | # the user's system PATH variable.
157 | # 1 hour max. per repository until timeout.
158 | process = pexpect.spawn("git", ["clone", link, out_dir], 3600)
159 | expectation = process.expect([
160 | 'Username',
161 | 'already exists and is not an empty directory',
162 | 'does not exist'
163 | ])
164 |
165 | if expectation == 0:
166 | raise CredentialsExpectedException()
167 |
168 | elif expectation == 1:
169 | raise RepositoryExistsException(
170 | process.before + process.after,
171 | out_dir
172 | )
173 |
174 | elif expectation == 2:
175 | raise RepositoryDoesNotExistException(
176 | process.before + process.after,
177 | out_dir
178 | )
179 |
180 | return out_dir
181 |
182 | def goToLine(self, fh, linenumber):
183 | """
184 | Go to 'linenumber' of a huge text file in an (memory-)efficient way.
185 | """
186 | if linenumber < 1:
187 | raise IOError(
188 | "Specified linenumber '%d' is smaller than 1." % linenumber
189 | )
190 |
191 | fh.seek(0, os.SEEK_SET)
192 |
193 | # Skip lines until desired line is reached.
194 | for _ in range(0, linenumber - 1):
195 | read = fh.readline()
196 | if read == "":
197 | # Empty string represents EOF.
198 | raise OutOfScopeException(msg="goToLine error: ",
199 | line=linenumber)
200 |
201 | def setSuccessHandler(self, package_path):
202 | """
203 | Load a python package, that will be executed each time a repository
204 | was successfully downloaded.
205 | """
206 | # Get module infos from module in 'package_path'.
207 | # For that, we need to split the path into its package and the module.
208 | # Example: example/dir/module.py
209 | # -> Name: module
210 | # -> [Path: example/dir]
211 | try:
212 | plugin_name = package_path[package_path.rfind("/")+1:-3]
213 | plugin_dir = package_path[:package_path.rfind("/")]
214 |
215 | info = imp.find_module(plugin_name, [plugin_dir])
216 |
217 | self.plugins[package_path] = imp.load_module(plugin_name, *info)
218 |
219 | except Exception as err:
220 | raise OSError(err)
221 |
222 | def runSuccessHandler(self, dir_path):
223 | """
224 | Execute each specified success handler.
225 | """
226 | if self.plugins:
227 | _files = os.listdir(dir_path)
228 | for key in self.plugins:
229 | self.plugins[key].run(_files)
230 |
231 |
232 | class CredentialsExpectedException(BaseException):
233 | def __init__(self, msg=None):
234 | if msg:
235 | self.message = msg
236 |
237 | else:
238 | self.message = (
239 | "Login credentials were requested."
240 | )
241 |
242 | class RepositoryExistsException(BaseException):
243 | def __init__(self, msg=None, out_dir=None):
244 | if msg:
245 | self.message = msg
246 |
247 | else:
248 | self.message = (
249 | "Repository does exist already."
250 | )
251 |
252 | if out_dir:
253 | self.out_dir = out_dir
254 |
255 | class RepositoryDoesNotExistException(BaseException):
256 | def __init__(self, msg=None, out_dir=None):
257 | if msg:
258 | self.message = msg
259 |
260 | else:
261 | self.message = (
262 | "Repository is not accessible on GitHub.com."
263 | )
264 |
265 | if out_dir:
266 | self.out_dir = out_dir
267 |
268 |
269 |
270 | class OutOfScopeException(BaseException):
271 | def __init__(self, msg=None, line=None):
272 | if msg:
273 | self.message = msg
274 |
275 | if line:
276 | self.message += "Line %d is out of scope." % line
277 |
278 | else:
279 | self.message = (
280 | "goToLine() was called with a linenumber, "
281 | "which was out of scope."
282 | )
283 |
284 | def __str__(self):
285 | return self.message
286 |
--------------------------------------------------------------------------------
/github/oauthManager.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Jul 21, 2015
3 |
4 | @author: Tommi Unruh
5 | '''
6 |
7 | import requests as r
8 | import getpass
9 | import json
10 | import os
11 |
12 | class OAuthManager(object):
13 | """
14 | Manages creation and loading/parsing of authorization data for Github.com.
15 | """
16 |
17 | KEY_OAUTH = "OAuth"
18 | KEY_USER_AGENT = "user_agent"
19 |
20 | def __init__(self, filename=None):
21 | '''
22 | Constructor
23 | '''
24 | self.FILE = filename
25 | self.AUTH = None
26 |
27 | def getAuthData(self):
28 | if not self.AUTH:
29 | # OAuth not found, try to parse it from file.
30 | self.parseAuthentication(self.FILE)
31 |
32 | return self.AUTH
33 |
34 | def parseAuthentication(self, filename):
35 | try:
36 | with open (filename, 'r') as fh:
37 | # Parse first line, should be OAuth token.
38 | oauth = fh.readline().strip()
39 | # Parse second line, should be user agent.
40 | user_agent = fh.readline().strip()
41 |
42 | if oauth == "" or user_agent == "":
43 | raise AuthException()
44 |
45 | self.setAuth(oauth, user_agent)
46 |
47 | except IOError:
48 | raise AuthFileNotFoundException()
49 |
50 | def createAuth(self):
51 | print (
52 | "Authentication file not found! This is probably your first use.\n"
53 | "We need to install an OAuth token for this crawler to work.\n"
54 | "This token does not need ANY access to your Github account.\n"
55 | "You can create one manually on https://github.com/settings/tokens\n"
56 | "or let me create one for you. However, you will need to specify\n"
57 | "your github username and password once. It will not be remembered "
58 | "or transfered somewhere else than github."
59 | )
60 |
61 | manual_oauth = False
62 | user_input = self.getValidUserInput(
63 | "Do you want to enter one manually? [y/N]",
64 | ["y", "Y", "N", "n"],
65 | default="N"
66 | )
67 |
68 | if user_input.lower() == "y" :
69 | manual_oauth = True
70 |
71 | oauth = None
72 | username = None
73 |
74 | if manual_oauth:
75 | oauth = raw_input("Please enter your OAuth token: ").strip()
76 | username = raw_input("Please enter your Github email: ").strip()
77 | else:
78 | print (
79 | "Alright, let's create an OAuth token for your "
80 | "Github account and this application!"
81 | )
82 |
83 | oauth, username = self.createOAuthUntilSuccess()
84 |
85 | with open(self.FILE, 'w') as fh:
86 | fh.write(oauth.strip() + "\n")
87 | fh.write(username.strip() + "\n")
88 |
89 | print (
90 | "OAuth file \"authentication\" successfully written!\n"
91 | "Future executions will automatically read your authentication data"
92 | " from that file."
93 | )
94 |
95 | self.setAuth(oauth, username)
96 |
97 | def createOAuthUntilSuccess(self):
98 | """
99 | Repeat asking the user for username/password, until a valid
100 | combination is specified. This data will be used to create an OAuth
101 | token for the 'username' account.
102 | """
103 | username = raw_input("Please enter your Github email: ")
104 | password = getpass.getpass("Please enter your Github password: ")
105 |
106 | oauth = self.createOAuthToken(username, password)
107 |
108 | return (oauth, username)
109 |
110 | def createOAuthToken(self, username, password, header=None):
111 | """
112 | Request Github API for OAuth token creation.
113 | 'header' can be used to pass extra headers, which are necessary for
114 | two-factor authentication.
115 | """
116 | url = "https://api.github.com/authorizations"
117 |
118 | payload = {
119 | "scopes": [],
120 | "note": "githubSpider token."
121 | }
122 |
123 | resp = r.post(url,
124 | auth=(username, password),
125 | data=json.dumps(payload),
126 | headers=header)
127 |
128 | oauth = self.processOAuthResponse(resp, username, password)
129 |
130 | return oauth
131 |
132 | def processOAuthResponse(self, resp, username, password):
133 | decoded = json.loads(resp.text)
134 | oauth = None
135 |
136 | if resp.status_code == 201:
137 | # Success.
138 | print (
139 | "OAuth successfully created in file 'authentication'.\n"
140 | "Remember: Do not transfer your OAuth token to anybody!"
141 | )
142 | oauth = decoded["token"]
143 |
144 | elif resp.status_code == 422:
145 | # OAuth already exists.
146 | print (
147 | "Error: OAuth already exists for this application.\n"
148 | "Visit https://github.com/settings/tokens and delete\n"
149 | "the githubSpider token. Then, please try again."
150 | )
151 |
152 | elif resp.status_code == 401:
153 | # Bad credentials or two-factor authentication.
154 | # Check for two-factor authentication header:
155 | # "X-GitHub-OTP: required; :2fa-type",
156 | # where 2fa-type = "sms" or other case
157 |
158 | KEY_TWO_FACTOR = "X-GitHub-OTP"
159 | if KEY_TWO_FACTOR in resp.headers:
160 | two_factor_header = resp.headers[KEY_TWO_FACTOR]
161 |
162 | # Check if two-factor-authentication is done via SMS or App.
163 | method = None
164 | if two_factor_header.find("sms") != -1:
165 | method = "via SMS"
166 | else:
167 | method = "via your Github application"
168 |
169 | print (
170 | "You setup two-factor authentication. You should get "
171 | "the one-time password %s shortly." % method
172 | )
173 |
174 | two_factor_pw = raw_input(
175 | "Please enter your one-time password: "
176 | )
177 |
178 | header = {KEY_TWO_FACTOR: two_factor_pw}
179 |
180 | # Query OAuth creation again, this time send username, password
181 | # and one-time password.
182 | oauth = self.createOAuthToken(username, password, header)
183 |
184 | else:
185 | # Bad credentials.
186 | print (
187 | "Error: Bad credentials, try again."
188 | )
189 |
190 | self.createOAuthUntilSuccess()
191 |
192 | elif resp.status_code == 403:
193 | # API rate limit exceeded.
194 | print (
195 | "Your Github API rate limit is already exceeded. "
196 | "Cannot query API for OAuth creation until rate limit is reset."
197 | )
198 |
199 | if not oauth:
200 | raise OAuthCreationException()
201 |
202 | return oauth
203 |
204 | def getValidUserInput(self, msg, valid_answers, default=None):
205 | """
206 | Ask user to input data until he entered a valid input.
207 | If 'default' is given, it will be returned on no user input (=user
208 | just input "\n").
209 | """
210 | if default:
211 | valid_answers.append("")
212 |
213 | user_input = raw_input(msg)
214 | while not self.isValidUserInput(user_input, valid_answers):
215 | user_input = raw_input(msg)
216 |
217 | if user_input == "" and default:
218 | user_input = default
219 |
220 | return user_input
221 |
222 | def isValidUserInput(self, user_input, valid_answers):
223 | for answer in valid_answers:
224 | if user_input == answer:
225 | return True
226 |
227 | return False
228 |
229 | def setAuth(self, oauth, user_agent):
230 | self.testAuth(oauth)
231 |
232 | self.AUTH = {
233 | self.KEY_OAUTH: oauth,
234 | self.KEY_USER_AGENT: user_agent
235 | }
236 |
237 | def testAuth(self, oauth_token):
238 | url = "https://api.github.com/rate_limit"
239 | header = {
240 | "Authorization": "token %s" % (oauth_token)
241 | }
242 |
243 | resp = r.get(url, headers=header)
244 |
245 | if resp.status_code != 200:
246 | print (
247 | "Found bad credentials in authentication "
248 | "file 'authentication'."
249 | )
250 |
251 | user_input = self.getValidUserInput(
252 | "Do you want to delete it? [Y/n]",
253 | ["y", "Y", "N", "n"],
254 | default="Y"
255 | )
256 |
257 | if user_input.lower() == "y":
258 | msg = "Deleting authentication file..."
259 | print "%s\r" % (msg),
260 |
261 | os.remove(self.FILE)
262 |
263 | print "%s Done." % (msg)
264 |
265 | raise AuthException()
266 |
267 | else:
268 | print "You chose to not delete the authentication data."
269 |
270 | raise NoCredentialsException()
271 |
272 |
273 | ### Exceptions
274 | class AuthException(BaseException):
275 | def __str__(self):
276 | return "No allowed authentication found in file 'authentication'."
277 |
278 | class AuthFileNotFoundException(BaseException):
279 | def __str__(self):
280 | return "Authentication file not found. Expecting file 'authentication'."
281 |
282 | class OAuthCreationException(BaseException):
283 | def __str__(self):
284 | return "Failed to create OAuth token."
285 |
286 | class NoAuthException(BaseException):
287 | def __str__(self):
288 | return (
289 | "No OAuth or user agent available. "
290 | "Did you specify or parse them before?"
291 | )
292 |
293 | class NoCredentialsException(BaseException):
294 | def __str__(self):
295 | return "No credentials given."
--------------------------------------------------------------------------------
/github/repository.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Jul 17, 2015
3 |
4 | @author: Tommi Unruh
5 | '''
6 |
7 | import json
8 | from github.exceptions import *
9 |
10 | class Repository(object):
11 | """
12 | Class representing a repository from Github.
13 | """
14 |
15 | def __init__(self, _dict):
16 | '''
17 | Constructor
18 | '''
19 | if isinstance(_dict, basestring):
20 | # '_dict' is given as a string.
21 | self._dict = json.loads(_dict)
22 |
23 | elif isinstance(_dict, dict):
24 | # '_dict' is given as a dict (=already json-decoded).
25 | self._dict = _dict
26 |
27 | else:
28 | raise Exception("Given value for '_dict' is not valid: '%s'." % (
29 | _dict
30 | ))
31 |
32 | def filter(self, _filter):
33 | """
34 | If all key,values match, return True. False otherwise.
35 | """
36 | for key in _filter:
37 | if key in self._dict:
38 | if (
39 | str(self._dict[key]).lower() != str(_filter[key]).lower()
40 | ):
41 | return False
42 |
43 | else:
44 | return False
45 |
46 | return True
47 |
48 | def __str__(self):
49 | return json.dumps(self._dict)
50 |
51 | def __getitem__(self, _key):
52 | return self.getValue(_key)
53 |
54 | def getValue(self, _key):
55 | """
56 | General method to acquire values associated with '_key'.
57 | """
58 | if _key in self._dict:
59 | return self._dict[_key]
60 | else:
61 | raise KeyNotFoundException(_key)
62 |
63 | def getStars(self):
64 | try:
65 | KEY = "stargazers_count"
66 | return self.getValue(KEY)
67 | except KeyNotFoundException:
68 | raise DidNotCrawlRepoDetailsException(KEY)
69 |
70 | def getSize(self):
71 | try:
72 | KEY = "size"
73 | return self.getValue(KEY)
74 | except KeyNotFoundException:
75 | raise DidNotCrawlRepoDetailsException(KEY)
76 |
77 | def getURL(self):
78 | KEY = "url"
79 | return self.getValue(KEY)
80 |
81 | def getDict(self):
82 | return self._dict
--------------------------------------------------------------------------------
/github/repository_list.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Jul 17, 2015
3 |
4 | @author: Tommi Unruh
5 | '''
6 | import json
7 | from github.repository import Repository
8 | from github.exceptions import UnavailableRepoException
9 |
10 | class RepositoryList(object):
11 | """
12 | Class representing a list of repositories from Github.
13 | """
14 |
15 | def __init__(self, url=None, etag=None, repos="[]", next_url=None):
16 | '''
17 | Constructor
18 | '''
19 | self.url = url
20 | self.etag = etag
21 | self.next_url = next_url
22 |
23 | self.setRepos(repos)
24 |
25 | def filter(self, session, _filter):
26 | """
27 | Remove repositories from list, that do not match filter.
28 | '_filter' should be a dictionary stating a value for each defined key.
29 | e.g. {"language": "PHP", "stargazers_count": 5}.
30 |
31 | Additionally, we get more details for each repository, because
32 | we query each repository individually.
33 | """
34 | filtered_repos = []
35 | for repo in self.repos:
36 | # Query repo and check filter.
37 | try:
38 | full_repo = session.getRepo(repo.getURL())
39 | if full_repo.filter(_filter):
40 | filtered_repos.append(full_repo)
41 |
42 | except UnavailableRepoException:
43 | # Skip repository
44 | pass
45 |
46 | self.repos = filtered_repos
47 |
48 | def __iadd__(self, other):
49 | self.repos.append(other)
50 | return self
51 |
52 | def __str__(self):
53 | """
54 | Get textual representation of list of repositories.
55 | """
56 | repos_decoded = []
57 |
58 | for repo in self.repos:
59 | repos_decoded.append(repo.getDict())
60 |
61 | return json.dumps(repos_decoded)
62 |
63 | def __iter__(self):
64 | """
65 | Iterate over the list of repositories in self.repos.
66 | """
67 | ctr = 0
68 | while ctr < len(self.repos):
69 | yield self.repos[ctr]
70 | ctr += 1
71 |
72 | def __len__(self):
73 | return len(self.repos)
74 |
75 | def isEmpty(self):
76 | return True if not self.repos else False
77 |
78 | def getURL(self):
79 | return self.url
80 |
81 | def setURL(self, url):
82 | self.url = url
83 |
84 | def getEtag(self):
85 | return self.etag
86 |
87 | def setETag(self, etag):
88 | self.etag = etag
89 |
90 | def getNextURL(self):
91 | return self.next_url
92 |
93 | def setNextURL(self, next_url):
94 | self.next_url = next_url
95 |
96 | def setRepos(self, repos):
97 | self.repos = []
98 |
99 | if isinstance(repos, basestring):
100 | # 'repos' is given as a string.
101 | repos = json.loads(repos)
102 |
103 | self.repos = []
104 | for _dict in repos:
105 | # Transform each dictionary into a Repository object.
106 | self.repos.append(Repository(_dict))
107 |
108 | return True
109 |
110 | elif isinstance(repos, list):
111 | # 'repos' is given as a list (=already json-decoded).
112 | # Check if the list is populated with dictionaries or
113 | # Repository objects.
114 | for _obj in repos:
115 | if isinstance(_obj, dict):
116 | # _obj is dict, transform it to Repository.
117 | self.repos.append(Repository(_obj))
118 |
119 | elif isinstance(_obj, Repository):
120 | # _obj already is Repository, just append it.
121 | self.repos.append(_obj)
122 |
123 | return True
124 |
125 | raise Exception("Given value for 'repos' is not valid: '%s'." % (
126 | repos
127 | ))
--------------------------------------------------------------------------------
/github/session.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Jul 17, 2015
3 |
4 | @author: Tommi Unruh
5 | '''
6 |
7 | import requests
8 | import json
9 |
10 | from exceptions import *
11 | from github.repository_list import RepositoryList
12 | from github.repository import Repository
13 | from time import sleep
14 |
15 | class Session(object):
16 | """
17 | This class saves the user's authorization infos and is able to do requests
18 | to the Github API on behalf of the authorized user.
19 | """
20 |
21 | SLEEP = 0.5
22 |
23 | URL_API = "https://api.github.com"
24 | URL_REPOS = URL_API + "/repositories"
25 | URL_SEARCH = URL_API + "/search/repositories"
26 | URL_RATE_LIMIT = URL_API + "/rate_limit"
27 |
28 | KEY_ETAG = "ETag"
29 | KEY_RL_REMAIN = "X-RateLimit-Remaining"
30 |
31 | STATUS_UNAVAILABLE = 403
32 |
33 | def __init__(self, OAuth=None, user_agent=None):
34 | """
35 | Setup session.
36 | """
37 | self.HEADERS = {}
38 |
39 | if OAuth and user_agent:
40 | self.setOAuth(OAuth)
41 | self.setUserAgent(user_agent)
42 |
43 | self.HEADERS = {
44 | 'User-Agent': user_agent,
45 | 'Authorization': "token %s" % OAuth
46 | }
47 |
48 | elif not OAuth:
49 | print (
50 | "No authorization token given, continuing unauthenticated.\n"
51 | "Unauthenticated requests are limited to 60 per hour, while\n"
52 | "authenticated requests are limited to 5000 per hour."
53 | )
54 |
55 | def getRatelimit(self):
56 | """
57 | Request Github API for ratelimit info for this session.
58 | """
59 | resp = self.sessionRequestGet(self.URL_RATE_LIMIT)
60 | _dict = json.loads(resp.text)
61 |
62 | if resp.status_code == 200:
63 | return _dict["resources"]
64 | else:
65 | raise Exception("Encountered a problem. Github answered with"
66 | ":\n%s" % _dict)
67 |
68 | return _dict
69 |
70 | def getRepos(self, since=0, url=None):
71 | """
72 | Get a list of repositories.
73 | """
74 | response = None
75 | if url:
76 | response = self.sessionRequestGet(url)
77 | else:
78 | url = self.URL_REPOS + "?since=" + str(since)
79 | response = self.sessionRequestGet(url)
80 |
81 | etag = response.headers[self.KEY_ETAG]
82 | repos = json.loads(response.text)
83 | next_url = response.links["next"]["url"]
84 |
85 | repos = RepositoryList(url, etag, repos, next_url)
86 |
87 | return repos
88 |
89 | def getRepo(self, url):
90 | """
91 | Query a single repository.
92 | """
93 | response = self.sessionRequestGet(url)
94 |
95 | return Repository(response.text)
96 |
97 | def update(self, repository_list):
98 | """
99 | Query API for an updated list of 'repository_list'.
100 | """
101 | header = {"If-None-Match": repository_list.getEtag()}
102 | response = self.sessionRequestGet(repository_list.getURL(), header)
103 |
104 | if response.status_code == 200:
105 | # Found update
106 |
107 | etag = response.headers[self.KEY_ETAG]
108 | repos = json.loads(response.text)
109 | next_url = response.links["next"]["url"]
110 |
111 | repository_list.setETag(etag)
112 | repository_list.setRepos(repos)
113 | repository_list.setNextURL(next_url)
114 |
115 | return True
116 |
117 | return False
118 |
119 | def sessionRequestGet(self, url, headers=None):
120 | """
121 | Send a get-request with all session-headers.
122 | """
123 | try:
124 | if headers:
125 | header = self.HEADERS.copy()
126 | header.update(headers)
127 |
128 | response = requests.get(url, headers=header)
129 | else:
130 | response = requests.get(url, headers=self.HEADERS)
131 |
132 | if response.status_code == self.STATUS_UNAVAILABLE:
133 | if response.headers[self.KEY_RL_REMAIN] == 0:
134 | # Ratelimit 0 reached.
135 | raise RatelimitExceededException()
136 |
137 | else:
138 | # Unavailable resource
139 | raise UnavailableRepoException()
140 |
141 | except requests.exceptions.ConnectionError as err:
142 | print err
143 | print "Sleeping %d seconds and retrying with same URL." % self.SLEEP
144 | sleep(0.5)
145 | response = self.sessionRequestGet(url, headers)
146 |
147 | return response
148 |
149 | def addOAuth(self, url):
150 | """
151 | Add the OAuth get-parameter to the specified 'url'.
152 | """
153 | token_query = "access_token=" + self.OAUTH["token"]
154 | if url.find('?') != -1:
155 | url += "&" + token_query
156 | else:
157 | url += "?" + token_query
158 |
159 | return url
160 |
161 | def setOAuth(self, OAuth):
162 | self.OAuth = OAuth
163 |
164 | def setUserAgent(self, user_agent):
165 | self.user_agent = user_agent
166 |
167 | def setPerPage(self, per_page):
168 | per_page = int(per_page)
169 |
170 | if per_page:
171 | self.per_page = per_page
172 | else:
173 | raise ValueError("'per_page' parameter could not be set.")
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Jul 4, 2015
3 |
4 | @author: Tommi Unruh
5 | '''
6 |
7 | from crawler import Crawler
8 | import sys
9 | from args_parser import ModeArgsParser
10 | from github.git_downloader import GitDownloader, OutOfScopeException
11 | import json
12 |
13 | ARGS_HELP = "help"
14 | ARGS_RATELIMIT = "ratelimit"
15 | ARGS_CRAWL_REPOS = "crawl"
16 | ARGS_CLONE_REPOS = "clone"
17 | ARGS_EXTRACT_KEYDATA = "extract"
18 | ARGS_EXTRACTREPOS_FILTERED = "filter"
19 |
20 | REPO_KEY_LANGUAGE = "language"
21 | DEFAULT_REPO_FILTER = {REPO_KEY_LANGUAGE: "PHP"}
22 |
23 | REPO_ALLOWED_KEYS = [
24 | 'issues_url', 'stargazers_count', 'forks_url', 'mirror_url',
25 | 'subscription_url', 'notifications_url', 'collaborators_url',
26 | 'updated_at', 'private', 'pulls_url', 'issue_comment_url',
27 | 'labels_url', 'has_wiki', 'full_name', 'owner', 'statuses_url',
28 | 'id', 'keys_url', 'description', 'subscribers_count',
29 | 'tags_url', 'network_count', 'downloads_url', 'assignees_url',
30 | 'contents_url', 'has_pages', 'git_refs_url',
31 | 'open_issues_count', 'clone_url', 'watchers_count',
32 | 'git_tags_url', 'milestones_url', 'languages_url', 'size',
33 | 'homepage', 'fork', 'commits_url', 'releases_url',
34 | 'issue_events_url', 'archive_url', 'comments_url',
35 | 'events_url', 'contributors_url', 'html_url', 'forks',
36 | 'compare_url', 'open_issues', 'git_url', 'svn_url',
37 | 'merges_url', 'has_issues', 'ssh_url', 'blobs_url',
38 | 'git_commits_url', 'hooks_url', 'has_downloads', 'watchers',
39 | 'name', 'language', 'url', 'created_at', 'pushed_at',
40 | 'forks_count', 'default_branch', 'teams_url', 'trees_url',
41 | 'branches_url', 'subscribers_url', 'stargazers_url']
42 |
43 | def main(argv):
44 | """
45 | Entry point of execution. Handles program arguments and
46 | acts accordingly.
47 | """
48 | auth_file = "authentication"
49 |
50 | # Setup command line arguments.
51 | parser = ModeArgsParser()
52 | setupArgs(parser)
53 |
54 | flow = None
55 | crawler = None
56 |
57 | try:
58 | flow = parser.parseArgs(argv[1], argv[2:])
59 |
60 | # Check if authentication file was specified.
61 | if "a" in flow:
62 | auth_file = flow["a"]
63 | elif "auth" in flow:
64 | auth_file = flow["auth"]
65 |
66 | except:
67 | parser.printHelp(argv[0])
68 | sys.exit()
69 |
70 | # Evaluate program arguments and start program.
71 | if flow[parser.KEY_MODE] == ARGS_HELP:
72 | parser.printHelp(argv[0])
73 |
74 | if flow[parser.KEY_MODE] == ARGS_RATELIMIT:
75 | crawler = Crawler(auth_file)
76 | _dict = crawler.getRateLimit()
77 | print "Rate Limits:"
78 | print "core:" , _dict["core"]
79 | print "search:", _dict["search"]
80 |
81 | elif flow[parser.KEY_MODE] == ARGS_CRAWL_REPOS:
82 | crawler = Crawler(auth_file)
83 |
84 | if "ds" in flow or "dontskip" in flow:
85 | skip = False
86 | else:
87 | skip = True
88 |
89 | try:
90 | if "f" in flow:
91 | _filter = flow["f"]
92 | _filter = convertIntoDict(_filter)
93 |
94 | elif "filter" in flow:
95 | _filter = flow["filter"]
96 | _filter = convertIntoDict(_filter)
97 |
98 | else:
99 | _filter = DEFAULT_REPO_FILTER
100 |
101 | except Exception as err:
102 | print err
103 |
104 | finally:
105 | crawler.crawlRepos(flow["in"], skip, _filter=_filter)
106 |
107 | elif flow[parser.KEY_MODE] == ARGS_EXTRACT_KEYDATA:
108 | if "k" in flow or "key" in flow:
109 | try:
110 | key = flow["k"]
111 | except:
112 | key = flow["key"]
113 | finally:
114 | Crawler.getKeyFromCrawlData(flow["in"], flow["out"], key)
115 |
116 | else:
117 | Crawler.getKeyFromCrawlData(flow["in"], flow["out"])
118 |
119 | elif flow[parser.KEY_MODE] == ARGS_EXTRACTREPOS_FILTERED:
120 | try:
121 | _filter = flow["f"]
122 | except:
123 | _filter = flow["filter"]
124 | finally:
125 | Crawler.extractReposFiltered(flow["in"], flow["out"], _filter)
126 |
127 | # cloning repos
128 | elif flow[parser.KEY_MODE] == ARGS_CLONE_REPOS:
129 | downloader = GitDownloader(flow["out"])
130 |
131 | try:
132 | _line = flow["l"]
133 | except:
134 | try:
135 | _line = flow["_line"]
136 | except:
137 | _line = 0
138 |
139 | delete = False
140 | if "d" in flow or "delete" in flow:
141 | delete = True
142 |
143 | plugin = False
144 | try:
145 | downloader.setSuccessHandler(flow["p"])
146 | plugin = True
147 |
148 | except Exception as err:
149 | try:
150 | downloader.setSuccessHandler(flow["plugin"])
151 | plugin = True
152 | except:
153 | pass
154 |
155 | if delete and not plugin:
156 | print (
157 | "A combination of -d/--delete without -p/--plugin is "
158 | "not allowed."
159 | )
160 | sys.exit()
161 |
162 | try:
163 | downloader.cloneAllFromFile(
164 | flow["in"],
165 | linenumber=_line,
166 | delete=delete
167 | )
168 |
169 | except OutOfScopeException as err:
170 | print (
171 | "The specified line number '%s' in parameter '-l/--line' is "
172 | "out of scope for file '%s'." % (_line, flow["in"])
173 | )
174 |
175 | def convertIntoDict(_str):
176 | try:
177 | _dict = json.loads(_str)
178 |
179 | except:
180 | _dict = None
181 |
182 | if isinstance(_dict, dict):
183 | valid = True
184 | for key in _dict:
185 | if key not in REPO_ALLOWED_KEYS:
186 | valid = False
187 | invalid_key = key
188 | break
189 |
190 | if valid:
191 | return _dict
192 |
193 | else:
194 | raise ValueError("Dictionary key '%s' is not a valid "
195 | "key of a repository" % invalid_key)
196 |
197 | raise ValueError("Filter should be specified as a "
198 | "JSON-decoded python dictionary.")
199 |
200 | def setupArgs(parser):
201 | """
202 | Setup command line arguments combinations.
203 | """
204 | # Ratelimit: ratelimit
205 | explanation = "Check your ratelimit."
206 | parser.addArgumentsCombination(ARGS_RATELIMIT,
207 | optional_args=[["a=", "auth"]],
208 | explanation=explanation)
209 |
210 | # Help: help
211 | explanation = "Print this help."
212 | parser.addArgumentsCombination(ARGS_HELP, explanation=explanation)
213 |
214 | # Crawl repos: crawl -in file -out file (-s/--skip, -a/--auth, -f/--filter)
215 | explanation = (
216 | "Crawl repositories from Github.com "
217 | "to file specified with \"-in\". "
218 | "-ds/--dontskip can be used to first check for updates "
219 | "for already crawled repositories in file. "
220 | "The input file will be renamed to input_file_backup. "
221 | "Use -f/--filter followed by a python dictionary to "
222 | "specify a filter to only save information of repositories "
223 | "which apply to that filter. "
224 | "The default filter is {\"language\": \"PHP\"}, but any "
225 | "python dictionary is allowed."
226 | )
227 | parser.addArgumentsCombination(
228 | ARGS_CRAWL_REPOS,
229 | [["in=", None]],
230 | [
231 | ["ds", "dontskip"],
232 | ["a=", "auth"],
233 | ["f=", "filter"]
234 | ],
235 | explanation=explanation
236 | )
237 |
238 | explanation = (
239 | "Extract the value associated with '-k/--key' from "
240 | "crawled repositories in '-in' and write it to '-out'."
241 | "Default for 'k/--key' is 'clone_url', which "
242 | "specifies the URL for cloning a repository. "
243 | "However, '-k/--key' can take a list of keys to extract, "
244 | "separated by commas. Example: -k \"id, clone_url\""
245 | )
246 | # Extract key data: extract -in file -out file (-k/--key)
247 | parser.addArgumentsCombination(ARGS_EXTRACT_KEYDATA,
248 | [["in=", None], ["out=", None]],
249 | [["k=", "key"]],
250 | explanation=explanation
251 | )
252 |
253 | explanation = (
254 | "Filter the repositories from file '-in' and write "
255 | "filtered repositories to '-out'. '-f/--filter' specifies "
256 | "the filter criterion. Currently supported: stars:=x, stars:>x "
257 | "stars:x x, size:x