├── .gitignore
├── README.md
├── ghrabber.py
└── test
├── data
├── lastpage.html
└── search.html
├── test_ghgrab.py
└── tutils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Python object files
2 | *.py[cd]
3 | *.swp
4 | /data
5 | .coverage
6 |
7 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Grab all files matching a search specification from Github.
2 |
3 | Downloaded files are written to files named user.repository. Existing files
4 | with the same name are skipped, which means that you can reasonably efficiently
5 | stop and resume a ghrab.
6 |
7 | Note that this is a Quick Hack that may break whenever Github changes even
8 | minor features on the site.
9 |
10 |
11 | ### Usage
12 |
13 | Grab all .bash_history files:
14 |
15 | ./ghrabber.py "path:.bash_history"
16 |
17 | Grab all files with extension of .key:
18 |
19 | ./ghrabber.py "extension:key"
20 |
21 |
22 | ### Installation
23 |
24 | Check out this code and install the dependencies:
25 |
26 | git clone git@github.com:cortesi/ghrabber.git
27 | cd ghrabber
28 | pip install beautifulsoup requests
29 |
30 | If pip is not installed, try to install it with `easy_install pip` first
31 |
--------------------------------------------------------------------------------
/ghrabber.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import argparse
3 | import os.path
4 | import urllib
5 |
6 | import BeautifulSoup
7 | import requests
8 |
9 | # Offset of path component to delete when converting to raw
10 | REM_OFFSET = 2
11 | RAWBASE = "https://raw.github.com/"
12 | SEARCH = "https://github.com/search"
13 |
14 |
15 | def extract(data):
16 | s = BeautifulSoup.BeautifulSoup(data)
17 | for i in s.findAll("p", {"class": "title"}):
18 | p = i.findAll("a")
19 | # The second link is the reference...
20 | yield p[1].get("href")
21 |
22 |
23 | def is_last_page(data):
24 | s = BeautifulSoup.BeautifulSoup(data)
25 | if s.find("p", {"class": "title"}):
26 | return False
27 | else:
28 | return True
29 |
30 |
31 | def raw_url(p):
32 | p = p.strip("/")
33 | parts = p.split("/")
34 | del parts[REM_OFFSET]
35 | return RAWBASE + "/".join(parts)
36 |
37 |
38 | def make_fname(p):
39 | p = p.strip("/")
40 | parts = p.split("/")
41 | return parts[0] + "." + parts[1]
42 |
43 |
44 | def get(query, relative, outdir, listonly=False):
45 | page = 1
46 | while 1:
47 | params = dict(
48 | q = query,
49 | type = "Code",
50 | p = page
51 | )
52 | r = requests.get(SEARCH, params=params)
53 | if is_last_page(r.content):
54 | print "** No more results"
55 | break
56 | for u in extract(r.content):
57 | ru = raw_url(u)
58 | if relative:
59 | ru = urllib.basejoin(ru, relative)
60 | if listonly:
61 | print ru
62 | else:
63 | fn = make_fname(u)
64 | outpath = os.path.join(outdir, fn)
65 | if os.path.exists(outpath):
66 | print "Skipping ", fn
67 | else:
68 | ret = requests.get(ru)
69 | if ret.status_code == 200:
70 | print "Fetching ", ru
71 | f = open(outpath, "w")
72 | f.write(ret.content)
73 | f.close()
74 | else:
75 | print "Error", fn, ret.status_code
76 | page += 1
77 |
78 |
79 | if __name__ == "__main__":
80 | parser = argparse.ArgumentParser()
81 | parser.add_argument(
82 | "-l", action="store_true",
83 | help="Just list results"
84 | )
85 | parser.add_argument(
86 | "-o", type=str, default=".",
87 | help="Output directory. Created if it doesn't exist."
88 | )
89 | parser.add_argument(
90 | "-r", type=str, default=None,
91 | help="Grab a path relative to the match"
92 | )
93 | parser.add_argument("query", type=str, help="Github Code Search query")
94 | args = parser.parse_args()
95 | if not os.path.exists(args.o):
96 | os.makedirs(args.o)
97 | try:
98 | get(args.query, args.r, args.o, listonly=args.l)
99 | except KeyboardInterrupt:
100 | pass
101 |
--------------------------------------------------------------------------------
/test/data/lastpage.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Search · path:viminfo · GitHub
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
97 |
98 |
99 |
131 |
132 |
133 |
134 |
146 |
We've found 53 code results
147 |
148 |
149 |
150 |
151 |
152 |
153 |
156 |
157 |
158 |
159 |
160 |
Loading…
161 |
162 |
163 |
164 |
165 |
166 |
167 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 | Something went wrong with that request. Please try again.
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
--------------------------------------------------------------------------------
/test/data/search.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Search · path:.bash_history · GitHub
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
97 |
98 |
99 |
124 |
125 |
126 |
127 |
128 |
129 |
138 |
139 |
140 |
149 |
150 |
151 |
160 |
161 |
162 |
171 |
172 |
173 |
182 |
183 |
184 |
193 |
194 |
195 |
204 |
205 |
206 |
215 |
216 |
217 |
226 |
227 |
228 |
237 |
238 |
239 |
240 |
243 |
244 |
245 |
246 |
247 |
Loading…
248 |
249 |
250 |
251 |
252 |
253 |
254 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
765 |
766 |
858 |
859 |
860 |
861 |
862 |
863 | Something went wrong with that request. Please try again.
864 |
865 |
866 |
867 |
868 |
869 |
870 |
871 |
872 |
873 |
--------------------------------------------------------------------------------
/test/test_ghgrab.py:
--------------------------------------------------------------------------------
1 | import tutils
2 | import ghrabber
3 |
4 | def test_extract():
5 | f = file(tutils.test_data.path("data/search.html")).read()
6 | ret = list(ghrabber.extract(f))
7 | for i in ret:
8 | assert i.endswith(".bash_history")
9 |
10 |
11 | def test_is_last_page():
12 | f = file(tutils.test_data.path("data/search.html")).read()
13 | assert not ghrabber.is_last_page(f)
14 | f = file(tutils.test_data.path("data/lastpage.html")).read()
15 | assert ghrabber.is_last_page(f)
16 |
17 |
18 | def test_to_raw():
19 | p = "/nonexistent/archlinux/blob/a4f339b71ed6bb703f5f77888272d886f553f99a/.bash_history"
20 | assert ghrabber.raw_url(p)
21 |
22 |
23 | def test_make_fname():
24 | p = "/nonexistent/archlinux/blob/a4f339b71ed6bb703f5f77888272d886f553f99a/.bash_history"
25 | assert ghrabber.make_fname(p)
26 |
27 |
--------------------------------------------------------------------------------
/test/tutils.py:
--------------------------------------------------------------------------------
1 | import tempfile, os, shutil
2 | from contextlib import contextmanager
3 |
4 | class Data:
5 | def __init__(self, name):
6 | m = __import__(name)
7 | dirname, _ = os.path.split(m.__file__)
8 | self.dirname = os.path.abspath(dirname)
9 |
10 | def path(self, path):
11 | """
12 | Returns a path to the package data housed at 'path' under this
13 | module.Path can be a path to a file, or to a directory.
14 |
15 | This function will raise ValueError if the path does not exist.
16 | """
17 | fullpath = os.path.join(self.dirname, path)
18 | if not os.path.exists(fullpath):
19 | raise ValueError, "dataPath: %s does not exist."%fullpath
20 | return fullpath
21 |
22 |
23 | @contextmanager
24 | def tmpdir(*args, **kwargs):
25 | orig_workdir = os.getcwd()
26 | temp_workdir = tempfile.mkdtemp(*args, **kwargs)
27 | os.chdir(temp_workdir)
28 |
29 | yield temp_workdir
30 |
31 | os.chdir(orig_workdir)
32 | shutil.rmtree(temp_workdir)
33 |
34 |
35 | def raises(exc, obj, *args, **kwargs):
36 | """
37 | Assert that a callable raises a specified exception.
38 |
39 | :exc An exception class or a string. If a class, assert that an
40 | exception of this type is raised. If a string, assert that the string
41 | occurs in the string representation of the exception, based on a
42 | case-insenstivie match.
43 |
44 | :obj A callable object.
45 |
46 | :args Arguments to be passsed to the callable.
47 |
48 | :kwargs Arguments to be passed to the callable.
49 | """
50 | try:
51 | apply(obj, args, kwargs)
52 | except Exception, v:
53 | if isinstance(exc, basestring):
54 | if exc.lower() in str(v).lower():
55 | return
56 | else:
57 | raise AssertionError(
58 | "Expected %s, but caught %s"%(
59 | repr(str(exc)), v
60 | )
61 | )
62 | else:
63 | if isinstance(v, exc):
64 | return
65 | else:
66 | raise AssertionError(
67 | "Expected %s, but caught %s %s"%(
68 | exc.__name__, v.__class__.__name__, str(v)
69 | )
70 | )
71 | raise AssertionError("No exception raised.")
72 |
73 | test_data = Data(__name__)
74 |
--------------------------------------------------------------------------------