├── .gitignore ├── README.md ├── ghrabber.py └── test ├── data ├── lastpage.html └── search.html ├── test_ghgrab.py └── tutils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Python object files 2 | *.py[cd] 3 | *.swp 4 | /data 5 | .coverage 6 | 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Grab all files matching a search specification from Github. 2 | 3 | Downloaded files are written to files named user.repository. Existing files 4 | with the same name are skipped, which means that you can reasonably efficiently 5 | stop and resume a ghrab. 6 | 7 | Note that this is a Quick Hack that may break whenever Github changes even 8 | minor features on the site. 9 | 10 | 11 | ### Usage 12 | 13 | Grab all .bash_history files: 14 | 15 | ./ghrabber.py "path:.bash_history" 16 | 17 | Grab all files with extension of .key: 18 | 19 | ./ghrabber.py "extension:key" 20 | 21 | 22 | ### Installation 23 | 24 | Check out this code and install the dependencies: 25 | 26 | git clone git@github.com:cortesi/ghrabber.git 27 | cd ghrabber 28 | pip install beautifulsoup requests 29 | 30 | If pip is not installed, try to install it with `easy_install pip` first 31 | -------------------------------------------------------------------------------- /ghrabber.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import os.path 4 | import urllib 5 | 6 | import BeautifulSoup 7 | import requests 8 | 9 | # Offset of path component to delete when converting to raw 10 | REM_OFFSET = 2 11 | RAWBASE = "https://raw.github.com/" 12 | SEARCH = "https://github.com/search" 13 | 14 | 15 | def extract(data): 16 | s = BeautifulSoup.BeautifulSoup(data) 17 | for i in s.findAll("p", {"class": "title"}): 18 | p = i.findAll("a") 19 | # The second link is the reference... 20 | yield p[1].get("href") 21 | 22 | 23 | def is_last_page(data): 24 | s = BeautifulSoup.BeautifulSoup(data) 25 | if s.find("p", {"class": "title"}): 26 | return False 27 | else: 28 | return True 29 | 30 | 31 | def raw_url(p): 32 | p = p.strip("/") 33 | parts = p.split("/") 34 | del parts[REM_OFFSET] 35 | return RAWBASE + "/".join(parts) 36 | 37 | 38 | def make_fname(p): 39 | p = p.strip("/") 40 | parts = p.split("/") 41 | return parts[0] + "." + parts[1] 42 | 43 | 44 | def get(query, relative, outdir, listonly=False): 45 | page = 1 46 | while 1: 47 | params = dict( 48 | q = query, 49 | type = "Code", 50 | p = page 51 | ) 52 | r = requests.get(SEARCH, params=params) 53 | if is_last_page(r.content): 54 | print "** No more results" 55 | break 56 | for u in extract(r.content): 57 | ru = raw_url(u) 58 | if relative: 59 | ru = urllib.basejoin(ru, relative) 60 | if listonly: 61 | print ru 62 | else: 63 | fn = make_fname(u) 64 | outpath = os.path.join(outdir, fn) 65 | if os.path.exists(outpath): 66 | print "Skipping ", fn 67 | else: 68 | ret = requests.get(ru) 69 | if ret.status_code == 200: 70 | print "Fetching ", ru 71 | f = open(outpath, "w") 72 | f.write(ret.content) 73 | f.close() 74 | else: 75 | print "Error", fn, ret.status_code 76 | page += 1 77 | 78 | 79 | if __name__ == "__main__": 80 | parser = argparse.ArgumentParser() 81 | parser.add_argument( 82 | "-l", action="store_true", 83 | help="Just list results" 84 | ) 85 | parser.add_argument( 86 | "-o", type=str, default=".", 87 | help="Output directory. Created if it doesn't exist." 88 | ) 89 | parser.add_argument( 90 | "-r", type=str, default=None, 91 | help="Grab a path relative to the match" 92 | ) 93 | parser.add_argument("query", type=str, help="Github Code Search query") 94 | args = parser.parse_args() 95 | if not os.path.exists(args.o): 96 | os.makedirs(args.o) 97 | try: 98 | get(args.query, args.r, args.o, listonly=args.l) 99 | except KeyboardInterrupt: 100 | pass 101 | -------------------------------------------------------------------------------- /test/data/lastpage.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Search · path:viminfo · GitHub 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 |
39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 |
48 |
49 | 50 | 51 | GitHub 52 | GitHub 53 | 54 | 55 | 56 | 62 | 63 | 64 |
65 | Sign up for free 66 | Sign in 67 |
68 | 69 |
70 |
71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 |
79 |
80 | 81 | 82 |
83 |
84 |
85 |

Search

86 | 95 |
96 |
97 | 98 |
99 |
100 |
101 | 108 |
109 | 110 |

Languages

111 | 127 | 128 | 130 |
131 |
132 | 133 |
134 |
135 | 136 | 137 | 138 | Sort: 139 | 143 | 144 | 145 |
146 |

We've found 53 code results

147 |
148 | 149 |
150 |
151 |
152 | How are these search results? Tell us! 153 | 156 |
157 | 158 |
159 |

Octocat-spinner-128

160 |

Loading…

161 |
162 |
163 | 164 |
165 |
166 | 167 | 317 | 318 | 319 |
320 |
321 |
322 | 323 |
324 | 325 | 326 | 387 | 388 | 389 | 390 | 391 | 392 |
393 | 394 | Something went wrong with that request. Please try again. 395 | 396 |
397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | -------------------------------------------------------------------------------- /test/data/search.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Search · path:.bash_history · GitHub 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 |
39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 |
48 |
49 | 50 | 51 | GitHub 52 | GitHub 53 | 54 | 55 | 56 | 62 | 63 | 64 |
65 | Sign up for free 66 | Sign in 67 |
68 | 69 |
70 |
71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 |
79 |
80 | 81 | 82 |
83 |
84 |
85 |

Search

86 | 95 |
96 |
97 | 98 |
99 |
100 |
101 | 108 |
109 | 110 |

Languages

111 | 120 | 121 | 123 |
124 |
125 | 126 |
127 |
128 | 129 |
130 | 131 | 132 |

133 | gunman808/archlinux – 134 | .bash_history
135 | Last indexed 136 |

137 |
138 | 139 | 140 |
141 | 142 | 143 |

144 | mcgournj/AD-Cloud-Computing – 145 | .bash_history
146 | Last indexed 147 |

148 |
149 | 150 | 151 |
152 | 153 | 154 |

155 | darealcaffeine/first_app – 156 | .bash_history
157 | Last indexed 158 |

159 |
160 | 161 | 162 |
163 | 164 | 165 |

166 | hedning/dotfiles – 167 | .bash_history
168 | Last indexed 169 |

170 |
171 | 172 | 173 |
174 | 175 | 176 |

177 | ICEFLOW01/Telephone – 178 | .bash_history
179 | Last indexed 180 |

181 |
182 | 183 | 184 |
185 | 186 | Shell 187 |

188 | MrMaksimize/DC2011 – 189 | .bash_history
190 | Last indexed 191 |

192 |
193 | 194 | 195 |
196 | 197 | 198 |

199 | syrup/dotfiles – 200 | .bash_history
201 | Last indexed 202 |

203 |
204 | 205 | 206 |
207 | 208 | 209 |

210 | alycolas/alycolas – 211 | .bash_history
212 | Last indexed 213 |

214 |
215 | 216 | 217 |
218 | 219 | 220 |

221 | rrutia/UW-Assignments – 222 | .bash_history
223 | Last indexed 224 |

225 |
226 | 227 | 228 |
229 | 230 | 231 |

232 | rudycazabon/jeniferbacon – 233 | .bash_history
234 | Last indexed 235 |

236 |
237 | 238 |
239 | How are these search results? Tell us! 240 | 243 |
244 | 245 |
246 |

Octocat-spinner-128

247 |

Loading…

248 |
249 |
250 | 251 |
252 |
253 | 254 | 404 | 405 | 406 |
407 |
408 |
409 | 410 |
411 | 412 | 413 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 765 | 766 |
767 |

Markdown Cheat Sheet

768 | 769 |
770 | 771 |
772 |
773 |

Format Text

774 |

Headers

775 |
776 | # This is an <h1> tag
777 | ## This is an <h2> tag
778 | ###### This is an <h6> tag
779 |

Text styles

780 |
781 | *This text will be italic*
782 | _This will also be italic_
783 | **This text will be bold**
784 | __This will also be bold__
785 | 
786 | *You **can** combine them*
787 | 
788 |
789 |
790 |

Lists

791 |

Unordered

792 |
793 | * Item 1
794 | * Item 2
795 |   * Item 2a
796 |   * Item 2b
797 |

Ordered

798 |
799 | 1. Item 1
800 | 2. Item 2
801 | 3. Item 3
802 |    * Item 3a
803 |    * Item 3b
804 |
805 |
806 |

Miscellaneous

807 |

Images

808 |
809 | ![GitHub Logo](/images/logo.png)
810 | Format: ![Alt Text](url)
811 | 
812 |

Links

813 |
814 | http://github.com - automatic!
815 | [GitHub](http://github.com)
816 |

Blockquotes

817 |
818 | As Kanye West said:
819 | 
820 | > We're living the future so
821 | > the present is our past.
822 | 
823 |
824 |
825 |
826 | 827 |

Code Examples in Markdown

828 |
829 |

Syntax highlighting with GFM

830 |
831 | ```javascript
832 | function fancyAlert(arg) {
833 |   if(arg) {
834 |     $.facebox({div:'#foo'})
835 |   }
836 | }
837 | ```
838 |
839 |
840 |

Or, indent your code 4 spaces

841 |
842 | Here is a Python code example
843 | without syntax highlighting:
844 | 
845 |     def foo:
846 |       if not bar:
847 |         return true
848 |
849 |
850 |

Inline code for comments

851 |
852 | I think you should use an
853 | `<addr>` element here instead.
854 |
855 |
856 | 857 |
858 | 859 | 860 | 861 |
862 | 863 | Something went wrong with that request. Please try again. 864 | 865 |
866 | 867 | 868 | 869 | 870 | 871 | 872 | 873 | -------------------------------------------------------------------------------- /test/test_ghgrab.py: -------------------------------------------------------------------------------- 1 | import tutils 2 | import ghrabber 3 | 4 | def test_extract(): 5 | f = file(tutils.test_data.path("data/search.html")).read() 6 | ret = list(ghrabber.extract(f)) 7 | for i in ret: 8 | assert i.endswith(".bash_history") 9 | 10 | 11 | def test_is_last_page(): 12 | f = file(tutils.test_data.path("data/search.html")).read() 13 | assert not ghrabber.is_last_page(f) 14 | f = file(tutils.test_data.path("data/lastpage.html")).read() 15 | assert ghrabber.is_last_page(f) 16 | 17 | 18 | def test_to_raw(): 19 | p = "/nonexistent/archlinux/blob/a4f339b71ed6bb703f5f77888272d886f553f99a/.bash_history" 20 | assert ghrabber.raw_url(p) 21 | 22 | 23 | def test_make_fname(): 24 | p = "/nonexistent/archlinux/blob/a4f339b71ed6bb703f5f77888272d886f553f99a/.bash_history" 25 | assert ghrabber.make_fname(p) 26 | 27 | -------------------------------------------------------------------------------- /test/tutils.py: -------------------------------------------------------------------------------- 1 | import tempfile, os, shutil 2 | from contextlib import contextmanager 3 | 4 | class Data: 5 | def __init__(self, name): 6 | m = __import__(name) 7 | dirname, _ = os.path.split(m.__file__) 8 | self.dirname = os.path.abspath(dirname) 9 | 10 | def path(self, path): 11 | """ 12 | Returns a path to the package data housed at 'path' under this 13 | module.Path can be a path to a file, or to a directory. 14 | 15 | This function will raise ValueError if the path does not exist. 16 | """ 17 | fullpath = os.path.join(self.dirname, path) 18 | if not os.path.exists(fullpath): 19 | raise ValueError, "dataPath: %s does not exist."%fullpath 20 | return fullpath 21 | 22 | 23 | @contextmanager 24 | def tmpdir(*args, **kwargs): 25 | orig_workdir = os.getcwd() 26 | temp_workdir = tempfile.mkdtemp(*args, **kwargs) 27 | os.chdir(temp_workdir) 28 | 29 | yield temp_workdir 30 | 31 | os.chdir(orig_workdir) 32 | shutil.rmtree(temp_workdir) 33 | 34 | 35 | def raises(exc, obj, *args, **kwargs): 36 | """ 37 | Assert that a callable raises a specified exception. 38 | 39 | :exc An exception class or a string. If a class, assert that an 40 | exception of this type is raised. If a string, assert that the string 41 | occurs in the string representation of the exception, based on a 42 | case-insenstivie match. 43 | 44 | :obj A callable object. 45 | 46 | :args Arguments to be passsed to the callable. 47 | 48 | :kwargs Arguments to be passed to the callable. 49 | """ 50 | try: 51 | apply(obj, args, kwargs) 52 | except Exception, v: 53 | if isinstance(exc, basestring): 54 | if exc.lower() in str(v).lower(): 55 | return 56 | else: 57 | raise AssertionError( 58 | "Expected %s, but caught %s"%( 59 | repr(str(exc)), v 60 | ) 61 | ) 62 | else: 63 | if isinstance(v, exc): 64 | return 65 | else: 66 | raise AssertionError( 67 | "Expected %s, but caught %s %s"%( 68 | exc.__name__, v.__class__.__name__, str(v) 69 | ) 70 | ) 71 | raise AssertionError("No exception raised.") 72 | 73 | test_data = Data(__name__) 74 | --------------------------------------------------------------------------------