├── Data ├── orgs │ ├── 2005.json │ ├── 2006.json │ ├── 2007.json │ ├── 2008.json │ ├── 2009-2013.json │ ├── 2014-2015.json │ ├── 2016-2017.json │ ├── 2018.json │ └── orgs_2019.json └── projects │ ├── 2005.json │ ├── 2006.json │ ├── 2007.json │ ├── 2008.json │ ├── 2009-2013.json │ ├── 2014-2015.json │ ├── 2016-2017.json │ └── 2018.json ├── README.md └── Scrapers ├── common.py ├── developer-scraper.py ├── melange-scraper.py └── summerofcode-scraper.py /Data/orgs/2005.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "link": "http://www.apache.org/", 4 | "name": "Apache Software Foundation" 5 | }, 6 | { 7 | "link": "http://www.blender3d.org", 8 | "name": "Blender" 9 | }, 10 | { 11 | "link": "http://www.codehaus.org/", 12 | "name": "Codehaus" 13 | }, 14 | { 15 | "link": "http://drupal.org/", 16 | "name": "Drupal" 17 | }, 18 | { 19 | "link": "http://getfedora.org/", 20 | "name": "Fedora Core" 21 | }, 22 | { 23 | "link": "http://freebsd.org", 24 | "name": "FreeBSD" 25 | }, 26 | { 27 | "link": "http://gaim.sourceforge.net", 28 | "name": "Gaim" 29 | }, 30 | { 31 | "link": "http://gallery.menalto.com/modules.php?op=modload&name=News&file=index", 32 | "name": "Gallery" 33 | }, 34 | { 35 | "link": "http://www.gnome.org/", 36 | "name": "The GNOME Foundation" 37 | }, 38 | { 39 | "link": "http://code.google.com/", 40 | "name": "Google" 41 | } 42 | ] 43 | -------------------------------------------------------------------------------- /Data/orgs/2006.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "link": "http://www.abisource.com/", 4 | "name": "AbiSource" 5 | }, 6 | { 7 | "link": "https://adium.im/", 8 | "name": "Adium" 9 | }, 10 | { 11 | "link": "http://ardour.org/", 12 | "name": "Ardour" 13 | }, 14 | { 15 | "link": "http://argouml.tigris.org/", 16 | "name": "ArgoUML" 17 | }, 18 | { 19 | "link": "http://www.apache.org", 20 | "name": "The Apache Software Foundation" 21 | }, 22 | { 23 | "link": "http://www.bbc.co.uk/rd/", 24 | "name": "BBC Research" 25 | } 26 | ] 27 | -------------------------------------------------------------------------------- /Data/orgs/2007.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "link": "http://www.abisource.com/", 4 | "name": "AbiSource" 5 | }, 6 | { 7 | "link": "https://adium.im/", 8 | "name": "Adium" 9 | }, 10 | { 11 | "link": "http://audacious-media-player.org", 12 | "name": "Audacious Media Player" 13 | }, 14 | { 15 | "link": "http://www.aqsis.org", 16 | "name": "Aqsis Team" 17 | }, 18 | { 19 | "link": "http://ardour.org/", 20 | "name": "Ardour" 21 | }, 22 | { 23 | "link": "http://argouml.tigris.org", 24 | "name": "ArgoUML" 25 | }, 26 | { 27 | "link": "http://www.apache.org", 28 | "name": "The Apache Software Foundation" 29 | }, 30 | { 31 | "link": "http://www.bbc.co.uk/rd/", 32 | "name": "BBC Research" 33 | } 34 | ] 35 | -------------------------------------------------------------------------------- /Data/orgs/2008.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "link": "http://www.abisource.com/", 4 | "name": "AbiSource" 5 | }, 6 | { 7 | "link": "https://adium.im/", 8 | "name": "Adium" 9 | }, 10 | { 11 | "link": "http://argouml.tigris.org", 12 | "name": "ArgoUML" 13 | }, 14 | { 15 | "link": "http://www.apache.org", 16 | "name": "The Apache Software Foundation" 17 | } 18 | ] 19 | -------------------------------------------------------------------------------- /Data/projects/2005.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "mentor": "Tim Funk", 4 | "project": "J2EP (tomcat-reverse-proxy)", 5 | "student": "Anders Nyman" 6 | }, 7 | { 8 | "mentor": "Julie MacNaught", 9 | "project": "A partial refactoring of WSRP4J, as well as a set of new features.", 10 | "student": "Diego Louzan Martinez" 11 | }, 12 | { 13 | "mentor": "Ian Holsman", 14 | "project": "Bandwidth Mod", 15 | "student": "Ivan Barrera" 16 | }, 17 | { 18 | "mentor": "Nick Kew", 19 | "project": "mod_smtpd adds SMTP support to Apache httpd 2.x.", 20 | "student": "Jem Berkes" 21 | }, 22 | { 23 | "mentor": "Santiago Gala", 24 | "project": "AJAX skin for jetspeed2", 25 | "student": "Juan Núñez Jaramillo" 26 | }, 27 | { 28 | "mentor": "Leo Simons", 29 | "project": "Bootstrapping Maven w/ Gump3", 30 | "student": "Justin Merz" 31 | }, 32 | { 33 | "mentor": "Reinhard Poetz", 34 | "project": "Cocoon Forms Library", 35 | "student": "Max Pfingsthorn" 36 | }, 37 | { 38 | "mentor": "Ian Holsman", 39 | "project": "httpd-mbox-if", 40 | "student": "Maxime Petazzoni" 41 | }, 42 | { 43 | "mentor": "Ian Holsman", 44 | "project": "mod-cache-requester - Apache", 45 | "student": "Parinkumar Shah" 46 | }, 47 | { 48 | "mentor": "Nick Kew", 49 | "project": "mod_smtpd", 50 | "student": "Rian Hunter" 51 | }, 52 | { 53 | "mentor": "Danny Angus", 54 | "project": "James Admin Console", 55 | "student": "Srinivas Anne" 56 | }, 57 | { 58 | "mentor": "Martin Marinschek", 59 | "project": "LeboN - An example web application with MyFaces", 60 | "student": "Tessy Kizhakkekara" 61 | }, 62 | { 63 | "mentor": "Sanjiva Weerawarana", 64 | "project": "Apache Kandula", 65 | "student": "Thilina Gunarathne" 66 | }, 67 | { 68 | "mentor": "Ajith Ranabahu", 69 | "project": "StaX parser for Axis C++", 70 | "student": "Weerapurage Dinesh Premalal" 71 | }, 72 | { 73 | "mentor": "Phil Steitz", 74 | "project": "Apache Commons Math Numerical Enhancements", 75 | "student": "Xiaogang Zhang" 76 | }, 77 | { 78 | "mentor": "Yoav Shapira", 79 | "project": "Tune Apache Tomcat Jasper JSP Compilation Performance", 80 | "student": "Xingbo Gao" 81 | }, 82 | { 83 | "mentor": "Ton Roosendaal", 84 | "project": "Extending Inverse Kinematics For Blender", 85 | "student": "Brecht Van Lommel" 86 | }, 87 | { 88 | "mentor": "Martin Poirier", 89 | "project": "curves/surfaces, Nurbana integration", 90 | "student": "Emmanuel Stone" 91 | }, 92 | { 93 | "mentor": "Alexander Ewering", 94 | "project": "Blender/FFMPEG", 95 | "student": "Ian Gowen" 96 | }, 97 | { 98 | "mentor": "Alexander Ewering", 99 | "project": "Boolean Development", 100 | "student": "Marc Freixas" 101 | }, 102 | { 103 | "mentor": "Jonathan Merritt", 104 | "project": "Fluid Simulation with Blender", 105 | "student": "Nils Thuerey" 106 | }, 107 | { 108 | "mentor": "Campbell Barton", 109 | "project": "PyTexture", 110 | "student": "Timothy Wakeham" 111 | }, 112 | { 113 | "mentor": "Vincent Massol", 114 | "project": "Oxyd", 115 | "student": "Jérémi Joslin" 116 | }, 117 | { 118 | "mentor": "Vincent Massol", 119 | "project": "Implementation of JSR-88 (J2EE Application Deployment) support classes for Cargo.", 120 | "student": "Lev Olkhovich" 121 | }, 122 | { 123 | "mentor": "Vincent Massol", 124 | "project": "Implementing Support for JBoss Application Server on Cargo", 125 | "student": "Nyoman Winardi" 126 | }, 127 | { 128 | "mentor": "Daniel Diephouse", 129 | "project": "Implemented the WS-RM specification into XFire to allow SOAP messages to be sent reliably between two endpoints", 130 | "student": "Ti Khoi Anh Phan" 131 | }, 132 | { 133 | "mentor": "James Walker", 134 | "project": "WebDAV API for Drupal", 135 | "student": "Fabiano Sant'Ana" 136 | }, 137 | { 138 | "mentor": "Károly Négyesi", 139 | "project": "Drupal Quiz Module", 140 | "student": "Angela Byron" 141 | }, 142 | { 143 | "mentor": "Moshe Weitzman", 144 | "project": "Drupal Automated Test Suite", 145 | "student": "Jakub Zygmunt" 146 | }, 147 | { 148 | "mentor": "Károly Négyesi", 149 | "project": "netnews (NNTP) integration for Drupal", 150 | "student": "Jan Blom" 151 | }, 152 | { 153 | "mentor": "Károly Négyesi", 154 | "project": "drupal subscription module", 155 | "student": "Márton Elek" 156 | }, 157 | { 158 | "mentor": "Gerhard Killesreiter", 159 | "project": "Drupal - Google Sitemap", 160 | "student": "Matthew Loar" 161 | }, 162 | { 163 | "mentor": "James Walker", 164 | "project": "upcoming.org REST API for Drupal", 165 | "student": "Stephan Jaensch" 166 | }, 167 | { 168 | "mentor": "Robert Douglass", 169 | "project": "Ajax Functionality (Drupal)", 170 | "student": "Steven Wittens" 171 | }, 172 | { 173 | "mentor": "Moshe Weitzman", 174 | "project": "Drupal automated test suite", 175 | "student": "Thomas Ilsche" 176 | }, 177 | { 178 | "mentor": "Elliot Lee", 179 | "project": "preload - an adaptive readahead daemon", 180 | "student": "Behdad Esfahbod" 181 | }, 182 | { 183 | "mentor": "Elliott Lee", 184 | "project": "An application for Fedora-based live CD generation", 185 | "student": "Darko Ilic" 186 | }, 187 | { 188 | "mentor": "Elliot Lee", 189 | "project": "pyBackPack", 190 | "student": "Dave Arter" 191 | }, 192 | { 193 | "mentor": "Elliot Lee", 194 | "project": "Global Command History for Bash", 195 | "student": "Praveenkumar Ponnusamy" 196 | }, 197 | { 198 | "mentor": "Elliot Lee", 199 | "project": "Python bindings for libparted", 200 | "student": "Ulisses Furquim Freire da Silva" 201 | }, 202 | { 203 | "mentor": "Elliot Lee", 204 | "project": "setting process rlimits", 205 | "student": "Wieland Gmeiner" 206 | }, 207 | { 208 | "mentor": "Lukas Ertl", 209 | "project": "FreeBSD gvinum 'move', 'rename', documentation", 210 | "student": "Chris Jones" 211 | }, 212 | { 213 | "mentor": "Maxime Henrion", 214 | "project": "csup", 215 | "student": "Christoph Mathys" 216 | }, 217 | { 218 | "mentor": "Scott Long", 219 | "project": "ssh based networked filesystem for FreeBSD", 220 | "student": "Csaba Henk" 221 | }, 222 | { 223 | "mentor": "Scott Long", 224 | "project": "FreeSBIE integration in FreeBSD", 225 | "student": "Dario Freni" 226 | }, 227 | { 228 | "mentor": "Murray Stokely", 229 | "project": "FreeBSD website redesign and development", 230 | "student": "Emily Boyd" 231 | }, 232 | { 233 | "mentor": "Pawel Dawidek", 234 | "project": "gjournal", 235 | "student": "Ivan Voras" 236 | }, 237 | { 238 | "mentor": "Luigi Rizzo", 239 | "project": "Improve libalias", 240 | "student": "Paolo Pisati" 241 | }, 242 | { 243 | "mentor": "Philip Paeps", 244 | "project": "SNMP client tools using FreeBSD's libbsnmp", 245 | "student": "Shteryana Shopova" 246 | }, 247 | { 248 | "mentor": "Murray Stokely", 249 | "project": "Porting Mac OS 10.4's launchd(8) tools to FreeBSD", 250 | "student": "Tyler Ballance" 251 | }, 252 | { 253 | "mentor": "Hartmut Brandt", 254 | "project": "Fully Integrated SNMP Monitoring", 255 | "student": "Victor Cruceru" 256 | }, 257 | { 258 | "mentor": "Sean Egan", 259 | "project": "Implement UPnP NAT Traversal into the Gaim project to allow for seamless file transfers and direct connections.", 260 | "student": "Adam Warrington" 261 | }, 262 | { 263 | "mentor": "Ethan Blanton", 264 | "project": "Gadu-Gadu Support (in Gaim)", 265 | "student": "Bartosz Oler" 266 | }, 267 | { 268 | "mentor": "Gary Kramlich", 269 | "project": "Music Messaging", 270 | "student": "Christian Muise" 271 | }, 272 | { 273 | "mentor": "Mark Doliner", 274 | "project": "File Transfers via the OSCAR (ICQ/AIM) Protocol", 275 | "student": "Jonathan Clark" 276 | }, 277 | { 278 | "mentor": "Mark Doliner", 279 | "project": "Rendezvous Plugin for Gaim", 280 | "student": "Juanjo Molinero Horno" 281 | }, 282 | { 283 | "mentor": "Sean Egan", 284 | "project": "D-Busified gaim-remote", 285 | "student": "Piotr Zielinski" 286 | }, 287 | { 288 | "mentor": "Ethan Blanton", 289 | "project": "SIP/SIMPLE/STUN for gaim", 290 | "student": "Thomas Butter" 291 | }, 292 | { 293 | "mentor": "Jay Rossiter", 294 | "project": "Gallery2 - DupeDetect", 295 | "student": "Aviad Tsherniak" 296 | }, 297 | { 298 | "mentor": "Chris Kelly", 299 | "project": "G2 XML-RPC Modularization/CoreAPI", 300 | "student": "Christopher Schwerdt" 301 | }, 302 | { 303 | "mentor": "Jay Rossiter", 304 | "project": "Hidden and Password-Protected Items", 305 | "student": "Jess Martin" 306 | }, 307 | { 308 | "mentor": "Bharat Mediratta", 309 | "project": "Downloadable Plugins", 310 | "student": "Jozef Selesi" 311 | }, 312 | { 313 | "mentor": "Andy Staudacher", 314 | "project": "G2 / ZenCart Integration", 315 | "student": "Michael Rodriguez-Torrent" 316 | }, 317 | { 318 | "mentor": "Jesse Mullan", 319 | "project": "Exploit aspects of DHTML to operate a smooth slideshow.", 320 | "student": "Ross Shannon" 321 | }, 322 | { 323 | "mentor": "Colin Walters", 324 | "project": "iTunes Music Sharing for Rhythmbox", 325 | "student": "Charles Schmidt" 326 | }, 327 | { 328 | "mentor": "Federico Mena-Quintero", 329 | "project": "Live Documentation Editor 'Sarma'", 330 | "student": "Danilo Šegan" 331 | }, 332 | { 333 | "mentor": "Owen Taylor", 334 | "project": "Improve GNOME startup time", 335 | "student": "Lorenzo Colitti" 336 | }, 337 | { 338 | "mentor": "Dave Camp", 339 | "project": "Track changes in a filesystem directory via subversion and nautilus", 340 | "student": "Matt Jones" 341 | }, 342 | { 343 | "mentor": "Johan Dahlin", 344 | "project": "gshrooms", 345 | "student": "Raphael Slinckx" 346 | }, 347 | { 348 | "mentor": "Seth Nickell", 349 | "project": "Search Party", 350 | "student": "Sanford Armstrong" 351 | }, 352 | { 353 | "mentor": "Havoc Pennington", 354 | "project": "GNOME Panel Extensions", 355 | "student": "Travis Vachon" 356 | }, 357 | { 358 | "mentor": "Miguel de Icaza", 359 | "project": "ASP.NET Editor-Mozilla", 360 | "student": "Blagovest Dachev" 361 | }, 362 | { 363 | "mentor": "Greg Wilson", 364 | "project": "Bitten", 365 | "student": "Christopher Lenz" 366 | }, 367 | { 368 | "mentor": "Chris DiBona", 369 | "project": "usb8x", 370 | "student": "Daniel Englender" 371 | }, 372 | { 373 | "mentor": "Evan Martin", 374 | "project": "osmo", 375 | "student": "David Wilson" 376 | }, 377 | { 378 | "mentor": "Michael Moss", 379 | "project": "Tsync", 380 | "student": "James Anderson" 381 | }, 382 | { 383 | "mentor": "Peter Murray", 384 | "project": "Fedora-ShibFilter", 385 | "student": "Joshua Kent" 386 | }, 387 | { 388 | "mentor": "Chris Lavoie", 389 | "project": "Query By Example", 390 | "student": "Meredith Patterson" 391 | }, 392 | { 393 | "mentor": "Chris Lavoie", 394 | "project": "A VM in Java with tail-calls and continuations", 395 | "student": "Nuno Cruces" 396 | }, 397 | { 398 | "mentor": "Yoshiki Hayashi", 399 | "project": "XSieve", 400 | "student": "Oleg Paraschenko" 401 | }, 402 | { 403 | "mentor": "Evan Martin", 404 | "project": "Google / Cairo integration into GTK Haskell bindings", 405 | "student": "Paolo Martini" 406 | } 407 | ] 408 | -------------------------------------------------------------------------------- /Data/projects/2006.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "mentor": "Dominic Lachowicz", 4 | "project": "PDF Import plugin (with style)", 5 | "student": "Jauco Noordzij" 6 | }, 7 | { 8 | "mentor": "Robert Staudinger", 9 | "project": "The AbiWord OLPC User Interface", 10 | "student": "Erik Pukinskis" 11 | }, 12 | { 13 | "mentor": "J.M. Maurer", 14 | "project": "Improving AbiWord's import/export plug-ins", 15 | "student": "Kamran Khan" 16 | }, 17 | { 18 | "mentor": "Robert August Fackler", 19 | "project": "Making XMPP a first-class citizen by using the Smack library", 20 | "student": "Andreas Monitzer" 21 | }, 22 | { 23 | "mentor": "Peter Hosey", 24 | "project": "Improving disability support for Adium", 25 | "student": "Chirag Shah" 26 | }, 27 | { 28 | "mentor": "Colin Harold Barrett", 29 | "project": "EMD & Presence Publishing over DO (leading to: Psyduck - A Desktop Presence Framework)", 30 | "student": "Joshua Lock" 31 | }, 32 | { 33 | "mentor": "David Smith", 34 | "project": "Improving and integrating PSMTabBarControl with Adium", 35 | "student": "Kent Sutherland" 36 | }, 37 | { 38 | "mentor": "Evan Schoenberg", 39 | "project": "Jingle in Adium X", 40 | "student": "Alvaro Saurin Parra" 41 | }, 42 | { 43 | "mentor": "Jesse Chappell", 44 | "project": "MIDI Track support", 45 | "student": "David Edward Robillard" 46 | }, 47 | { 48 | "mentor": "Paul Davis", 49 | "project": "Region Plugins and Undo Serialization", 50 | "student": "Hans Fugal" 51 | }, 52 | { 53 | "mentor": "Taybin Rutkin", 54 | "project": "Port Ardour to Windows", 55 | "student": "Tim Mayberry" 56 | }, 57 | { 58 | "mentor": "Linus Tolke", 59 | "project": "ArgoPrint Improvements", 60 | "student": "Ion Savin" 61 | }, 62 | { 63 | "mentor": "Tom Morris", 64 | "project": "Eclipse Integration", 65 | "student": "Pistol Constandache Bogdan Ciprian" 66 | }, 67 | { 68 | "mentor": "Robert James Tarling", 69 | "project": "A Graph Layout Algorithm for GEF", 70 | "student": "Martin Harrigan" 71 | }, 72 | { 73 | "mentor": "Michiel van der Wulp", 74 | "project": "ArgoUML Critics improvements / italian localization / various bug fixes", 75 | "student": "Andrea Nironi" 76 | }, 77 | { 78 | "mentor": "Ranabahu Mudiyanselage Ajith Harshana Ranabahu", 79 | "project": "C Implementation of WSDL2C Code generator for Axis2/C", 80 | "student": "Dimuthu Chathuranga Gamage" 81 | }, 82 | { 83 | "mentor": "Øystein Grøvlen", 84 | "project": "Derby LRU Cache Manager", 85 | "student": "Gokul Soundararajan" 86 | }, 87 | { 88 | "mentor": "Michael Parker", 89 | "project": "Thunderbird/Outlook/etc Plugins for Learning Messages via Spamd", 90 | "student": "William Duff" 91 | }, 92 | { 93 | "mentor": "Martin Marinschek", 94 | "project": "Implementing Partial State Saving in Apache MyFaces", 95 | "student": "Martin Haimberger" 96 | }, 97 | { 98 | "mentor": "Andrus Adamchik", 99 | "project": "cayenne-rop", 100 | "student": "Marcel Gordon" 101 | }, 102 | { 103 | "mentor": "Sanjiva Weerawarana", 104 | "project": "Axis2/Java-C# (http://wiki.apache.org/general/SummerOfCode2006#axis2/java-c#\")", 105 | "student": "Nandana Sampath Mihindulasooriya" 106 | }, 107 | { 108 | "mentor": "Kevin Joseph Menard", 109 | "project": "cayenne-ropwsdl", 110 | "student": "Mikhail Viktorov" 111 | }, 112 | { 113 | "mentor": "Michael Glavassevich", 114 | "project": "xerces-stax-api-converters", 115 | "student": "hua lei" 116 | }, 117 | { 118 | "mentor": "Changshin Lee", 119 | "project": "Add MMS transport support for Apache Mirae and Apache Axis2", 120 | "student": "Lilan Anjana Fernando" 121 | }, 122 | { 123 | "mentor": "Davanum Srinivas", 124 | "project": "Complete databinding support for ADB (Axis Databinding)", 125 | "student": "Maryam Moazeni" 126 | }, 127 | { 128 | "mentor": "Jeremias Märki", 129 | "project": "fop-auto-table-layout", 130 | "student": "Patrick Paul" 131 | }, 132 | { 133 | "mentor": "Jukka Zitting", 134 | "project": "JackRabbit Backup Tool", 135 | "student": "Nicolas Toper" 136 | }, 137 | { 138 | "mentor": "Matthias Weßendorf", 139 | "project": "Apache MyFaces skinning solution proposal", 140 | "student": "Catalin Adrian Kormos" 141 | }, 142 | { 143 | "mentor": "Werner Punz", 144 | "project": "Implement AJAX Partial Page Rendering Support in Apache MyFaces", 145 | "student": "Ernst Fastl" 146 | }, 147 | { 148 | "mentor": "Kevin Joseph Menard", 149 | "project": "Cayenne Eclipse Plugin", 150 | "student": "Bruno José de Moraes Melo" 151 | }, 152 | { 153 | "mentor": "David Van Couvering", 154 | "project": "Application for - Add JMX to Apache Derby", 155 | "student": "Sanket Sharma" 156 | }, 157 | { 158 | "mentor": "Deepal Jayasinghe", 159 | "project": "Axis2 WSDL 2.0 support", 160 | "student": "Oshani Wasana Seneviratne" 161 | }, 162 | { 163 | "mentor": "Mario Ivankovits", 164 | "project": "Apache MyFaces. JSF view flexible templating language or JSF on Rails.", 165 | "student": "Aliaksandr Kazachonak" 166 | }, 167 | { 168 | "mentor": "Jeremias Märki", 169 | "project": "Implementing before- and side-floats in Apache Fop", 170 | "student": "Vincent Hennebert" 171 | }, 172 | { 173 | "mentor": "Satheesh E Bandaram", 174 | "project": "Provide migration tool from MySQL to Derby", 175 | "student": "Ramin Moazeni" 176 | }, 177 | { 178 | "mentor": "Ian Holsman", 179 | "project": "GData Server", 180 | "student": "Simon Willnauer" 181 | }, 182 | { 183 | "mentor": "Justin Mason", 184 | "project": "spamassassin-httpd-spamd", 185 | "student": "Radosław Zieliński" 186 | }, 187 | { 188 | "mentor": "Michael Philip Sparks", 189 | "project": "Key Predistribution Infrastructure (KPI) base trusted communication framework for Kamaelia", 190 | "student": "Anagha Mudigonda" 191 | }, 192 | { 193 | "mentor": "Thomas Davies", 194 | "project": "Dirac decoder in Java", 195 | "student": "Luis Felipe Strano Moraes" 196 | }, 197 | { 198 | "mentor": "Michael Philip Sparks", 199 | "project": "3D widget framework", 200 | "student": "Thomas Flanitzer" 201 | }, 202 | { 203 | "mentor": "Michael Philip Sparks", 204 | "project": "Creation of an integrated BitTorrent component for Kamaelia", 205 | "student": "Ryan James Lothian" 206 | }, 207 | { 208 | "mentor": "Thomas Davies", 209 | "project": "SOC Dirac: Java implementation of a Dirac decoder", 210 | "student": "Adam Davison" 211 | } 212 | ] 213 | -------------------------------------------------------------------------------- /Data/projects/2007.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "mentor": "J.M. Maurer", 4 | "project": "Systematically Breaking and Fixing AbiCollab", 5 | "student": "Ryan Pavlik" 6 | }, 7 | { 8 | "mentor": "Dominic Lachowicz", 9 | "project": "'Putting libabiword on a Diet'", 10 | "student": "Robert Staudinger" 11 | }, 12 | { 13 | "mentor": "Kamran Khan", 14 | "project": "Abiword: OpenXML importer", 15 | "student": "Philippe Milot" 16 | }, 17 | { 18 | "mentor": "Martin Edmund Sevior", 19 | "project": "4. Implement annotations", 20 | "student": "Ernesto Rivera" 21 | }, 22 | { 23 | "mentor": "Dominic Lachowicz", 24 | "project": "Interface service for the use of external grammar checkers in Abiword", 25 | "student": "Gabriel Bakiewicz" 26 | }, 27 | { 28 | "mentor": "David Smith", 29 | "project": "Improving multi-user chat", 30 | "student": "Erik E. Beerepoot" 31 | }, 32 | { 33 | "mentor": "Brian Eric Ganninger", 34 | "project": "(Un)Lock groups", 35 | "student": "Andre Cohen" 36 | }, 37 | { 38 | "mentor": "Peter Hosey", 39 | "project": "Fix & expand AppleScript", 40 | "student": "Matthew Handley" 41 | }, 42 | { 43 | "mentor": "Robert August Fackler", 44 | "project": "Improving Adium's XMPP Support", 45 | "student": "Andreas Monitzer" 46 | }, 47 | { 48 | "mentor": "Andrew Wellington", 49 | "project": "Improved Bonjour Support for Adium", 50 | "student": "Erich Kreutzer" 51 | }, 52 | { 53 | "mentor": "William Pitcock", 54 | "project": "D-Bus Support", 55 | "student": "Ben Tucker" 56 | }, 57 | { 58 | "mentor": "Tony Vroon", 59 | "project": "CD-Text support for the CDAudio plugin", 60 | "student": "Calin Crisan" 61 | }, 62 | { 63 | "mentor": "William Pitcock", 64 | "project": "Rewrite widgetcore as real GTK2 widgets", 65 | "student": "Tomasz Mon" 66 | }, 67 | { 68 | "mentor": "William Pitcock", 69 | "project": "Additional NewVFS transports", 70 | "student": "Cristian Măgherușan" 71 | }, 72 | { 73 | "mentor": "Tristan Colgate", 74 | "project": "Multi-threaded Implementation", 75 | "student": "Manuel Antonio Fernandez Montecelo" 76 | }, 77 | { 78 | "mentor": "Christopher James Foster", 79 | "project": "Deep Shadow Maps implementation for the rendering of coloured shadows and hair/fur shadows", 80 | "student": "Zachary Carter" 81 | }, 82 | { 83 | "mentor": "Paul Davis", 84 | "project": "Piano-Roll MIDI Editing Interface", 85 | "student": "David Edward Robillard" 86 | }, 87 | { 88 | "mentor": "Jesse Chappell", 89 | "project": "n.m panning", 90 | "student": "Christian James Muise" 91 | }, 92 | { 93 | "mentor": "Tom Morris", 94 | "project": "UML2 for ArgoUML", 95 | "student": "Pistol Constandache Bogdan Ciprian" 96 | }, 97 | { 98 | "mentor": "Linus Tolke", 99 | "project": "Extending ArgoUML to support UML Profiles", 100 | "student": "Marcos Aurelio Almeida da Silva" 101 | }, 102 | { 103 | "mentor": "Robert James Tarling", 104 | "project": "Sequence Diagrams improvements", 105 | "student": "Christian Lopez Espinola" 106 | }, 107 | { 108 | "mentor": "Ion Savin", 109 | "project": "ArgoPDF – The PDF report generation tool.", 110 | "student": "Dzmitry Churbanau" 111 | }, 112 | { 113 | "mentor": "Robert James Tarling", 114 | "project": "Alternative GUI layer in GEF library for ArgoUML project", 115 | "student": "Jian Zheng" 116 | }, 117 | { 118 | "mentor": "William Glass-Husain", 119 | "project": "Improve Velocity Macro capability", 120 | "student": "Supun Madhushanka Kamburugamuva" 121 | }, 122 | { 123 | "mentor": "Ruchith Fernando", 124 | "project": "Canonical XML Implementation on Apache AXIOM", 125 | "student": "Saliya Ekanayake" 126 | }, 127 | { 128 | "mentor": "Saminda Wishwajith Abeyruwan", 129 | "project": "Implementing Mail Transport (SMTP/POP3) support for SOAP 1.2 for Axis2/C", 130 | "student": "Rajika Kumarasiri" 131 | }, 132 | { 133 | "mentor": "Daniel Fagerstrom", 134 | "project": "Unified expression handling and unified object model in Apache Cocoon", 135 | "student": "Grzegorz Kossakowski" 136 | }, 137 | { 138 | "mentor": "Patrick Linskey", 139 | "project": "Streaming LOB support (for OpenJPA)", 140 | "student": "Ignacio Andreu Dolset" 141 | }, 142 | { 143 | "mentor": "Jason van Zyl", 144 | "project": "Maven Diagram-Maker", 145 | "student": "Piotr Tabor" 146 | }, 147 | { 148 | "mentor": "Michael Parker", 149 | "project": "Implementing “Dobly” Noise Reduction for SpamAssassin", 150 | "student": "Jianyong Dai" 151 | }, 152 | { 153 | "mentor": "Werner Punz", 154 | "project": "MyFaces Renderer Based on Templates", 155 | "student": "Sorin Silaghi" 156 | }, 157 | { 158 | "mentor": "Cameron McCormack", 159 | "project": "Enhancing Batik's document viewer", 160 | "student": "Ivan Andjelkovic" 161 | }, 162 | { 163 | "mentor": "Michael Parker", 164 | "project": "The Persistent Database Connection Plugin", 165 | "student": "Zhang Shunchang" 166 | }, 167 | { 168 | "mentor": "Paul Smith", 169 | "project": "Adding Functionality and Usability Improvements to Chainsaw", 170 | "student": "Isuru Eranga Suriarachchi" 171 | }, 172 | { 173 | "mentor": "Nandika Jayawardana", 174 | "project": "Proposal for JSON Support for Apache Axis2/C", 175 | "student": "T.G. Kasun Indrasiri" 176 | }, 177 | { 178 | "mentor": "Martin Marinschek", 179 | "project": "MyFaces Component Set Integration", 180 | "student": "Leonardo Alfredo Uribe Panesso" 181 | }, 182 | { 183 | "mentor": "Amila Chinthaka Suriarachchi", 184 | "project": "JAX-WS 2.0 support for Apache Axis2 WSDL2Java Code Generator", 185 | "student": "Sameera Madushan Jayasoma" 186 | }, 187 | { 188 | "mentor": "Werner Punz", 189 | "project": "MyFaces Component Generator", 190 | "student": "Bernhard Huemer" 191 | }, 192 | { 193 | "mentor": "Jukka Zitting", 194 | "project": "JCR Demo Application based on Jackrabbit", 195 | "student": "Nandana Sampath Mihindukulasooriya" 196 | }, 197 | { 198 | "mentor": "Michael Glavassevich", 199 | "project": "Add support for the StAX (JSR-173) cursor API to Xerces-J", 200 | "student": "Wei Duan" 201 | }, 202 | { 203 | "mentor": "Cameron McCormack", 204 | "project": "Design and implementation a better document inspector", 205 | "student": "Jasleen Singh" 206 | }, 207 | { 208 | "mentor": "Katherine Marsden", 209 | "project": "Convert Derby tests to JUnit and fix Derby bugs", 210 | "student": "Ramin Moazeni" 211 | }, 212 | { 213 | "mentor": "Samuel Andrew McIntyre", 214 | "project": "Convert Derby tests to JUnit and fix Derby bugs", 215 | "student": "Ravinder Reddy Pandiri" 216 | }, 217 | { 218 | "mentor": "Jukka Zitting", 219 | "project": "jackrabbit-jcr-demo: μAssessment (muAssessment) Testing System", 220 | "student": "Pavel Konnikov" 221 | }, 222 | { 223 | "mentor": "Andrea Gabriellini", 224 | "project": "Motion estimation and mode decision in Dirac", 225 | "student": "Andrew Lewis" 226 | }, 227 | { 228 | "mentor": "Michael Philip Sparks", 229 | "project": "A file handle like interface to backgrounded Kamaelia components", 230 | "student": "Patrick Thomson" 231 | }, 232 | { 233 | "mentor": "Matt Hammond", 234 | "project": "AIM/IRC client for Kamaelia", 235 | "student": "Jinna Lei" 236 | }, 237 | { 238 | "mentor": "Michael Philip Sparks", 239 | "project": "Visual Editor for Creation & Composition of Shard Components", 240 | "student": "tara gilliam" 241 | } 242 | ] 243 | -------------------------------------------------------------------------------- /Data/projects/2008.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "mentor": "Dominic Lachowicz", 4 | "project": "Getting rid of non-portable library use in AbiWord/Gtk+", 5 | "student": "Robert Staudinger" 6 | }, 7 | { 8 | "mentor": "J.M. Maurer", 9 | "project": "Improve LaTeX Support", 10 | "student": "Xun Sun" 11 | }, 12 | { 13 | "mentor": "Martin Edmund Sevior", 14 | "project": "AbiWord Multiple Page View", 15 | "student": "James Denton" 16 | }, 17 | { 18 | "mentor": "Hubert Figuiere", 19 | "project": "Rethinking Styles", 20 | "student": "Ryan Pavlik" 21 | }, 22 | { 23 | "mentor": "Kamran Khan", 24 | "project": "Proposal for developing OOXML Export plugin for AbiWord", 25 | "student": "Firat Kiyak" 26 | }, 27 | { 28 | "mentor": "Peter Hosey", 29 | "project": "Unit Tests and Documentation for Adium", 30 | "student": "Branton John Homer" 31 | }, 32 | { 33 | "mentor": "David Smith", 34 | "project": "Data-detectors for Adium", 35 | "student": "Geoffrey Foster" 36 | }, 37 | { 38 | "mentor": "Robert James Tarling", 39 | "project": "UI Code generation for ArgoUML property panels", 40 | "student": "Christian Lopez Espinola" 41 | }, 42 | { 43 | "mentor": "Luís Sérgio Valente de Oliveira", 44 | "project": "Usability issues of diagrams", 45 | "student": "Bogdan SZANTO" 46 | }, 47 | { 48 | "mentor": "Jan Magne Andersen", 49 | "project": "C# Gene /Generating UML diagrams from C# source files", 50 | "student": "Thilina Hasantha" 51 | }, 52 | { 53 | "mentor": "Pistol Constandache Bogdan Ciprian", 54 | "project": "ArgoEclipse", 55 | "student": "Brian Hudson" 56 | }, 57 | { 58 | "mentor": "Michiel van der Wulp", 59 | "project": "Improving the Critics Subsystem", 60 | "student": "Marcos Aurelio Almeida da Silva" 61 | }, 62 | { 63 | "mentor": "Ant Elder", 64 | "project": "Tuscany SCA Support in the Geronimo Admin Console", 65 | "student": "Thilina Mahesh Buddhika" 66 | }, 67 | { 68 | "mentor": "Adriano Crestani Campos", 69 | "project": "Allow Google Android applications to easily consume business services (version 2.0 - 6Apr2008 @17.50)", 70 | "student": "Oscar Castaneda" 71 | }, 72 | { 73 | "mentor": "Alexei Y. Zakharov", 74 | "project": "harmony-demo-1", 75 | "student": "C. D. Tharindu Mathew" 76 | }, 77 | { 78 | "mentor": "Marnie McCormack", 79 | "project": "CLI for extract information from Apache Qpid Java broker", 80 | "student": "Lahiru Mananada Gunathilake" 81 | }, 82 | { 83 | "mentor": "Tammo van Lessen", 84 | "project": "Ajax-based Monitoring Console for Apache ODE", 85 | "student": "Milinda Lakmal Pathirage" 86 | }, 87 | { 88 | "mentor": "Luciano Resende", 89 | "project": "Integrate Google Services in SCA Compositions", 90 | "student": "Douglas Siqueira Leite" 91 | }, 92 | { 93 | "mentor": "Myrna van Lunteren", 94 | "project": "derby-testandfix - Convert Derby tests to JUnit and fix Derby bugs", 95 | "student": "Erlend Birkenes" 96 | }, 97 | { 98 | "mentor": "Khaled Noaman", 99 | "project": "Implementing Conditional Type Assignment for Apache Xerces2-J", 100 | "student": "Hiranya Jayathilaka" 101 | }, 102 | { 103 | "mentor": "Katherine Marsden", 104 | "project": "Convert Apache Derby tests to JUnit and fix Derby bugs", 105 | "student": "Umayanga Suran Jayathilaka" 106 | }, 107 | { 108 | "mentor": "Zhaohui Feng", 109 | "project": "CORBA support for Apache Tuscany", 110 | "student": "Wojciech Janiszewski" 111 | }, 112 | { 113 | "mentor": "Grant Ingersoll", 114 | "project": "Codename Mahout.GA for mahout-machine-learning", 115 | "student": "Abdel Hakim Deneche" 116 | }, 117 | { 118 | "mentor": "Luciano Resende", 119 | "project": "Integrate Google services in SCA compositions(Apache Tuscany)", 120 | "student": "Haibo Zhao" 121 | }, 122 | { 123 | "mentor": "Nandana Sampath Mihindukulasooriya", 124 | "project": "Basic Security Profile (BSP) 1.0 Validation for Apache Rampart", 125 | "student": "Heshan Suriyaarachchi" 126 | }, 127 | { 128 | "mentor": "Sian January", 129 | "project": "Policytool: command line tool for Harmony", 130 | "student": "András Belicza" 131 | }, 132 | { 133 | "mentor": "Bertrand Delacretaz", 134 | "project": "Make the Scala Language usable in Sling.", 135 | "student": "Janandith Uditha Jayawardena" 136 | }, 137 | { 138 | "mentor": "Reinhard Poetz", 139 | "project": "Cocoon block migration and development of new examples", 140 | "student": "Lukas Fridolin Lang" 141 | }, 142 | { 143 | "mentor": "Nandika Jayawardana", 144 | "project": "Axis2/C CGI application", 145 | "student": "Nikola Tankovic" 146 | }, 147 | { 148 | "mentor": "Myrna van Lunteren", 149 | "project": "derby-testandfix/Convert Derby tests to JUnit and fix Derby bugs", 150 | "student": "Junjie Peng" 151 | }, 152 | { 153 | "mentor": "Pallewela Mohottige Dumindu Perera", 154 | "project": "XPath implementation for Axis2/C", 155 | "student": "Varuna Parinda Jayasiri" 156 | }, 157 | { 158 | "mentor": "Ian Holsman", 159 | "project": "To implement Complementary Naïve Bayes algorithm using Map Reduce for Multicore Systems", 160 | "student": "Robin Anil" 161 | }, 162 | { 163 | "mentor": "Dave Johnson", 164 | "project": "Open ID support for Roller blog server", 165 | "student": "Tatyana Tokareva" 166 | }, 167 | { 168 | "mentor": "Ryan McKinley", 169 | "project": "SOLR: Create a javascript client library for Apache Solr", 170 | "student": "Matthias Epheser" 171 | }, 172 | { 173 | "mentor": "Kevin Joseph Menard", 174 | "project": "Cayenne - Cayenne Modeler Improvements", 175 | "student": "Andrey Razumovsky" 176 | } 177 | ] 178 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GSoC Data 2 | 3 | All the data from [GSoC-archive](https://developers.google.com/open-source/gsoc/past-summers) in JSON format. 4 | 5 | 6 | __NOTE__ 7 | For running the scrapers you must install the following dependencies 8 | * asyncio 9 | * aiohttp 10 | 11 | You can do that by running: `pip install ayncio aiohttp` 12 | 13 | # Directories 14 | 15 | * `Data/` 16 | + `orgs/` - all orgs that have been a part of GSoC from 2005 to 2017 17 | 18 | + `projects/` - all projects that are completed under GSoC program from year 2005-2017 19 | 20 | * `Scrapers/` 21 | - Contains all the scrapers used for scraping the data 22 | 23 | # Data 24 | 25 | ### `orgs/` 26 | 27 | * `2005.json` - `2008.json` 28 | - `link`: URL of the org 29 | - `name`: Name of the org 30 | 31 | * `2009-2013.json` 32 | - `about`: Work that org do 33 | - `link`: URL of the org 34 | - `mail`: Mailing list of the org 35 | - `name`: Name of the org 36 | - `page`: Idea page of the org 37 | 38 | * `2014-2015.json` 39 | - `link`: URL of the org 40 | - `mail`: Mailing list of the org 41 | - `page`: Idea page of the org 42 | - `name`: Name of the org selected 43 | 44 | * `2016-2017.json` 45 | - `about`: Info about the organization 46 | - `link`: URL of the org 47 | - `name`: Name of the org 48 | 49 | ### `projects/` 50 | 51 | * `2005.json` - `2008.json` 52 | - `Mentor`: Name of the mentor of the project 53 | - `project`: Name of the project 54 | - `student`: Name of the student 55 | 56 | * `2009-2013.json` & `2014-2015.json` 57 | - `Organization`: Name of the organization 58 | - `detail`: Detail about the project 59 | - `link`: Link to the project 60 | - `student`: Name of the student selected 61 | - `title`: Name of the project 62 | 63 | * `2016-2017.json` 64 | - `Organization`: Name of the organization 65 | - `link`: Link to the project 66 | - `mentors`: Name of the mentors 67 | - `student`: Name of the student 68 | - `title`: Name of the project 69 | 70 | 71 | # What can be done with the data? 72 | 73 | This data will be used for improving the functionality of [Soccer](http://github.com/dufferzafar/Soccer/). 74 | 75 | It can also be used to generate various stats, plots or answer data-related questions like: 76 | 77 | - Who did the most number of GSoCs? under which org? 78 | - Which org has the highest sutdent-to-mentor conversion rate? (students who first did GSoC under the org, and then became mentors) 79 | - Run some magic on the descriptions of projects over the years to find out if there is a trend of ML related projects. 80 | 81 | etc. etc. 82 | 83 | --- 84 | 85 | Feel free to open issues to discuss any more ideas! 86 | -------------------------------------------------------------------------------- /Scrapers/common.py: -------------------------------------------------------------------------------- 1 | import json 2 | import aiohttp 3 | from bs4 import BeautifulSoup 4 | 5 | 6 | async def get_page(url): 7 | async with aiohttp.ClientSession() as session: 8 | async with session.get(url) as response: 9 | soup = await response.read() 10 | 11 | return BeautifulSoup(soup.decode('utf-8'), "lxml") 12 | 13 | 14 | def dumper(json_data, json_file): 15 | """Makes json file from the given data""" 16 | with open(json_file, 'w') as f: 17 | json.dump(json_data, f) 18 | -------------------------------------------------------------------------------- /Scrapers/developer-scraper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scraper for developer-google GSoC archive 3 | Get organizations and projects information from 2005-2008 4 | """ 5 | 6 | import asyncio 7 | from os.path import join, basename 8 | from aiocache import SimpleMemoryCache 9 | from common import get_page, dumper 10 | 11 | developer = "https://developers.google.com" 12 | cache = SimpleMemoryCache() 13 | 14 | def get_info(soup): 15 | """Gets information about orgs and projects from year 2005-2008 16 | """ 17 | organizations, proj = [], [] 18 | 19 | org_sections = soup.find( 20 | 'div', {'itemprop': 'articleBody'}).find_all('section')[1:] 21 | 22 | try: 23 | for orgs in org_sections: 24 | org_name = orgs.find('h2').text 25 | link = orgs.find('a').text 26 | projects = orgs.find('ul').find_all('li') 27 | for p in projects: 28 | project = p.find('h4') 29 | details = project.nextSibling.strip().split(',') 30 | student = details[0].replace('by ', '') 31 | mentor = details[1].replace(' mentored by ', '') 32 | proj.append({'project': project.text, 'student': student, 33 | 'mentor': mentor}) 34 | organizations.append({'name': org_name, 'link': link}) 35 | 36 | except AttributeError: 37 | print(link) 38 | 39 | return organizations, proj 40 | 41 | 42 | def main(): 43 | orgs_data = {} 44 | projects_data = {} 45 | for year in range(2005, 2009): 46 | url = developer + '/open-source/gsoc/{yr}/'.format(yr=year) 47 | 48 | loop = asyncio.get_event_loop() 49 | soup = loop.run_until_complete(get_page(url)) 50 | orgs, projects = get_info(soup) 51 | 52 | orgs_data[year] = orgs 53 | projects_data[year] = projects 54 | 55 | dumper(orgs_data, "2005-2008.json") 56 | dumper(projects_data, "2005-2008.json") 57 | 58 | 59 | if __name__ == '__main__': 60 | main() 61 | -------------------------------------------------------------------------------- /Scrapers/melange-scraper.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a scarper for google-melange GSoC archive 3 | It only works for the year 2009-2015. 4 | """ 5 | import re 6 | import asyncio 7 | from os.path import join, basename 8 | from common import get_page, dumper 9 | 10 | melange = "https://www.google-melange.com" 11 | 12 | 13 | def grab_project_links(soup): 14 | """Gets links of particular projects 15 | """ 16 | project_urls = [] 17 | valid_project_url = "/?archive/?gsoc/\d+[0-9]/orgs/[a-zA-Z]+/[a-zA-Z]+/[a-zA-Z]+.html" 18 | try: 19 | # Grab links to all the projects 20 | all_link = soup.find_all("a") 21 | for link in all_link: 22 | if re.match(valid_project_url, link.get("href")): 23 | project_urls.append(join(melange, link.get("href")[1:])) 24 | except TypeError: 25 | print(link) 26 | 27 | return project_urls 28 | 29 | 30 | async def org_info_above_14(orgs_urls14): 31 | """Scarpe information about orgs of year 2014 and 2015 32 | :orgs_urls14: list of urls of year 2014 and 2015 33 | """ 34 | org_info_14 = [] 35 | project_urls_from14 = [] 36 | for url in orgs_urls14: 37 | try: 38 | soup = await get_page(url) 39 | org_name = basename(url) 40 | org_info = soup.find_all('p') 41 | web_page = org_info[1].text.splitlines()[-1].strip() 42 | mailing_list = org_info[2].text.split(":")[-1].strip() 43 | description = soup.find('div', {'class': 'main mdl-cell mdl-cell--8-col\ 44 | mdl-card mdl-shadow--4dp'}) 45 | detail = description.find_all('p')[2].nextSibling 46 | org_info_14.append({'name': org_name, 'page': web_page, 47 | 'about': detail, 'mail': mailing_list, 48 | 'link': url}) 49 | project_urls_from14.extend(grab_project_links(soup)) 50 | except IndexError: 51 | print(url) 52 | 53 | return org_info_14, get_project_info(project_urls_from14) 54 | 55 | 56 | async def org_info_below_13(org_urls13): 57 | """Scrape information about the orgs from 2009-2013 58 | :org_urls13: list of urls for all the orgs 59 | """ 60 | org_info_till13 = [] 61 | project_urls_till13 = [] 62 | for url in org_urls13: 63 | # General information about the org 64 | try: 65 | soup = await get_page(url) 66 | org_name = basename(url) 67 | org_info = soup.find_all('p') 68 | web_page = org_info[0].text.splitlines()[-1].strip() 69 | mailing_list = org_info[1].text.split(":")[-1].strip() 70 | detail = org_info[2].text 71 | org_info_till13.append({'name': org_name, 'about': detail, 72 | 'page': web_page, 'mail': mailing_list, 73 | 'link': url}) 74 | project_urls_till13.extend(grab_project_links(soup)) 75 | 76 | except IndexError: 77 | print(url) 78 | 79 | return org_info_till13, get_project_info(project_urls_till13) 80 | 81 | 82 | async def get_project_info(project_urls): 83 | """Get detail information of projects from given links 84 | :project_urls: list of all the project urls 85 | """ 86 | project_info = [] 87 | for url in project_urls: 88 | soup = await get_page(url) 89 | about = soup.find_all("p") 90 | title = soup.find("h3").text 91 | student = about[0].text.splitlines()[2].strip() 92 | details = about[1].text 93 | name = about[0].find("a").text 94 | project_info.append({'Organization': name, 'title': title, 95 | 'student': student, 'details': details, 96 | 'link': url}) 97 | 98 | return project_info 99 | 100 | 101 | async def All_orgs(): 102 | """Get links of all orgs from 2009 to 2015 103 | 104 | Makes two separate list: 105 | links_13 - links of all the Organization from 2009-2013 106 | links_14 - links of all the Organization 2014 and 2015 107 | """ 108 | 109 | links_13 = [] 110 | links_14 = [] 111 | valid_url = "/?archive/?gsoc/\d+[0-9]/orgs/[a-zA-Z]+" 112 | for year in range(2009, 2016): 113 | year_url = melange + "/archive/gsoc/{}".format(year) 114 | soup = await get_page(year_url) 115 | 116 | for url in soup.find_all('a'): 117 | if re.match(valid_url, url.get("href")): 118 | if year <= 2013: 119 | links_13.append(join(melange, url.get("href")[1:])) 120 | else: 121 | links_14.append(join(melange, url.get("href")[1:])) 122 | return links_13, links_14 123 | 124 | 125 | def main(): 126 | loop = asyncio.get_event_loop() 127 | orgs_13, orgs_14 = loop.run_until_complete(All_orgs()) 128 | org13, project13 = loop.run_until_complete(org_info_below_13(orgs_13)) 129 | org14, projects14 = loop.run_until_complete(org_info_above_14(orgs_14)) 130 | 131 | dumper(org13, "2009-2013.json") 132 | dumper(project13, "2009-2013.json") 133 | dumper(org14, "2014-2015.json") 134 | dumper(projects14, "2014-2015.json") 135 | 136 | 137 | if __name__ == "__main__": 138 | main() 139 | -------------------------------------------------------------------------------- /Scrapers/summerofcode-scraper.py: -------------------------------------------------------------------------------- 1 | """Scraper for new summerofcode archive 2 | Gets Get organizations and projects information of year 2016-2017 3 | """ 4 | import re 5 | import sys 6 | import asyncio 7 | from os.path import join 8 | from common import get_page, dumper 9 | 10 | 11 | URL = 'https://summerofcode.withgoogle.com' 12 | 13 | 14 | async def project_details(project_links: list): 15 | """Get all the information for specific project 16 | 17 | :project_links: List of all the Organization on one page 18 | """ 19 | project = [] 20 | for projects in project_links: 21 | link = join(URL, projects[1:-1]) 22 | soup = await get_page(link) 23 | title = soup.find('h3', {'class': 'banner__title'}).text 24 | org_data = soup.find( 25 | 'main', {'class': 'app-body'}).find('div', {'class': 'org__meta'}) 26 | try: 27 | org = org_data.find_all('div')[1].find('div').text 28 | student_name = org_data.find_all('div')[3].find('div').text 29 | mentors = org_data.find_all('div')[5].find('ul').find_all('li') 30 | mentors = [mentor.text for mentor in mentors] 31 | project.append({"Organization": org, "title": title, 32 | "student": student_name, "mentors": mentors, 33 | "link": projects}) 34 | except AttributeError: 35 | print(title) 36 | 37 | return project 38 | 39 | 40 | async def orgs_information(orgs_list: list): 41 | """Get all the information about an organizations 42 | Also grabs links for the project under each org. 43 | 44 | :orgs_list: List of urls for each org 45 | """ 46 | 47 | orgs_info = [] 48 | project_links = [] 49 | project_valid_url = '/?archive/\d+/projects/\d+[0-9]/' 50 | 51 | for org in orgs_list: 52 | topics = [] 53 | techs = [] 54 | 55 | url = join(URL, org[1:-1]) 56 | soup = await get_page(url) 57 | 58 | name = soup.find('h2', {'class': 'md-display-1'}).text 59 | about = soup.find('div', {'class': 'org__long-description'}).text 60 | idea = soup.find('md-button', {'target': '_blank'}).get('href') 61 | for topic in soup.find_all('li', {'class': 'organization__tag organization__tag--topic'}): 62 | topics.append(topic.text) 63 | 64 | for tech in soup.find_all('li', {'class': 'organization__tag organization__tag--technology'}): 65 | techs.append(tech.text) 66 | 67 | for channel in soup.find_all('md-button'): 68 | info = channel.get('href') 69 | 70 | if info: 71 | ch = channel.text.split(" ") 72 | if 'IRC' in ch: 73 | irc = channel.get("href") 74 | else: 75 | irc = "" 76 | if 'list' in info: 77 | mailing_list = info 78 | elif info.startswith('mailto'): 79 | contact = info 80 | else: 81 | mailing_list = "" 82 | 83 | # Get projects links of an orgs 84 | for links in soup.find_all('a'): 85 | if re.match(project_valid_url, links.get('href')): 86 | project_links.append(links.get('href')) 87 | 88 | orgs_info.append({'Organization': name, 'About': about, 'URL': url, 89 | 'Technologies': techs, 'Topics': topics, 'Mailing-list': mailing_list, 90 | 'IRC': irc, 'contact': contact, 'Idea-page': idea}) 91 | 92 | return orgs_info, project_links 93 | 94 | 95 | async def orgs_links(year): 96 | """Get links of all the organizations from 2016-2017""" 97 | 98 | orgs_list = [] 99 | valid_urls = "/?[a-z]+/?\\d+[0-9]/[a-z]+/?\\d+[0-9]/" 100 | 101 | orgs_url = join(URL, "archive/{yr}/organizations/".format(yr=year)) 102 | soup = await get_page(orgs_url) 103 | for link in soup.find_all('a'): 104 | if re.match(valid_urls, link.get('href')): 105 | orgs_list.append(link.get('href')) 106 | 107 | return orgs_list 108 | 109 | 110 | def main(): 111 | """Maintains all the other functions and generates JSON file""" 112 | 113 | if len(sys.argv) > 1: 114 | year = sys.argv[1] 115 | print("Scraping data for:", year) 116 | else: 117 | print("\nUSAGE: python summerofcode-scraper.py ") 118 | exit(1) 119 | 120 | loop = asyncio.get_event_loop() 121 | all_orgs = loop.run_until_complete(orgs_links(year)) 122 | organizations, project_links = loop.run_until_complete(orgs_information(all_orgs)) 123 | projects = loop.run_until_complete(project_details(project_links)) 124 | 125 | dumper(organizations, 'orgs_' + year + '.json') 126 | dumper(projects, '2018.json') 127 | 128 | 129 | if __name__ == '__main__': 130 | main() 131 | --------------------------------------------------------------------------------