├── 404.html
├── local.css
├── _plugins
    ├── ext.rb
    ├── mdhtml.rb
    └── debug.rb
├── files
    ├── mslogo.png
    ├── rulogo.gif
    ├── schema.pdf
    ├── schema.png
    ├── tudelftlogo.png
    └── ghtorrent-data.pdf
├── vm.md
├── README.md
├── Gemfile
├── .gitignore
├── assets
    └── themes
    │   └── twitter
    │       ├── bootstrap
    │           ├── img
    │           │   ├── glyphicons-halflings.png
    │           │   └── glyphicons-halflings-white.png
    │           └── css
    │           │   └── bootstrap-responsive.min.css
    │       └── css
    │           └── style.css
├── sitemap.txt
├── pages.html
├── _layouts
    ├── page.html
    └── default.html
├── dumps
    ├── update-downloads.sh
    ├── index.erb
    ├── run-all.sh
    ├── index.rb
    └── ght-periodic-dump
├── stats
    ├── extract-events.sh
    ├── extract-stats.sh
    ├── genstats.sh
    ├── index.md
    └── api-stats.R
├── docs.md
├── _includes
    └── comments.html
├── atom.xml
├── _config.yml
├── contrib.md
├── downloads.md
├── basedupon.md
├── lean.html
├── mysql.md
├── services.md
├── raw.md
├── pullreq-perf
    ├── openess-report.R
    ├── index.Rmd
    └── report.Rmd
├── vissoft14.md
├── syntax.css
├── geninst.md
├── mongo.md
├── cookbook.md
├── leanprogress.html
├── index.md
├── gcloud.md
├── streaming.md
├── pers-data.md
├── halloffame.md
├── ght-ubuntu.md
├── _bibliography
    └── references.bib
├── faq.md
├── msr14.md
├── Rakefile
└── relational.md


/404.html:
--------------------------------------------------------------------------------
1 | Sorry this page does not exist =(
2 | 


--------------------------------------------------------------------------------
/local.css:
--------------------------------------------------------------------------------
1 | h4 {
2 |   font-weight: bold;
3 | }
4 | 


--------------------------------------------------------------------------------
/_plugins/ext.rb:
--------------------------------------------------------------------------------
1 | require 'jekyll/scholar'
2 | require 'pp'
3 | puts 'Loaded scholar'
4 | 
5 | 


--------------------------------------------------------------------------------
/files/mslogo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ice3man543/ghtorrent.org/master/files/mslogo.png


--------------------------------------------------------------------------------
/files/rulogo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ice3man543/ghtorrent.org/master/files/rulogo.gif


--------------------------------------------------------------------------------
/files/schema.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ice3man543/ghtorrent.org/master/files/schema.pdf


--------------------------------------------------------------------------------
/files/schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ice3man543/ghtorrent.org/master/files/schema.png


--------------------------------------------------------------------------------
/vm.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Client VM 
4 | tagline: 
5 | ---
6 | 
7 | Under Construction!
8 | 


--------------------------------------------------------------------------------
/files/tudelftlogo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ice3man543/ghtorrent.org/master/files/tudelftlogo.png


--------------------------------------------------------------------------------
/files/ghtorrent-data.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ice3man543/ghtorrent.org/master/files/ghtorrent-data.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ### Installing
2 | 
3 | ````
4 | su
5 | apt-get install ruby
6 | gem install jekyll jekyll-scholar
7 | ````
8 | 
9 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | 
3 | gem 'rouge'
4 | gem 'rdiscount'
5 | gem 'jekyll-watch'
6 | gem 'jekyll-scholar'
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | _site/*
 2 | _theme_packages/*
 3 | pullreq-perf/*/
 4 | *~
 5 | Thumbs.db
 6 | .DS_Store
 7 | 
 8 | !.gitkeep
 9 | 
10 | .rbenv-version
11 | .rvmrc
12 | 


--------------------------------------------------------------------------------
/assets/themes/twitter/bootstrap/img/glyphicons-halflings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ice3man543/ghtorrent.org/master/assets/themes/twitter/bootstrap/img/glyphicons-halflings.png


--------------------------------------------------------------------------------
/assets/themes/twitter/bootstrap/img/glyphicons-halflings-white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ice3man543/ghtorrent.org/master/assets/themes/twitter/bootstrap/img/glyphicons-halflings-white.png


--------------------------------------------------------------------------------
/sitemap.txt:
--------------------------------------------------------------------------------
1 | ---
2 | # Remember to set production_url in your _config.yml file!
3 | title : Sitemap
4 | ---
5 | {% for page in site.pages %}
6 | {{site.production_url}}{{ page.url }}{% endfor %}
7 | {% for post in site.posts %}
8 | {{site.production_url}}{{ post.url }}{% endfor %}


--------------------------------------------------------------------------------
/pages.html:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | title: Pages
 4 | header: Pages
 5 | group: navigation
 6 | ---
 7 | 
 8 | <h2>All Pages</h2>
 9 | <ul>
10 | {% for page in site.pages %}
11 |   <li><a href="{{ page.url }}">{{ page.title }}</a></li>
12 | {% endfor %}  <!-- page -->
13 | </ul>
14 | 


--------------------------------------------------------------------------------
/_layouts/page.html:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | ---
 4 | 
 5 | <div class="page-header">
 6 |   <h1>{{ page.title }} {% if page.tagline %} <small>{{ page.tagline }}</small>{% endif %}</h1>
 7 | </div>
 8 | 
 9 | <div class="row">
10 |   <div class="span12">
11 |     {{ content }}
12 |   </div>
13 | </div>
14 | 


--------------------------------------------------------------------------------
/_plugins/mdhtml.rb:
--------------------------------------------------------------------------------
 1 | module Jekyll
 2 |   class MarkdownBlock < Liquid::Block
 3 |     def initialize(tag_name, text, tokens)
 4 |       super
 5 |     end
 6 |     require "kramdown"
 7 |     def render(context)
 8 |       content = super
 9 |       "#{Kramdown::Document.new(content).to_html}"
10 |     end
11 |   end
12 | end
13 | Liquid::Template.register_tag('markdown', Jekyll::MarkdownBlock)
14 | 


--------------------------------------------------------------------------------
/dumps/update-downloads.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | ./index.rb downloads/ > torrents 
 3 | cd ghtorrent.org
 4 | cat downloads.md | sed -n '/### Available/q;p' > dl.tmp
 5 | echo "### Available Downloads" >> dl.tmp
 6 | cat dl.tmp ../torrents > downloads.md
 7 | git stash
 8 | git pull
 9 | git stash pop
10 | git commit -a -m "Dump `date +'%Y-%m-%d'`"
11 | git push
12 | rm dl.tmp
13 | cd -
14 | rm torrents
15 | 


--------------------------------------------------------------------------------
/stats/extract-events.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | doit() {
 4 |   grep Processed $1|
 5 |   perl -lape 's/\[([T0-9-:.]*)\ .*\].* event:\ ([^-]*)/$1 $2/'|
 6 |   cut -f2,3 -d' '|
 7 |   cut -f1,2,3 -d'-'|
 8 |   ruby -ne 'BEGIN{require "time"}; t,i,d=$_.split(/ /); print Time.parse(t).    to_i," ", i, " ", d;'
 9 | }
10 | 
11 | export -f doit
12 | 
13 | find mirror -type f|grep log.txt| parallel -j10 doit {}
14 | 


--------------------------------------------------------------------------------
/stats/extract-stats.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | doit() {
 4 |   grep APIClient $1|
 5 |   grep -v WARN  |
 6 |   perl -lape 's/\[([T0-9-:.]*).*\] DEBUG.*\[([0-9.]*)\].*Total: ([0-9]*) ms/$1 $2 $3/'|
 7 |   cut -f2,3,4 -d' '|
 8 |   ruby -ne 'BEGIN{require "time"}; t,i,d=$_.split(/ /); print Time.parse(t).to_i," ", i, " ", d;'|
 9 |   grep -v "#"
10 | }
11 | 
12 | export -f doit
13 | 
14 | find mirror -type f|grep log.txt| parallel -j10 doit {}
15 | 


--------------------------------------------------------------------------------
/stats/genstats.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | scp ./extract-stats.sh dutihr:~
 4 | ssh dutihr /home/gousiosg/extract-stats.sh > foo
 5 | echo "ts ip ms" > data.txt
 6 | cat foo >> data.txt
 7 | 
 8 | scp ./extract-events.sh dutihr:~
 9 | ssh dutihr /home/gousiosg/extract-events.sh |sed -e 's/^ \(.*\)$/\1/' > foo
10 | echo "ts event" >events.txt
11 | cat foo >> events.txt
12 | 
13 | R --no-save < api-stats.R
14 | 
15 | cd ..
16 | jekyll build
17 | 
18 | 


--------------------------------------------------------------------------------
/stats/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | title: Statistics 
 4 | tagline: 
 5 | ---
 6 | 
 7 | <table class="table table-hover table-condensed">
 8 |   <thead>
 9 |     <tr>
10 |       <th></th>
11 |       <th></th>
12 |     </tr>
13 |   </thead>
14 |   <tbody>
15 |     <tr>
16 |       <td><img src="api-resp.png" alt="API response time timeseries plot"></img></td>
17 |       <td><img src="num-reqs.png" alt="Number of request per timeslot
18 |       timeseries plot"></img></td>
19 |       </tr>
20 |       <tr>
21 |       <td><img src="events-per-day.png " alt=""></img></td>
22 |       <td><img src="resp-ip-boxplot.png" ></img></td>
23 |       </tr>
24 |   </tbody>
25 | </table>
26 | 


--------------------------------------------------------------------------------
/docs.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | title: The GHTorrent documentation
 4 | tagline:
 5 | ---
 6 | 
 7 | ## Data formats
 8 | 
 9 | * [The relational data schema](relational.html)
10 | * [Collections in the MongoDB database](mongo.html)
11 | 
12 | ## Instaling and running
13 | 
14 | <button type="button" class="btn btn-success">New!</button> You can now use the
15 | [GHTorrent Vagrant](https://github.com/ghtorrent/ghtorrent-vagrant) box to setup
16 | a testing/development environment for GHTorrent! The GHTorrent Vagrant box
17 | completely automates the process below.
18 | 
19 | * [Generic installation instructions](geninst.html)
20 | * [Installing on Ubuntu 10.10](ght-ubuntu.html)
21 | * [The GHTorrent cookbook](cookbook.html)
22 | 
23 | 


--------------------------------------------------------------------------------
/dumps/index.erb:
--------------------------------------------------------------------------------
 1 | 
 2 | List of available torrents (Last dump date: <%= @last_update %>)
 3 | 
 4 | <table class="table table-hover table-condensed">
 5 |   <thead>
 6 |   <tr>
 7 |    <th>Dump date</th>
 8 |    <% @collections.sort.each do |c| %>
 9 |    <th><%=c.gsub("_", " ") %></th>
10 |      <% end %>
11 |   </tr>
12 |   </thead>
13 |   <tbody>
14 |    <% for @d in @dumps.sort{ |a,b| a.date <=> b.date} %>
15 |    <tr>
16 |     <td><%= @d.date %></td>
17 |     <% for @col in @collections.sort %>
18 |     <% @t = @d.torrents[@col] %>
19 |       <td><% unless @t.nil? %>
20 |           <a href="<%= @t.url %>"><%= @t.size%> MB</a>
21 |         <% end %>
22 |       </td>
23 |     <% end %>
24 |   </tr>
25 |   <% end %>
26 |   </tbody>
27 | </table>
28 | 
29 | 


--------------------------------------------------------------------------------
/_includes/comments.html:
--------------------------------------------------------------------------------
 1 | <div id="disqus_thread"></div>
 2 | <script type="text/javascript">
 3 | var disqus_shortname = 'ghtorrent'; 
 4 | 
 5 | /* * * DON'T EDIT BELOW THIS LINE * * */
 6 | (function() {
 7 |  var dsq = document.createElement('script');
 8 |  dsq.type = 'text/javascript'; dsq.async = true;
 9 |  dsq.src = '//' + disqus_shortname +
10 |  '.disqus.com/embed.js';
11 |  (document.getElementsByTagName('head')[0]
12 |   ||
13 |   document.getElementsByTagName('body')[0]).appendChild(dsq);
14 |  })();
15 | </script>
16 | <noscript>Please enable JavaScript to view the 
17 |   <a href="http://disqus.com/?ref_noscript">comments powered by Disqus.</a>
18 | </noscript>
19 | <a href="http://disqus.com" class="dsq-brlink">comments powered by <span class="logo-disqus">Disqus</span></a>
20 | 


--------------------------------------------------------------------------------
/atom.xml:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: nil
 3 | title : Atom Feed
 4 | ---
 5 | <?xml version="1.0" encoding="utf-8"?>
 6 | <feed xmlns="http://www.w3.org/2005/Atom">
 7 |  
 8 |  <title>{{ site.title }}</title>
 9 |  <link href="{{ site.production_url }}/atom.xml" rel="self"/>
10 |  <link href="{{ site.production_url }}"/>
11 |  <updated>{{ site.time | date_to_xmlschema }}</updated>
12 |  <id>{{ site.production_url }}</id>
13 |  <author>
14 |    <name>{{ site.author.name }}</name>
15 |    <email>{{ site.author.email }}</email>
16 |  </author>
17 | 
18 |  {% for post in site.posts %}
19 |  <entry>
20 |    <title>{{ post.title }}</title>
21 |    <link href="{{ site.production_url }}{{ post.url }}"/>
22 |    <updated>{{ post.date | date_to_xmlschema }}</updated>
23 |    <id>{{ site.production_url }}{{ post.id }}</id>
24 |    <content type="html">{{ post.content | xml_escape }}</content>
25 |  </entry>
26 |  {% endfor %}
27 |  
28 | </feed>


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
 1 | permalink: /:categories/:year/:month/:day/:title
 2 | 
 3 | exclude: [".rvmrc", ".rbenv-version", "README.md", "Rakefile", "changelog.md"]
 4 | highlighter: "rouge"
 5 | #markdown: kramdown
 6 | markdown: rdiscount
 7 | 
 8 | # Themes are encouraged to use these universal variable
 9 | # so be sure to set them if your theme uses them.
10 | title : GHTorrent
11 | tagline: Query Github data!
12 | author :
13 |   name : Georgios Gousios
14 |   email : gousiosg@gmail.com
15 |   github : gousiosg
16 |   twitter : gousiosg
17 |   feedburner :
18 | 
19 | production_url : http://www.ghtorrent.org
20 | 
21 | scholar:
22 |   style: apa
23 |   locale: en
24 | 
25 |   sort_by: none
26 |   order: ascending
27 | 
28 |   source: ./_bibliography
29 |   bibliography: references
30 |   bibliography_template: "{{reference}}"
31 | 
32 |   replace_strings: true
33 | 
34 |   details_dir:    bibliography
35 |   details_layout: bib_details.html
36 |   details_link:   more...
37 | 
38 |   query: "@*"
39 | 


--------------------------------------------------------------------------------
/_plugins/debug.rb:
--------------------------------------------------------------------------------
 1 | # A simple way to inspect liquid template variables.
 2 | # Usage:
 3 | #  Can be used anywhere liquid syntax is parsed (templates, includes, posts/pages)
 4 | #  {{ site | debug }}
 5 | #  {{ site.posts | debug }}
 6 | #
 7 | require 'pp'
 8 | module Jekyll
 9 |   # Need to overwrite the inspect method here because the original
10 |   # uses < > to encapsulate the psuedo post/page objects in which case
11 |   # the output is taken for HTML tags and hidden from view.
12 |   #
13 |   class Post
14 |     def inspect
15 |       "#Jekyll:Post @id=#{self.id.inspect}"
16 |     end
17 |   end
18 |   
19 |   class Page
20 |     def inspect
21 |       "#Jekyll:Page @name=#{self.name.inspect}"
22 |     end
23 |   end
24 |   
25 | end # Jekyll
26 |   
27 | module Jekyll
28 |   module DebugFilter
29 |     
30 |     def debug(obj, stdout=false)
31 |       puts obj.pretty_inspect if stdout
32 |       "<pre>#{obj.class}\n#{obj.pretty_inspect}</pre>"
33 |     end
34 | 
35 |   end # DebugFilter
36 | end # Jekyll
37 | 
38 | Liquid::Template.register_filter(Jekyll::DebugFilter)


--------------------------------------------------------------------------------
/dumps/run-all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #./ght-periodic-dump -f '2012-01-31 00:00' -t '2012-03-31 00:00'
 4 | #./update-downloads.sh
 5 | 
 6 | #./ght-periodic-dump -f '2012-03-31 00:00' -t '2012-05-31 00:00'
 7 | #./update-downloads.sh
 8 | 
 9 | #./ght-periodic-dump -f '2012-05-31 00:00' -t '2012-07-31 00:00'
10 | #./update-downloads.sh
11 | 
12 | #./ght-periodic-dump -f '2012-07-31 00:00' -t '2012-09-30 00:00'
13 | #./update-downloads.sh
14 | 
15 | #./ght-periodic-dump -f '2012-09-30 00:00' -t '2012-11-30 00:00'
16 | #./update-downloads.sh
17 | 
18 | #./ght-periodic-dump -f '2012-11-30 00:00' -t '2013-01-30 00:00'
19 | #./update-downloads.sh
20 | 
21 | #./ght-periodic-dump -f '2013-01-30 00:00' -t '2013-03-30 00:00'
22 | #./update-downloads.sh
23 | 
24 | #./ght-periodic-dump -f '2013-03-30 00:00' -t '2013-05-30 00:00'
25 | #./update-downloads.sh
26 | 
27 | #./ght-periodic-dump -f '2013-05-30 00:00' -t '2013-07-30 00:00'
28 | #./update-downloads.sh
29 | 
30 | #./ght-periodic-dump -f '2013-07-30 00:00' -t '2013-09-30 00:00'
31 | #./update-downloads.sh
32 | 
33 | ./ght-periodic-dump -f '2013-09-30 00:00' -t '2013-11-30 00:00'
34 | #./update-downloads.sh
35 | 
36 | ./ght-periodic-dump -f '2013-11-30 00:00' -t '2014-01-30 00:00'
37 | #./update-downloads.sh
38 | 
39 | 


--------------------------------------------------------------------------------
/contrib.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | title: Contributing to GHTorrent
 4 | tagline:
 5 | ---
 6 | 
 7 | Do you consider contributing to GHTorrent? That's great! We value any
 8 | contribution, no matter how small, big, simple or sophisticated it is. If you
 9 | decide to send a pull request, we will actively help you to get your pull
10 | request integrated.
11 | 
12 | ## Setting up GHTorrent
13 | 
14 | The first thing to do is to ensure that you have a working GHTorrent
15 | environment. To do so, please consult the top level
16 | [README.md](https://github.com/gousiosg/github-mirror/blob/master/README.md) file with
17 | instructions on doing so.
18 | 
19 | ## TODO list
20 | 
21 | The TODO list is maintained as a collection of open [GitHub
22 | issues](https://github.com/gousiosg/github-mirror). Please feel free to adopt
23 | any of those by @mentioning the @ghtorrent user.
24 | 
25 | ## New features
26 | 
27 | Do you have a cool idea that will make GHTorrent 100x (or 0.01x) better? That's
28 | great!  We look forward to reviewing your pull requests! We however advise you
29 | to:
30 | 
31 | 1. Read the [open issue list](https://github.com/gousiosg/github-mirror/issues)
32 | 2. Contact the [GHTorrent mailing list](). The maintainers will help you
33 | implement your proposed feature as efficiently as possible and make sure
34 | that it does not conflict with any change currently planned.
35 | 


--------------------------------------------------------------------------------
/downloads.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | title: Downloads
 4 | tagline:
 5 | ---
 6 | 
 7 | ### What am I downloading?
 8 | 
 9 | * The MySQL dump is a full, up to date database dump. You can use it for querying the latest available data.
10 | * The MongoDB dumps are incremental. They are provided mostly for reference and backup purposes, as they may contain duplicates. The reasons for this are the following:
11 |   * When refreshing project data, old records are deleted and new are added. This cannot be reflected in the dumps (it is not practical to regenerate all dumps every time).
12 |   * The dumps have already been restored once, hence the dump dates do not
13 | represent the actual data generation dates.
14 | 
15 | For those reasons, we recommend using the MongoDB data through our [query
16 | service](http://ghtorrent.org/raw.html).
17 | 
18 | ### Which is the applicable license?
19 | 
20 | See [here](faq.html)
21 | 
22 | ### MySQL database dumps
23 | <button type="button" class="btn btn-success">New!</button> As of MySQL dump
24 | `mysql-2015-09-25`, we are distributing CSV files (one file per table) instead
25 | of `mysqldump` based backups. The provided archive expands to a directory
26 | including a restore script and instructions on how to do the restore. See more
27 | information [here](https://github.com/gousiosg/github-mirror/tree/master/sql).
28 | 
29 | You can also [query MySQL](/dblite). It is always loaded with the latest
30 | dump.
31 | 


--------------------------------------------------------------------------------
/assets/themes/twitter/css/style.css:
--------------------------------------------------------------------------------
 1 | /* Override some defaults */
 2 | html, body {
 3 |   background-color: #eee;
 4 | }
 5 | .navbar {
 6 |   margin-bottom: 0;
 7 | }
 8 | .container > footer {
 9 |   margin-top: 20px;
10 | }
11 | .container > footer p {
12 |   text-align: center; /* center align it with the container */
13 | }
14 | 
15 | /* The white background content wrapper */
16 | .content {
17 |   background-color: #fff;
18 |   padding: 20px;
19 |   margin: 0 -20px; /* negative indent the amount of the padding to maintain the grid system */
20 |   -webkit-border-radius: 0 0 6px 6px;
21 |      -moz-border-radius: 0 0 6px 6px;
22 |           border-radius: 0 0 6px 6px;
23 |   -webkit-box-shadow: 0 1px 2px rgba(0,0,0,.15);
24 |      -moz-box-shadow: 0 1px 2px rgba(0,0,0,.15);
25 |           box-shadow: 0 1px 2px rgba(0,0,0,.15);
26 | }
27 | 
28 | /* Page header tweaks */
29 | .page-header {
30 |   background-color: #f5f5f5;
31 |   padding: 20px 20px 10px;
32 |   margin: -20px -20px 20px;
33 | }
34 | 
35 | .topbar .btn {
36 |   border: 0;
37 | }
38 | 
39 | 
40 | /* tag_box ======================================================== */
41 | 
42 | .tag_box {
43 | 	list-style:none;
44 | 	margin:0;
45 | 	padding:5px 0 ;
46 | 	overflow:hidden;
47 | }
48 | .tag_box li {
49 | 	line-height:28px;
50 | }
51 | .tag_box.inline li {
52 | 	float:left;
53 | }
54 | .tag_box a {
55 | 	padding: 3px 6px;
56 | 	margin: 2px;
57 | 	background: #eee;
58 | 	color:#005F6B;
59 | 	border-radius: 3px;
60 | 	text-decoration:none;
61 | }
62 | .tag_box a span{
63 | 	vertical-align:super;
64 | 	font-size:0.8em;
65 | }
66 | .tag_box a.active {
67 | 	background:#57A957;
68 | 	border:1px solid #4C964D;
69 | 	color:#FFF;
70 | }
71 | 	


--------------------------------------------------------------------------------
/dumps/index.rb:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env ruby
  2 | 
  3 | require 'rubygems'
  4 | require 'erb'
  5 | require 'set'
  6 | require 'date'
  7 | 
  8 | class GHTorrent
  9 |   attr_reader :collections
 10 |   attr_reader :dumps
 11 |   
 12 |   def initialize(last_update)
 13 |     @last_update = last_update
 14 |     @dumps = Set.new
 15 |     @collections = Set.new
 16 |   end
 17 | 
 18 |   def add_dump(dump)
 19 |     @dumps << dump
 20 |   end
 21 | 
 22 |   def add_collection(col)
 23 |     @collections << col
 24 |   end
 25 | 
 26 |   # Expose private binding() method.
 27 |   def get_binding
 28 |     binding()
 29 |   end
 30 | 
 31 | end
 32 | 
 33 | class Dump
 34 |   attr_reader :torrents
 35 |   attr_reader :date
 36 |   
 37 |   def initialize(torrents, date)
 38 |     @torrents = torrents
 39 |     @date = date
 40 |   end
 41 | end
 42 | 
 43 | class Torrent
 44 |   attr_reader :url
 45 |   attr_reader :name
 46 |   attr_reader :size
 47 |   attr_reader :date
 48 |   def initialize(url, name, size, date)
 49 |     @url = url
 50 |     @name = name
 51 |     @size = size
 52 |     @date = date
 53 |   end
 54 | end
 55 | 
 56 | url_prefix="http://ghtorrent.org/downloads"
 57 | 
 58 | # Load the template
 59 | file = File.open("index.erb").read
 60 | rhtml = ERB.new(file)
 61 | 
 62 | # Open the dir to read entries from
 63 | dir = ARGV.shift
 64 | 
 65 | if dir.nil?
 66 |   dir = "."
 67 | end
 68 | 
 69 | torrents = Dir.entries("#{dir}").map do |f|
 70 | 
 71 |   # Go through all torrent files and extract name of
 72 |   # dumped collection and dump date
 73 |   matches = /([a-z0-9_]+)-[a-z]+\.(.*)\.torrent/.match(f)
 74 |   next if matches.nil?
 75 | 
 76 |   # Calculate original file size
 77 |   dump = f.gsub(/.torrent/, ".tar.gz")
 78 |   size = File.stat(File.join(dir, dump)).size / 1024 / 1024
 79 |   
 80 |   date = Date.parse(matches[2])  
 81 |   
 82 |   if size > 0
 83 |     Torrent.new(url_prefix + "/" + dump, matches[1], size, date)
 84 |   end
 85 | end.select{|x| !x.nil?}
 86 | 
 87 | all_dates = torrents.inject(Set.new){|acc, t| acc << t.date}
 88 | 
 89 | all_dumps = all_dates.map{ |d|
 90 |   date_torrents = torrents.select{|t| t.date == d}
 91 |   name_torrents = date_torrents.inject(Hash.new){|acc, a| acc.store(a.name, a); acc}
 92 |   Dump.new(name_torrents, d)
 93 | }
 94 | 
 95 | max_date = all_dates.max{ |a,b| a <=> b}
 96 | 
 97 | ghtorrent = GHTorrent.new(max_date)
 98 | all_dumps.each { |x|
 99 |   ghtorrent.add_dump x
100 |   x.torrents.values.each { |t|
101 |     ghtorrent.add_collection t.name
102 |   }
103 | }
104 | 
105 | puts rhtml.result(ghtorrent.get_binding).gsub(/^\s+/, "").gsub(/\s+$/, $/).gsub(/<table>/, "\n<table>")
106 | # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
107 | 


--------------------------------------------------------------------------------
/basedupon.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | title: Based upon...
 4 | tagline:
 5 | ---
 6 | 
 7 | Here is a collection of projects that are based upon GHTorrent or its
 8 | devivatives, collected by searching GitHub for "ghtorrent". Is your
 9 | project missing/wrongly listed/not hosted on GitHub? Please add it by
10 | [directly editing](https://github.com/ghtorrent/ghtorrent.org/edit/master/basedupon.md) this file on GitHub.
11 | 
12 | * [OSSHealth/ghdata](https://github.com/OSSHealth/ghdata) A Python library and web service for GitHub Health and Sustainability metrics
13 | * [TestRoots/travistorrent-tools](https://github.com/TestRoots/travistorrent-tools) Tools used to create the data for TravisTorrent. TravisTorrent provides free and easy-to-use Travis CI build analyses to the masses through its open database. [More](http://travistorrent.testroots.org)
14 | * [RepoReapers/reaper](https://github.com/RepoReapers/reaper) Calculate the score of a repository based on best engineering practices. [More here](https://reporeapers.github.io/)
15 | * [SOM-Research/Gitana](https://github.com/SOM-Research/Gitana) a SQL-based Project Activity Inspector
16 | * [Microsoft/ghinsights](Microsoft/ghinsights) GHInsights is a data processing pipeline using Azure Data Factory and Azure Data Lake. It processes GitHub data from the ghtorrent project. The resulting processed data is available in Azure Data Lake for users to query, generate reports, and analyze GitHub projects.
17 | * [iandennismiller/gh-impact](https://github.com/iandennismiller/gh-impact) gh-impact is a measure of influence on GitHub. See more [here](http://www.gh-impact.com)
18 | * [valeriocos/selective-importer-4-ghtorrent](https://github.com/valeriocos/selective-importer-4-ghtorrent) Import a MySQL dump of GHTorrent, selecting only the tables and indexes you need
19 | * [cbogart/giterator](https://github.com/cbogart/giterator) Tools for importing and analyzing ghtorrent and githubarchive data
20 | * [DevMine/ght2dm](https://github.com/DevMine/ght2dm) CLI tool to import GHTorrent dumps into the DevMine database.
21 | * [SOM-Research/gila](https://github.com/SOM-Research/gila) Label analysis work. [More](http://som-research.uoc.edu/tools/gila/)
22 | * [BonnyCI/shuffleboard](https://github.com/BonnyCI/shuffleboard) Truffle-shuffling data for the [ci-plunder project](https://github.com/BonnyCI/ci-plunder)
23 | * [jakeharding/repo-health](https://github.com/jakeharding/repo-health) This repository holds the proof of concept for the repository health and sustainability project
24 | * [PRioritizer/PRioritizer-analyzer](https://github.com/PRioritizer/PRioritizer-analyzer) Prioritize your pull requests
25 | * [acs/ghtorrent](https://github.com/acs/ghtorrent) Analyze GHTorrent data using Elasticsearch + Kibana
26 | 


--------------------------------------------------------------------------------
/lean.html:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | title: Lean GHTorrent
 4 | header: Lean GHTorrent
 5 | group: navigation
 6 | ---
 7 | 
 8 | Lean GHTorrent allows researchers to get a slice of the full GHTorrent
 9 | dataset
10 | 
11 | <script>
12 | 
13 | function checkRepos(repos) {
14 |   var repoArr = repos.split(/\r?\n/).map(function(x){return x.trim();});
15 | 
16 |   return repoArr.reduce(function(acc, project) {
17 |     var owner = project.split("/")[0];
18 |     var repo = project.split("/")[1];
19 | 
20 |     // Missing / in repo name
21 |     if (owner == project) {
22 |       return acc.concat([project]);
23 |     }
24 | 
25 |     // Missing repo name
26 |     if (repo === undefined) {
27 |       return acc.concat([project]);
28 |     }
29 | 
30 |     if (!/^[0-9a-zA-Z\-_.]+$/.test(repo) ||
31 |         !/^[0-9a-zA-Z\-_.]+$/.test(owner)) {
32 |       return acc.concat([project]);
33 |     }
34 |     return acc;
35 |   }, []);
36 | }
37 | 
38 | $(document).ready(function() {
39 | 
40 |     function resetErrors() {
41 |       $("#errors").hide();
42 |       $("#missname").hide();
43 |       $("#missemail").hide();
44 |       $("#missrepo").hide();
45 |     }
46 | 
47 |     function displayErrors(errors) {
48 |       resetErrors();
49 |       $("#leansubmit").hide();
50 |       if (errors.length > 0) {
51 |         errors.forEach(function(x) {
52 |           $("#errorrepos").append(x + "<br/>");
53 |         });
54 |         $("#errors").show("fast");
55 |       } else {
56 |         $("#leansubmit").show("fast");
57 |       }
58 |     }
59 | 
60 |     function checkForm() {
61 |       //if ()
62 |     }
63 | 
64 |     $("#leanform").load("http://ghtorrent.org/lean/", function() {
65 |       $("#leansubmit").click(function() {
66 |         displayErrors(checkRepos($("#repos").val()));
67 |         });
68 | 
69 |       $("#leancheck").click(function() {
70 |           displayErrors(checkRepos($("#repos").val()));
71 |         });
72 |       });
73 | });
74 | 
75 | </script>
76 | 
77 | <div id="leanform"></div>
78 | 
79 | <div id="missname" style="display:none; color: rgb(173, 63, 63);">Missing name</div>
80 | <div id="missemail" style="display:none; color: rgb(173, 63, 63);">Missing email</div>
81 | <div id="missrepo" style="display:none; color: rgb(173, 63, 63);">No repositories were specified</div>
82 | 
83 | <div id="errors" style="display:none">
84 |   The following are not valid repository names:
85 |   <div id="errorrepos" style="margin: 20px; color: rgb(173, 63, 63);"></div>
86 | 
87 |   Valid repository names
88 |   contain small and capital letters or one of the following characters: _-.
89 |   <br/>
90 |   Examples of valid repository names are the following:
91 |   <ul>
92 |     <li>mojombo/jekyll</li>
93 |     <li>gousiosg/github-mirror</li>
94 |     <li>foo/BAR.org</li>
95 |   </ul>
96 | 
97 |   Please fix the repository list as necessary.
98 | </div>
99 | 


--------------------------------------------------------------------------------
/stats/api-stats.R:
--------------------------------------------------------------------------------
 1 | # Run this to create the data file
 2 | # cat */log.txt |grep APIClient|grep -v WARN |perl -lape 's/\[([T0-9-:.]*).*\] DEBUG.*\[([0-9.]*)\].*Total: ([0-9]*) ms/$1 $2 $3/'|cut -f2,3,4 -d' '|ruby -ne 'BEGIN{require "time"}; t,i,d=$_.split(/ /); print Time.parse(t).to_i," ", i, " ", d;' |egrep -v "#" >data.txt  
 3 | library(ggplot2)
 4 | library(sqldf)
 5 | require(scales)
 6 | 
 7 | data <- read.csv("data.txt", sep=" ", colClasses = c("integer", "factor", "integer"))
 8 | # Filter out data older than 3 days
 9 | data <- subset(data, ts > (as.numeric(Sys.time()) - 3 * 86400))
10 | data$ts <- as.POSIXct(data$ts, origin = "1970-01-01")
11 | summary(data$ms)
12 | 
13 | p <- ggplot(data) + aes(x = ip, y = ms) + scale_y_log10() + geom_boxplot() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
14 | png("resp-ip-boxplot.png")
15 | print(p)
16 | dev.off()
17 | 
18 | # Total num requests per IP
19 | aggregate(ms ~ ip, data = data, length)
20 | 
21 | # Mean time per IP
22 | aggregate(ms ~ ip, data = data, mean)
23 | 
24 | data$timebin <- cut.POSIXt(data$ts, breaks = "10 mins")
25 | 
26 | mean.interval <- aggregate(ms ~ timebin, data = data, mean)
27 | mean.interval$timebin <- as.POSIXct(mean.interval$timebin, origin = "1970-01-01")
28 | 
29 | p <- ggplot(mean.interval) + aes(x = timebin, y = ms) + geom_line() + scale_x_datetime() +
30 |   xlab('time') + ylab('Mean API resp in ms') + ggtitle('Mean API response time timeseries (10 min intervals)')
31 | 
32 | png("api-resp.png")
33 | print(p)
34 | dev.off()
35 | 
36 | data$timebin <- cut.POSIXt(data$ts, breaks = "30 mins")
37 | count.interval <- aggregate(ms ~ timebin, data = data, length)
38 | count.interval$timebin <- as.POSIXct(count.interval$timebin, origin = "1970-01-01")
39 | p <- ggplot(count.interval) + aes(x = timebin, y = ms) + geom_line() + scale_x_datetime() + scale_y_continuous(labels = comma) +  
40 |   stat_smooth(method = "loess", formula = y ~ x^2, size = 2, alpha = 0)+xlab('time') + ylab('Num API calls') + ggtitle('Num API calls per timeslot (30 mins interval)')
41 | 
42 | png("num-reqs.png")
43 | print(p)
44 | dev.off()
45 | 
46 | events <- read.csv("events.txt", sep=" ", colClasses = c("integer", "factor"))
47 | # Filter out data older than 3 days
48 | events <- subset(events, ts > (as.numeric(Sys.time()) - 3 * 86400))
49 | events$ts <- as.POSIXct(events$ts, origin = "1970-01-01")
50 | summary(events$ts)
51 | 
52 | events$timebin <- cut.POSIXt(events$ts, breaks = "1 day")
53 | 
54 | groupped <- sqldf("select timebin,event,count(*) as number from events group by timebin,event")
55 | 
56 | p <- ggplot(groupped) + aes(x = timebin, y = number, fill = event) +
57 |      scale_y_continuous(labels = comma) +
58 |      geom_bar(stat = "identity", position="dodge") +
59 |      xlab('day') + ylab('Num events') + 
60 |      ggtitle('Number of events processed per day')
61 | 
62 | png("events-per-day.png")
63 | print(p)
64 | dev.off()
65 | 
66 | 


--------------------------------------------------------------------------------
/mysql.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | title: Querying MySQL programmatically
 4 | tagline:
 5 | ---
 6 | 
 7 | To connect to the MySQL programmatic endpoint, you need a MySQL client (command
 8 | line, graphical or program library) and SSH installed on your machine.
 9 | 
10 | ## Connection details
11 | 
12 | To obtain access, please send us your public key [as described here](services.html).
13 | 
14 | 1. When we contact you back, you will be able to setup an SSH tunnel with the
15 | following command: `ssh -L 3306:web.ghtorrent.org:3306 ghtorrent@web.ghtorrent.org`. Keep in mind that no shell will be allocated in the open SSH session.
16 | 
17 | 2. You will then be able to connect to our server using the command: `mysql -u ght -h 127.0.0.1 ghtorrent` (user name: ght, no password, database: ghtorrent).
18 | 
19 | Here is an example session:
20 | 
21 | {% highlight bash%}
22 | ####
23 | # on terminal session 1
24 | $ ssh -L 3306:web.ghtorrent.org:3306 ghtorrent@web.ghtorrent.org
25 | PTY allocation request failed on channel 2
26 | #####
27 | # on a different terminal
28 | $ mysql -u ght -h 127.0.0.1 ghtorrent
29 | Reading table information for completion of table and column names
30 | You can turn off this feature to get a quicker startup with -A
31 | 
32 | Welcome to the MySQL monitor.  Commands end with ; or \g.
33 | Your MySQL connection id is 1004
34 | Server version: 5.5.5-10.1.11-MariaDB-1~wily mariadb.org binary distribution
35 | 
36 | Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
37 | 
38 | Oracle is a registered trademark of Oracle Corporation and/or its
39 | affiliates. Other names may be trademarks of their respective
40 | owners.
41 | 
42 | Type 'help;' or '\h' for help. Type '\c' to clear the current input statement.
43 | mysql> show tables;
44 | +-----------------------+
45 | | Tables_in_ghtorrent   |
46 | +-----------------------+
47 | | commit_comments       |
48 | [...]
49 | | users                 |
50 | | watchers              |
51 | +-----------------------+
52 | 21 rows in set (0.20 sec)
53 | 
54 | {% endhighlight %}
55 | 
56 | ## Database schema
57 | 
58 | Have a look [here](relational.html).
59 | 
60 | ## Things to keep in mind
61 | 
62 | 1. The hosting machine, while powerful, is not capable of processing the data
63 | very quickly.
64 | 
65 | 2. Other people may be using the machine as well. Make sure that you do not
66 | run very heavy queries. It is better to run many small queries (e.g. in
67 | a loop) than aggregation queries. Make sure you only query on indexed fields.
68 | 
69 | 3. Queries running in excess of 100 seconds are killed without any warning.
70 | 
71 | 4. At any time the machine may become unavailable.
72 | 
73 | 5. The data is provided in kind to help other people to do research with
74 | Please do not abuse the service.
75 | 
76 | 6. The data is offered as is without any explicit or implicit quality or
77 | service guarantee from our part.
78 | 
79 | 7. All operations are logged for security purposes.
80 | 


--------------------------------------------------------------------------------
/dumps/ght-periodic-dump:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | #
  3 | # Create the periodic database dump files
  4 | #
  5 | 
  6 | # Directory to place compressed files and torrents
  7 | OUTDIR=/data/downloads
  8 | 
  9 | # Base URL for HTTP dir containing torrents and data
 10 | WEBSEED=http://www.ghtorrent.org/downloads/
 11 | 
 12 | usage()
 13 | {
 14 | 	echo "Usage: $0 [-f 'yyyy-mm-dd hh:mm'] [-t 'yyyy-mm-dd hh:mm']"
 15 | 	echo "          [-c collection_to_dump]"
 16 | 	echo "Dump the database. -f earliest record timestamp"
 17 | 	echo "                   -t latest record timestamp"
 18 | 	echo "                   -c collection to dump (default: all)"
 19 | }
 20 | 
 21 | if [ -z $1 ]
 22 | then
 23 | 	usage
 24 | 	exit 1
 25 | fi
 26 | 
 27 | while getopts "f:t:c:" o
 28 | do
 29 | 	case $o in
 30 | 	f) 	timeStart=`date -d "$OPTARG" +%s` ;;
 31 | 	t) 	timeEnd=`date -d "$OPTARG" +%s` ;;
 32 | 	c) 	collection=$OPTARG ;;
 33 | 	\?)     echo "Invalid option: -$OPTARG" >&2
 34 | 		usage
 35 | 	        exit 1
 36 | 		;;
 37 | 	esac
 38 | done
 39 | 
 40 | 
 41 | # Time to start dumping from
 42 | if [ -z $timeStart ]
 43 | then
 44 | 	if [ -r lastrun ]
 45 | 	then
 46 | 		timeStart=`cat lastrun`
 47 | 	else
 48 | 		timeStart=0
 49 | 	fi
 50 | fi
 51 | 
 52 | # Time to end dumping
 53 | if [ -z $timeEnd ]
 54 | then
 55 | 	timeEnd=`date +%s`
 56 | fi
 57 | 
 58 | # Name used for the files
 59 | dateName=`date -d @$timeEnd -u +'%Y-%m-%d'`
 60 | 
 61 | # _id example:
 62 | # 4f208c3e08d69a1835000077
 63 | # 000102030405060708091011
 64 | # |      ||    ||  ||    |
 65 | # time    mach  pid count
 66 | 
 67 | endId=`printf '%08x0000000000000000' $timeEnd`
 68 | startId=`printf '%08x0000000000000000' $timeStart`
 69 | 
 70 | 
 71 | if [ -z $collection ]
 72 | then
 73 | 	collections=`echo "show collections"|mongo --quiet rs0/github|egrep -v "system|bye"`
 74 | else
 75 | 	collections=$collection
 76 | fi
 77 | 
 78 | echo "Dumping database from `date -d @$timeStart` to `date -d @$timeEnd`"
 79 | 
 80 | rm -rf dump
 81 | mkdir -p dump/github
 82 | 
 83 | for col in $collections; do
 84 | 
 85 | 	echo "Dumping $col"
 86 | 	mongodump --host rs0 --db github --collection $col -q '{"_id" : {"$gte" : ObjectId("'$startId'"), "$lt"  : ObjectId("'$endId'")} }' || exit 1
 87 | done
 88 | 
 89 | # Archive collections and create torrents 
 90 | for col in $collections; do
 91 | 	echo "Archiving $col.bson"
 92 | 	if [ ! -s dump/github/$col.bson ]; then
 93 | 		echo "Collection empty, skipping"
 94 | 		continue
 95 | 	fi
 96 | 
 97 | if ! tar zcf $OUTDIR/$col-dump.$dateName.tar.gz dump/github/$col.bson
 98 | then
 99 |   rm -f $OUTDIR/$col-dump.$dateName.tar.gz
100 |   exit 1
101 | fi
102 | #
103 | #	mktorrent -a udp://tracker.openbittorrent.com:80 -a udp://tracker.publicbt.com:80/announce -a http://tracker.bittorrent.am/announce -w $WEBSEED/$col-dump.$dateName.tar.gz -o $OUTDIR/$col-dump.$dateName.torrent $OUTDIR/$col-dump.$dateName.tar.gz
104 | #done
105 | #
106 | # Update last run info
107 | echo $timeEnd >lastrun || exit 1
108 | 
109 | # Clean up
110 | rm -rf dump
111 | 
112 | 


--------------------------------------------------------------------------------
/services.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | title: GHTorrent services
 4 | tagline:
 5 | ---
 6 | 
 7 | The GHTorrent project provides the following services to interested researchers
 8 | and third parties:
 9 | 
10 | * [Querying MongoDB](raw.html) programmatically
11 | * [Querying MySQL](/dblite/) through a web interface
12 | * [Querying MySQL](mysql.html) programmatically
13 | * [Streaming](streaming.html) of entries in MongoDB and MySQL
14 | 
15 | _The services are in kind and do not entail any quality or availability guarantee
16 | ._
17 | 
18 | ## Obtaining access
19 | 
20 | All services are offered over SSH tunnelling. See at the page of each
21 | individual service for connection details.
22 | 
23 | To obtain access to any of the services, add your public SSH key
24 | (usually in `~/.ssh/id_rsa.pub`), using a pull request, to
25 | [this file](https://github.com/ghtorrent/ghtorrent.org/blob/master/keys.txt).
26 | All pull requests are merged on Friday afternoon, so please wait a bit
27 | before firing a reminder email.
28 | 
29 | To create a public/private key pair, use `ssh-keygen`. Here are some hints on
30 | how to generate GHTorrent compatible SSH keys:
31 | 
32 | * **On Mac or Linux**: You can use the distribution provided `ssh-keygen`
33 |   command and it should work fine.
34 | 
35 | * **On Windows:** Keys generated with the popular Putty program cannot be used
36 |   by GHTorrent. Please use [CygWin](https://www.cygwin.com) or an equivalent
37 |   environment to install OpenSSH and use the `ssh-keygen` command as provided by
38 |   OpenSSH to generate a GHTorrent compatible key.
39 | 
40 | ## Fair use
41 | 
42 | To address GitHub's growth and GHTorrent's growing demands in API calls and the
43 | community's demand for better, more rich data, we need more GitHub API keys. We
44 | therefore kindly ask you to [send us](mailto:gousiosg@gmail.com)
45 | a GitHub API key (a “personal access token” as Github describes it).
46 | 
47 | The process to create a key is simple: First, go to the following URL (while
48 | logged in):
49 | 
50 | [https://github.com/settings/tokens/new](https://github.com/settings/tokens/new)
51 | 
52 | deselect *all* checkboxes *except* from `public_repo`, set a token name and
53 | click on "Generate Token".
54 | 
55 | Please note that it is possible to specify the maximum number of requests per
56 | hour that you would like to donate to GHTorrent. By default, GHTorrent uses the
57 | maximum allowed by GitHub (5k/hour), but if you are using the GitHub API for
58 | other projects/services, you might want to restrict this. A typical service like
59 | Travis only uses a few requests per hour, even on busy projects.
60 | 
61 | **If you do not want us to use your key any more, do let us know.** Do not
62 | just delete your key from GitHub as this will create holes in the data
63 | collection until we notice and remove your key. If this happens, we will also
64 | ban you indefinetely from the services (2 users have already been banned).
65 | 
66 | At the moment, this is a request in kind. If demand continues to grow and supply
67 | of keys is not enough to keep up, we might turn this into an obligatory step.
68 | 


--------------------------------------------------------------------------------
/raw.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | title: Querying MongoDB programmatically
 4 | tagline:
 5 | ---
 6 | 
 7 | While the GHTorrent project offers downloadable versions of the MongoDB raw
 8 | dataset, downloading and restoring them to MongoDB can be very time consuming.
 9 | For this reason, we have created a publicly available version of the data as
10 | they are collected by our main MongoDB server.
11 | The only prerequisite is to have a MongoDB client (command line, graphical
12 | or program library) and SSH installed on your machine.
13 | 
14 | ## Connection details
15 | 
16 | To obtain access, please send us your public key [as described here](services.html).
17 | 
18 | 1. When we contact you back, you will be able to setup an SSH tunnel with the
19 | following command: `ssh -L 27017:dutihr.st.ewi.tudelft.nl:27017
20 | ghtorrent@dutihr.st.ewi.tudelft.nl`. Keep in mind that no shell will be
21 | allocated in the open SSH session.
22 | 
23 | 2. You will then be able to connect to our server using the command: `mongo
24 | -u ghtorrentro -p ghtorrentro github`.
25 | 
26 | Here is an example session:
27 | 
28 | {% highlight bash%}
29 | ####
30 | # on terminal session 1
31 | $ ssh -L 27017:dutihr.st.ewi.tudelft.nl:27017 ghtorrent@dutihr.st.ewi.tudelft.nl
32 | PTY allocation request failed on channel 2
33 | #####
34 | # on a different terminal
35 | $ mongo -u ghtorrentro -p ghtorrentro github
36 | MongoDB shell version: 3.0.3
37 | connecting to: github
38 | >
39 | > db.events.count()
40 | 401209493
41 | > db.commits.count()
42 | 311041915
43 | >
44 | {% endhighlight %}
45 | 
46 | 
47 | ## Collections available in MongoDB
48 | 
49 | Have a look [here](mongo.html).
50 | 
51 | Due to its heavy load, the MongoDB server cannot process non-indexed field
52 | searches within the 100 sec time limit. To address this situation, we
53 | recommend querying MySQL first to get references to the data you want and
54 | then use MongoDB to get the raw data.
55 | 
56 | Below are the fields that MongoDB uses as indexes. Make sure your query hits
57 | those, otherwise querying is going to be extremely slow (and will overload our
58 | server as well).
59 | 
60 | <script src="http://gist-it.appspot.com/https://github.com/gousiosg/github-mirror/blob/master/lib/ghtorrent/adapters/mongo_persister.rb?slice=21:41">
61 | </script>
62 | 
63 | ## Things to keep in mind
64 | 
65 | 1. The hosting machine, while powerful, is not capable of processing the data
66 | very quickly. At the time of this writing, the data is more than 10TB.
67 | 
68 | 2. Other people may be using the machine as well. Make sure that you do not
69 | run very heavy queries. It is better to run many small queriess (e.g. in
70 | a loop) than aggregation queries. Make sure you only query on indexed fields.
71 | 
72 | 3. Queries running in excess of 100 seconds are killed without any warning.
73 | 
74 | 4. At any time the machine may become unavailable.
75 | 
76 | 5. Some data may be missing; if you are willing to provide workers to collect
77 | them, please [contact us](mailto:gousiosg@gmail.com).
78 | 
79 | 6. The data is provided in kind to help other people to do research with
80 | Please do not abuse the service.
81 | 
82 | 7. The data is offered as is without any explicit or implicit quality or
83 | service guarantee from our part.
84 | 
85 | 8. All operations are logged for security purposes.
86 | 
87 | 


--------------------------------------------------------------------------------
/pullreq-perf/openess-report.R:
--------------------------------------------------------------------------------
  1 | #
  2 | # (c) 2012 -- 2014 Georgios Gousios <gousiosg@gmail.com>
  3 | #
  4 | # BSD licensed, see LICENSE in top level dir
  5 | #
  6 | 
  7 | 
  8 | rm(list = ls(all = TRUE))
  9 | 
 10 | if (! "knitr" %in% installed.packages()) install.packages("knitr")
 11 | if (! "RMySQL" %in% installed.packages()) install.packages("RMySQL")
 12 | if (! "ggplot2" %in% installed.packages()) install.packages("ggplot2")
 13 | if (! "reshape" %in% installed.packages()) install.packages("reshape")
 14 | if (! "sqldf" %in% installed.packages()) install.packages("sqldf")
 15 | if (! "optparse" %in% installed.packages()) install.packages("optparse")
 16 | if (! "foreach" %in% installed.packages()) install.packages("foreach")
 17 | if (! "doMC" %in% installed.packages()) install.packages("doMC")
 18 | 
 19 | library(optparse)
 20 | 
 21 | mysql.user   =  "foo"
 22 | mysql.passwd = "bar"
 23 | mysql.db     = "ghtorrent"
 24 | mysql.host   = "127.0.0.1"
 25 | paralll      = 4
 26 | 
 27 | option_list <- list(
 28 |   make_option(c("-s", "--mysql-host"), default=mysql.host, dest = 'mysql.host',
 29 |               help = "MySQL host [\"%default\"]"),
 30 |   make_option(c("-d", "--mysql-db"), default=mysql.db, dest = 'mysql.db',
 31 |               help = "MySQL database [\"%default\"]"),
 32 |   make_option(c("-u", "--mysql-user"), default=mysql.user, dest = 'mysql.user',
 33 |               help = "MySQL user [\"%default\"]"),
 34 |   make_option(c("-p", "--mysql-passwd"), default=mysql.passwd, dest = 'mysql.passwd', help = "MySQL password [\"%default\"]"),
 35 |   make_option(c("-a", "--parallel"), default=paralll, dest = 'paralll', help = "Number of processes [\"%default\"]")
 36 | 
 37 | )
 38 | 
 39 | args <- parse_args(OptionParser(option_list = option_list),
 40 |                                 print_help_and_exit = FALSE,
 41 |                                 positional_arguments = TRUE)
 42 | 
 43 | if (args$options$help == TRUE) {
 44 |     parse_args(OptionParser(option_list = option_list))
 45 | }
 46 | 
 47 | mysql.user    = args$options$mysql.user
 48 | mysql.passwd  = args$options$mysql.passwd
 49 | mysql.db      = args$options$mysql.db
 50 | mysql.host    = args$options$mysql.host
 51 | paralll       = args$options$paralll
 52 | 
 53 | # Genearte stats
 54 | library(RMySQL)
 55 | library(knitr)
 56 | 
 57 | stats <- function(owner, repo) {
 58 | 
 59 |   db <- dbConnect(dbDriver("MySQL"),
 60 |                   user = mysql.user,
 61 |                   password = mysql.passwd,
 62 |                   dbname = mysql.db,
 63 |                   host = mysql.host)
 64 | 
 65 |   dirname = sprintf("%s-%s", owner,repo)
 66 |   print(sprintf("Running in %s", dirname))
 67 |   cwd <- getwd()
 68 |   dir.create(dirname)
 69 |   file.copy("report.Rmd", sprintf("%s/%s", dirname, "index.Rmd"))
 70 |   setwd(dirname)
 71 | 
 72 |   tryCatch({
 73 |     knit("index.Rmd")
 74 |     file.remove("index.Rmd")
 75 |   }, error = function(e) {
 76 |     print(e)
 77 |     setwd(cwd)
 78 |     unlink(dirname, TRUE, TRUE)
 79 |   }, finally = {
 80 |     dbDisconnect(db)
 81 |     setwd(cwd)
 82 |   })
 83 | }
 84 | 
 85 | if (length(args$args) == 0) {
 86 |   library(doMC)
 87 |   registerDoMC(paralll)
 88 | 
 89 |   projects <- read.csv('projects.txt', sep = ' ')
 90 | 
 91 |   print(sprintf("%s projects to analyze", nrow(projects)))
 92 |   print(sprintf("Running %d parallel processes", paralll))
 93 |   knit("index.Rmd")
 94 | 
 95 |   result <- foreach(n=1:nrow(projects), .combine='+') %dopar% {
 96 |     project <- projects[n, ]
 97 |     stats(project[,1], project[,2])
 98 |     1
 99 |   }
100 |   print(sprintf("processed %d projects", result))
101 | 
102 | } else {
103 |   stats(strsplit(args$args, " ")[1], strsplit(args$args, " ")[2])
104 | }
105 | 


--------------------------------------------------------------------------------
/vissoft14.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: page
  3 | title: VISSOFT 2014 Challenge Dataset
  4 | tagline:
  5 | ---
  6 | ### Versions
  7 | 
  8 | *You are advised to always run queries against the newest version.*
  9 | 
 10 | <table class="table table-hover table-condensed">
 11 |   <thead>
 12 |   <tr>
 13 |       <th>Version</th>
 14 |       <th>Release date</th>
 15 |       <th>Fixed error</th>
 16 |   </tr>
 17 |   </thead>
 18 |   <tbody>
 19 |     <tr>
 20 |       <td>1.0</td>
 21 |       <td>1 Mar 2014</td>
 22 |       <td></td>
 23 |   </tr>
 24 |   </tbody>
 25 | </table>
 26 | 
 27 | ### Dataset description
 28 | 
 29 | The VISSOFT 2014 challenge dataset is a (very) trimmed down version of the
 30 | original GHTorrent dataset. It includes data from the
 31 | [netty/netty](http://github.com/repos/netty/netty) repository (commits, pull
 32 | requests, collaborators, issues etc) along with all its forks (including the
 33 | forks' own commits, pull requests etc if any).
 34 | 
 35 | Similarly to GHTorrent itself, the VISSOFT challenge dataset comes in two flavours:
 36 | 
 37 | * A MongoDB database dump containing the results of querying the Github API. See [format here](mongo.html).
 38 | * A MySQL database dump containing a queriable version of important fields extracted from the raw data. See [schema here](relational.html).
 39 | 
 40 | ### Importing and using
 41 | 
 42 | The following instructions assume an OSX or Linux based host, on which
 43 | you have a running MongoDB or/and MySQL instance.
 44 | 
 45 | {%highlight bash%}
 46 | # Download and extract
 47 | $ wget http://ghtorrent-downloads.ewi.tudelft.nl/datasets/vissoft14-01032014.tar.gz
 48 | $ du -b vissoft14-01032014.tar.gz
 49 | 49178639  vissoft14-01032014.tar.gz
 50 | $ md5sum vissoft14-01032014.tar.gz
 51 | 4928efb679a0dc8254924d56760d65ec  vissoft14-01032014.tar.gz
 52 | $ tar zxvf vissoft14-01032014.tar.gz
 53 | $ cd vissoft14
 54 | $ du -b
 55 | 351024865
 56 | 
 57 | # MongoDB import
 58 | $ ls *.bson|while read dump; do mongorestore -d vissoft14 $dump; done
 59 | $ mongo vissoft14
 60 | mongo> db.commits.count()
 61 | 9118
 62 | mongo> db.issue_comments.count()
 63 | 10876
 64 | 
 65 | # MySQL import
 66 | $ mysql -u root -p
 67 | mysql > create user 'vissoft14'@'localhost' identified by 'vissoft14';
 68 | mysql> create database vissoft14;
 69 | mysql> grant all privileges on vissoft14.* to vissoft14@'localhost';
 70 | mysql> flush privileges;
 71 | mysql> ^D
 72 | $ cat mysql.sql |mysql -u vissoft14 -p vissoft14
 73 | $ mysql -u vissoft14 -p vissoft14
 74 | mysql> select count(*) from commits;
 75 | +----------+
 76 | | count(*) |
 77 | +----------+
 78 | |     9118 |
 79 | +----------+
 80 | {%endhighlight %}
 81 | 
 82 | ### FAQ
 83 | 
 84 | Answers to frequently asked questions
 85 | 
 86 | #### Why a new dataset?
 87 | 
 88 | For practical reasons. The dataset is small enough to be used on a laptop,
 89 | yet rich enough to do really interesting vizualizations with it.
 90 | 
 91 | #### What are the hardware requirements?
 92 | 
 93 | We have succesfully imported and used both dumps into a 2011 MacBookAir with 4GB
 94 | of RAM. Your mileage may vary, but relatively new systems with more than 4GB RAM should have no trouble with both databases. If you only need to use the MySQL data dump, the hardware requirements are even lower.
 95 | 
 96 | #### Why two databases? Do I need both?
 97 | 
 98 | Not necessarily. The MySQL database can readily cover many aspects of activity
 99 | on Github. Perhaps the only reason to use the MongoDB dump is to analyse commit contents, branches affected by pull requests or milestones, which are not included in MySQL.
100 | 
101 | #### How can I ask a question about the dataset?
102 | 
103 | Your question and the potential answer might be useful for other people as
104 | well, so please use the form below. *Please note that I (Georgios Gousios) will
105 | not answer questions sent to my email.*
106 | 
107 | {% include comments.html%}
108 | 


--------------------------------------------------------------------------------
/syntax.css:
--------------------------------------------------------------------------------
 1 | .highlight  { background: #ffffff; }
 2 | .highlight .c { color: #999988; font-style: italic } /* Comment */
 3 | .highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */
 4 | .highlight .k { font-weight: bold } /* Keyword */
 5 | .highlight .o { font-weight: bold } /* Operator */
 6 | .highlight .cm { color: #999988; font-style: italic } /* Comment.Multiline */
 7 | .highlight .cp { color: #999999; font-weight: bold } /* Comment.Preproc */
 8 | .highlight .c1 { color: #999988; font-style: italic } /* Comment.Single */
 9 | .highlight .cs { color: #999999; font-weight: bold; font-style: italic } /* Comment.Special */
10 | .highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
11 | .highlight .gd .x { color: #000000; background-color: #ffaaaa } /* Generic.Deleted.Specific */
12 | .highlight .ge { font-style: italic } /* Generic.Emph */
13 | .highlight .gr { color: #aa0000 } /* Generic.Error */
14 | .highlight .gh { color: #999999 } /* Generic.Heading */
15 | .highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
16 | .highlight .gi .x { color: #000000; background-color: #aaffaa } /* Generic.Inserted.Specific */
17 | .highlight .go { color: #888888 } /* Generic.Output */
18 | .highlight .gp { color: #555555 } /* Generic.Prompt */
19 | .highlight .gs { font-weight: bold } /* Generic.Strong */
20 | .highlight .gu { color: #aaaaaa } /* Generic.Subheading */
21 | .highlight .gt { color: #aa0000 } /* Generic.Traceback */
22 | .highlight .kc { font-weight: bold } /* Keyword.Constant */
23 | .highlight .kd { font-weight: bold } /* Keyword.Declaration */
24 | .highlight .kp { font-weight: bold } /* Keyword.Pseudo */
25 | .highlight .kr { font-weight: bold } /* Keyword.Reserved */
26 | .highlight .kt { color: #445588; font-weight: bold } /* Keyword.Type */
27 | .highlight .m { color: #009999 } /* Literal.Number */
28 | .highlight .s { color: #d14 } /* Literal.String */
29 | .highlight .na { color: #008080 } /* Name.Attribute */
30 | .highlight .nb { color: #0086B3 } /* Name.Builtin */
31 | .highlight .nc { color: #445588; font-weight: bold } /* Name.Class */
32 | .highlight .no { color: #008080 } /* Name.Constant */
33 | .highlight .ni { color: #800080 } /* Name.Entity */
34 | .highlight .ne { color: #990000; font-weight: bold } /* Name.Exception */
35 | .highlight .nf { color: #990000; font-weight: bold } /* Name.Function */
36 | .highlight .nn { color: #555555 } /* Name.Namespace */
37 | .highlight .nt { color: #000080 } /* Name.Tag */
38 | .highlight .nv { color: #008080 } /* Name.Variable */
39 | .highlight .ow { font-weight: bold } /* Operator.Word */
40 | .highlight .w { color: #bbbbbb } /* Text.Whitespace */
41 | .highlight .mf { color: #009999 } /* Literal.Number.Float */
42 | .highlight .mh { color: #009999 } /* Literal.Number.Hex */
43 | .highlight .mi { color: #009999 } /* Literal.Number.Integer */
44 | .highlight .mo { color: #009999 } /* Literal.Number.Oct */
45 | .highlight .sb { color: #d14 } /* Literal.String.Backtick */
46 | .highlight .sc { color: #d14 } /* Literal.String.Char */
47 | .highlight .sd { color: #d14 } /* Literal.String.Doc */
48 | .highlight .s2 { color: #d14 } /* Literal.String.Double */
49 | .highlight .se { color: #d14 } /* Literal.String.Escape */
50 | .highlight .sh { color: #d14 } /* Literal.String.Heredoc */
51 | .highlight .si { color: #d14 } /* Literal.String.Interpol */
52 | .highlight .sx { color: #d14 } /* Literal.String.Other */
53 | .highlight .sr { color: #009926 } /* Literal.String.Regex */
54 | .highlight .s1 { color: #d14 } /* Literal.String.Single */
55 | .highlight .ss { color: #990073 } /* Literal.String.Symbol */
56 | .highlight .bp { color: #999999 } /* Name.Builtin.Pseudo */
57 | .highlight .vc { color: #008080 } /* Name.Variable.Class */
58 | .highlight .vg { color: #008080 } /* Name.Variable.Global */
59 | .highlight .vi { color: #008080 } /* Name.Variable.Instance */
60 | .highlight .il { color: #009999 } /* Literal.Number.Integer.Long */
61 | 


--------------------------------------------------------------------------------
/geninst.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: page
  3 | title: Generic installation instructions
  4 | tagline:
  5 | ---
  6 | 
  7 | ## How to run GHTorrent locally
  8 | 
  9 | Depending on the size of the local mirror you have the following configuration
 10 | simplification options:
 11 | 
 12 | * You can skip using MongoDB if you only need to query the relational
 13 | database and/or you just need to do use GHTorrent once.
 14 | 
 15 | * You can use SQLite3 instead of MySQL if your setup only contains a few
 16 | (say, less than 1000) small projects.
 17 | 
 18 | ### Install Ruby and dependencies
 19 | 
 20 | Make sure you run the latest release of Ruby. On the main server, GHTorrent runs
 21 | on Ruby 2. If you are on Mac or Linux, you can use [RVM](https://rvm.io/)  to
 22 | manage Ruby versions.
 23 | 
 24 | Install the necessary dependencies:
 25 | 
 26 | {% highlight bash %}
 27 | sudo apt-get install build-essential curl libmysqlclient-dev
 28 | # Install RVM and Ruby 2.2
 29 | gpg --keyserver hkp://keys.gnupg.net --recv-keys 409B6B1796C275462A1703113804BB82D39DC0E3
 30 | curl -L https://get.rvm.io | bash -s stable --ruby=2.2
 31 | rvm use 2.2
 32 | gem install bundler sqlite3 #or mysql2
 33 | {% endhighlight %}
 34 | 
 35 | ### Install the source code
 36 | 
 37 | Checkout the latest version of the
 38 | [ghtorrent](https://github.com/gousiosg/github-mirror.git) Gem through Github.
 39 | By default, it will be checked out in the directory `github-mirror`. The
 40 | released versions of the Gem represent good states in the project's lifetime;
 41 | the main mirror always works on the bleeding edge, which contains error fixes
 42 | and updates to comply with changes to Github's API. You then need to install
 43 | the dependencies:
 44 | 
 45 | {% highlight bash %}
 46 | cd github-mirror
 47 | bundle install
 48 | {% endhighlight %}
 49 | 
 50 | Alternatively, you can just install the latest version of the GHTorrent gem:
 51 | 
 52 | {% highlight bash %}
 53 | gem install ghtorrent
 54 | {% endhighlight %}
 55 | 
 56 | #### Configure
 57 | 
 58 | **If you are using MySQL**, you need to create a user and a database, like so
 59 | 
 60 | {% highlight mysql %}
 61 | # Login as MySQL root user
 62 | mysql> create user ghtorrentuser@'localhost' identified by 'ghtorrentpassword';
 63 | mysql> create user ghtorrentuser@'*' identified by 'ghtorrentpassword';
 64 | mysql> grant all privileges on *.* to 'ghtorrentuser'@'localhost';
 65 | mysql> grant all privileges on *.* to 'ghtorrentuser'@'*';
 66 | 
 67 | # Login as the ghtorrent user
 68 | mysql> CREATE SCHEMA IF NOT EXISTS `ghtorrent` DEFAULT CHARACTER SET utf8 ;
 69 | {% endhighlight %}
 70 | 
 71 | **If you are using MongoDB**, you can just disable
 72 | authentication (run `mongod` with `--noauth`). If you do want to create a user,
 73 | it can be a bit more involved, see below:
 74 | 
 75 | {% highlight javascript %}
 76 | > db.createUser(
 77 |   {
 78 |     user: "root",
 79 |     pwd: "admin",
 80 |     roles: [ { role: "userAdminAnyDatabase", db: "admin" } ]
 81 |   }
 82 | )
 83 | 
 84 | > use ghtorrent
 85 | > db.createUser(
 86 |     {
 87 |       user: "ghtorrent",
 88 |       pwd: "ghtorrent",
 89 |       roles: [
 90 |          { role: "dbOwner", db: "ghtorrent" }
 91 |       ]
 92 |     }
 93 | )
 94 | {% endhighlight %}
 95 | 
 96 | **Download the
 97 | [sample configuration file](https://raw.githubusercontent.com/gousiosg/github-mirror/master/config.yaml.tmpl)**,
 98 | save it as `config.yaml` and change options as necessary. Important things to
 99 | configure are:
100 | 
101 | * The database connection string
102 | * The MongoDB connection details (if you are using it)
103 | * Your GitHub username/password or an API token. See
104 |   [instructions here](raw.html) on how to obtain an API key
105 | 
106 | ### Run and profit
107 | 
108 | To download the data for your first project, run:
109 | 
110 | {% highlight bash %}
111 | # Retrieve one repo
112 | ruby -Ilib bin/ght-retrieve-repo -c config.yaml gousiosg github-mirror
113 | {% endhighlight %}
114 | 
115 | You should see lots of output. After a while, you will have 1/2 databases
116 | full of data!
117 | 


--------------------------------------------------------------------------------
/mongo.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: page
  3 | title: Collections in MongoDB
  4 | tagline: 
  5 | ---
  6 | 
  7 | Here is a list of collections along with the Github API URL they cache data
  8 | from. All URLs need to be prefixed with `https://api.github.com/`. In MongoDB,
  9 | each entity is by default indexed by the parameter fields in each corresponding
 10 | URL (see also the actual [default indexes](https://github.com/gousiosg/github-mirror/blob/master/lib/ghtorrent/adapters/mongo_persister.rb#L23)).
 11 | 
 12 | <table class="table table-hover table-condensed">
 13 |   <thead>
 14 |     <tr>
 15 |       <th>Collection name</th>
 16 |       <th>Github API URL</th>
 17 |       <th>Documentation URL</th>
 18 |     </tr>
 19 |   </thead>
 20 |   <tbody>
 21 |     <tr>
 22 |       <td>commit_comments</td>
 23 |       <td><tt>#{user}/#{repo}/commits/#{sha}/comments</tt></td>
 24 |       <td><a href="http://developer.github.com/v3/repos/comments/#list-comments-for-a-single-commit">commit comments</a></td>
 25 |     </tr>
 26 |     <tr>
 27 |       <td>commits</td>
 28 |       <td><tt>repos/#{user}/#{repo}/commits</tt></td>
 29 |       <td><a href="http://developer.github.com/v3/repos/commits/#list-commits-on-a-repository">commits</a></td>
 30 |     </tr>
 31 |     <tr>
 32 |       <td>events</td>
 33 |       <td><tt>events</tt></td>
 34 |       <td><a href="http://developer.github.com/v3/activity/events/">events</a></td>
 35 |     </tr>
 36 |     <tr>
 37 |       <td>followers</td>
 38 |       <td><tt>users/#{user}/followers</tt></td>
 39 |       <td><a href="http://developer.github.com/v3/users/followers/#list-followers-of-a-user">followers list</a></td>
 40 |     </tr>
 41 |     <tr>
 42 |       <td>forks</td>
 43 |       <td><tt>repos/#{user}/#{repo}/forks</tt></td>
 44 |       <td><a href="http://developer.github.com/v3/repos/forks/#list-forks">forks list</a></td>
 45 |     </tr>
 46 |     <tr>
 47 |       <td>issues</td>
 48 |       <td><tt>/repos/#{owner}/#{repo}/issues</tt></td>
 49 |       <td><a href="http://developer.github.com/v3/issues/#list-issues-for-a-repository">issues for a repo</a></td>
 50 |     </tr>
 51 |     <tr>
 52 |       <td>issue_comments</td>
 53 |       <td><tt>repos/#{owner}/#{repo}/issues/comments/#{comment_id}</tt></td>
 54 |       <td><a href="http://developer.github.com/v3/issues/comments/#list-comments-on-an-issue">issue comments</a></td>
 55 |     </tr>
 56 |     <tr>
 57 |       <td>issue_events</td>
 58 |       <td><tt>repos/#{owner}/#{repo}/issues/events/#{event_id}</tt></td>
 59 |       <td><a href="http://developer.github.com/v3/issues/events/">issue events</a></td>
 60 |     </tr>
 61 |     <tr>
 62 |       <td>org_members</td>
 63 |       <td><tt>orgs/#{org}/members</tt></td>
 64 |       <td><a href="http://developer.github.com/v3/orgs/members/">organization members</a></td>
 65 |     </tr>
 66 |     <tr>
 67 |       <td>pull_request_comments</td>
 68 |       <td><tt>repos/#{owner}/#{repo}/pulls/#{pullreq_id}/comments</tt></td>
 69 |       <td><a href=http://developer.github.com/v3/pulls/comments/">pull request review comments</a></td>
 70 |     </tr>
 71 |     <tr>
 72 |       <td>pull_requests</td>
 73 |       <td><tt>repos/#{user}/#{repo}/pulls</tt></td>
 74 |       <td><a href="http://developer.github.com/v3/pulls/">pull requests</a></td>
 75 |     </tr>
 76 |     <tr>
 77 |       <td>repo_collaborators</td>
 78 |       <td><tt>repos/#{user}/#{repo}/collaborators</tt></td>
 79 |       <td><a href="http://developer.github.com/v3/repos/collaborators/">repo collaborators</a></td>
 80 |     </tr>
 81 |     <tr>
 82 |       <td>repo_labels</td>
 83 |       <td><tt>repos/#{owner}/#{repo}/issues/#{issue_id}/labels</tt></td>
 84 |       <td><a href="http://developer.github.com/v3/issues/labels/#list-all-labels-for-this-repository">issue labels</a></td>
 85 |     </tr>
 86 |     <tr>
 87 |       <td>repos</td>
 88 |       <td><tt>repos/#{user}/#{repo}</tt></td>
 89 |       <td><a href="http://developer.github.com/v3/repos/#list-all-public-repositories">repositories</a></td>
 90 |     </tr>
 91 |     <tr>
 92 |       <td>users</td>
 93 |       <td><tt>users/#{user}</tt></td>
 94 |       <td><a href="http://developer.github.com/v3/users/#get-a-single-user">users</a></td>
 95 |     </tr>
 96 |     <tr>
 97 |       <td>watchers</td>
 98 |       <td><tt>repos/#{user}/#{repo}/stargazers</tt></td>
 99 |       <td><a href="http://developer.github.com/v3/activity/starring/#list-stargazers">stargazers</a></td>
100 |     </tr>
101 |   </tbody>
102 | </table>
103 | 


--------------------------------------------------------------------------------
/cookbook.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: page
  3 | title: The GHTorrent cookbook
  4 | tagline:
  5 | ---
  6 | 
  7 | The following assume that you have GHTorrent checked out and its
  8 | dependencies configured as appropriate. The first step to all
  9 | the items below is to change the working directory to the
 10 | GHTorrent checkout, i.e. `cd github-mirror`.
 11 | 
 12 | ## General information
 13 | 
 14 | All GHTorrent commands accept the following important arguments:
 15 | 
 16 | * `-c`: Specifies the location of the configuration file. The config file is
 17 | auto discovered if it is in the same directory and named `config.yaml`
 18 | * `-t`: The GitHub token to use to do the API calls.
 19 | * `-l`: A limit to reqs/hr. This can be used to restrict the requests
 20 | * `-u`: A unique name for the running program, to differentiate
 21 | * `-v`: Verbose mode, means different things to different commands.
 22 | 
 23 | ### Getting all info about a single repo
 24 | 
 25 | The following will retrieve all information from the beginning of time
 26 | for a single repo.
 27 | 
 28 | {% highlight bash%}
 29 | ruby -Ilib bin/ght-retrieve-repo gousiosg github-mirror
 30 | {% endhighlight %}
 31 | 
 32 | There are a few arguments for this command to make it faster for specific
 33 | cases:
 34 | 
 35 | * `-n`: Do not retrieve events
 36 | * `-o`: Do not retrieve entities
 37 | * `-y`: Only retrieve one entity type. For example `-y ensure_commits` will
 38 | retrieve just the commits and finish.
 39 | * `-r` and `-b`: Process all events before or after a specific date.
 40 | 
 41 | ### Getting all info about a user
 42 | 
 43 | This will retrieve all data (followers, organizations etc) for a single
 44 | user/organization
 45 | 
 46 | {% highlight bash%}
 47 | ruby -Ilib bin/ght-retrieve-user gousiosg
 48 | {% endhighlight %}
 49 | 
 50 | 
 51 | The following will retrieve all users in the Microsoft organization
 52 | 
 53 | {% highlight bash%}
 54 | ruby -Ilib bin/ght-retrieve-user Microsoft
 55 | {% endhighlight %}
 56 | 
 57 | ### Getting many users/repos in parallel
 58 | 
 59 | The quick and dirty solution is to use
 60 | [GNU Parallel](http://www.gnu.org/software/parallel/). To do that you need two files, one listing API keys (`keys.txt`) and one listing repository names (`projects.txt`), see for example:
 61 | 
 62 | {% highlight bash%}
 63 | $ head -n 5 projects.txt
 64 | eed3si9n scalaxb
 65 | pocorall scaloid
 66 | socrata-platform soql-bigquery-adapter
 67 | ReactiveMongo Play-ReactiveMongo
 68 | chrisdinn brando
 69 | 
 70 | $ head -n 5 keys.txt
 71 | # Not real keys
 72 | d15d119551fd194745cb81df4f4c68c55460bf37
 73 | c3a1a550bcfc39ea374452f95a1dbe3002a3b8ab
 74 | ea9e186f882c853fe0eb3e387b8c01aafdca8645
 75 | bd3d11ae101cf931aed92f76ffc2f6ba40e3c9fa
 76 | c6e15a389537675539ddd4bf1ef7e0f96520ec3e
 77 | {% endhighlight %}
 78 | 
 79 | Then you can use GNU `parallel` like so:
 80 | 
 81 | {% highlight bash%}
 82 | parallel --no-notice --gnu --progress --joblog parjobs --xapply -P 4 \
 83 |  ruby -Ilib bin/ght-retrieve-repo -c config.yaml -t {3} {1} {2} \
 84 |  ::: `cat projects.txt | cut -f1 -d' '` \
 85 |  ::: `cat projects.txt|cut -f2 -d' '` \
 86 |  ::: `cat keys.txt`
 87 | {% endhighlight %}
 88 | 
 89 | The important parameter to tune here is `-P`, the number of parallel processes.
 90 | To retrieve users in parallel, you need to replace `ght-retrieve-repo` with
 91 | `ght-retrieve-user`.
 92 | 
 93 | ### Loading items to the queue
 94 | 
 95 | In some cases (e.g. bugs, network glitches etc), some events might
 96 | remain unprocessed. In other cases, you might want some events on
 97 | the queue, e.g. to test new functionality. The tool to use in this case
 98 | is `ght-load`. Below are some use cases:
 99 | 
100 | {% highlight bash%}
101 | # Load all PushEvents since yesterday
102 | ruby -Ilib bin/ght-load -v -e `gdate +%s --date '1 day ago'` -f PushEvent
103 | 
104 | # Load all events of previous day at a rate of 10 events/sec
105 | ruby -Ilib bin/ght-load -v -r 10 -e `gdate +%s --date '1 day ago'` -t gdate +%s --date '2 days ago'`
106 | 
107 | # Load all events of previous day at a rate of 10 events/sec
108 | ruby -Ilib bin/ght-load -v -r 10 -e `gdate +%s --date '1 day ago'` -t gdate +%s --date '2 days ago'`
109 | {% endhighlight %}
110 | 
111 | 
112 | `ght-load` can also be used to load arbitrary items read from files
113 |  rather than MongoDB. In this case, a routing key can be attached
114 |  in order for the loaded items to go to the appropriate queue.
115 | 
116 | {% highlight bash%}
117 | # Load items from file with a routing key
118 | ruby -Ilib bin/ght-load -i users.txt -o 'evt.users'
119 | {% endhighlight %}
120 | 
121 | 


--------------------------------------------------------------------------------
/leanprogress.html:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: page
  3 | title: Lean Request Results
  4 | group: navigation
  5 | ---
  6 | 
  7 | <script>
  8 | 
  9 |   if (!Array.prototype.last){
 10 |     Array.prototype.last = function(){
 11 |       return this[this.length - 1];
 12 |     };
 13 |   };
 14 | 
 15 |   Function.prototype.extractComment = function() {
 16 |     var startComment = "/*!";
 17 |     var endComment = "*/";
 18 |     var str = this.toString();
 19 | 
 20 |     var start = str.indexOf(startComment);
 21 |     var end = str.lastIndexOf(endComment);
 22 | 
 23 |     return str.slice(start + startComment.length, -(str.length - end));
 24 |   };
 25 | 
 26 |   String.prototype.format = function() {
 27 |     var newStr = this, i = 0;
 28 |     while (/%s/.test(newStr))
 29 |         newStr = newStr.replace("%s", arguments[i++])
 30 | 
 31 |     return newStr;
 32 |   }
 33 | 
 34 |   var accTmpl = function() { /*!
 35 |   <div class="panel panel-default">
 36 |     <div class="panel-heading">
 37 |       <h4 class="panel-title">
 38 |       <i data-toggle="tooltip" title="%s" data-placement="top" class="fa fa-%s"></i>
 39 |       <a data-toggle="collapse" data-parent="#accordion" href="#%s">
 40 |       %s
 41 |       </a>
 42 |       </h4>
 43 |     </div>
 44 |     <div id="%s" class="panel-collapse collapse">
 45 |       <div class="panel-body">
 46 |       %s
 47 |       </div>
 48 |     </div>
 49 |   </div>*/}.extractComment();
 50 | 
 51 |   function param(name){
 52 |    if(name=(new RegExp('[?&]'+encodeURIComponent(name)+'=([^&]*)')).exec(location.search))
 53 |       return decodeURIComponent(name[1]);
 54 |   }
 55 | 
 56 | $(document).ready(function() {
 57 | 
 58 |   var req_id = param('id');
 59 | 
 60 |   if (req_id == null || req_id.length != 40) {
 61 |     $("#progress").text("No request specified or invalid request id");
 62 |     return;
 63 |   }
 64 | 
 65 |   $.ajax({
 66 |     url: "http://ghtorrent.org/lean/requests/" + req_id,
 67 |     success: results,
 68 |     error: function(xhr, status, exception) {
 69 |        $("#progress").text("Invalid request id");
 70 |     }
 71 |   });
 72 | 
 73 |   function results(data, status, xhr) {
 74 | 
 75 |     function idGen() {
 76 |       return (Math.random() + 1).toString(36).substr(2, 5);
 77 |     }
 78 | 
 79 |     var toAppend = Object.keys(data).reduce(function(acc, project, x, y) {
 80 | 
 81 |       var id = idGen();
 82 | 
 83 |       var content = data[project].items.reduce(function(acc, item, x, y) {
 84 |         var itemFmt = '<li><i class="fa-li fa fa-%s"></i>%s: %s</li>';
 85 | 
 86 |         var d = new Date(item.created_at * 1000);
 87 |         var ts = "%s/%s/%s %s:%s:%s".format(d.getFullYear(), d.getMonth() + 1,
 88 |           d.getDay(), d.getHours(), d.getMinutes(), d.getSeconds());
 89 | 
 90 |         var icon = "";
 91 |         if (item.status == "ok"){
 92 |           icon = "check-square";
 93 |         } else {
 94 |           icon = "frown-o warncolor";
 95 |         }
 96 | 
 97 |         return (acc + itemFmt.format(icon, ts, item.item));
 98 |       }, '<ul class="fa-ul">');
 99 |       content = content + "</ul>";
100 | 
101 |       if (data[project].req_status.status == "finished") {
102 |         statusIcon = "check-circle";
103 |       } else if (data[project].req_status.status == "working") {
104 |         statusIcon = "cogs";
105 |       } else if (data[project].req_status.status == "stopped") {
106 |         statusIcon = "stop";
107 |       } else if (data[project].req_status.status == "error") {
108 |         statusIcon = "frown-o warncolor";
109 |       } else  {
110 |         statusIcon = "question-circle";
111 |       }
112 | 
113 |       var tooltip = "status: %s, last update: %s";
114 |       var updts = new Date(data[project].req_status.created_at * 1000);
115 |       var upd = "%s/%s/%s %s:%s:%s".format(updts.getFullYear(),
116 |             updts.getMonth() + 1, updts.getDay(), updts.getHours(),
117 |             updts.getMinutes(), updts.getSeconds());
118 |       tooltip = tooltip.format(data[project].req_status.status, upd);
119 | 
120 |       return (acc + accTmpl.format(tooltip, statusIcon, id, project, id, content));
121 |     }, "");
122 | 
123 |     $("#accordion").append(toAppend);
124 |     $("#expander").show();
125 |     $('.fa').tooltip();
126 |   }
127 | 
128 |   $("#expander").click(function() {
129 |      $('.panel-collapse').collapse({'toggle': true, 'parent': '#accordion'});
130 |   });
131 | 
132 | });
133 | </script>
134 | 
135 | <a class="btn btn-primary" id="expander" style="display:none"><i class="fa fa-expand fa-fw"></i>Expand all</a>
136 | 
137 | <div class="panel-group" id="accordion">
138 |   <style type = "text/css" scoped>
139 |     .warncolor {color:red;}
140 |   </style>
141 | </div>
142 | 
143 | <div id="#progress"></div>
144 | 


--------------------------------------------------------------------------------
/index.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: page
  3 | title: The GHTorrent project
  4 | tagline:
  5 | ---
  6 | 
  7 | Welcome to the GHTorrent project, an effort to create a scalable, queriable,
  8 | offline mirror of data offered through the [Github REST API](http://developer.github.com).
  9 | 
 10 | Follow [@ghtorrent](https://twitter.com/ghtorrent) on Twitter for project
 11 | updates and [exciting research](halloffame.html) done with GHTorrent.
 12 | 
 13 | ##What does GHTorrent do?
 14 | 
 15 | GHTorrent monitors the [Github public event time
 16 | line](https://api.github.com/events). For each event, it retrieves its contents
 17 | and their dependencies, exhaustively. It then stores the raw JSON responses to a
 18 | [MongoDB database](raw.html), while also extracting their structure in a [MySQL
 19 | database](relational.html).
 20 | 
 21 | GHTorrent works in a distributed manner. A [RabbitMQ](http://www.rabbitmq.com/)
 22 | message queue sits between the event mirroring and data retrieval phases, so
 23 | that both can be run on a cluster of machines. Have a look at this
 24 | [presentation](https://speakerdeck.com/gousiosg/mining-github-for-fun-and-profit)
 25 | and read [this paper](http://gousios.gr/bibliography/GS12.html) if you want to
 26 | know more. Here is the [source code](https://github.com/gousiosg/github-mirror).
 27 | 
 28 | The project releases the data collected during that period as
 29 | [downloadable archives](downloads.html).
 30 | 
 31 | ### How much data do you have?
 32 | 
 33 | Currently (Jan 2015), MongoDB stores around 4TB of JSON data (compressed), while
 34 | MySQL more than 1.5 billion rows of extracted metadata.  A large part of the
 35 | activity of 2012, 2013, 2014 and 2015 has been retrieved, while we are also
 36 | going backwards to retrieve the full recorded history of important projects.
 37 | 
 38 | ### How can I help?
 39 | 
 40 | GHTorrent needs contributions on the following fronts:
 41 | 
 42 | * **API keys:** We can run multiple GHTorrent worker instances concurrently. To
 43 | go over Github's API rate limit, we need multiple Github API keys provided by
 44 | users.  If you use GHTorrent for your reseach, please consider [donating a
 45 | key](services.html).
 46 | 
 47 | * **Linking and analysis:** GHTorrent currently only does limited analysis and
 48 | linking withing the the dataset (user geolocation). There are many possibilities
 49 | for expansion. One could for example think of linking commits to issues.
 50 | 
 51 | * **Reporting bugs:** Please use Github's [issue tracker here](https://github.com/gousiosg/ghtorrent.org/issues) to report any data consistency issues you have found.
 52 | 
 53 | * **Donating:** We are trying to make GHTorrent a self-sustainable operation.
 54 | If you are using GHTorrent, please consider donating (you can find a donation
 55 | button on the left). All individual/companies that have donated will be listed
 56 | in the Hall of Fame page.
 57 | 
 58 | ### Why did you do it?
 59 | 
 60 | We are doing research on [software repositories](http://www.msrconf.org/).
 61 | Github is an exciting new data source for us, one that has several of the
 62 | problems we are facing as data miners solved. The uniformity of data
 63 | will allow scaling of research to hundreds or thousands of repositories
 64 | spanning across multiple languages and application domains.
 65 | 
 66 | ### Why the name?
 67 | 
 68 | Initially the project offered the data through the Bittorrent network (gh: from
 69 | GitHub, torrent: from Bittorrent). As currently the data is only offered through
 70 | HTTP, the name signifies a [torrent](https://en.wiktionary.org/wiki/torrent) of
 71 | data coming from GitHub.
 72 | 
 73 | ### Can I know more?
 74 | 
 75 | Have a look at the following presentation for a short introduction.
 76 | 
 77 | <div style="width: 50%;margin-left:auto;margin-right:auto;">
 78 | <script class="speakerdeck-embed" data-id="75bea5909fbb0130f0eb364613f6f036" data-ratio="1.33333" src="//speakerdeck.com/assets/embed.js"></script>
 79 | </div>
 80 | 
 81 | ### How can I cite this work?
 82 | 
 83 | If you find this dataset useful and want to use it in your work, please cite the
 84 | following paper:
 85 | 
 86 | Georgios Gousios: [The GHTorrent dataset and tool
 87 | suite](http://www.gousios.gr/bibliography/G13.html). MSR 2013: 233-236
 88 | 
 89 | {%highlight text%}
 90 | @inproceedings{Gousi13,
 91 |   author = {Gousios, Georgios},
 92 |   title = {The GHTorrent dataset and tool suite},
 93 |   booktitle = {Proceedings of the 10th Working Conference on Mining Software
 94 |     Repositories},
 95 |   series = {MSR '13},
 96 |   year = {2013},
 97 |   isbn = {978-1-4673-2936-1},
 98 |   location = {San Francisco, CA, USA},
 99 |   pages = {233--236},
100 |   numpages = {4},
101 |   url = {http://dl.acm.org/citation.cfm?id=2487085.2487132},
102 |   acmid = {2487132},
103 |   publisher = {IEEE Press},
104 |   address = {Piscataway, NJ, USA},
105 | }
106 | {%endhighlight%}
107 | 
108 | ### Latest news
109 | 
110 | <a class="twitter-timeline"
111 |   data-widget-id="608916912693751808"
112 |   href="https://twitter.com/ghtorrent"
113 |   data-screen-name="ghtorrent">
114 | Latest news
115 | </a>
116 | 


--------------------------------------------------------------------------------
/gcloud.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: page
  3 | title: GHTorrent on the Google cloud
  4 | tagline:
  5 | ---
  6 | 
  7 | GHTorrent can be accessed over Google Cloud services. To access the data
  8 | requires you to have a Google Cloud account. Reasonable use is free of charge
  9 | and, in the case of BigQuery, it [should no longer require a credit
 10 | card](https://cloud.google.com/blog/big-data/2017/01/how-to-run-a-terabyte-of-google-bigquery-queries-each-month-without-a-credit-card). (Pub/Sub still requires a credit card). You can check what Google considers reasonable at any given moment
 11 | [here](https://cloud.google.com/pricing/free).
 12 | 
 13 | * [Google BigQuery](https://bigquery.cloud.google.com/dataset/ghtorrent-bq:ght)
 14 |   contains an up to date import of the latest GHTorrent MySQL dump.
 15 | 
 16 | * [Google Pub/Sub](https://console.cloud.google.com/cloudpubsub/topicList?project=ghtorrent-bq) exposes real-time streams of GitHub activity.
 17 | 
 18 | Both services can be accessed through the Web, the command line (after
 19 | installing the Google Cloud [command line utils](https://cloud.google.com/sdk/)) or though various programming languages.
 20 | 
 21 | ### BigQuery
 22 | 
 23 | With BigQuery, you can query GHTorrent's MySQL dataset using an SQL-like
 24 | language (lately, BigQuery also supports vanilla SQL); more importantly, you can 
 25 | join the dataset with other open datasets (e.g. GitHub's own project data, Reddit, 
 26 | [TravisTorrent](https://travistorrent.testroots.org/page_access/) etc) hosted on BigQuery.
 27 | 
 28 | To get the most popular programming languages by number of bytes written,
 29 | run the following:
 30 | 
 31 | {% highlight sql %}
 32 | select pl3.lang, sum(pl3.size) as total_bytes
 33 | from (
 34 |   select pl2.bytes as size, pl2.language as lang
 35 |   from (
 36 |     select pl.language as lang, max(pl.created_at) as latest, pl.project_id as project_id
 37 |     from [ght.project_languages] pl
 38 |       join [ght.projects] p on p.id = pl.project_id
 39 |     where p.deleted is false
 40 |       and p.forked_from is null
 41 |     group by lang, project_id
 42 |   ) pl1 join [ght.project_languages] pl2 on pl1.project_id = pl2.project_id
 43 |                                         and pl1.latest = pl2.created_at
 44 |                                         and pl1.lang = pl2.language
 45 | ) pl3
 46 | group by pl3.lang
 47 | order by total_bytes desc
 48 | {% endhighlight %}
 49 | 
 50 | To get the user with the most Java commits in the Netherlands in June 2016,
 51 | do the following:
 52 | 
 53 | {% highlight sql %}
 54 | select u.login as login, u.location as location, count(c.id) as num_commits
 55 | from [ghtorrent-bq.ght.project_commits] pc join
 56 |      (SELECT id, author_id FROM [ghtorrent-bq.ght.commits] WHERE
 57 |      date(created_at) between date('2016-06-01')
 58 |                           and date('2016-07-01') )c on pc.commit_id = c.id join
 59 |      (SELECT id
 60 |      FROM [ghtorrent-bq.ght.projects] WHERE language = 'Java') p on p.id = pc.project_id join
 61 |      (SELECT login, location, id
 62 |      FROM [ghtorrent-bq.ght.users]
 63 |      WHERE country_code = 'nl') u on c.author_id = u.id,
 64 | group by login, location
 65 | order by num_commits desc;
 66 | {% endhighlight %}
 67 | 
 68 | See also some queries by [Felipe Hoffa](https://medium.com/@hoffa/github-top-countries-201608-13f642493773).
 69 | 
 70 | ### Pub/Sub
 71 | 
 72 | Pub/Sub allows subscribers to get events of what is happening on GitHub (or at
 73 | least GHTorrent's interpretation of what is happening on GitHub) in almost real time. 
 74 | To do so, one needs to *subscribe* to one of the available *topics* with
 75 | a client in order to start receiving *events*.
 76 | 
 77 | The service is complimentary, even though less fine-grained, to GHTorrent's own
 78 | [streaming interface](streaming.html). As is also the case with GHTorrent
 79 | streaming, the contents of the streams are generated by following the live
 80 | MongoDB server replication stream. See the code [here](https://github.com/ghtorrent/ghtorrent-streaming).
 81 | 
 82 | To subscribe to a topic, e.g. `commits`, run the following:
 83 | 
 84 | ```
 85 | gcloud beta pubsub subscriptions create my_commits_subscription --topic projects/ghtorrent-bq/topics/commits
 86 | ```
 87 | 
 88 | To start receiving events, you can try the command line
 89 | 
 90 | ```
 91 | gcloud beta pubsub subscriptions pull --auto-ack --max-messages 5 -- my_commits_subscription
 92 | ```
 93 | 
 94 | The available topics are the following:
 95 | 
 96 | {% highlight txt%}
 97 | projects/ghtorrent-bq/topics/commits
 98 | projects/ghtorrent-bq/topics/events
 99 | projects/ghtorrent-bq/topics/followers
100 | projects/ghtorrent-bq/topics/forks
101 | projects/ghtorrent-bq/topics/issue_comments
102 | projects/ghtorrent-bq/topics/issue_events
103 | projects/ghtorrent-bq/topics/issues
104 | projects/ghtorrent-bq/topics/org_members
105 | projects/ghtorrent-bq/topics/pull_request_comments
106 | projects/ghtorrent-bq/topics/pull_requests
107 | projects/ghtorrent-bq/topics/repo_collaborators
108 | projects/ghtorrent-bq/topics/repo_labels
109 | projects/ghtorrent-bq/topics/repos
110 | projects/ghtorrent-bq/topics/users
111 | projects/ghtorrent-bq/topics/watchers
112 | {% endhighlight %}
113 | 
114 | 


--------------------------------------------------------------------------------
/pullreq-perf/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: page
  3 | title: Pull Request Performance reports
  4 | tagline: 
  5 | ---
  6 | 
  7 | ```{r preample, include=FALSE}
  8 | 
  9 | #
 10 | # (c) 2012 -- 2016 Georgios Gousios <gousiosg@gmail.com>
 11 | #
 12 | # BSD licensed, see LICENSE in top level dir
 13 | #
 14 | 
 15 | projectsjs = paste(apply(projects, 1, function(x){sprintf("'%s-%s'", x[1], x[2])}), collapse = ",")
 16 | projectsmd = paste(apply(projects, 1, function(x){sprintf("[%s/%s](/pullreq-perf/%s-%s/)", x[1], x[2], x[1], x[2])}), collapse="\n")
 17 | ```
 18 | 
 19 | See here some reports for popular repositories:
 20 | 
 21 | <div class="btn-toolbar">
 22 | <a class="btn" href="/pullreq-perf/rails-rails/">Ruby on Rails</a>
 23 | <a class="btn"  href="/pullreq-perf/ipython-ipython/">IPython</a>
 24 | <a  class="btn" href="/pullreq-perf/angular-angular.js/">Angular.js</a>
 25 | <a  class="btn" href="/pullreq-perf/nodejs-node/">Node.js</a>
 26 | <a  class="btn" href="/pullreq-perf/jquery-jquery/">JQuery</a>
 27 | <a  class="btn" href="/pullreq-perf/Homebrew-homebrew/">Homebrew</a>
 28 | <a  class="btn" href="/pullreq-perf/akka-akka/">Akka</a>
 29 | <a  class="btn" href="/pullreq-perf/rstudio-shiny/">Shiny</a>
 30 | </div>
 31 | <div class="btn-toolbar">
 32 | <a  class="btn" href="/pullreq-perf/mbostock-d3/">D3.js</a>
 33 | <a  class="btn" href="/pullreq-perf/bartaz-impress.js/">impress.js</a>
 34 | <a  class="btn" href="/pullreq-perf/jekyll-jekyll/">Jekyll</a>
 35 | <a  class="btn" href="/pullreq-perf/django-django/">Django</a>
 36 | <a  class="btn" href="/pullreq-perf/antirez-redis/">Redis</a>
 37 | <a  class="btn" href="/pullreq-perf/bitcoin-bitcoin/">Bitcoin</a>
 38 | <a  class="btn" href="/pullreq-perf/ReactiveX-RxJava/">RxJava</a>
 39 | <a  class="btn" href="/pullreq-perf/tensorflow-tensorflow/">Tensorflow</a>
 40 | 
 41 | </div>
 42 | 
 43 | <label for="projectSearch">Or search for project repository (e.g. akka-akka): </label>
 44 | <span>
 45 | <input id="projectSearch" type="search" data-provide="typeahead" />
 46 | <button id="go" type="button" class="btn btn-default">Go</button>
 47 | </span>
 48 | <script type="text/javascript">
 49 |   projects = [`r projectsjs`];
 50 | 
 51 |   $("#projectSearch").typeahead({
 52 |     source: projects
 53 |   });
 54 | 
 55 |   $("#go").click(function(){
 56 |     if ($("#projectSearch").val() != "") {
 57 |       window.location = "http://ghtorrent.org/pullreq-perf/" + $("#projectSearch").val();
 58 |     }
 59 |   });
 60 | 
 61 | </script>
 62 | 
 63 | ### What is this report about?
 64 | 
 65 | The report presents data on various aspects of pull request related activity 
 66 | within a project repository, with a special focus on how open the project
 67 | is to external contributions.
 68 | 
 69 | ### How did you choose the projects to analyze?
 70 | 
 71 | As a starting point, I used the 
 72 | [repository set](https://github.com/gousiosg/pullreqs/blob/master/projects.txt) 
 73 | that I also use for my
 74 | [research](http://www.gousios.gr/bibliography/GPD14.html) 
 75 | [work](http://www.gousios.gr/bibliography/GZ14.html).
 76 | Then, I added to this selection, the top 1000 repositories by number of stars
 77 | as reported by the GHTorrent database. I also added the top 50 projects in
 78 | terms of 
 79 | [total contributions received](http://www.gousios.gr/blog/The-triumph-of-online-collaboration/).
 80 | The actual list of projects can be found
 81 | [here](https://github.com/gousiosg/ghtorrent.org/blob/master/pullreq-perf/projects.txt) 
 82 | or by clicking on the "Show all repos" button below.
 83 | 
 84 | ### How did you build it?
 85 | 
 86 | I used R to query GHTorrent's main MySQL database, then piped the results
 87 | through [knitr](http://yihui.name/knitr/) templates which use
 88 | [ggplot2](ggplot2.org/) for generating the nice plots.
 89 | You can find the code [here](https://github.com/ghtorrent/ghtorrent.org/tree/master/pullreq-perf):
 90 | 
 91 | ### My repository is not included!
 92 | 
 93 | You can [send me an email](mailto:gousiosg@gmail.com) and I will make sure your
 94 | repository will be included in the next round of report generation. You can
 95 | also [edit this file](https://github.com/ghtorrent/ghtorrent.org/blob/master/pullreq-perf/projects.txt) and send a pull request.
 96 | 
 97 | ### The report for my project is just plain wrong!
 98 | 
 99 | If you have time to explain to me what is wrong, I would appreciate if
100 | [send me an email](mailto:gousiosg@gmail.com) and I will fix
101 | the issue.
102 | 
103 | <button type="button" class="btn btn-info" data-toggle="collapse" data-target="#allrepos">
104 |   Show all repos
105 | </button>
106 | 
107 | <div id="allrepos" class="collapse">
108 | <small>
109 | {% markdown %}
110 | `r projectsmd`
111 | {% endmarkdown %}
112 | </small>
113 | </div>
114 | 
115 | 
116 | <div id="disqus_thread"></div>
117 | 
118 | 
119 | <script type="text/javascript">
120 | var disqus_shortname = 'ghtorrent'; 
121 | 
122 | /* * * DON'T EDIT BELOW THIS LINE * * */
123 | (function() {
124 |  var dsq = document.createElement('script');
125 |  dsq.type = 'text/javascript'; dsq.async = true;
126 |  dsq.src = '//' + disqus_shortname +
127 |  '.disqus.com/embed.js';
128 |  (document.getElementsByTagName('head')[0]
129 |   ||
130 |   document.getElementsByTagName('body')[0]).appendChild(dsq);
131 |  })();
132 | </script>
133 | 


--------------------------------------------------------------------------------
/streaming.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: page
  3 | title: Streaming updates from GHTorrent
  4 | tagline:
  5 | ---
  6 | 
  7 | ## Connection details
  8 | 
  9 | To obtain access, please send us your public key [as described here](services.html).
 10 | 
 11 | ```
 12 | ssh -L 5672:streamer.ghtorrent.org:5672 ghtorrent@streamer.ghtorrent.org
 13 | ```
 14 | 
 15 | This will create a local port 5672 to which you can connect your AMQP client.
 16 | No shell is allocated for security reasons.
 17 | 
 18 | ## Declaring queues
 19 | 
 20 | Our queue server, [RabbitMQ](https://www.rabbitmq.com) implements the
 21 | [AMQP protocol](https://en.wikipedia.org/wiki/Advanced_Message_Queuing_Protocol). Some familiarity with it is necessary
 22 | before using the streaming service. The [RabbitMQ getting started page](https://www.rabbitmq.com/getstarted.html) is
 23 | a very good starting point with lots of examples in many languages.
 24 | 
 25 | The streaming service uses topic exchanges and concequently message-based
 26 | routing (see [here](https://www.rabbitmq.com/tutorials/tutorial-five-python.html) for details). To start receiving messages, a client needs to:
 27 | 
 28 | 1. connect to the server
 29 | 2. declare a queue
 30 | 3. bind the declared queue to the default exchange with routing key
 31 | 
 32 | The following examples are in Ruby.
 33 | 
 34 | ### Connecting to the server
 35 | 
 36 | Assuming your connection works as described above, you should have port
 37 | 5672 listening on localhost. You should connect and define the `ght-streams`
 38 | exchange (if you define other exchnages, you will receive no messages
 39 | as there is no script posting to them).
 40 | 
 41 | {% highlight ruby%}
 42 | #!/usr/bin/env ruby
 43 | 
 44 | require 'bunny'
 45 | conn = Bunny.new(:host => '127.0.0.1', :port => 5672,
 46 |                  :username => 'streamer', :password => 'streamer')
 47 | conn.start
 48 | ch  = conn.create_channel
 49 | exchange = ch.topic('ght-streams', :durable => true)
 50 | {% endhighlight%}
 51 | 
 52 | ### Declaring a queue
 53 | 
 54 | You can declare as many queues as you want (within reasonable limits). To
 55 | make the queue unique, we ask you to prefix your queue name with your
 56 | username (e.g. `gousiosg_queue`). You should also make your queue
 57 | non persistent, to avoid consuming server resouces when your program
 58 | finishes.
 59 | 
 60 | {% highlight ruby%}
 61 | q = ch.queue("gousiosg_queue", :auto_delete => true)
 62 | {% endhighlight%}
 63 | 
 64 | ### Binding queues to routing keys
 65 | 
 66 | All messages posted to `ght-streams` exchange have an attached routing key.
 67 | This allows clients to declare queues that selectively receive only
 68 | the messages they are interested into. The routing key is structured as
 69 | follows:
 70 | 
 71 | {% highlight%}
 72 | prefix.{entity|event}.action
 73 | {% endhighlight%}
 74 | 
 75 | The `prefix` denotes the type of the updated item
 76 | 
 77 | * `evt`: Denotes a GitHub event, as received by GHTorrent
 78 | * `ent`: Denotes an update in a MongoDB collection
 79 | 
 80 | The second part of the key denotes the updated item; its value depends on
 81 | the `prefix`. The permitted values are the following:
 82 | 
 83 | * For `evt` prefixes, it is the name of a [public GitHub event](https://developer.github.com/v3/activity/events/types/) shortened and lower-cased:
 84 | `commitcomment`,
 85 | `create`,
 86 | `delete`,
 87 | `deployment`,
 88 | `deploymentstatus`,
 89 | `download`,
 90 | `follow`,
 91 | `fork`,
 92 | `forkapply`,
 93 | `gist`,
 94 | `gollum`,
 95 | `issuecomment`,
 96 | `issues`,
 97 | `member`,
 98 | `membership`,
 99 | `pagebuild`,
100 | `public`,
101 | `pullrequest`,
102 | `pullrequestreviewcomment`,
103 | `push`,
104 | `release`,
105 | `repository`,
106 | `status`,
107 | `teamadd`,
108 | `watch`
109 | 
110 | * For `ent` prefixes, it is the name of the MongoDB collection that was updated. One of:
111 | `commit_comments`,
112 | `commits`,
113 | `followers`,
114 | `forks`,
115 | `geo_cache`,
116 | `issue_comments`,
117 | `issue_events`,
118 | `issues`,
119 | `org_members`,
120 | `pull_request_comments`,
121 | `pull_requests`,
122 | `repo_collaborators`,
123 | `repo_labels`,
124 | `repos`,
125 | `users`,
126 | `watchers`
127 | 
128 | The third part of the routing key denotes the update action. The allowed
129 | values are (this only applies to `ent` type messages; `evt` type messages
130 | are only marked as `insert`):
131 | 
132 | * `insert`: An insertion of a record to a MongoDB collection
133 | * `delete`: A deletion from a MongoDB record
134 | * `update`: An update to a MongoDB record
135 | 
136 | Let's see some example routing keys:
137 | 
138 | * `evt.repos.insert`: This will retrieve all new inserts to the `repos`
139 | collection
140 | * `evt.fork.*`: This will retrieve all fork events
141 | * `ent.*.update`: This will retrieve all updates on MongoDB collections
142 | * `*.*.insert`: This will retrieve all new events and all MongoDB inserts
143 | 
144 | {% highlight ruby%}
145 | q.bind(exchange, :routing_key => "evt.fork.*")
146 | q.subscribe do |delivery_info, metadata, payload|
147 |   puts "#{delivery_info.routing_key}: #{payload}"
148 | end
149 | {% endhighlight %}
150 | 
151 | ## Things to consider
152 | 
153 | * Queues are configured to be garbage collected when the client that declared them has been disconnected.
154 | * Messages have a pre-configured Time-To-Live equal to 1 minute. If your client
155 | is not fast enough, they will be discarded. For this reason, we recommend
156 | client-side buffering of unprocessed messages.
157 | * All exchanges not named `ght-streams` are deleted every 5 minutes.
158 | * All queues not prefixed with `username_` are deleted every 5 minutes.
159 | 


--------------------------------------------------------------------------------
/pers-data.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: page
  3 | title: Access to personal data
  4 | tagline:
  5 | ---
  6 | 
  7 | ## Accessing personal data
  8 | 
  9 | **Update Jun 2018: GDPR is in effect, which means that until GHTorrent's legal status
 10 | is cleared, we cannot distribute this data anymore.**
 11 | 
 12 | As of Mar 2016, the GHTorrent project does not offer personal data
 13 | (namely, emails and real names) for download. For research purposes,
 14 | you can request access to a file containing a mapping between
 15 | logins and personal data.
 16 | 
 17 | To access the file containing personal data, you will need to [edit this page](https://github.com/ghtorrent/ghtorrent.org/blob/master/pers-data.md) to include the following details.
 18 | When your pull request has been accepted, we will mail you the link
 19 | to the data.
 20 | 
 21 | {%highlight html%}
 22 | <dl>
 23 |   <dt></dt>
 24 |   <dd> Job Title
 25 |   <a href="email">email</a></dd>
 26 | 
 27 |   <dt>Date of request</dt>
 28 |   <dd>The actual date</dd>
 29 | 
 30 |   <dt>Why do you need the personal data?</dt>
 31 |   <dd> Provide an explanation </dd>
 32 | 
 33 | </dl>
 34 | {%endhighlight%}
 35 | 
 36 | ## People with access to personal data
 37 | 
 38 | #### Georgios Gousios
 39 | <dl>
 40 |   <dt>Researcher</dt>
 41 |   <dd>Georgios Gousios, Assistant Prof. Radboud University Nijmegen,
 42 |   <a href="g.gousios@cs.ru.nl">g.gousios@cs.ru.nl</a> </dd>
 43 | 
 44 |   <dt>Date of request</dt>
 45 |   <dd>Mar 14, 2016</dd>
 46 | 
 47 |   <dt>Intended use</dt>
 48 |   <dd>Maintenance of the GHTorrent internal databases.</dd>
 49 | 
 50 | </dl>
 51 | 
 52 | <dl>
 53 |   <dt>Researcher</dt>
 54 |   <dd>Diomidis Spinellis, Professor, Athens University of Economics and Business, Greece,
 55 |   <a href="dds@aueb.gr">dds@aueb.gr</a> </dd>
 56 | 
 57 |   <dt>Date of request</dt>
 58 |   <dd>July 1, 2016</dd>
 59 | 
 60 |   <dt>Intended use</dt>
 61 |   <dd>Research regarding commit practices of company employees.  Correlate projects with commits through git blame.</dd>
 62 | 
 63 | </dl>
 64 | 
 65 | <dl>
 66 |   <dt>Researcher</dt>
 67 |   <dd>Tong WANG, Lecturer, University of Edinburgh
 68 |   <a href="tong.wang@ed.ac.uk">tong.wang@ed.ac.uk</a> </dd>
 69 | 
 70 |   <dt>Date of request</dt>
 71 |   <dd>Aug. 30, 2016</dd>
 72 | 
 73 |   <dt>Intended use</dt>
 74 |   <dd>Research regarding Open Source software network, especially focus on the interaction between programming habitants and company employees</dd>
 75 | 
 76 | </dl>
 77 | 
 78 | <dl>
 79 |   <dt>Researcher</dt>
 80 |   <dd>Chris Chabot, Semmle.com
 81 |   <a href="chabotc@semmle.com">chabotc@semmle.com</a> </dd>
 82 | 
 83 |   <dt>Date of request</dt>
 84 |   <dd>Dec. 11, 2016</dd>
 85 | 
 86 |   <dt>Intended use</dt>
 87 |   <dd>Normalizing and de-duplicating of author contribution data on our free for open source lgtm.com project, which provides source code analysis and fault detection, as well as showing coding velocity and quality per author and organization</dd>
 88 | 
 89 | </dl>
 90 | 
 91 | <dl>
 92 |   <dt>Undergraduate</dt>
 93 |   <dd>Davide Primiceri, Student Computer Science, University of Bari, Italy.
 94 |   <a href="mailto:d.primiceri@studenti.uniba.it">d.primiceri@studenti.uniba.it</a></dd>
 95 | 
 96 |   <dt>21 April, 2017</dt>
 97 |   <dd></dd>
 98 | 
 99 |   <dt>Needed for Degree Thesis</dt>
100 |   <dd>I am doing my degree thesis on the topic 'Evaluating the effects of multitasking among the open source projects of GitHub'. In order to do my analysis work, i need to combine GitHub data with Travis data. Thus i require the name and other login details of all users. Kindly share the personal data with me.
101 |   </dd>
102 | 
103 | </dl>
104 | 
105 | <dl>
106 |   <dt>Researcher</dt>
107 |   <dd>Bogdan Vasilescu, Assistant Professor, School of Computer Science, Carnegie Mellon University
108 |   <a href="vasilescu@cmu.edu">vasilescu@cmu.edu</a> </dd>
109 | 
110 |   <dt>Date of request</dt>
111 |   <dd>June 1, 2017</dd>
112 | 
113 |   <dt>Intended use</dt>
114 |   <dd>Research regarding gender diversity in GitHub teams.</dd>
115 | 
116 | </dl>
117 | 
118 | <dl>
119 |   <dt>Graduate Student</dt>
120 |   <dd>Farhana Sarker, Computer Science Graduate Student, College of Engineering, University of California Davis
121 |   <a href="fasarker@ucdavis.edu">fasarker@ucdavis.edu</a> </dd>
122 | 
123 |   <dt>Date of request</dt>
124 |   <dd>September 16, 2017</dd>
125 | 
126 |   <dt>Intended use</dt>
127 |   <dd>Research regarding multitasking in GitHub teams.</dd>
128 | 
129 | </dl>
130 | 
131 | <dl>
132 |   <dt>Researcher</dt>
133 |   <dd>Guanliang Chen, PhD candidate, Web Information Systems group, EEMCS, TU Delft <a href="guanliang.chen@tudelft.nl">guanliang.chen@tudelft.nl</a> </dd>
134 | 
135 |   <dt>Date of request</dt>
136 |   <dd>Oct 30, 2017</dd>
137 | 
138 |   <dt>Intended use</dt>
139 |   <dd>To match learners in edX and investigate to what extend learners from programming MOOCs applied the knowledge into practice.</dd>
140 | </dl>
141 | 
142 | <dl>
143 |   <dt>Postdoctoral Researcher</dt>
144 |   <dd>Ayushi Rastogi, UC Irvine
145 |   <a href="ayushir@ics.uci.edu">ayushir@ics.uci.edu</a></dd>
146 | 
147 |   <dt>Date of request</dt>
148 |   <dd>November 30, 2017</dd>
149 | 
150 |   <dt>Why do you need the personal data?</dt>
151 |   <dd> My research focus is empirical software engineering, with a particular interest in human traits, team performance, and collaboration patterns. </dd>
152 | 
153 | </dl>
154 | 
155 | <dl>
156 |   <dt>PhD Student</dt>
157 |   <dd>Harsh Ketkar, University of Michigan
158 |   <a href="hketkar@umich.edu">hketkar@umich.edu</a></dd>
159 | 
160 |   <dt>Date of request</dt>
161 |   <dd>January 18, 2018</dd>
162 | 
163 |   <dt>Why do you need the personal data?</dt>
164 |   <dd> I am researching how contribution patterns of individual developers change over time and across platforms.</dd>
165 | 
166 | </dl>
167 | 
168 | <dl>
169 |   <dt>Researcher</dt>
170 |   <dd>Emerson Murphy-Hill, North Carolina State University, 
171 |   <a href="emerson@csc.ncsu.edu">emerson@csc.ncsu.edu</a></dd>
172 | 
173 |   <dt>Date of request</dt>
174 |   <dd>April 24, 2018</dd>
175 | 
176 |   <dt>Why do you need the personal data?</dt>
177 |   <dd>I will use email addresses to cross-reference GitHub accounts with social media accounts.</dd>
178 | 
179 | </dl>
180 | 
181 | ## Disclaimer
182 | 
183 | The data is provided as is with no further guarantees of data quality or law
184 | compliance. Redistribution is *strictly not* allowed! The GHTorrent project is
185 | not responsible for any illegal uses of the provided data.
186 | 


--------------------------------------------------------------------------------
/halloffame.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: page
  3 | title: Hall of Fame
  4 | tagline:
  5 | ---
  6 | 
  7 | ### Donations
  8 | 
  9 | * [17 Nov 2015] Microsoft donated $98,000 in Azure credits
 10 | * [30 Oct 2016] Google donated $1000 in Google Cloud credits
 11 | 
 12 | The project would also like to thank the anonymous donors for their
 13 | generocity. GHTorrent will become a better project thanks to you!
 14 | 
 15 | ### Papers using GHTorrent
 16 | 
 17 | This list is a subset of researchers who have used GHTorrent for research or
 18 | teaching. If you are a user of the dataset, please consider adding your details.
 19 | You can do it using the following simple steps:
 20 | 
 21 | * Add information about your organization and yourself to [this file on
 22 | Github](https://github.com/gousiosg/ghtorrent.org/blob/master/halloffame.md). You should describe how you used GHTorrent in a few lines. It is OK to include links. Please ensure that institution names are listed in alphabetic order.
 23 | 
 24 | * If you are interested to link your publications referencing GHTorrent, you should include a Bibtex record in [this file](https://github.com/gousiosg/ghtorrent.org/blob/master/_bibliography/references.bib) on Github. You can then reference them in [this file](https://github.com/gousiosg/ghtorrent.org/blob/master/halloffame.md).
 25 | 
 26 | #### [Inria/Mines Nantes/LINA/AtlanMod](http://www.emn.fr/z-info/atlanmod/index.php/Main_Page)
 27 | * [Jordi Cabot](http://modeling-languages.com): Research on usage of issue labels in GitHub.
 28 |   1. {% reference cabotSaner2015 %}
 29 |   2. {% reference canovasSaner2015 %}
 30 | 
 31 | #### [NUDT/Trustie](http://www.trustie.com/)
 32 | * [Yue Yu](http://yuyue.github.io): Research on reviewer recommendation, and latency of pull requests. Used GHTorrent to extract our dataset.
 33 |   1. {% reference YuRR14 %}
 34 |   2. {% reference yue2015wait %}
 35 | 
 36 | #### [Radboud University Nijmegen/DS](http://www.ru.nl/ds/)
 37 | * [Georgios Gousios](http://www.gousios.gr): Maintentance, qualitative research on pull requests, [pull request prioritization](http://ghtorrent.org/prioritizer),developer profiles
 38 |   1. {% reference GZSD15 %}
 39 |   2. {% reference HG15 %}
 40 |   3. {% reference VGZ15 %}
 41 | 
 42 | #### [TU Delft/SERG](http://swerl.tudelft.nl/bin/view/Main/WebHome)
 43 | * [Georgios Gousios](http://www.gousios.gr): Initial design and implementation. Project hosting. Lean GHTorrent. Research on pull requests. Project openess reports.
 44 |   1. {% reference GS12 %}
 45 |   2. {% reference G13 %}
 46 |   3. {% reference GPD14 %}
 47 |   4. {% reference GZ14 %}
 48 |   5. {% reference GVSZ14 %}
 49 | 
 50 | #### [TU Eindhoven/SET](http://www.tue.nl/en/university/departments/mathematics-and-computer-science/research/research-programs-computer-science/section-model-driven-software-engineering-mdse/set/)
 51 | * [Bogdan Vasilescu](http://bvasiles.github.io/): Integration of GitHub and Stack Overflow data. Research on productivity of GitHub developers. Sentiment analysis of GitHub discussions. Lean GHTorrent. Continuous integration in GitHub.
 52 | * [Alexander Serebrenik](http://www.win.tue.nl/~aserebre/): Research on productivity of GitHub developers. Sentiment analysis of GitHub discussions. Research on continuous integration in GitHub.
 53 |   1. {% reference VSF12 %}
 54 |   2. {% reference GVSZ14 %}
 55 |   3. {% reference PVS14 %}
 56 |   4. {% reference vasilescu2014ci %}
 57 | 
 58 | #### [University of California, Davis/DECAL](http://decallab.cs.ucdavis.edu)
 59 | * [Bogdan Vasilescu](http://bvasiles.github.io/): Research on effects of diversity in GitHub teams.
 60 |   1. {% reference vasilescu2015gender %}
 61 |   2. {% reference vasilescu2015chase %}
 62 | 
 63 | #### [University of Victoria/SEGAL](http://thesegalgroup.org)
 64 | * [Kelly Blincoe](http://thesegalgroup.org/people/kelly-blincoe): Research on Implicit Coordination and its impact on productivity.
 65 | * [Eirini Kalliamvakou](http://thesegalgroup.org/people/eirini-kalliamvakou): Research on collaborative development using decentralized workflows and GitHub. Used GHTorrent to extract information about pull requests for potential mining perils.
 66 |   1. {% reference KGBSGD14 %}
 67 | 
 68 | #### [University of Trier/SE](http://st.uni-trier.de/)
 69 | * [Sebastian Baltes](http://sbaltes.com/): Research on the usage of Stack Overflow code snippets in GitHub projects, its licensing implications, and developers' awareness.
 70 |   1. {% reference BaltesDiehl2018 %}
 71 | 
 72 | ### API keys contributors
 73 | 
 74 | The following people's contributions of GitHub OAuth API keys has allowed
 75 | the data collection process to catch on with GitHub's 10x growth since the
 76 | GHTorrent project started. If you would like to contribute and API key,
 77 | please follow the process specified [here](http://ghtorrent.org/services.html).
 78 | 
 79 | [Bram Adams](http://mcis.polymtl.ca/bram.html),
 80 | [Maryi Arciniegas Méndez](http://thechiselgroup.org/members/),
 81 | [Syed Arefinul Haque](https://uiu-bd.academia.edu/SyedArefinulHaque),
 82 | [Efthimia Aivaloglou](https://www.linkedin.com/pub/efthimia-aivaloglou/4/244/966),
 83 | [Alberto Bacchelli](http://sback.it),
 84 | [Moritz Beller](http://www.st.ewi.tudelft.nl/~mbeller/),
 85 | [Matthieu Bizien](https://www.linkedin.com/in/matthieubizien/en),
 86 | Erik Bowers,
 87 | [Frederic Gingras](http://fredericgingras.ca),
 88 | [Roberta de Souza Coelho](https://www.dimap.ufrn.br/~roberta/),
 89 | [Victor Costan](http://www.costan.us),
 90 | [Ayushi Dalmia](https://researchweb.iiit.ac.in/~ayushi.dalmia/),
 91 | Jos Demmers,
 92 | [Arie van Deursen](http://www.st.ewi.tudelft.nl/~arie/),
 93 | [Niel Ernst](http://neilernst.net),
 94 | [Joe Fleming](http://joefleming.net),
 95 | [Georgios Gousios](http://gousios.gr),
 96 | [Samarendra M Hedaoo](http://fortyplustwo.net),
 97 | [Mark Hills](http://www.cs.ecu.edu/hillsma/),
 98 | [Arun Kalyanasundaram](http://www.cs.cmu.edu/~arunkaly/),
 99 | [Syafiq Kamarul Azman](https://www.kaggle.com/syaffers),
100 | Lindsey Lanier,
101 | Pablo Loyola,
102 | Yao Lu,
103 | [Mahdi Moqri](http://www.moqri.com),
104 | Graeme Nathan,
105 | [Matteo Orrù](),
106 | [Gustavo Pinto](http://gustavopinto.org),
107 | [Dominic Safaric](https://github.com/dsafaric),
108 | Jasmine Sandhu,
109 | [Alexander Serebrenik](http://www.win.tue.nl/~aserebre/),
110 | [Diomidis Spinellis](http://www.dmst.aueb.gr/dds/),
111 | Simon Symeonidis,
112 | [Chris Thompson](http://www.cs.berkeley.edu/~cthompson/),
113 | [Peter Tröger](http://www.troeger.eu),
114 | [Bogdan Vasilescu](http://bvasiles.github.io),
115 | Marko Vit,
116 | [Meike Wiemann](https://twitter.com/weidenfreak),
117 | [Yue Yu](http://fisher.trustie.net/),
118 | [Alexey Zagalsky](http://alexeyza.com),
119 | [Andy Zaidman](http://www.st.ewi.tudelft.nl/~zaidman/),
120 | [Nosheen Zaza](http://www.people.usi.ch/zazan/)
121 | 


--------------------------------------------------------------------------------
/_layouts/default.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en" itemscope itemtype="http://schema.org/Article">
  3 |   <head>
  4 |     <meta charset="utf-8">
  5 |     <title>{{ page.title }}</title>
  6 |     <script type="text/javascript" src="//ajax.googleapis.com/ajax/libs/jquery/1.9.1/jquery.min.js"></script>
  7 |     <script src="//netdna.bootstrapcdn.com/twitter-bootstrap/2.3.0/js/bootstrap.min.js"></script>
  8 |     {% if page.description %}<meta name="description" content="{{ page.description }}">{% endif %}
  9 |     <meta name="author" content="{{ site.author.name }}">
 10 |     <link href="//netdna.bootstrapcdn.com/bootswatch/2.3.0/cosmo/bootstrap.min.css" rel="stylesheet" type="text/css" media="all">
 11 |     <link href="//netdna.bootstrapcdn.com/font-awesome/4.1.0/css/font-awesome.min.css" rel="stylesheet">
 12 | 
 13 |     <link href="syntax.css" type="text/css" rel="stylesheet" media="all">
 14 |     <link href="local.css" type="text/css" rel="stylesheet" media="all">
 15 |     <link rel="shortcut icon" href="https://github.com/favicon.ico">
 16 |   </head>
 17 |   <body>
 18 | 
 19 |     <div class="navbar">
 20 |       <div class="navbar-inner">
 21 |         <div class="container">
 22 |           <a class="brand" href="/">{{ site.title }}</a>
 23 |           <ul class="nav" role="navigation">
 24 |             <li><a href="/docs.html">Docs</a></li>
 25 |             <li class="dropdown">
 26 |             <a href="#" class="dropdown-toggle" data-toggle="dropdown">Downloads<b class="caret"></b></a>
 27 |             <ul class="dropdown-menu" role="menu" aria-labelledby="drop1">
 28 |               <li><a href="/downloads.html">Downloads</a></li>
 29 |               <li><a href="https://github.com/gousiosg/github-mirror">Source code</a></li>
 30 |             </ul>
 31 |             </li>
 32 |             <li class="dropdown">
 33 |             <a href="#" class="dropdown-toggle" data-toggle="dropdown">Live<b class="caret"></b></a>
 34 |             <ul class="dropdown-menu" role="menu" aria-labelledby="drop1">
 35 |               <li><a href="/services.html">Overview</a></li>
 36 |               <li><a href="/dblite">MySQL web</a></li>
 37 |               <li><a href="/mysql.html">MySQL</a></li>
 38 |               <li><a href="/raw.html">MongoDB</a></li>
 39 |               <li><a href="/streaming.html">Streaming</a></li>
 40 |               <li><a href="/gcloud.html">Google Cloud</a></li>
 41 | 
 42 |             </ul></li>
 43 | 
 44 |             <li class="dropdown">
 45 |             <a href="#" class="dropdown-toggle" data-toggle="dropdown">Visualizations<b class="caret"></b></a>
 46 |             <ul class="dropdown-menu" role="menu" aria-labelledby="drop1">
 47 |               <li><a href="http://langpop.corger.nl/">Language Popularity</a></li>
 48 |             </ul></li>
 49 |             <li class="dropdown">
 50 |             <a href="#" class="dropdown-toggle" data-toggle="dropdown">Reports<b class="caret"></b></a>
 51 |             <ul class="dropdown-menu" role="menu" aria-labelledby="drop1">
 52 |               <li><a href="/pullreq-perf/">Pull Request
 53 |                 performance</a></li>
 54 |             </ul></li>
 55 |             <li class="dropdown">
 56 |             <a href="#" class="dropdown-toggle" data-toggle="dropdown">Datasets<b class="caret"></b></a>
 57 |             <ul class="dropdown-menu" role="menu" aria-labelledby="drop1">
 58 |               <li><a href="/msr14.html">MSR 2014</a></li>
 59 |               <li><a href="/vissoft14.html">VISSOFT 2014</a></li>
 60 |               <li><a
 61 |                 href="https://github.com/gousiosg/pullreqs">Pull requests</a></li>
 62 |               <li><a
 63 |                 href="http://travistorrent.testroots.org">TravisTorrent</a></li>
 64 |             </ul></li>
 65 |             <li><a href="/basedupon.html">Based Upon</a></li>
 66 |             <li><a href="/halloffame.html">Hall of Fame</a></li>
 67 |             <li><a href="/faq.html">FAQ</a></li>
 68 |           </ul>
 69 |         </div>
 70 |       </div>
 71 |     </div>
 72 | 
 73 |     <div class="container-fluid">
 74 |       <div class="row-fluid">
 75 |         <div class="span2">
 76 |           <div class="page-header">
 77 |             <iframe src="http://ghbtns.com/github-btn.html?user=gousiosg&repo=github-mirror&type=watch&count=true" allowtransparency="true" frameborder="0" scrolling="0" width="110px" height="20px"></iframe><br/>
 78 |             <a href="https://twitter.com/share" class="twitter-share-button">Tweet</a> <br/>
 79 |             <div class="g-plusone" data-size="medium"></div><br/>
 80 |             <a href="http://news.ycombinator.com/submit" class="hn-share-button">Vote on HN</a><br/>
 81 |             <script type="text/javascript" src="http://en.reddit.com/buttonlite.js?i=1"></script>
 82 |           </div>
 83 |           <div>
 84 |             Sponsors
 85 |             <br/>
 86 |             <br/>
 87 |             <img src="/files/mslogo.png" class="img-responsive"
 88 |             alt="Microsoft logo">
 89 |             <br/>
 90 |             <br/>
 91 |             <img src="/files/rulogo.gif" class="img-responsive"
 92 |             alt="Radboud University logo" width="100">
 93 |             <br/>
 94 |             <br/>
 95 |             <img src="/files/tudelftlogo.png" class="img-responsive"
 96 |             alt="TU Delft logo" width="100">
 97 |           </div>
 98 |           <div>
 99 |             <br/>
100 |             Become a sponsor
101 | <form action="https://www.paypal.com/cgi-bin/webscr" method="post" target="_top">
102 | <input type="hidden" name="cmd" value="_s-xclick" />
103 | <input type="hidden" name="hosted_button_id" value="PNTMMBP9UWUDN" />
104 | <input type="image" src="https://www.paypal.com/en_US/i/btn/btn_donate_LG.gif" border="0" name="submit" title="PayPal - The safer, easier way to pay online!" alt="Donate" />
105 | <img alt="" border="0" src="https://www.paypal.com/en_US/i/scr/pixel.gif" width="1" height="1" />
106 | </form>
107 |           </div>
108 |         </div>
109 |         <div class="span10">
110 |           <div class="content">
111 |             {{ content }}
112 |           </div>
113 |         </div>
114 |       </div>
115 |     </div>
116 |     <script>!function(d,s,id){var js,fjs=d.getElementsByTagName(s)[0];if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src="//platform.twitter.com/widgets.js";fjs.parentNode.insertBefore(js,fjs);}}(document,"script","twitter-wjs");</script>
117 |     <!-- Place this tag after the last +1 button tag. -->
118 |     <script type="text/javascript">
119 |       (function() {
120 |        var po = document.createElement('script'); po.type = 'text/javascript'; po.async = true;
121 |        po.src = 'https://apis.google.com/js/plusone.js';
122 |        var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(po, s);
123 |        })();
124 |      </script>
125 |      <script>
126 |        (function(d, t) {
127 |          var g = d.createElement(t),
128 |          s = d.getElementsByTagName(t)[0];
129 |          g.src = '//hnbutton.appspot.com/static/hn.min.js';
130 |          s.parentNode.insertBefore(g, s);
131 |        }(document, 'script'));
132 |      </script>
133 | 
134 |      <script type="text/javascript">
135 | 
136 |        var _gaq = _gaq || [];
137 |        _gaq.push(['_setAccount', 'UA-38537159-1']);
138 |        _gaq.push(['_setDomainName', 'ghtorrent.org']);
139 |        _gaq.push(['_trackPageview']);
140 | 
141 |        (function() {
142 |         var ga = document.createElement('script'); ga.type =
143 |         'text/javascript'; ga.async = true;
144 |         ga.src = ('https:' == document.location.protocol ? 'https://' : 'http://') + 'stats.g.doubleclick.net/dc.js';
145 |         var s = document.getElementsByTagName('script')[0];
146 |         s.parentNode.insertBefore(ga, s);
147 |         })();
148 | 
149 |       </script>
150 |     </body>
151 |   </html>
152 | 


--------------------------------------------------------------------------------
/ght-ubuntu.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: page
  3 | title: GHTorrent optimized on Ubuntu 10.10
  4 | tagline:
  5 | ---
  6 | 
  7 | This is a from scratch installation script for GHTorrent running on Ubuntu
  8 | 10.10. It has been tuned for cloud Ubuntu installations (e.g. Azure). This
  9 | setup will handle installations in the order of 10's of thousands repositories
 10 | on a D3 Azure VM (2 CPUs, 7GB RAM).
 11 | 
 12 | ## Install essentials
 13 | 
 14 | {% highlight bash %}
 15 | sudo apt-get install -y git ntp mdadm lvm2 libssl-dev parallel
 16 | {% endhighlight %}
 17 | 
 18 | ## Configure RAID
 19 | 
 20 | {% highlight bash %}
 21 | sudo bash
 22 | # partition the devices
 23 | (echo -e "o\nn\np\n1\n\n\nt\nfd\nw" | fdisk /dev/sdc ) || exit 1
 24 | (echo -e "o\nn\np\n1\n\n\nt\nfd\nw" | fdisk /dev/sdd ) || exit 1
 25 | 
 26 | # start the array and write out its config
 27 | mdadm --create /dev/md127 --level 0 --raid-devices 2 /dev/sdc1 /dev/sdd1
 28 | mdadm --detail --scan >> /etc/mdadm/mdadm.conf
 29 | {% endhighlight %}
 30 | 
 31 | ## Configure filesystem on RAID
 32 | 
 33 | {% highlight bash %}
 34 | sudo bash
 35 | mkfs -t ext4 /dev/md127
 36 | id=`blkid|grep md127|cut -f2 -d'"'`
 37 | mkdir /data
 38 | echo "UUID=$id /data ext4 defaults,noatime,nobootwait,optional 0 0" >> /etc/fstab
 39 | {% endhighlight %}
 40 | 
 41 | ## Install required Ruby
 42 | 
 43 | {% highlight bash %}
 44 | sudo apt-get install ruby ruby2.2 ruby2.2-dev build-essential
 45 | 
 46 | sudo update-alternatives --install /usr/bin/ruby ruby /usr/bin/ruby2.2 400 \
 47 |  --slave /usr/bin/rake rake /usr/bin/rake2.2 \
 48 |  --slave /usr/bin/ri ri /usr/bin/ri2.2 \
 49 |  --slave /usr/bin/rdoc rdoc /usr/bin/rdoc2.2 \
 50 |  --slave /usr/bin/gem gem /usr/bin/gem2.2 \
 51 |  --slave /usr/bin/irb irb /usr/bin/irb2.2 \
 52 |  --slave /usr/share/man/man1/ruby.1.gz ruby.1.gz /usr/share/man/man1/ruby2.2.1.gz \
 53 |  --slave /usr/share/man/man1/rake.1.gz rake.1.gz /usr/share/man/man1/rake2.2.1.gz \
 54 |  --slave /usr/share/man/man1/ri.1.gz ri.1.gz /usr/share/man/man1/ri2.2.1.gz \
 55 |  --slave /usr/share/man/man1/rdoc.1.gz rdoc.1.gz /usr/share/man/man1/rdoc2.2.1.gz \
 56 |  --slave /usr/share/man/man1/gem.1.gz gem.1.gz /usr/share/man/man1/gem2.2.1.gz \
 57 |  --slave /usr/share/man/man1/irb.1.gz irb.1.gz /usr/share/man/man1/irb2.2.1.gz
 58 | {% endhighlight %}
 59 | 
 60 | ## Install MySQL (its MariaDB variant)
 61 | 
 62 | We are currently using MariaDB as it can handle complex queries better than
 63 | stock MySQL 5.6. If you prefer MySQL, skip the MariaDB installation script
 64 | below.
 65 | 
 66 | You can set any password for the root user in MySQL.
 67 | 
 68 | {% highlight bash %}
 69 | sudo apt-get install -y software-properties-common
 70 | sudo apt-key adv --recv-keys --keyserver hkp://keyserver.ubuntu.com:80 0xcbcb082a1bb943db
 71 | sudo add-apt-repository 'deb http://mariadb.mirror.triple-it.nl//repo/10.1/ubuntu wily main'
 72 | sudo apt-get update
 73 | sudo apt-get install -y mariadb-server percona-toolkit libmariadbclient-dev
 74 | {% endhighlight %}
 75 | 
 76 | Then, move data files to the RAID array.
 77 | 
 78 | {% highlight bash %}
 79 | sudo service mysql stop
 80 | sudo mkdir /data/mysql
 81 | sudo chown mysql:mysql /data/mysql
 82 | sudo mkdir /mnt/mysql
 83 | sudo chown mysql:mysql /mnt/mysql
 84 | sudo rsync -av /var/lib/mysql /data/mysql
 85 | sudo service mysql start
 86 | {% endhighlight %}
 87 | 
 88 | ## Configure MySQL/MariaDB
 89 | 
 90 | {% highlight bash %}
 91 | sudo service mysql stop
 92 | sudo vi /etc/mysql/my.cnf
 93 | 
 94 | ### change the following
 95 | datadir = /data/mysql
 96 | tmpdir = /mnt/mysql
 97 | innodb_buffer_pool_size=4GB
 98 | ###
 99 | 
100 | sudo service mysql start
101 | {% endhighlight %}
102 | 
103 | ## Install MongoDB
104 | 
105 | We are install MongoDB latest (3.0.x) from MongoDB's central repo and use
106 | WiredTiger as the storage engine due to huge space savings.
107 | 
108 | {% highlight bash %}
109 | sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 7F0CEB10
110 | echo "deb http://repo.mongodb.org/apt/ubuntu trusty/mongodb-org/3.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-3.0.list
111 | sudo apt-get update
112 | sudo apt-get install -y mongodb-org
113 | 
114 | echo << EOF > /lib/systemd/system/mongodb.service
115 | [Unit]
116 | Description=High-performance, schema-free document-oriented database
117 | After=network.target
118 | 
119 | [Service]
120 | User=mongodb
121 | ExecStart=/usr/bin/mongod --quiet --config /etc/mongod.conf
122 | 
123 | [Install]
124 | WantedBy=multi-user.target
125 | EOF
126 | 
127 | sudo systemctl daemon-reload
128 | sudo service mongodb start
129 | 
130 | # Stop it to move dirs to the right places
131 | sudo service mongodb stop
132 | sudo bash
133 | 
134 | mkdir /data/mongodb
135 | chown mongodb:mongodb /data/mongodb
136 | 
137 | echo << EOF > /etc/mongod.conf
138 | storage:
139 |   dbPath: "/data/mongodb"
140 |   engine: "wiredTiger"
141 |   wiredTiger:
142 |     collectionConfig:
143 |       blockCompressor: snappy
144 |     engineConfig:
145 |       cacheSizeGB: 4 # Configure this if you have more RAM
146 | systemLog:
147 |   destination: file
148 |   path: "/var/log/mongodb/mongodb.log"
149 |   logAppend: true
150 |   timeStampFormat: iso8601-utc
151 | 
152 | net:
153 |   bindIp: "0.0.0.0"
154 |   port: 27017
155 | EOF
156 | 
157 | service mongodb start
158 | {% endhighlight %}
159 | 
160 | ### Install and configure RabbitMQ
161 | 
162 | {% highlight bash %}
163 | sudo apt-get install rabbitmq-server
164 | sudo rabbitmqctl add_user ghtorrent ghtorrent
165 | sudo rabbitmqctl set_permissions -p / ghtorrent ".*" ".*" ".*"
166 | sudo rabbitmq-plugins enable rabbitmq_management
167 | sudo rabbitmqctl set_user_tags ghtorrent administrator
168 | {% endhighlight %}
169 | 
170 | ### Install and configure GHTorrent
171 | 
172 | {% highlight bash %}
173 | cd $HOME
174 | git clone https://github.com/gousiosg/github-mirror.git
175 | cd github-mirror
176 | 
177 | sudo gem install bundler
178 | sudo bundle install
179 | sudo gem install mysql2
180 | 
181 | cp config.yaml.tmpl config.yaml
182 | vi config.yaml
183 | {% endhighlight %}
184 | 
185 | Use the following contents for the config.yaml file
186 | 
187 | {% highlight yaml %}
188 | amqp:
189 |   host:   127.0.0.1 # Queue's IP address
190 |   port:   5672
191 |   username: ghtorrent # Username to connect to the queue
192 |   password: ghtorrent   # password
193 |   exchange: ghtorrent
194 |   prefetch: 1
195 | 
196 | sql:
197 |   # Configuration URL for the SQL database subsystem.
198 |   # Examples:
199 |   # - MySQL:     mysql2://user:password@host/github
200 |   # - Postgres:  postgres://user:password@host/github
201 |   #
202 |   # On JRuby, you can use the JDBC-mysql driver that comes with JRuby
203 |   #     jdbc:mysql://localhost/github?user=github&password=github
204 |   #
205 |   # see http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html
206 |   # for details
207 |   #url: sqlite://github.db
208 |   url: mysql2://ghtorrent:ghtorrent@localhost/ghtorrent
209 | 
210 | mirror:
211 |   urlbase: "https://api.github.com/"
212 |   persister: mongo #or noop
213 |   # How many pages of historical content to retrieve when doing multi-page
214 |   # API calls.
215 |   history_pages_back: 1000
216 |   # On a machine with multiple IP addresses, select the one to send the
217 |   # HTTP requests from.
218 |   # attach_ip: 0.0.0.0
219 |   # Use your token here
220 |   token:
221 |   # Number of reqs/hour to do with the provided key
222 |   req_limit: 4990
223 |   # User agent to use for requests. You must use a unique name per client program
224 |   user_agent: ghtorrent
225 |   # Time to wait between geo location API requests
226 |   geoloc_wait: 2
227 | 
228 | mongo:
229 |   host: 127.0.0.1      # Mongo's IP addr
230 |   port: 27017          # Mongo's port
231 |   db: ghtorrent        # DB name to store commits to
232 |   #username: github     # User name to connect to Mongo
233 |   #password: github     # Password for mongo
234 | 
235 | logging:
236 |   # A unique string to appear in all messages produced by the invoking program.
237 |   uniq: "ghtorrent"
238 |   # debug < info < warn < error, for decreasing log output
239 |   level: "info"
240 |   # stdout or stderr to log to system streams. A file name to log to this file.
241 |   file: "stdout"
242 | {% endhighlight %}
243 | 
244 | GHTorrent is now ready to run. Self-apply to begin with:
245 | 
246 | {% highlight bash %}
247 | cd $HOME/github-mirror
248 | ruby -Ilib bin/ght-retrieve-repo gousiosg github-mirror
249 | {% endhighlight %}
250 | 
251 | 
252 | 


--------------------------------------------------------------------------------
/_bibliography/references.bib:
--------------------------------------------------------------------------------
  1 | @inproceedings{GPD14,
  2 |   author = {Gousios, Georgios and Pinzger, Martin and Deursen, Arie van}, 
  3 |   title = {An Exploratory Study of the Pull-based Software Development Model},
  4 |   booktitle = {Proceedings of the 36th International Conference on Software Engineering},
  5 |   Year = {2014},
  6 |   series = {ICSE},
  7 |   year = {2014},
  8 |   isbn = {978-1-4503-2756-5},
  9 |   location = {Hyderabad, India},
 10 |   pages = {345--355},
 11 |   numpages = {11},
 12 |   doi = {10.1145/2568225.2568260},
 13 |   acmid = {2568260},
 14 |   publisher = {ACM},
 15 |   address = {New York, NY, USA},
 16 |   url = {http://www.gousios.gr/bibliography/GPD14.html},
 17 | }
 18 | 
 19 | @inproceedings{G13,
 20 |   Author = {Georgios Gousios},
 21 |   Title = {The {GHTorrent} dataset and tool suite},
 22 |   Year = 2013,
 23 |   Month = May,
 24 |   Booktitle = {Proceedings of the 10th Working Conference on Mining Software Repositories},
 25 |   series={MSR},
 26 |   pages={233--236},
 27 |   Location = {San Francisco, CA},
 28 |   url = {http://www.gousios.gr/bibliography/G13.html},
 29 |   award = {MSR2013: Best data showcase paper}
 30 | }
 31 | 
 32 | @inproceedings{GS12,
 33 |   Author = {Georgios Gousios and Diomidis Spinellis},
 34 |   Booktitle = {Proceedings of the 9th Working Conference on Mining  Software Repositories},
 35 |   series={MSR},
 36 |   Location = {Zurich, Switzerland},
 37 |   Pages = {12--21},
 38 |   Publisher = {IEEE},
 39 |   Title = { {GHTorrent}: {GitHub}'s Data from a Firehose},
 40 |   Year = 2012,
 41 |   doi = {10.1109/MSR.2012.6224294},
 42 |   ISSN = {2160-1852},
 43 |   url = {http://www.gousios.gr/bibliography/GS12.html}
 44 | }
 45 | 
 46 | 
 47 | @inproceedings{GZ14,
 48 |   author = {Gousios, Georgios and Zaidman, Andy},
 49 |   title = {A Dataset for Pull-based Development Research},
 50 |   booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories},
 51 |   series = {MSR},
 52 |   year = {2014},
 53 |   isbn = {978-1-4503-2863-0},
 54 |   location = {Hyderabad, India},
 55 |   pages = {368--371},
 56 |   numpages = {4},
 57 |   doi = {10.1145/2597073.2597122},
 58 |   acmid = {2597122},
 59 |   publisher = {ACM},
 60 |   address = {New York, NY, USA},
 61 |   url = {http://www.gousios.gr/bibliography/GZ14.html},
 62 |   note = {MSR2014: Best data showcase paper},
 63 | }
 64 | 
 65 | @inproceedings{GVSZ14,
 66 |   author = {Gousios, Georgios and Vasilescu, Bogdan and Serebrenik, Alexander and Zaidman, Andy},
 67 |   title = {Lean GHTorrent: GitHub Data on Demand},
 68 |   booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories},
 69 |   series = {MSR},
 70 |   year = {2014},
 71 |   isbn = {978-1-4503-2863-0},
 72 |   location = {Hyderabad, India},
 73 |   pages = {384--387},
 74 |   numpages = {4},
 75 |   doi = {10.1145/2597073.2597126},
 76 |   acmid = {2597126},
 77 |   publisher = {ACM},
 78 |   address = {New York, NY, USA},
 79 |   keywords = {GitHub, data on demand, dataset},
 80 |   url = {http://www.gousios.gr/bibliography/GVSZ14.html}
 81 | }
 82 | 
 83 | @inproceedings{VSF12,
 84 |   author = {Vasilescu, Bogdan and Filkov, Vladimir and Serebrenik, Alexander},
 85 |   title = {Stack Overflow and GitHub: Associations between software development and crowdsourced knowledge},
 86 |   booktitle = {Proceedings of the 2013 ASE/IEEE International Conference on Social Computing},
 87 |   series = {SocialCom},
 88 |   publisher = {IEEE},
 89 |   year = {2013},
 90 |   pages = {188--195},
 91 |   doi = {http://dx.doi.org/10.1109/SocialCom.2013.35}
 92 | }
 93 | 
 94 | @inproceedings{PVS14,
 95 |   title={Security and Emotion: Sentiment Analysis of Security Discussions on {GitHub}},
 96 |   author={Pletea, Daniel and Vasilescu, Bogdan and Serebrenik, Alexander},
 97 |   booktitle={Proceedings of the 11th Working Conference on Mining Software Repositories},
 98 |   series={MSR},
 99 |   year={2014},
100 |   pages={384--387},
101 |   Location={Hyderabad, India},
102 |   organization={ACM}
103 | }
104 | 
105 | @inproceedings{KGBSGD14,
106 |   title={The Promises and Perils of Mining {GitHub}},
107 |   author={Kalliamvakou, Eirini and Gousios, Georgios and Blincoe, Kelly and Singer, Leif and German, Daniel M. and Damian, Daniela},
108 |   booktitle={Proceedings of the 11th Working Conference on Mining Software Repositories},
109 |   series={MSR},
110 |   year={2014},
111 |   pages={92--101},
112 |   Location={Hyderabad, India},
113 |   organization={ACM}
114 | }
115 | 
116 | @inproceedings{YuRR14,
117 |   author={Yue Yu and Huaimin Wang and Gang Yin and Ling, C.X.},
118 |   booktitle={Proceedings of the 2014 IEEE International Conference on Software Maintenance and Evolution},
119 |   series={ICSME},
120 |   title={Reviewer Recommender of Pull-Requests in {GitHub}},
121 |   year={2014},
122 |   pages={609--612},
123 |   doi={10.1109/ICSME.2014.107},
124 |   ISSN={1063-6773},
125 |   publisher = {IEEE},
126 | }
127 | 
128 | @inproceedings{yue2015wait,
129 |   author = {Yu, Yue and Wang, Huaimin and Filkov, Vladimir and Devanbu, Premkumar and Vasilescu, Bogdan},
130 |   title = {Wait For It: Determinants of Pull Request Evaluation Latency on {GitHub}},
131 |   booktitle = {12th Working Conference on Mining Software Repositories},
132 |   year = {2015},
133 |   series = {MSR},
134 |   publisher = {IEEE},
135 |   note={to appear},
136 | }
137 | 
138 | @inproceedings{vasilescu2014ci,
139 |   author = {Vasilescu, Bogdan and van Schuylenburg, Stef and Wulms, Jules and Serebrenik, Alexander and van den Brand, Mark G. J.},
140 |   title = {Continuous integration in a social-coding world: Empirical evidence from {GitHub}},
141 |   booktitle = {Proceedings of the 30th IEEE International Conference on Software Maintenance and Evolution, Early
142 |     Research Achievements},
143 |   year = {2014},
144 |   series = {ICSME},
145 |   pages = {401--405},
146 |   publisher = {IEEE},
147 | }
148 | 
149 | @inproceedings{vasilescu2015gender,
150 |   author = {Vasilescu, Bogdan and Posnett, Daryl and Ray, Baishakhi and van den Brand, Mark G. J. and Serebrenik, Alexander and Devanbu, Premkumar and Filkov, Vladimir}, 
151 |   title = {Gender and tenure diversity in {GitHub} teams},
152 |   booktitle = {Proceedings of the ACM {CHI} Conference on Human Factors in Computing Systems},
153 |   year = {2015},
154 |   series = {CHI},
155 |   publisher = {ACM},
156 |   note={to appear},
157 | }
158 | 
159 | @inproceedings{vasilescu2015chase,
160 |   author = {Vasilescu, Bogdan and Filkov, Vladimir and Serebrenik, Alexander},
161 |   title = {Perceptions of Diversity on {GitHub}: A User Survey},
162 |   booktitle = {Proceedings of the 8th International Workshop on Cooperative and Human Aspects of Software Engineering},
163 |   year = {2015},
164 |   series = {CHASE},
165 |   publisher = {IEEE},
166 |   note={to appear},
167 | }
168 | 
169 | @inproceedings{cabotSaner2015,
170 |   title     = {{Exploring the Use of Labels to Categorize Issues in Open-Source Software Projects}},
171 |   author    = {Cabot, Jordi and C\'anovas Izquierdo, Javier Luis and Cosentino, Valerio and Rolandi, Bel\'en},
172 |   booktitle = {Proceedings of the 22nd International Conference on Software Analysis, Evolution, and Reengineering (SANER)},
173 |   pages     = {479--483},
174 |   year      = {2015}
175 | }
176 | 
177 | @inproceedings{canovasSaner2015,
178 |   title     = {{GiLA: GitHub Label Analyzer}},
179 |   author    = {C\'anovas Izquierdo, Javier Luis and Cosentino, Valerio and Rolandi, Bel\'en and Bergel, Alexandre and Cabot, Jordi},
180 |   booktitle = {Proceedings of the 22nd International Conference on Software Analysis, Evolution, and Reengineering (SANER)},
181 |   pages     = {550--554},
182 |   year      = {2015}
183 | }
184 | 
185 | @inproceedings{GZSD15,
186 |   author = {Gousios, Georgios and Zaidman, Andy and Storey, Margaret-Anne and Deursen, Arie van},
187 |   title = {Work Practices and Challenges in Pull-Based Development: The Integrator’s Perspective},
188 |   booktitle = {Proceedings of the 37th International Conference on Software Engineering},
189 |   series = {ICSE 2015},
190 |   year = {2015},
191 |   location = {Florence, Italy}
192 | }
193 | 
194 | @inproceedings{HG15,
195 |   author = {Hauff, Claudia and Gousios, Georgios},
196 |   title = {Matching GitHub developer profiles to job advertisements},
197 |   booktitle = {Proceedings of the 12th International Conference on Mining
198 |     Software Repositories},
199 |   year = {2015},
200 |   location = {Florence, Italy}
201 | }
202 | 
203 | @inproceedings{VGZ15,
204 |   author = {van der Veen, Erik and Gousios, Georgios and Zaidman, Andy},
205 |   title = {Automatically Prioritizing Pull Requests},
206 |   booktitle = {Proceedings of the 12th International Conference on Mining
207 |     Software Repositories},
208 |   year = {2015},
209 |   location = {Florence, Italy}
210 | }
211 | 
212 | @article{BaltesDiehl2018,
213 |  author = {Baltes, Sebastian and Diehl, Stephan},
214 |  title = {{Usage and Attribtion of Stack Overflow Code Snippets in GitHub Projects}},
215 |  journal = {{Empirical Software Engineering}},
216 |  year = {2018}
217 | }
218 | 


--------------------------------------------------------------------------------
/faq.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: page
  3 | title: GHTorrent FAQ
  4 | tagline:
  5 | ---
  6 | 
  7 | This is is the GHTorrent FAQ (work in progress). Please ask more questions using
  8 | the form below or by editing [this
  9 | file](https://github.com/ghtorrent/ghtorrent.org/blob/master/faq.md) directly on
 10 | GitHub.
 11 | 
 12 | ## General
 13 | 
 14 | #### _What is GHTorrent?_
 15 | 
 16 | GHTorrent collects all information from the GitHub API and populates with it two
 17 | databases: one with [raw data](mongo.html) and one with [linked
 18 | entities](relational.html). Using this data, users can get insights just for
 19 | their repositories or for the full state of OSS development on GitHub.
 20 | 
 21 | GHTorrent has been extensively used by
 22 | [researchers](halloffame.html),
 23 | [companies](https://github.com/Microsoft/ghinsights) and OSS projects
 24 | as a source of software process and product analytics.
 25 | 
 26 | #### _Can I use GHTorrent for my research?_
 27 | 
 28 | Absolutely! [Lots of
 29 | researchers](https://scholar.google.gr/scholar?cites=11132126230347149781) have
 30 | [done so](halloffame.html). You can [download](download.html) the database dumps
 31 | or use the [online access services](services.html) to get access to the data.
 32 | 
 33 | When using GHTorrent data for research or large scale repository analysis,
 34 | please consider the perils reported in [this paper](http://gousios.gr/bibliography/KGBSGD15.html).
 35 | 
 36 | #### _Which license is GHTorrent distributed under?_
 37 | 
 38 |  The GHTorrent dataset is distributed under a dual licensing scheme ([Creative Commons +](https://wiki.creativecommons.org/wiki/CCPlus)).
 39 | 
 40 | For non-commercial uses (including, but not limited to, educational, research or personal uses), the dataset is distributed under the [CC-BY-SA](http://creativecommons.org/licenses/by-sa/4.0/) license. <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a>
 41 | 
 42 | For commercial uses, please [contact the maintainer](mailto:gousiosg@gmail.com) for more information. Usually, a sizable donation to the project will be enough
 43 | to grant you full access.
 44 | 
 45 | #### _Who is behind GHTorrent?_
 46 | 
 47 | GHTorrent was initially created and is currently maintained by [Georgios
 48 | Gousios](http://gousios.org), with initial design support and ideas from
 49 | [Diomidis Spinellis](http://spinellis.gr). Several users have contributed code,
 50 | ideas and support over time. Here is a (hopefuly not partial) list of them:
 51 | 
 52 | Sebastian Bates, Derek Brown, Arie van Deursen, Daniel German, Jeff McAffer, Bogdan Vasilescu
 53 | 
 54 | Financial support has been provided by the following organizations:
 55 | 
 56 | * TU Delft: purchace and running costs for initial servers (2012 -- late 2015)
 57 | * Microsoft: donation of Azure tokens for running the project infrastructure
 58 |   (late 2015 -- late 2016)
 59 | 
 60 | #### _How is GHTorrent different from Github Archive?_
 61 | 
 62 | [Github Archive](http://githubarchive.org) collects and stores the GitHub event
 63 | stream. In addition to that, GHTorrent applies dependency based retrieval on all
 64 | entities (e.g. commits, pull requests etc) that are linked from the events and
 65 | stores the results in two databases: a raw data one (MongoDB) that stores the
 66 | unprocessed responses from GitHub API and a relational one (MySQL) that stores
 67 | links between the entities (e.g. commits are linked to projects). Using
 68 | GHTorrent, developers can obtain an up-to-date, relational view of their
 69 | project’s GitHub metadata, which can be used for answering questions regarding
 70 | their project’s processes.
 71 | 
 72 | ## How can I...?
 73 | 
 74 | #### _...contribute to GHTorrent?_
 75 | 
 76 | Please read the [contribution guide](contrib.html).
 77 | 
 78 | #### _... cite the GHTorrent data set?_
 79 | 
 80 | Georgios Gousios: [The GHTorrent dataset and tool
 81 | suite](http://www.gousios.gr/bibliography/G13.html). MSR 2013: 233-236
 82 | 
 83 | {%highlight text%}
 84 | @inproceedings{Gousi13,
 85 |   author = {Gousios, Georgios},
 86 |   title = {The GHTorrent dataset and tool suite},
 87 |   booktitle = {Proceedings of the 10th Working Conference on Mining Software
 88 |     Repositories},
 89 |   series = {MSR '13},
 90 |   year = {2013},
 91 |   isbn = {978-1-4673-2936-1},
 92 |   location = {San Francisco, CA, USA},
 93 |   pages = {233--236},
 94 |   numpages = {4},
 95 |   url = {http://dl.acm.org/citation.cfm?id=2487085.2487132},
 96 |   acmid = {2487132},
 97 |   publisher = {IEEE Press},
 98 |   address = {Piscataway, NJ, USA},
 99 | }
100 | {%endhighlight%}
101 | 
102 | #### _...download the data?_
103 | 
104 | You don't need to. GHTorrent offers a multitude of [online
105 | services](services.html) that enable access to almost realtime versions
106 | of the datastores. If you really want to, you can get all the data from
107 | the [downloads](downloads.html) page.
108 | 
109 | #### _...use the data for my private project?_
110 | 
111 | See the licensing information above.
112 | 
113 | ## Data processing
114 | 
115 | #### _What quality guarantees does GHTorrent offer?_
116 | 
117 | The GHTorrent data come as is with no quality guarantees. However, we are
118 | actively seeking to fix systematic (i.e. errors that are repeated across the
119 | whole dataset) data collection errors. Please [open an
120 | issue](https://github.com/gousiosg/github-mirror/issues) if you find one. As
121 | GHTorrent is essentially a data sync operation over unreliable networks,
122 | spurious inconsistencies such as (minor) holes in data collection are
123 | unavoidable.
124 | 
125 | #### _I 've seen weird commit timestamps_
126 | 
127 | Git records the commit timestamp on the developer's workstation. If the clock
128 | is missconfigured, timestamps will be weird. We have seen timestamps such
129 | as `0000-01-01 00:00` or `2034-12-31 23:59`. GitHub and GHTorrent do not
130 | process the timestamps in any way.
131 | 
132 | #### _My data is out of date_
133 | 
134 | Github only creates events when an entity is created and not when it is updated or deleted. It is therefore not possible to be completely up-to-date with changes in users (e.g. updated location) and repositories (e.g. renames). GHTorrent tries its best to stay up to date by refreshing all users and all repos every X months. As the DB contains 12M+ users and 30M+ repos, this process may take a while and it can also fail due to spurious reasons.
135 | 
136 | ## Copyright and Privacy
137 | 
138 | #### _Who owns the data that GHTorrent shares?_
139 | 
140 | The copyright situation is very complicated; in essense, GitHub owns copyright
141 | to the data formats for the API responses, users own copyright of the content
142 | they create and the GHTorrent creator has copyright on the GHTorrent database
143 | schemata.
144 | 
145 | #### _What types of privacy guarantees does GHTorrent offer?_
146 | 
147 | GHTorrent collects publicly available data from the GitHub API.
148 | 
149 | #### _How does GHTorrent handle my personal information?_
150 | 
151 | By personal information, we mean data that identify a real person uniquely. In
152 | the context of GHTorrent, these are emails and real names.
153 | 
154 | As of Mar 2016, GHTorrent does not distribute any personal information by
155 | default. Researchers whose research requires access to personal data
156 | can use [this form](pers-data.html) to obtain it.
157 | 
158 | #### _Can I get more information?_
159 | 
160 | Yes. Please read the following Slidedeck. If you are still in doubt,
161 | please contact us.
162 | <div style="width: 50%;margin-left:auto;margin-right:auto;">
163 | <script async class="speakerdeck-embed" data-id="1c64fd1e7dfe4032aff246b2dd1195bf" data-ratio="1.33333" src="//speakerdeck.com/assets/embed.js"></script>
164 | </div>
165 | 
166 | #### _How can I opt out?_
167 | 
168 | We understand that being part of such a big dataset can have concequences for
169 | your online privacy. For this reason (and also to comply with legal data
170 | processing requirements), you can opt out data collection. If you want to
171 | opt out, please [send us an email](gousiosg@gmail.com).
172 | 
173 | Opting out means that we will replace your email in the database with
174 | `no-spam@ghtorrent.org` and remove your real name.
175 | 
176 | #### _Contacting users for surveys_
177 | 
178 | (by @slang800)
179 | 
180 | Contacting GitHub users is sometimes necessary for research projects, but
181 | certain people regard this as spam and do not appreciate it. Even people who
182 | would ordinarily be willing to help in surveys can end up becoming hostile to
183 | requests due to the frequency with which they receive them. Due to the number of
184 | researchers who study the free software community, we have to be mindful of how
185 | many emails we are sending. Here are some tips to avoid annoying people:
186 | 
187 | - Do not contact users who have signed up for the
188 |   [do-not-survey-list](https://github.com/slang800/do-not-survey-list). These
189 |   users have explicitly stated that they don't want to be involved in surveys.
190 | 
191 | - Try to limit your data collection to a sample of users. While it may be
192 |   tempting to contact all 12 million developers, it is also likely to leave a
193 |   bad impression of researchers in general, and can make future studies
194 |   difficult.
195 | 
196 | - Don't contact people repeatedly if they fail to reply to your first email. If
197 |   they weren't interested the first time, follow-ups are much more likely to
198 |   bother them.
199 | 
200 | - Don't contact people who have chosen to hide their email address from their
201 |   profile page. People frequently assume that hiding their email from their
202 |   profile will prevent them from being contacted. Sometimes, they don't even
203 |   realize that every commit they make is signed with their email, so we
204 |   shouldn't assume that users are consenting to being contacted, without
205 |   checking the display settings on their profile.
206 | 
207 | - Be especially careful when contacting highly-active developers. Not only to
208 |   these people receive a massive amount of regular email from their
209 |   participation in the free software community, but they are the sent a
210 |   higher-than-average number of requests for surveys too.
211 | 
212 | {% include comments.html%}
213 | 


--------------------------------------------------------------------------------
/msr14.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: page
  3 | title: MSR 2014 Mining Challenge Dataset
  4 | tagline:
  5 | ---
  6 | ### Versions
  7 | 
  8 | After the initial release of the dataset, the users found errors and missing
  9 | features. The list of versions along with the fixes is presented in the table
 10 | below. Only the latest version is offered for download.
 11 | 
 12 | *You are advised to always run queries against the newest version.* If you have already downloaded an older version and the described fix does not
 13 | affect your experiment, you could skip the update.
 14 | 
 15 | <table class="table table-hover table-condensed">
 16 |   <thead>
 17 |   <tr>
 18 |       <th>Version</th>
 19 |       <th>Release date</th>
 20 |       <th>Fixed error</th>
 21 |   </tr>
 22 |   </thead>
 23 |   <tbody>
 24 |   <tr>
 25 |       <td>1.3</td>
 26 |       <td>13 Dec 2013</td>
 27 |       <td><a href="http://ghtorrent.org/msr14.html#comment-1161196386">Missing project members</a> for some projects is now fixed</td>
 28 |   </tr>
 29 |    <tr>
 30 |       <td>1.2</td>
 31 |       <td>22 Oct 2013</td>
 32 |       <td>user_id in table commit_comments <a href="http://ghtorrent.org/msr14.html#comment-1087775543">not set correctly</a>.</td>
 33 |   </tr>
 34 |     <tr>
 35 |       <td>1.1</td>
 36 |       <td>9 Oct 2013</td>
 37 |       <td>
 38 |       Table commit_comments was missing data. Some commits were missing from
 39 |       some projects.
 40 |       </td>
 41 |   </tr>
 42 |     <tr>
 43 |       <td>1.0</td>
 44 |       <td>28 Sep 2013</td>
 45 |       <td></td>
 46 |   </tr>
 47 |   </tbody>
 48 | </table>
 49 | 
 50 | ### Dataset description
 51 | 
 52 | The MSR 2014 challenge dataset is a (very) trimmed down version of the original
 53 | GHTorrent dataset. It includes data from the top-10 starred software projects
 54 | for the top programming languages on Github, which gives 90 projects and their
 55 | forks. For each project, we retrieved all data including issues, pull requests
 56 | organizations, followers, stars and labels (milestones and events not
 57 | included). The dataset was constructed from scratch to ensure the latest
 58 | information is in it.
 59 | 
 60 | Similarly to GHTorrent itself, the MSR challenge dataset comes in two flavours:
 61 | 
 62 | * A [MongoDB database dump](http://ghtorrent-downloads.ewi.tudelft.nl/datasets/msr14-mongo.tar.gz) containing the results of querying the Github API. See [format here](mongo.html).
 63 | * A [MySQL database dump](http://ghtorrent-downloads.ewi.tudelft.nl/datasets/msr14-mysql.gz) containing a queriable version of important fields extracted from the raw data. See [schema here](relational.html).
 64 | 
 65 | The included projects are the following:
 66 | 
 67 | <small>
 68 | [akka/akka](http://github.com/akka/akka)
 69 | [devtools/hadley](http://github.com/hadley/devtools)
 70 | [ProjectTemplate/johnmyleswhite](http://github.com/johnmyleswhite/ProjectTemplate)
 71 | [stat-cookbook/mavam](http://github.com/mavam/stat-cookbook)
 72 | [hiphop-php/facebook](http://github.com/facebook/hiphop-php)
 73 | [knitr/yihui](http://github.com/yihui/knitr)
 74 | [shiny/rstudio](http://github.com/rstudio/shiny)
 75 | [folly/facebook](http://github.com/facebook/folly)
 76 | [mongo/mongodb](http://github.com/mongodb/mongo)
 77 | [doom3.gpl/TTimo](http://github.com/TTimo/doom3.gpl)
 78 | [phantomjs/ariya](http://github.com/ariya/phantomjs)
 79 | [TrinityCore/TrinityCore](http://github.com/TrinityCore/TrinityCore)
 80 | [MaNGOS/mangos](http://github.com/mangos/MaNGOS)
 81 | [bitcoin/bitcoin](http://github.com/bitcoin/bitcoin)
 82 | [mosh/keithw](http://github.com/keithw/mosh)
 83 | [xbmc/xbmc](http://github.com/xbmc/xbmc)
 84 | [http-parser/joyent](http://github.com/joyent/http-parser)
 85 | [beanstalkd/kr](http://github.com/kr/beanstalkd)
 86 | [redis/antirez](http://github.com/antirez/redis)
 87 | [ccv/liuliu](http://github.com/liuliu/ccv)
 88 | [memcached/memcached](http://github.com/memcached/memcached)
 89 | [openFrameworks/openframeworks](http://github.com/openframeworks/openFrameworks)
 90 | [libgit2/libgit2](http://github.com/libgit2/libgit2)
 91 | [redcarpet/vmg](http://github.com/vmg/redcarpet)
 92 | [libuv/joyent](http://github.com/joyent/libuv)
 93 | [SignalR/SignalR](http://github.com/SignalR/SignalR)
 94 | [SparkleShare/hbons](http://github.com/hbons/SparkleShare)
 95 | [plupload/moxiecode](http://github.com/moxiecode/plupload)
 96 | [mono/mono](http://github.com/mono/mono)
 97 | [Nancy/NancyFx](http://github.com/NancyFx/Nancy)
 98 | [ServiceStack/ServiceStack](http://github.com/ServiceStack/ServiceStack)
 99 | [AutoMapper/AutoMapper](http://github.com/AutoMapper/AutoMapper)
100 | [RestSharp/restsharp](http://github.com/restsharp/RestSharp)
101 | [ravendb/ravendb](http://github.com/ravendb/ravendb)
102 | [MiniProfiler/SamSaffron](http://github.com/SamSaffron/MiniProfiler)
103 | [storm/nathanmarz](http://github.com/nathanmarz/storm)
104 | [elasticsearch/elasticsearch](http://github.com/elasticsearch/elasticsearch)
105 | [ActionBarSherlock/JakeWharton](http://github.com/JakeWharton/ActionBarSherlock)
106 | [facebook-android-sdk/facebook](http://github.com/facebook/facebook-android-sdk)
107 | [clojure/clojure](http://github.com/clojure/clojure)
108 | [CraftBukkit/Bukkit](http://github.com/Bukkit/CraftBukkit)
109 | [netty/netty](http://github.com/netty/netty)
110 | [android/github](http://github.com/github/android)
111 | [node/joyent](http://github.com/joyent/node)
112 | [jquery/jquery](http://github.com/jquery/jquery)
113 | [html5-boilerplate/h5bp](http://github.com/h5bp/html5-boilerplate)
114 | [impress.js/bartaz](http://github.com/bartaz/impress.js)
115 | [d3/mbostock](http://github.com/mbostock/d3)
116 | [chosen/harvesthq](http://github.com/harvesthq/chosen)
117 | [Font-Awesome/FortAwesome](http://github.com/FortAwesome/Font-Awesome)
118 | [three.js/mrdoob](http://github.com/mrdoob/three.js)
119 | [foundation/zurb](http://github.com/zurb/foundation)
120 | [symfony/symfony](http://github.com/symfony/symfony)
121 | [CodeIgniter/EllisLab](http://github.com/EllisLab/CodeIgniter)
122 | [php-sdk/facebook](http://github.com/facebook/php-sdk)
123 | [zf2/zendframework](http://github.com/zendframework/zf2)
124 | [cakephp/cakephp](http://github.com/cakephp/cakephp)
125 | [ThinkUp/ginatrapani](http://github.com/ginatrapani/ThinkUp)
126 | [phpunit/sebastianbergmann](http://github.com/sebastianbergmann/phpunit)
127 | [Slim/codeguy](http://github.com/codeguy/Slim)
128 | [django/django](http://github.com/django/django)
129 | [tornado/facebook](http://github.com/facebook/tornado)
130 | [httpie/jkbr](http://github.com/jkbr/httpie)
131 | [flask/mitsuhiko](http://github.com/mitsuhiko/flask)
132 | [requests/kennethreitz](http://github.com/kennethreitz/requests)
133 | [symfony/xphere-forks](http://github.com/xphere-forks/symfony)
134 | [reddit/reddit](http://github.com/reddit/reddit)
135 | [boto/boto](http://github.com/boto/boto)
136 | [django-debug-toolbar/django-debug-toolbar](http://github.com/django-debug-toolbar/django-debug-toolbar)
137 | [Sick-Beard/midgetspy](http://github.com/midgetspy/Sick-Beard)
138 | [django-cms/divio](http://github.com/divio/django-cms)
139 | [rails/rails](http://github.com/rails/rails)
140 | [homebrew/mxcl](http://github.com/mxcl/homebrew)
141 | [jekyll/mojombo](http://github.com/mojombo/jekyll)
142 | [gitlabhq/gitlabhq](http://github.com/gitlabhq/gitlabhq)
143 | [diaspora/diaspora](http://github.com/diaspora/diaspora)
144 | [devise/plataformatec](http://github.com/plataformatec/devise)
145 | [blueprint-css/joshuaclayton](http://github.com/joshuaclayton/blueprint-css)
146 | [octopress/imathis](http://github.com/imathis/octopress)
147 | [vinc.cc/vinc](http://github.com/vinc/vinc.cc)
148 | [paperclip/thoughtbot](http://github.com/thoughtbot/paperclip)
149 | [compass/chriseppstein](http://github.com/chriseppstein/compass)
150 | [finagle/twitter](http://github.com/twitter/finagle)
151 | [kestrel/robey](http://github.com/robey/kestrel)
152 | [flockdb/twitter](http://github.com/twitter/flockdb)
153 | [gizzard/twitter](http://github.com/twitter/gizzard)
154 | [sbt/sbt](http://github.com/sbt/sbt)
155 | [scala/scala](http://github.com/scala/scala)
156 | [scalatra/scalatra](http://github.com/scalatra/scalatra)
157 | [zipkin/twitter](http://github.com/twitter/zipkin)
158 | </small>
159 | ### Importing and using
160 | 
161 | The following instructions assume an OSX or Linux based host.
162 | 
163 | #### MongoDB
164 | 
165 | {%highlight bash%}
166 | 
167 | $ wget http://ghtorrent-downloads.ewi.tudelft.nl/datasets/msr14-mongo.tar.gz
168 | $ tar zxvf msr14-mongo.tar.gz
169 | $ mongorestore
170 | $ mongo msr14
171 | mongo> db.commits.count()
172 | 601080
173 | mongo> db.issues.count()
174 | 126308
175 | {%endhighlight %}
176 | 
177 | #### MySQL
178 | 
179 | {%highlight bash%}
180 | $ wget http://ghtorrent-downloads.ewi.tudelft.nl/datasets/msr14-mysql.gz
181 | $ mysql -u root -p
182 | mysql > create user 'msr14'@'localhost' identified by 'msr14';
183 | mysql> create database msr14;
184 | mysql> GRANT ALL PRIVILEGES ON msr14.* to msr14@'localhost';
185 | mysql> flush privileges;
186 | # Exit MySQL prompt
187 | $ zcat msr14-mysql.gz |mysql -u msr14 -p msr14
188 | $ mysql -u msr14 -p msr14
189 | mysql> select language,count(*) from projects where forked_from is null group by language;
190 | +------------+----------+
191 | | language   | count(*) |
192 | +------------+----------+
193 | | C          |       10 |
194 | | C#         |        8 |
195 | | C++        |        8 |
196 | | CSS        |        3 |
197 | | Go         |        1 |
198 | | Java       |        8 |
199 | | JavaScript |        9 |
200 | | PHP        |        9 |
201 | | Python     |       10 |
202 | | R          |        4 |
203 | | Ruby       |       10 |
204 | | Scala      |        9 |
205 | | TypeScript |        1 |
206 | +------------+----------+
207 | 13 rows in set (0.01 sec)
208 | {%endhighlight %}
209 | 
210 | ### FAQ
211 | 
212 | Answers to frequently asked questions
213 | 
214 | #### Why a new dataset?
215 | 
216 | For practical reasons. The dataset is small enough to be used on a laptop,
217 | yet rich enough to do really interesting research with it.
218 | 
219 | #### What are the hardware requirements?
220 | 
221 | We have succesfully imported and used both dumps into a 2011 MacBookAir with 4GB
222 | of RAM. Your mileage may vary, but relatively new systems with more than 4GB RAM should have no trouble with both databases. If you only need to use the MySQL data dump, the hardware requirements are even lower.
223 | 
224 | #### Why two databases? Do I need both?
225 | 
226 | Not necessarily. The MySQL database can readily cover many aspects of activity
227 | on Github. Perhaps the only reason to use the MongoDB dump is to analyse commit contents, branches affected by pull requests or milestones, which are not included in MySQL.
228 | 
229 | #### How can I ask a question about the dataset?
230 | 
231 | Your question and the potential answer might be useful for other people as well,
232 | so please use the form below. *Please note that I will not answer
233 | questions sent to my email.*
234 | 
235 | {% include comments.html%}
236 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
  1 | require "rubygems"
  2 | require 'rake'
  3 | require 'yaml'
  4 | require 'time'
  5 | 
  6 | SOURCE = "."
  7 | CONFIG = {
  8 |   'version' => "0.2.13",
  9 |   'themes' => File.join(SOURCE, "_includes", "themes"),
 10 |   'layouts' => File.join(SOURCE, "_layouts"),
 11 |   'posts' => File.join(SOURCE, "_posts"),
 12 |   'post_ext' => "md",
 13 |   'theme_package_version' => "0.1.0"
 14 | }
 15 | 
 16 | # Path configuration helper
 17 | module JB
 18 |   class Path
 19 |     SOURCE = "."
 20 |     Paths = {
 21 |       :layouts => "_layouts",
 22 |       :themes => "_includes/themes",
 23 |       :theme_assets => "assets/themes",
 24 |       :theme_packages => "_theme_packages",
 25 |       :posts => "_posts"
 26 |     }
 27 | 
 28 |     def self.base
 29 |       SOURCE
 30 |     end
 31 | 
 32 |     # build a path relative to configured path settings.
 33 |     def self.build(path, opts = {})
 34 |       opts[:root] ||= SOURCE
 35 |       path = "#{opts[:root]}/#{Paths[path.to_sym]}/#{opts[:node]}".split("/")
 36 |       path.compact!
 37 |       File.__send__ :join, path
 38 |     end
 39 | 
 40 |   end #Path
 41 | end #JB
 42 | 
 43 | # Usage: rake post title="A Title" [date="2012-02-09"]
 44 | desc "Begin a new post in #{CONFIG['posts']}"
 45 | task :post do
 46 |   abort("rake aborted: '#{CONFIG['posts']}' directory not found.") unless FileTest.directory?(CONFIG['posts'])
 47 |   title = ENV["title"] || "new-post"
 48 |   slug = title.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')
 49 |   begin
 50 |     date = (ENV['date'] ? Time.parse(ENV['date']) : Time.now).strftime('%Y-%m-%d')
 51 |   rescue Exception => e
 52 |     puts "Error - date format must be YYYY-MM-DD, please check you typed it correctly!"
 53 |     exit -1
 54 |   end
 55 |   filename = File.join(CONFIG['posts'], "#{date}-#{slug}.#{CONFIG['post_ext']}")
 56 |   if File.exist?(filename)
 57 |     abort("rake aborted!") if ask("#{filename} already exists. Do you want to overwrite?", ['y', 'n']) == 'n'
 58 |   end
 59 | 
 60 |   puts "Creating new post: #{filename}"
 61 |   open(filename, 'w') do |post|
 62 |     post.puts "---"
 63 |     post.puts "layout: post"
 64 |     post.puts "title: \"#{title.gsub(/-/,' ')}\""
 65 |     post.puts 'description: ""'
 66 |     post.puts "category: "
 67 |     post.puts "tags: []"
 68 |     post.puts "---"
 69 |     post.puts "{% include JB/setup %}"
 70 |   end
 71 | end # task :post
 72 | 
 73 | # Usage: rake page name="about.html"
 74 | # You can also specify a sub-directory path.
 75 | # If you don't specify a file extention we create an index.html at the path specified
 76 | desc "Create a new page."
 77 | task :page do
 78 |   name = ENV["name"] || "new-page.md"
 79 |   filename = File.join(SOURCE, "#{name}")
 80 |   filename = File.join(filename, "index.html") if File.extname(filename) == ""
 81 |   title = File.basename(filename, File.extname(filename)).gsub(/[\W\_]/, " ").gsub(/\b\w/){$&.upcase}
 82 |   if File.exist?(filename)
 83 |     abort("rake aborted!") if ask("#{filename} already exists. Do you want to overwrite?", ['y', 'n']) == 'n'
 84 |   end
 85 | 
 86 |   mkdir_p File.dirname(filename)
 87 |   puts "Creating new page: #{filename}"
 88 |   open(filename, 'w') do |post|
 89 |     post.puts "---"
 90 |     post.puts "layout: page"
 91 |     post.puts "title: \"#{title}\""
 92 |     post.puts 'description: ""'
 93 |     post.puts "---"
 94 |     post.puts "{% include JB/setup %}"
 95 |   end
 96 | end # task :page
 97 | 
 98 | desc "Launch preview environment"
 99 | task :preview do
100 |   system "jekyll --auto --server"
101 | end # task :preview
102 | 
103 | # Public: Alias - Maintains backwards compatability for theme switching.
104 | task :switch_theme => "theme:switch"
105 | 
106 | namespace :theme do
107 | 
108 |   # Public: Switch from one theme to another for your blog.
109 |   #
110 |   # name - String, Required. name of the theme you want to switch to.
111 |   #        The the theme must be installed into your JB framework.
112 |   #
113 |   # Examples
114 |   #
115 |   #   rake theme:switch name="the-program"
116 |   #
117 |   # Returns Success/failure messages.
118 |   desc "Switch between Jekyll-bootstrap themes."
119 |   task :switch do
120 |     theme_name = ENV["name"].to_s
121 |     theme_path = File.join(CONFIG['themes'], theme_name)
122 |     settings_file = File.join(theme_path, "settings.yml")
123 |     non_layout_files = ["settings.yml"]
124 | 
125 |     abort("rake aborted: name cannot be blank") if theme_name.empty?
126 |     abort("rake aborted: '#{theme_path}' directory not found.") unless FileTest.directory?(theme_path)
127 |     abort("rake aborted: '#{CONFIG['layouts']}' directory not found.") unless FileTest.directory?(CONFIG['layouts'])
128 | 
129 |     Dir.glob("#{theme_path}/*") do |filename|
130 |       next if non_layout_files.include?(File.basename(filename).downcase)
131 |       puts "Generating '#{theme_name}' layout: #{File.basename(filename)}"
132 | 
133 |       open(File.join(CONFIG['layouts'], File.basename(filename)), 'w') do |page|
134 |         if File.basename(filename, ".html").downcase == "default"
135 |           page.puts "---"
136 |           page.puts File.read(settings_file) if File.exist?(settings_file)
137 |           page.puts "---"
138 |         else
139 |           page.puts "---"
140 |           page.puts "layout: default"
141 |           page.puts "---"
142 |         end
143 |         page.puts "{% include JB/setup %}"
144 |         page.puts "{% include themes/#{theme_name}/#{File.basename(filename)} %}"
145 |       end
146 |     end
147 | 
148 |     puts "=> Theme successfully switched!"
149 |     puts "=> Reload your web-page to check it out =)"
150 |   end # task :switch
151 | 
152 |   # Public: Install a theme using the theme packager.
153 |   # Version 0.1.0 simple 1:1 file matching.
154 |   #
155 |   # git  - String, Optional path to the git repository of the theme to be installed.
156 |   # name - String, Optional name of the theme you want to install.
157 |   #        Passing name requires that the theme package already exist.
158 |   #
159 |   # Examples
160 |   #
161 |   #   rake theme:install git="https://github.com/jekyllbootstrap/theme-twitter.git"
162 |   #   rake theme:install name="cool-theme"
163 |   #
164 |   # Returns Success/failure messages.
165 |   desc "Install theme"
166 |   task :install do
167 |     if ENV["git"]
168 |       manifest = theme_from_git_url(ENV["git"])
169 |       name = manifest["name"]
170 |     else
171 |       name = ENV["name"].to_s.downcase
172 |     end
173 | 
174 |     packaged_theme_path = JB::Path.build(:theme_packages, :node => name)
175 | 
176 |     abort("rake aborted!
177 |       => ERROR: 'name' cannot be blank") if name.empty?
178 |     abort("rake aborted!
179 |       => ERROR: '#{packaged_theme_path}' directory not found.
180 |       => Installable themes can be added via git. You can find some here: http://github.com/jekyllbootstrap
181 |       => To download+install run: `rake theme:install git='[PUBLIC-CLONE-URL]'`
182 |       => example : rake theme:install git='git@github.com:jekyllbootstrap/theme-the-program.git'
183 |     ") unless FileTest.directory?(packaged_theme_path)
184 | 
185 |     manifest = verify_manifest(packaged_theme_path)
186 | 
187 |     # Get relative paths to packaged theme files
188 |     # Exclude directories as they'll be recursively created. Exclude meta-data files.
189 |     packaged_theme_files = []
190 |     FileUtils.cd(packaged_theme_path) {
191 |       Dir.glob("**/*.*") { |f|
192 |         next if ( FileTest.directory?(f) || f =~ /^(manifest|readme|packager)/i )
193 |         packaged_theme_files << f
194 |       }
195 |     }
196 | 
197 |     # Mirror each file into the framework making sure to prompt if already exists.
198 |     packaged_theme_files.each do |filename|
199 |       file_install_path = File.join(JB::Path.base, filename)
200 |       if File.exist? file_install_path
201 |         next if ask("#{file_install_path} already exists. Do you want to overwrite?", ['y', 'n']) == 'n'
202 |       else
203 |         mkdir_p File.dirname(file_install_path)
204 |         cp_r File.join(packaged_theme_path, filename), file_install_path
205 |       end
206 |     end
207 | 
208 |     puts "=> #{name} theme has been installed!"
209 |     puts "=> ---"
210 |     if ask("=> Want to switch themes now?", ['y', 'n']) == 'y'
211 |       system("rake switch_theme name='#{name}'")
212 |     end
213 |   end
214 | 
215 |   # Public: Package a theme using the theme packager.
216 |   # The theme must be structured using valid JB API.
217 |   # In other words packaging is essentially the reverse of installing.
218 |   #
219 |   # name - String, Required name of the theme you want to package.
220 |   #
221 |   # Examples
222 |   #
223 |   #   rake theme:package name="twitter"
224 |   #
225 |   # Returns Success/failure messages.
226 |   desc "Package theme"
227 |   task :package do
228 |     name = ENV["name"].to_s.downcase
229 |     theme_path = JB::Path.build(:themes, :node => name)
230 |     asset_path = JB::Path.build(:theme_assets, :node => name)
231 | 
232 |     abort("rake aborted: name cannot be blank") if name.empty?
233 |     abort("rake aborted: '#{theme_path}' directory not found.") unless FileTest.directory?(theme_path)
234 |     abort("rake aborted: '#{asset_path}' directory not found.") unless FileTest.directory?(asset_path)
235 | 
236 |     ## Mirror theme's template directory (_includes)
237 |     packaged_theme_path = JB::Path.build(:themes, :root => JB::Path.build(:theme_packages, :node => name))
238 |     mkdir_p packaged_theme_path
239 |     cp_r theme_path, packaged_theme_path
240 | 
241 |     ## Mirror theme's asset directory
242 |     packaged_theme_assets_path = JB::Path.build(:theme_assets, :root => JB::Path.build(:theme_packages, :node => name))
243 |     mkdir_p packaged_theme_assets_path
244 |     cp_r asset_path, packaged_theme_assets_path
245 | 
246 |     ## Log packager version
247 |     packager = {"packager" => {"version" => CONFIG["theme_package_version"].to_s } }
248 |     open(JB::Path.build(:theme_packages, :node => "#{name}/packager.yml"), "w") do |page|
249 |       page.puts packager.to_yaml
250 |     end
251 | 
252 |     puts "=> '#{name}' theme is packaged and available at: #{JB::Path.build(:theme_packages, :node => name)}"
253 |   end
254 | 
255 | end # end namespace :theme
256 | 
257 | # Internal: Download and process a theme from a git url.
258 | # Notice we don't know the name of the theme until we look it up in the manifest.
259 | # So we'll have to change the folder name once we get the name.
260 | #
261 | # url - String, Required url to git repository.
262 | #
263 | # Returns theme manifest hash
264 | def theme_from_git_url(url)
265 |   tmp_path = JB::Path.build(:theme_packages, :node => "_tmp")
266 |   abort("rake aborted: system call to git clone failed") if !system("git clone #{url} #{tmp_path}")
267 |   manifest = verify_manifest(tmp_path)
268 |   new_path = JB::Path.build(:theme_packages, :node => manifest["name"])
269 |   if File.exist?(new_path) && ask("=> #{new_path} theme package already exists. Override?", ['y', 'n']) == 'n'
270 |     remove_dir(tmp_path)
271 |     abort("rake aborted: '#{manifest["name"]}' already exists as theme package.")
272 |   end
273 | 
274 |   remove_dir(new_path) if File.exist?(new_path)
275 |   mv(tmp_path, new_path)
276 |   manifest
277 | end
278 | 
279 | # Internal: Process theme package manifest file.
280 | #
281 | # theme_path - String, Required. File path to theme package.
282 | #
283 | # Returns theme manifest hash
284 | def verify_manifest(theme_path)
285 |   manifest_path = File.join(theme_path, "manifest.yml")
286 |   manifest_file = File.open( manifest_path )
287 |   abort("rake aborted: repo must contain valid manifest.yml") unless File.exist? manifest_file
288 |   manifest = YAML.load( manifest_file )
289 |   manifest_file.close
290 |   manifest
291 | end
292 | 
293 | def ask(message, valid_options)
294 |   if valid_options
295 |     answer = get_stdin("#{message} #{valid_options.to_s.gsub(/"/, '').gsub(/, /,'/')} ") while !valid_options.include?(answer)
296 |   else
297 |     answer = get_stdin(message)
298 |   end
299 |   answer
300 | end
301 | 
302 | def get_stdin(message)
303 |   print message
304 |   STDIN.gets.chomp
305 | end
306 | 
307 | #Load custom rake scripts
308 | Dir['_rake/*.rake'].each { |r| load r }
309 | 


--------------------------------------------------------------------------------
/relational.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: page
  3 | title: The relational DB schema
  4 | tagline:
  5 | ---
  6 | 
  7 | <img width="20%" src="files/schema.png"/>
  8 | 
  9 | [Download](files/schema.png) [Download PDF](files/schema.pdf)
 10 | 
 11 | ## Entities and their relationships
 12 | 
 13 | #### users
 14 | Github users.
 15 | 
 16 | * A user has a unique user name or email. May contain artificially generated user names, see [commits](relational.html#commits) below.
 17 | * There are two `type`s of users, `USER`s and `ORG`anizations.
 18 |   * Users can be *real* or *fake*. Real users can own projects and perform
 19 |  actions such as open issues, create pull requests and push commits. Fake
 20 |  users only appear as authors or committers of commits. Fake users are marked
 21 |  by the `fake` field.
 22 |   * Organizations are meta users that point to a collection of users. The members of organizations can be found in `organization_members`. Organization users can only own projects and they do not perform any other actions.
 23 | * Users may be marked as `deleted`. This means that the user was once active on
 24 | GitHub but GHTorrent can no longer get his/her details.
 25 | 
 26 | *Update Nov 2015:* User entries are now geocoded. The location field remains
 27 | intact, while 5 fields have been added with information about the
 28 | geographic location of the user. The Open Street Maps API has been used
 29 | to do the mapping of the location field to the user's geocode. As a result,
 30 | the state and city fields are stored in the local language of the geocoded
 31 | area. Also, many users do not report their location or their location
 32 | is field in with random information; in those cases, no geocoding information
 33 | is available.
 34 | 
 35 | {% highlight sql %}
 36 | --- See where most commits are commit from today
 37 | select u.country_code, count(*)
 38 | from commits c, users u
 39 | where c.author_id = u.id
 40 | and date(c.created_at) = date(now())
 41 | group by u.country_code
 42 | {% endhighlight %}
 43 | 
 44 | 
 45 | *Update Mar 2016:* User personal data (emails and real names) are excluded
 46 | from the downloaded dump, while configuration dissalows access to those
 47 | fields for the online access services for the MySQL database.
 48 | 
 49 | 
 50 | #### organization\_members
 51 | Users that are members of an organization.
 52 | 
 53 | * The `created_at` field is only filled in accurately for memberships for which
 54 | GHTorrent has recorded a corresponding event. Otherwise, it is filled in with the
 55 | latest date that the corresponding user or organization has been created.
 56 | 
 57 | *Update Nov 2015:* Organizations can now select wheather membership information
 58 | is revealed to external parties. This means that information about this
 59 | table can no longer be accurate.
 60 | 
 61 | #### projects
 62 | Information about repositories. A repository is always owned by a user.
 63 | 
 64 | * The `forked_from` field is empty unless the
 65 | project is a fork in which case it contains the `id` of the project the project
 66 | is forked from.
 67 | 
 68 | * The `deleted` field means that the project has been deleted from Github.
 69 | 
 70 | * The `updated_at` field indicates when the last full update was done for
 71 | this project.
 72 | 
 73 | #### project\_members
 74 | Users that have commit access to the repository.
 75 | 
 76 | The `created_at` field is only filled in accurately for memberships for which
 77 | GHTorrent has recorded a corresponding event. Otherwise, it is filled in with the
 78 | latest date that the corresponding user or project has been created.
 79 | 
 80 | *Update Nov 2014:* GitHub has disabled the API end point used to retrieve
 81 | members to an organization. GHTorrent uses the `MemberEvent` event to
 82 | approximate memberships, but this is not always accurate. You are thus advised
 83 | to use heuristics (e.g. the  committers + mergers of pull) to calculate membership,
 84 | such as the following:
 85 | 
 86 | {% highlight sql %}
 87 | --- Get active core team participants for the last 3 months
 88 | select distinct(u.login) as login
 89 |     from commits c, users u, project_commits pc, users u1, projects p
 90 |     where u.id = c.committer_id
 91 |       and u.fake is false
 92 |       and pc.commit_id = c.id
 93 |       and pc.project_id = p.id
 94 |       and p.owner_id = u1.id
 95 |       and p.name = 'rails'
 96 |       and u1.login = 'rails'
 97 |       and c.created_at > DATE_SUB(NOW(), INTERVAL 3 MONTH)
 98 | union
 99 | select distinct(u.login) as login
100 |   from pull_requests pr, projects p, users u, users u1, pull_request_history prh
101 |   where u.id = prh.actor_id
102 |     and prh.action = 'merged'
103 |     and u1.id = p.owner_id
104 |     and prh.pull_request_id = pr.id
105 |     and pr.base_repo_id = p.id
106 |     and prh.created_at > DATE_SUB(NOW(), INTERVAL 3 MONTH)
107 |     and p.name = 'rails'
108 |     and u1.login = 'rails'
109 | {% endhighlight %}
110 | 
111 | 
112 | ### project\_languages
113 | Languages that are used in the repository along with **byte counts** for
114 | all files in those languages.
115 | 
116 | Multiple entries can exist per project. The `created_at` field is filled in with
117 | the latest timestamp the query for a specific `project_id` was done.
118 | 
119 | The table is filled in when the project has been first inserted on when
120 | an update round for all projects is made.
121 | 
122 | {% highlight sql %}
123 | -- Get the latest byte count for languges in Ruby on Rails
124 | select *
125 | from project_languages
126 | where project_id = 1334
127 | order by created_at desc
128 | 
129 | {% endhighlight %}
130 | 
131 | #### commits
132 | Unique commits.
133 | 
134 | * Each commit is identified globally through its `sha` field. If the author or
135 | the committer has not configured his [Github email address](https://help.github.com/articles/setting-your-email-in-git), no resolution to
136 | a `user` entry is possible. In that case, GHTorrent generates artificial users using the provided email in the Git commit author or committer fields. If the user
137 | then configures his Github account, GHTorrent will update the artificial user
138 | accordingly. 
139 | 
140 | * The `project_id` field contains a link to the project that this commit has
141 | been first associated with. This might not be the project this commit was
142 | initially pushed to, e.g. in case the fork is processed before the parent.
143 | See [project\_commits](relational.html#project_commits).
144 | 
145 | * The `project_id` field may be null when the repository has been
146 | deleted at the time the commit is processed. This situation might happen when
147 | retrospectively processing pull requests for a repository and the 
148 | repository which the pull request originates from has been deleted.
149 | 
150 | #### commit\_parents
151 | The parent commit(s) for each commit, as specified by Git.
152 | 
153 | #### project\_commits
154 | The commits belonging to the history of a project.
155 | 
156 | More than one projects can share the same commits if one is a fork of the other.
157 | 
158 | #### commit\_comments
159 | Code review comments on commits.
160 | 
161 | These are comments on individual commits. If a commit is associated with a pull
162 | request, then its comments are in the
163 | [pull\_request\_comments](relational.html#pull_request_comments) table.
164 | 
165 | #### followers
166 | A follower to a user.
167 | 
168 | The `created_at` field is only filled in accurately for followships for which
169 | GHTorrent has recorded a corresponding event. Otherwise, it is filled in with the
170 | latest date that the corresponding user or follower has been created.
171 | 
172 | #### watchers
173 | Users that have starred (was [watched](https://github.com/blog/1204-notifications-stars)) a project
174 | 
175 | The `created_at` field is only filled in accurately for starrings for which
176 | GHTorrent has recorded a corresponding event. Otherwise, it is filled in with the
177 | latest date that the corresponding user or project has been created.
178 | 
179 | #### pull\_requests
180 | A pull request initiated from `head_repo_id`:`head_commit_id` to `base_repo_id`:`base_commit_id`
181 | 
182 | * Pull requests can be in various states. The states and their transitions
183 | are recorded in the [pull\_request\_history](relational.html#pull_request_history) table.
184 | * The `pullreq_id` field is Github's pull request unique identifier
185 | * The `intra_branch` field signifies that the head and base repositories are the
186 | same
187 | * If the head repository is NULL, this means that the corresponding project had been deleted when GHTorrent processed the pull request.
188 | 
189 | #### pull\_request\_history
190 | An event in the pull request lifetime
191 | 
192 | The `action` field can take the following values
193 | 
194 | * `opened`: When the pull request has been opened
195 | * `closed`: When the pull request has been closed
196 | * `merged`: When Github detected that the pull request has been merged. No merges
197 | outside Github (i.e. Git based) are reported
198 | * `reoponed`: When a pull request is opened after being closed
199 | * `syncrhonize`: When new commits are added/removed to the head repository
200 | 
201 | #### pull\_request\_commits
202 | A commit associated with a pull request
203 | 
204 | The list is additive. This means if a rebase with commit squashing takes place after the commits of a pull request have been processed, the old commits will not be deleted.
205 | 
206 | #### pull\_request\_comments
207 | A code review comment on a commit associated with a pull request
208 | 
209 | The list is additive. If commits are squashed on the head repo, the comments
210 | remain intact.
211 | 
212 | #### issues
213 | An issue associated with a repository
214 | 
215 | * The `assignee` field is filed in with the user to which the issue was
216 | assigned at the time the issue was processed.
217 | * Issues have history recorded in the [issue\_events](relational.html#issue_events) table.
218 | * For every pull request, GHTorrent creates a corresponding issue. The
219 | `pull_request_id` field points to the associated pull request
220 | * The `issue_id` field is the unique identifier given to the issue by Github.
221 | 
222 | #### issue\_events
223 | An event on an issue
224 | 
225 | * The `action` field can have the following values: 
226 |   * `subscribed`: When a user subscribes to receive notifications about the issue.
227 |   * `mentioned`: When a user is mentioned by another user (@user notation)
228 |   * `closed`: When the issue has been closed
229 |   * `referenced`: The issue was referenced in a commit (using the 
230 |    [fixes: conventions](https://github.com/blog/831-issues-2-0-the-next-generation))
231 |   * `assigned`: When the issue has been assigned to an actor.
232 |   * `reopened`: When a closed issue is reopened
233 |   * `unsubscribed`: When a user unsubscribed from issue.
234 |   * `merged`: When the pull request pointed by the issue has been merged.
235 |   * `head_ref_cleaned`:  (Not documented) ?
236 |   * `head_ref_deleted`: (Not documented) When the branch of the head repository has been deleted
237 |   * `head_ref_restored`: (Not documented) When the head repository of a pull
238 |   request has been restored (using the restore branch functionality).
239 | 
240 | * The `action_specific` field gets filled in with the `commit\_id` of the last
241 | commit when a pull request has been closed, merged or referenced.
242 | 
243 | #### issue\_comments
244 | An entry to the issue discussion. This table is always filled in with pull
245 | request (or issue) discussion comments, irrespective of whether the repository
246 | has issues enabled or not.
247 | 
248 | #### repo\_labels
249 | A label to be assigned to an issue affecting this repository.
250 | 
251 | #### issue\_labels
252 | A label that has been assigned to an issue
253 | 
254 | ## Example queries
255 | 
256 | #### List commits for a repository
257 | 
258 | {%highlight sql%}
259 | select c.*
260 | from commits c, project_commits pc, projects p, users u
261 | where u.login = 'rails'
262 |   and p.name = 'rails'
263 |   and p.id = pc.project_id
264 |   and c.id = pc.commit_id
265 | order by c.created_at desc
266 | {%endhighlight%}
267 | 
268 | #### Get all actions for a pull request
269 | 
270 | {%highlight sql%}
271 | select user, action, created_at from
272 | (
273 |   select prh.action as action, prh.created_at as created_at, u.login as user
274 |   from pull_request_history prh, users u
275 |   where prh.pull_request_id = ?
276 |     and prh.actor_id = u.id
277 |   union
278 |   select ie.action as action, ie.created_at as created_at, u.login as user
279 |   from issues i, issue_events ie, users u
280 |   where ie.issue_id = i.id
281 |     and i.pull_request_id = ?
282 |     and ie.actor_id = u.id
283 |   union
284 |   select 'discussed' as action, ic.created_at as created_at, u.login as user
285 |   from issues i, issue_comments ic, users u
286 |   where ic.issue_id = i.id
287 |     and u.id = ic.user_id
288 |     and i.pull_request_id = ?
289 |   union
290 |   select 'reviewed' as action, prc.created_at as created_at, u.login as user
291 |   from pull_request_comments prc, users u
292 |   where prc.user_id = u.id
293 |     and prc.pull_request_id = ?
294 | ) as actions
295 | order by created_at;
296 | {%endhighlight%}
297 | 
298 | #### Get participants in an issue or pull request
299 | 
300 | {%highlight sql%}
301 | select distinct(user_id) from
302 | (
303 |   select user_id
304 |   from pull_request_comments
305 |   where pull_request_id = ?
306 |   union
307 |   select user_id
308 |   from issue_comments ic, issues i
309 |   where i.id = ic.issue_id and i.pull_request_id = ?
310 | ) as participants
311 | {%endhighlight%}
312 | 
313 | #### Get all users in NL that committed to a Java project today
314 | 
315 | {%highlight sql%}
316 | select u.login
317 | from users u, commits c, projects p, project_commits pc
318 | where date(c.created_at) = date(now())
319 | and pc.commit_id = c.id
320 | and c.author_id = u.id
321 | and u.country_code = 'nl'
322 | and 'java' = (select pl.language
323 |               from project_langauges pl
324 |               where pl.project_id = p.id
325 |               order by pl.created_at desc, pl.bytes desc
326 |               limit 1)
327 | {%endhighlight%}
328 | 
329 | 


--------------------------------------------------------------------------------
/pullreq-perf/report.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: page
  3 | title: Performance report for `r owner`/`r repo`
  4 | ---
  5 | 
  6 | ```{r preample, include=FALSE}
  7 | 
  8 | #
  9 | # (c) 2012 -- 2014 Georgios Gousios <gousiosg@gmail.com>
 10 | #
 11 | # BSD licensed, see LICENSE in top level dir
 12 | #
 13 | 
 14 | library(ggplot2)
 15 | library(reshape)
 16 | library(plyr)
 17 | library(sqldf)
 18 | 
 19 | unwrap <- function(str) {
 20 |     strwrap(str, width=10000, simplify=TRUE)
 21 | }
 22 | 
 23 | # Get the project id
 24 |   q <- "
 25 |     select p.id 
 26 |     from projects p, users u 
 27 |     where u.id = p.owner_id 
 28 |       and u.login='%s' 
 29 |       and p.name = '%s' 
 30 |       and p.forked_from is null
 31 |   "
 32 | 
 33 |   res <- dbSendQuery(db, sprintf(unwrap(q), owner, repo))
 34 |   df <- fetch(res, n = -1)
 35 |   pid <- df$id[[1]]
 36 | ```
 37 | 
 38 | ### Pull request backlog
 39 | ```{r plot6, message=FALSE, fig.align='center', echo=FALSE, fig.width=9, warning=FALSE}
 40 | 
 41 |   q <- "
 42 |     select pr.pullreq_id, prh1.created_at as opened, ifnull(prh2.created_at, now()) as closed
 43 |     from pull_request_history prh1,
 44 |       pull_requests pr left outer join pull_request_history prh2
 45 |         on pr.id = prh2.pull_request_id
 46 |         and prh2.action = 'closed'
 47 |     where pr.id = prh1.pull_request_id
 48 |       and prh1.action = 'opened'
 49 |       and pr.base_repo_id = %d
 50 |     group by pr.id
 51 |     order by pr.pullreq_id desc
 52 |   "
 53 | 
 54 |   res <- dbSendQuery(db, sprintf(unwrap(q), pid))
 55 | 
 56 |   pullreq.open.close <- fetch(res, n = -1)
 57 |   pullreq.open.close$opened <- as.POSIXct(pullreq.open.close$opened)
 58 |   pullreq.open.close$closed <- as.POSIXct(pullreq.open.close$closed)
 59 | 
 60 |   pullreq.open.close$mopen <- strftime(pullreq.open.close$opened, format="%Y-%m")
 61 |   pullreq.open.close$mclose <- strftime(pullreq.open.close$closed, format="%Y-%m")
 62 |   
 63 |   backlog.stats <- aggregate(pullreq_id ~ mopen, pullreq.open.close, length)
 64 |   backlog.stats <- rename(backlog.stats, c('mopen' = 'month', 'pullreq_id' = 'New pullreqs'))
 65 | 
 66 |   a <- aggregate(pullreq_id ~ mclose, subset(pullreq.open.close, mopen == mclose), length)
 67 |   backlog.stats <- merge(backlog.stats, a, by.x = 'month', by.y = 'mclose', sort = FALSE, all = T)
 68 |   backlog.stats[c("pullreq_id")][is.na(backlog.stats[c("pullreq_id")])] <- 0
 69 |   backlog.stats <- rename(backlog.stats, c('pullreq_id' = 'New and closed'))
 70 | 
 71 |   a <- aggregate(pullreq_id ~ mopen, subset(pullreq.open.close, mopen != mclose), length)
 72 |   backlog.stats <- merge(backlog.stats, a, by.x = 'month', by.y = 'mopen', sort = FALSE, all = T)
 73 |   backlog.stats[c("pullreq_id")][is.na(backlog.stats[c("pullreq_id")])] <- 0
 74 |   backlog.stats <- rename(backlog.stats, c('pullreq_id' = 'New and left open'))
 75 |   
 76 |   a <- aggregate(pullreq_id ~ mclose, subset(pullreq.open.close, mopen != mclose), length)
 77 |   backlog.stats <- merge(backlog.stats, a, by.x = 'month', by.y = 'mclose', sort = FALSE, all = T)
 78 |   backlog.stats[c("pullreq_id")][is.na(backlog.stats[c("pullreq_id")])] <- 0
 79 |   backlog.stats <- rename(backlog.stats, c('pullreq_id' = 'Old and closed'))
 80 | 
 81 |   backlog.stats$month <- sprintf("%s-01", backlog.stats$month)
 82 |   backlog.stats$month <- strptime(backlog.stats$month, "%Y-%m-%d")
 83 |   backlog.stats$month <- as.POSIXct(backlog.stats$month)
 84 |   backlog.stats <- backlog.stats[!names(backlog.stats) %in% c("New pullreqs")]
 85 |   backlog.stats <- backlog.stats[order(backlog.stats[,1]),]
 86 |   backlog.stats <- backlog.stats[-nrow(backlog.stats),]
 87 | 
 88 |   backlog.stats <- melt(backlog.stats, id=c("month"))
 89 | 
 90 |   ggplot(backlog.stats) + 
 91 |     aes(x = month, y = value, fill = variable) + 
 92 |     scale_fill_discrete(name = "Per month") +
 93 |     geom_bar(stat = "identity") + 
 94 |     scale_x_datetime("Date")
 95 | 
 96 | 
 97 | ```
 98 | 
 99 | The pull request backlog presents the number of pull requests processed
100 | per month.
101 | Even though a month is relatively coarse-grained period for pull requests
102 | (where review and acceptance/rejection 
103 | [happen very fast](http://www.gousios.gr/bibliography/GPD14.html)), the 
104 | backlog view can be helpful to get an idea of the overall activity within the
105 | project.
106 | 
107 | ### Slow Pull Request lifelines
108 | ```{r plot5, message=FALSE, fig.align='center', echo=FALSE, fig.width=9, warning=FALSE}
109 | 
110 |   perc.09 <- as.numeric(quantile(pullreq.open.close$closed - pullreq.open.close$opened, 0.9))
111 |   num.slow10 <- nrow(subset(pullreq.open.close, closed - opened > perc.09))
112 |   num.fast90 <- nrow(subset(pullreq.open.close, closed - opened <= perc.09))
113 |   slow.10 <- subset(pullreq.open.close, closed - opened > perc.09)
114 | 
115 |   ggplot(slow.10) +
116 |     geom_point(aes(y = pullreq_id, x = closed), colour = "red") +
117 |     geom_point(aes(y = pullreq_id, x = opened), colour = "green") +
118 |     geom_segment(aes(y=pullreq_id, yend = pullreq_id, x = opened, xend = closed), alpha = 0.4) +
119 |     scale_y_discrete("Pull Request Number", breaks = NULL) +
120 |     scale_x_datetime("Time (open/close)")
121 | ```
122 | In this plot, we can see the lifelines of the slowest 10% of pull requests.
123 | For this project, the cutoff is `r perc.09 / 3600 /24 ` days. `r num.slow10` 
124 | pull requests where processed slower than that, while `r num.fast90` were 
125 | faster. The line represents the time between opening and closing the pull request.
126 | Pull requests whose end time aligns at the right edge of the plot are still open
127 | at the time of building this report. Generally, it is considered good practice
128 | to avoid having pull requests open for long.
129 | 
130 | 
131 | ### Source of commits
132 | ```{r plot1, echo=FALSE, fig.align='center', warning=FALSE}
133 | 
134 |   q <- "
135 |     select a.month, a.total_commits - b.commits_from_pull_reqs as direct, b.commits_from_pull_reqs as pullreq 
136 |     from (
137 |       select last_day(c.created_at) as month, p.id as prid, count(c.id) as total_commits 
138 |       from commits c, projects p, project_commits pc 
139 |       where p.id=%d  
140 |         and p.id = pc.project_id 
141 |         and c.id = pc.commit_id 
142 |       group by month(c.created_at),year(c.created_at), p.id
143 |     ) as a, (
144 |       select last_day(c.created_at) as month, p.id as prid, count(prc.commit_id) as commits_from_pull_reqs 
145 |       from projects p, pull_requests pr, pull_request_commits prc, commits c, project_commits pc 
146 |       where p.id = %d 
147 |         and exists(
148 |           select prh.action 
149 |           from pull_request_history prh 
150 |           where prh.pull_request_id = pr.id 
151 |             and last_day(prh.created_at) between last_day(c.created_at) and 
152 |                                                  date_add(last_day(c.created_at), INTERVAL 1 MONTH) 
153 |             and prh.action='merged') 
154 |         and p.id = pr.base_repo_id and prc.commit_id = c.id 
155 |         and pc.project_id = p.id 
156 |         and pc.commit_id = c.id 
157 |         and pr.id = prc.pull_request_id 
158 |       group by month(c.created_at),year(c.created_at), p.id) as b 
159 |     where a.prid = b.prid and a.month = b.month 
160 |     order by a.month desc"
161 | 
162 |   res <- dbSendQuery(db, sprintf(unwrap(q), pid, pid))
163 |   df <- fetch(res, n = -1)
164 |   df$month <- as.POSIXct(df$month)
165 |   df$commit_source <- df$value
166 |   df <- melt(df, id=c('month'))
167 |   df <- rename(df, c("variable"="commit_source"))
168 | 
169 |   ggplot(df) + 
170 |     aes(x = month, y = value, fill = commit_source) + 
171 |     scale_x_datetime() + 
172 |     geom_bar(stat="identity") + 
173 |     xlab("Date") + 
174 |     ylab("Commits") + 
175 |     scale_colour_identity(name = "source")
176 | ```
177 | 
178 | This figure presents the source of commits in your project. The more commits 
179 | come from pull requests, the more open the project process is to accepting 
180 | contributions. However, pull requests may be used internally (across project 
181 | branches) so this might not entirely reflect the actual situation. 
182 | 
183 | ### Commits from the project community as percentage of total
184 | ```{r plot2, fig.keep='last', echo=FALSE, fig.align='center', warning=FALSE}
185 |   q <- "
186 |     select a.mon as date, a.intern as intern, b.extern as extern 
187 |     from (
188 |       select last_day(c.created_at) as mon, count(*) as intern 
189 |       from commits c, project_commits pc, project_members pm 
190 |       where c.id = pc.commit_id  
191 |         and pm.repo_id = pc.project_id 
192 |         and c.author_id = pm.user_id 
193 |         and pc.project_id = %d 
194 |       group by mon order by mon) as a, 
195 |     (select last_day(c.created_at) as mon, count(*) as extern 
196 |     from commits c, project_commits pc 
197 |     where c.id = pc.commit_id  
198 |         and not exists (
199 |           select * 
200 |           from project_members pm 
201 |           where c.author_id = pm.user_id 
202 |             and pm.repo_id = pc.project_id) 
203 |         and pc.project_id = %d 
204 |         group by mon 
205 |         order by mon) as b 
206 |     where a.mon = b.mon
207 |       and a.mon > from_unixtime(1312156800)"
208 |   
209 |   res <- dbSendQuery(db, sprintf(unwrap(q), pid, pid))
210 |   df <- fetch(res, n = -1)
211 |   df$date <- as.POSIXct(df$date)
212 |   df$ratio <- (df$extern / (df$inter + df$extern)) * 100
213 | 
214 |   ggplot(df) + 
215 |     aes(x = date, y = ratio) + 
216 |     scale_x_datetime() + 
217 |     geom_line(size = 2) + 
218 |     stat_smooth(method = "loess", formula = y ~ x^2, size = 2, alpha = 0) + 
219 |     xlab("Date") +  ylab("Percentage of commits from community")
220 | 
221 | ```
222 | 
223 | Percentage of total commits (and trendline) coming from the community. The more 
224 | commits coming from the community, the more this project is a community effort.
225 | 
226 | ### Comments and commenters from the community
227 | ```{r plot3, message=FALSE, fig.align='center', echo=FALSE, fig.width=9, warning=FALSE}
228 |   q <- "
229 |     select last_day(a.mon) as mon, (
230 |       select count(pm.user_id) 
231 |       from project_members pm 
232 |       where pm.user_id = a.user_id and pm.repo_id = a.p_id) as is_member, 
233 |       count(distinct user_id) as num_users, 
234 |       sum(a.cnt) as num_comments  
235 |     from (
236 |       select last_day(ic.created_at) as mon, pr.base_repo_id as p_id, ic.user_id as user_id, count(ic.comment_id) as cnt 
237 |       from projects p 
238 |         join pull_requests pr on p.id = pr.base_repo_id 
239 |         left outer join issues i on pr.pullreq_id = i.issue_id 
240 |         left outer join issue_comments ic on i.id = ic.issue_id 
241 |       where p.forked_from is null 
242 |         and p.id = %d 
243 |         and pr.base_repo_id = i.repo_id 
244 |         group by mon, pr.base_repo_id, ic.user_id) as a, 
245 |       projects p 
246 |     where p.id = a.p_id 
247 |     group by mon, is_member
248 |   "
249 |   res <- dbSendQuery(db, sprintf(unwrap(q), pid))
250 |   df <- fetch(res, n = -1)
251 |   df <- subset(df, !is.na(mon))
252 |   df$is_member <- factor(df$is_member)
253 |   df$mon <- as.POSIXct(df$mon)
254 |   
255 |   q <- "
256 |     select d.mon, (
257 |       select sum(df1.num_comments) 
258 |       from df df1 
259 |       where df1.mon = d.mon 
260 |         and df1.is_member = 0) *100/sum(d.num_comments) as comments, 
261 |       (select sum(df1.num_users) 
262 |       from df df1 
263 |       where df1.mon = d.mon 
264 |         and df1.is_member = 0) * 100/sum(d.num_users) as commenters 
265 |     from df d 
266 |     group by d.mon
267 |   "
268 | 
269 |   df <- sqldf(q, drv="SQLite")
270 |   df <- melt(df, 'mon', na.rm = TRUE)
271 |   df$variable <- as.factor(df$variable)
272 |   df$value <- as.numeric(as.character(df$value))
273 | 
274 |   ggplot(df, aes(x = mon, y = value, fill = variable)) + 
275 |     scale_x_datetime() + 
276 |     geom_bar(position = 'dodge', stat = "identity") + 
277 |     xlab("Date") + ylab("% from community") + 
278 |     facet_grid(. ~ variable) + 
279 |     theme(legend.position="none") +
280 |     scale_y_continuous(limits = c(0, 100))
281 | 
282 | ```
283 | Percentage of comments (left) and people that commented (right) coming from 
284 | outside the project's core development team. The more comments coming from the 
285 | community, the more welcoming the project is to outsiders.
286 | 
287 | ### Project forks: Total and contributing
288 | ```{r plot4, message=FALSE, fig.align='center', echo=FALSE, fig.width=9, warning=FALSE}
289 |   q <- "
290 |     select last_day(p.created_at) as month, count(*) as created 
291 |     from projects p 
292 |     where p.forked_from = (
293 |       select p.id 
294 |       from projects p 
295 |       where p.id = %d) 
296 |     group by month"
297 | 
298 |   res <- dbSendQuery(db, sprintf(unwrap(q), pid))
299 |   forks <- fetch(res, n = -1)
300 | 
301 |   q <- "
302 |     select last_day(p.created_at) as month, count(*) as contributing 
303 |     from projects p 
304 |     where p.forked_from = (
305 |       select p.id 
306 |       from projects p 
307 |       where p.id = %d) 
308 |     and exists (
309 |       select * 
310 |       from pull_requests pr 
311 |       where pr.head_repo_id = p.id) 
312 |     group by month
313 |   "
314 | 
315 |   res <- dbSendQuery(db, sprintf(unwrap(q), pid))
316 |   contrib <- fetch(res, n = -1)
317 |   
318 |   df <- merge(forks, contrib, by = 'month')
319 |   df$month <- as.POSIXct(df$month)
320 |   df <- melt(df, id=c('month'))
321 |   df <- rename(df, c("variable"="forks"))
322 | 
323 |   ggplot(df) + 
324 |     aes(x = month, y = value, fill = forks) + 
325 |     scale_x_datetime() + 
326 |     geom_freqpoly(aes(group = forks, colour = forks), stat="identity", size = 2) + 
327 |     xlab("Date") + ylab("Number of forks")
328 | ```
329 | 
330 | This is a plot of forks created per month versus forks contributing code back 
331 | (in the form of pull requests) per month. Ideally, all forks should contribute 
332 | back. In healty community, the montly number of forks contributing should be 
333 | increasing, as the total number of forks increases.
334 | 
335 | <br/>
336 | <small>Generated at: `r date()`</small>
337 | 


--------------------------------------------------------------------------------
/assets/themes/twitter/bootstrap/css/bootstrap-responsive.min.css:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * Bootstrap Responsive v2.3.0
 3 |  *
 4 |  * Copyright 2012 Twitter, Inc
 5 |  * Licensed under the Apache License v2.0
 6 |  * http://www.apache.org/licenses/LICENSE-2.0
 7 |  *
 8 |  * Designed and built with all the love in the world @twitter by @mdo and @fat.
 9 |  */.clearfix{*zoom:1}.clearfix:before,.clearfix:after{display:table;line-height:0;content:""}.clearfix:after{clear:both}.hide-text{font:0/0 a;color:transparent;text-shadow:none;background-color:transparent;border:0}.input-block-level{display:block;width:100%;min-height:30px;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}@-ms-viewport{width:device-width}.hidden{display:none;visibility:hidden}.visible-phone{display:none!important}.visible-tablet{display:none!important}.hidden-desktop{display:none!important}.visible-desktop{display:inherit!important}@media(min-width:768px) and (max-width:979px){.hidden-desktop{display:inherit!important}.visible-desktop{display:none!important}.visible-tablet{display:inherit!important}.hidden-tablet{display:none!important}}@media(max-width:767px){.hidden-desktop{display:inherit!important}.visible-desktop{display:none!important}.visible-phone{display:inherit!important}.hidden-phone{display:none!important}}.visible-print{display:none!important}@media print{.visible-print{display:inherit!important}.hidden-print{display:none!important}}@media(min-width:1200px){.row{margin-left:-30px;*zoom:1}.row:before,.row:after{display:table;line-height:0;content:""}.row:after{clear:both}[class*="span"]{float:left;min-height:1px;margin-left:30px}.container,.navbar-static-top .container,.navbar-fixed-top .container,.navbar-fixed-bottom .container{width:1170px}.span12{width:1170px}.span11{width:1070px}.span10{width:970px}.span9{width:870px}.span8{width:770px}.span7{width:670px}.span6{width:570px}.span5{width:470px}.span4{width:370px}.span3{width:270px}.span2{width:170px}.span1{width:70px}.offset12{margin-left:1230px}.offset11{margin-left:1130px}.offset10{margin-left:1030px}.offset9{margin-left:930px}.offset8{margin-left:830px}.offset7{margin-left:730px}.offset6{margin-left:630px}.offset5{margin-left:530px}.offset4{margin-left:430px}.offset3{margin-left:330px}.offset2{margin-left:230px}.offset1{margin-left:130px}.row-fluid{width:100%;*zoom:1}.row-fluid:before,.row-fluid:after{display:table;line-height:0;content:""}.row-fluid:after{clear:both}.row-fluid [class*="span"]{display:block;float:left;width:100%;min-height:30px;margin-left:2.564102564102564%;*margin-left:2.5109110747408616%;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}.row-fluid [class*="span"]:first-child{margin-left:0}.row-fluid .controls-row [class*="span"]+[class*="span"]{margin-left:2.564102564102564%}.row-fluid .span12{width:100%;*width:99.94680851063829%}.row-fluid .span11{width:91.45299145299145%;*width:91.39979996362975%}.row-fluid .span10{width:82.90598290598291%;*width:82.8527914166212%}.row-fluid .span9{width:74.35897435897436%;*width:74.30578286961266%}.row-fluid .span8{width:65.81196581196582%;*width:65.75877432260411%}.row-fluid .span7{width:57.26495726495726%;*width:57.21176577559556%}.row-fluid .span6{width:48.717948717948715%;*width:48.664757228587014%}.row-fluid .span5{width:40.17094017094017%;*width:40.11774868157847%}.row-fluid .span4{width:31.623931623931625%;*width:31.570740134569924%}.row-fluid .span3{width:23.076923076923077%;*width:23.023731587561375%}.row-fluid .span2{width:14.52991452991453%;*width:14.476723040552828%}.row-fluid .span1{width:5.982905982905983%;*width:5.929714493544281%}.row-fluid .offset12{margin-left:105.12820512820512%;*margin-left:105.02182214948171%}.row-fluid .offset12:first-child{margin-left:102.56410256410257%;*margin-left:102.45771958537915%}.row-fluid .offset11{margin-left:96.58119658119658%;*margin-left:96.47481360247316%}.row-fluid .offset11:first-child{margin-left:94.01709401709402%;*margin-left:93.91071103837061%}.row-fluid .offset10{margin-left:88.03418803418803%;*margin-left:87.92780505546462%}.row-fluid .offset10:first-child{margin-left:85.47008547008548%;*margin-left:85.36370249136206%}.row-fluid .offset9{margin-left:79.48717948717949%;*margin-left:79.38079650845607%}.row-fluid .offset9:first-child{margin-left:76.92307692307693%;*margin-left:76.81669394435352%}.row-fluid .offset8{margin-left:70.94017094017094%;*margin-left:70.83378796144753%}.row-fluid .offset8:first-child{margin-left:68.37606837606839%;*margin-left:68.26968539734497%}.row-fluid .offset7{margin-left:62.393162393162385%;*margin-left:62.28677941443899%}.row-fluid .offset7:first-child{margin-left:59.82905982905982%;*margin-left:59.72267685033642%}.row-fluid .offset6{margin-left:53.84615384615384%;*margin-left:53.739770867430444%}.row-fluid .offset6:first-child{margin-left:51.28205128205128%;*margin-left:51.175668303327875%}.row-fluid .offset5{margin-left:45.299145299145295%;*margin-left:45.1927623204219%}.row-fluid .offset5:first-child{margin-left:42.73504273504273%;*margin-left:42.62865975631933%}.row-fluid .offset4{margin-left:36.75213675213675%;*margin-left:36.645753773413354%}.row-fluid .offset4:first-child{margin-left:34.18803418803419%;*margin-left:34.081651209310785%}.row-fluid .offset3{margin-left:28.205128205128204%;*margin-left:28.0987452264048%}.row-fluid .offset3:first-child{margin-left:25.641025641025642%;*margin-left:25.53464266230224%}.row-fluid .offset2{margin-left:19.65811965811966%;*margin-left:19.551736679396257%}.row-fluid .offset2:first-child{margin-left:17.094017094017094%;*margin-left:16.98763411529369%}.row-fluid .offset1{margin-left:11.11111111111111%;*margin-left:11.004728132387708%}.row-fluid .offset1:first-child{margin-left:8.547008547008547%;*margin-left:8.440625568285142%}input,textarea,.uneditable-input{margin-left:0}.controls-row [class*="span"]+[class*="span"]{margin-left:30px}input.span12,textarea.span12,.uneditable-input.span12{width:1156px}input.span11,textarea.span11,.uneditable-input.span11{width:1056px}input.span10,textarea.span10,.uneditable-input.span10{width:956px}input.span9,textarea.span9,.uneditable-input.span9{width:856px}input.span8,textarea.span8,.uneditable-input.span8{width:756px}input.span7,textarea.span7,.uneditable-input.span7{width:656px}input.span6,textarea.span6,.uneditable-input.span6{width:556px}input.span5,textarea.span5,.uneditable-input.span5{width:456px}input.span4,textarea.span4,.uneditable-input.span4{width:356px}input.span3,textarea.span3,.uneditable-input.span3{width:256px}input.span2,textarea.span2,.uneditable-input.span2{width:156px}input.span1,textarea.span1,.uneditable-input.span1{width:56px}.thumbnails{margin-left:-30px}.thumbnails>li{margin-left:30px}.row-fluid .thumbnails{margin-left:0}}@media(min-width:768px) and (max-width:979px){.row{margin-left:-20px;*zoom:1}.row:before,.row:after{display:table;line-height:0;content:""}.row:after{clear:both}[class*="span"]{float:left;min-height:1px;margin-left:20px}.container,.navbar-static-top .container,.navbar-fixed-top .container,.navbar-fixed-bottom .container{width:724px}.span12{width:724px}.span11{width:662px}.span10{width:600px}.span9{width:538px}.span8{width:476px}.span7{width:414px}.span6{width:352px}.span5{width:290px}.span4{width:228px}.span3{width:166px}.span2{width:104px}.span1{width:42px}.offset12{margin-left:764px}.offset11{margin-left:702px}.offset10{margin-left:640px}.offset9{margin-left:578px}.offset8{margin-left:516px}.offset7{margin-left:454px}.offset6{margin-left:392px}.offset5{margin-left:330px}.offset4{margin-left:268px}.offset3{margin-left:206px}.offset2{margin-left:144px}.offset1{margin-left:82px}.row-fluid{width:100%;*zoom:1}.row-fluid:before,.row-fluid:after{display:table;line-height:0;content:""}.row-fluid:after{clear:both}.row-fluid [class*="span"]{display:block;float:left;width:100%;min-height:30px;margin-left:2.7624309392265194%;*margin-left:2.709239449864817%;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}.row-fluid [class*="span"]:first-child{margin-left:0}.row-fluid .controls-row [class*="span"]+[class*="span"]{margin-left:2.7624309392265194%}.row-fluid .span12{width:100%;*width:99.94680851063829%}.row-fluid .span11{width:91.43646408839778%;*width:91.38327259903608%}.row-fluid .span10{width:82.87292817679558%;*width:82.81973668743387%}.row-fluid .span9{width:74.30939226519337%;*width:74.25620077583166%}.row-fluid .span8{width:65.74585635359117%;*width:65.69266486422946%}.row-fluid .span7{width:57.18232044198895%;*width:57.12912895262725%}.row-fluid .span6{width:48.61878453038674%;*width:48.56559304102504%}.row-fluid .span5{width:40.05524861878453%;*width:40.00205712942283%}.row-fluid .span4{width:31.491712707182323%;*width:31.43852121782062%}.row-fluid .span3{width:22.92817679558011%;*width:22.87498530621841%}.row-fluid .span2{width:14.3646408839779%;*width:14.311449394616199%}.row-fluid .span1{width:5.801104972375691%;*width:5.747913483013988%}.row-fluid .offset12{margin-left:105.52486187845304%;*margin-left:105.41847889972962%}.row-fluid .offset12:first-child{margin-left:102.76243093922652%;*margin-left:102.6560479605031%}.row-fluid .offset11{margin-left:96.96132596685082%;*margin-left:96.8549429881274%}.row-fluid .offset11:first-child{margin-left:94.1988950276243%;*margin-left:94.09251204890089%}.row-fluid .offset10{margin-left:88.39779005524862%;*margin-left:88.2914070765252%}.row-fluid .offset10:first-child{margin-left:85.6353591160221%;*margin-left:85.52897613729868%}.row-fluid .offset9{margin-left:79.8342541436464%;*margin-left:79.72787116492299%}.row-fluid .offset9:first-child{margin-left:77.07182320441989%;*margin-left:76.96544022569647%}.row-fluid .offset8{margin-left:71.2707182320442%;*margin-left:71.16433525332079%}.row-fluid .offset8:first-child{margin-left:68.50828729281768%;*margin-left:68.40190431409427%}.row-fluid .offset7{margin-left:62.70718232044199%;*margin-left:62.600799341718584%}.row-fluid .offset7:first-child{margin-left:59.94475138121547%;*margin-left:59.838368402492065%}.row-fluid .offset6{margin-left:54.14364640883978%;*margin-left:54.037263430116376%}.row-fluid .offset6:first-child{margin-left:51.38121546961326%;*margin-left:51.27483249088986%}.row-fluid .offset5{margin-left:45.58011049723757%;*margin-left:45.47372751851417%}.row-fluid .offset5:first-child{margin-left:42.81767955801105%;*margin-left:42.71129657928765%}.row-fluid .offset4{margin-left:37.01657458563536%;*margin-left:36.91019160691196%}.row-fluid .offset4:first-child{margin-left:34.25414364640884%;*margin-left:34.14776066768544%}.row-fluid .offset3{margin-left:28.45303867403315%;*margin-left:28.346655695309746%}.row-fluid .offset3:first-child{margin-left:25.69060773480663%;*margin-left:25.584224756083227%}.row-fluid .offset2{margin-left:19.88950276243094%;*margin-left:19.783119783707537%}.row-fluid .offset2:first-child{margin-left:17.12707182320442%;*margin-left:17.02068884448102%}.row-fluid .offset1{margin-left:11.32596685082873%;*margin-left:11.219583872105325%}.row-fluid .offset1:first-child{margin-left:8.56353591160221%;*margin-left:8.457152932878806%}input,textarea,.uneditable-input{margin-left:0}.controls-row [class*="span"]+[class*="span"]{margin-left:20px}input.span12,textarea.span12,.uneditable-input.span12{width:710px}input.span11,textarea.span11,.uneditable-input.span11{width:648px}input.span10,textarea.span10,.uneditable-input.span10{width:586px}input.span9,textarea.span9,.uneditable-input.span9{width:524px}input.span8,textarea.span8,.uneditable-input.span8{width:462px}input.span7,textarea.span7,.uneditable-input.span7{width:400px}input.span6,textarea.span6,.uneditable-input.span6{width:338px}input.span5,textarea.span5,.uneditable-input.span5{width:276px}input.span4,textarea.span4,.uneditable-input.span4{width:214px}input.span3,textarea.span3,.uneditable-input.span3{width:152px}input.span2,textarea.span2,.uneditable-input.span2{width:90px}input.span1,textarea.span1,.uneditable-input.span1{width:28px}}@media(max-width:767px){body{padding-right:20px;padding-left:20px}.navbar-fixed-top,.navbar-fixed-bottom,.navbar-static-top{margin-right:-20px;margin-left:-20px}.container-fluid{padding:0}.dl-horizontal dt{float:none;width:auto;clear:none;text-align:left}.dl-horizontal dd{margin-left:0}.container{width:auto}.row-fluid{width:100%}.row,.thumbnails{margin-left:0}.thumbnails>li{float:none;margin-left:0}[class*="span"],.uneditable-input[class*="span"],.row-fluid [class*="span"]{display:block;float:none;width:100%;margin-left:0;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}.span12,.row-fluid .span12{width:100%;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}.row-fluid [class*="offset"]:first-child{margin-left:0}.input-large,.input-xlarge,.input-xxlarge,input[class*="span"],select[class*="span"],textarea[class*="span"],.uneditable-input{display:block;width:100%;min-height:30px;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}.input-prepend input,.input-append input,.input-prepend input[class*="span"],.input-append input[class*="span"]{display:inline-block;width:auto}.controls-row [class*="span"]+[class*="span"]{margin-left:0}.modal{position:fixed;top:20px;right:20px;left:20px;width:auto;margin:0}.modal.fade{top:-100px}.modal.fade.in{top:20px}}@media(max-width:480px){.nav-collapse{-webkit-transform:translate3d(0,0,0)}.page-header h1 small{display:block;line-height:20px}input[type="checkbox"],input[type="radio"]{border:1px solid #ccc}.form-horizontal .control-label{float:none;width:auto;padding-top:0;text-align:left}.form-horizontal .controls{margin-left:0}.form-horizontal .control-list{padding-top:0}.form-horizontal .form-actions{padding-right:10px;padding-left:10px}.media .pull-left,.media .pull-right{display:block;float:none;margin-bottom:10px}.media-object{margin-right:0;margin-left:0}.modal{top:10px;right:10px;left:10px}.modal-header .close{padding:10px;margin:-10px}.carousel-caption{position:static}}@media(max-width:979px){body{padding-top:0}.navbar-fixed-top,.navbar-fixed-bottom{position:static}.navbar-fixed-top{margin-bottom:20px}.navbar-fixed-bottom{margin-top:20px}.navbar-fixed-top .navbar-inner,.navbar-fixed-bottom .navbar-inner{padding:5px}.navbar .container{width:auto;padding:0}.navbar .brand{padding-right:10px;padding-left:10px;margin:0 0 0 -5px}.nav-collapse{clear:both}.nav-collapse .nav{float:none;margin:0 0 10px}.nav-collapse .nav>li{float:none}.nav-collapse .nav>li>a{margin-bottom:2px}.nav-collapse .nav>.divider-vertical{display:none}.nav-collapse .nav .nav-header{color:#777;text-shadow:none}.nav-collapse .nav>li>a,.nav-collapse .dropdown-menu a{padding:9px 15px;font-weight:bold;color:#777;-webkit-border-radius:3px;-moz-border-radius:3px;border-radius:3px}.nav-collapse .btn{padding:4px 10px 4px;font-weight:normal;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px}.nav-collapse .dropdown-menu li+li a{margin-bottom:2px}.nav-collapse .nav>li>a:hover,.nav-collapse .nav>li>a:focus,.nav-collapse .dropdown-menu a:hover,.nav-collapse .dropdown-menu a:focus{background-color:#f2f2f2}.navbar-inverse .nav-collapse .nav>li>a,.navbar-inverse .nav-collapse .dropdown-menu a{color:#999}.navbar-inverse .nav-collapse .nav>li>a:hover,.navbar-inverse .nav-collapse .nav>li>a:focus,.navbar-inverse .nav-collapse .dropdown-menu a:hover,.navbar-inverse .nav-collapse .dropdown-menu a:focus{background-color:#111}.nav-collapse.in .btn-group{padding:0;margin-top:5px}.nav-collapse .dropdown-menu{position:static;top:auto;left:auto;display:none;float:none;max-width:none;padding:0;margin:0 15px;background-color:transparent;border:0;-webkit-border-radius:0;-moz-border-radius:0;border-radius:0;-webkit-box-shadow:none;-moz-box-shadow:none;box-shadow:none}.nav-collapse .open>.dropdown-menu{display:block}.nav-collapse .dropdown-menu:before,.nav-collapse .dropdown-menu:after{display:none}.nav-collapse .dropdown-menu .divider{display:none}.nav-collapse .nav>li>.dropdown-menu:before,.nav-collapse .nav>li>.dropdown-menu:after{display:none}.nav-collapse .navbar-form,.nav-collapse .navbar-search{float:none;padding:10px 15px;margin:10px 0;border-top:1px solid #f2f2f2;border-bottom:1px solid #f2f2f2;-webkit-box-shadow:inset 0 1px 0 rgba(255,255,255,0.1),0 1px 0 rgba(255,255,255,0.1);-moz-box-shadow:inset 0 1px 0 rgba(255,255,255,0.1),0 1px 0 rgba(255,255,255,0.1);box-shadow:inset 0 1px 0 rgba(255,255,255,0.1),0 1px 0 rgba(255,255,255,0.1)}.navbar-inverse .nav-collapse .navbar-form,.navbar-inverse .nav-collapse .navbar-search{border-top-color:#111;border-bottom-color:#111}.navbar .nav-collapse .nav.pull-right{float:none;margin-left:0}.nav-collapse,.nav-collapse.collapse{height:0;overflow:hidden}.navbar .btn-navbar{display:block}.navbar-static .navbar-inner{padding-right:10px;padding-left:10px}}@media(min-width:980px){.nav-collapse.collapse{height:auto!important;overflow:visible!important}}
10 | 


--------------------------------------------------------------------------------