├── 404.html ├── local.css ├── _plugins ├── ext.rb ├── mdhtml.rb └── debug.rb ├── files ├── mslogo.png ├── rulogo.gif ├── schema.pdf ├── schema.png ├── tudelftlogo.png └── ghtorrent-data.pdf ├── vm.md ├── README.md ├── Gemfile ├── .gitignore ├── assets └── themes │ └── twitter │ ├── bootstrap │ ├── img │ │ ├── glyphicons-halflings.png │ │ └── glyphicons-halflings-white.png │ └── css │ │ └── bootstrap-responsive.min.css │ └── css │ └── style.css ├── sitemap.txt ├── pages.html ├── _layouts ├── page.html └── default.html ├── dumps ├── update-downloads.sh ├── index.erb ├── run-all.sh ├── index.rb └── ght-periodic-dump ├── stats ├── extract-events.sh ├── extract-stats.sh ├── genstats.sh ├── index.md └── api-stats.R ├── docs.md ├── _includes └── comments.html ├── atom.xml ├── _config.yml ├── contrib.md ├── downloads.md ├── basedupon.md ├── lean.html ├── mysql.md ├── services.md ├── raw.md ├── pullreq-perf ├── openess-report.R ├── index.Rmd └── report.Rmd ├── vissoft14.md ├── syntax.css ├── geninst.md ├── mongo.md ├── cookbook.md ├── leanprogress.html ├── index.md ├── gcloud.md ├── streaming.md ├── pers-data.md ├── halloffame.md ├── ght-ubuntu.md ├── _bibliography └── references.bib ├── faq.md ├── msr14.md ├── Rakefile └── relational.md /404.html: -------------------------------------------------------------------------------- 1 | Sorry this page does not exist =( 2 | -------------------------------------------------------------------------------- /local.css: -------------------------------------------------------------------------------- 1 | h4 { 2 | font-weight: bold; 3 | } 4 | -------------------------------------------------------------------------------- /_plugins/ext.rb: -------------------------------------------------------------------------------- 1 | require 'jekyll/scholar' 2 | require 'pp' 3 | puts 'Loaded scholar' 4 | 5 | -------------------------------------------------------------------------------- /files/mslogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ice3man543/ghtorrent.org/master/files/mslogo.png -------------------------------------------------------------------------------- /files/rulogo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ice3man543/ghtorrent.org/master/files/rulogo.gif -------------------------------------------------------------------------------- /files/schema.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ice3man543/ghtorrent.org/master/files/schema.pdf -------------------------------------------------------------------------------- /files/schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ice3man543/ghtorrent.org/master/files/schema.png -------------------------------------------------------------------------------- /vm.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Client VM 4 | tagline: 5 | --- 6 | 7 | Under Construction! 8 | -------------------------------------------------------------------------------- /files/tudelftlogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ice3man543/ghtorrent.org/master/files/tudelftlogo.png -------------------------------------------------------------------------------- /files/ghtorrent-data.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ice3man543/ghtorrent.org/master/files/ghtorrent-data.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Installing 2 | 3 | ```` 4 | su 5 | apt-get install ruby 6 | gem install jekyll jekyll-scholar 7 | ```` 8 | 9 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gem 'rouge' 4 | gem 'rdiscount' 5 | gem 'jekyll-watch' 6 | gem 'jekyll-scholar' 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _site/* 2 | _theme_packages/* 3 | pullreq-perf/*/ 4 | *~ 5 | Thumbs.db 6 | .DS_Store 7 | 8 | !.gitkeep 9 | 10 | .rbenv-version 11 | .rvmrc 12 | -------------------------------------------------------------------------------- /assets/themes/twitter/bootstrap/img/glyphicons-halflings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ice3man543/ghtorrent.org/master/assets/themes/twitter/bootstrap/img/glyphicons-halflings.png -------------------------------------------------------------------------------- /assets/themes/twitter/bootstrap/img/glyphicons-halflings-white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ice3man543/ghtorrent.org/master/assets/themes/twitter/bootstrap/img/glyphicons-halflings-white.png -------------------------------------------------------------------------------- /sitemap.txt: -------------------------------------------------------------------------------- 1 | --- 2 | # Remember to set production_url in your _config.yml file! 3 | title : Sitemap 4 | --- 5 | {% for page in site.pages %} 6 | {{site.production_url}}{{ page.url }}{% endfor %} 7 | {% for post in site.posts %} 8 | {{site.production_url}}{{ post.url }}{% endfor %} -------------------------------------------------------------------------------- /pages.html: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Pages 4 | header: Pages 5 | group: navigation 6 | --- 7 | 8 |

All Pages

9 | 14 | -------------------------------------------------------------------------------- /_layouts/page.html: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | --- 4 | 5 | 8 | 9 |
10 |
11 | {{ content }} 12 |
13 |
14 | -------------------------------------------------------------------------------- /_plugins/mdhtml.rb: -------------------------------------------------------------------------------- 1 | module Jekyll 2 | class MarkdownBlock < Liquid::Block 3 | def initialize(tag_name, text, tokens) 4 | super 5 | end 6 | require "kramdown" 7 | def render(context) 8 | content = super 9 | "#{Kramdown::Document.new(content).to_html}" 10 | end 11 | end 12 | end 13 | Liquid::Template.register_tag('markdown', Jekyll::MarkdownBlock) 14 | -------------------------------------------------------------------------------- /dumps/update-downloads.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | ./index.rb downloads/ > torrents 3 | cd ghtorrent.org 4 | cat downloads.md | sed -n '/### Available/q;p' > dl.tmp 5 | echo "### Available Downloads" >> dl.tmp 6 | cat dl.tmp ../torrents > downloads.md 7 | git stash 8 | git pull 9 | git stash pop 10 | git commit -a -m "Dump `date +'%Y-%m-%d'`" 11 | git push 12 | rm dl.tmp 13 | cd - 14 | rm torrents 15 | -------------------------------------------------------------------------------- /stats/extract-events.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | doit() { 4 | grep Processed $1| 5 | perl -lape 's/\[([T0-9-:.]*)\ .*\].* event:\ ([^-]*)/$1 $2/'| 6 | cut -f2,3 -d' '| 7 | cut -f1,2,3 -d'-'| 8 | ruby -ne 'BEGIN{require "time"}; t,i,d=$_.split(/ /); print Time.parse(t). to_i," ", i, " ", d;' 9 | } 10 | 11 | export -f doit 12 | 13 | find mirror -type f|grep log.txt| parallel -j10 doit {} 14 | -------------------------------------------------------------------------------- /stats/extract-stats.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | doit() { 4 | grep APIClient $1| 5 | grep -v WARN | 6 | perl -lape 's/\[([T0-9-:.]*).*\] DEBUG.*\[([0-9.]*)\].*Total: ([0-9]*) ms/$1 $2 $3/'| 7 | cut -f2,3,4 -d' '| 8 | ruby -ne 'BEGIN{require "time"}; t,i,d=$_.split(/ /); print Time.parse(t).to_i," ", i, " ", d;'| 9 | grep -v "#" 10 | } 11 | 12 | export -f doit 13 | 14 | find mirror -type f|grep log.txt| parallel -j10 doit {} 15 | -------------------------------------------------------------------------------- /stats/genstats.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | scp ./extract-stats.sh dutihr:~ 4 | ssh dutihr /home/gousiosg/extract-stats.sh > foo 5 | echo "ts ip ms" > data.txt 6 | cat foo >> data.txt 7 | 8 | scp ./extract-events.sh dutihr:~ 9 | ssh dutihr /home/gousiosg/extract-events.sh |sed -e 's/^ \(.*\)$/\1/' > foo 10 | echo "ts event" >events.txt 11 | cat foo >> events.txt 12 | 13 | R --no-save < api-stats.R 14 | 15 | cd .. 16 | jekyll build 17 | 18 | -------------------------------------------------------------------------------- /stats/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Statistics 4 | tagline: 5 | --- 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 19 | 20 | 21 | 22 | 23 | 24 | 25 |
API response time timeseries plotNumber of request per timeslot
18 |       timeseries plot
26 | -------------------------------------------------------------------------------- /docs.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: The GHTorrent documentation 4 | tagline: 5 | --- 6 | 7 | ## Data formats 8 | 9 | * [The relational data schema](relational.html) 10 | * [Collections in the MongoDB database](mongo.html) 11 | 12 | ## Instaling and running 13 | 14 | You can now use the 15 | [GHTorrent Vagrant](https://github.com/ghtorrent/ghtorrent-vagrant) box to setup 16 | a testing/development environment for GHTorrent! The GHTorrent Vagrant box 17 | completely automates the process below. 18 | 19 | * [Generic installation instructions](geninst.html) 20 | * [Installing on Ubuntu 10.10](ght-ubuntu.html) 21 | * [The GHTorrent cookbook](cookbook.html) 22 | 23 | -------------------------------------------------------------------------------- /dumps/index.erb: -------------------------------------------------------------------------------- 1 | 2 | List of available torrents (Last dump date: <%= @last_update %>) 3 | 4 | 5 | 6 | 7 | 8 | <% @collections.sort.each do |c| %> 9 | 10 | <% end %> 11 | 12 | 13 | 14 | <% for @d in @dumps.sort{ |a,b| a.date <=> b.date} %> 15 | 16 | 17 | <% for @col in @collections.sort %> 18 | <% @t = @d.torrents[@col] %> 19 | 23 | <% end %> 24 | 25 | <% end %> 26 | 27 |
Dump date<%=c.gsub("_", " ") %>
<%= @d.date %><% unless @t.nil? %> 20 | <%= @t.size%> MB 21 | <% end %> 22 |
28 | 29 | -------------------------------------------------------------------------------- /_includes/comments.html: -------------------------------------------------------------------------------- 1 |
2 | 16 | 19 | comments powered by Disqus 20 | -------------------------------------------------------------------------------- /atom.xml: -------------------------------------------------------------------------------- 1 | --- 2 | layout: nil 3 | title : Atom Feed 4 | --- 5 | 6 | 7 | 8 | {{ site.title }} 9 | 10 | 11 | {{ site.time | date_to_xmlschema }} 12 | {{ site.production_url }} 13 | 14 | {{ site.author.name }} 15 | {{ site.author.email }} 16 | 17 | 18 | {% for post in site.posts %} 19 | 20 | {{ post.title }} 21 | 22 | {{ post.date | date_to_xmlschema }} 23 | {{ site.production_url }}{{ post.id }} 24 | {{ post.content | xml_escape }} 25 | 26 | {% endfor %} 27 | 28 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | permalink: /:categories/:year/:month/:day/:title 2 | 3 | exclude: [".rvmrc", ".rbenv-version", "README.md", "Rakefile", "changelog.md"] 4 | highlighter: "rouge" 5 | #markdown: kramdown 6 | markdown: rdiscount 7 | 8 | # Themes are encouraged to use these universal variable 9 | # so be sure to set them if your theme uses them. 10 | title : GHTorrent 11 | tagline: Query Github data! 12 | author : 13 | name : Georgios Gousios 14 | email : gousiosg@gmail.com 15 | github : gousiosg 16 | twitter : gousiosg 17 | feedburner : 18 | 19 | production_url : http://www.ghtorrent.org 20 | 21 | scholar: 22 | style: apa 23 | locale: en 24 | 25 | sort_by: none 26 | order: ascending 27 | 28 | source: ./_bibliography 29 | bibliography: references 30 | bibliography_template: "{{reference}}" 31 | 32 | replace_strings: true 33 | 34 | details_dir: bibliography 35 | details_layout: bib_details.html 36 | details_link: more... 37 | 38 | query: "@*" 39 | -------------------------------------------------------------------------------- /_plugins/debug.rb: -------------------------------------------------------------------------------- 1 | # A simple way to inspect liquid template variables. 2 | # Usage: 3 | # Can be used anywhere liquid syntax is parsed (templates, includes, posts/pages) 4 | # {{ site | debug }} 5 | # {{ site.posts | debug }} 6 | # 7 | require 'pp' 8 | module Jekyll 9 | # Need to overwrite the inspect method here because the original 10 | # uses < > to encapsulate the psuedo post/page objects in which case 11 | # the output is taken for HTML tags and hidden from view. 12 | # 13 | class Post 14 | def inspect 15 | "#Jekyll:Post @id=#{self.id.inspect}" 16 | end 17 | end 18 | 19 | class Page 20 | def inspect 21 | "#Jekyll:Page @name=#{self.name.inspect}" 22 | end 23 | end 24 | 25 | end # Jekyll 26 | 27 | module Jekyll 28 | module DebugFilter 29 | 30 | def debug(obj, stdout=false) 31 | puts obj.pretty_inspect if stdout 32 | "
#{obj.class}\n#{obj.pretty_inspect}
" 33 | end 34 | 35 | end # DebugFilter 36 | end # Jekyll 37 | 38 | Liquid::Template.register_filter(Jekyll::DebugFilter) -------------------------------------------------------------------------------- /dumps/run-all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #./ght-periodic-dump -f '2012-01-31 00:00' -t '2012-03-31 00:00' 4 | #./update-downloads.sh 5 | 6 | #./ght-periodic-dump -f '2012-03-31 00:00' -t '2012-05-31 00:00' 7 | #./update-downloads.sh 8 | 9 | #./ght-periodic-dump -f '2012-05-31 00:00' -t '2012-07-31 00:00' 10 | #./update-downloads.sh 11 | 12 | #./ght-periodic-dump -f '2012-07-31 00:00' -t '2012-09-30 00:00' 13 | #./update-downloads.sh 14 | 15 | #./ght-periodic-dump -f '2012-09-30 00:00' -t '2012-11-30 00:00' 16 | #./update-downloads.sh 17 | 18 | #./ght-periodic-dump -f '2012-11-30 00:00' -t '2013-01-30 00:00' 19 | #./update-downloads.sh 20 | 21 | #./ght-periodic-dump -f '2013-01-30 00:00' -t '2013-03-30 00:00' 22 | #./update-downloads.sh 23 | 24 | #./ght-periodic-dump -f '2013-03-30 00:00' -t '2013-05-30 00:00' 25 | #./update-downloads.sh 26 | 27 | #./ght-periodic-dump -f '2013-05-30 00:00' -t '2013-07-30 00:00' 28 | #./update-downloads.sh 29 | 30 | #./ght-periodic-dump -f '2013-07-30 00:00' -t '2013-09-30 00:00' 31 | #./update-downloads.sh 32 | 33 | ./ght-periodic-dump -f '2013-09-30 00:00' -t '2013-11-30 00:00' 34 | #./update-downloads.sh 35 | 36 | ./ght-periodic-dump -f '2013-11-30 00:00' -t '2014-01-30 00:00' 37 | #./update-downloads.sh 38 | 39 | -------------------------------------------------------------------------------- /contrib.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Contributing to GHTorrent 4 | tagline: 5 | --- 6 | 7 | Do you consider contributing to GHTorrent? That's great! We value any 8 | contribution, no matter how small, big, simple or sophisticated it is. If you 9 | decide to send a pull request, we will actively help you to get your pull 10 | request integrated. 11 | 12 | ## Setting up GHTorrent 13 | 14 | The first thing to do is to ensure that you have a working GHTorrent 15 | environment. To do so, please consult the top level 16 | [README.md](https://github.com/gousiosg/github-mirror/blob/master/README.md) file with 17 | instructions on doing so. 18 | 19 | ## TODO list 20 | 21 | The TODO list is maintained as a collection of open [GitHub 22 | issues](https://github.com/gousiosg/github-mirror). Please feel free to adopt 23 | any of those by @mentioning the @ghtorrent user. 24 | 25 | ## New features 26 | 27 | Do you have a cool idea that will make GHTorrent 100x (or 0.01x) better? That's 28 | great! We look forward to reviewing your pull requests! We however advise you 29 | to: 30 | 31 | 1. Read the [open issue list](https://github.com/gousiosg/github-mirror/issues) 32 | 2. Contact the [GHTorrent mailing list](). The maintainers will help you 33 | implement your proposed feature as efficiently as possible and make sure 34 | that it does not conflict with any change currently planned. 35 | -------------------------------------------------------------------------------- /downloads.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Downloads 4 | tagline: 5 | --- 6 | 7 | ### What am I downloading? 8 | 9 | * The MySQL dump is a full, up to date database dump. You can use it for querying the latest available data. 10 | * The MongoDB dumps are incremental. They are provided mostly for reference and backup purposes, as they may contain duplicates. The reasons for this are the following: 11 | * When refreshing project data, old records are deleted and new are added. This cannot be reflected in the dumps (it is not practical to regenerate all dumps every time). 12 | * The dumps have already been restored once, hence the dump dates do not 13 | represent the actual data generation dates. 14 | 15 | For those reasons, we recommend using the MongoDB data through our [query 16 | service](http://ghtorrent.org/raw.html). 17 | 18 | ### Which is the applicable license? 19 | 20 | See [here](faq.html) 21 | 22 | ### MySQL database dumps 23 | As of MySQL dump 24 | `mysql-2015-09-25`, we are distributing CSV files (one file per table) instead 25 | of `mysqldump` based backups. The provided archive expands to a directory 26 | including a restore script and instructions on how to do the restore. See more 27 | information [here](https://github.com/gousiosg/github-mirror/tree/master/sql). 28 | 29 | You can also [query MySQL](/dblite). It is always loaded with the latest 30 | dump. 31 | -------------------------------------------------------------------------------- /assets/themes/twitter/css/style.css: -------------------------------------------------------------------------------- 1 | /* Override some defaults */ 2 | html, body { 3 | background-color: #eee; 4 | } 5 | .navbar { 6 | margin-bottom: 0; 7 | } 8 | .container > footer { 9 | margin-top: 20px; 10 | } 11 | .container > footer p { 12 | text-align: center; /* center align it with the container */ 13 | } 14 | 15 | /* The white background content wrapper */ 16 | .content { 17 | background-color: #fff; 18 | padding: 20px; 19 | margin: 0 -20px; /* negative indent the amount of the padding to maintain the grid system */ 20 | -webkit-border-radius: 0 0 6px 6px; 21 | -moz-border-radius: 0 0 6px 6px; 22 | border-radius: 0 0 6px 6px; 23 | -webkit-box-shadow: 0 1px 2px rgba(0,0,0,.15); 24 | -moz-box-shadow: 0 1px 2px rgba(0,0,0,.15); 25 | box-shadow: 0 1px 2px rgba(0,0,0,.15); 26 | } 27 | 28 | /* Page header tweaks */ 29 | .page-header { 30 | background-color: #f5f5f5; 31 | padding: 20px 20px 10px; 32 | margin: -20px -20px 20px; 33 | } 34 | 35 | .topbar .btn { 36 | border: 0; 37 | } 38 | 39 | 40 | /* tag_box ======================================================== */ 41 | 42 | .tag_box { 43 | list-style:none; 44 | margin:0; 45 | padding:5px 0 ; 46 | overflow:hidden; 47 | } 48 | .tag_box li { 49 | line-height:28px; 50 | } 51 | .tag_box.inline li { 52 | float:left; 53 | } 54 | .tag_box a { 55 | padding: 3px 6px; 56 | margin: 2px; 57 | background: #eee; 58 | color:#005F6B; 59 | border-radius: 3px; 60 | text-decoration:none; 61 | } 62 | .tag_box a span{ 63 | vertical-align:super; 64 | font-size:0.8em; 65 | } 66 | .tag_box a.active { 67 | background:#57A957; 68 | border:1px solid #4C964D; 69 | color:#FFF; 70 | } 71 | -------------------------------------------------------------------------------- /dumps/index.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'rubygems' 4 | require 'erb' 5 | require 'set' 6 | require 'date' 7 | 8 | class GHTorrent 9 | attr_reader :collections 10 | attr_reader :dumps 11 | 12 | def initialize(last_update) 13 | @last_update = last_update 14 | @dumps = Set.new 15 | @collections = Set.new 16 | end 17 | 18 | def add_dump(dump) 19 | @dumps << dump 20 | end 21 | 22 | def add_collection(col) 23 | @collections << col 24 | end 25 | 26 | # Expose private binding() method. 27 | def get_binding 28 | binding() 29 | end 30 | 31 | end 32 | 33 | class Dump 34 | attr_reader :torrents 35 | attr_reader :date 36 | 37 | def initialize(torrents, date) 38 | @torrents = torrents 39 | @date = date 40 | end 41 | end 42 | 43 | class Torrent 44 | attr_reader :url 45 | attr_reader :name 46 | attr_reader :size 47 | attr_reader :date 48 | def initialize(url, name, size, date) 49 | @url = url 50 | @name = name 51 | @size = size 52 | @date = date 53 | end 54 | end 55 | 56 | url_prefix="http://ghtorrent.org/downloads" 57 | 58 | # Load the template 59 | file = File.open("index.erb").read 60 | rhtml = ERB.new(file) 61 | 62 | # Open the dir to read entries from 63 | dir = ARGV.shift 64 | 65 | if dir.nil? 66 | dir = "." 67 | end 68 | 69 | torrents = Dir.entries("#{dir}").map do |f| 70 | 71 | # Go through all torrent files and extract name of 72 | # dumped collection and dump date 73 | matches = /([a-z0-9_]+)-[a-z]+\.(.*)\.torrent/.match(f) 74 | next if matches.nil? 75 | 76 | # Calculate original file size 77 | dump = f.gsub(/.torrent/, ".tar.gz") 78 | size = File.stat(File.join(dir, dump)).size / 1024 / 1024 79 | 80 | date = Date.parse(matches[2]) 81 | 82 | if size > 0 83 | Torrent.new(url_prefix + "/" + dump, matches[1], size, date) 84 | end 85 | end.select{|x| !x.nil?} 86 | 87 | all_dates = torrents.inject(Set.new){|acc, t| acc << t.date} 88 | 89 | all_dumps = all_dates.map{ |d| 90 | date_torrents = torrents.select{|t| t.date == d} 91 | name_torrents = date_torrents.inject(Hash.new){|acc, a| acc.store(a.name, a); acc} 92 | Dump.new(name_torrents, d) 93 | } 94 | 95 | max_date = all_dates.max{ |a,b| a <=> b} 96 | 97 | ghtorrent = GHTorrent.new(max_date) 98 | all_dumps.each { |x| 99 | ghtorrent.add_dump x 100 | x.torrents.values.each { |t| 101 | ghtorrent.add_collection t.name 102 | } 103 | } 104 | 105 | puts rhtml.result(ghtorrent.get_binding).gsub(/^\s+/, "").gsub(/\s+$/, $/).gsub(//, "\n
") 106 | # vim: set sta sts=2 shiftwidth=2 sw=2 et ai : 107 | -------------------------------------------------------------------------------- /basedupon.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Based upon... 4 | tagline: 5 | --- 6 | 7 | Here is a collection of projects that are based upon GHTorrent or its 8 | devivatives, collected by searching GitHub for "ghtorrent". Is your 9 | project missing/wrongly listed/not hosted on GitHub? Please add it by 10 | [directly editing](https://github.com/ghtorrent/ghtorrent.org/edit/master/basedupon.md) this file on GitHub. 11 | 12 | * [OSSHealth/ghdata](https://github.com/OSSHealth/ghdata) A Python library and web service for GitHub Health and Sustainability metrics 13 | * [TestRoots/travistorrent-tools](https://github.com/TestRoots/travistorrent-tools) Tools used to create the data for TravisTorrent. TravisTorrent provides free and easy-to-use Travis CI build analyses to the masses through its open database. [More](http://travistorrent.testroots.org) 14 | * [RepoReapers/reaper](https://github.com/RepoReapers/reaper) Calculate the score of a repository based on best engineering practices. [More here](https://reporeapers.github.io/) 15 | * [SOM-Research/Gitana](https://github.com/SOM-Research/Gitana) a SQL-based Project Activity Inspector 16 | * [Microsoft/ghinsights](Microsoft/ghinsights) GHInsights is a data processing pipeline using Azure Data Factory and Azure Data Lake. It processes GitHub data from the ghtorrent project. The resulting processed data is available in Azure Data Lake for users to query, generate reports, and analyze GitHub projects. 17 | * [iandennismiller/gh-impact](https://github.com/iandennismiller/gh-impact) gh-impact is a measure of influence on GitHub. See more [here](http://www.gh-impact.com) 18 | * [valeriocos/selective-importer-4-ghtorrent](https://github.com/valeriocos/selective-importer-4-ghtorrent) Import a MySQL dump of GHTorrent, selecting only the tables and indexes you need 19 | * [cbogart/giterator](https://github.com/cbogart/giterator) Tools for importing and analyzing ghtorrent and githubarchive data 20 | * [DevMine/ght2dm](https://github.com/DevMine/ght2dm) CLI tool to import GHTorrent dumps into the DevMine database. 21 | * [SOM-Research/gila](https://github.com/SOM-Research/gila) Label analysis work. [More](http://som-research.uoc.edu/tools/gila/) 22 | * [BonnyCI/shuffleboard](https://github.com/BonnyCI/shuffleboard) Truffle-shuffling data for the [ci-plunder project](https://github.com/BonnyCI/ci-plunder) 23 | * [jakeharding/repo-health](https://github.com/jakeharding/repo-health) This repository holds the proof of concept for the repository health and sustainability project 24 | * [PRioritizer/PRioritizer-analyzer](https://github.com/PRioritizer/PRioritizer-analyzer) Prioritize your pull requests 25 | * [acs/ghtorrent](https://github.com/acs/ghtorrent) Analyze GHTorrent data using Elasticsearch + Kibana 26 | -------------------------------------------------------------------------------- /lean.html: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Lean GHTorrent 4 | header: Lean GHTorrent 5 | group: navigation 6 | --- 7 | 8 | Lean GHTorrent allows researchers to get a slice of the full GHTorrent 9 | dataset 10 | 11 | 76 | 77 |
78 | 79 | 80 | 81 | 82 | 83 | 99 | -------------------------------------------------------------------------------- /stats/api-stats.R: -------------------------------------------------------------------------------- 1 | # Run this to create the data file 2 | # cat */log.txt |grep APIClient|grep -v WARN |perl -lape 's/\[([T0-9-:.]*).*\] DEBUG.*\[([0-9.]*)\].*Total: ([0-9]*) ms/$1 $2 $3/'|cut -f2,3,4 -d' '|ruby -ne 'BEGIN{require "time"}; t,i,d=$_.split(/ /); print Time.parse(t).to_i," ", i, " ", d;' |egrep -v "#" >data.txt 3 | library(ggplot2) 4 | library(sqldf) 5 | require(scales) 6 | 7 | data <- read.csv("data.txt", sep=" ", colClasses = c("integer", "factor", "integer")) 8 | # Filter out data older than 3 days 9 | data <- subset(data, ts > (as.numeric(Sys.time()) - 3 * 86400)) 10 | data$ts <- as.POSIXct(data$ts, origin = "1970-01-01") 11 | summary(data$ms) 12 | 13 | p <- ggplot(data) + aes(x = ip, y = ms) + scale_y_log10() + geom_boxplot() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) 14 | png("resp-ip-boxplot.png") 15 | print(p) 16 | dev.off() 17 | 18 | # Total num requests per IP 19 | aggregate(ms ~ ip, data = data, length) 20 | 21 | # Mean time per IP 22 | aggregate(ms ~ ip, data = data, mean) 23 | 24 | data$timebin <- cut.POSIXt(data$ts, breaks = "10 mins") 25 | 26 | mean.interval <- aggregate(ms ~ timebin, data = data, mean) 27 | mean.interval$timebin <- as.POSIXct(mean.interval$timebin, origin = "1970-01-01") 28 | 29 | p <- ggplot(mean.interval) + aes(x = timebin, y = ms) + geom_line() + scale_x_datetime() + 30 | xlab('time') + ylab('Mean API resp in ms') + ggtitle('Mean API response time timeseries (10 min intervals)') 31 | 32 | png("api-resp.png") 33 | print(p) 34 | dev.off() 35 | 36 | data$timebin <- cut.POSIXt(data$ts, breaks = "30 mins") 37 | count.interval <- aggregate(ms ~ timebin, data = data, length) 38 | count.interval$timebin <- as.POSIXct(count.interval$timebin, origin = "1970-01-01") 39 | p <- ggplot(count.interval) + aes(x = timebin, y = ms) + geom_line() + scale_x_datetime() + scale_y_continuous(labels = comma) + 40 | stat_smooth(method = "loess", formula = y ~ x^2, size = 2, alpha = 0)+xlab('time') + ylab('Num API calls') + ggtitle('Num API calls per timeslot (30 mins interval)') 41 | 42 | png("num-reqs.png") 43 | print(p) 44 | dev.off() 45 | 46 | events <- read.csv("events.txt", sep=" ", colClasses = c("integer", "factor")) 47 | # Filter out data older than 3 days 48 | events <- subset(events, ts > (as.numeric(Sys.time()) - 3 * 86400)) 49 | events$ts <- as.POSIXct(events$ts, origin = "1970-01-01") 50 | summary(events$ts) 51 | 52 | events$timebin <- cut.POSIXt(events$ts, breaks = "1 day") 53 | 54 | groupped <- sqldf("select timebin,event,count(*) as number from events group by timebin,event") 55 | 56 | p <- ggplot(groupped) + aes(x = timebin, y = number, fill = event) + 57 | scale_y_continuous(labels = comma) + 58 | geom_bar(stat = "identity", position="dodge") + 59 | xlab('day') + ylab('Num events') + 60 | ggtitle('Number of events processed per day') 61 | 62 | png("events-per-day.png") 63 | print(p) 64 | dev.off() 65 | 66 | -------------------------------------------------------------------------------- /mysql.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Querying MySQL programmatically 4 | tagline: 5 | --- 6 | 7 | To connect to the MySQL programmatic endpoint, you need a MySQL client (command 8 | line, graphical or program library) and SSH installed on your machine. 9 | 10 | ## Connection details 11 | 12 | To obtain access, please send us your public key [as described here](services.html). 13 | 14 | 1. When we contact you back, you will be able to setup an SSH tunnel with the 15 | following command: `ssh -L 3306:web.ghtorrent.org:3306 ghtorrent@web.ghtorrent.org`. Keep in mind that no shell will be allocated in the open SSH session. 16 | 17 | 2. You will then be able to connect to our server using the command: `mysql -u ght -h 127.0.0.1 ghtorrent` (user name: ght, no password, database: ghtorrent). 18 | 19 | Here is an example session: 20 | 21 | {% highlight bash%} 22 | #### 23 | # on terminal session 1 24 | $ ssh -L 3306:web.ghtorrent.org:3306 ghtorrent@web.ghtorrent.org 25 | PTY allocation request failed on channel 2 26 | ##### 27 | # on a different terminal 28 | $ mysql -u ght -h 127.0.0.1 ghtorrent 29 | Reading table information for completion of table and column names 30 | You can turn off this feature to get a quicker startup with -A 31 | 32 | Welcome to the MySQL monitor. Commands end with ; or \g. 33 | Your MySQL connection id is 1004 34 | Server version: 5.5.5-10.1.11-MariaDB-1~wily mariadb.org binary distribution 35 | 36 | Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved. 37 | 38 | Oracle is a registered trademark of Oracle Corporation and/or its 39 | affiliates. Other names may be trademarks of their respective 40 | owners. 41 | 42 | Type 'help;' or '\h' for help. Type '\c' to clear the current input statement. 43 | mysql> show tables; 44 | +-----------------------+ 45 | | Tables_in_ghtorrent | 46 | +-----------------------+ 47 | | commit_comments | 48 | [...] 49 | | users | 50 | | watchers | 51 | +-----------------------+ 52 | 21 rows in set (0.20 sec) 53 | 54 | {% endhighlight %} 55 | 56 | ## Database schema 57 | 58 | Have a look [here](relational.html). 59 | 60 | ## Things to keep in mind 61 | 62 | 1. The hosting machine, while powerful, is not capable of processing the data 63 | very quickly. 64 | 65 | 2. Other people may be using the machine as well. Make sure that you do not 66 | run very heavy queries. It is better to run many small queries (e.g. in 67 | a loop) than aggregation queries. Make sure you only query on indexed fields. 68 | 69 | 3. Queries running in excess of 100 seconds are killed without any warning. 70 | 71 | 4. At any time the machine may become unavailable. 72 | 73 | 5. The data is provided in kind to help other people to do research with 74 | Please do not abuse the service. 75 | 76 | 6. The data is offered as is without any explicit or implicit quality or 77 | service guarantee from our part. 78 | 79 | 7. All operations are logged for security purposes. 80 | -------------------------------------------------------------------------------- /dumps/ght-periodic-dump: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Create the periodic database dump files 4 | # 5 | 6 | # Directory to place compressed files and torrents 7 | OUTDIR=/data/downloads 8 | 9 | # Base URL for HTTP dir containing torrents and data 10 | WEBSEED=http://www.ghtorrent.org/downloads/ 11 | 12 | usage() 13 | { 14 | echo "Usage: $0 [-f 'yyyy-mm-dd hh:mm'] [-t 'yyyy-mm-dd hh:mm']" 15 | echo " [-c collection_to_dump]" 16 | echo "Dump the database. -f earliest record timestamp" 17 | echo " -t latest record timestamp" 18 | echo " -c collection to dump (default: all)" 19 | } 20 | 21 | if [ -z $1 ] 22 | then 23 | usage 24 | exit 1 25 | fi 26 | 27 | while getopts "f:t:c:" o 28 | do 29 | case $o in 30 | f) timeStart=`date -d "$OPTARG" +%s` ;; 31 | t) timeEnd=`date -d "$OPTARG" +%s` ;; 32 | c) collection=$OPTARG ;; 33 | \?) echo "Invalid option: -$OPTARG" >&2 34 | usage 35 | exit 1 36 | ;; 37 | esac 38 | done 39 | 40 | 41 | # Time to start dumping from 42 | if [ -z $timeStart ] 43 | then 44 | if [ -r lastrun ] 45 | then 46 | timeStart=`cat lastrun` 47 | else 48 | timeStart=0 49 | fi 50 | fi 51 | 52 | # Time to end dumping 53 | if [ -z $timeEnd ] 54 | then 55 | timeEnd=`date +%s` 56 | fi 57 | 58 | # Name used for the files 59 | dateName=`date -d @$timeEnd -u +'%Y-%m-%d'` 60 | 61 | # _id example: 62 | # 4f208c3e08d69a1835000077 63 | # 000102030405060708091011 64 | # | || || || | 65 | # time mach pid count 66 | 67 | endId=`printf '%08x0000000000000000' $timeEnd` 68 | startId=`printf '%08x0000000000000000' $timeStart` 69 | 70 | 71 | if [ -z $collection ] 72 | then 73 | collections=`echo "show collections"|mongo --quiet rs0/github|egrep -v "system|bye"` 74 | else 75 | collections=$collection 76 | fi 77 | 78 | echo "Dumping database from `date -d @$timeStart` to `date -d @$timeEnd`" 79 | 80 | rm -rf dump 81 | mkdir -p dump/github 82 | 83 | for col in $collections; do 84 | 85 | echo "Dumping $col" 86 | mongodump --host rs0 --db github --collection $col -q '{"_id" : {"$gte" : ObjectId("'$startId'"), "$lt" : ObjectId("'$endId'")} }' || exit 1 87 | done 88 | 89 | # Archive collections and create torrents 90 | for col in $collections; do 91 | echo "Archiving $col.bson" 92 | if [ ! -s dump/github/$col.bson ]; then 93 | echo "Collection empty, skipping" 94 | continue 95 | fi 96 | 97 | if ! tar zcf $OUTDIR/$col-dump.$dateName.tar.gz dump/github/$col.bson 98 | then 99 | rm -f $OUTDIR/$col-dump.$dateName.tar.gz 100 | exit 1 101 | fi 102 | # 103 | # mktorrent -a udp://tracker.openbittorrent.com:80 -a udp://tracker.publicbt.com:80/announce -a http://tracker.bittorrent.am/announce -w $WEBSEED/$col-dump.$dateName.tar.gz -o $OUTDIR/$col-dump.$dateName.torrent $OUTDIR/$col-dump.$dateName.tar.gz 104 | #done 105 | # 106 | # Update last run info 107 | echo $timeEnd >lastrun || exit 1 108 | 109 | # Clean up 110 | rm -rf dump 111 | 112 | -------------------------------------------------------------------------------- /services.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: GHTorrent services 4 | tagline: 5 | --- 6 | 7 | The GHTorrent project provides the following services to interested researchers 8 | and third parties: 9 | 10 | * [Querying MongoDB](raw.html) programmatically 11 | * [Querying MySQL](/dblite/) through a web interface 12 | * [Querying MySQL](mysql.html) programmatically 13 | * [Streaming](streaming.html) of entries in MongoDB and MySQL 14 | 15 | _The services are in kind and do not entail any quality or availability guarantee 16 | ._ 17 | 18 | ## Obtaining access 19 | 20 | All services are offered over SSH tunnelling. See at the page of each 21 | individual service for connection details. 22 | 23 | To obtain access to any of the services, add your public SSH key 24 | (usually in `~/.ssh/id_rsa.pub`), using a pull request, to 25 | [this file](https://github.com/ghtorrent/ghtorrent.org/blob/master/keys.txt). 26 | All pull requests are merged on Friday afternoon, so please wait a bit 27 | before firing a reminder email. 28 | 29 | To create a public/private key pair, use `ssh-keygen`. Here are some hints on 30 | how to generate GHTorrent compatible SSH keys: 31 | 32 | * **On Mac or Linux**: You can use the distribution provided `ssh-keygen` 33 | command and it should work fine. 34 | 35 | * **On Windows:** Keys generated with the popular Putty program cannot be used 36 | by GHTorrent. Please use [CygWin](https://www.cygwin.com) or an equivalent 37 | environment to install OpenSSH and use the `ssh-keygen` command as provided by 38 | OpenSSH to generate a GHTorrent compatible key. 39 | 40 | ## Fair use 41 | 42 | To address GitHub's growth and GHTorrent's growing demands in API calls and the 43 | community's demand for better, more rich data, we need more GitHub API keys. We 44 | therefore kindly ask you to [send us](mailto:gousiosg@gmail.com) 45 | a GitHub API key (a “personal access token” as Github describes it). 46 | 47 | The process to create a key is simple: First, go to the following URL (while 48 | logged in): 49 | 50 | [https://github.com/settings/tokens/new](https://github.com/settings/tokens/new) 51 | 52 | deselect *all* checkboxes *except* from `public_repo`, set a token name and 53 | click on "Generate Token". 54 | 55 | Please note that it is possible to specify the maximum number of requests per 56 | hour that you would like to donate to GHTorrent. By default, GHTorrent uses the 57 | maximum allowed by GitHub (5k/hour), but if you are using the GitHub API for 58 | other projects/services, you might want to restrict this. A typical service like 59 | Travis only uses a few requests per hour, even on busy projects. 60 | 61 | **If you do not want us to use your key any more, do let us know.** Do not 62 | just delete your key from GitHub as this will create holes in the data 63 | collection until we notice and remove your key. If this happens, we will also 64 | ban you indefinetely from the services (2 users have already been banned). 65 | 66 | At the moment, this is a request in kind. If demand continues to grow and supply 67 | of keys is not enough to keep up, we might turn this into an obligatory step. 68 | -------------------------------------------------------------------------------- /raw.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Querying MongoDB programmatically 4 | tagline: 5 | --- 6 | 7 | While the GHTorrent project offers downloadable versions of the MongoDB raw 8 | dataset, downloading and restoring them to MongoDB can be very time consuming. 9 | For this reason, we have created a publicly available version of the data as 10 | they are collected by our main MongoDB server. 11 | The only prerequisite is to have a MongoDB client (command line, graphical 12 | or program library) and SSH installed on your machine. 13 | 14 | ## Connection details 15 | 16 | To obtain access, please send us your public key [as described here](services.html). 17 | 18 | 1. When we contact you back, you will be able to setup an SSH tunnel with the 19 | following command: `ssh -L 27017:dutihr.st.ewi.tudelft.nl:27017 20 | ghtorrent@dutihr.st.ewi.tudelft.nl`. Keep in mind that no shell will be 21 | allocated in the open SSH session. 22 | 23 | 2. You will then be able to connect to our server using the command: `mongo 24 | -u ghtorrentro -p ghtorrentro github`. 25 | 26 | Here is an example session: 27 | 28 | {% highlight bash%} 29 | #### 30 | # on terminal session 1 31 | $ ssh -L 27017:dutihr.st.ewi.tudelft.nl:27017 ghtorrent@dutihr.st.ewi.tudelft.nl 32 | PTY allocation request failed on channel 2 33 | ##### 34 | # on a different terminal 35 | $ mongo -u ghtorrentro -p ghtorrentro github 36 | MongoDB shell version: 3.0.3 37 | connecting to: github 38 | > 39 | > db.events.count() 40 | 401209493 41 | > db.commits.count() 42 | 311041915 43 | > 44 | {% endhighlight %} 45 | 46 | 47 | ## Collections available in MongoDB 48 | 49 | Have a look [here](mongo.html). 50 | 51 | Due to its heavy load, the MongoDB server cannot process non-indexed field 52 | searches within the 100 sec time limit. To address this situation, we 53 | recommend querying MySQL first to get references to the data you want and 54 | then use MongoDB to get the raw data. 55 | 56 | Below are the fields that MongoDB uses as indexes. Make sure your query hits 57 | those, otherwise querying is going to be extremely slow (and will overload our 58 | server as well). 59 | 60 | 62 | 63 | ## Things to keep in mind 64 | 65 | 1. The hosting machine, while powerful, is not capable of processing the data 66 | very quickly. At the time of this writing, the data is more than 10TB. 67 | 68 | 2. Other people may be using the machine as well. Make sure that you do not 69 | run very heavy queries. It is better to run many small queriess (e.g. in 70 | a loop) than aggregation queries. Make sure you only query on indexed fields. 71 | 72 | 3. Queries running in excess of 100 seconds are killed without any warning. 73 | 74 | 4. At any time the machine may become unavailable. 75 | 76 | 5. Some data may be missing; if you are willing to provide workers to collect 77 | them, please [contact us](mailto:gousiosg@gmail.com). 78 | 79 | 6. The data is provided in kind to help other people to do research with 80 | Please do not abuse the service. 81 | 82 | 7. The data is offered as is without any explicit or implicit quality or 83 | service guarantee from our part. 84 | 85 | 8. All operations are logged for security purposes. 86 | 87 | -------------------------------------------------------------------------------- /pullreq-perf/openess-report.R: -------------------------------------------------------------------------------- 1 | # 2 | # (c) 2012 -- 2014 Georgios Gousios 3 | # 4 | # BSD licensed, see LICENSE in top level dir 5 | # 6 | 7 | 8 | rm(list = ls(all = TRUE)) 9 | 10 | if (! "knitr" %in% installed.packages()) install.packages("knitr") 11 | if (! "RMySQL" %in% installed.packages()) install.packages("RMySQL") 12 | if (! "ggplot2" %in% installed.packages()) install.packages("ggplot2") 13 | if (! "reshape" %in% installed.packages()) install.packages("reshape") 14 | if (! "sqldf" %in% installed.packages()) install.packages("sqldf") 15 | if (! "optparse" %in% installed.packages()) install.packages("optparse") 16 | if (! "foreach" %in% installed.packages()) install.packages("foreach") 17 | if (! "doMC" %in% installed.packages()) install.packages("doMC") 18 | 19 | library(optparse) 20 | 21 | mysql.user = "foo" 22 | mysql.passwd = "bar" 23 | mysql.db = "ghtorrent" 24 | mysql.host = "127.0.0.1" 25 | paralll = 4 26 | 27 | option_list <- list( 28 | make_option(c("-s", "--mysql-host"), default=mysql.host, dest = 'mysql.host', 29 | help = "MySQL host [\"%default\"]"), 30 | make_option(c("-d", "--mysql-db"), default=mysql.db, dest = 'mysql.db', 31 | help = "MySQL database [\"%default\"]"), 32 | make_option(c("-u", "--mysql-user"), default=mysql.user, dest = 'mysql.user', 33 | help = "MySQL user [\"%default\"]"), 34 | make_option(c("-p", "--mysql-passwd"), default=mysql.passwd, dest = 'mysql.passwd', help = "MySQL password [\"%default\"]"), 35 | make_option(c("-a", "--parallel"), default=paralll, dest = 'paralll', help = "Number of processes [\"%default\"]") 36 | 37 | ) 38 | 39 | args <- parse_args(OptionParser(option_list = option_list), 40 | print_help_and_exit = FALSE, 41 | positional_arguments = TRUE) 42 | 43 | if (args$options$help == TRUE) { 44 | parse_args(OptionParser(option_list = option_list)) 45 | } 46 | 47 | mysql.user = args$options$mysql.user 48 | mysql.passwd = args$options$mysql.passwd 49 | mysql.db = args$options$mysql.db 50 | mysql.host = args$options$mysql.host 51 | paralll = args$options$paralll 52 | 53 | # Genearte stats 54 | library(RMySQL) 55 | library(knitr) 56 | 57 | stats <- function(owner, repo) { 58 | 59 | db <- dbConnect(dbDriver("MySQL"), 60 | user = mysql.user, 61 | password = mysql.passwd, 62 | dbname = mysql.db, 63 | host = mysql.host) 64 | 65 | dirname = sprintf("%s-%s", owner,repo) 66 | print(sprintf("Running in %s", dirname)) 67 | cwd <- getwd() 68 | dir.create(dirname) 69 | file.copy("report.Rmd", sprintf("%s/%s", dirname, "index.Rmd")) 70 | setwd(dirname) 71 | 72 | tryCatch({ 73 | knit("index.Rmd") 74 | file.remove("index.Rmd") 75 | }, error = function(e) { 76 | print(e) 77 | setwd(cwd) 78 | unlink(dirname, TRUE, TRUE) 79 | }, finally = { 80 | dbDisconnect(db) 81 | setwd(cwd) 82 | }) 83 | } 84 | 85 | if (length(args$args) == 0) { 86 | library(doMC) 87 | registerDoMC(paralll) 88 | 89 | projects <- read.csv('projects.txt', sep = ' ') 90 | 91 | print(sprintf("%s projects to analyze", nrow(projects))) 92 | print(sprintf("Running %d parallel processes", paralll)) 93 | knit("index.Rmd") 94 | 95 | result <- foreach(n=1:nrow(projects), .combine='+') %dopar% { 96 | project <- projects[n, ] 97 | stats(project[,1], project[,2]) 98 | 1 99 | } 100 | print(sprintf("processed %d projects", result)) 101 | 102 | } else { 103 | stats(strsplit(args$args, " ")[1], strsplit(args$args, " ")[2]) 104 | } 105 | -------------------------------------------------------------------------------- /vissoft14.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: VISSOFT 2014 Challenge Dataset 4 | tagline: 5 | --- 6 | ### Versions 7 | 8 | *You are advised to always run queries against the newest version.* 9 | 10 |
11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 |
VersionRelease dateFixed error
1.01 Mar 2014
26 | 27 | ### Dataset description 28 | 29 | The VISSOFT 2014 challenge dataset is a (very) trimmed down version of the 30 | original GHTorrent dataset. It includes data from the 31 | [netty/netty](http://github.com/repos/netty/netty) repository (commits, pull 32 | requests, collaborators, issues etc) along with all its forks (including the 33 | forks' own commits, pull requests etc if any). 34 | 35 | Similarly to GHTorrent itself, the VISSOFT challenge dataset comes in two flavours: 36 | 37 | * A MongoDB database dump containing the results of querying the Github API. See [format here](mongo.html). 38 | * A MySQL database dump containing a queriable version of important fields extracted from the raw data. See [schema here](relational.html). 39 | 40 | ### Importing and using 41 | 42 | The following instructions assume an OSX or Linux based host, on which 43 | you have a running MongoDB or/and MySQL instance. 44 | 45 | {%highlight bash%} 46 | # Download and extract 47 | $ wget http://ghtorrent-downloads.ewi.tudelft.nl/datasets/vissoft14-01032014.tar.gz 48 | $ du -b vissoft14-01032014.tar.gz 49 | 49178639 vissoft14-01032014.tar.gz 50 | $ md5sum vissoft14-01032014.tar.gz 51 | 4928efb679a0dc8254924d56760d65ec vissoft14-01032014.tar.gz 52 | $ tar zxvf vissoft14-01032014.tar.gz 53 | $ cd vissoft14 54 | $ du -b 55 | 351024865 56 | 57 | # MongoDB import 58 | $ ls *.bson|while read dump; do mongorestore -d vissoft14 $dump; done 59 | $ mongo vissoft14 60 | mongo> db.commits.count() 61 | 9118 62 | mongo> db.issue_comments.count() 63 | 10876 64 | 65 | # MySQL import 66 | $ mysql -u root -p 67 | mysql > create user 'vissoft14'@'localhost' identified by 'vissoft14'; 68 | mysql> create database vissoft14; 69 | mysql> grant all privileges on vissoft14.* to vissoft14@'localhost'; 70 | mysql> flush privileges; 71 | mysql> ^D 72 | $ cat mysql.sql |mysql -u vissoft14 -p vissoft14 73 | $ mysql -u vissoft14 -p vissoft14 74 | mysql> select count(*) from commits; 75 | +----------+ 76 | | count(*) | 77 | +----------+ 78 | | 9118 | 79 | +----------+ 80 | {%endhighlight %} 81 | 82 | ### FAQ 83 | 84 | Answers to frequently asked questions 85 | 86 | #### Why a new dataset? 87 | 88 | For practical reasons. The dataset is small enough to be used on a laptop, 89 | yet rich enough to do really interesting vizualizations with it. 90 | 91 | #### What are the hardware requirements? 92 | 93 | We have succesfully imported and used both dumps into a 2011 MacBookAir with 4GB 94 | of RAM. Your mileage may vary, but relatively new systems with more than 4GB RAM should have no trouble with both databases. If you only need to use the MySQL data dump, the hardware requirements are even lower. 95 | 96 | #### Why two databases? Do I need both? 97 | 98 | Not necessarily. The MySQL database can readily cover many aspects of activity 99 | on Github. Perhaps the only reason to use the MongoDB dump is to analyse commit contents, branches affected by pull requests or milestones, which are not included in MySQL. 100 | 101 | #### How can I ask a question about the dataset? 102 | 103 | Your question and the potential answer might be useful for other people as 104 | well, so please use the form below. *Please note that I (Georgios Gousios) will 105 | not answer questions sent to my email.* 106 | 107 | {% include comments.html%} 108 | -------------------------------------------------------------------------------- /syntax.css: -------------------------------------------------------------------------------- 1 | .highlight { background: #ffffff; } 2 | .highlight .c { color: #999988; font-style: italic } /* Comment */ 3 | .highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */ 4 | .highlight .k { font-weight: bold } /* Keyword */ 5 | .highlight .o { font-weight: bold } /* Operator */ 6 | .highlight .cm { color: #999988; font-style: italic } /* Comment.Multiline */ 7 | .highlight .cp { color: #999999; font-weight: bold } /* Comment.Preproc */ 8 | .highlight .c1 { color: #999988; font-style: italic } /* Comment.Single */ 9 | .highlight .cs { color: #999999; font-weight: bold; font-style: italic } /* Comment.Special */ 10 | .highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */ 11 | .highlight .gd .x { color: #000000; background-color: #ffaaaa } /* Generic.Deleted.Specific */ 12 | .highlight .ge { font-style: italic } /* Generic.Emph */ 13 | .highlight .gr { color: #aa0000 } /* Generic.Error */ 14 | .highlight .gh { color: #999999 } /* Generic.Heading */ 15 | .highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */ 16 | .highlight .gi .x { color: #000000; background-color: #aaffaa } /* Generic.Inserted.Specific */ 17 | .highlight .go { color: #888888 } /* Generic.Output */ 18 | .highlight .gp { color: #555555 } /* Generic.Prompt */ 19 | .highlight .gs { font-weight: bold } /* Generic.Strong */ 20 | .highlight .gu { color: #aaaaaa } /* Generic.Subheading */ 21 | .highlight .gt { color: #aa0000 } /* Generic.Traceback */ 22 | .highlight .kc { font-weight: bold } /* Keyword.Constant */ 23 | .highlight .kd { font-weight: bold } /* Keyword.Declaration */ 24 | .highlight .kp { font-weight: bold } /* Keyword.Pseudo */ 25 | .highlight .kr { font-weight: bold } /* Keyword.Reserved */ 26 | .highlight .kt { color: #445588; font-weight: bold } /* Keyword.Type */ 27 | .highlight .m { color: #009999 } /* Literal.Number */ 28 | .highlight .s { color: #d14 } /* Literal.String */ 29 | .highlight .na { color: #008080 } /* Name.Attribute */ 30 | .highlight .nb { color: #0086B3 } /* Name.Builtin */ 31 | .highlight .nc { color: #445588; font-weight: bold } /* Name.Class */ 32 | .highlight .no { color: #008080 } /* Name.Constant */ 33 | .highlight .ni { color: #800080 } /* Name.Entity */ 34 | .highlight .ne { color: #990000; font-weight: bold } /* Name.Exception */ 35 | .highlight .nf { color: #990000; font-weight: bold } /* Name.Function */ 36 | .highlight .nn { color: #555555 } /* Name.Namespace */ 37 | .highlight .nt { color: #000080 } /* Name.Tag */ 38 | .highlight .nv { color: #008080 } /* Name.Variable */ 39 | .highlight .ow { font-weight: bold } /* Operator.Word */ 40 | .highlight .w { color: #bbbbbb } /* Text.Whitespace */ 41 | .highlight .mf { color: #009999 } /* Literal.Number.Float */ 42 | .highlight .mh { color: #009999 } /* Literal.Number.Hex */ 43 | .highlight .mi { color: #009999 } /* Literal.Number.Integer */ 44 | .highlight .mo { color: #009999 } /* Literal.Number.Oct */ 45 | .highlight .sb { color: #d14 } /* Literal.String.Backtick */ 46 | .highlight .sc { color: #d14 } /* Literal.String.Char */ 47 | .highlight .sd { color: #d14 } /* Literal.String.Doc */ 48 | .highlight .s2 { color: #d14 } /* Literal.String.Double */ 49 | .highlight .se { color: #d14 } /* Literal.String.Escape */ 50 | .highlight .sh { color: #d14 } /* Literal.String.Heredoc */ 51 | .highlight .si { color: #d14 } /* Literal.String.Interpol */ 52 | .highlight .sx { color: #d14 } /* Literal.String.Other */ 53 | .highlight .sr { color: #009926 } /* Literal.String.Regex */ 54 | .highlight .s1 { color: #d14 } /* Literal.String.Single */ 55 | .highlight .ss { color: #990073 } /* Literal.String.Symbol */ 56 | .highlight .bp { color: #999999 } /* Name.Builtin.Pseudo */ 57 | .highlight .vc { color: #008080 } /* Name.Variable.Class */ 58 | .highlight .vg { color: #008080 } /* Name.Variable.Global */ 59 | .highlight .vi { color: #008080 } /* Name.Variable.Instance */ 60 | .highlight .il { color: #009999 } /* Literal.Number.Integer.Long */ 61 | -------------------------------------------------------------------------------- /geninst.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Generic installation instructions 4 | tagline: 5 | --- 6 | 7 | ## How to run GHTorrent locally 8 | 9 | Depending on the size of the local mirror you have the following configuration 10 | simplification options: 11 | 12 | * You can skip using MongoDB if you only need to query the relational 13 | database and/or you just need to do use GHTorrent once. 14 | 15 | * You can use SQLite3 instead of MySQL if your setup only contains a few 16 | (say, less than 1000) small projects. 17 | 18 | ### Install Ruby and dependencies 19 | 20 | Make sure you run the latest release of Ruby. On the main server, GHTorrent runs 21 | on Ruby 2. If you are on Mac or Linux, you can use [RVM](https://rvm.io/) to 22 | manage Ruby versions. 23 | 24 | Install the necessary dependencies: 25 | 26 | {% highlight bash %} 27 | sudo apt-get install build-essential curl libmysqlclient-dev 28 | # Install RVM and Ruby 2.2 29 | gpg --keyserver hkp://keys.gnupg.net --recv-keys 409B6B1796C275462A1703113804BB82D39DC0E3 30 | curl -L https://get.rvm.io | bash -s stable --ruby=2.2 31 | rvm use 2.2 32 | gem install bundler sqlite3 #or mysql2 33 | {% endhighlight %} 34 | 35 | ### Install the source code 36 | 37 | Checkout the latest version of the 38 | [ghtorrent](https://github.com/gousiosg/github-mirror.git) Gem through Github. 39 | By default, it will be checked out in the directory `github-mirror`. The 40 | released versions of the Gem represent good states in the project's lifetime; 41 | the main mirror always works on the bleeding edge, which contains error fixes 42 | and updates to comply with changes to Github's API. You then need to install 43 | the dependencies: 44 | 45 | {% highlight bash %} 46 | cd github-mirror 47 | bundle install 48 | {% endhighlight %} 49 | 50 | Alternatively, you can just install the latest version of the GHTorrent gem: 51 | 52 | {% highlight bash %} 53 | gem install ghtorrent 54 | {% endhighlight %} 55 | 56 | #### Configure 57 | 58 | **If you are using MySQL**, you need to create a user and a database, like so 59 | 60 | {% highlight mysql %} 61 | # Login as MySQL root user 62 | mysql> create user ghtorrentuser@'localhost' identified by 'ghtorrentpassword'; 63 | mysql> create user ghtorrentuser@'*' identified by 'ghtorrentpassword'; 64 | mysql> grant all privileges on *.* to 'ghtorrentuser'@'localhost'; 65 | mysql> grant all privileges on *.* to 'ghtorrentuser'@'*'; 66 | 67 | # Login as the ghtorrent user 68 | mysql> CREATE SCHEMA IF NOT EXISTS `ghtorrent` DEFAULT CHARACTER SET utf8 ; 69 | {% endhighlight %} 70 | 71 | **If you are using MongoDB**, you can just disable 72 | authentication (run `mongod` with `--noauth`). If you do want to create a user, 73 | it can be a bit more involved, see below: 74 | 75 | {% highlight javascript %} 76 | > db.createUser( 77 | { 78 | user: "root", 79 | pwd: "admin", 80 | roles: [ { role: "userAdminAnyDatabase", db: "admin" } ] 81 | } 82 | ) 83 | 84 | > use ghtorrent 85 | > db.createUser( 86 | { 87 | user: "ghtorrent", 88 | pwd: "ghtorrent", 89 | roles: [ 90 | { role: "dbOwner", db: "ghtorrent" } 91 | ] 92 | } 93 | ) 94 | {% endhighlight %} 95 | 96 | **Download the 97 | [sample configuration file](https://raw.githubusercontent.com/gousiosg/github-mirror/master/config.yaml.tmpl)**, 98 | save it as `config.yaml` and change options as necessary. Important things to 99 | configure are: 100 | 101 | * The database connection string 102 | * The MongoDB connection details (if you are using it) 103 | * Your GitHub username/password or an API token. See 104 | [instructions here](raw.html) on how to obtain an API key 105 | 106 | ### Run and profit 107 | 108 | To download the data for your first project, run: 109 | 110 | {% highlight bash %} 111 | # Retrieve one repo 112 | ruby -Ilib bin/ght-retrieve-repo -c config.yaml gousiosg github-mirror 113 | {% endhighlight %} 114 | 115 | You should see lots of output. After a while, you will have 1/2 databases 116 | full of data! 117 | -------------------------------------------------------------------------------- /mongo.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Collections in MongoDB 4 | tagline: 5 | --- 6 | 7 | Here is a list of collections along with the Github API URL they cache data 8 | from. All URLs need to be prefixed with `https://api.github.com/`. In MongoDB, 9 | each entity is by default indexed by the parameter fields in each corresponding 10 | URL (see also the actual [default indexes](https://github.com/gousiosg/github-mirror/blob/master/lib/ghtorrent/adapters/mongo_persister.rb#L23)). 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 |
Collection nameGithub API URLDocumentation URL
commit_comments#{user}/#{repo}/commits/#{sha}/commentscommit comments
commitsrepos/#{user}/#{repo}/commitscommits
eventseventsevents
followersusers/#{user}/followersfollowers list
forksrepos/#{user}/#{repo}/forksforks list
issues/repos/#{owner}/#{repo}/issuesissues for a repo
issue_commentsrepos/#{owner}/#{repo}/issues/comments/#{comment_id}issue comments
issue_eventsrepos/#{owner}/#{repo}/issues/events/#{event_id}issue events
org_membersorgs/#{org}/membersorganization members
pull_request_commentsrepos/#{owner}/#{repo}/pulls/#{pullreq_id}/commentspull request review comments
pull_requestsrepos/#{user}/#{repo}/pullspull requests
repo_collaboratorsrepos/#{user}/#{repo}/collaboratorsrepo collaborators
repo_labelsrepos/#{owner}/#{repo}/issues/#{issue_id}/labelsissue labels
reposrepos/#{user}/#{repo}repositories
usersusers/#{user}users
watchersrepos/#{user}/#{repo}/stargazersstargazers
103 | -------------------------------------------------------------------------------- /cookbook.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: The GHTorrent cookbook 4 | tagline: 5 | --- 6 | 7 | The following assume that you have GHTorrent checked out and its 8 | dependencies configured as appropriate. The first step to all 9 | the items below is to change the working directory to the 10 | GHTorrent checkout, i.e. `cd github-mirror`. 11 | 12 | ## General information 13 | 14 | All GHTorrent commands accept the following important arguments: 15 | 16 | * `-c`: Specifies the location of the configuration file. The config file is 17 | auto discovered if it is in the same directory and named `config.yaml` 18 | * `-t`: The GitHub token to use to do the API calls. 19 | * `-l`: A limit to reqs/hr. This can be used to restrict the requests 20 | * `-u`: A unique name for the running program, to differentiate 21 | * `-v`: Verbose mode, means different things to different commands. 22 | 23 | ### Getting all info about a single repo 24 | 25 | The following will retrieve all information from the beginning of time 26 | for a single repo. 27 | 28 | {% highlight bash%} 29 | ruby -Ilib bin/ght-retrieve-repo gousiosg github-mirror 30 | {% endhighlight %} 31 | 32 | There are a few arguments for this command to make it faster for specific 33 | cases: 34 | 35 | * `-n`: Do not retrieve events 36 | * `-o`: Do not retrieve entities 37 | * `-y`: Only retrieve one entity type. For example `-y ensure_commits` will 38 | retrieve just the commits and finish. 39 | * `-r` and `-b`: Process all events before or after a specific date. 40 | 41 | ### Getting all info about a user 42 | 43 | This will retrieve all data (followers, organizations etc) for a single 44 | user/organization 45 | 46 | {% highlight bash%} 47 | ruby -Ilib bin/ght-retrieve-user gousiosg 48 | {% endhighlight %} 49 | 50 | 51 | The following will retrieve all users in the Microsoft organization 52 | 53 | {% highlight bash%} 54 | ruby -Ilib bin/ght-retrieve-user Microsoft 55 | {% endhighlight %} 56 | 57 | ### Getting many users/repos in parallel 58 | 59 | The quick and dirty solution is to use 60 | [GNU Parallel](http://www.gnu.org/software/parallel/). To do that you need two files, one listing API keys (`keys.txt`) and one listing repository names (`projects.txt`), see for example: 61 | 62 | {% highlight bash%} 63 | $ head -n 5 projects.txt 64 | eed3si9n scalaxb 65 | pocorall scaloid 66 | socrata-platform soql-bigquery-adapter 67 | ReactiveMongo Play-ReactiveMongo 68 | chrisdinn brando 69 | 70 | $ head -n 5 keys.txt 71 | # Not real keys 72 | d15d119551fd194745cb81df4f4c68c55460bf37 73 | c3a1a550bcfc39ea374452f95a1dbe3002a3b8ab 74 | ea9e186f882c853fe0eb3e387b8c01aafdca8645 75 | bd3d11ae101cf931aed92f76ffc2f6ba40e3c9fa 76 | c6e15a389537675539ddd4bf1ef7e0f96520ec3e 77 | {% endhighlight %} 78 | 79 | Then you can use GNU `parallel` like so: 80 | 81 | {% highlight bash%} 82 | parallel --no-notice --gnu --progress --joblog parjobs --xapply -P 4 \ 83 | ruby -Ilib bin/ght-retrieve-repo -c config.yaml -t {3} {1} {2} \ 84 | ::: `cat projects.txt | cut -f1 -d' '` \ 85 | ::: `cat projects.txt|cut -f2 -d' '` \ 86 | ::: `cat keys.txt` 87 | {% endhighlight %} 88 | 89 | The important parameter to tune here is `-P`, the number of parallel processes. 90 | To retrieve users in parallel, you need to replace `ght-retrieve-repo` with 91 | `ght-retrieve-user`. 92 | 93 | ### Loading items to the queue 94 | 95 | In some cases (e.g. bugs, network glitches etc), some events might 96 | remain unprocessed. In other cases, you might want some events on 97 | the queue, e.g. to test new functionality. The tool to use in this case 98 | is `ght-load`. Below are some use cases: 99 | 100 | {% highlight bash%} 101 | # Load all PushEvents since yesterday 102 | ruby -Ilib bin/ght-load -v -e `gdate +%s --date '1 day ago'` -f PushEvent 103 | 104 | # Load all events of previous day at a rate of 10 events/sec 105 | ruby -Ilib bin/ght-load -v -r 10 -e `gdate +%s --date '1 day ago'` -t gdate +%s --date '2 days ago'` 106 | 107 | # Load all events of previous day at a rate of 10 events/sec 108 | ruby -Ilib bin/ght-load -v -r 10 -e `gdate +%s --date '1 day ago'` -t gdate +%s --date '2 days ago'` 109 | {% endhighlight %} 110 | 111 | 112 | `ght-load` can also be used to load arbitrary items read from files 113 | rather than MongoDB. In this case, a routing key can be attached 114 | in order for the loaded items to go to the appropriate queue. 115 | 116 | {% highlight bash%} 117 | # Load items from file with a routing key 118 | ruby -Ilib bin/ght-load -i users.txt -o 'evt.users' 119 | {% endhighlight %} 120 | 121 | -------------------------------------------------------------------------------- /leanprogress.html: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Lean Request Results 4 | group: navigation 5 | --- 6 | 7 | 134 | 135 | 136 | 137 |
138 | 141 |
142 | 143 |
144 | -------------------------------------------------------------------------------- /index.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: The GHTorrent project 4 | tagline: 5 | --- 6 | 7 | Welcome to the GHTorrent project, an effort to create a scalable, queriable, 8 | offline mirror of data offered through the [Github REST API](http://developer.github.com). 9 | 10 | Follow [@ghtorrent](https://twitter.com/ghtorrent) on Twitter for project 11 | updates and [exciting research](halloffame.html) done with GHTorrent. 12 | 13 | ##What does GHTorrent do? 14 | 15 | GHTorrent monitors the [Github public event time 16 | line](https://api.github.com/events). For each event, it retrieves its contents 17 | and their dependencies, exhaustively. It then stores the raw JSON responses to a 18 | [MongoDB database](raw.html), while also extracting their structure in a [MySQL 19 | database](relational.html). 20 | 21 | GHTorrent works in a distributed manner. A [RabbitMQ](http://www.rabbitmq.com/) 22 | message queue sits between the event mirroring and data retrieval phases, so 23 | that both can be run on a cluster of machines. Have a look at this 24 | [presentation](https://speakerdeck.com/gousiosg/mining-github-for-fun-and-profit) 25 | and read [this paper](http://gousios.gr/bibliography/GS12.html) if you want to 26 | know more. Here is the [source code](https://github.com/gousiosg/github-mirror). 27 | 28 | The project releases the data collected during that period as 29 | [downloadable archives](downloads.html). 30 | 31 | ### How much data do you have? 32 | 33 | Currently (Jan 2015), MongoDB stores around 4TB of JSON data (compressed), while 34 | MySQL more than 1.5 billion rows of extracted metadata. A large part of the 35 | activity of 2012, 2013, 2014 and 2015 has been retrieved, while we are also 36 | going backwards to retrieve the full recorded history of important projects. 37 | 38 | ### How can I help? 39 | 40 | GHTorrent needs contributions on the following fronts: 41 | 42 | * **API keys:** We can run multiple GHTorrent worker instances concurrently. To 43 | go over Github's API rate limit, we need multiple Github API keys provided by 44 | users. If you use GHTorrent for your reseach, please consider [donating a 45 | key](services.html). 46 | 47 | * **Linking and analysis:** GHTorrent currently only does limited analysis and 48 | linking withing the the dataset (user geolocation). There are many possibilities 49 | for expansion. One could for example think of linking commits to issues. 50 | 51 | * **Reporting bugs:** Please use Github's [issue tracker here](https://github.com/gousiosg/ghtorrent.org/issues) to report any data consistency issues you have found. 52 | 53 | * **Donating:** We are trying to make GHTorrent a self-sustainable operation. 54 | If you are using GHTorrent, please consider donating (you can find a donation 55 | button on the left). All individual/companies that have donated will be listed 56 | in the Hall of Fame page. 57 | 58 | ### Why did you do it? 59 | 60 | We are doing research on [software repositories](http://www.msrconf.org/). 61 | Github is an exciting new data source for us, one that has several of the 62 | problems we are facing as data miners solved. The uniformity of data 63 | will allow scaling of research to hundreds or thousands of repositories 64 | spanning across multiple languages and application domains. 65 | 66 | ### Why the name? 67 | 68 | Initially the project offered the data through the Bittorrent network (gh: from 69 | GitHub, torrent: from Bittorrent). As currently the data is only offered through 70 | HTTP, the name signifies a [torrent](https://en.wiktionary.org/wiki/torrent) of 71 | data coming from GitHub. 72 | 73 | ### Can I know more? 74 | 75 | Have a look at the following presentation for a short introduction. 76 | 77 |
78 | 79 |
80 | 81 | ### How can I cite this work? 82 | 83 | If you find this dataset useful and want to use it in your work, please cite the 84 | following paper: 85 | 86 | Georgios Gousios: [The GHTorrent dataset and tool 87 | suite](http://www.gousios.gr/bibliography/G13.html). MSR 2013: 233-236 88 | 89 | {%highlight text%} 90 | @inproceedings{Gousi13, 91 | author = {Gousios, Georgios}, 92 | title = {The GHTorrent dataset and tool suite}, 93 | booktitle = {Proceedings of the 10th Working Conference on Mining Software 94 | Repositories}, 95 | series = {MSR '13}, 96 | year = {2013}, 97 | isbn = {978-1-4673-2936-1}, 98 | location = {San Francisco, CA, USA}, 99 | pages = {233--236}, 100 | numpages = {4}, 101 | url = {http://dl.acm.org/citation.cfm?id=2487085.2487132}, 102 | acmid = {2487132}, 103 | publisher = {IEEE Press}, 104 | address = {Piscataway, NJ, USA}, 105 | } 106 | {%endhighlight%} 107 | 108 | ### Latest news 109 | 110 | 114 | Latest news 115 | 116 | -------------------------------------------------------------------------------- /gcloud.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: GHTorrent on the Google cloud 4 | tagline: 5 | --- 6 | 7 | GHTorrent can be accessed over Google Cloud services. To access the data 8 | requires you to have a Google Cloud account. Reasonable use is free of charge 9 | and, in the case of BigQuery, it [should no longer require a credit 10 | card](https://cloud.google.com/blog/big-data/2017/01/how-to-run-a-terabyte-of-google-bigquery-queries-each-month-without-a-credit-card). (Pub/Sub still requires a credit card). You can check what Google considers reasonable at any given moment 11 | [here](https://cloud.google.com/pricing/free). 12 | 13 | * [Google BigQuery](https://bigquery.cloud.google.com/dataset/ghtorrent-bq:ght) 14 | contains an up to date import of the latest GHTorrent MySQL dump. 15 | 16 | * [Google Pub/Sub](https://console.cloud.google.com/cloudpubsub/topicList?project=ghtorrent-bq) exposes real-time streams of GitHub activity. 17 | 18 | Both services can be accessed through the Web, the command line (after 19 | installing the Google Cloud [command line utils](https://cloud.google.com/sdk/)) or though various programming languages. 20 | 21 | ### BigQuery 22 | 23 | With BigQuery, you can query GHTorrent's MySQL dataset using an SQL-like 24 | language (lately, BigQuery also supports vanilla SQL); more importantly, you can 25 | join the dataset with other open datasets (e.g. GitHub's own project data, Reddit, 26 | [TravisTorrent](https://travistorrent.testroots.org/page_access/) etc) hosted on BigQuery. 27 | 28 | To get the most popular programming languages by number of bytes written, 29 | run the following: 30 | 31 | {% highlight sql %} 32 | select pl3.lang, sum(pl3.size) as total_bytes 33 | from ( 34 | select pl2.bytes as size, pl2.language as lang 35 | from ( 36 | select pl.language as lang, max(pl.created_at) as latest, pl.project_id as project_id 37 | from [ght.project_languages] pl 38 | join [ght.projects] p on p.id = pl.project_id 39 | where p.deleted is false 40 | and p.forked_from is null 41 | group by lang, project_id 42 | ) pl1 join [ght.project_languages] pl2 on pl1.project_id = pl2.project_id 43 | and pl1.latest = pl2.created_at 44 | and pl1.lang = pl2.language 45 | ) pl3 46 | group by pl3.lang 47 | order by total_bytes desc 48 | {% endhighlight %} 49 | 50 | To get the user with the most Java commits in the Netherlands in June 2016, 51 | do the following: 52 | 53 | {% highlight sql %} 54 | select u.login as login, u.location as location, count(c.id) as num_commits 55 | from [ghtorrent-bq.ght.project_commits] pc join 56 | (SELECT id, author_id FROM [ghtorrent-bq.ght.commits] WHERE 57 | date(created_at) between date('2016-06-01') 58 | and date('2016-07-01') )c on pc.commit_id = c.id join 59 | (SELECT id 60 | FROM [ghtorrent-bq.ght.projects] WHERE language = 'Java') p on p.id = pc.project_id join 61 | (SELECT login, location, id 62 | FROM [ghtorrent-bq.ght.users] 63 | WHERE country_code = 'nl') u on c.author_id = u.id, 64 | group by login, location 65 | order by num_commits desc; 66 | {% endhighlight %} 67 | 68 | See also some queries by [Felipe Hoffa](https://medium.com/@hoffa/github-top-countries-201608-13f642493773). 69 | 70 | ### Pub/Sub 71 | 72 | Pub/Sub allows subscribers to get events of what is happening on GitHub (or at 73 | least GHTorrent's interpretation of what is happening on GitHub) in almost real time. 74 | To do so, one needs to *subscribe* to one of the available *topics* with 75 | a client in order to start receiving *events*. 76 | 77 | The service is complimentary, even though less fine-grained, to GHTorrent's own 78 | [streaming interface](streaming.html). As is also the case with GHTorrent 79 | streaming, the contents of the streams are generated by following the live 80 | MongoDB server replication stream. See the code [here](https://github.com/ghtorrent/ghtorrent-streaming). 81 | 82 | To subscribe to a topic, e.g. `commits`, run the following: 83 | 84 | ``` 85 | gcloud beta pubsub subscriptions create my_commits_subscription --topic projects/ghtorrent-bq/topics/commits 86 | ``` 87 | 88 | To start receiving events, you can try the command line 89 | 90 | ``` 91 | gcloud beta pubsub subscriptions pull --auto-ack --max-messages 5 -- my_commits_subscription 92 | ``` 93 | 94 | The available topics are the following: 95 | 96 | {% highlight txt%} 97 | projects/ghtorrent-bq/topics/commits 98 | projects/ghtorrent-bq/topics/events 99 | projects/ghtorrent-bq/topics/followers 100 | projects/ghtorrent-bq/topics/forks 101 | projects/ghtorrent-bq/topics/issue_comments 102 | projects/ghtorrent-bq/topics/issue_events 103 | projects/ghtorrent-bq/topics/issues 104 | projects/ghtorrent-bq/topics/org_members 105 | projects/ghtorrent-bq/topics/pull_request_comments 106 | projects/ghtorrent-bq/topics/pull_requests 107 | projects/ghtorrent-bq/topics/repo_collaborators 108 | projects/ghtorrent-bq/topics/repo_labels 109 | projects/ghtorrent-bq/topics/repos 110 | projects/ghtorrent-bq/topics/users 111 | projects/ghtorrent-bq/topics/watchers 112 | {% endhighlight %} 113 | 114 | -------------------------------------------------------------------------------- /pullreq-perf/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Pull Request Performance reports 4 | tagline: 5 | --- 6 | 7 | ```{r preample, include=FALSE} 8 | 9 | # 10 | # (c) 2012 -- 2016 Georgios Gousios 11 | # 12 | # BSD licensed, see LICENSE in top level dir 13 | # 14 | 15 | projectsjs = paste(apply(projects, 1, function(x){sprintf("'%s-%s'", x[1], x[2])}), collapse = ",") 16 | projectsmd = paste(apply(projects, 1, function(x){sprintf("[%s/%s](/pullreq-perf/%s-%s/)", x[1], x[2], x[1], x[2])}), collapse="\n") 17 | ``` 18 | 19 | See here some reports for popular repositories: 20 | 21 |
22 | Ruby on Rails 23 | IPython 24 | Angular.js 25 | Node.js 26 | JQuery 27 | Homebrew 28 | Akka 29 | Shiny 30 |
31 |
32 | D3.js 33 | impress.js 34 | Jekyll 35 | Django 36 | Redis 37 | Bitcoin 38 | RxJava 39 | Tensorflow 40 | 41 |
42 | 43 | 44 | 45 | 46 | 47 | 48 | 62 | 63 | ### What is this report about? 64 | 65 | The report presents data on various aspects of pull request related activity 66 | within a project repository, with a special focus on how open the project 67 | is to external contributions. 68 | 69 | ### How did you choose the projects to analyze? 70 | 71 | As a starting point, I used the 72 | [repository set](https://github.com/gousiosg/pullreqs/blob/master/projects.txt) 73 | that I also use for my 74 | [research](http://www.gousios.gr/bibliography/GPD14.html) 75 | [work](http://www.gousios.gr/bibliography/GZ14.html). 76 | Then, I added to this selection, the top 1000 repositories by number of stars 77 | as reported by the GHTorrent database. I also added the top 50 projects in 78 | terms of 79 | [total contributions received](http://www.gousios.gr/blog/The-triumph-of-online-collaboration/). 80 | The actual list of projects can be found 81 | [here](https://github.com/gousiosg/ghtorrent.org/blob/master/pullreq-perf/projects.txt) 82 | or by clicking on the "Show all repos" button below. 83 | 84 | ### How did you build it? 85 | 86 | I used R to query GHTorrent's main MySQL database, then piped the results 87 | through [knitr](http://yihui.name/knitr/) templates which use 88 | [ggplot2](ggplot2.org/) for generating the nice plots. 89 | You can find the code [here](https://github.com/ghtorrent/ghtorrent.org/tree/master/pullreq-perf): 90 | 91 | ### My repository is not included! 92 | 93 | You can [send me an email](mailto:gousiosg@gmail.com) and I will make sure your 94 | repository will be included in the next round of report generation. You can 95 | also [edit this file](https://github.com/ghtorrent/ghtorrent.org/blob/master/pullreq-perf/projects.txt) and send a pull request. 96 | 97 | ### The report for my project is just plain wrong! 98 | 99 | If you have time to explain to me what is wrong, I would appreciate if 100 | [send me an email](mailto:gousiosg@gmail.com) and I will fix 101 | the issue. 102 | 103 | 106 | 107 |
108 | 109 | {% markdown %} 110 | `r projectsmd` 111 | {% endmarkdown %} 112 | 113 |
114 | 115 | 116 |
117 | 118 | 119 | 133 | -------------------------------------------------------------------------------- /streaming.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Streaming updates from GHTorrent 4 | tagline: 5 | --- 6 | 7 | ## Connection details 8 | 9 | To obtain access, please send us your public key [as described here](services.html). 10 | 11 | ``` 12 | ssh -L 5672:streamer.ghtorrent.org:5672 ghtorrent@streamer.ghtorrent.org 13 | ``` 14 | 15 | This will create a local port 5672 to which you can connect your AMQP client. 16 | No shell is allocated for security reasons. 17 | 18 | ## Declaring queues 19 | 20 | Our queue server, [RabbitMQ](https://www.rabbitmq.com) implements the 21 | [AMQP protocol](https://en.wikipedia.org/wiki/Advanced_Message_Queuing_Protocol). Some familiarity with it is necessary 22 | before using the streaming service. The [RabbitMQ getting started page](https://www.rabbitmq.com/getstarted.html) is 23 | a very good starting point with lots of examples in many languages. 24 | 25 | The streaming service uses topic exchanges and concequently message-based 26 | routing (see [here](https://www.rabbitmq.com/tutorials/tutorial-five-python.html) for details). To start receiving messages, a client needs to: 27 | 28 | 1. connect to the server 29 | 2. declare a queue 30 | 3. bind the declared queue to the default exchange with routing key 31 | 32 | The following examples are in Ruby. 33 | 34 | ### Connecting to the server 35 | 36 | Assuming your connection works as described above, you should have port 37 | 5672 listening on localhost. You should connect and define the `ght-streams` 38 | exchange (if you define other exchnages, you will receive no messages 39 | as there is no script posting to them). 40 | 41 | {% highlight ruby%} 42 | #!/usr/bin/env ruby 43 | 44 | require 'bunny' 45 | conn = Bunny.new(:host => '127.0.0.1', :port => 5672, 46 | :username => 'streamer', :password => 'streamer') 47 | conn.start 48 | ch = conn.create_channel 49 | exchange = ch.topic('ght-streams', :durable => true) 50 | {% endhighlight%} 51 | 52 | ### Declaring a queue 53 | 54 | You can declare as many queues as you want (within reasonable limits). To 55 | make the queue unique, we ask you to prefix your queue name with your 56 | username (e.g. `gousiosg_queue`). You should also make your queue 57 | non persistent, to avoid consuming server resouces when your program 58 | finishes. 59 | 60 | {% highlight ruby%} 61 | q = ch.queue("gousiosg_queue", :auto_delete => true) 62 | {% endhighlight%} 63 | 64 | ### Binding queues to routing keys 65 | 66 | All messages posted to `ght-streams` exchange have an attached routing key. 67 | This allows clients to declare queues that selectively receive only 68 | the messages they are interested into. The routing key is structured as 69 | follows: 70 | 71 | {% highlight%} 72 | prefix.{entity|event}.action 73 | {% endhighlight%} 74 | 75 | The `prefix` denotes the type of the updated item 76 | 77 | * `evt`: Denotes a GitHub event, as received by GHTorrent 78 | * `ent`: Denotes an update in a MongoDB collection 79 | 80 | The second part of the key denotes the updated item; its value depends on 81 | the `prefix`. The permitted values are the following: 82 | 83 | * For `evt` prefixes, it is the name of a [public GitHub event](https://developer.github.com/v3/activity/events/types/) shortened and lower-cased: 84 | `commitcomment`, 85 | `create`, 86 | `delete`, 87 | `deployment`, 88 | `deploymentstatus`, 89 | `download`, 90 | `follow`, 91 | `fork`, 92 | `forkapply`, 93 | `gist`, 94 | `gollum`, 95 | `issuecomment`, 96 | `issues`, 97 | `member`, 98 | `membership`, 99 | `pagebuild`, 100 | `public`, 101 | `pullrequest`, 102 | `pullrequestreviewcomment`, 103 | `push`, 104 | `release`, 105 | `repository`, 106 | `status`, 107 | `teamadd`, 108 | `watch` 109 | 110 | * For `ent` prefixes, it is the name of the MongoDB collection that was updated. One of: 111 | `commit_comments`, 112 | `commits`, 113 | `followers`, 114 | `forks`, 115 | `geo_cache`, 116 | `issue_comments`, 117 | `issue_events`, 118 | `issues`, 119 | `org_members`, 120 | `pull_request_comments`, 121 | `pull_requests`, 122 | `repo_collaborators`, 123 | `repo_labels`, 124 | `repos`, 125 | `users`, 126 | `watchers` 127 | 128 | The third part of the routing key denotes the update action. The allowed 129 | values are (this only applies to `ent` type messages; `evt` type messages 130 | are only marked as `insert`): 131 | 132 | * `insert`: An insertion of a record to a MongoDB collection 133 | * `delete`: A deletion from a MongoDB record 134 | * `update`: An update to a MongoDB record 135 | 136 | Let's see some example routing keys: 137 | 138 | * `evt.repos.insert`: This will retrieve all new inserts to the `repos` 139 | collection 140 | * `evt.fork.*`: This will retrieve all fork events 141 | * `ent.*.update`: This will retrieve all updates on MongoDB collections 142 | * `*.*.insert`: This will retrieve all new events and all MongoDB inserts 143 | 144 | {% highlight ruby%} 145 | q.bind(exchange, :routing_key => "evt.fork.*") 146 | q.subscribe do |delivery_info, metadata, payload| 147 | puts "#{delivery_info.routing_key}: #{payload}" 148 | end 149 | {% endhighlight %} 150 | 151 | ## Things to consider 152 | 153 | * Queues are configured to be garbage collected when the client that declared them has been disconnected. 154 | * Messages have a pre-configured Time-To-Live equal to 1 minute. If your client 155 | is not fast enough, they will be discarded. For this reason, we recommend 156 | client-side buffering of unprocessed messages. 157 | * All exchanges not named `ght-streams` are deleted every 5 minutes. 158 | * All queues not prefixed with `username_` are deleted every 5 minutes. 159 | -------------------------------------------------------------------------------- /pers-data.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Access to personal data 4 | tagline: 5 | --- 6 | 7 | ## Accessing personal data 8 | 9 | **Update Jun 2018: GDPR is in effect, which means that until GHTorrent's legal status 10 | is cleared, we cannot distribute this data anymore.** 11 | 12 | As of Mar 2016, the GHTorrent project does not offer personal data 13 | (namely, emails and real names) for download. For research purposes, 14 | you can request access to a file containing a mapping between 15 | logins and personal data. 16 | 17 | To access the file containing personal data, you will need to [edit this page](https://github.com/ghtorrent/ghtorrent.org/blob/master/pers-data.md) to include the following details. 18 | When your pull request has been accepted, we will mail you the link 19 | to the data. 20 | 21 | {%highlight html%} 22 |
23 |
24 |
Job Title 25 | email
26 | 27 |
Date of request
28 |
The actual date
29 | 30 |
Why do you need the personal data?
31 |
Provide an explanation
32 | 33 |
34 | {%endhighlight%} 35 | 36 | ## People with access to personal data 37 | 38 | #### Georgios Gousios 39 |
40 |
Researcher
41 |
Georgios Gousios, Assistant Prof. Radboud University Nijmegen, 42 | g.gousios@cs.ru.nl
43 | 44 |
Date of request
45 |
Mar 14, 2016
46 | 47 |
Intended use
48 |
Maintenance of the GHTorrent internal databases.
49 | 50 |
51 | 52 |
53 |
Researcher
54 |
Diomidis Spinellis, Professor, Athens University of Economics and Business, Greece, 55 | dds@aueb.gr
56 | 57 |
Date of request
58 |
July 1, 2016
59 | 60 |
Intended use
61 |
Research regarding commit practices of company employees. Correlate projects with commits through git blame.
62 | 63 |
64 | 65 |
66 |
Researcher
67 |
Tong WANG, Lecturer, University of Edinburgh 68 | tong.wang@ed.ac.uk
69 | 70 |
Date of request
71 |
Aug. 30, 2016
72 | 73 |
Intended use
74 |
Research regarding Open Source software network, especially focus on the interaction between programming habitants and company employees
75 | 76 |
77 | 78 |
79 |
Researcher
80 |
Chris Chabot, Semmle.com 81 | chabotc@semmle.com
82 | 83 |
Date of request
84 |
Dec. 11, 2016
85 | 86 |
Intended use
87 |
Normalizing and de-duplicating of author contribution data on our free for open source lgtm.com project, which provides source code analysis and fault detection, as well as showing coding velocity and quality per author and organization
88 | 89 |
90 | 91 |
92 |
Undergraduate
93 |
Davide Primiceri, Student Computer Science, University of Bari, Italy. 94 | d.primiceri@studenti.uniba.it
95 | 96 |
21 April, 2017
97 |
98 | 99 |
Needed for Degree Thesis
100 |
I am doing my degree thesis on the topic 'Evaluating the effects of multitasking among the open source projects of GitHub'. In order to do my analysis work, i need to combine GitHub data with Travis data. Thus i require the name and other login details of all users. Kindly share the personal data with me. 101 |
102 | 103 |
104 | 105 |
106 |
Researcher
107 |
Bogdan Vasilescu, Assistant Professor, School of Computer Science, Carnegie Mellon University 108 | vasilescu@cmu.edu
109 | 110 |
Date of request
111 |
June 1, 2017
112 | 113 |
Intended use
114 |
Research regarding gender diversity in GitHub teams.
115 | 116 |
117 | 118 |
119 |
Graduate Student
120 |
Farhana Sarker, Computer Science Graduate Student, College of Engineering, University of California Davis 121 | fasarker@ucdavis.edu
122 | 123 |
Date of request
124 |
September 16, 2017
125 | 126 |
Intended use
127 |
Research regarding multitasking in GitHub teams.
128 | 129 |
130 | 131 |
132 |
Researcher
133 |
Guanliang Chen, PhD candidate, Web Information Systems group, EEMCS, TU Delft guanliang.chen@tudelft.nl
134 | 135 |
Date of request
136 |
Oct 30, 2017
137 | 138 |
Intended use
139 |
To match learners in edX and investigate to what extend learners from programming MOOCs applied the knowledge into practice.
140 |
141 | 142 |
143 |
Postdoctoral Researcher
144 |
Ayushi Rastogi, UC Irvine 145 | ayushir@ics.uci.edu
146 | 147 |
Date of request
148 |
November 30, 2017
149 | 150 |
Why do you need the personal data?
151 |
My research focus is empirical software engineering, with a particular interest in human traits, team performance, and collaboration patterns.
152 | 153 |
154 | 155 |
156 |
PhD Student
157 |
Harsh Ketkar, University of Michigan 158 | hketkar@umich.edu
159 | 160 |
Date of request
161 |
January 18, 2018
162 | 163 |
Why do you need the personal data?
164 |
I am researching how contribution patterns of individual developers change over time and across platforms.
165 | 166 |
167 | 168 |
169 |
Researcher
170 |
Emerson Murphy-Hill, North Carolina State University, 171 | emerson@csc.ncsu.edu
172 | 173 |
Date of request
174 |
April 24, 2018
175 | 176 |
Why do you need the personal data?
177 |
I will use email addresses to cross-reference GitHub accounts with social media accounts.
178 | 179 |
180 | 181 | ## Disclaimer 182 | 183 | The data is provided as is with no further guarantees of data quality or law 184 | compliance. Redistribution is *strictly not* allowed! The GHTorrent project is 185 | not responsible for any illegal uses of the provided data. 186 | -------------------------------------------------------------------------------- /halloffame.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Hall of Fame 4 | tagline: 5 | --- 6 | 7 | ### Donations 8 | 9 | * [17 Nov 2015] Microsoft donated $98,000 in Azure credits 10 | * [30 Oct 2016] Google donated $1000 in Google Cloud credits 11 | 12 | The project would also like to thank the anonymous donors for their 13 | generocity. GHTorrent will become a better project thanks to you! 14 | 15 | ### Papers using GHTorrent 16 | 17 | This list is a subset of researchers who have used GHTorrent for research or 18 | teaching. If you are a user of the dataset, please consider adding your details. 19 | You can do it using the following simple steps: 20 | 21 | * Add information about your organization and yourself to [this file on 22 | Github](https://github.com/gousiosg/ghtorrent.org/blob/master/halloffame.md). You should describe how you used GHTorrent in a few lines. It is OK to include links. Please ensure that institution names are listed in alphabetic order. 23 | 24 | * If you are interested to link your publications referencing GHTorrent, you should include a Bibtex record in [this file](https://github.com/gousiosg/ghtorrent.org/blob/master/_bibliography/references.bib) on Github. You can then reference them in [this file](https://github.com/gousiosg/ghtorrent.org/blob/master/halloffame.md). 25 | 26 | #### [Inria/Mines Nantes/LINA/AtlanMod](http://www.emn.fr/z-info/atlanmod/index.php/Main_Page) 27 | * [Jordi Cabot](http://modeling-languages.com): Research on usage of issue labels in GitHub. 28 | 1. {% reference cabotSaner2015 %} 29 | 2. {% reference canovasSaner2015 %} 30 | 31 | #### [NUDT/Trustie](http://www.trustie.com/) 32 | * [Yue Yu](http://yuyue.github.io): Research on reviewer recommendation, and latency of pull requests. Used GHTorrent to extract our dataset. 33 | 1. {% reference YuRR14 %} 34 | 2. {% reference yue2015wait %} 35 | 36 | #### [Radboud University Nijmegen/DS](http://www.ru.nl/ds/) 37 | * [Georgios Gousios](http://www.gousios.gr): Maintentance, qualitative research on pull requests, [pull request prioritization](http://ghtorrent.org/prioritizer),developer profiles 38 | 1. {% reference GZSD15 %} 39 | 2. {% reference HG15 %} 40 | 3. {% reference VGZ15 %} 41 | 42 | #### [TU Delft/SERG](http://swerl.tudelft.nl/bin/view/Main/WebHome) 43 | * [Georgios Gousios](http://www.gousios.gr): Initial design and implementation. Project hosting. Lean GHTorrent. Research on pull requests. Project openess reports. 44 | 1. {% reference GS12 %} 45 | 2. {% reference G13 %} 46 | 3. {% reference GPD14 %} 47 | 4. {% reference GZ14 %} 48 | 5. {% reference GVSZ14 %} 49 | 50 | #### [TU Eindhoven/SET](http://www.tue.nl/en/university/departments/mathematics-and-computer-science/research/research-programs-computer-science/section-model-driven-software-engineering-mdse/set/) 51 | * [Bogdan Vasilescu](http://bvasiles.github.io/): Integration of GitHub and Stack Overflow data. Research on productivity of GitHub developers. Sentiment analysis of GitHub discussions. Lean GHTorrent. Continuous integration in GitHub. 52 | * [Alexander Serebrenik](http://www.win.tue.nl/~aserebre/): Research on productivity of GitHub developers. Sentiment analysis of GitHub discussions. Research on continuous integration in GitHub. 53 | 1. {% reference VSF12 %} 54 | 2. {% reference GVSZ14 %} 55 | 3. {% reference PVS14 %} 56 | 4. {% reference vasilescu2014ci %} 57 | 58 | #### [University of California, Davis/DECAL](http://decallab.cs.ucdavis.edu) 59 | * [Bogdan Vasilescu](http://bvasiles.github.io/): Research on effects of diversity in GitHub teams. 60 | 1. {% reference vasilescu2015gender %} 61 | 2. {% reference vasilescu2015chase %} 62 | 63 | #### [University of Victoria/SEGAL](http://thesegalgroup.org) 64 | * [Kelly Blincoe](http://thesegalgroup.org/people/kelly-blincoe): Research on Implicit Coordination and its impact on productivity. 65 | * [Eirini Kalliamvakou](http://thesegalgroup.org/people/eirini-kalliamvakou): Research on collaborative development using decentralized workflows and GitHub. Used GHTorrent to extract information about pull requests for potential mining perils. 66 | 1. {% reference KGBSGD14 %} 67 | 68 | #### [University of Trier/SE](http://st.uni-trier.de/) 69 | * [Sebastian Baltes](http://sbaltes.com/): Research on the usage of Stack Overflow code snippets in GitHub projects, its licensing implications, and developers' awareness. 70 | 1. {% reference BaltesDiehl2018 %} 71 | 72 | ### API keys contributors 73 | 74 | The following people's contributions of GitHub OAuth API keys has allowed 75 | the data collection process to catch on with GitHub's 10x growth since the 76 | GHTorrent project started. If you would like to contribute and API key, 77 | please follow the process specified [here](http://ghtorrent.org/services.html). 78 | 79 | [Bram Adams](http://mcis.polymtl.ca/bram.html), 80 | [Maryi Arciniegas Méndez](http://thechiselgroup.org/members/), 81 | [Syed Arefinul Haque](https://uiu-bd.academia.edu/SyedArefinulHaque), 82 | [Efthimia Aivaloglou](https://www.linkedin.com/pub/efthimia-aivaloglou/4/244/966), 83 | [Alberto Bacchelli](http://sback.it), 84 | [Moritz Beller](http://www.st.ewi.tudelft.nl/~mbeller/), 85 | [Matthieu Bizien](https://www.linkedin.com/in/matthieubizien/en), 86 | Erik Bowers, 87 | [Frederic Gingras](http://fredericgingras.ca), 88 | [Roberta de Souza Coelho](https://www.dimap.ufrn.br/~roberta/), 89 | [Victor Costan](http://www.costan.us), 90 | [Ayushi Dalmia](https://researchweb.iiit.ac.in/~ayushi.dalmia/), 91 | Jos Demmers, 92 | [Arie van Deursen](http://www.st.ewi.tudelft.nl/~arie/), 93 | [Niel Ernst](http://neilernst.net), 94 | [Joe Fleming](http://joefleming.net), 95 | [Georgios Gousios](http://gousios.gr), 96 | [Samarendra M Hedaoo](http://fortyplustwo.net), 97 | [Mark Hills](http://www.cs.ecu.edu/hillsma/), 98 | [Arun Kalyanasundaram](http://www.cs.cmu.edu/~arunkaly/), 99 | [Syafiq Kamarul Azman](https://www.kaggle.com/syaffers), 100 | Lindsey Lanier, 101 | Pablo Loyola, 102 | Yao Lu, 103 | [Mahdi Moqri](http://www.moqri.com), 104 | Graeme Nathan, 105 | [Matteo Orrù](), 106 | [Gustavo Pinto](http://gustavopinto.org), 107 | [Dominic Safaric](https://github.com/dsafaric), 108 | Jasmine Sandhu, 109 | [Alexander Serebrenik](http://www.win.tue.nl/~aserebre/), 110 | [Diomidis Spinellis](http://www.dmst.aueb.gr/dds/), 111 | Simon Symeonidis, 112 | [Chris Thompson](http://www.cs.berkeley.edu/~cthompson/), 113 | [Peter Tröger](http://www.troeger.eu), 114 | [Bogdan Vasilescu](http://bvasiles.github.io), 115 | Marko Vit, 116 | [Meike Wiemann](https://twitter.com/weidenfreak), 117 | [Yue Yu](http://fisher.trustie.net/), 118 | [Alexey Zagalsky](http://alexeyza.com), 119 | [Andy Zaidman](http://www.st.ewi.tudelft.nl/~zaidman/), 120 | [Nosheen Zaza](http://www.people.usi.ch/zazan/) 121 | -------------------------------------------------------------------------------- /_layouts/default.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {{ page.title }} 6 | 7 | 8 | {% if page.description %}{% endif %} 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 72 | 73 |
74 |
75 |
76 | 83 |
84 | Sponsors 85 |
86 |
87 | Microsoft logo 89 |
90 |
91 | Radboud University logo 93 |
94 |
95 | TU Delft logo 97 |
98 |
99 |
100 | Become a sponsor 101 |
102 | 103 | 104 | 105 | 106 |
107 |
108 |
109 |
110 |
111 | {{ content }} 112 |
113 |
114 |
115 |
116 | 117 | 118 | 125 | 133 | 134 | 150 | 151 | 152 | -------------------------------------------------------------------------------- /ght-ubuntu.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: GHTorrent optimized on Ubuntu 10.10 4 | tagline: 5 | --- 6 | 7 | This is a from scratch installation script for GHTorrent running on Ubuntu 8 | 10.10. It has been tuned for cloud Ubuntu installations (e.g. Azure). This 9 | setup will handle installations in the order of 10's of thousands repositories 10 | on a D3 Azure VM (2 CPUs, 7GB RAM). 11 | 12 | ## Install essentials 13 | 14 | {% highlight bash %} 15 | sudo apt-get install -y git ntp mdadm lvm2 libssl-dev parallel 16 | {% endhighlight %} 17 | 18 | ## Configure RAID 19 | 20 | {% highlight bash %} 21 | sudo bash 22 | # partition the devices 23 | (echo -e "o\nn\np\n1\n\n\nt\nfd\nw" | fdisk /dev/sdc ) || exit 1 24 | (echo -e "o\nn\np\n1\n\n\nt\nfd\nw" | fdisk /dev/sdd ) || exit 1 25 | 26 | # start the array and write out its config 27 | mdadm --create /dev/md127 --level 0 --raid-devices 2 /dev/sdc1 /dev/sdd1 28 | mdadm --detail --scan >> /etc/mdadm/mdadm.conf 29 | {% endhighlight %} 30 | 31 | ## Configure filesystem on RAID 32 | 33 | {% highlight bash %} 34 | sudo bash 35 | mkfs -t ext4 /dev/md127 36 | id=`blkid|grep md127|cut -f2 -d'"'` 37 | mkdir /data 38 | echo "UUID=$id /data ext4 defaults,noatime,nobootwait,optional 0 0" >> /etc/fstab 39 | {% endhighlight %} 40 | 41 | ## Install required Ruby 42 | 43 | {% highlight bash %} 44 | sudo apt-get install ruby ruby2.2 ruby2.2-dev build-essential 45 | 46 | sudo update-alternatives --install /usr/bin/ruby ruby /usr/bin/ruby2.2 400 \ 47 | --slave /usr/bin/rake rake /usr/bin/rake2.2 \ 48 | --slave /usr/bin/ri ri /usr/bin/ri2.2 \ 49 | --slave /usr/bin/rdoc rdoc /usr/bin/rdoc2.2 \ 50 | --slave /usr/bin/gem gem /usr/bin/gem2.2 \ 51 | --slave /usr/bin/irb irb /usr/bin/irb2.2 \ 52 | --slave /usr/share/man/man1/ruby.1.gz ruby.1.gz /usr/share/man/man1/ruby2.2.1.gz \ 53 | --slave /usr/share/man/man1/rake.1.gz rake.1.gz /usr/share/man/man1/rake2.2.1.gz \ 54 | --slave /usr/share/man/man1/ri.1.gz ri.1.gz /usr/share/man/man1/ri2.2.1.gz \ 55 | --slave /usr/share/man/man1/rdoc.1.gz rdoc.1.gz /usr/share/man/man1/rdoc2.2.1.gz \ 56 | --slave /usr/share/man/man1/gem.1.gz gem.1.gz /usr/share/man/man1/gem2.2.1.gz \ 57 | --slave /usr/share/man/man1/irb.1.gz irb.1.gz /usr/share/man/man1/irb2.2.1.gz 58 | {% endhighlight %} 59 | 60 | ## Install MySQL (its MariaDB variant) 61 | 62 | We are currently using MariaDB as it can handle complex queries better than 63 | stock MySQL 5.6. If you prefer MySQL, skip the MariaDB installation script 64 | below. 65 | 66 | You can set any password for the root user in MySQL. 67 | 68 | {% highlight bash %} 69 | sudo apt-get install -y software-properties-common 70 | sudo apt-key adv --recv-keys --keyserver hkp://keyserver.ubuntu.com:80 0xcbcb082a1bb943db 71 | sudo add-apt-repository 'deb http://mariadb.mirror.triple-it.nl//repo/10.1/ubuntu wily main' 72 | sudo apt-get update 73 | sudo apt-get install -y mariadb-server percona-toolkit libmariadbclient-dev 74 | {% endhighlight %} 75 | 76 | Then, move data files to the RAID array. 77 | 78 | {% highlight bash %} 79 | sudo service mysql stop 80 | sudo mkdir /data/mysql 81 | sudo chown mysql:mysql /data/mysql 82 | sudo mkdir /mnt/mysql 83 | sudo chown mysql:mysql /mnt/mysql 84 | sudo rsync -av /var/lib/mysql /data/mysql 85 | sudo service mysql start 86 | {% endhighlight %} 87 | 88 | ## Configure MySQL/MariaDB 89 | 90 | {% highlight bash %} 91 | sudo service mysql stop 92 | sudo vi /etc/mysql/my.cnf 93 | 94 | ### change the following 95 | datadir = /data/mysql 96 | tmpdir = /mnt/mysql 97 | innodb_buffer_pool_size=4GB 98 | ### 99 | 100 | sudo service mysql start 101 | {% endhighlight %} 102 | 103 | ## Install MongoDB 104 | 105 | We are install MongoDB latest (3.0.x) from MongoDB's central repo and use 106 | WiredTiger as the storage engine due to huge space savings. 107 | 108 | {% highlight bash %} 109 | sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 7F0CEB10 110 | echo "deb http://repo.mongodb.org/apt/ubuntu trusty/mongodb-org/3.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-3.0.list 111 | sudo apt-get update 112 | sudo apt-get install -y mongodb-org 113 | 114 | echo << EOF > /lib/systemd/system/mongodb.service 115 | [Unit] 116 | Description=High-performance, schema-free document-oriented database 117 | After=network.target 118 | 119 | [Service] 120 | User=mongodb 121 | ExecStart=/usr/bin/mongod --quiet --config /etc/mongod.conf 122 | 123 | [Install] 124 | WantedBy=multi-user.target 125 | EOF 126 | 127 | sudo systemctl daemon-reload 128 | sudo service mongodb start 129 | 130 | # Stop it to move dirs to the right places 131 | sudo service mongodb stop 132 | sudo bash 133 | 134 | mkdir /data/mongodb 135 | chown mongodb:mongodb /data/mongodb 136 | 137 | echo << EOF > /etc/mongod.conf 138 | storage: 139 | dbPath: "/data/mongodb" 140 | engine: "wiredTiger" 141 | wiredTiger: 142 | collectionConfig: 143 | blockCompressor: snappy 144 | engineConfig: 145 | cacheSizeGB: 4 # Configure this if you have more RAM 146 | systemLog: 147 | destination: file 148 | path: "/var/log/mongodb/mongodb.log" 149 | logAppend: true 150 | timeStampFormat: iso8601-utc 151 | 152 | net: 153 | bindIp: "0.0.0.0" 154 | port: 27017 155 | EOF 156 | 157 | service mongodb start 158 | {% endhighlight %} 159 | 160 | ### Install and configure RabbitMQ 161 | 162 | {% highlight bash %} 163 | sudo apt-get install rabbitmq-server 164 | sudo rabbitmqctl add_user ghtorrent ghtorrent 165 | sudo rabbitmqctl set_permissions -p / ghtorrent ".*" ".*" ".*" 166 | sudo rabbitmq-plugins enable rabbitmq_management 167 | sudo rabbitmqctl set_user_tags ghtorrent administrator 168 | {% endhighlight %} 169 | 170 | ### Install and configure GHTorrent 171 | 172 | {% highlight bash %} 173 | cd $HOME 174 | git clone https://github.com/gousiosg/github-mirror.git 175 | cd github-mirror 176 | 177 | sudo gem install bundler 178 | sudo bundle install 179 | sudo gem install mysql2 180 | 181 | cp config.yaml.tmpl config.yaml 182 | vi config.yaml 183 | {% endhighlight %} 184 | 185 | Use the following contents for the config.yaml file 186 | 187 | {% highlight yaml %} 188 | amqp: 189 | host: 127.0.0.1 # Queue's IP address 190 | port: 5672 191 | username: ghtorrent # Username to connect to the queue 192 | password: ghtorrent # password 193 | exchange: ghtorrent 194 | prefetch: 1 195 | 196 | sql: 197 | # Configuration URL for the SQL database subsystem. 198 | # Examples: 199 | # - MySQL: mysql2://user:password@host/github 200 | # - Postgres: postgres://user:password@host/github 201 | # 202 | # On JRuby, you can use the JDBC-mysql driver that comes with JRuby 203 | # jdbc:mysql://localhost/github?user=github&password=github 204 | # 205 | # see http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html 206 | # for details 207 | #url: sqlite://github.db 208 | url: mysql2://ghtorrent:ghtorrent@localhost/ghtorrent 209 | 210 | mirror: 211 | urlbase: "https://api.github.com/" 212 | persister: mongo #or noop 213 | # How many pages of historical content to retrieve when doing multi-page 214 | # API calls. 215 | history_pages_back: 1000 216 | # On a machine with multiple IP addresses, select the one to send the 217 | # HTTP requests from. 218 | # attach_ip: 0.0.0.0 219 | # Use your token here 220 | token: 221 | # Number of reqs/hour to do with the provided key 222 | req_limit: 4990 223 | # User agent to use for requests. You must use a unique name per client program 224 | user_agent: ghtorrent 225 | # Time to wait between geo location API requests 226 | geoloc_wait: 2 227 | 228 | mongo: 229 | host: 127.0.0.1 # Mongo's IP addr 230 | port: 27017 # Mongo's port 231 | db: ghtorrent # DB name to store commits to 232 | #username: github # User name to connect to Mongo 233 | #password: github # Password for mongo 234 | 235 | logging: 236 | # A unique string to appear in all messages produced by the invoking program. 237 | uniq: "ghtorrent" 238 | # debug < info < warn < error, for decreasing log output 239 | level: "info" 240 | # stdout or stderr to log to system streams. A file name to log to this file. 241 | file: "stdout" 242 | {% endhighlight %} 243 | 244 | GHTorrent is now ready to run. Self-apply to begin with: 245 | 246 | {% highlight bash %} 247 | cd $HOME/github-mirror 248 | ruby -Ilib bin/ght-retrieve-repo gousiosg github-mirror 249 | {% endhighlight %} 250 | 251 | 252 | -------------------------------------------------------------------------------- /_bibliography/references.bib: -------------------------------------------------------------------------------- 1 | @inproceedings{GPD14, 2 | author = {Gousios, Georgios and Pinzger, Martin and Deursen, Arie van}, 3 | title = {An Exploratory Study of the Pull-based Software Development Model}, 4 | booktitle = {Proceedings of the 36th International Conference on Software Engineering}, 5 | Year = {2014}, 6 | series = {ICSE}, 7 | year = {2014}, 8 | isbn = {978-1-4503-2756-5}, 9 | location = {Hyderabad, India}, 10 | pages = {345--355}, 11 | numpages = {11}, 12 | doi = {10.1145/2568225.2568260}, 13 | acmid = {2568260}, 14 | publisher = {ACM}, 15 | address = {New York, NY, USA}, 16 | url = {http://www.gousios.gr/bibliography/GPD14.html}, 17 | } 18 | 19 | @inproceedings{G13, 20 | Author = {Georgios Gousios}, 21 | Title = {The {GHTorrent} dataset and tool suite}, 22 | Year = 2013, 23 | Month = May, 24 | Booktitle = {Proceedings of the 10th Working Conference on Mining Software Repositories}, 25 | series={MSR}, 26 | pages={233--236}, 27 | Location = {San Francisco, CA}, 28 | url = {http://www.gousios.gr/bibliography/G13.html}, 29 | award = {MSR2013: Best data showcase paper} 30 | } 31 | 32 | @inproceedings{GS12, 33 | Author = {Georgios Gousios and Diomidis Spinellis}, 34 | Booktitle = {Proceedings of the 9th Working Conference on Mining Software Repositories}, 35 | series={MSR}, 36 | Location = {Zurich, Switzerland}, 37 | Pages = {12--21}, 38 | Publisher = {IEEE}, 39 | Title = { {GHTorrent}: {GitHub}'s Data from a Firehose}, 40 | Year = 2012, 41 | doi = {10.1109/MSR.2012.6224294}, 42 | ISSN = {2160-1852}, 43 | url = {http://www.gousios.gr/bibliography/GS12.html} 44 | } 45 | 46 | 47 | @inproceedings{GZ14, 48 | author = {Gousios, Georgios and Zaidman, Andy}, 49 | title = {A Dataset for Pull-based Development Research}, 50 | booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories}, 51 | series = {MSR}, 52 | year = {2014}, 53 | isbn = {978-1-4503-2863-0}, 54 | location = {Hyderabad, India}, 55 | pages = {368--371}, 56 | numpages = {4}, 57 | doi = {10.1145/2597073.2597122}, 58 | acmid = {2597122}, 59 | publisher = {ACM}, 60 | address = {New York, NY, USA}, 61 | url = {http://www.gousios.gr/bibliography/GZ14.html}, 62 | note = {MSR2014: Best data showcase paper}, 63 | } 64 | 65 | @inproceedings{GVSZ14, 66 | author = {Gousios, Georgios and Vasilescu, Bogdan and Serebrenik, Alexander and Zaidman, Andy}, 67 | title = {Lean GHTorrent: GitHub Data on Demand}, 68 | booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories}, 69 | series = {MSR}, 70 | year = {2014}, 71 | isbn = {978-1-4503-2863-0}, 72 | location = {Hyderabad, India}, 73 | pages = {384--387}, 74 | numpages = {4}, 75 | doi = {10.1145/2597073.2597126}, 76 | acmid = {2597126}, 77 | publisher = {ACM}, 78 | address = {New York, NY, USA}, 79 | keywords = {GitHub, data on demand, dataset}, 80 | url = {http://www.gousios.gr/bibliography/GVSZ14.html} 81 | } 82 | 83 | @inproceedings{VSF12, 84 | author = {Vasilescu, Bogdan and Filkov, Vladimir and Serebrenik, Alexander}, 85 | title = {Stack Overflow and GitHub: Associations between software development and crowdsourced knowledge}, 86 | booktitle = {Proceedings of the 2013 ASE/IEEE International Conference on Social Computing}, 87 | series = {SocialCom}, 88 | publisher = {IEEE}, 89 | year = {2013}, 90 | pages = {188--195}, 91 | doi = {http://dx.doi.org/10.1109/SocialCom.2013.35} 92 | } 93 | 94 | @inproceedings{PVS14, 95 | title={Security and Emotion: Sentiment Analysis of Security Discussions on {GitHub}}, 96 | author={Pletea, Daniel and Vasilescu, Bogdan and Serebrenik, Alexander}, 97 | booktitle={Proceedings of the 11th Working Conference on Mining Software Repositories}, 98 | series={MSR}, 99 | year={2014}, 100 | pages={384--387}, 101 | Location={Hyderabad, India}, 102 | organization={ACM} 103 | } 104 | 105 | @inproceedings{KGBSGD14, 106 | title={The Promises and Perils of Mining {GitHub}}, 107 | author={Kalliamvakou, Eirini and Gousios, Georgios and Blincoe, Kelly and Singer, Leif and German, Daniel M. and Damian, Daniela}, 108 | booktitle={Proceedings of the 11th Working Conference on Mining Software Repositories}, 109 | series={MSR}, 110 | year={2014}, 111 | pages={92--101}, 112 | Location={Hyderabad, India}, 113 | organization={ACM} 114 | } 115 | 116 | @inproceedings{YuRR14, 117 | author={Yue Yu and Huaimin Wang and Gang Yin and Ling, C.X.}, 118 | booktitle={Proceedings of the 2014 IEEE International Conference on Software Maintenance and Evolution}, 119 | series={ICSME}, 120 | title={Reviewer Recommender of Pull-Requests in {GitHub}}, 121 | year={2014}, 122 | pages={609--612}, 123 | doi={10.1109/ICSME.2014.107}, 124 | ISSN={1063-6773}, 125 | publisher = {IEEE}, 126 | } 127 | 128 | @inproceedings{yue2015wait, 129 | author = {Yu, Yue and Wang, Huaimin and Filkov, Vladimir and Devanbu, Premkumar and Vasilescu, Bogdan}, 130 | title = {Wait For It: Determinants of Pull Request Evaluation Latency on {GitHub}}, 131 | booktitle = {12th Working Conference on Mining Software Repositories}, 132 | year = {2015}, 133 | series = {MSR}, 134 | publisher = {IEEE}, 135 | note={to appear}, 136 | } 137 | 138 | @inproceedings{vasilescu2014ci, 139 | author = {Vasilescu, Bogdan and van Schuylenburg, Stef and Wulms, Jules and Serebrenik, Alexander and van den Brand, Mark G. J.}, 140 | title = {Continuous integration in a social-coding world: Empirical evidence from {GitHub}}, 141 | booktitle = {Proceedings of the 30th IEEE International Conference on Software Maintenance and Evolution, Early 142 | Research Achievements}, 143 | year = {2014}, 144 | series = {ICSME}, 145 | pages = {401--405}, 146 | publisher = {IEEE}, 147 | } 148 | 149 | @inproceedings{vasilescu2015gender, 150 | author = {Vasilescu, Bogdan and Posnett, Daryl and Ray, Baishakhi and van den Brand, Mark G. J. and Serebrenik, Alexander and Devanbu, Premkumar and Filkov, Vladimir}, 151 | title = {Gender and tenure diversity in {GitHub} teams}, 152 | booktitle = {Proceedings of the ACM {CHI} Conference on Human Factors in Computing Systems}, 153 | year = {2015}, 154 | series = {CHI}, 155 | publisher = {ACM}, 156 | note={to appear}, 157 | } 158 | 159 | @inproceedings{vasilescu2015chase, 160 | author = {Vasilescu, Bogdan and Filkov, Vladimir and Serebrenik, Alexander}, 161 | title = {Perceptions of Diversity on {GitHub}: A User Survey}, 162 | booktitle = {Proceedings of the 8th International Workshop on Cooperative and Human Aspects of Software Engineering}, 163 | year = {2015}, 164 | series = {CHASE}, 165 | publisher = {IEEE}, 166 | note={to appear}, 167 | } 168 | 169 | @inproceedings{cabotSaner2015, 170 | title = {{Exploring the Use of Labels to Categorize Issues in Open-Source Software Projects}}, 171 | author = {Cabot, Jordi and C\'anovas Izquierdo, Javier Luis and Cosentino, Valerio and Rolandi, Bel\'en}, 172 | booktitle = {Proceedings of the 22nd International Conference on Software Analysis, Evolution, and Reengineering (SANER)}, 173 | pages = {479--483}, 174 | year = {2015} 175 | } 176 | 177 | @inproceedings{canovasSaner2015, 178 | title = {{GiLA: GitHub Label Analyzer}}, 179 | author = {C\'anovas Izquierdo, Javier Luis and Cosentino, Valerio and Rolandi, Bel\'en and Bergel, Alexandre and Cabot, Jordi}, 180 | booktitle = {Proceedings of the 22nd International Conference on Software Analysis, Evolution, and Reengineering (SANER)}, 181 | pages = {550--554}, 182 | year = {2015} 183 | } 184 | 185 | @inproceedings{GZSD15, 186 | author = {Gousios, Georgios and Zaidman, Andy and Storey, Margaret-Anne and Deursen, Arie van}, 187 | title = {Work Practices and Challenges in Pull-Based Development: The Integrator’s Perspective}, 188 | booktitle = {Proceedings of the 37th International Conference on Software Engineering}, 189 | series = {ICSE 2015}, 190 | year = {2015}, 191 | location = {Florence, Italy} 192 | } 193 | 194 | @inproceedings{HG15, 195 | author = {Hauff, Claudia and Gousios, Georgios}, 196 | title = {Matching GitHub developer profiles to job advertisements}, 197 | booktitle = {Proceedings of the 12th International Conference on Mining 198 | Software Repositories}, 199 | year = {2015}, 200 | location = {Florence, Italy} 201 | } 202 | 203 | @inproceedings{VGZ15, 204 | author = {van der Veen, Erik and Gousios, Georgios and Zaidman, Andy}, 205 | title = {Automatically Prioritizing Pull Requests}, 206 | booktitle = {Proceedings of the 12th International Conference on Mining 207 | Software Repositories}, 208 | year = {2015}, 209 | location = {Florence, Italy} 210 | } 211 | 212 | @article{BaltesDiehl2018, 213 | author = {Baltes, Sebastian and Diehl, Stephan}, 214 | title = {{Usage and Attribtion of Stack Overflow Code Snippets in GitHub Projects}}, 215 | journal = {{Empirical Software Engineering}}, 216 | year = {2018} 217 | } 218 | -------------------------------------------------------------------------------- /faq.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: GHTorrent FAQ 4 | tagline: 5 | --- 6 | 7 | This is is the GHTorrent FAQ (work in progress). Please ask more questions using 8 | the form below or by editing [this 9 | file](https://github.com/ghtorrent/ghtorrent.org/blob/master/faq.md) directly on 10 | GitHub. 11 | 12 | ## General 13 | 14 | #### _What is GHTorrent?_ 15 | 16 | GHTorrent collects all information from the GitHub API and populates with it two 17 | databases: one with [raw data](mongo.html) and one with [linked 18 | entities](relational.html). Using this data, users can get insights just for 19 | their repositories or for the full state of OSS development on GitHub. 20 | 21 | GHTorrent has been extensively used by 22 | [researchers](halloffame.html), 23 | [companies](https://github.com/Microsoft/ghinsights) and OSS projects 24 | as a source of software process and product analytics. 25 | 26 | #### _Can I use GHTorrent for my research?_ 27 | 28 | Absolutely! [Lots of 29 | researchers](https://scholar.google.gr/scholar?cites=11132126230347149781) have 30 | [done so](halloffame.html). You can [download](download.html) the database dumps 31 | or use the [online access services](services.html) to get access to the data. 32 | 33 | When using GHTorrent data for research or large scale repository analysis, 34 | please consider the perils reported in [this paper](http://gousios.gr/bibliography/KGBSGD15.html). 35 | 36 | #### _Which license is GHTorrent distributed under?_ 37 | 38 | The GHTorrent dataset is distributed under a dual licensing scheme ([Creative Commons +](https://wiki.creativecommons.org/wiki/CCPlus)). 39 | 40 | For non-commercial uses (including, but not limited to, educational, research or personal uses), the dataset is distributed under the [CC-BY-SA](http://creativecommons.org/licenses/by-sa/4.0/) license. Creative Commons License 41 | 42 | For commercial uses, please [contact the maintainer](mailto:gousiosg@gmail.com) for more information. Usually, a sizable donation to the project will be enough 43 | to grant you full access. 44 | 45 | #### _Who is behind GHTorrent?_ 46 | 47 | GHTorrent was initially created and is currently maintained by [Georgios 48 | Gousios](http://gousios.org), with initial design support and ideas from 49 | [Diomidis Spinellis](http://spinellis.gr). Several users have contributed code, 50 | ideas and support over time. Here is a (hopefuly not partial) list of them: 51 | 52 | Sebastian Bates, Derek Brown, Arie van Deursen, Daniel German, Jeff McAffer, Bogdan Vasilescu 53 | 54 | Financial support has been provided by the following organizations: 55 | 56 | * TU Delft: purchace and running costs for initial servers (2012 -- late 2015) 57 | * Microsoft: donation of Azure tokens for running the project infrastructure 58 | (late 2015 -- late 2016) 59 | 60 | #### _How is GHTorrent different from Github Archive?_ 61 | 62 | [Github Archive](http://githubarchive.org) collects and stores the GitHub event 63 | stream. In addition to that, GHTorrent applies dependency based retrieval on all 64 | entities (e.g. commits, pull requests etc) that are linked from the events and 65 | stores the results in two databases: a raw data one (MongoDB) that stores the 66 | unprocessed responses from GitHub API and a relational one (MySQL) that stores 67 | links between the entities (e.g. commits are linked to projects). Using 68 | GHTorrent, developers can obtain an up-to-date, relational view of their 69 | project’s GitHub metadata, which can be used for answering questions regarding 70 | their project’s processes. 71 | 72 | ## How can I...? 73 | 74 | #### _...contribute to GHTorrent?_ 75 | 76 | Please read the [contribution guide](contrib.html). 77 | 78 | #### _... cite the GHTorrent data set?_ 79 | 80 | Georgios Gousios: [The GHTorrent dataset and tool 81 | suite](http://www.gousios.gr/bibliography/G13.html). MSR 2013: 233-236 82 | 83 | {%highlight text%} 84 | @inproceedings{Gousi13, 85 | author = {Gousios, Georgios}, 86 | title = {The GHTorrent dataset and tool suite}, 87 | booktitle = {Proceedings of the 10th Working Conference on Mining Software 88 | Repositories}, 89 | series = {MSR '13}, 90 | year = {2013}, 91 | isbn = {978-1-4673-2936-1}, 92 | location = {San Francisco, CA, USA}, 93 | pages = {233--236}, 94 | numpages = {4}, 95 | url = {http://dl.acm.org/citation.cfm?id=2487085.2487132}, 96 | acmid = {2487132}, 97 | publisher = {IEEE Press}, 98 | address = {Piscataway, NJ, USA}, 99 | } 100 | {%endhighlight%} 101 | 102 | #### _...download the data?_ 103 | 104 | You don't need to. GHTorrent offers a multitude of [online 105 | services](services.html) that enable access to almost realtime versions 106 | of the datastores. If you really want to, you can get all the data from 107 | the [downloads](downloads.html) page. 108 | 109 | #### _...use the data for my private project?_ 110 | 111 | See the licensing information above. 112 | 113 | ## Data processing 114 | 115 | #### _What quality guarantees does GHTorrent offer?_ 116 | 117 | The GHTorrent data come as is with no quality guarantees. However, we are 118 | actively seeking to fix systematic (i.e. errors that are repeated across the 119 | whole dataset) data collection errors. Please [open an 120 | issue](https://github.com/gousiosg/github-mirror/issues) if you find one. As 121 | GHTorrent is essentially a data sync operation over unreliable networks, 122 | spurious inconsistencies such as (minor) holes in data collection are 123 | unavoidable. 124 | 125 | #### _I 've seen weird commit timestamps_ 126 | 127 | Git records the commit timestamp on the developer's workstation. If the clock 128 | is missconfigured, timestamps will be weird. We have seen timestamps such 129 | as `0000-01-01 00:00` or `2034-12-31 23:59`. GitHub and GHTorrent do not 130 | process the timestamps in any way. 131 | 132 | #### _My data is out of date_ 133 | 134 | Github only creates events when an entity is created and not when it is updated or deleted. It is therefore not possible to be completely up-to-date with changes in users (e.g. updated location) and repositories (e.g. renames). GHTorrent tries its best to stay up to date by refreshing all users and all repos every X months. As the DB contains 12M+ users and 30M+ repos, this process may take a while and it can also fail due to spurious reasons. 135 | 136 | ## Copyright and Privacy 137 | 138 | #### _Who owns the data that GHTorrent shares?_ 139 | 140 | The copyright situation is very complicated; in essense, GitHub owns copyright 141 | to the data formats for the API responses, users own copyright of the content 142 | they create and the GHTorrent creator has copyright on the GHTorrent database 143 | schemata. 144 | 145 | #### _What types of privacy guarantees does GHTorrent offer?_ 146 | 147 | GHTorrent collects publicly available data from the GitHub API. 148 | 149 | #### _How does GHTorrent handle my personal information?_ 150 | 151 | By personal information, we mean data that identify a real person uniquely. In 152 | the context of GHTorrent, these are emails and real names. 153 | 154 | As of Mar 2016, GHTorrent does not distribute any personal information by 155 | default. Researchers whose research requires access to personal data 156 | can use [this form](pers-data.html) to obtain it. 157 | 158 | #### _Can I get more information?_ 159 | 160 | Yes. Please read the following Slidedeck. If you are still in doubt, 161 | please contact us. 162 |
163 | 164 |
165 | 166 | #### _How can I opt out?_ 167 | 168 | We understand that being part of such a big dataset can have concequences for 169 | your online privacy. For this reason (and also to comply with legal data 170 | processing requirements), you can opt out data collection. If you want to 171 | opt out, please [send us an email](gousiosg@gmail.com). 172 | 173 | Opting out means that we will replace your email in the database with 174 | `no-spam@ghtorrent.org` and remove your real name. 175 | 176 | #### _Contacting users for surveys_ 177 | 178 | (by @slang800) 179 | 180 | Contacting GitHub users is sometimes necessary for research projects, but 181 | certain people regard this as spam and do not appreciate it. Even people who 182 | would ordinarily be willing to help in surveys can end up becoming hostile to 183 | requests due to the frequency with which they receive them. Due to the number of 184 | researchers who study the free software community, we have to be mindful of how 185 | many emails we are sending. Here are some tips to avoid annoying people: 186 | 187 | - Do not contact users who have signed up for the 188 | [do-not-survey-list](https://github.com/slang800/do-not-survey-list). These 189 | users have explicitly stated that they don't want to be involved in surveys. 190 | 191 | - Try to limit your data collection to a sample of users. While it may be 192 | tempting to contact all 12 million developers, it is also likely to leave a 193 | bad impression of researchers in general, and can make future studies 194 | difficult. 195 | 196 | - Don't contact people repeatedly if they fail to reply to your first email. If 197 | they weren't interested the first time, follow-ups are much more likely to 198 | bother them. 199 | 200 | - Don't contact people who have chosen to hide their email address from their 201 | profile page. People frequently assume that hiding their email from their 202 | profile will prevent them from being contacted. Sometimes, they don't even 203 | realize that every commit they make is signed with their email, so we 204 | shouldn't assume that users are consenting to being contacted, without 205 | checking the display settings on their profile. 206 | 207 | - Be especially careful when contacting highly-active developers. Not only to 208 | these people receive a massive amount of regular email from their 209 | participation in the free software community, but they are the sent a 210 | higher-than-average number of requests for surveys too. 211 | 212 | {% include comments.html%} 213 | -------------------------------------------------------------------------------- /msr14.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: MSR 2014 Mining Challenge Dataset 4 | tagline: 5 | --- 6 | ### Versions 7 | 8 | After the initial release of the dataset, the users found errors and missing 9 | features. The list of versions along with the fixes is presented in the table 10 | below. Only the latest version is offered for download. 11 | 12 | *You are advised to always run queries against the newest version.* If you have already downloaded an older version and the described fix does not 13 | affect your experiment, you could skip the update. 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 |
VersionRelease dateFixed error
1.313 Dec 2013Missing project members for some projects is now fixed
1.222 Oct 2013user_id in table commit_comments not set correctly.
1.19 Oct 2013 38 | Table commit_comments was missing data. Some commits were missing from 39 | some projects. 40 |
1.028 Sep 2013
49 | 50 | ### Dataset description 51 | 52 | The MSR 2014 challenge dataset is a (very) trimmed down version of the original 53 | GHTorrent dataset. It includes data from the top-10 starred software projects 54 | for the top programming languages on Github, which gives 90 projects and their 55 | forks. For each project, we retrieved all data including issues, pull requests 56 | organizations, followers, stars and labels (milestones and events not 57 | included). The dataset was constructed from scratch to ensure the latest 58 | information is in it. 59 | 60 | Similarly to GHTorrent itself, the MSR challenge dataset comes in two flavours: 61 | 62 | * A [MongoDB database dump](http://ghtorrent-downloads.ewi.tudelft.nl/datasets/msr14-mongo.tar.gz) containing the results of querying the Github API. See [format here](mongo.html). 63 | * A [MySQL database dump](http://ghtorrent-downloads.ewi.tudelft.nl/datasets/msr14-mysql.gz) containing a queriable version of important fields extracted from the raw data. See [schema here](relational.html). 64 | 65 | The included projects are the following: 66 | 67 | 68 | [akka/akka](http://github.com/akka/akka) 69 | [devtools/hadley](http://github.com/hadley/devtools) 70 | [ProjectTemplate/johnmyleswhite](http://github.com/johnmyleswhite/ProjectTemplate) 71 | [stat-cookbook/mavam](http://github.com/mavam/stat-cookbook) 72 | [hiphop-php/facebook](http://github.com/facebook/hiphop-php) 73 | [knitr/yihui](http://github.com/yihui/knitr) 74 | [shiny/rstudio](http://github.com/rstudio/shiny) 75 | [folly/facebook](http://github.com/facebook/folly) 76 | [mongo/mongodb](http://github.com/mongodb/mongo) 77 | [doom3.gpl/TTimo](http://github.com/TTimo/doom3.gpl) 78 | [phantomjs/ariya](http://github.com/ariya/phantomjs) 79 | [TrinityCore/TrinityCore](http://github.com/TrinityCore/TrinityCore) 80 | [MaNGOS/mangos](http://github.com/mangos/MaNGOS) 81 | [bitcoin/bitcoin](http://github.com/bitcoin/bitcoin) 82 | [mosh/keithw](http://github.com/keithw/mosh) 83 | [xbmc/xbmc](http://github.com/xbmc/xbmc) 84 | [http-parser/joyent](http://github.com/joyent/http-parser) 85 | [beanstalkd/kr](http://github.com/kr/beanstalkd) 86 | [redis/antirez](http://github.com/antirez/redis) 87 | [ccv/liuliu](http://github.com/liuliu/ccv) 88 | [memcached/memcached](http://github.com/memcached/memcached) 89 | [openFrameworks/openframeworks](http://github.com/openframeworks/openFrameworks) 90 | [libgit2/libgit2](http://github.com/libgit2/libgit2) 91 | [redcarpet/vmg](http://github.com/vmg/redcarpet) 92 | [libuv/joyent](http://github.com/joyent/libuv) 93 | [SignalR/SignalR](http://github.com/SignalR/SignalR) 94 | [SparkleShare/hbons](http://github.com/hbons/SparkleShare) 95 | [plupload/moxiecode](http://github.com/moxiecode/plupload) 96 | [mono/mono](http://github.com/mono/mono) 97 | [Nancy/NancyFx](http://github.com/NancyFx/Nancy) 98 | [ServiceStack/ServiceStack](http://github.com/ServiceStack/ServiceStack) 99 | [AutoMapper/AutoMapper](http://github.com/AutoMapper/AutoMapper) 100 | [RestSharp/restsharp](http://github.com/restsharp/RestSharp) 101 | [ravendb/ravendb](http://github.com/ravendb/ravendb) 102 | [MiniProfiler/SamSaffron](http://github.com/SamSaffron/MiniProfiler) 103 | [storm/nathanmarz](http://github.com/nathanmarz/storm) 104 | [elasticsearch/elasticsearch](http://github.com/elasticsearch/elasticsearch) 105 | [ActionBarSherlock/JakeWharton](http://github.com/JakeWharton/ActionBarSherlock) 106 | [facebook-android-sdk/facebook](http://github.com/facebook/facebook-android-sdk) 107 | [clojure/clojure](http://github.com/clojure/clojure) 108 | [CraftBukkit/Bukkit](http://github.com/Bukkit/CraftBukkit) 109 | [netty/netty](http://github.com/netty/netty) 110 | [android/github](http://github.com/github/android) 111 | [node/joyent](http://github.com/joyent/node) 112 | [jquery/jquery](http://github.com/jquery/jquery) 113 | [html5-boilerplate/h5bp](http://github.com/h5bp/html5-boilerplate) 114 | [impress.js/bartaz](http://github.com/bartaz/impress.js) 115 | [d3/mbostock](http://github.com/mbostock/d3) 116 | [chosen/harvesthq](http://github.com/harvesthq/chosen) 117 | [Font-Awesome/FortAwesome](http://github.com/FortAwesome/Font-Awesome) 118 | [three.js/mrdoob](http://github.com/mrdoob/three.js) 119 | [foundation/zurb](http://github.com/zurb/foundation) 120 | [symfony/symfony](http://github.com/symfony/symfony) 121 | [CodeIgniter/EllisLab](http://github.com/EllisLab/CodeIgniter) 122 | [php-sdk/facebook](http://github.com/facebook/php-sdk) 123 | [zf2/zendframework](http://github.com/zendframework/zf2) 124 | [cakephp/cakephp](http://github.com/cakephp/cakephp) 125 | [ThinkUp/ginatrapani](http://github.com/ginatrapani/ThinkUp) 126 | [phpunit/sebastianbergmann](http://github.com/sebastianbergmann/phpunit) 127 | [Slim/codeguy](http://github.com/codeguy/Slim) 128 | [django/django](http://github.com/django/django) 129 | [tornado/facebook](http://github.com/facebook/tornado) 130 | [httpie/jkbr](http://github.com/jkbr/httpie) 131 | [flask/mitsuhiko](http://github.com/mitsuhiko/flask) 132 | [requests/kennethreitz](http://github.com/kennethreitz/requests) 133 | [symfony/xphere-forks](http://github.com/xphere-forks/symfony) 134 | [reddit/reddit](http://github.com/reddit/reddit) 135 | [boto/boto](http://github.com/boto/boto) 136 | [django-debug-toolbar/django-debug-toolbar](http://github.com/django-debug-toolbar/django-debug-toolbar) 137 | [Sick-Beard/midgetspy](http://github.com/midgetspy/Sick-Beard) 138 | [django-cms/divio](http://github.com/divio/django-cms) 139 | [rails/rails](http://github.com/rails/rails) 140 | [homebrew/mxcl](http://github.com/mxcl/homebrew) 141 | [jekyll/mojombo](http://github.com/mojombo/jekyll) 142 | [gitlabhq/gitlabhq](http://github.com/gitlabhq/gitlabhq) 143 | [diaspora/diaspora](http://github.com/diaspora/diaspora) 144 | [devise/plataformatec](http://github.com/plataformatec/devise) 145 | [blueprint-css/joshuaclayton](http://github.com/joshuaclayton/blueprint-css) 146 | [octopress/imathis](http://github.com/imathis/octopress) 147 | [vinc.cc/vinc](http://github.com/vinc/vinc.cc) 148 | [paperclip/thoughtbot](http://github.com/thoughtbot/paperclip) 149 | [compass/chriseppstein](http://github.com/chriseppstein/compass) 150 | [finagle/twitter](http://github.com/twitter/finagle) 151 | [kestrel/robey](http://github.com/robey/kestrel) 152 | [flockdb/twitter](http://github.com/twitter/flockdb) 153 | [gizzard/twitter](http://github.com/twitter/gizzard) 154 | [sbt/sbt](http://github.com/sbt/sbt) 155 | [scala/scala](http://github.com/scala/scala) 156 | [scalatra/scalatra](http://github.com/scalatra/scalatra) 157 | [zipkin/twitter](http://github.com/twitter/zipkin) 158 | 159 | ### Importing and using 160 | 161 | The following instructions assume an OSX or Linux based host. 162 | 163 | #### MongoDB 164 | 165 | {%highlight bash%} 166 | 167 | $ wget http://ghtorrent-downloads.ewi.tudelft.nl/datasets/msr14-mongo.tar.gz 168 | $ tar zxvf msr14-mongo.tar.gz 169 | $ mongorestore 170 | $ mongo msr14 171 | mongo> db.commits.count() 172 | 601080 173 | mongo> db.issues.count() 174 | 126308 175 | {%endhighlight %} 176 | 177 | #### MySQL 178 | 179 | {%highlight bash%} 180 | $ wget http://ghtorrent-downloads.ewi.tudelft.nl/datasets/msr14-mysql.gz 181 | $ mysql -u root -p 182 | mysql > create user 'msr14'@'localhost' identified by 'msr14'; 183 | mysql> create database msr14; 184 | mysql> GRANT ALL PRIVILEGES ON msr14.* to msr14@'localhost'; 185 | mysql> flush privileges; 186 | # Exit MySQL prompt 187 | $ zcat msr14-mysql.gz |mysql -u msr14 -p msr14 188 | $ mysql -u msr14 -p msr14 189 | mysql> select language,count(*) from projects where forked_from is null group by language; 190 | +------------+----------+ 191 | | language | count(*) | 192 | +------------+----------+ 193 | | C | 10 | 194 | | C# | 8 | 195 | | C++ | 8 | 196 | | CSS | 3 | 197 | | Go | 1 | 198 | | Java | 8 | 199 | | JavaScript | 9 | 200 | | PHP | 9 | 201 | | Python | 10 | 202 | | R | 4 | 203 | | Ruby | 10 | 204 | | Scala | 9 | 205 | | TypeScript | 1 | 206 | +------------+----------+ 207 | 13 rows in set (0.01 sec) 208 | {%endhighlight %} 209 | 210 | ### FAQ 211 | 212 | Answers to frequently asked questions 213 | 214 | #### Why a new dataset? 215 | 216 | For practical reasons. The dataset is small enough to be used on a laptop, 217 | yet rich enough to do really interesting research with it. 218 | 219 | #### What are the hardware requirements? 220 | 221 | We have succesfully imported and used both dumps into a 2011 MacBookAir with 4GB 222 | of RAM. Your mileage may vary, but relatively new systems with more than 4GB RAM should have no trouble with both databases. If you only need to use the MySQL data dump, the hardware requirements are even lower. 223 | 224 | #### Why two databases? Do I need both? 225 | 226 | Not necessarily. The MySQL database can readily cover many aspects of activity 227 | on Github. Perhaps the only reason to use the MongoDB dump is to analyse commit contents, branches affected by pull requests or milestones, which are not included in MySQL. 228 | 229 | #### How can I ask a question about the dataset? 230 | 231 | Your question and the potential answer might be useful for other people as well, 232 | so please use the form below. *Please note that I will not answer 233 | questions sent to my email.* 234 | 235 | {% include comments.html%} 236 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "rubygems" 2 | require 'rake' 3 | require 'yaml' 4 | require 'time' 5 | 6 | SOURCE = "." 7 | CONFIG = { 8 | 'version' => "0.2.13", 9 | 'themes' => File.join(SOURCE, "_includes", "themes"), 10 | 'layouts' => File.join(SOURCE, "_layouts"), 11 | 'posts' => File.join(SOURCE, "_posts"), 12 | 'post_ext' => "md", 13 | 'theme_package_version' => "0.1.0" 14 | } 15 | 16 | # Path configuration helper 17 | module JB 18 | class Path 19 | SOURCE = "." 20 | Paths = { 21 | :layouts => "_layouts", 22 | :themes => "_includes/themes", 23 | :theme_assets => "assets/themes", 24 | :theme_packages => "_theme_packages", 25 | :posts => "_posts" 26 | } 27 | 28 | def self.base 29 | SOURCE 30 | end 31 | 32 | # build a path relative to configured path settings. 33 | def self.build(path, opts = {}) 34 | opts[:root] ||= SOURCE 35 | path = "#{opts[:root]}/#{Paths[path.to_sym]}/#{opts[:node]}".split("/") 36 | path.compact! 37 | File.__send__ :join, path 38 | end 39 | 40 | end #Path 41 | end #JB 42 | 43 | # Usage: rake post title="A Title" [date="2012-02-09"] 44 | desc "Begin a new post in #{CONFIG['posts']}" 45 | task :post do 46 | abort("rake aborted: '#{CONFIG['posts']}' directory not found.") unless FileTest.directory?(CONFIG['posts']) 47 | title = ENV["title"] || "new-post" 48 | slug = title.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '') 49 | begin 50 | date = (ENV['date'] ? Time.parse(ENV['date']) : Time.now).strftime('%Y-%m-%d') 51 | rescue Exception => e 52 | puts "Error - date format must be YYYY-MM-DD, please check you typed it correctly!" 53 | exit -1 54 | end 55 | filename = File.join(CONFIG['posts'], "#{date}-#{slug}.#{CONFIG['post_ext']}") 56 | if File.exist?(filename) 57 | abort("rake aborted!") if ask("#{filename} already exists. Do you want to overwrite?", ['y', 'n']) == 'n' 58 | end 59 | 60 | puts "Creating new post: #{filename}" 61 | open(filename, 'w') do |post| 62 | post.puts "---" 63 | post.puts "layout: post" 64 | post.puts "title: \"#{title.gsub(/-/,' ')}\"" 65 | post.puts 'description: ""' 66 | post.puts "category: " 67 | post.puts "tags: []" 68 | post.puts "---" 69 | post.puts "{% include JB/setup %}" 70 | end 71 | end # task :post 72 | 73 | # Usage: rake page name="about.html" 74 | # You can also specify a sub-directory path. 75 | # If you don't specify a file extention we create an index.html at the path specified 76 | desc "Create a new page." 77 | task :page do 78 | name = ENV["name"] || "new-page.md" 79 | filename = File.join(SOURCE, "#{name}") 80 | filename = File.join(filename, "index.html") if File.extname(filename) == "" 81 | title = File.basename(filename, File.extname(filename)).gsub(/[\W\_]/, " ").gsub(/\b\w/){$&.upcase} 82 | if File.exist?(filename) 83 | abort("rake aborted!") if ask("#{filename} already exists. Do you want to overwrite?", ['y', 'n']) == 'n' 84 | end 85 | 86 | mkdir_p File.dirname(filename) 87 | puts "Creating new page: #{filename}" 88 | open(filename, 'w') do |post| 89 | post.puts "---" 90 | post.puts "layout: page" 91 | post.puts "title: \"#{title}\"" 92 | post.puts 'description: ""' 93 | post.puts "---" 94 | post.puts "{% include JB/setup %}" 95 | end 96 | end # task :page 97 | 98 | desc "Launch preview environment" 99 | task :preview do 100 | system "jekyll --auto --server" 101 | end # task :preview 102 | 103 | # Public: Alias - Maintains backwards compatability for theme switching. 104 | task :switch_theme => "theme:switch" 105 | 106 | namespace :theme do 107 | 108 | # Public: Switch from one theme to another for your blog. 109 | # 110 | # name - String, Required. name of the theme you want to switch to. 111 | # The the theme must be installed into your JB framework. 112 | # 113 | # Examples 114 | # 115 | # rake theme:switch name="the-program" 116 | # 117 | # Returns Success/failure messages. 118 | desc "Switch between Jekyll-bootstrap themes." 119 | task :switch do 120 | theme_name = ENV["name"].to_s 121 | theme_path = File.join(CONFIG['themes'], theme_name) 122 | settings_file = File.join(theme_path, "settings.yml") 123 | non_layout_files = ["settings.yml"] 124 | 125 | abort("rake aborted: name cannot be blank") if theme_name.empty? 126 | abort("rake aborted: '#{theme_path}' directory not found.") unless FileTest.directory?(theme_path) 127 | abort("rake aborted: '#{CONFIG['layouts']}' directory not found.") unless FileTest.directory?(CONFIG['layouts']) 128 | 129 | Dir.glob("#{theme_path}/*") do |filename| 130 | next if non_layout_files.include?(File.basename(filename).downcase) 131 | puts "Generating '#{theme_name}' layout: #{File.basename(filename)}" 132 | 133 | open(File.join(CONFIG['layouts'], File.basename(filename)), 'w') do |page| 134 | if File.basename(filename, ".html").downcase == "default" 135 | page.puts "---" 136 | page.puts File.read(settings_file) if File.exist?(settings_file) 137 | page.puts "---" 138 | else 139 | page.puts "---" 140 | page.puts "layout: default" 141 | page.puts "---" 142 | end 143 | page.puts "{% include JB/setup %}" 144 | page.puts "{% include themes/#{theme_name}/#{File.basename(filename)} %}" 145 | end 146 | end 147 | 148 | puts "=> Theme successfully switched!" 149 | puts "=> Reload your web-page to check it out =)" 150 | end # task :switch 151 | 152 | # Public: Install a theme using the theme packager. 153 | # Version 0.1.0 simple 1:1 file matching. 154 | # 155 | # git - String, Optional path to the git repository of the theme to be installed. 156 | # name - String, Optional name of the theme you want to install. 157 | # Passing name requires that the theme package already exist. 158 | # 159 | # Examples 160 | # 161 | # rake theme:install git="https://github.com/jekyllbootstrap/theme-twitter.git" 162 | # rake theme:install name="cool-theme" 163 | # 164 | # Returns Success/failure messages. 165 | desc "Install theme" 166 | task :install do 167 | if ENV["git"] 168 | manifest = theme_from_git_url(ENV["git"]) 169 | name = manifest["name"] 170 | else 171 | name = ENV["name"].to_s.downcase 172 | end 173 | 174 | packaged_theme_path = JB::Path.build(:theme_packages, :node => name) 175 | 176 | abort("rake aborted! 177 | => ERROR: 'name' cannot be blank") if name.empty? 178 | abort("rake aborted! 179 | => ERROR: '#{packaged_theme_path}' directory not found. 180 | => Installable themes can be added via git. You can find some here: http://github.com/jekyllbootstrap 181 | => To download+install run: `rake theme:install git='[PUBLIC-CLONE-URL]'` 182 | => example : rake theme:install git='git@github.com:jekyllbootstrap/theme-the-program.git' 183 | ") unless FileTest.directory?(packaged_theme_path) 184 | 185 | manifest = verify_manifest(packaged_theme_path) 186 | 187 | # Get relative paths to packaged theme files 188 | # Exclude directories as they'll be recursively created. Exclude meta-data files. 189 | packaged_theme_files = [] 190 | FileUtils.cd(packaged_theme_path) { 191 | Dir.glob("**/*.*") { |f| 192 | next if ( FileTest.directory?(f) || f =~ /^(manifest|readme|packager)/i ) 193 | packaged_theme_files << f 194 | } 195 | } 196 | 197 | # Mirror each file into the framework making sure to prompt if already exists. 198 | packaged_theme_files.each do |filename| 199 | file_install_path = File.join(JB::Path.base, filename) 200 | if File.exist? file_install_path 201 | next if ask("#{file_install_path} already exists. Do you want to overwrite?", ['y', 'n']) == 'n' 202 | else 203 | mkdir_p File.dirname(file_install_path) 204 | cp_r File.join(packaged_theme_path, filename), file_install_path 205 | end 206 | end 207 | 208 | puts "=> #{name} theme has been installed!" 209 | puts "=> ---" 210 | if ask("=> Want to switch themes now?", ['y', 'n']) == 'y' 211 | system("rake switch_theme name='#{name}'") 212 | end 213 | end 214 | 215 | # Public: Package a theme using the theme packager. 216 | # The theme must be structured using valid JB API. 217 | # In other words packaging is essentially the reverse of installing. 218 | # 219 | # name - String, Required name of the theme you want to package. 220 | # 221 | # Examples 222 | # 223 | # rake theme:package name="twitter" 224 | # 225 | # Returns Success/failure messages. 226 | desc "Package theme" 227 | task :package do 228 | name = ENV["name"].to_s.downcase 229 | theme_path = JB::Path.build(:themes, :node => name) 230 | asset_path = JB::Path.build(:theme_assets, :node => name) 231 | 232 | abort("rake aborted: name cannot be blank") if name.empty? 233 | abort("rake aborted: '#{theme_path}' directory not found.") unless FileTest.directory?(theme_path) 234 | abort("rake aborted: '#{asset_path}' directory not found.") unless FileTest.directory?(asset_path) 235 | 236 | ## Mirror theme's template directory (_includes) 237 | packaged_theme_path = JB::Path.build(:themes, :root => JB::Path.build(:theme_packages, :node => name)) 238 | mkdir_p packaged_theme_path 239 | cp_r theme_path, packaged_theme_path 240 | 241 | ## Mirror theme's asset directory 242 | packaged_theme_assets_path = JB::Path.build(:theme_assets, :root => JB::Path.build(:theme_packages, :node => name)) 243 | mkdir_p packaged_theme_assets_path 244 | cp_r asset_path, packaged_theme_assets_path 245 | 246 | ## Log packager version 247 | packager = {"packager" => {"version" => CONFIG["theme_package_version"].to_s } } 248 | open(JB::Path.build(:theme_packages, :node => "#{name}/packager.yml"), "w") do |page| 249 | page.puts packager.to_yaml 250 | end 251 | 252 | puts "=> '#{name}' theme is packaged and available at: #{JB::Path.build(:theme_packages, :node => name)}" 253 | end 254 | 255 | end # end namespace :theme 256 | 257 | # Internal: Download and process a theme from a git url. 258 | # Notice we don't know the name of the theme until we look it up in the manifest. 259 | # So we'll have to change the folder name once we get the name. 260 | # 261 | # url - String, Required url to git repository. 262 | # 263 | # Returns theme manifest hash 264 | def theme_from_git_url(url) 265 | tmp_path = JB::Path.build(:theme_packages, :node => "_tmp") 266 | abort("rake aborted: system call to git clone failed") if !system("git clone #{url} #{tmp_path}") 267 | manifest = verify_manifest(tmp_path) 268 | new_path = JB::Path.build(:theme_packages, :node => manifest["name"]) 269 | if File.exist?(new_path) && ask("=> #{new_path} theme package already exists. Override?", ['y', 'n']) == 'n' 270 | remove_dir(tmp_path) 271 | abort("rake aborted: '#{manifest["name"]}' already exists as theme package.") 272 | end 273 | 274 | remove_dir(new_path) if File.exist?(new_path) 275 | mv(tmp_path, new_path) 276 | manifest 277 | end 278 | 279 | # Internal: Process theme package manifest file. 280 | # 281 | # theme_path - String, Required. File path to theme package. 282 | # 283 | # Returns theme manifest hash 284 | def verify_manifest(theme_path) 285 | manifest_path = File.join(theme_path, "manifest.yml") 286 | manifest_file = File.open( manifest_path ) 287 | abort("rake aborted: repo must contain valid manifest.yml") unless File.exist? manifest_file 288 | manifest = YAML.load( manifest_file ) 289 | manifest_file.close 290 | manifest 291 | end 292 | 293 | def ask(message, valid_options) 294 | if valid_options 295 | answer = get_stdin("#{message} #{valid_options.to_s.gsub(/"/, '').gsub(/, /,'/')} ") while !valid_options.include?(answer) 296 | else 297 | answer = get_stdin(message) 298 | end 299 | answer 300 | end 301 | 302 | def get_stdin(message) 303 | print message 304 | STDIN.gets.chomp 305 | end 306 | 307 | #Load custom rake scripts 308 | Dir['_rake/*.rake'].each { |r| load r } 309 | -------------------------------------------------------------------------------- /relational.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: The relational DB schema 4 | tagline: 5 | --- 6 | 7 | 8 | 9 | [Download](files/schema.png) [Download PDF](files/schema.pdf) 10 | 11 | ## Entities and their relationships 12 | 13 | #### users 14 | Github users. 15 | 16 | * A user has a unique user name or email. May contain artificially generated user names, see [commits](relational.html#commits) below. 17 | * There are two `type`s of users, `USER`s and `ORG`anizations. 18 | * Users can be *real* or *fake*. Real users can own projects and perform 19 | actions such as open issues, create pull requests and push commits. Fake 20 | users only appear as authors or committers of commits. Fake users are marked 21 | by the `fake` field. 22 | * Organizations are meta users that point to a collection of users. The members of organizations can be found in `organization_members`. Organization users can only own projects and they do not perform any other actions. 23 | * Users may be marked as `deleted`. This means that the user was once active on 24 | GitHub but GHTorrent can no longer get his/her details. 25 | 26 | *Update Nov 2015:* User entries are now geocoded. The location field remains 27 | intact, while 5 fields have been added with information about the 28 | geographic location of the user. The Open Street Maps API has been used 29 | to do the mapping of the location field to the user's geocode. As a result, 30 | the state and city fields are stored in the local language of the geocoded 31 | area. Also, many users do not report their location or their location 32 | is field in with random information; in those cases, no geocoding information 33 | is available. 34 | 35 | {% highlight sql %} 36 | --- See where most commits are commit from today 37 | select u.country_code, count(*) 38 | from commits c, users u 39 | where c.author_id = u.id 40 | and date(c.created_at) = date(now()) 41 | group by u.country_code 42 | {% endhighlight %} 43 | 44 | 45 | *Update Mar 2016:* User personal data (emails and real names) are excluded 46 | from the downloaded dump, while configuration dissalows access to those 47 | fields for the online access services for the MySQL database. 48 | 49 | 50 | #### organization\_members 51 | Users that are members of an organization. 52 | 53 | * The `created_at` field is only filled in accurately for memberships for which 54 | GHTorrent has recorded a corresponding event. Otherwise, it is filled in with the 55 | latest date that the corresponding user or organization has been created. 56 | 57 | *Update Nov 2015:* Organizations can now select wheather membership information 58 | is revealed to external parties. This means that information about this 59 | table can no longer be accurate. 60 | 61 | #### projects 62 | Information about repositories. A repository is always owned by a user. 63 | 64 | * The `forked_from` field is empty unless the 65 | project is a fork in which case it contains the `id` of the project the project 66 | is forked from. 67 | 68 | * The `deleted` field means that the project has been deleted from Github. 69 | 70 | * The `updated_at` field indicates when the last full update was done for 71 | this project. 72 | 73 | #### project\_members 74 | Users that have commit access to the repository. 75 | 76 | The `created_at` field is only filled in accurately for memberships for which 77 | GHTorrent has recorded a corresponding event. Otherwise, it is filled in with the 78 | latest date that the corresponding user or project has been created. 79 | 80 | *Update Nov 2014:* GitHub has disabled the API end point used to retrieve 81 | members to an organization. GHTorrent uses the `MemberEvent` event to 82 | approximate memberships, but this is not always accurate. You are thus advised 83 | to use heuristics (e.g. the committers + mergers of pull) to calculate membership, 84 | such as the following: 85 | 86 | {% highlight sql %} 87 | --- Get active core team participants for the last 3 months 88 | select distinct(u.login) as login 89 | from commits c, users u, project_commits pc, users u1, projects p 90 | where u.id = c.committer_id 91 | and u.fake is false 92 | and pc.commit_id = c.id 93 | and pc.project_id = p.id 94 | and p.owner_id = u1.id 95 | and p.name = 'rails' 96 | and u1.login = 'rails' 97 | and c.created_at > DATE_SUB(NOW(), INTERVAL 3 MONTH) 98 | union 99 | select distinct(u.login) as login 100 | from pull_requests pr, projects p, users u, users u1, pull_request_history prh 101 | where u.id = prh.actor_id 102 | and prh.action = 'merged' 103 | and u1.id = p.owner_id 104 | and prh.pull_request_id = pr.id 105 | and pr.base_repo_id = p.id 106 | and prh.created_at > DATE_SUB(NOW(), INTERVAL 3 MONTH) 107 | and p.name = 'rails' 108 | and u1.login = 'rails' 109 | {% endhighlight %} 110 | 111 | 112 | ### project\_languages 113 | Languages that are used in the repository along with **byte counts** for 114 | all files in those languages. 115 | 116 | Multiple entries can exist per project. The `created_at` field is filled in with 117 | the latest timestamp the query for a specific `project_id` was done. 118 | 119 | The table is filled in when the project has been first inserted on when 120 | an update round for all projects is made. 121 | 122 | {% highlight sql %} 123 | -- Get the latest byte count for languges in Ruby on Rails 124 | select * 125 | from project_languages 126 | where project_id = 1334 127 | order by created_at desc 128 | 129 | {% endhighlight %} 130 | 131 | #### commits 132 | Unique commits. 133 | 134 | * Each commit is identified globally through its `sha` field. If the author or 135 | the committer has not configured his [Github email address](https://help.github.com/articles/setting-your-email-in-git), no resolution to 136 | a `user` entry is possible. In that case, GHTorrent generates artificial users using the provided email in the Git commit author or committer fields. If the user 137 | then configures his Github account, GHTorrent will update the artificial user 138 | accordingly. 139 | 140 | * The `project_id` field contains a link to the project that this commit has 141 | been first associated with. This might not be the project this commit was 142 | initially pushed to, e.g. in case the fork is processed before the parent. 143 | See [project\_commits](relational.html#project_commits). 144 | 145 | * The `project_id` field may be null when the repository has been 146 | deleted at the time the commit is processed. This situation might happen when 147 | retrospectively processing pull requests for a repository and the 148 | repository which the pull request originates from has been deleted. 149 | 150 | #### commit\_parents 151 | The parent commit(s) for each commit, as specified by Git. 152 | 153 | #### project\_commits 154 | The commits belonging to the history of a project. 155 | 156 | More than one projects can share the same commits if one is a fork of the other. 157 | 158 | #### commit\_comments 159 | Code review comments on commits. 160 | 161 | These are comments on individual commits. If a commit is associated with a pull 162 | request, then its comments are in the 163 | [pull\_request\_comments](relational.html#pull_request_comments) table. 164 | 165 | #### followers 166 | A follower to a user. 167 | 168 | The `created_at` field is only filled in accurately for followships for which 169 | GHTorrent has recorded a corresponding event. Otherwise, it is filled in with the 170 | latest date that the corresponding user or follower has been created. 171 | 172 | #### watchers 173 | Users that have starred (was [watched](https://github.com/blog/1204-notifications-stars)) a project 174 | 175 | The `created_at` field is only filled in accurately for starrings for which 176 | GHTorrent has recorded a corresponding event. Otherwise, it is filled in with the 177 | latest date that the corresponding user or project has been created. 178 | 179 | #### pull\_requests 180 | A pull request initiated from `head_repo_id`:`head_commit_id` to `base_repo_id`:`base_commit_id` 181 | 182 | * Pull requests can be in various states. The states and their transitions 183 | are recorded in the [pull\_request\_history](relational.html#pull_request_history) table. 184 | * The `pullreq_id` field is Github's pull request unique identifier 185 | * The `intra_branch` field signifies that the head and base repositories are the 186 | same 187 | * If the head repository is NULL, this means that the corresponding project had been deleted when GHTorrent processed the pull request. 188 | 189 | #### pull\_request\_history 190 | An event in the pull request lifetime 191 | 192 | The `action` field can take the following values 193 | 194 | * `opened`: When the pull request has been opened 195 | * `closed`: When the pull request has been closed 196 | * `merged`: When Github detected that the pull request has been merged. No merges 197 | outside Github (i.e. Git based) are reported 198 | * `reoponed`: When a pull request is opened after being closed 199 | * `syncrhonize`: When new commits are added/removed to the head repository 200 | 201 | #### pull\_request\_commits 202 | A commit associated with a pull request 203 | 204 | The list is additive. This means if a rebase with commit squashing takes place after the commits of a pull request have been processed, the old commits will not be deleted. 205 | 206 | #### pull\_request\_comments 207 | A code review comment on a commit associated with a pull request 208 | 209 | The list is additive. If commits are squashed on the head repo, the comments 210 | remain intact. 211 | 212 | #### issues 213 | An issue associated with a repository 214 | 215 | * The `assignee` field is filed in with the user to which the issue was 216 | assigned at the time the issue was processed. 217 | * Issues have history recorded in the [issue\_events](relational.html#issue_events) table. 218 | * For every pull request, GHTorrent creates a corresponding issue. The 219 | `pull_request_id` field points to the associated pull request 220 | * The `issue_id` field is the unique identifier given to the issue by Github. 221 | 222 | #### issue\_events 223 | An event on an issue 224 | 225 | * The `action` field can have the following values: 226 | * `subscribed`: When a user subscribes to receive notifications about the issue. 227 | * `mentioned`: When a user is mentioned by another user (@user notation) 228 | * `closed`: When the issue has been closed 229 | * `referenced`: The issue was referenced in a commit (using the 230 | [fixes: conventions](https://github.com/blog/831-issues-2-0-the-next-generation)) 231 | * `assigned`: When the issue has been assigned to an actor. 232 | * `reopened`: When a closed issue is reopened 233 | * `unsubscribed`: When a user unsubscribed from issue. 234 | * `merged`: When the pull request pointed by the issue has been merged. 235 | * `head_ref_cleaned`: (Not documented) ? 236 | * `head_ref_deleted`: (Not documented) When the branch of the head repository has been deleted 237 | * `head_ref_restored`: (Not documented) When the head repository of a pull 238 | request has been restored (using the restore branch functionality). 239 | 240 | * The `action_specific` field gets filled in with the `commit\_id` of the last 241 | commit when a pull request has been closed, merged or referenced. 242 | 243 | #### issue\_comments 244 | An entry to the issue discussion. This table is always filled in with pull 245 | request (or issue) discussion comments, irrespective of whether the repository 246 | has issues enabled or not. 247 | 248 | #### repo\_labels 249 | A label to be assigned to an issue affecting this repository. 250 | 251 | #### issue\_labels 252 | A label that has been assigned to an issue 253 | 254 | ## Example queries 255 | 256 | #### List commits for a repository 257 | 258 | {%highlight sql%} 259 | select c.* 260 | from commits c, project_commits pc, projects p, users u 261 | where u.login = 'rails' 262 | and p.name = 'rails' 263 | and p.id = pc.project_id 264 | and c.id = pc.commit_id 265 | order by c.created_at desc 266 | {%endhighlight%} 267 | 268 | #### Get all actions for a pull request 269 | 270 | {%highlight sql%} 271 | select user, action, created_at from 272 | ( 273 | select prh.action as action, prh.created_at as created_at, u.login as user 274 | from pull_request_history prh, users u 275 | where prh.pull_request_id = ? 276 | and prh.actor_id = u.id 277 | union 278 | select ie.action as action, ie.created_at as created_at, u.login as user 279 | from issues i, issue_events ie, users u 280 | where ie.issue_id = i.id 281 | and i.pull_request_id = ? 282 | and ie.actor_id = u.id 283 | union 284 | select 'discussed' as action, ic.created_at as created_at, u.login as user 285 | from issues i, issue_comments ic, users u 286 | where ic.issue_id = i.id 287 | and u.id = ic.user_id 288 | and i.pull_request_id = ? 289 | union 290 | select 'reviewed' as action, prc.created_at as created_at, u.login as user 291 | from pull_request_comments prc, users u 292 | where prc.user_id = u.id 293 | and prc.pull_request_id = ? 294 | ) as actions 295 | order by created_at; 296 | {%endhighlight%} 297 | 298 | #### Get participants in an issue or pull request 299 | 300 | {%highlight sql%} 301 | select distinct(user_id) from 302 | ( 303 | select user_id 304 | from pull_request_comments 305 | where pull_request_id = ? 306 | union 307 | select user_id 308 | from issue_comments ic, issues i 309 | where i.id = ic.issue_id and i.pull_request_id = ? 310 | ) as participants 311 | {%endhighlight%} 312 | 313 | #### Get all users in NL that committed to a Java project today 314 | 315 | {%highlight sql%} 316 | select u.login 317 | from users u, commits c, projects p, project_commits pc 318 | where date(c.created_at) = date(now()) 319 | and pc.commit_id = c.id 320 | and c.author_id = u.id 321 | and u.country_code = 'nl' 322 | and 'java' = (select pl.language 323 | from project_langauges pl 324 | where pl.project_id = p.id 325 | order by pl.created_at desc, pl.bytes desc 326 | limit 1) 327 | {%endhighlight%} 328 | 329 | -------------------------------------------------------------------------------- /pullreq-perf/report.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Performance report for `r owner`/`r repo` 4 | --- 5 | 6 | ```{r preample, include=FALSE} 7 | 8 | # 9 | # (c) 2012 -- 2014 Georgios Gousios 10 | # 11 | # BSD licensed, see LICENSE in top level dir 12 | # 13 | 14 | library(ggplot2) 15 | library(reshape) 16 | library(plyr) 17 | library(sqldf) 18 | 19 | unwrap <- function(str) { 20 | strwrap(str, width=10000, simplify=TRUE) 21 | } 22 | 23 | # Get the project id 24 | q <- " 25 | select p.id 26 | from projects p, users u 27 | where u.id = p.owner_id 28 | and u.login='%s' 29 | and p.name = '%s' 30 | and p.forked_from is null 31 | " 32 | 33 | res <- dbSendQuery(db, sprintf(unwrap(q), owner, repo)) 34 | df <- fetch(res, n = -1) 35 | pid <- df$id[[1]] 36 | ``` 37 | 38 | ### Pull request backlog 39 | ```{r plot6, message=FALSE, fig.align='center', echo=FALSE, fig.width=9, warning=FALSE} 40 | 41 | q <- " 42 | select pr.pullreq_id, prh1.created_at as opened, ifnull(prh2.created_at, now()) as closed 43 | from pull_request_history prh1, 44 | pull_requests pr left outer join pull_request_history prh2 45 | on pr.id = prh2.pull_request_id 46 | and prh2.action = 'closed' 47 | where pr.id = prh1.pull_request_id 48 | and prh1.action = 'opened' 49 | and pr.base_repo_id = %d 50 | group by pr.id 51 | order by pr.pullreq_id desc 52 | " 53 | 54 | res <- dbSendQuery(db, sprintf(unwrap(q), pid)) 55 | 56 | pullreq.open.close <- fetch(res, n = -1) 57 | pullreq.open.close$opened <- as.POSIXct(pullreq.open.close$opened) 58 | pullreq.open.close$closed <- as.POSIXct(pullreq.open.close$closed) 59 | 60 | pullreq.open.close$mopen <- strftime(pullreq.open.close$opened, format="%Y-%m") 61 | pullreq.open.close$mclose <- strftime(pullreq.open.close$closed, format="%Y-%m") 62 | 63 | backlog.stats <- aggregate(pullreq_id ~ mopen, pullreq.open.close, length) 64 | backlog.stats <- rename(backlog.stats, c('mopen' = 'month', 'pullreq_id' = 'New pullreqs')) 65 | 66 | a <- aggregate(pullreq_id ~ mclose, subset(pullreq.open.close, mopen == mclose), length) 67 | backlog.stats <- merge(backlog.stats, a, by.x = 'month', by.y = 'mclose', sort = FALSE, all = T) 68 | backlog.stats[c("pullreq_id")][is.na(backlog.stats[c("pullreq_id")])] <- 0 69 | backlog.stats <- rename(backlog.stats, c('pullreq_id' = 'New and closed')) 70 | 71 | a <- aggregate(pullreq_id ~ mopen, subset(pullreq.open.close, mopen != mclose), length) 72 | backlog.stats <- merge(backlog.stats, a, by.x = 'month', by.y = 'mopen', sort = FALSE, all = T) 73 | backlog.stats[c("pullreq_id")][is.na(backlog.stats[c("pullreq_id")])] <- 0 74 | backlog.stats <- rename(backlog.stats, c('pullreq_id' = 'New and left open')) 75 | 76 | a <- aggregate(pullreq_id ~ mclose, subset(pullreq.open.close, mopen != mclose), length) 77 | backlog.stats <- merge(backlog.stats, a, by.x = 'month', by.y = 'mclose', sort = FALSE, all = T) 78 | backlog.stats[c("pullreq_id")][is.na(backlog.stats[c("pullreq_id")])] <- 0 79 | backlog.stats <- rename(backlog.stats, c('pullreq_id' = 'Old and closed')) 80 | 81 | backlog.stats$month <- sprintf("%s-01", backlog.stats$month) 82 | backlog.stats$month <- strptime(backlog.stats$month, "%Y-%m-%d") 83 | backlog.stats$month <- as.POSIXct(backlog.stats$month) 84 | backlog.stats <- backlog.stats[!names(backlog.stats) %in% c("New pullreqs")] 85 | backlog.stats <- backlog.stats[order(backlog.stats[,1]),] 86 | backlog.stats <- backlog.stats[-nrow(backlog.stats),] 87 | 88 | backlog.stats <- melt(backlog.stats, id=c("month")) 89 | 90 | ggplot(backlog.stats) + 91 | aes(x = month, y = value, fill = variable) + 92 | scale_fill_discrete(name = "Per month") + 93 | geom_bar(stat = "identity") + 94 | scale_x_datetime("Date") 95 | 96 | 97 | ``` 98 | 99 | The pull request backlog presents the number of pull requests processed 100 | per month. 101 | Even though a month is relatively coarse-grained period for pull requests 102 | (where review and acceptance/rejection 103 | [happen very fast](http://www.gousios.gr/bibliography/GPD14.html)), the 104 | backlog view can be helpful to get an idea of the overall activity within the 105 | project. 106 | 107 | ### Slow Pull Request lifelines 108 | ```{r plot5, message=FALSE, fig.align='center', echo=FALSE, fig.width=9, warning=FALSE} 109 | 110 | perc.09 <- as.numeric(quantile(pullreq.open.close$closed - pullreq.open.close$opened, 0.9)) 111 | num.slow10 <- nrow(subset(pullreq.open.close, closed - opened > perc.09)) 112 | num.fast90 <- nrow(subset(pullreq.open.close, closed - opened <= perc.09)) 113 | slow.10 <- subset(pullreq.open.close, closed - opened > perc.09) 114 | 115 | ggplot(slow.10) + 116 | geom_point(aes(y = pullreq_id, x = closed), colour = "red") + 117 | geom_point(aes(y = pullreq_id, x = opened), colour = "green") + 118 | geom_segment(aes(y=pullreq_id, yend = pullreq_id, x = opened, xend = closed), alpha = 0.4) + 119 | scale_y_discrete("Pull Request Number", breaks = NULL) + 120 | scale_x_datetime("Time (open/close)") 121 | ``` 122 | In this plot, we can see the lifelines of the slowest 10% of pull requests. 123 | For this project, the cutoff is `r perc.09 / 3600 /24 ` days. `r num.slow10` 124 | pull requests where processed slower than that, while `r num.fast90` were 125 | faster. The line represents the time between opening and closing the pull request. 126 | Pull requests whose end time aligns at the right edge of the plot are still open 127 | at the time of building this report. Generally, it is considered good practice 128 | to avoid having pull requests open for long. 129 | 130 | 131 | ### Source of commits 132 | ```{r plot1, echo=FALSE, fig.align='center', warning=FALSE} 133 | 134 | q <- " 135 | select a.month, a.total_commits - b.commits_from_pull_reqs as direct, b.commits_from_pull_reqs as pullreq 136 | from ( 137 | select last_day(c.created_at) as month, p.id as prid, count(c.id) as total_commits 138 | from commits c, projects p, project_commits pc 139 | where p.id=%d 140 | and p.id = pc.project_id 141 | and c.id = pc.commit_id 142 | group by month(c.created_at),year(c.created_at), p.id 143 | ) as a, ( 144 | select last_day(c.created_at) as month, p.id as prid, count(prc.commit_id) as commits_from_pull_reqs 145 | from projects p, pull_requests pr, pull_request_commits prc, commits c, project_commits pc 146 | where p.id = %d 147 | and exists( 148 | select prh.action 149 | from pull_request_history prh 150 | where prh.pull_request_id = pr.id 151 | and last_day(prh.created_at) between last_day(c.created_at) and 152 | date_add(last_day(c.created_at), INTERVAL 1 MONTH) 153 | and prh.action='merged') 154 | and p.id = pr.base_repo_id and prc.commit_id = c.id 155 | and pc.project_id = p.id 156 | and pc.commit_id = c.id 157 | and pr.id = prc.pull_request_id 158 | group by month(c.created_at),year(c.created_at), p.id) as b 159 | where a.prid = b.prid and a.month = b.month 160 | order by a.month desc" 161 | 162 | res <- dbSendQuery(db, sprintf(unwrap(q), pid, pid)) 163 | df <- fetch(res, n = -1) 164 | df$month <- as.POSIXct(df$month) 165 | df$commit_source <- df$value 166 | df <- melt(df, id=c('month')) 167 | df <- rename(df, c("variable"="commit_source")) 168 | 169 | ggplot(df) + 170 | aes(x = month, y = value, fill = commit_source) + 171 | scale_x_datetime() + 172 | geom_bar(stat="identity") + 173 | xlab("Date") + 174 | ylab("Commits") + 175 | scale_colour_identity(name = "source") 176 | ``` 177 | 178 | This figure presents the source of commits in your project. The more commits 179 | come from pull requests, the more open the project process is to accepting 180 | contributions. However, pull requests may be used internally (across project 181 | branches) so this might not entirely reflect the actual situation. 182 | 183 | ### Commits from the project community as percentage of total 184 | ```{r plot2, fig.keep='last', echo=FALSE, fig.align='center', warning=FALSE} 185 | q <- " 186 | select a.mon as date, a.intern as intern, b.extern as extern 187 | from ( 188 | select last_day(c.created_at) as mon, count(*) as intern 189 | from commits c, project_commits pc, project_members pm 190 | where c.id = pc.commit_id 191 | and pm.repo_id = pc.project_id 192 | and c.author_id = pm.user_id 193 | and pc.project_id = %d 194 | group by mon order by mon) as a, 195 | (select last_day(c.created_at) as mon, count(*) as extern 196 | from commits c, project_commits pc 197 | where c.id = pc.commit_id 198 | and not exists ( 199 | select * 200 | from project_members pm 201 | where c.author_id = pm.user_id 202 | and pm.repo_id = pc.project_id) 203 | and pc.project_id = %d 204 | group by mon 205 | order by mon) as b 206 | where a.mon = b.mon 207 | and a.mon > from_unixtime(1312156800)" 208 | 209 | res <- dbSendQuery(db, sprintf(unwrap(q), pid, pid)) 210 | df <- fetch(res, n = -1) 211 | df$date <- as.POSIXct(df$date) 212 | df$ratio <- (df$extern / (df$inter + df$extern)) * 100 213 | 214 | ggplot(df) + 215 | aes(x = date, y = ratio) + 216 | scale_x_datetime() + 217 | geom_line(size = 2) + 218 | stat_smooth(method = "loess", formula = y ~ x^2, size = 2, alpha = 0) + 219 | xlab("Date") + ylab("Percentage of commits from community") 220 | 221 | ``` 222 | 223 | Percentage of total commits (and trendline) coming from the community. The more 224 | commits coming from the community, the more this project is a community effort. 225 | 226 | ### Comments and commenters from the community 227 | ```{r plot3, message=FALSE, fig.align='center', echo=FALSE, fig.width=9, warning=FALSE} 228 | q <- " 229 | select last_day(a.mon) as mon, ( 230 | select count(pm.user_id) 231 | from project_members pm 232 | where pm.user_id = a.user_id and pm.repo_id = a.p_id) as is_member, 233 | count(distinct user_id) as num_users, 234 | sum(a.cnt) as num_comments 235 | from ( 236 | select last_day(ic.created_at) as mon, pr.base_repo_id as p_id, ic.user_id as user_id, count(ic.comment_id) as cnt 237 | from projects p 238 | join pull_requests pr on p.id = pr.base_repo_id 239 | left outer join issues i on pr.pullreq_id = i.issue_id 240 | left outer join issue_comments ic on i.id = ic.issue_id 241 | where p.forked_from is null 242 | and p.id = %d 243 | and pr.base_repo_id = i.repo_id 244 | group by mon, pr.base_repo_id, ic.user_id) as a, 245 | projects p 246 | where p.id = a.p_id 247 | group by mon, is_member 248 | " 249 | res <- dbSendQuery(db, sprintf(unwrap(q), pid)) 250 | df <- fetch(res, n = -1) 251 | df <- subset(df, !is.na(mon)) 252 | df$is_member <- factor(df$is_member) 253 | df$mon <- as.POSIXct(df$mon) 254 | 255 | q <- " 256 | select d.mon, ( 257 | select sum(df1.num_comments) 258 | from df df1 259 | where df1.mon = d.mon 260 | and df1.is_member = 0) *100/sum(d.num_comments) as comments, 261 | (select sum(df1.num_users) 262 | from df df1 263 | where df1.mon = d.mon 264 | and df1.is_member = 0) * 100/sum(d.num_users) as commenters 265 | from df d 266 | group by d.mon 267 | " 268 | 269 | df <- sqldf(q, drv="SQLite") 270 | df <- melt(df, 'mon', na.rm = TRUE) 271 | df$variable <- as.factor(df$variable) 272 | df$value <- as.numeric(as.character(df$value)) 273 | 274 | ggplot(df, aes(x = mon, y = value, fill = variable)) + 275 | scale_x_datetime() + 276 | geom_bar(position = 'dodge', stat = "identity") + 277 | xlab("Date") + ylab("% from community") + 278 | facet_grid(. ~ variable) + 279 | theme(legend.position="none") + 280 | scale_y_continuous(limits = c(0, 100)) 281 | 282 | ``` 283 | Percentage of comments (left) and people that commented (right) coming from 284 | outside the project's core development team. The more comments coming from the 285 | community, the more welcoming the project is to outsiders. 286 | 287 | ### Project forks: Total and contributing 288 | ```{r plot4, message=FALSE, fig.align='center', echo=FALSE, fig.width=9, warning=FALSE} 289 | q <- " 290 | select last_day(p.created_at) as month, count(*) as created 291 | from projects p 292 | where p.forked_from = ( 293 | select p.id 294 | from projects p 295 | where p.id = %d) 296 | group by month" 297 | 298 | res <- dbSendQuery(db, sprintf(unwrap(q), pid)) 299 | forks <- fetch(res, n = -1) 300 | 301 | q <- " 302 | select last_day(p.created_at) as month, count(*) as contributing 303 | from projects p 304 | where p.forked_from = ( 305 | select p.id 306 | from projects p 307 | where p.id = %d) 308 | and exists ( 309 | select * 310 | from pull_requests pr 311 | where pr.head_repo_id = p.id) 312 | group by month 313 | " 314 | 315 | res <- dbSendQuery(db, sprintf(unwrap(q), pid)) 316 | contrib <- fetch(res, n = -1) 317 | 318 | df <- merge(forks, contrib, by = 'month') 319 | df$month <- as.POSIXct(df$month) 320 | df <- melt(df, id=c('month')) 321 | df <- rename(df, c("variable"="forks")) 322 | 323 | ggplot(df) + 324 | aes(x = month, y = value, fill = forks) + 325 | scale_x_datetime() + 326 | geom_freqpoly(aes(group = forks, colour = forks), stat="identity", size = 2) + 327 | xlab("Date") + ylab("Number of forks") 328 | ``` 329 | 330 | This is a plot of forks created per month versus forks contributing code back 331 | (in the form of pull requests) per month. Ideally, all forks should contribute 332 | back. In healty community, the montly number of forks contributing should be 333 | increasing, as the total number of forks increases. 334 | 335 |
336 | Generated at: `r date()` 337 | -------------------------------------------------------------------------------- /assets/themes/twitter/bootstrap/css/bootstrap-responsive.min.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * Bootstrap Responsive v2.3.0 3 | * 4 | * Copyright 2012 Twitter, Inc 5 | * Licensed under the Apache License v2.0 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Designed and built with all the love in the world @twitter by @mdo and @fat. 9 | */.clearfix{*zoom:1}.clearfix:before,.clearfix:after{display:table;line-height:0;content:""}.clearfix:after{clear:both}.hide-text{font:0/0 a;color:transparent;text-shadow:none;background-color:transparent;border:0}.input-block-level{display:block;width:100%;min-height:30px;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}@-ms-viewport{width:device-width}.hidden{display:none;visibility:hidden}.visible-phone{display:none!important}.visible-tablet{display:none!important}.hidden-desktop{display:none!important}.visible-desktop{display:inherit!important}@media(min-width:768px) and (max-width:979px){.hidden-desktop{display:inherit!important}.visible-desktop{display:none!important}.visible-tablet{display:inherit!important}.hidden-tablet{display:none!important}}@media(max-width:767px){.hidden-desktop{display:inherit!important}.visible-desktop{display:none!important}.visible-phone{display:inherit!important}.hidden-phone{display:none!important}}.visible-print{display:none!important}@media print{.visible-print{display:inherit!important}.hidden-print{display:none!important}}@media(min-width:1200px){.row{margin-left:-30px;*zoom:1}.row:before,.row:after{display:table;line-height:0;content:""}.row:after{clear:both}[class*="span"]{float:left;min-height:1px;margin-left:30px}.container,.navbar-static-top .container,.navbar-fixed-top .container,.navbar-fixed-bottom .container{width:1170px}.span12{width:1170px}.span11{width:1070px}.span10{width:970px}.span9{width:870px}.span8{width:770px}.span7{width:670px}.span6{width:570px}.span5{width:470px}.span4{width:370px}.span3{width:270px}.span2{width:170px}.span1{width:70px}.offset12{margin-left:1230px}.offset11{margin-left:1130px}.offset10{margin-left:1030px}.offset9{margin-left:930px}.offset8{margin-left:830px}.offset7{margin-left:730px}.offset6{margin-left:630px}.offset5{margin-left:530px}.offset4{margin-left:430px}.offset3{margin-left:330px}.offset2{margin-left:230px}.offset1{margin-left:130px}.row-fluid{width:100%;*zoom:1}.row-fluid:before,.row-fluid:after{display:table;line-height:0;content:""}.row-fluid:after{clear:both}.row-fluid [class*="span"]{display:block;float:left;width:100%;min-height:30px;margin-left:2.564102564102564%;*margin-left:2.5109110747408616%;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}.row-fluid [class*="span"]:first-child{margin-left:0}.row-fluid .controls-row [class*="span"]+[class*="span"]{margin-left:2.564102564102564%}.row-fluid .span12{width:100%;*width:99.94680851063829%}.row-fluid .span11{width:91.45299145299145%;*width:91.39979996362975%}.row-fluid .span10{width:82.90598290598291%;*width:82.8527914166212%}.row-fluid .span9{width:74.35897435897436%;*width:74.30578286961266%}.row-fluid .span8{width:65.81196581196582%;*width:65.75877432260411%}.row-fluid .span7{width:57.26495726495726%;*width:57.21176577559556%}.row-fluid .span6{width:48.717948717948715%;*width:48.664757228587014%}.row-fluid .span5{width:40.17094017094017%;*width:40.11774868157847%}.row-fluid .span4{width:31.623931623931625%;*width:31.570740134569924%}.row-fluid .span3{width:23.076923076923077%;*width:23.023731587561375%}.row-fluid .span2{width:14.52991452991453%;*width:14.476723040552828%}.row-fluid .span1{width:5.982905982905983%;*width:5.929714493544281%}.row-fluid .offset12{margin-left:105.12820512820512%;*margin-left:105.02182214948171%}.row-fluid .offset12:first-child{margin-left:102.56410256410257%;*margin-left:102.45771958537915%}.row-fluid .offset11{margin-left:96.58119658119658%;*margin-left:96.47481360247316%}.row-fluid .offset11:first-child{margin-left:94.01709401709402%;*margin-left:93.91071103837061%}.row-fluid .offset10{margin-left:88.03418803418803%;*margin-left:87.92780505546462%}.row-fluid .offset10:first-child{margin-left:85.47008547008548%;*margin-left:85.36370249136206%}.row-fluid .offset9{margin-left:79.48717948717949%;*margin-left:79.38079650845607%}.row-fluid .offset9:first-child{margin-left:76.92307692307693%;*margin-left:76.81669394435352%}.row-fluid .offset8{margin-left:70.94017094017094%;*margin-left:70.83378796144753%}.row-fluid .offset8:first-child{margin-left:68.37606837606839%;*margin-left:68.26968539734497%}.row-fluid .offset7{margin-left:62.393162393162385%;*margin-left:62.28677941443899%}.row-fluid .offset7:first-child{margin-left:59.82905982905982%;*margin-left:59.72267685033642%}.row-fluid .offset6{margin-left:53.84615384615384%;*margin-left:53.739770867430444%}.row-fluid .offset6:first-child{margin-left:51.28205128205128%;*margin-left:51.175668303327875%}.row-fluid .offset5{margin-left:45.299145299145295%;*margin-left:45.1927623204219%}.row-fluid .offset5:first-child{margin-left:42.73504273504273%;*margin-left:42.62865975631933%}.row-fluid .offset4{margin-left:36.75213675213675%;*margin-left:36.645753773413354%}.row-fluid .offset4:first-child{margin-left:34.18803418803419%;*margin-left:34.081651209310785%}.row-fluid .offset3{margin-left:28.205128205128204%;*margin-left:28.0987452264048%}.row-fluid .offset3:first-child{margin-left:25.641025641025642%;*margin-left:25.53464266230224%}.row-fluid .offset2{margin-left:19.65811965811966%;*margin-left:19.551736679396257%}.row-fluid .offset2:first-child{margin-left:17.094017094017094%;*margin-left:16.98763411529369%}.row-fluid .offset1{margin-left:11.11111111111111%;*margin-left:11.004728132387708%}.row-fluid .offset1:first-child{margin-left:8.547008547008547%;*margin-left:8.440625568285142%}input,textarea,.uneditable-input{margin-left:0}.controls-row [class*="span"]+[class*="span"]{margin-left:30px}input.span12,textarea.span12,.uneditable-input.span12{width:1156px}input.span11,textarea.span11,.uneditable-input.span11{width:1056px}input.span10,textarea.span10,.uneditable-input.span10{width:956px}input.span9,textarea.span9,.uneditable-input.span9{width:856px}input.span8,textarea.span8,.uneditable-input.span8{width:756px}input.span7,textarea.span7,.uneditable-input.span7{width:656px}input.span6,textarea.span6,.uneditable-input.span6{width:556px}input.span5,textarea.span5,.uneditable-input.span5{width:456px}input.span4,textarea.span4,.uneditable-input.span4{width:356px}input.span3,textarea.span3,.uneditable-input.span3{width:256px}input.span2,textarea.span2,.uneditable-input.span2{width:156px}input.span1,textarea.span1,.uneditable-input.span1{width:56px}.thumbnails{margin-left:-30px}.thumbnails>li{margin-left:30px}.row-fluid .thumbnails{margin-left:0}}@media(min-width:768px) and (max-width:979px){.row{margin-left:-20px;*zoom:1}.row:before,.row:after{display:table;line-height:0;content:""}.row:after{clear:both}[class*="span"]{float:left;min-height:1px;margin-left:20px}.container,.navbar-static-top .container,.navbar-fixed-top .container,.navbar-fixed-bottom .container{width:724px}.span12{width:724px}.span11{width:662px}.span10{width:600px}.span9{width:538px}.span8{width:476px}.span7{width:414px}.span6{width:352px}.span5{width:290px}.span4{width:228px}.span3{width:166px}.span2{width:104px}.span1{width:42px}.offset12{margin-left:764px}.offset11{margin-left:702px}.offset10{margin-left:640px}.offset9{margin-left:578px}.offset8{margin-left:516px}.offset7{margin-left:454px}.offset6{margin-left:392px}.offset5{margin-left:330px}.offset4{margin-left:268px}.offset3{margin-left:206px}.offset2{margin-left:144px}.offset1{margin-left:82px}.row-fluid{width:100%;*zoom:1}.row-fluid:before,.row-fluid:after{display:table;line-height:0;content:""}.row-fluid:after{clear:both}.row-fluid [class*="span"]{display:block;float:left;width:100%;min-height:30px;margin-left:2.7624309392265194%;*margin-left:2.709239449864817%;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}.row-fluid [class*="span"]:first-child{margin-left:0}.row-fluid .controls-row [class*="span"]+[class*="span"]{margin-left:2.7624309392265194%}.row-fluid .span12{width:100%;*width:99.94680851063829%}.row-fluid .span11{width:91.43646408839778%;*width:91.38327259903608%}.row-fluid .span10{width:82.87292817679558%;*width:82.81973668743387%}.row-fluid .span9{width:74.30939226519337%;*width:74.25620077583166%}.row-fluid .span8{width:65.74585635359117%;*width:65.69266486422946%}.row-fluid .span7{width:57.18232044198895%;*width:57.12912895262725%}.row-fluid .span6{width:48.61878453038674%;*width:48.56559304102504%}.row-fluid .span5{width:40.05524861878453%;*width:40.00205712942283%}.row-fluid .span4{width:31.491712707182323%;*width:31.43852121782062%}.row-fluid .span3{width:22.92817679558011%;*width:22.87498530621841%}.row-fluid .span2{width:14.3646408839779%;*width:14.311449394616199%}.row-fluid .span1{width:5.801104972375691%;*width:5.747913483013988%}.row-fluid .offset12{margin-left:105.52486187845304%;*margin-left:105.41847889972962%}.row-fluid .offset12:first-child{margin-left:102.76243093922652%;*margin-left:102.6560479605031%}.row-fluid .offset11{margin-left:96.96132596685082%;*margin-left:96.8549429881274%}.row-fluid .offset11:first-child{margin-left:94.1988950276243%;*margin-left:94.09251204890089%}.row-fluid .offset10{margin-left:88.39779005524862%;*margin-left:88.2914070765252%}.row-fluid .offset10:first-child{margin-left:85.6353591160221%;*margin-left:85.52897613729868%}.row-fluid .offset9{margin-left:79.8342541436464%;*margin-left:79.72787116492299%}.row-fluid .offset9:first-child{margin-left:77.07182320441989%;*margin-left:76.96544022569647%}.row-fluid .offset8{margin-left:71.2707182320442%;*margin-left:71.16433525332079%}.row-fluid .offset8:first-child{margin-left:68.50828729281768%;*margin-left:68.40190431409427%}.row-fluid .offset7{margin-left:62.70718232044199%;*margin-left:62.600799341718584%}.row-fluid .offset7:first-child{margin-left:59.94475138121547%;*margin-left:59.838368402492065%}.row-fluid .offset6{margin-left:54.14364640883978%;*margin-left:54.037263430116376%}.row-fluid .offset6:first-child{margin-left:51.38121546961326%;*margin-left:51.27483249088986%}.row-fluid .offset5{margin-left:45.58011049723757%;*margin-left:45.47372751851417%}.row-fluid .offset5:first-child{margin-left:42.81767955801105%;*margin-left:42.71129657928765%}.row-fluid .offset4{margin-left:37.01657458563536%;*margin-left:36.91019160691196%}.row-fluid .offset4:first-child{margin-left:34.25414364640884%;*margin-left:34.14776066768544%}.row-fluid .offset3{margin-left:28.45303867403315%;*margin-left:28.346655695309746%}.row-fluid .offset3:first-child{margin-left:25.69060773480663%;*margin-left:25.584224756083227%}.row-fluid .offset2{margin-left:19.88950276243094%;*margin-left:19.783119783707537%}.row-fluid .offset2:first-child{margin-left:17.12707182320442%;*margin-left:17.02068884448102%}.row-fluid .offset1{margin-left:11.32596685082873%;*margin-left:11.219583872105325%}.row-fluid .offset1:first-child{margin-left:8.56353591160221%;*margin-left:8.457152932878806%}input,textarea,.uneditable-input{margin-left:0}.controls-row [class*="span"]+[class*="span"]{margin-left:20px}input.span12,textarea.span12,.uneditable-input.span12{width:710px}input.span11,textarea.span11,.uneditable-input.span11{width:648px}input.span10,textarea.span10,.uneditable-input.span10{width:586px}input.span9,textarea.span9,.uneditable-input.span9{width:524px}input.span8,textarea.span8,.uneditable-input.span8{width:462px}input.span7,textarea.span7,.uneditable-input.span7{width:400px}input.span6,textarea.span6,.uneditable-input.span6{width:338px}input.span5,textarea.span5,.uneditable-input.span5{width:276px}input.span4,textarea.span4,.uneditable-input.span4{width:214px}input.span3,textarea.span3,.uneditable-input.span3{width:152px}input.span2,textarea.span2,.uneditable-input.span2{width:90px}input.span1,textarea.span1,.uneditable-input.span1{width:28px}}@media(max-width:767px){body{padding-right:20px;padding-left:20px}.navbar-fixed-top,.navbar-fixed-bottom,.navbar-static-top{margin-right:-20px;margin-left:-20px}.container-fluid{padding:0}.dl-horizontal dt{float:none;width:auto;clear:none;text-align:left}.dl-horizontal dd{margin-left:0}.container{width:auto}.row-fluid{width:100%}.row,.thumbnails{margin-left:0}.thumbnails>li{float:none;margin-left:0}[class*="span"],.uneditable-input[class*="span"],.row-fluid [class*="span"]{display:block;float:none;width:100%;margin-left:0;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}.span12,.row-fluid .span12{width:100%;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}.row-fluid [class*="offset"]:first-child{margin-left:0}.input-large,.input-xlarge,.input-xxlarge,input[class*="span"],select[class*="span"],textarea[class*="span"],.uneditable-input{display:block;width:100%;min-height:30px;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}.input-prepend input,.input-append input,.input-prepend input[class*="span"],.input-append input[class*="span"]{display:inline-block;width:auto}.controls-row [class*="span"]+[class*="span"]{margin-left:0}.modal{position:fixed;top:20px;right:20px;left:20px;width:auto;margin:0}.modal.fade{top:-100px}.modal.fade.in{top:20px}}@media(max-width:480px){.nav-collapse{-webkit-transform:translate3d(0,0,0)}.page-header h1 small{display:block;line-height:20px}input[type="checkbox"],input[type="radio"]{border:1px solid #ccc}.form-horizontal .control-label{float:none;width:auto;padding-top:0;text-align:left}.form-horizontal .controls{margin-left:0}.form-horizontal .control-list{padding-top:0}.form-horizontal .form-actions{padding-right:10px;padding-left:10px}.media .pull-left,.media .pull-right{display:block;float:none;margin-bottom:10px}.media-object{margin-right:0;margin-left:0}.modal{top:10px;right:10px;left:10px}.modal-header .close{padding:10px;margin:-10px}.carousel-caption{position:static}}@media(max-width:979px){body{padding-top:0}.navbar-fixed-top,.navbar-fixed-bottom{position:static}.navbar-fixed-top{margin-bottom:20px}.navbar-fixed-bottom{margin-top:20px}.navbar-fixed-top .navbar-inner,.navbar-fixed-bottom .navbar-inner{padding:5px}.navbar .container{width:auto;padding:0}.navbar .brand{padding-right:10px;padding-left:10px;margin:0 0 0 -5px}.nav-collapse{clear:both}.nav-collapse .nav{float:none;margin:0 0 10px}.nav-collapse .nav>li{float:none}.nav-collapse .nav>li>a{margin-bottom:2px}.nav-collapse .nav>.divider-vertical{display:none}.nav-collapse .nav .nav-header{color:#777;text-shadow:none}.nav-collapse .nav>li>a,.nav-collapse .dropdown-menu a{padding:9px 15px;font-weight:bold;color:#777;-webkit-border-radius:3px;-moz-border-radius:3px;border-radius:3px}.nav-collapse .btn{padding:4px 10px 4px;font-weight:normal;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px}.nav-collapse .dropdown-menu li+li a{margin-bottom:2px}.nav-collapse .nav>li>a:hover,.nav-collapse .nav>li>a:focus,.nav-collapse .dropdown-menu a:hover,.nav-collapse .dropdown-menu a:focus{background-color:#f2f2f2}.navbar-inverse .nav-collapse .nav>li>a,.navbar-inverse .nav-collapse .dropdown-menu a{color:#999}.navbar-inverse .nav-collapse .nav>li>a:hover,.navbar-inverse .nav-collapse .nav>li>a:focus,.navbar-inverse .nav-collapse .dropdown-menu a:hover,.navbar-inverse .nav-collapse .dropdown-menu a:focus{background-color:#111}.nav-collapse.in .btn-group{padding:0;margin-top:5px}.nav-collapse .dropdown-menu{position:static;top:auto;left:auto;display:none;float:none;max-width:none;padding:0;margin:0 15px;background-color:transparent;border:0;-webkit-border-radius:0;-moz-border-radius:0;border-radius:0;-webkit-box-shadow:none;-moz-box-shadow:none;box-shadow:none}.nav-collapse .open>.dropdown-menu{display:block}.nav-collapse .dropdown-menu:before,.nav-collapse .dropdown-menu:after{display:none}.nav-collapse .dropdown-menu .divider{display:none}.nav-collapse .nav>li>.dropdown-menu:before,.nav-collapse .nav>li>.dropdown-menu:after{display:none}.nav-collapse .navbar-form,.nav-collapse .navbar-search{float:none;padding:10px 15px;margin:10px 0;border-top:1px solid #f2f2f2;border-bottom:1px solid #f2f2f2;-webkit-box-shadow:inset 0 1px 0 rgba(255,255,255,0.1),0 1px 0 rgba(255,255,255,0.1);-moz-box-shadow:inset 0 1px 0 rgba(255,255,255,0.1),0 1px 0 rgba(255,255,255,0.1);box-shadow:inset 0 1px 0 rgba(255,255,255,0.1),0 1px 0 rgba(255,255,255,0.1)}.navbar-inverse .nav-collapse .navbar-form,.navbar-inverse .nav-collapse .navbar-search{border-top-color:#111;border-bottom-color:#111}.navbar .nav-collapse .nav.pull-right{float:none;margin-left:0}.nav-collapse,.nav-collapse.collapse{height:0;overflow:hidden}.navbar .btn-navbar{display:block}.navbar-static .navbar-inner{padding-right:10px;padding-left:10px}}@media(min-width:980px){.nav-collapse.collapse{height:auto!important;overflow:visible!important}} 10 | --------------------------------------------------------------------------------