├── 404.html ├── local.css ├── _plugins ├── ext.rb ├── mdhtml.rb └── debug.rb ├── files ├── mslogo.png ├── rulogo.gif ├── schema.pdf ├── schema.png ├── tudelftlogo.png └── ghtorrent-data.pdf ├── vm.md ├── README.md ├── Gemfile ├── .gitignore ├── assets └── themes │ └── twitter │ ├── bootstrap │ ├── img │ │ ├── glyphicons-halflings.png │ │ └── glyphicons-halflings-white.png │ └── css │ │ └── bootstrap-responsive.min.css │ └── css │ └── style.css ├── sitemap.txt ├── pages.html ├── _layouts ├── page.html └── default.html ├── dumps ├── update-downloads.sh ├── index.erb ├── run-all.sh ├── index.rb └── ght-periodic-dump ├── stats ├── extract-events.sh ├── extract-stats.sh ├── genstats.sh ├── index.md └── api-stats.R ├── docs.md ├── _includes └── comments.html ├── atom.xml ├── _config.yml ├── contrib.md ├── downloads.md ├── basedupon.md ├── lean.html ├── mysql.md ├── services.md ├── raw.md ├── pullreq-perf ├── openess-report.R ├── index.Rmd └── report.Rmd ├── vissoft14.md ├── syntax.css ├── geninst.md ├── mongo.md ├── cookbook.md ├── leanprogress.html ├── index.md ├── gcloud.md ├── streaming.md ├── pers-data.md ├── halloffame.md ├── ght-ubuntu.md ├── _bibliography └── references.bib ├── faq.md ├── msr14.md ├── Rakefile └── relational.md /404.html: -------------------------------------------------------------------------------- 1 | Sorry this page does not exist =( 2 | -------------------------------------------------------------------------------- /local.css: -------------------------------------------------------------------------------- 1 | h4 { 2 | font-weight: bold; 3 | } 4 | -------------------------------------------------------------------------------- /_plugins/ext.rb: -------------------------------------------------------------------------------- 1 | require 'jekyll/scholar' 2 | require 'pp' 3 | puts 'Loaded scholar' 4 | 5 | -------------------------------------------------------------------------------- /files/mslogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ice3man543/ghtorrent.org/master/files/mslogo.png -------------------------------------------------------------------------------- /files/rulogo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ice3man543/ghtorrent.org/master/files/rulogo.gif -------------------------------------------------------------------------------- /files/schema.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ice3man543/ghtorrent.org/master/files/schema.pdf -------------------------------------------------------------------------------- /files/schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ice3man543/ghtorrent.org/master/files/schema.png -------------------------------------------------------------------------------- /vm.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Client VM 4 | tagline: 5 | --- 6 | 7 | Under Construction! 8 | -------------------------------------------------------------------------------- /files/tudelftlogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ice3man543/ghtorrent.org/master/files/tudelftlogo.png -------------------------------------------------------------------------------- /files/ghtorrent-data.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ice3man543/ghtorrent.org/master/files/ghtorrent-data.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Installing 2 | 3 | ```` 4 | su 5 | apt-get install ruby 6 | gem install jekyll jekyll-scholar 7 | ```` 8 | 9 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gem 'rouge' 4 | gem 'rdiscount' 5 | gem 'jekyll-watch' 6 | gem 'jekyll-scholar' 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _site/* 2 | _theme_packages/* 3 | pullreq-perf/*/ 4 | *~ 5 | Thumbs.db 6 | .DS_Store 7 | 8 | !.gitkeep 9 | 10 | .rbenv-version 11 | .rvmrc 12 | -------------------------------------------------------------------------------- /assets/themes/twitter/bootstrap/img/glyphicons-halflings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ice3man543/ghtorrent.org/master/assets/themes/twitter/bootstrap/img/glyphicons-halflings.png -------------------------------------------------------------------------------- /assets/themes/twitter/bootstrap/img/glyphicons-halflings-white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ice3man543/ghtorrent.org/master/assets/themes/twitter/bootstrap/img/glyphicons-halflings-white.png -------------------------------------------------------------------------------- /sitemap.txt: -------------------------------------------------------------------------------- 1 | --- 2 | # Remember to set production_url in your _config.yml file! 3 | title : Sitemap 4 | --- 5 | {% for page in site.pages %} 6 | {{site.production_url}}{{ page.url }}{% endfor %} 7 | {% for post in site.posts %} 8 | {{site.production_url}}{{ post.url }}{% endfor %} -------------------------------------------------------------------------------- /pages.html: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Pages 4 | header: Pages 5 | group: navigation 6 | --- 7 | 8 |
| 11 | | 12 | |
|---|---|
![]() |
17 | ![]() |
19 |
![]() |
22 | ![]() |
23 |
| Dump date | 8 | <% @collections.sort.each do |c| %> 9 |<%=c.gsub("_", " ") %> | 10 | <% end %> 11 |
|---|---|
| <%= @d.date %> | 17 | <% for @col in @collections.sort %> 18 | <% @t = @d.torrents[@col] %> 19 |<% unless @t.nil? %> 20 | <%= @t.size%> MB 21 | <% end %> 22 | | 23 | <% end %> 24 |
#{obj.class}\n#{obj.pretty_inspect}"
33 | end
34 |
35 | end # DebugFilter
36 | end # Jekyll
37 |
38 | Liquid::Template.register_filter(Jekyll::DebugFilter)
--------------------------------------------------------------------------------
/dumps/run-all.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #./ght-periodic-dump -f '2012-01-31 00:00' -t '2012-03-31 00:00'
4 | #./update-downloads.sh
5 |
6 | #./ght-periodic-dump -f '2012-03-31 00:00' -t '2012-05-31 00:00'
7 | #./update-downloads.sh
8 |
9 | #./ght-periodic-dump -f '2012-05-31 00:00' -t '2012-07-31 00:00'
10 | #./update-downloads.sh
11 |
12 | #./ght-periodic-dump -f '2012-07-31 00:00' -t '2012-09-30 00:00'
13 | #./update-downloads.sh
14 |
15 | #./ght-periodic-dump -f '2012-09-30 00:00' -t '2012-11-30 00:00'
16 | #./update-downloads.sh
17 |
18 | #./ght-periodic-dump -f '2012-11-30 00:00' -t '2013-01-30 00:00'
19 | #./update-downloads.sh
20 |
21 | #./ght-periodic-dump -f '2013-01-30 00:00' -t '2013-03-30 00:00'
22 | #./update-downloads.sh
23 |
24 | #./ght-periodic-dump -f '2013-03-30 00:00' -t '2013-05-30 00:00'
25 | #./update-downloads.sh
26 |
27 | #./ght-periodic-dump -f '2013-05-30 00:00' -t '2013-07-30 00:00'
28 | #./update-downloads.sh
29 |
30 | #./ght-periodic-dump -f '2013-07-30 00:00' -t '2013-09-30 00:00'
31 | #./update-downloads.sh
32 |
33 | ./ght-periodic-dump -f '2013-09-30 00:00' -t '2013-11-30 00:00'
34 | #./update-downloads.sh
35 |
36 | ./ght-periodic-dump -f '2013-11-30 00:00' -t '2014-01-30 00:00'
37 | #./update-downloads.sh
38 |
39 |
--------------------------------------------------------------------------------
/contrib.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Contributing to GHTorrent
4 | tagline:
5 | ---
6 |
7 | Do you consider contributing to GHTorrent? That's great! We value any
8 | contribution, no matter how small, big, simple or sophisticated it is. If you
9 | decide to send a pull request, we will actively help you to get your pull
10 | request integrated.
11 |
12 | ## Setting up GHTorrent
13 |
14 | The first thing to do is to ensure that you have a working GHTorrent
15 | environment. To do so, please consult the top level
16 | [README.md](https://github.com/gousiosg/github-mirror/blob/master/README.md) file with
17 | instructions on doing so.
18 |
19 | ## TODO list
20 |
21 | The TODO list is maintained as a collection of open [GitHub
22 | issues](https://github.com/gousiosg/github-mirror). Please feel free to adopt
23 | any of those by @mentioning the @ghtorrent user.
24 |
25 | ## New features
26 |
27 | Do you have a cool idea that will make GHTorrent 100x (or 0.01x) better? That's
28 | great! We look forward to reviewing your pull requests! We however advise you
29 | to:
30 |
31 | 1. Read the [open issue list](https://github.com/gousiosg/github-mirror/issues)
32 | 2. Contact the [GHTorrent mailing list](). The maintainers will help you
33 | implement your proposed feature as efficiently as possible and make sure
34 | that it does not conflict with any change currently planned.
35 |
--------------------------------------------------------------------------------
/downloads.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Downloads
4 | tagline:
5 | ---
6 |
7 | ### What am I downloading?
8 |
9 | * The MySQL dump is a full, up to date database dump. You can use it for querying the latest available data.
10 | * The MongoDB dumps are incremental. They are provided mostly for reference and backup purposes, as they may contain duplicates. The reasons for this are the following:
11 | * When refreshing project data, old records are deleted and new are added. This cannot be reflected in the dumps (it is not practical to regenerate all dumps every time).
12 | * The dumps have already been restored once, hence the dump dates do not
13 | represent the actual data generation dates.
14 |
15 | For those reasons, we recommend using the MongoDB data through our [query
16 | service](http://ghtorrent.org/raw.html).
17 |
18 | ### Which is the applicable license?
19 |
20 | See [here](faq.html)
21 |
22 | ### MySQL database dumps
23 | As of MySQL dump
24 | `mysql-2015-09-25`, we are distributing CSV files (one file per table) instead
25 | of `mysqldump` based backups. The provided archive expands to a directory
26 | including a restore script and instructions on how to do the restore. See more
27 | information [here](https://github.com/gousiosg/github-mirror/tree/master/sql).
28 |
29 | You can also [query MySQL](/dblite). It is always loaded with the latest
30 | dump.
31 |
--------------------------------------------------------------------------------
/assets/themes/twitter/css/style.css:
--------------------------------------------------------------------------------
1 | /* Override some defaults */
2 | html, body {
3 | background-color: #eee;
4 | }
5 | .navbar {
6 | margin-bottom: 0;
7 | }
8 | .container > footer {
9 | margin-top: 20px;
10 | }
11 | .container > footer p {
12 | text-align: center; /* center align it with the container */
13 | }
14 |
15 | /* The white background content wrapper */
16 | .content {
17 | background-color: #fff;
18 | padding: 20px;
19 | margin: 0 -20px; /* negative indent the amount of the padding to maintain the grid system */
20 | -webkit-border-radius: 0 0 6px 6px;
21 | -moz-border-radius: 0 0 6px 6px;
22 | border-radius: 0 0 6px 6px;
23 | -webkit-box-shadow: 0 1px 2px rgba(0,0,0,.15);
24 | -moz-box-shadow: 0 1px 2px rgba(0,0,0,.15);
25 | box-shadow: 0 1px 2px rgba(0,0,0,.15);
26 | }
27 |
28 | /* Page header tweaks */
29 | .page-header {
30 | background-color: #f5f5f5;
31 | padding: 20px 20px 10px;
32 | margin: -20px -20px 20px;
33 | }
34 |
35 | .topbar .btn {
36 | border: 0;
37 | }
38 |
39 |
40 | /* tag_box ======================================================== */
41 |
42 | .tag_box {
43 | list-style:none;
44 | margin:0;
45 | padding:5px 0 ;
46 | overflow:hidden;
47 | }
48 | .tag_box li {
49 | line-height:28px;
50 | }
51 | .tag_box.inline li {
52 | float:left;
53 | }
54 | .tag_box a {
55 | padding: 3px 6px;
56 | margin: 2px;
57 | background: #eee;
58 | color:#005F6B;
59 | border-radius: 3px;
60 | text-decoration:none;
61 | }
62 | .tag_box a span{
63 | vertical-align:super;
64 | font-size:0.8em;
65 | }
66 | .tag_box a.active {
67 | background:#57A957;
68 | border:1px solid #4C964D;
69 | color:#FFF;
70 | }
71 |
--------------------------------------------------------------------------------
/dumps/index.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | require 'rubygems'
4 | require 'erb'
5 | require 'set'
6 | require 'date'
7 |
8 | class GHTorrent
9 | attr_reader :collections
10 | attr_reader :dumps
11 |
12 | def initialize(last_update)
13 | @last_update = last_update
14 | @dumps = Set.new
15 | @collections = Set.new
16 | end
17 |
18 | def add_dump(dump)
19 | @dumps << dump
20 | end
21 |
22 | def add_collection(col)
23 | @collections << col
24 | end
25 |
26 | # Expose private binding() method.
27 | def get_binding
28 | binding()
29 | end
30 |
31 | end
32 |
33 | class Dump
34 | attr_reader :torrents
35 | attr_reader :date
36 |
37 | def initialize(torrents, date)
38 | @torrents = torrents
39 | @date = date
40 | end
41 | end
42 |
43 | class Torrent
44 | attr_reader :url
45 | attr_reader :name
46 | attr_reader :size
47 | attr_reader :date
48 | def initialize(url, name, size, date)
49 | @url = url
50 | @name = name
51 | @size = size
52 | @date = date
53 | end
54 | end
55 |
56 | url_prefix="http://ghtorrent.org/downloads"
57 |
58 | # Load the template
59 | file = File.open("index.erb").read
60 | rhtml = ERB.new(file)
61 |
62 | # Open the dir to read entries from
63 | dir = ARGV.shift
64 |
65 | if dir.nil?
66 | dir = "."
67 | end
68 |
69 | torrents = Dir.entries("#{dir}").map do |f|
70 |
71 | # Go through all torrent files and extract name of
72 | # dumped collection and dump date
73 | matches = /([a-z0-9_]+)-[a-z]+\.(.*)\.torrent/.match(f)
74 | next if matches.nil?
75 |
76 | # Calculate original file size
77 | dump = f.gsub(/.torrent/, ".tar.gz")
78 | size = File.stat(File.join(dir, dump)).size / 1024 / 1024
79 |
80 | date = Date.parse(matches[2])
81 |
82 | if size > 0
83 | Torrent.new(url_prefix + "/" + dump, matches[1], size, date)
84 | end
85 | end.select{|x| !x.nil?}
86 |
87 | all_dates = torrents.inject(Set.new){|acc, t| acc << t.date}
88 |
89 | all_dumps = all_dates.map{ |d|
90 | date_torrents = torrents.select{|t| t.date == d}
91 | name_torrents = date_torrents.inject(Hash.new){|acc, a| acc.store(a.name, a); acc}
92 | Dump.new(name_torrents, d)
93 | }
94 |
95 | max_date = all_dates.max{ |a,b| a <=> b}
96 |
97 | ghtorrent = GHTorrent.new(max_date)
98 | all_dumps.each { |x|
99 | ghtorrent.add_dump x
100 | x.torrents.values.each { |t|
101 | ghtorrent.add_collection t.name
102 | }
103 | }
104 |
105 | puts rhtml.result(ghtorrent.get_binding).gsub(/^\s+/, "").gsub(/\s+$/, $/).gsub(/| Version | 14 |Release date | 15 |Fixed error | 16 |
|---|---|---|
| 1.0 | 21 |1 Mar 2014 | 22 |23 | |
| Collection name | 16 |Github API URL | 17 |Documentation URL | 18 |
|---|---|---|
| commit_comments | 23 |#{user}/#{repo}/commits/#{sha}/comments | 24 |commit comments | 25 |
| commits | 28 |repos/#{user}/#{repo}/commits | 29 |commits | 30 |
| events | 33 |events | 34 |events | 35 |
| followers | 38 |users/#{user}/followers | 39 |followers list | 40 |
| forks | 43 |repos/#{user}/#{repo}/forks | 44 |forks list | 45 |
| issues | 48 |/repos/#{owner}/#{repo}/issues | 49 |issues for a repo | 50 |
| issue_comments | 53 |repos/#{owner}/#{repo}/issues/comments/#{comment_id} | 54 |issue comments | 55 |
| issue_events | 58 |repos/#{owner}/#{repo}/issues/events/#{event_id} | 59 |issue events | 60 |
| org_members | 63 |orgs/#{org}/members | 64 |organization members | 65 |
| pull_request_comments | 68 |repos/#{owner}/#{repo}/pulls/#{pullreq_id}/comments | 69 |pull request review comments | 70 |
| pull_requests | 73 |repos/#{user}/#{repo}/pulls | 74 |pull requests | 75 |
| repo_collaborators | 78 |repos/#{user}/#{repo}/collaborators | 79 |repo collaborators | 80 |
| repo_labels | 83 |repos/#{owner}/#{repo}/issues/#{issue_id}/labels | 84 |issue labels | 85 |
| repos | 88 |repos/#{user}/#{repo} | 89 |repositories | 90 |
| users | 93 |users/#{user} | 94 |users | 95 |
| watchers | 98 |repos/#{user}/#{repo}/stargazers | 99 |stargazers | 100 |
89 |
93 |
97 |
41 |
42 | For commercial uses, please [contact the maintainer](mailto:gousiosg@gmail.com) for more information. Usually, a sizable donation to the project will be enough
43 | to grant you full access.
44 |
45 | #### _Who is behind GHTorrent?_
46 |
47 | GHTorrent was initially created and is currently maintained by [Georgios
48 | Gousios](http://gousios.org), with initial design support and ideas from
49 | [Diomidis Spinellis](http://spinellis.gr). Several users have contributed code,
50 | ideas and support over time. Here is a (hopefuly not partial) list of them:
51 |
52 | Sebastian Bates, Derek Brown, Arie van Deursen, Daniel German, Jeff McAffer, Bogdan Vasilescu
53 |
54 | Financial support has been provided by the following organizations:
55 |
56 | * TU Delft: purchace and running costs for initial servers (2012 -- late 2015)
57 | * Microsoft: donation of Azure tokens for running the project infrastructure
58 | (late 2015 -- late 2016)
59 |
60 | #### _How is GHTorrent different from Github Archive?_
61 |
62 | [Github Archive](http://githubarchive.org) collects and stores the GitHub event
63 | stream. In addition to that, GHTorrent applies dependency based retrieval on all
64 | entities (e.g. commits, pull requests etc) that are linked from the events and
65 | stores the results in two databases: a raw data one (MongoDB) that stores the
66 | unprocessed responses from GitHub API and a relational one (MySQL) that stores
67 | links between the entities (e.g. commits are linked to projects). Using
68 | GHTorrent, developers can obtain an up-to-date, relational view of their
69 | project’s GitHub metadata, which can be used for answering questions regarding
70 | their project’s processes.
71 |
72 | ## How can I...?
73 |
74 | #### _...contribute to GHTorrent?_
75 |
76 | Please read the [contribution guide](contrib.html).
77 |
78 | #### _... cite the GHTorrent data set?_
79 |
80 | Georgios Gousios: [The GHTorrent dataset and tool
81 | suite](http://www.gousios.gr/bibliography/G13.html). MSR 2013: 233-236
82 |
83 | {%highlight text%}
84 | @inproceedings{Gousi13,
85 | author = {Gousios, Georgios},
86 | title = {The GHTorrent dataset and tool suite},
87 | booktitle = {Proceedings of the 10th Working Conference on Mining Software
88 | Repositories},
89 | series = {MSR '13},
90 | year = {2013},
91 | isbn = {978-1-4673-2936-1},
92 | location = {San Francisco, CA, USA},
93 | pages = {233--236},
94 | numpages = {4},
95 | url = {http://dl.acm.org/citation.cfm?id=2487085.2487132},
96 | acmid = {2487132},
97 | publisher = {IEEE Press},
98 | address = {Piscataway, NJ, USA},
99 | }
100 | {%endhighlight%}
101 |
102 | #### _...download the data?_
103 |
104 | You don't need to. GHTorrent offers a multitude of [online
105 | services](services.html) that enable access to almost realtime versions
106 | of the datastores. If you really want to, you can get all the data from
107 | the [downloads](downloads.html) page.
108 |
109 | #### _...use the data for my private project?_
110 |
111 | See the licensing information above.
112 |
113 | ## Data processing
114 |
115 | #### _What quality guarantees does GHTorrent offer?_
116 |
117 | The GHTorrent data come as is with no quality guarantees. However, we are
118 | actively seeking to fix systematic (i.e. errors that are repeated across the
119 | whole dataset) data collection errors. Please [open an
120 | issue](https://github.com/gousiosg/github-mirror/issues) if you find one. As
121 | GHTorrent is essentially a data sync operation over unreliable networks,
122 | spurious inconsistencies such as (minor) holes in data collection are
123 | unavoidable.
124 |
125 | #### _I 've seen weird commit timestamps_
126 |
127 | Git records the commit timestamp on the developer's workstation. If the clock
128 | is missconfigured, timestamps will be weird. We have seen timestamps such
129 | as `0000-01-01 00:00` or `2034-12-31 23:59`. GitHub and GHTorrent do not
130 | process the timestamps in any way.
131 |
132 | #### _My data is out of date_
133 |
134 | Github only creates events when an entity is created and not when it is updated or deleted. It is therefore not possible to be completely up-to-date with changes in users (e.g. updated location) and repositories (e.g. renames). GHTorrent tries its best to stay up to date by refreshing all users and all repos every X months. As the DB contains 12M+ users and 30M+ repos, this process may take a while and it can also fail due to spurious reasons.
135 |
136 | ## Copyright and Privacy
137 |
138 | #### _Who owns the data that GHTorrent shares?_
139 |
140 | The copyright situation is very complicated; in essense, GitHub owns copyright
141 | to the data formats for the API responses, users own copyright of the content
142 | they create and the GHTorrent creator has copyright on the GHTorrent database
143 | schemata.
144 |
145 | #### _What types of privacy guarantees does GHTorrent offer?_
146 |
147 | GHTorrent collects publicly available data from the GitHub API.
148 |
149 | #### _How does GHTorrent handle my personal information?_
150 |
151 | By personal information, we mean data that identify a real person uniquely. In
152 | the context of GHTorrent, these are emails and real names.
153 |
154 | As of Mar 2016, GHTorrent does not distribute any personal information by
155 | default. Researchers whose research requires access to personal data
156 | can use [this form](pers-data.html) to obtain it.
157 |
158 | #### _Can I get more information?_
159 |
160 | Yes. Please read the following Slidedeck. If you are still in doubt,
161 | please contact us.
162 | | Version | 19 |Release date | 20 |Fixed error | 21 |
|---|---|---|
| 1.3 | 26 |13 Dec 2013 | 27 |Missing project members for some projects is now fixed | 28 |
| 1.2 | 31 |22 Oct 2013 | 32 |user_id in table commit_comments not set correctly. | 33 |
| 1.1 | 36 |9 Oct 2013 | 37 |38 | Table commit_comments was missing data. Some commits were missing from 39 | some projects. 40 | | 41 |
| 1.0 | 44 |28 Sep 2013 | 45 |46 | |
8 |
9 | [Download](files/schema.png) [Download PDF](files/schema.pdf)
10 |
11 | ## Entities and their relationships
12 |
13 | #### users
14 | Github users.
15 |
16 | * A user has a unique user name or email. May contain artificially generated user names, see [commits](relational.html#commits) below.
17 | * There are two `type`s of users, `USER`s and `ORG`anizations.
18 | * Users can be *real* or *fake*. Real users can own projects and perform
19 | actions such as open issues, create pull requests and push commits. Fake
20 | users only appear as authors or committers of commits. Fake users are marked
21 | by the `fake` field.
22 | * Organizations are meta users that point to a collection of users. The members of organizations can be found in `organization_members`. Organization users can only own projects and they do not perform any other actions.
23 | * Users may be marked as `deleted`. This means that the user was once active on
24 | GitHub but GHTorrent can no longer get his/her details.
25 |
26 | *Update Nov 2015:* User entries are now geocoded. The location field remains
27 | intact, while 5 fields have been added with information about the
28 | geographic location of the user. The Open Street Maps API has been used
29 | to do the mapping of the location field to the user's geocode. As a result,
30 | the state and city fields are stored in the local language of the geocoded
31 | area. Also, many users do not report their location or their location
32 | is field in with random information; in those cases, no geocoding information
33 | is available.
34 |
35 | {% highlight sql %}
36 | --- See where most commits are commit from today
37 | select u.country_code, count(*)
38 | from commits c, users u
39 | where c.author_id = u.id
40 | and date(c.created_at) = date(now())
41 | group by u.country_code
42 | {% endhighlight %}
43 |
44 |
45 | *Update Mar 2016:* User personal data (emails and real names) are excluded
46 | from the downloaded dump, while configuration dissalows access to those
47 | fields for the online access services for the MySQL database.
48 |
49 |
50 | #### organization\_members
51 | Users that are members of an organization.
52 |
53 | * The `created_at` field is only filled in accurately for memberships for which
54 | GHTorrent has recorded a corresponding event. Otherwise, it is filled in with the
55 | latest date that the corresponding user or organization has been created.
56 |
57 | *Update Nov 2015:* Organizations can now select wheather membership information
58 | is revealed to external parties. This means that information about this
59 | table can no longer be accurate.
60 |
61 | #### projects
62 | Information about repositories. A repository is always owned by a user.
63 |
64 | * The `forked_from` field is empty unless the
65 | project is a fork in which case it contains the `id` of the project the project
66 | is forked from.
67 |
68 | * The `deleted` field means that the project has been deleted from Github.
69 |
70 | * The `updated_at` field indicates when the last full update was done for
71 | this project.
72 |
73 | #### project\_members
74 | Users that have commit access to the repository.
75 |
76 | The `created_at` field is only filled in accurately for memberships for which
77 | GHTorrent has recorded a corresponding event. Otherwise, it is filled in with the
78 | latest date that the corresponding user or project has been created.
79 |
80 | *Update Nov 2014:* GitHub has disabled the API end point used to retrieve
81 | members to an organization. GHTorrent uses the `MemberEvent` event to
82 | approximate memberships, but this is not always accurate. You are thus advised
83 | to use heuristics (e.g. the committers + mergers of pull) to calculate membership,
84 | such as the following:
85 |
86 | {% highlight sql %}
87 | --- Get active core team participants for the last 3 months
88 | select distinct(u.login) as login
89 | from commits c, users u, project_commits pc, users u1, projects p
90 | where u.id = c.committer_id
91 | and u.fake is false
92 | and pc.commit_id = c.id
93 | and pc.project_id = p.id
94 | and p.owner_id = u1.id
95 | and p.name = 'rails'
96 | and u1.login = 'rails'
97 | and c.created_at > DATE_SUB(NOW(), INTERVAL 3 MONTH)
98 | union
99 | select distinct(u.login) as login
100 | from pull_requests pr, projects p, users u, users u1, pull_request_history prh
101 | where u.id = prh.actor_id
102 | and prh.action = 'merged'
103 | and u1.id = p.owner_id
104 | and prh.pull_request_id = pr.id
105 | and pr.base_repo_id = p.id
106 | and prh.created_at > DATE_SUB(NOW(), INTERVAL 3 MONTH)
107 | and p.name = 'rails'
108 | and u1.login = 'rails'
109 | {% endhighlight %}
110 |
111 |
112 | ### project\_languages
113 | Languages that are used in the repository along with **byte counts** for
114 | all files in those languages.
115 |
116 | Multiple entries can exist per project. The `created_at` field is filled in with
117 | the latest timestamp the query for a specific `project_id` was done.
118 |
119 | The table is filled in when the project has been first inserted on when
120 | an update round for all projects is made.
121 |
122 | {% highlight sql %}
123 | -- Get the latest byte count for languges in Ruby on Rails
124 | select *
125 | from project_languages
126 | where project_id = 1334
127 | order by created_at desc
128 |
129 | {% endhighlight %}
130 |
131 | #### commits
132 | Unique commits.
133 |
134 | * Each commit is identified globally through its `sha` field. If the author or
135 | the committer has not configured his [Github email address](https://help.github.com/articles/setting-your-email-in-git), no resolution to
136 | a `user` entry is possible. In that case, GHTorrent generates artificial users using the provided email in the Git commit author or committer fields. If the user
137 | then configures his Github account, GHTorrent will update the artificial user
138 | accordingly.
139 |
140 | * The `project_id` field contains a link to the project that this commit has
141 | been first associated with. This might not be the project this commit was
142 | initially pushed to, e.g. in case the fork is processed before the parent.
143 | See [project\_commits](relational.html#project_commits).
144 |
145 | * The `project_id` field may be null when the repository has been
146 | deleted at the time the commit is processed. This situation might happen when
147 | retrospectively processing pull requests for a repository and the
148 | repository which the pull request originates from has been deleted.
149 |
150 | #### commit\_parents
151 | The parent commit(s) for each commit, as specified by Git.
152 |
153 | #### project\_commits
154 | The commits belonging to the history of a project.
155 |
156 | More than one projects can share the same commits if one is a fork of the other.
157 |
158 | #### commit\_comments
159 | Code review comments on commits.
160 |
161 | These are comments on individual commits. If a commit is associated with a pull
162 | request, then its comments are in the
163 | [pull\_request\_comments](relational.html#pull_request_comments) table.
164 |
165 | #### followers
166 | A follower to a user.
167 |
168 | The `created_at` field is only filled in accurately for followships for which
169 | GHTorrent has recorded a corresponding event. Otherwise, it is filled in with the
170 | latest date that the corresponding user or follower has been created.
171 |
172 | #### watchers
173 | Users that have starred (was [watched](https://github.com/blog/1204-notifications-stars)) a project
174 |
175 | The `created_at` field is only filled in accurately for starrings for which
176 | GHTorrent has recorded a corresponding event. Otherwise, it is filled in with the
177 | latest date that the corresponding user or project has been created.
178 |
179 | #### pull\_requests
180 | A pull request initiated from `head_repo_id`:`head_commit_id` to `base_repo_id`:`base_commit_id`
181 |
182 | * Pull requests can be in various states. The states and their transitions
183 | are recorded in the [pull\_request\_history](relational.html#pull_request_history) table.
184 | * The `pullreq_id` field is Github's pull request unique identifier
185 | * The `intra_branch` field signifies that the head and base repositories are the
186 | same
187 | * If the head repository is NULL, this means that the corresponding project had been deleted when GHTorrent processed the pull request.
188 |
189 | #### pull\_request\_history
190 | An event in the pull request lifetime
191 |
192 | The `action` field can take the following values
193 |
194 | * `opened`: When the pull request has been opened
195 | * `closed`: When the pull request has been closed
196 | * `merged`: When Github detected that the pull request has been merged. No merges
197 | outside Github (i.e. Git based) are reported
198 | * `reoponed`: When a pull request is opened after being closed
199 | * `syncrhonize`: When new commits are added/removed to the head repository
200 |
201 | #### pull\_request\_commits
202 | A commit associated with a pull request
203 |
204 | The list is additive. This means if a rebase with commit squashing takes place after the commits of a pull request have been processed, the old commits will not be deleted.
205 |
206 | #### pull\_request\_comments
207 | A code review comment on a commit associated with a pull request
208 |
209 | The list is additive. If commits are squashed on the head repo, the comments
210 | remain intact.
211 |
212 | #### issues
213 | An issue associated with a repository
214 |
215 | * The `assignee` field is filed in with the user to which the issue was
216 | assigned at the time the issue was processed.
217 | * Issues have history recorded in the [issue\_events](relational.html#issue_events) table.
218 | * For every pull request, GHTorrent creates a corresponding issue. The
219 | `pull_request_id` field points to the associated pull request
220 | * The `issue_id` field is the unique identifier given to the issue by Github.
221 |
222 | #### issue\_events
223 | An event on an issue
224 |
225 | * The `action` field can have the following values:
226 | * `subscribed`: When a user subscribes to receive notifications about the issue.
227 | * `mentioned`: When a user is mentioned by another user (@user notation)
228 | * `closed`: When the issue has been closed
229 | * `referenced`: The issue was referenced in a commit (using the
230 | [fixes: conventions](https://github.com/blog/831-issues-2-0-the-next-generation))
231 | * `assigned`: When the issue has been assigned to an actor.
232 | * `reopened`: When a closed issue is reopened
233 | * `unsubscribed`: When a user unsubscribed from issue.
234 | * `merged`: When the pull request pointed by the issue has been merged.
235 | * `head_ref_cleaned`: (Not documented) ?
236 | * `head_ref_deleted`: (Not documented) When the branch of the head repository has been deleted
237 | * `head_ref_restored`: (Not documented) When the head repository of a pull
238 | request has been restored (using the restore branch functionality).
239 |
240 | * The `action_specific` field gets filled in with the `commit\_id` of the last
241 | commit when a pull request has been closed, merged or referenced.
242 |
243 | #### issue\_comments
244 | An entry to the issue discussion. This table is always filled in with pull
245 | request (or issue) discussion comments, irrespective of whether the repository
246 | has issues enabled or not.
247 |
248 | #### repo\_labels
249 | A label to be assigned to an issue affecting this repository.
250 |
251 | #### issue\_labels
252 | A label that has been assigned to an issue
253 |
254 | ## Example queries
255 |
256 | #### List commits for a repository
257 |
258 | {%highlight sql%}
259 | select c.*
260 | from commits c, project_commits pc, projects p, users u
261 | where u.login = 'rails'
262 | and p.name = 'rails'
263 | and p.id = pc.project_id
264 | and c.id = pc.commit_id
265 | order by c.created_at desc
266 | {%endhighlight%}
267 |
268 | #### Get all actions for a pull request
269 |
270 | {%highlight sql%}
271 | select user, action, created_at from
272 | (
273 | select prh.action as action, prh.created_at as created_at, u.login as user
274 | from pull_request_history prh, users u
275 | where prh.pull_request_id = ?
276 | and prh.actor_id = u.id
277 | union
278 | select ie.action as action, ie.created_at as created_at, u.login as user
279 | from issues i, issue_events ie, users u
280 | where ie.issue_id = i.id
281 | and i.pull_request_id = ?
282 | and ie.actor_id = u.id
283 | union
284 | select 'discussed' as action, ic.created_at as created_at, u.login as user
285 | from issues i, issue_comments ic, users u
286 | where ic.issue_id = i.id
287 | and u.id = ic.user_id
288 | and i.pull_request_id = ?
289 | union
290 | select 'reviewed' as action, prc.created_at as created_at, u.login as user
291 | from pull_request_comments prc, users u
292 | where prc.user_id = u.id
293 | and prc.pull_request_id = ?
294 | ) as actions
295 | order by created_at;
296 | {%endhighlight%}
297 |
298 | #### Get participants in an issue or pull request
299 |
300 | {%highlight sql%}
301 | select distinct(user_id) from
302 | (
303 | select user_id
304 | from pull_request_comments
305 | where pull_request_id = ?
306 | union
307 | select user_id
308 | from issue_comments ic, issues i
309 | where i.id = ic.issue_id and i.pull_request_id = ?
310 | ) as participants
311 | {%endhighlight%}
312 |
313 | #### Get all users in NL that committed to a Java project today
314 |
315 | {%highlight sql%}
316 | select u.login
317 | from users u, commits c, projects p, project_commits pc
318 | where date(c.created_at) = date(now())
319 | and pc.commit_id = c.id
320 | and c.author_id = u.id
321 | and u.country_code = 'nl'
322 | and 'java' = (select pl.language
323 | from project_langauges pl
324 | where pl.project_id = p.id
325 | order by pl.created_at desc, pl.bytes desc
326 | limit 1)
327 | {%endhighlight%}
328 |
329 |
--------------------------------------------------------------------------------
/pullreq-perf/report.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Performance report for `r owner`/`r repo`
4 | ---
5 |
6 | ```{r preample, include=FALSE}
7 |
8 | #
9 | # (c) 2012 -- 2014 Georgios Gousios