├── .gitignore ├── Gemfile ├── Gemfile.lock ├── LICENSE.txt ├── README.md ├── crawler.rb └── words.rb /.gitignore: -------------------------------------------------------------------------------- 1 | githubapi.conf 2 | commits.txt 3 | commits.gz* 4 | words.txt 5 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gem 'octokit' 4 | 5 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: https://rubygems.org/ 3 | specs: 4 | addressable (2.3.6) 5 | faraday (0.9.1) 6 | multipart-post (>= 1.2, < 3) 7 | multipart-post (2.0.0) 8 | octokit (3.7.0) 9 | sawyer (~> 0.6.0, >= 0.5.3) 10 | sawyer (0.6.0) 11 | addressable (~> 2.3.5) 12 | faraday (~> 0.8, < 0.10) 13 | 14 | PLATFORMS 15 | ruby 16 | 17 | DEPENDENCIES 18 | octokit 19 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Masayuki Morita 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # commit-crawler 2 | 3 | Crawler for GitHub commit messages 4 | 5 | This is a crawler program which collects commit messages on the GitHub popular repositories. 6 | 7 | ## How To Use 8 | Get your access token for GitHub API. 9 | 10 | see: Creating an access token for command-line use 11 | https://help.github.com/articles/creating-an-access-token-for-command-line-use/ 12 | 13 | Git clone and bundle. 14 | ``` 15 | $ git clone https://github.com/minamijoyo/commit-crawler 16 | $ cd commit-crawler 17 | $ bundle install 18 | ``` 19 | 20 | Export your access token to environment valiable. 21 | ``` 22 | $ echo "export GITHUBAPI_ACCESS_TOKEN=xxxxx" > githubapi.conf 23 | $ source githubapi.conf 24 | ``` 25 | 26 | Run the crawler. 27 | ``` 28 | $ ruby crawler.rb 29 | ``` 30 | The results will be "commits.txt". 31 | -------------------------------------------------------------------------------- /crawler.rb: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | require 'octokit' 3 | 4 | # Crawler class for GitHub 5 | # 6 | # 7 | class Crawler 8 | 9 | # Initializes a new crawler 10 | # 11 | # @param [Hash] opts the options to initialize 12 | # @option opts [String] :access_token The token for GitHub API 13 | # @option opts [File] :file The file handle for output 14 | def initialize(opts = {}) 15 | @client = Octokit::Client.new(:access_token => opts[:access_token]) 16 | raise "Octokit initialization error" if @client.nil? 17 | @file = opts[:file] 18 | 19 | puts "Login user: #{@client.user.login}" 20 | end 21 | 22 | # Crawl to GitHub 23 | # 24 | def crawl 25 | puts "Search repositories on GitHub" 26 | 27 | # search for popular repositories 28 | @repos = search_repos('stars:>10000') 29 | puts "Target repositories: #{@repos}" 30 | 31 | # crawl for each repository 32 | @repos.each { |repo| crawl_repo repo } 33 | end 34 | 35 | private 36 | 37 | # Search target repositories for crawl 38 | # 39 | # @param [String] query Query keword for search 40 | # @return [Array] An array of repo's full_name 41 | def search_repos(query) 42 | # temporary set auto_paginate to true 43 | @client.auto_paginate = true 44 | # search 45 | search_results = @client.search_repos(query, :per_page => 100) 46 | # auto_paginate off 47 | @client.auto_paginate = false 48 | # extract repo's full_name 49 | repos = search_results.items.map(&:full_name) 50 | end 51 | 52 | # Crawl a repository 53 | # 54 | # @param [String] repo Repository full_name such as "minamijoyo/commit_messages" 55 | def crawl_repo(repo) 56 | # get commits list on the repository 57 | commits_list(repo) 58 | end 59 | 60 | # Get List of commits 61 | # 62 | # @param [String] repo Repository full_name 63 | def commits_list(repo) 64 | puts "#{Time.now}: Get commit messages on github:#{repo}, rate limit: #{@client.rate_limit}" 65 | 66 | # check the rate limitaion of GitHub API 67 | check_rate_limit_and_sleep 68 | 69 | # for the first page 70 | first_response = @client.commits(repo, :per_page => 100) 71 | # extract necessary data from response 72 | commits = parse_commits(repo, first_response) 73 | # output to file 74 | puts_file commits 75 | 76 | # for the next page 77 | last_response = @client.last_response 78 | while last_response && last_response.rels[:next] 79 | # each page 80 | last_response = last_response.rels[:next].get 81 | last_commits = parse_commits(repo, last_response.data) 82 | puts_file last_commits 83 | 84 | # adjust request speed on each page 85 | sleep 1 86 | end 87 | 88 | end 89 | 90 | # Extract necessary data from response 91 | # 92 | # @param [String] repo Repository full_name 93 | # @param [Sawyer::Response] commits_response Response object 94 | # @return [Hash] A hash of commit data 95 | def parse_commits(repo, commits_response) 96 | commits_response.map do |commit| 97 | message = commit[:commit][:message].lines[0] 98 | { :repo => repo, 99 | :sha => commit[:sha], 100 | :message => message.nil? ? "" : message.chomp 101 | } 102 | end 103 | end 104 | 105 | # Write results output file 106 | # 107 | # @param [Array] commits An array of hash which returned by parse_commits 108 | def puts_file(commits) 109 | commits.each do |commit| 110 | @file.puts "#{commit[:repo]}, #{commit[:sha]}, #{commit[:message]}" 111 | end 112 | end 113 | 114 | # check the rate limitaion of GitHub API 115 | # 116 | def check_rate_limit_and_sleep 117 | limit = @client.rate_limit 118 | # check the remaining count 119 | until limit.remaining do 120 | puts "current rate limit: #{limit}" 121 | # limit resets after reset_in second 122 | reset_in = limit.resets_in 123 | puts "rate limit sleep in #{reset_in}" 124 | # sleep until reset 125 | sleep reset_in 126 | end 127 | end 128 | end 129 | 130 | # Main 131 | # 132 | # 133 | puts "Start initialization" 134 | 135 | # GitHub API access token must be exported 136 | access_token = ENV['GITHUBAPI_ACCESS_TOKEN'] 137 | raise "GITHUBAPI_ACCESS_TOKEN must be exported !!" if access_token.nil? 138 | 139 | # results file name 140 | filename = "commits.txt" 141 | 142 | # Open file for results 143 | File.open(filename, "w") do |file| 144 | # set no buffering 145 | file.sync = true 146 | $stdout.sync = true 147 | 148 | # initialize crawler 149 | crawler = Crawler.new({:access_token => access_token, 150 | :file => file}) 151 | # run crawl main 152 | crawler.crawl 153 | 154 | end 155 | 156 | puts "End of program" 157 | -------------------------------------------------------------------------------- /words.rb: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | words = Hash.new(0) 4 | 5 | open('commits.txt') do |file| 6 | file.each do |line| 7 | repo_full_name, sha, message = line.split(', ', 3) 8 | message.chomp! 9 | message.split(/\W+/).each do |word| 10 | if word.length > 2 11 | words[word.downcase] += 1 12 | end 13 | end 14 | end 15 | end 16 | 17 | words.sort_by{|word, count| [-count, word]}.each do |word, count| 18 | puts "#{word}, #{count}" 19 | end 20 | --------------------------------------------------------------------------------