├── LICENSE ├── README.md ├── bq ├── 996icu.sql ├── bq_api_example.py ├── bq_helper_example.py ├── github_overview.sql └── github_rank.sql ├── gharchive └── parse_json.py └── links.md /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 jiawei 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Mining-Github 2 | 通过分析Github上的开发者以及项目的行为和特征,得出一些有意义的结果,看行业发展趋势 3 | -------------------------------------------------------------------------------- /bq/996icu.sql: -------------------------------------------------------------------------------- 1 | 2 | --996ICU 2019/03 日均 issue,北京属于东八区+8 3 | 4 | SELECT DATE(DATE_ADD(created_at, 8, "HOUR")) AS a_day,COUNT(*) as counts 5 | FROM TABLE_QUERY([githubarchive:month],'table_id CONTAINS "2019"') 6 | WHERE type = 'IssuesEvent' AND repo.name = '996icu/996.ICU' AND JSON_EXTRACT_SCALAR(payload, '$.action') = 'opened' 7 | GROUP BY a_day 8 | ORDER BY a_day 9 | 10 | 11 | --所有的issue中title出现次数排名 12 | SELECT JSON_EXTRACT_SCALAR(payload, '$.issue.title') as title,COUNT(*) AS counts 13 | FROM [githubarchive:month.201903] 14 | WHERE type = 'IssuesEvent' AND repo.name = '996icu/996.ICU' AND JSON_EXTRACT_SCALAR(payload, '$.action') = 'opened' 15 | GROUP BY title 16 | ORDER BY counts DESC 17 | 18 | --all issue title 19 | SELECT JSON_EXTRACT_SCALAR(payload, '$.issue.title') as title 20 | FROM [githubarchive:month.201903] 21 | WHERE type = 'IssuesEvent' AND repo.name = '996icu/996.ICU' AND JSON_EXTRACT_SCALAR(payload, '$.action') = 'opened' 22 | ORDER BY created_at 23 | 24 | 25 | --每个issue 的comment个数统计 26 | SELECT JSON_EXTRACT_SCALAR(payload, '$.issue.title') AS title,COUNT(*) AS counts 27 | FROM [githubarchive:month.201903] 28 | WHERE type = 'IssueCommentEvent' AND repo.name = '996icu/996.ICU' AND JSON_EXTRACT_SCALAR(payload, '$.action') = 'created' 29 | GROUP BY title 30 | ORDER BY counts desc 31 | 32 | 33 | --issue comment 公司员工个数排名 34 | SELECT company, COUNT(*) AS comments,COUNT(DISTINCT login) as user_number 35 | FROM [githubarchive:month.201903] AS a 36 | JOIN [ghtorrent-bq:ght_2018_04_01.users] AS b 37 | ON a.actor.login=b.login 38 | WHERE b.company != '\\N' AND a.type='IssueCommentEvent' AND a.repo.name = '996icu/996.ICU' 39 | GROUP BY company 40 | ORDER BY user_number DESC 41 | 42 | 43 | --issue comments 公司总量排名 44 | SELECT company, COUNT(*) AS comments,COUNT(DISTINCT login) as user_number,group_concat(login) 45 | FROM [githubarchive:month.201903] AS a 46 | JOIN [ghtorrent-bq:ght_2018_04_01.users] AS b 47 | ON a.actor.login=b.login 48 | WHERE b.company != '\\N' AND a.type='IssueCommentEvent' AND a.repo.name = '996icu/996.ICU' 49 | GROUP BY company 50 | ORDER BY pushes DESC 51 | 52 | --issue comments 国家排名 53 | SELECT country_code, COUNT(*) AS comments,COUNT(DISTINCT login) as user_number 54 | FROM [githubarchive:month.201903] AS a 55 | JOIN [ghtorrent-bq:ght_2018_04_01.users] AS b 56 | ON a.actor.login=b.login 57 | WHERE b.country_code != '\\N' AND a.type='IssueCommentEvent' AND a.repo.name = '996icu/996.ICU' 58 | GROUP BY country_code 59 | ORDER BY comments DESC 60 | 61 | 62 | --commit message内容 63 | SELECT JSON_EXTRACT_SCALAR(payload, '$.commits[0].message') AS message,created_at 64 | FROM TABLE_DATE_RANGE([githubarchive:month.],TIMESTAMP('2019-01'),TIMESTAMP('2019-10')) 65 | WHERE type = 'PushEvent' AND repo.name = '996icu/996.ICU' 66 | ORDER BY created_at 67 | 68 | 69 | --commit message 出现频率最高的排名 70 | SELECT JSON_EXTRACT_SCALAR(payload, '$.commits[0].message') AS message,COUNT(*) AS counts 71 | FROM TABLE_QUERY([githubarchive:month],'table_id CONTAINS "2019"') 72 | WHERE type = 'PushEvent' AND repo.name = '996icu/996.ICU' 73 | GROUP BY message 74 | ORDER BY counts DESC 75 | 76 | 77 | --commit记录作者提交次数 78 | SELECT JSON_EXTRACT_SCALAR(payload, '$.commits[0].author.name') AS name,COUNT(*) AS counts 79 | FROM TABLE_QUERY([githubarchive:month],'table_id CONTAINS "2019"') 80 | WHERE type = 'PushEvent' AND repo.name = '996icu/996.ICU' 81 | GROUP BY name 82 | ORDER BY counts desc 83 | 84 | 85 | --该项目都是由996icu push代码的,其他贡献者有commit权限 86 | SELECT repo.name,actor.id,actor.login,JSON_EXTRACT_SCALAR(payload, '$.commits[0].author.name') AS name,id,other 87 | FROM [githubarchive:month.201903] 88 | WHERE type = 'PushEvent' AND repo.name = '996icu/996.ICU' 89 | ORDER BY created_at desc 90 | 91 | -------------------------------------------------------------------------------- /bq/bq_api_example.py: -------------------------------------------------------------------------------- 1 | 2 | from google.cloud import bigquery 3 | client = bigquery.Client() 4 | query_sql = '''SELECT lang.name,COUNT(*) AS counts FROM `bigquery-public-data.github_repos.languages` , UNNEST(language) as lang 5 | GROUP BY lang.name 6 | ORDER BY counts DESC 7 | LIMIT 10''' 8 | 9 | query_job = client.query( 10 | query_sql, 11 | # Location must match that of the dataset(s) referenced in the query. 12 | location="US", 13 | ) 14 | 15 | for row in query_job: 16 | print(row['name'],row['counts']) 17 | -------------------------------------------------------------------------------- /bq/bq_helper_example.py: -------------------------------------------------------------------------------- 1 | import os 2 | import bq_helper 3 | os.environ['GOOGLE_APPLICATION_CREDENTIALS']='xxx.json' 4 | github_repos = bq_helper.BigQueryHelper(active_project= "bigquery-public-data", 5 | dataset_name = "github_repos") 6 | 7 | query_sql= '''SELECT lang.name,COUNT(*) AS counts FROM `bigquery-public-data.github_repos.languages` , UNNEST(language) as lang 8 | GROUP BY lang.name 9 | ORDER BY counts DESC 10 | LIMIT 10''' 11 | print('estimate query size:',github_repos.estimate_query_size(query_sql)) 12 | 13 | #BigQuery data as a Pandas DataFrame 14 | github_repo_sizes = github_repos.query_to_pandas_safe(query_sql, max_gb_scanned=23) 15 | 16 | print(github_repo_sizes.head(5)['name']) 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /bq/github_overview.sql: -------------------------------------------------------------------------------- 1 | --bq会用到三个关于Github数据集bigquery-public-data:github_repos,githubarchive,ghtorrent-bq 2 | --bigquery-public-data:github_repos,githubarchive更新频繁,ghtorrent-bq好久没有更新,但是可以手动导入其官网数据 3 | --三个数据集都有较为实用的信息,哪个方便使用哪一个 4 | 5 | 6 | 7 | --查询Github上总的仓库数量 8 | --使用bigquery-public-data:github_repos.languages数据表 9 | --3347497 repos by 2019 10/05 10 | 11 | SELECT COUNT(*) 12 | FROM [bigquery-public-data:github_repos.languages] 13 | 14 | 15 | 16 | --查询Github上总commits数量 17 | --使用bigquery-public-data:github_repos.commits数据表 18 | -- 235186010 commits by 2019 10/05 19 | 20 | SELECT COUNT(*) 21 | FROM [bigquery-public-data:github_repos.commits] 22 | 23 | 24 | --查询Github上总编程语言个数 25 | --可以使用bigquery-public-data:github_repos.languages数据表 26 | --注意的是结果中将Makefile,json,yaml等文件格式也标记成了编程语言 27 | -- 386 languages by 2019 10/05 28 | 29 | SELECT language.name,COUNT(*) counts 30 | FROM [bigquery-public-data:github_repos.languages] 31 | GROUP BY language.name 32 | ORDER BY counts DESC; 33 | 34 | 35 | --查询Github上license种类 36 | --可以使用bigquery-public-data:github_repos.licenses数据表 37 | -- 15 licenses by 2019 10/05 38 | 39 | SELECT license,COUNT(*) counts 40 | FROM [bigquery-public-data:github_repos.licenses] 41 | GROUP BY license 42 | ORDER BY counts DESC; 43 | 44 | 45 | --查询github上有多少个独立的注册用户 46 | --24154883 users by 2018 04/01 47 | 48 | SELECT COUNT(*) 49 | FROM [ghtorrent-bq:ght_2018_04_01.users] 50 | 51 | 52 | --查询github上有多少个曾经提交记录的用户 -------------------------------------------------------------------------------- /bq/github_rank.sql: -------------------------------------------------------------------------------- 1 | --bq会用到三个关于Github数据集bigquery-public-data:github_repos,githubarchive,ghtorrent-bq 2 | --bigquery-public-data:github_repos,githubarchive更新频繁,ghtorrent-bq好久没有更新,但是可以手动导入其官网数据 3 | --三个数据集都有较为实用的信息,哪个方便使用哪一个 4 | 5 | 6 | 7 | --中国高校2018年push数量100强 8 | --事件类型为PushEvent的payload含有email 9 | --全球教育机构只要将过滤条件改为.edu即可 10 | 11 | SELECT REGEXP_EXTRACT(email, r'@(.*)') AS domain, COUNT(*) AS counts 12 | FROM ( 13 | SELECT REGEXP_EXTRACT(payload, r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)') AS email 14 | FROM [githubarchive:year.2018] 15 | WHERE REGEXP_EXTRACT(payload, r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)') IS NOT null 16 | GROUP BY email 17 | HAVING email CONTAINS '.edu.cn' 18 | ) 19 | GROUP BY domain 20 | ORDER BY counts DESC 21 | LIMIT 10; 22 | 23 | 24 | --查看具有国家信息的用户占比 25 | --0.07504958728220708 repos by 2018 04/01 26 | 27 | SELECT SUM(country_code!='\\N')/COUNT(*) 28 | FROM [ghtorrent-bq:ght_2018_04_01.users] 29 | 30 | 31 | --ghtorrent-bq数据集users表记录了注册者的国家信息,有的用户没有 32 | --由于users只更新到2018/04/01,联合users表和githubarchive:year.2017表 33 | --Github 各个国家push数量,用户数量排名 34 | 35 | SELECT country_code, COUNT(*) AS pushes,COUNT(DISTINCT login) as user,INTEGER(COUNT(*)/COUNT(DISTINCT login)) 36 | FROM [githubarchive:year.2017] a 37 | JOIN [ghtorrent-bq:ght_2018_04_01.users] b 38 | ON a.actor.login=b.login 39 | WHERE country_code != '\\N' 40 | AND a.type='PushEvent' 41 | GROUP BY country_code 42 | ORDER BY pushes DESC 43 | 44 | 45 | --公司贡献排名 by 2018.01 46 | 47 | SELECT company , COUNT(*) AS pushes 48 | FROM [githubarchive:year.2017] a 49 | JOIN [ghtorrent-bq:ght_2018_04_01.users] b 50 | ON a.actor.login=b.login 51 | WHERE company != '\\N' 52 | AND a.type='PushEvent' 53 | GROUP BY company 54 | ORDER BY pushes DESC 55 | LIMIT 100 56 | 57 | 58 | --2019年以来被fork次数最多的项目,一定程度上代表了该项目的流行程度 59 | 60 | SELECT table2019.repo.name AS repo_name,COUNT(DISTINCT table2019.actor.id) AS fork_counts 61 | FROM 62 | ( 63 | SELECT * 64 | FROM TABLE_QUERY([githubarchive:month],'table_id CONTAINS "2019"') 65 | ) AS table2019 66 | WHERE table2019.type = 'ForkEvent' 67 | GROUP BY repo_name 68 | ORDER BY fork_counts DESC 69 | LIMIT 20 70 | 71 | 72 | --哪个项目被提交的次数最多 73 | 74 | SELECT repo_name, COUNT(*) AS commit_num 75 | FROM [bigquery-public-data:github_repos.commits] 76 | GROUP BY repo_name 77 | ORDER BY commit_num DESC 78 | 79 | 80 | --2019/01/01年以来提交最频繁的项目 81 | 82 | SELECT repo_name, COUNT(*) AS commit_num 83 | FROM [bigquery-public-data:github_repos.commits] 84 | WHERE committer.date.seconds >= 1546272000 85 | GROUP BY repo_name 86 | ORDER BY commit_num DESC 87 | 88 | 89 | --查询github修改代码次数最多的大神 90 | --2408004 人提交过代码 by 2019/10/05 91 | 92 | SELECT author.name,COUNT(*) AS commits 93 | FROM [bigquery-public-data:github_repos.commits] 94 | GROUP BY author.name 95 | ORDER BY commits DESC 96 | 97 | 98 | --2019/01/01以来修改代码次数用户排名 99 | --author 是 patch 的作者,committer 是把 patch 应用到 repository 里的人 (很多项目限制只有少数人可以 apply patch,但大家都可以把 patch 发送给这些人) 100 | --看看有没有你熟悉的大神,我严重怀疑提交次数过万的大概率是全职 101 | 102 | SELECT author.name,COUNT(*) AS commits 103 | FROM [bigquery-public-data:github_repos.commits] 104 | WHERE committer.date.seconds >= 1546272000 105 | GROUP BY author.name 106 | ORDER BY commits DESC 107 | 108 | 109 | --2019 star数TOP10 110 | 111 | SELECT table2019.repo.name AS repo_name,COUNT(DISTINCT table2019.actor.id) AS watch_counts 112 | FROM 113 | ( 114 | SELECT * 115 | FROM TABLE_QUERY([githubarchive:month],'table_id CONTAINS "2019"') 116 | ) AS table2019 117 | WHERE table2019.type = 'WatchEvent' AND JSON_EXTRACT_SCALAR(table2019.payload, '$.action') = 'started' 118 | GROUP BY repo_name 119 | ORDER BY watch_counts DESC 120 | LIMIT 10 121 | 122 | 123 | --中国程序员的数量 124 | 125 | SELECT * 126 | FROM [ghtorrent-bq:ght_2018_04_01.users] 127 | WHERE country_code = 'cn' 128 | 129 | -------------------------------------------------------------------------------- /gharchive/parse_json.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import json 4 | import gzip 5 | 6 | def jsonReader(inputJsonFilePath,pos): 7 | 8 | flag = False 9 | with gzip.open(inputJsonFilePath, 'r') as jsonContent: 10 | for rowNumber, line in enumerate(jsonContent, start=1): 11 | try: 12 | 13 | #此处加上flag的目的在于,当程序挂掉时候,可以根据域名从指定位置开始,不必重头开始跑 14 | if rowNumber == pos: 15 | flag = True 16 | 17 | if not flag: 18 | continue 19 | 20 | line = line.strip() 21 | if len(line) <= 0: 22 | continue 23 | 24 | jsonObject = json.loads(line) 25 | 26 | repoInfo = jsonObject.get('repo',None) 27 | 28 | 29 | if repoInfo == '' or repoInfo == None: 30 | continue 31 | 32 | print(repoInfo) 33 | except Exception as e: 34 | print(e) 35 | 36 | if __name__ == '__main__': 37 | 38 | jsonReader('2019-09-19-10.json.gz',1) -------------------------------------------------------------------------------- /links.md: -------------------------------------------------------------------------------- 1 | Bigquery上关于Github三个数据集链接如下: 2 | 3 | [bigquery-public-data数据集github_repos.languages数据表](https://bigquery.cloud.google.com/table/bigquery-public-data:github_repos.languages) 4 | 5 | [githubarchive数据集githubarchive:year.2018数据表](https://bigquery.cloud.google.com/table/githubarchive:year.2018) 6 | 7 | [ghtorrent-bq数据集ght_2018_04_01.users数据表](https://bigquery.cloud.google.com/table/ghtorrent-bq:ght_2018_04_01.users) 8 | 9 | Json格式数据 10 | 11 | [下载Github json格式数据方法](https://www.gharchive.org/) 12 | 13 | MySQL格式数据 14 | 15 | [下载Github MySQL格式数据方法](http://ghtorrent.org/downloads.html) 16 | 17 | github 提供的API获取数据 18 | --------------------------------------------------------------------------------