├── _build └── .keepme ├── .gitignore ├── requirements.txt ├── setup.cfg ├── Makefile ├── README.md ├── quickfind.sh ├── etc └── conf.yml-example └── bin └── git_stats.py /_build/.keepme: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _build/* 2 | etc/*.yml 3 | !.keepme 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | future 2 | gitpython 3 | pyyaml 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [pycodestyle] 2 | max-line-length = 100 3 | 4 | [flake8] 5 | max-line-length = 100 6 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | help: 3 | @echo "build Clone/update repositories" 4 | @echo "show Show statistics" 5 | 6 | build: 7 | ./bin/git_stats.py -c etc/conf.yml -i 8 | 9 | show: 10 | ./bin/git_stats.py -c etc/conf.yml 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # git-stats 2 | 3 | ## Usage 4 | 5 | ```bash 6 | # Initialize or update repositories 7 | ./bin/git-stats -c etc/conf.yml -i 8 | 9 | # Gather statistics 10 | ./bin/git-stats -c etc/conf.yml 11 | ``` 12 | 13 | ```bash 14 | # Search for a word in all commit messages 15 | # Must have run `./bin/git-stats -c etc/conf.yml -i` first 16 | ./quickfind.sh 17 | ``` 18 | -------------------------------------------------------------------------------- /quickfind.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | if [ "${#}" -ne "1" ]; then 4 | echo "Usage: ./quickfind.sh " 5 | exit 1 6 | fi 7 | 8 | WORD="${1}" 9 | 10 | find . \ 11 | -type d \ 12 | -name \.git \ 13 | -print0 \ 14 | | xargs \ 15 | -0 \ 16 | -n1 \ 17 | -P"$(nproc)" \ 18 | sh -c "cd \$1; git log 2>/dev/null | grep -Ei '.*\b(${WORD})(\b|s|z|d|es|ed|er|rs|ers|or|ors|ing|in|-)?(\b|\s|-|_|$).*' && echo repository \$1" -- 19 | 20 | 21 | # sh -c "cd \$1; git log --before='2019-01-01' --after='2017-12-31' 2>/dev/null | grep -Ei '.*\b(${WORD})(\b|s|z|d|es|ed|er|rs|ers|or|ors|ing|in|-)?(\b|\s|-|_|$).*' && echo repository \$1" -- 22 | 23 | -------------------------------------------------------------------------------- /etc/conf.yml-example: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | # Destination for temporary storing (cloning) the repositories for analysis 4 | tmpdir: /home/cytopia/repo/cytopia/git-stats/_build 5 | 6 | wordlist: 7 | # ---- common ----# 8 | - security 9 | - hotfix 10 | - fix 11 | - wip 12 | - temporary 13 | - temporarily 14 | - bug 15 | - buggy 16 | - hack 17 | - crash 18 | - disable 19 | - enable 20 | - comment 21 | - update 22 | - upgrade 23 | - typo 24 | - issue 25 | - fatal 26 | - exception 27 | - documentation 28 | - improve 29 | - clean 30 | # ---- security ----# 31 | - password 32 | - token 33 | - secret 34 | - ssh 35 | - private 36 | - backdoor 37 | # ---- pro ----# 38 | - boss 39 | - '1337' 40 | - 'l337' 41 | - 'l33t' 42 | - '133t' 43 | - haxor 44 | - haxxor 45 | - h4x0r 46 | - suxor 47 | - suxxor 48 | - n00b 49 | - pr0n 50 | - pwn 51 | - wayne 52 | # ---- religious ----# 53 | - jesus 54 | - god 55 | # ---- swear words ----# 56 | - facial 57 | - shit 58 | - shithead 59 | - fuck 60 | - fuckhead 61 | - motherfuck 62 | - bitch 63 | - bitchass 64 | - bitcharse 65 | - bitchfuck 66 | - butt 67 | - buttfuck 68 | - butthead 69 | - damn 70 | - cunt 71 | - pussy 72 | - dick 73 | - dickhead 74 | - prick 75 | - ass 76 | - asshole 77 | - arse 78 | - arsehole 79 | - crap 80 | - bastard 81 | - hell 82 | - piss 83 | - suck 84 | - whore 85 | 86 | 87 | # List of repositories to check (http(s)?:// or user@host:/ will work) 88 | repositories: 89 | - git@github.com:cytopia/git-stats.git 90 | - https://github.com/cytopia/devilbox 91 | -------------------------------------------------------------------------------- /bin/git_stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # The MIT License (MIT) 5 | # 6 | # Copyright (c) 2019 cytopia 7 | """Get git statistics.""" 8 | 9 | 10 | # ------------------------------------------------------------------------------------------------- 11 | # IMPORTS 12 | # ------------------------------------------------------------------------------------------------- 13 | 14 | # Default import 15 | import os 16 | import getopt 17 | import tempfile 18 | import re 19 | import sys 20 | from operator import itemgetter 21 | 22 | # External dependencies 23 | from git import Repo, GitError, Git 24 | # External dependencies 25 | import yaml 26 | 27 | 28 | # ------------------------------------------------------------------------------------------------- 29 | # GLOBALS 30 | # ------------------------------------------------------------------------------------------------- 31 | 32 | REPO_PATH = '/home/cytopia/repo/cytopia/git-stats/_build/' 33 | 34 | CONFIG_PATH = os.sep.join((os.path.expanduser('~'), '.config/git-stats/conf.yml')) 35 | TMPDIR_PATH = tempfile.gettempdir() if tempfile.gettempdir() is not None else '/tmp' 36 | 37 | CONFIG_DEFS = { 38 | 'tmpdir': TMPDIR_PATH, 39 | 'wordlist': [], 40 | 'repositories': [] 41 | } 42 | 43 | 44 | # ------------------------------------------------------------------------------------------------- 45 | # HELPER FUNCTIONS 46 | # ------------------------------------------------------------------------------------------------- 47 | 48 | def print_help(): 49 | """Show git-stats help.""" 50 | print('Help') 51 | 52 | 53 | def print_version(): 54 | """Show git-stats version.""" 55 | print('Version') 56 | 57 | 58 | # ------------------------------------------------------------------------------------------------- 59 | # GIT FUNCTIONS 60 | # ------------------------------------------------------------------------------------------------- 61 | def validate_repos(repo_paths): 62 | """Ensure repositories already exist.""" 63 | for repo_path in repo_paths: 64 | try: 65 | # pylint: disable=unused-variable 66 | Repo(repo_path) 67 | except GitError: 68 | print('[ERR] Repo does not yet exist: %s' % repo_path, file=sys.stderr) 69 | print('Run with --init first, see --help', file=sys.stderr) 70 | sys.exit(1) 71 | 72 | 73 | def clone(git_url, tmp_dir, init): 74 | """Clone a git repository and return its path.""" 75 | # Get repository name and path 76 | repo_name = re.search('^.+?/([-_a-zA-Z0-9\\.]+)(\\.git)$', git_url, re.IGNORECASE) 77 | repo_name = repo_name.group(1) 78 | repo_path = os.path.join(os.sep, tmp_dir, repo_name) 79 | 80 | if not init: 81 | return repo_path 82 | 83 | # If repo already exists fetch remote and checkout master 84 | try: 85 | repo = Repo(repo_path) 86 | print('Updating: ' + repo_path) 87 | repo.remotes.origin.fetch() 88 | # Otherwise clone repository 89 | except (GitError): 90 | print('Cloning : ' + repo_path) 91 | repo = Repo.clone_from(git_url, repo_path) 92 | 93 | return repo_path 94 | 95 | 96 | def _get_git_log(git_path, start_date, end_date, *args): 97 | """Get commit logs and return empty in case of no branch.""" 98 | try: 99 | git = Git(git_path) 100 | 101 | # Start and End time range 102 | if start_date and end_date: 103 | return git.log( 104 | '--after="'+start_date+'"', 105 | '--before="'+end_date+'"', 106 | args 107 | ) 108 | # Only Start date 109 | if start_date: 110 | return git.log( 111 | '--after="'+start_date+'"', 112 | args 113 | ) 114 | # Only End date 115 | if end_date: 116 | return git.log( 117 | '--before="'+end_date+'"', 118 | args 119 | ) 120 | # No time range 121 | return git.log( 122 | args 123 | ) 124 | except GitError: 125 | return '' 126 | 127 | 128 | def _get_git_words(git_paths, email, wordlist, start_date=None, end_date=None): 129 | """Get count on words per commit message.""" 130 | # initialize 131 | words = dict() 132 | for word in wordlist: 133 | words[word] = 0 134 | 135 | # For all Git repositories by specified user 136 | for path in git_paths: 137 | 138 | messages = _get_git_log( 139 | path, 140 | start_date, 141 | end_date, 142 | '--author='+email, 143 | '--oneline' 144 | ).splitlines() 145 | 146 | # For all words we are looking for 147 | for word in wordlist: 148 | # Loop over all commit messages 149 | for message in messages: 150 | # Apply forgiving regex 151 | match = re.match(r'.*\b('+word+')(\b|s|z|d|es|ed|er|rs|ers|or|ors|ing|in|-)?(\b|\\s|-|_|$).*', message, re.I) 152 | # Did the regex succeed? 153 | if match: 154 | # Did it really find something? 155 | if match.group(1) is not None: 156 | words[word] += 1 157 | 158 | return words 159 | 160 | 161 | def _get_git_files_adds_dels(git_paths, email, start_date=None, end_date=None): 162 | """Return count for changed diles, additions and delettion per email.""" 163 | # Total across all commits 164 | files = 0 165 | adds = 0 166 | dels = 0 167 | # Max per commit 168 | max_files = 0 169 | max_adds = 0 170 | max_dels = 0 171 | 172 | for path in git_paths: 173 | 174 | lines = _get_git_log( 175 | path, 176 | start_date, 177 | end_date, 178 | '--author='+email, 179 | '--oneline', 180 | '--shortstat' 181 | ).splitlines() 182 | 183 | for line in lines: 184 | match_files = re.match(r'.*(\s+([0-9]+)\s+file)', line, re.I) 185 | match_adds = re.match(r'.*(\s+([0-9]+)\s+inser)', line, re.I) 186 | match_dels = re.match(r'.*(\s+([0-9]+)\s+delet)', line, re.I) 187 | 188 | if match_files: 189 | if match_files.group(2) is not None: 190 | files += int(match_files.group(2)) 191 | if int(match_files.group(2)) > max_files: 192 | max_files = int(match_files.group(2)) 193 | if match_adds: 194 | if match_adds.group(2) is not None: 195 | adds += int(match_adds.group(2)) 196 | if int(match_adds.group(2)) > max_adds: 197 | max_adds = int(match_adds.group(2)) 198 | if match_dels: 199 | if match_dels.group(2) is not None: 200 | dels += int(match_dels.group(2)) 201 | if int(match_dels.group(2)) > max_dels: 202 | max_dels = int(match_dels.group(2)) 203 | 204 | return { 205 | 'files': files, 206 | 'adds': adds, 207 | 'dels': dels, 208 | 'max_files': max_files, 209 | 'max_adds': max_adds, 210 | 'max_dels': max_dels 211 | } 212 | 213 | 214 | def _get_git_contributor_commit_count(git_paths, email, start_date=None, end_date=None): 215 | """Retrieve list of commit hashes across all repositories for one contributor email.""" 216 | commits = list() 217 | 218 | for path in git_paths: 219 | 220 | hashes = _get_git_log( 221 | path, 222 | start_date, 223 | end_date, 224 | '--author='+email, 225 | '--format="%H"' 226 | ).splitlines() 227 | 228 | commits += hashes 229 | 230 | return len(commits) 231 | 232 | 233 | def _get_git_contributor_emails(git_paths, start_date=None, end_date=None): 234 | """Retrieve all contributor emails on all repositories uniquely.""" 235 | contributors = list() 236 | 237 | for path in git_paths: 238 | 239 | committers = _get_git_log( 240 | path, 241 | start_date, 242 | end_date, 243 | '--format=%cE' 244 | ).split() 245 | authors = _get_git_log( 246 | path, 247 | start_date, 248 | end_date, 249 | '--format=%aE' 250 | ).split() 251 | 252 | # Add to contributors 253 | contributors = list(set(authors + committers + contributors)) 254 | 255 | return contributors 256 | 257 | 258 | def get_statistics(git_paths, start_date, end_date, wordlist): 259 | """Get all unique contributor emails across all repositories.""" 260 | stats = list() 261 | 262 | # Get contributors 263 | contributors = _get_git_contributor_emails(git_paths, start_date, end_date) 264 | 265 | for email in contributors: 266 | changes = _get_git_files_adds_dels(git_paths, email, start_date, end_date) 267 | commits = _get_git_contributor_commit_count(git_paths, email, start_date, end_date) 268 | # Search for words in commit messages 269 | words = _get_git_words(git_paths, email, wordlist, start_date, end_date) 270 | 271 | stats.append({ 272 | 'email': email, 273 | 'commits': commits, 274 | 'files': changes['files'], 275 | 'adds': changes['adds'], 276 | 'dels': changes['dels'], 277 | 'max_files': changes['max_files'], 278 | 'max_adds': changes['max_adds'], 279 | 'max_dels': changes['max_dels'], 280 | 'words': words 281 | }) 282 | return stats 283 | 284 | 285 | # ------------------------------------------------------------------------------------------------- 286 | # SYSTEM FUNCTIONS 287 | # ------------------------------------------------------------------------------------------------- 288 | 289 | 290 | def read_config(path): 291 | """Read configuration from file.""" 292 | data = dict() 293 | if os.path.isfile(path): 294 | with open(path, 'r') as stream: 295 | try: 296 | data = yaml.load(stream) 297 | except yaml.YAMLError as err: 298 | print('[ERR] Cannot read yaml file', file=sys.stderr) 299 | print(str(err), file=sys.stderr) 300 | 301 | if data is None: 302 | return CONFIG_DEFS 303 | 304 | # Normalize 305 | if 'tmpdir' not in data: 306 | data['tmpdir'] = TMPDIR_PATH 307 | if 'repositories' not in data: 308 | data['repositories'] = list() 309 | if 'wordlist' not in data: 310 | data['wordlist'] = list() 311 | 312 | return data 313 | 314 | 315 | def parse_args(argv): 316 | """Parse command line arguments.""" 317 | # Dictionary for cmd options 318 | options = dict() 319 | 320 | try: 321 | opts, argv = getopt.getopt(argv, 'c:t:hvi', [ 322 | 'config=', 323 | 'tmpdir=', 324 | 'init', 325 | 'version', 326 | 'help' 327 | ]) 328 | except getopt.GetoptError as err: 329 | print(''.join(map(str, err)), file=sys.stderr) 330 | print('Type --help for help', file=sys.stderr) 331 | sys.exit(2) 332 | 333 | # Get command line options 334 | for opt, arg in opts: 335 | # Show help screen 336 | if opt in ('-h', '--help'): 337 | print_help() 338 | sys.exit() 339 | # Show version 340 | elif opt in ('-v', '--version'): 341 | print_version() 342 | sys.exit() 343 | # Do we initialize? 344 | elif opt in ('-i', '--init'): 345 | options['init'] = True 346 | # Get alternative configuration file 347 | elif opt in ('-c', '--config'): 348 | if not os.path.isfile(arg): 349 | print('[ERR] ' + opt + ' specified config does not exist: ' + arg, file=sys.stderr) 350 | sys.exit(2) 351 | options['config'] = arg 352 | # Get alternative configuration file 353 | elif opt in ('-t', '--tmpdir'): 354 | if not os.path.isdir(arg): 355 | print('[ERR] ' + opt + ' specified directory does not exist: ' + arg, 356 | file=sys.stderr) 357 | sys.exit(2) 358 | options['tmpdir'] = arg 359 | 360 | return options 361 | 362 | 363 | # ------------------------------------------------------------------------------------------------- 364 | # MAIN 365 | # ------------------------------------------------------------------------------------------------- 366 | 367 | def main(argv): 368 | """Start of main entrypoint.""" 369 | show_top = 10 370 | start_date = '2019-01-01' 371 | end_date = '2019-12-31' 372 | 373 | # Parse command line options and read config file 374 | options = parse_args(argv) 375 | settings = read_config(options.get('config', CONFIG_PATH)) 376 | 377 | # Overwrite settings with command line options 378 | settings['tmpdir'] = options.get('tmpdir', settings['tmpdir']) 379 | 380 | # Get Git repositories 381 | repo_paths = list() 382 | for repo in settings['repositories']: 383 | repo_paths.append(clone(repo, settings['tmpdir'], options.get('init', False))) 384 | 385 | # Ensure repositories have been cloned already 386 | validate_repos(repo_paths) 387 | 388 | # Get Statistics 389 | statistics = get_statistics(repo_paths, start_date, end_date, settings['wordlist']) 390 | 391 | # Sort by git statistics 392 | by_comms = (sorted(statistics, key=lambda d: d['commits'], reverse=True)) 393 | by_files = (sorted(statistics, key=lambda d: d['files'], reverse=True)) 394 | by_adds = (sorted(statistics, key=lambda d: d['adds'], reverse=True)) 395 | by_dels = (sorted(statistics, key=lambda d: d['dels'], reverse=True)) 396 | by_mfiles = (sorted(statistics, key=lambda d: d['max_files'], reverse=True)) 397 | by_madds = (sorted(statistics, key=lambda d: d['max_adds'], reverse=True)) 398 | by_mdels = (sorted(statistics, key=lambda d: d['max_dels'], reverse=True)) 399 | 400 | print() 401 | print('---------------------------------------------------------------------------------------') 402 | print(' NUMBER OF COMMITS') 403 | print('---------------------------------------------------------------------------------------') 404 | for cnt, item in enumerate(by_comms): 405 | if item['commits'] > 0 and cnt < show_top: 406 | print('{:9,d} {}'.format(item['commits'], item['email'])) 407 | 408 | print() 409 | print('---------------------------------------------------------------------------------------') 410 | print(' CHANGED FILES') 411 | print('---------------------------------------------------------------------------------------') 412 | for cnt, item in enumerate(by_files): 413 | if item['files'] > 0 and cnt < show_top: 414 | print('{:9,d} {}'.format(item['files'], item['email'])) 415 | 416 | print() 417 | print('---------------------------------------------------------------------------------------') 418 | print(' MAX CHANGED FILES PER COMMIT') 419 | print('---------------------------------------------------------------------------------------') 420 | for cnt, item in enumerate(by_mfiles): 421 | if item['max_files'] > 0 and cnt < show_top: 422 | print('{:9,d} {}'.format(item['max_files'], item['email'])) 423 | 424 | print() 425 | print('---------------------------------------------------------------------------------------') 426 | print(' LINES OF ADDITIONS') 427 | print('---------------------------------------------------------------------------------------') 428 | for cnt, item in enumerate(by_adds): 429 | if item['adds'] > 0 and cnt < show_top: 430 | print('{:9,d} {}'.format(item['adds'], item['email'])) 431 | 432 | print() 433 | print('---------------------------------------------------------------------------------------') 434 | print(' MAX LINES OF ADDITIONS PER COMMIT') 435 | print('---------------------------------------------------------------------------------------') 436 | for cnt, item in enumerate(by_madds): 437 | if item['max_adds'] > 0 and cnt < show_top: 438 | print('{:9,d} {}'.format(item['max_adds'], item['email'])) 439 | 440 | print() 441 | print('---------------------------------------------------------------------------------------') 442 | print(' LINES OF DELETIONS') 443 | print('---------------------------------------------------------------------------------------') 444 | for cnt, item in enumerate(by_dels): 445 | if item['dels'] > 0 and cnt < show_top: 446 | print('{:9,d} {}'.format(item['dels'], item['email'])) 447 | 448 | print() 449 | print('---------------------------------------------------------------------------------------') 450 | print(' MAX LINES OF DELETIONS PER COMMIT') 451 | print('---------------------------------------------------------------------------------------') 452 | for cnt, item in enumerate(by_mdels): 453 | if item['max_dels'] > 0 and cnt < show_top: 454 | print('{:9,d} {}'.format(item['max_dels'], item['email'])) 455 | 456 | for word in settings['wordlist']: 457 | print() 458 | print('---------------------------------------------------------------------------------------') 459 | print(' WORD: ' + word) 460 | print('---------------------------------------------------------------------------------------') 461 | by_word = sorted(statistics, key=lambda d: d['words'][word] if word in d['words'] else 0, reverse=True) 462 | for cnt, item in enumerate(by_word): 463 | if item['words'][word] > 0 and cnt < show_top: 464 | print('{:9,d} {}'.format(item['words'][word], item['email'])) 465 | 466 | 467 | if __name__ == '__main__': 468 | main(sys.argv[1:]) 469 | --------------------------------------------------------------------------------