├── .codecov.yml ├── .coveragerc ├── .gitignore ├── .travis.yml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── gitrisky ├── __init__.py ├── cli.py ├── gitcmds.py ├── model.py ├── parsing.py └── tests │ ├── __init__.py │ ├── test_cli.py │ ├── test_gitcmds.py │ └── test_model.py ├── requirements-dev.txt ├── requirements.txt ├── setup.cfg └── setup.py /.codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | 3 | status: 4 | project: off 5 | patch: off 6 | changes: off 7 | 8 | 9 | comment: false 10 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = gitrisky/tests/* 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # vim undo history 104 | *.un~ 105 | 106 | # pypi credentials 107 | .pypirc 108 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | cache: pip 4 | 5 | python: 6 | - "3.5" 7 | - "3.6" 8 | 9 | # command to install dependencies 10 | install: 11 | - pip install -r requirements.txt 12 | - pip install -r requirements-dev.txt 13 | - python3 setup.py install 14 | 15 | # command to run tests 16 | script: 17 | - pytest --cov=gitrisky 18 | - flake8 19 | 20 | # upload test coverage report to codecov 21 | after_success: 22 | - codecov 23 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Code of Conduct 2 | 3 | As contributors and maintainers of this project, and in the interest of 4 | fostering an open and welcoming community, we pledge to respect all people who 5 | contribute through reporting issues, posting feature requests, updating 6 | documentation, submitting pull requests or patches, and other activities. 7 | 8 | We are committed to making participation in this project a harassment-free 9 | experience for everyone, regardless of level of experience, gender, gender 10 | identity and expression, sexual orientation, disability, personal appearance, 11 | body size, race, ethnicity, age, religion, or nationality. 12 | 13 | Examples of unacceptable behavior by participants include: 14 | 15 | * The use of sexualized language or imagery 16 | * Personal attacks 17 | * Trolling or insulting/derogatory comments 18 | * Public or private harassment 19 | * Publishing other's private information, such as physical or electronic 20 | addresses, without explicit permission 21 | * Other unethical or unprofessional conduct 22 | 23 | Project maintainers have the right and responsibility to remove, edit, or 24 | reject comments, commits, code, wiki edits, issues, and other contributions 25 | that are not aligned to this Code of Conduct, or to ban temporarily or 26 | permanently any contributor for other behaviors that they deem inappropriate, 27 | threatening, offensive, or harmful. 28 | 29 | By adopting this Code of Conduct, project maintainers commit themselves to 30 | fairly and consistently applying these principles to every aspect of managing 31 | this project. Project maintainers who do not follow or enforce the Code of 32 | Conduct may be permanently removed from the project team. 33 | 34 | This Code of Conduct applies both within project spaces and in public spaces 35 | when an individual is representing the project or its community. 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 38 | reported by contacting a project maintainer at henry.hinnefeld@gmail.com. 39 | All complaints will be reviewed and investigated and will result in a response 40 | that is deemed necessary and appropriate to the circumstances. Maintainers are 41 | obligated to maintain confidentiality with regard to the reporter of an 42 | incident. 43 | 44 | 45 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 46 | version 1.3.0, available at 47 | [http://contributor-covenant.org/version/1/3/0/][version] 48 | 49 | [homepage]: http://contributor-covenant.org 50 | [version]: http://contributor-covenant.org/version/1/3/0/ 51 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to gitrisky 2 | 3 | We welcome bug reports and pull requests from everyone! 4 | This project is intended to be a safe, welcoming space for collaboration, and 5 | contributors are expected to adhere to the 6 | [Contributor Covenant](http://contributor-covenant.org) code of conduct. 7 | 8 | 9 | ## Getting Started 10 | 11 | 1. Fork it ( https://github.com/hinnefe2/gitrisky/fork ). 12 | 2. Install it, with the development dependencies. See `README.md`. 13 | 3. Make sure you are able to run the test suite locally (`py.test gitrisky`). 14 | 4. Create a feature branch (`git checkout -b my-new-feature`). 15 | 5. Make your change. Don't forget tests. 16 | 6. Make sure the test suite and style checks pass 17 | (`py.test gitrisky && flake8`). 18 | 7. Commit your changes (`git commit -am 'Add some feature'`). 19 | 8. Push to the branch (`git push origin my-new-feature`). 20 | 9. Create a new pull request. 21 | 10. If the build fails, address any issues. 22 | 23 | ## Tips 24 | 25 | - Contributions must conform to the guidelines encoded by `flake8`, based on 26 | PEP-8. 27 | 28 | Thank you for taking the time to contribute! 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Henry Hinnefeld 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gitrisky 2 | [![MIT License](https://img.shields.io/github/license/mashape/apistatus.svg)](http://opensource.org/licenses/MIT) 3 | [![PyPI pyversions](https://img.shields.io/pypi/pyversions/gitrisky.svg)](https://pypi.python.org/pypi/gitrisky/) 4 | [![Build Status](https://travis-ci.com/hinnefe2/gitrisky.svg?branch=master)](https://travis-ci.com/hinnefe2/gitrisky) 5 | [![codecov](https://codecov.io/gh/hinnefe2/gitrisky/branch/master/graph/badge.svg)](https://codecov.io/gh/hinnefe2/gitrisky) 6 | ![hasbadges](https://z2x6abi6e2.execute-api.us-east-1.amazonaws.com/v1/hasbadges?user=hinnefe2&repo=gitrisky) 7 | 8 | 9 | Predict code bug risk with git metadata 10 | 11 | 12 | ## Installation 13 | Installation with `pip` is recommended: 14 | ``` 15 | pip install gitrisky 16 | ``` 17 | Note that `gitrisky` requires `numpy`. If you don't already have it `pip` will 18 | try to install it for you, but this can result in a suboptimal build, see e.g. 19 | [here](https://github.com/scikit-learn/scikit-learn/issues/2569). 20 | 21 | For development a few additional dependencies are required: 22 | ``` 23 | pip install -r requirements-dev.txt 24 | ``` 25 | 26 | ## Usage 27 | `gitrisky` is installed as a command line tool. 28 | ``` 29 | Usage: gitrisky [OPTIONS] COMMAND [ARGS]... 30 | 31 | Options: 32 | --help Show this message and exit. 33 | 34 | Commands: 35 | predict Score a git commit bug risk model. 36 | train Train a git commit bug risk model. 37 | ``` 38 | The typical workflow is to first train a model on the existing commit history 39 | of a repository: 40 | ``` 41 | $ cd repo/ 42 | $ gitrisky train 43 | Model trained on 69 training examples with 14 positive cases 44 | ``` 45 | and then use the trained model to score subsequent commits: 46 | ``` 47 | $ gitrisky predict 48 | Commit 910cdb3c has a bug score of 0.2 / 1.0 49 | ``` 50 | When invoked without any extra arguments `gitrisky predict` will score the most 51 | recent commit. You can also score a particular commit with the `-c` flag: 52 | ``` 53 | $ gitrisky predict -c 470741f 54 | Commit 470741f has a bug score of 0.7 / 1.0 55 | ``` 56 | 57 | ## How does it work? 58 | See this [PyData talk](https://www.youtube.com/watch?v=2yzWrI3zGY0) for an explanation of how `gitrisky` works. 59 | 60 | 61 | ## Contributing 62 | Contributions are welcome! Please see `CONTRIBUTING.md` for information about 63 | contributing to this project. 64 | 65 | 66 | ## License 67 | The code in this project is licensed under the MIT license. See `LICENSE` for details. 68 | 69 | 70 | ## Acknowledgements 71 | The initial prototype of `gitrisky` was developed at 72 | [Civis Analytics](https://github.com/civisanalytics) during my 'Hack Time' 73 | (time explicitly allotted to explore offbeat ideas) . 74 | -------------------------------------------------------------------------------- /gitrisky/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hinnefe2/gitrisky/fd48b891f1f599a0b6d055e533c861ba63833dee/gitrisky/__init__.py -------------------------------------------------------------------------------- /gitrisky/cli.py: -------------------------------------------------------------------------------- 1 | """This module contains cli commands to train and score gitrisky models""" 2 | 3 | import sys 4 | import click 5 | 6 | from .model import create_model, save_model, load_model 7 | from .gitcmds import get_latest_commit 8 | from .parsing import get_features, get_labels 9 | 10 | 11 | @click.group() 12 | def cli(): 13 | pass 14 | 15 | 16 | @cli.command() 17 | def train(): 18 | """Train a git commit bug risk model. 19 | 20 | This will save a pickled sklearn model to a file in the toplevel directory 21 | for this repository. 22 | """ 23 | 24 | # get the features and labels by parsing the git logs 25 | features = get_features() 26 | 27 | # we can't train a model without positive training examples so we fail with 28 | # an informative error message 29 | try: 30 | labels = get_labels() 31 | except ValueError: 32 | # TODO: update this message once we support more / custom bug tags 33 | print('Failed to find any bug commits by parsing commit logs.\n' 34 | 'gitrisky looks for commit messages containing "bug" or "fix" ' 35 | 'and this repo appears not to have any.') 36 | sys.exit(1) 37 | 38 | # instantiate and train a model 39 | model = create_model() 40 | model.fit(features, labels) 41 | 42 | print('Model trained on {n} training examples with {n_bug} positive cases' 43 | .format(n=len(features), n_bug=sum(labels))) 44 | 45 | # pickle the model to a file in the top level repo directory 46 | save_model(model) 47 | 48 | 49 | @cli.command() 50 | @click.option('-c', '--commit', type=str) 51 | def predict(commit): 52 | """Score a git commit bug risk model. 53 | 54 | Parameters 55 | ---------- 56 | commit: str 57 | The hash of the commit to score. 58 | 59 | Raises 60 | ------ 61 | NotFittedError 62 | If a gitrisky model has not yet been trained on the currrent repo. 63 | """ 64 | 65 | try: 66 | model = load_model() 67 | except FileNotFoundError: 68 | print('could not find trained model. ' 69 | 'have you run "gitrisky train" yet?') 70 | sys.exit(1) 71 | 72 | if commit is None: 73 | commit = get_latest_commit() 74 | 75 | features = get_features(commit) 76 | 77 | # pull out just the postive class probability 78 | [(_, score)] = model.predict_proba(features) 79 | 80 | print('Commit {commit} has a bug score of {score} / 1.0' 81 | .format(commit=commit, score=score)) 82 | -------------------------------------------------------------------------------- /gitrisky/gitcmds.py: -------------------------------------------------------------------------------- 1 | """This module contains functions which invoke git cli commands.""" 2 | 3 | import re 4 | 5 | from collections import defaultdict 6 | from subprocess import check_output 7 | 8 | 9 | def _run_bash_command(bash_cmd): 10 | """Execute a bash command and capture the resulting stdout. 11 | 12 | Parameters 13 | ---------- 14 | bash_cmd : str 15 | The bash command to run. 16 | 17 | Returns 18 | ------- 19 | stdout : str 20 | The resulting stdout output. 21 | """ 22 | 23 | stdout = check_output(bash_cmd.split()).decode('utf-8').rstrip('\n') 24 | 25 | return stdout 26 | 27 | 28 | def trim_hash(commit): 29 | """Trim a commit hash to 8 characters.""" 30 | 31 | return commit[:8] 32 | 33 | 34 | def get_latest_commit(): 35 | """Get the hash of the most recent commit. 36 | 37 | Returns 38 | ------- 39 | hash : str 40 | The 8 character hash of the most recent commit 41 | """ 42 | 43 | bash_cmd = 'git log -1 --pretty=format:"%H"' 44 | 45 | stdout = _run_bash_command(bash_cmd) 46 | 47 | # single line outputs get quoted by check_output for some reason 48 | stdout = stdout.replace('"', '') 49 | 50 | return trim_hash(stdout) 51 | 52 | 53 | def get_git_log(commit=None): 54 | """Get the git log entry for one or more commits. 55 | 56 | This will return log entries with the format generated by the '--stat' 57 | option. 58 | 59 | Parameters 60 | ---------- 61 | commit : str, optional 62 | The hash of the commit to get log entries for. If not given this will 63 | return log entries for all commits. 64 | 65 | Returns 66 | ------- 67 | logstr : str 68 | A single string containing the output of a git log command.` 69 | 70 | """ 71 | 72 | if commit is not None: 73 | bash_cmd = \ 74 | 'git --no-pager log --stat -1 {commit}'.format(commit=commit) 75 | else: 76 | bash_cmd = 'git --no-pager log --stat' 77 | 78 | stdout = _run_bash_command(bash_cmd) 79 | 80 | return stdout 81 | 82 | 83 | def get_bugfix_commits(): 84 | """Get the commits whose commit messages contain BUG or FIX. 85 | 86 | Returns 87 | ------- 88 | commits : list(str) 89 | A list of commit hashes. 90 | 91 | Raises 92 | ------ 93 | ValueError 94 | If there are no bugfix commits (i.e. no commits which fix a bug 95 | according to the commit messages). 96 | """ 97 | 98 | # TODO: add option to specify custom bugfix tags 99 | bash_cmd = "git log -i --all --grep BUG --grep FIX --pretty=format:%h" 100 | 101 | stdout = _run_bash_command(bash_cmd) 102 | 103 | # filter out empty strings 104 | commits = [commit for commit in stdout.split('\n') if commit] 105 | 106 | if not commits: 107 | raise ValueError('No bug fix commits found') 108 | 109 | return commits 110 | 111 | 112 | def _get_commit_filenames(commit_hash): 113 | """Get the filename(s) of files which were modified by a specific commit. 114 | 115 | Parameters 116 | ---------- 117 | commit_hash: str 118 | The hash of a commit. 119 | 120 | Returns 121 | ------- 122 | filenames: list(str) 123 | A list of the filenames which were modified by the specified commit. 124 | """ 125 | 126 | commit_hash = trim_hash(commit_hash) 127 | 128 | bash_cmd = ('git --no-pager diff {commit_hash} {commit_hash}^ --name-only' 129 | .format(commit_hash=commit_hash)) 130 | 131 | stdout = _run_bash_command(bash_cmd) 132 | 133 | # note that .split() always returns a list, even if the string wasn't split 134 | filenames = stdout.split('\n') 135 | 136 | return filenames 137 | 138 | 139 | def _get_commit_lines(commit_hash, filenames): 140 | """Get the line numbers which were modified in each file by a given commit. 141 | 142 | Parameters 143 | ---------- 144 | commit_hash: str 145 | The hash of a commit. 146 | filenames: list(str) 147 | A list of the filenames which were modified by the specified commit. 148 | 149 | Returns 150 | ------- 151 | fname_lines: dict{str: list} 152 | A dictionary keyed on filename and valued with a list of 153 | (start_line, number_of_lines) tuples. 154 | """ 155 | 156 | commit_hash = trim_hash(commit_hash) 157 | fname_lines = defaultdict(lambda: []) 158 | 159 | for fname in filenames: 160 | 161 | bash_cmd = ('git --no-pager diff {commit}^ {commit} -U0 -- {fname}' 162 | .format(commit=commit_hash, fname=fname)) 163 | 164 | stdout = _run_bash_command(bash_cmd) 165 | 166 | # pull out the header line of each diff section 167 | headers = [l for l in stdout.split('\n') if '@@' in l] 168 | 169 | # header will look like @@ -198,2 +198,2 @@ 170 | for header in headers: 171 | 172 | # the .group(1) bit will pull out the part prefixed by '+' 173 | match = re.match('@@ -(.*) +(.*) @@', header).group(1) 174 | 175 | # header looks like @@ -198 +198 @@ if only one line changes 176 | if ',' in match: 177 | start, n_lines = match.split(',') 178 | else: 179 | start, n_lines = match, '1' 180 | 181 | if int(n_lines) > 0: 182 | fname_lines[fname].append((start, n_lines)) 183 | 184 | return fname_lines 185 | 186 | 187 | def _get_blame_commit(commit_hash, filenames, fname_lines): 188 | """Get the commits which last touched the lines changed by a given commit. 189 | 190 | Parameters 191 | ---------- 192 | commit_hash: str 193 | The hash of a commit. 194 | filenames: list(str) 195 | A list of the filenames which were modified by the specified commit. 196 | fname_lines: dict{str: list} 197 | A dictionary keyed on filename and valued with a list of 198 | (start_line, number_of_lines) tuples. 199 | 200 | Returns 201 | ------- 202 | buggy_commits: set 203 | A set containing the hashes of the commits which last modified the 204 | lines modified by the given commit. 205 | """ 206 | 207 | commit_hash = trim_hash(commit_hash) 208 | buggy_commits = set() 209 | 210 | for fname in filenames: 211 | 212 | for start, n_lines in fname_lines[fname]: 213 | 214 | bash_cmd = \ 215 | ('git --no-pager blame -L{start},+{n} {commit}^ -- {fname}' 216 | .format(start=start, 217 | n=n_lines, 218 | commit=commit_hash, 219 | fname=fname)) 220 | 221 | stdout = _run_bash_command(bash_cmd) 222 | 223 | changed_lines = stdout.split('\n') 224 | buggy_commits = \ 225 | buggy_commits.union([l.split(' ')[0] for l in changed_lines]) 226 | 227 | return buggy_commits 228 | 229 | 230 | def link_fixes_to_bugs(fix_commits): 231 | """Link a bugfix commit to the commits which introduced the bug it fixes. 232 | 233 | Parameters 234 | ---------- 235 | fix_commits: list(str) 236 | A list of hashes for commits which fix bugs. 237 | 238 | Returns 239 | ------- 240 | bug_commits: list(str) 241 | A list of hashes for commits which introduced bugs. 242 | """ 243 | 244 | bug_commits = set() 245 | 246 | for commit in fix_commits: 247 | 248 | # trim the hash to 8 characters 249 | commit = trim_hash(commit) 250 | 251 | # get the files modified by the commit 252 | filenames = _get_commit_filenames(commit) 253 | 254 | # get the lines in each file modified by the commit 255 | fname_lines = _get_commit_lines(commit, filenames) 256 | 257 | # get the last commit to modify those lines 258 | origin_commits = _get_blame_commit(commit, filenames, fname_lines) 259 | 260 | bug_commits = bug_commits.union(origin_commits) 261 | 262 | return list(bug_commits) 263 | -------------------------------------------------------------------------------- /gitrisky/model.py: -------------------------------------------------------------------------------- 1 | """This module contains code to load and save gitrisky models""" 2 | 3 | import os 4 | import pickle 5 | 6 | from git import Repo 7 | from sklearn.ensemble import RandomForestClassifier 8 | 9 | 10 | def _get_model_path(): 11 | """Get the full path of the gitrisky model. 12 | 13 | For now this is hardcoded to be '/gitrisky.model'. 14 | 15 | Returns 16 | ------- 17 | path : str 18 | The full path to the gitrisky model 19 | """ 20 | 21 | # use gitpython.Repo to find the repository's top level directory 22 | repo_dir = Repo('.', search_parent_directories=True).working_tree_dir 23 | model_path = os.path.join(repo_dir, 'gitrisky.model') 24 | 25 | return model_path 26 | 27 | 28 | def create_model(): 29 | """Create a new model. 30 | 31 | Returns 32 | ------- 33 | model : scikit-learn model 34 | A saved scikit-learn model 35 | """ 36 | 37 | # instantiate a new model 38 | # TODO: replace this with a gridsearchCV object for hyperparameter tuning 39 | model = RandomForestClassifier() 40 | 41 | return model 42 | 43 | 44 | def load_model(): 45 | """Load a model from a pickle file. 46 | 47 | Returns 48 | ------- 49 | model : scikit-learn model 50 | A saved scikit-learn model 51 | 52 | Raises 53 | ------ 54 | FileNotFoundError 55 | If the trained model pickle file can't be found 56 | """ 57 | 58 | model_path = _get_model_path() 59 | 60 | with open(model_path, 'rb') as infile: 61 | model = pickle.load(infile) 62 | 63 | return model 64 | 65 | 66 | def save_model(model): 67 | """Save a model to a pickle file. 68 | 69 | Parameters 70 | ---------- 71 | model : scikit-learn model 72 | The scikit-learn model to save. 73 | """ 74 | 75 | model_path = _get_model_path() 76 | 77 | with open(model_path, 'wb') as outfile: 78 | pickle.dump(model, outfile) 79 | -------------------------------------------------------------------------------- /gitrisky/parsing.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains functions which extract features from git log entries. 3 | """ 4 | 5 | import re 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from collections import defaultdict 10 | from .gitcmds import get_git_log, get_bugfix_commits, link_fixes_to_bugs, \ 11 | trim_hash 12 | 13 | 14 | def split_commits(whole_log): 15 | """Split the output of git log into separate entries per commit. 16 | 17 | Parameters 18 | ---------- 19 | whole_log: str 20 | A string containing the entire git log. 21 | 22 | Returns 23 | ------- 24 | list(str) 25 | A list of log entries, with each commit as its own string. 26 | """ 27 | 28 | lines = whole_log.splitlines() 29 | 30 | # find the indices which separate each commit's entry 31 | commit_line_idxs = [i for i, line in enumerate(lines) 32 | if re.match(r'^commit \w{40}$', line)] 33 | 34 | # split the lines from the whole log into subsets for each log entry 35 | commit_lines = np.array_split(lines, commit_line_idxs) 36 | 37 | return ["\n".join(arr) for arr in commit_lines[1:]] 38 | 39 | 40 | def parse_commit(commit_str): 41 | """Extract features from the text of a commit log entry. 42 | 43 | Parameters 44 | ---------- 45 | commit_str: str 46 | The text of a commit log entry. 47 | 48 | Returns 49 | ------- 50 | feats: defaultdict 51 | A dictionary of feature values. 52 | """ 53 | 54 | feats = defaultdict(lambda: None) 55 | lines = commit_str.splitlines() 56 | 57 | # parse the commit line 58 | commit_line = [line for line in lines if line.startswith('commit')][0] 59 | feats['hash'] = \ 60 | trim_hash(re.match(r'commit (\w{40})', commit_line).group(1)) 61 | 62 | # NOTE: skip string features for now because the one-hot encoding is a pain 63 | # parse the author line 64 | # author_line = [line for line in lines if line.startswith('Author:')][0] 65 | # author_matches = re.match(r'Author: (.+) <(.+)>', author_line) 66 | # feats['user'] = author_matches.group(1) 67 | # feats['email'] = author_matches.group(2) 68 | 69 | # parse the date line 70 | time_line = [line for line in lines if line.startswith('Date:')][0] 71 | timestamp = re.match(r'Date: (.*)', time_line).group(1) 72 | # TODO: fix the hardcoded timezone 73 | created_at = pd.to_datetime(timestamp, utc=True).tz_convert('US/Central') 74 | feats['dayofweek'] = created_at.dayofweek 75 | feats['hour'] = created_at.hour 76 | 77 | # parse the body lines 78 | body_lines = [line.lstrip() for line in lines if line.startswith(' ')] 79 | feats['len_message'] = len('\n'.join(body_lines)) 80 | 81 | # NOTE: skip string features for now because the one-hot encoding is a pain 82 | # feats['tag'] = body_lines[0].split()[0].rstrip(':') 83 | 84 | # if this is a merge commit fill some fields with NaNs 85 | if any([line.startswith('Merge:') for line in lines]): 86 | # feats['tag'] = 'MERGE' 87 | feats['changed_files'] = np.NaN 88 | feats['additions'] = np.NaN 89 | feats['deletions'] = np.NaN 90 | 91 | return feats 92 | 93 | # parse the changes line 94 | changes_line = lines[-1] 95 | 96 | changed_regex = r' ([0-9]+) file[s]{0,1} changed' 97 | insert_regex = r'.* ([0-9]+) insertion[s]{0,1}' 98 | delete_regex = r'.* ([0-9]+) deletion[s]{0,1}' 99 | 100 | if re.match(changed_regex, changes_line): 101 | feats['changed_files'] = \ 102 | int(re.match(changed_regex, changes_line).group(1)) 103 | 104 | if re.match(insert_regex, changes_line): 105 | feats['additions'] = int(re.match(insert_regex, changes_line).group(1)) 106 | 107 | if re.match(delete_regex, changes_line): 108 | feats['deletions'] = int(re.match(delete_regex, changes_line).group(1)) 109 | 110 | return feats 111 | 112 | 113 | def get_features(commit=None): 114 | """Get commit-level features. 115 | 116 | Parameters 117 | ---------- 118 | commit : str, optional 119 | The hash of the commit to get features for. If not given this will 120 | return features for all commits. 121 | 122 | Returns 123 | ------- 124 | features : pd.DataFrame of shape [n_commits, n_features] 125 | The features to use for modeling. The dataframe is indexed by commit 126 | hash. 127 | """ 128 | 129 | logstr = get_git_log(commit) 130 | 131 | feats = pd.DataFrame([parse_commit(c) for c in split_commits(logstr)]) 132 | 133 | feats = feats.set_index('hash').fillna(0) 134 | 135 | return feats 136 | 137 | 138 | def get_labels(): 139 | """Get a label for each commit indicating whether it introduced a bug. 140 | 141 | Returns 142 | ------- 143 | labels : pd.Series of shape (n_commits,) 144 | The labels to use for modeling. The dataframe is indexed by commit 145 | hash. 146 | """ 147 | 148 | feats = get_features() 149 | 150 | fix_commits = get_bugfix_commits() 151 | 152 | bug_commits = link_fixes_to_bugs(fix_commits) 153 | 154 | labels = feats.index.isin(bug_commits).astype(int) 155 | 156 | # convert to DataFrame so everything is the same type 157 | return pd.Series(data=labels, index=feats.index, name='label') 158 | -------------------------------------------------------------------------------- /gitrisky/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hinnefe2/gitrisky/fd48b891f1f599a0b6d055e533c861ba63833dee/gitrisky/tests/__init__.py -------------------------------------------------------------------------------- /gitrisky/tests/test_cli.py: -------------------------------------------------------------------------------- 1 | import mock 2 | 3 | from click.testing import CliRunner 4 | from gitrisky.cli import cli 5 | 6 | 7 | @mock.patch('gitrisky.cli.get_features') 8 | @mock.patch('gitrisky.cli.get_labels') 9 | @mock.patch('gitrisky.cli.create_model') 10 | @mock.patch('gitrisky.cli.save_model') 11 | def test_cli_train(m_save_model, m_create_model, m_get_labels, m_get_features): 12 | 13 | # make some fake features and labels 14 | m_get_features.return_value = [[1, 1], [2, 2]] 15 | m_get_labels.return_value = [0, 1] 16 | 17 | # test the 'gitrisky train' cli command 18 | runner = CliRunner() 19 | result = runner.invoke(cli, ['train']) 20 | 21 | assert result.exit_code == 0 22 | assert result.output == \ 23 | 'Model trained on 2 training examples with 1 positive cases\n' 24 | 25 | for mck in [m_save_model, m_create_model, m_get_labels, m_get_features]: 26 | assert mck.call_count == 1 27 | 28 | 29 | @mock.patch('gitrisky.cli.get_features') 30 | @mock.patch('gitrisky.cli.get_labels') 31 | @mock.patch('gitrisky.cli.create_model') 32 | @mock.patch('gitrisky.cli.save_model') 33 | def test_cli_train_no_bugs(m_save_model, m_create_model, m_get_labels, 34 | m_get_features): 35 | 36 | # make some fake features and labels 37 | m_get_features.return_value = [[1, 1], [2, 2]] 38 | m_get_labels.side_effect = ValueError('No bug commits found') 39 | 40 | # test the 'gitrisky train' cli command 41 | runner = CliRunner() 42 | result = runner.invoke(cli, ['train']) 43 | 44 | assert result.exit_code == 1 45 | assert result.output == ( 46 | 'Failed to find any bug commits by parsing commit logs.\n' 47 | 'gitrisky looks for commit messages containing "bug" or "fix" ' 48 | 'and this repo appears not to have any.\n') 49 | 50 | 51 | @mock.patch('gitrisky.cli.get_features') 52 | @mock.patch('gitrisky.cli.get_latest_commit') 53 | @mock.patch('gitrisky.cli.load_model') 54 | def test_cli_predict(m_load_model, m_get_latest_commit, m_get_features): 55 | 56 | runner = CliRunner() 57 | 58 | model = mock.MagicMock() 59 | model.predict_proba.return_value = [(0.1, 0.9)] 60 | 61 | m_load_model.return_value = model 62 | m_get_latest_commit.return_value = 'abcd' 63 | 64 | # test what happens when we don't specify a commit 65 | result = runner.invoke(cli, ['predict']) 66 | 67 | assert m_get_features.called_with('abcd') 68 | assert result.output == 'Commit abcd has a bug score of 0.9 / 1.0\n' 69 | assert result.exit_code == 0 70 | 71 | # test what happens when we specify a commit 72 | result = runner.invoke(cli, ['predict', '-c', '12345']) 73 | 74 | assert m_get_features.called_with('12345') 75 | assert result.output == 'Commit 12345 has a bug score of 0.9 / 1.0\n' 76 | assert result.exit_code == 0 77 | 78 | # test what happens when we can't load the model 79 | m_load_model.side_effect = FileNotFoundError() 80 | 81 | result = runner.invoke(cli, ['predict']) 82 | 83 | assert result.output == \ 84 | 'could not find trained model. have you run "gitrisky train" yet?\n' 85 | assert result.exit_code == 1 86 | -------------------------------------------------------------------------------- /gitrisky/tests/test_gitcmds.py: -------------------------------------------------------------------------------- 1 | import mock 2 | import pytest 3 | 4 | import numpy as np 5 | 6 | from collections import defaultdict 7 | 8 | from gitrisky.gitcmds import _run_bash_command, trim_hash, get_latest_commit, \ 9 | get_git_log, get_bugfix_commits, _get_commit_filenames, \ 10 | _get_commit_lines, _get_blame_commit, link_fixes_to_bugs 11 | 12 | 13 | @mock.patch('gitrisky.gitcmds.check_output') 14 | def test_run_bash_command(mock_co): 15 | 16 | cmd = 'some bash command' 17 | output = 'output from some command\n'.encode('utf-8') 18 | 19 | mock_co.return_value = output 20 | 21 | stdout = _run_bash_command(cmd) 22 | 23 | assert mock_co.called_with('some', 'bash', 'command') 24 | assert stdout == 'output from some command' 25 | 26 | 27 | def test_trim_hash(): 28 | 29 | long_hash = '123456789abcdefg' 30 | 31 | assert trim_hash(long_hash) == long_hash[:8] 32 | 33 | 34 | @mock.patch('gitrisky.gitcmds._run_bash_command') 35 | def test_get_latest_commit(mock_runbc): 36 | 37 | stdout = '4db4fc24afe7565ac65fdb272c7c157c43aace77' 38 | 39 | mock_runbc.return_value = stdout 40 | 41 | commit = get_latest_commit() 42 | 43 | assert isinstance(commit, str) 44 | assert len(commit) == 8 45 | assert commit == stdout[:8] 46 | assert mock_runbc.called_with('git log -1 --pretty=format:"%H"') 47 | 48 | 49 | @mock.patch('gitrisky.gitcmds._run_bash_command') 50 | def test_get_git_log(mock_runbc): 51 | 52 | stdout = ("Merge: 910cdb3 bbb59ea\n" 53 | "Author: Henry Hinnefeld \n" 54 | "Date: Sun Feb 4 15:55:45 2018 -0600\n" 55 | "\n" 56 | " Merge pull request #10 from hinnefe2/write_readme\n" 57 | "\n" 58 | " Write readme") 59 | 60 | mock_runbc.return_value = stdout 61 | 62 | # test calling with a commit specified 63 | log = get_git_log('1234abcd') 64 | 65 | assert mock_runbc.called_with('git --no-pager log --stat -1 1234abcd') 66 | assert log == stdout 67 | 68 | # test calling with no commit specified 69 | log = get_git_log() 70 | 71 | assert mock_runbc.called_with('git --no-pager log --stat') 72 | assert log == stdout 73 | 74 | 75 | @mock.patch('gitrisky.gitcmds._run_bash_command') 76 | def test_get_bugfix_commits(mock_runbc): 77 | 78 | stdout = "671e13d\n4fe1c42\n3e10227\n91d54e3\n2c3dca4" 79 | 80 | mock_runbc.return_value = stdout 81 | 82 | commits = get_bugfix_commits() 83 | 84 | assert mock_runbc.called_with( 85 | 'git log -i --all --grep BUG --grep FIX --pretty=format:%h') 86 | assert np.array_equal(commits, ['671e13d', '4fe1c42', '3e10227', 87 | '91d54e3', '2c3dca4']) 88 | 89 | 90 | @mock.patch('gitrisky.gitcmds._run_bash_command') 91 | def test_get_bugfix_commits_no_bugs(mock_runbc): 92 | 93 | stdout = "\n" 94 | 95 | mock_runbc.return_value = stdout 96 | 97 | with pytest.raises(ValueError): 98 | get_bugfix_commits() 99 | 100 | 101 | @mock.patch('gitrisky.gitcmds._run_bash_command') 102 | def test_get_commit_filenames(mock_runbc): 103 | 104 | stdout = "gitrisky/cli.py\ngitrisky/model.py" 105 | 106 | mock_runbc.return_value = stdout 107 | 108 | fnames = _get_commit_filenames('dc95b21') 109 | 110 | # try when _get_commit_filenames returns multiple filenames 111 | assert isinstance(fnames, list) 112 | assert mock_runbc.called_with( 113 | 'git --no-pager diff dc95b21 dc95b21^ --name-only') 114 | assert np.array_equal(fnames, ['gitrisky/cli.py', 'gitrisky/model.py']) 115 | 116 | # try when _get_commit_filenames returns single filename 117 | stdout = "gitrisky/cli.py" 118 | mock_runbc.return_value = stdout 119 | 120 | fnames = _get_commit_filenames('dc95b21') 121 | 122 | assert isinstance(fnames, list) 123 | 124 | 125 | @mock.patch('gitrisky.gitcmds._run_bash_command') 126 | def test_get_commit_lines(mock_runbc): 127 | 128 | # specify different return values for repeated calls to mock_runbc 129 | # note that this isn't the complete output of the relevant git command, 130 | # just the important bits 131 | stdout1 = ("@@ -5,3 +5 @@ import click\n" 132 | "@@ -30 +28 @@ def train():") 133 | stdout2 = ("@@ -6,0 +7 @@ from git import Repo\n" 134 | "@@ -26,0 +28,16 @@ def _get_model_path():") 135 | 136 | mock_runbc.side_effect = [stdout1, stdout2] 137 | 138 | lines = \ 139 | _get_commit_lines('dc95b21', ['gitrisky/cli.py', 'gitrisky/model.py']) 140 | 141 | assert mock_runbc.called_with( 142 | 'git --no-pager diff dc95b21^ dc95b21 -U0 -- gitrisky/cli.py') 143 | assert mock_runbc.called_with( 144 | 'git --no-pager diff dc95b21^ dc95b21 -U0 -- gitrisky/model.py') 145 | 146 | assert isinstance(lines, defaultdict) 147 | 148 | # we deleted 3 lines after line 5 and 1 line after line 30 in cli.py 149 | assert np.array_equal(lines['gitrisky/cli.py'], [('5', '3'), ('30', '1')]) 150 | 151 | # we didn't delete any lines in model.py 152 | assert np.array_equal(lines['gitrisky/model.py'], []) 153 | 154 | 155 | @mock.patch('gitrisky.gitcmds._run_bash_command') 156 | def test_get_blame_commit(mock_runbc): 157 | 158 | stdout1 = ("c668b98e (Henry Hinnefeld 2018-01-22 07:33:22 -0600 30) model = RandomForestClassifier()") # noqa 159 | stdout2 = ("c668b98e gitrisky/cli.py (Henry Hinnefeld 2018-01-22 07:33:22 -0600 5) from sklearn.ensemble import RandomForestClassifier\n" # noqa 160 | "2f0b9d3b cli.py (Henry Hinnefeld 2018-01-21 20:03:36 -0600 6) \n" # noqa 161 | "209879e0 gitrisky/cli.py (Henry Hinnefeld 2018-01-22 07:34:16 -0600 7) from .model import save_model, load_model") # noqa 162 | mock_runbc.side_effect = [stdout1, stdout2] 163 | 164 | filenames = ['gitrisky/cli.py', 'gitrisky/model.py'] 165 | fname_lines = {'gitrisky/cli.py': [('5', '3'), ('30', '1')], 166 | 'gitrisky/model.py': []} 167 | 168 | bug_commits = _get_blame_commit('dc95b21', filenames, fname_lines) 169 | 170 | # check we called the right git commands 171 | assert mock_runbc.called_with( 172 | 'git --no-pager blame -L5,+3 dc95b21^ -- gitrisky/cli.py') 173 | assert mock_runbc.called_with( 174 | 'git --no-pager blame -L30,+1 dc95b21^ -- gitrisky/cli.py') 175 | 176 | assert isinstance(bug_commits, set) 177 | assert bug_commits == set(['c668b98e', '2f0b9d3b', '209879e0']) 178 | 179 | 180 | def test_link_fixes_to_bugs(): 181 | 182 | # NOTE: this is effectively an integration test because the 183 | # link_fixes_to_bugs function just chains all the other functions 184 | 185 | # NOTE: this test actually runs against the gitrisky repo history 186 | 187 | fix_commits = ['3e10227', '2c3dca4'] 188 | 189 | bug_commits = link_fixes_to_bugs(fix_commits) 190 | 191 | assert isinstance(bug_commits, list) 192 | assert set(bug_commits) == set(['d90875b0', 'e359f619', 'bb47087b']) 193 | -------------------------------------------------------------------------------- /gitrisky/tests/test_model.py: -------------------------------------------------------------------------------- 1 | import mock 2 | import pickle 3 | 4 | from collections import namedtuple 5 | from tempfile import NamedTemporaryFile 6 | from gitrisky.model import _get_model_path, create_model, load_model, \ 7 | save_model 8 | 9 | 10 | @mock.patch('gitrisky.model.Repo') 11 | def test_get_model_path(mock_Repo): 12 | 13 | # fake version of the Repo class 14 | Repo = namedtuple('Repo', 'working_tree_dir') 15 | fake_instance = Repo('path/to/repo') 16 | 17 | # make the mocked constructor return the fake instance 18 | mock_Repo.return_value = fake_instance 19 | 20 | path = _get_model_path() 21 | 22 | assert isinstance(path, str) 23 | assert path == 'path/to/repo/gitrisky.model' 24 | 25 | 26 | def test_create_model(): 27 | 28 | model = create_model() 29 | 30 | assert hasattr(model, 'fit') 31 | assert callable(getattr(model, 'fit')) 32 | 33 | assert hasattr(model, 'predict_proba') 34 | assert callable(getattr(model, 'predict_proba')) 35 | 36 | 37 | @mock.patch('gitrisky.model._get_model_path') 38 | def test_load_model(mock_gmp): 39 | 40 | with NamedTemporaryFile() as tmpfile: 41 | 42 | pickle.dump('fake model', tmpfile) 43 | tmpfile.seek(0) 44 | 45 | mock_gmp.return_value = tmpfile.name 46 | model = load_model() 47 | 48 | assert model == 'fake model' 49 | 50 | 51 | @mock.patch('gitrisky.model._get_model_path') 52 | def test_save_model(mock_gmp): 53 | 54 | with NamedTemporaryFile() as tmpfile: 55 | 56 | mock_gmp.return_value = tmpfile.name 57 | save_model('fake model') 58 | 59 | tmpfile.seek(0) 60 | model = pickle.load(tmpfile) 61 | 62 | assert model == 'fake model' 63 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | codecov 2 | flake8==3.5.0 3 | mock==2.0.0 4 | pytest==3.4.0 5 | pytest-cov==2.5.1 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | click>=6.7 2 | GitPython>=2.1 3 | numpy>=1.13 4 | pandas>=0.20 5 | scikit-learn>=0.19.0 6 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | license_file = LICENSE 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | VERSION = '0.1.3' 4 | 5 | 6 | setup( 7 | 8 | name='gitrisky', 9 | 10 | description='Predict code bug risk with git metadata', 11 | 12 | version=VERSION, 13 | 14 | url='https://github.com/hinnefe2/gitrisky', 15 | 16 | download_url=('https://github.com/hinnefe2/gitrisky/archive/{}.tar.gz' 17 | .format(VERSION)), 18 | 19 | author='J. Henry Hinnefeld', 20 | 21 | author_email='henry.hinnefeld@gmail.com', 22 | 23 | packages=find_packages(), 24 | 25 | install_requires=[ 26 | 'click>=6.7', 27 | 'GitPython>=2.1', 28 | 'numpy>=1.13', 29 | 'pandas>=0.20', 30 | 'scikit-learn>=0.19', 31 | 'scipy>=0.19', 32 | ], 33 | 34 | classifiers=[ 35 | 'Programming Language :: Python', 36 | 'Programming Language :: Python :: 3.5', 37 | 'Programming Language :: Python :: 3.6', 38 | 'Topic :: Software Development :: Version Control :: Git', 39 | 'License :: OSI Approved :: MIT License', 40 | 'Natural Language :: English', 41 | ], 42 | 43 | include_package_data=True, 44 | 45 | entry_points={ 46 | 'console_scripts': ['gitrisky=gitrisky.cli:cli'], 47 | }, 48 | ) 49 | --------------------------------------------------------------------------------