├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── README.md ├── codesort.py ├── graph.png ├── sonar-project.properties ├── test_codesort.py └── tox.ini /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | dist: trusty 5 | addons: 6 | sonarcloud: 7 | organization: "jeffgreenca-github" 8 | install: 9 | - pip install tox-travis 10 | script: 11 | - tox -e lint-check 12 | - tox 13 | - sonar-scanner 14 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # codesort - https://github.com/jeffgreenca/codesort 2 | 3 | # build networkit 4 | FROM python:3.6-stretch as builder 5 | RUN apt-get update && apt-get install -y \ 6 | cmake \ 7 | build-essential \ 8 | && rm -rf /var/lib/apt/lists/* 9 | RUN pip install \ 10 | gitpython==2.1.11 \ 11 | networkit==5.0 12 | 13 | # codesort image 14 | FROM python:3.6-stretch 15 | LABEL maintainer="jeff.green.ca@gmail.com" 16 | COPY --from=builder /usr/local/lib/python3.6 /usr/local/lib/python3.6 17 | WORKDIR /app 18 | COPY codesort.py . 19 | ENTRYPOINT ["/usr/local/bin/python3.6", "codesort.py", "/repo"] 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Jeff Green 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/jeffgreenca/codesort.svg?branch=master)](https://travis-ci.org/jeffgreenca/codesort) [![Maintainability Rating](https://sonarcloud.io/api/project_badges/measure?project=jeffgreenca_codesort&metric=sqale_rating)](https://sonarcloud.io/dashboard?id=jeffgreenca_codesort) ![Docker Cloud Build Status](https://img.shields.io/docker/cloud/build/jeffgreenca/codesort.svg?style=for-the-badge) ![Docker Cloud Automated build](https://img.shields.io/docker/cloud/automated/jeffgreenca/codesort.svg?style=for-the-badge) 2 | 3 | # codesort 4 | 5 | Given a git repository, identify the most "central" source files based on 6 | commit history. 7 | 8 | ``` 9 | $ cd /path/to/your/repo 10 | $ docker run --rm -v "$PWD":/repo:ro jeffgreenca/codesort 11 | ``` 12 | 13 | When approaching an unknown code base, for example as a maintenance programmer, 14 | this provides a clue about which source files to examine first. 15 | 16 | ![annotated graph via Cytoscape](graph.png) 17 | > Example graph rendered via [Cytoscape](https://cytoscape.org/) from codesort 18 | > output 19 | 20 | ## summary and credits 21 | This follows [Aron Lurie's method 22 | @medium](https://medium.com/@a.lurie_78598/using-graph-theory-to-decide-where-to-start-reading-source-code-74a1e2ddf72). 23 | In short, compute [betweenness 24 | centrality](https://en.wikipedia.org/wiki/Betweenness_centrality) on a graph 25 | constructed from reading commit history. 26 | 27 | Vertices of the graph represent individual files in the repository, and edges 28 | are added between vertices (u, v) when files u and v appear in the same commit. 29 | Edge weights are assigned based on the inverse count of commits wherein the two 30 | vertices appear together (so, files that are highly correlated have a low edge 31 | weight). 32 | 33 | ## usage example 34 | 35 | ### run with docker 36 | Mount your repository to `/repo` and run the container: 37 | ``` 38 | $ docker run --rm -v /path/to/your/repo:/repo:ro jeffgreenca/codesort 39 | 25.00% app/file1.py 40 | 8.00% app/lib/__init__.py 41 | 4.00% tox.ini 42 | 2.00% Dockerfile 43 | ... 44 | ``` 45 | 46 | The output format is `scorefilepath` per line where `score` is the ranking 47 | of betweenness centrality score, descending. 48 | 49 | ## advanced usage 50 | 51 | ``` 52 | $ docker run --rm jeffgreenca/codesort --help 53 | 54 | usage: codesort.py [-h] [-v] [-n N] [-c N] [-b] [-r] [--include INCLUDE] 55 | [--exclude EXCLUDE] 56 | repo 57 | 58 | List most "important" files in a git repo. Implements Aron Lurie's method, see 59 | details at: https://bit.ly/2v6M3X0 60 | 61 | positional arguments: 62 | repo Path to target repository 63 | 64 | optional arguments: 65 | -h, --help show this help message and exit 66 | -v, --verbose Show timings 67 | -n N, --num-results N 68 | Return only top N results 69 | -c N, --commits N Max number of commits to traverse 70 | -b, --bare Return sorted filenames (without scores) 71 | -r, --raw Show raw scores (default is percentage rank) 72 | --include INCLUDE Include files in repo matching glob pattern(s) (comma 73 | separated) 74 | --exclude EXCLUDE Exclude files in repo matching glob pattern(s) (comma 75 | separated) 76 | ``` 77 | 78 | > NOTE: when using the docker image, arg `repo` is already specified for you. 79 | 80 | ### include and exclude patterns 81 | 82 | You can select which files to include, and/or files to ignore, using standard 83 | glob patterns. 84 | 85 | For example, to consider only Python source files: 86 | ``` 87 | $ docker run ... codesort --include "*.py" 88 | app.py 89 | lib/app.py 90 | tests/test_app.py 91 | ``` 92 | 93 | To consider only Python source files, and exclude the `tests` base folder: 94 | ``` 95 | $ docker run ... codesort --include "*.py" --exclude "tests/*" 96 | app.py 97 | lib/app.py 98 | ``` 99 | 100 | Multiple patterns are can be specified via a comma-delimited list: 101 | ``` 102 | $ docker run ... codesort --include "*.py,*.go,src/*" 103 | app.py 104 | main.go 105 | src/config.yml 106 | ``` 107 | 108 | ## about `networkit` 109 | 110 | Initially, I used the friendly, approachable 111 | [networkx](http://networkx.github.io/) package. It has an excellent API and 112 | installs easily. 113 | 114 | Unfortunately, my `networkx` based implementation was painfully slow for large 115 | repositories. 116 | 117 | I switched to [networkit](https://networkit.github.io/) for a significant speed 118 | boost. This came at the cost of more development time to get the install 119 | working, and slightly more complex code due to additional record-keeping 120 | requirements, but pays off when running `codesort` on large repositories (and 121 | frankly, if the repository isn't large, why bother using this tool anyway?). 122 | 123 | ## contributing 124 | 125 | Contributions welcome. Please apply [black](https://github.com/python/black). 126 | 127 | ## code of conduct 128 | 129 | Be kind. 130 | -------------------------------------------------------------------------------- /codesort.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # codesort.py - jeffgreenca 2019 3 | from git import Repo 4 | 5 | from collections import Counter 6 | from itertools import combinations 7 | import argparse 8 | import fnmatch 9 | import re 10 | import time 11 | 12 | # Temporary hack to suppress networkit stdout "warnings" 13 | # thanks https://codingdose.info/2018/03/22/supress-print-output-in-python/ 14 | # TODO fix after patch to networkit lands switching to system warnings lib 15 | import io 16 | from contextlib import redirect_stdout 17 | 18 | null = io.StringIO() 19 | with redirect_stdout(null): 20 | from networkit import graph, centrality 21 | 22 | 23 | def iter_files_per_commit(r, limit=None): 24 | """Iterate over lists of files per commit, by calling git log""" 25 | sep = "<|>" 26 | kwargs = {"name_only": True, "format": "format:%s" % sep} 27 | if limit: 28 | kwargs["max_count"] = limit 29 | log = r.git.log(**kwargs) 30 | for commit in log.split(sep + "\n"): 31 | files = [f.strip() for f in commit.split("\n") if f.strip()] 32 | if files: 33 | yield files 34 | 35 | 36 | # verbose display of runtime durations 37 | def start(x): 38 | if verbose: 39 | print("%s..." % x, end="", flush=True) 40 | return time.time() 41 | 42 | 43 | def finish(s): 44 | if verbose: 45 | print("ok (%ss)" % round(time.time() - s, 2), flush=True) 46 | 47 | 48 | def _top_x_hits(bb, x, raw=False): 49 | """Return nicely formatted list of scores by file""" 50 | if not raw: 51 | total = sum((score for _, score in bb)) 52 | for node, score in sorted(bb, key=lambda x: x[1], reverse=True)[:x]: 53 | if raw: 54 | yield ("%8.6f" % round(score, 6), node) 55 | else: 56 | yield ("%5.1f%%" % (score * 100 / total), node) 57 | 58 | 59 | def _gen_filter_files_func(include_pats, exclude_pats): 60 | """Return a function that filters a list of files by glob patterns""" 61 | regex_include = tuple( 62 | re.compile(rx) for rx in (fnmatch.translate(p) for p in include_pats) 63 | ) 64 | regex_exclude = tuple( 65 | re.compile(rx) for rx in (fnmatch.translate(p) for p in exclude_pats) 66 | ) 67 | 68 | def filter_func(files): 69 | for f in files: 70 | if any((r.match(f) for r in regex_exclude)): 71 | continue 72 | if not regex_include or any((r.match(f) for r in regex_include)): 73 | yield f 74 | 75 | return filter_func 76 | 77 | 78 | def main( 79 | repo_path, 80 | count, 81 | limit, 82 | bare=False, 83 | single=False, 84 | export=None, 85 | show_raw_scores=False, 86 | include=None, 87 | exclude=None, 88 | ): 89 | """List most "important" files in a git repo. 90 | 91 | Implements Aron Lurie's method, see details at: 92 | https://bit.ly/2v6M3X0 93 | """ 94 | repo = Repo(repo_path) 95 | 96 | include_pats = include.split(",") if include else [] 97 | exclude_pats = exclude.split(",") if exclude else [] 98 | filter_files = _gen_filter_files_func(include_pats, exclude_pats) 99 | 100 | s = start("counting togetherness") 101 | togetherness = Counter() 102 | file_to_id = dict() 103 | id_to_file = dict() 104 | i = 0 105 | for related_files in iter_files_per_commit(repo, limit): 106 | related_files_by_id = [] 107 | for f in filter_files(related_files): 108 | try: 109 | related_files_by_id.append(file_to_id[f]) 110 | except KeyError: 111 | related_files_by_id.append(i) 112 | file_to_id[f] = i 113 | id_to_file[i] = f 114 | i += 1 115 | for edge in combinations(related_files_by_id, 2): 116 | togetherness[edge] += 1 117 | finish(s) 118 | 119 | s = start("building networkit graph") 120 | g = graph.Graph(weighted=True) 121 | for i in range(len(file_to_id)): 122 | g.addNode() 123 | 124 | for e, t in togetherness.items(): 125 | g.addEdge(e[0], e[1], 1 / t) 126 | finish(s) 127 | 128 | s = start("computing betweenness") 129 | # accurate, slow calculation 130 | b = centrality.Betweenness(g, normalized=True) 131 | # TODO - maybe allow toggling between accurate and estimate methods 132 | # faster but not as precise (10x better in a benchmark test) 133 | # b = networkit.centrality.EstimateBetweenness(g, 128, normalized=True, parallel=True) 134 | b.run() 135 | bb = b.ranking() 136 | finish(s) 137 | 138 | if export: 139 | raise NotImplementedError("Not implemented for networkit") 140 | # TODO implement networkit based export 141 | # consider need for node id to filename conversion 142 | # s = start("saving graph to %s" % export) 143 | # networkx.set_node_attributes(graph, values=bb, name="betweenness") 144 | # networkx.write_graphml(graph, export) 145 | # finish(s) 146 | 147 | for hit in _top_x_hits(bb, count, show_raw_scores): 148 | if bare: 149 | print(f"{id_to_file[hit[1]]}") 150 | else: 151 | print(f"{hit[0]}\t{id_to_file[hit[1]]}") 152 | 153 | 154 | if __name__ == "__main__": 155 | parser = argparse.ArgumentParser(description=main.__doc__) 156 | parser.add_argument("-v", "--verbose", action="store_true", help="Show timings") 157 | parser.add_argument( 158 | "-n", 159 | "--num-results", 160 | default=24, 161 | type=int, 162 | metavar="N", 163 | help="Return only top N results", 164 | ) 165 | parser.add_argument( 166 | "-c", 167 | "--commits", 168 | default=None, 169 | type=int, 170 | metavar="N", 171 | help="Max number of commits to traverse", 172 | ) 173 | parser.add_argument( 174 | "-b", 175 | "--bare", 176 | action="store_true", 177 | help="Return sorted filenames (without scores)", 178 | ) 179 | # TODO implement networkit based export 180 | # parser.add_argument( 181 | # "-e", "--export", type=str, metavar="FILE", help="Save graph in GraphML format" 182 | # ) 183 | parser.add_argument( 184 | "-r", 185 | "--raw", 186 | action="store_true", 187 | help="Show raw scores (default is percentage rank)", 188 | ) 189 | parser.add_argument( 190 | "--include", 191 | help="Include files in repo matching glob pattern(s) (comma separated)", 192 | ) 193 | parser.add_argument( 194 | "--exclude", 195 | help="Exclude files in repo matching glob pattern(s) (comma separated)", 196 | ) 197 | parser.add_argument("repo", help="Path to target repository") 198 | args = parser.parse_args() 199 | verbose = args.verbose 200 | main( 201 | args.repo, 202 | count=args.num_results, 203 | limit=args.commits, 204 | bare=args.bare, 205 | export=False, 206 | show_raw_scores=args.raw, 207 | include=args.include, 208 | exclude=args.exclude, 209 | ) 210 | -------------------------------------------------------------------------------- /graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeffgreenca/codesort/346c83857219f1e89cc71e0e0649a1e2171107e1/graph.png -------------------------------------------------------------------------------- /sonar-project.properties: -------------------------------------------------------------------------------- 1 | sonar.projectKey=jeffgreenca_codesort 2 | sonar.organization=jeffgreenca-github 3 | sonar.sources=. 4 | sonar.host.url=https://sonarcloud.io 5 | sonar.python.coverage.reportPath=coverage.xml 6 | -------------------------------------------------------------------------------- /test_codesort.py: -------------------------------------------------------------------------------- 1 | # unit tests (via pytest) 2 | from unittest.mock import MagicMock 3 | import sys 4 | 5 | # TODO only mock networkit (as it is a lengthy install) 6 | sys.modules["git"] = MagicMock() 7 | sys.modules["networkit"] = MagicMock() 8 | import codesort 9 | 10 | del sys.modules["git"] 11 | del sys.modules["networkit"] 12 | 13 | 14 | def test_noop(): 15 | assert True 16 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist=py36 3 | skipsdist=True 4 | 5 | [testenv] 6 | deps=pytest-cov 7 | commands=pytest --cov=codesort --cov-report=term --cov-report=xml 8 | 9 | [testenv:lint] 10 | deps=black 11 | commands=black . 12 | 13 | [testenv:lint-check] 14 | deps=black 15 | commands=black --diff --check . 16 | --------------------------------------------------------------------------------