├── .github └── workflows │ └── ci-cd.yml ├── Dockerfile ├── LICENSE ├── README.md ├── container ├── root │ └── .config │ │ └── matplotlib │ │ └── matplotlibrc └── run.sh ├── docker-compose.yml ├── git_of_theseus ├── __init__.py ├── analyze.py ├── line_plot.py ├── stack_plot.py ├── survival_plot.py └── utils.py ├── pics ├── git-angular-survival.png ├── git-angular.png ├── git-git-authors-normalized.png ├── git-git-survival-exp-fit.png ├── git-git-survival.png ├── git-git.png ├── git-httpd-survival.png ├── git-httpd.png ├── git-kubernetes-authors.png ├── git-linux-survival.png ├── git-linux.png ├── git-node-survival.png ├── git-node.png ├── git-projects-survival-exp-fit.png ├── git-projects-survival.png ├── git-rails-survival.png ├── git-rails.png ├── git-rust.png ├── git-tensorflow.png └── trireme.jpg └── setup.py /.github/workflows/ci-cd.yml: -------------------------------------------------------------------------------- 1 | name: Build and run 2 | 3 | on: [push] 4 | 5 | env: 6 | DISPLAY: ":99.0" 7 | 8 | jobs: 9 | test: 10 | runs-on: ubuntu-latest 11 | strategy: 12 | matrix: 13 | python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] 14 | name: Tests on Python ${{ matrix.python-version }} 15 | steps: 16 | - name: Checkout Repository 17 | uses: actions/checkout@v3 18 | 19 | - name: Install Python 20 | uses: actions/setup-python@v4 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | 24 | - name: Install package 25 | run: | 26 | pip install . 27 | pip install scipy 28 | 29 | - name: Run tests 30 | run: | 31 | git clone https://github.com/erikbern/git-of-theseus 32 | git-of-theseus-analyze git-of-theseus --outdir got 33 | git-of-theseus-stack-plot got/cohorts.json 34 | git-of-theseus-stack-plot got/cohorts.json --normalize 35 | git-of-theseus-stack-plot got/exts.json 36 | git-of-theseus-stack-plot got/authors.json 37 | git-of-theseus-line-plot got/authors.json 38 | git-of-theseus-line-plot got/dirs.json 39 | git-of-theseus-survival-plot got/survival.json --exp-fit 40 | git-of-theseus-analyze --help 41 | git-of-theseus-stack-plot --help 42 | git-of-theseus-survival-plot --help 43 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:2.7.12 2 | MAINTAINER Jim DeLois 3 | 4 | COPY ./container/ / 5 | COPY ./ /got/ 6 | 7 | RUN pip install -e /got && \ 8 | apt-get update -q && \ 9 | apt-get install -yqq git 10 | 11 | #VOLUME ["/output"] 12 | 13 | WORKDIR /got/ 14 | 15 | CMD ["python"] 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2012-2016 Spotify AB 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![pypi badge](https://img.shields.io/pypi/v/git-of-theseus.svg?style=flat)](https://pypi.python.org/pypi/git-of-theseus) 2 | 3 | Some scripts to analyze Git repos. Produces cool looking graphs like this (running it on [git](https://github.com/git/git) itself): 4 | 5 | ![git](https://raw.githubusercontent.com/erikbern/git-of-theseus/master/pics/git-git.png) 6 | 7 | Installing 8 | ---------- 9 | 10 | Run `pip install git-of-theseus` 11 | 12 | Running 13 | ------- 14 | 15 | First, you need to run `git-of-theseus-analyze ` (see `git-of-theseus-analyze --help` for a bunch of config). This will analyze a repository and might take quite some time. 16 | 17 | After that, you can generate plots! Some examples: 18 | 19 | 1. Run `git-of-theseus-stack-plot cohorts.json` will create a stack plot showing the total amount of code broken down into cohorts (what year the code was added) 20 | 1. Run `git-of-theseus-line-plot authors.json --normalize` will show a plot of the % of code contributed by the top 20 authors 21 | 1. Run `git-of-theseus-survival-plot survival.json` 22 | 23 | You can run `--help` to see various options. 24 | 25 | If you want to plot multiple repositories, have to run `git-of-theseus-analyze` separately for each project and store the data in separate directories using the `--outdir` flag. Then you can run `git-of-theseus-survival-plot ` (optionally with the `--exp-fit` flag to fit an exponential decay) 26 | 27 | Help 28 | ---- 29 | 30 | `AttributeError: Unknown property labels` – upgrade matplotlib if you are seeing this. `pip install matplotlib --upgrade` 31 | 32 | Some pics 33 | --------- 34 | 35 | Survival of a line of code in a set of interesting repos: 36 | 37 | ![git](https://raw.githubusercontent.com/erikbern/git-of-theseus/master/pics/git-projects-survival.png) 38 | 39 | This curve is produced by the `git-of-theseus-survival-plot` script and shows the *percentage of lines in a commit that are still present after x years*. It aggregates it over all commits, no matter what point in time they were made. So for *x=0* it includes all commits, whereas for *x>0* not all commits are counted (because we would have to look into the future for some of them). The survival curves are estimated using [Kaplan-Meier](https://en.wikipedia.org/wiki/Kaplan%E2%80%93Meier_estimator). 40 | 41 | You can also add an exponential fit: 42 | 43 | ![git](https://raw.githubusercontent.com/erikbern/git-of-theseus/master/pics/git-projects-survival-exp-fit.png) 44 | 45 | Linux – stack plot: 46 | 47 | ![git](https://raw.githubusercontent.com/erikbern/git-of-theseus/master/pics/git-linux.png) 48 | 49 | This curve is produced by the `git-of-theseus-stack-plot` script and shows the total number of lines in a repo broken down into cohorts by the year the code was added. 50 | 51 | Node – stack plot: 52 | 53 | ![git](https://raw.githubusercontent.com/erikbern/git-of-theseus/master/pics/git-node.png) 54 | 55 | Rails – stack plot: 56 | 57 | ![git](https://raw.githubusercontent.com/erikbern/git-of-theseus/master/pics/git-rails.png) 58 | 59 | Tensorflow – stack plot: 60 | 61 | ![git](https://raw.githubusercontent.com/erikbern/git-of-theseus/master/pics/git-tensorflow.png) 62 | 63 | Rust – stack plot: 64 | 65 | ![git](https://raw.githubusercontent.com/erikbern/git-of-theseus/master/pics/git-rust.png) 66 | 67 | Plotting other stuff 68 | -------------------- 69 | 70 | `git-of-theseus-analyze` will write `exts.json`, `cohorts.json` and `authors.json`. You can run `git-of-theseus-stack-plot authors.json` to plot author statistics as well, or `git-of-theseus-stack-plot exts.json` to plot file extension statistics. For author statistics, you might want to create a [.mailmap](https://git-scm.com/docs/gitmailmap) file in the root directory of the repository to deduplicate authors. If you need to create a .mailmap file the following command can list the distinct author-email combinations in a repository: 71 | 72 | Mac / Linux 73 | 74 | ```shell 75 | git log --pretty=format:"%an %ae" | sort | uniq 76 | ``` 77 | 78 | Windows Powershell 79 | 80 | ```powershell 81 | git log --pretty=format:"%an %ae" | Sort-Object | Select-Object -Unique 82 | ``` 83 | 84 | For instance, here's the author statistics for [Kubernetes](https://github.com/kubernetes/kubernetes): 85 | 86 | ![git](https://raw.githubusercontent.com/erikbern/git-of-theseus/master/pics/git-kubernetes-authors.png) 87 | 88 | You can also normalize it to 100%. Here's author statistics for Git: 89 | 90 | ![git](https://raw.githubusercontent.com/erikbern/git-of-theseus/master/pics/git-git-authors-normalized.png) 91 | 92 | Other stuff 93 | ----------- 94 | 95 | [Markovtsev Vadim](https://twitter.com/tmarkhor) implemented a very similar analysis that claims to be 20%-6x faster than Git of Theseus. It's named [Hercules](https://github.com/src-d/hercules) and there's a great [blog post](https://web.archive.org/web/20180918135417/https://blog.sourced.tech/post/hercules.v4/) about all the complexity going into the analysis of Git history. 96 | -------------------------------------------------------------------------------- /container/root/.config/matplotlib/matplotlibrc: -------------------------------------------------------------------------------- 1 | backend: agg -------------------------------------------------------------------------------- /container/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | python /got/analyze.py /subject 4 | 5 | python /got/stack_plot.py --outfile=/output/stack_plot.png /got/cohorts.json 6 | 7 | CMD="python /got/survival_plot.py" 8 | 9 | if [ "$GOT_SURVIVAL_YEARS" ]; then 10 | CMD="${CMD} --years=${GOT_SURVIVAL_YEARS}" 11 | fi 12 | 13 | if [ "$GOT_SURVIVAL_FIT" ]; then 14 | CMD="${CMD} --exp-fit" 15 | fi 16 | 17 | CMD="${CMD} --outfile=/output/survival_plot.png /got/survival.json" 18 | $CMD 19 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | got: 2 | build: . 3 | volumes: 4 | - ${GOT_REPOSITORY}:/subject 5 | - ./output:/output 6 | command: "/run.sh" 7 | environment: 8 | GOT_SURVIVAL_FIT: ${GOT_SURVIVAL_FIT} 9 | GOT_SURVIVAL_YEARS: ${GOT_SURVIVAL_YEARS} 10 | stdin_open: true 11 | tty: true -------------------------------------------------------------------------------- /git_of_theseus/__init__.py: -------------------------------------------------------------------------------- 1 | from git_of_theseus.analyze import analyze, analyze_cmdline 2 | from git_of_theseus.survival_plot import survival_plot, survival_plot_cmdline 3 | from git_of_theseus.stack_plot import stack_plot, stack_plot_cmdline 4 | from git_of_theseus.line_plot import line_plot, line_plot_cmdline 5 | -------------------------------------------------------------------------------- /git_of_theseus/analyze.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Copyright 2016 Erik Bernhardsson 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import argparse 18 | import datetime 19 | import functools 20 | import json 21 | import multiprocessing 22 | import os 23 | import signal 24 | import warnings 25 | from pathlib import Path 26 | 27 | import git 28 | import pygments.lexers 29 | from tqdm import tqdm 30 | from wcmatch import fnmatch 31 | 32 | # Some filetypes in Pygments are not necessarily computer code, but configuration/documentation. Let's not include those. 33 | IGNORE_PYGMENTS_FILETYPES = [ 34 | "*.json", 35 | "*.md", 36 | "*.ps", 37 | "*.eps", 38 | "*.txt", 39 | "*.xml", 40 | "*.xsl", 41 | "*.rss", 42 | "*.xslt", 43 | "*.xsd", 44 | "*.wsdl", 45 | "*.wsf", 46 | "*.yaml", 47 | "*.yml", 48 | ] 49 | 50 | default_filetypes = set() 51 | for _, _, filetypes, _ in pygments.lexers.get_all_lexers(): 52 | default_filetypes.update(filetypes) 53 | default_filetypes.difference_update(IGNORE_PYGMENTS_FILETYPES) 54 | 55 | 56 | class MiniEntry: 57 | def __init__(self, entry): 58 | self.path = entry.path 59 | self.binsha = entry.binsha 60 | 61 | 62 | class MiniCommit: 63 | def __init__(self, commit): 64 | self.hexsha = commit.hexsha 65 | self.committed_date = commit.committed_date 66 | 67 | 68 | def get_top_dir(path): 69 | return ( 70 | os.path.dirname(path).split("/")[0] + "/" 71 | ) # Git/GitPython on Windows also returns paths with '/'s 72 | 73 | 74 | class BlameProc(multiprocessing.Process): 75 | def __init__( 76 | self, repo_dir, q, ret_q, run_flag, blame_kwargs, commit2cohort, use_mailmap 77 | ): 78 | super().__init__(daemon=True) 79 | self.repo: git.Repo = git.Repo(repo_dir) 80 | self.q: multiprocessing.Queue = q 81 | self.ret_q: multiprocessing.Queue = ret_q 82 | self.run_flag: multiprocessing.Event = run_flag 83 | self.blame_kwargs = dict(blame_kwargs) 84 | self.commit2cohort = commit2cohort # On Unix systems if process is started via the `fork` method, could make this a copy-on-write variable to save RAM 85 | self.use_mailmap = use_mailmap 86 | 87 | # Get Blame data for a `file` at `commit` 88 | def get_file_histogram(self, path, commit): 89 | h = {} 90 | try: 91 | for old_commit, lines in self.repo.blame(commit, path, **self.blame_kwargs): 92 | cohort = self.commit2cohort.get(old_commit.binsha, "MISSING") 93 | _, ext = os.path.splitext(path) 94 | if self.use_mailmap: 95 | author_name, author_email = get_mailmap_author_name_email( 96 | self.repo, old_commit.author.name, old_commit.author.email 97 | ) 98 | else: 99 | author_name, author_email = ( 100 | old_commit.author.name, 101 | old_commit.author.email, 102 | ) 103 | keys = [ 104 | ("cohort", cohort), 105 | ("ext", ext), 106 | ("author", author_name), 107 | ("dir", get_top_dir(path)), 108 | ("domain", author_email.split("@")[-1]), 109 | ] 110 | 111 | if old_commit.binsha in self.commit2cohort: 112 | keys.append(("sha", old_commit.hexsha)) 113 | 114 | for key in keys: 115 | h[key] = h.get(key, 0) + len(lines) 116 | except: 117 | pass 118 | return h 119 | 120 | def run(self): 121 | signal.signal(signal.SIGINT, signal.SIG_IGN) 122 | try: 123 | while self.run_flag.wait(): 124 | entry, commit = self.q.get() 125 | if not commit: 126 | return 127 | self.ret_q.put((entry, self.get_file_histogram(entry, commit))) 128 | except: 129 | raise 130 | 131 | 132 | class BlameDriver: 133 | def __init__( 134 | self, 135 | repo_dir, 136 | proc_count, 137 | last_file_y, 138 | cur_y, 139 | blame_kwargs, 140 | commit2cohort, 141 | use_mailmap, 142 | quiet, 143 | ): 144 | self.repo_dir = repo_dir 145 | self.proc_count = proc_count 146 | self.q = multiprocessing.Queue() 147 | self.ret_q = multiprocessing.Queue() 148 | self.run_flag = multiprocessing.Event() 149 | self.run_flag.set() 150 | self.last_file_y = last_file_y 151 | self.cur_y = cur_y 152 | self.blame_kwargs = blame_kwargs 153 | self.commit2cohort = commit2cohort 154 | self.use_mailmap = use_mailmap 155 | self.quiet = quiet 156 | self.proc_pool = [] 157 | self.spawn_process(self.proc_count) 158 | 159 | def spawn_process(self, spawn_only=False): 160 | n = self.proc_count - len(self.proc_pool) 161 | if n == 0: 162 | return 163 | if n < 0: 164 | return None if spawn_only else self._despawn_process(-n) 165 | if not self.quiet: 166 | print("\n\nStarting up processes: ", end="") 167 | for i in range(n): 168 | self.proc_pool.append( 169 | BlameProc( 170 | self.repo_dir, 171 | self.q, 172 | self.ret_q, 173 | self.run_flag, 174 | self.blame_kwargs, 175 | self.commit2cohort, 176 | self.use_mailmap, 177 | ) 178 | ) 179 | self.proc_pool[-1].start() 180 | if not self.quiet: 181 | print( 182 | ("" if i == 0 else ", ") + self.proc_pool[-1].name, 183 | end="\n" if i == n - 1 else "", 184 | ) 185 | 186 | def _despawn_process(self, n): 187 | for i in range(n): 188 | self.q.put((None, None)) 189 | 190 | print("\n") 191 | while True: 192 | print("\rShutting down processes: ", end="") 193 | killed_processes = 0 194 | for idx, proc in enumerate(self.proc_pool): 195 | if proc.is_alive(): 196 | continue 197 | else: 198 | print( 199 | ("" if killed_processes == 0 else ", ") + proc.name, 200 | end="\n" if killed_processes == n - 1 else "", 201 | ) 202 | killed_processes += 1 203 | if killed_processes >= n: 204 | for proc in self.proc_pool: 205 | if not proc.is_alive(): 206 | proc.join() 207 | self.proc_pool = [proc for proc in self.proc_pool if proc.is_alive()] 208 | return 209 | 210 | def fetch(self, commit, check_entries, bar): 211 | self.spawn_process() 212 | processed_entries = 0 213 | total_entries = len(check_entries) 214 | 215 | for entry in check_entries: 216 | self.q.put((entry.path, commit.hexsha)) 217 | 218 | while processed_entries < total_entries: 219 | path, file_y = self.ret_q.get() 220 | 221 | for key_tuple, file_locs in file_y.items(): 222 | self.cur_y[key_tuple] = self.cur_y.get(key_tuple, 0) + file_locs 223 | self.last_file_y[path] = file_y 224 | 225 | processed_entries += 1 226 | self.run_flag.wait() 227 | bar.update() 228 | 229 | return self.cur_y 230 | 231 | def pause(self): 232 | self.run_flag.clear() 233 | 234 | def resume(self): 235 | self.run_flag.set() 236 | 237 | 238 | def analyze( 239 | repo_dir, 240 | cohortfm="%Y", 241 | interval=7 * 24 * 60 * 60, 242 | ignore=[], 243 | only=[], 244 | outdir=".", 245 | branch="master", 246 | all_filetypes=False, 247 | ignore_whitespace=False, 248 | procs=2, 249 | quiet=False, 250 | opt=False, 251 | ): 252 | use_mailmap = (Path(repo_dir) / ".mailmap").exists() 253 | repo = git.Repo(repo_dir) 254 | blame_kwargs = {} 255 | if ignore_whitespace: 256 | blame_kwargs["w"] = True 257 | master_commits = [] # only stores a subset 258 | commit2cohort = {} 259 | curve_key_tuples = set() # Keys of each curve that will be tracked 260 | tqdm_args = { 261 | "smoothing": 0.025, # Exponential smoothing is still rather jumpy, a tiny number will do 262 | "disable": quiet, 263 | "dynamic_ncols": True, 264 | } 265 | 266 | if not os.path.exists(outdir): 267 | os.makedirs(outdir) 268 | 269 | # Check if specified branch exists 270 | try: 271 | repo.git.show_ref("refs/heads/{:s}".format(branch), verify=True) 272 | except git.exc.GitCommandError: 273 | default_branch = repo.active_branch.name 274 | warnings.warn( 275 | "Requested branch: '{:s}' does not exist. Falling back to default branch: '{:s}'".format( 276 | branch, default_branch 277 | ) 278 | ) 279 | 280 | branch = default_branch 281 | 282 | if not quiet and repo.git.version_info < (2, 31, 0): 283 | print( 284 | "Old Git version {:d}.{:d}.{:d} detected. There are optimizations available in version 2.31.0 which speed up performance".format( 285 | *repo.git.version_info 286 | ) 287 | ) 288 | 289 | if opt: 290 | if not quiet: 291 | print( 292 | "Generating git commit-graph... If you wish, this file is deletable later at .git/objects/info" 293 | ) 294 | repo.git.execute( 295 | ["git", "commit-graph", "write", "--changed-paths"] 296 | ) # repo.git.commit_graph('write --changed-paths') doesn't work for some reason 297 | 298 | desc = "{:<55s}".format("Listing all commits") 299 | for commit in tqdm( 300 | repo.iter_commits(branch), desc=desc, unit=" Commits", **tqdm_args 301 | ): 302 | cohort = datetime.datetime.utcfromtimestamp(commit.committed_date).strftime( 303 | cohortfm 304 | ) 305 | commit2cohort[commit.binsha] = cohort 306 | curve_key_tuples.add(("cohort", cohort)) 307 | if use_mailmap: 308 | author_name, author_email = get_mailmap_author_name_email( 309 | repo, commit.author.name, commit.author.email 310 | ) 311 | else: 312 | author_name, author_email = commit.author.name, commit.author.email 313 | curve_key_tuples.add(("author", author_name)) 314 | curve_key_tuples.add(("domain", author_email.split("@")[-1])) 315 | 316 | desc = "{:<55s}".format("Backtracking the master branch") 317 | with tqdm(desc=desc, unit=" Commits", **tqdm_args) as bar: 318 | commit = repo.head.commit 319 | last_date = None 320 | while True: 321 | if last_date is None or commit.committed_date < last_date - interval: 322 | master_commits.append(commit) 323 | last_date = commit.committed_date 324 | bar.update() 325 | if not commit.parents: 326 | break 327 | commit = commit.parents[0] 328 | del commit 329 | 330 | if ignore and not only: 331 | only = ["**"] # stupid fix 332 | def_ft_str = "+({:s})".format("|".join(default_filetypes)) 333 | path_match_str = "{:s}|!+({:s})".format("|".join(only), "|".join(ignore)) 334 | path_match_zero = len(only) == 0 and len(ignore) == 0 335 | ok_entry_paths = dict() 336 | all_entries = [] 337 | 338 | def entry_path_ok(path): 339 | # All this matching is slow so let's cache it 340 | if path not in ok_entry_paths: 341 | ok_entry_paths[path] = ( 342 | all_filetypes 343 | or fnmatch.fnmatch( 344 | os.path.split(path)[-1], def_ft_str, flags=fnmatch.EXTMATCH 345 | ) 346 | ) and ( 347 | path_match_zero 348 | or fnmatch.fnmatch( 349 | path, 350 | path_match_str, 351 | flags=fnmatch.NEGATE | fnmatch.EXTMATCH | fnmatch.SPLIT, 352 | ) 353 | ) 354 | return ok_entry_paths[path] 355 | 356 | def get_entries(commit): 357 | tmp = [ 358 | MiniEntry(entry) 359 | for entry in commit.tree.traverse() 360 | if entry.type == "blob" and entry_path_ok(entry.path) 361 | ] 362 | all_entries.append(tmp) 363 | return tmp 364 | 365 | master_commits = master_commits[::-1] # Reverse it so it's chnological ascending 366 | entries_total = 0 367 | desc = "{:<55s}".format("Discovering entries & caching filenames") 368 | with tqdm( 369 | desc="{:<55s}".format("Entries Discovered"), 370 | unit=" Entries", 371 | position=1, 372 | **tqdm_args, 373 | ) as bar: 374 | for i, commit in enumerate( 375 | tqdm(master_commits, desc=desc, unit=" Commits", position=0, **tqdm_args) 376 | ): 377 | for entry in get_entries(commit): 378 | entries_total += 1 379 | _, ext = os.path.splitext(entry.path) 380 | curve_key_tuples.add(("ext", ext)) 381 | curve_key_tuples.add(("dir", get_top_dir(entry.path))) 382 | bar.update() 383 | master_commits[i] = MiniCommit( 384 | commit 385 | ) # Might have cached the entries, we don't want that 386 | 387 | # We don't need these anymore, let GC Cleanup 388 | del repo 389 | del ok_entry_paths 390 | del commit 391 | # End GC Cleanup 392 | 393 | curves = {} # multiple y axis, in the form key_tuple: Array[y-axis points] 394 | ts = [] # x axis 395 | last_file_y = ( 396 | {} 397 | ) # Contributions of each individual file to each individual curve, when the file was last seen 398 | cur_y = {} # Sum of all contributions between files towards each individual curve 399 | blamer = BlameDriver( 400 | repo_dir, 401 | procs, 402 | last_file_y, 403 | cur_y, 404 | blame_kwargs, 405 | commit2cohort, 406 | use_mailmap, 407 | quiet, 408 | ) 409 | commit_history = ( 410 | {} 411 | ) # How many lines of a commit (by SHA) still exist at a given time 412 | last_file_hash = {} # File SHAs when they were last seen 413 | 414 | # Allow script to be paused and process count to change 415 | def handler(a, b): 416 | try: 417 | blamer.pause() 418 | print("\n\nProcess paused") 419 | x = int( 420 | input( 421 | "0. Exit\n1. Continue\n2. Modify process count\nSelect an option: " 422 | ) 423 | ) 424 | 425 | if x == 1: 426 | return blamer.resume() 427 | elif x == 2: 428 | x = int( 429 | input( 430 | "\n\nCurrent Processes: {:d}\nNew Setting: ".format( 431 | blamer.proc_count 432 | ) 433 | ) 434 | ) 435 | if x > 0: 436 | blamer.proc_count = x 437 | blamer.spawn_process(spawn_only=True) 438 | return blamer.resume() 439 | os._exit(1) # sys.exit() does weird things 440 | except: 441 | pass 442 | handler(None, None) 443 | 444 | if not quiet: 445 | signal.signal(signal.SIGINT, handler) 446 | 447 | desc = "{:<55s}".format( 448 | "Analyzing commit history with {:d} processes".format(procs) 449 | ) 450 | with tqdm( 451 | desc="{:<55s}".format("Entries Processed"), 452 | total=entries_total, 453 | unit=" Entries", 454 | position=1, 455 | maxinterval=1, 456 | miniters=100, 457 | **tqdm_args, 458 | ) as bar: 459 | cbar = tqdm(master_commits, desc=desc, unit=" Commits", position=0, **tqdm_args) 460 | for commit in cbar: 461 | t = datetime.datetime.utcfromtimestamp(commit.committed_date) 462 | ts.append(t) # x axis 463 | 464 | # START: Fast diff, to reduce no. of files checked via blame. 465 | # File hashes are checked against previous iteration 466 | entries = all_entries.pop( 467 | 0 468 | ) # all_entries grows smaller as curves grows larger 469 | 470 | check_entries = [] 471 | cur_file_hash = {} 472 | for entry in entries: 473 | cur_file_hash[entry.path] = entry.binsha 474 | if entry.path in last_file_hash: 475 | if last_file_hash[entry.path] != entry.binsha: # Modified file 476 | for key_tuple, count in last_file_y[entry.path].items(): 477 | cur_y[key_tuple] -= count 478 | check_entries.append(entry) 479 | else: # Identical file 480 | bar.update() 481 | del last_file_hash[ 482 | entry.path 483 | ] # Identical/Modified file removed, leaving deleted files behind 484 | else: # Newly added file 485 | check_entries.append(entry) 486 | for deleted_path in last_file_hash.keys(): # Deleted files 487 | for key_tuple, count in last_file_y[deleted_path].items(): 488 | cur_y[key_tuple] -= count 489 | last_file_hash = cur_file_hash 490 | # END: Fast diff 491 | 492 | # Multiprocess blame checker, updates cur_y & last_file_y 493 | blamer.fetch(commit, check_entries, bar) 494 | cbar.set_description( 495 | "{:<55s}".format( 496 | "Analyzing commit history with {:d} processes".format( 497 | len(blamer.proc_pool) 498 | ) 499 | ), 500 | False, 501 | ) 502 | 503 | for key_tuple, count in cur_y.items(): 504 | key_category, key = key_tuple 505 | if key_category == "sha": 506 | commit_history.setdefault(key, []).append( 507 | (commit.committed_date, count) 508 | ) 509 | 510 | for key_tuple in curve_key_tuples: 511 | curves.setdefault(key_tuple, []).append(cur_y.get(key_tuple, 0)) 512 | 513 | signal.signal(signal.SIGINT, signal.default_int_handler) 514 | 515 | def dump_json(output_fn, key_type, label_fmt=lambda x: x): 516 | key_items = sorted(k for t, k in curve_key_tuples if t == key_type) 517 | fn = os.path.join(outdir, output_fn) 518 | if not quiet: 519 | print("Writing %s data to %s" % (key_type, fn)) 520 | f = open(fn, "w") 521 | json.dump( 522 | { 523 | "y": [curves[(key_type, key_item)] for key_item in key_items], 524 | "ts": [t.isoformat() for t in ts], 525 | "labels": [label_fmt(key_item) for key_item in key_items], 526 | }, 527 | f, 528 | ) 529 | f.close() 530 | 531 | # Dump accumulated stuff 532 | dump_json("cohorts.json", "cohort", lambda c: "Code added in %s" % c) 533 | dump_json("exts.json", "ext") 534 | dump_json("authors.json", "author") 535 | dump_json("dirs.json", "dir") 536 | dump_json("domains.json", "domain") 537 | 538 | # Dump survival data 539 | fn = os.path.join(outdir, "survival.json") 540 | f = open(fn, "w") 541 | if not quiet: 542 | print("Writing survival data to %s" % fn) 543 | json.dump(commit_history, f) 544 | f.close() 545 | 546 | 547 | @functools.lru_cache(maxsize=None) 548 | def get_mailmap_author_name_email(repo, author_name, author_email): 549 | pre_mailmap_author_email = f"{author_name} <{author_email}>" 550 | mail_mapped_author_email: str = repo.git.check_mailmap(pre_mailmap_author_email) 551 | mailmap_name, mailmap_email = mail_mapped_author_email[:-1].split(" <", maxsplit=1) 552 | return mailmap_name, mailmap_email 553 | 554 | 555 | def analyze_cmdline(): 556 | parser = argparse.ArgumentParser(description="Analyze git repo") 557 | parser.add_argument( 558 | "--cohortfm", 559 | default="%Y", 560 | type=str, 561 | help='A Python datetime format string such as "%%Y" for creating cohorts (default: %(default)s)', 562 | ) 563 | parser.add_argument( 564 | "--interval", 565 | default=7 * 24 * 60 * 60, 566 | type=int, 567 | help="Min difference between commits to analyze (default: %(default)ss)", 568 | ) 569 | parser.add_argument( 570 | "--ignore", 571 | default=[], 572 | action="append", 573 | help="File patterns that should be ignored (can provide multiple, will each subtract independently). Uses glob syntax and generally needs to be shell escaped. For instance, to ignore a subdirectory `foo/`, run `git-of-theseus . --ignore 'foo/**'`.", 574 | ) 575 | parser.add_argument( 576 | "--only", 577 | default=[], 578 | action="append", 579 | help="File patterns that can match. Multiple can be provided. If at least one is provided, every file has to match at least one. Uses glob syntax and typically has to be shell escaped. In order to analytize a subdirectory `bar/`, run `git-of-theseus . --only 'bar/**'`", 580 | ) 581 | parser.add_argument( 582 | "--outdir", 583 | default=".", 584 | help="Output directory to store results (default: %(default)s)", 585 | ) 586 | parser.add_argument( 587 | "--branch", 588 | default="master", 589 | type=str, 590 | help="Branch to track (default: %(default)s)", 591 | ) 592 | parser.add_argument( 593 | "--ignore-whitespace", 594 | default=[], 595 | action="store_true", 596 | help="Ignore whitespace changes when running git blame.", 597 | ) 598 | parser.add_argument( 599 | "--all-filetypes", 600 | action="store_true", 601 | help="Include all files (if not set then will only analyze %s" 602 | % ",".join(default_filetypes), 603 | ) 604 | parser.add_argument( 605 | "--quiet", 606 | action="store_true", 607 | help="Disable all console output (default: %(default)s)", 608 | ) 609 | parser.add_argument( 610 | "--procs", 611 | default=2, 612 | type=int, 613 | help="Number of processes to use. There is a point of diminishing returns, and RAM may become an issue on large repos (default: %(default)s)", 614 | ) 615 | parser.add_argument( 616 | "--opt", 617 | action="store_true", 618 | help="Generates git commit-graph; Improves performance at the cost of some (~80KB/kCommit) disk space (default: %(default)s)", 619 | ) 620 | parser.add_argument("repo_dir") 621 | kwargs = vars(parser.parse_args()) 622 | 623 | try: 624 | analyze(**kwargs) 625 | except KeyboardInterrupt: 626 | exit(1) 627 | except: 628 | raise 629 | 630 | 631 | if __name__ == "__main__": 632 | analyze_cmdline() 633 | -------------------------------------------------------------------------------- /git_of_theseus/line_plot.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Copyright 2016 Erik Bernhardsson 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import matplotlib 18 | 19 | matplotlib.use("Agg") 20 | 21 | import argparse, dateutil.parser, json, numpy, sys 22 | from matplotlib import pyplot 23 | 24 | 25 | from .utils import generate_n_colors 26 | 27 | 28 | def line_plot( 29 | input_fn, display=False, outfile="line_plot.png", max_n=20, normalize=False 30 | ): 31 | data = json.load(open(input_fn)) # TODO do we support multiple arguments here? 32 | y = numpy.array(data["y"]) 33 | y_sums = numpy.sum(y, axis=0) 34 | if y.shape[0] > max_n: 35 | js = sorted(range(len(data["labels"])), key=lambda j: max(y[j]), reverse=True) 36 | top_js = sorted(js[:max_n], key=lambda j: data["labels"][j]) 37 | y = numpy.array([y[j] for j in top_js]) 38 | labels = [data["labels"][j] for j in top_js] 39 | else: 40 | labels = data["labels"] 41 | if normalize: 42 | y = 100.0 * y / y_sums 43 | pyplot.figure(figsize=(16, 12), dpi=120) 44 | pyplot.style.use("ggplot") 45 | ts = [dateutil.parser.parse(t) for t in data["ts"]] 46 | colors = generate_n_colors(len(labels)) 47 | for color, label, series in zip(colors, labels, y): 48 | pyplot.plot(ts, series, color=color, label=label, linewidth=3) 49 | pyplot.legend(loc=2) 50 | if normalize: 51 | pyplot.ylabel("Share of lines of code (%)") 52 | pyplot.ylim([0, 100]) 53 | else: 54 | pyplot.ylabel("Lines of code") 55 | print("Writing output to %s" % outfile) 56 | pyplot.savefig(outfile) 57 | pyplot.tight_layout() 58 | if display: 59 | pyplot.show() 60 | 61 | 62 | def line_plot_cmdline(): 63 | parser = argparse.ArgumentParser(description="Plot line plot") 64 | parser.add_argument("--display", action="store_true", help="Display plot") 65 | parser.add_argument( 66 | "--outfile", 67 | default="line_plot.png", 68 | type=str, 69 | help="Output file to store results (default: %(default)s)", 70 | ) 71 | parser.add_argument( 72 | "--max-n", 73 | default=20, 74 | type=int, 75 | help="Max number of dataseries (default: %(default)s)", 76 | ) 77 | parser.add_argument( 78 | "--normalize", 79 | action="store_true", 80 | help="Plot the share of each, so it adds up to 100%%", 81 | ) 82 | parser.add_argument("input_fn") 83 | kwargs = vars(parser.parse_args()) 84 | 85 | line_plot(**kwargs) 86 | 87 | 88 | if __name__ == "__main__": 89 | line_plot_cmdline() 90 | -------------------------------------------------------------------------------- /git_of_theseus/stack_plot.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Copyright 2016 Erik Bernhardsson 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import matplotlib 18 | 19 | matplotlib.use("Agg") 20 | 21 | import argparse, dateutil.parser, json, numpy, sys 22 | from matplotlib import pyplot 23 | 24 | from .utils import generate_n_colors 25 | 26 | 27 | def stack_plot( 28 | input_fn, display=False, outfile="stack_plot.png", max_n=20, normalize=False 29 | ): 30 | data = json.load(open(input_fn)) # TODO do we support multiple arguments here? 31 | y = numpy.array(data["y"]) 32 | if y.shape[0] > max_n: 33 | js = sorted(range(len(data["labels"])), key=lambda j: max(y[j]), reverse=True) 34 | other_sum = numpy.sum(y[j] for j in js[max_n:]) 35 | top_js = sorted(js[:max_n], key=lambda j: data["labels"][j]) 36 | y = numpy.array([y[j] for j in top_js] + [other_sum]) 37 | labels = [data["labels"][j] for j in top_js] + ["other"] 38 | else: 39 | labels = data["labels"] 40 | if normalize: 41 | y = 100.0 * numpy.array(y) / numpy.sum(y, axis=0) 42 | pyplot.figure(figsize=(16, 12), dpi=120) 43 | pyplot.style.use("ggplot") 44 | ts = [dateutil.parser.parse(t) for t in data["ts"]] 45 | colors = generate_n_colors(len(labels)) 46 | pyplot.stackplot(ts, numpy.array(y), labels=labels, colors=colors) 47 | pyplot.legend(loc=2) 48 | if normalize: 49 | pyplot.ylabel("Share of lines of code (%)") 50 | pyplot.ylim([0, 100]) 51 | else: 52 | pyplot.ylabel("Lines of code") 53 | print("Writing output to %s" % outfile) 54 | pyplot.savefig(outfile) 55 | pyplot.tight_layout() 56 | if display: 57 | pyplot.show() 58 | 59 | 60 | def stack_plot_cmdline(): 61 | parser = argparse.ArgumentParser(description="Plot stack plot") 62 | parser.add_argument("--display", action="store_true", help="Display plot") 63 | parser.add_argument( 64 | "--outfile", 65 | default="stack_plot.png", 66 | type=str, 67 | help="Output file to store results (default: %(default)s)", 68 | ) 69 | parser.add_argument( 70 | "--max-n", 71 | default=20, 72 | type=int, 73 | help='Max number of dataseries (will roll everything else into "other") (default: %(default)s)', 74 | ) 75 | parser.add_argument( 76 | "--normalize", action="store_true", help="Normalize the plot to 100%%" 77 | ) 78 | parser.add_argument("input_fn") 79 | kwargs = vars(parser.parse_args()) 80 | 81 | stack_plot(**kwargs) 82 | 83 | 84 | if __name__ == "__main__": 85 | stack_plot_cmdline() 86 | -------------------------------------------------------------------------------- /git_of_theseus/survival_plot.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Copyright 2016 Erik Bernhardsson 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import matplotlib 18 | 19 | matplotlib.use("Agg") 20 | 21 | import argparse 22 | import collections 23 | import json 24 | import math 25 | import os 26 | import sys 27 | 28 | import numpy 29 | from matplotlib import pyplot 30 | 31 | 32 | def survival_plot( 33 | input_fns, exp_fit=False, display=False, outfile="survival_plot", years=5 34 | ): 35 | all_deltas = [] 36 | YEAR = 365.25 * 24 * 60 * 60 37 | pyplot.figure(figsize=(13, 8)) 38 | pyplot.style.use("ggplot") 39 | 40 | for fn in input_fns: 41 | print("reading %s" % fn) 42 | commit_history = json.load(open(fn)) 43 | 44 | print("counting %d commits" % len(commit_history)) 45 | deltas = collections.defaultdict(lambda: numpy.zeros(2)) 46 | total_n = 0 47 | for commit, history in commit_history.items(): 48 | t0, orig_count = history[0] 49 | total_n += orig_count 50 | last_count = orig_count 51 | for t, count in history[1:]: 52 | deltas[t - t0] += (count - last_count, 0) 53 | last_count = count 54 | deltas[history[-1][0] - t0] += (-last_count, -orig_count) 55 | 56 | all_deltas.append((total_n, deltas)) 57 | print("adding %d deltas..." % len(deltas)) 58 | total_k = total_n 59 | P = 1.0 60 | xs = [] 61 | ys = [] 62 | for t in sorted(deltas.keys()): 63 | delta_k, delta_n = deltas[t] 64 | xs.append(t / YEAR) 65 | ys.append(100.0 * P) 66 | P *= 1 + delta_k / total_n 67 | total_k += delta_k 68 | total_n += delta_n 69 | if P < 0.05: 70 | break 71 | 72 | print("plotting...") 73 | if exp_fit: 74 | pyplot.plot(xs, ys, color="darkgray") 75 | else: 76 | parts = os.path.split(fn) 77 | pyplot.plot(xs, ys, label=(len(parts) > 1 and parts[-2] or None)) 78 | 79 | def fit(k): 80 | loss = 0.0 81 | for total_n, deltas in all_deltas: 82 | total_k = total_n 83 | P = 1.0 84 | for t in sorted(deltas.keys()): 85 | delta_k, delta_n = deltas[t] 86 | pred = total_n * math.exp(-k * t / YEAR) 87 | loss += (total_n * P - pred) ** 2 88 | P *= 1 + delta_k / total_n 89 | total_k += delta_k 90 | total_n += delta_n 91 | print(k, loss) 92 | return loss 93 | 94 | if exp_fit: 95 | try: 96 | import scipy.optimize 97 | except ImportError: 98 | sys.exit("Scipy is a required dependency when using the --exp-fit flag") 99 | 100 | print("fitting exponential function") 101 | k = scipy.optimize.fmin(fit, 0.5, maxiter=50)[0] 102 | ts = numpy.linspace(0, years, 1000) 103 | ys = [100.0 * math.exp(-k * t) for t in ts] 104 | pyplot.plot( 105 | ts, 106 | ys, 107 | color="red", 108 | label="Exponential fit, half-life = %.2f years" % (math.log(2) / k), 109 | ) 110 | 111 | pyplot.xlabel("Years") 112 | pyplot.ylabel("%") 113 | pyplot.xlim([0, years]) 114 | pyplot.ylim([0, 100]) 115 | pyplot.title("% of lines still present in code after n years") 116 | pyplot.legend() 117 | pyplot.tight_layout() 118 | pyplot.savefig(outfile) 119 | if display: 120 | pyplot.show() 121 | 122 | 123 | def survival_plot_cmdline(): 124 | parser = argparse.ArgumentParser(description="Plot survival plot") 125 | parser.add_argument("--exp-fit", action="store_true", help="Plot exponential fit") 126 | parser.add_argument("--display", action="store_true", help="Display plot") 127 | parser.add_argument( 128 | "--outfile", 129 | default="survival_plot.png", 130 | type=str, 131 | help="Output file to store results (default: %(default)s)", 132 | ) 133 | parser.add_argument( 134 | "--years", 135 | type=float, 136 | default=5, 137 | help="Number of years on x axis (default: %(default)s)", 138 | ) 139 | parser.add_argument("input_fns", nargs="*") 140 | kwargs = vars(parser.parse_args()) 141 | 142 | survival_plot(**kwargs) 143 | 144 | 145 | if __name__ == "__main__": 146 | survival_plot_cmdline() 147 | -------------------------------------------------------------------------------- /git_of_theseus/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Copyright 2016 Erik Bernhardsson 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import itertools, numpy 18 | 19 | 20 | def generate_n_colors(n): 21 | vs = numpy.linspace(0.4, 0.9, 6) 22 | colors = [(0.9, 0.4, 0.4)] 23 | 24 | def euclidean(a, b): 25 | return sum((x - y) ** 2 for x, y in zip(a, b)) 26 | 27 | while len(colors) < n: 28 | new_color = max( 29 | itertools.product(vs, vs, vs), 30 | key=lambda a: min(euclidean(a, b) for b in colors), 31 | ) 32 | colors.append(new_color) 33 | return colors 34 | -------------------------------------------------------------------------------- /pics/git-angular-survival.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/git-of-theseus/961bda027ffa9fcd8bbe99d5b8809cc0eaa86464/pics/git-angular-survival.png -------------------------------------------------------------------------------- /pics/git-angular.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/git-of-theseus/961bda027ffa9fcd8bbe99d5b8809cc0eaa86464/pics/git-angular.png -------------------------------------------------------------------------------- /pics/git-git-authors-normalized.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/git-of-theseus/961bda027ffa9fcd8bbe99d5b8809cc0eaa86464/pics/git-git-authors-normalized.png -------------------------------------------------------------------------------- /pics/git-git-survival-exp-fit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/git-of-theseus/961bda027ffa9fcd8bbe99d5b8809cc0eaa86464/pics/git-git-survival-exp-fit.png -------------------------------------------------------------------------------- /pics/git-git-survival.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/git-of-theseus/961bda027ffa9fcd8bbe99d5b8809cc0eaa86464/pics/git-git-survival.png -------------------------------------------------------------------------------- /pics/git-git.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/git-of-theseus/961bda027ffa9fcd8bbe99d5b8809cc0eaa86464/pics/git-git.png -------------------------------------------------------------------------------- /pics/git-httpd-survival.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/git-of-theseus/961bda027ffa9fcd8bbe99d5b8809cc0eaa86464/pics/git-httpd-survival.png -------------------------------------------------------------------------------- /pics/git-httpd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/git-of-theseus/961bda027ffa9fcd8bbe99d5b8809cc0eaa86464/pics/git-httpd.png -------------------------------------------------------------------------------- /pics/git-kubernetes-authors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/git-of-theseus/961bda027ffa9fcd8bbe99d5b8809cc0eaa86464/pics/git-kubernetes-authors.png -------------------------------------------------------------------------------- /pics/git-linux-survival.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/git-of-theseus/961bda027ffa9fcd8bbe99d5b8809cc0eaa86464/pics/git-linux-survival.png -------------------------------------------------------------------------------- /pics/git-linux.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/git-of-theseus/961bda027ffa9fcd8bbe99d5b8809cc0eaa86464/pics/git-linux.png -------------------------------------------------------------------------------- /pics/git-node-survival.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/git-of-theseus/961bda027ffa9fcd8bbe99d5b8809cc0eaa86464/pics/git-node-survival.png -------------------------------------------------------------------------------- /pics/git-node.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/git-of-theseus/961bda027ffa9fcd8bbe99d5b8809cc0eaa86464/pics/git-node.png -------------------------------------------------------------------------------- /pics/git-projects-survival-exp-fit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/git-of-theseus/961bda027ffa9fcd8bbe99d5b8809cc0eaa86464/pics/git-projects-survival-exp-fit.png -------------------------------------------------------------------------------- /pics/git-projects-survival.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/git-of-theseus/961bda027ffa9fcd8bbe99d5b8809cc0eaa86464/pics/git-projects-survival.png -------------------------------------------------------------------------------- /pics/git-rails-survival.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/git-of-theseus/961bda027ffa9fcd8bbe99d5b8809cc0eaa86464/pics/git-rails-survival.png -------------------------------------------------------------------------------- /pics/git-rails.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/git-of-theseus/961bda027ffa9fcd8bbe99d5b8809cc0eaa86464/pics/git-rails.png -------------------------------------------------------------------------------- /pics/git-rust.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/git-of-theseus/961bda027ffa9fcd8bbe99d5b8809cc0eaa86464/pics/git-rust.png -------------------------------------------------------------------------------- /pics/git-tensorflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/git-of-theseus/961bda027ffa9fcd8bbe99d5b8809cc0eaa86464/pics/git-tensorflow.png -------------------------------------------------------------------------------- /pics/trireme.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/git-of-theseus/961bda027ffa9fcd8bbe99d5b8809cc0eaa86464/pics/trireme.jpg -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup( 4 | name="git-of-theseus", 5 | version="0.3.4", 6 | description="Plot stats on Git repositories", 7 | author="Erik Bernhardsson", 8 | author_email="mail@erikbern.com", 9 | url="https://github.com/erikbern/git-of-theseus", 10 | packages=["git_of_theseus"], 11 | install_requires=[ 12 | "gitpython", 13 | "numpy", 14 | "tqdm", 15 | "wcmatch", 16 | "pygments", 17 | "matplotlib", 18 | ], 19 | entry_points={ 20 | "console_scripts": [ 21 | "git-of-theseus-analyze=git_of_theseus.analyze:analyze_cmdline", 22 | "git-of-theseus-survival-plot=git_of_theseus:survival_plot_cmdline", 23 | "git-of-theseus-stack-plot=git_of_theseus:stack_plot_cmdline", 24 | "git-of-theseus-line-plot=git_of_theseus:line_plot_cmdline", 25 | ] 26 | }, 27 | ) 28 | --------------------------------------------------------------------------------