├── __init__.py ├── gitoutput ├── images ├── file_range.png ├── commit_range.png ├── state_filter.png ├── commit_file_range.png └── complete_visual_history.png ├── creation.sh ├── example_githistoryvis.py ├── LICENSE ├── .gitignore ├── git_history_test_git.py ├── README.md └── githistoryvis.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gitoutput: -------------------------------------------------------------------------------- 1 | COMMIT 428b104 2 | M .gitignore 3 | 4 | COMMIT f452ee7 5 | M README.md 6 | M githistoryvis.py 7 | -------------------------------------------------------------------------------- /images/file_range.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidpixo/git_history_visualizer/HEAD/images/file_range.png -------------------------------------------------------------------------------- /images/commit_range.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidpixo/git_history_visualizer/HEAD/images/commit_range.png -------------------------------------------------------------------------------- /images/state_filter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidpixo/git_history_visualizer/HEAD/images/state_filter.png -------------------------------------------------------------------------------- /images/commit_file_range.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidpixo/git_history_visualizer/HEAD/images/commit_file_range.png -------------------------------------------------------------------------------- /images/complete_visual_history.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidpixo/git_history_visualizer/HEAD/images/complete_visual_history.png -------------------------------------------------------------------------------- /creation.sh: -------------------------------------------------------------------------------- 1 | # initialize repository 2 | git init 3 | 4 | # do stuff with the files 5 | touch / rm / write 6 | 7 | # extrat all the commits 8 | git --no-pager log --reverse --oneline 9 | 10 | # extract all the filenames 11 | git --no-pager log --reverse --name-only --oneline --pretty='format:' | sed '/^$/d' | sort | uniq 12 | 13 | # extract the whole git history 14 | git --no-pager log --reverse --name-status --oneline --pretty='format:COMMIT %h %s' | tr '\t' ' ' | sed -E -e 's/( )+/ /g' -e '/^$/d' 15 | -------------------------------------------------------------------------------- /example_githistoryvis.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import githistoryvis as ghv 3 | 4 | # put here the desired git repo path 5 | import os 6 | path = os.getcwd() 7 | 8 | # define the obejct linked to your repository 9 | # essentially add the gt.path variable 10 | gt = ghv.git_history(path) 11 | # get the history : define gt.all_commits, gt.commit, gt.all_file 12 | gt.get_history() 13 | 14 | # if the git log data are in a file somewhere, 15 | # read the file in a string and pass it 16 | with open('gitoutput', 'r') as file: 17 | data = file.read() 18 | gt.get_history(gitcommitlist=data) 19 | 20 | # Here is Pandas needed 21 | # define the datamatrix : define gt.datamatrix, 22 | # a Pandas.Dataurame with categorical columns 23 | gt.definedatamatrix() 24 | 25 | # new compact version 26 | gt = ghv.git_history(path, get_history=True, definedatamatrix=True) 27 | 28 | # visualization 29 | import matplotlib 30 | from matplotlib import pyplot as plt 31 | 32 | # play wiht size and figsize to find yours 33 | gt.plot_history_df(plt, gt.datamatrix, size=300, figsize=[12, 10.5], outpath=path+os.sep+'images/complete_visual_history.png') 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 kidpixo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.gitignore.io/api/ipythonnotebook 2 | 3 | ### Vim ### 4 | # swap 5 | [._]*.s[a-w][a-z] 6 | [._]s[a-w][a-z] 7 | # session 8 | Session.vim 9 | # temporary 10 | .netrwhist 11 | *~ 12 | # auto-generated tag files 13 | tags 14 | ### IPythonNotebook ### 15 | # Temporary data 16 | .ipynb_checkpoints 17 | 18 | MANIFEST 19 | build 20 | dist 21 | _build 22 | docs/man/*.gz 23 | docs/source/api/generated 24 | docs/source/config/options 25 | docs/source/interactive/magics-generated.txt 26 | docs/gh-pages 27 | jupyter_notebook/notebook/static/mathjax 28 | jupyter_notebook/static/style/*.map 29 | *.py[co] 30 | __pycache__ 31 | *.egg-info 32 | *~ 33 | *.bak 34 | .ipynb_checkpoints 35 | .tox 36 | .DS_Store 37 | \#*# 38 | .#* 39 | .coverage 40 | # Byte-compiled / optimized / DLL files 41 | __pycache__/ 42 | *.py[cod] 43 | 44 | # C extensions 45 | *.so 46 | 47 | # Distribution / packaging 48 | .Python 49 | env/ 50 | build/ 51 | develop-eggs/ 52 | dist/ 53 | downloads/ 54 | eggs/ 55 | .eggs/ 56 | lib/ 57 | lib64/ 58 | parts/ 59 | sdist/ 60 | var/ 61 | *.egg-info/ 62 | .installed.cfg 63 | *.egg 64 | 65 | # PyInstaller 66 | # Usually these files are written by a python script from a template 67 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 68 | *.manifest 69 | *.spec 70 | 71 | # Installer logs 72 | pip-log.txt 73 | pip-delete-this-directory.txt 74 | 75 | # Unit test / coverage reports 76 | htmlcov/ 77 | .tox/ 78 | .coverage 79 | .coverage.* 80 | .cache 81 | nosetests.xml 82 | coverage.xml 83 | *,cover 84 | 85 | # Translations 86 | *.mo 87 | *.pot 88 | 89 | # Django stuff: 90 | *.log 91 | 92 | # Sphinx documentation 93 | docs/_build/ 94 | 95 | # PyBuilder 96 | target/ 97 | -------------------------------------------------------------------------------- /git_history_test_git.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[18]: 5 | 6 | import githistoryvis as ghv 7 | 8 | 9 | # ### Gather the data 10 | # 11 | # Githistoryvis exposes the class `git_history`. 12 | # 13 | # The inizialization: 14 | # 15 | # ```python 16 | # foo = git_history(PATH) 17 | # ``` 18 | # sets the attribute `foo.path` that point to the git respository in PATH. 19 | # 20 | # Also `def_states` (and `def_states_explain`) are defined at inizialitation. 21 | # They are used to transform the state in the dataframe to number for visualization and define the legend. 22 | # 23 | # You can overwrite them at your own risk. 24 | # 25 | # ```python 26 | # def_states = { 27 | # 'A' : 120., 28 | # 'M' : 180., 29 | # 'S' : 255., # custom value, Static 30 | # 'D' : 240., 31 | # 'N' : 128., # custom value, Non existent 32 | # } 33 | # 34 | # def_states_explain = { 35 | # 'A' : 'Added', 36 | # 'D' : 'Deleted', 37 | # 'M' : 'Modified', 38 | # 'S' : 'Static', 39 | # 'N' : 'Non existent' 40 | # } 41 | # ``` 42 | # 43 | # 44 | # The method 45 | # 46 | # ```python 47 | # foo.get_history() 48 | # ``` 49 | # extracts the git log, and define: 50 | # 51 | # - foo.all_commits = the whole git log 52 | # - foo.commits = the commits SHA-1 53 | # - foo.all_files = all the unique file ever existed 54 | # 55 | # 56 | 57 | # In[19]: 58 | 59 | import os 60 | 61 | path = os.getcwd() # put here the desired git repo path 62 | 63 | gt = ghv.git_history(path) 64 | 65 | gt.get_history() 66 | 67 | 68 | # ### Visualize the data 69 | # 70 | # We define a pandas DataFrame to contain all the files (Rows) and the status (Columns). 71 | # 72 | # This Grid represent the status of each file at each step or commit. 73 | # 74 | # The inizial stata for all the files is `N` or `Non existent`, they are updated in the sequential reding of `git_history.all_commits` object. 75 | 76 | # ## Deserialize and structure the data 77 | # 78 | # The data gather in `githistoryvis.git_history()` object are deserialized and gathered in a pandas DataFrame by the `githistoryvis.definedatamatrix()` method. 79 | 80 | # In[20]: 81 | 82 | gt.definedatamatrix() 83 | gt.datamatrix 84 | 85 | 86 | # ## Visualize the data 87 | # 88 | # The data from the pandas DataFrame coul be visualized by this simple example routine. 89 | # 90 | # The arguments are: 91 | # 92 | # - size (default 200) : the size of the pyplot.scatteplot. 93 | # - figsize (default [9,7]) : size of the pyplot.figure. 94 | # - linewidths (default 3) : width of the pyplot.scatteplot outer lines. 95 | # - outpath : if defined, the figure will be saved without visualization. 96 | 97 | # In[21]: 98 | 99 | import matplotlib 100 | from matplotlib import pyplot as plt 101 | get_ipython().magic(u'matplotlib inline') 102 | 103 | 104 | # In[22]: 105 | 106 | gt.plot_history_df(plt,gt.datamatrix,size= 300, figsize = [12,10.5]) 107 | gt.plot_history_df(plt,gt.datamatrix,size= 300, figsize = [12,10.5],outpath=path+os.sep+'images/complete_visual_history.png') 108 | 109 | 110 | # In[24]: 111 | 112 | # filtering the history on: 113 | # a commit range 114 | plot_df_commit_range = gt.datamatrix.ix[:,'a4cb9a1':'1222c5e'] 115 | gt.plot_history_df(plt,plot_df_commit_range,size= 300, figsize= [3,10]) 116 | gt.plot_history_df(plt,plot_df_commit_range,size= 300, figsize= [3,10], outpath=path+os.sep+'images/commit_range.png') 117 | 118 | 119 | # In[25]: 120 | 121 | # filtering the history on: 122 | # a file range: all files not ending with txt 123 | plot_df_file_range = gt.datamatrix[~gt.datamatrix.index.str.contains('txt$')] 124 | gt.plot_history_df(plt,plot_df_file_range,size= 300, figsize= [11.5,8.5]) 125 | gt.plot_history_df(plt,plot_df_file_range,size= 300, figsize= [11.5,8.5], outpath=path+os.sep+'images/file_range.png') 126 | 127 | 128 | # In[26]: 129 | 130 | # filtering the history on: 131 | # a commit range AND a file range: all files not ending with txt 132 | plot_df_commit_file_range = gt.datamatrix.ix[:,'a4cb9a1':'1222c5e'][~gt.datamatrix.index.str.contains('txt$')] 133 | gt.plot_history_df(plt,plot_df_commit_file_range,size= 300,figsize= [3.5,8.5]) 134 | gt.plot_history_df(plt,plot_df_commit_file_range,size= 300,figsize= [3.5,8.5],outpath=path+os.sep+'images/commit_file_range.png') 135 | 136 | 137 | # In[27]: 138 | 139 | # filtering the history on: 140 | # a commit range AND a file range: all files not ending with txt 141 | plot_df_state_filter = gt.datamatrix[gt.datamatrix[gt.datamatrix.columns[-1]] != 'N'] 142 | gt.plot_history_df(plt,plot_df_state_filter,size= 300,figsize= [11,6]) 143 | gt.plot_history_df(plt,plot_df_state_filter,size= 300,figsize= [11,6],outpath=path+os.sep+'images/state_filter.png') 144 | 145 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Git history visualizer 2 | 3 | [![Join the chat at https://gitter.im/kidpixo/git_history_visualizer](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/kidpixo/git_history_visualizer?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) 4 | 5 | This script solve this problem: 6 | 7 | *I want to visualize the history of all the files in a git repository [in one branch]* 8 | 9 | The idea is to extract the whole commits log via the `git` command (you should have it on your machine) and process it to have: 10 | 11 | - the list off the file ever existed in this branch 12 | - the list of allsthe commit (at this stage we use the short SHA-1) 13 | 14 | # Requirements 15 | 16 | - Pandas (for data handling) 17 | - Matplotlib (for image generation) 18 | 19 | # Documentation 20 | 21 | `git_history` is the common base class for all git history. 22 | 23 | ### Inizialization 24 | 25 | foo = git_history(PATH, get_history=False, definedatamatrix=False) 26 | 27 | Optionally, set get_history and definedatamatrix to True to have all the process done in place, instead of calling each method. 28 | 29 | At the inizialitation the attribute `self.path` that point to the git respository in PATH. 30 | 31 | Also `def_states` (and `def_states_explain`) are defined at inizialitation. They are used to transform the state in the dataframe to number for visualization and define the legend. You can overwrite them at your own risk. 32 | 33 | # that is used as colorcode in the datamatrix 34 | def_states = { 35 | u'A': 120, 36 | u'C': 25, 37 | u'B': 51, 38 | u'D': 240, 39 | u'M': 180, 40 | u'R': 102, 41 | u'U': 204, 42 | u'T': 76, 43 | u'X': 153, 44 | u'S': 255, # custom value, Static 45 | u'N': None, # custom value, Non existent 46 | } 47 | 48 | # this is only a humand readable format 49 | def_states_explain = { 50 | u'A': u'added', 51 | u'C': u'copied', 52 | u'D': u'deleted', 53 | u'M': u'modified', 54 | u'R': u'renamed', 55 | u'T': u'type changed', 56 | u'U': u'unmerged', 57 | u'X': u'unknown', 58 | u'B': u'pairing broken', 59 | u'S': u'Static', 60 | u'N': u'Non existent' 61 | } 62 | 63 | 64 | ### Methods 65 | 66 | The method 67 | 68 | foo.get_history([prettyformat='%h'],[gitcommitlist=False]) 69 | 70 | extract the git log, and define: 71 | 72 | - foo.all_commits = the whole git log 73 | - foo.commits = the commits SHA-1 74 | - foo.all_files = all the unique file ever existed 75 | 76 | arguments: 77 | 78 | prettyformat, default %h 79 | 80 | optional, accept one of the git prettyformat, see http://git-scm.com/docs/pretty-formats. For example, get the whole commit text with '%s' and write your own parser for sel.decodelog(). 81 | 82 | Deafault is '%h' of the short SHA-1 of the commit. 83 | 84 | gitcommitlist, default False 85 | 86 | optional, if present should be a string withthe result of: 87 | 88 | git -C PATH --no-pager log --reverse --name-status --oneline --pretty="format:COMMIT%x09%h" 89 | 90 | For example, execute this command in remote and store the result in a file, read the content 91 | 92 | with open('gitoutput', 'r') as file: 93 | data = file.read() 94 | 95 | and pass the result to `get_history` method: 96 | 97 | gt.get_history(gitcommitlist=data) 98 | 99 | 100 | ### Status 101 | 102 | From the official git-log Documentation, http://git-scm.com/docs/git-log for files status: 103 | 104 | - A : file **A**dded 105 | - D : file **D**eleted 106 | - M : file **M**odified 107 | - C : **C**opied 108 | - R : **R**enamed 109 | - T : **T**ype changed 110 | - U : **U**nmerged 111 | - X : unknown 112 | - B : pairing **B**roken 113 | 114 | 115 | Custom defined status: 116 | 117 | - S : file is **S**tatic (nothing happen) 118 | - N : file is **N**on existent 119 | 120 | See http://git-scm.com/docs/git-log : 121 | 122 | > ... 123 | > 124 | > --diff-filter=[(A|C|D|M|R|T|U|X|B)…[*]] 125 | > 126 | > Select only files that are Added (A), Copied (C), Deleted (D), 127 | > Modified (M), Renamed (R), have their type (i.e. regular file, 128 | > symlink, submodule, …) changed (T), are Unmerged (U), are 129 | > Unknown (X), or have had their pairing Broken (B). Any combination 130 | > of the filter characters (including none) can be used. When * 131 | > (All-or-none) is added to the combination, all paths are selected if 132 | > there is any file that matches other criteria in the comparison; 133 | > if there is no file that matches other criteria, nothing is selected. 134 | > ... 135 | 136 | 137 | 138 | # Example 139 | 140 | The simplest way to get your image is to open [example_githistoryvis.py](https://github.com/kidpixo/git_history_visualizer/blob/master/example_githistoryvis.py), change the repository pathm the output path , save and run `python example_githistoryvis.py`. 141 | 142 | To have a better look of what is happening, the notebook and the python script included ([git_history_test_git.ipynb](https://github.com/kidpixo/git_history_visualizer/blob/master/git_history_test_git.ipynb) and [git_history_test_git.py](https://github.com/kidpixo/git_history_visualizer/blob/master/git_history_test_git.py)) are extended examples. 143 | 144 | Change the path at the beginning with your repository path and play with the visualizzation at the end. 145 | 146 | This example is on this very repository. The first `*txt` files were only placeholders. 147 | 148 | This is the complete visual history of this repository using 149 | 150 | ```python 151 | plot_history_df(gt.datamatrix,size= 300, figsize = [10,14]) 152 | ``` 153 | 154 | ![](images/complete_visual_history.png) 155 | 156 | 157 | This is a commit range, using using pandas' [Indexing and Selecting Data](http://pandas.pydata.org/pandas-docs/stable/indexing.html) capabilities: 158 | 159 | ```python 160 | plot_df_commit_range = gt.datamatrix.ix[:,'a4cb9a1':'1222c5e'] 161 | plot_history_df(plot_df_commit_range,size= 300, figsize= [3,13]) 162 | ``` 163 | 164 | ![](images/commit_range.png) 165 | 166 | This is a range of files, using 167 | 168 | ```python 169 | plot_df_file_range = gt.datamatrix[~gt.datamatrix.index.str.contains('txt$')] 170 | plot_history_df(plot_df_file_range,size= 300, figsize= [10,11.5]) 171 | ``` 172 | 173 | ![](images/file_range.png) 174 | 175 | This is combines the two filters, using 176 | 177 | ```python 178 | plot_df_commit_file_range = all_filenames.ix[:,'a4cb9a1':'1222c5e'] 179 | [~all_filenames.index.str.contains('txt$')] 180 | ``` 181 | 182 | ![](images/commit_file_range.png) 183 | 184 | This is filter on the all the state in the last commit, using 185 | 186 | ```python 187 | plot_df_state_filter = gt.datamatrix[gt.datamatrix[gt.datamatrix.columns[-1]] != 'N'] 188 | plot_history_df(plot_df_state_filter,size= 300,figsize= [10,10]) 189 | ``` 190 | 191 | ![](images/state_filter.png) 192 | -------------------------------------------------------------------------------- /githistoryvis.py: -------------------------------------------------------------------------------- 1 | # This Python file uses the following encoding: utf-8 2 | 3 | 4 | class git_history: 5 | """ Common base class for all git history. 6 | 7 | Inizialization: 8 | 9 | foo = git_history(PATH, get_history=False, definedatamatrix=False) 10 | 11 | Optionally, set get_history and definedatamatrix to True 12 | to have all the process done in place, instead of calling 13 | each method. 14 | 15 | At the inizialitation the attribute self.path 16 | that point to the git respository in PATH. 17 | 18 | Also def_states (and def_states_explain) are defined 19 | at inizialitation. They are used to transform the state 20 | in the dataframe to number for visualization and define 21 | the legend. You can overwrite them at your own risk. 22 | 23 | # this is used as colorcode in the datamatrix 24 | def_states = { 25 | u'A': 120, 26 | u'C': 25, 27 | u'B': 51, 28 | u'D': 240, 29 | u'M': 180, 30 | u'R': 102, 31 | u'U': 204, 32 | u'T': 76, 33 | u'X': 153, 34 | u'S': 255, # custom value, Static 35 | u'N': None, # custom value, Non existent 36 | } 37 | 38 | # this is only a humand readable format 39 | def_states_explain = { 40 | u'A': u'added', 41 | u'C': u'copied', 42 | u'D': u'deleted', 43 | u'M': u'modified', 44 | u'R': u'renamed', 45 | u'T': u'type changed', 46 | u'U': u'unmerged', 47 | u'X': u'unknown', 48 | u'B': u'pairing broken', 49 | u'S': u'Static', 50 | u'N': u'Non existent' 51 | } 52 | 53 | 54 | Methods: 55 | 56 | get_history 57 | 58 | foo.get_history(prettyformat='%h',gitcommitlist=False]) 59 | 60 | extracts the git log, and define: 61 | 62 | - foo.all_commits = the whole git log 63 | - foo.commits = the commits SHA-1 64 | - foo.all_files = all the unique file ever existed 65 | 66 | Arguments 67 | 68 | prettyformat, default %h 69 | 70 | optional, accept one of the git prettyformat, see 71 | http://git-scm.com/docs/pretty-formats. 72 | For example, get the whole commit text with '%s' and write 73 | your own parser for sel.decodelog(). 74 | Deafault is '%h' of the short SHA-1 of the commit. 75 | 76 | gitcommitlist, default False 77 | 78 | optional, if present should be a string withthe result of: 79 | 80 | git -C PATH --no-pager log --reverse --name-status --oneline --pretty="format:COMMIT%x09%h" 81 | 82 | For example, execute this command in remote and store the result in a file, 83 | read the content 84 | 85 | with open('gitoutput', 'r') as file: 86 | data = file.read() 87 | 88 | and pass the result to get_history method: 89 | 90 | foo.get_history(gitcommitlist=data) 91 | 92 | 93 | to_dict() 94 | 95 | foo.to_dict() 96 | 97 | converts a a full list of commits to a OrderedDict like: 98 | 99 | { 100 | 'hash' : { 101 | 'file_name': 'status' 102 | } 103 | } 104 | 105 | 106 | Status 107 | 108 | From the official git-log Documentation, http://git-scm.com/docs/git-log 109 | for files status: 110 | 111 | - A : file **A**dded 112 | - D : file **D**eleted 113 | - M : file **M**odified 114 | - C : **C**opied 115 | - R : **R**enamed 116 | - T : **T**ype changed 117 | - U : **U**nmerged 118 | - X : unknown 119 | - B : pairing **B**roken 120 | 121 | 122 | Custom defined status: 123 | 124 | - S : file is **S**tatic (nothing happen) 125 | - N : file is **N**on existent 126 | 127 | See http://git-scm.com/docs/git-log : 128 | 129 | ....... 130 | 131 | --diff-filter=[(A|C|D|M|R|T|U|X|B)…[*]] 132 | 133 | Select only files that are Added (A), Copied (C), Deleted (D), 134 | Modified (M), Renamed (R), have their type (i.e. regular file, 135 | symlink, submodule, …) changed (T), are Unmerged (U), are 136 | Unknown (X), or have had their pairing Broken (B). Any combination 137 | of the filter characters (including none) can be used. When * 138 | (All-or-none) is added to the combination, all paths are selected if 139 | there is any file that matches other criteria in the comparison; 140 | if there is no file that matches other criteria, nothing is selected. 141 | ....... 142 | 143 | """ 144 | def_states = { 145 | u'A': 120, 146 | u'C': 25, 147 | u'B': 51, 148 | u'D': 240, 149 | u'M': 180, 150 | u'R': 102, 151 | u'U': 204, 152 | u'T': 76, 153 | u'X': 153, 154 | u'S': 255, # custom value, Static 155 | u'N': None, # custom value, Non existent 156 | } 157 | 158 | def_states_explain = { 159 | u'A': u'added', 160 | u'C': u'copied', 161 | u'D': u'deleted', 162 | u'M': u'modified', 163 | u'R': u'renamed', 164 | u'T': u'type changed', 165 | u'U': u'unmerged', 166 | u'X': u'unknown', 167 | u'B': u'pairing broken', 168 | u'S': u'Static', 169 | u'N': u'Non existent' 170 | } 171 | 172 | def __init__(self, repo_path, get_history=False, definedatamatrix=False): 173 | self.path = repo_path 174 | if get_history: 175 | self.get_history() 176 | if definedatamatrix: 177 | self.definedatamatrix() 178 | 179 | def get_history(self, gitcommitlist=False, **kwargs): 180 | import re 181 | import subprocess 182 | 183 | if 'prettyformat' in kwargs: 184 | prettyformat = kwargs['prettyformat'] 185 | else: 186 | prettyformat = "%h" 187 | 188 | if not gitcommitlist: 189 | # get the whole git history 190 | p = subprocess.check_output(['git -C "{}" --no-pager log --reverse --name-status --oneline --pretty="format:COMMIT\t{}"'.format(self.path, prettyformat)], shell=True, universal_newlines=True) 191 | else: 192 | p = gitcommitlist 193 | 194 | # import ipdb 195 | # ipdb.set_trace() 196 | 197 | # old list version 198 | # self.all_commits = [i.split('\t') for i in p.split('\n') if '\t' in i] 199 | 200 | # new iterator version 201 | self.all_commits = [i.group(0).split('\t') for i in re.finditer(r'[^\r\n]+', p) if '\t' in i.group(0)] 202 | 203 | self.decodelog() 204 | 205 | def to_dict(self): 206 | import collections 207 | iterlist = iter(self.all_commits) 208 | all_commit_dict = collections.OrderedDict() 209 | tmp = {} 210 | for s,k in iterlist: 211 | if s != 'COMMIT': 212 | tmp[k] = s 213 | else: 214 | if len(tmp) != 0 : 215 | all_commit_dict[commit_hash] = tmp 216 | tmp = {} 217 | commit_hash = k 218 | all_commit_dict[commit_hash] = tmp 219 | 220 | return all_commit_dict 221 | 222 | def decodelog(self): 223 | # get all the commits SHA-1 224 | self.commits = [i[1] for i in self.all_commits if i[0] == 'COMMIT'] 225 | 226 | # get all the file in the history 227 | self.all_files = sorted(set([i[1] for i in self.all_commits if i[0] != 'COMMIT'])) 228 | 229 | def definedatamatrix(self): 230 | import pandas as pd 231 | 232 | all_filenames = pd.DataFrame(columns=self.commits, index=self.all_files) 233 | 234 | # fill NaN 235 | all_filenames.fillna('N', inplace=True) 236 | 237 | actual_commit = previous_commit = 0 238 | 239 | for i in self.all_commits: 240 | # set the commit number 241 | state, commit_label = i 242 | if state == 'COMMIT': 243 | commit_label_len = len(commit_label) 244 | # print '-'*(commit_label_len+5)+'+'+'-'*30 245 | # print '>', state, commit_label, 246 | # starting at the second commit see which file exist in the previous commit 247 | tmp_commit = commit_label 248 | if tmp_commit != all_filenames.columns[0]: 249 | previous_commit = actual_commit 250 | actual_commit = tmp_commit 251 | # assig 1 to file not null un the previous commit 252 | if previous_commit != 0: 253 | all_filenames[actual_commit][ 254 | (all_filenames[previous_commit] != 'N') & (all_filenames[previous_commit] != 'D')] = 'S' 255 | # print "| previous %s : actual %s" % (previous_commit, actual_commit) 256 | else: 257 | all_filenames.ix[commit_label, actual_commit] = state 258 | # print ' '*(commit_label_len+4), '|', state, commit_label 259 | self.datamatrix = all_filenames.apply(lambda x: x.astype('category')) 260 | 261 | def plot_history_df(self, plt, dataframe, **kwargs): 262 | """ 263 | Visualize the data 264 | 265 | The data from the pandas DataFrame in self.datamatrix could be visualized by this simple example routine. 266 | The arguments are: 267 | 268 | - plt : the imported name of matplotlib.pyplot. 269 | - size (default 200) : the size of the pyplot.scatteplot. 270 | - figsize (default [9, 7]) : size of the pyplot.figure. 271 | - linewidths (default 3) : width of the pyplot.scatteplot outer lines. 272 | - outpath : if defined, the figure will be saved without visualization. 273 | - legend : if defined to any value, will show a bad legend. 274 | """ 275 | if 'size' in kwargs: 276 | size = kwargs['size'] 277 | else: 278 | size = 500 279 | 280 | if 'figsize' in kwargs: 281 | figsize = kwargs['figsize'] 282 | else: 283 | figsize = [10, 12] 284 | 285 | if 'linewidths' in kwargs: 286 | linewidths = kwargs['linewidths'] 287 | else: 288 | linewidths = 3 289 | 290 | h = dataframe.applymap(lambda x: self.def_states[x]).values.copy() 291 | 292 | fig = plt.figure(figsize=figsize) 293 | 294 | ax = plt.subplot(111, frameon=False) 295 | for i in range(len(dataframe.index)): 296 | x = range(len(dataframe.columns)) 297 | y = [i for kk in x] 298 | ax.scatter(x, y, s=size, c=h[i, :], alpha=1, marker='o', linewidths=linewidths, cmap=plt.cm.spectral, vmin=0, vmax=255) 299 | ax.plot(x, y, lw=3, c='k', zorder=0, linewidth=linewidths) 300 | 301 | ax.set_xticks(range(h.shape[1])) 302 | ax.set_xticklabels(dataframe.columns, rotation=90) 303 | 304 | ax.set_xlabel('commits sha-1 (time arrow to the right ->)') 305 | ax.set_xlim([-.5, len(dataframe.columns)-0.5]) 306 | ax.set_ylabel('file names') 307 | ax.set_ylim([-.5, len(dataframe.index)+0.5]) 308 | ax.tick_params(axis='both', which='both', length=0, width=0) 309 | 310 | ax.set_yticks(range(h.shape[0])) 311 | ax.set_yticklabels(dataframe.index.tolist()) 312 | ax.set_yticks = 0.1 313 | 314 | if 'legend' in kwargs: 315 | xsize, ysize = fig.get_size_inches() 316 | w = 0.3 317 | l = (1.-w)/2. 318 | ax2 = fig.add_axes([(1.-w)/2., -.01, w, 0.035], frameon=False) 319 | 320 | colors = [i if i != self.def_states['N'] else float('nan') for i in self.def_states.values()] 321 | 322 | x = range(len(colors)) 323 | y = [1 for kk in x] 324 | ax2.scatter(x, y, s=size, c=colors, alpha=1, marker='o', linewidths=linewidths, cmap=plt.cm.spectral, vmin=0, vmax=255) 325 | ax2.plot(x, y, lw=3, c='k', zorder=0, linewidth=linewidths) 326 | 327 | ax2.set_xticks(x) 328 | ax2.set_xticklabels(self.def_states_explain.values()) 329 | ax2.set_xlabel('Legend') 330 | ax2.set_xlim([-.5, len(x)-0.5]) 331 | ax2.set_ylim([0.99, 1.01]) 332 | ax2.tick_params(axis='both', which='both', length=0, width=0, labelleft='off') 333 | 334 | if 'outpath' in kwargs: 335 | fig.savefig(kwargs['outpath'], bbox_inches='tight', pad_inches=0) 336 | plt.close() 337 | --------------------------------------------------------------------------------