├── .gitignore ├── Code_Cleanup-A_Data_Scientists_Guide_to_Sparkling_Code.pdf ├── Code_Complexity.ipynb ├── LICENSE ├── README.md ├── code ├── __init__.py ├── complexity_metrics.py ├── gsheet_utils.py ├── over_time.py └── parse_code.py ├── requirements.in └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,jupyternotebooks 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode,jupyternotebooks 3 | 4 | ### JupyterNotebooks ### 5 | # gitignore template for Jupyter Notebooks 6 | # website: http://jupyter.org/ 7 | 8 | .ipynb_checkpoints 9 | */.ipynb_checkpoints/* 10 | 11 | # IPython 12 | profile_default/ 13 | ipython_config.py 14 | 15 | # Remove previous ipynb_checkpoints 16 | # git rm -r .ipynb_checkpoints/ 17 | 18 | ### Python ### 19 | # Byte-compiled / optimized / DLL files 20 | __pycache__/ 21 | *.py[cod] 22 | *$py.class 23 | 24 | # C extensions 25 | *.so 26 | 27 | # Distribution / packaging 28 | .Python 29 | build/ 30 | develop-eggs/ 31 | dist/ 32 | downloads/ 33 | eggs/ 34 | .eggs/ 35 | lib/ 36 | lib64/ 37 | parts/ 38 | sdist/ 39 | var/ 40 | wheels/ 41 | share/python-wheels/ 42 | *.egg-info/ 43 | .installed.cfg 44 | *.egg 45 | MANIFEST 46 | 47 | # PyInstaller 48 | # Usually these files are written by a python script from a template 49 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 50 | *.manifest 51 | *.spec 52 | 53 | # Installer logs 54 | pip-log.txt 55 | pip-delete-this-directory.txt 56 | 57 | # Unit test / coverage reports 58 | htmlcov/ 59 | .tox/ 60 | .nox/ 61 | .coverage 62 | .coverage.* 63 | .cache 64 | nosetests.xml 65 | coverage.xml 66 | *.cover 67 | *.py,cover 68 | .hypothesis/ 69 | .pytest_cache/ 70 | cover/ 71 | 72 | # Translations 73 | *.mo 74 | *.pot 75 | 76 | # Django stuff: 77 | *.log 78 | local_settings.py 79 | db.sqlite3 80 | db.sqlite3-journal 81 | 82 | # Flask stuff: 83 | instance/ 84 | .webassets-cache 85 | 86 | # Scrapy stuff: 87 | .scrapy 88 | 89 | # Sphinx documentation 90 | docs/_build/ 91 | 92 | # PyBuilder 93 | .pybuilder/ 94 | target/ 95 | 96 | # Jupyter Notebook 97 | 98 | # IPython 99 | 100 | # pyenv 101 | # For a library or package, you might want to ignore these files since the code is 102 | # intended to run in multiple environments; otherwise, check them in: 103 | # .python-version 104 | 105 | # pipenv 106 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 107 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 108 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 109 | # install all needed dependencies. 110 | #Pipfile.lock 111 | 112 | # poetry 113 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 114 | # This is especially recommended for binary packages to ensure reproducibility, and is more 115 | # commonly ignored for libraries. 116 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 117 | #poetry.lock 118 | 119 | # pdm 120 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 121 | #pdm.lock 122 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 123 | # in version control. 124 | # https://pdm.fming.dev/#use-with-ide 125 | .pdm.toml 126 | 127 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 128 | __pypackages__/ 129 | 130 | # Celery stuff 131 | celerybeat-schedule 132 | celerybeat.pid 133 | 134 | # SageMath parsed files 135 | *.sage.py 136 | 137 | # Environments 138 | .env 139 | .venv 140 | env/ 141 | venv/ 142 | ENV/ 143 | env.bak/ 144 | venv.bak/ 145 | 146 | # Spyder project settings 147 | .spyderproject 148 | .spyproject 149 | 150 | # Rope project settings 151 | .ropeproject 152 | 153 | # mkdocs documentation 154 | /site 155 | 156 | # mypy 157 | .mypy_cache/ 158 | .dmypy.json 159 | dmypy.json 160 | 161 | # Pyre type checker 162 | .pyre/ 163 | 164 | # pytype static type analyzer 165 | .pytype/ 166 | 167 | # Cython debug symbols 168 | cython_debug/ 169 | 170 | # PyCharm 171 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 172 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 173 | # and can be added to the global gitignore or merged into this file. For a more nuclear 174 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 175 | #.idea/ 176 | 177 | ### Python Patch ### 178 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration 179 | poetry.toml 180 | 181 | # ruff 182 | .ruff_cache/ 183 | 184 | # LSP config files 185 | pyrightconfig.json 186 | 187 | ### VisualStudioCode ### 188 | .vscode/* 189 | !.vscode/settings.json 190 | !.vscode/tasks.json 191 | !.vscode/launch.json 192 | !.vscode/extensions.json 193 | !.vscode/*.code-snippets 194 | 195 | # Local History for Visual Studio Code 196 | .history/ 197 | 198 | # Built Visual Studio Code Extensions 199 | *.vsix 200 | 201 | ### VisualStudioCode Patch ### 202 | # Ignore all local history of files 203 | .history 204 | .ionide 205 | 206 | # End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,jupyternotebooks 207 | -------------------------------------------------------------------------------- /Code_Cleanup-A_Data_Scientists_Guide_to_Sparkling_Code.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/corriebar/code-complexity/b772a1b6c599b9d551e8d5ab4481cc5395aa3c8d/Code_Cleanup-A_Data_Scientists_Guide_to_Sparkling_Code.pdf -------------------------------------------------------------------------------- /Code_Complexity.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "id": "dac9edb2-734a-4ea6-8b66-991b1ddd7cb0", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "source": [ 11 | "# Code Complexity Summary\n", 12 | "\n", 13 | "This notbook generates code compexity summaries and writes them to a google sheet.\n", 14 | "\n", 15 | "For each repository, one tab is added.\n", 16 | "\n", 17 | "Before running the code, make sure to be on an up-to-date master for all repositories without any untracked python files.\n", 18 | "\n", 19 | "For more on code complexities and the measures used in this notebook, check these [slides](Code_Cleanup-A_Data_Scientists_Guide_to_Sparkling_Code.pdf)." 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 5, 25 | "id": "d6936daa-eb5e-44c1-81bf-3389da68153d", 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | "The autoreload extension is already loaded. To reload it, use:\n", 33 | " %reload_ext autoreload\n" 34 | ] 35 | } 36 | ], 37 | "source": [ 38 | "%load_ext autoreload\n", 39 | "%autoreload 2\n", 40 | "\n", 41 | "from pathlib import Path\n", 42 | "\n", 43 | "root = Path.cwd().parent\n", 44 | "repos = [\n", 45 | " root / \"code-complexity\",\n", 46 | " #...\n", 47 | "]" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "id": "a2297d47-d4b3-4351-99db-fdd62f8f6c0b", 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "\n", 58 | "\n", 59 | "\n", 60 | "import pandas as pd\n", 61 | "\n", 62 | "import gspread\n", 63 | "\n", 64 | "from code.gsheet_utils import apply_formatting, return_data_to_write\n", 65 | "from code.complexity_metrics import get_repo_complexities\n", 66 | "\n", 67 | "import warnings\n", 68 | "warnings.simplefilter(action=\"ignore\", category=Warning)\n", 69 | "\n", 70 | "from plotnine import *" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 4, 76 | "id": "bcdfc656-b500-4086-990d-f8d461ef94b4", 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "creds_dict = {} # your creds dict\n", 81 | "scopes = [\"https://www.googleapis.com/auth/drive\"]\n", 82 | "gc = gspread.service_account_from_dict(creds_dict, scopes)\n" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "id": "0a258c10-7104-4c0b-9e0f-619fc67706e7", 88 | "metadata": { 89 | "tags": [] 90 | }, 91 | "source": [ 92 | "## Run for a single repo\n", 93 | "\n", 94 | "If you only want to get the results for a single repo (or folder), you can run the following command:" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 5, 100 | "id": "0b34e184-e28d-4ff0-8dfa-f24a8443371f", 101 | "metadata": { 102 | "tags": [] 103 | }, 104 | "outputs": [ 105 | { 106 | "data": { 107 | "text/html": [ 108 | "
\n", 109 | "\n", 122 | "\n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | "
repofilefunction_namefunc_linenofunc_lengthcognitive_complexitysum_expression_complexitymax_expression_complexitynum_argumentsnum_returnsnum_module_expressionsmodule_complexityextract_date
3code-complexity/code/parse_code.pyiterate_over_expressions33311218.06.01040.02023-04-18
1code-complexity/code/parse_code.pyget_all_python_files1310818.03.52140.02023-04-18
6code-complexity/code/gsheet_utils.pyreturn_data_to_write45829.52.531147.52023-04-18
\n", 192 | "
" 193 | ], 194 | "text/plain": [ 195 | " repo file function_name \n", 196 | "3 code-complexity /code/parse_code.py iterate_over_expressions \\\n", 197 | "1 code-complexity /code/parse_code.py get_all_python_files \n", 198 | "6 code-complexity /code/gsheet_utils.py return_data_to_write \n", 199 | "\n", 200 | " func_lineno func_length cognitive_complexity sum_expression_complexity \n", 201 | "3 33 31 12 18.0 \\\n", 202 | "1 13 10 8 18.0 \n", 203 | "6 45 8 2 9.5 \n", 204 | "\n", 205 | " max_expression_complexity num_arguments num_returns \n", 206 | "3 6.0 1 0 \\\n", 207 | "1 3.5 2 1 \n", 208 | "6 2.5 3 1 \n", 209 | "\n", 210 | " num_module_expressions module_complexity extract_date \n", 211 | "3 4 0.0 2023-04-18 \n", 212 | "1 4 0.0 2023-04-18 \n", 213 | "6 14 7.5 2023-04-18 " 214 | ] 215 | }, 216 | "execution_count": 5, 217 | "metadata": {}, 218 | "output_type": "execute_result" 219 | } 220 | ], 221 | "source": [ 222 | "repo = 'code-complexity'\n", 223 | "\n", 224 | "df = get_repo_complexities(repos[0])\n", 225 | "\n", 226 | "df.head(3)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "id": "ca183151-df4d-430a-a021-943a012ccd68", 232 | "metadata": {}, 233 | "source": [ 234 | "## Save to Google Sheet" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 6, 240 | "id": "8efa54fa-2a4b-4daa-9664-d86a5e62f813", 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "sheet_id = \"\"\n", 245 | "url = f\"https://docs.google.com/spreadsheets/d/{sheet_id}}/\"\n", 246 | "sheet = gc.open_by_url(url)" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "id": "2857b1c9-e8af-4907-a175-ce59bbbe4409", 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "tabs = [wksh.title for wksh in sheet.worksheets()]\n", 257 | "all_repos = []\n", 258 | "for repo in repos:\n", 259 | " new_df = get_repo_complexities(repo)\n", 260 | " \n", 261 | " if not repo in tabs:\n", 262 | " wksht = sheet.add_worksheet(title=repo, rows=1000, cols=26, index=0)\n", 263 | " df = new_df\n", 264 | " else:\n", 265 | " df = return_data_to_write(sheet, repo.name, new_df)\n", 266 | " wksht = sheet.worksheet(repo.name)\n", 267 | " apply_formatting(wksht, df)\n", 268 | "\n", 269 | " \n", 270 | " all_repos.append(df)" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "id": "43942b9d-d69b-4d4a-ab44-54ca6abfa051", 276 | "metadata": {}, 277 | "source": [ 278 | "## Summary Statistics" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 8, 284 | "id": "7637e4fb-8c16-4e01-aef7-6265e5e4d362", 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "df_all = pd.concat(all_repos, ignore_index=True).query('repo != \"nan\" & repo.notna()')\n", 289 | "\n", 290 | "df_all['extract_date'] = df_all['extract_date'].replace('nan', None).fillna(method='ffill')" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 9, 296 | "id": "8dde96de-7116-4e9a-a3ab-fa99649151bf", 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "plot_df = (df_all\n", 301 | " .groupby(['repo', 'extract_date'])\n", 302 | " .cognitive_complexity.agg(['mean', 'max', 'median'])\n", 303 | " .reset_index()\n", 304 | " )\n", 305 | "\n", 306 | "plot_df['extract_date'] = pd.to_datetime(plot_df['extract_date'])\n" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 15, 312 | "id": "0d1c141c-4e2e-4001-b2a2-ff501edea21e", 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [ 316 | "(ggplot(plot_df, aes(x='extract_date', y='mean', color='repo'))\n", 317 | " + geom_line(show_legend=False, size=1.5)\n", 318 | " + geom_point(show_legend=False, size=2)\n", 319 | " + scale_x_date(date_labels='%b %Y', breaks=plot_df.extract_date.unique())\n", 320 | " + scale_color_brewer(type='qual', palette='Set2')\n", 321 | " + labs(x='', y='Complexity', title='Code Complexity of our Repos over Time')\n", 322 | " + theme_minimal()\n", 323 | " + theme(figure_size=(8,15),\n", 324 | " legend_position='bottom')\n", 325 | ").draw()" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 14, 331 | "id": "1685a242-9201-4ef3-957c-26f041d33dc1", 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "(ggplot(plot_df, aes(x='extract_date', y='mean', color='repo'))\n", 336 | " + geom_line(show_legend=False)\n", 337 | " + geom_point(show_legend=False)\n", 338 | " + scale_x_date(date_labels='%d %b %Y', breaks=plot_df['extract_date'].unique())\n", 339 | " + scale_color_brewer(type='qual', palette='Paired')\n", 340 | " + labs(x='', y='Max Cognitive Complexity', title='Max Cognitive Complexity of our Repos over Time')\n", 341 | " + theme_minimal()\n", 342 | " + theme(figure_size=(10,6),\n", 343 | " legend_position='bottom')\n", 344 | ").draw()" 345 | ] 346 | } 347 | ], 348 | "metadata": { 349 | "kernelspec": { 350 | "display_name": "code-complexity", 351 | "language": "python", 352 | "name": "code-complexity" 353 | }, 354 | "language_info": { 355 | "codemirror_mode": { 356 | "name": "ipython", 357 | "version": 3 358 | }, 359 | "file_extension": ".py", 360 | "mimetype": "text/x-python", 361 | "name": "python", 362 | "nbconvert_exporter": "python", 363 | "pygments_lexer": "ipython3", 364 | "version": "3.11.1" 365 | } 366 | }, 367 | "nbformat": 4, 368 | "nbformat_minor": 5 369 | } 370 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Corrie Bartelheimer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Code Complexity 2 | 3 | In the notebook [Code Complexity](Code_Complexity.ipynb) you can find an example flow on how to use the different [complexity metrics](code/complexity_metrics.py). 4 | 5 | To run the notebook, you might have to setup a Google Sheet [service account](https://docs.gspread.org/en/v5.7.1/oauth2.html#authentication) for the authentication. 6 | To install the necessary python packages, run the following command in the root directory of the repository: 7 | ``` 8 | pip install -r requirements.txt 9 | ``` 10 | 11 | ## Slides 12 | 13 | The slides for the talk can be found [here](Code_Cleanup-A_Data_Scientists_Guide_to_Sparkling_Code.pdf). 14 | -------------------------------------------------------------------------------- /code/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/corriebar/code-complexity/b772a1b6c599b9d551e8d5ab4481cc5395aa3c8d/code/__init__.py -------------------------------------------------------------------------------- /code/complexity_metrics.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from pathlib import Path 3 | 4 | from flake8_functions.function_length import get_function_start_row, get_function_last_row 5 | from cognitive_complexity.api import get_cognitive_complexity 6 | from flake8_expression_complexity.utils.complexity import get_expression_complexity 7 | 8 | from flake8_functions.function_arguments_amount import get_arguments_amount_for 9 | from flake8_functions.function_returns_amount import get_returns_amount_for 10 | 11 | from code.parse_code import ( 12 | iterate_over_expressions, 13 | get_function_definitions, 14 | parse_file, 15 | get_all_python_files, 16 | ) 17 | 18 | import pandas as pd 19 | 20 | COMPLEXITY_METRICS = [ 21 | "func_length", 22 | "cognitive_complexity", 23 | "sum_expression_complexity", 24 | "max_expression_complexity", 25 | "num_arguments", 26 | "num_returns", 27 | "num_module_expressions", 28 | "module_complexity", 29 | ] 30 | """ 31 | Only the metrics columns of the complexity analysis. 32 | 33 | This can be used e.g. to summarize the DataFrame. 34 | """ 35 | 36 | COMPLEXITY_COLUMNS = [ 37 | "repo", 38 | "file", 39 | "function_name", 40 | "func_lineno", 41 | "extract_date", 42 | *COMPLEXITY_METRICS, 43 | ] 44 | """All columns to expect in a complexity analysis DataFrame.""" 45 | 46 | COLUMN_TYPES = { 47 | "repo": str, 48 | "file": str, 49 | "function_name": str, 50 | "func_lineno": "float64", 51 | "func_length": "float64", 52 | "cognitive_complexity": "float64", 53 | "sum_expression_complexity": "float64", 54 | "max_expression_complexity": "float64", 55 | "num_arguments": "float64", 56 | "num_returns": "float64", 57 | "num_module_expressions": "float64", 58 | "module_complexity": "float64", 59 | } 60 | 61 | 62 | def get_function_length(funcdef): 63 | function_start_row = get_function_start_row(funcdef) 64 | function_last_row = get_function_last_row(funcdef) 65 | return function_last_row - function_start_row + 1 66 | 67 | 68 | def get_complexity_per_function(funcdef): 69 | expression_complexities = [ 70 | get_expression_complexity(expr) for expr in iterate_over_expressions(funcdef) 71 | ] 72 | return { 73 | "function_name": funcdef.name, 74 | "func_lineno": funcdef.lineno, 75 | "func_length": get_function_length(funcdef), 76 | "cognitive_complexity": get_cognitive_complexity(funcdef), 77 | "sum_expression_complexity": sum(expression_complexities), 78 | "max_expression_complexity": max(expression_complexities), 79 | "num_arguments": get_arguments_amount_for(funcdef), 80 | "num_returns": get_returns_amount_for(funcdef), 81 | } 82 | 83 | 84 | def get_module_complexities(module): 85 | expressions_outside_functions = [exp for exp in iterate_over_expressions(module)] 86 | expression_complexities = [ 87 | get_expression_complexity(expr) for expr in expressions_outside_functions 88 | ] 89 | num_expressions = len(expressions_outside_functions) 90 | return { 91 | "num_module_expressions": num_expressions, 92 | "module_complexity": sum(expression_complexities), 93 | } 94 | 95 | 96 | def get_module_function_complexities(module): 97 | complexities = [] 98 | funcdefs = get_function_definitions(module) 99 | 100 | for funcdef in funcdefs: 101 | comp_dict = get_complexity_per_function(funcdef) 102 | complexities.append(comp_dict) 103 | 104 | return complexities 105 | 106 | 107 | def get_file_complexities(repo_path: Path, filepath: Path): 108 | module = parse_file(filepath) 109 | 110 | function_complexities = get_module_function_complexities(module) 111 | 112 | module_complexities = get_module_complexities(module) 113 | 114 | rel_path = str(filepath.relative_to(repo_path)) 115 | module_function_complexities = [ 116 | {**d, **module_complexities, "file": rel_path} 117 | for d in function_complexities 118 | ] 119 | return module_function_complexities 120 | 121 | 122 | def get_repo_complexities(repo_path): 123 | repo_path = Path(repo_path) 124 | repo_name = repo_path.name 125 | 126 | python_files = get_all_python_files(repo_path, repo_name) 127 | complexities = [] 128 | for file_path in python_files: 129 | module_function_complexities = get_file_complexities(repo_path, file_path) 130 | 131 | complexities.extend(module_function_complexities) 132 | 133 | df = pd.DataFrame(complexities, columns=COMPLEXITY_COLUMNS) 134 | df["repo"] = repo_name 135 | df = add_extract_date(df) 136 | return df[COMPLEXITY_COLUMNS].sort_values( 137 | by=["cognitive_complexity", "func_length"], ascending=False 138 | ) 139 | 140 | 141 | def add_extract_date(df): 142 | today = datetime.datetime.today() 143 | d = df.copy() 144 | d["extract_date"] = str(today.date()) 145 | return d 146 | 147 | 148 | def compare_old_new(old_df, new_df): 149 | sort_cols = ["repo", "file", "function_name"] 150 | compare_cols = [ 151 | *sort_cols, 152 | "func_lineno", 153 | "func_length", 154 | "cognitive_complexity", 155 | "sum_expression_complexity", 156 | "max_expression_complexity", 157 | "num_arguments", 158 | "num_returns", 159 | "num_module_expressions", 160 | "module_complexity", 161 | ] 162 | old = old_df[compare_cols].sort_values(by=sort_cols).reset_index(drop=True).astype(COLUMN_TYPES) 163 | new = new_df[compare_cols].sort_values(by=sort_cols).reset_index(drop=True).astype(COLUMN_TYPES) 164 | is_equal = old.equals(new) 165 | return is_equal 166 | 167 | 168 | def get_latest_data(old_data): 169 | return old_data.query("extract_date == extract_date.max()") 170 | -------------------------------------------------------------------------------- /code/gsheet_utils.py: -------------------------------------------------------------------------------- 1 | from string import ascii_uppercase 2 | from typing import List 3 | 4 | from gspread_dataframe import get_as_dataframe 5 | from gspread_formatting import get_conditional_format_rules 6 | from gspread_formatting.dataframe import format_with_dataframe, set_frozen 7 | from gspread_formatting.models import Color 8 | from gspread_formatting.conditionals import ( 9 | ConditionalFormatRule, 10 | GradientRule, 11 | GridRange, 12 | InterpolationPoint, 13 | ) 14 | 15 | from gspread_dataframe import get_as_dataframe, set_with_dataframe 16 | 17 | 18 | import pandas as pd 19 | 20 | from code.complexity_metrics import COLUMN_TYPES, compare_old_new, get_latest_data 21 | 22 | 23 | COLUMN_GRADIENTS = { 24 | "func_length": [0, 50, 80], 25 | "cognitive_complexity": [0, 7, 10], 26 | "sum_expression_complexity": [0, 50, 80], 27 | "max_expression_complexity": [0, 6, 9], 28 | "num_arguments": [0, 4, 7], 29 | "num_returns": [0, 3, 7], 30 | } 31 | 32 | GREEN = Color(0.34117648, 0.73333335, 0.5411765) 33 | YELLOW = Color(0.9843137, 0.7372549, 0.015686275) 34 | RED = Color(1, 0.42745098, 0.003921569) 35 | 36 | 37 | def get_old_data(sheet, repo): 38 | types = {**COLUMN_TYPES, "extract_date": str} 39 | wksh = sheet.worksheet(repo) 40 | old_data = get_as_dataframe(wksh) 41 | old_df = old_data.dropna(axis=1, how="all") # drop empty columns 42 | return old_df.dropna(axis=0, how="all").astype(types) 43 | 44 | 45 | def return_data_to_write(sheet, repo, new_df): 46 | old_df = get_old_data(sheet, repo) 47 | latest = get_latest_data(old_df) 48 | if not compare_old_new(latest, new_df): 49 | print(f"Changes have been made in {repo}!") 50 | df = pd.concat([new_df, old_df], ignore_index=True) 51 | else: 52 | df = old_df 53 | return df 54 | 55 | 56 | def get_cell_ranges(worksheet, cell_ranges: str | List): 57 | ranges = [] 58 | if isinstance(cell_ranges, str): 59 | cell_ranges = [cell_ranges] 60 | for cells in cell_ranges: 61 | grid_range = GridRange.from_a1_range(cells, worksheet) 62 | ranges.append(grid_range) 63 | return ranges 64 | 65 | 66 | def get_conditional_format_rule(worksheet, cell_ranges: str | List, gradient_points: List[int]): 67 | minpoint, midpoint, maxpoint = gradient_points 68 | 69 | ranges = get_cell_ranges(worksheet, cell_ranges) 70 | rule = ConditionalFormatRule( 71 | ranges=ranges, 72 | gradientRule=GradientRule( 73 | minpoint=InterpolationPoint(color=GREEN, type="NUMBER", value=str(minpoint)), 74 | midpoint=InterpolationPoint(color=YELLOW, type="NUMBER", value=str(midpoint)), 75 | maxpoint=InterpolationPoint(color=RED, type="NUMBER", value=str(maxpoint)), 76 | ), 77 | ) 78 | return rule 79 | 80 | 81 | def map_colname_to_range(df, colname): 82 | col_index = list(df.columns).index(colname) 83 | if col_index > 26: 84 | print("not yet supported") 85 | col_letter = ascii_uppercase[col_index] 86 | return f"{col_letter}:{col_letter}" 87 | 88 | 89 | def set_conditional_rules(df, rules, worksheet): 90 | for col_name, gradient_points in COLUMN_GRADIENTS.items(): 91 | cell_range = map_colname_to_range(df, col_name) 92 | rule = get_conditional_format_rule(worksheet, cell_range, gradient_points) 93 | rules.append(rule) 94 | rules.save() 95 | 96 | 97 | def apply_formatting(worksheet, df): 98 | set_with_dataframe(worksheet, df) 99 | format_with_dataframe(worksheet, df, include_column_header=True) 100 | set_frozen(worksheet, rows=1) 101 | 102 | rules = get_conditional_format_rules(worksheet) 103 | set_conditional_rules(df, rules, worksheet) 104 | -------------------------------------------------------------------------------- /code/over_time.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions to jump through the git history of a repo. 3 | """ 4 | from datetime import datetime 5 | import pathlib 6 | import subprocess 7 | from typing import Callable, Dict, Optional, Sequence, Tuple, TypeVar, Union 8 | 9 | import pandas as pd 10 | 11 | from code.complexity_metrics import COMPLEXITY_METRICS, get_repo_complexities 12 | 13 | try: 14 | from fastprogress import progress_bar 15 | except ModuleNotFoundError: 16 | progress_bar = lambda x: None 17 | 18 | 19 | PathLike = Union[str, pathlib.Path] 20 | T = TypeVar("T") 21 | 22 | 23 | def git_log(dp: PathLike) -> Tuple[str, ...]: 24 | """Returns a tuple of all commit hashes in the git history (newest first).""" 25 | output = subprocess.check_output(["git", "-C", str(dp), "log", '--format=format:"%H"']) 26 | output = output.strip().decode("ascii") 27 | output = output.replace('"', "") 28 | return tuple(output.split("\n")) 29 | 30 | 31 | def git_commit_timestamps(dp: PathLike) -> Dict[str, datetime]: 32 | """Returns a tuple of all commit hashes in the git history (newest first).""" 33 | output = subprocess.check_output(["git", "-C", str(dp), "log", '--format=format:"%H|%ci"']) 34 | output = output.strip().decode("ascii") 35 | output = output.replace('"', "") 36 | result = {} 37 | for row in output.split("\n"): 38 | cid, ts = row.split("|") 39 | result[cid] = datetime.fromisoformat(ts) 40 | return result 41 | 42 | 43 | def git_status(dp: PathLike) -> str: 44 | """Returns the git status message.""" 45 | output = subprocess.check_output(["git", "-C", str(dp), "status"]) 46 | output = output.strip().decode("ascii") 47 | return output 48 | 49 | 50 | def git_current_branch(dp: PathLike) -> str: 51 | """Determines the name of the currently checked-out branch.""" 52 | status = git_status(dp) 53 | return status.split("\n")[0].replace("On branch ", "") 54 | 55 | 56 | def git_checkout(dp: PathLike, commit_or_branch: str): 57 | """Check out a specific branch or commit in the repository under `dp`.""" 58 | output = subprocess.check_output(["git", "-C", str(dp), "checkout", commit_or_branch], stderr=subprocess.DEVNULL) 59 | output = output.strip().decode("ascii") 60 | return output 61 | 62 | 63 | 64 | def eval_by_commit( 65 | dp: PathLike, 66 | func: Callable[[PathLike], T], 67 | commits: Sequence[str], 68 | *, 69 | raise_on_error: bool = True, 70 | ) -> Dict[str, Optional[T]]: 71 | """Apply `func` to the `dp` for each of the `commits` and return the results. 72 | 73 | Requires the repository at `dp` to be in a clean `git status` state. 74 | In the end, the current branch will be checked out again. 75 | 76 | Parameters 77 | ---------- 78 | dp 79 | Path to a local git repository. 80 | func 81 | A callable to apply at each commit. 82 | It should take one parameter `dp` and return something. 83 | commits 84 | A sequence of commits to execute the function at. 85 | raise_on_error 86 | If ``True``, exceptions other than SyntaxErrors are raised. 87 | 88 | Returns 89 | ------- 90 | results 91 | Maps commit IDs to return values of the provided callable, 92 | or ``None`` in case of syntax errors at the respective commit. 93 | """ 94 | status = git_status(dp) 95 | if not ("working tree clean" in status or "nothing added to commit but untracked" in status): 96 | raise Exception(f"The git status of '{dp}' is unclean:\n\n{status}") 97 | branch = git_current_branch(dp) 98 | results = {} 99 | for commit in progress_bar(commits): 100 | try: 101 | git_checkout(dp, commit) 102 | results[commit] = func(dp) 103 | except SyntaxError: 104 | results[commit] = None 105 | except: 106 | print(f"Failed to apply function at commit {commit}") 107 | if raise_on_error: 108 | git_checkout(dp, branch) 109 | raise 110 | print(f"\nEvaluations completed. Checking out branch '{branch}'.") 111 | git_checkout(dp, branch) 112 | return results 113 | 114 | 115 | def complexity_by_commit(dp: PathLike, commits: Optional[Sequence[str]] = None) -> pd.DataFrame: 116 | """Convenience wrapper around ``eval_by_commit`` to determine mean code complexity metrics over time. 117 | 118 | Parameters 119 | ---------- 120 | dp 121 | Path to a local git repository. 122 | commits 123 | Optional sequence of commits IDs to run for. 124 | Defaults to the entire git history of the repo. 125 | 126 | Returns 127 | ------- 128 | df 129 | DataFrame indexed by `commit_id`, 130 | with `timespan` and mean `COMPLEXITY_METRICS` columns. 131 | """ 132 | if not commits: 133 | commits = git_log(dp) 134 | 135 | results = eval_by_commit( 136 | dp=dp, 137 | func=lambda dp: get_repo_complexities(dp).set_index("repo")[COMPLEXITY_METRICS].mean(), 138 | commits=commits, 139 | ) 140 | 141 | # Summarize in a DataFrame with None-results as NA rows 142 | results_notna = {k:v for k,v in results.items() if v is not None} 143 | df = pd.DataFrame.from_dict(results_notna, orient="index") 144 | df.index.name = "commit_id" 145 | # Re-insert rows with None-results 146 | for k, v in results.items(): 147 | if v is None: 148 | df.loc[k] = pd.NA 149 | 150 | # Determine commit timestamps 151 | timestamps = git_commit_timestamps(dp) 152 | df["timestamp"] = [timestamps[row.Index] for row in df.itertuples()] 153 | return df.sort_values("timestamp") 154 | -------------------------------------------------------------------------------- /code/parse_code.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import itertools 3 | import os 4 | from pathlib import Path 5 | from typing import Callable, Iterable, List, Tuple 6 | 7 | 8 | _IGNORE_DIRS = { 9 | "notebooks", 10 | ".ipynb_checkpoints", 11 | os.sep + "build" + os.sep, 12 | } 13 | 14 | 15 | def parse_file(filepath): 16 | with open(filepath) as f: 17 | file_parsed = ast.parse(f.read()) 18 | return file_parsed 19 | 20 | 21 | def get_all_python_files(repo_path, repo) -> List[Path]: 22 | python_files = [] 23 | for fp_py in Path(repo_path).glob("**/*.py"): 24 | if fp_py.is_dir() or any(d in str(fp_py) for d in _IGNORE_DIRS): 25 | continue 26 | python_files.append(fp_py) 27 | return python_files 28 | 29 | 30 | def get_function_definitions(module): 31 | funcdefs = [ 32 | n for n in ast.walk(module) if isinstance(n, (ast.FunctionDef, ast.AsyncFunctionDef)) 33 | ] 34 | return funcdefs 35 | 36 | 37 | def iterate_over_expressions(node: ast.AST) -> Iterable[ast.AST]: 38 | """Ignoring function definitions.""" 39 | additionals_subnodes_info: List[Tuple[Tuple, Callable]] = [ 40 | ((ast.If, ast.While), lambda n: [n.test]), 41 | ((ast.For,), lambda n: [n.iter]), 42 | ((ast.AsyncFor,), lambda n: [n.iter]), 43 | ((ast.With, ast.AsyncWith), lambda n: [s.context_expr for s in n.items]), 44 | ] 45 | nodes_with_subnodes = ( 46 | ast.FunctionDef, 47 | ast.AsyncFunctionDef, 48 | ast.If, 49 | ast.For, 50 | ast.AsyncFor, 51 | ast.Module, 52 | ast.ClassDef, 53 | ast.Try, 54 | ast.With, 55 | ast.AsyncWith, 56 | ast.While, 57 | ) 58 | for bases, subnodes_getter in additionals_subnodes_info: 59 | if isinstance(node, bases): 60 | yield from subnodes_getter(node) 61 | nodes_to_iter = ( 62 | _get_try_node_children(node) if isinstance(node, ast.Try) else getattr(node, "body", []) 63 | ) 64 | for child_node in nodes_to_iter: 65 | if isinstance(child_node, nodes_with_subnodes): 66 | if not isinstance(child_node, (ast.FunctionDef, ast.AsyncFunctionDef)): 67 | yield from iterate_over_expressions(child_node) 68 | else: 69 | yield child_node 70 | 71 | 72 | def _get_try_node_children(try_node: ast.Try): 73 | return itertools.chain(try_node.body, try_node.finalbody, *[n.body for n in try_node.handlers]) 74 | -------------------------------------------------------------------------------- /requirements.in: -------------------------------------------------------------------------------- 1 | flake8-functions 2 | flake8-expression-complexity 3 | cognitive-complexity 4 | gspread-dataframe 5 | gspread-formatting 6 | plotnine 7 | pandas -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.11 3 | # by the following command: 4 | # 5 | # pip-compile requirements.in 6 | # 7 | 8 | astpretty==3.0.0 9 | # via flake8-expression-complexity 10 | cachetools==5.3.0 11 | # via google-auth 12 | certifi==2022.12.7 13 | # via requests 14 | charset-normalizer==3.1.0 15 | # via requests 16 | click==8.1.3 17 | # via mr-proper 18 | cognitive-complexity==1.3.0 19 | # via -r requirements.in 20 | contourpy==1.0.7 21 | # via matplotlib 22 | cycler==0.11.0 23 | # via matplotlib 24 | flake8==6.0.0 25 | # via flake8-expression-complexity 26 | flake8-expression-complexity==0.0.11 27 | # via -r requirements.in 28 | flake8-functions==0.0.8 29 | # via -r requirements.in 30 | fonttools==4.39.3 31 | # via matplotlib 32 | google-auth==2.17.3 33 | # via 34 | # google-auth-oauthlib 35 | # gspread 36 | google-auth-oauthlib==1.0.0 37 | # via gspread 38 | gspread==5.8.0 39 | # via 40 | # gspread-dataframe 41 | # gspread-formatting 42 | gspread-dataframe==3.3.0 43 | # via -r requirements.in 44 | gspread-formatting==1.1.2 45 | # via -r requirements.in 46 | idna==3.4 47 | # via requests 48 | kiwisolver==1.4.4 49 | # via matplotlib 50 | matplotlib==3.7.1 51 | # via 52 | # mizani 53 | # plotnine 54 | mccabe==0.7.0 55 | # via flake8 56 | mizani==0.9.0 57 | # via plotnine 58 | mr-proper==0.0.7 59 | # via flake8-functions 60 | numpy==1.24.2 61 | # via 62 | # contourpy 63 | # matplotlib 64 | # mizani 65 | # pandas 66 | # patsy 67 | # plotnine 68 | # scipy 69 | # statsmodels 70 | oauthlib==3.2.2 71 | # via requests-oauthlib 72 | packaging==23.1 73 | # via 74 | # matplotlib 75 | # statsmodels 76 | pandas==2.0.0 77 | # via 78 | # -r requirements.in 79 | # gspread-dataframe 80 | # mizani 81 | # plotnine 82 | # statsmodels 83 | patsy==0.5.3 84 | # via 85 | # plotnine 86 | # statsmodels 87 | pillow==9.5.0 88 | # via matplotlib 89 | plotnine==0.10.1 90 | # via -r requirements.in 91 | pyasn1==0.4.8 92 | # via 93 | # pyasn1-modules 94 | # rsa 95 | pyasn1-modules==0.2.8 96 | # via google-auth 97 | pycodestyle==2.10.0 98 | # via flake8 99 | pyflakes==3.0.1 100 | # via flake8 101 | pyparsing==3.0.9 102 | # via matplotlib 103 | python-dateutil==2.8.2 104 | # via 105 | # matplotlib 106 | # pandas 107 | pytz==2023.3 108 | # via pandas 109 | requests==2.28.2 110 | # via requests-oauthlib 111 | requests-oauthlib==1.3.1 112 | # via google-auth-oauthlib 113 | rsa==4.9 114 | # via google-auth 115 | scipy==1.10.1 116 | # via 117 | # mizani 118 | # plotnine 119 | # statsmodels 120 | six==1.16.0 121 | # via 122 | # google-auth 123 | # gspread-dataframe 124 | # patsy 125 | # python-dateutil 126 | statsmodels==0.13.5 127 | # via plotnine 128 | stdlib-list==0.8.0 129 | # via mr-proper 130 | tzdata==2023.3 131 | # via pandas 132 | urllib3==1.26.15 133 | # via requests 134 | 135 | # The following packages are considered to be unsafe in a requirements file: 136 | # setuptools 137 | --------------------------------------------------------------------------------