├── .gitignore
├── Code_Cleanup-A_Data_Scientists_Guide_to_Sparkling_Code.pdf
├── Code_Complexity.ipynb
├── LICENSE
├── README.md
├── code
├── __init__.py
├── complexity_metrics.py
├── gsheet_utils.py
├── over_time.py
└── parse_code.py
├── requirements.in
└── requirements.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,jupyternotebooks
2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode,jupyternotebooks
3 |
4 | ### JupyterNotebooks ###
5 | # gitignore template for Jupyter Notebooks
6 | # website: http://jupyter.org/
7 |
8 | .ipynb_checkpoints
9 | */.ipynb_checkpoints/*
10 |
11 | # IPython
12 | profile_default/
13 | ipython_config.py
14 |
15 | # Remove previous ipynb_checkpoints
16 | # git rm -r .ipynb_checkpoints/
17 |
18 | ### Python ###
19 | # Byte-compiled / optimized / DLL files
20 | __pycache__/
21 | *.py[cod]
22 | *$py.class
23 |
24 | # C extensions
25 | *.so
26 |
27 | # Distribution / packaging
28 | .Python
29 | build/
30 | develop-eggs/
31 | dist/
32 | downloads/
33 | eggs/
34 | .eggs/
35 | lib/
36 | lib64/
37 | parts/
38 | sdist/
39 | var/
40 | wheels/
41 | share/python-wheels/
42 | *.egg-info/
43 | .installed.cfg
44 | *.egg
45 | MANIFEST
46 |
47 | # PyInstaller
48 | # Usually these files are written by a python script from a template
49 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
50 | *.manifest
51 | *.spec
52 |
53 | # Installer logs
54 | pip-log.txt
55 | pip-delete-this-directory.txt
56 |
57 | # Unit test / coverage reports
58 | htmlcov/
59 | .tox/
60 | .nox/
61 | .coverage
62 | .coverage.*
63 | .cache
64 | nosetests.xml
65 | coverage.xml
66 | *.cover
67 | *.py,cover
68 | .hypothesis/
69 | .pytest_cache/
70 | cover/
71 |
72 | # Translations
73 | *.mo
74 | *.pot
75 |
76 | # Django stuff:
77 | *.log
78 | local_settings.py
79 | db.sqlite3
80 | db.sqlite3-journal
81 |
82 | # Flask stuff:
83 | instance/
84 | .webassets-cache
85 |
86 | # Scrapy stuff:
87 | .scrapy
88 |
89 | # Sphinx documentation
90 | docs/_build/
91 |
92 | # PyBuilder
93 | .pybuilder/
94 | target/
95 |
96 | # Jupyter Notebook
97 |
98 | # IPython
99 |
100 | # pyenv
101 | # For a library or package, you might want to ignore these files since the code is
102 | # intended to run in multiple environments; otherwise, check them in:
103 | # .python-version
104 |
105 | # pipenv
106 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
107 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
108 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
109 | # install all needed dependencies.
110 | #Pipfile.lock
111 |
112 | # poetry
113 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
114 | # This is especially recommended for binary packages to ensure reproducibility, and is more
115 | # commonly ignored for libraries.
116 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
117 | #poetry.lock
118 |
119 | # pdm
120 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
121 | #pdm.lock
122 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
123 | # in version control.
124 | # https://pdm.fming.dev/#use-with-ide
125 | .pdm.toml
126 |
127 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128 | __pypackages__/
129 |
130 | # Celery stuff
131 | celerybeat-schedule
132 | celerybeat.pid
133 |
134 | # SageMath parsed files
135 | *.sage.py
136 |
137 | # Environments
138 | .env
139 | .venv
140 | env/
141 | venv/
142 | ENV/
143 | env.bak/
144 | venv.bak/
145 |
146 | # Spyder project settings
147 | .spyderproject
148 | .spyproject
149 |
150 | # Rope project settings
151 | .ropeproject
152 |
153 | # mkdocs documentation
154 | /site
155 |
156 | # mypy
157 | .mypy_cache/
158 | .dmypy.json
159 | dmypy.json
160 |
161 | # Pyre type checker
162 | .pyre/
163 |
164 | # pytype static type analyzer
165 | .pytype/
166 |
167 | # Cython debug symbols
168 | cython_debug/
169 |
170 | # PyCharm
171 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
172 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
173 | # and can be added to the global gitignore or merged into this file. For a more nuclear
174 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
175 | #.idea/
176 |
177 | ### Python Patch ###
178 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
179 | poetry.toml
180 |
181 | # ruff
182 | .ruff_cache/
183 |
184 | # LSP config files
185 | pyrightconfig.json
186 |
187 | ### VisualStudioCode ###
188 | .vscode/*
189 | !.vscode/settings.json
190 | !.vscode/tasks.json
191 | !.vscode/launch.json
192 | !.vscode/extensions.json
193 | !.vscode/*.code-snippets
194 |
195 | # Local History for Visual Studio Code
196 | .history/
197 |
198 | # Built Visual Studio Code Extensions
199 | *.vsix
200 |
201 | ### VisualStudioCode Patch ###
202 | # Ignore all local history of files
203 | .history
204 | .ionide
205 |
206 | # End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,jupyternotebooks
207 |
--------------------------------------------------------------------------------
/Code_Cleanup-A_Data_Scientists_Guide_to_Sparkling_Code.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/corriebar/code-complexity/b772a1b6c599b9d551e8d5ab4481cc5395aa3c8d/Code_Cleanup-A_Data_Scientists_Guide_to_Sparkling_Code.pdf
--------------------------------------------------------------------------------
/Code_Complexity.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "id": "dac9edb2-734a-4ea6-8b66-991b1ddd7cb0",
7 | "metadata": {
8 | "tags": []
9 | },
10 | "source": [
11 | "# Code Complexity Summary\n",
12 | "\n",
13 | "This notbook generates code compexity summaries and writes them to a google sheet.\n",
14 | "\n",
15 | "For each repository, one tab is added.\n",
16 | "\n",
17 | "Before running the code, make sure to be on an up-to-date master for all repositories without any untracked python files.\n",
18 | "\n",
19 | "For more on code complexities and the measures used in this notebook, check these [slides](Code_Cleanup-A_Data_Scientists_Guide_to_Sparkling_Code.pdf)."
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 5,
25 | "id": "d6936daa-eb5e-44c1-81bf-3389da68153d",
26 | "metadata": {},
27 | "outputs": [
28 | {
29 | "name": "stdout",
30 | "output_type": "stream",
31 | "text": [
32 | "The autoreload extension is already loaded. To reload it, use:\n",
33 | " %reload_ext autoreload\n"
34 | ]
35 | }
36 | ],
37 | "source": [
38 | "%load_ext autoreload\n",
39 | "%autoreload 2\n",
40 | "\n",
41 | "from pathlib import Path\n",
42 | "\n",
43 | "root = Path.cwd().parent\n",
44 | "repos = [\n",
45 | " root / \"code-complexity\",\n",
46 | " #...\n",
47 | "]"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 3,
53 | "id": "a2297d47-d4b3-4351-99db-fdd62f8f6c0b",
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "\n",
58 | "\n",
59 | "\n",
60 | "import pandas as pd\n",
61 | "\n",
62 | "import gspread\n",
63 | "\n",
64 | "from code.gsheet_utils import apply_formatting, return_data_to_write\n",
65 | "from code.complexity_metrics import get_repo_complexities\n",
66 | "\n",
67 | "import warnings\n",
68 | "warnings.simplefilter(action=\"ignore\", category=Warning)\n",
69 | "\n",
70 | "from plotnine import *"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 4,
76 | "id": "bcdfc656-b500-4086-990d-f8d461ef94b4",
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "creds_dict = {} # your creds dict\n",
81 | "scopes = [\"https://www.googleapis.com/auth/drive\"]\n",
82 | "gc = gspread.service_account_from_dict(creds_dict, scopes)\n"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "id": "0a258c10-7104-4c0b-9e0f-619fc67706e7",
88 | "metadata": {
89 | "tags": []
90 | },
91 | "source": [
92 | "## Run for a single repo\n",
93 | "\n",
94 | "If you only want to get the results for a single repo (or folder), you can run the following command:"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 5,
100 | "id": "0b34e184-e28d-4ff0-8dfa-f24a8443371f",
101 | "metadata": {
102 | "tags": []
103 | },
104 | "outputs": [
105 | {
106 | "data": {
107 | "text/html": [
108 | "
\n",
109 | "\n",
122 | "
\n",
123 | " \n",
124 | " \n",
125 | " | \n",
126 | " repo | \n",
127 | " file | \n",
128 | " function_name | \n",
129 | " func_lineno | \n",
130 | " func_length | \n",
131 | " cognitive_complexity | \n",
132 | " sum_expression_complexity | \n",
133 | " max_expression_complexity | \n",
134 | " num_arguments | \n",
135 | " num_returns | \n",
136 | " num_module_expressions | \n",
137 | " module_complexity | \n",
138 | " extract_date | \n",
139 | "
\n",
140 | " \n",
141 | " \n",
142 | " \n",
143 | " 3 | \n",
144 | " code-complexity | \n",
145 | " /code/parse_code.py | \n",
146 | " iterate_over_expressions | \n",
147 | " 33 | \n",
148 | " 31 | \n",
149 | " 12 | \n",
150 | " 18.0 | \n",
151 | " 6.0 | \n",
152 | " 1 | \n",
153 | " 0 | \n",
154 | " 4 | \n",
155 | " 0.0 | \n",
156 | " 2023-04-18 | \n",
157 | "
\n",
158 | " \n",
159 | " 1 | \n",
160 | " code-complexity | \n",
161 | " /code/parse_code.py | \n",
162 | " get_all_python_files | \n",
163 | " 13 | \n",
164 | " 10 | \n",
165 | " 8 | \n",
166 | " 18.0 | \n",
167 | " 3.5 | \n",
168 | " 2 | \n",
169 | " 1 | \n",
170 | " 4 | \n",
171 | " 0.0 | \n",
172 | " 2023-04-18 | \n",
173 | "
\n",
174 | " \n",
175 | " 6 | \n",
176 | " code-complexity | \n",
177 | " /code/gsheet_utils.py | \n",
178 | " return_data_to_write | \n",
179 | " 45 | \n",
180 | " 8 | \n",
181 | " 2 | \n",
182 | " 9.5 | \n",
183 | " 2.5 | \n",
184 | " 3 | \n",
185 | " 1 | \n",
186 | " 14 | \n",
187 | " 7.5 | \n",
188 | " 2023-04-18 | \n",
189 | "
\n",
190 | " \n",
191 | "
\n",
192 | "
"
193 | ],
194 | "text/plain": [
195 | " repo file function_name \n",
196 | "3 code-complexity /code/parse_code.py iterate_over_expressions \\\n",
197 | "1 code-complexity /code/parse_code.py get_all_python_files \n",
198 | "6 code-complexity /code/gsheet_utils.py return_data_to_write \n",
199 | "\n",
200 | " func_lineno func_length cognitive_complexity sum_expression_complexity \n",
201 | "3 33 31 12 18.0 \\\n",
202 | "1 13 10 8 18.0 \n",
203 | "6 45 8 2 9.5 \n",
204 | "\n",
205 | " max_expression_complexity num_arguments num_returns \n",
206 | "3 6.0 1 0 \\\n",
207 | "1 3.5 2 1 \n",
208 | "6 2.5 3 1 \n",
209 | "\n",
210 | " num_module_expressions module_complexity extract_date \n",
211 | "3 4 0.0 2023-04-18 \n",
212 | "1 4 0.0 2023-04-18 \n",
213 | "6 14 7.5 2023-04-18 "
214 | ]
215 | },
216 | "execution_count": 5,
217 | "metadata": {},
218 | "output_type": "execute_result"
219 | }
220 | ],
221 | "source": [
222 | "repo = 'code-complexity'\n",
223 | "\n",
224 | "df = get_repo_complexities(repos[0])\n",
225 | "\n",
226 | "df.head(3)"
227 | ]
228 | },
229 | {
230 | "cell_type": "markdown",
231 | "id": "ca183151-df4d-430a-a021-943a012ccd68",
232 | "metadata": {},
233 | "source": [
234 | "## Save to Google Sheet"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": 6,
240 | "id": "8efa54fa-2a4b-4daa-9664-d86a5e62f813",
241 | "metadata": {},
242 | "outputs": [],
243 | "source": [
244 | "sheet_id = \"\"\n",
245 | "url = f\"https://docs.google.com/spreadsheets/d/{sheet_id}}/\"\n",
246 | "sheet = gc.open_by_url(url)"
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": null,
252 | "id": "2857b1c9-e8af-4907-a175-ce59bbbe4409",
253 | "metadata": {},
254 | "outputs": [],
255 | "source": [
256 | "tabs = [wksh.title for wksh in sheet.worksheets()]\n",
257 | "all_repos = []\n",
258 | "for repo in repos:\n",
259 | " new_df = get_repo_complexities(repo)\n",
260 | " \n",
261 | " if not repo in tabs:\n",
262 | " wksht = sheet.add_worksheet(title=repo, rows=1000, cols=26, index=0)\n",
263 | " df = new_df\n",
264 | " else:\n",
265 | " df = return_data_to_write(sheet, repo.name, new_df)\n",
266 | " wksht = sheet.worksheet(repo.name)\n",
267 | " apply_formatting(wksht, df)\n",
268 | "\n",
269 | " \n",
270 | " all_repos.append(df)"
271 | ]
272 | },
273 | {
274 | "cell_type": "markdown",
275 | "id": "43942b9d-d69b-4d4a-ab44-54ca6abfa051",
276 | "metadata": {},
277 | "source": [
278 | "## Summary Statistics"
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": 8,
284 | "id": "7637e4fb-8c16-4e01-aef7-6265e5e4d362",
285 | "metadata": {},
286 | "outputs": [],
287 | "source": [
288 | "df_all = pd.concat(all_repos, ignore_index=True).query('repo != \"nan\" & repo.notna()')\n",
289 | "\n",
290 | "df_all['extract_date'] = df_all['extract_date'].replace('nan', None).fillna(method='ffill')"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": 9,
296 | "id": "8dde96de-7116-4e9a-a3ab-fa99649151bf",
297 | "metadata": {},
298 | "outputs": [],
299 | "source": [
300 | "plot_df = (df_all\n",
301 | " .groupby(['repo', 'extract_date'])\n",
302 | " .cognitive_complexity.agg(['mean', 'max', 'median'])\n",
303 | " .reset_index()\n",
304 | " )\n",
305 | "\n",
306 | "plot_df['extract_date'] = pd.to_datetime(plot_df['extract_date'])\n"
307 | ]
308 | },
309 | {
310 | "cell_type": "code",
311 | "execution_count": 15,
312 | "id": "0d1c141c-4e2e-4001-b2a2-ff501edea21e",
313 | "metadata": {},
314 | "outputs": [],
315 | "source": [
316 | "(ggplot(plot_df, aes(x='extract_date', y='mean', color='repo'))\n",
317 | " + geom_line(show_legend=False, size=1.5)\n",
318 | " + geom_point(show_legend=False, size=2)\n",
319 | " + scale_x_date(date_labels='%b %Y', breaks=plot_df.extract_date.unique())\n",
320 | " + scale_color_brewer(type='qual', palette='Set2')\n",
321 | " + labs(x='', y='Complexity', title='Code Complexity of our Repos over Time')\n",
322 | " + theme_minimal()\n",
323 | " + theme(figure_size=(8,15),\n",
324 | " legend_position='bottom')\n",
325 | ").draw()"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": 14,
331 | "id": "1685a242-9201-4ef3-957c-26f041d33dc1",
332 | "metadata": {},
333 | "outputs": [],
334 | "source": [
335 | "(ggplot(plot_df, aes(x='extract_date', y='mean', color='repo'))\n",
336 | " + geom_line(show_legend=False)\n",
337 | " + geom_point(show_legend=False)\n",
338 | " + scale_x_date(date_labels='%d %b %Y', breaks=plot_df['extract_date'].unique())\n",
339 | " + scale_color_brewer(type='qual', palette='Paired')\n",
340 | " + labs(x='', y='Max Cognitive Complexity', title='Max Cognitive Complexity of our Repos over Time')\n",
341 | " + theme_minimal()\n",
342 | " + theme(figure_size=(10,6),\n",
343 | " legend_position='bottom')\n",
344 | ").draw()"
345 | ]
346 | }
347 | ],
348 | "metadata": {
349 | "kernelspec": {
350 | "display_name": "code-complexity",
351 | "language": "python",
352 | "name": "code-complexity"
353 | },
354 | "language_info": {
355 | "codemirror_mode": {
356 | "name": "ipython",
357 | "version": 3
358 | },
359 | "file_extension": ".py",
360 | "mimetype": "text/x-python",
361 | "name": "python",
362 | "nbconvert_exporter": "python",
363 | "pygments_lexer": "ipython3",
364 | "version": "3.11.1"
365 | }
366 | },
367 | "nbformat": 4,
368 | "nbformat_minor": 5
369 | }
370 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Corrie Bartelheimer
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Code Complexity
2 |
3 | In the notebook [Code Complexity](Code_Complexity.ipynb) you can find an example flow on how to use the different [complexity metrics](code/complexity_metrics.py).
4 |
5 | To run the notebook, you might have to setup a Google Sheet [service account](https://docs.gspread.org/en/v5.7.1/oauth2.html#authentication) for the authentication.
6 | To install the necessary python packages, run the following command in the root directory of the repository:
7 | ```
8 | pip install -r requirements.txt
9 | ```
10 |
11 | ## Slides
12 |
13 | The slides for the talk can be found [here](Code_Cleanup-A_Data_Scientists_Guide_to_Sparkling_Code.pdf).
14 |
--------------------------------------------------------------------------------
/code/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/corriebar/code-complexity/b772a1b6c599b9d551e8d5ab4481cc5395aa3c8d/code/__init__.py
--------------------------------------------------------------------------------
/code/complexity_metrics.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | from pathlib import Path
3 |
4 | from flake8_functions.function_length import get_function_start_row, get_function_last_row
5 | from cognitive_complexity.api import get_cognitive_complexity
6 | from flake8_expression_complexity.utils.complexity import get_expression_complexity
7 |
8 | from flake8_functions.function_arguments_amount import get_arguments_amount_for
9 | from flake8_functions.function_returns_amount import get_returns_amount_for
10 |
11 | from code.parse_code import (
12 | iterate_over_expressions,
13 | get_function_definitions,
14 | parse_file,
15 | get_all_python_files,
16 | )
17 |
18 | import pandas as pd
19 |
20 | COMPLEXITY_METRICS = [
21 | "func_length",
22 | "cognitive_complexity",
23 | "sum_expression_complexity",
24 | "max_expression_complexity",
25 | "num_arguments",
26 | "num_returns",
27 | "num_module_expressions",
28 | "module_complexity",
29 | ]
30 | """
31 | Only the metrics columns of the complexity analysis.
32 |
33 | This can be used e.g. to summarize the DataFrame.
34 | """
35 |
36 | COMPLEXITY_COLUMNS = [
37 | "repo",
38 | "file",
39 | "function_name",
40 | "func_lineno",
41 | "extract_date",
42 | *COMPLEXITY_METRICS,
43 | ]
44 | """All columns to expect in a complexity analysis DataFrame."""
45 |
46 | COLUMN_TYPES = {
47 | "repo": str,
48 | "file": str,
49 | "function_name": str,
50 | "func_lineno": "float64",
51 | "func_length": "float64",
52 | "cognitive_complexity": "float64",
53 | "sum_expression_complexity": "float64",
54 | "max_expression_complexity": "float64",
55 | "num_arguments": "float64",
56 | "num_returns": "float64",
57 | "num_module_expressions": "float64",
58 | "module_complexity": "float64",
59 | }
60 |
61 |
62 | def get_function_length(funcdef):
63 | function_start_row = get_function_start_row(funcdef)
64 | function_last_row = get_function_last_row(funcdef)
65 | return function_last_row - function_start_row + 1
66 |
67 |
68 | def get_complexity_per_function(funcdef):
69 | expression_complexities = [
70 | get_expression_complexity(expr) for expr in iterate_over_expressions(funcdef)
71 | ]
72 | return {
73 | "function_name": funcdef.name,
74 | "func_lineno": funcdef.lineno,
75 | "func_length": get_function_length(funcdef),
76 | "cognitive_complexity": get_cognitive_complexity(funcdef),
77 | "sum_expression_complexity": sum(expression_complexities),
78 | "max_expression_complexity": max(expression_complexities),
79 | "num_arguments": get_arguments_amount_for(funcdef),
80 | "num_returns": get_returns_amount_for(funcdef),
81 | }
82 |
83 |
84 | def get_module_complexities(module):
85 | expressions_outside_functions = [exp for exp in iterate_over_expressions(module)]
86 | expression_complexities = [
87 | get_expression_complexity(expr) for expr in expressions_outside_functions
88 | ]
89 | num_expressions = len(expressions_outside_functions)
90 | return {
91 | "num_module_expressions": num_expressions,
92 | "module_complexity": sum(expression_complexities),
93 | }
94 |
95 |
96 | def get_module_function_complexities(module):
97 | complexities = []
98 | funcdefs = get_function_definitions(module)
99 |
100 | for funcdef in funcdefs:
101 | comp_dict = get_complexity_per_function(funcdef)
102 | complexities.append(comp_dict)
103 |
104 | return complexities
105 |
106 |
107 | def get_file_complexities(repo_path: Path, filepath: Path):
108 | module = parse_file(filepath)
109 |
110 | function_complexities = get_module_function_complexities(module)
111 |
112 | module_complexities = get_module_complexities(module)
113 |
114 | rel_path = str(filepath.relative_to(repo_path))
115 | module_function_complexities = [
116 | {**d, **module_complexities, "file": rel_path}
117 | for d in function_complexities
118 | ]
119 | return module_function_complexities
120 |
121 |
122 | def get_repo_complexities(repo_path):
123 | repo_path = Path(repo_path)
124 | repo_name = repo_path.name
125 |
126 | python_files = get_all_python_files(repo_path, repo_name)
127 | complexities = []
128 | for file_path in python_files:
129 | module_function_complexities = get_file_complexities(repo_path, file_path)
130 |
131 | complexities.extend(module_function_complexities)
132 |
133 | df = pd.DataFrame(complexities, columns=COMPLEXITY_COLUMNS)
134 | df["repo"] = repo_name
135 | df = add_extract_date(df)
136 | return df[COMPLEXITY_COLUMNS].sort_values(
137 | by=["cognitive_complexity", "func_length"], ascending=False
138 | )
139 |
140 |
141 | def add_extract_date(df):
142 | today = datetime.datetime.today()
143 | d = df.copy()
144 | d["extract_date"] = str(today.date())
145 | return d
146 |
147 |
148 | def compare_old_new(old_df, new_df):
149 | sort_cols = ["repo", "file", "function_name"]
150 | compare_cols = [
151 | *sort_cols,
152 | "func_lineno",
153 | "func_length",
154 | "cognitive_complexity",
155 | "sum_expression_complexity",
156 | "max_expression_complexity",
157 | "num_arguments",
158 | "num_returns",
159 | "num_module_expressions",
160 | "module_complexity",
161 | ]
162 | old = old_df[compare_cols].sort_values(by=sort_cols).reset_index(drop=True).astype(COLUMN_TYPES)
163 | new = new_df[compare_cols].sort_values(by=sort_cols).reset_index(drop=True).astype(COLUMN_TYPES)
164 | is_equal = old.equals(new)
165 | return is_equal
166 |
167 |
168 | def get_latest_data(old_data):
169 | return old_data.query("extract_date == extract_date.max()")
170 |
--------------------------------------------------------------------------------
/code/gsheet_utils.py:
--------------------------------------------------------------------------------
1 | from string import ascii_uppercase
2 | from typing import List
3 |
4 | from gspread_dataframe import get_as_dataframe
5 | from gspread_formatting import get_conditional_format_rules
6 | from gspread_formatting.dataframe import format_with_dataframe, set_frozen
7 | from gspread_formatting.models import Color
8 | from gspread_formatting.conditionals import (
9 | ConditionalFormatRule,
10 | GradientRule,
11 | GridRange,
12 | InterpolationPoint,
13 | )
14 |
15 | from gspread_dataframe import get_as_dataframe, set_with_dataframe
16 |
17 |
18 | import pandas as pd
19 |
20 | from code.complexity_metrics import COLUMN_TYPES, compare_old_new, get_latest_data
21 |
22 |
23 | COLUMN_GRADIENTS = {
24 | "func_length": [0, 50, 80],
25 | "cognitive_complexity": [0, 7, 10],
26 | "sum_expression_complexity": [0, 50, 80],
27 | "max_expression_complexity": [0, 6, 9],
28 | "num_arguments": [0, 4, 7],
29 | "num_returns": [0, 3, 7],
30 | }
31 |
32 | GREEN = Color(0.34117648, 0.73333335, 0.5411765)
33 | YELLOW = Color(0.9843137, 0.7372549, 0.015686275)
34 | RED = Color(1, 0.42745098, 0.003921569)
35 |
36 |
37 | def get_old_data(sheet, repo):
38 | types = {**COLUMN_TYPES, "extract_date": str}
39 | wksh = sheet.worksheet(repo)
40 | old_data = get_as_dataframe(wksh)
41 | old_df = old_data.dropna(axis=1, how="all") # drop empty columns
42 | return old_df.dropna(axis=0, how="all").astype(types)
43 |
44 |
45 | def return_data_to_write(sheet, repo, new_df):
46 | old_df = get_old_data(sheet, repo)
47 | latest = get_latest_data(old_df)
48 | if not compare_old_new(latest, new_df):
49 | print(f"Changes have been made in {repo}!")
50 | df = pd.concat([new_df, old_df], ignore_index=True)
51 | else:
52 | df = old_df
53 | return df
54 |
55 |
56 | def get_cell_ranges(worksheet, cell_ranges: str | List):
57 | ranges = []
58 | if isinstance(cell_ranges, str):
59 | cell_ranges = [cell_ranges]
60 | for cells in cell_ranges:
61 | grid_range = GridRange.from_a1_range(cells, worksheet)
62 | ranges.append(grid_range)
63 | return ranges
64 |
65 |
66 | def get_conditional_format_rule(worksheet, cell_ranges: str | List, gradient_points: List[int]):
67 | minpoint, midpoint, maxpoint = gradient_points
68 |
69 | ranges = get_cell_ranges(worksheet, cell_ranges)
70 | rule = ConditionalFormatRule(
71 | ranges=ranges,
72 | gradientRule=GradientRule(
73 | minpoint=InterpolationPoint(color=GREEN, type="NUMBER", value=str(minpoint)),
74 | midpoint=InterpolationPoint(color=YELLOW, type="NUMBER", value=str(midpoint)),
75 | maxpoint=InterpolationPoint(color=RED, type="NUMBER", value=str(maxpoint)),
76 | ),
77 | )
78 | return rule
79 |
80 |
81 | def map_colname_to_range(df, colname):
82 | col_index = list(df.columns).index(colname)
83 | if col_index > 26:
84 | print("not yet supported")
85 | col_letter = ascii_uppercase[col_index]
86 | return f"{col_letter}:{col_letter}"
87 |
88 |
89 | def set_conditional_rules(df, rules, worksheet):
90 | for col_name, gradient_points in COLUMN_GRADIENTS.items():
91 | cell_range = map_colname_to_range(df, col_name)
92 | rule = get_conditional_format_rule(worksheet, cell_range, gradient_points)
93 | rules.append(rule)
94 | rules.save()
95 |
96 |
97 | def apply_formatting(worksheet, df):
98 | set_with_dataframe(worksheet, df)
99 | format_with_dataframe(worksheet, df, include_column_header=True)
100 | set_frozen(worksheet, rows=1)
101 |
102 | rules = get_conditional_format_rules(worksheet)
103 | set_conditional_rules(df, rules, worksheet)
104 |
--------------------------------------------------------------------------------
/code/over_time.py:
--------------------------------------------------------------------------------
1 | """
2 | Functions to jump through the git history of a repo.
3 | """
4 | from datetime import datetime
5 | import pathlib
6 | import subprocess
7 | from typing import Callable, Dict, Optional, Sequence, Tuple, TypeVar, Union
8 |
9 | import pandas as pd
10 |
11 | from code.complexity_metrics import COMPLEXITY_METRICS, get_repo_complexities
12 |
13 | try:
14 | from fastprogress import progress_bar
15 | except ModuleNotFoundError:
16 | progress_bar = lambda x: None
17 |
18 |
19 | PathLike = Union[str, pathlib.Path]
20 | T = TypeVar("T")
21 |
22 |
23 | def git_log(dp: PathLike) -> Tuple[str, ...]:
24 | """Returns a tuple of all commit hashes in the git history (newest first)."""
25 | output = subprocess.check_output(["git", "-C", str(dp), "log", '--format=format:"%H"'])
26 | output = output.strip().decode("ascii")
27 | output = output.replace('"', "")
28 | return tuple(output.split("\n"))
29 |
30 |
31 | def git_commit_timestamps(dp: PathLike) -> Dict[str, datetime]:
32 | """Returns a tuple of all commit hashes in the git history (newest first)."""
33 | output = subprocess.check_output(["git", "-C", str(dp), "log", '--format=format:"%H|%ci"'])
34 | output = output.strip().decode("ascii")
35 | output = output.replace('"', "")
36 | result = {}
37 | for row in output.split("\n"):
38 | cid, ts = row.split("|")
39 | result[cid] = datetime.fromisoformat(ts)
40 | return result
41 |
42 |
43 | def git_status(dp: PathLike) -> str:
44 | """Returns the git status message."""
45 | output = subprocess.check_output(["git", "-C", str(dp), "status"])
46 | output = output.strip().decode("ascii")
47 | return output
48 |
49 |
50 | def git_current_branch(dp: PathLike) -> str:
51 | """Determines the name of the currently checked-out branch."""
52 | status = git_status(dp)
53 | return status.split("\n")[0].replace("On branch ", "")
54 |
55 |
56 | def git_checkout(dp: PathLike, commit_or_branch: str):
57 | """Check out a specific branch or commit in the repository under `dp`."""
58 | output = subprocess.check_output(["git", "-C", str(dp), "checkout", commit_or_branch], stderr=subprocess.DEVNULL)
59 | output = output.strip().decode("ascii")
60 | return output
61 |
62 |
63 |
64 | def eval_by_commit(
65 | dp: PathLike,
66 | func: Callable[[PathLike], T],
67 | commits: Sequence[str],
68 | *,
69 | raise_on_error: bool = True,
70 | ) -> Dict[str, Optional[T]]:
71 | """Apply `func` to the `dp` for each of the `commits` and return the results.
72 |
73 | Requires the repository at `dp` to be in a clean `git status` state.
74 | In the end, the current branch will be checked out again.
75 |
76 | Parameters
77 | ----------
78 | dp
79 | Path to a local git repository.
80 | func
81 | A callable to apply at each commit.
82 | It should take one parameter `dp` and return something.
83 | commits
84 | A sequence of commits to execute the function at.
85 | raise_on_error
86 | If ``True``, exceptions other than SyntaxErrors are raised.
87 |
88 | Returns
89 | -------
90 | results
91 | Maps commit IDs to return values of the provided callable,
92 | or ``None`` in case of syntax errors at the respective commit.
93 | """
94 | status = git_status(dp)
95 | if not ("working tree clean" in status or "nothing added to commit but untracked" in status):
96 | raise Exception(f"The git status of '{dp}' is unclean:\n\n{status}")
97 | branch = git_current_branch(dp)
98 | results = {}
99 | for commit in progress_bar(commits):
100 | try:
101 | git_checkout(dp, commit)
102 | results[commit] = func(dp)
103 | except SyntaxError:
104 | results[commit] = None
105 | except:
106 | print(f"Failed to apply function at commit {commit}")
107 | if raise_on_error:
108 | git_checkout(dp, branch)
109 | raise
110 | print(f"\nEvaluations completed. Checking out branch '{branch}'.")
111 | git_checkout(dp, branch)
112 | return results
113 |
114 |
115 | def complexity_by_commit(dp: PathLike, commits: Optional[Sequence[str]] = None) -> pd.DataFrame:
116 | """Convenience wrapper around ``eval_by_commit`` to determine mean code complexity metrics over time.
117 |
118 | Parameters
119 | ----------
120 | dp
121 | Path to a local git repository.
122 | commits
123 | Optional sequence of commits IDs to run for.
124 | Defaults to the entire git history of the repo.
125 |
126 | Returns
127 | -------
128 | df
129 | DataFrame indexed by `commit_id`,
130 | with `timespan` and mean `COMPLEXITY_METRICS` columns.
131 | """
132 | if not commits:
133 | commits = git_log(dp)
134 |
135 | results = eval_by_commit(
136 | dp=dp,
137 | func=lambda dp: get_repo_complexities(dp).set_index("repo")[COMPLEXITY_METRICS].mean(),
138 | commits=commits,
139 | )
140 |
141 | # Summarize in a DataFrame with None-results as NA rows
142 | results_notna = {k:v for k,v in results.items() if v is not None}
143 | df = pd.DataFrame.from_dict(results_notna, orient="index")
144 | df.index.name = "commit_id"
145 | # Re-insert rows with None-results
146 | for k, v in results.items():
147 | if v is None:
148 | df.loc[k] = pd.NA
149 |
150 | # Determine commit timestamps
151 | timestamps = git_commit_timestamps(dp)
152 | df["timestamp"] = [timestamps[row.Index] for row in df.itertuples()]
153 | return df.sort_values("timestamp")
154 |
--------------------------------------------------------------------------------
/code/parse_code.py:
--------------------------------------------------------------------------------
1 | import ast
2 | import itertools
3 | import os
4 | from pathlib import Path
5 | from typing import Callable, Iterable, List, Tuple
6 |
7 |
8 | _IGNORE_DIRS = {
9 | "notebooks",
10 | ".ipynb_checkpoints",
11 | os.sep + "build" + os.sep,
12 | }
13 |
14 |
15 | def parse_file(filepath):
16 | with open(filepath) as f:
17 | file_parsed = ast.parse(f.read())
18 | return file_parsed
19 |
20 |
21 | def get_all_python_files(repo_path, repo) -> List[Path]:
22 | python_files = []
23 | for fp_py in Path(repo_path).glob("**/*.py"):
24 | if fp_py.is_dir() or any(d in str(fp_py) for d in _IGNORE_DIRS):
25 | continue
26 | python_files.append(fp_py)
27 | return python_files
28 |
29 |
30 | def get_function_definitions(module):
31 | funcdefs = [
32 | n for n in ast.walk(module) if isinstance(n, (ast.FunctionDef, ast.AsyncFunctionDef))
33 | ]
34 | return funcdefs
35 |
36 |
37 | def iterate_over_expressions(node: ast.AST) -> Iterable[ast.AST]:
38 | """Ignoring function definitions."""
39 | additionals_subnodes_info: List[Tuple[Tuple, Callable]] = [
40 | ((ast.If, ast.While), lambda n: [n.test]),
41 | ((ast.For,), lambda n: [n.iter]),
42 | ((ast.AsyncFor,), lambda n: [n.iter]),
43 | ((ast.With, ast.AsyncWith), lambda n: [s.context_expr for s in n.items]),
44 | ]
45 | nodes_with_subnodes = (
46 | ast.FunctionDef,
47 | ast.AsyncFunctionDef,
48 | ast.If,
49 | ast.For,
50 | ast.AsyncFor,
51 | ast.Module,
52 | ast.ClassDef,
53 | ast.Try,
54 | ast.With,
55 | ast.AsyncWith,
56 | ast.While,
57 | )
58 | for bases, subnodes_getter in additionals_subnodes_info:
59 | if isinstance(node, bases):
60 | yield from subnodes_getter(node)
61 | nodes_to_iter = (
62 | _get_try_node_children(node) if isinstance(node, ast.Try) else getattr(node, "body", [])
63 | )
64 | for child_node in nodes_to_iter:
65 | if isinstance(child_node, nodes_with_subnodes):
66 | if not isinstance(child_node, (ast.FunctionDef, ast.AsyncFunctionDef)):
67 | yield from iterate_over_expressions(child_node)
68 | else:
69 | yield child_node
70 |
71 |
72 | def _get_try_node_children(try_node: ast.Try):
73 | return itertools.chain(try_node.body, try_node.finalbody, *[n.body for n in try_node.handlers])
74 |
--------------------------------------------------------------------------------
/requirements.in:
--------------------------------------------------------------------------------
1 | flake8-functions
2 | flake8-expression-complexity
3 | cognitive-complexity
4 | gspread-dataframe
5 | gspread-formatting
6 | plotnine
7 | pandas
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | #
2 | # This file is autogenerated by pip-compile with Python 3.11
3 | # by the following command:
4 | #
5 | # pip-compile requirements.in
6 | #
7 |
8 | astpretty==3.0.0
9 | # via flake8-expression-complexity
10 | cachetools==5.3.0
11 | # via google-auth
12 | certifi==2022.12.7
13 | # via requests
14 | charset-normalizer==3.1.0
15 | # via requests
16 | click==8.1.3
17 | # via mr-proper
18 | cognitive-complexity==1.3.0
19 | # via -r requirements.in
20 | contourpy==1.0.7
21 | # via matplotlib
22 | cycler==0.11.0
23 | # via matplotlib
24 | flake8==6.0.0
25 | # via flake8-expression-complexity
26 | flake8-expression-complexity==0.0.11
27 | # via -r requirements.in
28 | flake8-functions==0.0.8
29 | # via -r requirements.in
30 | fonttools==4.39.3
31 | # via matplotlib
32 | google-auth==2.17.3
33 | # via
34 | # google-auth-oauthlib
35 | # gspread
36 | google-auth-oauthlib==1.0.0
37 | # via gspread
38 | gspread==5.8.0
39 | # via
40 | # gspread-dataframe
41 | # gspread-formatting
42 | gspread-dataframe==3.3.0
43 | # via -r requirements.in
44 | gspread-formatting==1.1.2
45 | # via -r requirements.in
46 | idna==3.4
47 | # via requests
48 | kiwisolver==1.4.4
49 | # via matplotlib
50 | matplotlib==3.7.1
51 | # via
52 | # mizani
53 | # plotnine
54 | mccabe==0.7.0
55 | # via flake8
56 | mizani==0.9.0
57 | # via plotnine
58 | mr-proper==0.0.7
59 | # via flake8-functions
60 | numpy==1.24.2
61 | # via
62 | # contourpy
63 | # matplotlib
64 | # mizani
65 | # pandas
66 | # patsy
67 | # plotnine
68 | # scipy
69 | # statsmodels
70 | oauthlib==3.2.2
71 | # via requests-oauthlib
72 | packaging==23.1
73 | # via
74 | # matplotlib
75 | # statsmodels
76 | pandas==2.0.0
77 | # via
78 | # -r requirements.in
79 | # gspread-dataframe
80 | # mizani
81 | # plotnine
82 | # statsmodels
83 | patsy==0.5.3
84 | # via
85 | # plotnine
86 | # statsmodels
87 | pillow==9.5.0
88 | # via matplotlib
89 | plotnine==0.10.1
90 | # via -r requirements.in
91 | pyasn1==0.4.8
92 | # via
93 | # pyasn1-modules
94 | # rsa
95 | pyasn1-modules==0.2.8
96 | # via google-auth
97 | pycodestyle==2.10.0
98 | # via flake8
99 | pyflakes==3.0.1
100 | # via flake8
101 | pyparsing==3.0.9
102 | # via matplotlib
103 | python-dateutil==2.8.2
104 | # via
105 | # matplotlib
106 | # pandas
107 | pytz==2023.3
108 | # via pandas
109 | requests==2.28.2
110 | # via requests-oauthlib
111 | requests-oauthlib==1.3.1
112 | # via google-auth-oauthlib
113 | rsa==4.9
114 | # via google-auth
115 | scipy==1.10.1
116 | # via
117 | # mizani
118 | # plotnine
119 | # statsmodels
120 | six==1.16.0
121 | # via
122 | # google-auth
123 | # gspread-dataframe
124 | # patsy
125 | # python-dateutil
126 | statsmodels==0.13.5
127 | # via plotnine
128 | stdlib-list==0.8.0
129 | # via mr-proper
130 | tzdata==2023.3
131 | # via pandas
132 | urllib3==1.26.15
133 | # via requests
134 |
135 | # The following packages are considered to be unsafe in a requirements file:
136 | # setuptools
137 |
--------------------------------------------------------------------------------