├── .gitignore ├── LICENSE ├── README.md ├── example ├── sample_code.ipynb ├── sample_data_ChatterBot_Requirements.csv ├── sample_viz.png └── sample_viz.svg ├── pycatflow ├── __init__.py ├── input.py └── viz.py ├── pyproject.toml ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | 3 | example/pycatflow 4 | **/.DS_Store 5 | **/.idea/ 6 | 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | cover/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | .pybuilder/ 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # IPython 87 | profile_default/ 88 | ipython_config.py 89 | 90 | # pyenv 91 | # For a library or package, you might want to ignore these files since the code is 92 | # intended to run in multiple environments; otherwise, check them in: 93 | # .python-version 94 | 95 | # pipenv 96 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 97 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 98 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 99 | # install all needed dependencies. 100 | #Pipfile.lock 101 | 102 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 103 | __pypackages__/ 104 | 105 | # Celery stuff 106 | celerybeat-schedule 107 | celerybeat.pid 108 | 109 | # SageMath parsed files 110 | *.sage.py 111 | 112 | # Environments 113 | .env 114 | .venv 115 | env/ 116 | venv/ 117 | ENV/ 118 | env.bak/ 119 | venv.bak/ 120 | 121 | # Spyder project settings 122 | .spyderproject 123 | .spyproject 124 | 125 | # Rope project settings 126 | .ropeproject 127 | 128 | # mkdocs documentation 129 | /site 130 | 131 | # mypy 132 | .mypy_cache/ 133 | .dmypy.json 134 | dmypy.json 135 | 136 | # Pyre type checker 137 | .pyre/ 138 | 139 | # pytype static type analyzer 140 | .pytype/ 141 | 142 | # Cython debug symbols 143 | cython_debug/ 144 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021 Marcus Burkhardt 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5531785.svg)](https://doi.org/10.5281/zenodo.5531785) 2 | 3 | # PyCatFlow 4 | 5 | This package is a visualization tool which allows the representation of temporal developments, based on categorical data. I wrote a short [article on Medium](https://medium.com/@bumatic/pycatflow-visualizing-categorical-data-over-time-b344102bcce2) in which I outline the basic idea of PyCatFlow and provide a Tutorial for non-programmers based on a [Jupyter Notebook with interactive widgets that can be run online](https://mybinder.org/v2/gist/bumatic/83c3423595cde010da7ad059c6b8b2f5/HEAD). 6 | 7 | ## Install 8 | 9 | PyCatFlow is available on PyPi: 10 | 11 | ```Shell 12 | $ pip3 install pycatflow 13 | ``` 14 | Alternatively you can download the repository and install the package by running 15 | the setup.py install routine. Make sure to install the requirements as well: 16 | 17 | ```python 18 | pip3 install -r requirements.txt 19 | python3 setup.py install 20 | ``` 21 | 22 | **Additional Requirements:** 23 | The visualization and export is based on the [drawSvg](https://pypi.org/project/drawSvg/) package that 24 | in turn requires cairo to be installed as an external requirement. Platform-specific instructions for installing cairo are available on the 25 | [cairo homepage](https://www.cairographics.org/download/). 26 | 27 | On macOS cairo can be installed easily using [homebrew](https://brew.sh/): 28 | 29 | ```Bash 30 | $ brew install cairo 31 | ``` 32 | 33 | ## Basic usage 34 | 35 | The visualization library provides many functionalities for adjusting the visual output. A simple use case is however as follows: 36 | 37 | ```Python 38 | import pycatflow as pcf 39 | 40 | # Loading and parsing data: 41 | data = pcf.read_file("sample_data_ChatterBot_Requirements.csv", columns="column", nodes="items", categories="category", column_order="column order") 42 | 43 | # Generating the visualization 44 | viz = pcf.visualize(data, spacing=20, width=800, maxValue=20, minValue=2) 45 | viz.savePng('sample_viz.png') 46 | viz.saveSvg('sample_viz.svg') 47 | viz 48 | ``` 49 | 50 | The code and sample data are provided in the example folder. The data contains 51 | annual snapshots of requirements of the [ChatterBots framework](https://github.com/gunthercox/ChatterBot) 52 | developed and maintained by Gunther Cox. 53 | 54 | Running the above code creates this visualization: 55 | 56 | ![Sample Visualization](https://raw.githubusercontent.com/bumatic/PyCatFlow/main/example/sample_viz.svg) 57 | 58 | 59 | ## Credits & License 60 | 61 | PyCatFlow was conceptualized by Marcus Burkhardt and implemented in collaboration with Herbert Natta ([@herbertmn](https://github.com/herbertmn)). It is inspired by the Rankflow visualization tool develped by Bernhard Rieder. 62 | 63 | **Cite as:** Marcus Burkhardt, and Herbert Natta. 2021. “PyCatFlow: A Python Package for Visualizing Categorical Data over Time”. Zenodo. https://doi.org/10.5281/zenodo.5531785. 64 | 65 | The package is released under MIT License. 66 | 67 | 68 | -------------------------------------------------------------------------------- /example/sample_code.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pycatflow as pcf" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# Load and parse data from file \n", 19 | "fname=\"sample_data_ChatterBot_Requirements.csv\"\n", 20 | "data = pcf.read_file(fname, columns=\"column\", nodes=\"items\", categories=\"category\", column_order=\"column order\")" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "# Generate visualization\n", 30 | "\n", 31 | "'''\n", 32 | "visualize(data, spacing=50, node_size=10, width=None, height=None, minValue=1, maxValue=10, node_scaling=\"linear\",\n", 33 | " connection_type=\"semi-curved\", color_startEnd=True, color_categories=True, nodes_color=\"gray\",\n", 34 | " start_node_color=\"green\", end_node_color=\"red\", palette=None, show_labels=True,\n", 35 | " label_text=\"item\", label_font=\"sans-serif\", label_color=\"black\", label_size=5,\n", 36 | " label_shortening=\"clip\", label_position=\"nodes\", line_opacity=0.5, line_stroke_color=\"white\",\n", 37 | " line_stroke_width=0.5, line_stroke_thick=0.5, legend=True, sort_by=\"frequency\")\n", 38 | "'''\n", 39 | "\n", 40 | "viz = pcf.visualize(data, spacing=20, width=800, maxValue=20, minValue=2)\n", 41 | "\n", 42 | "#save visualization to files\n", 43 | "viz.savePng('sample_viz.png')\n", 44 | "viz.saveSvg('sample_viz.svg')\n", 45 | "\n", 46 | "#show visualization\n", 47 | "viz" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "# Another option is to visualize the graph with curved connections.\n", 57 | "# The implementation of this connection type draws on https://github.com/bernorieder/RankFlow\n", 58 | "\n", 59 | "viz = pcf.visualize(data, spacing=20, width=800, maxValue=20, minValue=2, connection_type='curved')\n", 60 | "viz" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "# The third option are straight connections between nodes\n", 70 | "viz = pcf.visualize(data, spacing=20, width=800, maxValue=20, minValue=2, connection_type='straight')\n", 71 | "\n", 72 | "# show visualization\n", 73 | "viz" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [] 82 | } 83 | ], 84 | "metadata": { 85 | "kernelspec": { 86 | "display_name": "Python 3", 87 | "language": "python", 88 | "name": "python3" 89 | }, 90 | "language_info": { 91 | "codemirror_mode": { 92 | "name": "ipython", 93 | "version": 3 94 | }, 95 | "file_extension": ".py", 96 | "mimetype": "text/x-python", 97 | "name": "python", 98 | "nbconvert_exporter": "python", 99 | "pygments_lexer": "ipython3", 100 | "version": "3.8.8" 101 | } 102 | }, 103 | "nbformat": 4, 104 | "nbformat_minor": 4 105 | } 106 | -------------------------------------------------------------------------------- /example/sample_data_ChatterBot_Requirements.csv: -------------------------------------------------------------------------------- 1 | column items category column order 2 | 2015-09-08 fuzzywuzzy A_Requirements 1 3 | 2015-09-08 requests A_Requirements 1 4 | 2015-09-08 requests-oauthlib A_Requirements 1 5 | 2015-09-08 pymongo A_Requirements 1 6 | 2015-09-08 jsondatabase A_Requirements 1 7 | 2016-09-08 jsondatabase A_Requirements 2 8 | 2016-09-08 sqlalchemy A_Requirements 2 9 | 2016-09-08 textblob A_Requirements 2 10 | 2016-09-08 nltk A_Requirements 2 11 | 2016-09-08 pymongo A_Requirements 2 12 | 2016-09-08 fuzzywuzzy A_Requirements 2 13 | 2016-09-08 python-twitter A_Requirements 2 14 | 2017-09-08 python-twitter A_Requirements 3 15 | 2017-09-08 pymongo A_Requirements 3 16 | 2017-09-08 jsondatabase A_Requirements 3 17 | 2017-09-08 chatterbot-corpus A_Requirements 3 18 | 2017-09-08 python-dateutil A_Requirements 3 19 | 2017-09-08 nltk A_Requirements 3 20 | 2017-09-08 sqlalchemy A_Requirements 3 21 | 2017-09-08 nose-exclude B_Developers Requirements 3 22 | 2017-09-08 pip B_Developers Requirements 3 23 | 2017-09-08 twine B_Developers Requirements 3 24 | 2017-09-08 twython B_Developers Requirements 3 25 | 2017-09-08 sphinx_rtd_theme B_Developers Requirements 3 26 | 2017-09-08 sphinx B_Developers Requirements 3 27 | 2017-09-08 factory-boy B_Developers Requirements 3 28 | 2017-09-08 nose B_Developers Requirements 3 29 | 2017-09-08 mock B_Developers Requirements 3 30 | 2017-09-08 flake8 B_Developers Requirements 3 31 | 2017-09-08 coveralls B_Developers Requirements 3 32 | 2018-09-08 chatterbot-corpus A_Requirements 4 33 | 2018-09-08 mathparse A_Requirements 4 34 | 2018-09-08 nltk A_Requirements 4 35 | 2018-09-08 pymongo A_Requirements 4 36 | 2018-09-08 python-twitter A_Requirements 4 37 | 2018-09-08 sqlalchemy A_Requirements 4 38 | 2018-09-08 pint A_Requirements 4 39 | 2018-09-08 python-dateutil A_Requirements 4 40 | 2018-09-08 sphinx B_Developers Requirements 4 41 | 2018-09-08 sphinx_rtd_theme B_Developers Requirements 4 42 | 2018-09-08 twython B_Developers Requirements 4 43 | 2018-09-08 twine B_Developers Requirements 4 44 | 2018-09-08 nose B_Developers Requirements 4 45 | 2018-09-08 flake8 B_Developers Requirements 4 46 | 2018-09-08 coveralls B_Developers Requirements 4 47 | 2018-09-08 factory-boy B_Developers Requirements 4 48 | 2019-09-08 pint A_Requirements 5 49 | 2019-09-08 mathparse A_Requirements 5 50 | 2019-09-08 nltk A_Requirements 5 51 | 2019-09-08 python-dateutil A_Requirements 5 52 | 2019-09-08 pyyaml A_Requirements 5 53 | 2019-09-08 spacy A_Requirements 5 54 | 2019-09-08 sqlalchemy A_Requirements 5 55 | 2019-09-08 pytz A_Requirements 5 56 | 2019-09-08 chatterbot-corpus A_Requirements 5 57 | 2019-09-08 coveralls B_Developers Requirements 5 58 | 2019-09-08 nose B_Developers Requirements 5 59 | 2019-09-08 twine B_Developers Requirements 5 60 | 2019-09-08 sphinx B_Developers Requirements 5 61 | 2019-09-08 sphinx_rtd_theme B_Developers Requirements 5 62 | 2019-09-08 spacy_en_core_web_sm-2.1.0 B_Developers Requirements 5 63 | 2019-09-08 spacy_de_core_news_sm-2.1.0 B_Developers Requirements 5 64 | 2019-09-08 pymongo B_Developers Requirements 5 65 | 2019-09-08 flake8 B_Developers Requirements 5 66 | 2019-09-08 twython B_Developers Requirements 5 67 | 2020-09-08 pytz A_Requirements 5 68 | 2020-09-08 python-dateutil A_Requirements 5 69 | 2020-09-08 sqlalchemy A_Requirements 5 70 | 2020-09-08 mathparse A_Requirements 5 71 | 2020-09-08 pint B_Developers Requirements 5 72 | 2020-09-08 flake8 B_Developers Requirements 5 73 | 2020-09-08 nltk B_Developers Requirements 5 74 | 2020-09-08 spacy_de_core_news_sm-2.1.0 B_Developers Requirements 5 75 | 2020-09-08 twython B_Developers Requirements 5 76 | 2020-09-08 spacy B_Developers Requirements 5 77 | 2020-09-08 sphinx B_Developers Requirements 5 78 | 2020-09-08 sphinx_rtd_theme B_Developers Requirements 5 79 | 2020-09-08 pyyaml B_Developers Requirements 5 80 | 2020-09-08 chatterbot-corpus B_Developers Requirements 5 81 | 2020-09-08 spacy_en_core_web_sm-2.1.0 B_Developers Requirements 5 82 | 2020-09-08 twine B_Developers Requirements 5 83 | 2020-09-08 pymongo B_Developers Requirements 5 84 | 2020-09-08 coveralls B_Developers Requirements 5 85 | 2020-09-08 nose B_Developers Requirements 5 86 | 2021-05-17 python-dateutil A_Requirements 6 87 | 2021-05-17 pytz A_Requirements 6 88 | 2021-05-17 mathparse A_Requirements 6 89 | 2021-05-17 sqlalchemy A_Requirements 6 90 | 2021-05-17 coveralls B_Developers Requirements 6 91 | 2021-05-17 flake8 B_Developers Requirements 6 92 | 2021-05-17 nltk B_Developers Requirements 6 93 | 2021-05-17 nose B_Developers Requirements 6 94 | 2021-05-17 pint B_Developers Requirements 6 95 | 2021-05-17 pymongo B_Developers Requirements 6 96 | 2021-05-17 twine B_Developers Requirements 6 97 | 2021-05-17 twython B_Developers Requirements 6 98 | 2021-05-17 spacy B_Developers Requirements 6 99 | 2021-05-17 sphinx B_Developers Requirements 6 100 | 2021-05-17 sphinx_rtd_theme B_Developers Requirements 6 101 | 2021-05-17 pyyaml B_Developers Requirements 6 102 | 2021-05-17 chatterbot-corpus B_Developers Requirements 6 103 | 2021-05-17 spacy_en_core_web_sm-2.1.0 B_Developers Requirements 6 104 | 2021-05-17 spacy_de_core_news_sm-2.1.0 B_Developers Requirements 6 -------------------------------------------------------------------------------- /example/sample_viz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bumatic/PyCatFlow/e94d8c5eba4fc965b90b6c8cdccc3ee4be005435/example/sample_viz.png -------------------------------------------------------------------------------- /example/sample_viz.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 2015-09-08 278 | 2016-09-08 279 | 2017-09-08 280 | 2018-09-08 281 | 2019-09-08 282 | 2021-05-17 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | pymongo 340 | 341 | jsondatabase 342 | 343 | fuzzywuzzy 344 | 345 | requests 346 | 347 | requests-oauthlib 348 | 349 | pymongo 350 | 351 | nltk 352 | 353 | sqlalchemy 354 | 355 | jsondatabase 356 | 357 | python-twitter 358 | 359 | fuzzywuzzy 360 | 361 | textblob 362 | 363 | pymongo 364 | 365 | nltk 366 | 367 | sqlalchemy 368 | 369 | chatterbot-corpus 370 | 371 | coveralls 372 | 373 | flake8 374 | 375 | nose 376 | 377 | python-dateutil 378 | 379 | sphinx 380 | 381 | sphinx_rtd_theme 382 | 383 | twine 384 | 385 | twython 386 | 387 | jsondatabase 388 | 389 | python-twitter 390 | 391 | factory-boy 392 | 393 | mock 394 | 395 | nose-exclude 396 | 397 | pip 398 | 399 | pymongo 400 | 401 | nltk 402 | 403 | sqlalchemy 404 | 405 | chatterbot-corpus 406 | 407 | coveralls 408 | 409 | flake8 410 | 411 | nose 412 | 413 | python-dateutil 414 | 415 | sphinx 416 | 417 | sphinx_rtd_theme 418 | 419 | twine 420 | 421 | twython 422 | 423 | mathparse 424 | 425 | pint 426 | 427 | python-twitter 428 | 429 | factory-boy 430 | 431 | pymongo 432 | 433 | nltk 434 | 435 | sqlalchemy 436 | 437 | chatterbot-corpus 438 | 439 | coveralls 440 | 441 | flake8 442 | 443 | nose 444 | 445 | python-dateutil 446 | 447 | sphinx 448 | 449 | sphinx_rtd_theme 450 | 451 | twine 452 | 453 | twython 454 | 455 | mathparse 456 | 457 | pint 458 | 459 | pytz 460 | 461 | pyyaml 462 | 463 | spacy 464 | 465 | spacy_de_core_news_sm-2.1.0 466 | 467 | spacy_en_core_web_sm-2.1.0 468 | 469 | pymongo 470 | 471 | nltk 472 | 473 | sqlalchemy 474 | 475 | chatterbot-corpus 476 | 477 | coveralls 478 | 479 | flake8 480 | 481 | nose 482 | 483 | python-dateutil 484 | 485 | sphinx 486 | 487 | sphinx_rtd_theme 488 | 489 | twine 490 | 491 | twython 492 | 493 | mathparse 494 | 495 | pint 496 | 497 | pytz 498 | 499 | pyyaml 500 | 501 | spacy 502 | 503 | spacy_de_core_news_sm-2.1.0 504 | 505 | spacy_en_core_web_sm-2.1.0 506 | Legend 507 | 508 | B_Developers Requirements 509 | 510 | A_Requirements 511 | -------------------------------------------------------------------------------- /pycatflow/__init__.py: -------------------------------------------------------------------------------- 1 | from .input import * 2 | from .viz import * -------------------------------------------------------------------------------- /pycatflow/input.py: -------------------------------------------------------------------------------- 1 | def find_delimiter(data): 2 | """ 3 | This function finds and returns the delimiter of the given data. 4 | 5 | Parameters: 6 | data (string): data in which to look for the used delimiter 7 | 8 | Returns: 9 | (string): delimiter used in given data 10 | """ 11 | if type(data) == str: 12 | headers = data.split("\n")[0] 13 | else: 14 | headers = data.decode("utf-8").split("\n")[0] 15 | delimiters = [",", ";", "\t"] # Removed: , "\s", "|" 16 | l = {} 17 | for d in delimiters: 18 | count = 0 19 | for c in headers: 20 | if c.find(d) != -1: 21 | count += 1 22 | l[d] = count 23 | return [k for k, v in l.items() if v == max(l.values())][0] 24 | 25 | 26 | def detect_dtype(data, prefix): 27 | """ 28 | Transforms objects inside data into the correct datatypes and returns a sorted and duplicate-free list. 29 | 30 | Parameters: 31 | data (list): a single column 32 | prefix (str): delete an unwanted prefix out of the data 33 | 34 | Returns: 35 | (list): original data without duplicates in correct datatypes 36 | """ 37 | t1 = [] 38 | t2 = [] 39 | for x in data: 40 | x = x.replace(prefix, "") 41 | try: 42 | t1.append(int(x)) 43 | t2.append("int") 44 | except ValueError: 45 | try: 46 | t1.append(float(x)) 47 | t2.append("float") 48 | except ValueError: 49 | from dateutil.parser import parse, ParserError 50 | try: 51 | t1.append(parse(x)) 52 | t2.append("date") 53 | except ParserError: 54 | t1.append(x) 55 | t2.append("string") 56 | continue 57 | t = [] 58 | for k in set(t2): 59 | [t.append(data[t1.index(h)]) for h in sorted([x for x, y in zip(t1, t2) if y == k]) if h not in t] 60 | return t 61 | 62 | 63 | def prepare_data(data, columns_data, node_data, category_data, orientation, sort_field, prefix): 64 | """ 65 | Arranges the data into a new format to make is usable for the flow visualisation. 66 | 67 | Parameters: 68 | data (dict): original data transformed to a dict 69 | columns_data (str): Name of the column with temporal data (None if orientation="vertical") 70 | node_data (str): which column to use as nodes in the graph 71 | category_data (str): Name of the column containing optional categories of nodes 72 | orientation (str): Horizontal if the temporal data are in one columns, vertical if the temporal data are the name of the column 73 | sort_field (str): Optionally provide the name of a column determining the order of the time_field columns 74 | prefix (str): delete an unwanted prefix out of the data 75 | 76 | Returns: 77 | (dict): Dictionary of parsed data 78 | """ 79 | new_data = {} 80 | if orientation == 'horizontal': 81 | if sort_field is None: 82 | columns = detect_dtype(data[columns_data], prefix) 83 | else: 84 | columns = [] 85 | n_sort_field = [int(x) for x in data[sort_field]] 86 | [columns.append(data[columns_data][n_sort_field.index(x)]) for x in sorted(n_sort_field) if x not in columns] 87 | 88 | tags = data[node_data] 89 | counts = [[x for x in tags].count(x) for x in tags] 90 | if category_data is not None: 91 | for l in columns: 92 | d = {x: (z, y) for t, x, y, z in zip(data[columns_data], tags, data[category_data], counts) if l == t} 93 | new_data[l] = {k: v for k, v in d.items()} 94 | else: 95 | for l in columns: 96 | d = {x: z for t, x, z in zip(data[columns_data], tags, counts) if l == t} 97 | new_data[l] = {k: v for k, v in d.items()} 98 | else: 99 | if category_data is not None: 100 | columns = detect_dtype(list(data.keys()), prefix) 101 | 102 | tags = [] 103 | for l in columns: 104 | [tags.append(y) for y in data[l]] 105 | counts = [[x for x in tags].count(x) for x in tags] 106 | for l in columns: 107 | data[l+"_count"] = [counts[tags.index(x)] for x in data[l]] 108 | d = {x: (z, y) for x, y, z in zip(data[l], data[l + category_data], data[l + "_count"])} 109 | new_data[l] = {k: v for k, v in d.items()} 110 | else: 111 | types = detect_dtype(list(data.keys()), prefix) 112 | columns = detect_dtype(list(data.keys()), prefix) 113 | tags = [] 114 | for l in columns: 115 | [tags.append(y) for y in data[l]] 116 | counts = [[x for x in tags].count(x) for x in tags] 117 | for l in columns: 118 | data[l+"_count"] = [counts[tags.index(x)] for x in data[l]] 119 | d = {x: z for x, z in zip(data[l], data[l+"_count"])} 120 | new_data[l] = {k: v for k, v in d.items()} 121 | return new_data 122 | 123 | 124 | def read_file(filepath, 125 | columns=None, 126 | nodes=None, 127 | categories=None, 128 | column_order=None, 129 | orientation="horizontal", 130 | delimiter=None, 131 | line_delimiter=None, 132 | prefix=""): 133 | """ 134 | Loads data from file and returns structured data for visualisation. 135 | 136 | Parameters: 137 | filepath (str): Path to file 138 | columns (str): Name of the column with temporal data (leave None if orientation="vertical") 139 | nodes (str): Name of the column containing the node data 140 | categories (str): Name of the column containing optional categories of nodes 141 | column_order (str): Optionally provide the name of a column determining the order of the columns 142 | orientation (str): Horizontal if the temporal data are in one columns, vertical if the temporal data are the name of the column 143 | delimiter (str): Otpionally specify the delimiter, if None it will try to autodetect 144 | line_delimiter (str): optionally define the line_delimiter separator, by default \n 145 | prefix (str): delete an unwanted prefix out of the data 146 | 147 | Returns: 148 | (dict): Dictionary of parsed data 149 | """ 150 | 151 | with open(filepath, "rb") as f: 152 | data = f.read() 153 | if delimiter is None: 154 | delimiter = find_delimiter(data) 155 | else: 156 | delimiter = delimiter 157 | if line_delimiter is None: 158 | line_delimiter = "\n" 159 | else: 160 | line_delimiter = line_delimiter 161 | headers = data.decode("utf-8-sig").split(line_delimiter)[0].split(delimiter) 162 | lines = data.decode("utf-8-sig").split(line_delimiter)[1:] 163 | lines = [line for line in lines if line != ''] 164 | data = {} 165 | for h in headers: 166 | data[h.replace('\r', '')] = [line.split(delimiter)[headers.index(h)].replace('\r', '') for line in lines] 167 | 168 | data = prepare_data(data, columns, nodes, categories, orientation, column_order, prefix) 169 | return data 170 | 171 | 172 | def read(data, 173 | columns=None, 174 | nodes=None, 175 | categories=None, 176 | column_order=None, 177 | orientation="horizontal", 178 | delimiter=None, 179 | line_delimiter=None, 180 | prefix=""): 181 | """ 182 | Parses a string into structured data for visualization. 183 | 184 | Parameters: 185 | data (str): String with records divided by line_delimiter and fields divided by delimiter; list of lists with the first element as list of headers; dictionary with headers as keys and values as lists 186 | columns (str): Name of the column with temporal data (leave None if orientation="vertical") 187 | nodes (str): Name of the column containing the node data 188 | categories (str): Name of the column containing optional categories of nodes 189 | column_order (str): Optionally provide the name of a column determining the order of the columns 190 | orientation (str): Horizontal if the temporal data are in one columns, vertical if the temporal data are the name of the column 191 | delimiter (str): Otpionally specify the delimiter, if None it will try to autodetect 192 | line_delimiter (str): optionally define the line_delimiter separator, by default \n 193 | prefix (str): delete an unwanted prefix out of the data 194 | 195 | Returns: 196 | (dict): Dictionary of parsed data 197 | """ 198 | 199 | if type(data) == str: 200 | if delimiter is None: 201 | delimiter = find_delimiter(data) 202 | else: 203 | delimiter = delimiter 204 | if line_delimiter is None: 205 | line_delimiter = "\n" 206 | else: 207 | line_delimiter = line_delimiter 208 | headers = data.split(line_delimiter)[0].split(delimiter) 209 | lines = data.split(line_delimiter)[1:] 210 | data = {} 211 | for h in headers: 212 | # data[h] = [line.split(delimiter)[headers.index(h)] for line in lines] 213 | data[h.replace('\r', '')] = [line.split(delimiter)[headers.index(h)].replace('\r', '') for line in lines] 214 | if type(data) == list: 215 | headers = data[0] 216 | lines = data[1:] 217 | data = {} 218 | for h in headers: 219 | data[h.replace('\r', '')] = [line.split(delimiter)[headers.index(h)].replace('\r', '') for line in lines] 220 | data = prepare_data(data, columns, nodes, categories, orientation, column_order, prefix) 221 | return data -------------------------------------------------------------------------------- /pycatflow/viz.py: -------------------------------------------------------------------------------- 1 | import drawSvg as draw 2 | from matplotlib import cm,colors 3 | import pycatflow as pcf 4 | import math 5 | import copy 6 | 7 | debug_legend = False 8 | 9 | class Node: 10 | def __init__(self, index, col_index, x, y, size, value, width, label, category): 11 | self.x = x 12 | self.index = index 13 | self.col_index = col_index 14 | self.y = y 15 | self.size = size 16 | self.value = value 17 | self.width = width 18 | self.label = label 19 | self.category = category 20 | 21 | 22 | def nodify(data, sort_by="frequency"): 23 | """ 24 | Takes data and creates a list containing information about the nodes for the graph. 25 | 26 | Parameters: 27 | data (dict): output of the read_file/read functions, a dictionary with keys the temporal data, and values a dictionary with keys of the item and values or the frequency of the item or a tuple with the frequency and the category 28 | sort_by (str): "frequency" or "alphabetical" or "category", defaults to "frequency" 29 | 30 | Returns: 31 | (list): A list containing information about the nodes for the graph 32 | """ 33 | d = {} 34 | if sort_by == "frequency": 35 | for item in data.items(): 36 | if type(item[1][next(iter(item[1]))]) == tuple: 37 | d[item[0]] = {k: v for k, v in sorted(item[1].items(), 38 | key=lambda x: (-x[1][0], x[0]), 39 | reverse=False)} 40 | else: 41 | d[item[0]] = {k: v for k, v in sorted(item[1].items(), 42 | key=lambda x: (-x[1], x[0]), 43 | reverse=False)} 44 | elif sort_by == "category": 45 | for item in data.items(): 46 | d[item[0]] = {k: v for k, v in sorted(item[1].items(), key=lambda x: (x[1][1], x[0]), reverse=False)} 47 | elif sort_by == "alphabetical": 48 | for x, y in data.items(): 49 | d[x] = {k: v for k, v in sorted(y.items())} 50 | 51 | labels = [list(x[1].keys()) for x in d.items()] 52 | values = [[y[0] if type(y) == tuple else y for y in x[1].values()] for x in d.items()] 53 | categories = [[y[1] if type(y) == tuple else "null" for y in x[1].values()] for x in d.items()] 54 | headers = list(d.keys()) 55 | node_x = 0 56 | count = 0 57 | count2 = 0 58 | 59 | nodes = [] 60 | sequence = {} 61 | 62 | for l, v, s in zip(labels, values, categories): 63 | if count2 < len(labels): 64 | count2 += 1 65 | for x, y, z in zip(l, v, s): 66 | nodes.append(Node(count, count2, 0, 0, y, y, 1, x, z)) 67 | count += 1 68 | 69 | for n in nodes: 70 | if n.label in sequence.keys(): 71 | sequence[n.label].append(n.index) 72 | else: 73 | sequence[n.label] = [] 74 | sequence[n.label].append(n.index) 75 | 76 | return [headers, nodes, sequence] 77 | 78 | 79 | def genSVG(nodes, spacing, node_size, width=None, height=None, minValue=1, maxValue=10, node_scaling="linear", 80 | connection_type="semi-curved", color_startEnd=True, color_categories=True, nodes_color="gray", 81 | start_node_color="green", end_node_color="red", palette=None, show_labels=True, label_text="item", label_font="sans-serif", 82 | label_color="black", label_size=5, label_shortening="clip", label_position="nodes", line_opacity=0.5, 83 | line_stroke_color="white", line_stroke_width=0.5, line_stroke_thick=0.5, legend=True): 84 | """ 85 | Generates an SVG from data loaded via the read functions. 86 | 87 | Parameters: 88 | nodes (list): The output of nodify(), a list containingg information about the nodes for the graph 89 | spacing (int): the space between the nodes, defaults to 50 90 | node_size (int): default node size, defauults to 10 91 | width (int): width of the visualization, defaults to None, if None they are generated from the size of the nodes, if they are specified the nodes will be rescaled to fit the space 92 | height (int): height of the visualization, defaults to None, if None they are generated from the size of the nodes, if they are specified the nodes will be rescaled to fit the space 93 | minValue (int): min size of a node , defaults to 1 94 | maxValue (int): max size of a node, defaults to 10 95 | node_scaling (str): "linear" or ... " ", defaults to "linear" 96 | connection_type (str): "semi-curved" or "curved" or "linear", defaults to "semi-curved" 97 | color_startEnd (bool) : if True it marks the colors of the first and last appearence of a category, defaults to True 98 | color_categories (bool): if True the nodes and the lines are colored depending by the subcategory, deafults to True 99 | nodes_color (str): the color of the nodes if the previous two options are false, defaults to "gray", used also for the lines and for the middle nodes in case of startEnd option 100 | start_node_color (str): Defaults to "green" 101 | end_node_color (str): Defaults to "red" 102 | palette (tuple): a tuple with the name of the matplotlib palette and the number of colors ("viridis",12), defaults to None 103 | show_labels (bool): Defaults to True 104 | label_text (str): "item" shows the category, defaults to "item", "item_count" shows the category and the frequency, "item_category" shows the category and the subcategory 105 | label_font (str): Defaults to "sans-serif" 106 | label_color (str): Defaults to "black" 107 | label_size (int): Defaults to 5 108 | label_shortening (str): defaults to "clip", "clip" cuts the text when it overlaps the margin, "resize" changes the size of the font to fit the available space, "new_line" wraps the text when it overlaps the margin and it rescale the size if the two lines overlaps the bottom margin 109 | label_position (str): defaults to "nodes", "nodes" shows a label for each node, "start_end" shows a label for the first and last node of a sequence 110 | line_opacity (float): Defaults to 0.5 111 | line_stroke_color (str): Defaults to "white" 112 | line_stroke_width (float): Defaults to 0.5 113 | line_stroke_thick (float): Defaults to 0.5 114 | legend (bool): If True a Legend is included, defaults to True 115 | 116 | Returns: 117 | (drawSvg.drawing.Drawing): The finished graph 118 | """ 119 | headers = nodes[0] 120 | nodes2 = copy.deepcopy(nodes[1]) 121 | sequence = nodes[2] 122 | 123 | if start_node_color == "green": 124 | start_node_color = "#4BA167" 125 | if end_node_color == "red": 126 | end_node_color = "#A04B83" 127 | if nodes_color == "gray": 128 | nodes_color = "#EAEBEE" 129 | 130 | # Resizing of the nodes in relation to the canvas size and to the scaling option 131 | m = max([v.value for v in nodes[1]]) 132 | new_nodes = [] 133 | if width is not None: 134 | dx = (width-(spacing*2))/len(headers) 135 | spacing2 = 2*(dx/3) 136 | node_size = dx/3 137 | else: 138 | spacing2 = spacing 139 | if height is not None: 140 | l_col_index = [x.col_index for x in nodes2] 141 | l_col_index_max = max([l_col_index.count(y.col_index) for y in nodes2]) 142 | sum_values = sum([x.value for x in nodes2 if l_col_index.count(x.col_index) == l_col_index_max]) 143 | max_values = max([x.value for x in nodes2 if l_col_index.count(x.col_index) == l_col_index_max]) 144 | if node_scaling == "linear": 145 | dy = ((height-(spacing*2)-(spacing/5))*max_values)/(sum_values+((maxValue/2)*l_col_index_max)) 146 | else: 147 | dy = ((height-(spacing*2)-(spacing/5))*max_values)/(sum_values+((max_values/2)*l_col_index_max)) 148 | spacingy = dy/3 149 | maxValue = 2*(dy/3) 150 | else: 151 | spacingy = spacing/5 152 | 153 | node_x = 0 154 | for n in nodes2: 155 | n.width = node_size 156 | if n.col_index != nodes2[n.index-1].col_index and n.index > 0: 157 | node_x += node_size 158 | n.x += node_x 159 | 160 | if node_scaling == "linear": 161 | n.size = (((n.value+1)*maxValue)/m)+minValue 162 | elif node_scaling == "log": 163 | n.size = (((maxValue-minValue)/math.log(m))*math.log(n.value))+minValue 164 | 165 | new_nodes.append(n) 166 | 167 | # positioning of the nodes on the canvas (x,y) 168 | n_x_spacing = spacing 169 | n_y_spacing = spacing+spacingy 170 | points = [] 171 | for n in new_nodes: 172 | 173 | if n.index > 0 and n.col_index == new_nodes[n.index-1].col_index: 174 | n_y_spacing += spacingy+n.size 175 | else: 176 | n_y_spacing = spacing+spacingy+n.size 177 | if n.index > 0 and n.col_index != new_nodes[n.index-1].col_index: 178 | n_x_spacing += spacing2 179 | 180 | points.append(pcf.Node(n.index, n.col_index, n.x + n_x_spacing, n.y + n_y_spacing, 181 | n.size, n.value, n.width, n.label, n.category)) 182 | 183 | # sizing of the canvas 184 | if width is None and height is None: 185 | width = spacing*4+max([x.x for x in points]) 186 | height = spacing * 4 + max([x.y for x in points]) + ((sum([x.size for x in points]) / len(points)) * len(set([x.category for x in points]))) 187 | elif height is None: 188 | height = spacing * 4 + max([x.y for x in points]) + ((sum([x.size for x in points]) / len(points)) * len(set([x.category for x in points]))) 189 | elif width is None: 190 | width = spacing * 4 + max([x.x for x in points]) 191 | 192 | # COLORS 193 | if palette is not None: 194 | palette = cm.get_cmap(palette[0], palette[1]).colors 195 | count = 0 196 | category_colors = {} 197 | for e in set([n.category for n in points]): 198 | if count < len(palette): 199 | count += 1 200 | category_colors[e] = colors.to_hex(palette[count]) 201 | else: 202 | # DEFAULT PALETTE: the number of colors is set in relation to the length of the category list 203 | palette = cm.get_cmap("tab20c", len(set([n.category for n in points])) + 1).colors 204 | count = 0 205 | category_colors = {} 206 | for e in set([n.category for n in points]): 207 | if count < len(palette)-1: 208 | count += 1 209 | category_colors[e] = colors.to_hex(palette[count]) 210 | 211 | d = draw.Drawing(width, height, displayInline=True) 212 | r = draw.Rectangle(0, 0, width, height, stroke_width=2, stroke='black', fill="white") 213 | d.append(r) 214 | 215 | # headers 216 | h_x_shift = [points[0].x] 217 | 218 | for x in points: 219 | if x.x != points[x.index-1].x and x.index > 0: 220 | h_x_shift.append(x.x) 221 | 222 | n2 = h_x_shift[1]-h_x_shift[0] 223 | 224 | for h, x in zip(headers, h_x_shift): 225 | l = label_size 226 | if label_shortening == "resize": 227 | while len(h)*(l/2) > n2+points[0].size-(n2/8) and l > 1: 228 | if x != max(h_x_shift): 229 | l -= 1 230 | else: 231 | break 232 | d.append(draw.Text(h, x=x, y=height - spacing, fontSize=l, font_family=label_font, fill=label_color)) 233 | elif label_shortening == "clip": 234 | clip = draw.ClipPath() 235 | clip.append(draw.Rectangle(x, height - spacing, n2, label_size)) 236 | d.append(draw.Text(h, x=x, y=height - spacing, fontSize=l, font_family=label_font, clip_path=clip, fill=label_color)) 237 | elif label_shortening == "new_line": 238 | if len(h)*(label_size/2) > n2+points[0].size-(n2/8): 239 | margin = int((n2+points[0].size-(n2/8))/(label_size/2)) 240 | txt = [h[x:x+margin] for x in range(0, len(h), margin)] 241 | while len(txt)*l > (l+n2/5) and l > 1: 242 | l -= 1 243 | else: 244 | txt = h 245 | d.append(draw.Text(txt, x=x, y=height-spacing, fontSize=l, font_family=label_font, fill=label_color)) 246 | 247 | # lines 248 | for n in sequence.items(): 249 | if len(n[1]) > 1: 250 | for k in n[1][:-1]: 251 | if color_categories: 252 | color = category_colors[points[k].category] 253 | else: 254 | color = nodes_color 255 | if connection_type.lower() == "semi-curved": 256 | p = draw.Path(fill=color, stroke=line_stroke_color, opacity=line_opacity, stroke_width=line_stroke_width) 257 | p.M(points[k].x + points[k].width, height - points[k].y) 258 | p.L(points[k].x + points[k].width, height - points[k].y + points[k].size) 259 | 260 | if points[k].y == points[n[1][n[1].index(k)+1]].y: 261 | p.L(points[n[1][n[1].index(k)+1]].x, height - points[k].y + points[k].size) 262 | p.L(points[n[1][n[1].index(k)+1]].x, height - points[k].y) 263 | 264 | else: 265 | xMedium = ((points[n[1][n[1].index(k)+1]].x-(points[k].x+points[k].width))/2)+(points[k].x+points[k].width) 266 | yMedium = (((height - points[k].y + points[k].size) - (height - points[n[1][n[1].index(k) + 1]].y + points[k].size)) / 2) + (height - points[n[1][n[1].index(k) + 1]].y) 267 | yMedium2 = (((height - points[k].y) - (height - points[n[1][n[1].index(k) + 1]].y)) / 2) + (height - points[n[1][n[1].index(k) + 1]].y) 268 | p.Q(points[k].x + points[k].width + (spacing/2), height - points[k].y + points[k].size, xMedium + line_stroke_thick, yMedium + points[k].size) 269 | p.T(points[n[1][n[1].index(k)+1]].x, height - points[n[1][n[1].index(k) + 1]].y + points[n[1][n[1].index(k) + 1]].size) 270 | p.L(points[n[1][n[1].index(k)+1]].x, height - points[n[1][n[1].index(k) + 1]].y) 271 | p.Q(points[n[1][n[1].index(k)+1]].x - (spacing/2), height - points[n[1][n[1].index(k) + 1]].y, xMedium - line_stroke_thick, yMedium2) 272 | p.T(points[k].x + points[k].width, height - points[k].y) 273 | 274 | p.Z() 275 | d.append(p) 276 | elif connection_type.lower() == 'curved': 277 | p = draw.Path(fill=color, stroke=line_stroke_color, opacity=line_opacity, 278 | stroke_width=line_stroke_width) 279 | 280 | size_start = points[k].size 281 | size_end = points[n[1][n[1].index(k) + 1]].size 282 | 283 | x1_start = points[k].x + points[k].width 284 | y1_start = height - points[k].y + size_start 285 | 286 | x1_end = points[n[1][n[1].index(k) + 1]].x 287 | y1_end = height - points[n[1][n[1].index(k) + 1]].y + size_end 288 | 289 | x2_start = x1_start 290 | y2_start = y1_start - size_start 291 | 292 | x2_end = x1_end 293 | y2_end = y1_end - size_end 294 | 295 | x_diff = x1_end - x1_start 296 | y_diff = y2_start - y1_end 297 | height_factor = 2 298 | width_factor = 0 299 | 300 | if points[k].y == points[n[1][n[1].index(k) + 1]].y: 301 | p.M(x1_start, y1_start) 302 | p.L(x2_start, y2_start) 303 | p.L(x2_end, y2_end) 304 | p.L(x1_end, y1_end) 305 | p.Z() 306 | d.append(p) 307 | pass 308 | 309 | else: 310 | p.M(x1_start, y1_start) 311 | cx1 = x1_end - (x_diff / 4 * 3) 312 | cy1 = y1_start 313 | ex1 = x1_end - (x_diff / 2) 314 | ey1 = y1_end + (y_diff / 2) 315 | p.Q(cx1, cy1, ex1, ey1) 316 | 317 | cx2 = x1_start + (x_diff / 4 * 3) 318 | cy2 = y1_end - (size_end / height_factor) 319 | p.Q(cx2, cy2, x1_end, y1_end) 320 | 321 | p.L(x2_end, y2_end) 322 | 323 | cx3 = (x2_end - (x_diff / 4)) 324 | cy3 = (y2_end - (size_end / height_factor)) 325 | ex3 = (x2_end + ((x1_start - x1_end) / 2) - width_factor) 326 | ey3 = (y2_end + (((y1_start - y1_end) / 2) - (((size_start + size_end) / 2)) / height_factor)) 327 | p.Q(cx3, cy3, ex3, ey3) 328 | 329 | cx4 = x2_start + (x_diff / 4) 330 | cy4 = y2_start 331 | p.Q(cx4, cy4, x2_start, y2_start) 332 | 333 | p.Z() 334 | d.append(p) 335 | 336 | elif connection_type.lower() == 'straight': 337 | p = draw.Path(fill=color, stroke=line_stroke_color, opacity=line_opacity, 338 | stroke_width=line_stroke_width) 339 | size_start = points[k].size 340 | size_end = points[n[1][n[1].index(k) + 1]].size 341 | 342 | x1_start = points[k].x + points[k].width 343 | y1_start = height - points[k].y 344 | 345 | x1_end = points[n[1][n[1].index(k) + 1]].x 346 | y1_end = height - points[n[1][n[1].index(k) + 1]].y 347 | 348 | x2_start = x1_start 349 | y2_start = y1_start + size_start 350 | 351 | x2_end = x1_end 352 | y2_end = y1_end + size_end 353 | 354 | p.M(x1_start, y1_start) 355 | p.L(x2_start, y2_start) 356 | p.L(x2_end, y2_end) 357 | p.L(x1_end, y1_end) 358 | 359 | p.Z() 360 | d.append(p) 361 | 362 | else: 363 | print('This connection type is not implemented.') 364 | raise KeyError 365 | 366 | # nodes 367 | # return points 368 | col_index_max = 0 369 | for node in points: 370 | if node.col_index > col_index_max: 371 | col_index_max = node.col_index 372 | 373 | for node in points: 374 | if color_startEnd == True and color_categories == True: 375 | if node.label not in [n.label for n in points][:node.index]: 376 | color = start_node_color 377 | elif node.label not in [n.label for n in points][node.index+1:] and node.col_index < col_index_max: #and node.indexspacing-(spacing/8): 407 | if node.x != max([n.x for n in points]) and l > 1: 408 | l -= 1 409 | else: 410 | break 411 | elif label_shortening == "clip": 412 | clip = draw.ClipPath() 413 | clip.append(draw.Rectangle(node.x, height-node.y-(spacing/5), n2-(n2/8), node.size+2*(spacing/5))) 414 | elif label_shortening == "new_line": 415 | if len(txt)*(label_size/2) > n2-2*(n2/8): 416 | margin = int((n2-2*(n2/8))/(label_size/2)) 417 | txt = [txt[x:x+margin] for x in range(0, len(txt), margin)] 418 | while len(txt)*l > node.size+2*(spacing/8) and l > 1: 419 | l -= 1 420 | 421 | label_pos_y = height - node.y + (node.size/2) - (l/2) 422 | if label_position == "start_end": 423 | if node.label not in [n.label for n in points][:node.index] or node.label not in [n.label for n in points][node.index+1:] and node.index < len(points) and node.x != max([n.x for n in points]): 424 | if label_shortening == "clip": 425 | label = draw.Text(txt, x=node.x+node.width+(n2/8), y=label_pos_y, 426 | fontSize=l, font_family=label_font, fill=label_color, clip_path=clip) 427 | else: 428 | label = draw.Text(txt, x=node.x-(n2/8), y=label_pos_y, 429 | fontSize=l, font_family=label_font, fill=label_color, text_anchor="end") 430 | 431 | elif label_position == "nodes": 432 | if label_shortening == "clip": 433 | label = draw.Text(txt, x=node.x+node.width+(n2/8), y=label_pos_y, 434 | fontSize=l, font_family=label_font, fill=label_color, clip_path=clip) 435 | else: 436 | label = draw.Text(txt, x=node.x + node.width+(n2/8), y=label_pos_y, 437 | fontSize=l, font_family=label_font, fill=label_color) 438 | d.append(label) 439 | 440 | # Add legend to canvas 441 | if color_categories and legend: 442 | offset = 5 # Alternative: offset = spacing 443 | spacing_bottom = 5 # Alternative: spacing_bottom = spacing 444 | symbol_size = sum([x.size for x in points])/len(points) 445 | 446 | legend_height = (symbol_size+offset) * len(category_colors) 447 | legend_header_y = legend_height + symbol_size + spacing_bottom + (offset) 448 | legend_header = draw.Text("Legend", x=points[0].x, y=legend_header_y, fontSize=label_size, 449 | font_family=label_font, fill=label_color) 450 | 451 | if debug_legend: 452 | print('Legend Title') 453 | print('legend_height: {}'.format(legend_height)) 454 | print('legend_header_y: {}'.format(legend_header_y)) 455 | print('points[0].x: {}'.format(points[0].x)) 456 | print('legend_header_y'.format(legend_header_y)) 457 | print() 458 | 459 | d.append(legend_header) 460 | symbol_y_shift = 0 461 | for e in category_colors.items(): 462 | 463 | legend_label_y = spacing_bottom + legend_height + (symbol_size/2) - (label_size/2) - offset - symbol_y_shift 464 | 465 | symbol = draw.Rectangle(points[0].x, spacing_bottom+legend_height-offset-symbol_y_shift, 466 | points[0].width, symbol_size, fill=e[1], stroke=e[1]) #stroke="black" 467 | 468 | 469 | if debug_legend: 470 | print(e) 471 | print('points[0].x: {}'.format(points[0].x)) 472 | print('spacing_bottom+legend_height-offset-symbol_y_shift: {}'.format(spacing_bottom+legend_height-offset-symbol_y_shift)) 473 | print('points[0].width: {}'.format(points[0].width)) 474 | print('symbol_size: {}'.format(symbol_size)) 475 | print() 476 | 477 | name = draw.Text(e[0], x=points[0].x+node.width+(n2/12), y=legend_label_y, 478 | fontSize=label_size, fill=label_color) 479 | d.append(symbol) 480 | d.append(name) 481 | if spacing_bottom+legend_height-(offset)-symbol_y_shift > spacing_bottom: 482 | symbol_y_shift += offset+symbol_size 483 | else: 484 | symbol_y_shift = 0 485 | 486 | return d 487 | 488 | 489 | def visualize(data, spacing=50, node_size=10, width=None, height=None, minValue=1, maxValue=10, node_scaling="linear", 490 | connection_type="semi-curved", color_startEnd=True, color_categories=True, nodes_color="gray", 491 | start_node_color="green", end_node_color="red", palette=None, show_labels=True, 492 | label_text="item", label_font="sans-serif", label_color="black", label_size=5, 493 | label_shortening="clip", label_position="nodes", line_opacity=0.5, line_stroke_color="white", 494 | line_stroke_width=0.5, line_stroke_thick=0.5, legend=True, sort_by="frequency"): 495 | """ 496 | Generates an SVG from data loaded via the read functions. 497 | 498 | Parameters: 499 | data (dict): output of the read_file/read functions, a dictionary with keys the temporal data, and values a dictionary with keys of the item and values or the frequency of the item or a tuple with the frequency and the category 500 | spacing (int): the space between the nodes, defaults to 50 501 | node_size (int): default node size, defauults to 10 502 | width (int): width of the visualization, defaults to None, if None they are generated from the size of the nodes, if they are specified the nodes will be rescaled to fit the space 503 | height (int): height of the visualization, defaults to None, if None they are generated from the size of the nodes, if they are specified the nodes will be rescaled to fit the space 504 | minValue (int): min size of a node , defaults to 1 505 | maxValue (int): max size of a node, defaults to 10 506 | node_scaling (str): "linear" or ... " ", defaults to "linear" 507 | connection_type (str): "semi-curved" or "curved" or "linear", defaults to "semi-curved" 508 | color_startEnd (bool) : if True it marks the colors of the first and last appearence of a category, defaults to True 509 | color_categories (bool): if True the nodes and the lines are colored depending by the subcategory, deafults to True 510 | nodes_color (str): the color of the nodes if the previous two options are false, defaults to "gray", used also for the lines and for the middle nodes in case of startEnd option 511 | start_node_color (str): Defaults to "green" 512 | end_node_color (str): Defaults to "red" 513 | palette (tuple): a tuple with the name of the matplotlib palette and the number of colors ("viridis",12), defaults to None 514 | show_labels (bool): Defaults to True 515 | label_text (str): "item" shows the category, defaults to "item", "item_count" shows the category and the frequency, "item_category" shows the category and the subcategory 516 | label_font (str): Defaults to "sans-serif" 517 | label_color (str): Defaults to "black" 518 | label_size (int): Defaults to 5 519 | label_shortening (str): defaults to "clip", "clip" cuts the text when it overlaps the margin, "resize" changes the size of the font to fit the available space, "new_line" wraps the text when it overlaps the margin and it rescale the size if the two lines overlaps the bottom margin 520 | label_position (str): defaults to "nodes", "nodes" shows a label for each node, "start_end" shows a label for the first and last node of a sequence 521 | line_opacity (float): Defaults to 0.5 522 | line_stroke_color (str): Defaults to "white" 523 | line_stroke_width (float): Defaults to 0.5 524 | line_stroke_thick (float): Defaults to 0.5 525 | legend (bool): If True a Legend is included, defaults to True 526 | sort_by (str): "frequency" or "alphabetical" or "category", defaults to "frequency" 527 | 528 | Returns: 529 | (drawSvg.drawing.Drawing): The finished graph 530 | """ 531 | 532 | nodes = pcf.nodify(data, sort_by=sort_by) 533 | viz = genSVG(nodes, spacing, node_size, width=width, height=height, minValue=minValue, 534 | maxValue=maxValue, node_scaling=node_scaling, connection_type=connection_type, 535 | color_startEnd=color_startEnd, color_categories=color_categories, 536 | nodes_color=nodes_color, start_node_color=start_node_color, 537 | end_node_color=end_node_color, palette=palette, show_labels=show_labels, 538 | label_text=label_text, label_font=label_font, label_color=label_color, 539 | label_size=label_size, label_shortening=label_shortening, label_position=label_position, 540 | line_opacity=line_opacity, line_stroke_color=line_stroke_color, 541 | line_stroke_width=line_stroke_width, line_stroke_thick=line_stroke_thick, legend=legend) 542 | return viz 543 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | drawSVG<2.0 2 | matplotlib<3.9 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r", encoding="utf-8") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="pycatflow", 8 | version="0.1.1a", 9 | author="Marcus Burkhardt", 10 | author_email="marcus.burkhardt@gmail.com", 11 | description="A tool for visualizing categorical data over time.", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/bumatic/PyCatFlow", 15 | license="MIT", 16 | classifiers=[ 17 | "Programming Language :: Python :: 3", 18 | "License :: OSI Approved :: MIT License", 19 | "Operating System :: OS Independent", 20 | ], 21 | packages=setuptools.find_packages(), 22 | python_requires=">=3.6", 23 | install_requires=['drawSVG<2.0', 'matplotlib<3.9'] 24 | ) 25 | --------------------------------------------------------------------------------