├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md └── cellPLATO ├── .gitignore ├── README.md ├── cellPLATO ├── Btrack_cellPLATO_2024.ipynb ├── __init__.py ├── cellPLATO_StepByStep_main.ipynb ├── cellPLATO_StepByStep_trackmate.ipynb ├── data_processing │ ├── __init__.py │ ├── cell_identifier.py │ ├── cleaning_formatting_filtering.py │ ├── cleaning_formatting_filtering_remotefix.py │ ├── clustering.py │ ├── data_io.py │ ├── data_wrangling.py │ ├── dimensionality_reduction.py │ ├── load_trackmate.py │ ├── measurements.py │ ├── migration_calculations.py │ ├── pipelines.py │ ├── shape_calculations.py │ ├── statistics.py │ ├── time_calculations.py │ └── trajectory_clustering.py ├── initialization │ ├── __init__.py │ ├── btrack_config.json │ ├── config.py │ └── initialization.py └── visualization │ ├── __init__.py │ ├── cluster_visualization.py │ ├── comparative_visualization.py │ ├── filter_visualization.py │ ├── low_dimension_visualization.py │ ├── panel_apps.py │ ├── plots_of_differences.py │ ├── scatterplots.py │ ├── small_multiples.py │ ├── superplots.py │ ├── timecourse_visualization.py │ └── trajectory_visualization.py ├── environment.yml ├── environment_oldversion.yml ├── images └── cellPLATOlogo.png ├── requirements.txt ├── setup.py └── tests └── testing.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Mono auto generated files 17 | mono_crash.* 18 | 19 | # Build results 20 | [Dd]ebug/ 21 | [Dd]ebugPublic/ 22 | [Rr]elease/ 23 | [Rr]eleases/ 24 | x64/ 25 | x86/ 26 | [Ww][Ii][Nn]32/ 27 | [Aa][Rr][Mm]/ 28 | [Aa][Rr][Mm]64/ 29 | bld/ 30 | [Bb]in/ 31 | [Oo]bj/ 32 | [Ll]og/ 33 | [Ll]ogs/ 34 | 35 | # Visual Studio 2015/2017 cache/options directory 36 | .vs/ 37 | # Uncomment if you have tasks that create the project's static files in wwwroot 38 | #wwwroot/ 39 | 40 | # Visual Studio 2017 auto generated files 41 | Generated\ Files/ 42 | 43 | # MSTest test Results 44 | [Tt]est[Rr]esult*/ 45 | [Bb]uild[Ll]og.* 46 | 47 | # NUnit 48 | *.VisualState.xml 49 | TestResult.xml 50 | nunit-*.xml 51 | 52 | # Build Results of an ATL Project 53 | [Dd]ebugPS/ 54 | [Rr]eleasePS/ 55 | dlldata.c 56 | 57 | # Benchmark Results 58 | BenchmarkDotNet.Artifacts/ 59 | 60 | # .NET Core 61 | project.lock.json 62 | project.fragment.lock.json 63 | artifacts/ 64 | 65 | # ASP.NET Scaffolding 66 | ScaffoldingReadMe.txt 67 | 68 | # StyleCop 69 | StyleCopReport.xml 70 | 71 | # Files built by Visual Studio 72 | *_i.c 73 | *_p.c 74 | *_h.h 75 | *.ilk 76 | *.meta 77 | *.obj 78 | *.iobj 79 | *.pch 80 | *.pdb 81 | *.ipdb 82 | *.pgc 83 | *.pgd 84 | *.rsp 85 | *.sbr 86 | *.tlb 87 | *.tli 88 | *.tlh 89 | *.tmp 90 | *.tmp_proj 91 | *_wpftmp.csproj 92 | *.log 93 | *.tlog 94 | *.vspscc 95 | *.vssscc 96 | .builds 97 | *.pidb 98 | *.svclog 99 | *.scc 100 | 101 | # Chutzpah Test files 102 | _Chutzpah* 103 | 104 | # Visual C++ cache files 105 | ipch/ 106 | *.aps 107 | *.ncb 108 | *.opendb 109 | *.opensdf 110 | *.sdf 111 | *.cachefile 112 | *.VC.db 113 | *.VC.VC.opendb 114 | 115 | # Visual Studio profiler 116 | *.psess 117 | *.vsp 118 | *.vspx 119 | *.sap 120 | 121 | # Visual Studio Trace Files 122 | *.e2e 123 | 124 | # TFS 2012 Local Workspace 125 | $tf/ 126 | 127 | # Guidance Automation Toolkit 128 | *.gpState 129 | 130 | # ReSharper is a .NET coding add-in 131 | _ReSharper*/ 132 | *.[Rr]e[Ss]harper 133 | *.DotSettings.user 134 | 135 | # TeamCity is a build add-in 136 | _TeamCity* 137 | 138 | # DotCover is a Code Coverage Tool 139 | *.dotCover 140 | 141 | # AxoCover is a Code Coverage Tool 142 | .axoCover/* 143 | !.axoCover/settings.json 144 | 145 | # Coverlet is a free, cross platform Code Coverage Tool 146 | coverage*.json 147 | coverage*.xml 148 | coverage*.info 149 | 150 | # Visual Studio code coverage results 151 | *.coverage 152 | *.coveragexml 153 | 154 | # NCrunch 155 | _NCrunch_* 156 | .*crunch*.local.xml 157 | nCrunchTemp_* 158 | 159 | # MightyMoose 160 | *.mm.* 161 | AutoTest.Net/ 162 | 163 | # Web workbench (sass) 164 | .sass-cache/ 165 | 166 | # Installshield output folder 167 | [Ee]xpress/ 168 | 169 | # DocProject is a documentation generator add-in 170 | DocProject/buildhelp/ 171 | DocProject/Help/*.HxT 172 | DocProject/Help/*.HxC 173 | DocProject/Help/*.hhc 174 | DocProject/Help/*.hhk 175 | DocProject/Help/*.hhp 176 | DocProject/Help/Html2 177 | DocProject/Help/html 178 | 179 | # Click-Once directory 180 | publish/ 181 | 182 | # Publish Web Output 183 | *.[Pp]ublish.xml 184 | *.azurePubxml 185 | # Note: Comment the next line if you want to checkin your web deploy settings, 186 | # but database connection strings (with potential passwords) will be unencrypted 187 | *.pubxml 188 | *.publishproj 189 | 190 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 191 | # checkin your Azure Web App publish settings, but sensitive information contained 192 | # in these scripts will be unencrypted 193 | PublishScripts/ 194 | 195 | # NuGet Packages 196 | *.nupkg 197 | # NuGet Symbol Packages 198 | *.snupkg 199 | # The packages folder can be ignored because of Package Restore 200 | **/[Pp]ackages/* 201 | # except build/, which is used as an MSBuild target. 202 | !**/[Pp]ackages/build/ 203 | # Uncomment if necessary however generally it will be regenerated when needed 204 | #!**/[Pp]ackages/repositories.config 205 | # NuGet v3's project.json files produces more ignorable files 206 | *.nuget.props 207 | *.nuget.targets 208 | 209 | # Microsoft Azure Build Output 210 | csx/ 211 | *.build.csdef 212 | 213 | # Microsoft Azure Emulator 214 | ecf/ 215 | rcf/ 216 | 217 | # Windows Store app package directories and files 218 | AppPackages/ 219 | BundleArtifacts/ 220 | Package.StoreAssociation.xml 221 | _pkginfo.txt 222 | *.appx 223 | *.appxbundle 224 | *.appxupload 225 | 226 | # Visual Studio cache files 227 | # files ending in .cache can be ignored 228 | *.[Cc]ache 229 | # but keep track of directories ending in .cache 230 | !?*.[Cc]ache/ 231 | 232 | # Others 233 | ClientBin/ 234 | ~$* 235 | *~ 236 | *.dbmdl 237 | *.dbproj.schemaview 238 | *.jfm 239 | *.pfx 240 | *.publishsettings 241 | orleans.codegen.cs 242 | 243 | # Including strong name files can present a security risk 244 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 245 | #*.snk 246 | 247 | # Since there are multiple workflows, uncomment next line to ignore bower_components 248 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 249 | #bower_components/ 250 | 251 | # RIA/Silverlight projects 252 | Generated_Code/ 253 | 254 | # Backup & report files from converting an old project file 255 | # to a newer Visual Studio version. Backup files are not needed, 256 | # because we have git ;-) 257 | _UpgradeReport_Files/ 258 | Backup*/ 259 | UpgradeLog*.XML 260 | UpgradeLog*.htm 261 | ServiceFabricBackup/ 262 | *.rptproj.bak 263 | 264 | # SQL Server files 265 | *.mdf 266 | *.ldf 267 | *.ndf 268 | 269 | # Business Intelligence projects 270 | *.rdl.data 271 | *.bim.layout 272 | *.bim_*.settings 273 | *.rptproj.rsuser 274 | *- [Bb]ackup.rdl 275 | *- [Bb]ackup ([0-9]).rdl 276 | *- [Bb]ackup ([0-9][0-9]).rdl 277 | 278 | # Microsoft Fakes 279 | FakesAssemblies/ 280 | 281 | # GhostDoc plugin setting file 282 | *.GhostDoc.xml 283 | 284 | # Node.js Tools for Visual Studio 285 | .ntvs_analysis.dat 286 | node_modules/ 287 | 288 | # Visual Studio 6 build log 289 | *.plg 290 | 291 | # Visual Studio 6 workspace options file 292 | *.opt 293 | 294 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 295 | *.vbw 296 | 297 | # Visual Studio 6 auto-generated project file (contains which files were open etc.) 298 | *.vbp 299 | 300 | # Visual Studio 6 workspace and project file (working project files containing files to include in project) 301 | *.dsw 302 | *.dsp 303 | 304 | # Visual Studio 6 technical files 305 | *.ncb 306 | *.aps 307 | 308 | # Visual Studio LightSwitch build output 309 | **/*.HTMLClient/GeneratedArtifacts 310 | **/*.DesktopClient/GeneratedArtifacts 311 | **/*.DesktopClient/ModelManifest.xml 312 | **/*.Server/GeneratedArtifacts 313 | **/*.Server/ModelManifest.xml 314 | _Pvt_Extensions 315 | 316 | # Paket dependency manager 317 | .paket/paket.exe 318 | paket-files/ 319 | 320 | # FAKE - F# Make 321 | .fake/ 322 | 323 | # CodeRush personal settings 324 | .cr/personal 325 | 326 | # Python Tools for Visual Studio (PTVS) 327 | __pycache__/ 328 | *.pyc 329 | 330 | # Cake - Uncomment if you are using it 331 | # tools/** 332 | # !tools/packages.config 333 | 334 | # Tabs Studio 335 | *.tss 336 | 337 | # Telerik's JustMock configuration file 338 | *.jmconfig 339 | 340 | # BizTalk build output 341 | *.btp.cs 342 | *.btm.cs 343 | *.odx.cs 344 | *.xsd.cs 345 | 346 | # OpenCover UI analysis results 347 | OpenCover/ 348 | 349 | # Azure Stream Analytics local run output 350 | ASALocalRun/ 351 | 352 | # MSBuild Binary and Structured Log 353 | *.binlog 354 | 355 | # NVidia Nsight GPU debugger configuration file 356 | *.nvuser 357 | 358 | # MFractors (Xamarin productivity tool) working folder 359 | .mfractor/ 360 | 361 | # Local History for Visual Studio 362 | .localhistory/ 363 | 364 | # Visual Studio History (VSHistory) files 365 | .vshistory/ 366 | 367 | # BeatPulse healthcheck temp database 368 | healthchecksdb 369 | 370 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 371 | MigrationBackup/ 372 | 373 | # Ionide (cross platform F# VS Code tools) working folder 374 | .ionide/ 375 | 376 | # Fody - auto-generated XML schema 377 | FodyWeavers.xsd 378 | 379 | # VS Code files for those working on multiple tools 380 | .vscode/* 381 | !.vscode/settings.json 382 | !.vscode/tasks.json 383 | !.vscode/launch.json 384 | !.vscode/extensions.json 385 | *.code-workspace 386 | 387 | # Local History for Visual Studio Code 388 | .history/ 389 | 390 | # Windows Installer files from build outputs 391 | *.cab 392 | *.msi 393 | *.msix 394 | *.msm 395 | *.msp 396 | 397 | # JetBrains Rider 398 | *.sln.iml 399 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Michael Shannon 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![CPLogo](https://github.com/Michael-shannon/cellPLATO/blob/main/cellPLATO/images/cellPLATOlogo.png) 2 | 3 | An unsupervised method for identifying cell behaviour in heterogeneous cell trajectory data 4 | 5 | cellPLATO workflow: 6 | 7 | 1. takes tracking and segmentation data as input 8 | 2. measures morphology and migration at each timepoint for every cell 9 | 3. clusters cells with similar morphology and migration using UMAP and HDBSCAN 10 | 4. measures the similarity of behavioural sequences for each cell over time, and clusters them to form 'trajectories of behaviour' 11 | 5. de-abstractifies the clustered behaviours using graphics of exemplar cells, readouts of plasticity and comparisons between conditions 12 | 13 | ## Updates 14 | 15 | Published in the Journal of Cell Science on the 24th of June, 2024 [here](https://journals.biologists.com/jcs/article/137/20/jcs261887/352628) 16 | 17 | Movies for the paper can be found [here](https://drive.google.com/drive/folders/1wvCbWoywRdk0OWhcwwJhiIdAEz4yTKHt?usp=sharing) 18 | 19 | On the 14th of May 2023 at Journal of Cell Science's 'Imaging Cell Dynamics' conference in Lisbon, we presented cellPLATO's UMAP and HDBSCAN module to produce a fingerprint of cell behaviours in a heterogeneous population. Go [here](https://drive.google.com/drive/folders/1_f2GmdqbaF15FyesgxnsotuAu_XGh10o?usp=sharing) to see the poster! 20 | 21 | [![DOI](https://zenodo.org/badge/588728402.svg)](https://zenodo.org/badge/latestdoi/588728402) 22 | 23 | ## Description 24 | 25 | A Python data analysis package for time-lapse cell migration experiments written in collaboration with [Tyler Sloan](https://github.com/tsloan1377) at [Quorumetrix](https://github.com/Quorumetrix). Used after segmention (eg. [Cellpose](https://github.com/MouseLand/cellpose)) and tracking (eg. [Bayesian Tracker](https://github.com/quantumjot/btrack)) of large timelapse microscopy datasets, cellPLATO measures morphokinetic information about each cell per timepoint and automatically makes statistical plots (plots of difference in python, inspired by those in R by [Joachim Goedhart](https://github.com/JoachimGoedhart)). Users can pool/compare multiple replicates from multiple experimental conditions. Next, dimensionality reduction and cluster analysis is used to segregate cells into behavioural subtypes and produce a fingerprint for each condition (cells per behaviour subtype). Finally, exemplar cells are automatically selected and graphically displayed to disambiguate the nature of each quanitfied cell behavioural subtype. 26 | 27 | ## Installation instructions 28 | 29 | 1. Using anaconda terminal, cd to a directory where you want to install the software 30 | 2. Clone the repository onto your local machine: git clone 31 | 3. cd to the folder that contains 'environment.yml' and type: conda env create -f environment.yml 32 | 4. Activate the environment: conda activate cellPLATO 33 | 5. Install the rest of the packages: pip install -e . 34 | 35 | Known issues with installation: 36 | 37 | If you get the ERROR: Could not build wheels for hdbscan, which is required to install pyproject.toml-based projects 38 | 39 | Please check 1) you have C++ installed, 2) install hdbscan using 'conda install -c conda-forge hdbscan' 40 | 41 | If matplotlib fails to install via pip for the same reason, please use: 42 | 43 | conda install -c conda-forge matplotlib 44 | 45 | ## How to use cellPLATO: 46 | 47 | cellPLATO is made to be used downstream of cell segmentation and tracking, and can currently be used with several tracking methodologies. The default is btrack. 48 | 49 | ### Step 1: 50 | 51 | Organize your data into the following heirarchal format: 52 | 53 | - 📁 **Master folder** `[Folder_path]` 54 | - 🌿 **Condition_1** `[Experimental condition 1]` 55 | - 🔄 **Rep_1** `[Experimental repeat 1]` 56 | - 📄 `Replicate_1.h5` 57 | - 🔄 **Rep_2** 58 | - 📄 `Replicate_2.h5` 59 | - 🔄 **Rep_n** 60 | - 📄 `Replicate_n.h5` 61 | - 🌿 **Condition_2** 62 | - 🔄 **Rep_1** 63 | - 🔄 **Rep_2** 64 | - 🔄 **Rep_n** 65 | - 🌿 **Condition_n** 66 | - 🔄 **Rep_n** 67 | 68 | 📁 represents the main folder or directory. 69 | 🌿 represents the condition folders. 70 | 🔄 represents the replicate folders. 71 | 📄 represents the individual H5 files containing the segmentations and tracks 72 | 73 | ### Step 2: 74 | 75 | Open the config.py file, and edit as directed. 76 | 77 | As a minimum fill in the master directory, experiments to include, pixel size and sampling interval. 78 | 79 | Experiments to include getsf filled with the folder names of the conditions you are measuring: 80 | 81 | EXPERIMENTS_TO_INCLUDE = ['Condition_1', 'Condition_2', 'Condition_n'] 82 | 83 | ### Step 3: 84 | 85 | Run cellPLATO through Jupyter Notebooks. Choose the master notebook to run all of the analysis step by step. 86 | 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /cellPLATO/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.pyc 4 | 5 | # Distribution / packaging 6 | *.egg-info/ 7 | 8 | # Notebook checkpoints 9 | .ipynb_checkpoints 10 | -------------------------------------------------------------------------------- /cellPLATO/README.md: -------------------------------------------------------------------------------- 1 | # cellPLATO: cell PLasticity Analysis TOol 2 | 3 | A Python data analysis package for time-lapse cell migration experiments. Used in conjunction with Bayesian Tracker for automated cell tracking and segmentation, cellPLATO adds additional layers of analysis and visualization. This tool allows users to pool/compare multiple replicates from multiple experimental conditions, perform dimensionality reduction, and explore cell behavioural trajectories through physical and low-dimensional space. 4 | 5 | ## Installation instructions 6 | 7 | 1. Using anaconda terminal, cd to a directory where you want to install the software 8 | 2. Clone the repository onto your local machine: git clone 9 | 3. cd to the folder that contains 'environment.yml' and type: conda env create -f environment.yml 10 | 4. Activate the environment: conda activate cellPLATO 11 | 5. Install the rest of the packages: pip install -e . 12 | 13 | ## How to use cellPLATO: 14 | 15 | cellPLATO is made to be used downstream of cell segmentation and tracking. We used cellpose and then bayesian tracker, with files organized as below in the 'file organization' section. 16 | 17 | With jupyter notebook installed, type jupyter notebook in the terminal to begin, and select one of the notebooks to begin running cellPLATO. 18 | 19 | ## Description: 20 | 21 | A collection of Jupyter notebooks allows user to process through the analysis step by step, or using pre-assembled pipelines. 22 | 23 | All experimental constants and filepaths are contained within the config.py file. This will inform the active Python kernel where to find the data files (.h5) files, where to export plots, and ey parameters to control the analysis. Each time the analysis is run, it generates a time-stamped analysis output folder, with a copy of the config file as a record for future verification. 24 | 25 | The experimental conditions and replicates are indicated in the config.py file as the EXPERIMENTS_TO_INCLUDE = []. The data_processing module will automatically extract the replicates from the following file folder structure: 26 | 27 | my_data_path 28 | condition 1 29 | Replicate 1 30 | Replicate 1.h5 31 | Replicate 2 32 | Replicate 2.h5 33 | ... 34 | Replicate n 35 | Replicate n.h5 36 | Condition 2 37 | Replicate 1 38 | Replicate 1.h5 39 | Replicate 2 40 | Replicate 2.h5 41 | ... 42 | Replicate n 43 | Replicate n.h5 44 | ... 45 | Condition N 46 | Replicate 1 47 | Replicate 1.h5 48 | ... 49 | Replicate n 50 | Replicate n.h5 51 | 52 | 53 | The data_processing submodule is designed to sequentially process the cell tracks and shape measurements from the btracker-generated h5 files, and combine them into a Pandas dataframe for further processing, filtering and visualization. 54 | 55 | The functionality of the subsequent processing steps are defined below; 56 | 57 | Pre-preprocessed data is are combined into a single dataframe (comb_df), maininging labels for the Condition and replicate_ID. For plotting, optionally Condition_shortlabel is also used to have more succinct plot labels. The comb_df both cell shape and cell migration-related factors. 58 | 59 | At this stage, additional measurements are performed, such as the aspect ratio and Ripleys L and K. The factors are calibrated according to the micron_per_pixel ratio defined in the config.py file. Optionally, data are filtered upstream of dimensionality reduction. 60 | 61 | Next, the combined dataframe undergoes dimensionality reduction: initially PCA, followed by both tSNE and UMAP low-dimension embeddings. The lowD representations contain information about both the cell migration and shape characteristics of each cell, at each timepoint, and additional filtering steps following the dimensionality reduction steps are possible. 62 | 63 | The low-dimensional embeddings are then clustered using hdbscan to automatically extract density-based clusters from the selected embedding. Cells at a given timepoint are clustered into distinct groups and provided a label for their group. 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /cellPLATO/cellPLATO/Btrack_cellPLATO_2024.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import glob\n", 10 | "import os\n", 11 | "\n", 12 | "import btrack\n", 13 | "import json\n", 14 | "from skimage import io\n", 15 | "import matplotlib.pyplot as plt\n", 16 | "from skimage.measure import regionprops\n", 17 | "from skimage.io import imread\n", 18 | "from btrack.dataio import HDF5FileHandler\n", 19 | "from btrack.constants import BayesianUpdates\n", 20 | "import numpy as np\n", 21 | "from pathlib import Path\n", 22 | "from skimage.io import imread\n", 23 | "from skimage.io import imread\n", 24 | "from skimage.util import montage" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "# Define a function to make the files into a numpy array\n", 34 | "\n", 35 | "def segmentation_arr(files):\n", 36 | " \"\"\"Segmentation as numpy array.\"\"\"\n", 37 | " \n", 38 | " stack = []\n", 39 | " for filename in files:\n", 40 | " img = imread(filename)\n", 41 | " stack.append(img)\n", 42 | " return np.stack(stack, axis=0)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "## User changes MASTERPATH only. This the globs through and selects all mask_avg folders in condition folders for process >\n", 52 | "\n", 53 | "MASTERPATH = 'D://cellPLATO_test_data/TestSet/'\n", 54 | "\n", 55 | "PATHTOCONFIG = 'initialization/btrack_config.json'\n", 56 | "\n", 57 | "\n", 58 | "# CONFIG_FILE = datasets.cell_config() # this is the default cell tracking config file\n", 59 | "\n", 60 | "# PATHTOCONFIG =\"D://GitHub_software_forallusers/BayesianTracker/BayesianTracker/models/cell_config_Michael_phase.json\"\n", 61 | "INPUT_FMT = 'cellpose2D' #cellpose #cellpose2D\n", 62 | "\n", 63 | "if INPUT_FMT == 'usiigaci': \n", 64 | " PATHCONDLIST = glob.glob(os.path.join(MASTERPATH, 'Condition*', '*_mask_avg'))\n", 65 | " \n", 66 | "elif INPUT_FMT == 'cellpose':\n", 67 | " PATHCONDLIST = glob.glob(os.path.join(MASTERPATH, 'Condition*'))\n", 68 | " \n", 69 | "elif INPUT_FMT == 'cellpose2D':\n", 70 | " PATHCONDLIST = glob.glob(os.path.join(MASTERPATH, 'Condition*', '*'))\n", 71 | "\n", 72 | "elif INPUT_FMT == 'cellpose3D':\n", 73 | " PATHCONDLIST = glob.glob(os.path.join(MASTERPATH, 'Condition*', '*'))\n", 74 | " \n", 75 | "elif INPUT_FMT == 'cellpose_skrt':\n", 76 | " PATHCONDLIST = glob.glob(os.path.join(MASTERPATH, '*' ))\n", 77 | " \n", 78 | "print(PATHCONDLIST)\n", 79 | "\n", 80 | "# Display the PATHCONDLIST in a nice way\n", 81 | "for i in PATHCONDLIST:\n", 82 | " print(i)\n" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "if INPUT_FMT == 'usiigaci':\n", 92 | " scaling = (1., 1.) \n", 93 | "elif INPUT_FMT == 'cellpose3D':\n", 94 | " scaling = (3.45, 1., 1.)\n", 95 | " # scaling = (1., 1.)\n", 96 | "elif INPUT_FMT == 'cellpose2D': \n", 97 | " scaling = (1., 1.)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "FEATURES = [\n", 107 | " 'area',\n", 108 | " 'bbox_area',\n", 109 | " 'eccentricity',\n", 110 | " 'equivalent_diameter',\n", 111 | " 'extent',\n", 112 | " 'filled_area',\n", 113 | " 'major_axis_length',\n", 114 | " 'minor_axis_length',\n", 115 | " 'orientation',\n", 116 | " 'perimeter',\n", 117 | " 'solidity',\n", 118 | "\n", 119 | "]\n" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "## Then extract the segmentation files iteratively, do tracking on them and save a h5 in place\n", 129 | "\n", 130 | "for PATH in PATHCONDLIST:\n", 131 | " \n", 132 | " ww = os.path.dirname(PATH) #wheretosavethefile\n", 133 | " basenameyeah = os.path.basename(PATH)\n", 134 | " \n", 135 | " # files = glob.glob(os.path.join(PATH, '*.png')) #Use this line for Usiigaci\n", 136 | " # files = glob.glob(os.path.join(PATH, '*_cp_masks_T????.tif')) #\n", 137 | " files = glob.glob(os.path.join(PATH, '*_cp_masks.tif')) #Use this line and the part below for Cellpose\n", 138 | "\n", 139 | " for i in files:\n", 140 | " print(i)\n", 141 | "\n", 142 | " stack = []\n", 143 | " stack = segmentation_arr(files)\n", 144 | " obj_from_arr = []\n", 145 | " obj_from_arr = btrack.utils.segmentation_to_objects(stack, scale=scaling, properties=tuple(FEATURES))\n", 146 | "\n", 147 | " with btrack.BayesianTracker() as tracker:\n", 148 | "\n", 149 | " # configure the tracker using a config file\n", 150 | " tracker.configure_from_file(PATHTOCONFIG)\n", 151 | " tracker.max_search_radius = 50\n", 152 | " tracker.verbose = True\n", 153 | "\n", 154 | " tracker.tracking_updates = [\"MOTION\"] #update 2023\n", 155 | " tracker.features = FEATURES #update 2023 \n", 156 | " tracker.update_method = BayesianUpdates.EXACT #changed from EXACT\n", 157 | " # append the objects to be tracked\n", 158 | " tracker.append(obj_from_arr)\n", 159 | " # set the volume\n", 160 | " tracker.volume=((0, 2030), (0, 2030), (0, 100000.)) #changed for Batya\n", 161 | " # track them (in interactive mode)\n", 162 | " tracker.track_interactive(step_size=100)\n", 163 | " # generate hypotheses and run the global optimizer\n", 164 | " tracker.optimize()\n", 165 | " h5fileexport = os.path.join(ww, (basenameyeah + '_tracks.h5'))\n", 166 | " # export the tracks\n", 167 | " tracker.export(h5fileexport, obj_type='obj_type_1')\n", 168 | " # write the segmentation (you needed to add \"a\" rather than \"w\" here!)\n", 169 | " with HDF5FileHandler(h5fileexport, \"a\") as h:\n", 170 | " h.write_segmentation(stack)\n", 171 | " # make sure that we did this by checking that the data exists\n", 172 | " assert \"segmentation\" in h._hdf\n", 173 | " " 174 | ] 175 | } 176 | ], 177 | "metadata": { 178 | "kernelspec": { 179 | "display_name": "btrack-2023", 180 | "language": "python", 181 | "name": "python3" 182 | }, 183 | "language_info": { 184 | "codemirror_mode": { 185 | "name": "ipython", 186 | "version": 3 187 | }, 188 | "file_extension": ".py", 189 | "mimetype": "text/x-python", 190 | "name": "python", 191 | "nbconvert_exporter": "python", 192 | "pygments_lexer": "ipython3", 193 | "version": "3.7.12" 194 | }, 195 | "orig_nbformat": 4 196 | }, 197 | "nbformat": 4, 198 | "nbformat_minor": 2 199 | } 200 | -------------------------------------------------------------------------------- /cellPLATO/cellPLATO/__init__.py: -------------------------------------------------------------------------------- 1 | # from cellPLATO.cellPLATO.initialization.config_trackmate import * 2 | from initialization.config import * 3 | from initialization.initialization import * 4 | 5 | from data_processing.cell_identifier import * 6 | from data_processing.cleaning_formatting_filtering import * 7 | from data_processing.clustering import * 8 | from data_processing.data_io import * 9 | from data_processing.data_wrangling import * 10 | from data_processing.dimensionality_reduction import * 11 | from data_processing.measurements import * 12 | from data_processing.migration_calculations import * 13 | from data_processing.pipelines import * 14 | from data_processing.shape_calculations import * 15 | from data_processing.statistics import * 16 | from data_processing.time_calculations import * 17 | from data_processing.trajectory_clustering import * 18 | from data_processing.load_trackmate import * 19 | 20 | from visualization.comparative_visualization import * 21 | from visualization.cluster_visualization import * 22 | from visualization.filter_visualization import * 23 | from visualization.low_dimension_visualization import * 24 | # from visualization.panel_apps import * 25 | from visualization.plots_of_differences import * 26 | from visualization.small_multiples import * 27 | from visualization.superplots import * 28 | from visualization.timecourse_visualization import * 29 | from visualization.trajectory_visualization import * 30 | 31 | print('Finished initializing cellPLATO') 32 | -------------------------------------------------------------------------------- /cellPLATO/cellPLATO/data_processing/__init__.py: -------------------------------------------------------------------------------- 1 | # from initialization.config import * 2 | # from initialization.initialization import * 3 | 4 | # from data_processing.cell_identifier import * 5 | # from data_processing.cleaning_formatting_filtering import * 6 | # from data_processing.clustering import * 7 | # from data_processing.data_io import * 8 | # from data_processing.data_wrangling import * 9 | # from data_processing.dimensionality_reduction import * 10 | # from data_processing.measurements import * 11 | # from data_processing.migration_calculations import * 12 | # from data_processing.pipelines import * 13 | # from data_processing.shape_calculations import * 14 | # from data_processing.statistics import * 15 | # from data_processing.time_calculations import * 16 | # from data_processing.trajectory_clustering import * 17 | # 18 | # from visualization.comparative_visualization import * 19 | # from visualization.cluster_visualization import * 20 | # from visualization.filter_visualization import * 21 | # from visualization.low_dimension_visualization import * 22 | # from visualization.panel_apps import * 23 | # from visualization.plots_of_differences import * 24 | # from visualization.small_multiples import * 25 | # from visualization.superplots import * 26 | # from visualization.timecourse_visualization import * 27 | # from visualization.trajectory_visualization import * 28 | 29 | print('Finished initializing data_processing') 30 | -------------------------------------------------------------------------------- /cellPLATO/cellPLATO/data_processing/cell_identifier.py: -------------------------------------------------------------------------------- 1 | #cell_identifier.py 2 | # Functions for finding cells that meet a criteria, or random ones. 3 | #test# 4 | 5 | # from cellPLATO.cellPLATO.initialization.config_trackmate import * 6 | from initialization.config import * 7 | from initialization.initialization import * 8 | 9 | import os 10 | import numpy as np 11 | import pandas as pd 12 | 13 | def get_random_cell(df): 14 | 15 | # Select random row. 16 | i_row = np.random.randint(len(df)) 17 | row = df.iloc[i_row] 18 | 19 | # Get sub_df for cell from random row 20 | cell_df = df[(df['Condition']==row['Condition']) & 21 | (df['Replicate_ID']==row['Replicate_ID']) & 22 | (df['particle']==row['particle'])] 23 | 24 | return cell_df 25 | 26 | def get_cell_mean_variance(df,factor, sortby='mean'): 27 | 28 | ''' 29 | Rank the cells in df with respect to their standard deviation of a given factor. 30 | Used to find example cells that show large changes in a specific factor over time. 31 | ''' 32 | 33 | avg_list = [] 34 | 35 | for rep in df['Replicate_ID'].unique(): 36 | 37 | for cell_id in df[df['Replicate_ID'] == rep]['particle'].unique(): 38 | 39 | # Create a unique cell identifier 40 | rep_ind = list(df['Replicate_ID'].unique()).index(rep) 41 | 42 | cell_uniq_ident = str(rep_ind) + '_' + str(int(cell_id)) 43 | 44 | cell_df = df[(df['Replicate_ID']==rep) & 45 | (df['particle']==cell_id)] 46 | 47 | avg_list.append((rep, cell_id, cell_uniq_ident, np.mean(cell_df[factor]),np.std(cell_df[factor]))) 48 | 49 | df_inds = list(df.index[(df['Replicate_ID']==rep) 50 | & (df['particle']==cell_id)]) 51 | 52 | # Add unique ID back into the original dataframe 53 | df.at[df_inds,'uniq_id'] = cell_uniq_ident 54 | 55 | 56 | mean_std_df = pd.DataFrame(data=avg_list,columns=['rep', 'cell_id','cell_uniq_ident', 'mean','std']) 57 | 58 | if sortby=='mean': 59 | mean_std_df.sort_values(by='mean', ascending=False, inplace=True) 60 | 61 | elif sortby=='std': 62 | mean_std_df.sort_values(by='std', ascending=False, inplace=True) 63 | 64 | 65 | return mean_std_df 66 | 67 | 68 | def get_cell_variance(df,factor): 69 | 70 | ''' 71 | TEMP - TO DELETE. 72 | ''' 73 | print('() is discontinued, use get_cell_mean_variance() instead.') 74 | 75 | 76 | # Get that cell and confirm it has the same measured value. 77 | def get_specific_cell(sum_df, full_df,nth): 78 | 79 | ''' 80 | Having calculated the average and standard deviation for the factor of interest, find the specific cell from the main dataframe 81 | 82 | Input: 83 | sum_df: The dataframe containing summary measurements (ex: std) 84 | full_df: The full datafrme from which we want to extract an example cell 85 | nth: integer indicating which row of sum_df to extract the cell info. 86 | 87 | returns: 88 | cell_df: Section of full_df corresponding to the selected cell. 89 | ''' 90 | 91 | this_rep = sum_df.iloc[nth]['rep'] 92 | this_cell_id = sum_df.iloc[nth]['cell_id'] 93 | this_std = sum_df.iloc[nth]['std'] 94 | 95 | # Get sub_df for cell from random row 96 | cell_df = full_df[(full_df['Replicate_ID']==this_rep) & 97 | (full_df['particle']==this_cell_id)] 98 | 99 | return cell_df 100 | 101 | 102 | 103 | 104 | def get_cell_id(cell_df): 105 | 106 | ''' 107 | For a given cell dataframe, return a string containing a unique identifier, 108 | accounting for the condition, replicate and cell number. 109 | ''' 110 | 111 | assert len(np.unique(cell_df['particle'].values)) == 1, 'Should be only one cell in dataframe' 112 | cell_number = cell_df['particle'].values[0] 113 | 114 | rep_label = int(cell_df['Rep_label'].values[0]) 115 | cond_label = cell_df['Cond_label'].values[0] 116 | cid_str = str(cond_label)+ '_' + str(rep_label)+ '_' + str(int(cell_number) ) 117 | 118 | return cid_str 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | -------------------------------------------------------------------------------- /cellPLATO/cellPLATO/data_processing/cleaning_formatting_filtering_remotefix.py: -------------------------------------------------------------------------------- 1 | #cleaning_Labeling.py 2 | 3 | from initialization.config import * 4 | from initialization.initialization import * 5 | 6 | import os 7 | import numpy as np 8 | import pandas as pd 9 | 10 | import itertools 11 | 12 | def clean_comb_df(df_in, deduplicate=True): 13 | 14 | ''' 15 | A function with steps to clean up the comb_df 16 | to standardize the dataframe formatting for all the downstream processing steps 17 | 18 | ''' 19 | 20 | df = df_in.copy() 21 | # Drop all-nan rows. 22 | # df.dropna(how='all', axis=1,inplace=True) 23 | # df.dropna(how='any', axis=0,inplace=True) 24 | 25 | if 'Replicate_ID' not in df.columns: 26 | print('No column Replicate_ID, renaming Experiment column') 27 | 28 | df.rename(columns = {'Experiment': 'Replicate_ID'}, inplace=True) 29 | 30 | df.dropna(subset = ['Condition', 'Replicate_ID'], inplace=True) 31 | 32 | # Create Rep_label column 33 | reps = df['Replicate_ID'].unique() 34 | allreps = df['Replicate_ID'].values 35 | 36 | rep_inds = np.empty([len(df)]) 37 | 38 | for i, rep in enumerate(reps): 39 | rep_inds[np.where(allreps==rep)] = i 40 | 41 | df['Cond_label'] = df['Condition'] 42 | df['Rep_label'] = rep_inds 43 | 44 | 45 | if 'level_0' in df.columns: 46 | 47 | df.drop(columns=['level_0'], inplace=True) 48 | df.reset_index(inplace=True,drop=True) 49 | print('Dropped level_0 column.') 50 | 51 | 52 | if(deduplicate): 53 | 54 | #Prepare the combined dataframe for migration calculations 55 | #be ensuring there will be no overlap in columns 56 | 57 | overlap = list(set(df.columns).intersection(MIG_FACTORS)) 58 | print('Overlap:', overlap) 59 | df.drop(columns=overlap, inplace=True) 60 | 61 | # Remove duplicate coloumns 62 | dedup_df = df.loc[:,~df.columns.duplicated()] 63 | df = dedup_df.copy() 64 | 65 | return df 66 | 67 | def apply_unique_id(df): 68 | 69 | ''' 70 | Add column to dataframe indicating a unique id for each cell, constructed as a concatenation of 71 | a numerical representation of the cells experimental replicate and the particle (cell) if. 72 | Of the form: XX_xx 73 | 74 | Additionally, adds column 'ntpts' to the dataframe, to make it easier to filter by track length. 75 | 76 | Input: 77 | df: DataFrame 78 | 79 | 80 | Returns: 81 | None. (Change is made directly to the passed dataframe.) 82 | 83 | ''' 84 | 85 | for rep in df['Replicate_ID'].unique(): 86 | 87 | for cell_id in df[df['Replicate_ID'] == rep]['particle'].unique(): 88 | 89 | # Create a unique cell identifier 90 | rep_ind = list(df['Replicate_ID'].unique()).index(rep) 91 | 92 | cell_uniq_ident = str(rep_ind) + '_' + str(int(cell_id)) 93 | 94 | cell_df = df[(df['Replicate_ID']==rep) & 95 | (df['particle']==cell_id)] 96 | 97 | df_inds = list(df.index[(df['Replicate_ID']==rep) 98 | & (df['particle']==cell_id)]) 99 | ntpts = len(cell_df) 100 | # Add unique ID back into the original dataframe 101 | df.at[df_inds,'uniq_id'] = cell_uniq_ident 102 | df.at[df_inds,'ntpts'] = ntpts 103 | 104 | 105 | def replace_labels_shortlabels(df): 106 | 107 | ''' 108 | If shortlabels are used, Replace Condition labels with shortlabels. 109 | 110 | Should work on any dataframe, intended for adding shortlabels to the difference plots. 111 | ''' 112 | 113 | assert USE_SHORTLABELS is True, 'This should only be used if USE_SHORTLABELS is True...' 114 | 115 | full_condition_list = list(df['Condition']) 116 | condition_shortlabels = [] 117 | 118 | # Create a shortlabel per replicate 119 | rep_shortlabel_list = [] 120 | 121 | for this_cond_label in full_condition_list: 122 | 123 | this_cond_ind = CONDITIONS_TO_INCLUDE.index(this_cond_label) 124 | this_shortlabel = CONDITION_SHORTLABELS[this_cond_ind] 125 | condition_shortlabels.append(this_shortlabel) 126 | 127 | df['Condition'] = condition_shortlabels 128 | 129 | 130 | def apply_filters(df, filter_cell=True, how = 'all', filter_dict=DATA_FILTERS): 131 | 132 | ''' 133 | Apply the filters defines as FILTERS dictionary in config.py 134 | Apply in subsequent steps, and visualize the loss. 135 | 136 | Adds the 'included' column to the inputted datafra,me 137 | 138 | Returns: 139 | Filtered dataframe 140 | 141 | 142 | ''' 143 | 144 | print('Applying filters:') 145 | print(filter_dict) 146 | 147 | print('Beginning filtering ...') 148 | print(len(df.index), ' data points from ', len(df['uniq_id'].unique()), ' cells') 149 | 150 | df.to_csv(os.path.join(DATA_OUTPUT,'dr_df-prefilt.csv')) 151 | 152 | filt_counts=[] 153 | 154 | 155 | if(filter_cell is False): 156 | 157 | 158 | print('Applying data filters to individual timepoints:') 159 | print(filter_dict) 160 | print('...') 161 | 162 | for i,factor in enumerate(filter_dict.keys()): 163 | print(factor) 164 | print(filter_dict[factor][0], filter_dict[factor][1]) 165 | 166 | '''Consider adding here the export csv summary step, to export along with plots''' 167 | filt_df = df[(df[factor] > filter_dict[factor][0]) &#]#) 168 | (df[factor] < filter_dict[factor][1])] 169 | 170 | df.to_csv(os.path.join(DATA_OUTPUT,'filt_'+str(i)+'-'+factor+'.csv')) 171 | print(len(df.index), ' data points remaining.') 172 | assert len(df.index) > 0, 'Filtered out all the data.' 173 | filt_counts.append((factor, len(filt_df))) 174 | else: 175 | 176 | # Default filtering of entire cell. 177 | print('Applying filters to entire cell trajectory:') 178 | print(filter_dict) 179 | print('...') 180 | 181 | for cell_id in df['uniq_id'].unique(): 182 | 183 | cell_df = df[df['uniq_id'] == cell_id] 184 | 185 | # make a list to hold the filter results per factor 186 | incl_list = [] 187 | 188 | for i,factor in enumerate(filter_dict.keys()): 189 | 190 | if how == 'any': 191 | included = cell_df[factor].between(filter_dict[factor][0],filter_dict[factor][1]).any() 192 | elif how == 'all': 193 | included = cell_df[factor].between(filter_dict[factor][0],filter_dict[factor][1]).all() 194 | 195 | incl_list.append(included) 196 | filt_counts.append((factor, np.sum(included))) 197 | 198 | assert len(incl_list) == len(filter_dict.keys()) 199 | 200 | # Get indices in the dataframe for this cell. 201 | df_inds = list(df.index[(df['uniq_id']==cell_id)]) 202 | 203 | # Cell is only included if all of the list of criteria are met. 204 | if all(incl_list): 205 | 206 | # Add included flag if true 207 | df.at[df_inds,'included'] = True 208 | 209 | else: 210 | 211 | # Add unique ID back into the original dataframe 212 | df.at[df_inds,'included'] = False 213 | 214 | filt_df = df[df['included'] == True] 215 | 216 | print(' Finished filtering. Resulting dataframe contains:') 217 | print(len(filt_df.index), ' data points from ', len(filt_df['uniq_id'].unique()), ' cells') 218 | 219 | sum_counts = [(key, sum(num for _, num in value)) 220 | for key, value in itertools.groupby(sorted(filt_counts), lambda x: x[0])] 221 | 222 | # Re-index the filtered dataframe, while keeping index of each row in the unfiltered dataframe. 223 | filt_df.reset_index(inplace=True) 224 | filt_df.rename(columns={'level_0': 'comb_df_row_ind'}, inplace=True) 225 | 226 | return filt_df, sum_counts 227 | 228 | 229 | 230 | def factor_calibration(df, mixed_calibration=False): 231 | 232 | if mixed_calibration: 233 | print('Using mixed_calibration.') 234 | df_list = [] 235 | 236 | # Make sure the lists of calibration factors are the correct length 237 | assert len(CONDITIONS_TO_INCLUDE) == len(MICRONS_PER_PIXEL_LIST), 'MICRONS_PER_PIXEL_LIST must be same sized list as CONDITIONS_TO_INCLUDE' 238 | assert len(CONDITIONS_TO_INCLUDE) == len(SAMPLING_INTERVAL_LIST),'SAMPLING_INTERVAL_LIST must be same sized list as CONDITIONS_TO_INCLUDE' 239 | 240 | for i, cond in enumerate(list(df['Condition'].unique())): 241 | 242 | microns_per_pixel = MICRONS_PER_PIXEL_LIST[i] 243 | sampling_interval = SAMPLING_INTERVAL_LIST[i] 244 | print(cond, microns_per_pixel,sampling_interval) 245 | 246 | sub_df = df[df['Condition'] == cond] 247 | 248 | for factor in FACTORS_TO_CONVERT: 249 | 250 | if(factor == 'area' or factor == 'filled_area' or factor == 'bbox_area'): 251 | sub_df[factor] = sub_df[factor] * microns_per_pixel ** 2 252 | 253 | else: 254 | 255 | sub_df[factor] = sub_df[factor] * microns_per_pixel 256 | 257 | # Special case for speed: 258 | 259 | ''' Be extra careful with speed 260 | May also need a correction relative to the base pixel calibration''' 261 | sub_df['speed'] = sub_df['speed'] * sampling_interval / SAMPLING_INTERVAL 262 | 263 | df_list.append(sub_df) 264 | 265 | df_out = pd.concat(df_list) 266 | 267 | 268 | else: 269 | 270 | df_out = df.copy() 271 | 272 | for factor in FACTORS_TO_CONVERT: 273 | 274 | if(factor == 'area' or factor == 'filled_area' or factor == 'bbox_area'): 275 | 276 | df_out[factor] = df_out[factor] * MICRONS_PER_PIXEL ** 2 277 | 278 | else: 279 | 280 | 281 | df_out[factor] = df_out[factor] * MICRONS_PER_PIXEL 282 | 283 | return df_out 284 | -------------------------------------------------------------------------------- /cellPLATO/cellPLATO/data_processing/data_wrangling.py: -------------------------------------------------------------------------------- 1 | #data_wrangling.py 2 | 3 | from initialization.initialization import * 4 | from initialization.config import * 5 | 6 | import os 7 | import numpy as np 8 | import pandas as pd 9 | 10 | import h5py 11 | 12 | def format_for_superplots(df, metric, t, to_csv=False): 13 | 14 | ''' 15 | Dataframe should contain the combination of all loaded datasets to be included in the superplots 16 | metric: a string relating to one of the dataframe column headers, telling which measurement to include in the superplots. 17 | 18 | t: timepoint for visualization 19 | 20 | ''' 21 | # get sub dataframe at the selected timepoint. 22 | sub_df = df.loc[(df['frame'] == t)] 23 | 24 | # if(DEBUG): 25 | # print(sub_df.head()) 26 | 27 | if(USE_SHORTLABELS): 28 | # Create dataframe from the selected series within the original 29 | frame = { 'Replicate': sub_df['Rep_label'], 'Treatment': sub_df['Condition_shortlabel'], str(metric): sub_df[metric] } 30 | else: 31 | # Create dataframe from the selected series within the original 32 | frame = { 'Replicate': sub_df['Rep_label'], 'Treatment': sub_df['Cond_label'], str(metric): sub_df[metric] } 33 | output_df = pd.DataFrame(frame) 34 | 35 | assert len(df.index) > 0, 'Error with empty dataframe' 36 | 37 | if to_csv: 38 | output_df.to_csv(DATA_OUTPUT+'superplot_fmt_'+metric+'_t_'+str(t)+'.csv') 39 | 40 | return output_df 41 | 42 | def get_data_matrix(df, dr_factors=DR_FACTORS): #can deprecate as it is just a one liner 43 | ''' 44 | input dataframe (df): 45 | 46 | returns x: ndarray, matrix containing numerical values to be considered in 47 | dimensionality reduction methods. 48 | ''' 49 | 50 | sub_df = df[dr_factors] # Filter original dataframe by select factors 51 | 52 | x = sub_df.values # Matrix to be used in the dimensionality reduction 53 | 54 | return x 55 | 56 | 57 | ''' 58 | Spacetime-cube related functions 59 | previously in spacetimecube.py, 60 | used only in blender_visualization_pipeline() 61 | ''' 62 | 63 | 64 | def df2stc(df):#,exp_list): 65 | 66 | ''' 67 | Input: DataFrame containing a data from a number of cells at multiple timepoints. 68 | Note: Should also work on combined dataframes. 69 | 70 | Returns: N*n*t Numpy Array, where: 71 | N = the unique cell index (not ID) 72 | D = the factor extracted from the dataframe (including unique ID) 73 | t = integer timepoint index, 74 | (can be converted to time with experimental parameters) 75 | 76 | ''' 77 | 78 | # Open question whether these should be defined somewhere else, 79 | # or stored with the Object like data.n_cells, data.n_factors, etc. 80 | 81 | 82 | 83 | ''' 84 | Note: Particle numbers are only unique to each experiment. 85 | Cannot assume otherwise. 86 | Maybe need to be sure that this function only run on separate experiments. 87 | OR that it splits them up from the beginning. 88 | i.e. assert len(df['Condition'].unique()) == 1 89 | or: if len(df['Condition'].unique()) > 1: Split them. 90 | 91 | ''' 92 | 93 | 94 | 95 | ''' 96 | For testing/development purposes, use only the first condition 97 | Eventually will loop through each condition, creating an array for each and returning the list of arrays 98 | (Assert that the length of the list of arrays is the same as the length of the list of conditions.) 99 | ''' 100 | 101 | # conditions = exp_list['Condition'] 102 | 103 | # 104 | # if(DEBUG): 105 | # 106 | # display(df) 107 | # 108 | # ax1 = df.plot.scatter(x='x', 109 | # y='y', 110 | # c='DarkBlue') 111 | 112 | 113 | # Take only the first condition from the list. 114 | # sub_df = df.loc[(df['Condition'] == exp_list.loc[0]['Condition'])] #conditions[0] 115 | 116 | # Override conditional selection above, use full DataFrame 117 | sub_df = df.copy() 118 | 119 | factor_list = list(sub_df.columns) # Otherwise is 120 | n_factors = len(factor_list) 121 | 122 | # Select the first row to know about the data types 123 | row = sub_df.iloc[0] # Select first row of data frame 124 | 125 | 126 | strings = row[row.apply(isinstance, args=(str,))] 127 | non_strings = row[~row.apply(isinstance, args=(str,))] 128 | n_num_cats = len(non_strings)# number of numerical catergoies 129 | 130 | #Get the list of headers for the numerical catergories 131 | headers = non_strings.index.values 132 | 133 | # if(DEBUG): 134 | # display(row) 135 | # display(non_strings) 136 | 137 | # Assertions to catch problematic data input 138 | assert n_num_cats + len(strings) == n_factors, 'Mismatach between categories' 139 | assert len(headers) == n_num_cats, 'Number of headers doesnt match number of non-numerical categories' 140 | 141 | cells = np.sort(sub_df['particle'].unique()) 142 | frames = np.sort(sub_df['frame'].unique()) 143 | 144 | n_cells = len(cells) 145 | n_frames = len(frames) 146 | 147 | 148 | # Build a list of dataframes for each timepoint. 149 | df_list = [] 150 | 151 | for t in frames: 152 | t_df = sub_df.loc[(sub_df['frame'] == t)] 153 | df_list.append(t_df) 154 | 155 | # Built the spacetime cubes with space for the non-string contents only. 156 | stc = np.empty([n_cells, n_num_cats, n_frames]) 157 | 158 | for ind, row in sub_df.iterrows(): 159 | 160 | # Split the row into strings and numbers (non-strings) 161 | ''' 162 | The assumption above should be asserted. 163 | ''' 164 | row_str = row[row.apply(isinstance, args=(str,))] 165 | row_data = row[~row.apply(isinstance, args=(str,))] 166 | 167 | frame = int(row['frame']) 168 | cell = int(row['particle']) - 1 169 | 170 | np_row = row_data.to_numpy(copy=True) # Get the data elements of row in numpy format 171 | 172 | ''' 173 | using to_numpy allowed for strings, but the strings can't go into the array. 174 | Will need to convert them to np.nans or ignore text entries entirely 175 | ''' 176 | 177 | # On the first pass, check that the number of factors is correct. 178 | if(ind == 0): 179 | assert np.shape(np_row)[0] == n_num_cats, ' # rows != n_factors' 180 | assert np.shape(np_row) == np.shape(stc[1,:,1]), ' # rows != shape of stc' 181 | 182 | ''' 183 | If something changes in the labelling pattern from imageJ/Fiji or other 184 | upstream software, the asserts below will throw an error to let us know 185 | the arrays won't be indexed correctly. 186 | ''' 187 | 188 | # Ensure frame is zero indexed. 189 | assert frame == 0, 'Frame not correctly zero-indexed for numpy.' 190 | assert cell == 0, ' Cell not correctly zero-indexed for numpy.' 191 | 192 | # Data transformation (recall 0 indexing of numpy array) 193 | # Recall spacetime-cube dimensions stc[n_cells, n_factors, n_frames] 194 | stc[cell,:,frame] = np_row 195 | 196 | 197 | assert len(df_list) == np.shape(stc)[2], 'df_list length doesnt match time dimension of array' 198 | 199 | return stc, list(headers), df_list # Or a list of stc's 200 | 201 | 202 | def verify_stc(stc): 203 | 204 | ''' 205 | A testing fiinun to validate that the time-array is create as expected. 206 | Not currently implemented as not working properly: 207 | To Do: 208 | - pass stc, or replace from asserts. 209 | - Repair the ValueError: 210 | The truth value of an array with more than one element is ambiguous. 211 | Use a.any() or a.all() 212 | ''' 213 | 214 | print('Verifying that time-array matches with corresponding dataframe for that time point.') 215 | 216 | for t in range(np.shape(stc)[2]): 217 | 218 | for n in range(np.shape(stc)[0] - 2):# -1 because of cell indexing 219 | 220 | this_df = df_list[t] 221 | sub_df = this_df.loc[(this_df['particle'] == n+1)] # +1 accounts for zero indexing of np array but not cell (particle) #. 222 | 223 | x_ind = int(headers.index('x')) 224 | y_ind = int(headers.index('y')) 225 | 226 | if (len(sub_df['x']) > 0): # This avoids assert errors on empty series of the dataframe. 227 | 228 | # assert statements to check that everything lines up correctly 229 | assert sub_df['x'].values == stc[n,x_ind,t], 'Error' 230 | assert sub_df['y'].values == stc[n,y_ind,t], 'Error' 231 | 232 | 233 | 234 | print("If this is the only text you see, it means it worked") 235 | 236 | def condense_stc(stc,headers, zero_it=False, x_label='x', y_label='y'): 237 | 238 | ''' 239 | Function to Condense the spacetime cube to a 2D + time output. 240 | 241 | Function to 'zero' all of the cell trajectories such that they all 242 | begin at the origin of the graph (0, 0). 243 | 244 | Importantly it also reduces the shape to only the x and y positions. 245 | 246 | Inputs: 247 | stc: spacetime cube (numpy array) where: 248 | ax=0 : cell number 249 | ax=1 : factor, measurement 250 | ax=2 : timepoint 251 | headers: Column headers from original dataframe that are passed 252 | to columns of the ndarray 253 | zero_it: Boolean (optional), controls weather the zeroing operation is 254 | performed. Otherwise, allows this function to format for space-time cube visualization 255 | 256 | x_label, y_label: strings, indicate the name of the column headers to be used in the animation. 257 | Allows us to use the tSNE dimensions in the spacetime cube. 258 | Output: 259 | zerod_stc 260 | 261 | ''' 262 | assert not ((x_label != 'x') and (zero_it == True)), 'Zeroing a non-spatial dimension is not supported.' 263 | 264 | n_cells = np.shape(stc)[0] 265 | n_frames = np.shape(stc)[2] 266 | zerod_stc = np.empty([n_cells, 2, n_frames]) # Creates a spacetime-cube, formatted like a spreadsheet, cells in rows, columns for X and Y, and t in Z 267 | 268 | x_ind = int(headers.index(x_label)) 269 | y_ind = int(headers.index(y_label)) 270 | 271 | # Convert all zero values of x and y position to NaN 272 | xpos_arr = stc[:,x_ind,:] 273 | ypos_arr = stc[:,y_ind,:] 274 | 275 | # Replace zero values with np.nan 276 | xpos_arr[xpos_arr == 0] = np.nan# or use np.nan 277 | ypos_arr[ypos_arr == 0] = np.nan# or use np.nan 278 | 279 | # Insert the corrected values back into the array 280 | stc[:,x_ind,:] = xpos_arr 281 | stc[:,y_ind,:] = ypos_arr 282 | 283 | for i in range(0,n_cells): 284 | 285 | # For each cell, find the first frame on which the cell appears 286 | # This will be the first non-NaN value 287 | # Solution using x position only 288 | non_nan_inds = np.argwhere(~np.isnan(stc[i,x_ind,:])) 289 | first_ind = non_nan_inds[0] 290 | 291 | 292 | for j in range(0,n_frames): 293 | zerod_stc[i,0,j] = stc[i,x_ind,j] - stc[i,x_ind,first_ind] * zero_it 294 | zerod_stc[i,1,j] = stc[i,y_ind,j] - stc[i,y_ind,first_ind] * zero_it 295 | 296 | 297 | return zerod_stc 298 | 299 | 300 | def zero_stc(stc,headers, zero_it=True): 301 | 302 | print('Warning, this function will be replaced by condense_stc(). ') 303 | 304 | ''' 305 | DELETE THIS FUNCTION ONLY WHEN SURE THAT ALL USES OF zero_stc have been replaced with condense_stc. 306 | ''' 307 | 308 | n_cells = np.shape(stc)[0] 309 | n_frames = np.shape(stc)[2] 310 | zerod_stc = np.empty([n_cells, 2, n_frames]) # Creates a spacetime-cube, formatted like a spreadsheet, cells in rows, columns for X and Y, and t in Z 311 | 312 | x_ind = int(headers.index('x')) 313 | y_ind = int(headers.index('y')) 314 | 315 | # Convert all zero values of x and y position to NaN 316 | xpos_arr = stc[:,x_ind,:] 317 | ypos_arr = stc[:,y_ind,:] 318 | 319 | # Replace zero values with np.nan 320 | xpos_arr[xpos_arr == 0] = np.nan# or use np.nan 321 | ypos_arr[ypos_arr == 0] = np.nan# or use np.nan 322 | 323 | # Insert the corrected values back into the array 324 | stc[:,x_ind,:] = xpos_arr 325 | stc[:,y_ind,:] = ypos_arr 326 | 327 | for i in range(0,n_cells): 328 | 329 | # For each cell, find the first frame on which the cell appears 330 | # This will be the first non-NaN value 331 | # Solution using x position only 332 | non_nan_inds = np.argwhere(~np.isnan(stc[i,x_ind,:])) 333 | first_ind = non_nan_inds[0] 334 | 335 | 336 | for j in range(0,n_frames): 337 | zerod_stc[i,0,j] = stc[i,x_ind,j] - stc[i,x_ind,first_ind] * zero_it 338 | zerod_stc[i,1,j] = stc[i,y_ind,j] - stc[i,y_ind,first_ind] * zero_it 339 | 340 | 341 | return zerod_stc 342 | 343 | 344 | 345 | def stc2df(stc_0d): 346 | 347 | ''' 348 | Transform the origin-corrected ndarray to a format 349 | to be visualized in 3d with plotly. 350 | 351 | Input: 352 | stc0d: 'zeroed' ndarray (time-array, spacetime-cube) 353 | 354 | Output: 355 | out_df: DataFrame, transposed and reshaped such that 356 | origin-corrected cells are in rows, with columns: 357 | cell, X0, Y0, t (slice) 358 | ''' 359 | 360 | n,m,t = stc_0d.shape 361 | 362 | # Transpose the array upsteam of the reshape 363 | transp_array = np.transpose(stc_0d,(0,2,1)) 364 | out_arr = np.column_stack((np.repeat(np.arange(n),t), 365 | transp_array.reshape(n*t,-1), 366 | np.repeat(np.arange(t),n))) 367 | 368 | out_df = pd.DataFrame(out_arr,columns=['cell', 'X0', 'Y0', 't']) 369 | 370 | return out_df 371 | -------------------------------------------------------------------------------- /cellPLATO/cellPLATO/data_processing/load_trackmate.py: -------------------------------------------------------------------------------- 1 | # load_trackmate.py 2 | from initialization.initialization import * 3 | from initialization.config import * 4 | from data_processing.cleaning_formatting_filtering import * 5 | from data_processing.migration_calculations import * 6 | import re 7 | import tqdm 8 | import pandas as pd 9 | import numpy as np 10 | 11 | # Thanks to Guillaume Jacquemet for the following code structure. It does some trackmate loading followed by some cellPLATO formatting. 12 | 13 | def populate_columns(df, filepath): 14 | # Extract the parts of the file path 15 | path_parts = os.path.normpath(filepath).split(os.sep) 16 | 17 | if len(path_parts) < 3: 18 | # if there are not enough parts in the path to extract folder and parent folder 19 | print(f"Error: Cannot extract parent folder and folder from the filepath: {filepath}") 20 | return df 21 | 22 | # Assuming that the file is located at least two levels deep in the directory structure 23 | folder_name = path_parts[-2] # The folder name is the second last part of the path 24 | parent_folder_name = path_parts[-3] # The parent folder name is the third last part of the path 25 | 26 | 27 | filename_without_extension = os.path.splitext(os.path.basename(filepath))[0] 28 | 29 | 30 | 31 | df['File_name'] = remove_suffix(filename_without_extension) 32 | df['Condition'] = parent_folder_name # Populate 'Condition' with the parent folder name 33 | # df['experiment_nb'] = folder_name # Populate 'Repeat' with the folder name 34 | df['Replicate_ID'] = parent_folder_name + folder_name # Populate 'Repeat' with the folder name 35 | 36 | 37 | ############### 38 | return df 39 | 40 | 41 | def load_and_populate(file_pattern, usecols=None, chunksize=100000, Folder_path = DATA_PATH, Results_Folder = SAVED_DATA_PATH): 42 | df_list = [] 43 | pattern = re.compile(file_pattern) # Compile the file pattern to a regex object 44 | files_to_process = [] 45 | 46 | # First, list all the files we'll be processing 47 | for dirpath, dirnames, filenames in os.walk(Folder_path): 48 | # print(f"Dirpath is {dirpath}") 49 | # print(f"Dirnames is {dirnames}") 50 | # print(f"filenames is {filenames}") 51 | for filename in filenames: 52 | if pattern.match(filename): # Check if the filename matches the file pattern 53 | filepath = os.path.join(dirpath, filename) 54 | files_to_process.append(filepath) 55 | 56 | # Metadata list 57 | metadata_list = [] 58 | 59 | # Create a tqdm instance for progress tracking 60 | for filepath in tqdm.tqdm(files_to_process, desc="Processing Files"): 61 | # Get the expected number of rows in the file (subtracting header rows) 62 | expected_rows = sum(1 for row in open(filepath)) - 4 63 | 64 | # Get file size 65 | file_size = os.path.getsize(filepath) 66 | 67 | # Add to the metadata list 68 | metadata_list.append({ 69 | 'filename': os.path.basename(filepath), 70 | 'expected_rows': expected_rows, 71 | 'file_size': file_size 72 | }) 73 | 74 | chunked_reader = pd.read_csv(filepath, skiprows=[1, 2, 3], usecols=usecols, chunksize=chunksize) 75 | 76 | for chunk in chunked_reader: 77 | processed_chunk = populate_columns(chunk, filepath) 78 | df_list.append(processed_chunk) 79 | 80 | if not df_list: # if df_list is empty, return an empty DataFrame 81 | print(f"No files found with pattern: {file_pattern}") 82 | return pd.DataFrame() 83 | 84 | merged_df = pd.concat(df_list, ignore_index=True) 85 | # Verify the total rows in the merged dataframe matches the total expected rows from metadata 86 | total_expected_rows = sum(item['expected_rows'] for item in metadata_list) 87 | if len(merged_df) != total_expected_rows: 88 | print(f"Warning: Mismatch in total rows. Expected {total_expected_rows}, found {len(merged_df)} in the merged dataframe.") 89 | else: 90 | print(f"Success: The processed dataframe matches the metadata. Total rows: {len(merged_df)}") 91 | return merged_df, metadata_list 92 | 93 | 94 | 95 | def sort_and_generate_repeat(merged_df): 96 | merged_df.sort_values(['Condition', 'experiment_nb'], inplace=True) 97 | merged_df = merged_df.groupby('Condition', group_keys=False).apply(generate_repeat) 98 | return merged_df 99 | 100 | def generate_repeat(group): 101 | unique_experiment_nbs = sorted(group['experiment_nb'].unique()) 102 | experiment_nb_to_repeat = {experiment_nb: i+1 for i, experiment_nb in enumerate(unique_experiment_nbs)} 103 | group['Repeat'] = group['experiment_nb'].map(experiment_nb_to_repeat) 104 | return group 105 | 106 | def remove_suffix(filename): 107 | suffixes_to_remove = ["-tracks", "-spots"] 108 | for suffix in suffixes_to_remove: 109 | if filename.endswith(suffix): 110 | filename = filename[:-len(suffix)] 111 | break 112 | return filename 113 | 114 | 115 | def validate_tracks_df(df): 116 | """Validate the tracks dataframe for necessary columns and data types.""" 117 | required_columns = ['TRACK_ID'] 118 | for col in required_columns: 119 | if col not in df.columns: 120 | print(f"Error: Column '{col}' missing in tracks dataframe.") 121 | return False 122 | 123 | # Additional data type checks or value ranges can be added here 124 | return True 125 | 126 | def validate_spots_df(df): 127 | """Validate the spots dataframe for necessary columns and data types.""" 128 | required_columns = ['TRACK_ID', 'POSITION_X', 'POSITION_Y', 'POSITION_Z', 'POSITION_T'] 129 | for col in required_columns: 130 | if col not in df.columns: 131 | print(f"Error: Column '{col}' missing in spots dataframe.") 132 | return False 133 | 134 | # Additional data type checks or value ranges can be added here 135 | return True 136 | 137 | def check_unique_id_match(df1, df2): 138 | df1_ids = set(df1['Unique_ID']) 139 | df2_ids = set(df2['Unique_ID']) 140 | 141 | # Check if the IDs in the two dataframes match 142 | if df1_ids == df2_ids: 143 | print("The Unique_ID values in both dataframes match perfectly!") 144 | else: 145 | missing_in_df1 = df2_ids - df1_ids 146 | missing_in_df2 = df1_ids - df2_ids 147 | 148 | if missing_in_df1: 149 | print(f"There are {len(missing_in_df1)} Unique_ID values present in the second dataframe but missing in the first.") 150 | print("Examples of these IDs are:", list(missing_in_df1)[:5]) 151 | 152 | if missing_in_df2: 153 | print(f"There are {len(missing_in_df2)} Unique_ID values present in the first dataframe but missing in the second.") 154 | print("Examples of these IDs are:", list(missing_in_df2)[:5]) 155 | 156 | ##### 157 | 158 | # Function to calculate Cohen's d 159 | def cohen_d(group1, group2): 160 | diff = group1.mean() - group2.mean() 161 | n1, n2 = len(group1), len(group2) 162 | var1 = group1.var() 163 | var2 = group2.var() 164 | pooled_var = ((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2) 165 | d = diff / np.sqrt(pooled_var) 166 | return d 167 | 168 | def save_dataframe_with_progress(df, path, desc="Saving", chunk_size=50000): 169 | """Save a DataFrame with a progress bar.""" 170 | 171 | # Estimating the number of chunks based on the provided chunk size 172 | num_chunks = int(len(df) / chunk_size) + 1 173 | 174 | # Create a tqdm instance for progress tracking 175 | with tqdm(total=len(df), unit="rows", desc=desc) as pbar: 176 | # Open the file for writing 177 | with open(path, "w") as f: 178 | # Write the header once at the beginning 179 | df.head(0).to_csv(f, index=False) 180 | 181 | for chunk in np.array_split(df, num_chunks): 182 | chunk.to_csv(f, mode="a", header=False, index=False) 183 | pbar.update(len(chunk)) 184 | 185 | def check_for_nans(df, df_name): 186 | """ 187 | Checks the given DataFrame for NaN values and prints the count for each column containing NaNs. 188 | 189 | Args: 190 | df (pd.DataFrame): DataFrame to be checked for NaN values. 191 | df_name (str): The name of the DataFrame as a string, used for printing. 192 | """ 193 | # Check if the DataFrame has any NaN values and print a warning if it does. 194 | nan_columns = df.columns[df.isna().any()].tolist() 195 | 196 | if nan_columns: 197 | for col in nan_columns: 198 | nan_count = df[col].isna().sum() 199 | print(f"Column '{col}' in {df_name} contains {nan_count} NaN values.") 200 | else: 201 | print(f"No NaN values found in {df_name}.") 202 | 203 | 204 | def trackmate_to_cellPLATO(df): 205 | # This will become the function to make the comb_df 206 | 207 | input_df=df.copy() 208 | 209 | '''This part renames a lot of columns to match cellPLATO''' 210 | 211 | # rename LABEL to trackmate_label 212 | input_df = input_df.rename(columns={'LABEL':'trackmate_label'}) 213 | # ID to particle 214 | input_df = input_df.rename(columns={'ID':'particle'}) 215 | # change the data type of particle to float 216 | input_df['particle'] = input_df['particle'].astype(float) 217 | # rename POSITION_X to x, POSITION_Y to y, POSITION_Z to z, FRAME to t 218 | input_df = input_df.rename(columns={'POSITION_X':'x', 'POSITION_Y':'y', 'POSITION_Z':'z', 'FRAME':'frame'}) 219 | 220 | # Convert the values in frame column to float 221 | input_df['frame'] = input_df['frame'].astype(float) 222 | 223 | '''This part makes the x_um, y_um, z_um columns just by replicating the existing ones''' 224 | # copy the x column to a new x_um column, and the y column to a new y_um column, and z to z_um 225 | input_df['x_um'] = input_df['x'] 226 | input_df['y_um'] = input_df['y'] 227 | input_df['z_um'] = input_df['z'] 228 | # Same with the x_pix, y_pix, z_pix 229 | input_df['x_pix'] = input_df['x'] 230 | input_df['y_pix'] = input_df['y'] 231 | input_df['z_pix'] = input_df['z'] 232 | 233 | '''This part makes the Rep_label column''' 234 | # Make a column of floats that corresponds to the 'Replicate_ID' column and call it 'Rep_label' 235 | # To do this, extract the 'Replicate_ID' columns from the merged_spots_df 236 | Replicate_ID = input_df['Replicate_ID'] 237 | # Get the unique Replicate_IDs 238 | Replicate_ID_unique = np.unique(Replicate_ID) 239 | # Make a dictionary of the unique Replicate_IDs and a number (float) that corresponds to them 240 | Replicate_ID_dict = {} 241 | for i, ID in enumerate(Replicate_ID_unique): 242 | Replicate_ID_dict[ID] = i 243 | # Make a new column called 'Rep_label' and populate it with the float values from the dictionary 244 | input_df['Rep_label'] = input_df['Replicate_ID'].map(Replicate_ID_dict) 245 | # make those floats 246 | input_df['Rep_label'] = input_df['Rep_label'].astype(float) 247 | 248 | # Make a new column called 'Condition_shortlabel' which has the same value as 'Condition' 249 | input_df['Condition_shortlabel'] = input_df['Condition'] 250 | 251 | ########################## 252 | 253 | # Then, add the ntpts and the uniq_id to the df 254 | 255 | apply_unique_id_trackmate(input_df) 256 | #sort by frame 257 | input_df = input_df.sort_values(by=['uniq_id', 'frame']) 258 | 259 | # display(input_df) 260 | 261 | # Then, do the cellPLATO migration calculations 262 | 263 | if DO_CP_METRICS_FOR_TRACKMATE: 264 | 265 | proto_comb_list = [] 266 | proto_comb_df = pd.DataFrame() 267 | 268 | ######################################################## 269 | 270 | for replicate in np.unique(input_df['Replicate_ID']): 271 | # extract the replicate 272 | replicate_df = input_df[input_df['Replicate_ID'] == replicate] 273 | # sort that df by uniq_id and frame 274 | replicate_df = replicate_df.sort_values(by=['uniq_id', 'frame']) 275 | 276 | 277 | # print('For this replciate df, the replicated is ', replicate_df['Replicate_ID'].unique()) 278 | # print('And the rep_label is ', replicate_df['Rep_label'].unique()) 279 | # print('And the unique ID is ', replicate_df['uniq_id'].unique()) 280 | # print('And finally the file name is ', replicate_df['File_name'].unique()) 281 | # print('And the condition is ', replicate_df['Condition'].unique()) 282 | 283 | # do the migration measurements 284 | mig_df = migration_calcs(replicate_df) 285 | 286 | mig_df.reset_index(inplace=True, drop=True) 287 | # # add it to the proto_comb_df list 288 | proto_comb_list.append(mig_df) 289 | 290 | proto_comb_df = pd.concat(proto_comb_list, ignore_index=True) 291 | 292 | # proto_comb_df = pd.concat([proto_comb_df,mig_df]) 293 | proto_comb_df.reset_index(inplace=True, drop=True) 294 | else: 295 | proto_comb_df = input_df 296 | 297 | 298 | 299 | ############# 300 | return proto_comb_df 301 | ##### 302 | 303 | -------------------------------------------------------------------------------- /cellPLATO/cellPLATO/data_processing/measurements.py: -------------------------------------------------------------------------------- 1 | #measurements.py 2 | 3 | from initialization.initialization import * 4 | from initialization.config import * 5 | 6 | import os 7 | import numpy as np 8 | import pandas as pd 9 | 10 | from tqdm import tqdm 11 | 12 | def calc_aspect_ratio(df, drop=False): 13 | 14 | df['aspect'] = df['major_axis_length']/df['minor_axis_length'] 15 | 16 | # Remove NaNs 17 | if (drop): 18 | df.replace([np.inf, -np.inf], np.nan, inplace=True) 19 | df.dropna(subset=["aspect"], how="all", inplace=True) 20 | 21 | 22 | 23 | def ripley_K(X,r): 24 | 25 | ''' 26 | Calculate Ripleys K for a given radius r 27 | 28 | ''' 29 | 30 | # Extract the number of other points, p, within a distance r. 31 | rip = [] 32 | 33 | for i,x in enumerate(X): 34 | 35 | # Get the distance matrix for this point. 36 | Xd = np.sqrt((X[:,0]-x[0])**2 + (X[:,1]-x[1])**2) 37 | 38 | Xd = np.delete(Xd, (i), axis=0) # Delete self. 39 | 40 | # Count the number of points within radius 41 | n = len(X) # Number of points total 42 | A = np.pi * r ** 2 # Area of circle with radius r 43 | p = sum(Xd < r) # Number of points within radius r 44 | K = p * A / n # Ripley's K - number of points within radius r per unit area 45 | L = (K / np.pi) ** 0.5 # Ripley's L - radius of circle with same density as K 46 | 47 | rip.append([p,K,L]) # Append tuple containing count, Ripley's K and L 48 | 49 | rip = np.asarray(rip) 50 | 51 | return rip 52 | 53 | 54 | 55 | 56 | def calc_ripleys_xy(df_in, r=RIP_R, plot=False, inplace=False): 57 | 58 | ''' 59 | Calculate ripleys p, K and L for a given radius r. 60 | Create a dataframe with these measurements. 61 | 62 | ''' 63 | 64 | print('Calculating ripleys p, K and L with radius: ', r, ' (pixels)') 65 | df = df_in.copy() 66 | 67 | df_list = [] 68 | 69 | for rep in df['Replicate_ID'].unique(): 70 | 71 | rep_df = df[df['Replicate_ID'] == rep] 72 | 73 | 74 | for frame in rep_df['frame'].unique(): 75 | 76 | t_df = rep_df[rep_df['frame'] == frame] 77 | pos = t_df[['x_um', 'y_um']].values 78 | rip = ripley_K(pos,r) 79 | 80 | 81 | t_df['rip_p'] = rip[:,0] # Number of points within radius 82 | t_df['rip_K'] = rip[:,1] 83 | t_df['rip_L'] = rip[:,2] 84 | 85 | df_list.append(t_df) 86 | 87 | 88 | if plot: 89 | 90 | ''' 91 | Plot should be made to create animation, gif?? 92 | ''' 93 | 94 | plt.scatter(pos[:, 0], pos[:, 1], c=rip[:,2], s=t_df['area']/5) # Colormap by ripleys L 95 | 96 | plt.show() 97 | 98 | df_out = pd.concat(df_list) 99 | df_out.sort_index(inplace=True) 100 | 101 | return df_out 102 | 103 | 104 | 105 | def standardize_factors_per_cell(df_in, factor_list=['area', 'perimeter']): 106 | 107 | from sklearn.preprocessing import StandardScaler 108 | 109 | df = df_in.copy() 110 | cell_df_list = [] 111 | 112 | unique_id = 0 # Create a unique cell id 113 | rep_list = df['Replicate_ID'].unique() 114 | 115 | # For each replicate 116 | for i_rep, this_rep in enumerate(rep_list): 117 | 118 | rep_df = df[df['Replicate_ID']==this_rep] 119 | cell_ids = rep_df['particle'].unique() # Particle ids only unique for replicate, not between. 120 | 121 | # For each cell, calculate the average value and add to new DataFrame 122 | print('Replicate ', i_rep+1, ' out of ', len(rep_list)) 123 | for cid in tqdm(cell_ids): 124 | 125 | cell_df = rep_df[rep_df['particle'] == cid] 126 | 127 | # A test to ensure there is only one replicate label included. 128 | assert len(cell_df['Rep_label'].unique()) == 1, 'check reps' 129 | 130 | # x = get_data_matrix(cell_df, dr_factors=factor_list) 131 | x = cell_df[factor_list].values 132 | x_ = StandardScaler().fit_transform(x) 133 | 134 | 135 | cell_df[factor_list] = x_ 136 | cell_df_list.append(cell_df) 137 | 138 | df_out = pd.concat(cell_df_list) 139 | df_out.sort_index(inplace=True) 140 | 141 | return df_out 142 | 143 | 144 | 145 | 146 | def t_window_metrics(df_in, t_window=MIG_T_WIND,min_frames=MIG_T_WIND/2,factor_list=DR_FACTORS): 147 | 148 | ''' 149 | Create measurements average and ratio measurements for each. 150 | ''' 151 | 152 | df = df_in.copy() 153 | df_list = [] 154 | 155 | time_avg_df = pd.DataFrame() 156 | unique_id = 0 # Create a unique cell id 157 | rep_list = df['Replicate_ID'].unique() 158 | new_factor_list = [] 159 | 160 | # For each replicate 161 | for i_rep, this_rep in enumerate(rep_list): 162 | 163 | rep_df = df[df['Replicate_ID']==this_rep] 164 | cell_ids = rep_df['particle'].unique() # Particle ids only unique for replicate, not between. 165 | 166 | # For each cell, calculate the average value and add to new DataFrame (akin to making the tavg_df) 167 | print('Replicate ', i_rep, ' out of ', len(rep_list)) 168 | for cid in tqdm(cell_ids): 169 | 170 | cell_df = rep_df[rep_df['particle'] == cid] 171 | 172 | # A test to ensure there is only one replicate label included. 173 | assert len(cell_df['Rep_label'].unique()) == 1, 'check reps' 174 | 175 | # Unique list of frames for this cell 176 | frame_list = cell_df['frame'].unique() 177 | 178 | for frame in frame_list: 179 | 180 | # get a subset of the dataframe across the range of frames 181 | t_wind_df = cell_df[(cell_df['frame']>=frame - t_window/2) & 182 | (cell_df['frame']= min_frames: 190 | 191 | # Do the measurements for each factor 192 | for factor in factor_list: 193 | 194 | mean_str = factor + '_tmean' 195 | ratio_str = factor + '_ratio' 196 | 197 | # Mean value for factor across time window 198 | tpt_df[mean_str] = np.nanmean(t_wind_df[factor]) #adds new col to df called 'area_tmean' for example 199 | 200 | # Ratio 201 | tpt_df[ratio_str] = tpt_df[factor] / tpt_df[mean_str] 202 | 203 | # Keep a list of the factors in order to make DR methods easier to implement 204 | new_factor_list.append(factor) 205 | new_factor_list.append(mean_str) 206 | new_factor_list.append(ratio_str) 207 | 208 | df_list.append(tpt_df) # Append the row of new calculations to a list of dataframes 209 | 210 | # Increase the unique id given to each cell 211 | unique_id += 1 212 | 213 | # Assemble the df_list into a dataframe and reorder by index. 214 | df_out = pd.concat(df_list) 215 | df_out.sort_index(inplace=True) 216 | 217 | new_factor_list=np.unique(new_factor_list) 218 | 219 | 220 | 221 | return df_out, new_factor_list 222 | -------------------------------------------------------------------------------- /cellPLATO/cellPLATO/data_processing/migration_calculations.py: -------------------------------------------------------------------------------- 1 | #migration_calculations.py 2 | 3 | from initialization.config import * 4 | from initialization.initialization import * 5 | 6 | import os 7 | import numpy as np 8 | import pandas as pd 9 | 10 | from tqdm import tqdm 11 | 12 | def cell_calcs(cell_tarray, t_window=MIG_T_WIND):#, calibrate): 13 | 14 | ''' 15 | Cell migration calculations for a given cell through time. 16 | This function is passed a numpy array corresponding to the timecourse of a single cell, 17 | (from a single experimental replicate) 18 | 19 | 20 | Migration calcs accessory function that is optimized to use Numpy only, instead 21 | of pandas. 22 | 23 | Input: 24 | cell_tarray: [T * 4] NumPy array, where T is the number of frames over which this cell was tracker 25 | [frame, x_um, y_um, index]: 26 | 27 | t_window = int; width of the time window in # of frames. 28 | 29 | Returns: 30 | cell_calcs: list; 31 | 32 | UPDATED: This version of the function calculates certain values across a time window. 33 | 34 | 35 | ''' 36 | 37 | cell_calcs = [] 38 | mig_calcs = [] 39 | 40 | if(cell_tarray.shape[0] > 0): 41 | 42 | # Find the first and last frame in which this cell was tracked. 43 | init_f = int(np.min(cell_tarray[:,0])) 44 | final_f = int(np.max(cell_tarray[:,0])) 45 | 46 | # Enumerate across the range of frames 47 | for i, t in enumerate(range(init_f, final_f)): # Because we need a count and an index, for cases where cells arent included throughout 48 | 49 | # Adding actual window size 50 | # actual_window_size = min(t - init_f + 1, final_f - t, t_window) #trackmate 51 | 52 | # Extract separate arrays for the timepoints and window of interest 53 | prev_frame_arr = np.squeeze(cell_tarray[np.where(cell_tarray[:,0] == t-1)]) 54 | this_frame_arr = np.squeeze(cell_tarray[np.where(cell_tarray[:,0] == t)]) 55 | 56 | # if INPUT_FMT == 'trackmate': 57 | 58 | # #### trackmate 59 | # # Extract the time window array considering the actual window size 60 | # t_window_arr = np.squeeze(cell_tarray[np.where((cell_tarray[:,0] >= t - actual_window_size//2) & 61 | # (cell_tarray[:,0] < t + actual_window_size//2))]) 62 | # size_of_window = actual_window_size 63 | 64 | 65 | # # Check if the t_window_arr is not empty 66 | # if t_window_arr.size > 0 and t_window_arr.shape[0] == actual_window_size: 67 | # # Access the first row of the window 68 | # init_frame_arr = t_window_arr[0,:] 69 | 70 | # # ... [rest of your calculations] 71 | # else: 72 | # # Handle the case where t_window_arr is empty 73 | # # For example, you can continue to the next iteration of the loop 74 | # continue 75 | 76 | 77 | 78 | # else: 79 | # t_window_arr = np.squeeze(cell_tarray[np.where((cell_tarray[:,0] >= t - t_window/2) & 80 | # (cell_tarray[:,0] < t + t_window/2))]) 81 | # size_of_window = t_window 82 | t_window_arr = np.squeeze(cell_tarray[np.where((cell_tarray[:,0] >= t - t_window/2) & 83 | (cell_tarray[:,0] < t + t_window/2))]) 84 | # size_of_window = t_window # Redundant, equivalent to t_window 85 | 86 | ##### 87 | init_frame_arr = t_window_arr[0,:] # MOVED THIS INTO LOOP Use the first row of the window 88 | 89 | # segment_length = np.nan # default value 90 | 91 | 92 | # Only process calculations for which we have the entire window 93 | if(t_window_arr.shape[0] == t_window): 94 | 95 | # Extract the critical coordinates for making mnigration calculations 96 | x0, y0 = init_frame_arr[1:3] 97 | xi, yi = prev_frame_arr[1:3] 98 | xf, yf = this_frame_arr[1:3] 99 | 100 | # Extract the xy-track across the window 101 | window_traj = t_window_arr[:,1:3] 102 | 103 | # Use the index of the row of the subdf to insert value into original df 104 | ind = this_frame_arr[3] 105 | 106 | # Decide which one to keep 107 | segment_length = np.sqrt((xf-xi)**2 + (yf-yi)**2) 108 | # dist = np.sqrt((xf-xi)**2 + (yf-yi)**2) # Redundant, equivalent to segment_length 109 | 110 | '''Decide which one to keep''' 111 | euc_dist = np.sqrt((xf-x0)**2 + (yf-y0)**2) 112 | # net_dist = np.sqrt((xf-x0)**2 + (yf-y0)**2) # Redundant, equivalent to euc_dist 113 | 114 | speed = segment_length / SAMPLING_INTERVAL # Units will be in microns per unit of time of T_INC 115 | 116 | 117 | # Efficient cumulative euclidean distance calculation: 118 | diff = np.diff(window_traj, axis=0, prepend=window_traj[-1].reshape((1, -1))) 119 | ss = np.power(diff, 2).sum(axis=1) 120 | cumul_euc_dist = np.sqrt(ss).sum() 121 | # 122 | # Calculate the cumulative path length across the window 123 | ''' 124 | Would be nice to replace with a more efficient implementation 125 | as for cumulative euclidean above 126 | ''' 127 | 128 | 129 | # Calculations to be made across the window 130 | 131 | cumulative_dist_sqrd = 0 # reset for each window 132 | dist_list = [] 133 | turn_list = [] 134 | 135 | for n in range(1,len(window_traj)): 136 | 137 | x_, y_ = window_traj[n-1,:] 138 | x__, y__ = window_traj[n,:] 139 | dist = np.sqrt((x__-x_)**2 + (y__-y_)**2) 140 | dist_list.append(dist) 141 | 142 | # Global turn (relative to previous frame) 143 | glob_turn = np.arctan((y__ - y_) / (x__ - x_)) # Equivalent to turn_angle_radians 144 | turn_list.append(glob_turn) 145 | 146 | 147 | if INPUT_FMT == 'trackmate': 148 | actual_window_size = len(window_traj) 149 | assert len(dist_list) == actual_window_size - 1, 'length of computed distances does not match actual window size' 150 | else: 151 | 152 | assert len(dist_list) == t_window-1, 'length of computed distances doesnt match time window' 153 | 154 | # Summary measurements across the time window 155 | cumulative_length = np.sum(dist_list) 156 | max_dist = np.max(dist_list) 157 | 158 | # Mean-squared displacement (MSD) 159 | msd = np.sum(np.power(dist_list,2)) / t_window 160 | 161 | 162 | cumulative_dist_sqrd = cumulative_dist_sqrd + segment_length**2 163 | 164 | # Meandering index 165 | # meandering_ind = net_dist / total_dist 166 | meandering_ind = euc_dist / cumulative_length 167 | # Outreach Ratio 168 | # outreach_ratio = max_dist / total_dist 169 | outreach_ratio = max_dist / cumulative_length 170 | 171 | # Arrest coefficient - proportion of track cell is immobile (speed < x um) 172 | 173 | arrest_coefficient = sum(dist < ARREST_THRESHOLD for dist in dist_list) / len(dist_list) 174 | 175 | 176 | # 177 | # Direction calculations 178 | # 179 | 180 | # Global turn for this frame 181 | glob_turn = np.arctan((yf - y0) / (xf - x0))# change from yi and xi 182 | glob_turn_deg = np.degrees(glob_turn) # 183 | dir_autocorr = np.cos(turn_list[int(t_window/2)-1] - 184 | turn_list[int(t_window/2)-2]) 185 | 186 | ''' 187 | The directional autocorrelation is usually calculated between this and the previous frame 188 | It would be more interesting as compared to the trajectory in the time window./ 189 | ''' 190 | 191 | # Orientation 192 | axis_angle = np.arctan(yf / xf) # Temp 193 | orientation = np.cos(2 * np.radians(axis_angle)) 194 | 195 | # Directedness 196 | directedness = (xf - x0) / euc_dist 197 | 198 | # Turned angle (Between two frames) 199 | turn_angle_radians = np.arctan((yf - yi) / (xf - xi)) 200 | turn_angle = np.degrees(turn_angle_radians) 201 | 202 | # Endpoint directionality ratio 203 | endpoint_dir_ratio = euc_dist / cumulative_length 204 | 205 | 206 | # Combine current calculations into a list for the current timepoint 207 | mig_calcs = [ind, 208 | euc_dist, 209 | segment_length, 210 | cumulative_length, 211 | speed, 212 | orientation, 213 | directedness, 214 | turn_angle, 215 | endpoint_dir_ratio, 216 | # New ones added: 217 | dir_autocorr, 218 | outreach_ratio, 219 | msd, 220 | max_dist, 221 | glob_turn_deg, 222 | arrest_coefficient] 223 | 224 | # Add the current timepoint calculations to the cell-sepecific list of calculations 225 | cell_calcs.append(mig_calcs) 226 | 227 | 228 | return cell_calcs 229 | 230 | 231 | def migration_calcs(df_in):#, calibrate=CALIBRATE_MIG): 232 | 233 | ''' 234 | Re-implementation of the previous Usiigaci function to calculate cell 235 | migration measurements, for the dataframe instead of a numpy array. 236 | 237 | 238 | Function works in two steps: 239 | 1. Calculate any frame-independent measures, i.e. that don't require 240 | comparing to a previous frame. These are applied to the entire sub_df 241 | associated with a given cell. (Orientation) 242 | 2. Calculate frame-dependent measures, where the difference of a measurement 243 | is made with a previous frame. These must be done on a further segmented 244 | dataframe. 245 | 246 | Read from df_in, make changes to df_out. 247 | 248 | ''' 249 | # calibrate = CALIBRATE_MIG # Previously an input argument, placed in function so cannot be changed. 250 | 251 | df_out = df_in.copy() # Make a copy so as not to modify the original input 252 | df_out.reset_index(inplace=True, drop=True) 253 | # df_out.drop(columns=['index'],inplace=True) # Dropped already in reset_index 254 | assert len(df_out.index.unique()) == len(df_out.index), 'Dataframe indexes not unique' 255 | 256 | # Determine if dataframe contains a single replicate or multiple 257 | # by seeing if the column Replicate_ID exists. 258 | if 'Replicate_ID' in df_in.columns.values: 259 | print('Processing migration calculations of pooled data') 260 | 261 | else: 262 | 263 | # Add Replicate_ID with arbitrary values to the dataframe 264 | print('Processing single experiment, adding arbitrary Replicate_ID = -1') 265 | df_out['Replicate_ID'] = -1 266 | df_out['Condition'] = 'unknown' 267 | 268 | 269 | calcs_list = [] # Initialize for the whole dataframe 270 | 271 | conditions = df_in['Condition'].unique() 272 | 273 | 274 | for cond in conditions: 275 | 276 | # cond_df = df_in[df_in['Condition'] == cond] 277 | cond_df = df_out[df_out['Condition'] == cond] 278 | 279 | # If a combined dataframe is provided, it will have duplicate particle(cell) 280 | # numbers, therefore we must treat them separately 281 | 282 | exp_reps = cond_df['Replicate_ID'].unique() 283 | 284 | ''' 285 | NOTE: 286 | Important to use Replicate_ID (string of experiment name) instead of 287 | Rep_label (integer), as Rep_label is only unique per condition. 288 | ''' 289 | print('Processing migration_calcs() for condition: ',cond) 290 | 291 | for exp_rep in exp_reps: 292 | 293 | print('Processing migration_calcs() for experiment: ',exp_rep) 294 | 295 | # Get subset of dataframe corresponding to this replicate 296 | exp_subdf = cond_df[cond_df['Replicate_ID'] == exp_rep] 297 | assert len(exp_subdf.index)==len(exp_subdf.index.unique()), 'exp_subdf indexes not unique' 298 | 299 | # Get the number of frames and cells in this selection 300 | n_frames = int(np.max(exp_subdf['frame'])) 301 | # n_cells = int(np.max(exp_subdf['particle'])) # This 302 | n_cells = len(exp_subdf['particle'].unique()) 303 | # for n in tqdm(range(n_cells)): 304 | 305 | # print('n_frames: ',n_frames ) 306 | # print('n_cells: ',n_cells ) 307 | if INPUT_FMT != 'trackmate': 308 | thing_to_iterate = 'particle' 309 | elif INPUT_FMT == 'trackmate': 310 | thing_to_iterate = 'uniq_id' 311 | 312 | for n in tqdm(exp_subdf[thing_to_iterate].unique()): #put in thing_to_iterate 313 | # For each cell, get another subset of the dataframe 314 | cell_subdf = exp_subdf[exp_subdf[thing_to_iterate] == n] # was 'particle', now thing_to_iterate 315 | assert len(cell_subdf.index)==len(cell_subdf.index.unique()), 'exp_subdf indexes not unique' 316 | 317 | tarray = cell_subdf[['frame', 'x_um', 'y_um']].to_numpy()#cell_subdf['frame', 'x', 'y'] 318 | inds = cell_subdf.index.values 319 | 320 | assert tarray.shape[0] == len(inds), 'indexes doesnt match tarray shape' 321 | assert len(inds) == len(np.unique(inds)), 'indexes not unique' 322 | 323 | tarray = np.c_[tarray,inds] # Append index as 4th column to the array 324 | assert tarray.shape[1] == 4, '' 325 | mig_calcs = cell_calcs(tarray)#, calibrate) 326 | 327 | if len(mig_calcs) > 0: 328 | calcs_list.append(mig_calcs) 329 | 330 | calcs_array = np.vstack(calcs_list) # Arrat from the list 331 | 332 | # Insert back into dataframe 333 | mig_calcs_df = pd.DataFrame(data=calcs_array[:,1:], # values 334 | index=calcs_array[:,0], # 1st column as index 335 | columns=['euclidean_dist', 336 | 'segment_length', 337 | 'cumulative_length', 338 | 'speed', 339 | 'orientedness', 340 | 'directedness', 341 | 'turn_angle', 342 | 'endpoint_dir_ratio', 343 | 'dir_autocorr', 344 | 'outreach_ratio', 345 | 'MSD', 346 | 'max_dist', 347 | 'glob_turn_deg', 348 | 'arrest_coefficient']) 349 | 350 | # The old ones from the previous version of cell_calcs, kept here just in case. 351 | # columns=['euclidean_dist','segment_length','cumulative_length','speed', 352 | # 'orientedness', 'directedness', 'turn_angle', 'endpoint_dir_ratio'])#, 'dir_autocorr']) 353 | 354 | assert len(mig_calcs_df.index.unique()) == len(np.unique(calcs_array[:,0])), 'Created dataframe indexes don match values from calcs_array' 355 | 356 | df_out = df_out.join(mig_calcs_df) # Add migration calcs to dataframr 357 | 358 | return df_out 359 | -------------------------------------------------------------------------------- /cellPLATO/cellPLATO/data_processing/statistics.py: -------------------------------------------------------------------------------- 1 | #statistics.py 2 | 3 | from initialization.initialization import * 4 | from initialization.config import * 5 | 6 | import os 7 | import numpy as np 8 | import pandas as pd 9 | 10 | import scipy.stats as st 11 | import scipy.stats as stats 12 | 13 | 14 | def average_per_condition(df, avg_per_rep=False): 15 | 16 | ''' 17 | 18 | Function to calculate average value for each metric in a dataframe, taking a time-averagede dataframe as input 19 | 20 | Input: 21 | df: time-averaged DataFrame [N * X] 22 | 23 | Returns: 24 | avg_df: DataFrame [N] 25 | ''' 26 | 27 | 28 | assert df['frame'].unique()[0] == 'timeaverage', 'This function is intended for a time-averaged dataset.' 29 | 30 | avg_df = pd.DataFrame() 31 | std_df = pd.DataFrame() 32 | n_df = pd.DataFrame() 33 | cond_list = df['Condition'].unique() 34 | 35 | # Find the average value for each of the numerical columns 36 | 37 | for cond in cond_list: 38 | 39 | this_cond_df = df[df['Condition'] == cond] 40 | cond_avg_df = this_cond_df.mean()#skipna=True) 41 | cond_std_df = this_cond_df.std()#skipna=True) 42 | cond_n_df = this_cond_df.count()#skipna=True) 43 | 44 | # Additional nested level of processing if we want to calculate the average per replicate. 45 | if(avg_per_rep): 46 | 47 | rep_list = this_cond_df['Replicate_ID'].unique() 48 | 49 | for this_rep in rep_list: 50 | 51 | 52 | this_rep_df = this_cond_df[this_cond_df['Replicate_ID'] == this_rep] 53 | rep_avg_df = this_rep_df.mean()#skipna=True) 54 | rep_std_df = this_rep_df.std() 55 | rep_n_df = this_rep_df.count() 56 | 57 | # Add back non-numeric data 58 | dropped_cols = list(set(this_rep_df.columns) - set(rep_avg_df.index)) 59 | 60 | for col in dropped_cols: 61 | 62 | assert len(this_rep_df[col].unique()) == 1, 'Invalid assumption: uniqueness of non-numerical column values' 63 | rep_avg_df.loc[col] = this_rep_df[col].values[0] # Get the non-numerical value from dataframe (assuming all equivalent) 64 | rep_std_df.loc[col] = this_rep_df[col].values[0] # Get the non-numerical value from dataframe (assuming all equivalent) 65 | rep_n_df.loc[col] = this_rep_df[col].values[0] # Get the non-numerical value from dataframe (assuming all equivalent) 66 | 67 | 68 | avg_df = avg_df.append(rep_avg_df,ignore_index=True) 69 | std_df = std_df.append(rep_std_df,ignore_index=True) 70 | n_df = n_df.append(rep_n_df,ignore_index=True) 71 | 72 | 73 | else: 74 | 75 | 76 | # Add back non-numeric data 77 | dropped_cols = list(set(this_cond_df.columns) - set(cond_avg_df.index)) 78 | 79 | for col in dropped_cols: 80 | 81 | # Since we are averaging without considering replicates, we expect the list of Replicates_IDs to not be unique. 82 | if col != 'Replicate_ID' and col != 'Replicate_shortlabel': 83 | assert len(this_cond_df[col].unique()) == 1, 'Invalid assumption: uniqueness of non-numerical column values' 84 | cond_avg_df.loc[col] = this_cond_df[col].values[0] # Get the non-numerical value from dataframe (assuming all equivalent) 85 | cond_std_df.loc[col] = this_cond_df[col].values[0] 86 | cond_n_df.loc[col] = this_cond_df[col].values[0] 87 | else: 88 | cond_avg_df.loc[col] = 'NA' # Get the non-numerical value from dataframe (assuming all equivalent) 89 | cond_std_df.loc[col] = 'NA' 90 | cond_n_df.loc[col] = 'NA' 91 | 92 | avg_df = avg_df.append(cond_avg_df,ignore_index=True) 93 | std_df = std_df.append(cond_std_df,ignore_index=True) 94 | n_df = n_df.append(cond_n_df,ignore_index=True) 95 | 96 | 97 | avg_std_n = (avg_df, std_df, n_df) 98 | 99 | return avg_std_n 100 | 101 | 102 | def generalized_stats(set1, set2, test=STAT_TEST): 103 | 104 | ''' 105 | Function should work for any test between two datasets, so long as it returns two arguments 106 | the second of which is the P value. 107 | 108 | ''' 109 | 110 | t, P = eval(test+'(set1, set2)') 111 | # print(t,P) 112 | 113 | return P 114 | 115 | def stats_table(df, factor, grouping='Condition', test=STAT_TEST): 116 | 117 | ''' 118 | Create a matrix of P-values for an exhaustive comparison of groupings. 119 | 120 | Inputs: 121 | df: pd.DataFrame 122 | factor: string, column in df. 123 | grouping: default: Condition, alternatively used with label 124 | test: Statistical test to use. Defaut STAT_TEST 125 | 126 | Returns: 127 | stat_table: pd.DataFrame 128 | ''' 129 | 130 | print('Returning stats_table using test: ', test, ' for factor: ', factor) 131 | print('Note: for exploratory purposes only, no multiple comparison correction is being applied.') 132 | 133 | # Create a numpy array to hold the values, fill with NaNs 134 | n_cond = len((df['Condition'].unique())) 135 | stat_mat = np.empty([n_cond, n_cond]) 136 | stat_mat[:] = np.NaN 137 | 138 | # Fill the table with the statistic of choice. 139 | for i, cond_i in enumerate(df['Condition'].unique()): 140 | for j, cond_j in enumerate(df['Condition'].unique()): 141 | 142 | if cond_i == cond_j: 143 | stat_mat[i,j] = np.NaN 144 | else: 145 | 146 | set1 = df[factor][df[grouping] == cond_i] 147 | set2 = df[factor][df[grouping] == cond_j] 148 | 149 | P = generalized_stats(set1, set2, test) 150 | 151 | stat_mat[i,j] = P 152 | 153 | # Turn the filled numpy array into a dataframe 154 | stat_table = pd.DataFrame(data=stat_mat, 155 | index=df['Condition'].unique(), 156 | columns=df['Condition'].unique()) 157 | 158 | stat_table.to_csv(DATA_OUTPUT+factor+'_P_table.csv') 159 | 160 | return stat_table 161 | 162 | 163 | # Bootstrapping function 164 | def bootstrap_sample(df, n_samples=1000): 165 | 166 | measurements = df.values 167 | medians = [] 168 | 169 | for i in range(n_samples): 170 | 171 | samples = np.random.choice(measurements, size = len(measurements)) 172 | medians.append(np.median(samples)) 173 | 174 | medians = np.asarray(medians) 175 | 176 | return medians 177 | 178 | 179 | 180 | 181 | def bootstrap_sample_df(df,factor,ctl_label): 182 | 183 | ''' 184 | Generate bootstrapped sample and return as dataframe, to be plotted with seaborn 185 | ''' 186 | 187 | # Calculate the differences for each category and save them into dataframes for visualizing in Seaborn or Matplotlib 188 | bootstrap_diff_df = pd.DataFrame() 189 | 190 | # Get the control bootstrap 191 | ctl_bootstrap = bootstrap_sample(df[factor][df['Condition'] == ctl_label]) 192 | 193 | for i in range(0,len(pd.unique(df['Condition']))): 194 | 195 | # Use the ctl_bootstrap if we're now on that condition, otherwise will create a new bootstrap sample that won't be the same. 196 | if(pd.unique(df['Condition'])[i] == ctl_label): 197 | bootstrap = ctl_bootstrap 198 | else: 199 | bootstrap = bootstrap_sample(df[factor][df['Condition'] == pd.unique(df['Condition'])[i]]) 200 | 201 | difference = bootstrap - ctl_bootstrap 202 | this_cond = pd.unique(df['Condition'])[i] 203 | this_diff_df = pd.DataFrame(data={'Difference':difference, 'Condition':this_cond}) 204 | bootstrap_diff_df = bootstrap_diff_df.append(this_diff_df) 205 | 206 | 207 | # Calculate and print mean effect size for each condition 208 | mean_effect_sizes = bootstrap_diff_df.groupby('Condition')['Difference'].mean() 209 | print("Mean Effect Size for Each Condition Compared to Control:") 210 | for condition, mean_effect_size in mean_effect_sizes.items(): 211 | print(f"The effect size for {condition} compared with control is: {mean_effect_size}") 212 | 213 | return bootstrap_diff_df 214 | 215 | # Function to calculate median and mean for each condition per factor and save results to CSV 216 | def calculate_median_mean_and_save(df, factors): 217 | for factor_name in factors: 218 | result_df = df.groupby('Condition_shortlabel')[factor_name].agg(['median', 'mean']).reset_index() 219 | output_file = f'{DATA_OUTPUT}/{factor_name}_median_mean_results.csv' 220 | result_df.to_csv(output_file, index=False) 221 | 222 | # Function to perform statistical testing between two conditions for each factor and save results to CSV 223 | def perform_statistical_testing_and_save(df, factors): # , output_folder, 224 | for factor_name in factors: 225 | conditions = df['Condition_shortlabel'].unique() 226 | condition1, condition2 = conditions[:2] # Assuming only two conditions for simplicity 227 | 228 | data1 = df[df['Condition_shortlabel'] == condition1][factor_name] 229 | data2 = df[df['Condition_shortlabel'] == condition2][factor_name] 230 | 231 | # Perform Mann-Whitney U test for non-normal data 232 | stat_mw, p_value_mw = stats.mannwhitneyu(data1, data2) 233 | 234 | # Perform t-test for normal data (assuming normality for simplicity) 235 | stat_t, p_value_t = stats.ttest_ind(data1, data2) 236 | 237 | result_df = pd.DataFrame({ 238 | 'Factor': [factor_name], 239 | 'Condition1': [condition1], 240 | 'Condition2': [condition2], 241 | 'Mann-Whitney U Statistic': [stat_mw], 242 | 'Mann-Whitney U P-Value': [p_value_mw], 243 | 't-test Statistic': [stat_t], 244 | 't-test P-Value': [p_value_t] 245 | }) 246 | 247 | output_file_mw = f'{DATA_OUTPUT}/{factor_name}_mannwhitneyu_results.csv' 248 | output_file_t = f'{DATA_OUTPUT}/{factor_name}_ttest_results.csv' 249 | # DATA_OUTPUT 250 | result_df.to_csv(output_file_mw, index=False) 251 | result_df.to_csv(output_file_t, index=False) 252 | -------------------------------------------------------------------------------- /cellPLATO/cellPLATO/data_processing/time_calculations.py: -------------------------------------------------------------------------------- 1 | # time_calculations.py 2 | 3 | from initialization.initialization import * 4 | from initialization.config import * 5 | 6 | from data_processing.clustering import cluster_purity 7 | 8 | import os 9 | import numpy as np 10 | import pandas as pd 11 | 12 | def cluster_composition_timecourse(df): 13 | 14 | df_list = [] 15 | 16 | for frame in df['frame'].unique(): 17 | 18 | # Get dataframe at this timepoint 19 | tpt_sub_df = df[df['frame'] == frame] 20 | 21 | clust_sum_df = cluster_purity(tpt_sub_df) 22 | clust_sum_df['frame'] = frame 23 | 24 | df_list.append(clust_sum_df) 25 | 26 | df_out = pd.concat(df_list) 27 | df_out['Time (min)'] = df_out['frame'] * SAMPLING_INTERVAL 28 | df_out.reset_index(inplace=True) 29 | 30 | return df_out 31 | 32 | # def time_average(df): 33 | 34 | # ''' 35 | # Needs a more descriptive name? 36 | # average_across_time()? 37 | 38 | # Function to generate a time-averaged dataframe, 39 | # where the average value for each factor across all timepoints 40 | # is calculated for each cell. 41 | 42 | # Input: 43 | # df: DataFrame [N * T * X] 44 | 45 | 46 | # Returns: 47 | # avg_df: DataFrame [N * X] 48 | # ''' 49 | 50 | # time_avg_df = pd.DataFrame() 51 | # unique_id = 0 # Create a unique cell id 52 | # rep_list = df['Replicate_ID'].unique() 53 | 54 | 55 | # for this_rep in rep_list: 56 | 57 | # rep_df = df[df['Replicate_ID']==this_rep] 58 | # print(f'Replicate: {this_rep}') 59 | # cell_ids = rep_df['particle'].unique() # Particle ids only unique for replicate, not between. 60 | # print(f'cell_ids: {cell_ids} ') 61 | 62 | # # For each cell, calculate the average value and add to new DataFrame 63 | # for cid in cell_ids: 64 | 65 | # cell_df = rep_df[rep_df['particle'] == cid] 66 | 67 | # # A test to ensure there is only one replicate label included. 68 | # assert len(cell_df['Rep_label'].unique()) == 1, 'check reps' 69 | 70 | # avg_df = cell_df.mean() # Returns a series that is the mean value for each numerical column. Non-numerical columns are dropped. 71 | 72 | # # Add back non-numeric data 73 | # dropped_cols = list(set(cell_df.columns) - set(avg_df.index)) 74 | 75 | # for col in dropped_cols: 76 | 77 | # assert len(cell_df[col].unique()) == 1, 'Invalid assumption: uniqueness of non-numerical column values' 78 | # avg_df.loc[col] = cell_df[col].values[0] # Get the non-numerical value from dataframe (assuming all equivalent) 79 | 80 | # avg_df.loc['unique_id'] = unique_id # Add Unique cell ID for the analysis 81 | # time_avg_df = time_avg_df.append(avg_df,ignore_index=True) 82 | # unique_id += 1 83 | 84 | # time_avg_df['frame'] = 'timeaverage' # Replace the meaningless average frame values with a string desciption 85 | 86 | # return time_avg_df 87 | 88 | 89 | def time_average(df): 90 | """ 91 | Function to generate a time-averaged dataframe, 92 | where the average value for each factor across all timepoints 93 | is calculated for each unique `uniq_id`. 94 | 95 | Input: 96 | df: DataFrame with a `uniq_id` column 97 | 98 | Returns: 99 | time_avg_df: DataFrame with averaged values for each `uniq_id` 100 | """ 101 | 102 | time_avg_df = pd.DataFrame() 103 | unique_ids = df['uniq_id'].unique() 104 | 105 | for uid in unique_ids: 106 | cell_df = df[df['uniq_id'] == uid] 107 | 108 | # Calculate the mean value for each numerical column 109 | avg_df = cell_df.mean() # Returns a series 110 | 111 | # Add back non-numeric data (assuming they are consistent across the unique_id) 112 | non_numeric_cols = list(set(cell_df.columns) - set(avg_df.index)) 113 | for col in non_numeric_cols: 114 | # Check if the column is indeed non-numeric 115 | if cell_df[col].dtype == 'object' or cell_df[col].dtype == 'category': 116 | # Make sure there's only one unique value for this column in the filtered dataframe 117 | assert len(cell_df[col].unique()) == 1, f"Non-unique values found in column {col} for uniq_id {uid}" 118 | avg_df.loc[col] = cell_df[col].values[0] 119 | 120 | avg_df.loc['uniq_id'] = uid # Add the unique_id back to the dataframe 121 | time_avg_df = time_avg_df.append(avg_df, ignore_index=True) 122 | 123 | time_avg_df['frame'] = 'timeaverage' # Replace the meaningless average frame values with a string description 124 | 125 | return time_avg_df 126 | 127 | 128 | def time_average_trackmate(df): 129 | 130 | ''' 131 | Needs a more descriptive name? 132 | average_across_time()? 133 | 134 | Function to generate a time-averaged dataframe, 135 | where the average value for each factor across all timepoints 136 | is calculated for each cell. 137 | 138 | Input: 139 | df: DataFrame [N * T * X] 140 | 141 | 142 | Returns: 143 | avg_df: DataFrame [N * X] 144 | ''' 145 | 146 | time_avg_df = pd.DataFrame() 147 | unique_id = 0 # Create a unique cell id 148 | 149 | 150 | cell_ids = df['uniq_id'].unique() # Just use unique ids 151 | 152 | # For each cell, calculate the average value and add to new DataFrame 153 | for cid in cell_ids: 154 | 155 | cell_df = df[df['uniq_id'] == cid] 156 | 157 | # A test to ensure there is only one replicate label included. 158 | assert len(cell_df['Rep_label'].unique()) == 1, 'check reps' 159 | 160 | avg_df = cell_df.mean() # Returns a series that is the mean value for each numerical column. Non-numerical columns are dropped. 161 | 162 | # Add back non-numeric data 163 | dropped_cols = list(set(cell_df.columns) - set(avg_df.index)) 164 | 165 | for col in dropped_cols: 166 | 167 | # assert len(cell_df[col].unique()) == 1, 'Invalid assumption: uniqueness of non-numerical column values' 168 | # print the number of columns with the same column name in cell_df 169 | print(f'The column named {col} has this number of occurrences in cell_df: {len(cell_df[col].unique())}') 170 | 171 | avg_df.loc[col] = cell_df[col].values[0] # Get the non-numerical value from dataframe (assuming all equivalent) 172 | 173 | avg_df.loc['unique_id'] = unique_id # Add Unique cell ID for the analysis 174 | time_avg_df = time_avg_df.append(avg_df,ignore_index=True) 175 | unique_id += 1 176 | 177 | time_avg_df['frame'] = 'timeaverage' # Replace the meaningless average frame values with a string desciption 178 | 179 | return time_avg_df 180 | 181 | 182 | 183 | def average_per_timepoint(df, t_window=None): 184 | 185 | ''' 186 | For each timepoint, calculate the average across cells 187 | 188 | Note: this works for single timepoints or time windows, but 189 | doing these calculations at the level of the dataframe 190 | wont easily permit stdev and sem calculations 191 | 192 | Input: 193 | df: DataFrame [N * T * X] 194 | #poolreps: Boolean, default=False 195 | 196 | Returns: 197 | tpt_avg_df: DataFrame [T * X] 198 | 199 | ''' 200 | 201 | tptavg_df = pd.DataFrame() 202 | 203 | frame_list = df['frame'].unique() 204 | cond_list = df['Condition'].unique() 205 | rep_list = df['Replicate_ID'].unique() 206 | 207 | ''' 208 | Do we instead want to use FRAME_END? 209 | More user-controlled vs data-driven: 210 | frame_list = range(FRAME_END) 211 | ''' 212 | 213 | for frame in frame_list: 214 | 215 | if t_window is not None: 216 | # get a subset of the dataframe across the range of frames 217 | frame_df = df[(df['frame']>=frame - t_window/2) & 218 | (df['frame'] MIN_CELLS_PER_TPT): 234 | 235 | avg_df = rep_df.mean() # Returns a series that is the mean value for each numerical column. Non-numerical columns are dropped. 236 | 237 | # Add back non-numeric data 238 | dropped_cols = list(set(frame_df.columns) - set(avg_df.index)) 239 | 240 | for col in dropped_cols: 241 | 242 | # Validate assumption that sub_df has only one rep/condition, then use this value in new frame 243 | assert len(rep_df[col].unique()) == 1, 'Invalid assumption: uniqueness of non-numerical column values' 244 | avg_df.loc[col] = rep_df[col].values[0] # Get the non-numerical value from dataframe (assuming all equivalent) 245 | 246 | if t_window is None: # assertion only works when no window is used. 247 | assert avg_df.loc['frame'] == frame, 'Frame mismatch' 248 | 249 | tptavg_df = tptavg_df.append(avg_df,ignore_index=True) 250 | else: 251 | if(DEBUG): 252 | print('Skipping: ',rep, ' N = ', len(rep_df)) 253 | 254 | return tptavg_df 255 | -------------------------------------------------------------------------------- /cellPLATO/cellPLATO/initialization/__init__.py: -------------------------------------------------------------------------------- 1 | # from . import initialization 2 | # from . import config 3 | print('Finished running cellPLATO initialization and loaded config.') 4 | -------------------------------------------------------------------------------- /cellPLATO/cellPLATO/initialization/btrack_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "TrackerConfig": 3 | { 4 | "MotionModel": 5 | { 6 | "name": "cell_motion", 7 | "dt": 1.0, 8 | "measurements": 3, 9 | "states": 6, 10 | "accuracy": 7.5, 11 | "prob_not_assign": 0.001, 12 | "max_lost": 5, 13 | "A": { 14 | "matrix": [1,0,0,1,0,0, 15 | 0,1,0,0,1,0, 16 | 0,0,1,0,0,1, 17 | 0,0,0,1,0,0, 18 | 0,0,0,0,1,0, 19 | 0,0,0,0,0,1] 20 | }, 21 | "H": { 22 | "matrix": [1,0,0,0,0,0, 23 | 0,1,0,0,0,0, 24 | 0,0,1,0,0,0] 25 | }, 26 | "P": { 27 | "sigma": 150.0, 28 | "matrix": [0.1,0,0,0,0,0, 29 | 0,0.1,0,0,0,0, 30 | 0,0,0.1,0,0,0, 31 | 0,0,0,1,0,0, 32 | 0,0,0,0,1,0, 33 | 0,0,0,0,0,1] 34 | }, 35 | "G": { 36 | "sigma": 15.0, 37 | "matrix": [0.5,0.5,0.5,1,1,1] 38 | 39 | }, 40 | "R": { 41 | "sigma": 5.0, 42 | "matrix": [1,0,0, 43 | 0,1,0, 44 | 0,0,1] 45 | } 46 | }, 47 | "ObjectModel": 48 | {}, 49 | "HypothesisModel": 50 | { 51 | "name": "cell_hypothesis", 52 | "hypotheses": ["P_FP", "P_init", "P_term", "P_link", "P_branch", "P_dead"], 53 | "lambda_time": 5.0, 54 | "lambda_dist": 3.0, 55 | "lambda_link": 10.0, 56 | "lambda_branch": 50.0, 57 | "eta": 1e-10, 58 | "theta_dist": 50.0, 59 | "theta_time": 5.0, 60 | "dist_thresh": 75, 61 | "time_thresh": 2, 62 | "apop_thresh": 5, 63 | "segmentation_miss_rate": 0.1, 64 | "apoptosis_rate": 0.001, 65 | "relax": true 66 | } 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /cellPLATO/cellPLATO/initialization/config.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Configuration file. 3 | Fill out this file then run the jupyter notebook to analyze your data 4 | 5 | ''' 6 | 7 | ''' 8 | Experiment-specific constants to be filled by user 9 | ''' 10 | 11 | DATA_PATH = 'D:/PATH/' # Input the path to the folder containing the data 12 | OUTPUT_PATH = 'D:/PATH_OUTPUT/' # Input the path to the folder where the output will be saved 13 | CTL_LABEL = 'CONTROL_CONDITION' # Input the name of the control condition here 14 | 15 | # Input here the folder names of the conditions you want to include in the analysis 16 | # Note: the order of the conditions here will be the order of the conditions in the plots 17 | 18 | CONDITIONS_TO_INCLUDE = ['CONTROL_CONDITION', 19 | 'CONDITION_2', 20 | 'CONDITION_3', 21 | 'CONDITION_4'] 22 | 23 | CONDITION_SHORTLABELS = ['Ctrl','One','Two','Three',] # Short labels for the conditions, for plotting purposes 24 | DATASET_SHORTNAME = 'EXAMPLE_DATASET_NAME' # give the data a nickname 25 | 26 | INPUT_FMT = 'btrack' # 'usiigaci'#btrack 27 | TRACK_FILENAME = '.h5' 28 | 29 | MICRONS_PER_PIXEL = 0.537 30 | # MICRONS_PER_PIXEL_LIST = [0.537,0.537,0.537, 0.537,] # For mixed spatial scaling 31 | # MICRONS_PER_PIXEL = MICRONS_PER_PIXEL_LIST[0] 32 | 33 | SAMPLING_INTERVAL = 40/60 # time between frames in minutes 34 | # SAMPLING_INTERVAL_LIST= [40/60,40/60,40/60,40/60,] # For mixed temporal scaling 35 | # SAMPLING_INTERVAL = SAMPLING_INTERVAL_LIST[0] 36 | 37 | IMAGE_HEIGHT = 1024 # pixels 38 | IMAGE_WIDTH = 1024 # pixels 39 | Z_SCALE = 1.00 40 | 41 | MigrationTimeWindow_minutes = 5 # Here, set the length of the time window in minutes 42 | MIG_T_WIND = round(MigrationTimeWindow_minutes / SAMPLING_INTERVAL) 43 | T_WINDOW_MULTIPLIER = 1.0 # 6.0 # For plasticity plots, to potentially increase the time window size for those calculations 44 | 45 | CLUSTER_CMAP = 'tab20' # Define colormap used for clustering plots 46 | CONDITION_CMAP = 'Dark2' #'Define colormap used for condition maps. Dark2 is good for 7 conditions, tab20 > 20 conditions. 47 | # Note: use paired for groups of 2 48 | 49 | ARREST_THRESHOLD = 3 * SAMPLING_INTERVAL # Here, user can define threshold in MICRONS PER MINUTE, because we multiply by the sampling interval to convert it to microns per frame. 50 | RIP_R = 140 # Radius to search when calculating Ripleys L in pixels. 1.5 * the size of a cell = 12+6=18 51 | 52 | DATA_FILTERS = { 53 | "area": (50, 10000), # Debris removal 54 | "ntpts": (8,1800) # Remove cells that are tracked for less than 8 frames 55 | 56 | } 57 | 58 | # Booleans to draw or not specific plots. 59 | DRAW_SUPERPLOTS = True 60 | DRAW_DIFFPLOTS = True 61 | DRAW_MARGSCAT = True 62 | DRAW_TIMEPLOTS = True 63 | DRAW_BARPLOTS = True 64 | DRAW_SUPERPLOTS_grays = True 65 | DRAW_SNS_BARPLOTS = True 66 | 67 | 68 | ''' 69 | Measurements to make 70 | ''' 71 | 72 | # Cell migration factors calculated in migration_calcs() 73 | 74 | MIG_FACTORS = ['euclidean_dist', 75 | 'cumulative_length', 76 | 'speed', 77 | 'orientedness', 78 | 'directedness', 79 | 'turn_angle', 80 | 'endpoint_dir_ratio', 81 | 'dir_autocorr', 82 | 'outreach_ratio', 83 | 'MSD', 84 | 'max_dist', 85 | 'glob_turn_deg', 86 | 'arrest_coefficient'] 87 | 88 | # Region property factors to be extracted from the cell contours 89 | # This list must match with props from regionprops 90 | 91 | REGIONPROPS_LIST = ['area', 92 | 'bbox_area', 93 | 'eccentricity', 94 | 'equivalent_diameter', 95 | 'extent', 96 | 'filled_area', 97 | 'major_axis_length', 98 | 'minor_axis_length', 99 | 'orientation', 100 | 'perimeter', 101 | 'solidity'] 102 | 103 | SHAPE_FACTORS = ['area', 104 | 'bbox_area', 105 | 'eccentricity', 106 | 'equivalent_diameter', 107 | 'extent', 108 | 'filled_area', 109 | 'major_axis_length', 110 | 'minor_axis_length', 111 | 'orientation', 112 | 'perimeter', 113 | 'solidity'] 114 | 115 | ADDITIONAL_FACTORS = ['aspect', 'rip_p', 'rip_K', 'rip_L'] 116 | 117 | DR_FACTORS = REGIONPROPS_LIST + MIG_FACTORS + ADDITIONAL_FACTORS 118 | ALL_FACTORS = REGIONPROPS_LIST + MIG_FACTORS + ADDITIONAL_FACTORS 119 | 120 | 121 | NUM_FACTORS = DR_FACTORS + ['tSNE1', 'tSNE2', 'PC1', 'PC2'] 122 | 123 | ''' 124 | Advanced parameters (can stay default) 125 | ''' 126 | 127 | MIXED_SCALING = False # Not used yet, for futureproofing 128 | SELF_STANDARDIZE = False #STANDARDIZES ACROSS factors within a cell df. 129 | AVERAGE_TIME_WINDOWS = False #This does two things. 1) provides a time window averaged value for every metric (_tmean). 130 | # 2) gives also a ratio of the time window averaged value to the first timepoint in the time window (_tmean_ratio). 131 | 132 | CALIBRATED_POS = False # Does the data need to be calibrated? 133 | OVERWRITE = True # Overwrite the pre-processed data. 134 | USE_INPUT_REGIONPROPS = True 135 | CALCULATE_REGIONPROPS = False 136 | USE_SHORTLABELS = True 137 | PERFORM_RIPLEYS = True 138 | ARCHIVE_CONFIG = True 139 | 140 | ''' 141 | Everything below does not need to be changed by the user 142 | ''' 143 | 144 | N_COMPONENTS = 3 #this is for UMAP 145 | UMAPS = ['UMAP1','UMAP2','UMAP3'] 146 | FRAME_START = 0 # Start frame for analysis (deprecated) 147 | FRAME_END = 180 # End frame for analysis (deprecated) 148 | MIN_CELLS_PER_TPT = 1 # used in: average_per_timepoint() 149 | 150 | CLUSTER_BY = 'umap' # temp 151 | PALETTE = 'colorblind' 152 | PX_COLORS = 'px.colors.qualitative.Safe' # Choose between discrete colors from https://plotly.com/python/discrete-color/ 153 | 154 | STATIC_PLOTS = True 155 | PLOTS_IN_BROWSER = False 156 | 157 | ANIMATE_TRAJECTORIES = True 158 | DEBUG = False 159 | 160 | # Booleans for Analysis components: 161 | '''(Only run pipelines if true)''' 162 | DIMENSION_REDUCTION = True 163 | PARAM_SWEEP = True 164 | CLUSTERING = True 165 | 166 | CLUSTER_TSNE = True 167 | CLUSTER_PCA = True 168 | CLUSTER_XY = True 169 | 170 | ############################################### 171 | # tSNE/UMAP parameters and embedding: 172 | ############################################### 173 | 174 | SCALING_METHOD = 'choice' # minmax powertransformer log2minmax choice 175 | TSNE_PERP = 185#230 # Perplexity 176 | TSNE_R_S = 11 # Random seed 177 | USE_SAVED_EMBEDDING = False#True 178 | EMBEDDING_FILENAME = 'saved_embedding.npy' 179 | TRAINX_FILENAME = 'saved_x_train.npy' 180 | UMAP_NN = 10 # Nearest-neighbors 181 | UMAP_MIN_DIST = 0.2 #0.5 182 | MIN_SAMPLES = 10 # DBScan 183 | EPS = 0.06 # DBScan 184 | 185 | ############################################### 186 | # Factor wrangling - no need to change these 187 | ############################################### 188 | 189 | # Factors to display on the animated plots 190 | MIG_DISPLAY_FACTORS=['speed', 'euclidean_dist', 'arrest_coefficient', 'turn_angle','directedness', 'dir_autocorr','orientedness'] 191 | SHAPE_DISPLAY_FACTORS = ['area','aspect','orientation'] 192 | 193 | # Factor to standardize to themselves over time (to look at self-relative instead of absolute values.) 194 | FACTORS_TO_STANDARDIZE = ['area', 195 | 'bbox_area', 196 | 'equivalent_diameter', 197 | 'filled_area', 198 | 'major_axis_length', 199 | 'minor_axis_length', 200 | 'perimeter'] 201 | 202 | FACTORS_TO_CONVERT = ['area', 'bbox_area', 'equivalent_diameter', 'extent', 'filled_area', 203 | 'major_axis_length', 'minor_axis_length', 'perimeter'] 204 | 205 | ############################################### 206 | # Plotting parameters 207 | ############################################### 208 | 209 | AXES_LIMITS = '2-sigma' #'min-max' #'2-sigma' # Currently only implemented in marginal_xy contour plots. 210 | STAT_TEST = 'st.ttest_ind' 211 | # Plot display Parameters 212 | PLOT_TEXT_SIZE = 30 213 | DIFF_PLOT_TYPE = 'violin' # 'swarm', 'violin', 'box' 214 | 215 | # Pre-defined pairs of factors for generating comparison plots 216 | FACTOR_PAIRS = [['tSNE1', 'tSNE2'], 217 | ['area', 'speed'], 218 | ['directedness', 'speed'], 219 | ['orientedness', 'speed'], 220 | ['endpoint_dir_ratio', 'speed'], 221 | ['orientation', 'speed'], 222 | ['turn_angle', 'speed'], # These are identical 223 | ['major_axis_length', 'speed'], 224 | ['major_axis_length', 'minor_axis_length'], 225 | ['euclidean_dist','cumulative_length'], 226 | ['euclidean_dist','speed'], 227 | ['PC1', 'PC2']] 228 | 229 | # No need to change these # 230 | 231 | DIS_REGIONPROPS_LIST = ['area', 232 | # 'bbox_area', 233 | 'eccentricity', 234 | 'equivalent_diameter', 235 | # 'extent', 236 | # 'filled_area', 237 | 'major_axis_length', 238 | 'minor_axis_length', 239 | 'orientation', 240 | 'perimeter', 241 | 'solidity'] 242 | DIS_MIG_FACTORS = ['euclidean_dist', # Valid? 243 | 'cumulative_length', # Valid? 244 | 'speed', 245 | # 'orientedness', # name changed from orientation 246 | # 'directedness', 247 | # 'turn_angle', 248 | 'endpoint_dir_ratio', 249 | 'dir_autocorr', 250 | 'outreach_ratio', 251 | 'MSD', # Valid? 252 | # 'max_dist', # Valid? 253 | 'glob_turn_deg', 254 | 'arrest_coefficient'] 255 | 256 | DIS_ADDITIONAL_FACTORS = ['aspect', 'rip_L'] 257 | 258 | T_WIND_DR_FACTORS = ['MSD', 259 | 260 | # 'MSD_ratio', 261 | # 'MSD_tmean', 262 | 'area', 263 | # 'area_ratio', # Doesn't work in DR if using self-standardized because min (0) becomes inf. 264 | 'area_tmean', 265 | 'arrest_coefficient', 266 | # 'arrest_coefficient_ratio', 267 | 'arrest_coefficient_tmean', 268 | 'aspect', 269 | 'aspect_ratio', 270 | 'aspect_tmean', 271 | 'bbox_area', 272 | # 'bbox_area_ratio', 273 | 'bbox_area_tmean', 274 | 'cumulative_length', 275 | # 'cumulative_length_ratio', 276 | # 'cumulative_length_tmean', 277 | 'dir_autocorr', 278 | 'dir_autocorr_ratio', 279 | 'dir_autocorr_tmean', 280 | 'directedness', 281 | 'directedness_ratio', 282 | 'directedness_tmean', 283 | 'eccentricity', 284 | 'eccentricity_ratio', 285 | 'eccentricity_tmean', 286 | 'endpoint_dir_ratio', 287 | 'endpoint_dir_ratio_ratio', 288 | 'endpoint_dir_ratio_tmean', 289 | 'equivalent_diameter', 290 | 'equivalent_diameter_ratio', 291 | 'equivalent_diameter_tmean', 292 | 'euclidean_dist', 293 | 'euclidean_dist_ratio', 294 | 'euclidean_dist_tmean', 295 | 'extent', 296 | 'extent_ratio', 297 | 'extent_tmean', 298 | 'filled_area', 299 | # 'filled_area_ratio', # Doesn't work in DR if using self-standardized because min (0) becomes inf. 300 | 'filled_area_tmean', 301 | 'glob_turn_deg', 302 | # 'glob_turn_deg_ratio', 303 | # 'glob_turn_deg_tmean', 304 | 'major_axis_length', 305 | # 'major_axis_length_ratio', 306 | # 'major_axis_length_tmean', 307 | 'max_dist', 308 | 'max_dist_ratio', 309 | 'max_dist_tmean', 310 | 'minor_axis_length', 311 | # 'minor_axis_length_ratio', 312 | # 'minor_axis_length_tmean', 313 | 'orientation', 314 | 'orientation_ratio', 315 | 'orientation_tmean', 316 | 'orientedness', 317 | 'orientedness_ratio', 318 | 'orientedness_tmean', 319 | 'outreach_ratio', 320 | 'outreach_ratio_ratio', 321 | 'outreach_ratio_tmean', 322 | 'perimeter', 323 | 'perimeter_ratio', 324 | 'perimeter_tmean', 325 | 'rip_K', 326 | # 'rip_K_ratio', 327 | # 'rip_K_tmean', 328 | 'rip_L', 329 | # 'rip_L_ratio', 330 | # 'rip_L_tmean', 331 | 'rip_p', 332 | # 'rip_p_ratio', 333 | # 'rip_p_tmean', 334 | 'solidity', 335 | 'solidity_ratio', 336 | 'solidity_tmean', 337 | 'speed', 338 | 'speed_ratio', 339 | 'speed_tmean', 340 | 'turn_angle', 341 | 'turn_angle_ratio', 342 | 'turn_angle_tmean'] -------------------------------------------------------------------------------- /cellPLATO/cellPLATO/initialization/initialization.py: -------------------------------------------------------------------------------- 1 | #initialization.py 2 | 3 | from initialization.config import * 4 | 5 | import os 6 | import shutil 7 | import datetime 8 | import warnings 9 | warnings.filterwarnings("ignore") 10 | 11 | TIMESTAMP = str(datetime.datetime.now()).replace(':', '-').replace('.', '-').replace(' ', '_') 12 | 13 | # print('Dataset in current notebook: ',DATASET_SHORTNAME) 14 | 15 | print('Initializing: ', DATASET_SHORTNAME) 16 | print('Hypthesis testing using: ',STAT_TEST) 17 | 18 | 19 | ''' 20 | Make the folders for exporting 21 | ''' 22 | 23 | TEMP_OUTPUT = os.path.join(OUTPUT_PATH,DATASET_SHORTNAME,TIMESTAMP,'tmp/') 24 | ANIM_OUTPUT = os.path.join(OUTPUT_PATH,DATASET_SHORTNAME,TIMESTAMP,'animations/') 25 | 26 | SAVED_DATA_PATH = os.path.join(OUTPUT_PATH,DATASET_SHORTNAME,'saved_data/') 27 | 28 | # Create timestamped folders to contain data and plot from this analysis 29 | # Main level: 30 | DATA_OUTPUT = os.path.join(OUTPUT_PATH,DATASET_SHORTNAME,TIMESTAMP,'data/') 31 | PLOT_OUTPUT = os.path.join(OUTPUT_PATH,DATASET_SHORTNAME,TIMESTAMP,'plots/') 32 | print('Plots will be exported to: ', PLOT_OUTPUT) 33 | 34 | 35 | if not os.path.exists(TEMP_OUTPUT): 36 | os.makedirs(TEMP_OUTPUT) 37 | 38 | if not os.path.exists(ANIM_OUTPUT): 39 | os.makedirs(ANIM_OUTPUT) 40 | 41 | if not os.path.exists(SAVED_DATA_PATH): 42 | os.makedirs(SAVED_DATA_PATH) 43 | 44 | if not os.path.exists(PLOT_OUTPUT): 45 | os.makedirs(PLOT_OUTPUT) 46 | 47 | if not os.path.exists(DATA_OUTPUT): 48 | os.makedirs(DATA_OUTPUT) 49 | 50 | if not os.path.exists(SAVED_DATA_PATH): 51 | os.makedirs(SAVED_DATA_PATH) 52 | 53 | 54 | print('Using unique embedding per dataset shortname: ',DATASET_SHORTNAME) 55 | EMBEDDING_PATH = os.path.join(OUTPUT_PATH,DATASET_SHORTNAME,'tsne_embedding/') 56 | 57 | 58 | 59 | # Sub folders for analysis components: 60 | COMP_DIR = os.path.join(PLOT_OUTPUT,'Comparative_analysis/') 61 | DR_DIR = os.path.join(PLOT_OUTPUT,'Dimensionality_Reduction/') 62 | CLUST_DIR = os.path.join(PLOT_OUTPUT,'Clustering/') 63 | 64 | # Sub-directories for parameter sweeping: 65 | DR_PARAMS_DIR = os.path.join(DR_DIR,'Parameter_sweep/') 66 | CLUST_PARAMS_DIR = os.path.join(CLUST_DIR,'Parameter_sweep/') 67 | 68 | if not os.path.exists(COMP_DIR): 69 | os.makedirs(COMP_DIR) 70 | 71 | if DIMENSION_REDUCTION and not os.path.exists(DR_DIR): 72 | os.makedirs(DR_DIR) 73 | 74 | if CLUSTERING and not os.path.exists(CLUST_DIR): 75 | os.makedirs(CLUST_DIR) 76 | 77 | if PARAM_SWEEP and not os.path.exists(DR_PARAMS_DIR): 78 | os.makedirs(DR_PARAMS_DIR) 79 | 80 | if PARAM_SWEEP and not os.path.exists(CLUST_PARAMS_DIR): 81 | os.makedirs(CLUST_PARAMS_DIR) 82 | 83 | 84 | # Sub folders for plot types (Comparative) 85 | SUPERPLOT_DIR = os.path.join(COMP_DIR,'Superplots/') 86 | SUPERPLOT_grays_DIR = os.path.join(COMP_DIR,'Superplots_grays/') 87 | DIFFPLOT_DIR = os.path.join(COMP_DIR,'Plots_of_differences/') 88 | MARGSCAT_DIR = os.path.join(COMP_DIR,'Marginal_scatterplots/') 89 | TIMEPLOT_DIR = os.path.join(COMP_DIR,'Timeplots/') 90 | BAR_DIR = os.path.join(COMP_DIR,'Bar_plots/') 91 | BAR_SNS_DIR = os.path.join(COMP_DIR,'SNS_Gray_Bar_plots/') 92 | 93 | 94 | if DRAW_SUPERPLOTS and not os.path.exists(SUPERPLOT_DIR): 95 | print('Exporting static Superplots') 96 | os.makedirs(SUPERPLOT_DIR) 97 | 98 | if DRAW_SUPERPLOTS_grays and not os.path.exists(SUPERPLOT_grays_DIR): 99 | print('Exporting static Superplots') 100 | os.makedirs(SUPERPLOT_grays_DIR) 101 | 102 | if DRAW_DIFFPLOTS and not os.path.exists(DIFFPLOT_DIR): 103 | print('Exporting static Plots of Differences') 104 | os.makedirs(DIFFPLOT_DIR) 105 | 106 | if DRAW_MARGSCAT and not os.path.exists(MARGSCAT_DIR): 107 | print('Exporting static Marginal scatterplots') 108 | os.makedirs(MARGSCAT_DIR) 109 | 110 | if DRAW_TIMEPLOTS and not os.path.exists(TIMEPLOT_DIR): 111 | print('Exporting static Timeplots') 112 | os.makedirs(TIMEPLOT_DIR) 113 | 114 | if DRAW_BARPLOTS and not os.path.exists(BAR_DIR): 115 | print('Exporting Bar plots') 116 | os.makedirs(BAR_DIR) 117 | 118 | if DRAW_SNS_BARPLOTS and not os.path.exists(BAR_SNS_DIR): 119 | print('Exporting SNS Bar plots') 120 | os.makedirs(BAR_SNS_DIR) 121 | 122 | 123 | # Create the folder where the subgroup cluster outputs will go: 124 | 125 | CLUST_TSNE_DIR = os.path.join(CLUST_DIR,'tSNE/') 126 | CLUST_PCA_DIR = os.path.join(CLUST_DIR,'PCA/') 127 | CLUST_XY_DIR = os.path.join(CLUST_DIR,'xy/') 128 | CLUST_DISAMBIG_DIR = os.path.join(CLUST_DIR,'Cluster_Disambiguation/') 129 | TRAJECTORY_DISAMBIG_DIR = os.path.join(CLUST_DIR,'Trajectory_Cluster_Disambiguation/') 130 | CLUST_DISAMBIG_DIR_TAVG = os.path.join(CLUST_DIR,'Cluster_Disambiguation_tavg/') 131 | CLUSTERING_DIR = os.path.join(CLUST_DIR,'Clustering/') 132 | 133 | if not os.path.exists(CLUSTERING_DIR): 134 | os.makedirs(CLUSTERING_DIR) 135 | 136 | if not os.path.exists(CLUST_DISAMBIG_DIR): 137 | os.makedirs(CLUST_DISAMBIG_DIR) 138 | 139 | if not os.path.exists(TRAJECTORY_DISAMBIG_DIR): 140 | os.makedirs(TRAJECTORY_DISAMBIG_DIR) 141 | 142 | if not os.path.exists(CLUST_DISAMBIG_DIR_TAVG): 143 | os.makedirs(CLUST_DISAMBIG_DIR_TAVG) 144 | 145 | if CLUSTERING and CLUSTER_TSNE and not os.path.exists(CLUST_TSNE_DIR): 146 | os.makedirs(CLUST_TSNE_DIR) 147 | 148 | if CLUSTERING and CLUSTER_PCA and not os.path.exists(CLUST_PCA_DIR): 149 | os.makedirs(CLUST_PCA_DIR) 150 | 151 | if CLUSTERING and CLUSTER_XY and not os.path.exists(CLUST_XY_DIR): 152 | os.makedirs(CLUST_XY_DIR) 153 | 154 | 155 | 156 | # Some assert statements as sanity checks: 157 | assert CTL_LABEL in CONDITIONS_TO_INCLUDE, 'Be sure that CTL_LABEL in config is within the CONDITIONS_TO_INCLUDE list' 158 | 159 | if(USE_SHORTLABELS): 160 | this_cond_ind = CONDITIONS_TO_INCLUDE.index(CTL_LABEL) 161 | CTL_SHORTLABEL = CONDITION_SHORTLABELS[this_cond_ind] 162 | print('Using corresponding CTL_SHORTLABEL: ',CTL_SHORTLABEL, 163 | ' for condition: ', CTL_LABEL) 164 | 165 | # Archive a copy of this config file for future reference 166 | if(ARCHIVE_CONFIG): 167 | 168 | # Also save copy as a .py file so it is easy to re-run later 169 | path_to_config = 'initialization/config.py' 170 | export_path = DATA_OUTPUT + 'config_' + '.txt' #+ TIMESTAMP (removed because folder already created with timestamp in name) 171 | shutil.copyfile(path_to_config, export_path) 172 | 173 | 174 | print('Dataset in current notebook: ',DATASET_SHORTNAME) 175 | -------------------------------------------------------------------------------- /cellPLATO/cellPLATO/visualization/__init__.py: -------------------------------------------------------------------------------- 1 | # from initialization.config import * 2 | # from initialization.initialization import * 3 | # 4 | # from data_processing.cell_identifier import * 5 | # from data_processing.cleaning_formatting_filtering import * 6 | # from data_processing.clustering import * 7 | # from data_processing.data_io import * 8 | # from data_processing.data_wrangling import * 9 | # from data_processing.dimensionality_reduction import * 10 | # from data_processing.measurements import * 11 | # from data_processing.migration_calculations import * 12 | # from data_processing.pipelines import * 13 | # from data_processing.shape_calculations import * 14 | # from data_processing.statistics import * 15 | # from data_processing.time_calculations import * 16 | # from data_processing.trajectory_clustering import * 17 | # 18 | # from visualization.comparative_visualization import * 19 | # from visualization.cluster_visualization import * 20 | # from visualization.filter_visualization import * 21 | # from visualization.low_dimension_visualization import * 22 | # from visualization.panel_apps import * 23 | # from visualization.plots_of_differences import * 24 | # from visualization.small_multiples import * 25 | # from visualization.superplots import * 26 | # from visualization.timecourse_visualization import * 27 | # from visualization.trajectory_visualization import * 28 | 29 | print("Finished intializing visualizations") 30 | -------------------------------------------------------------------------------- /cellPLATO/cellPLATO/visualization/comparative_visualization.py: -------------------------------------------------------------------------------- 1 | # comparative_visualization.py 2 | 3 | from initialization.initialization import * 4 | from initialization.config import * 5 | 6 | from data_processing.data_wrangling import * 7 | from data_processing.statistics import * 8 | 9 | import numpy as np 10 | import pandas as pd 11 | import os 12 | 13 | import scipy 14 | import scipy.stats as st 15 | 16 | import seaborn as sns 17 | import matplotlib.pyplot as plt 18 | 19 | import plotly 20 | import plotly.graph_objects as go 21 | from plotly.subplots import make_subplots 22 | import plotly.express as px 23 | 24 | 25 | 26 | def scatter2dplotly_compare(comb_df, factors): 27 | 28 | ''' 29 | 2D scatter plot built with plotly graph objects, intended to visualize the 30 | results of the dimensionality reduction operations. 31 | This version is explicitly intended for comparing between conditions on the 32 | same scatter axis. 33 | 34 | 35 | Input: 36 | comb_df: DataFrame, contains combined data from multiple conditions, and/or replicates 37 | factors: 38 | color_by: Indicates what factor should be used to color the points. 39 | default='Condition' 40 | 41 | Returns: 42 | fig_data: 43 | Note: can be visualized normally by using: 44 | fig = go.Figure(fig_data) 45 | fig.show() 46 | 47 | ''' 48 | 49 | # Extract the data to be used to color-code 50 | cmaps = ['Viridis', 'inferno'] 51 | 52 | ''' 53 | For each of the conditions to be plotted, assign them a colormap. 54 | Create trace_data for each, 55 | ''' 56 | cond_list = comb_df['Condition'].unique() 57 | trace_list = [] # Keep traces in list to return, instead of fig object. 58 | for i, condition in enumerate(cond_list): 59 | 60 | sub_df = comb_df.loc[comb_df['Condition'] == condition] 61 | 62 | x = sub_df[factors[0]] 63 | y = sub_df[factors[1]] 64 | 65 | trace_data = go.Scatter( 66 | x=x, 67 | y=y, 68 | mode='markers', 69 | marker=dict( 70 | size=5, 71 | color=sub_df['frame'], # set color to an array/list of desired values 72 | colorscale=cmaps[i], # choose a colorscale 73 | opacity=0.5)) 74 | 75 | trace_list.append(trace_data) 76 | 77 | # After all replicates are drawn, THEN draw the summary stats fig_data 78 | # fig.update_layout(showlegend=False, 79 | # yaxis_title=factor) 80 | 81 | # Define fig layout as dict, to return and apply in the pipeline 82 | fig_layout={ 83 | 'xaxis_title': factors[0], 84 | 'yaxis_title': factors[1], 85 | 'showlegend': False, 86 | 'title': 'Low-dimension scatterplot' 87 | } 88 | 89 | # Create the Plotly figure. 90 | scatter_comp = go.Figure() 91 | for trace in dr_data: 92 | scatter_comp.add_trace(trace) 93 | scatter_comp.update_layout(layout) 94 | 95 | if STATIC_PLOTS: 96 | scatter_comp.write_image(PLOT_OUTPUT+str(factors)+"_compartive_scatter_plotly.png") 97 | 98 | if PLOTS_IN_BROWSER: 99 | scatter_comp.show() 100 | 101 | 102 | return trace_list, fig_layout #fig_data 103 | 104 | 105 | 106 | 107 | def scatter3dplotly_compare(comb_df, factors): 108 | 109 | ''' 110 | 3D scatter plot built with plotly graph objects, intended to visualize the 111 | results of the dimensionality reduction operations. 112 | This version is explicitly intended for comparing between conditions on the 113 | same scatter axis. 114 | 115 | 116 | 117 | Input: 118 | comb_df: DataFrame, contains combined data from multiple conditions, and/or replicates 119 | factors: 120 | color_by: Indicates what factor should be used to color the points. 121 | default='Condition' 122 | 123 | Returns: 124 | 125 | trace_list: 126 | fig_layout: 127 | OR 128 | fig_data: 129 | Note: can be visualized normally by using: 130 | fig = go.Figure(fig_data) 131 | fig.show() 132 | 133 | ''' 134 | 135 | # Extract the data to be used to color-code 136 | cmaps = ['Viridis', 'inferno'] 137 | 138 | ''' 139 | For each of the conditions to be plotted, assign them a colormap. 140 | Create trace_data for each, 141 | ''' 142 | cond_list = comb_df['Condition'].unique() 143 | trace_list = [] # Keep traces in list to return, instead of fig object. 144 | for i, condition in enumerate(cond_list): 145 | 146 | sub_df = comb_df.loc[comb_df['Condition'] == condition] 147 | 148 | x = sub_df[factors[0]] 149 | y = sub_df[factors[1]] 150 | z = sub_df[factors[2]] 151 | 152 | trace_data = go.Scatter3d( 153 | x=x, 154 | y=y, 155 | z=z, 156 | mode='markers', 157 | marker=dict( 158 | size=5, 159 | color=sub_df['frame'], # set color to an array/list of desired values 160 | colorscale=cmaps[i], # choose a colorscale 161 | opacity=1)) 162 | 163 | trace_list.append(trace_data) 164 | 165 | 166 | # Define fig layout as dict, to return and apply in the pipeline 167 | fig_layout={ 168 | 'xaxis_title': factors[0], 169 | 'yaxis_title': factors[1], 170 | 'showlegend': False, 171 | 'title': 'Low-dimension scatterplot' 172 | } 173 | 174 | # Create the Plotly figure. 175 | scatter_comp = go.Figure() 176 | for trace in dr_data: 177 | scatter_comp.add_trace(trace) 178 | scatter_comp.update_layout(layout) 179 | 180 | if STATIC_PLOTS: 181 | scatter_comp.write_image(PLOT_OUTPUT+str(factors)+"_compartive_scatter_plotly.png") 182 | 183 | if PLOTS_IN_BROWSER: 184 | scatter_comp.show() 185 | 186 | return trace_list, fig_layout #fig_data 187 | 188 | 189 | 190 | def plotly_marginal_scatter(df, pair, save_path=MARGSCAT_DIR): 191 | 192 | ''' 193 | Create a plotly express scatterplot comparing multiple conditions, for a user-provided 194 | pair of factors. 195 | 196 | input: 197 | df: DataFrame 198 | pair: list of factors to compare. 199 | ''' 200 | 201 | assert len(pair) == 2, 'Marginal scatter requires 2 factors as input' 202 | 203 | fig = px.scatter(df, x=pair[0], y=pair[1], color="Condition", 204 | marginal_x="violin", marginal_y="violin", 205 | title="Comparative marginal scatter: "+str(pair)) 206 | 207 | if STATIC_PLOTS: 208 | fig.write_image(save_path+'marginal_scatter_'+str(pair)+'.png') 209 | 210 | if PLOTS_IN_BROWSER: 211 | fig.show() 212 | 213 | 214 | 215 | def marginal_xy(df, pair, plot_type='scatter', renderer='plotly', save_path=MARGSCAT_DIR, bounds=None,supp_label=''): 216 | 217 | ''' 218 | Create a plotly express scatterplot comparing multiple conditions, for a user-provided 219 | pair of factors. 220 | 221 | input: 222 | df: DataFrame 223 | pair: list of factors to compare. 224 | plot_type: scatter, contour, hex 225 | renderer: plotly or seaborn 226 | 227 | ''' 228 | 229 | assert len(pair) == 2, 'Marginal scatter requires 2 factors as input' 230 | 231 | cond_grouping = 'Condition' 232 | # rep_grouping = 'Replicate_ID' 233 | 234 | if(USE_SHORTLABELS): 235 | cond_grouping = 'Condition_shortlabel' 236 | # rep_grouping = 'Replicate_shortlabel' 237 | 238 | # Unpack th ebounds if they exist 239 | if bounds is not None: 240 | x_min, x_max,y_min, y_max = bounds 241 | 242 | if renderer == 'plotly': 243 | 244 | if plot_type == 'scatter': 245 | fig = px.scatter(df, x=pair[0], y=pair[1], color=cond_grouping, 246 | marginal_x="violin", marginal_y="violin", 247 | title='marginal_xy_'+plot_type+ '_'+str(pair)) 248 | 249 | elif plot_type == 'contour': 250 | 251 | fig = px.density_contour(df, x=pair[0], y=pair[1], color=cond_grouping, 252 | marginal_x="violin", marginal_y="violin", 253 | title='marginal_xy_'+plot_type+ '_'+str(pair)) 254 | 255 | elif plot_type == 'hex': 256 | 257 | print('No hexbin plot type in plotly.') 258 | fig = go.Figure() 259 | 260 | if STATIC_PLOTS: 261 | fig.write_image(save_path+'marginal_xy_plotly_'+plot_type+ '_'+str(pair)+'_'+supp_label+'.png') 262 | 263 | if PLOTS_IN_BROWSER: 264 | fig.show() 265 | 266 | elif renderer == 'seaborn': 267 | 268 | if plot_type == 'scatter': 269 | 270 | # If only one condition: 271 | if len(df[cond_grouping].unique())==1: 272 | 273 | fig = sns.jointplot(data=df, x=pair[0], y=pair[1], color='black', 274 | joint_kws={'s': 1}, alpha=0.5) 275 | else: 276 | fig = sns.jointplot(data=df, x=pair[0], y=pair[1], hue = df[cond_grouping], 277 | joint_kws={'s': 1}, alpha=0.5) 278 | plt.legend(loc='best', bbox_to_anchor=(1.05, 1), borderaxespad=0.) 279 | 280 | elif plot_type == 'contour': 281 | 282 | # Remove the legend if there's only one condition in the provided dataset. 283 | if len(df[cond_grouping].unique())==1: 284 | 285 | fig = sns.jointplot(data=df, x=pair[0], y=pair[1], color = 'black', kind="kde", palette='magma') 286 | plt.suptitle(supp_label, y=1.05, fontsize = 16) 287 | 288 | else: 289 | fig = sns.jointplot(data=df, x=pair[0], y=pair[1], hue = df[cond_grouping],kind="kde", palette=PALETTE) 290 | plt.suptitle(supp_label, y=1.05, fontsize = 16) 291 | sns.color_palette(PALETTE, as_cmap=True) 292 | 293 | elif plot_type == 'hex': 294 | print('No multi-condition hexplot available, consider making small multiples.') 295 | fig = sns.jointplot(data=df, x=pair[0], y=pair[1],kind="hex",palette='magma') 296 | plt.suptitle(supp_label, y=1.05, fontsize = 16) 297 | sns.color_palette("magma", as_cmap=True) 298 | 299 | if bounds is not None: 300 | fig.ax_marg_x.set_xlim(x_min, x_max) 301 | fig.ax_marg_y.set_ylim(y_min, y_max) 302 | 303 | 304 | if STATIC_PLOTS: 305 | 306 | fig.savefig(save_path+'marginal_xy_sns_'+plot_type+ '_'+str(pair)+'_'+supp_label+'.png', dpi=300)#plt. 307 | 308 | 309 | 310 | def comparative_bar(df_tup, x, y, title='', height=400, to_plot='avg',error='SEM', save_path=BAR_DIR): #color='Condition' 311 | # print('THIS IS THE INPUT DF') 312 | # display(df_tup) 313 | # print(df_tup.columns) 314 | 315 | widthmultiplier = len(df_tup) 316 | print("widthmultiplier: ", widthmultiplier) 317 | 318 | ''' 319 | Simple bar plot conveneince function that allows plotting of color-coded conditions either on a per-conditions 320 | or per-replicate basis. 321 | Eventually to be replaced by a bar plot that includes measure of variance to be plotted as error bars 322 | 323 | Inputs: 324 | df: DataFrame to be plotted 325 | x: Grouping, 'Condition' 'Replicate_ID' 326 | y: factor to be visualized 327 | title: str, additional label for the saved plot. 328 | color: factor to color by, default: 'Condition' 329 | height: plot height, default: 400px 330 | to_plot: str, what to plot: 'avg' or 'n' 331 | error: Measure of variance for error bars, str: SEM or STD 332 | ''' 333 | # This part extracts an sns colormap for use in plotly express ### 334 | 335 | pal = sns.color_palette(CONDITION_CMAP) #extracts a colormap from the seaborn stuff. 336 | cmap=pal.as_hex()[:] #outputs that as a hexmap which is compatible with plotlyexpress below 337 | 338 | # Split up the input tuple:(avg, std, n) 339 | df = df_tup[0] 340 | std_df = df_tup[1] 341 | n_df = df_tup[2] 342 | 343 | if(USE_SHORTLABELS): 344 | 345 | # df = add_shortlabels(df) 346 | grouping = 'Condition_shortlabel' 347 | 348 | # Sort the dataframe by custom category list to set draw order 349 | df[grouping] = pd.Categorical(df[grouping], CONDITION_SHORTLABELS) 350 | 351 | # Also replace the x-labels on the plot and legend. 352 | if(x=='Condition'): 353 | x = 'Condition_shortlabel' 354 | elif(x=='Replicate_ID'): 355 | x = 'Replicate_shortlabel' 356 | 357 | else: 358 | 359 | grouping = 'Condition' 360 | 361 | # Sort the dataframe by custom category list to set draw order 362 | df[grouping] = pd.Categorical(df[grouping], CONDITIONS_TO_INCLUDE) 363 | 364 | color = grouping 365 | 366 | 367 | df.sort_values(by=grouping, inplace=True, ascending=True) 368 | 369 | if error == 'SEM': 370 | y_error = std_df[y] / np.sqrt(n_df[y]) #Estimate of SEM (std / sqare root of n) 371 | 372 | elif error == 'STD': 373 | y_error = std_df[y] # Stadard deviation 374 | 375 | if(to_plot == 'avg'): 376 | # Plot the means between groups for this factor, between conditions and between replicates. 377 | fig = px.bar(df, x=x, y=y, color=color, height=height, 378 | # color_discrete_sequence=eval(PX_COLORS),#cmap 379 | color_discrete_sequence=cmap, 380 | error_y = y_error) 381 | 382 | elif(to_plot == 'n'): 383 | 384 | fig = px.bar(n_df, x=x, y=y, color=color, height=height, 385 | # color_discrete_sequence=eval(PX_COLORS), 386 | color_discrete_sequence=cmap, 387 | labels = dict(y="Number of cells")) 388 | 389 | widthofplot = 220* widthmultiplier 390 | #change the font size of the axis labels 391 | fig.update_layout(showlegend=False, 392 | # plot_bgcolor = 'white', 393 | autosize=False, 394 | width = widthofplot, 395 | height = 650, 396 | font=dict( 397 | #family="Courier New, monospace", 398 | size=PLOT_TEXT_SIZE, 399 | color="Black")) 400 | # fig.update_xaxes(tickangle=90) 401 | # Remove the x axis label 402 | fig.update_xaxes(title_text='', tickangle=45) 403 | 404 | 405 | # change the font size of the y and x axis tick labels 406 | 407 | 408 | 409 | 410 | if STATIC_PLOTS: 411 | fig.write_image(save_path+y+'_'+to_plot+'_'+title + '.png') 412 | 413 | if PLOTS_IN_BROWSER: 414 | fig.show() 415 | 416 | return fig 417 | 418 | def comparative_SNS_bar(df, save_path=BAR_SNS_DIR): 419 | import seaborn as sns 420 | whattoplot=ALL_FACTORS 421 | CLUSTER_CMAP = 'tab20' 422 | # CONDITION_CMAP = 'dark' 423 | 424 | colors = np.asarray(sns.color_palette('Greys', n_colors=6)) 425 | timestorepeat_in=(len(df['Condition'].unique()))/2 426 | timestorepeat = (np.ceil(timestorepeat_in)).astype(int) 427 | colors2=colors[2] 428 | colors3=colors[4] 429 | colors4=np.stack((colors2,colors3)) 430 | colors5 = np.tile(colors4,(timestorepeat,1)) 431 | colors=colors5 432 | 433 | import seaborn as sns 434 | sns.set_theme(style="ticks") 435 | # sns.set_palette(CONDITION_CMAP) 436 | 437 | x_lab = whattoplot 438 | plottitle = "" 439 | 440 | for factor in np.arange(len(whattoplot)): 441 | # f, ax = plt.subplots(1, 1, figsize=(10, 10)) #sharex=True 442 | f, ax = plt.subplots() #sharex=True 443 | sns.barplot(ax=ax, x="Condition_shortlabel", y=whattoplot[factor], data=df, palette=colors,capsize=.2, dodge=False) #ci=85, # estimator=np.mean, 444 | # sns.catplot(ax=ax, x="Condition_shortlabel", y=whattoplot[factor], data=df, palette=colors, kind="boxen") #errorbar=('ci', 95) 445 | sns.stripplot(ax=ax, x="Condition_shortlabel", y=whattoplot[factor], data=df, size=5, color=".1",alpha = 0.6, linewidth=0, jitter=0.2) 446 | 447 | ax.xaxis.grid(True) 448 | ax.set(xlabel="") 449 | ax.set_ylabel(whattoplot[factor], fontsize=PLOT_TEXT_SIZE) 450 | ax.set_title("", fontsize=PLOT_TEXT_SIZE) 451 | # ax.tick_params(axis='both', labelsize=36) 452 | ax.tick_params(axis='y', labelsize=PLOT_TEXT_SIZE) 453 | ax.tick_params(axis='x', labelsize=PLOT_TEXT_SIZE, rotation = 45) 454 | # f.tight_layout() 455 | plt.setp(ax.patches, linewidth=3, edgecolor='k') 456 | # fig.write_image(CLUST_DISAMBIG_DIR+'\cluster_label_counts.png') 457 | f.savefig(save_path+str(whattoplot[factor])+'_gray_barplot.png', dpi=300)#plt. 458 | if PLOTS_IN_BROWSER: 459 | f.show() 460 | 461 | return 462 | 463 | def getaveragevalues(df_in, factorstoinclude, savepath = SAVED_DATA_PATH): 464 | cols = factorstoinclude + ['Condition_shortlabel'] 465 | df_vals=df_in[cols] 466 | 467 | df_averaged_mean = df_vals.groupby('Condition_shortlabel').median().reset_index() 468 | df_averaged_median = df_vals.groupby('Condition_shortlabel').mean().reset_index() 469 | # Save the dfs as csv files 470 | df_averaged_mean.to_csv(savepath + 'Mean_Values_per_condition.csv', index=False) 471 | df_averaged_median.to_csv(savepath + 'Median_Values_per_condition.csv', index=False) 472 | 473 | if 'label' in df_in.columns: 474 | cols = factorstoinclude + ['label'] 475 | df_labels=df_in[cols] 476 | df_cluster_labels_averaged_mean = df_labels.groupby('label').mean().reset_index() 477 | df_cluster_labels_averaged_median = df_labels.groupby('label').median().reset_index() 478 | df_cluster_labels_averaged_mean.to_csv(savepath + 'Mean_Values_per_cluster.csv', index=False) 479 | df_cluster_labels_averaged_median.to_csv(savepath + 'Median_Values_per_cluster.csv', index=False) 480 | # Can potentially include a multi groupby on the label and condition_shortlabel to get the mean and median values per cluster per condition 481 | 482 | return -------------------------------------------------------------------------------- /cellPLATO/cellPLATO/visualization/filter_visualization.py: -------------------------------------------------------------------------------- 1 | from initialization.initialization import * 2 | from initialization.config import * 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import os 7 | 8 | import matplotlib.pyplot as plt 9 | import plotly.graph_objects as go 10 | 11 | 12 | plt.rcParams.update({ 13 | "figure.facecolor": (1.0, 1.0, 1.0, 1.0), 14 | "axes.facecolor": (1.0, 1.0, 1.0, 1.0), 15 | "savefig.facecolor": (1.0, 1.0, 1.0, 1.), 16 | "figure.figsize": (10,10), 17 | "font.size": 12 18 | }) 19 | 20 | def visualize_filtering(df, filt_counts, plot_by='xy'): 21 | 22 | assert 'included' in df.columns, 'visualize_filtering() must be run on filtered dataframe' 23 | 24 | if plot_by == 'xy': 25 | x_name = 'x_um' 26 | y_name = 'y_um' 27 | color_by = 'rip_L' 28 | x_label='x position (microns)' 29 | y_label='y position (microns)' 30 | 31 | elif plot_by == 'pca': 32 | x_name = 'PC1' 33 | y_name = 'PC2' 34 | color_by = 'label' 35 | x_label='PC1' 36 | y_label='PC2' 37 | 38 | elif (plot_by == 'tsne' or plot_by == 'tSNE'): 39 | 40 | x_name = 'tSNE1' 41 | y_name = 'tSNE2' 42 | color_by = 'label' 43 | x_label='tSNE1' 44 | y_label='tSNE2' 45 | 46 | elif plot_by == 'umap': 47 | 48 | x_name = 'UMAP1' 49 | y_name = 'UMAP2' 50 | color_by = 'label' 51 | x_label = 'UMAP1' 52 | y_label = 'UMAP2' 53 | 54 | fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=[20,10]) 55 | 56 | df_filt = df[df['included'] == True] 57 | 58 | ax1.scatter(x=df[x_name], y=df[y_name], color='gray', s=0.5) 59 | ax1.scatter(x=df_filt[x_name], y=df_filt[y_name], c=df_filt[color_by], s=5) # 60 | ax1.set_xlabel(x_label) 61 | ax1.set_ylabel(y_label) 62 | 63 | filt_cond = ['Pre-filtering'] 64 | counts = [len(df['uniq_id'].unique())] 65 | 66 | for filt in filt_counts: 67 | filt_cond.append(filt[0]) 68 | counts.append(filt[1]) 69 | 70 | filt_cond.append('Post-filtering') 71 | counts.append(len(df_filt['uniq_id'].unique())) 72 | 73 | ax2.bar(filt_cond,counts) 74 | ax2.set_ylabel('Number of cells') 75 | 76 | return fig 77 | 78 | 79 | def visualize_filt_loss(): 80 | 81 | # Labels as names of exported dataframes 82 | labels = ['comb_df', 83 | 'mig_df', 84 | 'dr_df-prefilt', 85 | 'dr_df_filt'] 86 | 87 | # Add the programatically generated names for the filtered outputs 88 | # From the DATA_FILTERS dictionary 89 | for i,factor in enumerate(DATA_FILTERS.keys()): 90 | labels.append('filt_'+str(i)+'-'+factor) 91 | 92 | # Load each of the DataFrames into a list 93 | df_list = [] 94 | for label in labels: 95 | df_list.append(pd.read_csv(DATA_OUTPUT + label+'.csv')) 96 | 97 | # Set up the subplot figure. 98 | fig = make_subplots( 99 | rows=2, cols=len(df_list), 100 | # subplot_titles=(labels), 101 | specs=[[{} for _ in range(len(df_list))], 102 | [{'colspan': len(df_list)}, *[None for _ in range(len(df_list)-1)]]]) 103 | 104 | count = [] 105 | 106 | for i, df in enumerate(df_list): #enumerate here to get access to i 107 | label=labels[i] 108 | count.append(len(df.index)) 109 | 110 | fig.add_trace(go.Scatter(x=df['x'], 111 | y=df['y'], 112 | opacity=0.5), 113 | row=1, 114 | col=i+1) 115 | 116 | fig.add_trace(go.Scatter(x=labels, y=count), 117 | row=2, col=1) 118 | 119 | fig.update_yaxes(rangemode="tozero") 120 | fig.update_xaxes(tickangle=-90) 121 | fig.update_layout(showlegend=False) 122 | 123 | if STATIC_PLOTS: 124 | fig.write_image(PLOT_OUTPUT+'filter_loss.png') 125 | 126 | if PLOTS_IN_BROWSER: 127 | fig.show() 128 | -------------------------------------------------------------------------------- /cellPLATO/cellPLATO/visualization/low_dimension_visualization.py: -------------------------------------------------------------------------------- 1 | #low_dimension_visualization.py 2 | 3 | from initialization.config import * 4 | from initialization.initialization import * 5 | 6 | from data_processing.clustering import hdbscan_clustering 7 | from data_processing.dimensionality_reduction import * 8 | from data_processing.shape_calculations import * 9 | 10 | 11 | import numpy as np 12 | import pandas as pd 13 | import os 14 | import imageio 15 | 16 | import plotly 17 | import plotly.graph_objects as go 18 | from plotly.subplots import make_subplots 19 | import seaborn as sns 20 | 21 | # matplotlib imports 22 | import matplotlib.pyplot as plt 23 | plt.rcParams['image.cmap'] = 'viridis' 24 | plt.rcParams.update({ 25 | "figure.facecolor": (1.0, 1.0, 1.0, 1.0), 26 | "axes.facecolor": (1.0, 1.0, 1.0, 1.0), 27 | "savefig.facecolor": (1.0, 1.0, 1.0, 1.), 28 | "figure.figsize": (10,10), 29 | "font.size": 12 30 | }) 31 | 32 | 33 | from mpl_toolkits.mplot3d.art3d import Line3DCollection 34 | from matplotlib.collections import LineCollection 35 | from matplotlib import colors as mcolors 36 | from matplotlib.colors import ListedColormap, BoundaryNorm 37 | 38 | plt.rcParams.update({ 39 | "figure.facecolor": (1.0, 1.0, 1.0, 1.0), 40 | "axes.facecolor": (1.0, 1.0, 1.0, 1.0), 41 | "savefig.facecolor": (1.0, 1.0, 1.0, 1.), 42 | "figure.figsize": (10,10), 43 | "font.size": 12 44 | }) 45 | 46 | # Datashader imports 47 | import datashader as ds 48 | import datashader.transfer_functions as tf 49 | from datashader.utils import export_image 50 | from matplotlib import cm 51 | 52 | from sklearn.preprocessing import MinMaxScaler 53 | 54 | import math 55 | import ternary 56 | 57 | def correlation_matrix(df): 58 | 59 | f = plt.figure(figsize=(19, 15)) 60 | plt.matshow(df.corr(), fignum=f.number, cmap='viridis') 61 | plt.xticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=14, rotation=90) 62 | plt.yticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=14) 63 | cb = plt.colorbar() 64 | 65 | return f 66 | 67 | def pca_factor_vis(df, pca_tuple, dr_factors=DR_FACTORS): 68 | 69 | pca_df, components, expl = pca_tuple#do_pca(df[dr_factors]) 70 | 71 | fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2,figsize=(15, 15)) 72 | 73 | ax1.imshow(components) 74 | ax1.set_yticklabels(dr_factors) 75 | ax1.set_yticks(np.arange(len(dr_factors))) 76 | ax1.set_xticklabels(range(1,len(components))) 77 | ax1.set_xticks(range(0,len(components[0]))) 78 | ax1.title.set_text('Principal compnents') 79 | ax1.set_xlabel('Principal Component') 80 | ax1.set_ylabel('Factor') 81 | 82 | ax2.imshow(components*expl) 83 | ax2.set_yticklabels(dr_factors) 84 | ax2.set_yticks(np.arange(len(dr_factors))) 85 | ax2.set_xticklabels(range(1,len(components))) 86 | ax2.set_xticks(range(0,len(components[0]))) 87 | ax2.title.set_text('Componenets * variance explained') 88 | ax2.set_xlabel('Principal Component') 89 | ax2.set_ylabel('Factor') 90 | 91 | ax3.imshow(np.expand_dims(np.sum(components*expl,axis=1), axis=1)) 92 | ax3.set_yticklabels(dr_factors) 93 | ax3.set_yticks(np.arange(len(dr_factors))) 94 | ax3.set_xticks(range(0)) 95 | ax3.set_ylabel('Factor') 96 | ax3.title.set_text('Sum variance contribution per factor') 97 | 98 | ax4.plot(expl) 99 | ax4.title.set_text('Variance accounted for') 100 | ax4.set_xlabel('Principal Component') 101 | factor_variance = np.sum(components*expl,axis=1) 102 | 103 | 104 | if STATIC_PLOTS: 105 | 106 | plt.savefig(DR_DIR + '\pca_variance.png', format='png', dpi=600) 107 | 108 | return fig 109 | 110 | def pca_factor_matrix(df,pca_tuple, dr_factors=DR_FACTORS, ax=None): 111 | 112 | # If no axis is supplied, then createa simple fig, ax and default to drawing the points. 113 | if ax is None: 114 | fig, ax = plt.subplots() 115 | fig.patch.set_facecolor('white') 116 | 117 | x = df[dr_factors].values 118 | 119 | pca_df, components, expl = pca_tuple#do_pca(x) 120 | dr_df = pd.concat([df,pca_df], axis=1) 121 | 122 | # Make the matrix plot 123 | # fig, ax = plt.subplots(1, 1,figsize=(15, 20)) 124 | # fig.patch.set_facecolor('white') 125 | ax.imshow(components) 126 | ax.set_yticklabels(dr_factors) 127 | ax.set_yticks(np.arange(len(dr_factors))) 128 | ax.set_xticklabels(range(1,len(components))) 129 | ax.set_xticks(range(0,len(components[0]))) 130 | ax.title.set_text('Principal components') 131 | ax.set_xlabel('Principal Component') 132 | ax.set_ylabel('Factor') 133 | 134 | # if ax is None: 135 | 136 | return ax, dr_df 137 | 138 | def colormap_pcs(dr_df, cmap = 'rgb'): 139 | 140 | pcs = np.asarray(dr_df[['PC1','PC2','PC3']]) 141 | scaler = MinMaxScaler() 142 | scaler.fit(pcs) 143 | pc_colors = scaler.transform(pcs) 144 | 145 | if cmap != 'rgb': 146 | 147 | if cmap == 'cmy': 148 | 149 | pc_colors = rgb2cmy(pc_colors) 150 | 151 | pc_colors = np.clip(pc_colors, 0, 1) 152 | 153 | return pc_colors 154 | 155 | 156 | def rgb2cmy(rgb_arr): 157 | 158 | ''' 159 | Allows recoloring the 3-factor rgb array into cyan, magenta, yellow 160 | ''' 161 | 162 | x = rgb_arr[:,0] 163 | y = rgb_arr[:,1] 164 | z = rgb_arr[:,2] 165 | 166 | w = 255 167 | x_color = x * w #/ float(scale) 168 | y_color = y * w #/ float(scale) 169 | z_color = z * w #/ float(scale) 170 | 171 | r = np.abs(w - y_color) / w 172 | g = np.abs(w - x_color) / w 173 | b = np.abs(w - z_color) / w 174 | 175 | color_arr = np.c_[r,g,b] 176 | print(color_arr.shape) 177 | 178 | 179 | return color_arr 180 | 181 | 182 | def datashader_lines(df_in, x,y,color_by='Condition', output_res=500, aspect=1,categorical=False, export=False, identifier = ''): 183 | 184 | df = df_in.copy() 185 | 186 | # Need to add conditions as category datatype to use multi-color datashader 187 | width = output_res 188 | height = int(output_res / aspect) 189 | 190 | cvs = ds.Canvas(plot_width=width, plot_height=height)#,x_range=x_range, y_range=y_range) 191 | 192 | if categorical: 193 | 194 | df['Cat'] = df[color_by].astype('category') 195 | # Multicolor categorical 196 | agg = cvs.line(df, x, y, agg=ds.count_cat('Cat')) 197 | img = tf.set_background(tf.shade(agg, how='eq_hist'),"black") 198 | 199 | else: 200 | 201 | agg = cvs.line(df, x, y, agg=ds.count()) 202 | img = tf.set_background(tf.shade(agg, cmap=cm.inferno, how='linear'),"black") 203 | 204 | if STATIC_PLOTS: 205 | 206 | # plt.savefig(CLUST_DIR+label+'.png', dpi=300) 207 | figname = CLUST_DIR+identifier+'_datashaderlines.png' 208 | export_image(img, figname, background="black") 209 | 210 | return img 211 | 212 | 213 | def spatial_img_coloc(df_in, xy='tSNE',thresh=2,n_bins=50): 214 | 215 | ''' 216 | Visualize dimensionally reduced space as a histogram and perfrom image 217 | collocalization between the images. 218 | 219 | TO DO: Update this to work for the inputted conditions... 220 | By Default it assumes the first is the control and the second is for comparison. 221 | ''' 222 | 223 | 224 | if(xy == 'tsne' or xy == 'tSNE'): 225 | 226 | x_lab = 'tSNE1' 227 | y_lab = 'tSNE2' 228 | 229 | elif xy == 'PCA': 230 | 231 | x_lab = 'PC1' 232 | y_lab = 'PC2' 233 | 234 | elif (xy == 'umap' or xy == 'UMAP'): 235 | 236 | x_lab = 'UMAP1' 237 | y_lab = 'UMAP2' 238 | 239 | df = df_in.copy() 240 | 241 | # Get the list of conditions included in the dataframe. By default show the first two. 242 | cond_list = df['Condition_shortlabel'].unique() 243 | print(cond_list) 244 | 245 | fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(10,10)) 246 | 247 | ax1.title.set_text('Combined set') 248 | # ax2.title.set_text(CTL_LABEL) 249 | # ax3.title.set_text(CONDITIONS_TO_INCLUDE[1]) 250 | # ax2.title.set_text(CONDITION_SHORTLABELS[0]) 251 | # ax3.title.set_text(CONDITION_SHORTLABELS[1]) 252 | ax2.title.set_text(cond_list[0]) 253 | ax3.title.set_text(cond_list[1]) 254 | ax4.title.set_text('Colocalization') 255 | 256 | 257 | # xy_range = [[-60, 60], [-40, 40]] 258 | xy_range = [[np.min(df[x_lab]), np.max(df[x_lab])], [np.min(df[y_lab]), np.max(df[y_lab])]] 259 | 260 | H, xedges, yedges = np.histogram2d(df[x_lab], df[y_lab], bins=n_bins, range=xy_range, normed=None, weights=None, density=None) 261 | H = H.T 262 | ax1.imshow(H) 263 | # ctl_df = df[df['Condition']==CTL_LABEL] 264 | ctl_df = df[df['Condition_shortlabel']==cond_list[0]]#CONDITION_SHORTLABELS[0]] 265 | H_ctl, xedges, yedges = np.histogram2d(ctl_df[x_lab], ctl_df[y_lab], bins=n_bins, range=xy_range, normed=None, weights=None, density=None) 266 | H_ctl = H_ctl.T 267 | ax2.imshow(H_ctl) 268 | 269 | comp_df = df[df['Condition_shortlabel']==cond_list[1]]#CONDITION_SHORTLABELS[1]] 270 | H_comp, xedges, yedges = np.histogram2d(comp_df[x_lab], comp_df[y_lab], bins=n_bins, range=xy_range, normed=None, weights=None, density=None) 271 | H_comp = H_comp.T 272 | ax3.imshow(H_comp) 273 | 274 | # Image Colocalization 275 | 276 | # Inds that will be max value 277 | thresh_1 = thresh 278 | thresh_2 = thresh 279 | inds = (H_comp > thresh_1) & (H_ctl > thresh_2) 280 | 281 | # Convert inds to white 282 | 283 | # im=H_ctl 284 | im = np.zeros(H_ctl.shape) 285 | im[inds] = 1000.0 # An arbitrarily high intensity value so you'll effectively only see this in the plot 286 | 287 | ax4.imshow(im) 288 | 289 | # Invert axes to be consistent with the scatter plots 290 | ax1.invert_yaxis() 291 | ax2.invert_yaxis() 292 | ax3.invert_yaxis() 293 | ax4.invert_yaxis() 294 | 295 | return fig 296 | 297 | 298 | 299 | def dr_contour_matrix(df_in,n_grid_pts=10, dr_method='tSNE', t_window=None): 300 | 301 | ''' 302 | 303 | n_grid_pts 304 | ''' 305 | 306 | df = df_in.copy() 307 | 308 | if(dr_method == 'tsne' or dr_method == 'tSNE'): 309 | 310 | x_lab = 'tSNE1' 311 | y_lab = 'tSNE2' 312 | 313 | elif dr_method == 'PCA': 314 | 315 | x_lab = 'PC1' 316 | y_lab = 'PC2' 317 | 318 | elif dr_method == 'umap': 319 | 320 | x_lab = 'UMAP1' 321 | y_lab = 'UMAP2' 322 | 323 | 324 | # Make the figure 325 | fig, ax = plt.subplots(1, 1, figsize=(10,10))#(ax1, ax2), (ax3, ax4) 326 | 327 | # ax.scatter(x=df[x_lab],y=df[y_lab],c='gray', alpha=0.1, s=1) 328 | pc_colors = colormap_pcs(df, cmap='rgb') # cmap='cmy' 329 | ax.scatter(x=df[x_lab],y=df[y_lab], alpha=0.5, s=1, c=pc_colors) 330 | 331 | 332 | # Create a meshgrid covering the area of DR space 333 | x_bounds = [np.min(df[x_lab]), np.max(df[x_lab])] 334 | y_bounds = [np.min(df[y_lab]), np.max(df[y_lab])] 335 | 336 | xs = np.linspace(x_bounds[0], x_bounds[1], n_grid_pts) 337 | ys = np.linspace(y_bounds[0], y_bounds[1], n_grid_pts) 338 | 339 | xx, yy = np.meshgrid(xs,ys, indexing='ij') 340 | 341 | # Lists to store the shapes 342 | df_list = [] 343 | traj_list = [] 344 | 345 | for i in range(n_grid_pts): 346 | 347 | for j in range(n_grid_pts): 348 | 349 | grid_x = xx[i,j] 350 | grid_y = yy[i,j] 351 | 352 | plt.scatter(x=grid_x,y=grid_y,c='black', alpha=0.3, s=2) 353 | 354 | # Find the closest cell to this. 355 | dr_arr = df[[x_lab,y_lab]].values 356 | 357 | # Calculate the distance between grid points and DR points 358 | distances = np.sqrt((dr_arr[:,0] - grid_x)**2 + (dr_arr[:,1] - grid_y)**2) 359 | 360 | # Sort, but keep indices 361 | dist_inds = np.argsort(distances) 362 | row_ind = dist_inds[0] # The first is the closest point 363 | 364 | # Gee the sub dataframe of this cell 365 | row_df = df.loc[row_ind].to_frame().transpose() 366 | 367 | df_list.append(row_df) 368 | 369 | # if distances[row_ind] < 5: 370 | 371 | plt.scatter(x=row_df[x_lab],y=row_df[y_lab],c='red', alpha=0.3, s=5) 372 | 373 | # Get a dataframe containing the cells that fall closest to the grid points 374 | grid_cell_df = pd.concat(df_list) 375 | grid_cell_df.sort_index(inplace=True) 376 | 377 | 378 | # For each of these cells, extract their track. 379 | for i,row in grid_cell_df.iterrows(): 380 | 381 | this_rep = row['Replicate_ID'] 382 | this_cell_id = row['particle'] 383 | frame = row['frame'] 384 | 385 | # Get sub_df for cell from row 386 | cell_df = df[(df['Replicate_ID']==this_rep) & 387 | (df['particle']==this_cell_id)] 388 | 389 | 390 | if t_window is not None: 391 | 392 | # get a subset of the dataframe across the range of frames 393 | cell_df = cell_df[(cell_df['frame']>=frame - t_window/2) & 394 | (cell_df['frame'] 0): 74 | y_low = 0 75 | 76 | y_high = np.mean(data[factor]) + 3 * np.std(data[factor]) 77 | 78 | filtered_df = data[(data[factor].values>y_low) & (data[factor].values < y_high)] 79 | n_dropped = len(data[~data.isin(filtered_df)].dropna()) 80 | ax.set(ylim=(y_low, y_high)) 81 | print('Custom axis using 3 sigma rule, axis bounds not showing ' + str(n_dropped) + ' point(s): ') 82 | 83 | if STATIC_PLOTS and DRAW_SUPERPLOTS: 84 | 85 | plt.savefig(save_path + factor +'_superplots_sns_t_'+str(t)+'.png', format='png', dpi=600) 86 | 87 | 88 | def superplots_plotly(df_in, factor, t=FRAME_END, grid=False, save_path=SUPERPLOT_DIR): 89 | 90 | ''' 91 | A function to implement the 'superplots' from Lord et al 2020, 92 | where eperimental replicates within pooled conditions are plotted such that they can be distinguished. 93 | 94 | df_in: a pandas DataFrame with the column headers:Replicate,Treatment,Speed 95 | 96 | This plot started its life as a boxplot: 97 | https://plotly.com/python/reference/box/ 98 | 99 | ''' 100 | df = df_in.copy() 101 | 102 | # Get a colormap the length of unique replicates 103 | replicates = df['Replicate_ID'].unique() 104 | colors = np.asarray(sns.color_palette(PALETTE, n_colors=len(replicates))) 105 | 106 | sp_df = format_for_superplots(df, factor,t) 107 | # print(sp_df) 108 | # print(sp_df['Replicate'].unique()) 109 | # print('Check this is correct!') 110 | 111 | if(USE_SHORTLABELS): # Must instead sort by shortlabel list order 112 | # Sort the dataframe by custom category list to set draw order 113 | sp_df['Treatment'] = pd.Categorical(sp_df['Treatment'], CONDITION_SHORTLABELS) 114 | else: 115 | # Sort the dataframe by custom category list to set draw order 116 | sp_df['Treatment'] = pd.Categorical(sp_df['Treatment'], CONDITIONS_TO_INCLUDE) 117 | 118 | sp_df.sort_values(by='Treatment', inplace=True, ascending=True) 119 | # sp_df.reset_index(inplace=True, drop=True) 120 | 121 | 122 | # Extract the actual treatment names 123 | treatment_list = list(pd.unique(sp_df['Treatment'])) 124 | # print(len(colors)) 125 | # print(treatment_list) 126 | # assert len(colors) == len(treatment_list), 'Color range should equal the number of conditons (treatments)' 127 | 128 | fig = go.Figure() 129 | 130 | # For each condition, 131 | for treatment in treatment_list: 132 | 133 | stat_list = [] # List to contain the sumary statistic plotted on top. 134 | treat_subdf = sp_df.loc[sp_df['Treatment'] == treatment] 135 | rep_list = pd.unique(treat_subdf['Replicate']) 136 | n_reps = len(rep_list) 137 | # For each replicate 138 | for i, rep in enumerate(rep_list): # Using enumerate to keep track of the # of reps 139 | ''' 140 | Note: This is needed to manually force the summary points to spread out along 141 | the x-dimension. Need to specify their position, based on the number of replicates. 142 | Important: 143 | rep gives the id of the replicate, which determines the color. This may 144 | or may not be shared with the other condition, depending on input data. 145 | i is the index relative to len(rep_list), used to distinguish between replicates 146 | of the same group. 147 | ''' 148 | # Use seperate index to choose colors 149 | if(i < len(colors)): 150 | ci = i 151 | else: 152 | ci = i - len(colors) 153 | 154 | rel_pos = -0.5 + i / n_reps 155 | rep_subdf = treat_subdf.loc[treat_subdf['Replicate'] == rep] 156 | 157 | # Draw the swarm plots 158 | fig.add_trace(go.Box(y=rep_subdf[factor].values,#y0, 159 | name=treatment,#treatment_list[0], 160 | opacity=1, 161 | marker={ 162 | 'color':'rgb' + str(tuple(colors[ci,:])),# tuple(colors[ci,:])#rep] 163 | }, 164 | fillcolor='rgba(0,0,0,0)', 165 | boxpoints='all', 166 | jitter=0.8, 167 | line={ 168 | 'width': 0 169 | }, 170 | pointpos=0)) 171 | 172 | # Save trace data to a list to draw summary stats on top. 173 | trace_data = go.Box(y=[np.mean(rep_subdf[factor].values)],#y0)], 174 | name=treatment, 175 | opacity=1, 176 | marker={ 177 | 'size':20, 178 | 'color': 'rgb' + str(tuple(colors[ci,:])),#colors[ci,:],#rep], 179 | 'line': { 180 | 'color': 'black', 181 | 'width': 2 182 | } 183 | }, 184 | fillcolor='rgba(0,0,0,0)', 185 | boxpoints='all', 186 | jitter=0, 187 | line={ 188 | 'width': 0 189 | }, 190 | pointpos=rel_pos) 191 | 192 | stat_list.append(trace_data) 193 | 194 | # After all replicates are drawn, THEN draw the summary stats fig_data 195 | for stat in stat_list: 196 | fig.add_trace(stat) 197 | 198 | fig.update_layout(showlegend=False, 199 | plot_bgcolor = 'white', 200 | yaxis_title=factor, 201 | title_text="Superplots: "+factor, 202 | font=dict( 203 | #family="Courier New, monospace", 204 | size=PLOT_TEXT_SIZE, #CHANGED BY MJS 205 | # size=PLOT_TEXT_SIZE, 206 | color="Black")) 207 | 208 | # Show the axis frame, and optionally the grid 209 | fig.update_xaxes(showline=True, linewidth=1, linecolor='black') 210 | fig.update_yaxes(showline=True, linewidth=1, linecolor='black') 211 | 212 | if(grid): 213 | fig.update_yaxes(showgrid=True, gridwidth=0.5, gridcolor='black') 214 | 215 | 216 | if STATIC_PLOTS and DRAW_SUPERPLOTS: 217 | 218 | fig.write_image(save_path + factor +'_superplots_plotly_t_'+str(t)+'.png') 219 | 220 | if PLOTS_IN_BROWSER: 221 | fig.show() 222 | 223 | # Superplots retuns the figure object, not to be added to subplot figure 224 | return fig# graphJSON = json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder) 225 | 226 | def superplots_plotly_grays(df_in, factor, t=FRAME_END, grid=False, save_path=SUPERPLOT_grays_DIR): 227 | 228 | ''' 229 | A function to implement the 'superplots' from Lord et al 2020, 230 | where eperimental replicates within pooled conditions are plotted such that they can be distinguished. 231 | 232 | df_in: a pandas DataFrame with the column headers:Replicate,Treatment,Speed 233 | 234 | This plot started its life as a boxplot: 235 | https://plotly.com/python/reference/box/ 236 | 237 | ''' 238 | df = df_in.copy() 239 | 240 | # Get a colormap the length of unique replicates 241 | replicates = df['Replicate_ID'].unique() 242 | # colors = np.asarray(sns.color_palette(PALETTE, n_colors=len(replicates))) 243 | # colors = np.asarray(sns.color_palette('Greys', n_colors=len(replicates))) 244 | colors = np.asarray(sns.color_palette('Greys', n_colors=6)) 245 | timestorepeat_in=(len(replicates))/2 246 | timestorepeat = (np.ceil(timestorepeat_in)).astype(int) 247 | colors2=colors[2] 248 | colors3=colors[4] 249 | colors4=np.stack((colors2,colors3)) 250 | colors5 = np.tile(colors4,(timestorepeat,1)) 251 | colors=colors5 252 | 253 | sp_df = format_for_superplots(df, factor,t) 254 | 255 | if(USE_SHORTLABELS): # Must instead sort by shortlabel list order 256 | # Sort the dataframe by custom category list to set draw order 257 | sp_df['Treatment'] = pd.Categorical(sp_df['Treatment'], CONDITION_SHORTLABELS) 258 | else: 259 | # Sort the dataframe by custom category list to set draw order 260 | sp_df['Treatment'] = pd.Categorical(sp_df['Treatment'], CONDITIONS_TO_INCLUDE) 261 | 262 | sp_df.sort_values(by='Treatment', inplace=True, ascending=True) 263 | # sp_df.reset_index(inplace=True, drop=True) 264 | 265 | 266 | # Extract the actual treatment names 267 | treatment_list = list(pd.unique(sp_df['Treatment'])) 268 | # print(len(colors)) 269 | # print(treatment_list) 270 | # assert len(colors) == len(treatment_list), 'Color range should equal the number of conditons (treatments)' 271 | 272 | fig = go.Figure() 273 | 274 | # For each condition, 275 | for treatment in treatment_list: 276 | 277 | stat_list = [] # List to contain the sumary statistic plotted on top. 278 | treat_subdf = sp_df.loc[sp_df['Treatment'] == treatment] 279 | rep_list = pd.unique(treat_subdf['Replicate']) 280 | n_reps = len(rep_list) 281 | # For each replicate 282 | for i, rep in enumerate(rep_list): # Using enumerate to keep track of the # of reps 283 | ''' 284 | Note: This is needed to manually force the summary points to spread out along 285 | the x-dimension. Need to specify their position, based on the number of replicates. 286 | Important: 287 | rep gives the id of the replicate, which determines the color. This may 288 | or may not be shared with the other condition, depending on input data. 289 | i is the index relative to len(rep_list), used to distinguish between replicates 290 | of the same group. 291 | ''' 292 | # Use seperate index to choose colors 293 | if(i < len(colors)): 294 | ci = i 295 | else: 296 | ci = i - len(colors) 297 | 298 | rel_pos = -0.5 + i / n_reps 299 | rep_subdf = treat_subdf.loc[treat_subdf['Replicate'] == rep] 300 | 301 | # Draw the swarm plots 302 | fig.add_trace(go.Box(y=rep_subdf[factor].values,#y0, 303 | name=treatment,#treatment_list[0], 304 | opacity=1, 305 | marker={ 306 | 'color':'rgb' + str(tuple(colors[ci,:])),# tuple(colors[ci,:])#rep] 307 | }, 308 | fillcolor='rgba(0,0,0,0)', 309 | boxpoints='all', 310 | jitter=0.8, 311 | line={ 312 | 'width': 0 313 | }, 314 | pointpos=0)) 315 | 316 | # Save trace data to a list to draw summary stats on top. 317 | trace_data = go.Box(y=[np.mean(rep_subdf[factor].values)],#y0)], 318 | name=treatment, 319 | opacity=1, 320 | marker={ 321 | 'size':20, 322 | 'color': 'rgb' + str(tuple(colors[ci,:])),#colors[ci,:],#rep], 323 | 'line': { 324 | 'color': 'black', 325 | 'width': 2 326 | } 327 | }, 328 | fillcolor='rgba(0,0,0,0)', 329 | boxpoints='all', 330 | jitter=0, 331 | line={ 332 | 'width': 0 333 | }, 334 | pointpos=rel_pos) 335 | 336 | stat_list.append(trace_data) 337 | 338 | # After all replicates are drawn, THEN draw the summary stats fig_data 339 | for stat in stat_list: 340 | fig.add_trace(stat) 341 | 342 | fig.update_layout(showlegend=False, 343 | plot_bgcolor = 'white', 344 | yaxis_title=factor, 345 | title_text="Superplots: "+factor, 346 | font=dict( 347 | #family="Courier New, monospace", 348 | size=PLOT_TEXT_SIZE, #CHANGED BY MJS 349 | # size=PLOT_TEXT_SIZE, 350 | color="Black")) 351 | 352 | # Show the axis frame, and optionally the grid 353 | fig.update_xaxes(showline=True, linewidth=1, linecolor='black') 354 | fig.update_yaxes(showline=True, linewidth=1, linecolor='black') 355 | 356 | if(grid): 357 | fig.update_yaxes(showgrid=True, gridwidth=0.5, gridcolor='black') 358 | 359 | 360 | if STATIC_PLOTS and DRAW_SUPERPLOTS: 361 | 362 | fig.write_image(save_path + factor +'_superplots_plotly_t_'+str(t)+'.png') 363 | 364 | if PLOTS_IN_BROWSER: 365 | fig.show() 366 | 367 | # Superplots retuns the figure object, not to be added to subplot figure 368 | return fig# graphJSON = json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder) 369 | -------------------------------------------------------------------------------- /cellPLATO/environment.yml: -------------------------------------------------------------------------------- 1 | name: cellplato 2 | channels: 3 | - conda-forge 4 | - david_baddeley 5 | - defaults 6 | - anaconda 7 | dependencies: 8 | - ca-certificates=2022.9.24=h5b45459_0 9 | - nodejs=18.11.0=h57928b3_0 10 | - openssl=3.0.5=h8ffe710_1 11 | - pip=22.1.2=pyhd8ed1ab_0 12 | - python=3.7.12=h900ac77_100_cpython 13 | - python_abi=3.7=2_cp37m 14 | - setuptools=63.1.0=py37h03978a9_0 15 | - sqlite=3.39.0=h8ffe710_0 16 | - ucrt=10.0.20348.0=h57928b3_0 17 | - vc=14.2=hb210afc_6 18 | - vs2015_runtime=14.29.30037=h902a5da_6 19 | - wheel=0.37.1=pyhd8ed1ab_0 20 | - pip: 21 | - alabaster==0.7.12 22 | - anyio==3.6.1 23 | - appdirs==1.4.4 24 | - argon2-cffi==21.3.0 25 | - argon2-cffi-bindings==21.2.0 26 | - astunparse==1.6.3 27 | - attrdict==2.0.1 28 | - attrs==22.1.0 29 | - babel==2.10.3 30 | - backcall==0.2.0 31 | - beautifulsoup4==4.11.1 32 | - bleach==5.0.1 33 | - bokeh==2.3.3 34 | - btrack==0.4.0 35 | - cachetools==4.2.4 36 | - cachey==0.2.1 37 | - certifi==2022.6.15 38 | - cffi==1.15.1 39 | - chardet==3.0.4 40 | - click==8.1.3 41 | - cloudpickle==2.1.0 42 | - colorama==0.4.5 43 | - colorcet==3.0.0 44 | - cvxopt==1.3.0 45 | - cycler==0.11.0 46 | - cython==0.29.30 47 | - cytoolz==0.11.2 48 | - dask==2.30.0 49 | - datashader==0.13.0 50 | - datashape==0.5.2 51 | - debugpy==1.6.2 52 | - decorator==5.1.1 53 | - defusedxml==0.7.1 54 | - distributed==2.30.1 55 | - docstring-parser==0.14.1 56 | - docutils==0.18.1 57 | - entrypoints==0.4 58 | - et-xmlfile==1.1.0 59 | - eth-abi==1.3.0 60 | - eth-account==0.3.0 61 | - eth-hash==0.3.3 62 | - eth-keyfile==0.5.1 63 | - eth-keys==0.2.4 64 | - eth-rlp==0.2.1 65 | - eth-typing==2.3.0 66 | - eth-utils==1.10.0 67 | - fastjsonschema==2.15.3 68 | - flask==1.0.2 69 | - fonttools==4.38.0 70 | - freetype-py==2.3.0 71 | - google-api-core==1.31.6 72 | - google-auth==1.35.0 73 | - google-cloud==0.34.0 74 | - google-cloud-core==1.7.2 75 | - google-cloud-storage==1.23.0 76 | - google-resumable-media==0.5.1 77 | - googleapis-common-protos==1.56.3 78 | - h5py==3.7.0 79 | - hdbscan==0.8.28 80 | - heapdict==1.0.1 81 | - hexbytes==0.1.0 82 | - hsluv==5.0.3 83 | - idna==2.8 84 | - imageio==2.21.1 85 | - imagesize==1.4.1 86 | - importlib-metadata==4.12.0 87 | - importlib-resources==5.9.0 88 | - ipykernel==6.15.1 89 | - ipython==7.34.0 90 | - ipython-genutils==0.2.0 91 | - ipywidgets==7.7.1 92 | - itsdangerous==2.1.2 93 | - jedi==0.18.1 94 | - jinja2==3.0.1 95 | - joblib==1.1.0 96 | - json5==0.9.8 97 | - jsonschema==4.9.1 98 | - jupyter==1.0.0 99 | - jupyter-client==7.3.4 100 | - jupyter-console==6.4.4 101 | - jupyter-core==4.10.0 102 | - jupyter-server==1.18.0 103 | - jupyterlab==3.4.3 104 | - jupyterlab-pygments==0.2.2 105 | - jupyterlab-server==2.10.3 106 | - jupyterlab-widgets==1.1.1 107 | - kaleido==0.1.0.post1 108 | - kiwisolver==1.4.4 109 | - llvmlite==0.34.0 110 | - locket==1.0.0 111 | - lru-dict==1.1.7 112 | - magicgui==0.5.1 113 | - markdown==3.3.7 114 | - markupsafe==2.1.1 115 | - matplotlib==3.5.3 116 | - matplotlib-inline==0.1.3 117 | - mistune==0.8.4 118 | - msgpack==1.0.4 119 | - multipledispatch==0.6.0 120 | - napari==0.4.12 121 | - napari-console==0.0.4 122 | - napari-plugin-engine==0.2.0 123 | - napari-svg==0.1.6 124 | - nbclassic==0.4.0 125 | - nbclient==0.6.6 126 | - nbconvert==6.5.0 127 | - nbformat==5.4.0 128 | - nest-asyncio==1.5.5 129 | - networkx==2.6.3 130 | - notebook==6.4.12 131 | - notebook-shim==0.1.0 132 | - numba==0.51.2 133 | - numpy==1.21.0 134 | - numpydoc==1.4.0 135 | - openpyxl==3.0.7 136 | - opentsne==0.5.1 137 | - packaging==21.3 138 | - pandas==1.2.3 139 | - pandocfilters==1.5.0 140 | - panel==0.11.0 141 | - param==1.12.2 142 | - parsimonious==0.8.1 143 | - parso==0.8.3 144 | - partd==1.2.0 145 | - patsy==0.5.2 146 | - pdoc==12.0.2 147 | - pickleshare==0.7.5 148 | - pillow==9.2.0 149 | - pint==0.18 150 | - pkgutil-resolve-name==1.3.10 151 | - plotly==4.14.1 152 | - prometheus-client==0.14.1 153 | - prompt-toolkit==3.0.30 154 | - protobuf==3.20.1 155 | - psutil==5.9.1 156 | - psygnal==0.3.5 157 | - pyasn1-modules==0.2.8 158 | - pycparser==2.21 159 | - pycryptodome==3.15.0 160 | - pyct==0.4.8 161 | - pydantic==1.9.1 162 | - pygments==2.12.0 163 | - pynndescent==0.5.7 164 | - pyopengl==3.1.6 165 | - pyparsing==3.0.9 166 | - pypiwin32==223 167 | - pyrsistent==0.18.1 168 | - python-dateutil==2.8.2 169 | - python-ternary==1.0.8 170 | - pyviz-comms==2.2.0 171 | - pywavelets==1.3.0 172 | - pywin32==304 173 | - pywinpty==2.0.7 174 | - pyyaml==6.0 175 | - pyzmq==23.2.0 176 | - qtconsole==5.3.1 177 | - qtpy==2.1.0 178 | - requests==2.21.0 179 | - retrying==1.3.3 180 | - rlp==2.0.1 181 | - rsa==4.8 182 | - scikit-image==0.16.1 183 | - scikit-learn==0.23.2 184 | - scipy==1.7.3 185 | - seaborn==0.12.2 186 | - send2trash==1.8.0 187 | - similaritymeasures==0.4.4 188 | - simplification==0.6.1 189 | - sniffio==1.2.0 190 | - snowballstemmer==2.2.0 191 | - sortedcontainers==2.4.0 192 | - soupsieve==2.3.2.post1 193 | - sphinx==5.0.2 194 | - sphinxcontrib-applehelp==1.0.2 195 | - sphinxcontrib-devhelp==1.0.2 196 | - sphinxcontrib-htmlhelp==2.0.0 197 | - sphinxcontrib-jsmath==1.0.1 198 | - sphinxcontrib-qthelp==1.0.3 199 | - sphinxcontrib-serializinghtml==1.1.5 200 | - statsmodels==0.12.2 201 | - superqt==0.3.2 202 | - tblib==1.7.0 203 | - terminado==0.15.0 204 | - threadpoolctl==3.1.0 205 | - tifffile==2021.11.2 206 | - tinycss2==1.1.1 207 | - toolz==0.11.2 208 | - tornado==6.2 209 | - tqdm==4.60.0 210 | - traitlets==5.3.0 211 | - typing-extensions==4.3.0 212 | - umap-learn==0.5.2 213 | - urllib3==1.24.3 214 | - vispy==0.11.0 215 | - wcwidth==0.2.5 216 | - web3==4.8.2 217 | - webencodings==0.5.1 218 | - websocket-client==1.3.3 219 | - websockets==6.0 220 | - werkzeug==2.1.2 221 | - widgetsnbextension==3.6.1 222 | - wrapt==1.14.1 223 | - xarray==0.14.1 224 | - zict==2.2.0 225 | prefix: C:\ProgramData\Anaconda3\envs\cellplato_gitversion 226 | -------------------------------------------------------------------------------- /cellPLATO/environment_oldversion.yml: -------------------------------------------------------------------------------- 1 | name: cellPLATO 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - ca-certificates=2022.6.15=h5b45459_0 7 | - openssl=3.0.4=h8ffe710_2 8 | - pip=22.1.2=pyhd8ed1ab_0 9 | - python=3.7.12=h900ac77_100_cpython 10 | - python_abi=3.7=2_cp37m 11 | - setuptools=63.1.0=py37h03978a9_0 12 | - sqlite=3.39.0=h8ffe710_0 13 | - ucrt=10.0.20348.0=h57928b3_0 14 | - vc=14.2=hb210afc_6 15 | - vs2015_runtime=14.29.30037=h902a5da_6 16 | - wheel=0.37.1=pyhd8ed1ab_0 17 | - pip: 18 | - alabaster==0.7.12 19 | - anyio==3.6.1 20 | - appdirs==1.4.4 21 | - astunparse==1.6.3 22 | - attrdict==2.0.1 23 | - babel==2.10.3 24 | - beautifulsoup4==4.11.1 25 | - bokeh==2.3.3 26 | - btrack==0.4.0 27 | - cachetools==4.2.4 28 | - cachey==0.2.1 29 | - certifi==2022.6.15 30 | - chardet==3.0.4 31 | - click==8.1.3 32 | - cloudpickle==2.1.0 33 | - colorcet==3.0.0 34 | - cvxopt==1.3.0 35 | - cython==0.29.30 36 | - cytoolz==0.11.2 37 | - dask==2.30.0 38 | - datashader==0.13.0 39 | - datashape==0.5.2 40 | - distributed==2.30.1 41 | - docstring-parser==0.14.1 42 | - docutils==0.18.1 43 | - et-xmlfile==1.1.0 44 | - eth-abi==1.3.0 45 | - eth-account==0.3.0 46 | - eth-hash==0.3.3 47 | - eth-keyfile==0.5.1 48 | - eth-keys==0.2.4 49 | - eth-rlp==0.2.1 50 | - eth-typing==2.3.0 51 | - eth-utils==1.10.0 52 | - fastjsonschema==2.15.3 53 | - flask==1.0.2 54 | - freetype-py==2.3.0 55 | - google-api-core==1.31.6 56 | - google-auth==1.35.0 57 | - google-cloud==0.34.0 58 | - google-cloud-core==1.7.2 59 | - google-cloud-storage==1.23.0 60 | - google-resumable-media==0.5.1 61 | - googleapis-common-protos==1.56.3 62 | - hdbscan==0.8.28 63 | - heapdict==1.0.1 64 | - hexbytes==0.1.0 65 | - hsluv==5.0.3 66 | - idna==2.8 67 | - imagesize==1.4.1 68 | - importlib-metadata==4.12.0 69 | - itsdangerous==2.1.2 70 | - jinja2==3.0.1 71 | - joblib==1.1.0 72 | - json5==0.9.8 73 | - jupyter==1.0.0 74 | - jupyter-client==7.3.4 75 | - jupyter-console==6.4.4 76 | - jupyter-core==4.10.0 77 | - jupyter-server==1.18.0 78 | - jupyterlab==3.4.3 79 | - jupyterlab-server==2.10.3 80 | - kaleido==0.1.0.post1 81 | - llvmlite==0.34.0 82 | - locket==1.0.0 83 | - lru-dict==1.1.7 84 | - magicgui==0.5.1 85 | - markdown==3.3.7 86 | - markupsafe==2.1.1 87 | - matplotlib==3.1.1 88 | - msgpack==1.0.4 89 | - multipledispatch==0.6.0 90 | - napari==0.4.12 91 | - napari-console==0.0.4 92 | - napari-plugin-engine==0.2.0 93 | - napari-svg==0.1.6 94 | - nbclassic==0.4.0 95 | - nbconvert==6.5.0 96 | - nbformat==5.4.0 97 | - nest-asyncio==1.5.5 98 | - notebook-shim==0.1.0 99 | - numba==0.51.2 100 | - numpy==1.21.0 101 | - numpydoc==1.4.0 102 | - openpyxl==3.0.7 103 | - opentsne==0.5.1 104 | - pandas==1.2.3 105 | - panel==0.11.0 106 | - param==1.12.2 107 | - parsimonious==0.8.1 108 | - partd==1.2.0 109 | - patsy==0.5.2 110 | - pdoc==12.0.2 111 | - pillow==9.2.0 112 | - pint==0.18 113 | - plotly==4.14.1 114 | - protobuf==3.20.1 115 | - psutil==5.9.1 116 | - psygnal==0.3.5 117 | - pyasn1==0.4.8 118 | - pyasn1-modules==0.2.8 119 | - pycryptodome==3.15.0 120 | - pyct==0.4.8 121 | - pydantic==1.9.1 122 | - pygments==2.12.0 123 | - pynndescent==0.5.7 124 | - pyopengl==3.1.6 125 | - pypiwin32==223 126 | - python-dateutil==2.8.2 127 | - python-ternary==1.0.8 128 | - pyviz-comms==2.2.0 129 | - pyyaml==6.0 130 | - pyzmq==23.2.0 131 | - qtconsole==5.3.1 132 | - qtpy==2.1.0 133 | - requests==2.21.0 134 | - rlp==2.0.1 135 | - rsa==4.8 136 | - scikit-image==0.16.1 137 | - scikit-learn==0.23.2 138 | - scipy==1.7.3 139 | - seaborn==0.11.0 140 | - send2trash==1.8.0 141 | - similaritymeasures==0.4.4 142 | - simplification==0.6.1 143 | - sniffio==1.2.0 144 | - snowballstemmer==2.2.0 145 | - sortedcontainers==2.4.0 146 | - soupsieve==2.3.2.post1 147 | - sphinx==5.0.2 148 | - sphinxcontrib-applehelp==1.0.2 149 | - sphinxcontrib-devhelp==1.0.2 150 | - sphinxcontrib-htmlhelp==2.0.0 151 | - sphinxcontrib-jsmath==1.0.1 152 | - sphinxcontrib-qthelp==1.0.3 153 | - sphinxcontrib-serializinghtml==1.1.5 154 | - statsmodels==0.12.2 155 | - superqt==0.3.2 156 | - tblib==1.7.0 157 | - threadpoolctl==3.1.0 158 | - tifffile==2021.11.2 159 | - tinycss2==1.1.1 160 | - toolz==0.11.2 161 | - tornado==6.2 162 | - tqdm==4.60.0 163 | - traitlets==5.3.0 164 | - umap-learn==0.5.2 165 | - urllib3==1.24.3 166 | - vispy==0.11.0 167 | - web3==4.8.2 168 | - websocket-client==1.3.3 169 | - websockets==6.0 170 | - werkzeug==2.1.2 171 | - wrapt==1.14.1 172 | - xarray==0.14.1 173 | - zict==2.2.0 174 | prefix: C:\Users\tyler\Anaconda3\envs\cellPLATO 175 | -------------------------------------------------------------------------------- /cellPLATO/images/cellPLATOlogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Michael-shannon/cellPLATO/8d19af543653479bf34c8833da041ae195ce03dd/cellPLATO/images/cellPLATOlogo.png -------------------------------------------------------------------------------- /cellPLATO/requirements.txt: -------------------------------------------------------------------------------- 1 | #btrack==0.4.5 2 | requests==2.21.0 3 | #pandas==0.25.3 4 | 5 | datashader==0.13.0 6 | Flask==1.0.2 7 | google-cloud==0.34.0 8 | google-cloud-storage==1.23.0 9 | h5py==3.1.0 10 | hdbscan==0.8.28 11 | hexbytes==0.1.0 12 | imageio==2.6.1 13 | ipykernel 14 | jinja2==3.0.1 15 | jupyter==1.0.0 16 | jupyterlab 17 | kaleido==0.1.0.post1 18 | llvmlite==0.38 19 | matplotlib==3.1.1 20 | #napari 21 | notebook 22 | #numpy==1.19.5 23 | numpy==1.21 24 | openpyxl==3.0.7 25 | openTSNE==0.5.1 26 | pandas==1.2.3 27 | panel==0.11.0 28 | plotly==4.14.1 29 | python-ternary==1.0.8 30 | scikit-image==0.16.1 31 | scikit-learn==0.23.2 32 | scipy==1.7.3 33 | simplification==0.6.1 34 | statsmodels==0.12.2 35 | seaborn==0.11.0 36 | similaritymeasures==0.4.4 37 | tqdm==4.60 38 | umap-learn==0.5.2 39 | web3==4.8.2 40 | -------------------------------------------------------------------------------- /cellPLATO/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name='cellPLATO', 5 | packages=find_packages(), 6 | ) 7 | -------------------------------------------------------------------------------- /cellPLATO/tests/testing.py: -------------------------------------------------------------------------------- 1 | #testing.py 2 | 3 | 4 | ''' 5 | Old Module: 6 | ''' 7 | # 8 | 9 | # # Import everything: 10 | # import sys 11 | # sys.path.append("..") # Adds higher directory to python modules path. 12 | 13 | 14 | # from old_module.config import * 15 | # from old_module.comparative_visualization import * 16 | # from old_module.spacetimecube import * 17 | # from old_module.data_visualization import * 18 | # from old_module.data_processing import time_average, average_per_condition, clean_comb_df, migration_calcs, format_for_superplots 19 | # from old_module.data_processing import get_data_matrix, do_tsne, do_pca, dbscan_clustering, get_label_counts 20 | # from old_module.data_processing import factor_calibration, stats_table 21 | # from old_module.combine_compare import load_data, get_experiments, combine_dataframes, csv_summary 22 | # from old_module.tsne_embedding import do_open_tsne 23 | # from old_module.pipelines import process_ind_exp 24 | # from old_module.panel_app import * 25 | # from old_module.param_sweep import * 26 | # from old_module.segmentations import * 27 | # 28 | # from old_module.dev_funcs_uncategorized import * 29 | 30 | 31 | ''' 32 | New Module: 33 | ''' 34 | 35 | from initialization.config import * 36 | 37 | from data_processing.cell_identifier import * 38 | from data_processing.cleaning_formatting_filtering import * 39 | from data_processing.clustering import * 40 | from data_processing.data_io import * 41 | from data_processing.data_wrangling import * 42 | from data_processing.dimensionality_reduction import * 43 | from data_processing.measurements import * 44 | from data_processing.migration_calculations import * 45 | from data_processing.pipelines import * 46 | from data_processing.shape_calculations import * 47 | from data_processing.time_calculations import * 48 | from data_processing.trajectory_clustering import * 49 | 50 | from visualization.cluster_visualization import * 51 | from visualization.filter_visualization import * 52 | from visualization.low_dimension_visualization import * 53 | from visualization.panel_apps import * 54 | from visualization.timecourse_visualization import * 55 | from visualization.trajectory_visualization import * 56 | 57 | print('Successfully imported all modules without error.') 58 | --------------------------------------------------------------------------------