├── tests ├── __init__.py ├── data │ ├── file2.txt │ ├── file.txt │ ├── ccc-example-coef.pkl │ ├── ccc-example-data.pkl │ ├── ccc-random_data-coef.pkl │ ├── ccc-random_data-data.pkl │ └── README.md ├── README.md ├── test_log.py ├── test_methods.py ├── test_pytorch_core.py ├── test_scipy_stats.py └── test_conf.py ├── libs └── ccc │ ├── numpy │ └── __init__.py │ ├── pytorch │ └── __init__.py │ ├── scipy │ └── __init__.py │ ├── sklearn │ └── __init__.py │ ├── utils │ └── __init__.py │ ├── __init__.py │ ├── coef │ └── __init__.py │ ├── methods.py │ ├── log_config.yaml │ ├── log.py │ ├── settings.py │ └── corr.py ├── .gitattributes ├── misc └── logo │ └── ccc.png ├── scripts ├── styler.r ├── jupytext_sync.sh ├── touch_pys.sh ├── env.sh ├── create_docker_image.sh ├── run_nbs_server.sh ├── convert_ipynb_to_py.sh ├── rsync.sh └── run_docker.sh ├── nbs ├── .jupytext ├── others │ └── 05_clustermatch_profiling │ │ ├── 10_cm_optimized │ │ ├── 07-n_samples_large.txt │ │ ├── 07-n_samples_small.txt │ │ ├── 04-n_samples_large.txt │ │ ├── 04-n_samples_small.txt │ │ ├── 00-n_samples_large.txt │ │ ├── 00-n_samples_small.txt │ │ ├── 01-n_samples_large.txt │ │ ├── 01-n_samples_small.txt │ │ ├── 02-n_samples_large.txt │ │ ├── 02-n_samples_small.txt │ │ ├── 03-n_samples_large.txt │ │ ├── 03-n_samples_small.txt │ │ ├── 08-n_samples_small_50.txt │ │ ├── 08-n_samples_large_50000.txt │ │ ├── 09-n_samples_large_50000.txt │ │ ├── 09-n_samples_small_1000.txt │ │ ├── 08-n_samples_small_100.txt │ │ ├── 08-n_samples_small_500.txt │ │ ├── 09-n_samples_small_100.txt │ │ ├── 09-n_samples_small_50.txt │ │ ├── 09-n_samples_small_500.txt │ │ ├── 08-n_samples_large_100000.txt │ │ ├── 09-n_samples_large_100000.txt │ │ ├── 08-n_samples_small_1000.txt │ │ ├── 10-n_samples_small_50.txt │ │ ├── 10-n_samples_large_50000.txt │ │ ├── 10-n_samples_small_100.txt │ │ ├── 10-n_samples_small_1000.txt │ │ ├── 10-n_samples_small_500.txt │ │ ├── 10-n_samples_large_100000.txt │ │ ├── 06-n_samples_large.txt │ │ ├── 05-n_samples_small.txt │ │ ├── 05-n_samples_large.txt │ │ ├── 06-n_samples_small.txt │ │ └── py │ │ │ ├── 01-cdist_parts_v00.py │ │ │ ├── 00-run_reference.py │ │ │ ├── 04-get_parts_v00.py │ │ │ ├── 02-cdist_parts_v01.py │ │ │ ├── 03-cdist_parts_v02.py │ │ │ ├── 05-get_parts_v01.py │ │ │ ├── 07-get_parts_v03.py │ │ │ └── 06-get_parts_v02.py │ │ ├── README.md │ │ ├── 05_cm_optimized │ │ ├── py │ │ │ ├── 06-many_genes.py │ │ │ ├── 07-many_samples.py │ │ │ ├── 05-compare_precomputing_of_parts.py │ │ │ └── 04-compare_numba_ari.py │ │ ├── 04-cm_ari_numba.txt │ │ ├── 04-cm_ari_sklearn.txt │ │ ├── 05-cm_precompute_parts_false.txt │ │ ├── 05-cm_precompute_parts_true.txt │ │ ├── 06-cm_many_genes.txt │ │ ├── 07-cm_many_samples-less_internal_n_clusters.txt │ │ └── 07-cm_many_samples-default_internal_n_clusters.txt │ │ ├── 06_cm_optimized │ │ ├── py │ │ │ ├── 06-many_genes.py │ │ │ ├── 04-compare_numba_ari.py │ │ │ └── 07-many_samples.py │ │ └── 06-cm_many_genes.txt │ │ ├── 07_cm_optimized │ │ ├── py │ │ │ ├── 06-many_genes.py │ │ │ ├── 04-compare_numba_ari.py │ │ │ └── 07-many_samples.py │ │ └── 04-cm_ari_numba.txt │ │ ├── 11_cm_optimized │ │ └── py │ │ │ ├── 06-many_genes.py │ │ │ ├── 08-many_genes.py │ │ │ ├── 07-many_samples.py │ │ │ └── 09-many_samples.py │ │ └── 12_cm_optimized │ │ └── py │ │ ├── 06-many_genes.py │ │ ├── 10-many_genes.py │ │ ├── 08-many_genes.py │ │ ├── 07-many_samples.py │ │ ├── 11-many_samples.py │ │ └── 09-many_samples.py ├── run_nbs.sh ├── 25_pvalue │ └── py │ │ ├── 00-ccc_pvalue_dist-generate-data_matrix.py │ │ └── 01-ccc_pvalue_dist-generate-gene_pairs.py ├── 99_manuscript │ ├── k_max │ │ └── py │ │ │ └── 01-k_max-runs.py │ └── giant │ │ └── py │ │ └── 03_00-giant-get_gene_info.py └── 20_comparison_others │ └── py │ ├── 60-time_test-1_cpu_core.py │ ├── 61-time_test-3_cpu_cores.py │ └── 62-time_test-6_cpu_cores.py ├── .dockerignore ├── setup.cfg ├── entrypoint.sh ├── environment ├── scripts │ ├── install_other_packages.sh │ ├── environment_base.yml │ └── install_r_packages.r └── environment.yml ├── LICENSE_bundled ├── .github └── workflows │ └── lint.yaml ├── setup.py ├── Dockerfile ├── .gitignore └── LICENSE /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /libs/ccc/numpy/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /libs/ccc/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /libs/ccc/scipy/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /libs/ccc/sklearn/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | tests/data/* -text 2 | -------------------------------------------------------------------------------- /tests/data/file2.txt: -------------------------------------------------------------------------------- 1 | another file 2 | -------------------------------------------------------------------------------- /tests/data/file.txt: -------------------------------------------------------------------------------- 1 | a file with some content 2 | -------------------------------------------------------------------------------- /misc/logo/ccc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greenelab/ccc/HEAD/misc/logo/ccc.png -------------------------------------------------------------------------------- /libs/ccc/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from ccc.utils.utility_functions import * # noqa: F403, F401 2 | -------------------------------------------------------------------------------- /libs/ccc/__init__.py: -------------------------------------------------------------------------------- 1 | # Remember to change also setup.py with the version here 2 | __version__ = "0.2.2" 3 | -------------------------------------------------------------------------------- /tests/data/ccc-example-coef.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greenelab/ccc/HEAD/tests/data/ccc-example-coef.pkl -------------------------------------------------------------------------------- /tests/data/ccc-example-data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greenelab/ccc/HEAD/tests/data/ccc-example-data.pkl -------------------------------------------------------------------------------- /tests/data/ccc-random_data-coef.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greenelab/ccc/HEAD/tests/data/ccc-random_data-coef.pkl -------------------------------------------------------------------------------- /tests/data/ccc-random_data-data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greenelab/ccc/HEAD/tests/data/ccc-random_data-data.pkl -------------------------------------------------------------------------------- /scripts/styler.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | args <- commandArgs(trailingOnly = TRUE) 4 | file_name <- args[1L] 5 | styler::style_file(file_name) 6 | -------------------------------------------------------------------------------- /nbs/.jupytext: -------------------------------------------------------------------------------- 1 | cell_metadata_filter = "all,-execution,-papermill,-trusted" 2 | notebook_metadata_filter="-jupytext.text_representation.jupytext_version" 3 | formats = "ipynb,py//auto:percent" 4 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # project specific 2 | data/ 3 | images/ 4 | base/ 5 | 6 | # git 7 | .git/ 8 | .gitignore 9 | .github/ 10 | 11 | # python 12 | .idea/ 13 | .pytest_cache/ 14 | **/__pycache__ 15 | *.py[cod] 16 | 17 | # other 18 | *.swp 19 | -------------------------------------------------------------------------------- /libs/ccc/coef/__init__.py: -------------------------------------------------------------------------------- 1 | from ccc.coef.impl import * # noqa: F403, F401 2 | 3 | # Run CCC to initialize/compile its functions with numba 4 | from ccc.coef.impl import ccc 5 | import numpy as np 6 | 7 | ccc(np.random.rand(10), np.random.rand(10)) 8 | -------------------------------------------------------------------------------- /scripts/jupytext_sync.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This is used to go through all Jupyter notebooks, run black on the text 4 | # representation of the code, and and sync with the ipynb file. 5 | 6 | parallel 'jupytext --sync --pipe black {}' ::: nbs/**/*.ipynb 7 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = 3 | E501, 4 | W503 5 | exclude = 6 | # No need to traverse our git directory 7 | .git, 8 | setup.py, 9 | max-line-length = 88 10 | per-file-ignores = 11 | nbs/**/py/*.py:E302,E305,E402,F821 12 | -------------------------------------------------------------------------------- /scripts/touch_pys.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Updates the access date of all python scripts (.py) converted from notebooks. 4 | # This is needed sometimes when git updates files after a pull, otherwise 5 | # jupyter won't load the notebooks in the browser. 6 | 7 | find . -type f -wholename "**/py/*.py" -exec touch {} + 8 | -------------------------------------------------------------------------------- /entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash --login 2 | # Taken from here with modifications: https://pythonspeed.com/articles/activate-conda-dockerfile/ 3 | # The --login ensures the bash configuration is loaded, 4 | # enabling Conda. 5 | 6 | set +eu 7 | conda activate ccc 8 | set -euo pipefail 9 | 10 | # load environment variables 11 | eval `python libs/ccc/conf.py` 12 | 13 | exec "$@" 14 | 15 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Unit tests 2 | 3 | ## Run 4 | 5 | These are the instructions to run the unit tests. It is assumed that you already 6 | followed the steps to set up the environment and download the needed data, and that 7 | your `PYTHONPATH` and `CM_ROOT_DIR` variables are adjusted appropriately. 8 | 9 | Execute these commands to run the unit tests: 10 | 11 | ```bash 12 | pytest -rs --color=yes tests/ 13 | ``` 14 | -------------------------------------------------------------------------------- /libs/ccc/methods.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains other correlation methods. 3 | """ 4 | import warnings 5 | 6 | from minepy.mine import MINE 7 | 8 | 9 | def mic(x, y, estimator="mic_approx"): 10 | """ 11 | Given two arrays (x and y), it computes MIC with the default parameters. 12 | """ 13 | with warnings.catch_warnings(): 14 | warnings.filterwarnings("ignore", category=DeprecationWarning) 15 | 16 | mine = MINE(alpha=0.6, c=15, est=estimator) 17 | mine.compute_score(x, y) 18 | return mine.mic() 19 | -------------------------------------------------------------------------------- /tests/test_log.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests the log.py module. 3 | """ 4 | 5 | 6 | def test_log_module_load(): 7 | from ccc import log 8 | 9 | assert log is not None 10 | assert log.__file__ is not None 11 | 12 | 13 | def test_log_get_logger(): 14 | from ccc import log 15 | 16 | logger = log.get_logger("testing") 17 | assert logger is not None 18 | assert hasattr(logger, "info") 19 | assert hasattr(logger, "debug") 20 | assert hasattr(logger, "error") 21 | 22 | logger.info("test") 23 | logger.warning("test warn") 24 | -------------------------------------------------------------------------------- /scripts/env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This file exports some common environmental variables to run the code. It 4 | # has to be customized for your need by changing the BASE_DIR and CM_N_JOBS 5 | # below. 6 | 7 | # Your settings here 8 | # BASE_DIR is the parent directory where the code and manuscript repos are 9 | # located. 10 | BASE_DIR=/home/miltondp/projects/ccc/greenelab/ 11 | export CM_N_JOBS=20 12 | 13 | export CM_ROOT_DIR=${BASE_DIR}/ccc/base 14 | export CM_MANUSCRIPT_DIR=${BASE_DIR}/ccc-manuscript/ 15 | 16 | export PYTHONPATH=${BASE_DIR}/ccc/libs/ 17 | 18 | -------------------------------------------------------------------------------- /environment/scripts/install_other_packages.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script installs other dependencies that cannot be directly installed using conda. 4 | 5 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 6 | 7 | # Fix tqdm with JupyterLab: https://github.com/tqdm/tqdm/issues/394#issuecomment-384743637 8 | # jupyter nbextension enable --py widgetsnbextension 9 | 10 | # jupyter labextension install @jupyter-widgets/jupyterlab-manager 11 | 12 | # 13 | # R dependencies 14 | # 15 | TAR=$(which tar) Rscript ${SCRIPT_DIR}/install_r_packages.r 16 | 17 | -------------------------------------------------------------------------------- /libs/ccc/log_config.yaml: -------------------------------------------------------------------------------- 1 | version: 1 2 | 3 | formatters: 4 | simple: 5 | format: "[%(asctime)s - %(name)s] %(levelname)s: %(message)s" 6 | 7 | handlers: 8 | empty: 9 | class: logging.NullHandler 10 | 11 | console: 12 | class: logging.StreamHandler 13 | level: INFO 14 | formatter: simple 15 | 16 | file_handler: 17 | class: logging.FileHandler 18 | level: DEBUG 19 | filename: logging.txt 20 | formatter: simple 21 | delay: true 22 | 23 | loggers: 24 | none: 25 | handlers: [empty] 26 | propagate: false 27 | 28 | root: 29 | handlers: [console] 30 | level: INFO 31 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/07-n_samples_large.txt: -------------------------------------------------------------------------------- 1 | 14 function calls in 5.605 seconds 2 | 3 | Ordered by: cumulative time 4 | 5 | ncalls tottime percall cumtime percall filename:lineno(function) 6 | 1 0.000 0.000 5.605 5.605 {built-in method builtins.exec} 7 | 1 0.000 0.000 5.605 5.605 :1() 8 | 1 0.000 0.000 5.605 5.605 691993785.py:1(func) 9 | 10 5.605 0.560 5.605 0.560 coef.py:254(_cm) 10 | 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/07-n_samples_small.txt: -------------------------------------------------------------------------------- 1 | 14 function calls in 0.034 seconds 2 | 3 | Ordered by: cumulative time 4 | 5 | ncalls tottime percall cumtime percall filename:lineno(function) 6 | 1 0.000 0.000 0.034 0.034 {built-in method builtins.exec} 7 | 1 0.000 0.000 0.034 0.034 :1() 8 | 1 0.000 0.000 0.034 0.034 691993785.py:1(func) 9 | 10 0.033 0.003 0.033 0.003 coef.py:254(_cm) 10 | 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} -------------------------------------------------------------------------------- /LICENSE_bundled: -------------------------------------------------------------------------------- 1 | The Clustermatch Correlation Coefficient (CCC) repository and source 2 | distributions bundle a number of libraries that are compatibly 3 | licensed. We list these here. 4 | 5 | Name: scikit-learn 6 | Files: libs/ccc/sklearn/* 7 | License: BSD 3-Clause License 8 | For details, see the header inside libs/ccc/sklearn/metrics.py 9 | 10 | Name: SciPy 11 | Files: libs/ccc/scipy/* 12 | License: BSD 3-Clause License 13 | For details, see the header inside libs/ccc/scipy/stats.py 14 | 15 | Name: PyTorch 16 | Files: libs/ccc/pytorch/* 17 | License: BSD License 18 | For details, see the header inside libs/ccc/pytorch/core.py 19 | -------------------------------------------------------------------------------- /environment/scripts/environment_base.yml: -------------------------------------------------------------------------------- 1 | name: ccc 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - ipython 7 | - ipywidgets 8 | - jupyterlab 9 | - jupytext 10 | - matplotlib 11 | - minepy 12 | - numba 13 | - numpy 14 | - openpyxl 15 | - pandas 16 | - papermill 17 | - pip 18 | - pytables 19 | - pytest 20 | - python=3.9.* 21 | - pyyaml 22 | - requests 23 | - r-base 24 | - r-devtools 25 | - r-essentials 26 | - r-reticulate 27 | - r-svglite 28 | - rpy2 29 | - scikit-learn 30 | - scipy 31 | - seaborn 32 | - svgutils 33 | - tabulate 34 | - tqdm 35 | - upsetplot 36 | 37 | -------------------------------------------------------------------------------- /libs/ccc/log.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provides logging functions. 3 | """ 4 | import logging 5 | import logging.config 6 | import yaml 7 | 8 | from ccc import conf 9 | 10 | 11 | def _get_logger_config(): 12 | """Reads the logging config file in YAML format.""" 13 | with open(conf.GENERAL["LOG_CONFIG_FILE"], "r") as f: 14 | return yaml.safe_load(f.read()) 15 | 16 | 17 | logging.config.dictConfig(_get_logger_config()) 18 | 19 | 20 | def get_logger(log_name: str = None) -> logging.Logger: 21 | """ 22 | Returns a Logger instance. 23 | 24 | Args: 25 | log_name: logger name. 26 | 27 | Returns: 28 | A Logger instance configured with default settings. 29 | """ 30 | return logging.getLogger(log_name) 31 | -------------------------------------------------------------------------------- /environment/environment.yml: -------------------------------------------------------------------------------- 1 | name: ccc 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - ipython=7.* 7 | - ipywidgets 8 | - jupyterlab=3.3.* 9 | - jupytext=1.11.* 10 | - matplotlib=3.4.* 11 | - minepy=1.2.* 12 | - numba=0.53.* 13 | - numpy=1.21.* 14 | - openpyxl=3.0.* 15 | - pandas=1.3.* 16 | - papermill=2.3.* 17 | - pip 18 | - pytables=3.7.* 19 | - pytest=6.* 20 | - python=3.9.* 21 | - pyyaml=5.4.* 22 | - requests=2.* 23 | - r-base=4.1.* 24 | - r-devtools 25 | - r-essentials 26 | - r-reticulate=1.* 27 | - r-svglite=2.* 28 | - rpy2=3.4.* 29 | - scikit-learn=0.24.* 30 | - scipy=1.7.* 31 | - seaborn=0.11.* 32 | - svgutils=0.3.* 33 | - tabulate=0.8.* 34 | - tqdm=4.* 35 | - upsetplot=0.6.* 36 | -------------------------------------------------------------------------------- /scripts/create_docker_image.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | PROJECT_NAME="ccc" 4 | VERSION="1.0" 5 | 6 | CURRENT_IMAGE_ID=$(docker images --filter=reference=miltondp/${PROJECT_NAME}:latest --format "{{.ID}}") 7 | 8 | docker build -t miltondp/${PROJECT_NAME}:${VERSION} -t miltondp/${PROJECT_NAME}:latest . 9 | 10 | read -p "'docker push' new image and retag? " -r 11 | echo # (optional) move to a new line 12 | if [[ $REPLY =~ ^[Yy]$ ]]; then 13 | # push version label 14 | echo "Pushing new image to miltondp/${PROJECT_NAME}:${VERSION}" 15 | docker push miltondp/${PROJECT_NAME}:${VERSION} 16 | 17 | # push latest label 18 | echo "Pushing new image as latest" 19 | docker push miltondp/${PROJECT_NAME}:latest 20 | 21 | # retag previous version 22 | docker tag ${CURRENT_IMAGE_ID} miltondp/${PROJECT_NAME}:prev 23 | fi 24 | 25 | -------------------------------------------------------------------------------- /scripts/run_nbs_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # The script allows to run a JupyterLab server, listening to local connections 4 | # only by default. 5 | # It accepts only one argument, which could be: 6 | # * "--container-mode": it sets some parameters when starting the jupyter server 7 | # to make it work inside a Docker container. 8 | # * any other value: it is the token that the server will request from users; 9 | # in addition, it will listen to any address (*). 10 | 11 | PORT=8893 12 | 13 | IP="127.0.0.1" 14 | TOKEN="" 15 | EXTRA_ARGS="" 16 | 17 | if [ "$1" = "--container-mode" ]; then 18 | IP="*" 19 | # EXTRA_ARGS="--allow-root" 20 | elif [ ! -z "$1" ]; then 21 | IP="*" 22 | TOKEN="${1}" 23 | fi 24 | 25 | exec jupyter lab \ 26 | --ip="${IP}" \ 27 | --port="${PORT}" \ 28 | --ContentsManager.allow_hidden=True \ 29 | --no-browser \ 30 | --ServerApp.token="${TOKEN}" ${EXTRA_ARGS} 31 | 32 | -------------------------------------------------------------------------------- /scripts/convert_ipynb_to_py.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # show commands being executed (for debugging purposes) 4 | #set -x 5 | 6 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 7 | 8 | NOTEBOOK="${1}" 9 | if [ -z "${NOTEBOOK}" ]; then 10 | echo "Provide the notebook path" 11 | exit 1 12 | fi 13 | 14 | # capture whether notebook has a python or R kernel 15 | regex="\"file_extension\": \"(\.[a-zA-Z]+)\"\," 16 | value=`cat ${NOTEBOOK} | grep "file_extension"` 17 | if [[ $value =~ $regex ]]; then 18 | fext="${BASH_REMATCH[1]}" 19 | else 20 | echo "ERROR: file extension not found" 21 | exit 1 22 | fi 23 | 24 | # select code formatter according to file extension 25 | PIPE_CMD=("black {}") 26 | if [ "$fext" = ".r" ] || [ "$fext" = ".R" ]; then 27 | PIPE_CMD=("${SCRIPT_DIR}/styler.r {}") 28 | fi 29 | 30 | jupytext \ 31 | --sync \ 32 | --pipe "${PIPE_CMD[@]}" \ 33 | ${NOTEBOOK} 34 | 35 | -------------------------------------------------------------------------------- /environment/scripts/install_r_packages.r: -------------------------------------------------------------------------------- 1 | # This script installs R packages. When installing BiocManager, the script updates all R packages 2 | # currently installed (options update=TRUE, ask=FALSE in BiocManager::install). 3 | 4 | 5 | default_repo <- "http://cran.us.r-project.org" 6 | 7 | # install BiocManager but do not update R packages so we keep those installed 8 | # with conda 9 | if (!requireNamespace("BiocManager", quietly = TRUE)) { 10 | install.packages("BiocManager", repos = default_repo) 11 | } 12 | BiocManager::install(version = "3.13", update = FALSE, ask = FALSE) 13 | 14 | # styler 15 | BiocManager::install("styler", update = FALSE, ask = FALSE) 16 | 17 | # org.Hs.eg.db 18 | BiocManager::install("org.Hs.eg.db", update = FALSE, ask = FALSE) 19 | 20 | # clusterProfiler 21 | # BiocManager::install("clusterProfiler", update = FALSE, ask = FALSE) 22 | 23 | # ReactomePA 24 | # BiocManager::install("ReactomePA", update = FALSE, ask = FALSE) 25 | 26 | # library(devtools) 27 | 28 | # fgsea 29 | # install_github("ctlab/fgsea", ref="v1.17.0") 30 | -------------------------------------------------------------------------------- /.github/workflows/lint.yaml: -------------------------------------------------------------------------------- 1 | name: lint 2 | on: 3 | push: 4 | pull_request: 5 | types: [opened, reopened] 6 | jobs: 7 | run-linters: 8 | name: Run linters 9 | runs-on: ubuntu-latest 10 | 11 | steps: 12 | - name: Check out Git repository 13 | uses: actions/checkout@v2 14 | 15 | - name: Set up Python 16 | uses: actions/setup-python@v1 17 | with: 18 | python-version: 3.9 19 | 20 | - name: Install Python dependencies 21 | run: pip install black flake8 22 | 23 | - name: Run linters 24 | uses: wearerequired/lint-action@v1 25 | with: 26 | github_token: ${{ secrets.github_token }} 27 | # Enable linters 28 | black: true 29 | flake8: true 30 | # Mark the following line true if you want linters to attempt to 31 | # autocorrect your code 32 | auto_fix: true 33 | git_name: "Greene Lab Linter" 34 | git_email: "miltondp@gmail.com" 35 | commit_message: "fix code style issues with ${linter}" 36 | 37 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/README.md: -------------------------------------------------------------------------------- 1 | # Clustermatch profiling 2 | 3 | This folder contains profiling results (with cProfile) of different 4 | optimizations of the clustermatch code. A brief description of each subfolder is 5 | below. 6 | 7 | * `05_cm_optimized`: 8 | * ari implementation with numba 9 | * precomputing of internal partitions 10 | 11 | * `06_cm_optimized`: 12 | * cm function fully implemented in numba 13 | 14 | * `07_cm_optimized`: 15 | * cm function now supports parallelization (from numba) 16 | 17 | * `10_cm_optimized`: 18 | * optimization for computing ari in parallel (function cdist_parts) 19 | * many optimizations in other functions associated to _get_parts, such as rank, run_quantile_clustering, etc. 20 | * the idea here is to optimize the single variable pair processing 21 | 22 | * `11_cm_optimized`: 23 | * after all optimization in 10_cm_optimized, this is a copy of 07_cm_optimized to check 24 | if the matrix data input keeps working correctly. 25 | 26 | * `12_cm_optimized`: 27 | * a copy of `11_cm_optimized` with some other optimizations. 28 | -------------------------------------------------------------------------------- /tests/test_methods.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from ccc.methods import mic 4 | 5 | 6 | def test_mic_basic(): 7 | # Prepare 8 | np.random.seed(123) 9 | 10 | # two features on 100 objects (random data) 11 | feature0 = np.random.rand(100) 12 | feature1 = np.random.rand(100) 13 | 14 | # Run 15 | mic_value = mic(feature0, feature1) 16 | assert mic_value is not None 17 | assert isinstance(mic_value, float) 18 | assert 1.0 > mic_value > 0.0 19 | 20 | 21 | def test_mic_use_estimator_mic_e(): 22 | # Prepare 23 | np.random.seed(123) 24 | 25 | # two features on 100 objects (random data) 26 | feature0 = np.random.rand(100) 27 | feature1 = np.random.rand(100) 28 | 29 | # Run default estimator 30 | mic_value = mic(feature0, feature1) 31 | 32 | # Run with mic_e estimator 33 | mic_e_value = mic(feature0, feature1, estimator="mic_e") 34 | 35 | assert mic_e_value is not None 36 | assert isinstance(mic_e_value, float) 37 | assert 1.0 > mic_e_value > 0.0 38 | 39 | # make sure the estimator parameter is being used 40 | assert mic_value != mic_e_value 41 | -------------------------------------------------------------------------------- /scripts/rsync.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Intended for internal use only, with very personalized settings. 5 | # 6 | # This script runs rsync with some common parameters to sync with a remote 7 | # machine. For instance, it checks files' hashes instead of timestamp, and 8 | # excludes some huge files not needed. 9 | # 10 | # It accepts one argument, and it is the remote directory path (absolute) where 11 | # the base directory is. 12 | 13 | GIT_ROOT_DIR=$(git rev-parse --show-toplevel) 14 | LOCAL_DIR="${GIT_ROOT_DIR}/base/" 15 | 16 | REMOTE_DIR="${1}" 17 | if [ -z "${REMOTE_DIR}" ]; then 18 | # if remote dir not given, use the same as local 19 | REMOTE_DIR=${LOCAL_DIR} 20 | else 21 | # default value 22 | REMOTE_DIR="/home/miltondp/projects/ccc/ccc/base/*" 23 | fi 24 | 25 | rsync \ 26 | -chavzP \ 27 | --stats \ 28 | --exclude 'GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz' \ 29 | --exclude 'GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt' \ 30 | --exclude 'recount_data_prep_PLIER.*' \ 31 | --exclude 'recount2_PLIER_data.zip' \ 32 | pcgreene:${REMOTE_DIR} \ 33 | ${LOCAL_DIR} 34 | 35 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/05_cm_optimized/py/06-many_genes.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.11.5 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] 20 | # Clustermatch run using a larger number of genes. 21 | 22 | # %% [markdown] 23 | # # Modules 24 | 25 | # %% 26 | import numpy as np 27 | 28 | from ccc.coef import ccc 29 | 30 | # %% [markdown] 31 | # # Data 32 | 33 | # %% tags=[] 34 | n_genes, n_samples = 500, 1000 35 | 36 | # %% tags=[] 37 | np.random.seed(0) 38 | 39 | # %% tags=[] 40 | data = np.random.rand(n_genes, n_samples) 41 | 42 | # %% tags=[] 43 | data.shape 44 | 45 | 46 | # %% [markdown] tags=[] 47 | # # Profile 48 | 49 | # %% tags=[] 50 | def func(): 51 | return ccc(data, internal_n_clusters=range(2, 10 + 1), precompute_parts=True) 52 | 53 | 54 | # %% tags=[] 55 | # %%timeit func() 56 | func() 57 | 58 | # %% tags=[] 59 | # %%prun -s cumulative -l 20 -T 06-cm_many_genes.txt 60 | func() 61 | 62 | # %% tags=[] 63 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/04-n_samples_large.txt: -------------------------------------------------------------------------------- 1 | 134 function calls in 15.269 seconds 2 | 3 | Ordered by: cumulative time 4 | 5 | ncalls tottime percall cumtime percall filename:lineno(function) 6 | 1 0.000 0.000 15.269 15.269 {built-in method builtins.exec} 7 | 1 0.000 0.000 15.269 15.269 :1() 8 | 1 0.007 0.007 15.269 15.269 1556911885.py:1(func) 9 | 10 0.026 0.003 15.262 1.526 coef.py:266(_cm) 10 | 20 11.375 0.569 11.375 0.569 coef.py:169(_get_parts) 11 | 10 3.860 0.386 3.860 0.386 coef.py:199(cdist_parts) 12 | 30 0.001 0.000 0.001 0.000 {built-in method numpy.zeros} 13 | 10 0.000 0.000 0.000 0.000 {method 'argmax' of 'numpy.ndarray' objects} 14 | 10 0.000 0.000 0.000 0.000 {built-in method numpy.empty} 15 | 10 0.000 0.000 0.000 0.000 coef.py:119(_get_range_n_clusters) 16 | 10 0.000 0.000 0.000 0.000 coef.py:249(unravel_index_2d) 17 | 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} 18 | 10 0.000 0.000 0.000 0.000 special.py:18(__new__) 19 | 10 0.000 0.000 0.000 0.000 coef.py:225(get_coords_from_index) -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/04-n_samples_small.txt: -------------------------------------------------------------------------------- 1 | 134 function calls in 0.028 seconds 2 | 3 | Ordered by: cumulative time 4 | 5 | ncalls tottime percall cumtime percall filename:lineno(function) 6 | 1 0.000 0.000 0.028 0.028 {built-in method builtins.exec} 7 | 1 0.000 0.000 0.028 0.028 :1() 8 | 1 0.000 0.000 0.028 0.028 1556911885.py:1(func) 9 | 10 0.001 0.000 0.027 0.003 coef.py:266(_cm) 10 | 10 0.019 0.002 0.019 0.002 coef.py:199(cdist_parts) 11 | 20 0.008 0.000 0.008 0.000 coef.py:169(_get_parts) 12 | 30 0.000 0.000 0.000 0.000 {built-in method numpy.zeros} 13 | 10 0.000 0.000 0.000 0.000 {method 'argmax' of 'numpy.ndarray' objects} 14 | 10 0.000 0.000 0.000 0.000 coef.py:119(_get_range_n_clusters) 15 | 10 0.000 0.000 0.000 0.000 {built-in method numpy.empty} 16 | 10 0.000 0.000 0.000 0.000 coef.py:249(unravel_index_2d) 17 | 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} 18 | 10 0.000 0.000 0.000 0.000 special.py:18(__new__) 19 | 10 0.000 0.000 0.000 0.000 coef.py:225(get_coords_from_index) -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/06_cm_optimized/py/06-many_genes.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.11.5 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] 20 | # Clustermatch run using a larger number of genes. 21 | 22 | # %% [markdown] tags=[] 23 | # # Modules 24 | 25 | # %% tags=[] 26 | import numpy as np 27 | 28 | from ccc.coef import ccc 29 | 30 | # %% [markdown] tags=[] 31 | # # Data 32 | 33 | # %% tags=[] 34 | n_genes, n_samples = 500, 1000 35 | 36 | # %% tags=[] 37 | np.random.seed(0) 38 | 39 | # %% tags=[] 40 | data = np.random.rand(n_genes, n_samples) 41 | 42 | # %% tags=[] 43 | data.shape 44 | 45 | 46 | # %% [markdown] tags=[] 47 | # # Profile 48 | 49 | # %% tags=[] 50 | def func(): 51 | n_clust = list(range(2, 10 + 1)) 52 | return ccc(data, internal_n_clusters=n_clust) 53 | 54 | 55 | # %% tags=[] 56 | # %%timeit func() 57 | func() 58 | 59 | # %% tags=[] 60 | # %%prun -s cumulative -l 50 -T 06-cm_many_genes.txt 61 | func() 62 | 63 | # %% tags=[] 64 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/07_cm_optimized/py/06-many_genes.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.11.5 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] 20 | # Clustermatch run using a larger number of genes. 21 | 22 | # %% [markdown] tags=[] 23 | # # Modules 24 | 25 | # %% tags=[] 26 | import numpy as np 27 | 28 | from ccc.coef import ccc 29 | 30 | # %% [markdown] tags=[] 31 | # # Data 32 | 33 | # %% tags=[] 34 | n_genes, n_samples = 500, 1000 35 | 36 | # %% tags=[] 37 | np.random.seed(0) 38 | 39 | # %% tags=[] 40 | data = np.random.rand(n_genes, n_samples) 41 | 42 | # %% tags=[] 43 | data.shape 44 | 45 | 46 | # %% [markdown] tags=[] 47 | # # Profile 48 | 49 | # %% tags=[] 50 | def func(): 51 | n_clust = list(range(2, 10 + 1)) 52 | return ccc(data, internal_n_clusters=n_clust) 53 | 54 | 55 | # %% tags=[] 56 | # %%timeit func() 57 | func() 58 | 59 | # %% tags=[] 60 | # %%prun -s cumulative -l 50 -T 06-cm_many_genes.txt 61 | func() 62 | 63 | # %% tags=[] 64 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/00-n_samples_large.txt: -------------------------------------------------------------------------------- 1 | 154 function calls in 18.817 seconds 2 | 3 | Ordered by: cumulative time 4 | 5 | ncalls tottime percall cumtime percall filename:lineno(function) 6 | 1 0.000 0.000 18.817 18.817 {built-in method builtins.exec} 7 | 1 0.000 0.000 18.817 18.817 :1() 8 | 1 0.004 0.004 18.817 18.817 1556911885.py:1(func) 9 | 10 0.008 0.001 18.813 1.881 coef.py:265(_cm) 10 | 20 10.568 0.528 10.568 0.528 coef.py:169(_get_parts) 11 | 10 8.237 0.824 8.237 0.824 coef.py:198(cdist_parts) 12 | 20 0.001 0.000 0.001 0.000 {built-in method numpy.zeros} 13 | 10 0.000 0.000 0.000 0.000 {method 'argmax' of 'numpy.ndarray' objects} 14 | 20 0.000 0.000 0.000 0.000 coef.py:119(_get_range_n_clusters) 15 | 10 0.000 0.000 0.000 0.000 {built-in method numpy.empty} 16 | 10 0.000 0.000 0.000 0.000 special.py:18(__new__) 17 | 10 0.000 0.000 0.000 0.000 coef.py:248(unravel_index_2d) 18 | 20 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects} 19 | 10 0.000 0.000 0.000 0.000 coef.py:224(get_coords_from_index) 20 | 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/00-n_samples_small.txt: -------------------------------------------------------------------------------- 1 | 154 function calls in 0.034 seconds 2 | 3 | Ordered by: cumulative time 4 | 5 | ncalls tottime percall cumtime percall filename:lineno(function) 6 | 1 0.000 0.000 0.034 0.034 {built-in method builtins.exec} 7 | 1 0.000 0.000 0.034 0.034 :1() 8 | 1 0.000 0.000 0.034 0.034 1556911885.py:1(func) 9 | 10 0.001 0.000 0.034 0.003 coef.py:265(_cm) 10 | 10 0.024 0.002 0.024 0.002 coef.py:198(cdist_parts) 11 | 20 0.009 0.000 0.009 0.000 coef.py:169(_get_parts) 12 | 10 0.000 0.000 0.000 0.000 {method 'argmax' of 'numpy.ndarray' objects} 13 | 20 0.000 0.000 0.000 0.000 coef.py:119(_get_range_n_clusters) 14 | 20 0.000 0.000 0.000 0.000 {built-in method numpy.zeros} 15 | 10 0.000 0.000 0.000 0.000 {built-in method numpy.empty} 16 | 10 0.000 0.000 0.000 0.000 coef.py:248(unravel_index_2d) 17 | 10 0.000 0.000 0.000 0.000 special.py:18(__new__) 18 | 10 0.000 0.000 0.000 0.000 coef.py:224(get_coords_from_index) 19 | 20 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects} 20 | 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/01-n_samples_large.txt: -------------------------------------------------------------------------------- 1 | 154 function calls in 14.819 seconds 2 | 3 | Ordered by: cumulative time 4 | 5 | ncalls tottime percall cumtime percall filename:lineno(function) 6 | 1 0.000 0.000 14.819 14.819 {built-in method builtins.exec} 7 | 1 0.000 0.000 14.819 14.819 :1() 8 | 1 0.005 0.005 14.819 14.819 1556911885.py:1(func) 9 | 10 0.009 0.001 14.815 1.481 coef.py:265(_cm) 10 | 20 11.105 0.555 11.105 0.555 coef.py:169(_get_parts) 11 | 10 3.700 0.370 3.700 0.370 coef.py:198(cdist_parts) 12 | 20 0.000 0.000 0.000 0.000 {built-in method numpy.zeros} 13 | 20 0.000 0.000 0.000 0.000 coef.py:119(_get_range_n_clusters) 14 | 10 0.000 0.000 0.000 0.000 {method 'argmax' of 'numpy.ndarray' objects} 15 | 10 0.000 0.000 0.000 0.000 {built-in method numpy.empty} 16 | 10 0.000 0.000 0.000 0.000 coef.py:248(unravel_index_2d) 17 | 10 0.000 0.000 0.000 0.000 special.py:18(__new__) 18 | 20 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects} 19 | 10 0.000 0.000 0.000 0.000 coef.py:224(get_coords_from_index) 20 | 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/01-n_samples_small.txt: -------------------------------------------------------------------------------- 1 | 154 function calls in 0.032 seconds 2 | 3 | Ordered by: cumulative time 4 | 5 | ncalls tottime percall cumtime percall filename:lineno(function) 6 | 1 0.000 0.000 0.032 0.032 {built-in method builtins.exec} 7 | 1 0.000 0.000 0.032 0.032 :1() 8 | 1 0.000 0.000 0.032 0.032 1556911885.py:1(func) 9 | 10 0.001 0.000 0.032 0.003 coef.py:265(_cm) 10 | 10 0.021 0.002 0.021 0.002 coef.py:198(cdist_parts) 11 | 20 0.010 0.000 0.010 0.000 coef.py:169(_get_parts) 12 | 20 0.000 0.000 0.000 0.000 coef.py:119(_get_range_n_clusters) 13 | 10 0.000 0.000 0.000 0.000 {method 'argmax' of 'numpy.ndarray' objects} 14 | 20 0.000 0.000 0.000 0.000 {built-in method numpy.zeros} 15 | 10 0.000 0.000 0.000 0.000 {built-in method numpy.empty} 16 | 10 0.000 0.000 0.000 0.000 coef.py:248(unravel_index_2d) 17 | 10 0.000 0.000 0.000 0.000 special.py:18(__new__) 18 | 20 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects} 19 | 10 0.000 0.000 0.000 0.000 coef.py:224(get_coords_from_index) 20 | 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/02-n_samples_large.txt: -------------------------------------------------------------------------------- 1 | 154 function calls in 15.669 seconds 2 | 3 | Ordered by: cumulative time 4 | 5 | ncalls tottime percall cumtime percall filename:lineno(function) 6 | 1 0.000 0.000 15.669 15.669 {built-in method builtins.exec} 7 | 1 0.000 0.000 15.669 15.669 :1() 8 | 1 0.004 0.004 15.669 15.669 1556911885.py:1(func) 9 | 10 0.010 0.001 15.665 1.566 coef.py:265(_cm) 10 | 20 11.799 0.590 11.799 0.590 coef.py:169(_get_parts) 11 | 10 3.854 0.385 3.854 0.385 coef.py:198(cdist_parts) 12 | 20 0.000 0.000 0.000 0.000 {built-in method numpy.zeros} 13 | 20 0.000 0.000 0.000 0.000 coef.py:119(_get_range_n_clusters) 14 | 10 0.000 0.000 0.000 0.000 {method 'argmax' of 'numpy.ndarray' objects} 15 | 10 0.000 0.000 0.000 0.000 {built-in method numpy.empty} 16 | 10 0.000 0.000 0.000 0.000 special.py:18(__new__) 17 | 10 0.000 0.000 0.000 0.000 coef.py:248(unravel_index_2d) 18 | 20 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects} 19 | 10 0.000 0.000 0.000 0.000 coef.py:224(get_coords_from_index) 20 | 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/02-n_samples_small.txt: -------------------------------------------------------------------------------- 1 | 154 function calls in 0.034 seconds 2 | 3 | Ordered by: cumulative time 4 | 5 | ncalls tottime percall cumtime percall filename:lineno(function) 6 | 1 0.000 0.000 0.034 0.034 {built-in method builtins.exec} 7 | 1 0.000 0.000 0.034 0.034 :1() 8 | 1 0.000 0.000 0.034 0.034 1556911885.py:1(func) 9 | 10 0.001 0.000 0.034 0.003 coef.py:265(_cm) 10 | 10 0.024 0.002 0.024 0.002 coef.py:198(cdist_parts) 11 | 20 0.009 0.000 0.009 0.000 coef.py:169(_get_parts) 12 | 20 0.000 0.000 0.000 0.000 {built-in method numpy.zeros} 13 | 10 0.000 0.000 0.000 0.000 {method 'argmax' of 'numpy.ndarray' objects} 14 | 20 0.000 0.000 0.000 0.000 coef.py:119(_get_range_n_clusters) 15 | 10 0.000 0.000 0.000 0.000 {built-in method numpy.empty} 16 | 10 0.000 0.000 0.000 0.000 coef.py:248(unravel_index_2d) 17 | 10 0.000 0.000 0.000 0.000 special.py:18(__new__) 18 | 20 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects} 19 | 10 0.000 0.000 0.000 0.000 coef.py:224(get_coords_from_index) 20 | 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/03-n_samples_large.txt: -------------------------------------------------------------------------------- 1 | 154 function calls in 15.245 seconds 2 | 3 | Ordered by: cumulative time 4 | 5 | ncalls tottime percall cumtime percall filename:lineno(function) 6 | 1 0.000 0.000 15.245 15.245 {built-in method builtins.exec} 7 | 1 0.000 0.000 15.245 15.245 :1() 8 | 1 0.004 0.004 15.245 15.245 1556911885.py:1(func) 9 | 10 0.011 0.001 15.241 1.524 coef.py:265(_cm) 10 | 20 11.407 0.570 11.407 0.570 coef.py:169(_get_parts) 11 | 10 3.823 0.382 3.823 0.382 coef.py:198(cdist_parts) 12 | 20 0.000 0.000 0.000 0.000 {built-in method numpy.zeros} 13 | 20 0.000 0.000 0.000 0.000 coef.py:119(_get_range_n_clusters) 14 | 10 0.000 0.000 0.000 0.000 {method 'argmax' of 'numpy.ndarray' objects} 15 | 10 0.000 0.000 0.000 0.000 {built-in method numpy.empty} 16 | 10 0.000 0.000 0.000 0.000 coef.py:248(unravel_index_2d) 17 | 10 0.000 0.000 0.000 0.000 special.py:18(__new__) 18 | 20 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects} 19 | 10 0.000 0.000 0.000 0.000 coef.py:224(get_coords_from_index) 20 | 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/03-n_samples_small.txt: -------------------------------------------------------------------------------- 1 | 154 function calls in 0.032 seconds 2 | 3 | Ordered by: cumulative time 4 | 5 | ncalls tottime percall cumtime percall filename:lineno(function) 6 | 1 0.000 0.000 0.032 0.032 {built-in method builtins.exec} 7 | 1 0.000 0.000 0.032 0.032 :1() 8 | 1 0.000 0.000 0.032 0.032 1556911885.py:1(func) 9 | 10 0.001 0.000 0.032 0.003 coef.py:265(_cm) 10 | 10 0.021 0.002 0.021 0.002 coef.py:198(cdist_parts) 11 | 20 0.010 0.001 0.010 0.001 coef.py:169(_get_parts) 12 | 10 0.000 0.000 0.000 0.000 {method 'argmax' of 'numpy.ndarray' objects} 13 | 20 0.000 0.000 0.000 0.000 coef.py:119(_get_range_n_clusters) 14 | 20 0.000 0.000 0.000 0.000 {built-in method numpy.zeros} 15 | 10 0.000 0.000 0.000 0.000 {built-in method numpy.empty} 16 | 10 0.000 0.000 0.000 0.000 coef.py:248(unravel_index_2d) 17 | 10 0.000 0.000 0.000 0.000 special.py:18(__new__) 18 | 10 0.000 0.000 0.000 0.000 coef.py:224(get_coords_from_index) 19 | 20 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects} 20 | 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/06_cm_optimized/py/04-compare_numba_ari.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.11.5 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] tags=[] 20 | # This file actually does not compare different ARI implementations. The name is kept to ease comparison with the previous runs from `05_cm_optimized` 21 | 22 | # %% [markdown] tags=[] 23 | # # Modules 24 | 25 | # %% tags=[] 26 | import numpy as np 27 | 28 | from ccc import coef 29 | 30 | # %% [markdown] tags=[] 31 | # # Data 32 | 33 | # %% tags=[] 34 | n_genes, n_samples = 100, 1000 35 | 36 | # %% tags=[] 37 | np.random.seed(0) 38 | 39 | # %% tags=[] 40 | data = np.random.rand(n_genes, n_samples) 41 | 42 | # %% tags=[] 43 | data.shape 44 | 45 | 46 | # %% [markdown] tags=[] 47 | # # Improved implementation (ARI implemented in numba) 48 | 49 | # %% tags=[] 50 | def func(): 51 | n_clust = list(range(2, 10 + 1)) 52 | return coef.ccc(data, internal_n_clusters=n_clust) 53 | 54 | 55 | # %% tags=[] 56 | # %%timeit func() 57 | func() 58 | 59 | # %% tags=[] 60 | # %%prun -s cumulative -l 50 -T 04-cm_ari_numba.txt 61 | func() 62 | 63 | # %% tags=[] 64 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/07_cm_optimized/py/04-compare_numba_ari.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.11.5 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] tags=[] 20 | # This file actually does not compare different ari implementations. The name is kept to ease comparison with the previous runs from `05_cm_optimized` and `06_cm_optimized`. 21 | 22 | # %% [markdown] tags=[] 23 | # # Modules 24 | 25 | # %% tags=[] 26 | import numpy as np 27 | 28 | from ccc import coef 29 | 30 | # %% [markdown] tags=[] 31 | # # Data 32 | 33 | # %% tags=[] 34 | n_genes, n_samples = 100, 1000 35 | 36 | # %% tags=[] 37 | np.random.seed(0) 38 | 39 | # %% tags=[] 40 | data = np.random.rand(n_genes, n_samples) 41 | 42 | # %% tags=[] 43 | data.shape 44 | 45 | 46 | # %% [markdown] tags=[] 47 | # # Improved implementation (ARI implemented in numba) 48 | 49 | # %% tags=[] 50 | def func(): 51 | n_clust = list(range(2, 10 + 1)) 52 | return coef.ccc(data, internal_n_clusters=n_clust) 53 | 54 | 55 | # %% tags=[] 56 | # %%timeit func() 57 | func() 58 | 59 | # %% tags=[] 60 | # %%prun -s cumulative -l 50 -T 04-cm_ari_numba.txt 61 | func() 62 | 63 | # %% tags=[] 64 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | # Commands to publish new package: 4 | # 5 | # rm -rf dist/ 6 | # python setup.py sdist 7 | # twine upload dist/* 8 | 9 | with open("README.md", "r") as fh: 10 | long_description = fh.read() 11 | 12 | setuptools.setup( 13 | name="ccc-coef", 14 | version="0.2.2", # remember to change libs/ccc/__init__.py file also 15 | author="Milton Pividori", 16 | author_email="miltondp@gmail.com", 17 | description="The Clustermatch Correlation Coefficient (CCC) is a highly-efficient, next-generation not-only-linear correlation coefficient that can work on numerical and categorical data types.", 18 | license="BSD-2-Clause Plus Patent", 19 | long_description=long_description, 20 | long_description_content_type="text/markdown", 21 | url="https://github.com/greenelab/ccc", 22 | package_dir={"": "libs"}, 23 | packages=[ 24 | "ccc/coef", 25 | "ccc/numpy", 26 | "ccc/pytorch", 27 | "ccc/scipy", 28 | "ccc/sklearn", 29 | "ccc/utils", 30 | ], 31 | python_requires=">=3.9", 32 | install_requires=[ 33 | # numpy.typing is only available in numpy>=1.21.0 34 | "numpy>=1.21.0", 35 | "scipy", 36 | "numba", 37 | ], 38 | classifiers=[ 39 | "Programming Language :: Python :: 3", 40 | "License :: OSI Approved :: BSD License", 41 | "Operating System :: OS Independent", 42 | "Development Status :: 5 - Production/Stable", 43 | "Environment :: Console", 44 | ], 45 | ) 46 | -------------------------------------------------------------------------------- /libs/ccc/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | General settings. This file is intended to be modified by the user. Each entry 3 | also provides an alternative way to specify its value using an environment 4 | variable. 5 | """ 6 | 7 | # Instead of changing this file, you can also use the environment variable name 8 | # specified for each entry (environment variables supersede these settings). 9 | 10 | # Specifies the main directory where all data and results generated are stored. 11 | # When setting up the environment for the first time, input data will be 12 | # automatically downloaded into a subfolder of ROOT_DIR. 13 | # 14 | # Default: if not specified (None), it defaults to the 'cm_gene_expr' subfolder 15 | # in the temporary directory of the operating system (i.e. '/tmp/cm_gene_expr' 16 | # in Unix systems). 17 | # Environment variable: CM_ROOT_DIR 18 | ROOT_DIR = None 19 | 20 | # Specifies the directory where the manuscript git repository was 21 | # cloned/downloaded. If None, manuscript figures and other related files will 22 | # not be generated. 23 | # 24 | # Default: None 25 | # Environment variable: CM_MANUSCRIPT_DIR 26 | MANUSCRIPT_DIR = None 27 | 28 | 29 | # 30 | # CPU usage 31 | # 32 | 33 | # Amount of cores to use for general usage. 34 | # 35 | # Default: half of available cores. 36 | # Environment variable: CM_N_JOBS 37 | N_JOBS = None 38 | 39 | # Number of cores to use for low-computational tasks (IO, etc). This number 40 | # can be greater than N_JOBS. 41 | # 42 | # Default: same as N_JOBS. 43 | # Environment variable: CM_N_JOBS_LOW 44 | N_JOBS_LOW = None 45 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda3 2 | 3 | EXPOSE 8893/tcp 4 | 5 | ENV CODE_DIR=/opt/code 6 | ENV CM_CONDA_ENV_NAME="ccc" 7 | ENV CM_N_JOBS=1 8 | ENV CM_ROOT_DIR=/opt/data 9 | ENV CM_USER_HOME=${CM_ROOT_DIR}/user_home 10 | ENV CM_MANUSCRIPT_DIR=/opt/manuscript 11 | 12 | VOLUME ${CM_ROOT_DIR} 13 | VOLUME ${CM_MANUSCRIPT_DIR} 14 | 15 | # install gnu parallel 16 | RUN DEBIAN_FRONTEND=noninteractive apt-get update \ 17 | && apt-get install -y --no-install-recommends parallel \ 18 | && apt-get clean \ 19 | && rm -rf /var/lib/apt/lists/* 20 | 21 | # setup phenoplier 22 | COPY environment/environment.yml environment/scripts/install_other_packages.sh environment/scripts/install_r_packages.r /tmp/ 23 | RUN conda env create --name ${CM_CONDA_ENV_NAME} --file /tmp/environment.yml \ 24 | && conda run -n ${CM_CONDA_ENV_NAME} --no-capture-output /bin/bash /tmp/install_other_packages.sh \ 25 | && conda clean --all --yes 26 | 27 | # activate the environment when starting bash 28 | RUN echo "conda activate ${CM_CONDA_ENV_NAME}" >> ~/.bashrc 29 | SHELL ["/bin/bash", "--login", "-c"] 30 | 31 | ENV PYTHONPATH=${CODE_DIR}/libs:${PYTHONPATH} 32 | 33 | RUN echo "Make sure packages can be loaded" 34 | RUN python -c "import papermill" 35 | 36 | COPY . ${CODE_DIR} 37 | WORKDIR ${CODE_DIR} 38 | 39 | RUN echo "Make sure modules can be loaded" 40 | RUN python -c "from ccc import conf" 41 | 42 | # setup user home directory 43 | RUN mkdir ${CM_USER_HOME} && chmod -R 0777 ${CM_USER_HOME} 44 | ENV HOME=${CM_USER_HOME} 45 | 46 | ENTRYPOINT ["/opt/code/entrypoint.sh"] 47 | CMD ["scripts/run_nbs_server.sh", "--container-mode"] 48 | 49 | -------------------------------------------------------------------------------- /nbs/run_nbs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | GIT_ROOT_DIR=$(git rev-parse --show-toplevel) 5 | 6 | # This script runs a Jupyter notebook (.ipynb) from the command line using 7 | # papermill. 8 | # 9 | # This script must be run within the nbs/ folder. 10 | 11 | if [ -z "${1}" ]; then 12 | echo "Specify notebook to run" 13 | exit 1 14 | fi 15 | 16 | # If the notebook is an "output notebook" (*.run.ipynb), which are generated by 17 | # papermill for instance, then do not run it. 18 | pattern="*.run.ipynb" 19 | 20 | input_notebook=$1 21 | shift 22 | 23 | if [[ $input_notebook == $pattern ]]; then 24 | echo "Not running output notebook" 25 | exit 0 26 | fi 27 | 28 | override_nbs=${CM_RUN_NBS_OVERRIDE} 29 | 30 | # if second argument is a notebook, then it is the output 31 | # notebook filename 32 | if [[ $1 == *.ipynb ]]; then 33 | output_notebook=${input_notebook%/*}/$1 34 | shift 35 | 36 | # do not override if output was specified 37 | override_nbs=0 38 | else 39 | output_notebook="${input_notebook%.*}.run.ipynb" 40 | fi 41 | 42 | # run papermill 43 | papermill \ 44 | --log-output \ 45 | --request-save-on-cell-execute \ 46 | $@ \ 47 | $input_notebook \ 48 | $output_notebook 49 | 50 | # Convert to notebook 51 | # 52 | # This is to reduce the notebook final size, which is huge after 53 | # running with papermill. 54 | jupyter nbconvert --to notebook ${output_notebook} --output ${output_notebook##*/} 55 | 56 | if [ "${override_nbs}" != "0" ]; then 57 | mv $output_notebook $input_notebook 58 | bash ${GIT_ROOT_DIR}/scripts/convert_ipynb_to_py.sh ${input_notebook} 59 | fi 60 | 61 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/08-n_samples_small_50.txt: -------------------------------------------------------------------------------- 1 | 6815 function calls in 0.028 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 120 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 0.028 0.028 {built-in method builtins.exec} 8 | 1 0.000 0.000 0.028 0.028 :1() 9 | 1 0.000 0.000 0.028 0.028 1517976664.py:1(func) 10 | 10 0.001 0.000 0.028 0.003 coef.py:275(cm) 11 | 10 0.001 0.000 0.020 0.002 coef.py:414(compute_coef) 12 | 10 0.000 0.000 0.019 0.002 coef.py:407(cdist_func) 13 | 10 0.002 0.000 0.019 0.002 coef.py:168(cdist_parts_parallel) 14 | 132 0.001 0.000 0.015 0.000 threading.py:280(wait) 15 | 540 0.015 0.000 0.015 0.000 {method 'acquire' of '_thread.lock' objects} 16 | 65 0.000 0.000 0.014 0.000 threading.py:556(wait) 17 | 70 0.000 0.000 0.012 0.000 _base.py:201(as_completed) 18 | 80 0.000 0.000 0.007 0.000 thread.py:155(submit) 19 | 80 0.000 0.000 0.006 0.000 thread.py:174(_adjust_thread_count) 20 | 30 0.000 0.000 0.004 0.000 threading.py:873(start) 21 | 10 0.000 0.000 0.004 0.000 coef.py:186() 22 | 10 0.000 0.000 0.004 0.000 _base.py:572(map) 23 | 10 0.000 0.000 0.004 0.000 _base.py:597() 24 | 80 0.000 0.000 0.001 0.000 _base.py:417(result) 25 | 10 0.000 0.000 0.001 0.000 _base.py:635(__exit__) 26 | 10 0.000 0.000 0.001 0.000 thread.py:210(shutdown) -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/08-n_samples_large_50000.txt: -------------------------------------------------------------------------------- 1 | 9633 function calls in 2.469 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 120 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 2.469 2.469 {built-in method builtins.exec} 8 | 1 0.000 0.000 2.469 2.469 :1() 9 | 1 0.000 0.000 2.469 2.469 1517976664.py:1(func) 10 | 10 0.003 0.000 2.469 0.247 coef.py:275(cm) 11 | 220 0.001 0.000 2.448 0.011 threading.py:280(wait) 12 | 890 2.448 0.003 2.448 0.003 {method 'acquire' of '_thread.lock' objects} 13 | 10 0.001 0.000 1.676 0.168 coef.py:414(compute_coef) 14 | 10 0.000 0.000 1.675 0.168 coef.py:407(cdist_func) 15 | 10 0.001 0.000 1.675 0.167 coef.py:168(cdist_parts_parallel) 16 | 120 0.000 0.000 1.668 0.014 threading.py:556(wait) 17 | 100 0.001 0.000 1.667 0.017 _base.py:201(as_completed) 18 | 110 0.000 0.000 0.781 0.007 _base.py:417(result) 19 | 30 0.000 0.000 0.780 0.026 _base.py:601(result_iterator) 20 | 110 0.001 0.000 0.008 0.000 thread.py:155(submit) 21 | 110 0.000 0.000 0.007 0.000 thread.py:174(_adjust_thread_count) 22 | 10 0.002 0.000 0.006 0.001 coef.py:186() 23 | 30 0.000 0.000 0.004 0.000 threading.py:873(start) 24 | 10 0.000 0.000 0.004 0.000 _base.py:572(map) 25 | 10 0.000 0.000 0.004 0.000 _base.py:597() 26 | 10 0.000 0.000 0.002 0.000 _base.py:635(__exit__) -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/09-n_samples_large_50000.txt: -------------------------------------------------------------------------------- 1 | 9632 function calls in 2.252 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 120 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 2.252 2.252 {built-in method builtins.exec} 8 | 1 0.000 0.000 2.252 2.252 :1() 9 | 1 0.000 0.000 2.252 2.252 1517976664.py:1(func) 10 | 10 0.003 0.000 2.252 0.225 coef.py:275(cm) 11 | 890 2.231 0.003 2.231 0.003 {method 'acquire' of '_thread.lock' objects} 12 | 220 0.001 0.000 2.231 0.010 threading.py:280(wait) 13 | 10 0.001 0.000 1.547 0.155 coef.py:414(compute_coef) 14 | 10 0.000 0.000 1.546 0.155 coef.py:407(cdist_func) 15 | 10 0.001 0.000 1.546 0.155 coef.py:168(cdist_parts_parallel) 16 | 120 0.000 0.000 1.538 0.013 threading.py:556(wait) 17 | 100 0.001 0.000 1.537 0.015 _base.py:201(as_completed) 18 | 110 0.000 0.000 0.693 0.006 _base.py:417(result) 19 | 30 0.000 0.000 0.693 0.023 _base.py:601(result_iterator) 20 | 110 0.001 0.000 0.008 0.000 thread.py:155(submit) 21 | 10 0.002 0.000 0.007 0.001 coef.py:186() 22 | 110 0.001 0.000 0.007 0.000 thread.py:174(_adjust_thread_count) 23 | 30 0.000 0.000 0.004 0.000 threading.py:873(start) 24 | 10 0.000 0.000 0.003 0.000 _base.py:572(map) 25 | 10 0.000 0.000 0.003 0.000 _base.py:597() 26 | 10 0.000 0.000 0.003 0.000 _base.py:635(__exit__) -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/09-n_samples_small_1000.txt: -------------------------------------------------------------------------------- 1 | 9576 function calls in 0.069 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 120 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 0.069 0.069 {built-in method builtins.exec} 8 | 1 0.000 0.000 0.068 0.068 :1() 9 | 1 0.000 0.000 0.068 0.068 1517976664.py:1(func) 10 | 10 0.001 0.000 0.068 0.007 coef.py:275(cm) 11 | 884 0.055 0.000 0.055 0.000 {method 'acquire' of '_thread.lock' objects} 12 | 223 0.001 0.000 0.054 0.000 threading.py:280(wait) 13 | 10 0.000 0.000 0.051 0.005 coef.py:414(compute_coef) 14 | 10 0.000 0.000 0.051 0.005 coef.py:407(cdist_func) 15 | 10 0.001 0.000 0.051 0.005 coef.py:168(cdist_parts_parallel) 16 | 116 0.000 0.000 0.045 0.000 threading.py:556(wait) 17 | 100 0.001 0.000 0.042 0.000 _base.py:201(as_completed) 18 | 110 0.000 0.000 0.010 0.000 _base.py:417(result) 19 | 30 0.000 0.000 0.010 0.000 _base.py:601(result_iterator) 20 | 110 0.000 0.000 0.008 0.000 thread.py:155(submit) 21 | 110 0.000 0.000 0.007 0.000 thread.py:174(_adjust_thread_count) 22 | 10 0.001 0.000 0.007 0.001 coef.py:186() 23 | 30 0.000 0.000 0.005 0.000 threading.py:873(start) 24 | 10 0.000 0.000 0.003 0.000 _base.py:572(map) 25 | 10 0.000 0.000 0.003 0.000 _base.py:597() 26 | 10 0.000 0.000 0.002 0.000 _base.py:635(__exit__) -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/08-n_samples_small_100.txt: -------------------------------------------------------------------------------- 1 | 9175 function calls in 0.046 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 120 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 0.046 0.046 {built-in method builtins.exec} 8 | 1 0.000 0.000 0.046 0.046 :1() 9 | 1 0.000 0.000 0.046 0.046 1517976664.py:1(func) 10 | 10 0.001 0.000 0.045 0.005 coef.py:275(cm) 11 | 10 0.001 0.000 0.037 0.004 coef.py:414(compute_coef) 12 | 10 0.000 0.000 0.036 0.004 coef.py:407(cdist_func) 13 | 10 0.002 0.000 0.036 0.004 coef.py:168(cdist_parts_parallel) 14 | 203 0.001 0.000 0.030 0.000 threading.py:280(wait) 15 | 810 0.029 0.000 0.029 0.000 {method 'acquire' of '_thread.lock' objects} 16 | 100 0.000 0.000 0.028 0.000 threading.py:556(wait) 17 | 100 0.001 0.000 0.027 0.000 _base.py:201(as_completed) 18 | 110 0.001 0.000 0.009 0.000 thread.py:155(submit) 19 | 110 0.000 0.000 0.007 0.000 thread.py:174(_adjust_thread_count) 20 | 10 0.001 0.000 0.006 0.001 coef.py:186() 21 | 30 0.000 0.000 0.005 0.000 threading.py:873(start) 22 | 10 0.000 0.000 0.004 0.000 _base.py:572(map) 23 | 10 0.000 0.000 0.004 0.000 _base.py:597() 24 | 110 0.000 0.000 0.002 0.000 _base.py:417(result) 25 | 110 0.000 0.000 0.002 0.000 threading.py:404(acquire) 26 | 30 0.000 0.000 0.001 0.000 _base.py:601(result_iterator) -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/08-n_samples_small_500.txt: -------------------------------------------------------------------------------- 1 | 9391 function calls in 0.062 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 120 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 0.062 0.062 {built-in method builtins.exec} 8 | 1 0.000 0.000 0.062 0.062 :1() 9 | 1 0.000 0.000 0.062 0.062 1517976664.py:1(func) 10 | 10 0.001 0.000 0.062 0.006 coef.py:275(cm) 11 | 10 0.001 0.000 0.048 0.005 coef.py:414(compute_coef) 12 | 10 0.000 0.000 0.047 0.005 coef.py:407(cdist_func) 13 | 10 0.002 0.000 0.047 0.005 coef.py:168(cdist_parts_parallel) 14 | 215 0.001 0.000 0.045 0.000 threading.py:280(wait) 15 | 850 0.045 0.000 0.045 0.000 {method 'acquire' of '_thread.lock' objects} 16 | 108 0.000 0.000 0.040 0.000 threading.py:556(wait) 17 | 100 0.001 0.000 0.039 0.000 _base.py:201(as_completed) 18 | 110 0.001 0.000 0.008 0.000 thread.py:155(submit) 19 | 110 0.000 0.000 0.007 0.000 thread.py:174(_adjust_thread_count) 20 | 110 0.000 0.000 0.006 0.000 _base.py:417(result) 21 | 30 0.000 0.000 0.006 0.000 _base.py:601(result_iterator) 22 | 10 0.001 0.000 0.006 0.001 coef.py:186() 23 | 30 0.000 0.000 0.004 0.000 threading.py:873(start) 24 | 10 0.000 0.000 0.004 0.000 _base.py:572(map) 25 | 10 0.000 0.000 0.004 0.000 _base.py:597() 26 | 110 0.000 0.000 0.001 0.000 threading.py:404(acquire) -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/09-n_samples_small_100.txt: -------------------------------------------------------------------------------- 1 | 9212 function calls in 0.034 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 120 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 0.034 0.034 {built-in method builtins.exec} 8 | 1 0.000 0.000 0.034 0.034 :1() 9 | 1 0.000 0.000 0.034 0.034 1517976664.py:1(func) 10 | 10 0.001 0.000 0.034 0.003 coef.py:275(cm) 11 | 10 0.000 0.000 0.028 0.003 coef.py:414(compute_coef) 12 | 10 0.000 0.000 0.027 0.003 coef.py:407(cdist_func) 13 | 10 0.003 0.000 0.027 0.003 coef.py:168(cdist_parts_parallel) 14 | 199 0.001 0.000 0.022 0.000 threading.py:280(wait) 15 | 802 0.022 0.000 0.022 0.000 {method 'acquire' of '_thread.lock' objects} 16 | 106 0.000 0.000 0.021 0.000 threading.py:556(wait) 17 | 100 0.000 0.000 0.020 0.000 _base.py:201(as_completed) 18 | 110 0.000 0.000 0.007 0.000 thread.py:155(submit) 19 | 110 0.000 0.000 0.006 0.000 thread.py:174(_adjust_thread_count) 20 | 10 0.001 0.000 0.004 0.000 coef.py:186() 21 | 30 0.000 0.000 0.004 0.000 threading.py:873(start) 22 | 10 0.000 0.000 0.003 0.000 _base.py:572(map) 23 | 10 0.000 0.000 0.003 0.000 _base.py:597() 24 | 110 0.000 0.000 0.001 0.000 _base.py:417(result) 25 | 110 0.000 0.000 0.001 0.000 threading.py:404(acquire) 26 | 30 0.000 0.000 0.001 0.000 _base.py:601(result_iterator) -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/09-n_samples_small_50.txt: -------------------------------------------------------------------------------- 1 | 6936 function calls in 0.020 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 120 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 0.020 0.020 {built-in method builtins.exec} 8 | 1 0.000 0.000 0.020 0.020 :1() 9 | 1 0.000 0.000 0.020 0.020 1517976664.py:1(func) 10 | 10 0.001 0.000 0.020 0.002 coef.py:275(cm) 11 | 10 0.000 0.000 0.014 0.001 coef.py:414(compute_coef) 12 | 10 0.000 0.000 0.013 0.001 coef.py:407(cdist_func) 13 | 10 0.002 0.000 0.013 0.001 coef.py:168(cdist_parts_parallel) 14 | 136 0.000 0.000 0.009 0.000 threading.py:280(wait) 15 | 554 0.009 0.000 0.009 0.000 {method 'acquire' of '_thread.lock' objects} 16 | 71 0.000 0.000 0.009 0.000 threading.py:556(wait) 17 | 70 0.000 0.000 0.007 0.000 _base.py:201(as_completed) 18 | 80 0.000 0.000 0.005 0.000 thread.py:155(submit) 19 | 80 0.000 0.000 0.005 0.000 thread.py:174(_adjust_thread_count) 20 | 10 0.000 0.000 0.003 0.000 coef.py:186() 21 | 30 0.000 0.000 0.003 0.000 threading.py:873(start) 22 | 10 0.000 0.000 0.003 0.000 _base.py:572(map) 23 | 10 0.000 0.000 0.003 0.000 _base.py:597() 24 | 10 0.001 0.000 0.001 0.000 parallel.py:596(get_num_threads) 25 | 10 0.000 0.000 0.001 0.000 _base.py:635(__exit__) 26 | 10 0.000 0.000 0.001 0.000 thread.py:210(shutdown) -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/09-n_samples_small_500.txt: -------------------------------------------------------------------------------- 1 | 9477 function calls in 0.044 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 120 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 0.044 0.044 {built-in method builtins.exec} 8 | 1 0.000 0.000 0.044 0.044 :1() 9 | 1 0.000 0.000 0.044 0.044 1517976664.py:1(func) 10 | 10 0.001 0.000 0.044 0.004 coef.py:275(cm) 11 | 10 0.000 0.000 0.034 0.003 coef.py:414(compute_coef) 12 | 10 0.000 0.000 0.033 0.003 coef.py:407(cdist_func) 13 | 10 0.002 0.000 0.033 0.003 coef.py:168(cdist_parts_parallel) 14 | 217 0.001 0.000 0.032 0.000 threading.py:280(wait) 15 | 858 0.031 0.000 0.031 0.000 {method 'acquire' of '_thread.lock' objects} 16 | 113 0.000 0.000 0.027 0.000 threading.py:556(wait) 17 | 100 0.001 0.000 0.026 0.000 _base.py:201(as_completed) 18 | 110 0.000 0.000 0.006 0.000 thread.py:155(submit) 19 | 110 0.000 0.000 0.005 0.000 _base.py:417(result) 20 | 110 0.000 0.000 0.005 0.000 thread.py:174(_adjust_thread_count) 21 | 30 0.000 0.000 0.005 0.000 _base.py:601(result_iterator) 22 | 10 0.001 0.000 0.004 0.000 coef.py:186() 23 | 30 0.000 0.000 0.003 0.000 threading.py:873(start) 24 | 10 0.000 0.000 0.002 0.000 _base.py:572(map) 25 | 10 0.000 0.000 0.002 0.000 _base.py:597() 26 | 110 0.000 0.000 0.001 0.000 threading.py:404(acquire) -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/05_cm_optimized/py/07-many_samples.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.11.5 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] 20 | # Clustermatch run using a larger number of samples. 21 | 22 | # %% [markdown] tags=[] 23 | # # Modules 24 | 25 | # %% tags=[] 26 | import numpy as np 27 | 28 | from ccc.coef import ccc 29 | 30 | # %% [markdown] tags=[] 31 | # # Data 32 | 33 | # %% tags=[] 34 | n_genes, n_samples = 10, 30000 35 | 36 | # %% tags=[] 37 | np.random.seed(0) 38 | 39 | # %% tags=[] 40 | data = np.random.rand(n_genes, n_samples) 41 | 42 | # %% tags=[] 43 | data.shape 44 | 45 | 46 | # %% [markdown] tags=[] 47 | # # With defeault `internal_n_clusters` 48 | 49 | # %% tags=[] 50 | def func(): 51 | return ccc(data, internal_n_clusters=range(2, 10 + 1), precompute_parts=True) 52 | 53 | 54 | # %% tags=[] 55 | # %%timeit func() 56 | func() 57 | 58 | # %% tags=[] 59 | # %%prun -s cumulative -l 20 -T 07-cm_many_samples-default_internal_n_clusters.txt 60 | func() 61 | 62 | 63 | # %% [markdown] tags=[] 64 | # # With defeault `internal_n_clusters` 65 | 66 | # %% tags=[] 67 | def func(): 68 | return ccc(data, internal_n_clusters=range(2, 5 + 1), precompute_parts=True) 69 | 70 | 71 | # %% tags=[] 72 | # %%timeit func() 73 | func() 74 | 75 | # %% tags=[] 76 | # %%prun -s cumulative -l 20 -T 07-cm_many_samples-less_internal_n_clusters.txt 77 | func() 78 | 79 | # %% tags=[] 80 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/08-n_samples_large_100000.txt: -------------------------------------------------------------------------------- 1 | 9647 function calls in 5.917 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 120 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 5.917 5.917 {built-in method builtins.exec} 8 | 1 0.000 0.000 5.917 5.917 :1() 9 | 1 0.000 0.000 5.917 5.917 1517976664.py:1(func) 10 | 10 0.005 0.001 5.917 0.592 coef.py:275(cm) 11 | 222 0.001 0.000 5.890 0.027 threading.py:280(wait) 12 | 894 5.889 0.007 5.889 0.007 {method 'acquire' of '_thread.lock' objects} 13 | 10 0.001 0.000 4.013 0.401 coef.py:414(compute_coef) 14 | 10 0.000 0.000 4.011 0.401 coef.py:407(cdist_func) 15 | 10 0.002 0.000 4.011 0.401 coef.py:168(cdist_parts_parallel) 16 | 120 0.000 0.000 4.002 0.033 threading.py:556(wait) 17 | 100 0.001 0.000 4.001 0.040 _base.py:201(as_completed) 18 | 110 0.000 0.000 1.888 0.017 _base.py:417(result) 19 | 30 0.000 0.000 1.888 0.063 _base.py:601(result_iterator) 20 | 110 0.001 0.000 0.010 0.000 thread.py:155(submit) 21 | 110 0.001 0.000 0.008 0.000 thread.py:174(_adjust_thread_count) 22 | 10 0.003 0.000 0.008 0.001 coef.py:186() 23 | 30 0.000 0.000 0.005 0.000 threading.py:873(start) 24 | 10 0.000 0.000 0.005 0.000 _base.py:572(map) 25 | 10 0.000 0.000 0.005 0.000 _base.py:597() 26 | 50 0.002 0.000 0.002 0.000 {built-in method numpy.zeros} -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/09-n_samples_large_100000.txt: -------------------------------------------------------------------------------- 1 | 9634 function calls in 4.652 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 120 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 4.652 4.652 {built-in method builtins.exec} 8 | 1 0.000 0.000 4.652 4.652 :1() 9 | 1 0.000 0.000 4.652 4.652 1517976664.py:1(func) 10 | 10 0.005 0.001 4.652 0.465 coef.py:275(cm) 11 | 221 0.001 0.000 4.626 0.021 threading.py:280(wait) 12 | 892 4.626 0.005 4.626 0.005 {method 'acquire' of '_thread.lock' objects} 13 | 10 0.001 0.000 3.121 0.312 coef.py:414(compute_coef) 14 | 10 0.000 0.000 3.120 0.312 coef.py:407(cdist_func) 15 | 10 0.001 0.000 3.119 0.312 coef.py:168(cdist_parts_parallel) 16 | 120 0.000 0.000 3.110 0.026 threading.py:556(wait) 17 | 100 0.001 0.000 3.110 0.031 _base.py:201(as_completed) 18 | 110 0.000 0.000 1.516 0.014 _base.py:417(result) 19 | 30 0.000 0.000 1.516 0.051 _base.py:601(result_iterator) 20 | 10 0.004 0.000 0.008 0.001 coef.py:186() 21 | 110 0.001 0.000 0.008 0.000 thread.py:155(submit) 22 | 110 0.000 0.000 0.006 0.000 thread.py:174(_adjust_thread_count) 23 | 30 0.000 0.000 0.004 0.000 threading.py:873(start) 24 | 10 0.000 0.000 0.004 0.000 _base.py:572(map) 25 | 10 0.000 0.000 0.004 0.000 _base.py:597() 26 | 50 0.003 0.000 0.003 0.000 {built-in method numpy.zeros} -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/08-n_samples_small_1000.txt: -------------------------------------------------------------------------------- 1 | 9577 function calls in 0.083 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 120 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 0.083 0.083 {built-in method builtins.exec} 8 | 1 0.000 0.000 0.083 0.083 :1() 9 | 1 0.000 0.000 0.083 0.083 1517976664.py:1(func) 10 | 10 0.001 0.000 0.083 0.008 coef.py:275(cm) 11 | 223 0.001 0.000 0.066 0.000 threading.py:280(wait) 12 | 882 0.065 0.000 0.065 0.000 {method 'acquire' of '_thread.lock' objects} 13 | 10 0.001 0.000 0.064 0.006 coef.py:414(compute_coef) 14 | 10 0.000 0.000 0.063 0.006 coef.py:407(cdist_func) 15 | 10 0.002 0.000 0.062 0.006 coef.py:168(cdist_parts_parallel) 16 | 115 0.000 0.000 0.055 0.000 threading.py:556(wait) 17 | 100 0.001 0.000 0.054 0.001 _base.py:201(as_completed) 18 | 110 0.000 0.000 0.012 0.000 _base.py:417(result) 19 | 30 0.000 0.000 0.011 0.000 _base.py:601(result_iterator) 20 | 110 0.001 0.000 0.008 0.000 thread.py:155(submit) 21 | 110 0.000 0.000 0.007 0.000 thread.py:174(_adjust_thread_count) 22 | 10 0.002 0.000 0.006 0.001 coef.py:186() 23 | 30 0.000 0.000 0.005 0.000 threading.py:873(start) 24 | 10 0.000 0.000 0.004 0.000 _base.py:572(map) 25 | 10 0.000 0.000 0.004 0.000 _base.py:597() 26 | 185 0.001 0.000 0.002 0.000 _base.py:179(_yield_finished_futures) -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/10-n_samples_small_50.txt: -------------------------------------------------------------------------------- 1 | 7071 function calls (7061 primitive calls) in 0.123 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 131 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 0.123 0.123 {built-in method builtins.exec} 8 | 1 0.000 0.000 0.123 0.123 :1() 9 | 1 0.000 0.000 0.123 0.123 1517976664.py:1(func) 10 | 10 0.001 0.000 0.123 0.012 coef.py:275(cm) 11 | 10 0.000 0.000 0.103 0.010 coef.py:414(compute_coef) 12 | 10 0.000 0.000 0.102 0.010 coef.py:407(cdist_func) 13 | 10 0.015 0.002 0.102 0.010 coef.py:168(cdist_parts_parallel) 14 | 131 0.000 0.000 0.099 0.001 threading.py:280(wait) 15 | 534 0.099 0.000 0.099 0.000 {method 'acquire' of '_thread.lock' objects} 16 | 72 0.000 0.000 0.095 0.001 threading.py:556(wait) 17 | 70 0.000 0.000 0.074 0.001 _base.py:201(as_completed) 18 | 80 0.000 0.000 0.025 0.000 thread.py:155(submit) 19 | 80 0.000 0.000 0.024 0.000 thread.py:174(_adjust_thread_count) 20 | 30 0.000 0.000 0.023 0.001 threading.py:873(start) 21 | 10 0.000 0.000 0.013 0.001 coef.py:186() 22 | 10 0.000 0.000 0.013 0.001 _base.py:572(map) 23 | 10 0.000 0.000 0.013 0.001 _base.py:597() 24 | 80 0.000 0.000 0.004 0.000 _base.py:417(result) 25 | 30 0.000 0.000 0.004 0.000 _base.py:601(result_iterator) 26 | 10 0.000 0.000 0.001 0.000 _base.py:635(__exit__) -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/10-n_samples_large_50000.txt: -------------------------------------------------------------------------------- 1 | 9867 function calls (9857 primitive calls) in 2.227 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 131 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 2.227 2.227 {built-in method builtins.exec} 8 | 1 0.000 0.000 2.227 2.227 :1() 9 | 1 0.001 0.001 2.227 2.227 1517976664.py:1(func) 10 | 10 0.003 0.000 2.227 0.223 coef.py:275(cm) 11 | 225 0.001 0.000 2.204 0.010 threading.py:280(wait) 12 | 900 2.203 0.002 2.203 0.002 {method 'acquire' of '_thread.lock' objects} 13 | 10 0.000 0.000 1.502 0.150 coef.py:414(compute_coef) 14 | 10 0.000 0.000 1.501 0.150 coef.py:407(cdist_func) 15 | 10 0.003 0.000 1.501 0.150 coef.py:168(cdist_parts_parallel) 16 | 120 0.000 0.000 1.491 0.012 threading.py:556(wait) 17 | 100 0.001 0.000 1.490 0.015 _base.py:201(as_completed) 18 | 110 0.000 0.000 0.713 0.006 _base.py:417(result) 19 | 30 0.000 0.000 0.713 0.024 _base.py:601(result_iterator) 20 | 110 0.001 0.000 0.009 0.000 thread.py:155(submit) 21 | 10 0.003 0.000 0.008 0.001 coef.py:186() 22 | 110 0.000 0.000 0.007 0.000 thread.py:174(_adjust_thread_count) 23 | 30 0.000 0.000 0.005 0.000 threading.py:873(start) 24 | 10 0.000 0.000 0.004 0.000 _base.py:572(map) 25 | 10 0.000 0.000 0.004 0.000 _base.py:597() 26 | 10 0.000 0.000 0.002 0.000 _base.py:635(__exit__) -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/10-n_samples_small_100.txt: -------------------------------------------------------------------------------- 1 | 9146 function calls (9136 primitive calls) in 0.359 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 131 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 0.359 0.359 {built-in method builtins.exec} 8 | 1 0.000 0.000 0.359 0.359 :1() 9 | 1 0.000 0.000 0.359 0.359 1517976664.py:1(func) 10 | 10 0.001 0.000 0.359 0.036 coef.py:275(cm) 11 | 10 0.000 0.000 0.331 0.033 coef.py:414(compute_coef) 12 | 10 0.000 0.000 0.330 0.033 coef.py:407(cdist_func) 13 | 10 0.050 0.005 0.330 0.033 coef.py:168(cdist_parts_parallel) 14 | 744 0.299 0.000 0.299 0.000 {method 'acquire' of '_thread.lock' objects} 15 | 184 0.001 0.000 0.298 0.002 threading.py:280(wait) 16 | 98 0.000 0.000 0.284 0.003 threading.py:556(wait) 17 | 100 0.001 0.000 0.270 0.003 _base.py:201(as_completed) 18 | 110 0.000 0.000 0.020 0.000 thread.py:155(submit) 19 | 110 0.000 0.000 0.019 0.000 thread.py:174(_adjust_thread_count) 20 | 30 0.000 0.000 0.017 0.001 threading.py:873(start) 21 | 110 0.000 0.000 0.014 0.000 _base.py:417(result) 22 | 30 0.000 0.000 0.014 0.000 _base.py:601(result_iterator) 23 | 10 0.000 0.000 0.010 0.001 _base.py:572(map) 24 | 10 0.000 0.000 0.010 0.001 _base.py:597() 25 | 10 0.000 0.000 0.010 0.001 coef.py:186() 26 | 10 0.000 0.000 0.002 0.000 _base.py:635(__exit__) -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/10-n_samples_small_1000.txt: -------------------------------------------------------------------------------- 1 | 9875 function calls (9865 primitive calls) in 0.867 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 131 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 0.867 0.867 {built-in method builtins.exec} 8 | 1 0.000 0.000 0.867 0.867 :1() 9 | 1 0.000 0.000 0.867 0.867 1517976664.py:1(func) 10 | 10 0.001 0.000 0.866 0.087 coef.py:275(cm) 11 | 226 0.001 0.000 0.845 0.004 threading.py:280(wait) 12 | 902 0.845 0.001 0.845 0.001 {method 'acquire' of '_thread.lock' objects} 13 | 10 0.000 0.000 0.831 0.083 coef.py:414(compute_coef) 14 | 10 0.000 0.000 0.830 0.083 coef.py:407(cdist_func) 15 | 10 0.007 0.001 0.830 0.083 coef.py:168(cdist_parts_parallel) 16 | 120 0.000 0.000 0.818 0.007 threading.py:556(wait) 17 | 100 0.001 0.000 0.815 0.008 _base.py:201(as_completed) 18 | 110 0.000 0.000 0.028 0.000 _base.py:417(result) 19 | 30 0.000 0.000 0.027 0.001 _base.py:601(result_iterator) 20 | 110 0.001 0.000 0.009 0.000 thread.py:155(submit) 21 | 110 0.001 0.000 0.008 0.000 thread.py:174(_adjust_thread_count) 22 | 10 0.002 0.000 0.007 0.001 coef.py:186() 23 | 30 0.000 0.000 0.006 0.000 threading.py:873(start) 24 | 10 0.000 0.000 0.004 0.000 _base.py:572(map) 25 | 10 0.000 0.000 0.004 0.000 _base.py:597() 26 | 10 0.000 0.000 0.002 0.000 _base.py:635(__exit__) -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/10-n_samples_small_500.txt: -------------------------------------------------------------------------------- 1 | 9462 function calls (9452 primitive calls) in 0.392 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 131 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 0.392 0.392 {built-in method builtins.exec} 8 | 1 0.000 0.000 0.392 0.392 :1() 9 | 1 0.000 0.000 0.392 0.392 1517976664.py:1(func) 10 | 10 0.001 0.000 0.392 0.039 coef.py:275(cm) 11 | 10 0.000 0.000 0.363 0.036 coef.py:414(compute_coef) 12 | 10 0.000 0.000 0.362 0.036 coef.py:407(cdist_func) 13 | 10 0.043 0.004 0.362 0.036 coef.py:168(cdist_parts_parallel) 14 | 201 0.001 0.000 0.338 0.002 threading.py:280(wait) 15 | 810 0.338 0.000 0.338 0.000 {method 'acquire' of '_thread.lock' objects} 16 | 108 0.000 0.000 0.317 0.003 threading.py:556(wait) 17 | 100 0.001 0.000 0.312 0.003 _base.py:201(as_completed) 18 | 110 0.000 0.000 0.021 0.000 _base.py:417(result) 19 | 30 0.000 0.000 0.021 0.001 _base.py:601(result_iterator) 20 | 110 0.000 0.000 0.011 0.000 thread.py:155(submit) 21 | 110 0.000 0.000 0.010 0.000 thread.py:174(_adjust_thread_count) 22 | 30 0.000 0.000 0.008 0.000 threading.py:873(start) 23 | 10 0.001 0.000 0.007 0.001 coef.py:186() 24 | 10 0.000 0.000 0.004 0.000 _base.py:572(map) 25 | 10 0.000 0.000 0.004 0.000 _base.py:597() 26 | 10 0.000 0.000 0.002 0.000 _base.py:635(__exit__) -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/06_cm_optimized/py/07-many_samples.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.11.5 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] 20 | # Clustermatch run using a larger number of samples. 21 | 22 | # %% [markdown] tags=[] 23 | # # Modules 24 | 25 | # %% tags=[] 26 | import numpy as np 27 | 28 | from ccc.coef import ccc 29 | 30 | # %% [markdown] tags=[] 31 | # # Data 32 | 33 | # %% tags=[] 34 | n_genes, n_samples = 10, 30000 35 | 36 | # %% tags=[] 37 | np.random.seed(0) 38 | 39 | # %% tags=[] 40 | data = np.random.rand(n_genes, n_samples) 41 | 42 | # %% tags=[] 43 | data.shape 44 | 45 | 46 | # %% [markdown] tags=[] 47 | # # With default `internal_n_clusters` 48 | 49 | # %% tags=[] 50 | def func(): 51 | n_clust = list(range(2, 10 + 1)) 52 | return ccc(data, internal_n_clusters=n_clust) 53 | 54 | 55 | # %% tags=[] 56 | # %%timeit func() 57 | func() 58 | 59 | # %% tags=[] 60 | # %%prun -s cumulative -l 50 -T 07-cm_many_samples-default_internal_n_clusters.txt 61 | func() 62 | 63 | 64 | # %% [markdown] tags=[] 65 | # # With reduced `internal_n_clusters` 66 | 67 | # %% tags=[] 68 | def func(): 69 | n_clust = list(range(2, 5 + 1)) 70 | return ccc(data, internal_n_clusters=n_clust) 71 | 72 | 73 | # %% tags=[] 74 | # %%timeit func() 75 | func() 76 | 77 | # %% tags=[] 78 | # %%prun -s cumulative -l 50 -T 07-cm_many_samples-less_internal_n_clusters.txt 79 | func() 80 | 81 | # %% tags=[] 82 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/07_cm_optimized/py/07-many_samples.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.11.5 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] 20 | # Clustermatch run using a larger number of samples. 21 | 22 | # %% [markdown] tags=[] 23 | # # Modules 24 | 25 | # %% tags=[] 26 | import numpy as np 27 | 28 | from ccc.coef import ccc 29 | 30 | # %% [markdown] tags=[] 31 | # # Data 32 | 33 | # %% tags=[] 34 | n_genes, n_samples = 10, 30000 35 | 36 | # %% tags=[] 37 | np.random.seed(0) 38 | 39 | # %% tags=[] 40 | data = np.random.rand(n_genes, n_samples) 41 | 42 | # %% tags=[] 43 | data.shape 44 | 45 | 46 | # %% [markdown] tags=[] 47 | # # With default `internal_n_clusters` 48 | 49 | # %% tags=[] 50 | def func(): 51 | n_clust = list(range(2, 10 + 1)) 52 | return ccc(data, internal_n_clusters=n_clust) 53 | 54 | 55 | # %% tags=[] 56 | # %%timeit func() 57 | func() 58 | 59 | # %% tags=[] 60 | # %%prun -s cumulative -l 50 -T 07-cm_many_samples-default_internal_n_clusters.txt 61 | func() 62 | 63 | 64 | # %% [markdown] tags=[] 65 | # # With reduced `internal_n_clusters` 66 | 67 | # %% tags=[] 68 | def func(): 69 | n_clust = list(range(2, 5 + 1)) 70 | return ccc(data, internal_n_clusters=n_clust) 71 | 72 | 73 | # %% tags=[] 74 | # %%timeit func() 75 | func() 76 | 77 | # %% tags=[] 78 | # %%prun -s cumulative -l 50 -T 07-cm_many_samples-less_internal_n_clusters.txt 79 | func() 80 | 81 | # %% tags=[] 82 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/10-n_samples_large_100000.txt: -------------------------------------------------------------------------------- 1 | 9838 function calls (9828 primitive calls) in 4.263 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 131 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 4.263 4.263 {built-in method builtins.exec} 8 | 1 0.000 0.000 4.263 4.263 :1() 9 | 1 0.000 0.000 4.263 4.263 1517976664.py:1(func) 10 | 10 0.005 0.001 4.263 0.426 coef.py:275(cm) 11 | 221 0.001 0.000 4.234 0.019 threading.py:280(wait) 12 | 892 4.234 0.005 4.234 0.005 {method 'acquire' of '_thread.lock' objects} 13 | 10 0.000 0.000 2.689 0.269 coef.py:414(compute_coef) 14 | 10 0.000 0.000 2.688 0.269 coef.py:407(cdist_func) 15 | 10 0.003 0.000 2.688 0.269 coef.py:168(cdist_parts_parallel) 16 | 120 0.000 0.000 2.676 0.022 threading.py:556(wait) 17 | 100 0.001 0.000 2.675 0.027 _base.py:201(as_completed) 18 | 110 0.000 0.000 1.559 0.014 _base.py:417(result) 19 | 30 0.000 0.000 1.559 0.052 _base.py:601(result_iterator) 20 | 10 0.004 0.000 0.009 0.001 coef.py:186() 21 | 110 0.001 0.000 0.009 0.000 thread.py:155(submit) 22 | 110 0.001 0.000 0.007 0.000 thread.py:174(_adjust_thread_count) 23 | 30 0.000 0.000 0.005 0.000 threading.py:873(start) 24 | 10 0.000 0.000 0.004 0.000 _base.py:572(map) 25 | 10 0.000 0.000 0.004 0.000 _base.py:597() 26 | 50 0.003 0.000 0.003 0.000 {built-in method numpy.zeros} -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/11_cm_optimized/py/06-many_genes.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.11.5 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] tags=[] 20 | # Clustermatch run using a larger number of genes. 21 | 22 | # %% [markdown] tags=[] 23 | # # Remove pycache dir 24 | 25 | # %% tags=[] 26 | # !echo ${CODE_DIR} 27 | 28 | # %% tags=[] 29 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print 30 | 31 | # %% tags=[] 32 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -exec rm -rf {} \; 33 | 34 | # %% tags=[] 35 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print 36 | 37 | # %% [markdown] tags=[] 38 | # # Modules 39 | 40 | # %% tags=[] 41 | import numpy as np 42 | 43 | from ccc.coef import ccc 44 | 45 | # %% tags=[] 46 | # let numba compile all the code before profiling 47 | ccc(np.random.rand(10), np.random.rand(10)) 48 | 49 | # %% [markdown] tags=[] 50 | # # Data 51 | 52 | # %% tags=[] 53 | n_genes, n_samples = 500, 1000 54 | 55 | # %% tags=[] 56 | np.random.seed(0) 57 | 58 | # %% tags=[] 59 | data = np.random.rand(n_genes, n_samples) 60 | 61 | # %% tags=[] 62 | data.shape 63 | 64 | 65 | # %% [markdown] tags=[] 66 | # # Profile 67 | 68 | # %% tags=[] 69 | def func(): 70 | n_clust = list(range(2, 10 + 1)) 71 | return ccc(data, internal_n_clusters=n_clust) 72 | 73 | 74 | # %% tags=[] 75 | # %%timeit func() 76 | func() 77 | 78 | # %% tags=[] 79 | # %%prun -s cumulative -l 50 -T 06-cm_many_genes.txt 80 | func() 81 | 82 | # %% tags=[] 83 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/12_cm_optimized/py/06-many_genes.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # notebook_metadata_filter: -jupytext.text_representation.jupytext_version 6 | # text_representation: 7 | # extension: .py 8 | # format_name: percent 9 | # format_version: '1.3' 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] tags=[] 20 | # Clustermatch run using a larger number of genes. 21 | 22 | # %% [markdown] tags=[] 23 | # # Remove pycache dir 24 | 25 | # %% tags=[] 26 | # !echo ${CODE_DIR} 27 | 28 | # %% tags=[] 29 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -print 30 | 31 | # %% tags=[] 32 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -exec rm -rf {} \; 33 | 34 | # %% tags=[] 35 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -print 36 | 37 | # %% [markdown] tags=[] 38 | # # Modules 39 | 40 | # %% tags=[] 41 | import numpy as np 42 | 43 | from ccc.coef import ccc 44 | 45 | # %% tags=[] 46 | # let numba compile all the code before profiling 47 | ccc(np.random.rand(10), np.random.rand(10)) 48 | 49 | # %% [markdown] tags=[] 50 | # # Data 51 | 52 | # %% tags=[] 53 | n_genes, n_samples = 500, 1000 54 | 55 | # %% tags=[] 56 | np.random.seed(0) 57 | 58 | # %% tags=[] 59 | data = np.random.rand(n_genes, n_samples) 60 | 61 | # %% tags=[] 62 | data.shape 63 | 64 | 65 | # %% [markdown] tags=[] 66 | # # Profile 67 | 68 | # %% tags=[] 69 | def func(): 70 | n_clust = list(range(2, 10 + 1)) 71 | return ccc(data, internal_n_clusters=n_clust) 72 | 73 | 74 | # %% tags=[] 75 | # %%timeit func() 76 | func() 77 | 78 | # %% tags=[] 79 | # %%prun -s cumulative -l 50 -T 06-cm_many_genes.txt 80 | func() 81 | 82 | # %% tags=[] 83 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/06-n_samples_large.txt: -------------------------------------------------------------------------------- 1 | 21654 function calls in 7.834 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 64 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 7.834 7.834 {built-in method builtins.exec} 8 | 1 0.000 0.000 7.834 7.834 :1() 9 | 1 0.000 0.000 7.834 7.834 691993785.py:1(func) 10 | 10 0.016 0.002 7.834 0.783 coef.py:251(_cm) 11 | 20 0.015 0.001 3.914 0.196 coef.py:154(_get_parts) 12 | 10 3.902 0.390 3.902 0.390 coef.py:183(cdist_parts) 13 | 180 0.134 0.001 3.602 0.020 coef.py:63(run_quantile_clustering) 14 | 360 3.045 0.008 3.045 0.008 {method 'argsort' of 'numpy.ndarray' objects} 15 | 180 0.300 0.002 1.943 0.011 stats.py:8631(rankdata) 16 | 1620 0.019 0.000 1.858 0.001 {built-in method numpy.core._multiarray_umath.implement_array_function} 17 | 540 0.001 0.000 1.543 0.003 fromnumeric.py:51(_wrapfunc) 18 | 180 0.000 0.000 1.528 0.008 <__array_function__ internals>:2(argsort) 19 | 180 0.000 0.000 1.528 0.008 fromnumeric.py:1006(argsort) 20 | 20 0.001 0.000 0.296 0.015 coef.py:177() 21 | 180 0.000 0.000 0.295 0.002 <__array_function__ internals>:2(unique) 22 | 180 0.000 0.000 0.295 0.002 arraysetops.py:138(unique) 23 | 180 0.007 0.000 0.294 0.002 arraysetops.py:320(_unique1d) 24 | 180 0.284 0.002 0.284 0.002 {method 'sort' of 'numpy.ndarray' objects} 25 | 180 0.052 0.000 0.052 0.000 {method 'cumsum' of 'numpy.ndarray' objects} 26 | 360 0.005 0.000 0.031 0.000 index_tricks.py:323(__getitem__) -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/05_cm_optimized/04-cm_ari_numba.txt: -------------------------------------------------------------------------------- 1 | 592106 function calls in 53.048 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 70 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 53.048 53.048 {built-in method builtins.exec} 8 | 1 0.000 0.000 53.048 53.048 :1() 9 | 1 0.000 0.000 53.048 53.048 4139949497.py:1(func) 10 | 1 0.024 0.024 53.048 53.048 coef.py:163(cm) 11 | 4950 0.020 0.000 52.745 0.011 distance.py:2616(cdist) 12 | 4950 0.954 0.000 52.720 0.011 distance.py:2606(_cdist_callable) 13 | 400950 51.761 0.000 51.761 0.000 metrics.py:46(adjusted_rand_index) 14 | 100 0.003 0.000 0.239 0.002 coef.py:113(_get_parts) 15 | 900 0.021 0.000 0.199 0.000 coef.py:29(run_quantile_clustering) 16 | 13950 0.032 0.000 0.158 0.000 {built-in method numpy.core._multiarray_umath.implement_array_function} 17 | 900 0.026 0.000 0.125 0.000 stats.py:8631(rankdata) 18 | 3600 0.003 0.000 0.086 0.000 fromnumeric.py:51(_wrapfunc) 19 | 1800 0.002 0.000 0.083 0.000 <__array_function__ internals>:2(argsort) 20 | 1800 0.001 0.000 0.080 0.000 fromnumeric.py:1006(argsort) 21 | 1800 0.077 0.000 0.077 0.000 {method 'argsort' of 'numpy.ndarray' objects} 22 | 1800 0.015 0.000 0.039 0.000 index_tricks.py:323(__getitem__) 23 | 900 0.001 0.000 0.037 0.000 <__array_function__ internals>:2(unique) 24 | 900 0.001 0.000 0.035 0.000 arraysetops.py:138(unique) 25 | 900 0.012 0.000 0.033 0.000 arraysetops.py:320(_unique1d) 26 | 4950 0.005 0.000 0.029 0.000 <__array_function__ internals>:2(unravel_index) -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/05_cm_optimized/04-cm_ari_sklearn.txt: -------------------------------------------------------------------------------- 1 | 365055656 function calls (362649956 primitive calls) in 431.008 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 209 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 431.008 431.008 {built-in method builtins.exec} 8 | 1 0.000 0.000 431.008 431.008 :1() 9 | 1 0.000 0.000 431.008 431.008 4139949497.py:1(func) 10 | 1 0.028 0.028 431.008 431.008 coef.py:163(cm) 11 | 4950 0.022 0.000 430.705 0.087 distance.py:2616(cdist) 12 | 4950 0.881 0.000 430.677 0.087 distance.py:2606(_cdist_callable) 13 | 400950 5.395 0.000 429.789 0.001 _supervised.py:302(adjusted_rand_score) 14 | 400950 6.795 0.000 424.394 0.001 _supervised.py:154(pair_confusion_matrix) 15 | 1202850 2.658 0.000 201.502 0.000 validation.py:59(inner_f) 16 | 400950 3.032 0.000 161.605 0.000 _supervised.py:87(contingency_matrix) 17 | 15651000/14047200 17.349 0.000 126.910 0.000 {built-in method numpy.core._multiarray_umath.implement_array_function} 18 | 400950 1.945 0.000 121.473 0.000 _supervised.py:32(check_clusterings) 19 | 2005650 1.842 0.000 91.816 0.000 <__array_function__ internals>:2(unique) 20 | 801900 3.315 0.000 88.057 0.000 compressed.py:588(sum) 21 | 2005650 3.214 0.000 87.180 0.000 arraysetops.py:138(unique) 22 | 2005650 30.384 0.000 81.455 0.000 arraysetops.py:320(_unique1d) 23 | 1202850 7.100 0.000 78.486 0.000 compressed.py:27(__init__) 24 | 400950 5.167 0.000 71.729 0.000 coo.py:372(tocsr) 25 | 801900 9.618 0.000 68.168 0.000 multiclass.py:186(type_of_target) 26 | 400950 2.239 0.000 61.496 0.000 base.py:968(sum) -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/05_cm_optimized/py/05-compare_precomputing_of_parts.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.11.5 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] 20 | # Compares two different ccc implementations: one using precomputation of internal clusterings, and the other one using the original implementation that does not perform such precomputation. 21 | 22 | # %% [markdown] 23 | # # Modules 24 | 25 | # %% tags=[] 26 | import numpy as np 27 | 28 | from ccc.coef import ccc 29 | 30 | # %% [markdown] 31 | # # Data 32 | 33 | # %% tags=[] 34 | n_genes, n_samples = 100, 1000 35 | 36 | # %% tags=[] 37 | np.random.seed(0) 38 | 39 | # %% tags=[] 40 | data = np.random.rand(n_genes, n_samples) 41 | 42 | # %% tags=[] 43 | data.shape 44 | 45 | 46 | # %% [markdown] tags=[] 47 | # # Improved implementation (`precompute_parts=True`) 48 | 49 | # %% tags=[] 50 | def func(): 51 | return ccc(data, internal_n_clusters=range(2, 10 + 1), precompute_parts=True) 52 | 53 | 54 | # %% tags=[] 55 | # %%timeit func() 56 | func() 57 | 58 | # %% tags=[] 59 | # %%prun -s cumulative -l 20 -T 05-cm_precompute_parts_true.txt 60 | func() 61 | 62 | 63 | # %% [markdown] tags=[] 64 | # # Original implementation (`precompute_parts=False`) 65 | 66 | # %% tags=[] 67 | def func(): 68 | return ccc(data, internal_n_clusters=range(2, 10 + 1), precompute_parts=False) 69 | 70 | 71 | # %% tags=[] 72 | # %%timeit func() 73 | func() 74 | 75 | # %% tags=[] 76 | # %%prun -s cumulative -l 20 -T 05-cm_precompute_parts_false.txt 77 | func() 78 | 79 | # %% tags=[] 80 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/05-n_samples_small.txt: -------------------------------------------------------------------------------- 1 | 5094 function calls in 0.047 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 36 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 0.047 0.047 {built-in method builtins.exec} 8 | 1 0.000 0.000 0.047 0.047 :1() 9 | 1 0.000 0.000 0.047 0.047 1556911885.py:1(func) 10 | 10 0.001 0.000 0.047 0.005 coef.py:266(_cm) 11 | 20 0.001 0.000 0.026 0.001 coef.py:170(_get_parts) 12 | 10 0.020 0.002 0.020 0.002 coef.py:199(cdist_parts) 13 | 180 0.007 0.000 0.020 0.000 coef.py:81(run_quantile_clustering) 14 | 180 0.009 0.000 0.009 0.000 coef.py:32(rank) 15 | 360 0.001 0.000 0.006 0.000 {built-in method numpy.core._multiarray_umath.implement_array_function} 16 | 20 0.000 0.000 0.005 0.000 coef.py:193() 17 | 180 0.000 0.000 0.004 0.000 <__array_function__ internals>:2(unique) 18 | 180 0.000 0.000 0.004 0.000 arraysetops.py:138(unique) 19 | 180 0.002 0.000 0.003 0.000 arraysetops.py:320(_unique1d) 20 | 180 0.000 0.000 0.002 0.000 <__array_function__ internals>:2(searchsorted) 21 | 180 0.000 0.000 0.002 0.000 fromnumeric.py:1283(searchsorted) 22 | 180 0.000 0.000 0.001 0.000 fromnumeric.py:51(_wrapfunc) 23 | 180 0.001 0.000 0.001 0.000 {method 'sort' of 'numpy.ndarray' objects} 24 | 180 0.001 0.000 0.001 0.000 {method 'searchsorted' of 'numpy.ndarray' objects} 25 | 180 0.001 0.000 0.001 0.000 {method 'argsort' of 'numpy.ndarray' objects} 26 | 230 0.000 0.000 0.000 0.000 {built-in method numpy.zeros} -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/05-n_samples_large.txt: -------------------------------------------------------------------------------- 1 | 5094 function calls in 19.355 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 36 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 19.355 19.355 {built-in method builtins.exec} 8 | 1 0.000 0.000 19.355 19.355 :1() 9 | 1 0.009 0.009 19.355 19.355 1556911885.py:1(func) 10 | 10 0.032 0.003 19.346 1.935 coef.py:266(_cm) 11 | 20 0.013 0.001 14.474 0.724 coef.py:170(_get_parts) 12 | 180 0.210 0.001 14.050 0.078 coef.py:81(run_quantile_clustering) 13 | 180 11.764 0.065 11.764 0.065 coef.py:32(rank) 14 | 10 4.839 0.484 4.839 0.484 coef.py:199(cdist_parts) 15 | 180 2.066 0.011 2.066 0.011 {method 'argsort' of 'numpy.ndarray' objects} 16 | 360 0.001 0.000 0.412 0.001 {built-in method numpy.core._multiarray_umath.implement_array_function} 17 | 20 0.001 0.000 0.408 0.020 coef.py:193() 18 | 180 0.000 0.000 0.407 0.002 <__array_function__ internals>:2(unique) 19 | 180 0.001 0.000 0.406 0.002 arraysetops.py:138(unique) 20 | 180 0.013 0.000 0.405 0.002 arraysetops.py:320(_unique1d) 21 | 180 0.387 0.002 0.387 0.002 {method 'sort' of 'numpy.ndarray' objects} 22 | 180 0.001 0.000 0.007 0.000 <__array_function__ internals>:2(searchsorted) 23 | 230 0.005 0.000 0.005 0.000 {built-in method numpy.zeros} 24 | 180 0.001 0.000 0.005 0.000 fromnumeric.py:1283(searchsorted) 25 | 180 0.000 0.000 0.004 0.000 fromnumeric.py:51(_wrapfunc) 26 | 180 0.004 0.000 0.004 0.000 {method 'searchsorted' of 'numpy.ndarray' objects} -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/05_cm_optimized/05-cm_precompute_parts_false.txt: -------------------------------------------------------------------------------- 1 | 91218606 function calls (81318606 primitive calls) in 121.734 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 82 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 121.734 121.734 {built-in method builtins.exec} 8 | 1 0.000 0.000 121.734 121.734 :1() 9 | 1 0.000 0.000 121.734 121.734 1745201673.py:1(func) 10 | 1 0.086 0.086 121.734 121.734 coef.py:163(cm) 11 | 4950 0.021 0.000 49.955 0.010 distance.py:2616(cdist) 12 | 4950 0.988 0.000 49.929 0.010 distance.py:2606(_cdist_callable) 13 | 400950 48.935 0.000 48.935 0.000 metrics.py:46(adjusted_rand_index) 14 | 4950 0.028 0.000 48.169 0.010 coef.py:153(_get_common_features) 15 | 9900 0.038 0.000 48.098 0.005 coef.py:149(_isempty) 16 | 9900 19.278 0.002 47.668 0.005 coef.py:150() 17 | 20695950/10795950 12.181 0.000 36.403 0.000 {built-in method numpy.core._multiarray_umath.implement_array_function} 18 | 9900000 4.311 0.000 28.390 0.000 <__array_function__ internals>:2(isreal) 19 | 9900 0.247 0.000 23.440 0.002 coef.py:113(_get_parts) 20 | 89100 2.080 0.000 19.487 0.000 coef.py:29(run_quantile_clustering) 21 | 9900000 4.959 0.000 17.102 0.000 type_check.py:247(isreal) 22 | 89100 2.589 0.000 12.277 0.000 stats.py:8631(rankdata) 23 | 9900000 3.887 0.000 12.144 0.000 <__array_function__ internals>:2(imag) 24 | 356400 0.272 0.000 8.465 0.000 fromnumeric.py:51(_wrapfunc) 25 | 178200 0.152 0.000 8.200 0.000 <__array_function__ internals>:2(argsort) 26 | 178200 0.149 0.000 7.904 0.000 fromnumeric.py:1006(argsort) -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/05_cm_optimized/05-cm_precompute_parts_true.txt: -------------------------------------------------------------------------------- 1 | 592106 function calls in 50.073 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 70 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 50.073 50.073 {built-in method builtins.exec} 8 | 1 0.000 0.000 50.073 50.073 :1() 9 | 1 0.000 0.000 50.073 50.073 674090675.py:1(func) 10 | 1 0.023 0.023 50.073 50.073 coef.py:163(cm) 11 | 4950 0.018 0.000 49.776 0.010 distance.py:2616(cdist) 12 | 4950 0.953 0.000 49.753 0.010 distance.py:2606(_cdist_callable) 13 | 400950 48.794 0.000 48.794 0.000 metrics.py:46(adjusted_rand_index) 14 | 100 0.003 0.000 0.238 0.002 coef.py:113(_get_parts) 15 | 900 0.021 0.000 0.198 0.000 coef.py:29(run_quantile_clustering) 16 | 13950 0.030 0.000 0.156 0.000 {built-in method numpy.core._multiarray_umath.implement_array_function} 17 | 900 0.026 0.000 0.125 0.000 stats.py:8631(rankdata) 18 | 3600 0.003 0.000 0.086 0.000 fromnumeric.py:51(_wrapfunc) 19 | 1800 0.002 0.000 0.083 0.000 <__array_function__ internals>:2(argsort) 20 | 1800 0.002 0.000 0.080 0.000 fromnumeric.py:1006(argsort) 21 | 1800 0.077 0.000 0.077 0.000 {method 'argsort' of 'numpy.ndarray' objects} 22 | 1800 0.015 0.000 0.039 0.000 index_tricks.py:323(__getitem__) 23 | 900 0.001 0.000 0.037 0.000 <__array_function__ internals>:2(unique) 24 | 900 0.002 0.000 0.035 0.000 arraysetops.py:138(unique) 25 | 900 0.012 0.000 0.032 0.000 arraysetops.py:320(_unique1d) 26 | 4950 0.005 0.000 0.027 0.000 <__array_function__ internals>:2(unravel_index) -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/06-n_samples_small.txt: -------------------------------------------------------------------------------- 1 | 21654 function calls in 0.059 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 64 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 0.059 0.059 {built-in method builtins.exec} 8 | 1 0.000 0.000 0.059 0.059 :1() 9 | 1 0.000 0.000 0.059 0.059 691993785.py:1(func) 10 | 10 0.001 0.000 0.059 0.006 coef.py:251(_cm) 11 | 20 0.001 0.000 0.036 0.002 coef.py:154(_get_parts) 12 | 180 0.006 0.000 0.032 0.000 coef.py:63(run_quantile_clustering) 13 | 180 0.005 0.000 0.022 0.000 stats.py:8631(rankdata) 14 | 10 0.022 0.002 0.022 0.002 coef.py:183(cdist_parts) 15 | 360 0.004 0.000 0.012 0.000 index_tricks.py:323(__getitem__) 16 | 1620 0.002 0.000 0.010 0.000 {built-in method numpy.core._multiarray_umath.implement_array_function} 17 | 20 0.000 0.000 0.004 0.000 coef.py:177() 18 | 540 0.001 0.000 0.003 0.000 fromnumeric.py:51(_wrapfunc) 19 | 180 0.000 0.000 0.003 0.000 <__array_function__ internals>:2(unique) 20 | 180 0.000 0.000 0.003 0.000 arraysetops.py:138(unique) 21 | 360 0.001 0.000 0.003 0.000 numerictypes.py:599(find_common_type) 22 | 360 0.002 0.000 0.002 0.000 {method 'argsort' of 'numpy.ndarray' objects} 23 | 180 0.001 0.000 0.002 0.000 arraysetops.py:320(_unique1d) 24 | 180 0.000 0.000 0.002 0.000 <__array_function__ internals>:2(searchsorted) 25 | 180 0.000 0.000 0.002 0.000 <__array_function__ internals>:2(argsort) 26 | 360 0.001 0.000 0.002 0.000 <__array_function__ internals>:2(concatenate) -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/05_cm_optimized/06-cm_many_genes.txt: -------------------------------------------------------------------------------- 1 | 12560506 function calls in 1263.543 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 70 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 1263.543 1263.543 {built-in method builtins.exec} 8 | 1 0.000 0.000 1263.543 1263.543 :1() 9 | 1 0.000 0.000 1263.543 1263.543 674090675.py:1(func) 10 | 1 0.538 0.538 1263.543 1263.543 coef.py:163(cm) 11 | 124750 0.457 0.000 1260.921 0.010 distance.py:2616(cdist) 12 | 124750 25.373 0.000 1260.340 0.010 distance.py:2606(_cdist_callable) 13 | 10104750 1234.826 0.000 1234.826 0.000 metrics.py:46(adjusted_rand_index) 14 | 169750 0.576 0.000 1.202 0.000 {built-in method numpy.core._multiarray_umath.implement_array_function} 15 | 500 0.012 0.000 1.184 0.002 coef.py:113(_get_parts) 16 | 4500 0.106 0.000 0.983 0.000 coef.py:29(run_quantile_clustering) 17 | 124750 0.126 0.000 0.681 0.000 <__array_function__ internals>:2(unravel_index) 18 | 4500 0.130 0.000 0.616 0.000 stats.py:8631(rankdata) 19 | 18000 0.013 0.000 0.429 0.000 fromnumeric.py:51(_wrapfunc) 20 | 9000 0.008 0.000 0.415 0.000 <__array_function__ internals>:2(argsort) 21 | 9000 0.007 0.000 0.401 0.000 fromnumeric.py:1006(argsort) 22 | 9000 0.384 0.000 0.384 0.000 {method 'argsort' of 'numpy.ndarray' objects} 23 | 124750 0.216 0.000 0.216 0.000 {method 'argmax' of 'numpy.ndarray' objects} 24 | 9000 0.073 0.000 0.193 0.000 index_tricks.py:323(__getitem__) 25 | 4500 0.004 0.000 0.183 0.000 <__array_function__ internals>:2(unique) 26 | 4500 0.007 0.000 0.173 0.000 arraysetops.py:138(unique) -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/05_cm_optimized/07-cm_many_samples-less_internal_n_clusters.txt: -------------------------------------------------------------------------------- 1 | 6641 function calls in 2.164 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 70 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 2.164 2.164 {built-in method builtins.exec} 8 | 1 0.000 0.000 2.164 2.164 :1() 9 | 1 0.001 0.001 2.164 2.164 3897795364.py:1(func) 10 | 1 0.001 0.001 2.163 2.163 coef.py:163(cm) 11 | 45 0.000 0.000 1.908 0.042 distance.py:2616(cdist) 12 | 45 0.004 0.000 1.908 0.042 distance.py:2606(_cdist_callable) 13 | 720 1.904 0.003 1.904 0.003 metrics.py:46(adjusted_rand_index) 14 | 10 0.000 0.000 0.253 0.025 coef.py:113(_get_parts) 15 | 40 0.010 0.000 0.222 0.006 coef.py:29(run_quantile_clustering) 16 | 445 0.002 0.000 0.213 0.000 {built-in method numpy.core._multiarray_umath.implement_array_function} 17 | 160 0.000 0.000 0.181 0.001 fromnumeric.py:51(_wrapfunc) 18 | 80 0.000 0.000 0.180 0.002 <__array_function__ internals>:2(argsort) 19 | 80 0.000 0.000 0.180 0.002 fromnumeric.py:1006(argsort) 20 | 80 0.179 0.002 0.179 0.002 {method 'argsort' of 'numpy.ndarray' objects} 21 | 40 0.022 0.001 0.121 0.003 stats.py:8631(rankdata) 22 | 40 0.000 0.000 0.030 0.001 <__array_function__ internals>:2(unique) 23 | 40 0.000 0.000 0.030 0.001 arraysetops.py:138(unique) 24 | 40 0.002 0.000 0.030 0.001 arraysetops.py:320(_unique1d) 25 | 40 0.028 0.001 0.028 0.001 {method 'sort' of 'numpy.ndarray' objects} 26 | 40 0.003 0.000 0.003 0.000 {method 'cumsum' of 'numpy.ndarray' objects} -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/05_cm_optimized/07-cm_many_samples-default_internal_n_clusters.txt: -------------------------------------------------------------------------------- 1 | 16016 function calls in 15.982 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 70 to 20 due to restriction <20> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 15.982 15.982 {built-in method builtins.exec} 8 | 1 0.000 0.000 15.982 15.982 :1() 9 | 1 0.002 0.002 15.982 15.982 674090675.py:1(func) 10 | 1 0.001 0.001 15.980 15.980 coef.py:163(cm) 11 | 45 0.000 0.000 15.391 0.342 distance.py:2616(cdist) 12 | 45 0.020 0.000 15.390 0.342 distance.py:2606(_cdist_callable) 13 | 3645 15.371 0.004 15.371 0.004 metrics.py:46(adjusted_rand_index) 14 | 10 0.001 0.000 0.587 0.059 coef.py:113(_get_parts) 15 | 90 0.024 0.000 0.503 0.006 coef.py:29(run_quantile_clustering) 16 | 945 0.003 0.000 0.493 0.001 {built-in method numpy.core._multiarray_umath.implement_array_function} 17 | 360 0.000 0.000 0.413 0.001 fromnumeric.py:51(_wrapfunc) 18 | 180 0.000 0.000 0.410 0.002 <__array_function__ internals>:2(argsort) 19 | 180 0.000 0.000 0.410 0.002 fromnumeric.py:1006(argsort) 20 | 180 0.409 0.002 0.409 0.002 {method 'argsort' of 'numpy.ndarray' objects} 21 | 90 0.047 0.001 0.271 0.003 stats.py:8631(rankdata) 22 | 90 0.000 0.000 0.076 0.001 <__array_function__ internals>:2(unique) 23 | 90 0.000 0.000 0.076 0.001 arraysetops.py:138(unique) 24 | 90 0.004 0.000 0.076 0.001 arraysetops.py:320(_unique1d) 25 | 90 0.071 0.001 0.071 0.001 {method 'sort' of 'numpy.ndarray' objects} 26 | 90 0.008 0.000 0.008 0.000 {method 'cumsum' of 'numpy.ndarray' objects} -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/05_cm_optimized/py/04-compare_numba_ari.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.11.5 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] 20 | # Compares two different ccc implementations: one using the new optimized adjusted Rand index (ARI) with numba, and the other one using the ARI from scikit-learn. 21 | 22 | # %% [markdown] tags=[] 23 | # # Modules 24 | 25 | # %% tags=[] 26 | import numpy as np 27 | 28 | from ccc import coef 29 | 30 | # %% [markdown] tags=[] 31 | # # Data 32 | 33 | # %% tags=[] 34 | n_genes, n_samples = 100, 1000 35 | 36 | # %% tags=[] 37 | np.random.seed(0) 38 | 39 | # %% tags=[] 40 | data = np.random.rand(n_genes, n_samples) 41 | 42 | # %% tags=[] 43 | data.shape 44 | 45 | 46 | # %% [markdown] tags=[] 47 | # # Improved implementation (ARI implemented in numba) 48 | 49 | # %% tags=[] 50 | def func(): 51 | return coef.ccc(data, internal_n_clusters=range(2, 10 + 1), precompute_parts=True) 52 | 53 | 54 | # %% tags=[] 55 | # %%timeit func() 56 | func() 57 | 58 | # %% tags=[] 59 | # %%prun -s cumulative -l 20 -T 04-cm_ari_numba.txt 60 | func() 61 | 62 | # %% [markdown] tags=[] 63 | # # Original implementation (ARI from sklearn) 64 | 65 | # %% tags=[] 66 | from sklearn.metrics import adjusted_rand_score 67 | 68 | # %% tags=[] 69 | coef.ari = adjusted_rand_score 70 | 71 | 72 | # %% tags=[] 73 | def func(): 74 | return coef.ccc(data, internal_n_clusters=range(2, 10 + 1), precompute_parts=True) 75 | 76 | 77 | # %% tags=[] 78 | # %%timeit func() 79 | func() 80 | 81 | # %% tags=[] 82 | # %%prun -s cumulative -l 20 -T 04-cm_ari_sklearn.txt 83 | func() 84 | 85 | # %% tags=[] 86 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/12_cm_optimized/py/10-many_genes.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # notebook_metadata_filter: -jupytext.text_representation.jupytext_version 6 | # text_representation: 7 | # extension: .py 8 | # format_name: percent 9 | # format_version: '1.3' 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] tags=[] 20 | # Similar as `06` but it computes across gene pairs instead of data matrix. 21 | 22 | # %% [markdown] tags=[] 23 | # # Remove pycache dir 24 | 25 | # %% tags=[] 26 | # !echo ${CODE_DIR} 27 | 28 | # %% tags=[] 29 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -print 30 | 31 | # %% tags=[] 32 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -exec rm -rf {} \; 33 | 34 | # %% tags=[] 35 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -print 36 | 37 | # %% [markdown] tags=[] 38 | # # Modules 39 | 40 | # %% tags=[] 41 | import numpy as np 42 | 43 | from ccc.coef import ccc 44 | 45 | # %% tags=[] 46 | # let numba compile all the code before profiling 47 | ccc(np.random.rand(10), np.random.rand(10)) 48 | 49 | # %% [markdown] tags=[] 50 | # # Data 51 | 52 | # %% tags=[] 53 | n_genes, n_samples = 500, 1000 54 | 55 | # %% tags=[] 56 | np.random.seed(0) 57 | 58 | # %% tags=[] 59 | data = np.random.rand(n_genes, n_samples) 60 | 61 | # %% tags=[] 62 | data.shape 63 | 64 | 65 | # %% [markdown] tags=[] 66 | # # Profile 67 | 68 | # %% tags=[] 69 | def func(): 70 | res = np.full(int((data.shape[0] * (data.shape[0] - 1)) / 2), np.nan) 71 | 72 | n_clust = list(range(2, 10 + 1)) 73 | idx = 0 74 | for i in range(data.shape[0] - 1): 75 | for j in range(i + 1, data.shape[0]): 76 | res[idx] = ccc(data[i], data[j], internal_n_clusters=n_clust) 77 | idx += 1 78 | 79 | 80 | # %% tags=[] 81 | # %%timeit func() 82 | func() 83 | 84 | # %% tags=[] 85 | # %%prun -s cumulative -l 50 -T 10-cm_many_genes.txt 86 | func() 87 | 88 | # %% tags=[] 89 | -------------------------------------------------------------------------------- /nbs/25_pvalue/py/00-ccc_pvalue_dist-generate-data_matrix.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # notebook_metadata_filter: -jupytext.text_representation.jupytext_version 6 | # text_representation: 7 | # extension: .py 8 | # format_name: percent 9 | # format_version: '1.3' 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] tags=[] 20 | # Generates a distribution of pvalues under the null hypothesis of no association. 21 | # 22 | # This notebook uses a data matrix as input for CCC and parallelizes computation across gene pairs. 23 | 24 | # %% [markdown] tags=[] 25 | # # Modules loading 26 | 27 | # %% tags=[] 28 | import numpy as np 29 | 30 | from ccc.coef import ccc 31 | from ccc import conf 32 | 33 | # %% [markdown] tags=[] 34 | # # Settings 35 | 36 | # %% tags=[] 37 | rs = np.random.RandomState(0) 38 | 39 | # %% tags=[] 40 | DATA_N_OBJS, DATA_N_FEATURES = 100, 1000 41 | PVALUE_N_PERMS = 1000 42 | 43 | # %% [markdown] tags=[] 44 | # # Paths 45 | 46 | # %% tags=[] 47 | OUTPUT_DIR = conf.RESULTS_DIR / "ccc_null-pvalues" 48 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True) 49 | 50 | # %% tags=[] 51 | OUTPUT_DIR 52 | 53 | # %% [markdown] tags=[] 54 | # # Generate random data 55 | 56 | # %% tags=[] 57 | data = rs.rand(DATA_N_OBJS, DATA_N_FEATURES) 58 | 59 | # %% tags=[] 60 | data.shape 61 | 62 | # %% [markdown] tags=[] 63 | # # Run CCC 64 | 65 | # %% tags=[] 66 | res = ccc( 67 | data, 68 | n_jobs=conf.GENERAL["N_JOBS"], 69 | pvalue_n_perms=PVALUE_N_PERMS, 70 | ) 71 | 72 | # %% tags=[] 73 | cm_values, cm_pvalues = res 74 | 75 | # %% tags=[] 76 | cm_values.shape 77 | 78 | # %% tags=[] 79 | cm_pvalues.shape 80 | 81 | # %% [markdown] tags=[] 82 | # # Save 83 | 84 | # %% tags=[] 85 | output_file = OUTPUT_DIR / "data_matrix-cm_values.npy" 86 | display(output_file) 87 | 88 | np.save(output_file, cm_values) 89 | 90 | # %% tags=[] 91 | output_file = OUTPUT_DIR / "data_matrix-cm_pvalues.npy" 92 | display(output_file) 93 | 94 | np.save(output_file, cm_pvalues) 95 | 96 | # %% tags=[] 97 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/11_cm_optimized/py/08-many_genes.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.11.5 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] tags=[] 20 | # Similar as `06` but with numba disabled to compare with a pure Python implementation. 21 | # 22 | # Here I had to reduce the number of `n_genes`, since it takes too much otherwise. 23 | 24 | # %% [markdown] tags=[] 25 | # # Disable numba 26 | 27 | # %% tags=[] 28 | # %env NUMBA_DISABLE_JIT=1 29 | 30 | # %% [markdown] tags=[] 31 | # # Remove pycache dir 32 | 33 | # %% tags=[] 34 | # !echo ${CODE_DIR} 35 | 36 | # %% tags=[] 37 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print 38 | 39 | # %% tags=[] 40 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -prune -exec rm -rf {} \; 41 | 42 | # %% tags=[] 43 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print 44 | 45 | # %% [markdown] tags=[] 46 | # # Modules 47 | 48 | # %% tags=[] 49 | import numpy as np 50 | 51 | from ccc.coef import ccc 52 | 53 | # %% tags=[] 54 | # let numba compile all the code before profiling 55 | ccc(np.random.rand(10), np.random.rand(10)) 56 | 57 | # %% [markdown] tags=[] 58 | # # Data 59 | 60 | # %% tags=[] 61 | n_genes, n_samples = 50, 1000 62 | 63 | # %% tags=[] 64 | np.random.seed(0) 65 | 66 | # %% tags=[] 67 | data = np.random.rand(n_genes, n_samples) 68 | 69 | # %% tags=[] 70 | data.shape 71 | 72 | 73 | # %% [markdown] tags=[] 74 | # # Profile 75 | 76 | # %% tags=[] 77 | def func(): 78 | n_clust = list(range(2, 10 + 1)) 79 | return ccc(data, internal_n_clusters=n_clust) 80 | 81 | 82 | # %% tags=[] 83 | # %%timeit func() 84 | func() 85 | 86 | # %% tags=[] 87 | # %%prun -s cumulative -l 50 -T 08-cm_many_genes.txt 88 | func() 89 | 90 | # %% [markdown] tags=[] 91 | # **CONCLUSIONS:** compared with notebook `06` (which has 500 rows (`n_genes`) instead of 50 here), this one would have taken 2.80 hours for 500 rows based on this results. Whereas the numba-compiled version took ~7 minutes. 92 | 93 | # %% tags=[] 94 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/11_cm_optimized/py/07-many_samples.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.11.5 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] tags=[] 20 | # Clustermatch run using a larger number of samples. 21 | 22 | # %% [markdown] tags=[] 23 | # # Remove pycache dir 24 | 25 | # %% tags=[] 26 | # !echo ${CODE_DIR} 27 | 28 | # %% tags=[] 29 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print 30 | 31 | # %% tags=[] 32 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -prune -exec rm -rf {} \; 33 | 34 | # %% tags=[] 35 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print 36 | 37 | # %% [markdown] tags=[] 38 | # # Modules 39 | 40 | # %% tags=[] 41 | import numpy as np 42 | 43 | from ccc.coef import ccc 44 | 45 | # %% tags=[] 46 | # let numba compile all the code before profiling 47 | ccc(np.random.rand(10), np.random.rand(10)) 48 | 49 | # %% [markdown] tags=[] 50 | # # Data 51 | 52 | # %% tags=[] 53 | n_genes, n_samples = 10, 30000 54 | 55 | # %% tags=[] 56 | np.random.seed(0) 57 | 58 | # %% tags=[] 59 | data = np.random.rand(n_genes, n_samples) 60 | 61 | # %% tags=[] 62 | data.shape 63 | 64 | 65 | # %% [markdown] tags=[] 66 | # # With default `internal_n_clusters` 67 | 68 | # %% tags=[] 69 | def func(): 70 | n_clust = list(range(2, 10 + 1)) 71 | return ccc(data, internal_n_clusters=n_clust) 72 | 73 | 74 | # %% tags=[] 75 | # %%timeit func() 76 | func() 77 | 78 | # %% tags=[] 79 | # %%prun -s cumulative -l 50 -T 07-cm_many_samples-default_internal_n_clusters.txt 80 | func() 81 | 82 | 83 | # %% [markdown] tags=[] 84 | # # With reduced `internal_n_clusters` 85 | 86 | # %% tags=[] 87 | def func(): 88 | n_clust = list(range(2, 5 + 1)) 89 | return ccc(data, internal_n_clusters=n_clust) 90 | 91 | 92 | # %% tags=[] 93 | # %%timeit func() 94 | func() 95 | 96 | # %% tags=[] 97 | # %%prun -s cumulative -l 50 -T 07-cm_many_samples-less_internal_n_clusters.txt 98 | func() 99 | 100 | # %% tags=[] 101 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/12_cm_optimized/py/08-many_genes.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # notebook_metadata_filter: -jupytext.text_representation.jupytext_version 6 | # text_representation: 7 | # extension: .py 8 | # format_name: percent 9 | # format_version: '1.3' 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] tags=[] 20 | # Similar as `06` but with numba disabled to compare with a pure Python implementation. 21 | # 22 | # Here I had to reduce the number of `n_genes`, since it takes too much otherwise. 23 | 24 | # %% [markdown] tags=[] 25 | # # Disable numba 26 | 27 | # %% tags=[] 28 | # %env NUMBA_DISABLE_JIT=1 29 | 30 | # %% [markdown] tags=[] 31 | # # Remove pycache dir 32 | 33 | # %% tags=[] 34 | # !echo ${CODE_DIR} 35 | 36 | # %% tags=[] 37 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -print 38 | 39 | # %% tags=[] 40 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -prune -exec rm -rf {} \; 41 | 42 | # %% tags=[] 43 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -print 44 | 45 | # %% [markdown] tags=[] 46 | # # Modules 47 | 48 | # %% tags=[] 49 | import numpy as np 50 | 51 | from ccc.coef import ccc 52 | 53 | # %% tags=[] 54 | # let numba compile all the code before profiling 55 | ccc(np.random.rand(10), np.random.rand(10)) 56 | 57 | # %% [markdown] tags=[] 58 | # # Data 59 | 60 | # %% tags=[] 61 | n_genes, n_samples = 50, 1000 62 | 63 | # %% tags=[] 64 | np.random.seed(0) 65 | 66 | # %% tags=[] 67 | data = np.random.rand(n_genes, n_samples) 68 | 69 | # %% tags=[] 70 | data.shape 71 | 72 | 73 | # %% [markdown] tags=[] 74 | # # Profile 75 | 76 | # %% tags=[] 77 | def func(): 78 | n_clust = list(range(2, 10 + 1)) 79 | return ccc(data, internal_n_clusters=n_clust) 80 | 81 | 82 | # %% tags=[] 83 | # %%timeit func() 84 | func() 85 | 86 | # %% tags=[] 87 | # %%prun -s cumulative -l 50 -T 08-cm_many_genes.txt 88 | func() 89 | 90 | # %% [markdown] tags=[] 91 | # **CONCLUSIONS:** compared with notebook `06` (which has 500 rows (`n_genes`) instead of 50 here), this one would have taken 2.80 hours for 500 rows based on this results. Whereas the numba-compiled version took ~7 minutes. 92 | 93 | # %% tags=[] 94 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/12_cm_optimized/py/07-many_samples.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # notebook_metadata_filter: -jupytext.text_representation.jupytext_version 6 | # text_representation: 7 | # extension: .py 8 | # format_name: percent 9 | # format_version: '1.3' 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] tags=[] 20 | # Clustermatch run using a larger number of samples. 21 | 22 | # %% [markdown] tags=[] 23 | # # Remove pycache dir 24 | 25 | # %% tags=[] 26 | # !echo ${CODE_DIR} 27 | 28 | # %% tags=[] 29 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -print 30 | 31 | # %% tags=[] 32 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -prune -exec rm -rf {} \; 33 | 34 | # %% tags=[] 35 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -print 36 | 37 | # %% [markdown] tags=[] 38 | # # Modules 39 | 40 | # %% tags=[] 41 | import numpy as np 42 | 43 | from ccc.coef import ccc 44 | 45 | # %% tags=[] 46 | # let numba compile all the code before profiling 47 | ccc(np.random.rand(10), np.random.rand(10)) 48 | 49 | # %% [markdown] tags=[] 50 | # # Data 51 | 52 | # %% tags=[] 53 | n_genes, n_samples = 10, 30000 54 | 55 | # %% tags=[] 56 | np.random.seed(0) 57 | 58 | # %% tags=[] 59 | data = np.random.rand(n_genes, n_samples) 60 | 61 | # %% tags=[] 62 | data.shape 63 | 64 | 65 | # %% [markdown] tags=[] 66 | # # With default `internal_n_clusters` 67 | 68 | # %% tags=[] 69 | def func(): 70 | n_clust = list(range(2, 10 + 1)) 71 | return ccc(data, internal_n_clusters=n_clust) 72 | 73 | 74 | # %% tags=[] 75 | # %%timeit func() 76 | func() 77 | 78 | # %% tags=[] 79 | # %%prun -s cumulative -l 50 -T 07-cm_many_samples-default_internal_n_clusters.txt 80 | func() 81 | 82 | 83 | # %% [markdown] tags=[] 84 | # # With reduced `internal_n_clusters` 85 | 86 | # %% tags=[] 87 | def func(): 88 | n_clust = list(range(2, 5 + 1)) 89 | return ccc(data, internal_n_clusters=n_clust) 90 | 91 | 92 | # %% tags=[] 93 | # %%timeit func() 94 | func() 95 | 96 | # %% tags=[] 97 | # %%prun -s cumulative -l 50 -T 07-cm_many_samples-less_internal_n_clusters.txt 98 | func() 99 | 100 | # %% tags=[] 101 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD-2-Clause Plus Patent License 2 | 3 | Copyright (c) 2020-2021, Contributors & the Greene Laboratory at the University of Pennsylvania 4 | 5 | Redistribution and use in source and binary forms, with or without modification, are permitted 6 | provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions 9 | and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions 12 | and the following disclaimer in the documentation and/or other materials provided with the 13 | distribution. 14 | 15 | Subject to the terms and conditions of this license, each copyright holder and contributor hereby 16 | grants to those receiving rights under this license a perpetual, worldwide, non-exclusive, 17 | no-charge, royalty-free, irrevocable (except for failure to satisfy the conditions of this license) 18 | patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer this 19 | software, where such license applies only to those patent claims, already acquired or hereafter 20 | acquired, licensable by such copyright holder or contributor that are necessarily infringed by: 21 | 22 | (a) their Contribution(s) (the licensed copyrights of copyright holders and non-copyrightable 23 | additions of contributors, in source or binary form) alone; or 24 | 25 | (b) combination of their Contribution(s) with the work of authorship to which such Contribution(s) 26 | was added by such copyright holder or contributor, if, at the time the Contribution is added, 27 | such addition causes such combination to be necessarily infringed. The patent license shall not 28 | apply to any other combinations which include the Contribution. 29 | 30 | Except as expressly stated above, no rights or licenses from any copyright holder or contributor is 31 | granted under this license, whether expressly, by implication, estoppel or otherwise. 32 | 33 | DISCLAIMER 34 | 35 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 36 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 37 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR 38 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 39 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 40 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER 41 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF 42 | THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 43 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/11_cm_optimized/py/09-many_samples.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.11.5 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] tags=[] 20 | # Similar as `07` but with numba disabled to compare with a pure Python implementation. 21 | 22 | # %% [markdown] tags=[] 23 | # # Disable numba 24 | 25 | # %% tags=[] 26 | # %env NUMBA_DISABLE_JIT=1 27 | 28 | # %% [markdown] tags=[] 29 | # # Remove pycache dir 30 | 31 | # %% tags=[] 32 | # !echo ${CODE_DIR} 33 | 34 | # %% tags=[] 35 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print 36 | 37 | # %% tags=[] 38 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -prune -exec rm -rf {} \; 39 | 40 | # %% tags=[] 41 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print 42 | 43 | # %% [markdown] tags=[] 44 | # # Modules 45 | 46 | # %% tags=[] 47 | import numpy as np 48 | 49 | from ccc.coef import ccc 50 | 51 | # %% tags=[] 52 | # let numba compile all the code before profiling 53 | ccc(np.random.rand(10), np.random.rand(10)) 54 | 55 | # %% [markdown] tags=[] 56 | # # Data 57 | 58 | # %% tags=[] 59 | n_genes, n_samples = 10, 30000 60 | 61 | # %% tags=[] 62 | np.random.seed(0) 63 | 64 | # %% tags=[] 65 | data = np.random.rand(n_genes, n_samples) 66 | 67 | # %% tags=[] 68 | data.shape 69 | 70 | 71 | # %% [markdown] tags=[] 72 | # # With default `internal_n_clusters` 73 | 74 | # %% tags=[] 75 | def func(): 76 | n_clust = list(range(2, 10 + 1)) 77 | return ccc(data, internal_n_clusters=n_clust) 78 | 79 | 80 | # %% tags=[] 81 | # %%timeit func() 82 | func() 83 | 84 | # %% tags=[] 85 | # %%prun -s cumulative -l 50 -T 09-cm_many_samples-default_internal_n_clusters.txt 86 | func() 87 | 88 | 89 | # %% [markdown] tags=[] 90 | # These results are just slightly worse than the numba-compiled version (notebook `07`). 91 | 92 | # %% [markdown] tags=[] 93 | # # With reduced `internal_n_clusters` 94 | 95 | # %% tags=[] 96 | def func(): 97 | n_clust = list(range(2, 5 + 1)) 98 | return ccc(data, internal_n_clusters=n_clust) 99 | 100 | 101 | # %% tags=[] 102 | # %%timeit func() 103 | func() 104 | 105 | # %% tags=[] 106 | # %%prun -s cumulative -l 50 -T 09-cm_many_samples-less_internal_n_clusters.txt 107 | func() 108 | 109 | # %% [markdown] tags=[] 110 | # These results are slightly better than the numba-compiled version (notebook `07`), which is surprising. In the future, it would be interesting to disable threading here to get accurate profiling results to debug this issue. 111 | 112 | # %% tags=[] 113 | -------------------------------------------------------------------------------- /tests/test_pytorch_core.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | 4 | from ccc.pytorch.core import unravel_index_2d 5 | 6 | 7 | def test_unravel_index_2d_square_simple(): 8 | shape = (2, 2) 9 | assert unravel_index_2d(0, shape) == (0, 0) 10 | assert unravel_index_2d(1, shape) == (0, 1) 11 | assert unravel_index_2d(2, shape) == (1, 0) 12 | assert unravel_index_2d(3, shape) == (1, 1) 13 | 14 | 15 | def test_unravel_index_2d_rect_simple(): 16 | shape = (2, 3) 17 | assert unravel_index_2d(0, shape) == (0, 0) 18 | assert unravel_index_2d(1, shape) == (0, 1) 19 | assert unravel_index_2d(2, shape) == (0, 2) 20 | assert unravel_index_2d(3, shape) == (1, 0) 21 | assert unravel_index_2d(4, shape) == (1, 1) 22 | assert unravel_index_2d(5, shape) == (1, 2) 23 | 24 | shape = (1, 4) 25 | assert unravel_index_2d(0, shape) == (0, 0) 26 | assert unravel_index_2d(1, shape) == (0, 1) 27 | assert unravel_index_2d(2, shape) == (0, 2) 28 | assert unravel_index_2d(3, shape) == (0, 3) 29 | 30 | shape = (4, 1) 31 | assert unravel_index_2d(0, shape) == (0, 0) 32 | assert unravel_index_2d(1, shape) == (1, 0) 33 | assert unravel_index_2d(2, shape) == (2, 0) 34 | assert unravel_index_2d(3, shape) == (3, 0) 35 | 36 | 37 | def test_unravel_index_2d_square0(): 38 | x = np.array([[0, 7], [-5, 6.999]]) 39 | x_max_idx = np.argmax(x, axis=None) 40 | assert x_max_idx == 1 41 | 42 | expected_idx = np.unravel_index(x_max_idx, x.shape) 43 | observed_idx = unravel_index_2d(x_max_idx, x.shape) 44 | 45 | assert expected_idx == observed_idx == (0, 1) 46 | 47 | 48 | def test_unravel_index_2d_square1(): 49 | x = np.array([[0, 7], [-5, 7.01]]) 50 | x_max_idx = np.argmax(x, axis=None) 51 | assert x_max_idx == 3 52 | 53 | expected_idx = np.unravel_index(x_max_idx, x.shape) 54 | observed_idx = unravel_index_2d(x_max_idx, x.shape) 55 | 56 | assert expected_idx == observed_idx == (1, 1) 57 | 58 | 59 | def test_unravel_index_2d_square_all_equal(): 60 | x = np.array([[7.0, 7.0], [7.0, 7.0]]) 61 | x_max_idx = np.argmax(x, axis=None) 62 | assert x_max_idx == 0 63 | 64 | expected_idx = np.unravel_index(x_max_idx, x.shape) 65 | observed_idx = unravel_index_2d(x_max_idx, x.shape) 66 | 67 | assert expected_idx == observed_idx == (0, 0) 68 | 69 | 70 | def test_unravel_index_2d_rect(): 71 | x = np.array([[0, 7, -5.6], [8.1, 6.999, 0]]) 72 | x_max_idx = np.argmax(x, axis=None) 73 | assert x_max_idx == 3 74 | 75 | expected_idx = np.unravel_index(x_max_idx, x.shape) 76 | observed_idx = unravel_index_2d(x_max_idx, x.shape) 77 | 78 | assert expected_idx == observed_idx == (1, 0) 79 | 80 | 81 | def test_unravel_index_index_out_of_bounds(): 82 | with pytest.raises(ValueError): 83 | unravel_index_2d(6, (2, 3)) 84 | 85 | 86 | def test_unravel_index_non_2d(): 87 | with pytest.raises(ValueError): 88 | unravel_index_2d(0, (2, 3, 4)) 89 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/12_cm_optimized/py/11-many_samples.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # notebook_metadata_filter: -jupytext.text_representation.jupytext_version 6 | # text_representation: 7 | # extension: .py 8 | # format_name: percent 9 | # format_version: '1.3' 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] tags=[] 20 | # Similar as `06` but it computes across gene pairs instead of data matrix. 21 | 22 | # %% [markdown] tags=[] 23 | # # Remove pycache dir 24 | 25 | # %% tags=[] 26 | # !echo ${CODE_DIR} 27 | 28 | # %% tags=[] 29 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -print 30 | 31 | # %% tags=[] 32 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -prune -exec rm -rf {} \; 33 | 34 | # %% tags=[] 35 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -print 36 | 37 | # %% [markdown] tags=[] 38 | # # Modules 39 | 40 | # %% tags=[] 41 | import numpy as np 42 | 43 | from ccc.coef import ccc 44 | 45 | # %% tags=[] 46 | # let numba compile all the code before profiling 47 | ccc(np.random.rand(10), np.random.rand(10)) 48 | 49 | # %% [markdown] tags=[] 50 | # # Data 51 | 52 | # %% tags=[] 53 | n_genes, n_samples = 10, 30000 54 | 55 | # %% tags=[] 56 | np.random.seed(0) 57 | 58 | # %% tags=[] 59 | data = np.random.rand(n_genes, n_samples) 60 | 61 | # %% tags=[] 62 | data.shape 63 | 64 | 65 | # %% [markdown] tags=[] 66 | # # With default `internal_n_clusters` 67 | 68 | # %% tags=[] 69 | def func(): 70 | res = np.full(int((data.shape[0] * (data.shape[0] - 1)) / 2), np.nan) 71 | 72 | n_clust = list(range(2, 10 + 1)) 73 | idx = 0 74 | for i in range(data.shape[0] - 1): 75 | for j in range(i + 1, data.shape[0]): 76 | res[idx] = ccc(data[i], data[j], internal_n_clusters=n_clust) 77 | idx += 1 78 | 79 | 80 | # %% tags=[] 81 | # %%timeit func() 82 | func() 83 | 84 | # %% tags=[] 85 | # %%prun -s cumulative -l 50 -T 11-cm_many_samples-default_internal_n_clusters.txt 86 | func() 87 | 88 | 89 | # %% [markdown] tags=[] 90 | # # With reduced `internal_n_clusters` 91 | 92 | # %% tags=[] 93 | def func(): 94 | res = np.full(int((data.shape[0] * (data.shape[0] - 1)) / 2), np.nan) 95 | 96 | n_clust = list(range(2, 5 + 1)) 97 | idx = 0 98 | for i in range(data.shape[0] - 1): 99 | for j in range(i + 1, data.shape[0]): 100 | res[idx] = ccc(data[i], data[j], internal_n_clusters=n_clust) 101 | idx += 1 102 | 103 | 104 | # %% tags=[] 105 | # %%timeit func() 106 | func() 107 | 108 | # %% tags=[] 109 | # %%prun -s cumulative -l 50 -T 11-cm_many_samples-less_internal_n_clusters.txt 110 | func() 111 | 112 | # %% tags=[] 113 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/12_cm_optimized/py/09-many_samples.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # notebook_metadata_filter: -jupytext.text_representation.jupytext_version 6 | # text_representation: 7 | # extension: .py 8 | # format_name: percent 9 | # format_version: '1.3' 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] tags=[] 20 | # Similar as `07` but with numba disabled to compare with a pure Python implementation. 21 | 22 | # %% [markdown] tags=[] 23 | # # Disable numba 24 | 25 | # %% tags=[] 26 | # %env NUMBA_DISABLE_JIT=1 27 | 28 | # %% [markdown] tags=[] 29 | # # Remove pycache dir 30 | 31 | # %% tags=[] 32 | # !echo ${CODE_DIR} 33 | 34 | # %% tags=[] 35 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -print 36 | 37 | # %% tags=[] 38 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -prune -exec rm -rf {} \; 39 | 40 | # %% tags=[] 41 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -print 42 | 43 | # %% [markdown] tags=[] 44 | # # Modules 45 | 46 | # %% tags=[] 47 | import numpy as np 48 | 49 | from ccc.coef import ccc 50 | 51 | # %% tags=[] 52 | # let numba compile all the code before profiling 53 | ccc(np.random.rand(10), np.random.rand(10)) 54 | 55 | # %% [markdown] tags=[] 56 | # # Data 57 | 58 | # %% tags=[] 59 | n_genes, n_samples = 10, 30000 60 | 61 | # %% tags=[] 62 | np.random.seed(0) 63 | 64 | # %% tags=[] 65 | data = np.random.rand(n_genes, n_samples) 66 | 67 | # %% tags=[] 68 | data.shape 69 | 70 | 71 | # %% [markdown] tags=[] 72 | # # With default `internal_n_clusters` 73 | 74 | # %% tags=[] 75 | def func(): 76 | n_clust = list(range(2, 10 + 1)) 77 | return ccc(data, internal_n_clusters=n_clust) 78 | 79 | 80 | # %% tags=[] 81 | # %%timeit func() 82 | func() 83 | 84 | # %% tags=[] 85 | # %%prun -s cumulative -l 50 -T 09-cm_many_samples-default_internal_n_clusters.txt 86 | func() 87 | 88 | 89 | # %% [markdown] tags=[] 90 | # These results are just slightly worse than the numba-compiled version (notebook `07`). 91 | 92 | # %% [markdown] tags=[] 93 | # # With reduced `internal_n_clusters` 94 | 95 | # %% tags=[] 96 | def func(): 97 | n_clust = list(range(2, 5 + 1)) 98 | return ccc(data, internal_n_clusters=n_clust) 99 | 100 | 101 | # %% tags=[] 102 | # %%timeit func() 103 | func() 104 | 105 | # %% tags=[] 106 | # %%prun -s cumulative -l 50 -T 09-cm_many_samples-less_internal_n_clusters.txt 107 | func() 108 | 109 | # %% [markdown] tags=[] 110 | # These results are slightly better than the numba-compiled version (notebook `07`), which is surprising. In the future, it would be interesting to disable threading here to get accurate profiling results to debug this issue. 111 | 112 | # %% tags=[] 113 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/py/01-cdist_parts_v00.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.11.5 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] 20 | # UPDATE: 21 | # 22 | # list changes here 23 | 24 | # %% [markdown] 25 | # ![image.png](attachment:3ca43189-f499-4016-a6b7-e0b476fcac1b.png) 26 | 27 | # %% [markdown] tags=[] 28 | # # Remove pycache dir 29 | 30 | # %% 31 | # !echo ${CODE_DIR} 32 | 33 | # %% 34 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print 35 | 36 | # %% 37 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -exec rm -rf {} \; 38 | 39 | # %% 40 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print 41 | 42 | # %% [markdown] tags=[] 43 | # # Modules 44 | 45 | # %% tags=[] 46 | import numpy as np 47 | 48 | from ccc.coef import _cm 49 | 50 | # %% [markdown] tags=[] 51 | # # Settings 52 | 53 | # %% 54 | N_REPS = 10 55 | 56 | # %% tags=[] 57 | np.random.seed(0) 58 | 59 | # %% [markdown] tags=[] 60 | # # Setup 61 | 62 | # %% 63 | # let numba compile all the code before profiling 64 | _cm.py_func(np.random.rand(10), np.random.rand(10)) 65 | 66 | # %% [markdown] tags=[] 67 | # # Run with `n_samples` small 68 | 69 | # %% 70 | N_SAMPLES = 100 71 | 72 | # %% 73 | x = np.random.rand(N_SAMPLES) 74 | y = np.random.rand(N_SAMPLES) 75 | 76 | 77 | # %% tags=[] 78 | def func(): 79 | for i in range(N_REPS): 80 | # py_func accesses the original python function, not the numba-optimized one 81 | # this is needed to be able to profile the function 82 | _cm.py_func(x, y) 83 | 84 | 85 | # %% tags=[] 86 | # %%timeit -n1 -r1 func() 87 | func() 88 | 89 | # %% tags=[] 90 | # %%prun -s cumulative -l 20 -T 01-n_samples_small.txt 91 | func() 92 | 93 | # %% [markdown] tags=[] 94 | # **No improvement** for this case. 95 | 96 | # %% [markdown] tags=[] 97 | # # Run with `n_samples` large 98 | 99 | # %% 100 | N_SAMPLES = 100000 101 | 102 | # %% 103 | x = np.random.rand(N_SAMPLES) 104 | y = np.random.rand(N_SAMPLES) 105 | 106 | 107 | # %% tags=[] 108 | def func(): 109 | for i in range(N_REPS): 110 | # py_func accesses the original python function, not the numba-optimized one 111 | # this is needed to be able to profile the function 112 | _cm.py_func(x, y) 113 | 114 | 115 | # %% tags=[] 116 | # %%timeit -n1 -r1 func() 117 | func() 118 | 119 | # %% tags=[] 120 | # %%prun -s cumulative -l 20 -T 01-n_samples_large.txt 121 | func() 122 | 123 | # %% [markdown] tags=[] 124 | # **Important improvement** for this case. `cdist_parts` takes now 0.370 percall instead of 0.824 (from reference). 125 | 126 | # %% 127 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/py/00-run_reference.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.11.5 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] 20 | # Creates a point of reference/comparison with non-optimized version of ccc. 21 | 22 | # %% [markdown] tags=[] 23 | # # Remove pycache dir 24 | 25 | # %% 26 | # !echo ${CODE_DIR} 27 | 28 | # %% 29 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print 30 | 31 | # %% 32 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -exec rm -rf {} \; 33 | 34 | # %% 35 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print 36 | 37 | # %% [markdown] tags=[] 38 | # # Modules 39 | 40 | # %% tags=[] 41 | import numpy as np 42 | 43 | from ccc.coef import _cm 44 | 45 | # %% [markdown] tags=[] 46 | # # Settings 47 | 48 | # %% 49 | N_REPS = 10 50 | 51 | # %% tags=[] 52 | np.random.seed(0) 53 | 54 | # %% [markdown] tags=[] 55 | # # Setup 56 | 57 | # %% 58 | # let numba compile all the code before profiling 59 | _cm.py_func(np.random.rand(10), np.random.rand(10)) 60 | 61 | # %% [markdown] tags=[] 62 | # # Run with `n_samples` small 63 | 64 | # %% 65 | N_SAMPLES = 100 66 | 67 | # %% 68 | x = np.random.rand(N_SAMPLES) 69 | y = np.random.rand(N_SAMPLES) 70 | 71 | 72 | # %% tags=[] 73 | def func(): 74 | for i in range(N_REPS): 75 | # py_func accesses the original python function, not the numba-optimized one 76 | # this is needed to be able to profile the function 77 | _cm.py_func(x, y) 78 | 79 | 80 | # %% tags=[] 81 | # %%timeit -n1 -r1 func() 82 | func() 83 | 84 | # %% tags=[] 85 | # %%prun -s cumulative -l 20 -T 00-n_samples_small.txt 86 | func() 87 | 88 | # %% [markdown] tags=[] 89 | # The bottleneck functions are, in order of importance: 90 | # 1. `cdist_parts` 91 | # 1. `_get_parts` 92 | 93 | # %% [markdown] tags=[] 94 | # # Run with `n_samples` large 95 | 96 | # %% 97 | N_SAMPLES = 100000 98 | 99 | # %% 100 | x = np.random.rand(N_SAMPLES) 101 | y = np.random.rand(N_SAMPLES) 102 | 103 | 104 | # %% tags=[] 105 | def func(): 106 | for i in range(N_REPS): 107 | # py_func accesses the original python function, not the numba-optimized one 108 | # this is needed to be able to profile the function 109 | _cm.py_func(x, y) 110 | 111 | 112 | # %% tags=[] 113 | # %%timeit -n1 -r1 func() 114 | func() 115 | 116 | # %% tags=[] 117 | # %%prun -s cumulative -l 20 -T 00-n_samples_large.txt 118 | func() 119 | 120 | # %% [markdown] tags=[] 121 | # The bottleneck functions now are **different**, in order of importance: 122 | # 1. `_get_parts` 123 | # 1. `cdist_parts` 124 | 125 | # %% 126 | -------------------------------------------------------------------------------- /nbs/99_manuscript/k_max/py/01-k_max-runs.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # notebook_metadata_filter: -jupytext.text_representation.jupytext_version 6 | # text_representation: 7 | # extension: .py 8 | # format_name: percent 9 | # format_version: '1.3' 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] tags=[] 20 | # Runs CCC with different values for parameter $k_{\mathrm{max}}$ to assess the constant baseline property empirically. 21 | 22 | # %% [markdown] tags=[] 23 | # # Modules loading 24 | 25 | # %% tags=[] 26 | import numpy as np 27 | import pandas as pd 28 | from tqdm import tqdm 29 | 30 | from ccc import conf 31 | from ccc.coef import ccc 32 | 33 | # %% [markdown] tags=[] 34 | # # Settings 35 | 36 | # %% tags=[] 37 | display(conf.GENERAL["N_JOBS"]) 38 | 39 | # %% tags=[] 40 | DATA_SIZES = [ 41 | 200, 42 | 600, 43 | 1800, 44 | ] 45 | 46 | # split data size in this many points 47 | K_MAX_N_SPLITS = 10 48 | 49 | # always include this value since it is the default we use in CCC 50 | DEFAULT_K_MAX = 10 51 | 52 | # N_REPS = 10 53 | 54 | # %% tags=[] 55 | np.random.seed(0) 56 | 57 | # %% [markdown] tags=[] 58 | # # Paths 59 | 60 | # %% tags=[] 61 | OUTPUT_DIR = conf.RESULTS_DIR / "k_max_test" 62 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True) 63 | display(OUTPUT_DIR) 64 | 65 | # %% [markdown] tags=[] 66 | # # Run 67 | 68 | # %% tags=[] 69 | # initialize (i.e., compile with numba) 70 | ccc(np.random.rand(100), np.random.rand(100)) 71 | 72 | # %% tags=[] 73 | results = pd.DataFrame(columns=["data_size", "k_max", "k_max_as_n_fraction", "coef"]) 74 | 75 | idx = 0 76 | for data_size in tqdm(DATA_SIZES): 77 | # get the values for k_max to try... 78 | k_max_splits = np.linspace(2, data_size, K_MAX_N_SPLITS) 79 | # ... but always add the default k_max used by CCC 80 | k_max_splits = [int(i) for i in np.sort(np.append(k_max_splits, DEFAULT_K_MAX))] 81 | 82 | # generate random data 83 | # TODO: if I generate normal data, what happens? 84 | # d1 = np.random.rand(data_size) 85 | # d2 = np.random.rand(data_size) 86 | d1 = np.random.normal(size=data_size) 87 | d2 = np.random.normal(size=data_size) 88 | 89 | for k_max in tqdm(k_max_splits): 90 | c = ccc(d1, d2, internal_n_clusters=k_max, n_jobs=conf.GENERAL["N_JOBS"]) 91 | 92 | results.loc[idx] = [data_size, k_max, k_max / data_size, c] 93 | idx += 1 94 | 95 | # save 96 | results.to_pickle(OUTPUT_DIR / "k_max-results.pkl") 97 | 98 | # %% [markdown] tags=[] 99 | # # Check 100 | 101 | # %% tags=[] 102 | results.shape 103 | 104 | # %% tags=[] 105 | assert results.shape[0] == int(len(DATA_SIZES) * (K_MAX_N_SPLITS + 1)) 106 | 107 | # %% tags=[] 108 | results.head() 109 | 110 | # %% tags=[] 111 | -------------------------------------------------------------------------------- /tests/data/README.md: -------------------------------------------------------------------------------- 1 | # Data used in unit tests 2 | 3 | ## Clustermatch data 4 | 5 | The `clustermatch-example-*.pkl` files were generated using the original clustermatch 6 | code (https://github.com/sinc-lab/clustermatch - Commit 8b66b3d7) plus the patch below: 7 | 8 | ```patch 9 | $ git diff 10 | diff --git a/clustermatch/cluster.py b/clustermatch/cluster.py 11 | index 9f7d06c..07e8192 100644 12 | --- a/clustermatch/cluster.py 13 | +++ b/clustermatch/cluster.py 14 | @@ -160,7 +160,7 @@ def _get_range_n_clusters(n_common_features, **kwargs): 15 | if internal_n_clusters is None: 16 | estimated_k = int(np.floor(np.sqrt(n_common_features))) 17 | estimated_k = np.min((estimated_k, 10)) 18 | - range_n_clusters = range(2, np.max((estimated_k, 3))) 19 | + range_n_clusters = range(2, np.max((estimated_k, 3))+1) 20 | elif isinstance(internal_n_clusters, (tuple, list, range)): 21 | range_n_clusters = internal_n_clusters 22 | elif isinstance(internal_n_clusters, int): 23 | @@ -211,7 +211,7 @@ def row_col_from_condensed_index(d,i): 24 | 25 | 26 | def _compute_ari(part1, part2): 27 | - if np.isnan(part1).any() or len(part1) == 0: 28 | + if np.isnan(part1).any() or np.isnan(part2).any() or len(part1) == 0 or len(part2) == 0: 29 | return 0.0 30 | 31 | return ari(part1, part2) 32 | ``` 33 | 34 | Then I moved to the git root directory and executed the following commands in ipython: 35 | 36 | ### Random data without NaN 37 | ```python 38 | from pathlib import Path 39 | 40 | import numpy as np 41 | import pandas as pd 42 | 43 | from clustermatch.cluster import calculate_simmatrix 44 | 45 | np.random.seed(0) 46 | random_data = pd.DataFrame(np.random.rand(20, 100)) 47 | 48 | OUTPUT_DIR = Path("/home/miltondp/projects/ccc/ccc/tests/data/") 49 | 50 | random_data.to_pickle(OUTPUT_DIR / "ccc-random_data-data.pkl") 51 | 52 | int_n_clusters = range(2, 10+1) 53 | cm_sim_matrix = calculate_simmatrix(random_data, internal_n_clusters=int_n_clusters, n_jobs=3) 54 | cm_sim_matrix.to_pickle(OUTPUT_DIR / "ccc-random_data-coef.pkl") 55 | ``` 56 | 57 | 58 | THIS IS WITH THE ORIGINAL DATA WITH NANS 59 | ### Tomato dataset used in the original clustermatch implementation (contains NaN) 60 | ```python 61 | from pathlib import Path 62 | 63 | import pandas as pd 64 | 65 | from clustermatch.cluster import calculate_simmatrix 66 | from clustermatch.utils.data import merge_sources 67 | 68 | data_files = ['experiments/tomato/data/real_sample.xlsx'] 69 | merged_sources, feature_names, sources_names = merge_sources(data_files) 70 | 71 | OUTPUT_DIR = Path("/home/miltondp/projects/ccc/ccc/tests/data/") 72 | 73 | merged_sources_final = merged_sources.apply(lambda x: pd.to_numeric(x, errors="coerce"), axis=1) 74 | merged_sources_final = merged_sources_final.dropna(how="all") 75 | merged_sources_final.to_pickle(OUTPUT_DIR / "ccc-example-data.pkl") 76 | 77 | int_n_clusters = range(2, 5) 78 | cm_sim_matrix = calculate_simmatrix(merged_sources_final, internal_n_clusters=int_n_clusters, n_jobs=3) 79 | cm_sim_matrix.to_pickle(OUTPUT_DIR / "ccc-example-coef.pkl") 80 | ``` 81 | -------------------------------------------------------------------------------- /tests/test_scipy_stats.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy import stats 3 | 4 | from ccc.scipy.stats import rank 5 | 6 | 7 | def test_rank_no_duplicates(): 8 | data = np.array([0, 10, 1, 5, 7, 8, -5, -2]) 9 | 10 | expected_ranks = stats.rankdata(data, "average") 11 | observed_ranks = rank(data) 12 | 13 | np.testing.assert_array_equal(observed_ranks, expected_ranks) 14 | 15 | 16 | def test_rank_one_duplicate_group(): 17 | data = np.array([0, 10, 1, 5, 7, 8, 1, -2]) 18 | 19 | expected_ranks = stats.rankdata(data, "average") 20 | observed_ranks = rank(data) 21 | 22 | np.testing.assert_array_equal(observed_ranks, expected_ranks) 23 | 24 | 25 | def test_rank_one_duplicate_group_with_more_elements(): 26 | data = np.array([0, 10, 1, 1, 7, 8, 1, -2]) 27 | 28 | expected_ranks = stats.rankdata(data, "average") 29 | observed_ranks = rank(data) 30 | 31 | np.testing.assert_array_equal(observed_ranks, expected_ranks) 32 | 33 | 34 | def test_rank_one_duplicate_group_at_beginning(): 35 | data = np.array([0, 0, 1, -10, 7, 8, 9.4, -2]) 36 | 37 | expected_ranks = stats.rankdata(data, "average") 38 | observed_ranks = rank(data) 39 | 40 | np.testing.assert_array_equal(observed_ranks, expected_ranks) 41 | 42 | 43 | def test_rank_one_duplicate_group_at_beginning_with_more_elements(): 44 | data = np.array([0.13, 0.13, 0.13, 1, -10, 7, 8, 9.4, -2]) 45 | 46 | expected_ranks = stats.rankdata(data, "average") 47 | observed_ranks = rank(data) 48 | 49 | np.testing.assert_array_equal(observed_ranks, expected_ranks) 50 | 51 | 52 | def test_rank_one_duplicate_group_at_beginning_are_smallest(): 53 | data = np.array([0, 10, 1.5, -99.5, -99.5, -99.5, 5, 7, 8, -5, -2]) 54 | 55 | expected_ranks = stats.rankdata(data, "average") 56 | observed_ranks = rank(data) 57 | 58 | np.testing.assert_array_equal(observed_ranks, expected_ranks) 59 | 60 | 61 | def test_rank_one_duplicate_group_at_end(): 62 | data = np.array([0, 1, -10, 7, 8, 9.4, -2.5, -2.5]) 63 | 64 | expected_ranks = stats.rankdata(data, "average") 65 | observed_ranks = rank(data) 66 | 67 | np.testing.assert_array_equal(observed_ranks, expected_ranks) 68 | 69 | 70 | def test_rank_one_duplicate_group_at_end_with_more_elements(): 71 | data = np.array([0, 1, -10, 7, 8, 9.4, -12.5, -12.5, -12.5]) 72 | 73 | expected_ranks = stats.rankdata(data, "average") 74 | observed_ranks = rank(data) 75 | 76 | np.testing.assert_array_equal(observed_ranks, expected_ranks) 77 | 78 | 79 | def test_rank_one_duplicate_group_at_end_is_the_largest(): 80 | data = np.array([0, 1, -10, 7, 8, 9.4, 120.5, 120.5, 120.5]) 81 | 82 | expected_ranks = stats.rankdata(data, "average") 83 | observed_ranks = rank(data) 84 | 85 | np.testing.assert_array_equal(observed_ranks, expected_ranks) 86 | 87 | 88 | def test_rank_all_are_duplicates(): 89 | data = np.array([1.5, 1.5, 1.5, 1.5]) 90 | 91 | expected_ranks = stats.rankdata(data, "average") 92 | observed_ranks = rank(data) 93 | 94 | np.testing.assert_array_equal(observed_ranks, expected_ranks) 95 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/py/04-get_parts_v00.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.11.5 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] 20 | # Now `cdist_parts` has been optimized with previous profiling tests. 21 | # 22 | # Here we profile function `_get_parts`. 23 | # 24 | # USING _cm not _cm.py_func 25 | 26 | # %% [markdown] 27 | # 28 | 29 | # %% [markdown] tags=[] 30 | # # Remove pycache dir 31 | 32 | # %% 33 | # !echo ${CODE_DIR} 34 | 35 | # %% 36 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print 37 | 38 | # %% 39 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -exec rm -rf {} \; 40 | 41 | # %% 42 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print 43 | 44 | # %% [markdown] tags=[] 45 | # # Modules 46 | 47 | # %% tags=[] 48 | import numpy as np 49 | 50 | from ccc.coef import _cm 51 | 52 | # %% [markdown] tags=[] 53 | # # Settings 54 | 55 | # %% 56 | N_REPS = 10 57 | 58 | # %% tags=[] 59 | np.random.seed(0) 60 | 61 | # %% [markdown] tags=[] 62 | # # Setup 63 | 64 | # %% 65 | # let numba compile all the code before profiling 66 | _cm.py_func(np.random.rand(10), np.random.rand(10)) 67 | 68 | # %% [markdown] tags=[] 69 | # # Run with `n_samples` small 70 | 71 | # %% 72 | N_SAMPLES = 100 73 | 74 | # %% 75 | x = np.random.rand(N_SAMPLES) 76 | y = np.random.rand(N_SAMPLES) 77 | 78 | 79 | # %% tags=[] 80 | def func(): 81 | for i in range(N_REPS): 82 | # py_func accesses the original python function, not the numba-optimized one 83 | # this is needed to be able to profile the function 84 | _cm.py_func(x, y) 85 | 86 | 87 | # %% tags=[] 88 | # %%timeit -n1 -r1 func() 89 | func() 90 | 91 | # %% tags=[] 92 | # %%prun -s cumulative -l 20 -T 04-n_samples_small.txt 93 | func() 94 | 95 | # %% [markdown] tags=[] 96 | # **No improvement** for this case with respect to reference. 97 | 98 | # %% [markdown] tags=[] 99 | # # Run with `n_samples` large 100 | 101 | # %% 102 | N_SAMPLES = 100000 103 | 104 | # %% 105 | x = np.random.rand(N_SAMPLES) 106 | y = np.random.rand(N_SAMPLES) 107 | 108 | 109 | # %% tags=[] 110 | def func(): 111 | for i in range(N_REPS): 112 | # py_func accesses the original python function, not the numba-optimized one 113 | # this is needed to be able to profile the function 114 | _cm.py_func(x, y) 115 | 116 | 117 | # %% tags=[] 118 | # %%timeit -n1 -r1 func() 119 | func() 120 | 121 | # %% tags=[] 122 | # %%prun -s cumulative -l 20 -T 04-n_samples_large.txt 123 | func() 124 | 125 | # %% [markdown] tags=[] 126 | # **No improvement** for this case. In fact, it's a bit worse compared with reference (10.568 tottime). 127 | 128 | # %% 129 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/py/02-cdist_parts_v01.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.11.5 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] 20 | # UPDATE: 21 | # 22 | # list changes here 23 | 24 | # %% [markdown] 25 | # ![image.png](attachment:0b015079-ce2b-4e6c-b2ea-22980d3c2f7d.png) 26 | 27 | # %% [markdown] tags=[] 28 | # # Remove pycache dir 29 | 30 | # %% 31 | # !echo ${CODE_DIR} 32 | 33 | # %% 34 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print 35 | 36 | # %% 37 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -exec rm -rf {} \; 38 | 39 | # %% 40 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print 41 | 42 | # %% [markdown] tags=[] 43 | # # Modules 44 | 45 | # %% tags=[] 46 | import numpy as np 47 | 48 | from ccc.coef import _cm 49 | 50 | # %% [markdown] tags=[] 51 | # # Settings 52 | 53 | # %% 54 | N_REPS = 10 55 | 56 | # %% tags=[] 57 | np.random.seed(0) 58 | 59 | # %% [markdown] tags=[] 60 | # # Setup 61 | 62 | # %% 63 | # let numba compile all the code before profiling 64 | _cm.py_func(np.random.rand(10), np.random.rand(10)) 65 | 66 | # %% [markdown] tags=[] 67 | # # Run with `n_samples` small 68 | 69 | # %% 70 | N_SAMPLES = 100 71 | 72 | # %% 73 | x = np.random.rand(N_SAMPLES) 74 | y = np.random.rand(N_SAMPLES) 75 | 76 | 77 | # %% tags=[] 78 | def func(): 79 | for i in range(N_REPS): 80 | # py_func accesses the original python function, not the numba-optimized one 81 | # this is needed to be able to profile the function 82 | _cm.py_func(x, y) 83 | 84 | 85 | # %% tags=[] 86 | # %%timeit -n1 -r1 func() 87 | func() 88 | 89 | # %% tags=[] 90 | # %%prun -s cumulative -l 20 -T 02-n_samples_small.txt 91 | func() 92 | 93 | # %% [markdown] tags=[] 94 | # **No improvement** for this case. 95 | 96 | # %% [markdown] tags=[] 97 | # # Run with `n_samples` large 98 | 99 | # %% 100 | N_SAMPLES = 100000 101 | 102 | # %% 103 | x = np.random.rand(N_SAMPLES) 104 | y = np.random.rand(N_SAMPLES) 105 | 106 | 107 | # %% tags=[] 108 | def func(): 109 | for i in range(N_REPS): 110 | # py_func accesses the original python function, not the numba-optimized one 111 | # this is needed to be able to profile the function 112 | _cm.py_func(x, y) 113 | 114 | 115 | # %% tags=[] 116 | # %%timeit -n1 -r1 func() 117 | func() 118 | 119 | # %% tags=[] 120 | # %%prun -s cumulative -l 20 -T 02-n_samples_large.txt 121 | func() 122 | 123 | # %% [markdown] tags=[] 124 | # **Important improvement** for this case. `cdist_parts` takes now 0.370 percall instead of 0.824 (from reference). 125 | # 126 | # **However**, compared with `v00` (0.370 per call), this one is slightly worse. 127 | 128 | # %% 129 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/py/03-cdist_parts_v02.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.11.5 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] 20 | # UPDATE: 21 | # 22 | # list changes here 23 | 24 | # %% [markdown] 25 | # ![image.png](attachment:bee5d958-22e0-4cd2-8667-9b29973604f7.png) 26 | 27 | # %% [markdown] tags=[] 28 | # # Remove pycache dir 29 | 30 | # %% 31 | # !echo ${CODE_DIR} 32 | 33 | # %% 34 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print 35 | 36 | # %% 37 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -exec rm -rf {} \; 38 | 39 | # %% 40 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print 41 | 42 | # %% [markdown] tags=[] 43 | # # Modules 44 | 45 | # %% tags=[] 46 | import numpy as np 47 | 48 | from ccc.coef import _cm 49 | 50 | # %% [markdown] tags=[] 51 | # # Settings 52 | 53 | # %% 54 | N_REPS = 10 55 | 56 | # %% tags=[] 57 | np.random.seed(0) 58 | 59 | # %% [markdown] tags=[] 60 | # # Setup 61 | 62 | # %% 63 | # let numba compile all the code before profiling 64 | _cm.py_func(np.random.rand(10), np.random.rand(10)) 65 | 66 | # %% [markdown] tags=[] 67 | # # Run with `n_samples` small 68 | 69 | # %% 70 | N_SAMPLES = 100 71 | 72 | # %% 73 | x = np.random.rand(N_SAMPLES) 74 | y = np.random.rand(N_SAMPLES) 75 | 76 | 77 | # %% tags=[] 78 | def func(): 79 | for i in range(N_REPS): 80 | # py_func accesses the original python function, not the numba-optimized one 81 | # this is needed to be able to profile the function 82 | _cm.py_func(x, y) 83 | 84 | 85 | # %% tags=[] 86 | # %%timeit -n1 -r1 func() 87 | func() 88 | 89 | # %% tags=[] 90 | # %%prun -s cumulative -l 20 -T 03-n_samples_small.txt 91 | func() 92 | 93 | # %% [markdown] tags=[] 94 | # **No improvement** for this case. 95 | 96 | # %% [markdown] tags=[] 97 | # # Run with `n_samples` large 98 | 99 | # %% 100 | N_SAMPLES = 100000 101 | 102 | # %% 103 | x = np.random.rand(N_SAMPLES) 104 | y = np.random.rand(N_SAMPLES) 105 | 106 | 107 | # %% tags=[] 108 | def func(): 109 | for i in range(N_REPS): 110 | # py_func accesses the original python function, not the numba-optimized one 111 | # this is needed to be able to profile the function 112 | _cm.py_func(x, y) 113 | 114 | 115 | # %% tags=[] 116 | # %%timeit -n1 -r1 func() 117 | func() 118 | 119 | # %% tags=[] 120 | # %%prun -s cumulative -l 20 -T 03-n_samples_large.txt 121 | func() 122 | 123 | # %% [markdown] tags=[] 124 | # **Important improvement** for this case. `cdist_parts` takes now 0.370 percall instead of 0.824 (from reference). 125 | # 126 | # **However**, compared with `v00` (0.370 per call) or `v01` (0.385), this one does not change. 127 | 128 | # %% 129 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/py/05-get_parts_v01.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.11.5 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] 20 | # Now `cdist_parts` has been optimized with previous profiling tests. 21 | # 22 | # Here we profile function `_get_parts`. 23 | # 24 | # Here I disabled njit in `_get_parts` and `run_quantile_clustering` to be able to profile. 25 | 26 | # %% [markdown] 27 | # 28 | 29 | # %% [markdown] tags=[] 30 | # # Remove pycache dir 31 | 32 | # %% 33 | # !echo ${CODE_DIR} 34 | 35 | # %% 36 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print 37 | 38 | # %% 39 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -exec rm -rf {} \; 40 | 41 | # %% 42 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print 43 | 44 | # %% [markdown] tags=[] 45 | # # Modules 46 | 47 | # %% tags=[] 48 | import numpy as np 49 | 50 | from ccc.coef import _cm 51 | 52 | # %% [markdown] tags=[] 53 | # # Settings 54 | 55 | # %% 56 | N_REPS = 10 57 | 58 | # %% tags=[] 59 | np.random.seed(0) 60 | 61 | # %% [markdown] tags=[] 62 | # # Setup 63 | 64 | # %% 65 | # let numba compile all the code before profiling 66 | _cm.py_func(np.random.rand(10), np.random.rand(10)) 67 | 68 | # %% [markdown] tags=[] 69 | # # Run with `n_samples` small 70 | 71 | # %% 72 | N_SAMPLES = 100 73 | 74 | # %% 75 | x = np.random.rand(N_SAMPLES) 76 | y = np.random.rand(N_SAMPLES) 77 | 78 | 79 | # %% tags=[] 80 | def func(): 81 | for i in range(N_REPS): 82 | # py_func accesses the original python function, not the numba-optimized one 83 | # this is needed to be able to profile the function 84 | _cm.py_func(x, y) 85 | 86 | 87 | # %% tags=[] 88 | # %%timeit -n1 -r1 func() 89 | func() 90 | 91 | # %% tags=[] 92 | # %%prun -s cumulative -l 20 -T 05-n_samples_small.txt 93 | func() 94 | 95 | # %% [markdown] tags=[] 96 | # In this case (small number of samples), `cdist_parts` is still the most consuming function, followed by `rank` (`tottime`). 97 | 98 | # %% [markdown] tags=[] 99 | # # Run with `n_samples` large 100 | 101 | # %% 102 | N_SAMPLES = 100000 103 | 104 | # %% 105 | x = np.random.rand(N_SAMPLES) 106 | y = np.random.rand(N_SAMPLES) 107 | 108 | 109 | # %% tags=[] 110 | def func(): 111 | for i in range(N_REPS): 112 | # py_func accesses the original python function, not the numba-optimized one 113 | # this is needed to be able to profile the function 114 | _cm.py_func(x, y) 115 | 116 | 117 | # %% tags=[] 118 | # %%timeit -n1 -r1 func() 119 | func() 120 | 121 | # %% tags=[] 122 | # %%prun -s cumulative -l 20 -T 05-n_samples_large.txt 123 | func() 124 | 125 | # %% [markdown] tags=[] 126 | # `rank` is the function that needs optimization. 127 | 128 | # %% 129 | -------------------------------------------------------------------------------- /nbs/25_pvalue/py/01-ccc_pvalue_dist-generate-gene_pairs.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # notebook_metadata_filter: -jupytext.text_representation.jupytext_version 6 | # text_representation: 7 | # extension: .py 8 | # format_name: percent 9 | # format_version: '1.3' 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] tags=[] 20 | # Generates a distribution of pvalues under the null hypothesis of no association. 21 | # 22 | # This notebook uses individual gene pairs as input for CCC and parallelizes permutations. 23 | 24 | # %% [markdown] tags=[] 25 | # # Modules loading 26 | 27 | # %% tags=[] 28 | import numpy as np 29 | from joblib import Parallel, delayed 30 | 31 | from ccc.coef import ccc 32 | from ccc import conf 33 | 34 | # %% [markdown] tags=[] 35 | # # Settings 36 | 37 | # %% tags=[] 38 | rs = np.random.RandomState(0) 39 | 40 | # %% tags=[] 41 | N_JOBS = 1 42 | display(N_JOBS) 43 | 44 | PVALUE_N_JOBS = conf.GENERAL["N_JOBS"] 45 | display(PVALUE_N_JOBS) 46 | 47 | # %% tags=[] 48 | DATA_N_OBJS, DATA_N_FEATURES = 100, 1000 49 | PVALUE_N_PERMS = 1000 50 | 51 | # %% [markdown] tags=[] 52 | # # Paths 53 | 54 | # %% tags=[] 55 | OUTPUT_DIR = conf.RESULTS_DIR / "ccc_null-pvalues" 56 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True) 57 | 58 | # %% tags=[] 59 | OUTPUT_DIR 60 | 61 | # %% [markdown] tags=[] 62 | # # Generate random data 63 | 64 | # %% tags=[] 65 | data = rs.rand(DATA_N_OBJS, DATA_N_FEATURES) 66 | 67 | # %% tags=[] 68 | data.shape 69 | 70 | 71 | # %% [markdown] tags=[] 72 | # # Run CCC 73 | 74 | # %% tags=[] 75 | def ccc_single(x, y): 76 | return ccc(x, y, pvalue_n_perms=PVALUE_N_PERMS, n_jobs=PVALUE_N_JOBS) 77 | 78 | 79 | # %% tags=[] 80 | results = Parallel(n_jobs=N_JOBS)( 81 | delayed(ccc_single)(data[i], data[j]) 82 | for i in range(data.shape[0] - 1) 83 | for j in range(i + 1, data.shape[0]) 84 | ) 85 | 86 | # %% tags=[] 87 | assert len(results) == (DATA_N_OBJS * (DATA_N_OBJS - 1)) / 2 88 | 89 | # %% tags=[] 90 | results[0] 91 | 92 | # %% tags=[] 93 | cm_values = [x[0] for x in results] 94 | 95 | # %% tags=[] 96 | cm_pvalues = [x[1] for x in results] 97 | 98 | # %% tags=[] 99 | assert len(cm_values) == len(cm_pvalues) 100 | assert len(cm_values) == (DATA_N_OBJS * (DATA_N_OBJS - 1)) / 2 101 | 102 | # %% tags=[] 103 | cm_values = np.array(cm_values) 104 | cm_pvalues = np.array(cm_pvalues) 105 | 106 | # %% tags=[] 107 | cm_values.shape 108 | 109 | # %% tags=[] 110 | cm_values 111 | 112 | # %% tags=[] 113 | cm_pvalues.shape 114 | 115 | # %% tags=[] 116 | cm_pvalues 117 | 118 | # %% [markdown] tags=[] 119 | # # Save 120 | 121 | # %% tags=[] 122 | output_file = OUTPUT_DIR / "gene_pairs-cm_values.npy" 123 | display(output_file) 124 | 125 | np.save(output_file, cm_values) 126 | 127 | # %% tags=[] 128 | output_file = OUTPUT_DIR / "gene_pairs-cm_pvalues.npy" 129 | display(output_file) 130 | 131 | np.save(output_file, cm_pvalues) 132 | 133 | # %% tags=[] 134 | -------------------------------------------------------------------------------- /libs/ccc/corr.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions to compute different correlation coefficients. 3 | 4 | All correlation functions in this module are expected to have the same input and output 5 | structure: 6 | 7 | * The input is a pandas DataFrame with genes in rows (Ensembl IDs) and samples 8 | in columns. The values are gene expression data normalized with some technique, 9 | but that should not be relevant for the correlation method. No empty values 10 | are allowed. 11 | 12 | * The output is a pandas DataFrame, a symmetric correlation matrix with genes 13 | in rows and columns (Ensembl IDs), and the values are the correlation 14 | coefficients. Diagonal values are expected to be ones. 15 | """ 16 | import pandas as pd 17 | import numpy as np 18 | from sklearn.metrics import pairwise_distances 19 | 20 | 21 | def pearson(data: pd.DataFrame) -> pd.DataFrame: 22 | """ 23 | Compute the Pearson correlation coefficient. 24 | """ 25 | corr_mat = 1 - pairwise_distances(data.to_numpy(), metric="correlation", n_jobs=1) 26 | 27 | np.fill_diagonal(corr_mat, 1.0) 28 | 29 | return pd.DataFrame( 30 | corr_mat, 31 | index=data.index.copy(), 32 | columns=data.index.copy(), 33 | ) 34 | 35 | 36 | def spearman(data: pd.DataFrame) -> pd.DataFrame: 37 | """ 38 | Compute the Spearman correlation coefficient. 39 | """ 40 | # compute ranks 41 | data = data.rank(axis=1) 42 | 43 | corr_mat = 1 - pairwise_distances(data.to_numpy(), metric="correlation", n_jobs=1) 44 | 45 | np.fill_diagonal(corr_mat, 1.0) 46 | 47 | return pd.DataFrame( 48 | corr_mat, 49 | index=data.index.copy(), 50 | columns=data.index.copy(), 51 | ) 52 | 53 | 54 | def mic(data: pd.DataFrame, estimator="mic_approx", n_jobs=None) -> pd.DataFrame: 55 | """ 56 | Compute the Maximal Correlation Coefficient (MIC). 57 | """ 58 | from scipy.spatial.distance import squareform 59 | from minepy import pstats 60 | from ccc.methods import mic as mic_single 61 | 62 | if n_jobs is None: 63 | corr_mat = pstats( 64 | data.to_numpy(), 65 | est=estimator, 66 | )[0] 67 | 68 | corr_mat = squareform(corr_mat) 69 | else: 70 | corr_mat = pairwise_distances(data.to_numpy(), metric=mic_single, n_jobs=n_jobs) 71 | 72 | np.fill_diagonal(corr_mat, 1.0) 73 | 74 | return pd.DataFrame( 75 | corr_mat, 76 | index=data.index.copy(), 77 | columns=data.index.copy(), 78 | ) 79 | 80 | 81 | def ccc(data: pd.DataFrame, internal_n_clusters=None, n_jobs=1) -> pd.DataFrame: 82 | """ 83 | Compute the Clustermatch Correlation Coefficient (CCC). 84 | """ 85 | from scipy.spatial.distance import squareform 86 | from ccc.coef import ccc 87 | 88 | corr_mat = ccc( 89 | data.to_numpy(), 90 | internal_n_clusters=internal_n_clusters, 91 | n_jobs=n_jobs, 92 | ) 93 | 94 | corr_mat = squareform(corr_mat) 95 | np.fill_diagonal(corr_mat, 1.0) 96 | 97 | return pd.DataFrame( 98 | corr_mat, 99 | index=data.index.copy(), 100 | columns=data.index.copy(), 101 | ) 102 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/py/07-get_parts_v03.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.11.5 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] 20 | # Now `cdist_parts` has been optimized with previous profiling tests. 21 | # 22 | # Here we profile function `_get_parts`. 23 | # 24 | # Here I try a completely new `rank` function. 25 | # I'm also trying a slightly different `run_quantile_clustering`, given the changes to `rank`. 26 | # I'm also parallelizing `_get_parts` inside `_cm`. 27 | 28 | # %% [markdown] tags=[] 29 | # # Remove pycache dir 30 | 31 | # %% 32 | # !echo ${CODE_DIR} 33 | 34 | # %% 35 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print 36 | 37 | # %% 38 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -exec rm -rf {} \; 39 | 40 | # %% 41 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print 42 | 43 | # %% [markdown] tags=[] 44 | # # Modules 45 | 46 | # %% tags=[] 47 | import numpy as np 48 | 49 | from ccc.coef import _cm 50 | 51 | # %% [markdown] tags=[] 52 | # # Settings 53 | 54 | # %% 55 | N_REPS = 10 56 | 57 | # %% tags=[] 58 | np.random.seed(0) 59 | 60 | # %% [markdown] tags=[] 61 | # # Setup 62 | 63 | # %% 64 | # let numba compile all the code before profiling 65 | _cm(np.random.rand(10), np.random.rand(10)) 66 | 67 | # %% [markdown] tags=[] 68 | # # Run with `n_samples` small 69 | 70 | # %% 71 | N_SAMPLES = 100 72 | 73 | # %% 74 | x = np.random.rand(N_SAMPLES) 75 | y = np.random.rand(N_SAMPLES) 76 | 77 | 78 | # %% tags=[] 79 | def func(): 80 | for i in range(N_REPS): 81 | # py_func accesses the original python function, not the numba-optimized one 82 | # this is needed to be able to profile the function 83 | _cm(x, y) 84 | 85 | 86 | # %% tags=[] 87 | # %%timeit -n1 -r4 func() 88 | func() 89 | 90 | # %% tags=[] 91 | # %%prun -s cumulative -l 20 -T 07-n_samples_small.txt 92 | func() 93 | 94 | # %% [markdown] tags=[] 95 | # In this case (small number of samples), `cdist_parts` is still the most consuming function, followed by `rank` (`tottime`). 96 | 97 | # %% [markdown] tags=[] 98 | # # Run with `n_samples` large 99 | 100 | # %% 101 | N_SAMPLES = 100000 102 | 103 | # %% 104 | x = np.random.rand(N_SAMPLES) 105 | y = np.random.rand(N_SAMPLES) 106 | 107 | 108 | # %% tags=[] 109 | def func(): 110 | for i in range(N_REPS): 111 | # py_func accesses the original python function, not the numba-optimized one 112 | # this is needed to be able to profile the function 113 | _cm(x, y) 114 | 115 | 116 | # %% tags=[] 117 | # %%timeit -n1 -r4 func() 118 | func() 119 | 120 | # %% tags=[] 121 | # %%prun -s cumulative -l 20 -T 07-n_samples_large.txt 122 | func() 123 | 124 | # %% [markdown] tags=[] 125 | # **Large improvement** using a new `rank` function and parallelizing the call of `_get_parts` from `_cm`. 126 | 127 | # %% 128 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/10_cm_optimized/py/06-get_parts_v02.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.11.5 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] 20 | # Now `cdist_parts` has been optimized with previous profiling tests. 21 | # 22 | # Here we profile function `_get_parts`. 23 | # 24 | # Here I disabled njit in `_get_parts` and `run_quantile_clustering` to be able to profile. 25 | # 26 | # Here I tried `scipy.stats.rankdata` instead of the `rank` function I wrote. 27 | 28 | # %% [markdown] 29 | # 30 | 31 | # %% [markdown] tags=[] 32 | # # Remove pycache dir 33 | 34 | # %% 35 | # !echo ${CODE_DIR} 36 | 37 | # %% 38 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print 39 | 40 | # %% 41 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -exec rm -rf {} \; 42 | 43 | # %% 44 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print 45 | 46 | # %% [markdown] tags=[] 47 | # # Modules 48 | 49 | # %% tags=[] 50 | import numpy as np 51 | 52 | from ccc.coef import _cm 53 | 54 | # %% [markdown] tags=[] 55 | # # Settings 56 | 57 | # %% 58 | N_REPS = 10 59 | 60 | # %% tags=[] 61 | np.random.seed(0) 62 | 63 | # %% [markdown] tags=[] 64 | # # Setup 65 | 66 | # %% 67 | # let numba compile all the code before profiling 68 | _cm(np.random.rand(10), np.random.rand(10)) 69 | 70 | # %% [markdown] tags=[] 71 | # # Run with `n_samples` small 72 | 73 | # %% 74 | N_SAMPLES = 100 75 | 76 | # %% 77 | x = np.random.rand(N_SAMPLES) 78 | y = np.random.rand(N_SAMPLES) 79 | 80 | 81 | # %% tags=[] 82 | def func(): 83 | for i in range(N_REPS): 84 | # py_func accesses the original python function, not the numba-optimized one 85 | # this is needed to be able to profile the function 86 | _cm(x, y) 87 | 88 | 89 | # %% tags=[] 90 | # %%timeit -n1 -r1 func() 91 | func() 92 | 93 | # %% tags=[] 94 | # %%prun -s cumulative -l 20 -T 06-n_samples_small.txt 95 | func() 96 | 97 | # %% [markdown] tags=[] 98 | # In this case (small number of samples), `cdist_parts` is still the most consuming function, followed by `rank` (`tottime`). 99 | 100 | # %% [markdown] tags=[] 101 | # # Run with `n_samples` large 102 | 103 | # %% 104 | N_SAMPLES = 100000 105 | 106 | # %% 107 | x = np.random.rand(N_SAMPLES) 108 | y = np.random.rand(N_SAMPLES) 109 | 110 | 111 | # %% tags=[] 112 | def func(): 113 | for i in range(N_REPS): 114 | # py_func accesses the original python function, not the numba-optimized one 115 | # this is needed to be able to profile the function 116 | _cm(x, y) 117 | 118 | 119 | # %% tags=[] 120 | # %%timeit -n1 -r1 func() 121 | func() 122 | 123 | # %% tags=[] 124 | # %%prun -s cumulative -l 20 -T 06-n_samples_large.txt 125 | func() 126 | 127 | # %% [markdown] tags=[] 128 | # **Large improvement** using the scipy rankdata function. The current `rank` function needs optimization. 129 | 130 | # %% 131 | -------------------------------------------------------------------------------- /scripts/run_docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # It runs the Docker container of this project by mounting the code and 4 | # manuscript directories inside the container. This makes that any file created 5 | # during the execution is locally available and ready to be pushed to the repo. 6 | # Plus, the code is always run inside the same environment (including the full 7 | # operating system). 8 | # 9 | # We assume the repo code is in the current directory, so the user has to make 10 | # sure this is right. 11 | 12 | # general settings 13 | DOCKER_IMAGE_NAMESPACE="miltondp" 14 | DOCKER_IMAGE_NAME="ccc" 15 | DOCKER_TAG="${CM_DOCKER_IMAGE_TAG:-latest}" 16 | DOCKER_PUBLISH_HOST="127.0.0.1" 17 | DOCKER_CONTAINER_PORT="8893" 18 | DOCKER_HOST_PORT="8893" 19 | 20 | # project-specific environment variables 21 | ROOT_DIR="${CM_ROOT_DIR}" 22 | MANUSCRIPT_DIR="${CM_MANUSCRIPT_DIR}" 23 | N_JOBS_VARNAME="CM_N_JOBS" 24 | N_JOBS=${!N_JOBS_VARNAME} 25 | 26 | # parameters parsing 27 | # read arguments 28 | POSITIONAL_ARGS=() 29 | 30 | while [[ $# -gt 0 ]]; do 31 | case $1 in 32 | --docker-args) 33 | DOCKER_ARGS="$2" 34 | shift # past argument 35 | shift # past value 36 | ;; 37 | *) 38 | POSITIONAL_ARGS+=("$1") # save positional arg 39 | shift # past argument 40 | ;; 41 | esac 42 | done 43 | 44 | set -- "${POSITIONAL_ARGS[@]}" # restore positional parameters 45 | 46 | echo "Configuration:" 47 | 48 | CODE_DIR=`pwd` 49 | 50 | # root dir 51 | if [ -z "${ROOT_DIR}" ]; then 52 | ROOT_DIR="${CODE_DIR}/base" 53 | fi 54 | 55 | # manuscript dir 56 | if [ -z "${MANUSCRIPT_DIR}" ]; then 57 | MANUSCRIPT_DIR="/tmp/${DOCKER_IMAGE_NAME}_manuscript" 58 | mkdir -p ${MANUSCRIPT_DIR} 59 | fi 60 | 61 | if [ -z "${N_JOBS}" ]; then 62 | N_JOBS=1 63 | fi 64 | 65 | echo "Configuration:" 66 | echo " Code dir: ${CODE_DIR}" 67 | echo " Root dir: ${ROOT_DIR}" 68 | echo " Manuscript dir: ${MANUSCRIPT_DIR}" 69 | echo " CPU cores: ${N_JOBS}" 70 | echo " Docker image tag: ${DOCKER_TAG}" 71 | 72 | echo 73 | echo "Waiting 2 seconds before starting" 74 | echo 75 | sleep 2 76 | 77 | # always create data directory before running Docker 78 | mkdir -p ${ROOT_DIR} 79 | 80 | COMMAND="$@" 81 | PORT_ARG="-p ${DOCKER_PUBLISH_HOST}:${DOCKER_HOST_PORT}:${DOCKER_CONTAINER_PORT}" 82 | if [ -z "${COMMAND}" ]; then 83 | FULL_COMMAND=() 84 | else 85 | FULL_COMMAND=(/bin/bash -c "${COMMAND}") 86 | PORT_ARG="" 87 | fi 88 | 89 | echo "Full command: ${FULL_COMMAND}" 90 | 91 | if [ -z "${DOCKER_ARGS}" ]; then 92 | # by default, use interactive mode (enables cancelling run with Ctrl C from console) 93 | DOCKER_ARGS="-ti" 94 | fi 95 | 96 | # show commands being executed 97 | echo 98 | set -x 99 | 100 | # run 101 | docker run --rm ${PORT_ARG} ${DOCKER_ARGS} \ 102 | -e ${N_JOBS_VARNAME}=${N_JOBS} \ 103 | -e NUMBA_NUM_THREADS=${N_JOBS} \ 104 | -e MKL_NUM_THREADS=${N_JOBS} \ 105 | -e OPEN_BLAS_NUM_THREADS=${N_JOBS} \ 106 | -e NUMEXPR_NUM_THREADS=${N_JOBS} \ 107 | -e OMP_NUM_THREADS=${N_JOBS} \ 108 | -e "${CM_RUN_NBS_OVERRIDE}":${CM_RUN_NBS_OVERRIDE:-0} \ 109 | -v "${CODE_DIR}:/opt/code" \ 110 | -v "${ROOT_DIR}:/opt/data" \ 111 | -v "${MANUSCRIPT_DIR}:/opt/manuscript" \ 112 | --user "$(id -u):$(id -g)" \ 113 | ${DOCKER_IMAGE_NAMESPACE}/${DOCKER_IMAGE_NAME}:${DOCKER_TAG} "${FULL_COMMAND[@]}" 114 | 115 | -------------------------------------------------------------------------------- /nbs/20_comparison_others/py/60-time_test-1_cpu_core.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # notebook_metadata_filter: -jupytext.text_representation.jupytext_version 6 | # text_representation: 7 | # extension: .py 8 | # format_name: percent 9 | # format_version: '1.3' 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] tags=[] 20 | # It generates random variables of varying sizes to compare the time taken by CCC and MIC. 21 | # 22 | # This notebook uses 1 CPU core. 23 | 24 | # %% [markdown] tags=[] 25 | # # Modules loading 26 | 27 | # %% [markdown] tags=[] 28 | # Make sure only one core is used everywhere. 29 | 30 | # %% tags=[] 31 | # %env CM_N_JOBS=1 32 | # %env NUMBA_NUM_THREADS=1 33 | # %env MKL_NUM_THREADS=1 34 | # %env OPEN_BLAS_NUM_THREADS=1 35 | # %env NUMEXPR_NUM_THREADS=1 36 | # %env OMP_NUM_THREADS=1 37 | 38 | # %% tags=[] 39 | from time import time 40 | 41 | import numpy as np 42 | import pandas as pd 43 | from scipy.stats import pearsonr, spearmanr 44 | 45 | from ccc import conf 46 | from ccc.coef import ccc 47 | from ccc.methods import mic 48 | 49 | # %% [markdown] tags=[] 50 | # # Settings 51 | 52 | # %% tags=[] 53 | OUTPUT_FILENAME = "time_test.pkl" 54 | 55 | # %% tags=[] 56 | DATA_SIZES = [ 57 | 100, 58 | 500, 59 | 1000, 60 | 5000, 61 | 10000, 62 | 50000, 63 | 100000, 64 | 1000000, 65 | 10000000, 66 | ] 67 | 68 | N_REPS = 10 69 | 70 | # %% tags=[] 71 | np.random.seed(0) 72 | 73 | # %% [markdown] tags=[] 74 | # # Paths 75 | 76 | # %% tags=[] 77 | OUTPUT_DIR = conf.RESULTS_DIR / "time_test" 78 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True) 79 | display(OUTPUT_DIR) 80 | 81 | # %% [markdown] tags=[] 82 | # # Functions 83 | 84 | # %% tags=[] 85 | time_results = pd.DataFrame(columns=["data_size", "method", "time", "sim"]) 86 | 87 | 88 | # %% tags=[] 89 | def run_method(func, method_name, size): 90 | n_reps = N_REPS 91 | if size < 500: 92 | n_reps = 1000 93 | 94 | for r in range(n_reps): 95 | d1 = np.random.rand(size) 96 | d2 = np.random.rand(size) 97 | 98 | start_time = time() 99 | sim = func(d1, d2) 100 | end_time = time() 101 | met_time = end_time - start_time 102 | 103 | idx = time_results.shape[0] 104 | time_results.loc[idx] = [d1.shape[0], method_name, met_time, sim] 105 | 106 | 107 | # %% [markdown] tags=[] 108 | # # Run 109 | 110 | # %% tags=[] 111 | # initialize methods 112 | ccc(np.random.rand(100), np.random.rand(100)) 113 | 114 | # %% tags=[] 115 | for s in DATA_SIZES: 116 | print(f"Size: {s}") 117 | 118 | print(" p") 119 | run_method(lambda x, y: pearsonr(x, y)[0], "p-1", s) 120 | 121 | print(" s") 122 | run_method(lambda x, y: spearmanr(x, y)[0], "s-1", s) 123 | 124 | print(" cm") 125 | run_method(lambda x, y: ccc(x, y), "cm-1", s) 126 | 127 | if s <= 50000: 128 | print(" mic_e") 129 | run_method(lambda x, y: mic(x, y, estimator="mic_e"), "mic_e-1", s) 130 | 131 | if s <= 10000: 132 | print(" mic") 133 | run_method(lambda x, y: mic(x, y), "mic-1", s) 134 | 135 | print("Saving to pickle") 136 | time_results.to_pickle(OUTPUT_DIR / OUTPUT_FILENAME) 137 | 138 | print("\n") 139 | 140 | # %% [markdown] tags=[] 141 | # # Summary of results 142 | 143 | # %% tags=[] 144 | time_results.shape 145 | 146 | # %% tags=[] 147 | time_results.head() 148 | 149 | # %% tags=[] 150 | -------------------------------------------------------------------------------- /nbs/20_comparison_others/py/61-time_test-3_cpu_cores.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # notebook_metadata_filter: -jupytext.text_representation.jupytext_version 6 | # text_representation: 7 | # extension: .py 8 | # format_name: percent 9 | # format_version: '1.3' 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] tags=[] 20 | # It generates random variables of varying sizes to compare the time taken by CCC and MIC. 21 | # 22 | # This notebook uses 3 CPU core. 23 | 24 | # %% [markdown] tags=[] 25 | # # Modules loading 26 | 27 | # %% tags=[] 28 | # %env CM_N_JOBS=3 29 | # %env NUMBA_NUM_THREADS=3 30 | # %env MKL_NUM_THREADS=3 31 | # %env OPEN_BLAS_NUM_THREADS=3 32 | # %env NUMEXPR_NUM_THREADS=3 33 | # %env OMP_NUM_THREADS=3 34 | 35 | # %% tags=[] 36 | import os 37 | from time import time 38 | 39 | import numpy as np 40 | import pandas as pd 41 | from scipy.stats import pearsonr, spearmanr 42 | 43 | from ccc import conf 44 | from ccc.coef import ccc 45 | from ccc.methods import mic 46 | 47 | # %% [markdown] tags=[] 48 | # # Settings 49 | 50 | # %% tags=[] 51 | N_JOBS = int(os.environ["CM_N_JOBS"]) 52 | display(N_JOBS) 53 | 54 | # %% tags=[] 55 | OUTPUT_FILENAME = "time_test.pkl" 56 | 57 | # %% tags=[] 58 | DATA_SIZES = [ 59 | 100, 60 | 500, 61 | 1000, 62 | 5000, 63 | 10000, 64 | 50000, 65 | 100000, 66 | 1000000, 67 | 10000000, 68 | ] 69 | 70 | N_REPS = 10 71 | 72 | # %% tags=[] 73 | np.random.seed(0) 74 | 75 | # %% [markdown] tags=[] 76 | # # Paths 77 | 78 | # %% tags=[] 79 | OUTPUT_DIR = conf.RESULTS_DIR / "time_test" 80 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True) 81 | display(OUTPUT_DIR) 82 | 83 | # %% [markdown] tags=[] 84 | # # Functions 85 | 86 | # %% tags=[] 87 | # append to previous run 88 | time_results = pd.read_pickle(OUTPUT_DIR / OUTPUT_FILENAME) 89 | 90 | # %% tags=[] 91 | time_results.shape 92 | 93 | 94 | # %% tags=[] 95 | def run_method(func, method_name, size): 96 | n_reps = N_REPS 97 | if size < 500: 98 | n_reps = 1000 99 | 100 | for r in range(n_reps): 101 | d1 = np.random.rand(size) 102 | d2 = np.random.rand(size) 103 | 104 | start_time = time() 105 | sim = func(d1, d2) 106 | end_time = time() 107 | met_time = end_time - start_time 108 | 109 | idx = time_results.shape[0] 110 | time_results.loc[idx] = [d1.shape[0], method_name, met_time, sim] 111 | 112 | 113 | # %% [markdown] tags=[] 114 | # # Run 115 | 116 | # %% tags=[] 117 | # initialize methods 118 | ccc(np.random.rand(100), np.random.rand(100)) 119 | 120 | # %% tags=[] 121 | for s in DATA_SIZES: 122 | print(f"Size: {s}") 123 | 124 | print(" p") 125 | run_method(lambda x, y: pearsonr(x, y)[0], "p-3", s) 126 | 127 | print(" s") 128 | run_method(lambda x, y: spearmanr(x, y)[0], "s-3", s) 129 | 130 | print(" cm") 131 | run_method(lambda x, y: ccc(x, y, n_jobs=N_JOBS), "cm-3", s) 132 | 133 | if s <= 50000: 134 | print(" mic_e") 135 | run_method(lambda x, y: mic(x, y, estimator="mic_e"), "mic_e-3", s) 136 | 137 | if s <= 10000: 138 | print(" mic") 139 | run_method(lambda x, y: mic(x, y), "mic-3", s) 140 | 141 | print("Saving to pickle") 142 | time_results.to_pickle(OUTPUT_DIR / OUTPUT_FILENAME) 143 | 144 | print("\n") 145 | 146 | # %% [markdown] tags=[] 147 | # # Summary of results 148 | 149 | # %% tags=[] 150 | time_results.shape 151 | 152 | # %% tags=[] 153 | time_results.head() 154 | 155 | # %% tags=[] 156 | -------------------------------------------------------------------------------- /nbs/20_comparison_others/py/62-time_test-6_cpu_cores.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # notebook_metadata_filter: -jupytext.text_representation.jupytext_version 6 | # text_representation: 7 | # extension: .py 8 | # format_name: percent 9 | # format_version: '1.3' 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] tags=[] 20 | # It generates random variables of varying sizes to compare the time taken by CCC and MIC. 21 | # 22 | # This notebook uses 6 CPU core. 23 | 24 | # %% [markdown] tags=[] 25 | # # Modules loading 26 | 27 | # %% tags=[] 28 | # %env CM_N_JOBS=6 29 | # %env NUMBA_NUM_THREADS=6 30 | # %env MKL_NUM_THREADS=6 31 | # %env OPEN_BLAS_NUM_THREADS=6 32 | # %env NUMEXPR_NUM_THREADS=6 33 | # %env OMP_NUM_THREADS=6 34 | 35 | # %% tags=[] 36 | import os 37 | from time import time 38 | 39 | import numpy as np 40 | import pandas as pd 41 | from scipy.stats import pearsonr, spearmanr 42 | 43 | from ccc import conf 44 | from ccc.coef import ccc 45 | from ccc.methods import mic 46 | 47 | # %% [markdown] tags=[] 48 | # # Settings 49 | 50 | # %% tags=[] 51 | N_JOBS = int(os.environ["CM_N_JOBS"]) 52 | display(N_JOBS) 53 | 54 | # %% tags=[] 55 | OUTPUT_FILENAME = "time_test.pkl" 56 | 57 | # %% tags=[] 58 | DATA_SIZES = [ 59 | 100, 60 | 500, 61 | 1000, 62 | 5000, 63 | 10000, 64 | 50000, 65 | 100000, 66 | 1000000, 67 | 10000000, 68 | ] 69 | 70 | N_REPS = 10 71 | 72 | # %% tags=[] 73 | np.random.seed(0) 74 | 75 | # %% [markdown] tags=[] 76 | # # Paths 77 | 78 | # %% tags=[] 79 | OUTPUT_DIR = conf.RESULTS_DIR / "time_test" 80 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True) 81 | display(OUTPUT_DIR) 82 | 83 | # %% [markdown] tags=[] 84 | # # Functions 85 | 86 | # %% tags=[] 87 | # append to previous run 88 | time_results = pd.read_pickle(OUTPUT_DIR / OUTPUT_FILENAME) 89 | 90 | # %% tags=[] 91 | time_results.shape 92 | 93 | 94 | # %% tags=[] 95 | def run_method(func, method_name, size): 96 | n_reps = N_REPS 97 | if size < 500: 98 | n_reps = 1000 99 | 100 | for r in range(n_reps): 101 | d1 = np.random.rand(size) 102 | d2 = np.random.rand(size) 103 | 104 | start_time = time() 105 | sim = func(d1, d2) 106 | end_time = time() 107 | met_time = end_time - start_time 108 | 109 | idx = time_results.shape[0] 110 | time_results.loc[idx] = [d1.shape[0], method_name, met_time, sim] 111 | 112 | 113 | # %% [markdown] tags=[] 114 | # # Run 115 | 116 | # %% tags=[] 117 | # initialize methods 118 | ccc(np.random.rand(100), np.random.rand(100)) 119 | 120 | # %% tags=[] 121 | for s in DATA_SIZES: 122 | print(f"Size: {s}") 123 | 124 | print(" p") 125 | run_method(lambda x, y: pearsonr(x, y)[0], "p-6", s) 126 | 127 | print(" s") 128 | run_method(lambda x, y: spearmanr(x, y)[0], "s-6", s) 129 | 130 | print(" cm") 131 | run_method(lambda x, y: ccc(x, y, n_jobs=N_JOBS), "cm-6", s) 132 | 133 | if s <= 50000: 134 | print(" mic_e") 135 | run_method(lambda x, y: mic(x, y, estimator="mic_e"), "mic_e-6", s) 136 | 137 | if s <= 10000: 138 | print(" mic") 139 | run_method(lambda x, y: mic(x, y), "mic-6", s) 140 | 141 | print("Saving to pickle") 142 | time_results.to_pickle(OUTPUT_DIR / OUTPUT_FILENAME) 143 | 144 | print("\n") 145 | 146 | # %% [markdown] tags=[] 147 | # # Summary of results 148 | 149 | # %% tags=[] 150 | time_results.shape 151 | 152 | # %% tags=[] 153 | time_results.head() 154 | 155 | # %% tags=[] 156 | -------------------------------------------------------------------------------- /nbs/99_manuscript/giant/py/03_00-giant-get_gene_info.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # cell_metadata_filter: all,-execution,-papermill,-trusted 5 | # notebook_metadata_filter: -jupytext.text_representation.jupytext_version 6 | # text_representation: 7 | # extension: .py 8 | # format_name: percent 9 | # format_version: '1.3' 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] tags=[] 17 | # # Description 18 | 19 | # %% [markdown] tags=[] 20 | # It gets all the gene pairs prioritized by different correlation coefficients and writes a file with gene ID mappings (symbols and Entrez IDs). 21 | 22 | # %% [markdown] tags=[] 23 | # # Modules 24 | 25 | # %% tags=[] 26 | # %load_ext rpy2.ipython 27 | 28 | # %% tags=[] 29 | import pandas as pd 30 | 31 | from ccc import conf 32 | 33 | # %% [markdown] tags=[] 34 | # # Settings 35 | 36 | # %% tags=[] 37 | DATASET_CONFIG = conf.GTEX 38 | 39 | # %% [markdown] tags=[] 40 | # # Paths 41 | 42 | # %% tags=[] 43 | INPUT_DIR = DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"] 44 | display(INPUT_DIR) 45 | 46 | assert INPUT_DIR.exists() 47 | 48 | # %% tags=[] 49 | OUTPUT_DIR = conf.GIANT["RESULTS_DIR"] / "intersection_genes" 50 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True) 51 | display(OUTPUT_DIR) 52 | 53 | # %% [markdown] tags=[] 54 | # # Get gene entrez ids 55 | 56 | # %% tags=[] 57 | genes = set() 58 | 59 | # %% tags=[] 60 | data = pd.read_pickle(INPUT_DIR / "clustermatch_vs_pearson.pkl") 61 | _tmp0 = set(data.index.get_level_values(0)) 62 | _tmp1 = set(data.index.get_level_values(1)) 63 | genes.update(_tmp0.union(_tmp1)) 64 | display(len(genes)) 65 | 66 | # %% tags=[] 67 | data = pd.read_pickle(INPUT_DIR / "clustermatch_vs_pearson_spearman.pkl") 68 | _tmp0 = set(data.index.get_level_values(0)) 69 | _tmp1 = set(data.index.get_level_values(1)) 70 | genes.update(_tmp0.union(_tmp1)) 71 | display(len(genes)) 72 | 73 | # %% tags=[] 74 | data = pd.read_pickle(INPUT_DIR / "clustermatch_vs_spearman.pkl") 75 | _tmp0 = set(data.index.get_level_values(0)) 76 | _tmp1 = set(data.index.get_level_values(1)) 77 | genes.update(_tmp0.union(_tmp1)) 78 | display(len(genes)) 79 | 80 | # %% tags=[] 81 | data = pd.read_pickle(INPUT_DIR / "pearson_vs_clustermatch.pkl") 82 | _tmp0 = set(data.index.get_level_values(0)) 83 | _tmp1 = set(data.index.get_level_values(1)) 84 | genes.update(_tmp0.union(_tmp1)) 85 | display(len(genes)) 86 | 87 | # %% tags=[] 88 | data = pd.read_pickle(INPUT_DIR / "pearson_vs_clustermatch_spearman.pkl") 89 | _tmp0 = set(data.index.get_level_values(0)) 90 | _tmp1 = set(data.index.get_level_values(1)) 91 | genes.update(_tmp0.union(_tmp1)) 92 | display(len(genes)) 93 | 94 | # %% tags=[] 95 | genes = list(genes) 96 | assert not pd.Series(genes).isna().any() 97 | 98 | # %% tags=[] magic_args="-i genes -o symbol_to_entrezid" language="R" 99 | # library(org.Hs.eg.db) 100 | # hs <- org.Hs.eg.db 101 | # 102 | # symbol_to_entrezid <- select(hs, 103 | # keys = unlist(genes), 104 | # columns = c("ENTREZID", "SYMBOL"), 105 | # keytype = "SYMBOL") 106 | 107 | # %% tags=[] 108 | symbol_to_entrezid.shape 109 | 110 | # %% tags=[] 111 | assert symbol_to_entrezid.shape[0] == len(genes) 112 | 113 | # %% tags=[] 114 | symbol_to_entrezid.head() 115 | 116 | # %% tags=[] 117 | symbol_to_entrezid.isna().any().any() 118 | 119 | # %% tags=[] 120 | symbol_to_entrezid = symbol_to_entrezid.dropna() 121 | 122 | # %% tags=[] 123 | symbol_to_entrezid.shape 124 | 125 | # %% tags=[] 126 | assert symbol_to_entrezid[symbol_to_entrezid["SYMBOL"] == "IFNG"].shape[0] == 1 127 | assert symbol_to_entrezid[symbol_to_entrezid["SYMBOL"] == "RASSF2"].shape[0] == 1 128 | 129 | # %% [markdown] tags=[] 130 | # # Save 131 | 132 | # %% tags=[] 133 | symbol_to_entrezid.to_pickle(OUTPUT_DIR / "gene_map-symbol_to_entrezid.pkl") 134 | 135 | # %% tags=[] 136 | -------------------------------------------------------------------------------- /tests/test_conf.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests the conf.py module. 3 | """ 4 | import os 5 | import sys 6 | import runpy 7 | from unittest import mock 8 | 9 | import pytest 10 | 11 | 12 | def test_conf_module_load(): 13 | from ccc import conf 14 | 15 | assert conf is not None 16 | assert conf.__file__ is not None 17 | 18 | 19 | @mock.patch.dict(os.environ, {}, clear=True) 20 | def test_conf_entries(): 21 | from ccc import conf 22 | import importlib 23 | 24 | importlib.reload(conf) 25 | 26 | assert conf.ROOT_DIR is not None 27 | assert conf.ROOT_DIR != "" 28 | 29 | assert conf.DATA_DIR is not None 30 | assert conf.DATA_DIR != "" 31 | 32 | assert conf.RESULTS_DIR is not None 33 | assert conf.RESULTS_DIR != "" 34 | 35 | assert conf.GENERAL is not None 36 | assert len(conf.GENERAL) > 0 37 | assert conf.GENERAL["N_JOBS"] is not None 38 | assert conf.GENERAL["N_JOBS"] > 0 39 | assert conf.GENERAL["N_JOBS_LOW"] is not None 40 | assert conf.GENERAL["N_JOBS_LOW"] > 0 41 | 42 | assert conf.MANUSCRIPT is not None 43 | assert "CONTENT_DIR" not in conf.MANUSCRIPT 44 | 45 | 46 | def test_conf_main(): 47 | t = runpy.run_module("ccc.conf", run_name="__main__") 48 | assert t is not None 49 | assert "print_vars" in t 50 | assert "CM_ROOT_DIR" in t["print_vars"] 51 | assert "CM_RESULTS_DIR" in t["print_vars"] 52 | assert "CM_GENERAL_N_JOBS" in t["print_vars"] 53 | 54 | 55 | @pytest.mark.skipif( 56 | sys.platform.startswith("win"), 57 | reason="exporting variables is only supported in non-Windows platforms", 58 | ) 59 | def test_conf_export_variables(): 60 | from pathlib import Path 61 | import subprocess 62 | from ccc import conf 63 | 64 | conf_filepath = Path(conf.__file__).resolve() 65 | assert conf_filepath is not None 66 | assert conf_filepath.exists() 67 | 68 | # check output 69 | r = subprocess.run(["python", conf_filepath], stdout=subprocess.PIPE) 70 | assert r is not None 71 | assert r.returncode == 0 72 | r_output = r.stdout.decode("utf-8") 73 | assert r_output is not None 74 | assert len(r_output) > 8, r_output 75 | assert r_output.count("export ") > 5 76 | 77 | # check variable 78 | r = subprocess.run( 79 | f"eval `python {conf_filepath}` && echo $CM_ROOT_DIR", 80 | shell=True, 81 | stdout=subprocess.PIPE, 82 | ) 83 | assert r is not None 84 | assert r.returncode == 0 85 | r_output = r.stdout.decode("utf-8").strip() 86 | assert r_output is not None 87 | assert len(r_output) > 8, r_output 88 | assert r_output.startswith("/") 89 | 90 | # check dict variable 91 | r = subprocess.run( 92 | f"eval `python {conf_filepath}` && echo $CM_GENERAL_N_JOBS", 93 | shell=True, 94 | stdout=subprocess.PIPE, 95 | ) 96 | assert r is not None 97 | assert r.returncode == 0 98 | r_output = r.stdout.decode("utf-8").strip() 99 | assert r_output is not None 100 | assert r_output.isdigit() 101 | assert int(r_output) > 0 102 | 103 | 104 | @mock.patch.dict(os.environ, {"CM_MANUSCRIPT_DIR": "/tmp/some/dir"}) 105 | def test_conf_with_manuscript_dir(): 106 | from ccc import conf 107 | import importlib 108 | 109 | importlib.reload(conf) 110 | 111 | assert conf.MANUSCRIPT is not None 112 | assert "CONTENT_DIR" in conf.MANUSCRIPT 113 | assert conf.MANUSCRIPT["CONTENT_DIR"] is not None 114 | assert conf.MANUSCRIPT["CONTENT_DIR"] != "" 115 | 116 | 117 | @mock.patch.dict(os.environ, {"CM_N_JOBS": ""}) 118 | def test_conf_cm_n_jobs_is_empty_string(): 119 | from ccc import conf 120 | import importlib 121 | 122 | importlib.reload(conf) 123 | 124 | assert conf.GENERAL is not None 125 | assert len(conf.GENERAL) > 0 126 | assert conf.GENERAL["N_JOBS"] is not None 127 | assert conf.GENERAL["N_JOBS"] > 0 128 | assert conf.GENERAL["N_JOBS_LOW"] is not None 129 | assert conf.GENERAL["N_JOBS_LOW"] > 0 130 | -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/07_cm_optimized/04-cm_ari_numba.txt: -------------------------------------------------------------------------------- 1 | 149 function calls (143 primitive calls) in 16.228 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 52 to 50 due to restriction <50> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 16.228 16.228 {built-in method builtins.exec} 8 | 1 0.000 0.000 16.228 16.228 :1() 9 | 1 0.000 0.000 16.228 16.228 130967321.py:1(func) 10 | 1 0.000 0.000 16.228 16.228 coef.py:276(cm) 11 | 1 16.227 16.227 16.227 16.227 coef.py:208(_cm) 12 | 9 0.000 0.000 0.001 0.000 typedlist.py:341(append) 13 | 1 0.000 0.000 0.000 0.000 typedlist.py:298(_initialise_list) 14 | 2 0.000 0.000 0.000 0.000 abstract.py:60(__call__) 15 | 2 0.000 0.000 0.000 0.000 typeof.py:25(typeof) 16 | 1 0.000 0.000 0.000 0.000 typedlist.py:270(_parse_arg) 17 | 2 0.000 0.000 0.000 0.000 functools.py:872(wrapper) 18 | 1 0.000 0.000 0.000 0.000 dispatcher.py:677(typeof_pyval) 19 | 2 0.000 0.000 0.000 0.000 abstract.py:48(_intern) 20 | 1 0.000 0.000 0.000 0.000 typeof.py:257(_typeof_nb_type) 21 | 9 0.000 0.000 0.000 0.000 typedlist.py:81(_append) 22 | 2 0.000 0.000 0.000 0.000 {method 'get' of 'dict' objects} 23 | 1 0.000 0.000 0.000 0.000 containers.py:618(__init__) 24 | 6 0.000 0.000 0.000 0.000 {built-in method builtins.isinstance} 25 | 2 0.000 0.000 0.000 0.000 functools.py:816(dispatch) 26 | 4/2 0.000 0.000 0.000 0.000 abstract.py:117(__hash__) 27 | 7/5 0.000 0.000 0.000 0.000 abstract.py:120(__eq__) 28 | 1 0.000 0.000 0.000 0.000 typeof.py:121(_typeof_int) 29 | 1 0.000 0.000 0.000 0.000 typedlist.py:228(__init__) 30 | 1 0.000 0.000 0.000 0.000 typedlist.py:202(__new__) 31 | 4 0.000 0.000 0.000 0.000 abc.py:117(__instancecheck__) 32 | 1 0.000 0.000 0.000 0.000 utils.py:294(bit_length) 33 | 10 0.000 0.000 0.000 0.000 serialize.py:140(_numba_unpickle) 34 | 20 0.000 0.000 0.000 0.000 typedlist.py:280(_numba_type_) 35 | 4/2 0.000 0.000 0.000 0.000 {built-in method builtins.hash} 36 | 1 0.000 0.000 0.000 0.000 functions.py:660(__init__) 37 | 1 0.000 0.000 0.000 0.000 {method 'format' of 'str' objects} 38 | 1 0.000 0.000 0.000 0.000 misc.py:47(unliteral) 39 | 4 0.000 0.000 0.000 0.000 {built-in method _abc._abc_instancecheck} 40 | 1 0.000 0.000 0.000 0.000 typedlist.py:50(_make_list) 41 | 2 0.000 0.000 0.000 0.000 weakref.py:415(__getitem__) 42 | 2 0.000 0.000 0.000 0.000 :1() 43 | 9 0.000 0.000 0.000 0.000 typedlist.py:286(_typed) 44 | 3 0.000 0.000 0.000 0.000 {built-in method __new__ of type object at 0x55a47df0e300} 45 | 1 0.000 0.000 0.000 0.000 {built-in method builtins.any} 46 | 6 0.000 0.000 0.000 0.000 abstract.py:95(key) 47 | 2 0.000 0.000 0.000 0.000 abstract.py:92(__init__) 48 | 1 0.000 0.000 0.000 0.000 {built-in method builtins.getattr} 49 | 3 0.000 0.000 0.000 0.000 containers.py:630(key) 50 | 2 0.000 0.000 0.000 0.000 abstract.py:114(__repr__) 51 | 3 0.000 0.000 0.000 0.000 functions.py:672(key) 52 | 2 0.000 0.000 0.000 0.000 {built-in method _abc.get_cache_token} 53 | 1 0.000 0.000 0.000 0.000 typedlist.py:244() 54 | 1 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr} 55 | 1 0.000 0.000 0.000 0.000 {built-in method builtins.bin} 56 | 1 0.000 0.000 0.000 0.000 {built-in method builtins.len} -------------------------------------------------------------------------------- /nbs/others/05_clustermatch_profiling/06_cm_optimized/06-cm_many_genes.txt: -------------------------------------------------------------------------------- 1 | 149 function calls (143 primitive calls) in 934.834 seconds 2 | 3 | Ordered by: cumulative time 4 | List reduced from 52 to 50 due to restriction <50> 5 | 6 | ncalls tottime percall cumtime percall filename:lineno(function) 7 | 1 0.000 0.000 934.834 934.834 {built-in method builtins.exec} 8 | 1 0.000 0.000 934.834 934.834 :1() 9 | 1 0.000 0.000 934.834 934.834 1750096170.py:1(func) 10 | 1 0.000 0.000 934.834 934.834 coef.py:272(cm) 11 | 1 934.834 934.834 934.834 934.834 coef.py:197(_cm) 12 | 9 0.000 0.000 0.000 0.000 typedlist.py:341(append) 13 | 1 0.000 0.000 0.000 0.000 typedlist.py:298(_initialise_list) 14 | 2 0.000 0.000 0.000 0.000 abstract.py:60(__call__) 15 | 1 0.000 0.000 0.000 0.000 typedlist.py:270(_parse_arg) 16 | 2 0.000 0.000 0.000 0.000 typeof.py:25(typeof) 17 | 2 0.000 0.000 0.000 0.000 functools.py:872(wrapper) 18 | 1 0.000 0.000 0.000 0.000 dispatcher.py:677(typeof_pyval) 19 | 2 0.000 0.000 0.000 0.000 abstract.py:48(_intern) 20 | 2 0.000 0.000 0.000 0.000 {method 'get' of 'dict' objects} 21 | 1 0.000 0.000 0.000 0.000 typeof.py:257(_typeof_nb_type) 22 | 9 0.000 0.000 0.000 0.000 typedlist.py:81(_append) 23 | 1 0.000 0.000 0.000 0.000 containers.py:618(__init__) 24 | 4/2 0.000 0.000 0.000 0.000 abstract.py:117(__hash__) 25 | 4/2 0.000 0.000 0.000 0.000 {built-in method builtins.hash} 26 | 6 0.000 0.000 0.000 0.000 {built-in method builtins.isinstance} 27 | 2 0.000 0.000 0.000 0.000 functools.py:816(dispatch) 28 | 1 0.000 0.000 0.000 0.000 typeof.py:121(_typeof_int) 29 | 7/5 0.000 0.000 0.000 0.000 abstract.py:120(__eq__) 30 | 4 0.000 0.000 0.000 0.000 abc.py:117(__instancecheck__) 31 | 1 0.000 0.000 0.000 0.000 typedlist.py:228(__init__) 32 | 1 0.000 0.000 0.000 0.000 typedlist.py:202(__new__) 33 | 1 0.000 0.000 0.000 0.000 utils.py:294(bit_length) 34 | 10 0.000 0.000 0.000 0.000 serialize.py:140(_numba_unpickle) 35 | 4 0.000 0.000 0.000 0.000 {built-in method _abc._abc_instancecheck} 36 | 1 0.000 0.000 0.000 0.000 typedlist.py:50(_make_list) 37 | 1 0.000 0.000 0.000 0.000 functions.py:660(__init__) 38 | 1 0.000 0.000 0.000 0.000 {method 'format' of 'str' objects} 39 | 20 0.000 0.000 0.000 0.000 typedlist.py:280(_numba_type_) 40 | 1 0.000 0.000 0.000 0.000 misc.py:47(unliteral) 41 | 2 0.000 0.000 0.000 0.000 weakref.py:415(__getitem__) 42 | 2 0.000 0.000 0.000 0.000 :1() 43 | 9 0.000 0.000 0.000 0.000 typedlist.py:286(_typed) 44 | 3 0.000 0.000 0.000 0.000 {built-in method __new__ of type object at 0x55c573e05300} 45 | 1 0.000 0.000 0.000 0.000 {built-in method builtins.any} 46 | 2 0.000 0.000 0.000 0.000 abstract.py:92(__init__) 47 | 6 0.000 0.000 0.000 0.000 abstract.py:95(key) 48 | 3 0.000 0.000 0.000 0.000 containers.py:630(key) 49 | 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} 50 | 2 0.000 0.000 0.000 0.000 abstract.py:114(__repr__) 51 | 1 0.000 0.000 0.000 0.000 {built-in method builtins.getattr} 52 | 2 0.000 0.000 0.000 0.000 {built-in method _abc.get_cache_token} 53 | 3 0.000 0.000 0.000 0.000 functions.py:672(key) 54 | 1 0.000 0.000 0.000 0.000 typedlist.py:244() 55 | 1 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr} 56 | 1 0.000 0.000 0.000 0.000 {built-in method builtins.bin} --------------------------------------------------------------------------------